diff options
Diffstat (limited to 'ml/dlib/dlib/optimization')
38 files changed, 14987 insertions, 0 deletions
diff --git a/ml/dlib/dlib/optimization/elastic_net.h b/ml/dlib/dlib/optimization/elastic_net.h new file mode 100644 index 000000000..6c4b6d0b4 --- /dev/null +++ b/ml/dlib/dlib/optimization/elastic_net.h @@ -0,0 +1,389 @@ +// Copyright (C) 2016 Davis E. King (davis@dlib.net) +// License: Boost Software License See LICENSE.txt for the full license. +#ifndef DLIB_ElASTIC_NET_Hh_ +#define DLIB_ElASTIC_NET_Hh_ + +#include "../matrix.h" +#include "elastic_net_abstract.h" + +namespace dlib +{ + +// ---------------------------------------------------------------------------------------- + + class elastic_net + { + public: + + template <typename EXP> + explicit elastic_net( + const matrix_exp<EXP>& XX + ) : eps(1e-5), max_iterations(50000), verbose(false) + { + // make sure requires clause is not broken + DLIB_ASSERT(XX.size() > 0 && + XX.nr() == XX.nc(), + "\t elastic_net::elastic_net(XX)" + << " \n\t XX must be a non-empty square matrix." + << " \n\t XX.nr(): " << XX.nr() + << " \n\t XX.nc(): " << XX.nc() + << " \n\t this: " << this + ); + + + // If the number of columns in X is big and in particular bigger than the number of + // rows then we can get rid of them by doing some SVD magic. Doing this doesn't + // make the final results of anything change but makes all the matrices have + // dimensions that are X.nr() in size, which can be much smaller. + matrix<double,0,1> s; + svd3(XX,u,eig_vals,eig_vects); + s = sqrt(eig_vals); + X = eig_vects*diagm(s); + u = eig_vects*inv(diagm(s)); + + + + samples.resize(X.nr()*2); + + for (size_t i = 0; i < samples.size(); ++i) + index.push_back(i); + active_size = index.size(); + + + // setup the training samples used in the SVM optimizer below + for (size_t i = 0; i < samples.size(); ++i) + { + auto& x = samples[i]; + const long idx = i/2; + if (i%2 == 0) + x.label = +1; + else + x.label = -1; + + x.r = idx%X.nr(); + } + } + + template <typename EXP1, typename EXP2> + elastic_net( + const matrix_exp<EXP1>& XX, + const matrix_exp<EXP2>& XY + ) : elastic_net(XX) + { + // make sure requires clause is not broken + DLIB_ASSERT(XX.size() > 0 && + XX.nr() == XX.nc() && + is_col_vector(XY) && + XX.nc() == XY.size() , + "\t elastic_net::elastic_net(XX,XY)" + << " \n\t Invalid inputs were given to this function." + << " \n\t XX.size(): " << XX.size() + << " \n\t is_col_vector(XY): " << is_col_vector(XY) + << " \n\t XX.nr(): " << XX.nr() + << " \n\t XX.nc(): " << XX.nc() + << " \n\t XY.size(): " << XY.size() + << " \n\t this: " << this + ); + + set_xy(XY); + } + + long size ( + ) const { return u.nr(); } + + template <typename EXP> + void set_xy( + const matrix_exp<EXP>& XY + ) + { + // make sure requires clause is not broken + DLIB_ASSERT(is_col_vector(XY) && + XY.size() == size(), + "\t void elastic_net::set_y(Y)" + << " \n\t Invalid inputs were given to this function." + << " \n\t is_col_vector(XY): " << is_col_vector(XY) + << " \n\t size(): " << size() + << " \n\t XY.size(): " << XY.size() + << " \n\t this: " << this + ); + + Y = trans(u)*XY; + // We can use the ynorm after it has been projected because the only place Y + // appears in the algorithm is in terms of dot products with w and x vectors. + // But those vectors are always in the span of X and therefore we only see the + // part of the norm of Y that is in the span of X (and hence u since u and X + // have the same span by construction) + ynorm = length_squared(Y); + xdoty = X*Y; + eig_vects_xdoty = trans(eig_vects)*xdoty; + + w.set_size(Y.size()); + // zero out any memory of previous solutions + alpha.assign(X.nr()*2, 0); + } + + bool have_target_values ( + ) const { return Y.size() != 0; } + + void set_epsilon( + double eps_ + ) + { + // make sure requires clause is not broken + DLIB_ASSERT(eps_ > 0, + "\t void elastic_net::set_epsilon()" + << " \n\t eps_ must be greater than 0" + << " \n\t eps_: " << eps_ + << " \n\t this: " << this + ); + + eps = eps_; + } + + unsigned long get_max_iterations ( + ) const { return max_iterations; } + + void set_max_iterations ( + unsigned long max_iter + ) + { + max_iterations = max_iter; + } + + void be_verbose ( + ) + { + verbose = true; + } + + void be_quiet ( + ) + { + verbose = false; + } + + double get_epsilon ( + ) const { return eps; } + + matrix<double,0,1> operator() ( + double ridge_lambda, + double lasso_budget = std::numeric_limits<double>::infinity() + ) + { + // make sure requires clause is not broken + DLIB_ASSERT(have_target_values() && + ridge_lambda > 0 && + lasso_budget > 0 , + "\t matrix<double,0,1> elastic_net::operator()()" + << " \n\t Invalid inputs were given to this function." + << " \n\t have_target_values(): " << have_target_values() + << " \n\t ridge_lambda: " << ridge_lambda + << " \n\t lasso_budget: " << lasso_budget + << " \n\t this: " << this + ); + + + // First check if lasso_budget is so big that it isn't even active. We do this + // by doing just ridge regression and checking the result. + matrix<double,0,1> betas = eig_vects*tmp(inv(diagm(eig_vals + ridge_lambda))*eig_vects_xdoty); + if (sum(abs(betas)) <= lasso_budget) + return betas; + + + // Set w back to 0. We will compute the w corresponding to what is currently + // in alpha layer on. This way w and alpha are always in sync. + w = 0; + wy_mult = 0; + wdoty = 0; + + + // return dot(w,x) + auto dot = [&](const matrix<double,0,1>& w, const en_sample2& x) + { + const double xmul = -x.label*(1/lasso_budget); + // Do the base dot product but don't forget to add in the -(1/t)*y part from the svm reduction paper + double val = rowm(X,x.r)*w + xmul*wdoty + wy_mult*xdoty(x.r) + xmul*wy_mult*ynorm; + + return val; + }; + + + // perform w += scale*x; + auto add_to = [&](matrix<double,0,1>& w, double scale, const en_sample2& x) + { + const double xmul = -x.label*(1/lasso_budget); + wy_mult += scale*xmul; + wdoty += scale*xdoty(x.r); + w += scale*trans(rowm(X,x.r)); + + }; + + const double Dii = ridge_lambda; + + // setup the training samples used in the SVM optimizer below + for (size_t i = 0; i < samples.size(); ++i) + { + auto& x = samples[i]; + + const double xmul = -x.label*(1/lasso_budget); + x.xdotx = xmul*xmul*ynorm; + for (long c = 0; c < X.nc(); ++c) + x.xdotx += std::pow(X(x.r,c)+xmul*Y(c), 2.0) - std::pow(xmul*Y(c),2.0); + + // compute the correct w given whatever might be in alpha. + if (alpha[i] != 0) + add_to(w, x.label*alpha[i], samples[i]); + } + + + // Now run the optimizer + double PG_max_prev = std::numeric_limits<double>::infinity(); + double PG_min_prev = -std::numeric_limits<double>::infinity(); + + + unsigned int iter; + for (iter = 0; iter < max_iterations; ++iter) + { + // randomly shuffle the indices + for (unsigned long i = 0; i < active_size; ++i) + { + // pick a random index >= i + const long j = i + rnd.get_random_32bit_number()%(active_size-i); + std::swap(index[i], index[j]); + } + + double PG_max = -std::numeric_limits<double>::infinity(); + double PG_min = std::numeric_limits<double>::infinity(); + for (size_t ii = 0; ii < active_size; ++ii) + { + const auto i = index[ii]; + const auto& x = samples[i]; + double G = x.label*dot(w, x) - 1 + Dii*alpha[i]; + + double PG = 0; + if (alpha[i] == 0) + { + if (G > PG_max_prev) + { + // shrink the active set of training examples + --active_size; + std::swap(index[ii], index[active_size]); + --ii; + continue; + } + + if (G < 0) + PG = G; + } + else + { + PG = G; + } + + if (PG > PG_max) + PG_max = PG; + if (PG < PG_min) + PG_min = PG; + + // if PG != 0 + if (std::abs(PG) > 1e-12) + { + const double alpha_old = alpha[i]; + alpha[i] = std::max(alpha[i] - G/(x.xdotx+Dii), (double)0.0); + const double delta = (alpha[i]-alpha_old)*x.label; + add_to(w, delta, x); + } + } + + if (verbose) + { + using namespace std; + cout << "gap: " << PG_max - PG_min << endl; + cout << "active_size: " << active_size << endl; + cout << "iter: " << iter << endl; + cout << endl; + } + + if (PG_max - PG_min <= eps) + { + // stop if we are within eps tolerance and the last iteration + // was over all the samples + if (active_size == index.size()) + break; + + // Turn off shrinking on the next iteration. We will stop if the + // tolerance is still <= eps when shrinking is off. + active_size = index.size(); + PG_max_prev = std::numeric_limits<double>::infinity(); + PG_min_prev = -std::numeric_limits<double>::infinity(); + } + else + { + PG_max_prev = PG_max; + PG_min_prev = PG_min; + if (PG_max_prev <= 0) + PG_max_prev = std::numeric_limits<double>::infinity(); + if (PG_min_prev >= 0) + PG_min_prev = -std::numeric_limits<double>::infinity(); + } + + + // recalculate wdoty every so often to avoid drift. + if (iter%100 == 0) + wdoty = dlib::dot(Y, w); + } + + + betas.set_size(alpha.size()/2); + for (long i = 0; i < betas.size(); ++i) + betas(i) = lasso_budget*(alpha[2*i] - alpha[2*i+1]); + betas /= sum(mat(alpha)); + return betas; + } + + + private: + + struct en_sample2 + { + // X location + long r; + + + double label; + + double xdotx; + }; + + std::vector<en_sample2> samples; + std::vector<double> alpha; + double ynorm; + matrix<double> X; + matrix<double,0,1> Y; + matrix<double,0,1> xdoty; + double wdoty; + double wy_mult; // logically, the real w is what is in the w vector + wy_mult*Y + matrix<double,0,1> w; + std::vector<long> index; + unsigned long active_size; + + matrix<double,0,1> eig_vects_xdoty; + matrix<double,0,1> eig_vals; + matrix<double> eig_vects; + matrix<double> u; + + dlib::rand rnd; + + + double eps; + unsigned long max_iterations; + bool verbose; + }; + +// ---------------------------------------------------------------------------------------- + +} + +#endif // DLIB_ElASTIC_NET_Hh_ + + diff --git a/ml/dlib/dlib/optimization/elastic_net_abstract.h b/ml/dlib/dlib/optimization/elastic_net_abstract.h new file mode 100644 index 000000000..8ae69e37e --- /dev/null +++ b/ml/dlib/dlib/optimization/elastic_net_abstract.h @@ -0,0 +1,190 @@ +// Copyright (C) 2016 Davis E. King (davis@dlib.net) +// License: Boost Software License See LICENSE.txt for the full license. +#undef DLIB_ElASTIC_NET_ABSTRACT_Hh_ +#ifdef DLIB_ElASTIC_NET_ABSTRACT_Hh_ + +#include "../matrix.h" + +namespace dlib +{ + +// ---------------------------------------------------------------------------------------- + + class elastic_net + { + /*! + WHAT THIS OBJECT REPRESENTS + This object is a tool for solving the following optimization problem: + + min_w: length_squared(X*w - Y) + ridge_lambda*length_squared(w) + such that: sum(abs(w)) <= lasso_budget + + That is, it solves the elastic net optimization problem. This object also + has the special property that you can quickly obtain different solutions + for different settings of ridge_lambda, lasso_budget, and target Y values. + + This is because a large amount of work is precomputed in the constructor. + The solver will also remember the previous solution and will use that to + warm start subsequent invocations. Therefore, you can efficiently get + solutions for a wide range of regularization parameters. + + + The particular algorithm used to solve it is described in the paper: + Zhou, Quan, et al. "A reduction of the elastic net to support vector + machines with an application to gpu computing." arXiv preprint + arXiv:1409.1976 (2014). APA + + And for the SVM solver sub-component we use the algorithm from: + Hsieh, Cho-Jui, et al. "A dual coordinate descent method for large-scale + linear SVM." Proceedings of the 25th international conference on Machine + learning. ACM, 2008. + !*/ + + public: + + template <typename EXP> + explicit elastic_net( + const matrix_exp<EXP>& XX + ); + /*! + requires + - XX.size() != 0 + - XX.nr() == XX.nc() + ensures + - #get_epsilon() == 1e-5 + - #get_max_iterations() == 50000 + - This object will not be verbose unless be_verbose() is called. + - #size() == XX.nc() + - #have_target_values() == false + - We interpret XX as trans(X)*X where X is as defined in the objective + function discussed above in WHAT THIS OBJECT REPRESENTS. + !*/ + + template <typename EXP1, typename EXP2> + elastic_net( + const matrix_exp<EXP1>& XX, + const matrix_exp<EXP2>& XY + ); + /*! + requires + - XX.size() != 0 + - XX.nr() == XX.nc() + - is_col_vector(XY) + - XX.nc() == Y.size() + ensures + - constructs this object by calling the elastic_net(XX) constructor and + then calling this->set_xy(XY). + - #have_target_values() == true + - We interpret XX as trans(X)*X where X is as defined in the objective + function discussed above in WHAT THIS OBJECT REPRESENTS. Similarly, XY + should be trans(X)*Y. + !*/ + + long size ( + ) const; + /*! + ensures + - returns the dimensionality of the data loaded into this object. That is, + how many elements are in the optimal w vector? This function returns + that number. + !*/ + + bool have_target_values ( + ) const; + /*! + ensures + - returns true if set_xy() has been called and false otherwise. + !*/ + + template <typename EXP> + void set_xy( + const matrix_exp<EXP>& XY + ); + /*! + requires + - is_col_vector(Y) + - Y.size() == size() + ensures + - #have_target_values() == true + - Sets the target values of the regression. Note that we expect the given + matrix, XY, to be equal to trans(X)*Y, where X and Y have the definitions + discussed above in WHAT THIS OBJECT REPRESENTS. + !*/ + + void set_epsilon( + double eps + ); + /*! + requires + - eps > 0 + ensures + - #get_epsilon() == eps + !*/ + + double get_epsilon ( + ) const; + /*! + ensures + - returns the error epsilon that determines when the solver should stop. + Smaller values may result in a more accurate solution but take longer to + execute. + !*/ + + unsigned long get_max_iterations ( + ) const; + /*! + ensures + - returns the maximum number of iterations the optimizer is allowed to run + before it is required to stop and return a result. + !*/ + + void set_max_iterations ( + unsigned long max_iter + ); + /*! + ensures + - #get_max_iterations() == max_iter + !*/ + + void be_verbose ( + ); + /*! + ensures + - This object will print status messages to standard out so that a + user can observe the progress of the algorithm. + !*/ + + void be_quiet ( + ); + /*! + ensures + - this object will not print anything to standard out. + !*/ + + + matrix<double,0,1> operator() ( + double ridge_lambda, + double lasso_budget = std::numeric_limits<double>::infinity() + ); + /*! + requires + - have_target_values() == true + - ridge_lambda > 0 + - lasso_budget > 0 + ensures + - Solves the optimization problem described in the WHAT THIS OBJECT + REPRESENTS section above and returns the optimal w. + - The returned vector has size() elements. + - if (lasso_budget == infinity) then + - The lasso constraint is ignored + !*/ + + }; + +// ---------------------------------------------------------------------------------------- + +} + +#endif // DLIB_ElASTIC_NET_ABSTRACT_Hh_ + + diff --git a/ml/dlib/dlib/optimization/find_max_factor_graph_nmplp.h b/ml/dlib/dlib/optimization/find_max_factor_graph_nmplp.h new file mode 100644 index 000000000..3dd7fd56f --- /dev/null +++ b/ml/dlib/dlib/optimization/find_max_factor_graph_nmplp.h @@ -0,0 +1,337 @@ +// Copyright (C) 2011 Davis E. King (davis@dlib.net) +// License: Boost Software License See LICENSE.txt for the full license. +#ifndef DLIB_FIND_MAX_FACTOR_GRAPH_nMPLP_Hh_ +#define DLIB_FIND_MAX_FACTOR_GRAPH_nMPLP_Hh_ + +#include "find_max_factor_graph_nmplp_abstract.h" +#include <vector> +#include <map> +#include "../matrix.h" +#include "../hash.h" + + +namespace dlib +{ + +// ---------------------------------------------------------------------------------------- + + namespace impl + { + class simple_hash_map + { + public: + + simple_hash_map( + ) : + scan_dist(6) + { + data.resize(5000); + } + + void insert ( + const unsigned long a, + const unsigned long b, + const unsigned long value + ) + /*! + requires + - a != std::numeric_limits<unsigned long>::max() + ensures + - #(*this)(a,b) == value + !*/ + { + const uint32 h = murmur_hash3_2(a,b)%(data.size()-scan_dist); + + const unsigned long empty_bucket = std::numeric_limits<unsigned long>::max(); + + for (uint32 i = 0; i < scan_dist; ++i) + { + if (data[i+h].key1 == empty_bucket) + { + data[i+h].key1 = a; + data[i+h].key2 = b; + data[i+h].value = value; + return; + } + } + + // if we get this far it means the hash table is filling up. So double its size. + std::vector<bucket> new_data; + new_data.resize(data.size()*2); + new_data.swap(data); + for (uint32 i = 0; i < new_data.size(); ++i) + { + if (new_data[i].key1 != empty_bucket) + { + insert(new_data[i].key1, new_data[i].key2, new_data[i].value); + } + } + + insert(a,b,value); + } + + unsigned long operator() ( + const unsigned long a, + const unsigned long b + ) const + /*! + requires + - this->insert(a,b,some_value) has been called + ensures + - returns the value stored at key (a,b) + !*/ + { + DLIB_ASSERT(a != b, "An invalid map_problem was given to find_max_factor_graph_nmplp()." + << "\nNode " << a << " is listed as being a neighbor with itself, which is illegal."); + + uint32 h = murmur_hash3_2(a,b)%(data.size()-scan_dist); + + + for (unsigned long i = 0; i < scan_dist; ++i) + { + if (data[h].key1 == a && data[h].key2 == b) + { + return data[h].value; + } + ++h; + } + + + // this should never happen (since this function requires (a,b) to be in the hash table + DLIB_ASSERT(false, "An invalid map_problem was given to find_max_factor_graph_nmplp()." + << "\nThe nodes in the map_problem are inconsistent because node "<<a<<" is in the neighbor list" + << "\nof node "<<b<< " but node "<<b<<" isn't in the neighbor list of node "<<a<<". The neighbor relationship" + << "\nis supposed to be symmetric." + ); + return 0; + } + + private: + + struct bucket + { + // having max() in key1 indicates that the bucket isn't used. + bucket() : key1(std::numeric_limits<unsigned long>::max()) {} + unsigned long key1; + unsigned long key2; + unsigned long value; + }; + + std::vector<bucket> data; + const unsigned int scan_dist; + }; + } + +// ---------------------------------------------------------------------------------------- + + template < + typename map_problem + > + void find_max_factor_graph_nmplp ( + const map_problem& prob, + std::vector<unsigned long>& map_assignment, + unsigned long max_iter, + double eps + ) + { + // make sure requires clause is not broken + DLIB_ASSERT( eps > 0, + "\t void find_max_factor_graph_nmplp()" + << "\n\t eps must be greater than zero" + << "\n\t eps: " << eps + ); + + /* + This function is an implementation of the NMPLP algorithm introduced in the + following papers: + Fixing Max-Product: Convergent Message Passing Algorithms for MAP LP-Relaxations (2008) + by Amir Globerson and Tommi Jaakkola + + Introduction to dual decomposition for inference (2011) + by David Sontag, Amir Globerson, and Tommi Jaakkola + + In particular, this function implements the star MPLP update equations shown as + equation 1.20 from the paper Introduction to dual decomposition for inference + (the method was called NMPLP in the first paper). It should also be noted that + the original description of the NMPLP in the first paper had an error in the + equations and the second paper contains corrected equations, which is what this + function uses. + */ + + typedef typename map_problem::node_iterator node_iterator; + typedef typename map_problem::neighbor_iterator neighbor_iterator; + + map_assignment.resize(prob.number_of_nodes()); + + + if (prob.number_of_nodes() == 0) + return; + + + std::vector<double> delta_elements; + delta_elements.reserve(prob.number_of_nodes()*prob.num_states(prob.begin())*3); + + impl::simple_hash_map delta_idx; + + + + // Initialize delta to zero and fill up the hash table with the appropriate values + // so we can index into delta later on. + for (node_iterator i = prob.begin(); i != prob.end(); ++i) + { + const unsigned long id_i = prob.node_id(i); + + for (neighbor_iterator j = prob.begin(i); j != prob.end(i); ++j) + { + const unsigned long id_j = prob.node_id(j); + delta_idx.insert(id_i, id_j, delta_elements.size()); + + const unsigned long num_states_xj = prob.num_states(j); + for (unsigned long xj = 0; xj < num_states_xj; ++xj) + delta_elements.push_back(0); + } + } + + + std::vector<double> gamma_i; + std::vector<std::vector<double> > gamma_ji; + std::vector<std::vector<double> > delta_to_j_no_i; + // These arrays will end up with a length equal to the maximum number of neighbors + // of any node in the graph. So reserve a bigish number of slots so that we are + // very unlikely to need to preform an expensive reallocation during the + // optimization. + gamma_ji.reserve(10000); + delta_to_j_no_i.reserve(10000); + + + double max_change = eps + 1; + // Now do the main body of the optimization. + unsigned long iter; + for (iter = 0; iter < max_iter && max_change > eps; ++iter) + { + max_change = -std::numeric_limits<double>::infinity(); + + for (node_iterator i = prob.begin(); i != prob.end(); ++i) + { + const unsigned long id_i = prob.node_id(i); + const unsigned long num_states_xi = prob.num_states(i); + gamma_i.assign(num_states_xi, 0); + + double num_neighbors = 0; + + unsigned int jcnt = 0; + // first we fill in the gamma vectors + for (neighbor_iterator j = prob.begin(i); j != prob.end(i); ++j) + { + // Make sure these arrays are big enough to hold all the neighbor + // information. + if (jcnt >= gamma_ji.size()) + { + gamma_ji.resize(gamma_ji.size()+1); + delta_to_j_no_i.resize(delta_to_j_no_i.size()+1); + } + + ++num_neighbors; + const unsigned long id_j = prob.node_id(j); + const unsigned long num_states_xj = prob.num_states(j); + + gamma_ji[jcnt].assign(num_states_xi, -std::numeric_limits<double>::infinity()); + delta_to_j_no_i[jcnt].assign(num_states_xj, 0); + + // compute delta_j^{-i} and store it in delta_to_j_no_i[jcnt] + for (neighbor_iterator k = prob.begin(j); k != prob.end(j); ++k) + { + const unsigned long id_k = prob.node_id(k); + if (id_k==id_i) + continue; + const double* const delta_kj = &delta_elements[delta_idx(id_k,id_j)]; + for (unsigned long xj = 0; xj < num_states_xj; ++xj) + { + delta_to_j_no_i[jcnt][xj] += delta_kj[xj]; + } + } + + // now compute gamma values + for (unsigned long xi = 0; xi < num_states_xi; ++xi) + { + for (unsigned long xj = 0; xj < num_states_xj; ++xj) + { + gamma_ji[jcnt][xi] = std::max(gamma_ji[jcnt][xi], prob.factor_value(i,j,xi,xj) + delta_to_j_no_i[jcnt][xj]); + } + gamma_i[xi] += gamma_ji[jcnt][xi]; + } + ++jcnt; + } + + // now update the delta values + jcnt = 0; + for (neighbor_iterator j = prob.begin(i); j != prob.end(i); ++j) + { + const unsigned long id_j = prob.node_id(j); + const unsigned long num_states_xj = prob.num_states(j); + + // messages from j to i + double* const delta_ji = &delta_elements[delta_idx(id_j,id_i)]; + + // messages from i to j + double* const delta_ij = &delta_elements[delta_idx(id_i,id_j)]; + + for (unsigned long xj = 0; xj < num_states_xj; ++xj) + { + double best_val = -std::numeric_limits<double>::infinity(); + + for (unsigned long xi = 0; xi < num_states_xi; ++xi) + { + double val = prob.factor_value(i,j,xi,xj) + 2/(num_neighbors+1)*gamma_i[xi] -gamma_ji[jcnt][xi]; + if (val > best_val) + best_val = val; + } + best_val = -0.5*delta_to_j_no_i[jcnt][xj] + 0.5*best_val; + + if (std::abs(delta_ij[xj] - best_val) > max_change) + max_change = std::abs(delta_ij[xj] - best_val); + + delta_ij[xj] = best_val; + } + + for (unsigned long xi = 0; xi < num_states_xi; ++xi) + { + double new_val = -1/(num_neighbors+1)*gamma_i[xi] + gamma_ji[jcnt][xi]; + if (std::abs(delta_ji[xi] - new_val) > max_change) + max_change = std::abs(delta_ji[xi] - new_val); + delta_ji[xi] = new_val; + } + ++jcnt; + } + } + } + + + // now decode the "beliefs" + std::vector<double> b; + for (node_iterator i = prob.begin(); i != prob.end(); ++i) + { + const unsigned long id_i = prob.node_id(i); + b.assign(prob.num_states(i), 0); + + for (neighbor_iterator k = prob.begin(i); k != prob.end(i); ++k) + { + const unsigned long id_k = prob.node_id(k); + + for (unsigned long xi = 0; xi < b.size(); ++xi) + { + const double* const delta_ki = &delta_elements[delta_idx(id_k,id_i)]; + b[xi] += delta_ki[xi]; + } + } + + map_assignment[id_i] = index_of_max(mat(b)); + } + } + +// ---------------------------------------------------------------------------------------- + +} + +#endif // DLIB_FIND_MAX_FACTOR_GRAPH_nMPLP_Hh_ + diff --git a/ml/dlib/dlib/optimization/find_max_factor_graph_nmplp_abstract.h b/ml/dlib/dlib/optimization/find_max_factor_graph_nmplp_abstract.h new file mode 100644 index 000000000..3dd9aead0 --- /dev/null +++ b/ml/dlib/dlib/optimization/find_max_factor_graph_nmplp_abstract.h @@ -0,0 +1,365 @@ +// Copyright (C) 2011 Davis E. King (davis@dlib.net) +// License: Boost Software License See LICENSE.txt for the full license. +#undef DLIB_FIND_MAX_FACTOR_GRAPH_nMPLP_ABSTRACT_Hh_ +#ifdef DLIB_FIND_MAX_FACTOR_GRAPH_nMPLP_ABSTRACT_Hh_ + +#include <vector> + + +namespace dlib +{ + +// ---------------------------------------------------------------------------------------- + + class map_problem + { + /*! + WHAT THIS OBJECT REPRESENTS + This object represents a factor graph or graphical model. In + particular, this object defines the interface a MAP problem on + a factor graph must implement if it is to be solved using the + find_max_factor_graph_nmplp() routine defined at the bottom of this file. + + Note that there is no dlib::map_problem object. What you are + looking at here is simply the interface definition for a map problem. + You must implement your own version of this object for the problem + you wish to solve and then pass it to the find_max_factor_graph_nmplp() routine. + + + Note also that a factor graph should not have any nodes which are + neighbors with themselves. Additionally, the graph is undirected. This + mean that if A is a neighbor of B then B must be a neighbor of A for + the map problem to be valid. + + + Finally, note that the "neighbor" relationship between nodes means the + following: Two nodes are neighbors if and only if there is a potential + function (implemented by the factor_value() method) which operates on + the nodes. + !*/ + + public: + + class node_iterator + { + /*! + WHAT THIS OBJECT REPRESENTS + This is a simple forward iterator for iterating over + the nodes/variables in this factor graph. + + Note that you can't dereference the iterator and + obtain a value. That is, the iterator is opaque to + the user. It is used only as an argument to the other + methods defined in this interface. + !*/ + + public: + node_iterator( + ); + /*! + ensures + - constructs an iterator in an undefined state + !*/ + + node_iterator( + const node_iterator& item + ); + /*! + ensures + - #*this is a copy of item + !*/ + + node_iterator& operator= ( + const node_iterator& item + ); + /*! + ensures + - #*this is a copy of item + - returns #*this + !*/ + + bool operator== ( + const node_iterator& item + ) const; + /*! + ensures + - returns true if *this and item both reference + the same node in the factor graph and false + otherwise. + !*/ + + bool operator!= ( + const node_iterator& item + ) const; + /*! + ensures + - returns false if *this and item both reference + the same node in the factor graph and true + otherwise. + !*/ + + node_iterator& operator++( + ); + /*! + ensures + - advances *this to the next node in the factor graph. + - returns a reference to the updated *this + (i.e. this is the ++object form of the increment operator) + !*/ + }; + + class neighbor_iterator + { + /*! + WHAT THIS OBJECT REPRESENTS + This is a simple forward iterator for iterating over + the nodes/variables in this factor graph. This version + of the iterator is used for iterating over the neighbors + of another node in the graph. + !*/ + + public: + neighbor_iterator( + ); + /*! + ensures + - constructs an iterator in an undefined state + !*/ + + neighbor_iterator( + const neighbor_iterator& item + ); + /*! + ensures + - #*this is a copy of item + !*/ + + neighbor_iterator& operator= ( + const neighbor_iterator& item + ); + /*! + ensures + - #*this is a copy of item + - returns #*this + !*/ + + bool operator== ( + const neighbor_iterator& item + ) const; + /*! + ensures + - returns true if *this and item both reference + the same node in the factor graph and false + otherwise. + !*/ + + bool operator!= ( + const neighbor_iterator& item + ) const; + /*! + ensures + - returns false if *this and item both reference + the same node in the factor graph and true + otherwise. + !*/ + + neighbor_iterator& operator++( + ); + /*! + ensures + - advances *this to the next node in the factor graph. + - returns a reference to the updated *this + (i.e. this is the ++object form of the increment operator) + !*/ + }; + + unsigned long number_of_nodes ( + ) const; + /*! + ensures + - returns the number of nodes in the factor graph. Or in other words, + returns the number of variables in the MAP problem. + !*/ + + node_iterator begin( + ) const; + /*! + ensures + - returns an iterator to the first node in the graph. If no such + node exists then returns end(). + !*/ + + node_iterator end( + ) const; + /*! + ensures + - returns an iterator to one past the last node in the graph. + !*/ + + neighbor_iterator begin( + const node_iterator& it + ) const; + /*! + requires + - it == a valid iterator (i.e. it must be in the range [begin(), end())) + ensures + - returns an iterator to the first neighboring node of the node + referenced by it. If no such node exists then returns end(it). + !*/ + + neighbor_iterator begin( + const neighbor_iterator& it + ) const; + /*! + requires + - it == a valid iterator. (i.e. it must be in the range + [begin(i), end(i)) where i is some valid iterator. ) + ensures + - returns an iterator to the first neighboring node of the node + referenced by it. If no such node exists then returns end(it). + !*/ + + neighbor_iterator end( + const node_iterator& it + ) const; + /*! + requires + - it == a valid iterator (i.e. it must be in the range [begin(), end())) + ensures + - returns an iterator to one past the last neighboring node of the node + referenced by it. + !*/ + + neighbor_iterator end( + const neighbor_iterator& it + ) const; + /*! + requires + - it == a valid iterator. (i.e. it must be in the range + [begin(i), end(i)) where i is some valid iterator. ) + ensures + - returns an iterator to one past the last neighboring node of the node + referenced by it. + !*/ + + unsigned long node_id ( + const node_iterator& it + ) const; + /*! + requires + - it == a valid iterator (i.e. it must be in the range [begin(), end())) + ensures + - returns a number ID such that: + - 0 <= ID < number_of_nodes() + - ID == a number which uniquely identifies the node pointed to by it. + !*/ + + unsigned long node_id ( + const neighbor_iterator& it + ) const; + /*! + requires + - it == a valid iterator. (i.e. it must be in the range + [begin(i), end(i)) where i is some valid iterator. ) + ensures + - returns a number ID such that: + - 0 <= ID < number_of_nodes() + - ID == a number which uniquely identifies the node pointed to by it. + !*/ + + unsigned long num_states ( + const node_iterator& it + ) const; + /*! + requires + - it == a valid iterator (i.e. it must be in the range [begin(), end())) + ensures + - returns the number of states attainable by the node/variable referenced by it. + !*/ + + unsigned long num_states ( + const neighbor_iterator& it + ) const; + /*! + requires + - it == a valid iterator. (i.e. it must be in the range + [begin(i), end(i)) where i is some valid iterator. ) + ensures + - returns the number of states attainable by the node/variable referenced by it. + !*/ + + // The next four functions all have the same contract. + double factor_value (const node_iterator& it1, const node_iterator& it2, unsigned long s1, unsigned long s2) const; + double factor_value (const neighbor_iterator& it1, const node_iterator& it2, unsigned long s1, unsigned long s2) const; + double factor_value (const node_iterator& it1, const neighbor_iterator& it2, unsigned long s1, unsigned long s2) const; + double factor_value (const neighbor_iterator& it1, const neighbor_iterator& it2, unsigned long s1, unsigned long s2) const; + /*! + requires + - it1 == a valid iterator + - it2 == a valid iterator + - 0 <= s1 < num_states(it1) + - 0 <= s2 < num_states(it2) + - it1 and it2 reference nodes which are neighbors in the factor graph + ensures + - returns the value of the factor/potential function for the given pair of + nodes, defined by it1 and it2, for the case where they take on the values + s1 and s2 respectively. + !*/ + + }; + +// ---------------------------------------------------------------------------------------- + + template < + typename map_problem + > + void find_max_factor_graph_nmplp ( + const map_problem& prob, + std::vector<unsigned long>& map_assignment, + unsigned long max_iter, + double eps + ); + /*! + requires + - for all valid i: prob.num_states(i) >= 2 + - map_problem == an object with an interface compatible with the map_problem + object defined at the top of this file. + - eps > 0 + ensures + - This function is a tool for approximately solving the given MAP problem in a graphical + model or factor graph with pairwise potential functions. That is, it attempts + to solve a certain kind of optimization problem which can be defined as follows: + maximize: f(X) + where X is a set of integer valued variables and f(X) can be written as the + sum of functions which each involve only two variables from X. In reference + to the prob object, the nodes in prob represent the variables in X and the + functions which are summed are represented by prob.factor_value(). + - #map_assignment == the result of the optimization. + - #map_assignment.size() == prob.number_of_nodes() + - for all valid i: + - #map_assignment[prob.node_id(i)] < prob.num_states(i) + - #map_assignment[prob.node_id(i)] == The approximate MAP assignment for node/variable i. + - eps controls the stopping condition, smaller values of eps lead to more accurate + solutions of the relaxed linear program but may take more iterations. Note that + the algorithm will never execute more than max_iter iterations regardless of + the setting of eps. + - If the graph is tree-structured then this routine always gives the exact solution + to the MAP problem. However, for graphs with cycles, the solution may be approximate. + + + - This function is an implementation of the NMPLP algorithm introduced in the + following papers: + Fixing Max-Product: Convergent Message Passing Algorithms for MAP LP-Relaxations (2008) + by Amir Globerson and Tommi Jaakkola + + Introduction to dual decomposition for inference (2011) + by David Sontag, Amir Globerson, and Tommi Jaakkola + !*/ + +// ---------------------------------------------------------------------------------------- + +} + +#endif // DLIB_FIND_MAX_FACTOR_GRAPH_nMPLP_ABSTRACT_Hh_ + + diff --git a/ml/dlib/dlib/optimization/find_max_factor_graph_viterbi.h b/ml/dlib/dlib/optimization/find_max_factor_graph_viterbi.h new file mode 100644 index 000000000..f7cbcb8d9 --- /dev/null +++ b/ml/dlib/dlib/optimization/find_max_factor_graph_viterbi.h @@ -0,0 +1,232 @@ +// Copyright (C) 2011 Davis E. King (davis@dlib.net) +// License: Boost Software License See LICENSE.txt for the full license. +#ifndef DLIB_FIND_MAX_FACTOR_GRAPH_VITERBi_Hh_ +#define DLIB_FIND_MAX_FACTOR_GRAPH_VITERBi_Hh_ + +#include "find_max_factor_graph_viterbi_abstract.h" +#include <vector> +#include "../matrix.h" +#include "../array2d.h" + + +namespace dlib +{ + +// ---------------------------------------------------------------------------------------- + + namespace impl + { + struct viterbi_data + { + viterbi_data() :val(-std::numeric_limits<double>::infinity()), back_index(0) {} + double val; + unsigned long back_index; + }; + + template <long NC> + inline bool advance_state( + matrix<unsigned long,1,NC>& node_states, + unsigned long num_states + ) + /*! + ensures + - advances node_states to the next state by adding 1 + to node_states(node_states.size()-1) and carrying any + rollover (modulo num_states). Stores the result into #node_states. + - if (#node_states is all zeros) then + - returns false + - else + - returns true + !*/ + { + for (long i = node_states.size()-1; i >= 0; --i) + { + node_states(i) += 1; + if (node_states(i) < num_states) + return true; + + node_states(i) = 0; + } + return false; + } + } + +// ---------------------------------------------------------------------------------------- + + template < + typename map_problem + > + void find_max_factor_graph_viterbi ( + const map_problem& prob, + std::vector<unsigned long>& map_assignment + ) + { + using namespace dlib::impl; + const unsigned long order = prob.order(); + const unsigned long num_states = prob.num_states(); + + + DLIB_ASSERT(prob.num_states() > 0, + "\t void find_max_factor_graph_viterbi()" + << "\n\t The nodes in a factor graph have to be able to take on more than 0 states." + ); + DLIB_ASSERT(std::pow(num_states,(double)order) < std::numeric_limits<unsigned long>::max(), + "\t void find_max_factor_graph_viterbi()" + << "\n\t The order is way too large for this algorithm to handle." + << "\n\t order: " << order + << "\n\t num_states: " << num_states + << "\n\t std::pow(num_states,order): " << std::pow(num_states,(double)order) + << "\n\t std::numeric_limits<unsigned long>::max(): " << std::numeric_limits<unsigned long>::max() + ); + + if (prob.number_of_nodes() == 0) + { + map_assignment.clear(); + return; + } + + if (order == 0) + { + map_assignment.resize(prob.number_of_nodes()); + for (unsigned long i = 0; i < map_assignment.size(); ++i) + { + matrix<unsigned long,1,1> node_state; + unsigned long best_state = 0; + double best_val = -std::numeric_limits<double>::infinity(); + for (unsigned long s = 0; s < num_states; ++s) + { + node_state(0) = s; + const double temp = prob.factor_value(i,node_state); + if (temp > best_val) + { + best_val = temp; + best_state = s; + } + } + map_assignment[i] = best_state; + } + return; + } + + + const unsigned long trellis_size = static_cast<unsigned long>(std::pow(num_states,(double)order)); + unsigned long init_ring_size = 1; + + array2d<impl::viterbi_data> trellis; + trellis.set_size(prob.number_of_nodes(), trellis_size); + + + for (unsigned long node = 0; node < prob.number_of_nodes(); ++node) + { + + if (node < order) + { + matrix<unsigned long,1,0> node_states; + node_states.set_size(std::min<int>(node, order) + 1); + node_states = 0; + + unsigned long idx = 0; + if (node == 0) + { + do + { + trellis[node][idx].val = prob.factor_value(node,node_states); + ++idx; + } while(advance_state(node_states,num_states)); + } + else + { + init_ring_size *= num_states; + do + { + const unsigned long back_index = idx%init_ring_size; + trellis[node][idx].val = prob.factor_value(node,node_states) + trellis[node-1][back_index].val; + trellis[node][idx].back_index = back_index; + ++idx; + } while(advance_state(node_states,num_states)); + + } + } + else if (order == 1) + { + /* + WHAT'S THE DEAL WITH THIS PREPROCESSOR MACRO? + Well, if we can declare the dimensions of node_states as a compile + time constant then this function runs significantly faster. So this macro + is here to let us do that. It just lets us avoid replicating this code + block in the following if statements for different order sizes. + */ +#define DLIB_FMFGV_WORK \ + node_states = 0; \ + unsigned long count = 0; \ + for (unsigned long i = 0; i < trellis_size; ++i) \ + { \ + unsigned long back_index = 0; \ + double best_score = -std::numeric_limits<double>::infinity(); \ + for (unsigned long s = 0; s < num_states; ++s) \ + { \ + const double temp = prob.factor_value(node,node_states) + trellis[node-1][count%trellis_size].val; \ + if (temp > best_score) \ + { \ + best_score = temp; \ + back_index = count%trellis_size; \ + } \ + advance_state(node_states,num_states); \ + ++count; \ + } \ + trellis[node][i].val = best_score; \ + trellis[node][i].back_index = back_index; \ + } + + matrix<unsigned long,1,2> node_states; + DLIB_FMFGV_WORK + } + else if (order == 2) + { + matrix<unsigned long,1,3> node_states; + DLIB_FMFGV_WORK + } + else if (order == 3) + { + matrix<unsigned long,1,4> node_states; + DLIB_FMFGV_WORK + } + else + { + // The general case, here we don't define the size of node_states at compile time. + matrix<unsigned long,1,0> node_states(order+1); + DLIB_FMFGV_WORK + } + } + + + map_assignment.resize(prob.number_of_nodes()); + // Figure out which state of the last node has the biggest value. + unsigned long back_index = 0; + double best_val = -std::numeric_limits<double>::infinity(); + for (long i = 0; i < trellis.nc(); ++i) + { + if (trellis[trellis.nr()-1][i].val > best_val) + { + best_val = trellis[trellis.nr()-1][i].val; + back_index = i; + } + } + // Follow the back links to find the decoding. + for (long node = map_assignment.size()-1; node >= 0; --node) + { + map_assignment[node] = back_index/init_ring_size; + back_index = trellis[node][back_index].back_index; + if (node < (long)order) + init_ring_size /= num_states; + } + + } + +// ---------------------------------------------------------------------------------------- + +} + +#endif // DLIB_FIND_MAX_FACTOR_GRAPH_VITERBi_Hh_ + + diff --git a/ml/dlib/dlib/optimization/find_max_factor_graph_viterbi_abstract.h b/ml/dlib/dlib/optimization/find_max_factor_graph_viterbi_abstract.h new file mode 100644 index 000000000..c19e4c7eb --- /dev/null +++ b/ml/dlib/dlib/optimization/find_max_factor_graph_viterbi_abstract.h @@ -0,0 +1,131 @@ +// Copyright (C) 2011 Davis E. King (davis@dlib.net) +// License: Boost Software License See LICENSE.txt for the full license. +#undef DLIB_FIND_MAX_FACTOR_GRAPH_VITERBi_ABSTRACT_Hh_ +#ifdef DLIB_FIND_MAX_FACTOR_GRAPH_VITERBi_ABSTRACT_Hh_ + +#include <vector> +#include "../matrix.h" + + +namespace dlib +{ + +// ---------------------------------------------------------------------------------------- + + class map_problem + { + /*! + WHAT THIS OBJECT REPRESENTS + This object represents a chain-structured factor graph or graphical + model. In particular, this object defines the interface a MAP problem + on a factor graph must implement if it is to be solved using the + find_max_factor_graph_viterbi() routine defined at the bottom of this file. + + Note that there is no dlib::map_problem object. What you are looking + at here is simply the interface definition for a map problem. You must + implement your own version of this object for the problem you wish to + solve and then pass it to the find_max_factor_graph_viterbi() routine. + !*/ + + public: + + unsigned long order ( + ) const; + /*! + ensures + - returns the order of this model. The order has the following interpretation: + This model can represent a high order Markov chain. If order()==1 then map_problem + represents a basic chain-structured graph where nodes only depend on their immediate + neighbors. However, high order Markov models can also be used by setting order() > 1. + !*/ + + unsigned long num_states ( + ) const; + /*! + ensures + - returns the number of states attainable by each variable/node in the graph. + !*/ + + unsigned long number_of_nodes ( + ) const; + /*! + ensures + - returns the number of nodes in the factor graph. Or in other words, + returns the number of variables in the MAP problem. + !*/ + + template < + typename EXP + > + double factor_value ( + unsigned long node_id, + const matrix_exp<EXP>& node_states + ) const; + /*! + requires + - EXP::type == unsigned long + (i.e. node_states contains unsigned longs) + - node_id < number_of_nodes() + - node_states.size() == min(node_id, order()) + 1 + - is_vector(node_states) == true + - max(node_states) < num_states() + ensures + - In a chain-structured graph, each node has a potential function associated with + it. The potential function operates on the variable given by the node as well + as the order() previous variables. Therefore, factor_value() returns the value + of the factor/potential function associated with node node_id where the following + nodes take on the values defined below: + - node_states(0) == the value of the node with ID node_id + - node_states(i) == the value of the node with ID node_id-i + - It is ok for this function to return a value of -std::numeric_limits<double>::infinity(). + !*/ + + }; + +// ---------------------------------------------------------------------------------------- + + template < + typename map_problem + > + void find_max_factor_graph_viterbi ( + const map_problem& prob, + std::vector<unsigned long>& map_assignment + ); + /*! + requires + - prob.num_states() > 0 + - std::pow(prob.num_states(), prob.order()) < std::numeric_limits<unsigned long>::max() + (i.e. The Viterbi algorithm is exponential in the order of the map problem. So don't + make order too large.) + - map_problem == an object with an interface compatible with the map_problem + object defined at the top of this file. + ensures + - This function is a tool for exactly solving the MAP problem in a chain-structured + graphical model or factor graph. That is, it attempts to solve a certain kind of + optimization problem which can be defined as follows: + - Let X denote a set of prob.number_of_nodes() integer valued variables, each taking + a value in the range [0, prob.num_states()). + - Let X(i) = the ith variable in X. + - Let F(i) = factor_value_i(X(i), X(i-1), ..., X(i-prob.order())) + (This is the value returned by prob.factor_value(i, node_states). Note that + each factor's value function operates on at most prob.order()+1 variables. + Moreover, the variables are adjacent and hence the graph is "chain-structured".) + + Then this function finds the assignments to the X variables which + maximizes: sum over all valid i: F(i) + + - #map_assignment == the result of the optimization. + - #map_assignment.size() == prob.number_of_nodes() + - for all valid i: + - #map_assignment[i] < prob.num_states() + - #map_assignment[i] == The MAP assignment for node/variable i. + !*/ + +// ---------------------------------------------------------------------------------------- + +} + +#endif // DLIB_FIND_MAX_FACTOR_GRAPH_VITERBi_ABSTRACT_Hh_ + + + diff --git a/ml/dlib/dlib/optimization/find_max_parse_cky.h b/ml/dlib/dlib/optimization/find_max_parse_cky.h new file mode 100644 index 000000000..79614792a --- /dev/null +++ b/ml/dlib/dlib/optimization/find_max_parse_cky.h @@ -0,0 +1,414 @@ +// Copyright (C) 2012 Davis E. King (davis@dlib.net) +// License: Boost Software License See LICENSE.txt for the full license. +#ifndef DLIB_FIND_MAX_PaRSE_CKY_Hh_ +#define DLIB_FIND_MAX_PaRSE_CKY_Hh_ + +#include "find_max_parse_cky_abstract.h" +#include <vector> +#include <string> +#include <sstream> +#include "../serialize.h" +#include "../array2d.h" + +namespace dlib +{ + +// ----------------------------------------------------------------------------------------- + + template <typename T> + struct constituent + { + unsigned long begin, end, k; + T left_tag; + T right_tag; + }; + + template <typename T> + void serialize( + const constituent<T>& item, + std::ostream& out + ) + { + serialize(item.begin, out); + serialize(item.end, out); + serialize(item.k, out); + serialize(item.left_tag, out); + serialize(item.right_tag, out); + } + + template <typename T> + void deserialize( + constituent<T>& item, + std::istream& in + ) + { + deserialize(item.begin, in); + deserialize(item.end, in); + deserialize(item.k, in); + deserialize(item.left_tag, in); + deserialize(item.right_tag, in); + } + +// ----------------------------------------------------------------------------------------- + + const unsigned long END_OF_TREE = 0xFFFFFFFF; + +// ----------------------------------------------------------------------------------------- + + template <typename T> + struct parse_tree_element + { + constituent<T> c; + T tag; // id for the constituent corresponding to this level of the tree + + unsigned long left; + unsigned long right; + double score; + }; + + template <typename T> + void serialize ( + const parse_tree_element<T>& item, + std::ostream& out + ) + { + serialize(item.c, out); + serialize(item.tag, out); + serialize(item.left, out); + serialize(item.right, out); + serialize(item.score, out); + } + + template <typename T> + void deserialize ( + parse_tree_element<T>& item, + std::istream& in + ) + { + deserialize(item.c, in); + deserialize(item.tag, in); + deserialize(item.left, in); + deserialize(item.right, in); + deserialize(item.score, in); + } + +// ----------------------------------------------------------------------------------------- + + namespace impl + { + template <typename T> + unsigned long fill_parse_tree( + std::vector<parse_tree_element<T> >& parse_tree, + const T& tag, + const array2d<std::map<T, parse_tree_element<T> > >& back, + long r, long c + ) + /*! + requires + - back[r][c].size() == 0 || back[r][c].count(tag) != 0 + !*/ + { + // base case of the recursion + if (back[r][c].size() == 0) + { + return END_OF_TREE; + } + + const unsigned long idx = parse_tree.size(); + const parse_tree_element<T>& item = back[r][c].find(tag)->second; + parse_tree.push_back(item); + + const long k = item.c.k; + const unsigned long idx_left = fill_parse_tree(parse_tree, item.c.left_tag, back, r, k-1); + const unsigned long idx_right = fill_parse_tree(parse_tree, item.c.right_tag, back, k, c); + parse_tree[idx].left = idx_left; + parse_tree[idx].right = idx_right; + return idx; + } + } + + template <typename T, typename production_rule_function> + void find_max_parse_cky ( + const std::vector<T>& sequence, + const production_rule_function& production_rules, + std::vector<parse_tree_element<T> >& parse_tree + ) + { + parse_tree.clear(); + if (sequence.size() == 0) + return; + + array2d<std::map<T,double> > table(sequence.size(), sequence.size()); + array2d<std::map<T,parse_tree_element<T> > > back(sequence.size(), sequence.size()); + typedef typename std::map<T,double>::iterator itr; + typedef typename std::map<T,parse_tree_element<T> >::iterator itr_b; + + for (long r = 0; r < table.nr(); ++r) + table[r][r][sequence[r]] = 0; + + std::vector<std::pair<T,double> > possible_tags; + + for (long r = table.nr()-2; r >= 0; --r) + { + for (long c = r+1; c < table.nc(); ++c) + { + for (long k = r; k < c; ++k) + { + for (itr i = table[k+1][c].begin(); i != table[k+1][c].end(); ++i) + { + for (itr j = table[r][k].begin(); j != table[r][k].end(); ++j) + { + constituent<T> con; + con.begin = r; + con.end = c+1; + con.k = k+1; + con.left_tag = j->first; + con.right_tag = i->first; + possible_tags.clear(); + production_rules(sequence, con, possible_tags); + for (unsigned long m = 0; m < possible_tags.size(); ++m) + { + const double score = possible_tags[m].second + i->second + j->second; + itr match = table[r][c].find(possible_tags[m].first); + if (match == table[r][c].end() || score > match->second) + { + table[r][c][possible_tags[m].first] = score; + parse_tree_element<T> item; + item.c = con; + item.score = score; + item.tag = possible_tags[m].first; + item.left = END_OF_TREE; + item.right = END_OF_TREE; + back[r][c][possible_tags[m].first] = item; + } + } + } + } + } + } + } + + + // now use back pointers to build the parse trees + const long r = 0; + const long c = back.nc()-1; + if (back[r][c].size() != 0) + { + + // find the max scoring element in back[r][c] + itr_b max_i = back[r][c].begin(); + itr_b i = max_i; + ++i; + for (; i != back[r][c].end(); ++i) + { + if (i->second.score > max_i->second.score) + max_i = i; + } + + parse_tree.reserve(c); + impl::fill_parse_tree(parse_tree, max_i->second.tag, back, r, c); + } + } + +// ----------------------------------------------------------------------------------------- + + class parse_tree_to_string_error : public error + { + public: + parse_tree_to_string_error(const std::string& str): error(str) {} + }; + + namespace impl + { + template <bool enabled, typename T> + typename enable_if_c<enabled>::type conditional_print( + const T& item, + std::ostream& out + ) { out << item << " "; } + + template <bool enabled, typename T> + typename disable_if_c<enabled>::type conditional_print( + const T& , + std::ostream& + ) { } + + template <bool print_tag, bool skip_tag, typename T, typename U > + void print_parse_tree_helper ( + const std::vector<parse_tree_element<T> >& tree, + const std::vector<U>& words, + unsigned long i, + const T& tag_to_skip, + std::ostream& out + ) + { + if (!skip_tag || tree[i].tag != tag_to_skip) + out << "["; + + bool left_recurse = false; + + // Only print if we are supposed to. Doing it this funny way avoids compiler + // errors in parse_tree_to_string() for the case where tag isn't + // printable. + if (!skip_tag || tree[i].tag != tag_to_skip) + conditional_print<print_tag>(tree[i].tag, out); + + if (tree[i].left < tree.size()) + { + left_recurse = true; + print_parse_tree_helper<print_tag,skip_tag>(tree, words, tree[i].left, tag_to_skip, out); + } + else + { + if ((tree[i].c.begin) < words.size()) + { + out << words[tree[i].c.begin] << " "; + } + else + { + std::ostringstream sout; + sout << "Parse tree refers to element " << tree[i].c.begin + << " of sequence which is only of size " << words.size() << "."; + throw parse_tree_to_string_error(sout.str()); + } + } + + if (left_recurse == true) + out << " "; + + if (tree[i].right < tree.size()) + { + print_parse_tree_helper<print_tag,skip_tag>(tree, words, tree[i].right, tag_to_skip, out); + } + else + { + if (tree[i].c.k < words.size()) + { + out << words[tree[i].c.k]; + } + else + { + std::ostringstream sout; + sout << "Parse tree refers to element " << tree[i].c.k + << " of sequence which is only of size " << words.size() << "."; + throw parse_tree_to_string_error(sout.str()); + } + } + + + if (!skip_tag || tree[i].tag != tag_to_skip) + out << "]"; + } + } + +// ----------------------------------------------------------------------------------------- + + template <typename T, typename U> + std::string parse_tree_to_string ( + const std::vector<parse_tree_element<T> >& tree, + const std::vector<U>& words, + const unsigned long root_idx = 0 + ) + { + if (root_idx >= tree.size()) + return ""; + + std::ostringstream sout; + impl::print_parse_tree_helper<false,false>(tree, words, root_idx, tree[root_idx].tag, sout); + return sout.str(); + } + +// ----------------------------------------------------------------------------------------- + + template <typename T, typename U> + std::string parse_tree_to_string_tagged ( + const std::vector<parse_tree_element<T> >& tree, + const std::vector<U>& words, + const unsigned long root_idx = 0 + ) + { + if (root_idx >= tree.size()) + return ""; + + std::ostringstream sout; + impl::print_parse_tree_helper<true,false>(tree, words, root_idx, tree[root_idx].tag, sout); + return sout.str(); + } + +// ----------------------------------------------------------------------------------------- + + template <typename T, typename U> + std::string parse_trees_to_string ( + const std::vector<parse_tree_element<T> >& tree, + const std::vector<U>& words, + const T& tag_to_skip + ) + { + if (tree.size() == 0) + return ""; + + std::ostringstream sout; + impl::print_parse_tree_helper<false,true>(tree, words, 0, tag_to_skip, sout); + return sout.str(); + } + +// ----------------------------------------------------------------------------------------- + + template <typename T, typename U> + std::string parse_trees_to_string_tagged ( + const std::vector<parse_tree_element<T> >& tree, + const std::vector<U>& words, + const T& tag_to_skip + ) + { + if (tree.size() == 0) + return ""; + + std::ostringstream sout; + impl::print_parse_tree_helper<true,true>(tree, words, 0, tag_to_skip, sout); + return sout.str(); + } + +// ----------------------------------------------------------------------------------------- + + namespace impl + { + template <typename T> + void helper_find_trees_without_tag ( + const std::vector<parse_tree_element<T> >& tree, + const T& tag, + std::vector<unsigned long>& tree_roots, + unsigned long idx + ) + { + if (idx < tree.size()) + { + if (tree[idx].tag != tag) + { + tree_roots.push_back(idx); + } + else + { + helper_find_trees_without_tag(tree, tag, tree_roots, tree[idx].left); + helper_find_trees_without_tag(tree, tag, tree_roots, tree[idx].right); + } + } + } + } + + template <typename T> + void find_trees_not_rooted_with_tag ( + const std::vector<parse_tree_element<T> >& tree, + const T& tag, + std::vector<unsigned long>& tree_roots + ) + { + tree_roots.clear(); + impl::helper_find_trees_without_tag(tree, tag, tree_roots, 0); + } + +// ----------------------------------------------------------------------------------------- + +} + +#endif // DLIB_FIND_MAX_PaRSE_CKY_Hh_ + diff --git a/ml/dlib/dlib/optimization/find_max_parse_cky_abstract.h b/ml/dlib/dlib/optimization/find_max_parse_cky_abstract.h new file mode 100644 index 000000000..52ffc787b --- /dev/null +++ b/ml/dlib/dlib/optimization/find_max_parse_cky_abstract.h @@ -0,0 +1,388 @@ +// Copyright (C) 2012 Davis E. King (davis@dlib.net) +// License: Boost Software License See LICENSE.txt for the full license. +#undef DLIB_FIND_MAX_PARsE_CKY_ABSTRACT_Hh_ +#ifdef DLIB_FIND_MAX_PARsE_CKY_ABSTRACT_Hh_ + +#include <vector> +#include <string> +#include "../algs.h" +#include "../serialize.h" + +namespace dlib +{ + +// ----------------------------------------------------------------------------------------- + + template < + typename T + > + struct constituent + { + /*! + WHAT THIS OBJECT REPRESENTS + This object represents the linguistic idea of a constituent, that is, a + group of words that functions as a single unit. In particular, it + represents a combination of two constituents into a new constituent. + + Additionally, a constituent object represents a range of words relative to + some std::vector of words. The range is from [begin, end) (i.e. including + begin but not including end, so using the normal C++ iterator notation). + Moreover, a constituent is always composed of two parts, each having a tag. + Therefore, the left part is composed of the words in the range [begin,k) + and has tag left_tag while the right part of the constituent contains the + words in the range [k,end) and has the tag right_tag. + + The tags are user defined objects of type T. In general, they are used to + represent syntactic categories such as noun phrase, verb phrase, etc. + !*/ + + unsigned long begin, end, k; + T left_tag; + T right_tag; + }; + + template < + typename T + > + void serialize( + const constituent<T>& item, + std::ostream& out + ); + /*! + provides serialization support + !*/ + + template < + typename T + > + void deserialize( + constituent<T>& item, + std::istream& in + ); + /*! + provides deserialization support + !*/ + +// ----------------------------------------------------------------------------------------- + + /*!A END_OF_TREE is used to indicate that parse_tree_element::left or + parse_tree_element::right doesn't point to another subtree. + !*/ + const unsigned long END_OF_TREE = 0xFFFFFFFF; + +// ----------------------------------------------------------------------------------------- + + template < + typename T + > + struct parse_tree_element + { + /*! + WHAT THIS OBJECT REPRESENTS + This object is used to represent a node in a binary parse tree. An entire + parse tree is represented by a std::vector of parse_tree_element objects. + We follow the convention that the first element of this vector is always + the root of the entire tree. + + The fields of this object have the following interpretations: + - c == the constituent spanned by this node in the parse tree. + Therefore, the node spans the words in the range [c.begin, c.end). + - tag == the syntactic category of this node in the parse tree. + - score == the score or log likelihood for this parse tree. In + general, this is the sum of scores of all the production rules used + to build the tree rooted at the current node. + - let PT denote the vector of parse_tree_elements that defines an + entire parse tree. Then we have: + - if (left != END_OF_TREE) then + - PT[left] == the left sub-tree of the current node. + - PT[left] spans the words [c.begin, c.k) + - PT[left].tag == c.left_tag + - else + - there is no left sub-tree + + - if (right != END_OF_TREE) then + - PT[right] == the right sub-tree of the current node. + - PT[right] spans the words [c.k, c.end) + - PT[right].tag == c.right_tag + - else + - there is no right sub-tree + !*/ + + constituent<T> c; + T tag; + double score; + + unsigned long left; + unsigned long right; + }; + + template < + typename T + > + void serialize ( + const parse_tree_element<T>& item, + std::ostream& out + ); + /*! + provides serialization support + !*/ + + template < + typename T + > + void deserialize ( + parse_tree_element<T>& item, + std::istream& in + ); + /*! + provides deserialization support + !*/ + +// ----------------------------------------------------------------------------------------- +// ----------------------------------------------------------------------------------------- + + void example_production_rule_function ( + const std::vector<T>& words, + const constituent<T>& c, + std::vector<std::pair<T,double> >& possible_tags + ) + /*! + requires + - 0 <= c.begin < c.k < c.end <= words.size() + - possible_tags.size() == 0 + ensures + - Finds all the syntactic categories that can be used to label c and puts those + categories, along with their scores, into possible_tags. Or in other words, + this function determines which production rules can be used to turn the left + and right sub-constituents in c into a single constituent. The contents of c + have the following interpretations: + - The left sub-constituent has syntactic category c.left_tag + - for all i such that c.begin <= i < c.k: + - words[i] is part of the left sub-constituent. + - The right sub-constituent has syntactic category c.right_tag + - for all i such that c.k <= i < c.end: + - words[i] is part of the right sub-constituent. + + - Note that example_production_rule_function() is not a real function. It is + here just to show you how to define production rule producing functions for + use with the find_max_parse_cky() routine defined below. + !*/ + + template < + typename T, + typename production_rule_function + > + void find_max_parse_cky ( + const std::vector<T>& words, + const production_rule_function& production_rules, + std::vector<parse_tree_element<T> >& parse_tree + ); + /*! + requires + - production_rule_function == a function or function object with the same + interface as example_production_rule_function defined above. + - It must be possible to store T objects in a std::map. + ensures + - Uses the CKY algorithm to find the most probable/highest scoring binary parse + tree of the given vector of words. + - if (#parse_tree.size() == 0) then + - There is no parse tree, using the given production_rules, that can cover + the given word sequence. + - else + - #parse_tree == the highest scoring parse tree that covers all the + elements of words. + - #parse_tree[0] == the root node of the parse tree. + - #parse_tree[0].score == the score of the parse tree. This is the sum of + the scores of all production rules used to construct the tree. + - #parse_tree[0].begin == 0 + - #parse_tree[0].end == words.size() + - This function uses production_rules() to find out what the allowed production + rules are. That is, production_rules() defines all properties of the grammar + used by find_max_parse_cky(). + !*/ + +// ----------------------------------------------------------------------------------------- +// ----------------------------------------------------------------------------------------- + + class parse_tree_to_string_error : public error + { + /*! + WHAT THIS OBJECT REPRESENTS + This is the exception thrown by parse_tree_to_string() and + parse_tree_to_string_tagged() if the inputs are discovered to be invalid. + !*/ + }; + +// ----------------------------------------------------------------------------------------- + + template < + typename T, + typename U + > + std::string parse_tree_to_string ( + const std::vector<parse_tree_element<T> >& tree, + const std::vector<U>& words, + const unsigned long root_idx = 0 + ); + /*! + requires + - It must be possible to print U objects to an ostream using operator<< + (typically, U would be something like std::string) + ensures + - Interprets tree as a parse tree defined over the given sequence of words. + - returns a bracketed string that represents the parse tree over the words. + For example, suppose the following parse tree is input: + + /\ + / \ + /\ \ + / \ \ + the dog ran + + Then the output would be the string "[[the dog] ran]" + - Only the sub-tree rooted at tree[root_idx] will be output. If root_idx >= + tree.size() then the empty string is returned. + throws + - parse_tree_to_string_error + This exception is thrown if an invalid tree is detected. This might happen + if the tree refers to elements of words that don't exist because words is + shorted than it is supposed to be. + !*/ + +// ----------------------------------------------------------------------------------------- + + template < + typename T, + typename U + > + std::string parse_tree_to_string_tagged ( + const std::vector<parse_tree_element<T> >& tree, + const std::vector<U>& words, + const unsigned long root_idx = 0 + ); + /*! + requires + - It must be possible to print T objects to an ostream using operator<< + - It must be possible to print U objects to an ostream using operator<< + (typically, U would be something like std::string) + ensures + - This function does the same thing as parse_tree_to_string() except that it + also includes the parse_tree_element::tag object in the output. Therefore, + the tag of each bracket will be included as the first token inside the + bracket. For example, suppose the following parse tree is input (where tags + are shown at the vertices): + + S + /\ + NP \ + /\ \ + / \ \ + the dog ran + + Then the output would be the string "[S [NP the dog] ran]" + - Only the sub-tree rooted at tree[root_idx] will be output. If root_idx >= + tree.size() then the empty string is returned. + throws + - parse_tree_to_string_error + This exception is thrown if an invalid tree is detected. This might happen + if the tree refers to elements of words that don't exist because words is + shorted than it is supposed to be. + !*/ + +// ----------------------------------------------------------------------------------------- + + template < + typename T, + typename U + > + std::string parse_trees_to_string ( + const std::vector<parse_tree_element<T> >& tree, + const std::vector<U>& words, + const T& tag_to_skip + ); + /*! + requires + - It must be possible to print U objects to an ostream using operator<< + (typically, U would be something like std::string) + ensures + - This function behaves just like parse_tree_to_string() except that it will + not print the brackets (i.e. []) for the top most parts of the tree which + have tags equal to tag_to_skip. It will however print all the words. + Therefore, this function only includes brackets on the subtrees which begin + with a tag other than tag_to_skip. + throws + - parse_tree_to_string_error + This exception is thrown if an invalid tree is detected. This might happen + if the tree refers to elements of words that don't exist because words is + shorted than it is supposed to be. + !*/ + +// ----------------------------------------------------------------------------------------- + + template < + typename T, + typename U + > + std::string parse_trees_to_string_tagged ( + const std::vector<parse_tree_element<T> >& tree, + const std::vector<U>& words, + const T& tag_to_skip + ); + /*! + requires + - It must be possible to print T objects to an ostream using operator<< + - It must be possible to print U objects to an ostream using operator<< + (typically, U would be something like std::string) + ensures + - This function behaves just like parse_tree_to_string_tagged() except that it + will not print the brackets (i.e. []) for the top most parts of the tree + which have tags equal to tag_to_skip. It will however print all the words. + Therefore, this function only includes brackets on the subtrees which begin + with a tag other than tag_to_skip. + throws + - parse_tree_to_string_error + This exception is thrown if an invalid tree is detected. This might happen + if the tree refers to elements of words that don't exist because words is + shorted than it is supposed to be. + !*/ + +// ----------------------------------------------------------------------------------------- + + template < + typename T + > + void find_trees_not_rooted_with_tag ( + const std::vector<parse_tree_element<T> >& tree, + const T& tag, + std::vector<unsigned long>& tree_roots + ); + /*! + requires + - objects of type T must be comparable using operator== + ensures + - Finds all the largest non-overlapping trees in tree that are not rooted with + the given tag. + - find_trees_not_rooted_with_tag() is useful when you want to cut a parse tree + into a bunch of sub-trees and you know that the top level of the tree is all + composed of the same kind of tag. So if you want to just "slice off" the top + of the tree where this tag lives then this function is useful for doing that. + - #tree_roots.size() == the number of sub-trees found. + - for all valid i: + - tree[#tree_roots[i]].tag != tag + - To make the operation of this function clearer, here are a few examples of + what it will do: + - if (tree[0].tag != tag) then + - #tree_roots.size() == 0 + - #tree_roots[0] == 0 + - else if (tree[0].tag == tag but its immediate children's tags are not equal to tag) then + - #tree_roots.size() == 2 + - #tree_roots[0] == tree[0].left + - #tree_roots[1] == tree[0].right + !*/ + +// ----------------------------------------------------------------------------------------- + +} + +#endif // DLIB_FIND_MAX_PARsE_CKY_ABSTRACT_Hh_ + diff --git a/ml/dlib/dlib/optimization/find_optimal_parameters.h b/ml/dlib/dlib/optimization/find_optimal_parameters.h new file mode 100644 index 000000000..0884778c9 --- /dev/null +++ b/ml/dlib/dlib/optimization/find_optimal_parameters.h @@ -0,0 +1,117 @@ +// Copyright (C) 2016 Davis E. King (davis@dlib.net) +// License: Boost Software License See LICENSE.txt for the full license. +#ifndef DLIB_fIND_OPTIMAL_PARAMETERS_Hh_ +#define DLIB_fIND_OPTIMAL_PARAMETERS_Hh_ + +#include "../matrix.h" +#include "find_optimal_parameters_abstract.h" +#include "optimization_bobyqa.h" +#include "optimization_line_search.h" + +namespace dlib +{ + +// ---------------------------------------------------------------------------------------- + + template < + typename funct + > + double find_optimal_parameters ( + double initial_search_radius, + double eps, + const unsigned int max_f_evals, + matrix<double,0,1>& x, + const matrix<double,0,1>& x_lower, + const matrix<double,0,1>& x_upper, + const funct& f + ) + { + DLIB_CASSERT(x.size() == x_lower.size() && x_lower.size() == x_upper.size() && x.size() > 0, + "\t double find_optimal_parameters()" + << "\n\t x.size(): " << x.size() + << "\n\t x_lower.size(): " << x_lower.size() + << "\n\t x_upper.size(): " << x_upper.size() + ); + + // check the requirements. Also split the assert up so that the error message isn't huge. + DLIB_CASSERT(max_f_evals > 1 && eps > 0 && initial_search_radius > eps, + "\t double find_optimal_parameters()" + << "\n\t Invalid arguments have been given to this function" + << "\n\t initial_search_radius: " << initial_search_radius + << "\n\t eps: " << eps + << "\n\t max_f_evals: " << max_f_evals + ); + + DLIB_CASSERT( min(x_upper - x_lower) > 0 && + min(x - x_lower) >= 0 && min(x_upper - x) >= 0, + "\t double find_optimal_parameters()" + << "\n\t The bounds constraints have to make sense and also contain the starting point." + << "\n\t min(x_upper - x_lower): " << min(x_upper - x_lower) + << "\n\t min(x - x_lower) >= 0 && min(x_upper - x) >= 0: " << (min(x - x_lower) >= 0 && min(x_upper - x) >= 0) + ); + + // if the search radius is too big then shrink it so it fits inside the bounds. + if (initial_search_radius*2 >= min(x_upper-x_lower)) + initial_search_radius = 0.5*min(x_upper-x_lower)*0.99; + + + double objective_val = std::numeric_limits<double>::infinity(); + size_t num_iter_used = 0; + if (x.size() == 1) + { + // BOBYQA requires x to have at least 2 variables in it. So we can't call it in + // this case. Instead we call find_min_single_variable(). + matrix<double,0,1> temp(1); + auto ff = [&](const double& xx) + { + temp = xx; + double obj = f(temp); + ++num_iter_used; + // keep track of the best x. + if (obj < objective_val) + { + objective_val = obj; + x = temp; + } + return obj; + }; + try + { + double dx = x(0); + find_min_single_variable(ff, dx, x_lower(0), x_upper(0), eps, max_f_evals, initial_search_radius); + } catch (optimize_single_variable_failure& ) + { + } + } + else + { + auto ff = [&](const matrix<double,0,1>& xx) + { + double obj = f(xx); + ++num_iter_used; + // keep track of the best x. + if (obj < objective_val) + { + objective_val = obj; + x = xx; + } + return obj; + }; + try + { + matrix<double,0,1> start_x = x; + find_min_bobyqa(ff, start_x, 2*x.size()+1, x_lower, x_upper, initial_search_radius, eps, max_f_evals); + } catch (bobyqa_failure& ) + { + } + } + + return objective_val; + } + +// ---------------------------------------------------------------------------------------- + +} + +#endif // DLIB_fIND_OPTIMAL_PARAMETERS_Hh_ + diff --git a/ml/dlib/dlib/optimization/find_optimal_parameters_abstract.h b/ml/dlib/dlib/optimization/find_optimal_parameters_abstract.h new file mode 100644 index 000000000..96dcee89b --- /dev/null +++ b/ml/dlib/dlib/optimization/find_optimal_parameters_abstract.h @@ -0,0 +1,58 @@ +// Copyright (C) 2016 Davis E. King (davis@dlib.net) +// License: Boost Software License See LICENSE.txt for the full license. +#undef DLIB_fIND_OPTIMAL_PARAMETERS_ABSTRACT_Hh_ +#ifdef DLIB_fIND_OPTIMAL_PARAMETERS_ABSTRACT_Hh_ + +#include "../matrix.h" + +namespace dlib +{ + +// ---------------------------------------------------------------------------------------- + + template < + typename funct + > + double find_optimal_parameters ( + double initial_search_radius, + double eps, + const unsigned int max_f_evals, + matrix<double,0,1>& x, + const matrix<double,0,1>& x_lower, + const matrix<double,0,1>& x_upper, + const funct& f + ); + /*! + requires + - f(x) must be a valid expression that evaluates to a double + - x.size() == x_lower.size() == x_upper.size() + - x.size() > 0 + - 0 < eps < initial_search_radius + - max_f_evals > 1 + - min(x_upper - x_lower) > 0 + - min(x - x_lower) >= 0 && min(x_upper - x) >= 0 + (i.e. the given x should be within the bounds defined by x_lower and x_upper) + ensures + - Performs a constrained minimization of the function f() starting from + the initial point x. + - This function does not require derivatives of f(). Instead, it uses + derivative free methods to find the best setting of x. In particular, it + will begin by searching within a sphere of radius initial_search_radius + around x and will continue searching until either f() has been called + max_f_evals times or the search area has been shrunk to less than eps radius. + - #x == the value of x (within the bounds defined by x_lower and x_upper) that + was found to minimize f(). More precisely, it will always be true that: + - min(#x - x_lower) >= 0 && min(x_upper - #x) >= 0 + - returns f(#x). + throws + - No exception is thrown for executing max_f_evals iterations. This function + will simply output the best x it has seen if it runs out of iterations. + !*/ + +// ---------------------------------------------------------------------------------------- + +} + +#endif // DLIB_fIND_OPTIMAL_PARAMETERS_ABSTRACT_Hh_ + + diff --git a/ml/dlib/dlib/optimization/isotonic_regression.h b/ml/dlib/dlib/optimization/isotonic_regression.h new file mode 100644 index 000000000..e89e70b48 --- /dev/null +++ b/ml/dlib/dlib/optimization/isotonic_regression.h @@ -0,0 +1,169 @@ +// Copyright (C) 2018 Davis E. King (davis@dlib.net) +// License: Boost Software License See LICENSE.txt for the full license. +#ifndef DLIB_ISOTONIC_ReGRESSION_H_ +#define DLIB_ISOTONIC_ReGRESSION_H_ + +#include "isotonic_regression_abstract.h" +#include <vector> +#include <utility> + +namespace dlib +{ + class isotonic_regression + { + public: + + template < + typename const_iterator, + typename iterator + > + void operator() ( + const_iterator begin, + const_iterator end, + iterator obegin + ) + { + do_isotonic_regression(begin, end); + + // unpack blocks to output + for (auto& block : blocks) + { + for (size_t k = 0; k < block.num; ++k) + set_val(*obegin++, block.avg); + } + + blocks.clear(); + } + + void operator() ( + std::vector<double>& vect + ) { (*this)(vect.begin(), vect.end(), vect.begin()); } + + template <typename T, typename U> + void operator() ( + std::vector<std::pair<T,U>>& vect + ) { (*this)(vect.begin(), vect.end(), vect.begin()); } + + + template < + typename const_iterator, + typename iterator + > + void fit_with_linear_output_interpolation ( + const_iterator begin, + const_iterator end, + iterator obegin + ) + { + do_isotonic_regression(begin, end); + + // Unpack blocks to output, but here instead of producing the step function + // output we linearly interpolate. Note that this actually fits the data less + // than the step-function, but in many applications might be closer to what you + // really when when using isotonic_regression than the step function. + for (size_t i = 0; i < blocks.size(); ++i) + { + auto& block = blocks[i]; + + double prev = (blocks.front().avg + block.avg)/2; + if (i > 0) + prev = (blocks[i-1].avg+block.avg)/2; + + double next = (blocks.back().avg + block.avg)/2; + if (i+1 < blocks.size()) + next = (blocks[i+1].avg+block.avg)/2; + + for (size_t k = 0; k < block.num; ++k) + { + const auto mid = block.num/2.0; + if (k < mid) + { + const double alpha = k/mid; + set_val(*obegin++, (1-alpha)*prev + alpha*block.avg); + } + else + { + const double alpha = k/mid-1; + set_val(*obegin++, alpha*next + (1-alpha)*block.avg); + } + } + } + + blocks.clear(); + } + + void fit_with_linear_output_interpolation ( + std::vector<double>& vect + ) { fit_with_linear_output_interpolation(vect.begin(), vect.end(), vect.begin()); } + + template <typename T, typename U> + void fit_with_linear_output_interpolation ( + std::vector<std::pair<T,U>>& vect + ) { fit_with_linear_output_interpolation(vect.begin(), vect.end(), vect.begin()); } + + private: + + template < + typename const_iterator + > + void do_isotonic_regression ( + const_iterator begin, + const_iterator end + ) + { + blocks.clear(); + + // Do the actual isotonic regression. The output is a step-function and is + // stored in the vector of blocks. + for (auto i = begin; i != end; ++i) + { + blocks.emplace_back(get_val(*i)); + while (blocks.size() > 1 && prev_block().avg > current_block().avg) + { + // merge the last two blocks. + prev_block() = prev_block() + current_block(); + blocks.pop_back(); + } + } + } + + + template <typename T> + static double get_val(const T& v) { return v;} + + template <typename T, typename U> + static double get_val(const std::pair<T,U>& v) { return v.second;} + + template <typename T> + static void set_val(T& v, double val) { v = val;} + + template <typename T, typename U> + static void set_val(std::pair<T,U>& v, double val) { v.second = val;} + + + + struct block_t + { + block_t(double val) : num(1), avg(val) {} + block_t(size_t n, double val) : num(n), avg(val) {} + + size_t num; + double avg; + + inline block_t operator+(const block_t& rhs) const + { + return block_t(num+rhs.num, + (num*avg + rhs.num*rhs.avg)/(num+rhs.num)); + } + }; + + inline block_t& prev_block() { return blocks[blocks.size()-2]; } + inline block_t& current_block() { return blocks.back(); } + + std::vector<block_t> blocks; + }; +} + +#endif // DLIB_ISOTONIC_ReGRESSION_H_ + + diff --git a/ml/dlib/dlib/optimization/isotonic_regression_abstract.h b/ml/dlib/dlib/optimization/isotonic_regression_abstract.h new file mode 100644 index 000000000..b00334bef --- /dev/null +++ b/ml/dlib/dlib/optimization/isotonic_regression_abstract.h @@ -0,0 +1,128 @@ +// Copyright (C) 2018 Davis E. King (davis@dlib.net) +// License: Boost Software License See LICENSE.txt for the full license. +#undef DLIB_ISOTONIC_ReGRESSION_ABSTRACT_H_ +#ifdef DLIB_ISOTONIC_ReGRESSION_ABSTRACT_H_ + +#include <vector> +#include <utility> + +namespace dlib +{ + class isotonic_regression + { + /*! + WHAT THIS OBJECT REPRESENTS + This object is a tool for performing 1-D isotonic regression. That is, it + finds the least squares fit of a non-parametric curve to some user supplied + data, subject to the constraint that the fitted curve is non-decreasing. + + This is done using the fast O(n) pool adjacent violators algorithm. + !*/ + + public: + + template < + typename const_iterator, + typename iterator + > + void operator() ( + const_iterator begin, + const_iterator end, + iterator obegin + ); + /*! + requires + - [begin,end) is an iterator range of float or doubles or a range of + std::pair<T,double> or std::pair<T,float> where T an be anything. + - obegin points to an iterator range at least std::distance(begin,end). + - obegin points to an iterator range of objects of type float, double, std::pair<T,float>, or std::pair<T,double>. + ensures + - Given the range of real values stored in [begin,end), this method performs isotonic regression + on this data and writes the results to obegin. To be specific: + - let IN refer to the input values stored in the iterator range [begin,end). + - let OUT refer to the output values stored in the iterator range [obegin, obegin+std::distance(begin,end)). + - This function populates OUT with values such that the sum_i of + (IN[i]-OUT[i])^2 is minimized, subject to the constraint that + OUT[i] <= OUT[i+1], i.e. that OUT is monotonic. + - It is OK for [begin,end) to overlap with the range pointed to by obegin. + That is, this function can run in-place. + - Note that when the inputs or outputs are std::pairs this algorithm only + looks at the .second field of the pair. It therefore still treats these + iterator ranges as ranges of reals since it only looks at the .second + field, which is a real number. The .first field is entirely ignored. + !*/ + + void operator() ( + std::vector<double>& vect + ) { (*this)(vect.begin(), vect.end(), vect.begin()); } + /*! + ensures + - performs in-place isotonic regression. Therefore, #vect will contain the + isotonic regression of vect. + - #vect.size() == vect.size() + !*/ + + template <typename T, typename U> + void operator() ( + std::vector<std::pair<T,U>>& vect + ) { (*this)(vect.begin(), vect.end(), vect.begin()); } + /*! + ensures + - performs in-place isotonic regression. Therefore, #vect will contain the + isotonic regression of vect. + - #vect.size() == vect.size() + !*/ + + + template < + typename const_iterator, + typename iterator + > + void fit_with_linear_output_interpolation ( + const_iterator begin, + const_iterator end, + iterator obegin + ); + /*! + requires + - [begin,end) is an iterator range of float or doubles or a range of + std::pair<T,double> or std::pair<T,float> where T an be anything. + - obegin points to an iterator range at least std::distance(begin,end). + - obegin points to an iterator range of objects of type float, double, std::pair<T,float>, or std::pair<T,double>. + ensures + - This function behaves just like (*this)(begin,end,obegin) except that the + output is interpolated. To explain, not that the optimal output of + isotonic regression is a step function. However, in many applications + that isn't really what you want. You want something smoother. So + fit_with_linear_output_interpolation() does isotonic regression and then + linearly interpolates the step function into a piecewise linear function. + !*/ + + void fit_with_linear_output_interpolation ( + std::vector<double>& vect + ) { fit_with_linear_output_interpolation(vect.begin(), vect.end(), vect.begin()); } + /*! + ensures + - performs in-place isotonic regression. Therefore, #vect will contain the + isotonic regression of vect. + - #vect.size() == vect.size() + !*/ + + template <typename T, typename U> + void fit_with_linear_output_interpolation ( + std::vector<std::pair<T,U>>& vect + ) { fit_with_linear_output_interpolation(vect.begin(), vect.end(), vect.begin()); } + /*! + ensures + - performs in-place isotonic regression. Therefore, #vect will contain the + isotonic regression of vect. + - #vect.size() == vect.size() + !*/ + + }; +} + +#endif // DLIB_ISOTONIC_ReGRESSION_ABSTRACT_H_ + + + diff --git a/ml/dlib/dlib/optimization/max_cost_assignment.h b/ml/dlib/dlib/optimization/max_cost_assignment.h new file mode 100644 index 000000000..db6c6f0d7 --- /dev/null +++ b/ml/dlib/dlib/optimization/max_cost_assignment.h @@ -0,0 +1,288 @@ +// Copyright (C) 2011 Davis E. King (davis@dlib.net) +// License: Boost Software License See LICENSE.txt for the full license. +#ifndef DLIB_MAX_COST_ASSIgNMENT_Hh_ +#define DLIB_MAX_COST_ASSIgNMENT_Hh_ + +#include "max_cost_assignment_abstract.h" +#include "../matrix.h" +#include <vector> +#include <deque> + +namespace dlib +{ + +// ---------------------------------------------------------------------------------------- + + template <typename EXP> + typename EXP::type assignment_cost ( + const matrix_exp<EXP>& cost, + const std::vector<long>& assignment + ) + { + DLIB_ASSERT(cost.nr() == cost.nc(), + "\t type assignment_cost(cost,assignment)" + << "\n\t cost.nr(): " << cost.nr() + << "\n\t cost.nc(): " << cost.nc() + ); +#ifdef ENABLE_ASSERTS + // can't call max on an empty vector. So put an if here to guard against it. + if (assignment.size() > 0) + { + DLIB_ASSERT(0 <= min(mat(assignment)) && max(mat(assignment)) < cost.nr(), + "\t type assignment_cost(cost,assignment)" + << "\n\t cost.nr(): " << cost.nr() + << "\n\t cost.nc(): " << cost.nc() + << "\n\t min(assignment): " << min(mat(assignment)) + << "\n\t max(assignment): " << max(mat(assignment)) + ); + } +#endif + + typename EXP::type temp = 0; + for (unsigned long i = 0; i < assignment.size(); ++i) + { + temp += cost(i, assignment[i]); + } + return temp; + } + +// ---------------------------------------------------------------------------------------- + + namespace impl + { + template <typename EXP> + inline void compute_slack( + const long x, + std::vector<typename EXP::type>& slack, + std::vector<long>& slackx, + const matrix_exp<EXP>& cost, + const std::vector<typename EXP::type>& lx, + const std::vector<typename EXP::type>& ly + ) + { + for (long y = 0; y < cost.nc(); ++y) + { + if (lx[x] + ly[y] - cost(x,y) < slack[y]) + { + slack[y] = lx[x] + ly[y] - cost(x,y); + slackx[y] = x; + } + } + } + } + +// ---------------------------------------------------------------------------------------- + + template <typename EXP> + std::vector<long> max_cost_assignment ( + const matrix_exp<EXP>& cost_ + ) + { + const_temp_matrix<EXP> cost(cost_); + typedef typename EXP::type type; + // This algorithm only works if the elements of the cost matrix can be reliably + // compared using operator==. However, comparing for equality with floating point + // numbers is not a stable operation. So you need to use an integer cost matrix. + COMPILE_TIME_ASSERT(std::numeric_limits<type>::is_integer); + DLIB_ASSERT(cost.nr() == cost.nc(), + "\t std::vector<long> max_cost_assignment(cost)" + << "\n\t cost.nr(): " << cost.nr() + << "\n\t cost.nc(): " << cost.nc() + ); + + using namespace dlib::impl; + /* + I based the implementation of this algorithm on the description of the + Hungarian algorithm on the following websites: + http://www.math.uwo.ca/~mdawes/courses/344/kuhn-munkres.pdf + http://www.topcoder.com/tc?module=Static&d1=tutorials&d2=hungarianAlgorithm + + Note that this is the fast O(n^3) version of the algorithm. + */ + + if (cost.size() == 0) + return std::vector<long>(); + + std::vector<type> lx, ly; + std::vector<long> xy; + std::vector<long> yx; + std::vector<char> S, T; + std::vector<type> slack; + std::vector<long> slackx; + std::vector<long> aug_path; + + + + + // Initially, nothing is matched. + xy.assign(cost.nc(), -1); + yx.assign(cost.nc(), -1); + /* + We maintain the following invariant: + Vertex x is matched to vertex xy[x] and + vertex y is matched to vertex yx[y]. + + A value of -1 means a vertex isn't matched to anything. Moreover, + x corresponds to rows of the cost matrix and y corresponds to the + columns of the cost matrix. So we are matching X to Y. + */ + + + // Create an initial feasible labeling. Moreover, in the following + // code we will always have: + // for all valid x and y: lx[x] + ly[y] >= cost(x,y) + lx.resize(cost.nc()); + ly.assign(cost.nc(),0); + for (long x = 0; x < cost.nr(); ++x) + lx[x] = max(rowm(cost,x)); + + // Now grow the match set by picking edges from the equality subgraph until + // we have a complete matching. + for (long match_size = 0; match_size < cost.nc(); ++match_size) + { + std::deque<long> q; + + // Empty out the S and T sets + S.assign(cost.nc(), false); + T.assign(cost.nc(), false); + + // clear out old slack values + slack.assign(cost.nc(), std::numeric_limits<type>::max()); + slackx.resize(cost.nc()); + /* + slack and slackx are maintained such that we always + have the following (once they get initialized by compute_slack() below): + - for all y: + - let x == slackx[y] + - slack[y] == lx[x] + ly[y] - cost(x,y) + */ + + aug_path.assign(cost.nc(), -1); + + for (long x = 0; x < cost.nc(); ++x) + { + // If x is not matched to anything + if (xy[x] == -1) + { + q.push_back(x); + S[x] = true; + + compute_slack(x, slack, slackx, cost, lx, ly); + break; + } + } + + + long x_start = 0; + long y_start = 0; + + // Find an augmenting path. + bool found_augmenting_path = false; + while (!found_augmenting_path) + { + while (q.size() > 0 && !found_augmenting_path) + { + const long x = q.front(); + q.pop_front(); + for (long y = 0; y < cost.nc(); ++y) + { + if (cost(x,y) == lx[x] + ly[y] && !T[y]) + { + // if vertex y isn't matched with anything + if (yx[y] == -1) + { + y_start = y; + x_start = x; + found_augmenting_path = true; + break; + } + + T[y] = true; + q.push_back(yx[y]); + + aug_path[yx[y]] = x; + S[yx[y]] = true; + compute_slack(yx[y], slack, slackx, cost, lx, ly); + } + } + } + + if (found_augmenting_path) + break; + + + // Since we didn't find an augmenting path we need to improve the + // feasible labeling stored in lx and ly. We also need to keep the + // slack updated accordingly. + type delta = std::numeric_limits<type>::max(); + for (unsigned long i = 0; i < T.size(); ++i) + { + if (!T[i]) + delta = std::min(delta, slack[i]); + } + for (unsigned long i = 0; i < T.size(); ++i) + { + if (S[i]) + lx[i] -= delta; + + if (T[i]) + ly[i] += delta; + else + slack[i] -= delta; + } + + + + q.clear(); + for (long y = 0; y < cost.nc(); ++y) + { + if (!T[y] && slack[y] == 0) + { + // if vertex y isn't matched with anything + if (yx[y] == -1) + { + x_start = slackx[y]; + y_start = y; + found_augmenting_path = true; + break; + } + else + { + T[y] = true; + if (!S[yx[y]]) + { + q.push_back(yx[y]); + + aug_path[yx[y]] = slackx[y]; + S[yx[y]] = true; + compute_slack(yx[y], slack, slackx, cost, lx, ly); + } + } + } + } + } // end while (!found_augmenting_path) + + // Flip the edges along the augmenting path. This means we will add one more + // item to our matching. + for (long cx = x_start, cy = y_start, ty; + cx != -1; + cx = aug_path[cx], cy = ty) + { + ty = xy[cx]; + yx[cy] = cx; + xy[cx] = cy; + } + + } + + + return xy; + } + +// ---------------------------------------------------------------------------------------- + +} + +#endif // DLIB_MAX_COST_ASSIgNMENT_Hh_ + + diff --git a/ml/dlib/dlib/optimization/max_cost_assignment_abstract.h b/ml/dlib/dlib/optimization/max_cost_assignment_abstract.h new file mode 100644 index 000000000..bbdb0abfb --- /dev/null +++ b/ml/dlib/dlib/optimization/max_cost_assignment_abstract.h @@ -0,0 +1,63 @@ +// Copyright (C) 2011 Davis E. King (davis@dlib.net) +// License: Boost Software License See LICENSE.txt for the full license. +#undef DLIB_MAX_COST_ASSIgNMENT_ABSTRACT_Hh_ +#ifdef DLIB_MAX_COST_ASSIgNMENT_ABSTRACT_Hh_ + +#include "../matrix.h" +#include <vector> + +namespace dlib +{ + +// ---------------------------------------------------------------------------------------- + + template <typename EXP> + typename EXP::type assignment_cost ( + const matrix_exp<EXP>& cost, + const std::vector<long>& assignment + ); + /*! + requires + - cost.nr() == cost.nc() + - for all valid i: + - 0 <= assignment[i] < cost.nr() + ensures + - Interprets cost as a cost assignment matrix. That is, cost(i,j) + represents the cost of assigning i to j. + - Interprets assignment as a particular set of assignments. That is, + i is assigned to assignment[i]. + - returns the cost of the given assignment. That is, returns + a number which is: + sum over i: cost(i,assignment[i]) + !*/ + +// ---------------------------------------------------------------------------------------- + + template <typename EXP> + std::vector<long> max_cost_assignment ( + const matrix_exp<EXP>& cost + ); + /*! + requires + - EXP::type == some integer type (e.g. int) + (i.e. cost must contain integers rather than floats or doubles) + - cost.nr() == cost.nc() + ensures + - Finds and returns the solution to the following optimization problem: + + Maximize: f(A) == assignment_cost(cost, A) + Subject to the following constraints: + - The elements of A are unique. That is, there aren't any + elements of A which are equal. + - A.size() == cost.nr() + + - This function implements the O(N^3) version of the Hungarian algorithm + where N is the number of rows in the cost matrix. + !*/ + +// ---------------------------------------------------------------------------------------- + +} + +#endif // DLIB_MAX_COST_ASSIgNMENT_ABSTRACT_Hh_ + diff --git a/ml/dlib/dlib/optimization/max_sum_submatrix.h b/ml/dlib/dlib/optimization/max_sum_submatrix.h new file mode 100644 index 000000000..1986cc26b --- /dev/null +++ b/ml/dlib/dlib/optimization/max_sum_submatrix.h @@ -0,0 +1,285 @@ +// Copyright (C) 2011 Davis E. King (davis@dlib.net) +// License: Boost Software License See LICENSE.txt for the full license. +#ifndef DLIB_MAX_SUM_SUBMaTRIX_Hh_ +#define DLIB_MAX_SUM_SUBMaTRIX_Hh_ + +#include "max_sum_submatrix_abstract.h" +#include "../matrix.h" +#include <vector> +#include <queue> +#include "../geometry.h" + +namespace dlib +{ + namespace impl + { + + // ------------------------------------------------------------------------------------ + + template <typename T> + struct range_set + { + int top_min; + int top_max; + int bottom_min; + int bottom_max; + T weight; + + bool operator<(const range_set& item) const { return weight < item.weight; } + }; + + // ------------------------------------------------------------------------------------ + + template <typename T> + bool is_terminal_set ( + const range_set<T>& item + ) + { + return (item.top_min >= item.top_max && + item.bottom_min >= item.bottom_max); + } + + // ------------------------------------------------------------------------------------ + + template <typename T> + void split ( + const range_set<T>& rset, + range_set<T>& a, + range_set<T>& b + ) + { + if (rset.top_max - rset.top_min > rset.bottom_max - rset.bottom_min) + { + // split top + const int middle = (rset.top_max + rset.top_min)/2; + a.top_min = rset.top_min; + a.top_max = middle; + b.top_min = middle+1; + b.top_max = rset.top_max; + + a.bottom_min = rset.bottom_min; + a.bottom_max = rset.bottom_max; + b.bottom_min = rset.bottom_min; + b.bottom_max = rset.bottom_max; + } + else + { + // split bottom + const int middle = (rset.bottom_max + rset.bottom_min)/2; + a.bottom_min = rset.bottom_min; + a.bottom_max = middle; + b.bottom_min = middle+1; + b.bottom_max = rset.bottom_max; + + a.top_min = rset.top_min; + a.top_max = rset.top_max; + b.top_min = rset.top_min; + b.top_max = rset.top_max; + } + } + + // ------------------------------------------------------------------------------------ + + template <typename EXP, typename T> + void find_best_column_range ( + const matrix_exp<EXP>& sum_pos, + const matrix_exp<EXP>& sum_neg, + const range_set<T>& row_range, + T& weight, + int& left, + int& right + ) + { + left = 0; + right = -1; + weight = 0; + T cur_sum = 0; + int cur_pos = 0; + for (long c = 0; c < sum_pos.nc(); ++c) + { + // compute the value for the current column + T temp = sum_pos(row_range.bottom_max+1,c) - sum_pos(row_range.top_min,c); + if (row_range.top_max <= row_range.bottom_min) + temp += sum_neg(row_range.bottom_min+1,c) - sum_neg(row_range.top_max,c); + + + cur_sum += temp; + if (cur_sum > weight) + { + left = cur_pos; + right = c; + weight = cur_sum; + } + + if (cur_sum <= 0) + { + cur_sum = 0; + cur_pos = c+1; + } + + } + } + } + +// ---------------------------------------------------------------------------------------- + + template <typename EXP> + std::vector<rectangle> max_sum_submatrix( + const matrix_exp<EXP>& mat, + unsigned long max_rects, + double thresh_ = 0 + ) + { + // make sure requires clause is not broken + DLIB_ASSERT(thresh_ >= 0 && mat.size() > 0, + "\t std::vector<rectangle> max_sum_submatrix()" + << "\n\t Invalid arguments were given to this function." + << "\n\t mat.size(): " << mat.size() + << "\n\t thresh_: " << thresh_ + ); + + /* + This function is basically an implementation of the efficient subwindow search (I-ESS) + algorithm presented in the following paper: + Efficient Algorithms for Subwindow Search in Object Detection and Localization + by Senjian An, Patrick Peursum, Wanquan Liu and Svetha Venkatesh + In CVPR 2009 + + */ + + + if (max_rects == 0) + return std::vector<rectangle>(); + + using namespace dlib::impl; + typedef typename EXP::type element_type; + typedef typename promote<element_type>::type scalar_type; + + const scalar_type thresh = static_cast<scalar_type>(thresh_); + + + matrix<scalar_type> sum_pos; + matrix<scalar_type> sum_neg; + sum_pos.set_size(mat.nr()+1, mat.nc()); + sum_neg.set_size(mat.nr()+1, mat.nc()); + // integrate over the rows. + for (long c = 0; c < mat.nc(); ++c) + { + sum_pos(0,c) = 0; + sum_neg(0,c) = 0; + } + for (long r = 0; r < mat.nr(); ++r) + { + for (long c = 0; c < mat.nc(); ++c) + { + if (mat(r,c) > 0) + { + sum_pos(r+1,c) = mat(r,c) + sum_pos(r,c); + sum_neg(r+1,c) = sum_neg(r,c); + } + else + { + sum_pos(r+1,c) = sum_pos(r,c); + sum_neg(r+1,c) = mat(r,c) + sum_neg(r,c); + } + } + } + + std::priority_queue<range_set<scalar_type> > q; + + // the range_sets will represent ranges of columns + range_set<scalar_type> universe_set; + universe_set.bottom_min = 0; + universe_set.top_min = 0; + universe_set.bottom_max = mat.nr()-1; + universe_set.top_max = mat.nr()-1; + universe_set.weight = sum(rowm(dlib::mat(sum_pos),mat.nr())); + + q.push(universe_set); + + std::vector<rectangle> results; + std::vector<scalar_type> temp_pos(mat.nc()); + std::vector<scalar_type> temp_neg(mat.nc()); + + while (q.size() > 0) + { + if (is_terminal_set(q.top())) + { + int left, right; + scalar_type weight; + find_best_column_range(sum_pos, sum_neg, q.top(), weight, left, right); + + rectangle rect(left, q.top().top_min, + right, q.top().bottom_min); + + if (weight <= thresh) + break; + + results.push_back(rect); + + if (results.size() >= max_rects) + break; + + q = std::priority_queue<range_set<scalar_type> >(); + // We are going to blank out the weights we just used. So adjust the sum images appropriately. + for (long c = rect.left(); c <= rect.right(); ++c) + { + temp_pos[c] = sum_pos(rect.bottom()+1,c) - sum_pos(rect.top(),c); + temp_neg[c] = sum_neg(rect.bottom()+1,c) - sum_neg(rect.top(),c); + } + // blank out the area inside the rectangle + for (long r = rect.top(); r <= rect.bottom(); ++r) + { + for (long c = rect.left(); c <= rect.right(); ++c) + { + sum_pos(r+1,c) = sum_pos(r,c); + sum_neg(r+1,c) = sum_neg(r,c); + } + } + // account for the area below the rectangle + for (long r = rect.bottom()+2; r < sum_pos.nr(); ++r) + { + for (long c = rect.left(); c <= rect.right(); ++c) + { + sum_pos(r,c) -= temp_pos[c]; + sum_neg(r,c) -= temp_neg[c]; + } + } + + + universe_set.weight = sum(rowm(dlib::mat(sum_pos),mat.nr())); + if (universe_set.weight <= thresh) + break; + + q.push(universe_set); + continue; + } + + range_set<scalar_type> a, b; + split(q.top(), a,b); + q.pop(); + + // these variables are not used at this point in the algorithm. + int a_left, a_right; + int b_left, b_right; + + find_best_column_range(sum_pos, sum_neg, a, a.weight, a_left, a_right); + find_best_column_range(sum_pos, sum_neg, b, b.weight, b_left, b_right); + + if (a.weight > thresh) + q.push(a); + if (b.weight > thresh) + q.push(b); + + } + + + return results; + } + +// ---------------------------------------------------------------------------------------- + +} + +#endif // DLIB_MAX_SUM_SUBMaTRIX_Hh_ + diff --git a/ml/dlib/dlib/optimization/max_sum_submatrix_abstract.h b/ml/dlib/dlib/optimization/max_sum_submatrix_abstract.h new file mode 100644 index 000000000..6714dd7fe --- /dev/null +++ b/ml/dlib/dlib/optimization/max_sum_submatrix_abstract.h @@ -0,0 +1,49 @@ +// Copyright (C) 2011 Davis E. King (davis@dlib.net) +// License: Boost Software License See LICENSE.txt for the full license. +#undef DLIB_MAX_SUM_SUBMaTRIX_ABSTRACT_Hh_ +#ifdef DLIB_MAX_SUM_SUBMaTRIX_ABSTRACT_Hh_ + +#include "../matrix.h" +#include <vector> +#include "../geometry.h" + +namespace dlib +{ + +// ---------------------------------------------------------------------------------------- + + template < + typename EXP + > + std::vector<rectangle> max_sum_submatrix( + const matrix_exp<EXP>& mat, + unsigned long max_rects, + double thresh = 0 + ); + /*! + requires + - thresh >= 0 + - mat.size() != 0 + ensures + - This function finds the submatrix within mat which has the largest sum. It then + zeros out that submatrix and repeats the process until no more maximal submatrices can + be found. The std::vector returned will be ordered so that the rectangles with the + largest sum come first. + - Each submatrix must have a sum greater than thresh. If no such submatrix exists then + the algorithm terminates and returns an empty std::vector. + - At most max_rects rectangles are returned. + + - This function is basically an implementation of the efficient subwindow search (I-ESS) + algorithm presented in the following paper: + Efficient Algorithms for Subwindow Search in Object Detection and Localization + by Senjian An, Patrick Peursum, Wanquan Liu and Svetha Venkatesh + In CVPR 2009 + !*/ + +// ---------------------------------------------------------------------------------------- + +} + +#endif // DLIB_MAX_SUM_SUBMaTRIX_ABSTRACT_Hh_ + + diff --git a/ml/dlib/dlib/optimization/optimization.h b/ml/dlib/dlib/optimization/optimization.h new file mode 100644 index 000000000..561d64376 --- /dev/null +++ b/ml/dlib/dlib/optimization/optimization.h @@ -0,0 +1,714 @@ +// Copyright (C) 2008 Davis E. King (davis@dlib.net) +// License: Boost Software License See LICENSE.txt for the full license. +#ifndef DLIB_OPTIMIZATIOn_H_ +#define DLIB_OPTIMIZATIOn_H_ + +#include <cmath> +#include <limits> +#include "optimization_abstract.h" +#include "optimization_search_strategies.h" +#include "optimization_stop_strategies.h" +#include "optimization_line_search.h" + +namespace dlib +{ + +// ---------------------------------------------------------------------------------------- +// ---------------------------------------------------------------------------------------- +// Functions that transform other functions +// ---------------------------------------------------------------------------------------- +// ---------------------------------------------------------------------------------------- + + template <typename funct> + class central_differences + { + public: + central_differences(const funct& f_, double eps_ = 1e-7) : f(f_), eps(eps_){} + + template <typename T> + typename T::matrix_type operator()(const T& x) const + { + // T must be some sort of dlib matrix + COMPILE_TIME_ASSERT(is_matrix<T>::value); + + typename T::matrix_type der(x.size()); + typename T::matrix_type e(x); + for (long i = 0; i < x.size(); ++i) + { + const double old_val = e(i); + + e(i) += eps; + const double delta_plus = f(e); + e(i) = old_val - eps; + const double delta_minus = f(e); + + der(i) = (delta_plus - delta_minus)/((old_val+eps)-(old_val-eps)); + + // and finally restore the old value of this element + e(i) = old_val; + } + + return der; + } + + template <typename T, typename U> + typename U::matrix_type operator()(const T& item, const U& x) const + { + // U must be some sort of dlib matrix + COMPILE_TIME_ASSERT(is_matrix<U>::value); + + typename U::matrix_type der(x.size()); + typename U::matrix_type e(x); + for (long i = 0; i < x.size(); ++i) + { + const double old_val = e(i); + + e(i) += eps; + const double delta_plus = f(item,e); + e(i) = old_val - eps; + const double delta_minus = f(item,e); + + der(i) = (delta_plus - delta_minus)/((old_val+eps)-(old_val-eps)); + + // and finally restore the old value of this element + e(i) = old_val; + } + + return der; + } + + + double operator()(const double& x) const + { + return (f(x+eps)-f(x-eps))/((x+eps)-(x-eps)); + } + + private: + const funct& f; + const double eps; + }; + + template <typename funct> + const central_differences<funct> derivative(const funct& f) { return central_differences<funct>(f); } + template <typename funct> + const central_differences<funct> derivative(const funct& f, double eps) + { + DLIB_ASSERT ( + eps > 0, + "\tcentral_differences derivative(f,eps)" + << "\n\tYou must give an epsilon > 0" + << "\n\teps: " << eps + ); + return central_differences<funct>(f,eps); + } + +// ---------------------------------------------------------------------------------------- + + template <typename funct, typename EXP1, typename EXP2> + struct clamped_function_object + { + clamped_function_object( + const funct& f_, + const matrix_exp<EXP1>& x_lower_, + const matrix_exp<EXP2>& x_upper_ + ) : f(f_), x_lower(x_lower_), x_upper(x_upper_) + { + } + + template <typename T> + double operator() ( + const T& x + ) const + { + return f(clamp(x,x_lower,x_upper)); + } + + const funct& f; + const matrix_exp<EXP1>& x_lower; + const matrix_exp<EXP2>& x_upper; + }; + + template <typename funct, typename EXP1, typename EXP2> + clamped_function_object<funct,EXP1,EXP2> clamp_function( + const funct& f, + const matrix_exp<EXP1>& x_lower, + const matrix_exp<EXP2>& x_upper + ) { return clamped_function_object<funct,EXP1,EXP2>(f,x_lower,x_upper); } + +// ---------------------------------------------------------------------------------------- + +// ---------------------------------------------------------------------------------------- +// ---------------------------------------------------------------------------------------- +// Functions that perform unconstrained optimization +// ---------------------------------------------------------------------------------------- +// ---------------------------------------------------------------------------------------- + + template < + typename search_strategy_type, + typename stop_strategy_type, + typename funct, + typename funct_der, + typename T + > + double find_min ( + search_strategy_type search_strategy, + stop_strategy_type stop_strategy, + const funct& f, + const funct_der& der, + T& x, + double min_f + ) + { + COMPILE_TIME_ASSERT(is_matrix<T>::value); + // The starting point (i.e. x) must be a column vector. + COMPILE_TIME_ASSERT(T::NC <= 1); + + DLIB_CASSERT ( + is_col_vector(x), + "\tdouble find_min()" + << "\n\tYou have to supply column vectors to this function" + << "\n\tx.nc(): " << x.nc() + ); + + + T g, s; + + double f_value = f(x); + g = der(x); + + if (!is_finite(f_value)) + throw error("The objective function generated non-finite outputs"); + if (!is_finite(g)) + throw error("The objective function generated non-finite outputs"); + + while(stop_strategy.should_continue_search(x, f_value, g) && f_value > min_f) + { + s = search_strategy.get_next_direction(x, f_value, g); + + double alpha = line_search( + make_line_search_function(f,x,s, f_value), + f_value, + make_line_search_function(der,x,s, g), + dot(g,s), // compute initial gradient for the line search + search_strategy.get_wolfe_rho(), search_strategy.get_wolfe_sigma(), min_f, + search_strategy.get_max_line_search_iterations()); + + // Take the search step indicated by the above line search + x += alpha*s; + + if (!is_finite(f_value)) + throw error("The objective function generated non-finite outputs"); + if (!is_finite(g)) + throw error("The objective function generated non-finite outputs"); + } + + return f_value; + } + +// ---------------------------------------------------------------------------------------- + + template < + typename search_strategy_type, + typename stop_strategy_type, + typename funct, + typename funct_der, + typename T + > + double find_max ( + search_strategy_type search_strategy, + stop_strategy_type stop_strategy, + const funct& f, + const funct_der& der, + T& x, + double max_f + ) + { + COMPILE_TIME_ASSERT(is_matrix<T>::value); + // The starting point (i.e. x) must be a column vector. + COMPILE_TIME_ASSERT(T::NC <= 1); + + DLIB_CASSERT ( + is_col_vector(x), + "\tdouble find_max()" + << "\n\tYou have to supply column vectors to this function" + << "\n\tx.nc(): " << x.nc() + ); + + T g, s; + + // This function is basically just a copy of find_min() but with - put in the right places + // to flip things around so that it ends up looking for the max rather than the min. + + double f_value = -f(x); + g = -der(x); + + if (!is_finite(f_value)) + throw error("The objective function generated non-finite outputs"); + if (!is_finite(g)) + throw error("The objective function generated non-finite outputs"); + + while(stop_strategy.should_continue_search(x, f_value, g) && f_value > -max_f) + { + s = search_strategy.get_next_direction(x, f_value, g); + + double alpha = line_search( + negate_function(make_line_search_function(f,x,s, f_value)), + f_value, + negate_function(make_line_search_function(der,x,s, g)), + dot(g,s), // compute initial gradient for the line search + search_strategy.get_wolfe_rho(), search_strategy.get_wolfe_sigma(), -max_f, + search_strategy.get_max_line_search_iterations() + ); + + // Take the search step indicated by the above line search + x += alpha*s; + + // Don't forget to negate these outputs from the line search since they are + // from the unnegated versions of f() and der() + g *= -1; + f_value *= -1; + + if (!is_finite(f_value)) + throw error("The objective function generated non-finite outputs"); + if (!is_finite(g)) + throw error("The objective function generated non-finite outputs"); + } + + return -f_value; + } + +// ---------------------------------------------------------------------------------------- + + template < + typename search_strategy_type, + typename stop_strategy_type, + typename funct, + typename T + > + double find_min_using_approximate_derivatives ( + search_strategy_type search_strategy, + stop_strategy_type stop_strategy, + const funct& f, + T& x, + double min_f, + double derivative_eps = 1e-7 + ) + { + COMPILE_TIME_ASSERT(is_matrix<T>::value); + // The starting point (i.e. x) must be a column vector. + COMPILE_TIME_ASSERT(T::NC <= 1); + + DLIB_CASSERT ( + is_col_vector(x) && derivative_eps > 0, + "\tdouble find_min_using_approximate_derivatives()" + << "\n\tYou have to supply column vectors to this function" + << "\n\tx.nc(): " << x.nc() + << "\n\tderivative_eps: " << derivative_eps + ); + + T g, s; + + double f_value = f(x); + g = derivative(f,derivative_eps)(x); + + if (!is_finite(f_value)) + throw error("The objective function generated non-finite outputs"); + if (!is_finite(g)) + throw error("The objective function generated non-finite outputs"); + + while(stop_strategy.should_continue_search(x, f_value, g) && f_value > min_f) + { + s = search_strategy.get_next_direction(x, f_value, g); + + double alpha = line_search( + make_line_search_function(f,x,s,f_value), + f_value, + derivative(make_line_search_function(f,x,s),derivative_eps), + dot(g,s), // Sometimes the following line is a better way of determining the initial gradient. + //derivative(make_line_search_function(f,x,s),derivative_eps)(0), + search_strategy.get_wolfe_rho(), search_strategy.get_wolfe_sigma(), min_f, + search_strategy.get_max_line_search_iterations() + ); + + // Take the search step indicated by the above line search + x += alpha*s; + + g = derivative(f,derivative_eps)(x); + + if (!is_finite(f_value)) + throw error("The objective function generated non-finite outputs"); + if (!is_finite(g)) + throw error("The objective function generated non-finite outputs"); + } + + return f_value; + } + +// ---------------------------------------------------------------------------------------- + + template < + typename search_strategy_type, + typename stop_strategy_type, + typename funct, + typename T + > + double find_max_using_approximate_derivatives ( + search_strategy_type search_strategy, + stop_strategy_type stop_strategy, + const funct& f, + T& x, + double max_f, + double derivative_eps = 1e-7 + ) + { + COMPILE_TIME_ASSERT(is_matrix<T>::value); + // The starting point (i.e. x) must be a column vector. + COMPILE_TIME_ASSERT(T::NC <= 1); + + DLIB_CASSERT ( + is_col_vector(x) && derivative_eps > 0, + "\tdouble find_max_using_approximate_derivatives()" + << "\n\tYou have to supply column vectors to this function" + << "\n\tx.nc(): " << x.nc() + << "\n\tderivative_eps: " << derivative_eps + ); + + // Just negate the necessary things and call the find_min version of this function. + return -find_min_using_approximate_derivatives( + search_strategy, + stop_strategy, + negate_function(f), + x, + -max_f, + derivative_eps + ); + } + +// ---------------------------------------------------------------------------------------- +// ---------------------------------------------------------------------------------------- +// Functions for box constrained optimization +// ---------------------------------------------------------------------------------------- +// ---------------------------------------------------------------------------------------- + + template <typename T, typename U, typename V> + T zero_bounded_variables ( + const double eps, + T vect, + const T& x, + const T& gradient, + const U& x_lower, + const V& x_upper + ) + { + for (long i = 0; i < gradient.size(); ++i) + { + const double tol = eps*std::abs(x(i)); + // if x(i) is an active bound constraint + if (x_lower(i)+tol >= x(i) && gradient(i) > 0) + vect(i) = 0; + else if (x_upper(i)-tol <= x(i) && gradient(i) < 0) + vect(i) = 0; + } + return vect; + } + +// ---------------------------------------------------------------------------------------- + + template <typename T, typename U, typename V> + T gap_step_assign_bounded_variables ( + const double eps, + T vect, + const T& x, + const T& gradient, + const U& x_lower, + const V& x_upper + ) + { + for (long i = 0; i < gradient.size(); ++i) + { + const double tol = eps*std::abs(x(i)); + // If x(i) is an active bound constraint then we should set its search + // direction such that a single step along the direction either does nothing or + // closes the gap of size tol before hitting the bound exactly. + if (x_lower(i)+tol >= x(i) && gradient(i) > 0) + vect(i) = x_lower(i)-x(i); + else if (x_upper(i)-tol <= x(i) && gradient(i) < 0) + vect(i) = x_upper(i)-x(i); + } + return vect; + } + +// ---------------------------------------------------------------------------------------- + + template < + typename search_strategy_type, + typename stop_strategy_type, + typename funct, + typename funct_der, + typename T, + typename EXP1, + typename EXP2 + > + double find_min_box_constrained ( + search_strategy_type search_strategy, + stop_strategy_type stop_strategy, + const funct& f, + const funct_der& der, + T& x, + const matrix_exp<EXP1>& x_lower, + const matrix_exp<EXP2>& x_upper + ) + { + /* + The implementation of this function is more or less based on the discussion in + the paper Projected Newton-type Methods in Machine Learning by Mark Schmidt, et al. + */ + + // make sure the requires clause is not violated + COMPILE_TIME_ASSERT(is_matrix<T>::value); + // The starting point (i.e. x) must be a column vector. + COMPILE_TIME_ASSERT(T::NC <= 1); + + DLIB_CASSERT ( + is_col_vector(x) && is_col_vector(x_lower) && is_col_vector(x_upper) && + x.size() == x_lower.size() && x.size() == x_upper.size(), + "\tdouble find_min_box_constrained()" + << "\n\t The inputs to this function must be equal length column vectors." + << "\n\t is_col_vector(x): " << is_col_vector(x) + << "\n\t is_col_vector(x_upper): " << is_col_vector(x_upper) + << "\n\t is_col_vector(x_upper): " << is_col_vector(x_upper) + << "\n\t x.size(): " << x.size() + << "\n\t x_lower.size(): " << x_lower.size() + << "\n\t x_upper.size(): " << x_upper.size() + ); + DLIB_ASSERT ( + min(x_upper-x_lower) >= 0, + "\tdouble find_min_box_constrained()" + << "\n\t You have to supply proper box constraints to this function." + << "\n\r min(x_upper-x_lower): " << min(x_upper-x_lower) + ); + + + T g, s; + double f_value = f(x); + g = der(x); + + if (!is_finite(f_value)) + throw error("The objective function generated non-finite outputs"); + if (!is_finite(g)) + throw error("The objective function generated non-finite outputs"); + + // gap_eps determines how close we have to get to a bound constraint before we + // start basically dropping it from the optimization and consider it to be an + // active constraint. + const double gap_eps = 1e-8; + + double last_alpha = 1; + while(stop_strategy.should_continue_search(x, f_value, g)) + { + s = search_strategy.get_next_direction(x, f_value, zero_bounded_variables(gap_eps, g, x, g, x_lower, x_upper)); + s = gap_step_assign_bounded_variables(gap_eps, s, x, g, x_lower, x_upper); + + double alpha = backtracking_line_search( + make_line_search_function(clamp_function(f,x_lower,x_upper), x, s, f_value), + f_value, + dot(g,s), // compute gradient for the line search + last_alpha, + search_strategy.get_wolfe_rho(), + search_strategy.get_max_line_search_iterations()); + + // Do a trust region style thing for alpha. The idea is that if we take a + // small step then we are likely to take another small step. So we reuse the + // alpha from the last iteration unless the line search didn't shrink alpha at + // all, in that case, we start with a bigger alpha next time. + if (alpha == last_alpha) + last_alpha = std::min(last_alpha*10,1.0); + else + last_alpha = alpha; + + // Take the search step indicated by the above line search + x = dlib::clamp(x + alpha*s, x_lower, x_upper); + g = der(x); + + if (!is_finite(f_value)) + throw error("The objective function generated non-finite outputs"); + if (!is_finite(g)) + throw error("The objective function generated non-finite outputs"); + } + + return f_value; + } + +// ---------------------------------------------------------------------------------------- + + template < + typename search_strategy_type, + typename stop_strategy_type, + typename funct, + typename funct_der, + typename T + > + double find_min_box_constrained ( + search_strategy_type search_strategy, + stop_strategy_type stop_strategy, + const funct& f, + const funct_der& der, + T& x, + double x_lower, + double x_upper + ) + { + // The starting point (i.e. x) must be a column vector. + COMPILE_TIME_ASSERT(T::NC <= 1); + + typedef typename T::type scalar_type; + return find_min_box_constrained(search_strategy, + stop_strategy, + f, + der, + x, + uniform_matrix<scalar_type>(x.size(),1,x_lower), + uniform_matrix<scalar_type>(x.size(),1,x_upper) ); + } + +// ---------------------------------------------------------------------------------------- + + template < + typename search_strategy_type, + typename stop_strategy_type, + typename funct, + typename funct_der, + typename T, + typename EXP1, + typename EXP2 + > + double find_max_box_constrained ( + search_strategy_type search_strategy, + stop_strategy_type stop_strategy, + const funct& f, + const funct_der& der, + T& x, + const matrix_exp<EXP1>& x_lower, + const matrix_exp<EXP2>& x_upper + ) + { + // make sure the requires clause is not violated + COMPILE_TIME_ASSERT(is_matrix<T>::value); + // The starting point (i.e. x) must be a column vector. + COMPILE_TIME_ASSERT(T::NC <= 1); + + DLIB_CASSERT ( + is_col_vector(x) && is_col_vector(x_lower) && is_col_vector(x_upper) && + x.size() == x_lower.size() && x.size() == x_upper.size(), + "\tdouble find_max_box_constrained()" + << "\n\t The inputs to this function must be equal length column vectors." + << "\n\t is_col_vector(x): " << is_col_vector(x) + << "\n\t is_col_vector(x_upper): " << is_col_vector(x_upper) + << "\n\t is_col_vector(x_upper): " << is_col_vector(x_upper) + << "\n\t x.size(): " << x.size() + << "\n\t x_lower.size(): " << x_lower.size() + << "\n\t x_upper.size(): " << x_upper.size() + ); + DLIB_ASSERT ( + min(x_upper-x_lower) >= 0, + "\tdouble find_max_box_constrained()" + << "\n\t You have to supply proper box constraints to this function." + << "\n\r min(x_upper-x_lower): " << min(x_upper-x_lower) + ); + + // This function is basically just a copy of find_min_box_constrained() but with - put + // in the right places to flip things around so that it ends up looking for the max + // rather than the min. + + T g, s; + double f_value = -f(x); + g = -der(x); + + if (!is_finite(f_value)) + throw error("The objective function generated non-finite outputs"); + if (!is_finite(g)) + throw error("The objective function generated non-finite outputs"); + + // gap_eps determines how close we have to get to a bound constraint before we + // start basically dropping it from the optimization and consider it to be an + // active constraint. + const double gap_eps = 1e-8; + + double last_alpha = 1; + while(stop_strategy.should_continue_search(x, f_value, g)) + { + s = search_strategy.get_next_direction(x, f_value, zero_bounded_variables(gap_eps, g, x, g, x_lower, x_upper)); + s = gap_step_assign_bounded_variables(gap_eps, s, x, g, x_lower, x_upper); + + double alpha = backtracking_line_search( + negate_function(make_line_search_function(clamp_function(f,x_lower,x_upper), x, s, f_value)), + f_value, + dot(g,s), // compute gradient for the line search + last_alpha, + search_strategy.get_wolfe_rho(), + search_strategy.get_max_line_search_iterations()); + + // Do a trust region style thing for alpha. The idea is that if we take a + // small step then we are likely to take another small step. So we reuse the + // alpha from the last iteration unless the line search didn't shrink alpha at + // all, in that case, we start with a bigger alpha next time. + if (alpha == last_alpha) + last_alpha = std::min(last_alpha*10,1.0); + else + last_alpha = alpha; + + // Take the search step indicated by the above line search + x = dlib::clamp(x + alpha*s, x_lower, x_upper); + g = -der(x); + + // Don't forget to negate the output from the line search since it is from the + // unnegated version of f() + f_value *= -1; + + if (!is_finite(f_value)) + throw error("The objective function generated non-finite outputs"); + if (!is_finite(g)) + throw error("The objective function generated non-finite outputs"); + } + + return -f_value; + } + +// ---------------------------------------------------------------------------------------- + + template < + typename search_strategy_type, + typename stop_strategy_type, + typename funct, + typename funct_der, + typename T + > + double find_max_box_constrained ( + search_strategy_type search_strategy, + stop_strategy_type stop_strategy, + const funct& f, + const funct_der& der, + T& x, + double x_lower, + double x_upper + ) + { + // The starting point (i.e. x) must be a column vector. + COMPILE_TIME_ASSERT(T::NC <= 1); + + typedef typename T::type scalar_type; + return find_max_box_constrained(search_strategy, + stop_strategy, + f, + der, + x, + uniform_matrix<scalar_type>(x.size(),1,x_lower), + uniform_matrix<scalar_type>(x.size(),1,x_upper) ); + } + +// ---------------------------------------------------------------------------------------- + +} + +#endif // DLIB_OPTIMIZATIOn_H_ + diff --git a/ml/dlib/dlib/optimization/optimization_abstract.h b/ml/dlib/dlib/optimization/optimization_abstract.h new file mode 100644 index 000000000..f3c42c2b4 --- /dev/null +++ b/ml/dlib/dlib/optimization/optimization_abstract.h @@ -0,0 +1,468 @@ +// Copyright (C) 2008 Davis E. King (davis@dlib.net) +// License: Boost Software License See LICENSE.txt for the full license. +#undef DLIB_OPTIMIZATIOn_ABSTRACT_ +#ifdef DLIB_OPTIMIZATIOn_ABSTRACT_ + +#include <cmath> +#include <limits> +#include "../matrix/matrix_abstract.h" +#include "../algs.h" +#include "optimization_search_strategies_abstract.h" +#include "optimization_stop_strategies_abstract.h" +#include "optimization_line_search_abstract.h" + + +namespace dlib +{ + +// ---------------------------------------------------------------------------------------- +// ---------------------------------------------------------------------------------------- +// Functions that transform other functions +// ---------------------------------------------------------------------------------------- +// ---------------------------------------------------------------------------------------- + + template < + typename funct + > + class central_differences; + /*! + This is a function object that represents the derivative of some other + function. + + Note that if funct is a function of a double then the derivative of + funct is just a double but if funct is a function of a dlib::matrix (i.e. a + function of many variables) then its derivative is a gradient vector (a column + vector in particular). + !*/ + + template < + typename funct + > + const central_differences<funct> derivative( + const funct& f, + double eps + ); + /*! + requires + - f == a function that returns a scalar + - f must have one of the following forms: + - double f(double) + - double f(dlib::matrix) (where the matrix is a column vector) + - double f(T, dlib::matrix) (where the matrix is a column vector. In + this case the derivative of f is taken with respect to the second argument.) + - eps > 0 + ensures + - returns a function that represents the derivative of the function f. It + is approximated numerically by: + (f(x+eps)-f(x-eps))/(2*eps) + !*/ + + template < + typename funct + > + const central_differences<funct> derivative( + const funct& f + ); + /*! + ensures + - returns derivative(f, 1e-7) + !*/ + +// ---------------------------------------------------------------------------------------- + + template < + typename funct, + typename EXP1, + typename EXP2 + > + clamped_function_object<funct,EXP1,EXP2> clamp_function ( + const funct& f, + const matrix_exp<EXP1>& x_lower, + const matrix_exp<EXP2>& x_upper + ); + /*! + requires + - f == a function that takes a matrix and returns a scalar value. Moreover, f + must be capable of taking in matrices with the same dimensions as x_lower and + x_upper. So f(x_lower) must be a valid expression that evaluates to a scalar + value. + - x_lower.nr() == x_upper.nr() && x_lower.nc() == x_upper.nc() + (i.e. x_lower and x_upper must have the same dimensions) + - x_lower and x_upper must contain the same type of elements. + ensures + - returns a function object that represents the function g(x) where + g(x) == f(clamp(x,x_lower,x_upper)) + !*/ + +// ---------------------------------------------------------------------------------------- +// ---------------------------------------------------------------------------------------- +// Functions that perform unconstrained optimization +// ---------------------------------------------------------------------------------------- +// ---------------------------------------------------------------------------------------- + + template < + typename search_strategy_type, + typename stop_strategy_type, + typename funct, + typename funct_der, + typename T + > + double find_min ( + search_strategy_type search_strategy, + stop_strategy_type stop_strategy, + const funct& f, + const funct_der& der, + T& x, + double min_f + ); + /*! + requires + - search_strategy == an object that defines a search strategy such as one + of the objects from dlib/optimization/optimization_search_strategies_abstract.h + - stop_strategy == an object that defines a stop strategy such as one of + the objects from dlib/optimization/optimization_stop_strategies_abstract.h + - f(x) must be a valid expression that evaluates to a double + - der(x) must be a valid expression that evaluates to the derivative of f() at x. + - is_col_vector(x) == true + ensures + - Performs an unconstrained minimization of the function f() using the given + search_strategy and starting from the initial point x. + - The function is optimized until stop_strategy decides that an acceptable + point has been found or f(#x) < min_f. + - #x == the value of x that was found to minimize f() + - returns f(#x). + - When this function makes calls to f() and der() it always does so by + first calling f() and then calling der(). That is, these two functions + are always called in pairs with f() being called first and then der() + being called second. + !*/ + +// ---------------------------------------------------------------------------------------- + + template < + typename search_strategy_type, + typename stop_strategy_type, + typename funct, + typename funct_der, + typename T + > + double find_max ( + search_strategy_type search_strategy, + stop_strategy_type stop_strategy, + const funct& f, + const funct_der& der, + T& x, + double max_f + ); + /*! + requires + - search_strategy == an object that defines a search strategy such as one + of the objects from dlib/optimization/optimization_search_strategies_abstract.h + - stop_strategy == an object that defines a stop strategy such as one of + the objects from dlib/optimization/optimization_stop_strategies_abstract.h + - f(x) must be a valid expression that evaluates to a double + - der(x) must be a valid expression that evaluates to the derivative of f() at x. + - is_col_vector(x) == true + ensures + - Performs an unconstrained maximization of the function f() using the given + search_strategy and starting from the initial point x. + - The function is optimized until stop_strategy decides that an acceptable + point has been found or f(#x) > max_f. + - #x == the value of x that was found to maximize f() + - returns f(#x). + - When this function makes calls to f() and der() it always does so by + first calling f() and then calling der(). That is, these two functions + are always called in pairs with f() being called first and then der() + being called second. + - Note that this function solves the maximization problem by converting it + into a minimization problem. Therefore, the values of f and its derivative + reported to the stopping strategy will be negated. That is, stop_strategy + will see -f() and -der(). All this really means is that the status messages + from a stopping strategy in verbose mode will display a negated objective + value. + !*/ + +// ---------------------------------------------------------------------------------------- + + template < + typename search_strategy_type, + typename stop_strategy_type, + typename funct, + typename T + > + double find_min_using_approximate_derivatives ( + search_strategy_type search_strategy, + stop_strategy_type stop_strategy, + const funct& f, + T& x, + double min_f, + double derivative_eps = 1e-7 + ); + /*! + requires + - search_strategy == an object that defines a search strategy such as one + of the objects from dlib/optimization/optimization_search_strategies_abstract.h + - stop_strategy == an object that defines a stop strategy such as one of + the objects from dlib/optimization/optimization_stop_strategies_abstract.h + - f(x) must be a valid expression that evaluates to a double + - is_col_vector(x) == true + - derivative_eps > 0 + ensures + - Performs an unconstrained minimization of the function f() using the given + search_strategy and starting from the initial point x. + - The function is optimized until stop_strategy decides that an acceptable + point has been found or f(#x) < min_f. + - #x == the value of x that was found to minimize f() + - returns f(#x). + - Uses the dlib::derivative(f,derivative_eps) function to compute gradient + information. + !*/ + +// ---------------------------------------------------------------------------------------- + + template < + typename search_strategy_type, + typename stop_strategy_type, + typename funct, + typename T + > + double find_max_using_approximate_derivatives ( + search_strategy_type search_strategy, + stop_strategy_type stop_strategy, + const funct& f, + T& x, + double max_f, + double derivative_eps = 1e-7 + ); + /*! + requires + - search_strategy == an object that defines a search strategy such as one + of the objects from dlib/optimization/optimization_search_strategies_abstract.h + - stop_strategy == an object that defines a stop strategy such as one of + the objects from dlib/optimization/optimization_stop_strategies_abstract.h + - f(x) must be a valid expression that evaluates to a double + - is_col_vector(x) == true + - derivative_eps > 0 + ensures + - Performs an unconstrained maximization of the function f() using the given + search_strategy and starting from the initial point x. + - The function is optimized until stop_strategy decides that an acceptable + point has been found or f(#x) > max_f. + - #x == the value of x that was found to maximize f() + - returns f(#x). + - Uses the dlib::derivative(f,derivative_eps) function to compute gradient + information. + - Note that this function solves the maximization problem by converting it + into a minimization problem. Therefore, the values of f and its derivative + reported to the stopping strategy will be negated. That is, stop_strategy + will see -f() and -der(). All this really means is that the status messages + from a stopping strategy in verbose mode will display a negated objective + value. + !*/ + +// ---------------------------------------------------------------------------------------- +// ---------------------------------------------------------------------------------------- +// Functions that perform box constrained optimization +// ---------------------------------------------------------------------------------------- +// ---------------------------------------------------------------------------------------- + + template < + typename search_strategy_type, + typename stop_strategy_type, + typename funct, + typename funct_der, + typename T, + typename EXP1, + typename EXP2 + > + double find_min_box_constrained ( + search_strategy_type search_strategy, + stop_strategy_type stop_strategy, + const funct& f, + const funct_der& der, + T& x, + const matrix_exp<EXP1>& x_lower, + const matrix_exp<EXP2>& x_upper + ); + /*! + requires + - search_strategy == an object that defines a search strategy such as one + of the objects from dlib/optimization/optimization_search_strategies_abstract.h + - stop_strategy == an object that defines a stop strategy such as one of + the objects from dlib/optimization/optimization_stop_strategies_abstract.h + - f(x) must be a valid expression that evaluates to a double + - der(x) must be a valid expression that evaluates to the derivative of f() at x. + - is_col_vector(x) == true + - is_col_vector(x_lower) == true + - is_col_vector(x_upper) == true + - x.size() == x_lower.size() == x_upper.size() + (i.e. x, x_lower, and x_upper need to all be column vectors of the same dimensionality) + - min(x_upper-x_lower) >= 0 + (i.e. x_upper must contain upper bounds relative to x_lower) + ensures + - Performs a box constrained minimization of the function f() using the given + search_strategy and starting from the initial point x. That is, we try to + find the x value that minimizes f(x) but is also within the box constraints + specified by x_lower and x_upper. That is, we ensure that #x satisfies: + - min(#x - x_lower) >= 0 && min(x_upper - #x) >= 0 + - This function uses a backtracking line search along with a gradient projection + step to handle the box constraints. + - The function is optimized until stop_strategy decides that an acceptable + point has been found. + - #x == the value of x that was found to minimize f() within the given box + constraints. + - returns f(#x). + - The last call to f() will be made with f(#x). + - When calling f() and der(), the input passed to them will always be inside + the box constraints defined by x_lower and x_upper. + - When calling der(x), it will always be the case that the last call to f() was + made with the same x value. This means that you can reuse any intermediate + results from the previous call to f(x) inside der(x) rather than recomputing + them inside der(x). + !*/ + +// ---------------------------------------------------------------------------------------- + + template < + typename search_strategy_type, + typename stop_strategy_type, + typename funct, + typename funct_der, + typename T + > + double find_min_box_constrained ( + search_strategy_type search_strategy, + stop_strategy_type stop_strategy, + const funct& f, + const funct_der& der, + T& x, + const double x_lower, + const double x_upper + ); + /*! + requires + - search_strategy == an object that defines a search strategy such as one + of the objects from dlib/optimization/optimization_search_strategies_abstract.h + - stop_strategy == an object that defines a stop strategy such as one of + the objects from dlib/optimization/optimization_stop_strategies_abstract.h + - f(x) must be a valid expression that evaluates to a double + - der(x) must be a valid expression that evaluates to the derivative of f() at x. + - is_col_vector(x) == true + - x_lower < x_upper + ensures + - This function is identical to find_min_box_constrained() as defined above + except that it takes x_lower and x_upper as doubles rather than column + vectors. In this case, all variables have the same lower bound of x_lower + and similarly have the same upper bound of x_upper. Therefore, this is just + a convenience function for calling find_max_box_constrained() when all + variables have the same bound constraints. + !*/ + +// ---------------------------------------------------------------------------------------- + + template < + typename search_strategy_type, + typename stop_strategy_type, + typename funct, + typename funct_der, + typename T, + typename EXP1, + typename EXP2 + > + double find_max_box_constrained ( + search_strategy_type search_strategy, + stop_strategy_type stop_strategy, + const funct& f, + const funct_der& der, + T& x, + const matrix_exp<EXP1>& x_lower, + const matrix_exp<EXP2>& x_upper + ); + /*! + requires + - search_strategy == an object that defines a search strategy such as one + of the objects from dlib/optimization/optimization_search_strategies_abstract.h + - stop_strategy == an object that defines a stop strategy such as one of + the objects from dlib/optimization/optimization_stop_strategies_abstract.h + - f(x) must be a valid expression that evaluates to a double + - der(x) must be a valid expression that evaluates to the derivative of f() at x. + - is_col_vector(x) == true + - is_col_vector(x_lower) == true + - is_col_vector(x_upper) == true + - x.size() == x_lower.size() == x_upper.size() + (i.e. x, x_lower, and x_upper need to all be column vectors of the same dimensionality) + - min(x_upper-x_lower) >= 0 + (i.e. x_upper must contain upper bounds relative to x_lower) + ensures + - Performs a box constrained maximization of the function f() using the given + search_strategy and starting from the initial point x. That is, we try to + find the x value that maximizes f(x) but is also within the box constraints + specified by x_lower and x_upper. That is, we ensure that #x satisfies: + - min(#x - x_lower) >= 0 && min(x_upper - #x) >= 0 + - This function uses a backtracking line search along with a gradient projection + step to handle the box constraints. + - The function is optimized until stop_strategy decides that an acceptable + point has been found. + - #x == the value of x that was found to maximize f() within the given box + constraints. + - returns f(#x). + - The last call to f() will be made with f(#x). + - When calling f() and der(), the input passed to them will always be inside + the box constraints defined by x_lower and x_upper. + - When calling der(x), it will always be the case that the last call to f() was + made with the same x value. This means that you can reuse any intermediate + results from the previous call to f(x) inside der(x) rather than recomputing + them inside der(x). + - Note that this function solves the maximization problem by converting it + into a minimization problem. Therefore, the values of f and its derivative + reported to the stopping strategy will be negated. That is, stop_strategy + will see -f() and -der(). All this really means is that the status messages + from a stopping strategy in verbose mode will display a negated objective + value. + !*/ + +// ---------------------------------------------------------------------------------------- + + template < + typename search_strategy_type, + typename stop_strategy_type, + typename funct, + typename funct_der, + typename T + > + double find_max_box_constrained ( + search_strategy_type search_strategy, + stop_strategy_type stop_strategy, + const funct& f, + const funct_der& der, + T& x, + const double x_lower, + const double x_upper + ); + /*! + requires + - search_strategy == an object that defines a search strategy such as one + of the objects from dlib/optimization/optimization_search_strategies_abstract.h + - stop_strategy == an object that defines a stop strategy such as one of + the objects from dlib/optimization/optimization_stop_strategies_abstract.h + - f(x) must be a valid expression that evaluates to a double + - der(x) must be a valid expression that evaluates to the derivative of f() at x. + - is_col_vector(x) == true + - x_lower < x_upper + ensures + - This function is identical to find_max_box_constrained() as defined above + except that it takes x_lower and x_upper as doubles rather than column + vectors. In this case, all variables have the same lower bound of x_lower + and similarly have the same upper bound of x_upper. Therefore, this is just + a convenience function for calling find_max_box_constrained() when all + variables have the same bound constraints. + !*/ + +// ---------------------------------------------------------------------------------------- + +} + +#endif // DLIB_OPTIMIZATIOn_ABSTRACT_ + + + diff --git a/ml/dlib/dlib/optimization/optimization_bobyqa.h b/ml/dlib/dlib/optimization/optimization_bobyqa.h new file mode 100644 index 000000000..6fbc40c06 --- /dev/null +++ b/ml/dlib/dlib/optimization/optimization_bobyqa.h @@ -0,0 +1,3423 @@ +// Copyright (C) 2009 M.J.D. Powell, Davis E. King (davis@dlib.net) +// License: Boost Software License See LICENSE.txt for the full license. +#ifndef DLIB_OPTIMIZATIOn_BOBYQA_Hh_ +#define DLIB_OPTIMIZATIOn_BOBYQA_Hh_ + +/* + The code in this file is derived from Powell's BOBYQA Fortran code. + It was created by running f2c on the original Fortran code and then + massaging the resulting C code into what you can see below. + + + The following paper, published in 2009 by Powell, describes the + detailed workings of the BOBYQA algorithm. + + The BOBYQA algorithm for bound constrained optimization + without derivatives by M.J.D. Powell +*/ + +#include <algorithm> +#include <cmath> +#include <memory> + +#include "../matrix.h" +#include "optimization_bobyqa_abstract.h" +#include "optimization.h" + +// ---------------------------------------------------------------------------------------- + +namespace dlib +{ + + class bobyqa_failure : public error { + public: bobyqa_failure(const std::string& s):error(s){} + }; + +// ---------------------------------------------------------------------------------------- + + class bobyqa_implementation + { + typedef long integer; + typedef double doublereal; + + public: + + template < + typename funct, + typename T, + typename U + > + double find_min ( + const funct& f, + T& x, + long npt, + const U& xl_, + const U& xu_, + const double rhobeg, + const double rhoend, + const long max_f_evals + ) const + { + const unsigned long n = x.size(); + const unsigned long w_size = (npt+5)*(npt+n)+3*n*(n+5)/2; + std::unique_ptr<doublereal[]> w(new doublereal[w_size]); + + // make these temporary matrices becuse U might be some + // kind of matrix_exp that doesn't support taking the address + // of the first element. + matrix<double,0,1> xl(xl_); + matrix<double,0,1> xu(xu_); + + + return bobyqa_ (f, + x.size(), + npt, + &x(0), + &xl(0), + &xu(0), + rhobeg, + rhoend, + max_f_evals, + w.get() ); + } + + private: + + + template <typename funct> + doublereal bobyqa_( + const funct& calfun, + const integer n, + const integer npt, + doublereal *x, + const doublereal *xl, + const doublereal *xu, + const doublereal rhobeg, + const doublereal rhoend, + const integer maxfun, + doublereal *w + ) const + { + + /* System generated locals */ + integer i__1; + doublereal d__1, d__2; + + /* Local variables */ + integer j, id_, np, iw, igo, ihq, ixb, ixa, ifv, isl, jsl, ipq, ivl, ixn, ixo, ixp, isu, jsu, ndim; + doublereal temp, zero; + integer ibmat, izmat; + + + /* This subroutine seeks the least value of a function of many variables, */ + /* by applying a trust region method that forms quadratic models by */ + /* interpolation. There is usually some freedom in the interpolation */ + /* conditions, which is taken up by minimizing the Frobenius norm of */ + /* the change to the second derivative of the model, beginning with the */ + /* zero matrix. The values of the variables are constrained by upper and */ + /* lower bounds. The arguments of the subroutine are as follows. */ + + /* N must be set to the number of variables and must be at least two. */ + /* NPT is the number of interpolation conditions. Its value must be in */ + /* the interval [N+2,(N+1)(N+2)/2]. Choices that exceed 2*N+1 are not */ + /* recommended. */ + /* Initial values of the variables must be set in X(1),X(2),...,X(N). They */ + /* will be changed to the values that give the least calculated F. */ + /* For I=1,2,...,N, XL(I) and XU(I) must provide the lower and upper */ + /* bounds, respectively, on X(I). The construction of quadratic models */ + /* requires XL(I) to be strictly less than XU(I) for each I. Further, */ + /* the contribution to a model from changes to the I-th variable is */ + /* damaged severely by rounding errors if XU(I)-XL(I) is too small. */ + /* RHOBEG and RHOEND must be set to the initial and final values of a trust */ + /* region radius, so both must be positive with RHOEND no greater than */ + /* RHOBEG. Typically, RHOBEG should be about one tenth of the greatest */ + /* expected change to a variable, while RHOEND should indicate the */ + /* accuracy that is required in the final values of the variables. An */ + /* error return occurs if any of the differences XU(I)-XL(I), I=1,...,N, */ + /* is less than 2*RHOBEG. */ + /* MAXFUN must be set to an upper bound on the number of calls of CALFUN. */ + /* The array W will be used for working space. Its length must be at least */ + /* (NPT+5)*(NPT+N)+3*N*(N+5)/2. */ + + /* Parameter adjustments */ + --w; + --xu; + --xl; + --x; + + /* Function Body */ + np = n + 1; + + /* Return if the value of NPT is unacceptable. */ + if (npt < n + 2 || npt > (n + 2) * np / 2) { + throw bobyqa_failure("Return from BOBYQA because NPT is not in the required interval"); + //goto L40; + } + + /* Partition the working space array, so that different parts of it can */ + /* be treated separately during the calculation of BOBYQB. The partition */ + /* requires the first (NPT+2)*(NPT+N)+3*N*(N+5)/2 elements of W plus the */ + /* space that is taken by the last array in the argument list of BOBYQB. */ + + ndim = npt + n; + ixb = 1; + ixp = ixb + n; + ifv = ixp + n * npt; + ixo = ifv + npt; + igo = ixo + n; + ihq = igo + n; + ipq = ihq + n * np / 2; + ibmat = ipq + npt; + izmat = ibmat + ndim * n; + isl = izmat + npt * (npt - np); + isu = isl + n; + ixn = isu + n; + ixa = ixn + n; + id_ = ixa + n; + ivl = id_ + n; + iw = ivl + ndim; + + /* Return if there is insufficient space between the bounds. Modify the */ + /* initial X if necessary in order to avoid conflicts between the bounds */ + /* and the construction of the first quadratic model. The lower and upper */ + /* bounds on moves from the updated X are set now, in the ISL and ISU */ + /* partitions of W, in order to provide useful and exact information about */ + /* components of X that become within distance RHOBEG from their bounds. */ + + zero = 0.; + i__1 = n; + for (j = 1; j <= i__1; ++j) { + temp = xu[j] - xl[j]; + if (temp < rhobeg + rhobeg) { + throw bobyqa_failure("Return from BOBYQA because one of the differences in x_lower and x_upper is less than 2*rho_begin"); + //goto L40; + } + jsl = isl + j - 1; + jsu = jsl + n; + w[jsl] = xl[j] - x[j]; + w[jsu] = xu[j] - x[j]; + if (w[jsl] >= -(rhobeg)) { + if (w[jsl] >= zero) { + x[j] = xl[j]; + w[jsl] = zero; + w[jsu] = temp; + } else { + x[j] = xl[j] + rhobeg; + w[jsl] = -(rhobeg); + /* Computing MAX */ + d__1 = xu[j] - x[j]; + w[jsu] = std::max(d__1,rhobeg); + } + } else if (w[jsu] <= rhobeg) { + if (w[jsu] <= zero) { + x[j] = xu[j]; + w[jsl] = -temp; + w[jsu] = zero; + } else { + x[j] = xu[j] - rhobeg; + /* Computing MIN */ + d__1 = xl[j] - x[j], d__2 = -(rhobeg); + w[jsl] = std::min(d__1,d__2); + w[jsu] = rhobeg; + } + } + /* L30: */ + } + + /* Make the call of BOBYQB. */ + + return bobyqb_(calfun, n, npt, &x[1], &xl[1], &xu[1], rhobeg, rhoend, maxfun, &w[ + ixb], &w[ixp], &w[ifv], &w[ixo], &w[igo], &w[ihq], &w[ipq], &w[ + ibmat], &w[izmat], ndim, &w[isl], &w[isu], &w[ixn], &w[ixa], &w[ + id_], &w[ivl], &w[iw]); + //L40: + ; + } /* bobyqa_ */ + + // ---------------------------------------------------------------------------------------- + + template <typename funct> + doublereal bobyqb_( + const funct& calfun, + const integer n, + const integer npt, + doublereal *x, + const doublereal *xl, + const doublereal *xu, + const doublereal rhobeg, + const doublereal rhoend, + const integer maxfun, + doublereal *xbase, + doublereal *xpt, + doublereal *fval, + doublereal *xopt, + doublereal *gopt, + doublereal *hq, + doublereal *pq, + doublereal *bmat, + doublereal *zmat, + const integer ndim, + doublereal *sl, + doublereal *su, + doublereal *xnew, + doublereal *xalt, + doublereal *d__, + doublereal *vlag, + doublereal *w + ) const + { + /* System generated locals */ + integer xpt_dim1, xpt_offset, bmat_dim1, bmat_offset, zmat_dim1, + zmat_offset, i__1, i__2, i__3; + doublereal d__1, d__2, d__3, d__4; + + /* Local variables */ + doublereal f = 0; + integer i__, j, k, ih, nf, jj, nh, ip, jp; + doublereal dx; + integer np; + doublereal den = 0, one = 0, ten = 0, dsq = 0, rho = 0, sum = 0, two = 0, diff = 0, half = 0, beta = 0, gisq = 0; + integer knew = 0; + doublereal temp, suma, sumb, bsum, fopt; + integer kopt = 0, nptm; + doublereal zero, curv; + integer ksav; + doublereal gqsq = 0, dist = 0, sumw = 0, sumz = 0, diffa = 0, diffb = 0, diffc = 0, hdiag = 0; + integer kbase; + doublereal alpha = 0, delta = 0, adelt = 0, denom = 0, fsave = 0, bdtol = 0, delsq = 0; + integer nresc, nfsav; + doublereal ratio = 0, dnorm = 0, vquad = 0, pqold = 0, tenth = 0; + integer itest; + doublereal sumpq, scaden; + doublereal errbig, cauchy, fracsq, biglsq, densav; + doublereal bdtest; + doublereal crvmin, frhosq; + doublereal distsq; + integer ntrits; + doublereal xoptsq; + + + + /* The arguments N, NPT, X, XL, XU, RHOBEG, RHOEND, IPRINT and MAXFUN */ + /* are identical to the corresponding arguments in SUBROUTINE BOBYQA. */ + /* XBASE holds a shift of origin that should reduce the contributions */ + /* from rounding errors to values of the model and Lagrange functions. */ + /* XPT is a two-dimensional array that holds the coordinates of the */ + /* interpolation points relative to XBASE. */ + /* FVAL holds the values of F at the interpolation points. */ + /* XOPT is set to the displacement from XBASE of the trust region centre. */ + /* GOPT holds the gradient of the quadratic model at XBASE+XOPT. */ + /* HQ holds the explicit second derivatives of the quadratic model. */ + /* PQ contains the parameters of the implicit second derivatives of the */ + /* quadratic model. */ + /* BMAT holds the last N columns of H. */ + /* ZMAT holds the factorization of the leading NPT by NPT submatrix of H, */ + /* this factorization being ZMAT times ZMAT^T, which provides both the */ + /* correct rank and positive semi-definiteness. */ + /* NDIM is the first dimension of BMAT and has the value NPT+N. */ + /* SL and SU hold the differences XL-XBASE and XU-XBASE, respectively. */ + /* All the components of every XOPT are going to satisfy the bounds */ + /* SL(I) .LEQ. XOPT(I) .LEQ. SU(I), with appropriate equalities when */ + /* XOPT is on a constraint boundary. */ + /* XNEW is chosen by SUBROUTINE TRSBOX or ALTMOV. Usually XBASE+XNEW is the */ + /* vector of variables for the next call of CALFUN. XNEW also satisfies */ + /* the SL and SU constraints in the way that has just been mentioned. */ + /* XALT is an alternative to XNEW, chosen by ALTMOV, that may replace XNEW */ + /* in order to increase the denominator in the updating of UPDATE. */ + /* D is reserved for a trial step from XOPT, which is usually XNEW-XOPT. */ + /* VLAG contains the values of the Lagrange functions at a new point X. */ + /* They are part of a product that requires VLAG to be of length NDIM. */ + /* W is a one-dimensional array that is used for working space. Its length */ + /* must be at least 3*NDIM = 3*(NPT+N). */ + + /* Set some constants. */ + + /* Parameter adjustments */ + zmat_dim1 = npt; + zmat_offset = 1 + zmat_dim1; + zmat -= zmat_offset; + xpt_dim1 = npt; + xpt_offset = 1 + xpt_dim1; + xpt -= xpt_offset; + --x; + --xl; + --xu; + --xbase; + --fval; + --xopt; + --gopt; + --hq; + --pq; + bmat_dim1 = ndim; + bmat_offset = 1 + bmat_dim1; + bmat -= bmat_offset; + --sl; + --su; + --xnew; + --xalt; + --d__; + --vlag; + --w; + + /* Function Body */ + half = .5; + one = 1.; + ten = 10.; + tenth = .1; + two = 2.; + zero = 0.; + np = n + 1; + nptm = npt - np; + nh = n * np / 2; + + /* The call of PRELIM sets the elements of XBASE, XPT, FVAL, GOPT, HQ, PQ, */ + /* BMAT and ZMAT for the first iteration, with the corresponding values of */ + /* of NF and KOPT, which are the number of calls of CALFUN so far and the */ + /* index of the interpolation point at the trust region centre. Then the */ + /* initial XOPT is set too. The branch to label 720 occurs if MAXFUN is */ + /* less than NPT. GOPT will be updated if KOPT is different from KBASE. */ + + prelim_(calfun, n, npt, &x[1], &xl[1], &xu[1], rhobeg, maxfun, &xbase[1], + &xpt[xpt_offset], &fval[1], &gopt[1], &hq[1], &pq[1], &bmat[bmat_offset], + &zmat[zmat_offset], ndim, &sl[1], &su[1], nf, kopt); + xoptsq = zero; + i__1 = n; + for (i__ = 1; i__ <= i__1; ++i__) { + xopt[i__] = xpt[kopt + i__ * xpt_dim1]; + /* L10: */ + /* Computing 2nd power */ + d__1 = xopt[i__]; + xoptsq += d__1 * d__1; + } + fsave = fval[1]; + if (nf < npt) { + throw bobyqa_failure("Return from BOBYQA because the objective function has been called max_f_evals times."); + //goto L720; + } + kbase = 1; + + /* Complete the settings that are required for the iterative procedure. */ + + rho = rhobeg; + delta = rho; + nresc = nf; + ntrits = 0; + diffa = zero; + diffb = zero; + itest = 0; + nfsav = nf; + + /* Update GOPT if necessary before the first iteration and after each */ + /* call of RESCUE that makes a call of CALFUN. */ + +L20: + if (kopt != kbase) { + ih = 0; + i__1 = n; + for (j = 1; j <= i__1; ++j) { + i__2 = j; + for (i__ = 1; i__ <= i__2; ++i__) { + ++ih; + if (i__ < j) { + gopt[j] += hq[ih] * xopt[i__]; + } + /* L30: */ + gopt[i__] += hq[ih] * xopt[j]; + } + } + if (nf > npt) { + i__2 = npt; + for (k = 1; k <= i__2; ++k) { + temp = zero; + i__1 = n; + for (j = 1; j <= i__1; ++j) { + /* L40: */ + temp += xpt[k + j * xpt_dim1] * xopt[j]; + } + temp = pq[k] * temp; + i__1 = n; + for (i__ = 1; i__ <= i__1; ++i__) { + /* L50: */ + gopt[i__] += temp * xpt[k + i__ * xpt_dim1]; + } + } + } + } + + /* Generate the next point in the trust region that provides a small value */ + /* of the quadratic model subject to the constraints on the variables. */ + /* The integer NTRITS is set to the number "trust region" iterations that */ + /* have occurred since the last "alternative" iteration. If the length */ + /* of XNEW-XOPT is less than HALF*RHO, however, then there is a branch to */ + /* label 650 or 680 with NTRITS=-1, instead of calculating F at XNEW. */ + +L60: + trsbox_(n, npt, &xpt[xpt_offset], &xopt[1], &gopt[1], &hq[1], &pq[1], &sl[1], + &su[1], delta, &xnew[1], &d__[1], &w[1], &w[np], &w[np + n], + &w[np + (n << 1)], &w[np + n * 3], &dsq, &crvmin); + /* Computing MIN */ + d__1 = delta, d__2 = std::sqrt(dsq); + dnorm = std::min(d__1,d__2); + if (dnorm < half * rho) { + ntrits = -1; + /* Computing 2nd power */ + d__1 = ten * rho; + distsq = d__1 * d__1; + if (nf <= nfsav + 2) { + goto L650; + } + + /* The following choice between labels 650 and 680 depends on whether or */ + /* not our work with the current RHO seems to be complete. Either RHO is */ + /* decreased or termination occurs if the errors in the quadratic model at */ + /* the last three interpolation points compare favourably with predictions */ + /* of likely improvements to the model within distance HALF*RHO of XOPT. */ + + /* Computing MAX */ + d__1 = std::max(diffa,diffb); + errbig = std::max(d__1,diffc); + frhosq = rho * .125 * rho; + if (crvmin > zero && errbig > frhosq * crvmin) { + goto L650; + } + bdtol = errbig / rho; + i__1 = n; + for (j = 1; j <= i__1; ++j) { + bdtest = bdtol; + if (xnew[j] == sl[j]) { + bdtest = w[j]; + } + if (xnew[j] == su[j]) { + bdtest = -w[j]; + } + if (bdtest < bdtol) { + curv = hq[(j + j * j) / 2]; + i__2 = npt; + for (k = 1; k <= i__2; ++k) { + /* L70: */ + /* Computing 2nd power */ + d__1 = xpt[k + j * xpt_dim1]; + curv += pq[k] * (d__1 * d__1); + } + bdtest += half * curv * rho; + if (bdtest < bdtol) { + goto L650; + } + } + /* L80: */ + } + goto L680; + } + ++ntrits; + + /* Severe cancellation is likely to occur if XOPT is too far from XBASE. */ + /* If the following test holds, then XBASE is shifted so that XOPT becomes */ + /* zero. The appropriate changes are made to BMAT and to the second */ + /* derivatives of the current model, beginning with the changes to BMAT */ + /* that do not depend on ZMAT. VLAG is used temporarily for working space. */ + +L90: + if (dsq <= xoptsq * .001) { + fracsq = xoptsq * .25; + sumpq = zero; + i__1 = npt; + for (k = 1; k <= i__1; ++k) { + sumpq += pq[k]; + sum = -half * xoptsq; + i__2 = n; + for (i__ = 1; i__ <= i__2; ++i__) { + /* L100: */ + sum += xpt[k + i__ * xpt_dim1] * xopt[i__]; + } + w[npt + k] = sum; + temp = fracsq - half * sum; + i__2 = n; + for (i__ = 1; i__ <= i__2; ++i__) { + w[i__] = bmat[k + i__ * bmat_dim1]; + vlag[i__] = sum * xpt[k + i__ * xpt_dim1] + temp * xopt[i__]; + ip = npt + i__; + i__3 = i__; + for (j = 1; j <= i__3; ++j) { + /* L110: */ + bmat[ip + j * bmat_dim1] = bmat[ip + j * bmat_dim1] + w[ + i__] * vlag[j] + vlag[i__] * w[j]; + } + } + } + + /* Then the revisions of BMAT that depend on ZMAT are calculated. */ + + i__3 = nptm; + for (jj = 1; jj <= i__3; ++jj) { + sumz = zero; + sumw = zero; + i__2 = npt; + for (k = 1; k <= i__2; ++k) { + sumz += zmat[k + jj * zmat_dim1]; + vlag[k] = w[npt + k] * zmat[k + jj * zmat_dim1]; + /* L120: */ + sumw += vlag[k]; + } + i__2 = n; + for (j = 1; j <= i__2; ++j) { + sum = (fracsq * sumz - half * sumw) * xopt[j]; + i__1 = npt; + for (k = 1; k <= i__1; ++k) { + /* L130: */ + sum += vlag[k] * xpt[k + j * xpt_dim1]; + } + w[j] = sum; + i__1 = npt; + for (k = 1; k <= i__1; ++k) { + /* L140: */ + bmat[k + j * bmat_dim1] += sum * zmat[k + jj * zmat_dim1]; + } + } + i__1 = n; + for (i__ = 1; i__ <= i__1; ++i__) { + ip = i__ + npt; + temp = w[i__]; + i__2 = i__; + for (j = 1; j <= i__2; ++j) { + /* L150: */ + bmat[ip + j * bmat_dim1] += temp * w[j]; + } + } + } + + /* The following instructions complete the shift, including the changes */ + /* to the second derivative parameters of the quadratic model. */ + + ih = 0; + i__2 = n; + for (j = 1; j <= i__2; ++j) { + w[j] = -half * sumpq * xopt[j]; + i__1 = npt; + for (k = 1; k <= i__1; ++k) { + w[j] += pq[k] * xpt[k + j * xpt_dim1]; + /* L160: */ + xpt[k + j * xpt_dim1] -= xopt[j]; + } + i__1 = j; + for (i__ = 1; i__ <= i__1; ++i__) { + ++ih; + hq[ih] = hq[ih] + w[i__] * xopt[j] + xopt[i__] * w[j]; + /* L170: */ + bmat[npt + i__ + j * bmat_dim1] = bmat[npt + j + i__ * + bmat_dim1]; + } + } + i__1 = n; + for (i__ = 1; i__ <= i__1; ++i__) { + xbase[i__] += xopt[i__]; + xnew[i__] -= xopt[i__]; + sl[i__] -= xopt[i__]; + su[i__] -= xopt[i__]; + /* L180: */ + xopt[i__] = zero; + } + xoptsq = zero; + } + if (ntrits == 0) { + goto L210; + } + goto L230; + + /* XBASE is also moved to XOPT by a call of RESCUE. This calculation is */ + /* more expensive than the previous shift, because new matrices BMAT and */ + /* ZMAT are generated from scratch, which may include the replacement of */ + /* interpolation points whose positions seem to be causing near linear */ + /* dependence in the interpolation conditions. Therefore RESCUE is called */ + /* only if rounding errors have reduced by at least a factor of two the */ + /* denominator of the formula for updating the H matrix. It provides a */ + /* useful safeguard, but is not invoked in most applications of BOBYQA. */ + +L190: + nfsav = nf; + kbase = kopt; + rescue_(calfun, n, npt, &xl[1], &xu[1], maxfun, &xbase[1], &xpt[ + xpt_offset], &fval[1], &xopt[1], &gopt[1], &hq[1], &pq[1], &bmat[ + bmat_offset], &zmat[zmat_offset], ndim, &sl[1], &su[1], nf, delta, + kopt, &vlag[1], &w[1], &w[n + np], &w[ndim + np]); + + /* XOPT is updated now in case the branch below to label 720 is taken. */ + /* Any updating of GOPT occurs after the branch below to label 20, which */ + /* leads to a trust region iteration as does the branch to label 60. */ + + xoptsq = zero; + if (kopt != kbase) { + i__1 = n; + for (i__ = 1; i__ <= i__1; ++i__) { + xopt[i__] = xpt[kopt + i__ * xpt_dim1]; + /* L200: */ + /* Computing 2nd power */ + d__1 = xopt[i__]; + xoptsq += d__1 * d__1; + } + } + if (nf < 0) { + nf = maxfun; + throw bobyqa_failure("Return from BOBYQA because the objective function has been called max_f_evals times."); + //goto L720; + } + nresc = nf; + if (nfsav < nf) { + nfsav = nf; + goto L20; + } + if (ntrits > 0) { + goto L60; + } + + /* Pick two alternative vectors of variables, relative to XBASE, that */ + /* are suitable as new positions of the KNEW-th interpolation point. */ + /* Firstly, XNEW is set to the point on a line through XOPT and another */ + /* interpolation point that minimizes the predicted value of the next */ + /* denominator, subject to ||XNEW - XOPT|| .LEQ. ADELT and to the SL */ + /* and SU bounds. Secondly, XALT is set to the best feasible point on */ + /* a constrained version of the Cauchy step of the KNEW-th Lagrange */ + /* function, the corresponding value of the square of this function */ + /* being returned in CAUCHY. The choice between these alternatives is */ + /* going to be made when the denominator is calculated. */ + +L210: + altmov_(n, npt, &xpt[xpt_offset], &xopt[1], &bmat[bmat_offset], &zmat[zmat_offset], + ndim, &sl[1], &su[1], kopt, knew, adelt, &xnew[1], + &xalt[1], alpha, cauchy, &w[1], &w[np], &w[ndim + 1]); + i__1 = n; + for (i__ = 1; i__ <= i__1; ++i__) { + /* L220: */ + d__[i__] = xnew[i__] - xopt[i__]; + } + + /* Calculate VLAG and BETA for the current choice of D. The scalar */ + /* product of D with XPT(K,.) is going to be held in W(NPT+K) for */ + /* use when VQUAD is calculated. */ + +L230: + i__1 = npt; + for (k = 1; k <= i__1; ++k) { + suma = zero; + sumb = zero; + sum = zero; + i__2 = n; + for (j = 1; j <= i__2; ++j) { + suma += xpt[k + j * xpt_dim1] * d__[j]; + sumb += xpt[k + j * xpt_dim1] * xopt[j]; + /* L240: */ + sum += bmat[k + j * bmat_dim1] * d__[j]; + } + w[k] = suma * (half * suma + sumb); + vlag[k] = sum; + /* L250: */ + w[npt + k] = suma; + } + beta = zero; + i__1 = nptm; + for (jj = 1; jj <= i__1; ++jj) { + sum = zero; + i__2 = npt; + for (k = 1; k <= i__2; ++k) { + /* L260: */ + sum += zmat[k + jj * zmat_dim1] * w[k]; + } + beta -= sum * sum; + i__2 = npt; + for (k = 1; k <= i__2; ++k) { + /* L270: */ + vlag[k] += sum * zmat[k + jj * zmat_dim1]; + } + } + dsq = zero; + bsum = zero; + dx = zero; + i__2 = n; + for (j = 1; j <= i__2; ++j) { + /* Computing 2nd power */ + d__1 = d__[j]; + dsq += d__1 * d__1; + sum = zero; + i__1 = npt; + for (k = 1; k <= i__1; ++k) { + /* L280: */ + sum += w[k] * bmat[k + j * bmat_dim1]; + } + bsum += sum * d__[j]; + jp = npt + j; + i__1 = n; + for (i__ = 1; i__ <= i__1; ++i__) { + /* L290: */ + sum += bmat[jp + i__ * bmat_dim1] * d__[i__]; + } + vlag[jp] = sum; + bsum += sum * d__[j]; + /* L300: */ + dx += d__[j] * xopt[j]; + } + beta = dx * dx + dsq * (xoptsq + dx + dx + half * dsq) + beta - bsum; + vlag[kopt] += one; + + /* If NTRITS is zero, the denominator may be increased by replacing */ + /* the step D of ALTMOV by a Cauchy step. Then RESCUE may be called if */ + /* rounding errors have damaged the chosen denominator. */ + + if (ntrits == 0) { + /* Computing 2nd power */ + d__1 = vlag[knew]; + denom = d__1 * d__1 + alpha * beta; + if (denom < cauchy && cauchy > zero) { + i__2 = n; + for (i__ = 1; i__ <= i__2; ++i__) { + xnew[i__] = xalt[i__]; + /* L310: */ + d__[i__] = xnew[i__] - xopt[i__]; + } + cauchy = zero; + goto L230; + } + /* Computing 2nd power */ + d__1 = vlag[knew]; + if (denom <= half * (d__1 * d__1)) { + if (nf > nresc) { + goto L190; + } + throw bobyqa_failure("Return from BOBYQA because of much cancellation in a denominator."); + //goto L720; + } + + /* Alternatively, if NTRITS is positive, then set KNEW to the index of */ + /* the next interpolation point to be deleted to make room for a trust */ + /* region step. Again RESCUE may be called if rounding errors have damaged */ + /* the chosen denominator, which is the reason for attempting to select */ + /* KNEW before calculating the next value of the objective function. */ + + } else { + delsq = delta * delta; + scaden = zero; + biglsq = zero; + knew = 0; + i__2 = npt; + for (k = 1; k <= i__2; ++k) { + if (k == kopt) { + goto L350; + } + hdiag = zero; + i__1 = nptm; + for (jj = 1; jj <= i__1; ++jj) { + /* L330: */ + /* Computing 2nd power */ + d__1 = zmat[k + jj * zmat_dim1]; + hdiag += d__1 * d__1; + } + /* Computing 2nd power */ + d__1 = vlag[k]; + den = beta * hdiag + d__1 * d__1; + distsq = zero; + i__1 = n; + for (j = 1; j <= i__1; ++j) { + /* L340: */ + /* Computing 2nd power */ + d__1 = xpt[k + j * xpt_dim1] - xopt[j]; + distsq += d__1 * d__1; + } + /* Computing MAX */ + /* Computing 2nd power */ + d__3 = distsq / delsq; + d__1 = one, d__2 = d__3 * d__3; + temp = std::max(d__1,d__2); + if (temp * den > scaden) { + scaden = temp * den; + knew = k; + denom = den; + } + /* Computing MAX */ + /* Computing 2nd power */ + d__3 = vlag[k]; + d__1 = biglsq, d__2 = temp * (d__3 * d__3); + biglsq = std::max(d__1,d__2); +L350: + ; + } + if (scaden <= half * biglsq) { + if (nf > nresc) { + goto L190; + } + throw bobyqa_failure("Return from BOBYQA because of much cancellation in a denominator."); + //goto L720; + } + } + + /* Put the variables for the next calculation of the objective function */ + /* in XNEW, with any adjustments for the bounds. */ + + + /* Calculate the value of the objective function at XBASE+XNEW, unless */ + /* the limit on the number of calculations of F has been reached. */ + +L360: + i__2 = n; + for (i__ = 1; i__ <= i__2; ++i__) { + /* Computing MIN */ + /* Computing MAX */ + d__3 = xl[i__], d__4 = xbase[i__] + xnew[i__]; + d__1 = std::max(d__3,d__4), d__2 = xu[i__]; + x[i__] = std::min(d__1,d__2); + if (xnew[i__] == sl[i__]) { + x[i__] = xl[i__]; + } + if (xnew[i__] == su[i__]) { + x[i__] = xu[i__]; + } + /* L380: */ + } + if (nf >= maxfun) { + throw bobyqa_failure("Return from BOBYQA because the objective function has been called max_f_evals times."); + //goto L720; + } + ++nf; + f = calfun(mat(&x[1], n)); + if (ntrits == -1) { + fsave = f; + goto L720; + } + + /* Use the quadratic model to predict the change in F due to the step D, */ + /* and set DIFF to the error of this prediction. */ + + fopt = fval[kopt]; + vquad = zero; + ih = 0; + i__2 = n; + for (j = 1; j <= i__2; ++j) { + vquad += d__[j] * gopt[j]; + i__1 = j; + for (i__ = 1; i__ <= i__1; ++i__) { + ++ih; + temp = d__[i__] * d__[j]; + if (i__ == j) { + temp = half * temp; + } + /* L410: */ + vquad += hq[ih] * temp; + } + } + i__1 = npt; + for (k = 1; k <= i__1; ++k) { + /* L420: */ + /* Computing 2nd power */ + d__1 = w[npt + k]; + vquad += half * pq[k] * (d__1 * d__1); + } + diff = f - fopt - vquad; + diffc = diffb; + diffb = diffa; + diffa = std::abs(diff); + if (dnorm > rho) { + nfsav = nf; + } + + /* Pick the next value of DELTA after a trust region step. */ + + if (ntrits > 0) { + if (vquad >= zero) { + throw bobyqa_failure("Return from BOBYQA because a trust region step has failed to reduce Q."); + //goto L720; + } + ratio = (f - fopt) / vquad; + if (ratio <= tenth) { + /* Computing MIN */ + d__1 = half * delta; + delta = std::min(d__1,dnorm); + } else if (ratio <= .7) { + /* Computing MAX */ + d__1 = half * delta; + delta = std::max(d__1,dnorm); + } else { + /* Computing MAX */ + d__1 = half * delta, d__2 = dnorm + dnorm; + delta = std::max(d__1,d__2); + } + if (delta <= rho * 1.5) { + delta = rho; + } + + /* Recalculate KNEW and DENOM if the new F is less than FOPT. */ + + if (f < fopt) { + ksav = knew; + densav = denom; + delsq = delta * delta; + scaden = zero; + biglsq = zero; + knew = 0; + i__1 = npt; + for (k = 1; k <= i__1; ++k) { + hdiag = zero; + i__2 = nptm; + for (jj = 1; jj <= i__2; ++jj) { + /* L440: */ + /* Computing 2nd power */ + d__1 = zmat[k + jj * zmat_dim1]; + hdiag += d__1 * d__1; + } + /* Computing 2nd power */ + d__1 = vlag[k]; + den = beta * hdiag + d__1 * d__1; + distsq = zero; + i__2 = n; + for (j = 1; j <= i__2; ++j) { + /* L450: */ + /* Computing 2nd power */ + d__1 = xpt[k + j * xpt_dim1] - xnew[j]; + distsq += d__1 * d__1; + } + /* Computing MAX */ + /* Computing 2nd power */ + d__3 = distsq / delsq; + d__1 = one, d__2 = d__3 * d__3; + temp = std::max(d__1,d__2); + if (temp * den > scaden) { + scaden = temp * den; + knew = k; + denom = den; + } + /* L460: */ + /* Computing MAX */ + /* Computing 2nd power */ + d__3 = vlag[k]; + d__1 = biglsq, d__2 = temp * (d__3 * d__3); + biglsq = std::max(d__1,d__2); + } + if (scaden <= half * biglsq) { + knew = ksav; + denom = densav; + } + } + } + + /* Update BMAT and ZMAT, so that the KNEW-th interpolation point can be */ + /* moved. Also update the second derivative terms of the model. */ + + update_(n, npt, &bmat[bmat_offset], &zmat[zmat_offset], ndim, &vlag[1], + beta, denom, knew, &w[1]); + ih = 0; + pqold = pq[knew]; + pq[knew] = zero; + i__1 = n; + for (i__ = 1; i__ <= i__1; ++i__) { + temp = pqold * xpt[knew + i__ * xpt_dim1]; + i__2 = i__; + for (j = 1; j <= i__2; ++j) { + ++ih; + /* L470: */ + hq[ih] += temp * xpt[knew + j * xpt_dim1]; + } + } + i__2 = nptm; + for (jj = 1; jj <= i__2; ++jj) { + temp = diff * zmat[knew + jj * zmat_dim1]; + i__1 = npt; + for (k = 1; k <= i__1; ++k) { + /* L480: */ + pq[k] += temp * zmat[k + jj * zmat_dim1]; + } + } + + /* Include the new interpolation point, and make the changes to GOPT at */ + /* the old XOPT that are caused by the updating of the quadratic model. */ + + fval[knew] = f; + i__1 = n; + for (i__ = 1; i__ <= i__1; ++i__) { + xpt[knew + i__ * xpt_dim1] = xnew[i__]; + /* L490: */ + w[i__] = bmat[knew + i__ * bmat_dim1]; + } + i__1 = npt; + for (k = 1; k <= i__1; ++k) { + suma = zero; + i__2 = nptm; + for (jj = 1; jj <= i__2; ++jj) { + /* L500: */ + suma += zmat[knew + jj * zmat_dim1] * zmat[k + jj * zmat_dim1]; + } + sumb = zero; + i__2 = n; + for (j = 1; j <= i__2; ++j) { + /* L510: */ + sumb += xpt[k + j * xpt_dim1] * xopt[j]; + } + temp = suma * sumb; + i__2 = n; + for (i__ = 1; i__ <= i__2; ++i__) { + /* L520: */ + w[i__] += temp * xpt[k + i__ * xpt_dim1]; + } + } + i__2 = n; + for (i__ = 1; i__ <= i__2; ++i__) { + /* L530: */ + gopt[i__] += diff * w[i__]; + } + + /* Update XOPT, GOPT and KOPT if the new calculated F is less than FOPT. */ + + if (f < fopt) { + kopt = knew; + xoptsq = zero; + ih = 0; + i__2 = n; + for (j = 1; j <= i__2; ++j) { + xopt[j] = xnew[j]; + /* Computing 2nd power */ + d__1 = xopt[j]; + xoptsq += d__1 * d__1; + i__1 = j; + for (i__ = 1; i__ <= i__1; ++i__) { + ++ih; + if (i__ < j) { + gopt[j] += hq[ih] * d__[i__]; + } + /* L540: */ + gopt[i__] += hq[ih] * d__[j]; + } + } + i__1 = npt; + for (k = 1; k <= i__1; ++k) { + temp = zero; + i__2 = n; + for (j = 1; j <= i__2; ++j) { + /* L550: */ + temp += xpt[k + j * xpt_dim1] * d__[j]; + } + temp = pq[k] * temp; + i__2 = n; + for (i__ = 1; i__ <= i__2; ++i__) { + /* L560: */ + gopt[i__] += temp * xpt[k + i__ * xpt_dim1]; + } + } + } + + /* Calculate the parameters of the least Frobenius norm interpolant to */ + /* the current data, the gradient of this interpolant at XOPT being put */ + /* into VLAG(NPT+I), I=1,2,...,N. */ + + if (ntrits > 0) { + i__2 = npt; + for (k = 1; k <= i__2; ++k) { + vlag[k] = fval[k] - fval[kopt]; + /* L570: */ + w[k] = zero; + } + i__2 = nptm; + for (j = 1; j <= i__2; ++j) { + sum = zero; + i__1 = npt; + for (k = 1; k <= i__1; ++k) { + /* L580: */ + sum += zmat[k + j * zmat_dim1] * vlag[k]; + } + i__1 = npt; + for (k = 1; k <= i__1; ++k) { + /* L590: */ + w[k] += sum * zmat[k + j * zmat_dim1]; + } + } + i__1 = npt; + for (k = 1; k <= i__1; ++k) { + sum = zero; + i__2 = n; + for (j = 1; j <= i__2; ++j) { + /* L600: */ + sum += xpt[k + j * xpt_dim1] * xopt[j]; + } + w[k + npt] = w[k]; + /* L610: */ + w[k] = sum * w[k]; + } + gqsq = zero; + gisq = zero; + i__1 = n; + for (i__ = 1; i__ <= i__1; ++i__) { + sum = zero; + i__2 = npt; + for (k = 1; k <= i__2; ++k) { + /* L620: */ + sum = sum + bmat[k + i__ * bmat_dim1] * vlag[k] + xpt[k + i__ + * xpt_dim1] * w[k]; + } + if (xopt[i__] == sl[i__]) { + /* Computing MIN */ + d__2 = zero, d__3 = gopt[i__]; + /* Computing 2nd power */ + d__1 = std::min(d__2,d__3); + gqsq += d__1 * d__1; + /* Computing 2nd power */ + d__1 = std::min(zero,sum); + gisq += d__1 * d__1; + } else if (xopt[i__] == su[i__]) { + /* Computing MAX */ + d__2 = zero, d__3 = gopt[i__]; + /* Computing 2nd power */ + d__1 = std::max(d__2,d__3); + gqsq += d__1 * d__1; + /* Computing 2nd power */ + d__1 = std::max(zero,sum); + gisq += d__1 * d__1; + } else { + /* Computing 2nd power */ + d__1 = gopt[i__]; + gqsq += d__1 * d__1; + gisq += sum * sum; + } + /* L630: */ + vlag[npt + i__] = sum; + } + + /* Test whether to replace the new quadratic model by the least Frobenius */ + /* norm interpolant, making the replacement if the test is satisfied. */ + + ++itest; + if (gqsq < ten * gisq) { + itest = 0; + } + if (itest >= 3) { + i__1 = std::max(npt,nh); + for (i__ = 1; i__ <= i__1; ++i__) { + if (i__ <= n) { + gopt[i__] = vlag[npt + i__]; + } + if (i__ <= npt) { + pq[i__] = w[npt + i__]; + } + if (i__ <= nh) { + hq[i__] = zero; + } + itest = 0; + /* L640: */ + } + } + } + + /* If a trust region step has provided a sufficient decrease in F, then */ + /* branch for another trust region calculation. The case NTRITS=0 occurs */ + /* when the new interpolation point was reached by an alternative step. */ + + if (ntrits == 0) { + goto L60; + } + if (f <= fopt + tenth * vquad) { + goto L60; + } + + /* Alternatively, find out if the interpolation points are close enough */ + /* to the best point so far. */ + + /* Computing MAX */ + /* Computing 2nd power */ + d__3 = two * delta; + /* Computing 2nd power */ + d__4 = ten * rho; + d__1 = d__3 * d__3, d__2 = d__4 * d__4; + distsq = std::max(d__1,d__2); +L650: + knew = 0; + i__1 = npt; + for (k = 1; k <= i__1; ++k) { + sum = zero; + i__2 = n; + for (j = 1; j <= i__2; ++j) { + /* L660: */ + /* Computing 2nd power */ + d__1 = xpt[k + j * xpt_dim1] - xopt[j]; + sum += d__1 * d__1; + } + if (sum > distsq) { + knew = k; + distsq = sum; + } + /* L670: */ + } + + /* If KNEW is positive, then ALTMOV finds alternative new positions for */ + /* the KNEW-th interpolation point within distance ADELT of XOPT. It is */ + /* reached via label 90. Otherwise, there is a branch to label 60 for */ + /* another trust region iteration, unless the calculations with the */ + /* current RHO are complete. */ + + if (knew > 0) { + dist = std::sqrt(distsq); + if (ntrits == -1) { + /* Computing MIN */ + d__1 = tenth * delta, d__2 = half * dist; + delta = std::min(d__1,d__2); + if (delta <= rho * 1.5) { + delta = rho; + } + } + ntrits = 0; + /* Computing MAX */ + /* Computing MIN */ + d__2 = tenth * dist; + d__1 = std::min(d__2,delta); + adelt = std::max(d__1,rho); + dsq = adelt * adelt; + goto L90; + } + if (ntrits == -1) { + goto L680; + } + if (ratio > zero) { + goto L60; + } + if (std::max(delta,dnorm) > rho) { + goto L60; + } + + /* The calculations with the current value of RHO are complete. Pick the */ + /* next values of RHO and DELTA. */ + +L680: + if (rho > rhoend) { + delta = half * rho; + ratio = rho / rhoend; + if (ratio <= 16.) { + rho = rhoend; + } else if (ratio <= 250.) { + rho = std::sqrt(ratio) * rhoend; + } else { + rho = tenth * rho; + } + delta = std::max(delta,rho); + ntrits = 0; + nfsav = nf; + goto L60; + } + + /* Return from the calculation, after another Newton-Raphson step, if */ + /* it is too short to have been tried before. */ + + if (ntrits == -1) { + goto L360; + } +L720: + if (fval[kopt] <= fsave) { + i__1 = n; + for (i__ = 1; i__ <= i__1; ++i__) { + /* Computing MIN */ + /* Computing MAX */ + d__3 = xl[i__], d__4 = xbase[i__] + xopt[i__]; + d__1 = std::max(d__3,d__4), d__2 = xu[i__]; + x[i__] = std::min(d__1,d__2); + if (xopt[i__] == sl[i__]) { + x[i__] = xl[i__]; + } + if (xopt[i__] == su[i__]) { + x[i__] = xu[i__]; + } + /* L730: */ + } + f = fval[kopt]; + } + + return f; + } /* bobyqb_ */ + + // ---------------------------------------------------------------------------------------- + + void altmov_( + const integer n, + const integer npt, + const doublereal *xpt, + const doublereal *xopt, + const doublereal *bmat, + const doublereal *zmat, + const integer ndim, + const doublereal *sl, + const doublereal *su, + const integer kopt, + const integer knew, + const doublereal adelt, + doublereal *xnew, + doublereal *xalt, + doublereal& alpha, + doublereal& cauchy, + doublereal *glag, + doublereal *hcol, + doublereal *w + ) const + { + /* System generated locals */ + integer xpt_dim1, xpt_offset, bmat_dim1, bmat_offset, zmat_dim1, + zmat_offset, i__1, i__2; + doublereal d__1, d__2, d__3, d__4; + + + /* Local variables */ + integer i__, j, k; + doublereal ha, gw, one, diff, half; + integer ilbd, isbd; + doublereal slbd; + integer iubd; + doublereal vlag, subd, temp; + integer ksav = 0; + doublereal step = 0, zero = 0, curv = 0; + integer iflag; + doublereal scale = 0, csave = 0, tempa = 0, tempb = 0, tempd = 0, const__ = 0, sumin = 0, + ggfree = 0; + integer ibdsav = 0; + doublereal dderiv = 0, bigstp = 0, predsq = 0, presav = 0, distsq = 0, stpsav = 0, wfixsq = 0, wsqsav = 0; + + + /* The arguments N, NPT, XPT, XOPT, BMAT, ZMAT, NDIM, SL and SU all have */ + /* the same meanings as the corresponding arguments of BOBYQB. */ + /* KOPT is the index of the optimal interpolation point. */ + /* KNEW is the index of the interpolation point that is going to be moved. */ + /* ADELT is the current trust region bound. */ + /* XNEW will be set to a suitable new position for the interpolation point */ + /* XPT(KNEW,.). Specifically, it satisfies the SL, SU and trust region */ + /* bounds and it should provide a large denominator in the next call of */ + /* UPDATE. The step XNEW-XOPT from XOPT is restricted to moves along the */ + /* straight lines through XOPT and another interpolation point. */ + /* XALT also provides a large value of the modulus of the KNEW-th Lagrange */ + /* function subject to the constraints that have been mentioned, its main */ + /* difference from XNEW being that XALT-XOPT is a constrained version of */ + /* the Cauchy step within the trust region. An exception is that XALT is */ + /* not calculated if all components of GLAG (see below) are zero. */ + /* ALPHA will be set to the KNEW-th diagonal element of the H matrix. */ + /* CAUCHY will be set to the square of the KNEW-th Lagrange function at */ + /* the step XALT-XOPT from XOPT for the vector XALT that is returned, */ + /* except that CAUCHY is set to zero if XALT is not calculated. */ + /* GLAG is a working space vector of length N for the gradient of the */ + /* KNEW-th Lagrange function at XOPT. */ + /* HCOL is a working space vector of length NPT for the second derivative */ + /* coefficients of the KNEW-th Lagrange function. */ + /* W is a working space vector of length 2N that is going to hold the */ + /* constrained Cauchy step from XOPT of the Lagrange function, followed */ + /* by the downhill version of XALT when the uphill step is calculated. */ + + /* Set the first NPT components of W to the leading elements of the */ + /* KNEW-th column of the H matrix. */ + + /* Parameter adjustments */ + zmat_dim1 = npt; + zmat_offset = 1 + zmat_dim1; + zmat -= zmat_offset; + xpt_dim1 = npt; + xpt_offset = 1 + xpt_dim1; + xpt -= xpt_offset; + --xopt; + bmat_dim1 = ndim; + bmat_offset = 1 + bmat_dim1; + bmat -= bmat_offset; + --sl; + --su; + --xnew; + --xalt; + --glag; + --hcol; + --w; + + /* Function Body */ + half = .5; + one = 1.; + zero = 0.; + const__ = one + std::sqrt(2.); + i__1 = npt; + for (k = 1; k <= i__1; ++k) { + /* L10: */ + hcol[k] = zero; + } + i__1 = npt - n - 1; + for (j = 1; j <= i__1; ++j) { + temp = zmat[knew + j * zmat_dim1]; + i__2 = npt; + for (k = 1; k <= i__2; ++k) { + /* L20: */ + hcol[k] += temp * zmat[k + j * zmat_dim1]; + } + } + alpha = hcol[knew]; + ha = half * alpha; + + /* Calculate the gradient of the KNEW-th Lagrange function at XOPT. */ + + i__2 = n; + for (i__ = 1; i__ <= i__2; ++i__) { + /* L30: */ + glag[i__] = bmat[knew + i__ * bmat_dim1]; + } + i__2 = npt; + for (k = 1; k <= i__2; ++k) { + temp = zero; + i__1 = n; + for (j = 1; j <= i__1; ++j) { + /* L40: */ + temp += xpt[k + j * xpt_dim1] * xopt[j]; + } + temp = hcol[k] * temp; + i__1 = n; + for (i__ = 1; i__ <= i__1; ++i__) { + /* L50: */ + glag[i__] += temp * xpt[k + i__ * xpt_dim1]; + } + } + + /* Search for a large denominator along the straight lines through XOPT */ + /* and another interpolation point. SLBD and SUBD will be lower and upper */ + /* bounds on the step along each of these lines in turn. PREDSQ will be */ + /* set to the square of the predicted denominator for each line. PRESAV */ + /* will be set to the largest admissible value of PREDSQ that occurs. */ + + presav = zero; + i__1 = npt; + for (k = 1; k <= i__1; ++k) { + if (k == kopt) { + goto L80; + } + dderiv = zero; + distsq = zero; + i__2 = n; + for (i__ = 1; i__ <= i__2; ++i__) { + temp = xpt[k + i__ * xpt_dim1] - xopt[i__]; + dderiv += glag[i__] * temp; + /* L60: */ + distsq += temp * temp; + } + subd = adelt / std::sqrt(distsq); + slbd = -subd; + ilbd = 0; + iubd = 0; + sumin = std::min(one,subd); + + /* Revise SLBD and SUBD if necessary because of the bounds in SL and SU. */ + + i__2 = n; + for (i__ = 1; i__ <= i__2; ++i__) { + temp = xpt[k + i__ * xpt_dim1] - xopt[i__]; + if (temp > zero) { + if (slbd * temp < sl[i__] - xopt[i__]) { + slbd = (sl[i__] - xopt[i__]) / temp; + ilbd = -i__; + } + if (subd * temp > su[i__] - xopt[i__]) { + /* Computing MAX */ + d__1 = sumin, d__2 = (su[i__] - xopt[i__]) / temp; + subd = std::max(d__1,d__2); + iubd = i__; + } + } else if (temp < zero) { + if (slbd * temp > su[i__] - xopt[i__]) { + slbd = (su[i__] - xopt[i__]) / temp; + ilbd = i__; + } + if (subd * temp < sl[i__] - xopt[i__]) { + /* Computing MAX */ + d__1 = sumin, d__2 = (sl[i__] - xopt[i__]) / temp; + subd = std::max(d__1,d__2); + iubd = -i__; + } + } + /* L70: */ + } + + /* Seek a large modulus of the KNEW-th Lagrange function when the index */ + /* of the other interpolation point on the line through XOPT is KNEW. */ + + if (k == knew) { + diff = dderiv - one; + step = slbd; + vlag = slbd * (dderiv - slbd * diff); + isbd = ilbd; + temp = subd * (dderiv - subd * diff); + if (std::abs(temp) > std::abs(vlag)) { + step = subd; + vlag = temp; + isbd = iubd; + } + tempd = half * dderiv; + tempa = tempd - diff * slbd; + tempb = tempd - diff * subd; + if (tempa * tempb < zero) { + temp = tempd * tempd / diff; + if (std::abs(temp) > std::abs(vlag)) { + step = tempd / diff; + vlag = temp; + isbd = 0; + } + } + + /* Search along each of the other lines through XOPT and another point. */ + + } else { + step = slbd; + vlag = slbd * (one - slbd); + isbd = ilbd; + temp = subd * (one - subd); + if (std::abs(temp) > std::abs(vlag)) { + step = subd; + vlag = temp; + isbd = iubd; + } + if (subd > half) { + if (std::abs(vlag) < .25) { + step = half; + vlag = .25; + isbd = 0; + } + } + vlag *= dderiv; + } + + /* Calculate PREDSQ for the current line search and maintain PRESAV. */ + + temp = step * (one - step) * distsq; + predsq = vlag * vlag * (vlag * vlag + ha * temp * temp); + if (predsq > presav) { + presav = predsq; + ksav = k; + stpsav = step; + ibdsav = isbd; + } +L80: + ; + } + + /* Construct XNEW in a way that satisfies the bound constraints exactly. */ + + i__1 = n; + for (i__ = 1; i__ <= i__1; ++i__) { + temp = xopt[i__] + stpsav * (xpt[ksav + i__ * xpt_dim1] - xopt[i__]); + /* L90: */ + /* Computing MAX */ + /* Computing MIN */ + d__3 = su[i__]; + d__1 = sl[i__], d__2 = std::min(d__3,temp); + xnew[i__] = std::max(d__1,d__2); + } + if (ibdsav < 0) { + xnew[-ibdsav] = sl[-ibdsav]; + } + if (ibdsav > 0) { + xnew[ibdsav] = su[ibdsav]; + } + + /* Prepare for the iterative method that assembles the constrained Cauchy */ + /* step in W. The sum of squares of the fixed components of W is formed in */ + /* WFIXSQ, and the free components of W are set to BIGSTP. */ + + bigstp = adelt + adelt; + iflag = 0; +L100: + wfixsq = zero; + ggfree = zero; + i__1 = n; + for (i__ = 1; i__ <= i__1; ++i__) { + w[i__] = zero; + /* Computing MIN */ + d__1 = xopt[i__] - sl[i__], d__2 = glag[i__]; + tempa = std::min(d__1,d__2); + /* Computing MAX */ + d__1 = xopt[i__] - su[i__], d__2 = glag[i__]; + tempb = std::max(d__1,d__2); + if (tempa > zero || tempb < zero) { + w[i__] = bigstp; + /* Computing 2nd power */ + d__1 = glag[i__]; + ggfree += d__1 * d__1; + } + /* L110: */ + } + if (ggfree == zero) { + cauchy = zero; + goto L200; + } + + /* Investigate whether more components of W can be fixed. */ + +L120: + temp = adelt * adelt - wfixsq; + if (temp > zero) { + wsqsav = wfixsq; + step = std::sqrt(temp / ggfree); + ggfree = zero; + i__1 = n; + for (i__ = 1; i__ <= i__1; ++i__) { + if (w[i__] == bigstp) { + temp = xopt[i__] - step * glag[i__]; + if (temp <= sl[i__]) { + w[i__] = sl[i__] - xopt[i__]; + /* Computing 2nd power */ + d__1 = w[i__]; + wfixsq += d__1 * d__1; + } else if (temp >= su[i__]) { + w[i__] = su[i__] - xopt[i__]; + /* Computing 2nd power */ + d__1 = w[i__]; + wfixsq += d__1 * d__1; + } else { + /* Computing 2nd power */ + d__1 = glag[i__]; + ggfree += d__1 * d__1; + } + } + /* L130: */ + } + if (wfixsq > wsqsav && ggfree > zero) { + goto L120; + } + } + + /* Set the remaining free components of W and all components of XALT, */ + /* except that W may be scaled later. */ + + gw = zero; + i__1 = n; + for (i__ = 1; i__ <= i__1; ++i__) { + if (w[i__] == bigstp) { + w[i__] = -step * glag[i__]; + /* Computing MAX */ + /* Computing MIN */ + d__3 = su[i__], d__4 = xopt[i__] + w[i__]; + d__1 = sl[i__], d__2 = std::min(d__3,d__4); + xalt[i__] = std::max(d__1,d__2); + } else if (w[i__] == zero) { + xalt[i__] = xopt[i__]; + } else if (glag[i__] > zero) { + xalt[i__] = sl[i__]; + } else { + xalt[i__] = su[i__]; + } + /* L140: */ + gw += glag[i__] * w[i__]; + } + + /* Set CURV to the curvature of the KNEW-th Lagrange function along W. */ + /* Scale W by a factor less than one if that can reduce the modulus of */ + /* the Lagrange function at XOPT+W. Set CAUCHY to the final value of */ + /* the square of this function. */ + + curv = zero; + i__1 = npt; + for (k = 1; k <= i__1; ++k) { + temp = zero; + i__2 = n; + for (j = 1; j <= i__2; ++j) { + /* L150: */ + temp += xpt[k + j * xpt_dim1] * w[j]; + } + /* L160: */ + curv += hcol[k] * temp * temp; + } + if (iflag == 1) { + curv = -curv; + } + if (curv > -gw && curv < -const__ * gw) { + scale = -gw / curv; + i__1 = n; + for (i__ = 1; i__ <= i__1; ++i__) { + temp = xopt[i__] + scale * w[i__]; + /* L170: */ + /* Computing MAX */ + /* Computing MIN */ + d__3 = su[i__]; + d__1 = sl[i__], d__2 = std::min(d__3,temp); + xalt[i__] = std::max(d__1,d__2); + } + /* Computing 2nd power */ + d__1 = half * gw * scale; + cauchy = d__1 * d__1; + } else { + /* Computing 2nd power */ + d__1 = gw + half * curv; + cauchy = d__1 * d__1; + } + + /* If IFLAG is zero, then XALT is calculated as before after reversing */ + /* the sign of GLAG. Thus two XALT vectors become available. The one that */ + /* is chosen is the one that gives the larger value of CAUCHY. */ + + if (iflag == 0) { + i__1 = n; + for (i__ = 1; i__ <= i__1; ++i__) { + glag[i__] = -glag[i__]; + /* L180: */ + w[n + i__] = xalt[i__]; + } + csave = cauchy; + iflag = 1; + goto L100; + } + if (csave > cauchy) { + i__1 = n; + for (i__ = 1; i__ <= i__1; ++i__) { + /* L190: */ + xalt[i__] = w[n + i__]; + } + cauchy = csave; + } +L200: + ; + } /* altmov_ */ + + // ---------------------------------------------------------------------------------------- + + template <typename funct> + void prelim_( + const funct& calfun, + const integer n, + const integer npt, + doublereal *x, + const doublereal *xl, + const doublereal *xu, + const doublereal rhobeg, + const integer maxfun, + doublereal *xbase, + doublereal *xpt, + doublereal *fval, + doublereal *gopt, + doublereal *hq, + doublereal *pq, + doublereal *bmat, + doublereal *zmat, + const integer ndim, + const doublereal *sl, + const doublereal *su, + integer& nf, + integer& kopt + ) const + { + /* System generated locals */ + integer xpt_dim1, xpt_offset, bmat_dim1, bmat_offset, zmat_dim1, + zmat_offset, i__1, i__2; + doublereal d__1, d__2, d__3, d__4; + + + /* Local variables */ + doublereal f; + integer i__, j, k, ih, np, nfm; + doublereal one; + integer nfx = 0, ipt = 0, jpt = 0; + doublereal two = 0, fbeg = 0, diff = 0, half = 0, temp = 0, zero = 0, recip = 0, stepa = 0, stepb = 0; + integer itemp; + doublereal rhosq; + + + + /* The arguments N, NPT, X, XL, XU, RHOBEG, IPRINT and MAXFUN are the */ + /* same as the corresponding arguments in SUBROUTINE BOBYQA. */ + /* The arguments XBASE, XPT, FVAL, HQ, PQ, BMAT, ZMAT, NDIM, SL and SU */ + /* are the same as the corresponding arguments in BOBYQB, the elements */ + /* of SL and SU being set in BOBYQA. */ + /* GOPT is usually the gradient of the quadratic model at XOPT+XBASE, but */ + /* it is set by PRELIM to the gradient of the quadratic model at XBASE. */ + /* If XOPT is nonzero, BOBYQB will change it to its usual value later. */ + /* NF is maintaned as the number of calls of CALFUN so far. */ + /* KOPT will be such that the least calculated value of F so far is at */ + /* the point XPT(KOPT,.)+XBASE in the space of the variables. */ + + /* SUBROUTINE PRELIM sets the elements of XBASE, XPT, FVAL, GOPT, HQ, PQ, */ + /* BMAT and ZMAT for the first iteration, and it maintains the values of */ + /* NF and KOPT. The vector X is also changed by PRELIM. */ + + /* Set some constants. */ + + /* Parameter adjustments */ + zmat_dim1 = npt; + zmat_offset = 1 + zmat_dim1; + zmat -= zmat_offset; + xpt_dim1 = npt; + xpt_offset = 1 + xpt_dim1; + xpt -= xpt_offset; + --x; + --xl; + --xu; + --xbase; + --fval; + --gopt; + --hq; + --pq; + bmat_dim1 = ndim; + bmat_offset = 1 + bmat_dim1; + bmat -= bmat_offset; + --sl; + --su; + + /* Function Body */ + half = .5; + one = 1.; + two = 2.; + zero = 0.; + rhosq = rhobeg * rhobeg; + recip = one / rhosq; + np = n + 1; + + /* Set XBASE to the initial vector of variables, and set the initial */ + /* elements of XPT, BMAT, HQ, PQ and ZMAT to zero. */ + + i__1 = n; + for (j = 1; j <= i__1; ++j) { + xbase[j] = x[j]; + i__2 = npt; + for (k = 1; k <= i__2; ++k) { + /* L10: */ + xpt[k + j * xpt_dim1] = zero; + } + i__2 = ndim; + for (i__ = 1; i__ <= i__2; ++i__) { + /* L20: */ + bmat[i__ + j * bmat_dim1] = zero; + } + } + i__2 = n * np / 2; + for (ih = 1; ih <= i__2; ++ih) { + /* L30: */ + hq[ih] = zero; + } + i__2 = npt; + for (k = 1; k <= i__2; ++k) { + pq[k] = zero; + i__1 = npt - np; + for (j = 1; j <= i__1; ++j) { + /* L40: */ + zmat[k + j * zmat_dim1] = zero; + } + } + + /* Begin the initialization procedure. NF becomes one more than the number */ + /* of function values so far. The coordinates of the displacement of the */ + /* next initial interpolation point from XBASE are set in XPT(NF+1,.). */ + + nf = 0; +L50: + nfm = nf; + nfx = nf - n; + ++(nf); + if (nfm <= n << 1) { + if (nfm >= 1 && nfm <= n) { + stepa = rhobeg; + if (su[nfm] == zero) { + stepa = -stepa; + } + xpt[nf + nfm * xpt_dim1] = stepa; + } else if (nfm > n) { + stepa = xpt[nf - n + nfx * xpt_dim1]; + stepb = -(rhobeg); + if (sl[nfx] == zero) { + /* Computing MIN */ + d__1 = two * rhobeg, d__2 = su[nfx]; + stepb = std::min(d__1,d__2); + } + if (su[nfx] == zero) { + /* Computing MAX */ + d__1 = -two * rhobeg, d__2 = sl[nfx]; + stepb = std::max(d__1,d__2); + } + xpt[nf + nfx * xpt_dim1] = stepb; + } + } else { + itemp = (nfm - np) / n; + jpt = nfm - itemp * n - n; + ipt = jpt + itemp; + if (ipt > n) { + itemp = jpt; + jpt = ipt - n; + ipt = itemp; + } + xpt[nf + ipt * xpt_dim1] = xpt[ipt + 1 + ipt * xpt_dim1]; + xpt[nf + jpt * xpt_dim1] = xpt[jpt + 1 + jpt * xpt_dim1]; + } + + /* Calculate the next value of F. The least function value so far and */ + /* its index are required. */ + + i__1 = n; + for (j = 1; j <= i__1; ++j) { + /* Computing MIN */ + /* Computing MAX */ + d__3 = xl[j], d__4 = xbase[j] + xpt[nf + j * xpt_dim1]; + d__1 = std::max(d__3,d__4), d__2 = xu[j]; + x[j] = std::min(d__1,d__2); + if (xpt[nf + j * xpt_dim1] == sl[j]) { + x[j] = xl[j]; + } + if (xpt[nf + j * xpt_dim1] == su[j]) { + x[j] = xu[j]; + } + /* L60: */ + } + f = calfun(mat(&x[1],n)); + fval[nf] = f; + if (nf == 1) { + fbeg = f; + kopt = 1; + } else if (f < fval[kopt]) { + kopt = nf; + } + + /* Set the nonzero initial elements of BMAT and the quadratic model in the */ + /* cases when NF is at most 2*N+1. If NF exceeds N+1, then the positions */ + /* of the NF-th and (NF-N)-th interpolation points may be switched, in */ + /* order that the function value at the first of them contributes to the */ + /* off-diagonal second derivative terms of the initial quadratic model. */ + + if (nf <= (n << 1) + 1) { + if (nf >= 2 && nf <= n + 1) { + gopt[nfm] = (f - fbeg) / stepa; + if (npt < nf + n) { + bmat[nfm * bmat_dim1 + 1] = -one / stepa; + bmat[nf + nfm * bmat_dim1] = one / stepa; + bmat[npt + nfm + nfm * bmat_dim1] = -half * rhosq; + } + } else if (nf >= n + 2) { + ih = nfx * (nfx + 1) / 2; + temp = (f - fbeg) / stepb; + diff = stepb - stepa; + hq[ih] = two * (temp - gopt[nfx]) / diff; + gopt[nfx] = (gopt[nfx] * stepb - temp * stepa) / diff; + if (stepa * stepb < zero) { + if (f < fval[nf - n]) { + fval[nf] = fval[nf - n]; + fval[nf - n] = f; + if (kopt == nf) { + kopt = nf - n; + } + xpt[nf - n + nfx * xpt_dim1] = stepb; + xpt[nf + nfx * xpt_dim1] = stepa; + } + } + bmat[nfx * bmat_dim1 + 1] = -(stepa + stepb) / (stepa * stepb); + bmat[nf + nfx * bmat_dim1] = -half / xpt[nf - n + nfx * + xpt_dim1]; + bmat[nf - n + nfx * bmat_dim1] = -bmat[nfx * bmat_dim1 + 1] - + bmat[nf + nfx * bmat_dim1]; + zmat[nfx * zmat_dim1 + 1] = std::sqrt(two) / (stepa * stepb); + zmat[nf + nfx * zmat_dim1] = std::sqrt(half) / rhosq; + zmat[nf - n + nfx * zmat_dim1] = -zmat[nfx * zmat_dim1 + 1] - + zmat[nf + nfx * zmat_dim1]; + } + + /* Set the off-diagonal second derivatives of the Lagrange functions and */ + /* the initial quadratic model. */ + + } else { + ih = ipt * (ipt - 1) / 2 + jpt; + zmat[nfx * zmat_dim1 + 1] = recip; + zmat[nf + nfx * zmat_dim1] = recip; + zmat[ipt + 1 + nfx * zmat_dim1] = -recip; + zmat[jpt + 1 + nfx * zmat_dim1] = -recip; + temp = xpt[nf + ipt * xpt_dim1] * xpt[nf + jpt * xpt_dim1]; + hq[ih] = (fbeg - fval[ipt + 1] - fval[jpt + 1] + f) / temp; + } + if (nf < npt && nf < maxfun) { + goto L50; + } + + } /* prelim_ */ + + // ---------------------------------------------------------------------------------------- + + template <typename funct> + void rescue_ ( + const funct& calfun, + const integer n, + const integer npt, + const doublereal *xl, + const doublereal *xu, + const integer maxfun, + doublereal *xbase, + doublereal *xpt, + doublereal *fval, + doublereal *xopt, + doublereal *gopt, + doublereal *hq, + doublereal *pq, + doublereal *bmat, + doublereal *zmat, + const integer ndim, + doublereal *sl, + doublereal *su, + integer& nf, + const doublereal delta, + integer& kopt, + doublereal *vlag, + doublereal * ptsaux, + doublereal *ptsid, + doublereal *w + ) const + { + /* System generated locals */ + integer xpt_dim1, xpt_offset, bmat_dim1, bmat_offset, zmat_dim1, + zmat_offset, i__1, i__2, i__3; + doublereal d__1, d__2, d__3, d__4; + + + /* Local variables */ + doublereal f; + integer i__, j, k, ih, jp, ip, iq, np, iw; + doublereal xp = 0, xq = 0, den = 0; + integer ihp = 0; + doublereal one; + integer ihq, jpn, kpt; + doublereal sum = 0, diff = 0, half = 0, beta = 0; + integer kold; + doublereal winc; + integer nrem, knew; + doublereal temp, bsum; + integer nptm; + doublereal zero = 0, hdiag = 0, fbase = 0, sfrac = 0, denom = 0, vquad = 0, sumpq = 0; + doublereal dsqmin, distsq, vlmxsq; + + + + /* The arguments N, NPT, XL, XU, IPRINT, MAXFUN, XBASE, XPT, FVAL, XOPT, */ + /* GOPT, HQ, PQ, BMAT, ZMAT, NDIM, SL and SU have the same meanings as */ + /* the corresponding arguments of BOBYQB on the entry to RESCUE. */ + /* NF is maintained as the number of calls of CALFUN so far, except that */ + /* NF is set to -1 if the value of MAXFUN prevents further progress. */ + /* KOPT is maintained so that FVAL(KOPT) is the least calculated function */ + /* value. Its correct value must be given on entry. It is updated if a */ + /* new least function value is found, but the corresponding changes to */ + /* XOPT and GOPT have to be made later by the calling program. */ + /* DELTA is the current trust region radius. */ + /* VLAG is a working space vector that will be used for the values of the */ + /* provisional Lagrange functions at each of the interpolation points. */ + /* They are part of a product that requires VLAG to be of length NDIM. */ + /* PTSAUX is also a working space array. For J=1,2,...,N, PTSAUX(1,J) and */ + /* PTSAUX(2,J) specify the two positions of provisional interpolation */ + /* points when a nonzero step is taken along e_J (the J-th coordinate */ + /* direction) through XBASE+XOPT, as specified below. Usually these */ + /* steps have length DELTA, but other lengths are chosen if necessary */ + /* in order to satisfy the given bounds on the variables. */ + /* PTSID is also a working space array. It has NPT components that denote */ + /* provisional new positions of the original interpolation points, in */ + /* case changes are needed to restore the linear independence of the */ + /* interpolation conditions. The K-th point is a candidate for change */ + /* if and only if PTSID(K) is nonzero. In this case let p and q be the */ + /* integer parts of PTSID(K) and (PTSID(K)-p) multiplied by N+1. If p */ + /* and q are both positive, the step from XBASE+XOPT to the new K-th */ + /* interpolation point is PTSAUX(1,p)*e_p + PTSAUX(1,q)*e_q. Otherwise */ + /* the step is PTSAUX(1,p)*e_p or PTSAUX(2,q)*e_q in the cases q=0 or */ + /* p=0, respectively. */ + /* The first NDIM+NPT elements of the array W are used for working space. */ + /* The final elements of BMAT and ZMAT are set in a well-conditioned way */ + /* to the values that are appropriate for the new interpolation points. */ + /* The elements of GOPT, HQ and PQ are also revised to the values that are */ + /* appropriate to the final quadratic model. */ + + /* Set some constants. */ + + /* Parameter adjustments */ + zmat_dim1 = npt; + zmat_offset = 1 + zmat_dim1; + zmat -= zmat_offset; + xpt_dim1 = npt; + xpt_offset = 1 + xpt_dim1; + xpt -= xpt_offset; + --xl; + --xu; + --xbase; + --fval; + --xopt; + --gopt; + --hq; + --pq; + bmat_dim1 = ndim; + bmat_offset = 1 + bmat_dim1; + bmat -= bmat_offset; + --sl; + --su; + --vlag; + ptsaux -= 3; + --ptsid; + --w; + + /* Function Body */ + half = .5; + one = 1.; + zero = 0.; + np = n + 1; + sfrac = half / (doublereal) np; + nptm = npt - np; + + /* Shift the interpolation points so that XOPT becomes the origin, and set */ + /* the elements of ZMAT to zero. The value of SUMPQ is required in the */ + /* updating of HQ below. The squares of the distances from XOPT to the */ + /* other interpolation points are set at the end of W. Increments of WINC */ + /* may be added later to these squares to balance the consideration of */ + /* the choice of point that is going to become current. */ + + sumpq = zero; + winc = zero; + i__1 = npt; + for (k = 1; k <= i__1; ++k) { + distsq = zero; + i__2 = n; + for (j = 1; j <= i__2; ++j) { + xpt[k + j * xpt_dim1] -= xopt[j]; + /* L10: */ + /* Computing 2nd power */ + d__1 = xpt[k + j * xpt_dim1]; + distsq += d__1 * d__1; + } + sumpq += pq[k]; + w[ndim + k] = distsq; + winc = std::max(winc,distsq); + i__2 = nptm; + for (j = 1; j <= i__2; ++j) { + /* L20: */ + zmat[k + j * zmat_dim1] = zero; + } + } + + /* Update HQ so that HQ and PQ define the second derivatives of the model */ + /* after XBASE has been shifted to the trust region centre. */ + + ih = 0; + i__2 = n; + for (j = 1; j <= i__2; ++j) { + w[j] = half * sumpq * xopt[j]; + i__1 = npt; + for (k = 1; k <= i__1; ++k) { + /* L30: */ + w[j] += pq[k] * xpt[k + j * xpt_dim1]; + } + i__1 = j; + for (i__ = 1; i__ <= i__1; ++i__) { + ++ih; + /* L40: */ + hq[ih] = hq[ih] + w[i__] * xopt[j] + w[j] * xopt[i__]; + } + } + + /* Shift XBASE, SL, SU and XOPT. Set the elements of BMAT to zero, and */ + /* also set the elements of PTSAUX. */ + + i__1 = n; + for (j = 1; j <= i__1; ++j) { + xbase[j] += xopt[j]; + sl[j] -= xopt[j]; + su[j] -= xopt[j]; + xopt[j] = zero; + /* Computing MIN */ + d__1 = delta, d__2 = su[j]; + ptsaux[(j << 1) + 1] = std::min(d__1,d__2); + /* Computing MAX */ + d__1 = -(delta), d__2 = sl[j]; + ptsaux[(j << 1) + 2] = std::max(d__1,d__2); + if (ptsaux[(j << 1) + 1] + ptsaux[(j << 1) + 2] < zero) { + temp = ptsaux[(j << 1) + 1]; + ptsaux[(j << 1) + 1] = ptsaux[(j << 1) + 2]; + ptsaux[(j << 1) + 2] = temp; + } + if ((d__2 = ptsaux[(j << 1) + 2], std::abs(d__2)) < half * (d__1 = ptsaux[( + j << 1) + 1], std::abs(d__1))) { + ptsaux[(j << 1) + 2] = half * ptsaux[(j << 1) + 1]; + } + i__2 = ndim; + for (i__ = 1; i__ <= i__2; ++i__) { + /* L50: */ + bmat[i__ + j * bmat_dim1] = zero; + } + } + fbase = fval[kopt]; + + /* Set the identifiers of the artificial interpolation points that are */ + /* along a coordinate direction from XOPT, and set the corresponding */ + /* nonzero elements of BMAT and ZMAT. */ + + ptsid[1] = sfrac; + i__2 = n; + for (j = 1; j <= i__2; ++j) { + jp = j + 1; + jpn = jp + n; + ptsid[jp] = (doublereal) j + sfrac; + if (jpn <= npt) { + ptsid[jpn] = (doublereal) j / (doublereal) np + sfrac; + temp = one / (ptsaux[(j << 1) + 1] - ptsaux[(j << 1) + 2]); + bmat[jp + j * bmat_dim1] = -temp + one / ptsaux[(j << 1) + 1]; + bmat[jpn + j * bmat_dim1] = temp + one / ptsaux[(j << 1) + 2]; + bmat[j * bmat_dim1 + 1] = -bmat[jp + j * bmat_dim1] - bmat[jpn + + j * bmat_dim1]; + zmat[j * zmat_dim1 + 1] = std::sqrt(2.) / (d__1 = ptsaux[(j << 1) + 1] + * ptsaux[(j << 1) + 2], std::abs(d__1)); + zmat[jp + j * zmat_dim1] = zmat[j * zmat_dim1 + 1] * ptsaux[(j << + 1) + 2] * temp; + zmat[jpn + j * zmat_dim1] = -zmat[j * zmat_dim1 + 1] * ptsaux[(j + << 1) + 1] * temp; + } else { + bmat[j * bmat_dim1 + 1] = -one / ptsaux[(j << 1) + 1]; + bmat[jp + j * bmat_dim1] = one / ptsaux[(j << 1) + 1]; + /* Computing 2nd power */ + d__1 = ptsaux[(j << 1) + 1]; + bmat[j + npt + j * bmat_dim1] = -half * (d__1 * d__1); + } + /* L60: */ + } + + /* Set any remaining identifiers with their nonzero elements of ZMAT. */ + + if (npt >= n + np) { + i__2 = npt; + for (k = np << 1; k <= i__2; ++k) { + iw = (integer) (((doublereal) (k - np) - half) / (doublereal) (n) + ); + ip = k - np - iw * n; + iq = ip + iw; + if (iq > n) { + iq -= n; + } + ptsid[k] = (doublereal) ip + (doublereal) iq / (doublereal) np + + sfrac; + temp = one / (ptsaux[(ip << 1) + 1] * ptsaux[(iq << 1) + 1]); + zmat[(k - np) * zmat_dim1 + 1] = temp; + zmat[ip + 1 + (k - np) * zmat_dim1] = -temp; + zmat[iq + 1 + (k - np) * zmat_dim1] = -temp; + /* L70: */ + zmat[k + (k - np) * zmat_dim1] = temp; + } + } + nrem = npt; + kold = 1; + knew = kopt; + + /* Reorder the provisional points in the way that exchanges PTSID(KOLD) */ + /* with PTSID(KNEW). */ + +L80: + i__2 = n; + for (j = 1; j <= i__2; ++j) { + temp = bmat[kold + j * bmat_dim1]; + bmat[kold + j * bmat_dim1] = bmat[knew + j * bmat_dim1]; + /* L90: */ + bmat[knew + j * bmat_dim1] = temp; + } + i__2 = nptm; + for (j = 1; j <= i__2; ++j) { + temp = zmat[kold + j * zmat_dim1]; + zmat[kold + j * zmat_dim1] = zmat[knew + j * zmat_dim1]; + /* L100: */ + zmat[knew + j * zmat_dim1] = temp; + } + ptsid[kold] = ptsid[knew]; + ptsid[knew] = zero; + w[ndim + knew] = zero; + --nrem; + if (knew != kopt) { + temp = vlag[kold]; + vlag[kold] = vlag[knew]; + vlag[knew] = temp; + + /* Update the BMAT and ZMAT matrices so that the status of the KNEW-th */ + /* interpolation point can be changed from provisional to original. The */ + /* branch to label 350 occurs if all the original points are reinstated. */ + /* The nonnegative values of W(NDIM+K) are required in the search below. */ + + update_(n, npt, &bmat[bmat_offset], &zmat[zmat_offset], ndim, &vlag[1], + beta, denom, knew, &w[1]); + if (nrem == 0) { + goto L350; + } + i__2 = npt; + for (k = 1; k <= i__2; ++k) { + /* L110: */ + w[ndim + k] = (d__1 = w[ndim + k], std::abs(d__1)); + } + } + + /* Pick the index KNEW of an original interpolation point that has not */ + /* yet replaced one of the provisional interpolation points, giving */ + /* attention to the closeness to XOPT and to previous tries with KNEW. */ + +L120: + dsqmin = zero; + i__2 = npt; + for (k = 1; k <= i__2; ++k) { + if (w[ndim + k] > zero) { + if (dsqmin == zero || w[ndim + k] < dsqmin) { + knew = k; + dsqmin = w[ndim + k]; + } + } + /* L130: */ + } + if (dsqmin == zero) { + goto L260; + } + + /* Form the W-vector of the chosen original interpolation point. */ + + i__2 = n; + for (j = 1; j <= i__2; ++j) { + /* L140: */ + w[npt + j] = xpt[knew + j * xpt_dim1]; + } + i__2 = npt; + for (k = 1; k <= i__2; ++k) { + sum = zero; + if (k == kopt) { + } else if (ptsid[k] == zero) { + i__1 = n; + for (j = 1; j <= i__1; ++j) { + /* L150: */ + sum += w[npt + j] * xpt[k + j * xpt_dim1]; + } + } else { + ip = (integer) ptsid[k]; + if (ip > 0) { + sum = w[npt + ip] * ptsaux[(ip << 1) + 1]; + } + iq = (integer) ((doublereal) np * ptsid[k] - (doublereal) (ip * + np)); + if (iq > 0) { + iw = 1; + if (ip == 0) { + iw = 2; + } + sum += w[npt + iq] * ptsaux[iw + (iq << 1)]; + } + } + /* L160: */ + w[k] = half * sum * sum; + } + + /* Calculate VLAG and BETA for the required updating of the H matrix if */ + /* XPT(KNEW,.) is reinstated in the set of interpolation points. */ + + i__2 = npt; + for (k = 1; k <= i__2; ++k) { + sum = zero; + i__1 = n; + for (j = 1; j <= i__1; ++j) { + /* L170: */ + sum += bmat[k + j * bmat_dim1] * w[npt + j]; + } + /* L180: */ + vlag[k] = sum; + } + beta = zero; + i__2 = nptm; + for (j = 1; j <= i__2; ++j) { + sum = zero; + i__1 = npt; + for (k = 1; k <= i__1; ++k) { + /* L190: */ + sum += zmat[k + j * zmat_dim1] * w[k]; + } + beta -= sum * sum; + i__1 = npt; + for (k = 1; k <= i__1; ++k) { + /* L200: */ + vlag[k] += sum * zmat[k + j * zmat_dim1]; + } + } + bsum = zero; + distsq = zero; + i__1 = n; + for (j = 1; j <= i__1; ++j) { + sum = zero; + i__2 = npt; + for (k = 1; k <= i__2; ++k) { + /* L210: */ + sum += bmat[k + j * bmat_dim1] * w[k]; + } + jp = j + npt; + bsum += sum * w[jp]; + i__2 = ndim; + for (ip = npt + 1; ip <= i__2; ++ip) { + /* L220: */ + sum += bmat[ip + j * bmat_dim1] * w[ip]; + } + bsum += sum * w[jp]; + vlag[jp] = sum; + /* L230: */ + /* Computing 2nd power */ + d__1 = xpt[knew + j * xpt_dim1]; + distsq += d__1 * d__1; + } + beta = half * distsq * distsq + beta - bsum; + vlag[kopt] += one; + + /* KOLD is set to the index of the provisional interpolation point that is */ + /* going to be deleted to make way for the KNEW-th original interpolation */ + /* point. The choice of KOLD is governed by the avoidance of a small value */ + /* of the denominator in the updating calculation of UPDATE. */ + + denom = zero; + vlmxsq = zero; + i__1 = npt; + for (k = 1; k <= i__1; ++k) { + if (ptsid[k] != zero) { + hdiag = zero; + i__2 = nptm; + for (j = 1; j <= i__2; ++j) { + /* L240: */ + /* Computing 2nd power */ + d__1 = zmat[k + j * zmat_dim1]; + hdiag += d__1 * d__1; + } + /* Computing 2nd power */ + d__1 = vlag[k]; + den = beta * hdiag + d__1 * d__1; + if (den > denom) { + kold = k; + denom = den; + } + } + /* L250: */ + /* Computing MAX */ + /* Computing 2nd power */ + d__3 = vlag[k]; + d__1 = vlmxsq, d__2 = d__3 * d__3; + vlmxsq = std::max(d__1,d__2); + } + if (denom <= vlmxsq * .01) { + w[ndim + knew] = -w[ndim + knew] - winc; + goto L120; + } + goto L80; + + /* When label 260 is reached, all the final positions of the interpolation */ + /* points have been chosen although any changes have not been included yet */ + /* in XPT. Also the final BMAT and ZMAT matrices are complete, but, apart */ + /* from the shift of XBASE, the updating of the quadratic model remains to */ + /* be done. The following cycle through the new interpolation points begins */ + /* by putting the new point in XPT(KPT,.) and by setting PQ(KPT) to zero, */ + /* except that a RETURN occurs if MAXFUN prohibits another value of F. */ + +L260: + i__1 = npt; + for (kpt = 1; kpt <= i__1; ++kpt) { + if (ptsid[kpt] == zero) { + goto L340; + } + if (nf >= maxfun) { + nf = -1; + goto L350; + } + ih = 0; + i__2 = n; + for (j = 1; j <= i__2; ++j) { + w[j] = xpt[kpt + j * xpt_dim1]; + xpt[kpt + j * xpt_dim1] = zero; + temp = pq[kpt] * w[j]; + i__3 = j; + for (i__ = 1; i__ <= i__3; ++i__) { + ++ih; + /* L270: */ + hq[ih] += temp * w[i__]; + } + } + pq[kpt] = zero; + ip = (integer) ptsid[kpt]; + iq = (integer) ((doublereal) np * ptsid[kpt] - (doublereal) (ip * np)) + ; + if (ip > 0) { + xp = ptsaux[(ip << 1) + 1]; + xpt[kpt + ip * xpt_dim1] = xp; + } + if (iq > 0) { + xq = ptsaux[(iq << 1) + 1]; + if (ip == 0) { + xq = ptsaux[(iq << 1) + 2]; + } + xpt[kpt + iq * xpt_dim1] = xq; + } + + /* Set VQUAD to the value of the current model at the new point. */ + + vquad = fbase; + if (ip > 0) { + ihp = (ip + ip * ip) / 2; + vquad += xp * (gopt[ip] + half * xp * hq[ihp]); + } + if (iq > 0) { + ihq = (iq + iq * iq) / 2; + vquad += xq * (gopt[iq] + half * xq * hq[ihq]); + if (ip > 0) { + iw = std::max(ihp,ihq) - (i__3 = ip - iq, std::abs(i__3)); + vquad += xp * xq * hq[iw]; + } + } + i__3 = npt; + for (k = 1; k <= i__3; ++k) { + temp = zero; + if (ip > 0) { + temp += xp * xpt[k + ip * xpt_dim1]; + } + if (iq > 0) { + temp += xq * xpt[k + iq * xpt_dim1]; + } + /* L280: */ + vquad += half * pq[k] * temp * temp; + } + + /* Calculate F at the new interpolation point, and set DIFF to the factor */ + /* that is going to multiply the KPT-th Lagrange function when the model */ + /* is updated to provide interpolation to the new function value. */ + + i__3 = n; + for (i__ = 1; i__ <= i__3; ++i__) { + /* Computing MIN */ + /* Computing MAX */ + d__3 = xl[i__], d__4 = xbase[i__] + xpt[kpt + i__ * xpt_dim1]; + d__1 = std::max(d__3,d__4), d__2 = xu[i__]; + w[i__] = std::min(d__1,d__2); + if (xpt[kpt + i__ * xpt_dim1] == sl[i__]) { + w[i__] = xl[i__]; + } + if (xpt[kpt + i__ * xpt_dim1] == su[i__]) { + w[i__] = xu[i__]; + } + /* L290: */ + } + ++(nf); + f = calfun(mat(&w[1],n)); + fval[kpt] = f; + if (f < fval[kopt]) { + kopt = kpt; + } + diff = f - vquad; + + /* Update the quadratic model. The RETURN from the subroutine occurs when */ + /* all the new interpolation points are included in the model. */ + + i__3 = n; + for (i__ = 1; i__ <= i__3; ++i__) { + /* L310: */ + gopt[i__] += diff * bmat[kpt + i__ * bmat_dim1]; + } + i__3 = npt; + for (k = 1; k <= i__3; ++k) { + sum = zero; + i__2 = nptm; + for (j = 1; j <= i__2; ++j) { + /* L320: */ + sum += zmat[k + j * zmat_dim1] * zmat[kpt + j * zmat_dim1]; + } + temp = diff * sum; + if (ptsid[k] == zero) { + pq[k] += temp; + } else { + ip = (integer) ptsid[k]; + iq = (integer) ((doublereal) np * ptsid[k] - (doublereal) (ip + * np)); + ihq = (iq * iq + iq) / 2; + if (ip == 0) { + /* Computing 2nd power */ + d__1 = ptsaux[(iq << 1) + 2]; + hq[ihq] += temp * (d__1 * d__1); + } else { + ihp = (ip * ip + ip) / 2; + /* Computing 2nd power */ + d__1 = ptsaux[(ip << 1) + 1]; + hq[ihp] += temp * (d__1 * d__1); + if (iq > 0) { + /* Computing 2nd power */ + d__1 = ptsaux[(iq << 1) + 1]; + hq[ihq] += temp * (d__1 * d__1); + iw = std::max(ihp,ihq) - (i__2 = iq - ip, std::abs(i__2)); + hq[iw] += temp * ptsaux[(ip << 1) + 1] * ptsaux[(iq << + 1) + 1]; + } + } + } + /* L330: */ + } + ptsid[kpt] = zero; +L340: + ; + } +L350: + ; + } /* rescue_ */ + + // ---------------------------------------------------------------------------------------- + + void trsbox_( + const integer n, + const integer npt, + const doublereal *xpt, + const doublereal *xopt, + const doublereal *gopt, + const doublereal *hq, + const doublereal *pq, + const doublereal *sl, + const doublereal *su, + const doublereal delta, + doublereal *xnew, + doublereal *d__, + doublereal *gnew, + doublereal *xbdi, + doublereal *s, + doublereal *hs, + doublereal *hred, + doublereal *dsq, + doublereal *crvmin + ) const + { + /* System generated locals */ + integer xpt_dim1, xpt_offset, i__1, i__2; + doublereal d__1, d__2, d__3, d__4; + + /* Local variables */ + integer i__, j, k, ih; + doublereal ds; + integer iu; + doublereal dhd, dhs, cth, one, shs, sth, ssq, half, beta, sdec, blen; + integer iact = 0, nact = 0; + doublereal angt, qred; + integer isav; + doublereal temp = 0, zero = 0, xsav = 0, xsum = 0, angbd = 0, dredg = 0, sredg = 0; + integer iterc; + doublereal resid = 0, delsq = 0, ggsav = 0, tempa = 0, tempb = 0, + redmax = 0, dredsq = 0, redsav = 0, onemin = 0, gredsq = 0, rednew = 0; + integer itcsav = 0; + doublereal rdprev = 0, rdnext = 0, stplen = 0, stepsq = 0; + integer itermax = 0; + + + /* The arguments N, NPT, XPT, XOPT, GOPT, HQ, PQ, SL and SU have the same */ + /* meanings as the corresponding arguments of BOBYQB. */ + /* DELTA is the trust region radius for the present calculation, which */ + /* seeks a small value of the quadratic model within distance DELTA of */ + /* XOPT subject to the bounds on the variables. */ + /* XNEW will be set to a new vector of variables that is approximately */ + /* the one that minimizes the quadratic model within the trust region */ + /* subject to the SL and SU constraints on the variables. It satisfies */ + /* as equations the bounds that become active during the calculation. */ + /* D is the calculated trial step from XOPT, generated iteratively from an */ + /* initial value of zero. Thus XNEW is XOPT+D after the final iteration. */ + /* GNEW holds the gradient of the quadratic model at XOPT+D. It is updated */ + /* when D is updated. */ + /* XBDI is a working space vector. For I=1,2,...,N, the element XBDI(I) is */ + /* set to -1.0, 0.0, or 1.0, the value being nonzero if and only if the */ + /* I-th variable has become fixed at a bound, the bound being SL(I) or */ + /* SU(I) in the case XBDI(I)=-1.0 or XBDI(I)=1.0, respectively. This */ + /* information is accumulated during the construction of XNEW. */ + /* The arrays S, HS and HRED are also used for working space. They hold the */ + /* current search direction, and the changes in the gradient of Q along S */ + /* and the reduced D, respectively, where the reduced D is the same as D, */ + /* except that the components of the fixed variables are zero. */ + /* DSQ will be set to the square of the length of XNEW-XOPT. */ + /* CRVMIN is set to zero if D reaches the trust region boundary. Otherwise */ + /* it is set to the least curvature of H that occurs in the conjugate */ + /* gradient searches that are not restricted by any constraints. The */ + /* value CRVMIN=-1.0D0 is set, however, if all of these searches are */ + /* constrained. */ + + /* A version of the truncated conjugate gradient is applied. If a line */ + /* search is restricted by a constraint, then the procedure is restarted, */ + /* the values of the variables that are at their bounds being fixed. If */ + /* the trust region boundary is reached, then further changes may be made */ + /* to D, each one being in the two dimensional space that is spanned */ + /* by the current D and the gradient of Q at XOPT+D, staying on the trust */ + /* region boundary. Termination occurs when the reduction in Q seems to */ + /* be close to the greatest reduction that can be achieved. */ + + /* Set some constants. */ + + /* Parameter adjustments */ + xpt_dim1 = npt; + xpt_offset = 1 + xpt_dim1; + xpt -= xpt_offset; + --xopt; + --gopt; + --hq; + --pq; + --sl; + --su; + --xnew; + --d__; + --gnew; + --xbdi; + --s; + --hs; + --hred; + + /* Function Body */ + half = .5; + one = 1.; + onemin = -1.; + zero = 0.; + + /* The sign of GOPT(I) gives the sign of the change to the I-th variable */ + /* that will reduce Q from its value at XOPT. Thus XBDI(I) shows whether */ + /* or not to fix the I-th variable at one of its bounds initially, with */ + /* NACT being set to the number of fixed variables. D and GNEW are also */ + /* set for the first iteration. DELSQ is the upper bound on the sum of */ + /* squares of the free variables. QRED is the reduction in Q so far. */ + + iterc = 0; + nact = 0; + i__1 = n; + for (i__ = 1; i__ <= i__1; ++i__) { + xbdi[i__] = zero; + if (xopt[i__] <= sl[i__]) { + if (gopt[i__] >= zero) { + xbdi[i__] = onemin; + } + } else if (xopt[i__] >= su[i__]) { + if (gopt[i__] <= zero) { + xbdi[i__] = one; + } + } + if (xbdi[i__] != zero) { + ++nact; + } + d__[i__] = zero; + /* L10: */ + gnew[i__] = gopt[i__]; + } + delsq = delta * delta; + qred = zero; + *crvmin = onemin; + + /* Set the next search direction of the conjugate gradient method. It is */ + /* the steepest descent direction initially and when the iterations are */ + /* restarted because a variable has just been fixed by a bound, and of */ + /* course the components of the fixed variables are zero. ITERMAX is an */ + /* upper bound on the indices of the conjugate gradient iterations. */ + +L20: + beta = zero; +L30: + stepsq = zero; + i__1 = n; + for (i__ = 1; i__ <= i__1; ++i__) { + if (xbdi[i__] != zero) { + s[i__] = zero; + } else if (beta == zero) { + s[i__] = -gnew[i__]; + } else { + s[i__] = beta * s[i__] - gnew[i__]; + } + /* L40: */ + /* Computing 2nd power */ + d__1 = s[i__]; + stepsq += d__1 * d__1; + } + if (stepsq == zero) { + goto L190; + } + if (beta == zero) { + gredsq = stepsq; + itermax = iterc + n - nact; + } + if (gredsq * delsq <= qred * 1e-4 * qred) { + goto L190; + } + + /* Multiply the search direction by the second derivative matrix of Q and */ + /* calculate some scalars for the choice of steplength. Then set BLEN to */ + /* the length of the the step to the trust region boundary and STPLEN to */ + /* the steplength, ignoring the simple bounds. */ + + goto L210; +L50: + resid = delsq; + ds = zero; + shs = zero; + i__1 = n; + for (i__ = 1; i__ <= i__1; ++i__) { + if (xbdi[i__] == zero) { + /* Computing 2nd power */ + d__1 = d__[i__]; + resid -= d__1 * d__1; + ds += s[i__] * d__[i__]; + shs += s[i__] * hs[i__]; + } + /* L60: */ + } + if (resid <= zero) { + goto L90; + } + temp = std::sqrt(stepsq * resid + ds * ds); + if (ds < zero) { + blen = (temp - ds) / stepsq; + } else { + blen = resid / (temp + ds); + } + stplen = blen; + if (shs > zero) { + /* Computing MIN */ + d__1 = blen, d__2 = gredsq / shs; + stplen = std::min(d__1,d__2); + } + + /* Reduce STPLEN if necessary in order to preserve the simple bounds, */ + /* letting IACT be the index of the new constrained variable. */ + + iact = 0; + i__1 = n; + for (i__ = 1; i__ <= i__1; ++i__) { + if (s[i__] != zero) { + xsum = xopt[i__] + d__[i__]; + if (s[i__] > zero) { + temp = (su[i__] - xsum) / s[i__]; + } else { + temp = (sl[i__] - xsum) / s[i__]; + } + if (temp < stplen) { + stplen = temp; + iact = i__; + } + } + /* L70: */ + } + + /* Update CRVMIN, GNEW and D. Set SDEC to the decrease that occurs in Q. */ + + sdec = zero; + if (stplen > zero) { + ++iterc; + temp = shs / stepsq; + if (iact == 0 && temp > zero) { + *crvmin = std::min(*crvmin,temp); + if (*crvmin == onemin) { + *crvmin = temp; + } + } + ggsav = gredsq; + gredsq = zero; + i__1 = n; + for (i__ = 1; i__ <= i__1; ++i__) { + gnew[i__] += stplen * hs[i__]; + if (xbdi[i__] == zero) { + /* Computing 2nd power */ + d__1 = gnew[i__]; + gredsq += d__1 * d__1; + } + /* L80: */ + d__[i__] += stplen * s[i__]; + } + /* Computing MAX */ + d__1 = stplen * (ggsav - half * stplen * shs); + sdec = std::max(d__1,zero); + qred += sdec; + } + + /* Restart the conjugate gradient method if it has hit a new bound. */ + + if (iact > 0) { + ++nact; + xbdi[iact] = one; + if (s[iact] < zero) { + xbdi[iact] = onemin; + } + /* Computing 2nd power */ + d__1 = d__[iact]; + delsq -= d__1 * d__1; + if (delsq <= zero) { + goto L90; + } + goto L20; + } + + /* If STPLEN is less than BLEN, then either apply another conjugate */ + /* gradient iteration or RETURN. */ + + if (stplen < blen) { + if (iterc == itermax) { + goto L190; + } + if (sdec <= qred * .01) { + goto L190; + } + beta = gredsq / ggsav; + goto L30; + } +L90: + *crvmin = zero; + + /* Prepare for the alternative iteration by calculating some scalars */ + /* and by multiplying the reduced D by the second derivative matrix of */ + /* Q, where S holds the reduced D in the call of GGMULT. */ + +L100: + if (nact >= n - 1) { + goto L190; + } + dredsq = zero; + dredg = zero; + gredsq = zero; + i__1 = n; + for (i__ = 1; i__ <= i__1; ++i__) { + if (xbdi[i__] == zero) { + /* Computing 2nd power */ + d__1 = d__[i__]; + dredsq += d__1 * d__1; + dredg += d__[i__] * gnew[i__]; + /* Computing 2nd power */ + d__1 = gnew[i__]; + gredsq += d__1 * d__1; + s[i__] = d__[i__]; + } else { + s[i__] = zero; + } + /* L110: */ + } + itcsav = iterc; + goto L210; + + /* Let the search direction S be a linear combination of the reduced D */ + /* and the reduced G that is orthogonal to the reduced D. */ + +L120: + ++iterc; + temp = gredsq * dredsq - dredg * dredg; + if (temp <= qred * 1e-4 * qred) { + goto L190; + } + temp = std::sqrt(temp); + i__1 = n; + for (i__ = 1; i__ <= i__1; ++i__) { + if (xbdi[i__] == zero) { + s[i__] = (dredg * d__[i__] - dredsq * gnew[i__]) / temp; + } else { + s[i__] = zero; + } + /* L130: */ + } + sredg = -temp; + + /* By considering the simple bounds on the variables, calculate an upper */ + /* bound on the tangent of half the angle of the alternative iteration, */ + /* namely ANGBD, except that, if already a free variable has reached a */ + /* bound, there is a branch back to label 100 after fixing that variable. */ + + angbd = one; + iact = 0; + i__1 = n; + for (i__ = 1; i__ <= i__1; ++i__) { + if (xbdi[i__] == zero) { + tempa = xopt[i__] + d__[i__] - sl[i__]; + tempb = su[i__] - xopt[i__] - d__[i__]; + if (tempa <= zero) { + ++nact; + xbdi[i__] = onemin; + goto L100; + } else if (tempb <= zero) { + ++nact; + xbdi[i__] = one; + goto L100; + } + /* Computing 2nd power */ + d__1 = d__[i__]; + /* Computing 2nd power */ + d__2 = s[i__]; + ssq = d__1 * d__1 + d__2 * d__2; + /* Computing 2nd power */ + d__1 = xopt[i__] - sl[i__]; + temp = ssq - d__1 * d__1; + if (temp > zero) { + temp = std::sqrt(temp) - s[i__]; + if (angbd * temp > tempa) { + angbd = tempa / temp; + iact = i__; + xsav = onemin; + } + } + /* Computing 2nd power */ + d__1 = su[i__] - xopt[i__]; + temp = ssq - d__1 * d__1; + if (temp > zero) { + temp = std::sqrt(temp) + s[i__]; + if (angbd * temp > tempb) { + angbd = tempb / temp; + iact = i__; + xsav = one; + } + } + } + /* L140: */ + } + + /* Calculate HHD and some curvatures for the alternative iteration. */ + + goto L210; +L150: + shs = zero; + dhs = zero; + dhd = zero; + i__1 = n; + for (i__ = 1; i__ <= i__1; ++i__) { + if (xbdi[i__] == zero) { + shs += s[i__] * hs[i__]; + dhs += d__[i__] * hs[i__]; + dhd += d__[i__] * hred[i__]; + } + /* L160: */ + } + + /* Seek the greatest reduction in Q for a range of equally spaced values */ + /* of ANGT in [0,ANGBD], where ANGT is the tangent of half the angle of */ + /* the alternative iteration. */ + + redmax = zero; + isav = 0; + redsav = zero; + iu = (integer) (angbd * 17. + 3.1); + i__1 = iu; + for (i__ = 1; i__ <= i__1; ++i__) { + angt = angbd * (doublereal) i__ / (doublereal) iu; + sth = (angt + angt) / (one + angt * angt); + temp = shs + angt * (angt * dhd - dhs - dhs); + rednew = sth * (angt * dredg - sredg - half * sth * temp); + if (rednew > redmax) { + redmax = rednew; + isav = i__; + rdprev = redsav; + } else if (i__ == isav + 1) { + rdnext = rednew; + } + /* L170: */ + redsav = rednew; + } + + /* Return if the reduction is zero. Otherwise, set the sine and cosine */ + /* of the angle of the alternative iteration, and calculate SDEC. */ + + if (isav == 0) { + goto L190; + } + if (isav < iu) { + temp = (rdnext - rdprev) / (redmax + redmax - rdprev - rdnext); + angt = angbd * ((doublereal) isav + half * temp) / (doublereal) iu; + } + cth = (one - angt * angt) / (one + angt * angt); + sth = (angt + angt) / (one + angt * angt); + temp = shs + angt * (angt * dhd - dhs - dhs); + sdec = sth * (angt * dredg - sredg - half * sth * temp); + if (sdec <= zero) { + goto L190; + } + + /* Update GNEW, D and HRED. If the angle of the alternative iteration */ + /* is restricted by a bound on a free variable, that variable is fixed */ + /* at the bound. */ + + dredg = zero; + gredsq = zero; + i__1 = n; + for (i__ = 1; i__ <= i__1; ++i__) { + gnew[i__] = gnew[i__] + (cth - one) * hred[i__] + sth * hs[i__]; + if (xbdi[i__] == zero) { + d__[i__] = cth * d__[i__] + sth * s[i__]; + dredg += d__[i__] * gnew[i__]; + /* Computing 2nd power */ + d__1 = gnew[i__]; + gredsq += d__1 * d__1; + } + /* L180: */ + hred[i__] = cth * hred[i__] + sth * hs[i__]; + } + qred += sdec; + if (iact > 0 && isav == iu) { + ++nact; + xbdi[iact] = xsav; + goto L100; + } + + /* If SDEC is sufficiently small, then RETURN after setting XNEW to */ + /* XOPT+D, giving careful attention to the bounds. */ + + if (sdec > qred * .01) { + goto L120; + } +L190: + *dsq = zero; + i__1 = n; + for (i__ = 1; i__ <= i__1; ++i__) { + /* Computing MAX */ + /* Computing MIN */ + d__3 = xopt[i__] + d__[i__], d__4 = su[i__]; + d__1 = std::min(d__3,d__4), d__2 = sl[i__]; + xnew[i__] = std::max(d__1,d__2); + if (xbdi[i__] == onemin) { + xnew[i__] = sl[i__]; + } + if (xbdi[i__] == one) { + xnew[i__] = su[i__]; + } + d__[i__] = xnew[i__] - xopt[i__]; + /* L200: */ + /* Computing 2nd power */ + d__1 = d__[i__]; + *dsq += d__1 * d__1; + } + return; + /* The following instructions multiply the current S-vector by the second */ + /* derivative matrix of the quadratic model, putting the product in HS. */ + /* They are reached from three different parts of the software above and */ + /* they can be regarded as an external subroutine. */ + +L210: + ih = 0; + i__1 = n; + for (j = 1; j <= i__1; ++j) { + hs[j] = zero; + i__2 = j; + for (i__ = 1; i__ <= i__2; ++i__) { + ++ih; + if (i__ < j) { + hs[j] += hq[ih] * s[i__]; + } + /* L220: */ + hs[i__] += hq[ih] * s[j]; + } + } + i__2 = npt; + for (k = 1; k <= i__2; ++k) { + if (pq[k] != zero) { + temp = zero; + i__1 = n; + for (j = 1; j <= i__1; ++j) { + /* L230: */ + temp += xpt[k + j * xpt_dim1] * s[j]; + } + temp *= pq[k]; + i__1 = n; + for (i__ = 1; i__ <= i__1; ++i__) { + /* L240: */ + hs[i__] += temp * xpt[k + i__ * xpt_dim1]; + } + } + /* L250: */ + } + if (*crvmin != zero) { + goto L50; + } + if (iterc > itcsav) { + goto L150; + } + i__2 = n; + for (i__ = 1; i__ <= i__2; ++i__) { + /* L260: */ + hred[i__] = hs[i__]; + } + goto L120; + } /* trsbox_ */ + + // ---------------------------------------------------------------------------------------- + + void update_( + const integer n, + const integer npt, + doublereal *bmat, + doublereal *zmat, + const integer ndim, + doublereal *vlag, + const doublereal beta, + const doublereal denom, + const integer knew, + doublereal *w + ) const + { + /* System generated locals */ + integer bmat_dim1, bmat_offset, zmat_dim1, zmat_offset, i__1, i__2; + doublereal d__1, d__2, d__3; + + /* Local variables */ + integer i__, j, k, jp; + doublereal one, tau, temp; + integer nptm; + doublereal zero, alpha, tempa, tempb, ztest; + + + /* The arrays BMAT and ZMAT are updated, as required by the new position */ + /* of the interpolation point that has the index KNEW. The vector VLAG has */ + /* N+NPT components, set on entry to the first NPT and last N components */ + /* of the product Hw in equation (4.11) of the Powell (2006) paper on */ + /* NEWUOA. Further, BETA is set on entry to the value of the parameter */ + /* with that name, and DENOM is set to the denominator of the updating */ + /* formula. Elements of ZMAT may be treated as zero if their moduli are */ + /* at most ZTEST. The first NDIM elements of W are used for working space. */ + + /* Set some constants. */ + + /* Parameter adjustments */ + zmat_dim1 = npt; + zmat_offset = 1 + zmat_dim1; + zmat -= zmat_offset; + bmat_dim1 = ndim; + bmat_offset = 1 + bmat_dim1; + bmat -= bmat_offset; + --vlag; + --w; + + /* Function Body */ + one = 1.; + zero = 0.; + nptm = npt - n - 1; + ztest = zero; + i__1 = npt; + for (k = 1; k <= i__1; ++k) { + i__2 = nptm; + for (j = 1; j <= i__2; ++j) { + /* L10: */ + /* Computing MAX */ + d__2 = ztest, d__3 = (d__1 = zmat[k + j * zmat_dim1], std::abs(d__1)); + ztest = std::max(d__2,d__3); + } + } + ztest *= 1e-20; + + /* Apply the rotations that put zeros in the KNEW-th row of ZMAT. */ + + i__2 = nptm; + for (j = 2; j <= i__2; ++j) { + if ((d__1 = zmat[knew + j * zmat_dim1], std::abs(d__1)) > ztest) { + /* Computing 2nd power */ + d__1 = zmat[knew + zmat_dim1]; + /* Computing 2nd power */ + d__2 = zmat[knew + j * zmat_dim1]; + temp = std::sqrt(d__1 * d__1 + d__2 * d__2); + tempa = zmat[knew + zmat_dim1] / temp; + tempb = zmat[knew + j * zmat_dim1] / temp; + i__1 = npt; + for (i__ = 1; i__ <= i__1; ++i__) { + temp = tempa * zmat[i__ + zmat_dim1] + tempb * zmat[i__ + j * + zmat_dim1]; + zmat[i__ + j * zmat_dim1] = tempa * zmat[i__ + j * zmat_dim1] + - tempb * zmat[i__ + zmat_dim1]; + /* L20: */ + zmat[i__ + zmat_dim1] = temp; + } + } + zmat[knew + j * zmat_dim1] = zero; + /* L30: */ + } + + /* Put the first NPT components of the KNEW-th column of HLAG into W, */ + /* and calculate the parameters of the updating formula. */ + + i__2 = npt; + for (i__ = 1; i__ <= i__2; ++i__) { + w[i__] = zmat[knew + zmat_dim1] * zmat[i__ + zmat_dim1]; + /* L40: */ + } + alpha = w[knew]; + tau = vlag[knew]; + vlag[knew] -= one; + + /* Complete the updating of ZMAT. */ + + temp = std::sqrt(denom); + tempb = zmat[knew + zmat_dim1] / temp; + tempa = tau / temp; + i__2 = npt; + for (i__ = 1; i__ <= i__2; ++i__) { + /* L50: */ + zmat[i__ + zmat_dim1] = tempa * zmat[i__ + zmat_dim1] - tempb * vlag[ + i__]; + } + + /* Finally, update the matrix BMAT. */ + + i__2 = n; + for (j = 1; j <= i__2; ++j) { + jp = npt + j; + w[jp] = bmat[knew + j * bmat_dim1]; + tempa = (alpha * vlag[jp] - tau * w[jp]) / denom; + tempb = (-(beta) * w[jp] - tau * vlag[jp]) / denom; + i__1 = jp; + for (i__ = 1; i__ <= i__1; ++i__) { + bmat[i__ + j * bmat_dim1] = bmat[i__ + j * bmat_dim1] + tempa * + vlag[i__] + tempb * w[i__]; + if (i__ > npt) { + bmat[jp + (i__ - npt) * bmat_dim1] = bmat[i__ + j * + bmat_dim1]; + } + /* L60: */ + } + } + } /* update_ */ + }; + +// ---------------------------------------------------------------------------------------- + + template < + typename funct, + typename T, + typename U + > + double find_min_bobyqa ( + const funct& f, + T& x, + long npt, + const U& x_lower, + const U& x_upper, + const double rho_begin, + const double rho_end, + const long max_f_evals + ) + { + // The starting point (i.e. x) must be a column vector. + COMPILE_TIME_ASSERT(T::NC <= 1); + + // check the requirements. Also split the assert up so that the error message isn't huge. + DLIB_CASSERT(is_col_vector(x) && is_col_vector(x_lower) && is_col_vector(x_upper) && + x.size() == x_lower.size() && x_lower.size() == x_upper.size() && + x.size() > 1 && max_f_evals > 1, + "\tdouble find_min_bobyqa()" + << "\n\t Invalid arguments have been given to this function" + << "\n\t is_col_vector(x): " << is_col_vector(x) + << "\n\t is_col_vector(x_lower): " << is_col_vector(x_lower) + << "\n\t is_col_vector(x_upper): " << is_col_vector(x_upper) + << "\n\t x.size(): " << x.size() + << "\n\t x_lower.size(): " << x_lower.size() + << "\n\t x_upper.size(): " << x_upper.size() + << "\n\t max_f_evals: " << max_f_evals + ); + + DLIB_CASSERT(x.size() + 2 <= npt && npt <= (x.size()+1)*(x.size()+2)/2 && + 0 < rho_end && rho_end < rho_begin && + min(x_upper - x_lower) > 2*rho_begin && + min(x - x_lower) >= 0 && min(x_upper - x) >= 0, + "\tdouble find_min_bobyqa()" + << "\n\t Invalid arguments have been given to this function" + << "\n\t ntp in valid range: " << (x.size() + 2 <= npt && npt <= (x.size()+1)*(x.size()+2)/2) + << "\n\t npt: " << npt + << "\n\t rho_begin: " << rho_begin + << "\n\t rho_end: " << rho_end + << "\n\t min(x_upper - x_lower) > 2*rho_begin: " << (min(x_upper - x_lower) > 2*rho_begin) + << "\n\t min(x - x_lower) >= 0 && min(x_upper - x) >= 0: " << (min(x - x_lower) >= 0 && min(x_upper - x) >= 0) + ); + + + bobyqa_implementation impl; + return impl.find_min(f, x, npt, x_lower, x_upper, rho_begin, rho_end, max_f_evals); + } + +// ---------------------------------------------------------------------------------------- + + template < + typename funct, + typename T, + typename U + > + double find_max_bobyqa ( + const funct& f, + T& x, + long npt, + const U& x_lower, + const U& x_upper, + const double rho_begin, + const double rho_end, + const long max_f_evals + ) + { + // The starting point (i.e. x) must be a column vector. + COMPILE_TIME_ASSERT(T::NC <= 1); + + return -find_min_bobyqa(negate_function(f), x, npt, x_lower, x_upper, rho_begin, rho_end, max_f_evals); + } + +// ---------------------------------------------------------------------------------------- + +} + +#endif // DLIB_OPTIMIZATIOn_BOBYQA_Hh_ + diff --git a/ml/dlib/dlib/optimization/optimization_bobyqa_abstract.h b/ml/dlib/dlib/optimization/optimization_bobyqa_abstract.h new file mode 100644 index 000000000..46f9436af --- /dev/null +++ b/ml/dlib/dlib/optimization/optimization_bobyqa_abstract.h @@ -0,0 +1,120 @@ +// Copyright (C) 2009 Davis E. King (davis@dlib.net) +// License: Boost Software License See LICENSE.txt for the full license. +#undef DLIB_OPTIMIZATIOn_BOBYQA_ABSTRACT_Hh_ +#ifdef DLIB_OPTIMIZATIOn_BOBYQA_ABSTRACT_Hh_ + +#include "../matrix.h" + +// ---------------------------------------------------------------------------------------- + +/* + This file defines the dlib interface to the BOBYQA software developed by M.J.D Powell. + BOBYQA is a method for optimizing a function in the absence of derivative information. + Powell described it as a method that seeks the least value of a function of many + variables, by applying a trust region method that forms quadratic models by + interpolation. There is usually some freedom in the interpolation conditions, + which is taken up by minimizing the Frobenius norm of the change to the second + derivative of the model, beginning with the zero matrix. The values of the variables + are constrained by upper and lower bounds. + + + The following paper, published in 2009 by Powell, describes the + detailed working of the BOBYQA algorithm. + + The BOBYQA algorithm for bound constrained optimization + without derivatives by M.J.D. Powell +*/ + +// ---------------------------------------------------------------------------------------- + +namespace dlib +{ + class bobyqa_failure : public error; + /*! + This is the exception class used by the functions defined in this file. + !*/ + +// ---------------------------------------------------------------------------------------- + + template < + typename funct, + typename T, + typename U + > + double find_min_bobyqa ( + const funct& f, + T& x, + long npt, + const U& x_lower, + const U& x_upper, + const double rho_begin, + const double rho_end, + const long max_f_evals + ); + /*! + requires + - f(x) must be a valid expression that evaluates to a double + - is_col_vector(x) == true + - is_col_vector(x_lower) == true + - is_col_vector(x_upper) == true + - x.size() == x_lower.size() == x_upper.size() + - x.size() > 1 + - x.size() + 2 <= npt <= (x.size()+1)*(x.size()+2)/2 + - 0 < rho_end < rho_begin + - min(x_upper - x_lower) > 2*rho_begin + (i.e. the lower and upper bounds on each x element must be larger than 2*rho_begin) + - min(x - x_lower) >= 0 && min(x_upper - x) >= 0 + (i.e. the given x should be within the bounds defined by x_lower and x_upper) + - max_f_evals > 1 + ensures + - Performs a constrained minimization of the function f() starting from + the initial point x. + - The BOBYQA algorithm uses a number of interpolating points to perform its + work. The npt argument controls how many points get used. Typically, + a good value to use is 2*x.size()+1. + - #x == the value of x (within the bounds defined by x_lower and x_upper) that + was found to minimize f(). More precisely: + - min(#x - x_lower) >= 0 && min(x_upper - #x) >= 0 + - returns f(#x). + - rho_begin and rho_end are used as the initial and final values of a trust + region radius. Typically, rho_begin should be about one tenth of the greatest + expected change to a variable, while rho_end should indicate the accuracy that + is required in the final values of the variables. + throws + - bobyqa_failure + This exception is thrown if the algorithm is unable to make progress towards + solving the problem. This may occur because the algorithm detects excessive + numerical errors or because max_f_evals of f() have occurred without reaching + convergence. + !*/ + +// ---------------------------------------------------------------------------------------- + + template < + typename funct, + typename T, + typename U + > + double find_max_bobyqa ( + const funct& f, + T& x, + long npt, + const U& x_lower, + const U& x_upper, + const double rho_begin, + const double rho_end, + const long max_f_evals + ); + /*! + This function is identical to the find_min_bobyqa() routine defined above + except that it negates the f() function before performing optimization. + Thus this function will attempt to find the maximizer of f() rather than + the minimizer. + !*/ + +// ---------------------------------------------------------------------------------------- + +} + +#endif // DLIB_OPTIMIZATIOn_BOBYQA_ABSTRACT_Hh_ + diff --git a/ml/dlib/dlib/optimization/optimization_least_squares.h b/ml/dlib/dlib/optimization/optimization_least_squares.h new file mode 100644 index 000000000..6d12a919d --- /dev/null +++ b/ml/dlib/dlib/optimization/optimization_least_squares.h @@ -0,0 +1,345 @@ +// Copyright (C) 2010 Davis E. King (davis@dlib.net) +// License: Boost Software License See LICENSE.txt for the full license. +#ifndef DLIB_OPTIMIZATION_LEAST_SQuARES_H_h_ +#define DLIB_OPTIMIZATION_LEAST_SQuARES_H_h_ + +#include "../matrix.h" +#include "optimization_trust_region.h" +#include "optimization_least_squares_abstract.h" + +namespace dlib +{ + +// ---------------------------------------------------------------------------------------- + + template < + typename column_vector_type, + typename funct_type, + typename funct_der_type, + typename vector_type + > + class least_squares_function_model + { + public: + least_squares_function_model ( + const funct_type& f_, + const funct_der_type& der_, + const vector_type& list_ + ) : f(f_), der(der_), list(list_) + { + S = 0; + last_f = 0; + last_f2 = 0; + + r.set_size(list.size(),1); + } + + const funct_type& f; + const funct_der_type& der; + const vector_type& list; + + typedef typename column_vector_type::type type; + typedef typename column_vector_type::mem_manager_type mem_manager_type; + typedef typename column_vector_type::layout_type layout_type; + const static long NR = column_vector_type::NR; + + typedef column_vector_type column_vector; + typedef matrix<type,NR,NR,mem_manager_type,layout_type> general_matrix; + + + type operator() ( + const column_vector& x + ) const + { + type result = 0; + for (long i = 0; i < list.size(); ++i) + { + const type temp = f(list(i), x); + // save the residual for later + r(i) = temp; + result += temp*temp; + } + + last_f = 0.5*result; + return 0.5*result; + } + + void get_derivative_and_hessian ( + const column_vector& x, + column_vector& d, + general_matrix& h + ) const + { + J.set_size(list.size(), x.size()); + + // compute the Jacobian + for (long i = 0; i < list.size(); ++i) + { + set_rowm(J,i) = trans(der(list(i), x)); + } + + // Compute the Levenberg-Marquardt gradient and hessian + d = trans(J)*r; + h = trans(J)*J; + + if (S.size() == 0) + { + S.set_size(x.size(), x.size()); + S = 0; + } + + // If this isn't the first iteration then check if using + // a quasi-newton update helps things. + if (last_r.size() != 0) + { + + s = x - last_x; + y = d - last_d; + yy = d - trans(last_J)*r; + + const type ys = trans(y)*s; + vtemp = yy - S*s; + const type temp2 = std::abs(trans(s)*S*s); + type scale = (temp2 != 0) ? std::min<type>(1, std::abs(dot(s,yy))/temp2) : 1; + + if (ys != 0) + S = scale*S + (vtemp*trans(y) + y*trans(vtemp))/(ys) - dot(vtemp,s)/ys/ys*y*trans(y); + else + S *= scale; + + // check how well both the models fit the last change we saw in f() + const type measured_delta = last_f2 - last_f; + s = -s; + const type h_predicted_delta = 0.5*trans(s)*h*s + trans(d)*s; + const type s_predicted_delta = 0.5*trans(s)*(h+S)*s + trans(d)*s; + + const type h_error = std::abs((h_predicted_delta/measured_delta) - 1); + const type s_error = std::abs((s_predicted_delta/measured_delta) - 1); + + if (s_error < h_error && h_error > 0.01) + { + h += make_symmetric(S); + } + else if (s_error > 10) + { + S = 0; + } + + // put r into last_r + r.swap(last_r); + } + else + { + // put r into last_r + last_r = r; + } + + J.swap(last_J); + last_x = x; + last_d = d; + + last_f2 = last_f; + } + + mutable type last_f; // value of function we saw in last operator() + mutable type last_f2; // value of last_f we saw in get_derivative_and_hessian() + mutable matrix<type,0,1,mem_manager_type,layout_type> r; + mutable column_vector vtemp; + mutable column_vector s, y, yy; + + mutable general_matrix S; + mutable column_vector last_x; + mutable column_vector last_d; + mutable matrix<type,0,1,mem_manager_type,layout_type> last_r; + mutable matrix<type,0,NR,mem_manager_type,layout_type> last_J; + mutable matrix<type,0,NR,mem_manager_type,layout_type> J; + }; + +// ---------------------------------------------------------------------------------------- + + template < + typename column_vector_type, + typename funct_type, + typename funct_der_type, + typename vector_type + > + least_squares_function_model<column_vector_type,funct_type,funct_der_type,vector_type> least_squares_model ( + const funct_type& f, + const funct_der_type& der, + const vector_type& list + ) + { + return least_squares_function_model<column_vector_type,funct_type,funct_der_type,vector_type>(f,der,list); + } + +// ---------------------------------------------------------------------------------------- + + template < + typename stop_strategy_type, + typename funct_type, + typename funct_der_type, + typename vector_type, + typename T + > + double solve_least_squares ( + stop_strategy_type stop_strategy, + const funct_type& f, + const funct_der_type& der, + const vector_type& list, + T& x, + double radius = 1 + ) + { + // The starting point (i.e. x) must be a column vector. + COMPILE_TIME_ASSERT(T::NC <= 1); + + // make sure requires clause is not broken + DLIB_ASSERT(is_vector(mat(list)) && list.size() > 0 && + is_col_vector(x) && radius > 0, + "\t double solve_least_squares()" + << "\n\t invalid arguments were given to this function" + << "\n\t is_vector(list): " << is_vector(mat(list)) + << "\n\t list.size(): " << list.size() + << "\n\t is_col_vector(x): " << is_col_vector(x) + << "\n\t radius: " << radius + ); + + return find_min_trust_region(stop_strategy, + least_squares_model<T>(f, der, mat(list)), + x, + radius); + } + +// ---------------------------------------------------------------------------------------- +// ---------------------------------------------------------------------------------------- +// ---------------------------------------------------------------------------------------- + + template < + typename column_vector_type, + typename funct_type, + typename funct_der_type, + typename vector_type + > + class least_squares_lm_function_model + { + public: + least_squares_lm_function_model ( + const funct_type& f_, + const funct_der_type& der_, + const vector_type& list_ + ) : f(f_), der(der_), list(list_) + { + r.set_size(list.size(),1); + } + + const funct_type& f; + const funct_der_type& der; + const vector_type& list; + + typedef typename column_vector_type::type type; + typedef typename column_vector_type::mem_manager_type mem_manager_type; + typedef typename column_vector_type::layout_type layout_type; + const static long NR = column_vector_type::NR; + + typedef column_vector_type column_vector; + typedef matrix<type,NR,NR,mem_manager_type,layout_type> general_matrix; + + mutable matrix<type,0,1,mem_manager_type,layout_type> r; + mutable column_vector vtemp; + + type operator() ( + const column_vector& x + ) const + { + type result = 0; + for (long i = 0; i < list.size(); ++i) + { + const type temp = f(list(i), x); + // save the residual for later + r(i) = temp; + result += temp*temp; + } + + return 0.5*result; + } + + void get_derivative_and_hessian ( + const column_vector& x, + column_vector& d, + general_matrix& h + ) const + { + d = 0; + h = 0; + for (long i = 0; i < list.size(); ++i) + { + vtemp = der(list(i), x); + d += r(i)*vtemp; + h += vtemp*trans(vtemp); + } + } + + }; + +// ---------------------------------------------------------------------------------------- + + template < + typename column_vector_type, + typename funct_type, + typename funct_der_type, + typename vector_type + > + least_squares_lm_function_model<column_vector_type,funct_type,funct_der_type,vector_type> least_squares_lm_model ( + const funct_type& f, + const funct_der_type& der, + const vector_type& list + ) + { + return least_squares_lm_function_model<column_vector_type,funct_type,funct_der_type,vector_type>(f,der,list); + } + +// ---------------------------------------------------------------------------------------- + + template < + typename stop_strategy_type, + typename funct_type, + typename funct_der_type, + typename vector_type, + typename T + > + double solve_least_squares_lm ( + stop_strategy_type stop_strategy, + const funct_type& f, + const funct_der_type& der, + const vector_type& list, + T& x, + double radius = 1 + ) + { + // The starting point (i.e. x) must be a column vector. + COMPILE_TIME_ASSERT(T::NC <= 1); + + // make sure requires clause is not broken + DLIB_ASSERT(is_vector(mat(list)) && list.size() > 0 && + is_col_vector(x) && radius > 0, + "\t double solve_least_squares_lm()" + << "\n\t invalid arguments were given to this function" + << "\n\t is_vector(list): " << is_vector(mat(list)) + << "\n\t list.size(): " << list.size() + << "\n\t is_col_vector(x): " << is_col_vector(x) + << "\n\t radius: " << radius + ); + + return find_min_trust_region(stop_strategy, + least_squares_lm_model<T>(f, der, mat(list)), + x, + radius); + } + +// ---------------------------------------------------------------------------------------- + +} + +#endif // DLIB_OPTIMIZATION_LEAST_SQuARES_H_h_ + + diff --git a/ml/dlib/dlib/optimization/optimization_least_squares_abstract.h b/ml/dlib/dlib/optimization/optimization_least_squares_abstract.h new file mode 100644 index 000000000..aaffb2221 --- /dev/null +++ b/ml/dlib/dlib/optimization/optimization_least_squares_abstract.h @@ -0,0 +1,112 @@ +// Copyright (C) 2010 Davis E. King (davis@dlib.net) +// License: Boost Software License See LICENSE.txt for the full license. +#undef DLIB_OPTIMIZATIOn_LEAST_SQUARES_ABSTRACT_ +#ifdef DLIB_OPTIMIZATIOn_LEAST_SQUARES_ABSTRACT_ + +#include "../matrix/matrix_abstract.h" +#include "optimization_trust_region_abstract.h" + +namespace dlib +{ + +// ---------------------------------------------------------------------------------------- + + template < + typename stop_strategy_type, + typename funct_type, + typename funct_der_type, + typename vector_type, + typename T + > + double solve_least_squares ( + stop_strategy_type stop_strategy, + const funct_type& f, + const funct_der_type& der, + const vector_type& list, + T& x, + double radius = 1 + ); + /*! + requires + - stop_strategy == an object that defines a stop strategy such as one of + the objects from dlib/optimization/optimization_stop_strategies_abstract.h + - list == a matrix or something convertible to a matrix via mat() + such as a std::vector. + - is_vector(list) == true + - list.size() > 0 + - is_col_vector(x) == true + - radius > 0 + - for all valid i: + - f(list(i),x) must be a valid expression that evaluates to a floating point value. + - der(list(i),x) must be a valid expression that evaluates to the derivative of f(list(i),x) + with respect to x. This derivative must take the form of a column vector. + ensures + - This function performs an unconstrained minimization of the least squares + function g(x) defined by: + - g(x) = sum over all i: 0.5*pow( f(list(i),x), 2 ) + - This method combines the Levenberg-Marquardt method with a quasi-newton method + for approximating the second order terms of the hessian and is appropriate for + large residual problems (i.e. problems where the f() function isn't driven to 0). + In particular, it uses the method of Dennis, Gay, and Welsch as described in + Numerical Optimization by Nocedal and Wright (second edition). + - Since this is a trust region algorithm, the radius parameter defines the initial + size of the trust region. + - The function is optimized until stop_strategy decides that an acceptable + point has been found or the trust region subproblem fails to make progress. + - #x == the value of x that was found to minimize g() + - returns g(#x). + !*/ + +// ---------------------------------------------------------------------------------------- + + template < + typename stop_strategy_type, + typename funct_type, + typename funct_der_type, + typename vector_type, + typename T + > + double solve_least_squares_lm ( + stop_strategy_type stop_strategy, + const funct_type& f, + const funct_der_type& der, + const vector_type& list, + T& x, + double radius = 1 + ); + /*! + requires + - stop_strategy == an object that defines a stop strategy such as one of + the objects from dlib/optimization/optimization_stop_strategies_abstract.h + - list == a matrix or something convertible to a matrix via mat() + such as a std::vector. + - is_vector(list) == true + - list.size() > 0 + - is_col_vector(x) == true + - radius > 0 + - for all valid i: + - f(list(i),x) must be a valid expression that evaluates to a floating point value. + - der(list(i),x) must be a valid expression that evaluates to the derivative of f(list(i),x) + with respect to x. This derivative must take the form of a column vector. + ensures + - This function performs an unconstrained minimization of the least squares + function g(x) defined by: + - g(x) = sum over all i: 0.5*pow( f(list(i),x), 2 ) + - This method implements a plain Levenberg-Marquardt approach for approximating + the hessian of g(). Therefore, it is most appropriate for small residual problems + (i.e. problems where f() goes to 0 at the solution). + - Since this is a trust region algorithm, the radius parameter defines the initial + size of the trust region. + - The function is optimized until stop_strategy decides that an acceptable + point has been found or the trust region subproblem fails to make progress. + - #x == the value of x that was found to minimize g() + - returns g(#x). + !*/ + +// ---------------------------------------------------------------------------------------- + +} + +#endif // DLIB_OPTIMIZATIOn_LEAST_SQUARES_ABSTRACT_ + + diff --git a/ml/dlib/dlib/optimization/optimization_line_search.h b/ml/dlib/dlib/optimization/optimization_line_search.h new file mode 100644 index 000000000..a91e3df84 --- /dev/null +++ b/ml/dlib/dlib/optimization/optimization_line_search.h @@ -0,0 +1,888 @@ +// Copyright (C) 2008 Davis E. King (davis@dlib.net) +// License: Boost Software License See LICENSE.txt for the full license. +#ifndef DLIB_OPTIMIZATIOn_LINE_SEARCH_H_ +#define DLIB_OPTIMIZATIOn_LINE_SEARCH_H_ + +#include <cmath> +#include <limits> +#include "../matrix.h" +#include "../algs.h" +#include "optimization_line_search_abstract.h" +#include <utility> + +namespace dlib +{ + +// ---------------------------------------------------------------------------------------- + + template <typename funct, typename T> + class line_search_funct + { + public: + line_search_funct(const funct& f_, const T& start_, const T& direction_) + : f(f_),start(start_), direction(direction_), matrix_r(0), scalar_r(0) + {} + + line_search_funct(const funct& f_, const T& start_, const T& direction_, T& r) + : f(f_),start(start_), direction(direction_), matrix_r(&r), scalar_r(0) + {} + + line_search_funct(const funct& f_, const T& start_, const T& direction_, double& r) + : f(f_),start(start_), direction(direction_), matrix_r(0), scalar_r(&r) + {} + + double operator()(const double& x) const + { + return get_value(f(start + x*direction)); + } + + private: + + double get_value (const double& r) const + { + // save a copy of this value for later + if (scalar_r) + *scalar_r = r; + + return r; + } + + template <typename U> + double get_value (const U& r) const + { + // U should be a matrix type + COMPILE_TIME_ASSERT(is_matrix<U>::value); + + // save a copy of this value for later + if (matrix_r) + *matrix_r = r; + + return dot(r,direction); + } + + const funct& f; + const T& start; + const T& direction; + T* matrix_r; + double* scalar_r; + }; + + template <typename funct, typename T> + const line_search_funct<funct,T> make_line_search_function(const funct& f, const T& start, const T& direction) + { + COMPILE_TIME_ASSERT(is_matrix<T>::value); + DLIB_ASSERT ( + is_col_vector(start) && is_col_vector(direction) && start.size() == direction.size(), + "\tline_search_funct make_line_search_function(f,start,direction)" + << "\n\tYou have to supply column vectors to this function" + << "\n\tstart.nc(): " << start.nc() + << "\n\tdirection.nc(): " << direction.nc() + << "\n\tstart.nr(): " << start.nr() + << "\n\tdirection.nr(): " << direction.nr() + ); + return line_search_funct<funct,T>(f,start,direction); + } + +// ---------------------------------------------------------------------------------------- + + template <typename funct, typename T> + const line_search_funct<funct,T> make_line_search_function(const funct& f, const T& start, const T& direction, double& f_out) + { + COMPILE_TIME_ASSERT(is_matrix<T>::value); + DLIB_ASSERT ( + is_col_vector(start) && is_col_vector(direction) && start.size() == direction.size(), + "\tline_search_funct make_line_search_function(f,start,direction)" + << "\n\tYou have to supply column vectors to this function" + << "\n\tstart.nc(): " << start.nc() + << "\n\tdirection.nc(): " << direction.nc() + << "\n\tstart.nr(): " << start.nr() + << "\n\tdirection.nr(): " << direction.nr() + ); + return line_search_funct<funct,T>(f,start,direction, f_out); + } + +// ---------------------------------------------------------------------------------------- + + template <typename funct, typename T> + const line_search_funct<funct,T> make_line_search_function(const funct& f, const T& start, const T& direction, T& grad_out) + { + COMPILE_TIME_ASSERT(is_matrix<T>::value); + DLIB_ASSERT ( + is_col_vector(start) && is_col_vector(direction) && start.size() == direction.size(), + "\tline_search_funct make_line_search_function(f,start,direction)" + << "\n\tYou have to supply column vectors to this function" + << "\n\tstart.nc(): " << start.nc() + << "\n\tdirection.nc(): " << direction.nc() + << "\n\tstart.nr(): " << start.nr() + << "\n\tdirection.nr(): " << direction.nr() + ); + return line_search_funct<funct,T>(f,start,direction,grad_out); + } + +// ---------------------------------------------------------------------------------------- +// ---------------------------------------------------------------------------------------- +// ---------------------------------------------------------------------------------------- + + inline double poly_min_extrap ( + double f0, + double d0, + double f1, + double d1, + double limit = 1 + ) + { + const double n = 3*(f1 - f0) - 2*d0 - d1; + const double e = d0 + d1 - 2*(f1 - f0); + + + // find the minimum of the derivative of the polynomial + + double temp = std::max(n*n - 3*e*d0,0.0); + + if (temp < 0) + return 0.5; + + temp = std::sqrt(temp); + + if (std::abs(e) <= std::numeric_limits<double>::epsilon()) + return 0.5; + + // figure out the two possible min values + double x1 = (temp - n)/(3*e); + double x2 = -(temp + n)/(3*e); + + // compute the value of the interpolating polynomial at these two points + double y1 = f0 + d0*x1 + n*x1*x1 + e*x1*x1*x1; + double y2 = f0 + d0*x2 + n*x2*x2 + e*x2*x2*x2; + + // pick the best point + double x; + if (y1 < y2) + x = x1; + else + x = x2; + + // now make sure the minimum is within the allowed range of [0,limit] + return put_in_range(0,limit,x); + } + +// ---------------------------------------------------------------------------------------- + + inline double poly_min_extrap ( + double f0, + double d0, + double f1 + ) + { + const double temp = 2*(f1 - f0 - d0); + if (std::abs(temp) <= d0*std::numeric_limits<double>::epsilon()) + return 0.5; + + const double alpha = -d0/temp; + + // now make sure the minimum is within the allowed range of (0,1) + return put_in_range(0,1,alpha); + } + +// ---------------------------------------------------------------------------------------- + + inline double poly_min_extrap ( + double f0, + double d0, + double x1, + double f_x1, + double x2, + double f_x2 + ) + { + DLIB_ASSERT(0 < x1 && x1 < x2,"Invalid inputs were given to this function"); + // The contents of this function follow the equations described on page 58 of the + // book Numerical Optimization by Nocedal and Wright, second edition. + matrix<double,2,2> m; + matrix<double,2,1> v; + + const double aa2 = x2*x2; + const double aa1 = x1*x1; + m = aa2, -aa1, + -aa2*x2, aa1*x1; + v = f_x1 - f0 - d0*x1, + f_x2 - f0 - d0*x2; + + + double temp = aa2*aa1*(x1-x2); + + // just take a guess if this happens + if (temp == 0 || std::fpclassify(temp) == FP_SUBNORMAL) + { + return x1/2.0; + } + + matrix<double,2,1> temp2; + temp2 = m*v/temp; + const double a = temp2(0); + const double b = temp2(1); + + temp = b*b - 3*a*d0; + if (temp < 0 || a == 0) + { + // This is probably a line so just pick the lowest point + if (f0 < f_x2) + return 0; + else + return x2; + } + temp = (-b + std::sqrt(temp))/(3*a); + return put_in_range(0, x2, temp); + } + +// ---------------------------------------------------------------------------------------- + + inline double lagrange_poly_min_extrap ( + double p1, + double p2, + double p3, + double f1, + double f2, + double f3 + ) + { + DLIB_ASSERT(p1 < p2 && p2 < p3 && f1 >= f2 && f2 <= f3, + " p1: " << p1 + << " p2: " << p2 + << " p3: " << p3 + << " f1: " << f1 + << " f2: " << f2 + << " f3: " << f3); + + // This formula is out of the book Nonlinear Optimization by Andrzej Ruszczynski. See section 5.2. + double temp1 = f1*(p3*p3 - p2*p2) + f2*(p1*p1 - p3*p3) + f3*(p2*p2 - p1*p1); + double temp2 = 2*(f1*(p3 - p2) + f2*(p1 - p3) + f3*(p2 - p1) ); + + if (temp2 == 0) + { + return p2; + } + + const double result = temp1/temp2; + + // do a final sanity check to make sure the result is in the right range + if (p1 <= result && result <= p3) + { + return result; + } + else + { + return std::min(std::max(p1,result),p3); + } + } + +// ---------------------------------------------------------------------------------------- + + template < + typename funct, + typename funct_der + > + double line_search ( + const funct& f, + const double f0, + const funct_der& der, + const double d0, + double rho, + double sigma, + double min_f, + unsigned long max_iter + ) + { + DLIB_ASSERT ( + 0 < rho && rho < sigma && sigma < 1 && max_iter > 0, + "\tdouble line_search()" + << "\n\tYou have given invalid arguments to this function" + << "\n\t sigma: " << sigma + << "\n\t rho: " << rho + << "\n\t max_iter: " << max_iter + ); + + // The bracketing phase of this function is implemented according to block 2.6.2 from + // the book Practical Methods of Optimization by R. Fletcher. The sectioning + // phase is an implementation of 2.6.4 from the same book. + + // 1 <= tau1a < tau1b. Controls the alpha jump size during the bracketing phase of + // the search. + const double tau1a = 1.4; + const double tau1b = 9; + + // it must be the case that 0 < tau2 < tau3 <= 1/2 for the algorithm to function + // correctly but the specific values of tau2 and tau3 aren't super important. + const double tau2 = 1.0/10.0; + const double tau3 = 1.0/2.0; + + + // Stop right away and return a step size of 0 if the gradient is 0 at the starting point + if (std::abs(d0) <= std::abs(f0)*std::numeric_limits<double>::epsilon()) + return 0; + + // Stop right away if the current value is good enough according to min_f + if (f0 <= min_f) + return 0; + + // Figure out a reasonable upper bound on how large alpha can get. + const double mu = (min_f-f0)/(rho*d0); + + + double alpha = 1; + if (mu < 0) + alpha = -alpha; + alpha = put_in_range(0, 0.65*mu, alpha); + + + double last_alpha = 0; + double last_val = f0; + double last_val_der = d0; + + // The bracketing stage will find a range of points [a,b] + // that contains a reasonable solution to the line search + double a, b; + + // These variables will hold the values and derivatives of f(a) and f(b) + double a_val, b_val, a_val_der, b_val_der; + + // This thresh value represents the Wolfe curvature condition + const double thresh = std::abs(sigma*d0); + + unsigned long itr = 0; + // do the bracketing stage to find the bracket range [a,b] + while (true) + { + ++itr; + const double val = f(alpha); + const double val_der = der(alpha); + + // we are done with the line search since we found a value smaller + // than the minimum f value + if (val <= min_f) + return alpha; + + if (val > f0 + rho*alpha*d0 || val >= last_val) + { + a_val = last_val; + a_val_der = last_val_der; + b_val = val; + b_val_der = val_der; + + a = last_alpha; + b = alpha; + break; + } + + if (std::abs(val_der) <= thresh) + return alpha; + + // if we are stuck not making progress then quit with the current alpha + if (last_alpha == alpha || itr >= max_iter) + return alpha; + + if (val_der >= 0) + { + a_val = val; + a_val_der = val_der; + b_val = last_val; + b_val_der = last_val_der; + + a = alpha; + b = last_alpha; + break; + } + + + + const double temp = alpha; + // Pick a larger range [first, last]. We will pick the next alpha in that + // range. + double first, last; + if (mu > 0) + { + first = std::min(mu, alpha + tau1a*(alpha - last_alpha)); + last = std::min(mu, alpha + tau1b*(alpha - last_alpha)); + } + else + { + first = std::max(mu, alpha + tau1a*(alpha - last_alpha)); + last = std::max(mu, alpha + tau1b*(alpha - last_alpha)); + } + + + + // pick a point between first and last by doing some kind of interpolation + if (last_alpha < alpha) + alpha = last_alpha + (alpha-last_alpha)*poly_min_extrap(last_val, last_val_der, val, val_der, 1e10); + else + alpha = alpha + (last_alpha-alpha)*poly_min_extrap(val, val_der, last_val, last_val_der, 1e10); + + alpha = put_in_range(first,last,alpha); + + last_alpha = temp; + + last_val = val; + last_val_der = val_der; + + } + + + // Now do the sectioning phase from 2.6.4 + while (true) + { + ++itr; + double first = a + tau2*(b-a); + double last = b - tau3*(b-a); + + // use interpolation to pick alpha between first and last + alpha = a + (b-a)*poly_min_extrap(a_val, a_val_der, b_val, b_val_der); + alpha = put_in_range(first,last,alpha); + + const double val = f(alpha); + const double val_der = der(alpha); + + // we are done with the line search since we found a value smaller + // than the minimum f value or we ran out of iterations. + if (val <= min_f || itr >= max_iter) + return alpha; + + // stop if the interval gets so small that it isn't shrinking any more due to rounding error + if (a == first || b == last) + { + return b; + } + + // If alpha has basically become zero then just stop. Think of it like this, + // if we take the largest possible alpha step will the objective function + // change at all? If not then there isn't any point looking for a better + // alpha. + const double max_possible_alpha = std::max(std::abs(a),std::abs(b)); + if (std::abs(max_possible_alpha*d0) <= std::abs(f0)*std::numeric_limits<double>::epsilon()) + return alpha; + + + if (val > f0 + rho*alpha*d0 || val >= a_val) + { + b = alpha; + b_val = val; + b_val_der = val_der; + } + else + { + if (std::abs(val_der) <= thresh) + return alpha; + + if ( (b-a)*val_der >= 0) + { + b = a; + b_val = a_val; + b_val_der = a_val_der; + } + + a = alpha; + a_val = val; + a_val_der = val_der; + } + } + } + +// ---------------------------------------------------------------------------------------- + + template < + typename funct + > + double backtracking_line_search ( + const funct& f, + double f0, + double d0, + double alpha, + double rho, + unsigned long max_iter + ) + { + DLIB_ASSERT ( + 0 < rho && rho < 1 && max_iter > 0, + "\tdouble backtracking_line_search()" + << "\n\tYou have given invalid arguments to this function" + << "\n\t rho: " << rho + << "\n\t max_iter: " << max_iter + ); + + // make sure alpha is going in the right direction. That is, it should be opposite + // the direction of the gradient. + if ((d0 > 0 && alpha > 0) || + (d0 < 0 && alpha < 0)) + { + alpha *= -1; + } + + bool have_prev_alpha = false; + double prev_alpha = 0; + double prev_val = 0; + unsigned long iter = 0; + while (true) + { + ++iter; + const double val = f(alpha); + if (val <= f0 + alpha*rho*d0 || iter >= max_iter) + { + return alpha; + } + else + { + // Interpolate a new alpha. We also make sure the step by which we + // reduce alpha is not super small. + double step; + if (!have_prev_alpha) + { + if (d0 < 0) + step = alpha*put_in_range(0.1,0.9, poly_min_extrap(f0, d0, val)); + else + step = alpha*put_in_range(0.1,0.9, poly_min_extrap(f0, -d0, val)); + have_prev_alpha = true; + } + else + { + if (d0 < 0) + step = put_in_range(0.1*alpha,0.9*alpha, poly_min_extrap(f0, d0, alpha, val, prev_alpha, prev_val)); + else + step = put_in_range(0.1*alpha,0.9*alpha, -poly_min_extrap(f0, -d0, -alpha, val, -prev_alpha, prev_val)); + } + + prev_alpha = alpha; + prev_val = val; + + alpha = step; + } + } + } + +// ---------------------------------------------------------------------------------------- + + class optimize_single_variable_failure : public error { + public: optimize_single_variable_failure(const std::string& s):error(s){} + }; + +// ---------------------------------------------------------------------------------------- + + template <typename funct> + double find_min_single_variable ( + const funct& f, + double& starting_point, + const double begin = -1e200, + const double end = 1e200, + const double eps = 1e-3, + const long max_iter = 100, + const double initial_search_radius = 1 + ) + { + DLIB_CASSERT( eps > 0 && + max_iter > 1 && + begin <= starting_point && starting_point <= end && + initial_search_radius > 0, + "eps: " << eps + << "\n max_iter: "<< max_iter + << "\n begin: "<< begin + << "\n end: "<< end + << "\n starting_point: "<< starting_point + << "\n initial_search_radius: "<< initial_search_radius + ); + + double search_radius = initial_search_radius; + + double p1=0, p2=0, p3=0, f1=0, f2=0, f3=0; + long f_evals = 1; + + if (begin == end) + { + return f(starting_point); + } + + using std::abs; + using std::min; + using std::max; + + // find three bracketing points such that f1 > f2 < f3. Do this by generating a sequence + // of points expanding away from 0. Also note that, in the following code, it is always the + // case that p1 < p2 < p3. + + + + // The first thing we do is get a starting set of 3 points that are inside the [begin,end] bounds + p1 = max(starting_point-search_radius, begin); + p3 = min(starting_point+search_radius, end); + f1 = f(p1); + f3 = f(p3); + + if (starting_point == p1 || starting_point == p3) + { + p2 = (p1+p3)/2; + f2 = f(p2); + } + else + { + p2 = starting_point; + f2 = f(starting_point); + } + + f_evals += 2; + + // Now we have 3 points on the function. Start looking for a bracketing set such that + // f1 > f2 < f3 is the case. + while ( !(f1 > f2 && f2 < f3)) + { + // check for hitting max_iter or if the interval is now too small + if (f_evals >= max_iter) + { + throw optimize_single_variable_failure( + "The max number of iterations of single variable optimization have been reached\n" + "without converging."); + } + if (p3-p1 < eps) + { + if (f1 < min(f2,f3)) + { + starting_point = p1; + return f1; + } + + if (f2 < min(f1,f3)) + { + starting_point = p2; + return f2; + } + + starting_point = p3; + return f3; + } + + // If the left most points are identical in function value then expand out the + // left a bit, unless it's already at bound or we would drop that left most + // point anyway because it's bad. + if (f1==f2 && f1<f3 && p1!=begin) + { + p1 = max(p1 - search_radius, begin); + f1 = f(p1); + ++f_evals; + search_radius *= 2; + continue; + } + if (f2==f3 && f3<f1 && p3!=end) + { + p3 = min(p3 + search_radius, end); + f3 = f(p3); + ++f_evals; + search_radius *= 2; + continue; + } + + + // if f1 is small then take a step to the left + if (f1 <= f3) + { + // check if the minimum is butting up against the bounds and if so then pick + // a point between p1 and p2 in the hopes that shrinking the interval will + // be a good thing to do. Or if p1 and p2 aren't differentiated then try and + // get them to obtain different values. + if (p1 == begin || (f1 == f2 && (end-begin) < search_radius )) + { + p3 = p2; + f3 = f2; + + p2 = (p1+p2)/2.0; + f2 = f(p2); + } + else + { + // pick a new point to the left of our current bracket + p3 = p2; + f3 = f2; + + p2 = p1; + f2 = f1; + + p1 = max(p1 - search_radius, begin); + f1 = f(p1); + + search_radius *= 2; + } + + } + // otherwise f3 is small and we should take a step to the right + else + { + // check if the minimum is butting up against the bounds and if so then pick + // a point between p2 and p3 in the hopes that shrinking the interval will + // be a good thing to do. Or if p2 and p3 aren't differentiated then try and + // get them to obtain different values. + if (p3 == end || (f2 == f3 && (end-begin) < search_radius)) + { + p1 = p2; + f1 = f2; + + p2 = (p3+p2)/2.0; + f2 = f(p2); + } + else + { + // pick a new point to the right of our current bracket + p1 = p2; + f1 = f2; + + p2 = p3; + f2 = f3; + + p3 = min(p3 + search_radius, end); + f3 = f(p3); + + search_radius *= 2; + } + } + + ++f_evals; + } + + + // Loop until we have done the max allowable number of iterations or + // the bracketing window is smaller than eps. + // Within this loop we maintain the invariant that: f1 > f2 < f3 and p1 < p2 < p3 + const double tau = 0.1; + while( f_evals < max_iter && p3-p1 > eps) + { + double p_min = lagrange_poly_min_extrap(p1,p2,p3, f1,f2,f3); + + + // make sure p_min isn't too close to the three points we already have + if (p_min < p2) + { + const double min_dist = (p2-p1)*tau; + if (abs(p1-p_min) < min_dist) + { + p_min = p1 + min_dist; + } + else if (abs(p2-p_min) < min_dist) + { + p_min = p2 - min_dist; + } + } + else + { + const double min_dist = (p3-p2)*tau; + if (abs(p2-p_min) < min_dist) + { + p_min = p2 + min_dist; + } + else if (abs(p3-p_min) < min_dist) + { + p_min = p3 - min_dist; + } + } + + // make sure one side of the bracket isn't super huge compared to the other + // side. If it is then contract it. + const double bracket_ratio = abs(p1-p2)/abs(p2-p3); + if ( !( bracket_ratio < 10 && bracket_ratio > 0.1) ) + { + // Force p_min to be on a reasonable side. But only if lagrange_poly_min_extrap() + // didn't put it on a good side already. + if (bracket_ratio > 1 && p_min > p2) + p_min = (p1+p2)/2; + else if (p_min < p2) + p_min = (p2+p3)/2; + } + + + const double f_min = f(p_min); + + + // Remove one of the endpoints of our bracket depending on where the new point falls. + if (p_min < p2) + { + if (f1 > f_min && f_min < f2) + { + p3 = p2; + f3 = f2; + p2 = p_min; + f2 = f_min; + } + else + { + p1 = p_min; + f1 = f_min; + } + } + else + { + if (f2 > f_min && f_min < f3) + { + p1 = p2; + f1 = f2; + p2 = p_min; + f2 = f_min; + } + else + { + p3 = p_min; + f3 = f_min; + } + } + + + ++f_evals; + } + + if (f_evals >= max_iter) + { + throw optimize_single_variable_failure( + "The max number of iterations of single variable optimization have been reached\n" + "without converging."); + } + + starting_point = p2; + return f2; + } + +// ---------------------------------------------------------------------------------------- + + template <typename funct> + class negate_function_object + { + public: + negate_function_object(const funct& f_) : f(f_){} + + template <typename T> + double operator()(const T& x) const + { + return -f(x); + } + + private: + const funct& f; + }; + + template <typename funct> + const negate_function_object<funct> negate_function(const funct& f) { return negate_function_object<funct>(f); } + +// ---------------------------------------------------------------------------------------- + + template <typename funct> + double find_max_single_variable ( + const funct& f, + double& starting_point, + const double begin = -1e200, + const double end = 1e200, + const double eps = 1e-3, + const long max_iter = 100, + const double initial_search_radius = 1 + ) + { + return -find_min_single_variable(negate_function(f), starting_point, begin, end, eps, max_iter, initial_search_radius); + } + +// ---------------------------------------------------------------------------------------- + +} + +#endif // DLIB_OPTIMIZATIOn_LINE_SEARCH_H_ + diff --git a/ml/dlib/dlib/optimization/optimization_line_search_abstract.h b/ml/dlib/dlib/optimization/optimization_line_search_abstract.h new file mode 100644 index 000000000..2aa221da4 --- /dev/null +++ b/ml/dlib/dlib/optimization/optimization_line_search_abstract.h @@ -0,0 +1,361 @@ +// Copyright (C) 2008 Davis E. King (davis@dlib.net) +// License: Boost Software License See LICENSE.txt for the full license. +#undef DLIB_OPTIMIZATIOn_ABSTRACT_ +#ifdef DLIB_OPTIMIZATIOn_ABSTRACT_ + +#include <cmath> +#include <limits> +#include "../matrix/matrix_abstract.h" +#include "../algs.h" + + +namespace dlib +{ + +// ---------------------------------------------------------------------------------------- + + template < + typename funct, + typename T + > + class line_search_funct; + /*! + This object is a function object that represents a line search function. + + Moreover, it represents a function with the signature: + double l(double x) + !*/ + + template < + typename funct, + typename T + > + const line_search_funct<funct,T> make_line_search_function ( + const funct& f, + const T& start, + const T& direction + ); + /*! + requires + - is_col_vector(start) && is_col_vector(direction) && start.size() == direction.size() + (i.e. start and direction should be column vectors of the same size) + - f must return either a double or a column vector the same length as start + - f(start + 1.5*direction) should be a valid expression + ensures + - if (f returns a double) then + - returns a line search function that computes l(x) == f(start + x*direction) + - else + - returns a line search function that computes l(x) == dot(f(start + x*direction),direction). + That is, we assume f is the derivative of some other function and that what + f returns is a gradient vector. + So the following two expressions both create the derivative of l(x): + - derivative(make_line_search_function(funct,start,direction)) + - make_line_search_function(derivative(funct),start,direction) + !*/ + + template < + typename funct, + typename T + > + const line_search_funct<funct,T> make_line_search_function ( + const funct& f, + const T& start, + const T& direction, + double& f_out + ); + /*! + This function is identical to the above three argument version of make_line_search_function() + except that, if f() outputs a double, every time f() is evaluated its output is also stored + into f_out. + !*/ + + template < + typename funct, + typename T + > + const line_search_funct<funct,T> make_line_search_function ( + const funct& f, + const T& start, + const T& direction, + T& gradient_out + ); + /*! + This function is identical to the above three argument version of make_line_search_function() + except that, if f() outputs a column vector, every time f() is evaluated its output is also + stored into gradient_out. + !*/ + +// ---------------------------------------------------------------------------------------- + + inline double poly_min_extrap ( + double f0, + double d0, + double f1, + double d1, + double limit = 1 + ); + /*! + ensures + - let c(x) be a 3rd degree polynomial such that: + - c(0) == f0 + - c(1) == f1 + - derivative of c(x) at x==0 is d0 + - derivative of c(x) at x==1 is d1 + - returns the point in the range [0,limit] that minimizes the polynomial c(x) + !*/ + +// ---------------------------------------------------------------------------------------- + + inline double poly_min_extrap ( + double f0, + double d0, + double f1 + ); + /*! + ensures + - let c(x) be a 2nd degree polynomial such that: + - c(0) == f0 + - c(1) == f1 + - derivative of c(x) at x==0 is d0 + - returns the point in the range [0,1] that minimizes the polynomial c(x) + !*/ + +// ---------------------------------------------------------------------------------------- + + inline double poly_min_extrap ( + double f0, + double d0, + double x1, + double f_x1, + double x2, + double f_x2 + ) + /*! + requires + - 0 < x1 < x2 + ensures + - let f(x) be a 3rd degree polynomial such that: + - f(0) == f0 + - derivative of f(x) at x==0 is d0 + - f(x1) == f_x1 + - f(x2) == f_x2 + - returns the point in the range [0,x2] that minimizes the polynomial f(x) + !*/ + +// ---------------------------------------------------------------------------------------- + + inline double lagrange_poly_min_extrap ( + double p1, + double p2, + double p3, + double f1, + double f2, + double f3 + ); + /*! + requires + - f1 >= f2 <= f3 + - p1 < p2 < p3 + ensures + - let c(x) be the second order Lagrange polynomial that interpolates the + points p1, p2, and p3 where c(p1)==f1, c(p2)==f2, and c(p3)==f3 + - this function returns the point in the range [p1,p3] that minimizes + the polynomial c(x) + !*/ + +// ---------------------------------------------------------------------------------------- + + template < + typename funct, + typename funct_der + > + double line_search ( + const funct& f, + const double f0, + const funct_der& der, + const double d0, + double rho, + double sigma, + double min_f, + unsigned long max_iter + ) + /*! + requires + - 0 < rho < sigma < 1 + - f and der are scalar functions of scalars + (e.g. line_search_funct objects) + - der is the derivative of f + - f0 == f(0) + - d0 == der(0) + - max_iter > 0 + ensures + - Performs a line search and uses the strong Wolfe conditions to decide when + the search can stop. + - rho == the parameter of the Wolfe sufficient decrease condition + - sigma == the parameter of the Wolfe curvature condition + - max_iter == the maximum number of iterations allowable. After this + many evaluations of f() line_search() is guaranteed to terminate. + - returns a value alpha such that f(alpha) is significantly closer to + the minimum of f than f(0). + - It is assumed that the minimum possible value of f(x) is min_f. So if + an alpha is found such that f(alpha) <= min_f then the search stops + immediately. + - This function is also optimized for the case where der(0) is negative. I.e. + positive values of the argument to f() should be in a descent direction. + - When this function makes calls to f() and der() it always does so by + first calling f() and then calling der(). That is, these two functions + are always called in pairs with f() being called first and then der() + being called second. + !*/ + + /* + A good discussion of the Wolfe conditions and line search algorithms in + general can be found in the book Practical Methods of Optimization by R. Fletcher + and also in the more recent book Numerical Optimization by Nocedal and Wright. + */ + +// ---------------------------------------------------------------------------------------- + + template < + typename funct + > + double backtracking_line_search ( + const funct& f, + double f0, + double d0, + double alpha, + double rho, + unsigned long max_iter + ); + /*! + requires + - 0 < rho < 1 + - f is a scalar function of scalars + (e.g. a line_search_funct object) + - f0 == f(0) + - d0 == the derivative of f() at f(0). + - max_iter > 0 + ensures + - Performs a backtracking line search and uses the Armijo sufficient decrease + rule to decide when the search can stop. + - rho == the parameter of the sufficient decrease condition. + - max_iter == the maximum number of iterations allowable. After this many + evaluations of f() backtracking_line_search() is guaranteed to terminate. + - The line search starts with the input alpha value and then backtracks until + it finds a good enough alpha value. Once found, it returns the alpha value + such that f(alpha) is significantly closer to the minimum of f than f(0). + - The returned value of alpha will always be the last value of alpha which was + passed to f(). That is, it will always be the case that the last call to f() + made by backtracking_line_search() was f(alpha) where alpha is the return + value from this function. + !*/ + +// ---------------------------------------------------------------------------------------- + + template < + typename funct + > + const negate_function_object<funct> negate_function( + const funct& f + ); + /*! + requires + - f == a function that returns a scalar + ensures + - returns a function that represents the negation of f. That is, + the returned function object represents g(x) == -f(x) + !*/ + +// ---------------------------------------------------------------------------------------- + + class optimize_single_variable_failure : public error; + /*! + This is the exception class used by the functions defined below. + !*/ + +// ---------------------------------------------------------------------------------------- + + template < + typename funct + > + double find_min_single_variable ( + const funct& f, + double& starting_point, + const double begin = -1e200, + const double end = 1e200, + const double eps = 1e-3, + const long max_iter = 100, + const double initial_search_radius = 1 + ) + /*! + requires + - eps > 0 + - max_iter > 1 + - begin <= starting_point <= end + - f must be a function of a double that returns a double + (e.g. f(starting_point) should be a valid expression that evaluates to a double) + - initial_search_radius > 0 + ensures + - Finds a point P such that: + - P is a local minimum of the function f(). + - begin <= P <= end + - Evaluates f() no more than max_iter times + - Stops searching when the window around the minimum point is smaller than eps. + The search will begin with the given starting_point and expand out to the + left and right by initial_search_radius sized steps. So if you think the + minimum is likely to be found within X distance from the starting_point then + set initial_search_radius to X. + - #starting_point == P + - returns f(P) + throws + - optimize_single_variable_failure + This exception is thrown if max_iter iterations are performed without + determining the min point to the requested accuracy of eps. + !*/ + +// ---------------------------------------------------------------------------------------- + + template < + typename funct + > + double find_max_single_variable ( + const funct& f, + double& starting_point, + const double begin = -1e200, + const double end = 1e200, + const double eps = 1e-3, + const long max_iter = 100, + const double initial_search_radius = 1 + ) + /*! + requires + - eps > 0 + - max_iter > 1 + - begin <= starting_point <= end + - f must be a function of a double that returns a double + (e.g. f(starting_point) should be a valid expression that evaluates to a double) + - initial_search_radius > 0 + ensures + - Finds a point P such that: + - P is a local maximum of the function f(). + - begin <= P <= end + - Evaluates f() no more than max_iter times + - Stops searching when the window around the maximum point is smaller than eps. + The search will begin with the given starting_point and expand out to the + left and right by initial_search_radius sized steps. So if you think the + maximum is likely to be found within X distance from the starting_point then + set initial_search_radius to X. + - #starting_point == P + - returns f(P) + throws + - optimize_single_variable_failure + This exception is thrown if max_iter iterations are performed without + determining the max point to the requested accuracy of eps. + !*/ + +// ---------------------------------------------------------------------------------------- + +} + +#endif // DLIB_OPTIMIZATIOn_ABSTRACT_ + diff --git a/ml/dlib/dlib/optimization/optimization_oca.h b/ml/dlib/dlib/optimization/optimization_oca.h new file mode 100644 index 000000000..4ca9cd7ae --- /dev/null +++ b/ml/dlib/dlib/optimization/optimization_oca.h @@ -0,0 +1,407 @@ +// Copyright (C) 2010 Davis E. King (davis@dlib.net) +// License: Boost Software License See LICENSE.txt for the full license. +#ifndef DLIB_OPTIMIZATIoN_OCA_Hh_ +#define DLIB_OPTIMIZATIoN_OCA_Hh_ + +#include "optimization_oca_abstract.h" + +#include "../matrix.h" +#include "optimization_solve_qp_using_smo.h" +#include <vector> +#include "../sequence.h" + +// ---------------------------------------------------------------------------------------- + +namespace dlib +{ + template <typename matrix_type> + class oca_problem + { + public: + typedef typename matrix_type::type scalar_type; + + virtual ~oca_problem() {} + + virtual bool risk_has_lower_bound ( + scalar_type& + ) const { return false; } + + virtual bool optimization_status ( + scalar_type , + scalar_type , + scalar_type , + scalar_type , + unsigned long, + unsigned long + ) const = 0; + + virtual scalar_type get_c ( + ) const = 0; + + virtual long get_num_dimensions ( + ) const = 0; + + virtual void get_risk ( + matrix_type& current_solution, + scalar_type& risk_value, + matrix_type& risk_subgradient + ) const = 0; + + }; + +// ---------------------------------------------------------------------------------------- + + class oca + { + public: + + oca () + { + sub_eps = 1e-2; + sub_max_iter = 50000; + + inactive_thresh = 20; + } + + void set_subproblem_epsilon ( + double eps_ + ) { sub_eps = eps_; } + + double get_subproblem_epsilon ( + ) const { return sub_eps; } + + void set_subproblem_max_iterations ( + unsigned long sub_max_iter_ + ) + { + // make sure requires clause is not broken + DLIB_ASSERT(sub_max_iter_ > 0, + "\t void oca::set_subproblem_max_iterations" + << "\n\t max iterations must be greater than 0" + << "\n\t sub_max_iter_: " << sub_max_iter_ + << "\n\t this: " << this + ); + + sub_max_iter = sub_max_iter_; + } + + unsigned long get_subproblem_max_iterations ( + ) const { return sub_max_iter; } + + void set_inactive_plane_threshold ( + unsigned long inactive_thresh_ + ) + { + // make sure requires clause is not broken + DLIB_ASSERT(inactive_thresh_ > 0, + "\t void oca::set_inactive_plane_threshold" + << "\n\t inactive threshold must be greater than 0" + << "\n\t inactive_thresh_: " << inactive_thresh_ + << "\n\t this: " << this + ); + + inactive_thresh = inactive_thresh_; + } + + unsigned long get_inactive_plane_threshold ( + ) const { return inactive_thresh; } + + template < + typename matrix_type + > + typename matrix_type::type operator() ( + const oca_problem<matrix_type>& problem, + matrix_type& w, + unsigned long num_nonnegative = 0, + unsigned long force_weight_to_1 = std::numeric_limits<unsigned long>::max() + ) const + { + matrix_type empty_prior; + return oca_impl(problem, w, empty_prior, false, num_nonnegative, force_weight_to_1, 0); + } + + template < + typename matrix_type + > + typename matrix_type::type solve_with_elastic_net ( + const oca_problem<matrix_type>& problem, + matrix_type& w, + double lasso_lambda, + unsigned long force_weight_to_1 = std::numeric_limits<unsigned long>::max() + ) const + { + matrix_type empty_prior; + return oca_impl(problem, w, empty_prior, false, 0, force_weight_to_1, lasso_lambda); + } + + template < + typename matrix_type + > + typename matrix_type::type operator() ( + const oca_problem<matrix_type>& problem, + matrix_type& w, + const matrix_type& prior + ) const + { + // make sure requires clause is not broken + DLIB_ASSERT(is_col_vector(prior) && prior.size() == problem.get_num_dimensions(), + "\t scalar_type oca::operator()" + << "\n\t The prior vector does not have the correct dimensions." + << "\n\t is_col_vector(prior): " << is_col_vector(prior) + << "\n\t prior.size(): " << prior.size() + << "\n\t problem.get_num_dimensions(): " << problem.get_num_dimensions() + << "\n\t this: " << this + ); + // disable the force weight to 1 option for this mode. We also disable the + // non-negative constraints. + unsigned long force_weight_to_1 = std::numeric_limits<unsigned long>::max(); + return oca_impl(problem, w, prior, true, 0, force_weight_to_1, 0); + } + + private: + + template < + typename matrix_type + > + typename matrix_type::type oca_impl ( + const oca_problem<matrix_type>& problem, + matrix_type& w, + const matrix_type& prior, + bool have_prior, + unsigned long num_nonnegative, + unsigned long force_weight_to_1, + const double lasso_lambda + ) const + { + const unsigned long num_dims = problem.get_num_dimensions(); + + // make sure requires clause is not broken + DLIB_ASSERT(problem.get_c() > 0 && + problem.get_num_dimensions() > 0 && + 0 <= lasso_lambda && lasso_lambda < 1, + "\t scalar_type oca::operator()" + << "\n\t The oca_problem is invalid" + << "\n\t problem.get_c(): " << problem.get_c() + << "\n\t problem.get_num_dimensions(): " << num_dims + << "\n\t lasso_lambda: " << lasso_lambda + << "\n\t this: " << this + ); + if (have_prior) + { + DLIB_ASSERT(lasso_lambda == 0, "Solver doesn't support using a prior with lasso."); + DLIB_ASSERT(num_nonnegative == 0, "Solver doesn't support using a prior with non-negative constraints."); + } + else if (lasso_lambda != 0) + { + DLIB_ASSERT(num_nonnegative == 0, "Solver doesn't support using lasso with non-negative constraints."); + } + + const double ridge_lambda = 1-lasso_lambda; + + if (num_nonnegative > num_dims) + num_nonnegative = num_dims; + + typedef typename matrix_type::type scalar_type; + typedef typename matrix_type::layout_type layout_type; + typedef typename matrix_type::mem_manager_type mem_manager_type; + typedef matrix_type vect_type; + + const scalar_type C = problem.get_c(); + + typename sequence<vect_type>::kernel_2a planes; + std::vector<scalar_type> bs, miss_count; + + vect_type new_plane, alpha, btemp; + + w.set_size(num_dims, 1); + w = 0; + + // The current objective value. Note also that w always contains + // the current solution. + scalar_type cur_obj = std::numeric_limits<scalar_type>::max(); + + // This will hold the cutting plane objective value. This value is + // a lower bound on the true optimal objective value. + scalar_type cp_obj = 0; + + matrix<scalar_type,0,0,mem_manager_type, layout_type> K, Ktmp; + matrix<scalar_type,0,1,mem_manager_type, layout_type> lambda, d; + if (lasso_lambda != 0) + d.set_size(num_dims); + else + d.set_size(num_nonnegative); + d = lasso_lambda*ones_matrix(d); + + scalar_type R_lower_bound; + if (problem.risk_has_lower_bound(R_lower_bound)) + { + // The flat lower bounding plane is always good to have if we know + // what it is. + bs.push_back(R_lower_bound); + new_plane = zeros_matrix(w); + planes.add(0, new_plane); + alpha = uniform_matrix<scalar_type>(1,1, C); + miss_count.push_back(0); + + K.set_size(1,1); + K(0,0) = 0; + } + + const double prior_norm = have_prior ? 0.5*dot(prior,prior) : 0; + + unsigned long counter = 0; + while (true) + { + + // add the next cutting plane + scalar_type cur_risk; + if (force_weight_to_1 < (unsigned long)w.size()) + w(force_weight_to_1) = 1; + + problem.get_risk(w, cur_risk, new_plane); + + if (force_weight_to_1 < (unsigned long)w.size()) + { + // We basically arrange for the w(force_weight_to_1) element and all + // subsequent elements of w to not be involved in the optimization at + // all. An easy way to do this is to just make sure the elements of w + // corresponding elements in the subgradient are always set to zero + // while we run the cutting plane algorithm. The only time + // w(force_weight_to_1) is 1 is when we pass it to the oca_problem. + set_rowm(w, range(force_weight_to_1, w.size()-1)) = 0; + set_rowm(new_plane, range(force_weight_to_1, new_plane.size()-1)) = 0; + } + + if (have_prior) + bs.push_back(cur_risk - dot(w,new_plane) + dot(prior,new_plane)); + else + bs.push_back(cur_risk - dot(w,new_plane)); + planes.add(planes.size(), new_plane); + miss_count.push_back(0); + + // If alpha is empty then initialize it (we must always have sum(alpha) == C). + // But otherwise, just append a zero. + if (alpha.size() == 0) + alpha = uniform_matrix<scalar_type>(1,1, C); + else + alpha = join_cols(alpha,zeros_matrix<scalar_type>(1,1)); + + const scalar_type wnorm = 0.5*ridge_lambda*trans(w)*w + lasso_lambda*sum(abs(w)); + const double prior_part = have_prior? dot(w,prior) : 0; + cur_obj = wnorm + C*cur_risk + prior_norm-prior_part; + + // report current status + const scalar_type risk_gap = cur_risk - (cp_obj-wnorm+prior_part-prior_norm)/C; + if (counter > 0 && problem.optimization_status(cur_obj, cur_obj - cp_obj, + cur_risk, risk_gap, planes.size(), counter)) + { + break; + } + + // compute kernel matrix for all the planes + K.swap(Ktmp); + K.set_size(planes.size(), planes.size()); + // copy over the old K matrix + set_subm(K, 0,0, Ktmp.nr(), Ktmp.nc()) = Ktmp; + + // now add the new row and column to K + for (unsigned long c = 0; c < planes.size(); ++c) + { + K(c, Ktmp.nc()) = dot(planes[c], planes[planes.size()-1]); + K(Ktmp.nc(), c) = K(c,Ktmp.nc()); + } + + + // solve the cutting plane subproblem for the next w. We solve it to an + // accuracy that is related to how big the error gap is. Also, we multiply + // by ridge_lambda because the objective function for the QP we solve was + // implicitly scaled by ridge_lambda. That is, we want to ask the QP + // solver to solve the problem until the duality gap is 0.1 times smaller + // than what it is now. So the factor of ridge_lambda is necessary to make + // this happen. + scalar_type eps = std::min<scalar_type>(sub_eps, 0.1*ridge_lambda*(cur_obj-cp_obj)); + // just a sanity check + if (eps < 1e-16) + eps = 1e-16; + // Note that we warm start this optimization by using the alpha from the last + // iteration as the starting point. + if (lasso_lambda != 0) + { + // copy planes into a matrix so we can call solve_qp4_using_smo() + matrix<scalar_type,0,0,mem_manager_type, layout_type> planes_mat(num_dims,planes.size()); + for (unsigned long i = 0; i < planes.size(); ++i) + set_colm(planes_mat,i) = planes[i]; + + btemp = ridge_lambda*mat(bs) - trans(planes_mat)*d; + solve_qp4_using_smo(planes_mat, K, btemp, d, alpha, lambda, eps, sub_max_iter, (scalar_type)(2*lasso_lambda)); + } + else if (num_nonnegative != 0) + { + // copy planes into a matrix so we can call solve_qp4_using_smo() + matrix<scalar_type,0,0,mem_manager_type, layout_type> planes_mat(num_nonnegative,planes.size()); + for (unsigned long i = 0; i < planes.size(); ++i) + set_colm(planes_mat,i) = colm(planes[i],0,num_nonnegative); + + solve_qp4_using_smo(planes_mat, K, mat(bs), d, alpha, lambda, eps, sub_max_iter); + } + else + { + solve_qp_using_smo(K, mat(bs), alpha, eps, sub_max_iter); + } + + // construct the w that minimized the subproblem. + w = -alpha(0)*planes[0]; + for (unsigned long i = 1; i < planes.size(); ++i) + w -= alpha(i)*planes[i]; + if (lasso_lambda != 0) + w = (lambda-d+w)/ridge_lambda; + else if (num_nonnegative != 0) // threshold the first num_nonnegative w elements if necessary. + set_rowm(w,range(0,num_nonnegative-1)) = lowerbound(rowm(w,range(0,num_nonnegative-1)),0); + + for (long i = 0; i < alpha.size(); ++i) + { + if (alpha(i) != 0) + miss_count[i] = 0; + else + miss_count[i] += 1; + } + + // Compute the lower bound on the true objective given to us by the cutting + // plane subproblem. + cp_obj = -0.5*ridge_lambda*trans(w)*w + trans(alpha)*mat(bs); + if (have_prior) + w += prior; + + // If it has been a while since a cutting plane was an active constraint then + // we should throw it away. + while (max(mat(miss_count)) >= inactive_thresh) + { + const long idx = index_of_max(mat(miss_count)); + bs.erase(bs.begin()+idx); + miss_count.erase(miss_count.begin()+idx); + K = removerc(K, idx, idx); + alpha = remove_row(alpha,idx); + planes.remove(idx, new_plane); + } + + ++counter; + } + + if (force_weight_to_1 < (unsigned long)w.size()) + w(force_weight_to_1) = 1; + + return cur_obj; + } + + double sub_eps; + + unsigned long sub_max_iter; + + unsigned long inactive_thresh; + }; +} + +// ---------------------------------------------------------------------------------------- + +#endif // DLIB_OPTIMIZATIoN_OCA_Hh_ + diff --git a/ml/dlib/dlib/optimization/optimization_oca_abstract.h b/ml/dlib/dlib/optimization/optimization_oca_abstract.h new file mode 100644 index 000000000..859dbdcfc --- /dev/null +++ b/ml/dlib/dlib/optimization/optimization_oca_abstract.h @@ -0,0 +1,334 @@ +// Copyright (C) 2010 Davis E. King (davis@dlib.net) +// License: Boost Software License See LICENSE.txt for the full license. +#undef DLIB_OPTIMIZATION_OCA_ABsTRACT_Hh_ +#ifdef DLIB_OPTIMIZATION_OCA_ABsTRACT_Hh_ + +// ---------------------------------------------------------------------------------------- + +namespace dlib +{ + template <typename matrix_type> + class oca_problem + { + /*! + REQUIREMENTS ON matrix_type + - matrix_type == a dlib::matrix capable of storing column vectors + + WHAT THIS OBJECT REPRESENTS + This object is the interface used to define the optimization + problems solved by the oca optimizer defined later in this file. + + OCA solves optimization problems with the following form: + Minimize: f(w) == 0.5*length_squared(w) + C*R(w) + + Where R(w) is a user-supplied convex function and C > 0. Optionally, + there can also be non-negativity constraints on some or all of the + elements of w. + + Or it can alternatively solve: + Minimize: f(w) == 0.5*length_squared(w-prior) + C*R(w) + + Where prior is a user supplied vector and R(w) has the same + interpretation as above. + + Or it can use the elastic net regularizer: + Minimize: f(w) == 0.5*(1-lasso_lambda)*length_squared(w) + lasso_lambda*sum(abs(w)) + C*R(w) + + Where lasso_lambda is a number in the range [0, 1) and controls + trade-off between doing L1 and L2 regularization. R(w) has the same + interpretation as above. + + + Note that the stopping condition must be provided by the user + in the form of the optimization_status() function. + !*/ + + public: + + typedef typename matrix_type::type scalar_type; + + virtual ~oca_problem() {} + + virtual bool risk_has_lower_bound ( + scalar_type& lower_bound + ) const { return false; } + /*! + ensures + - if (R(w) >= a constant for all values of w) then + - returns true + - #lower_bound == the constant that lower bounds R(w) + - else + - returns false + !*/ + + virtual bool optimization_status ( + scalar_type current_objective_value, + scalar_type current_error_gap, + scalar_type current_risk_value, + scalar_type current_risk_gap, + unsigned long num_cutting_planes, + unsigned long num_iterations + ) const = 0; + /*! + requires + - This function is called by the OCA optimizer each iteration. + - current_objective_value == the current value of the objective function f(w) + - current_error_gap == The bound on how much lower the objective function + can drop before we reach the optimal point. At the optimal solution the + error gap is equal to 0. + - current_risk_value == the current value of the R(w) term of the objective function. + - current_risk_gap == the bound on how much lower the risk term can go. At the optimal + solution the risk gap is zero. + - num_cutting_planes == the number of cutting planes the algorithm is currently using. + - num_iterations == A count of the total number of iterations that have executed + since we started running the optimization. + ensures + - If it is appropriate to terminate the optimization then this function returns true + and false otherwise. + !*/ + + virtual scalar_type get_c ( + ) const = 0; + /*! + ensures + - returns the C parameter + !*/ + + virtual long get_num_dimensions ( + ) const = 0; + /*! + ensures + - returns the number of free variables in this optimization problem + !*/ + + virtual void get_risk ( + matrix_type& current_solution, + scalar_type& risk_value, + matrix_type& risk_subgradient + ) const = 0; + /*! + requires + - is_col_vector(current_solution) == true + - current_solution.size() == get_num_dimensions() + ensures + - #current_solution will be set to one of the following: + - current_solution (i.e. it won't be modified at all) + - The result of a line search passing through current_solution. + - #risk_value == R(#current_solution) + - #risk_subgradient == an element of the subgradient of R() at the + point #current_solution + - Note that #risk_value and #risk_subgradient are NOT multiplied by get_c() + !*/ + + }; + +// ---------------------------------------------------------------------------------------- + + class oca + { + /*! + INITIAL VALUE + - get_subproblem_epsilon() == 1e-2 + - get_subproblem_max_iterations() == 50000 + - get_inactive_plane_threshold() == 20 + + WHAT THIS OBJECT REPRESENTS + This object is a tool for solving the optimization problem defined above + by the oca_problem abstract class. + + For reference, OCA solves optimization problems with the following form: + Minimize: f(w) == 0.5*length_squared(w) + C*R(w) + + Where R(w) is a user-supplied convex function and C > 0. Optionally, + this object can also add non-negativity constraints to some or all + of the elements of w. + + Or it can alternatively solve: + Minimize: f(w) == 0.5*length_squared(w-prior) + C*R(w) + + Where prior is a user supplied vector and R(w) has the same + interpretation as above. + + Or it can use the elastic net regularizer: + Minimize: f(w) == 0.5*(1-lasso_lambda)*length_squared(w) + lasso_lambda*sum(abs(w)) + C*R(w) + + Where lasso_lambda is a number in the range [0, 1) and controls + trade-off between doing L1 and L2 regularization. R(w) has the same + interpretation as above. + + + For a detailed discussion you should consult the following papers + from the Journal of Machine Learning Research: + Optimized Cutting Plane Algorithm for Large-Scale Risk Minimization + Vojtech Franc, Soren Sonnenburg; 10(Oct):2157--2192, 2009. + + Bundle Methods for Regularized Risk Minimization + Choon Hui Teo, S.V.N. Vishwanthan, Alex J. Smola, Quoc V. Le; 11(Jan):311-365, 2010. + !*/ + public: + + oca ( + ); + /*! + ensures + - this object is properly initialized + !*/ + + template < + typename matrix_type + > + typename matrix_type::type operator() ( + const oca_problem<matrix_type>& problem, + matrix_type& w, + unsigned long num_nonnegative = 0, + unsigned long force_weight_to_1 = std::numeric_limits<unsigned long>::max() + ) const; + /*! + requires + - problem.get_c() > 0 + - problem.get_num_dimensions() > 0 + ensures + - solves the given oca problem and stores the solution in #w. In particular, + this function solves: + Minimize: f(w) == 0.5*length_squared(w) + C*R(w) + - The optimization algorithm runs until problem.optimization_status() + indicates it is time to stop. + - returns the objective value at the solution #w + - if (num_nonnegative != 0) then + - Adds the constraint that #w(i) >= 0 for all i < num_nonnegative. + That is, the first num_nonnegative elements of #w will always be + non-negative. This includes the copies of w passed to get_risk() + in the form of the current_solution vector as well as the final + output of this function. + - if (force_weight_to_1 < problem.get_num_dimensions()) then + - The optimizer enforces the following constraints: + - #w(force_weight_to_1) == 1 + - for all i > force_weight_to_1: + - #w(i) == 0 + - That is, the element in the weight vector at the index indicated + by force_weight_to_1 will have a value of 1 upon completion of + this function, while all subsequent elements of w will have + values of 0. + !*/ + + template < + typename matrix_type + > + typename matrix_type::type operator() ( + const oca_problem<matrix_type>& problem, + matrix_type& w, + const matrix_type& prior + ) const; + /*! + requires + - problem.get_c() > 0 + - problem.get_num_dimensions() > 0 + - is_col_vector(prior) == true + - prior.size() == problem.get_num_dimensions() + ensures + - solves the given oca problem and stores the solution in #w. + - In this mode, we solve a version of the problem with a different + regularizer. In particular, this function solves: + Minimize: f(w) == 0.5*length_squared(w-prior) + C*R(w) + - The optimization algorithm runs until problem.optimization_status() + indicates it is time to stop. + - returns the objective value at the solution #w + !*/ + + template < + typename matrix_type + > + typename matrix_type::type solve_with_elastic_net ( + const oca_problem<matrix_type>& problem, + matrix_type& w, + scalar_type lasso_lambda, + unsigned long force_weight_to_1 = std::numeric_limits<unsigned long>::max() + ) const; + /*! + requires + - problem.get_c() > 0 + - problem.get_num_dimensions() > 0 + - 0 <= lasso_lambda < 1 + ensures + - Solves the given oca problem and stores the solution in #w, but uses an + elastic net regularizer instead of the normal L2 regularizer. In + particular, this function solves: + Minimize: f(w) == 0.5*(1-lasso_lambda)*length_squared(w) + lasso_lambda*sum(abs(w)) + C*R(w) + - The optimization algorithm runs until problem.optimization_status() + indicates it is time to stop. + - returns the objective value at the solution #w + - if (force_weight_to_1 < problem.get_num_dimensions()) then + - The optimizer enforces the following constraints: + - #w(force_weight_to_1) == 1 + - for all i > force_weight_to_1: + - #w(i) == 0 + - That is, the element in the weight vector at the index indicated + by force_weight_to_1 will have a value of 1 upon completion of + this function, while all subsequent elements of w will have + values of 0. + !*/ + + void set_subproblem_epsilon ( + double eps + ); + /*! + requires + - eps > 0 + ensures + - #get_subproblem_epsilon() == eps + !*/ + + double get_subproblem_epsilon ( + ) const; + /*! + ensures + - returns the accuracy used in solving the quadratic programming + subproblem that is part of the overall OCA algorithm. + !*/ + + void set_subproblem_max_iterations ( + unsigned long sub_max_iter + ); + /*! + requires + - sub_max_iter > 0 + ensures + - #get_subproblem_max_iterations() == sub_max_iter + !*/ + + unsigned long get_subproblem_max_iterations ( + ) const; + /*! + ensures + - returns the maximum number of iterations this object will perform + while attempting to solve each quadratic programming subproblem. + !*/ + + void set_inactive_plane_threshold ( + unsigned long inactive_thresh + ); + /*! + requires + - inactive_thresh > 0 + ensures + - #get_inactive_plane_threshold() == inactive_thresh + !*/ + + unsigned long get_inactive_plane_threshold ( + ) const; + /*! + ensures + - As OCA runs it builds up a set of cutting planes. Typically + cutting planes become inactive after a certain point and can then + be removed. This function returns the number of iterations of + inactivity required before a cutting plane is removed. + !*/ + + }; +} + +// ---------------------------------------------------------------------------------------- + +#endif // DLIB_OPTIMIZATION_OCA_ABsTRACT_Hh_ + + diff --git a/ml/dlib/dlib/optimization/optimization_search_strategies.h b/ml/dlib/dlib/optimization/optimization_search_strategies.h new file mode 100644 index 000000000..eda637fa1 --- /dev/null +++ b/ml/dlib/dlib/optimization/optimization_search_strategies.h @@ -0,0 +1,324 @@ +// Copyright (C) 2008 Davis E. King (davis@dlib.net) +// License: Boost Software License See LICENSE.txt for the full license. +#ifndef DLIB_OPTIMIZATIOn_SEARCH_STRATEGIES_H_ +#define DLIB_OPTIMIZATIOn_SEARCH_STRATEGIES_H_ + +#include <cmath> +#include <limits> +#include "../matrix.h" +#include "../algs.h" +#include "optimization_search_strategies_abstract.h" +#include "../sequence.h" + +namespace dlib +{ + +// ---------------------------------------------------------------------------------------- + + class cg_search_strategy + { + public: + cg_search_strategy() : been_used(false) {} + + double get_wolfe_rho ( + ) const { return 0.001; } + + double get_wolfe_sigma ( + ) const { return 0.01; } + + unsigned long get_max_line_search_iterations ( + ) const { return 100; } + + template <typename T> + const matrix<double,0,1>& get_next_direction ( + const T& , + const double , + const T& funct_derivative + ) + { + if (been_used == false) + { + been_used = true; + prev_direction = -funct_derivative; + } + else + { + // Use the Polak-Ribiere (4.1.12) conjugate gradient described by Fletcher on page 83 + const double temp = trans(prev_derivative)*prev_derivative; + // If this value hits zero then just use the direction of steepest descent. + if (std::abs(temp) < std::numeric_limits<double>::epsilon()) + { + prev_derivative = funct_derivative; + prev_direction = -funct_derivative; + return prev_direction; + } + + double b = trans(funct_derivative-prev_derivative)*funct_derivative/(temp); + prev_direction = -funct_derivative + b*prev_direction; + + } + + prev_derivative = funct_derivative; + return prev_direction; + } + + private: + bool been_used; + matrix<double,0,1> prev_derivative; + matrix<double,0,1> prev_direction; + }; + +// ---------------------------------------------------------------------------------------- + + class bfgs_search_strategy + { + public: + bfgs_search_strategy() : been_used(false), been_used_twice(false) {} + + double get_wolfe_rho ( + ) const { return 0.01; } + + double get_wolfe_sigma ( + ) const { return 0.9; } + + unsigned long get_max_line_search_iterations ( + ) const { return 100; } + + template <typename T> + const matrix<double,0,1>& get_next_direction ( + const T& x, + const double , + const T& funct_derivative + ) + { + if (been_used == false) + { + been_used = true; + H = identity_matrix<double>(x.size()); + } + else + { + // update H with the BFGS formula from (3.2.12) on page 55 of Fletcher + delta = (x-prev_x); + gamma = funct_derivative-prev_derivative; + + double dg = dot(delta,gamma); + + // Try to set the initial value of the H matrix to something reasonable if we are still + // in the early stages of figuring out what it is. This formula below is what is suggested + // in the book Numerical Optimization by Nocedal and Wright in the chapter on Quasi-Newton methods. + if (been_used_twice == false) + { + double gg = trans(gamma)*gamma; + if (std::abs(gg) > std::numeric_limits<double>::epsilon()) + { + const double temp = put_in_range(0.01, 100, dg/gg); + H = diagm(uniform_matrix<double>(x.size(),1, temp)); + been_used_twice = true; + } + } + + Hg = H*gamma; + gH = trans(trans(gamma)*H); + double gHg = trans(gamma)*H*gamma; + if (gHg < std::numeric_limits<double>::infinity() && dg < std::numeric_limits<double>::infinity() && + dg != 0) + { + H += (1 + gHg/dg)*delta*trans(delta)/(dg) - (delta*trans(gH) + Hg*trans(delta))/(dg); + } + else + { + H = identity_matrix<double>(H.nr()); + been_used_twice = false; + } + } + + prev_x = x; + prev_direction = -H*funct_derivative; + prev_derivative = funct_derivative; + return prev_direction; + } + + private: + bool been_used; + bool been_used_twice; + matrix<double,0,1> prev_x; + matrix<double,0,1> prev_derivative; + matrix<double,0,1> prev_direction; + matrix<double> H; + matrix<double,0,1> delta, gamma, Hg, gH; + }; + +// ---------------------------------------------------------------------------------------- + + class lbfgs_search_strategy + { + public: + explicit lbfgs_search_strategy(unsigned long max_size_) : max_size(max_size_), been_used(false) + { + DLIB_ASSERT ( + max_size > 0, + "\t lbfgs_search_strategy(max_size)" + << "\n\t max_size can't be zero" + ); + } + + lbfgs_search_strategy(const lbfgs_search_strategy& item) + { + max_size = item.max_size; + been_used = item.been_used; + prev_x = item.prev_x; + prev_derivative = item.prev_derivative; + prev_direction = item.prev_direction; + alpha = item.alpha; + dh_temp = item.dh_temp; + } + + double get_wolfe_rho ( + ) const { return 0.01; } + + double get_wolfe_sigma ( + ) const { return 0.9; } + + unsigned long get_max_line_search_iterations ( + ) const { return 100; } + + template <typename T> + const matrix<double,0,1>& get_next_direction ( + const T& x, + const double , + const T& funct_derivative + ) + { + prev_direction = -funct_derivative; + + if (been_used == false) + { + been_used = true; + } + else + { + // add an element into the stored data sequence + dh_temp.s = x - prev_x; + dh_temp.y = funct_derivative - prev_derivative; + double temp = dot(dh_temp.s, dh_temp.y); + // only accept this bit of data if temp isn't zero + if (std::abs(temp) > std::numeric_limits<double>::epsilon()) + { + dh_temp.rho = 1/temp; + data.add(data.size(), dh_temp); + } + else + { + data.clear(); + } + + if (data.size() > 0) + { + // This block of code is from algorithm 7.4 in the Nocedal book. + + alpha.resize(data.size()); + for (unsigned long i = data.size()-1; i < data.size(); --i) + { + alpha[i] = data[i].rho*dot(data[i].s, prev_direction); + prev_direction -= alpha[i]*data[i].y; + } + + // Take a guess at what the first H matrix should be. This formula below is what is suggested + // in the book Numerical Optimization by Nocedal and Wright in the chapter on Large Scale + // Unconstrained Optimization (in the L-BFGS section). + double H_0 = 1.0/data[data.size()-1].rho/dot(data[data.size()-1].y, data[data.size()-1].y); + H_0 = put_in_range(0.001, 1000.0, H_0); + prev_direction *= H_0; + + for (unsigned long i = 0; i < data.size(); ++i) + { + double beta = data[i].rho*dot(data[i].y, prev_direction); + prev_direction += data[i].s * (alpha[i] - beta); + } + } + + } + + if (data.size() > max_size) + { + // remove the oldest element in the data sequence + data.remove(0, dh_temp); + } + + prev_x = x; + prev_derivative = funct_derivative; + return prev_direction; + } + + private: + + struct data_helper + { + matrix<double,0,1> s; + matrix<double,0,1> y; + double rho; + + friend void swap(data_helper& a, data_helper& b) + { + a.s.swap(b.s); + a.y.swap(b.y); + std::swap(a.rho, b.rho); + } + }; + sequence<data_helper>::kernel_2a data; + + unsigned long max_size; + bool been_used; + matrix<double,0,1> prev_x; + matrix<double,0,1> prev_derivative; + matrix<double,0,1> prev_direction; + std::vector<double> alpha; + + data_helper dh_temp; + }; + +// ---------------------------------------------------------------------------------------- + + template <typename hessian_funct> + class newton_search_strategy_obj + { + public: + explicit newton_search_strategy_obj( + const hessian_funct& hess + ) : hessian(hess) {} + + double get_wolfe_rho ( + ) const { return 0.01; } + + double get_wolfe_sigma ( + ) const { return 0.9; } + + unsigned long get_max_line_search_iterations ( + ) const { return 100; } + + template <typename T> + const matrix<double,0,1> get_next_direction ( + const T& x, + const double , + const T& funct_derivative + ) + { + return -inv(hessian(x))*funct_derivative; + } + + private: + hessian_funct hessian; + }; + + template <typename hessian_funct> + newton_search_strategy_obj<hessian_funct> newton_search_strategy ( + hessian_funct hessian + ) { return newton_search_strategy_obj<hessian_funct>(hessian); } + +// ---------------------------------------------------------------------------------------- + +} + +#endif // DLIB_OPTIMIZATIOn_SEARCH_STRATEGIES_H_ + diff --git a/ml/dlib/dlib/optimization/optimization_search_strategies_abstract.h b/ml/dlib/dlib/optimization/optimization_search_strategies_abstract.h new file mode 100644 index 000000000..44b894bfc --- /dev/null +++ b/ml/dlib/dlib/optimization/optimization_search_strategies_abstract.h @@ -0,0 +1,330 @@ +// Copyright (C) 2008 Davis E. King (davis@dlib.net) +// License: Boost Software License See LICENSE.txt for the full license. +#undef DLIB_OPTIMIZATIOn_SEARCH_STRATEGIES_ABSTRACT_ +#ifdef DLIB_OPTIMIZATIOn_SEARCH_STRATEGIES_ABSTRACT_ + +#include <cmath> +#include <limits> +#include "../matrix/matrix_abstract.h" +#include "../algs.h" + + +namespace dlib +{ + /* + A good discussion of the search strategies in this file can be found in the + following book: Numerical Optimization by Nocedal and Wright. + */ + +// ---------------------------------------------------------------------------------------- + + class cg_search_strategy + { + /*! + WHAT THIS OBJECT REPRESENTS + This object represents a strategy for determining which direction + a line search should be carried out along. This particular object + is an implementation of the Polak-Ribiere conjugate gradient method + for determining this direction. + + This method uses an amount of memory that is linear in the number + of variables to be optimized. So it is capable of handling problems + with a very large number of variables. However, it is generally + not as good as the L-BFGS algorithm (which is defined below in + the lbfgs_search_strategy class). + !*/ + + public: + cg_search_strategy( + ); + /*! + ensures + - This object is properly initialized and ready to generate + search directions. + !*/ + + double get_wolfe_rho ( + ) const; + /*! + ensures + - returns the value of the Wolfe rho parameter that should be used when + this search strategy is used with the line_search() function. + !*/ + + double get_wolfe_sigma ( + ) const; + /*! + ensures + - returns the value of the Wolfe sigma parameter that should be used when + this search strategy is used with the line_search() function. + !*/ + + unsigned long get_max_line_search_iterations ( + ) const; + /*! + ensures + - returns the value of the max iterations parameter that should be used when + this search strategy is used with the line_search() function. + !*/ + + template <typename T> + const matrix<double,0,1>& get_next_direction ( + const T& x, + const double funct_value, + const T& funct_derivative + ); + /*! + requires + - this function is only called once per search iteration + - for some objective function f(): + - x == the search point for the current iteration + - funct_value == f(x) + - funct_derivative == derivative(f)(x) + ensures + - Assuming that a line search is going to be conducted starting from the point x, + this function returns the direction in which the search should proceed. + !*/ + + }; + +// ---------------------------------------------------------------------------------------- + + class bfgs_search_strategy + { + /*! + WHAT THIS OBJECT REPRESENTS + This object represents a strategy for determining which direction + a line search should be carried out along. This particular object + is an implementation of the BFGS quasi-newton method for determining + this direction. + + This method uses an amount of memory that is quadratic in the number + of variables to be optimized. It is generally very effective but + if your problem has a very large number of variables then it isn't + appropriate. Instead You should try the lbfgs_search_strategy. + !*/ + + public: + bfgs_search_strategy( + ); + /*! + ensures + - This object is properly initialized and ready to generate + search directions. + !*/ + + double get_wolfe_rho ( + ) const; + /*! + ensures + - returns the value of the Wolfe rho parameter that should be used when + this search strategy is used with the line_search() function. + !*/ + + double get_wolfe_sigma ( + ) const; + /*! + ensures + - returns the value of the Wolfe sigma parameter that should be used when + this search strategy is used with the line_search() function. + !*/ + + unsigned long get_max_line_search_iterations ( + ) const; + /*! + ensures + - returns the value of the max iterations parameter that should be used when + this search strategy is used with the line_search() function. + !*/ + + template <typename T> + const matrix<double,0,1>& get_next_direction ( + const T& x, + const double funct_value, + const T& funct_derivative + ); + /*! + requires + - this function is only called once per search iteration + - for some objective function f(): + - x == the search point for the current iteration + - funct_value == f(x) + - funct_derivative == derivative(f)(x) + ensures + - Assuming that a line search is going to be conducted starting from the point x, + this function returns the direction in which the search should proceed. + !*/ + }; + +// ---------------------------------------------------------------------------------------- + + class lbfgs_search_strategy + { + /*! + WHAT THIS OBJECT REPRESENTS + This object represents a strategy for determining which direction + a line search should be carried out along. This particular object + is an implementation of the L-BFGS quasi-newton method for determining + this direction. + + This method uses an amount of memory that is linear in the number + of variables to be optimized. This makes it an excellent method + to use when an optimization problem has a large number of variables. + !*/ + public: + explicit lbfgs_search_strategy( + unsigned long max_size + ); + /*! + requires + - max_size > 0 + ensures + - This object is properly initialized and ready to generate + search directions. + - L-BFGS works by remembering a certain number of position and gradient + pairs. It uses this remembered information to compute search directions. + The max_size argument determines how many of these pairs will be remembered. + Typically, using between 3 and 30 pairs performs well for many problems. + !*/ + + double get_wolfe_rho ( + ) const; + /*! + ensures + - returns the value of the Wolfe rho parameter that should be used when + this search strategy is used with the line_search() function. + !*/ + + double get_wolfe_sigma ( + ) const; + /*! + ensures + - returns the value of the Wolfe sigma parameter that should be used when + this search strategy is used with the line_search() function. + !*/ + + unsigned long get_max_line_search_iterations ( + ) const; + /*! + ensures + - returns the value of the max iterations parameter that should be used when + this search strategy is used with the line_search() function. + !*/ + + template <typename T> + const matrix<double,0,1>& get_next_direction ( + const T& x, + const double funct_value, + const T& funct_derivative + ); + /*! + requires + - this function is only called once per search iteration + - for some objective function f(): + - x == the search point for the current iteration + - funct_value == f(x) + - funct_derivative == derivative(f)(x) + ensures + - Assuming that a line search is going to be conducted starting from the point x, + this function returns the direction in which the search should proceed. + !*/ + }; + +// ---------------------------------------------------------------------------------------- + + template < + typename hessian_funct + > + class newton_search_strategy_obj + { + /*! + REQUIREMENTS ON hessian_funct + Objects of hessian_funct type must be function objects which + take a single argument and return a dlib::matrix of doubles. The + single argument must be a dlib::matrix capable of representing + column vectors of doubles. hessian_funct must also be copy + constructable. + + WHAT THIS OBJECT REPRESENTS + This object represents a strategy for determining which direction + a line search should be carried out along. This particular object + is an implementation of the newton method for determining this + direction. That is, it uses the following formula to determine + the direction: + search_direction = -inv(hessian(x))*derivative + !*/ + public: + explicit newton_search_strategy_obj( + const hessian_funct& hess + ); + /*! + ensures + - This object is properly initialized and ready to generate + search directions. + - hess will be used by this object to generate the needed hessian + matrices every time get_next_direction() is called. + !*/ + + double get_wolfe_rho ( + ) const; + /*! + ensures + - returns the value of the Wolfe rho parameter that should be used when + this search strategy is used with the line_search() function. + !*/ + + double get_wolfe_sigma ( + ) const; + /*! + ensures + - returns the value of the Wolfe sigma parameter that should be used when + this search strategy is used with the line_search() function. + !*/ + + unsigned long get_max_line_search_iterations ( + ) const; + /*! + ensures + - returns the value of the max iterations parameter that should be used when + this search strategy is used with the line_search() function. + !*/ + + template <typename T> + const matrix<double,0,1> get_next_direction ( + const T& x, + const double funct_value, + const T& funct_derivative + ); + /*! + requires + - for some objective function f(): + - x == the search point for the current iteration + - funct_value == f(x) + - funct_derivative == derivative(f)(x) + ensures + - Assuming that a line search is going to be conducted starting from the + point x, this function returns the direction in which the search should + proceed. + - In particular, the search direction will be given by: + - search_direction = -inv(hessian(x))*funct_derivative + !*/ + + }; + + template <typename hessian_funct> + newton_search_strategy_obj<hessian_funct> newton_search_strategy ( + hessian_funct hessian + ) { return newton_search_strategy_obj<hessian_funct>(hessian); } + /*! + ensures + - constructs and returns a newton_search_strategy_obj. + This function is just a helper to make the syntax for creating + these objects a little simpler. + !*/ + +// ---------------------------------------------------------------------------------------- + +} + +#endif // DLIB_OPTIMIZATIOn_SEARCH_STRATEGIES_ABSTRACT_ + diff --git a/ml/dlib/dlib/optimization/optimization_solve_qp2_using_smo.h b/ml/dlib/dlib/optimization/optimization_solve_qp2_using_smo.h new file mode 100644 index 000000000..88cad0cf3 --- /dev/null +++ b/ml/dlib/dlib/optimization/optimization_solve_qp2_using_smo.h @@ -0,0 +1,468 @@ +// Copyright (C) 2007 Davis E. King (davis@dlib.net) +// License: Boost Software License See LICENSE.txt for the full license. +#ifndef DLIB_SOLVE_QP2_USING_SMo_Hh_ +#define DLIB_SOLVE_QP2_USING_SMo_Hh_ + +#include "optimization_solve_qp2_using_smo_abstract.h" +#include <cmath> +#include <limits> +#include <sstream> +#include "../matrix.h" +#include "../algs.h" + + +namespace dlib +{ + +// ---------------------------------------------------------------------------------------- + + class invalid_nu_error : public dlib::error + { + public: + invalid_nu_error(const std::string& msg, double nu_) : dlib::error(msg), nu(nu_) {}; + const double nu; + }; + +// ---------------------------------------------------------------------------------------- + + template < + typename T + > + typename T::type maximum_nu_impl ( + const T& y + ) + { + typedef typename T::type scalar_type; + // make sure requires clause is not broken + DLIB_ASSERT(y.size() > 1 && is_col_vector(y), + "\ttypedef T::type maximum_nu(y)" + << "\n\ty should be a column vector with more than one entry" + << "\n\ty.nr(): " << y.nr() + << "\n\ty.nc(): " << y.nc() + ); + + long pos_count = 0; + long neg_count = 0; + for (long r = 0; r < y.nr(); ++r) + { + if (y(r) == 1.0) + { + ++pos_count; + } + else if (y(r) == -1.0) + { + ++neg_count; + } + else + { + // make sure requires clause is not broken + DLIB_ASSERT(y(r) == -1.0 || y(r) == 1.0, + "\ttypedef T::type maximum_nu(y)" + << "\n\ty should contain only 1 and 0 entries" + << "\n\tr: " << r + << "\n\ty(r): " << y(r) + ); + } + } + return static_cast<scalar_type>(2.0*(scalar_type)std::min(pos_count,neg_count)/(scalar_type)y.nr()); + } + + template < + typename T + > + typename T::type maximum_nu ( + const T& y + ) + { + return maximum_nu_impl(mat(y)); + } + + template < + typename T + > + typename T::value_type maximum_nu ( + const T& y + ) + { + return maximum_nu_impl(mat(y)); + } + +// ---------------------------------------------------------------------------------------- + + template < + typename matrix_type + > + class solve_qp2_using_smo + { + public: + typedef typename matrix_type::mem_manager_type mem_manager_type; + typedef typename matrix_type::type scalar_type; + typedef typename matrix_type::layout_type layout_type; + typedef matrix<scalar_type,0,0,mem_manager_type,layout_type> general_matrix; + typedef matrix<scalar_type,0,1,mem_manager_type,layout_type> column_matrix; + + + template < + typename EXP1, + typename EXP2, + long NR + > + unsigned long operator() ( + const matrix_exp<EXP1>& Q, + const matrix_exp<EXP2>& y, + const scalar_type nu, + matrix<scalar_type,NR,1,mem_manager_type, layout_type>& alpha, + scalar_type eps + ) + { + DLIB_ASSERT(Q.nr() == Q.nc() && y.size() == Q.nr() && y.size() > 1 && is_col_vector(y) && + sum((y == +1) + (y == -1)) == y.size() && + 0 < nu && nu <= 1 && + eps > 0, + "\t void solve_qp2_using_smo::operator()" + << "\n\t invalid arguments were given to this function" + << "\n\t Q.nr(): " << Q.nr() + << "\n\t Q.nc(): " << Q.nc() + << "\n\t is_col_vector(y): " << is_col_vector(y) + << "\n\t y.size(): " << y.size() + << "\n\t sum((y == +1) + (y == -1)): " << sum((y == +1) + (y == -1)) + << "\n\t nu: " << nu + << "\n\t eps: " << eps + ); + + alpha.set_size(Q.nr(),1); + df.set_size(Q.nr()); + + // now initialize alpha + set_initial_alpha(y, nu, alpha); + + const scalar_type tau = 1e-12; + + typedef typename colm_exp<EXP1>::type col_type; + + set_all_elements(df, 0); + // initialize df. Compute df = Q*alpha + for (long r = 0; r < df.nr(); ++r) + { + if (alpha(r) != 0) + { + df += alpha(r)*matrix_cast<scalar_type>(colm(Q,r)); + } + } + + unsigned long count = 0; + + // now perform the actual optimization of alpha + long i=0, j=0; + while (find_working_group(y,alpha,Q,df,tau,eps,i,j)) + { + ++count; + const scalar_type old_alpha_i = alpha(i); + const scalar_type old_alpha_j = alpha(j); + + optimize_working_pair(alpha,Q,df,tau,i,j); + + // update the df vector now that we have modified alpha(i) and alpha(j) + scalar_type delta_alpha_i = alpha(i) - old_alpha_i; + scalar_type delta_alpha_j = alpha(j) - old_alpha_j; + + col_type Q_i = colm(Q,i); + col_type Q_j = colm(Q,j); + for(long k = 0; k < df.nr(); ++k) + df(k) += Q_i(k)*delta_alpha_i + Q_j(k)*delta_alpha_j; + } + + return count; + } + + const column_matrix& get_gradient ( + ) const { return df; } + + private: + + // ------------------------------------------------------------------------------------- + + template < + typename scalar_type, + typename scalar_vector_type, + typename scalar_vector_type2 + > + inline void set_initial_alpha ( + const scalar_vector_type& y, + const scalar_type nu, + scalar_vector_type2& alpha + ) const + { + set_all_elements(alpha,0); + const scalar_type l = y.nr(); + scalar_type temp = nu*l/2; + long num = (long)std::floor(temp); + long num_total = (long)std::ceil(temp); + + bool has_slack = false; + int count = 0; + for (int i = 0; i < alpha.nr(); ++i) + { + if (y(i) == 1) + { + if (count < num) + { + ++count; + alpha(i) = 1; + } + else + { + has_slack = true; + if (num_total > num) + { + ++count; + alpha(i) = temp - std::floor(temp); + } + break; + } + } + } + + if (count != num_total || has_slack == false) + { + std::ostringstream sout; + sout << "Invalid nu of " << nu << ". It is required that: 0 < nu < " << 2*(scalar_type)count/y.nr(); + throw invalid_nu_error(sout.str(),nu); + } + + has_slack = false; + count = 0; + for (int i = 0; i < alpha.nr(); ++i) + { + if (y(i) == -1) + { + if (count < num) + { + ++count; + alpha(i) = 1; + } + else + { + has_slack = true; + if (num_total > num) + { + ++count; + alpha(i) = temp - std::floor(temp); + } + break; + } + } + } + + if (count != num_total || has_slack == false) + { + std::ostringstream sout; + sout << "Invalid nu of " << nu << ". It is required that: 0 < nu < " << 2*(scalar_type)count/y.nr(); + throw invalid_nu_error(sout.str(),nu); + } + } + + // ------------------------------------------------------------------------------------ + + template < + typename scalar_vector_type, + typename scalar_type, + typename EXP, + typename U, typename V + > + inline bool find_working_group ( + const V& y, + const U& alpha, + const matrix_exp<EXP>& Q, + const scalar_vector_type& df, + const scalar_type tau, + const scalar_type eps, + long& i_out, + long& j_out + ) const + { + using namespace std; + long ip = 0; + long jp = 0; + long in = 0; + long jn = 0; + + + typedef typename colm_exp<EXP>::type col_type; + typedef typename diag_exp<EXP>::type diag_type; + + scalar_type ip_val = -numeric_limits<scalar_type>::infinity(); + scalar_type jp_val = numeric_limits<scalar_type>::infinity(); + scalar_type in_val = -numeric_limits<scalar_type>::infinity(); + scalar_type jn_val = numeric_limits<scalar_type>::infinity(); + + // loop over the alphas and find the maximum ip and in indices. + for (long i = 0; i < alpha.nr(); ++i) + { + if (y(i) == 1) + { + if (alpha(i) < 1.0) + { + if (-df(i) > ip_val) + { + ip_val = -df(i); + ip = i; + } + } + } + else + { + if (alpha(i) > 0.0) + { + if (df(i) > in_val) + { + in_val = df(i); + in = i; + } + } + } + } + + scalar_type Mp = numeric_limits<scalar_type>::infinity(); + scalar_type Mn = numeric_limits<scalar_type>::infinity(); + + // Pick out the columns and diagonal of Q we need below. Doing + // it this way is faster if Q is actually a symmetric_matrix_cache + // object. + col_type Q_ip = colm(Q,ip); + col_type Q_in = colm(Q,in); + diag_type Q_diag = diag(Q); + + + + // now we need to find the minimum jp and jn indices + for (long j = 0; j < alpha.nr(); ++j) + { + if (y(j) == 1) + { + if (alpha(j) > 0.0) + { + scalar_type b = ip_val + df(j); + if (-df(j) < Mp) + Mp = -df(j); + + if (b > 0) + { + scalar_type a = Q_ip(ip) + Q_diag(j) - 2*Q_ip(j); + if (a <= 0) + a = tau; + scalar_type temp = -b*b/a; + if (temp < jp_val) + { + jp_val = temp; + jp = j; + } + } + } + } + else + { + if (alpha(j) < 1.0) + { + scalar_type b = in_val - df(j); + if (df(j) < Mn) + Mn = df(j); + + if (b > 0) + { + scalar_type a = Q_in(in) + Q_diag(j) - 2*Q_in(j); + if (a <= 0) + a = tau; + scalar_type temp = -b*b/a; + if (temp < jn_val) + { + jn_val = temp; + jn = j; + } + } + } + } + } + + // if we are at the optimal point then return false so the caller knows + // to stop optimizing + if (std::max(ip_val - Mp, in_val - Mn) < eps) + return false; + + if (jp_val < jn_val) + { + i_out = ip; + j_out = jp; + } + else + { + i_out = in; + j_out = jn; + } + + return true; + } + + // ------------------------------------------------------------------------------------ + + template < + typename EXP, + typename T, typename U + > + inline void optimize_working_pair ( + T& alpha, + const matrix_exp<EXP>& Q, + const U& df, + const scalar_type tau, + const long i, + const long j + ) const + { + scalar_type quad_coef = Q(i,i)+Q(j,j)-2*Q(j,i); + if (quad_coef <= 0) + quad_coef = tau; + scalar_type delta = (df(i)-df(j))/quad_coef; + scalar_type sum = alpha(i) + alpha(j); + alpha(i) -= delta; + alpha(j) += delta; + + if(sum > 1) + { + if(alpha(i) > 1) + { + alpha(i) = 1; + alpha(j) = sum - 1; + } + else if(alpha(j) > 1) + { + alpha(j) = 1; + alpha(i) = sum - 1; + } + } + else + { + if(alpha(j) < 0) + { + alpha(j) = 0; + alpha(i) = sum; + } + else if(alpha(i) < 0) + { + alpha(i) = 0; + alpha(j) = sum; + } + } + } + + // ------------------------------------------------------------------------------------ + + column_matrix df; // gradient of f(alpha) + }; + +// ---------------------------------------------------------------------------------------- + +} + +#endif // DLIB_SOLVE_QP2_USING_SMo_Hh_ + + diff --git a/ml/dlib/dlib/optimization/optimization_solve_qp2_using_smo_abstract.h b/ml/dlib/dlib/optimization/optimization_solve_qp2_using_smo_abstract.h new file mode 100644 index 000000000..962541c25 --- /dev/null +++ b/ml/dlib/dlib/optimization/optimization_solve_qp2_using_smo_abstract.h @@ -0,0 +1,150 @@ +// Copyright (C) 2007 Davis E. King (davis@dlib.net) +// License: Boost Software License See LICENSE.txt for the full license. +#undef DLIB_OPTIMIZATION_SOLVE_QP2_USING_SMO_ABSTRACT_H_ +#ifdef DLIB_OPTIMIZATION_SOLVE_QP2_USING_SMO_ABSTRACT_H_ + +#include "../matrix/matrix_abstract.h" +#include "../algs.h" + +namespace dlib +{ + +// ---------------------------------------------------------------------------------------- + + class invalid_nu_error : public dlib::error + { + /*! + WHAT THIS OBJECT REPRESENTS + This object is an exception class used to indicate that a + value of nu given to the solve_qp2_using_smo object is incompatible + with the constraints of the quadratic program. + + this->nu will be set to the invalid value of nu used. + !*/ + + public: + invalid_nu_error(const std::string& msg, double nu_) : dlib::error(msg), nu(nu_) {}; + const double nu; + }; + +// ---------------------------------------------------------------------------------------- + + template < + typename T + > + typename T::type maximum_nu ( + const T& y + ); + /*! + requires + - T == a matrix object or an object convertible to a matrix via mat() + - is_col_vector(y) == true + - y.size() > 1 + - sum((y == +1) + (y == -1)) == y.size() + (i.e. all elements of y must be equal to +1 or -1) + ensures + - returns the maximum valid nu that can be used with solve_qp2_using_smo and + the given y vector. + (i.e. 2.0*min(sum(y == +1), sum(y == -1))/y.size()) + !*/ + +// ---------------------------------------------------------------------------------------- + + template < + typename matrix_type + > + class solve_qp2_using_smo + { + /*! + REQUIREMENTS ON matrix_type + Must be some type of dlib::matrix. + + WHAT THIS OBJECT REPRESENTS + This object is a tool for solving the following quadratic programming + problem using the sequential minimal optimization algorithm: + + Minimize: f(alpha) == 0.5*trans(alpha)*Q*alpha + subject to the following constraints: + - sum(alpha) == nu*y.size() + - 0 <= min(alpha) && max(alpha) <= 1 + - trans(y)*alpha == 0 + + Where all elements of y must be equal to +1 or -1 and f is convex. + This means that Q should be symmetric and positive-semidefinite. + + + This object implements the strategy used by the LIBSVM tool. The following papers + can be consulted for additional details: + - Chang and Lin, Training {nu}-Support Vector Classifiers: Theory and Algorithms + - Chih-Chung Chang and Chih-Jen Lin, LIBSVM : a library for support vector + machines, 2001. Software available at http://www.csie.ntu.edu.tw/~cjlin/libsvm + !*/ + + public: + + typedef typename matrix_type::mem_manager_type mem_manager_type; + typedef typename matrix_type::type scalar_type; + typedef typename matrix_type::layout_type layout_type; + typedef matrix<scalar_type,0,0,mem_manager_type,layout_type> general_matrix; + typedef matrix<scalar_type,0,1,mem_manager_type,layout_type> column_matrix; + + template < + typename EXP1, + typename EXP2, + long NR + > + unsigned long operator() ( + const matrix_exp<EXP1>& Q, + const matrix_exp<EXP2>& y, + const scalar_type nu, + matrix<scalar_type,NR,1,mem_manager_type, layout_type>& alpha, + scalar_type eps + ); + /*! + requires + - Q.nr() == Q.nc() + - is_col_vector(y) == true + - y.size() == Q.nr() + - y.size() > 1 + - sum((y == +1) + (y == -1)) == y.size() + (i.e. all elements of y must be equal to +1 or -1) + - alpha must be capable of representing a vector of size y.size() elements + - 0 < nu <= 1 + - eps > 0 + ensures + - This function solves the quadratic program defined in this class's main comment. + - The solution to the quadratic program will be stored in #alpha. + - #alpha.size() == y.size() + - This function uses an implementation of the sequential minimal optimization + algorithm. It runs until the KKT violation is less than eps. So eps controls + how accurate the solution is and smaller values result in better solutions. + (a reasonable eps is usually about 1e-3) + - #get_gradient() == Q*(#alpha) + (i.e. stores the gradient of f() at #alpha in get_gradient()) + - returns the number of iterations performed. + throws + - invalid_nu_error + This exception is thrown if nu >= maximum_nu(y). + (some values of nu cause the constraints to become impossible to satisfy. + If this is detected then an exception is thrown). + !*/ + + const column_matrix& get_gradient ( + ) const; + /*! + ensures + - returns the gradient vector at the solution of the last problem solved + by this object. If no problem has been solved then returns an empty + vector. + !*/ + + }; + +// ---------------------------------------------------------------------------------------- + +} + +#endif // DLIB_OPTIMIZATION_SOLVE_QP2_USING_SMO_ABSTRACT_H_ + + + diff --git a/ml/dlib/dlib/optimization/optimization_solve_qp3_using_smo.h b/ml/dlib/dlib/optimization/optimization_solve_qp3_using_smo.h new file mode 100644 index 000000000..617ecc408 --- /dev/null +++ b/ml/dlib/dlib/optimization/optimization_solve_qp3_using_smo.h @@ -0,0 +1,455 @@ +// Copyright (C) 2007 Davis E. King (davis@dlib.net) +// License: Boost Software License See LICENSE.txt for the full license. +#ifndef DLIB_SOLVE_QP3_USING_SMo_Hh_ +#define DLIB_SOLVE_QP3_USING_SMo_Hh_ + +#include "optimization_solve_qp3_using_smo_abstract.h" +#include <cmath> +#include <limits> +#include <sstream> +#include "../matrix.h" +#include "../algs.h" + + +namespace dlib +{ + +// ---------------------------------------------------------------------------------------- + + class invalid_qp3_error : public dlib::error + { + + public: + invalid_qp3_error( + const std::string& msg, + double B_, + double Cp_, + double Cn_ + ) : + dlib::error(msg), + B(B_), + Cp(Cp_), + Cn(Cn_) + {}; + + const double B; + const double Cp; + const double Cn; + }; + +// ---------------------------------------------------------------------------------------- + + template < + typename matrix_type + > + class solve_qp3_using_smo + { + public: + typedef typename matrix_type::mem_manager_type mem_manager_type; + typedef typename matrix_type::type scalar_type; + typedef typename matrix_type::layout_type layout_type; + typedef matrix<scalar_type,0,0,mem_manager_type,layout_type> general_matrix; + typedef matrix<scalar_type,0,1,mem_manager_type,layout_type> column_matrix; + + + template < + typename EXP1, + typename EXP2, + typename EXP3, + long NR + > + unsigned long operator() ( + const matrix_exp<EXP1>& Q, + const matrix_exp<EXP2>& p, + const matrix_exp<EXP3>& y, + const scalar_type B, + const scalar_type Cp, + const scalar_type Cn, + matrix<scalar_type,NR,1,mem_manager_type, layout_type>& alpha, + scalar_type eps + ) + { + DLIB_ASSERT(Q.nr() == Q.nc() && y.size() == Q.nr() && p.size() == y.size() && + y.size() > 0 && is_col_vector(y) && is_col_vector(p) && + sum((y == +1) + (y == -1)) == y.size() && + Cp > 0 && Cn > 0 && + eps > 0, + "\t void solve_qp3_using_smo::operator()" + << "\n\t invalid arguments were given to this function" + << "\n\t Q.nr(): " << Q.nr() + << "\n\t Q.nc(): " << Q.nc() + << "\n\t is_col_vector(p): " << is_col_vector(p) + << "\n\t p.size(): " << p.size() + << "\n\t is_col_vector(y): " << is_col_vector(y) + << "\n\t y.size(): " << y.size() + << "\n\t sum((y == +1) + (y == -1)): " << sum((y == +1) + (y == -1)) + << "\n\t Cp: " << Cp + << "\n\t Cn: " << Cn + << "\n\t eps: " << eps + ); + + + + set_initial_alpha(y, B, Cp, Cn, alpha); + + + const scalar_type tau = 1e-12; + + typedef typename colm_exp<EXP1>::type col_type; + + // initialize df. Compute df = Q*alpha + p + df = p; + for (long r = 0; r < df.nr(); ++r) + { + if (alpha(r) != 0) + { + df += alpha(r)*matrix_cast<scalar_type>(colm(Q,r)); + } + } + + unsigned long count = 0; + // now perform the actual optimization of alpha + long i=0, j=0; + while (find_working_group(y,alpha,Q,df,Cp,Cn,tau,eps,i,j)) + { + ++count; + const scalar_type old_alpha_i = alpha(i); + const scalar_type old_alpha_j = alpha(j); + + optimize_working_pair(alpha,Q,y,df,tau,i,j, Cp, Cn ); + + // update the df vector now that we have modified alpha(i) and alpha(j) + scalar_type delta_alpha_i = alpha(i) - old_alpha_i; + scalar_type delta_alpha_j = alpha(j) - old_alpha_j; + + col_type Q_i = colm(Q,i); + col_type Q_j = colm(Q,j); + for(long k = 0; k < df.nr(); ++k) + df(k) += Q_i(k)*delta_alpha_i + Q_j(k)*delta_alpha_j; + } + + return count; + } + + const column_matrix& get_gradient ( + ) const { return df; } + + private: + + // ------------------------------------------------------------------------------------- + + template < + typename scalar_type, + typename scalar_vector_type, + typename scalar_vector_type2 + > + inline void set_initial_alpha ( + const scalar_vector_type& y, + const scalar_type B, + const scalar_type Cp, + const scalar_type Cn, + scalar_vector_type2& alpha + ) const + { + alpha.set_size(y.size()); + + set_all_elements(alpha,0); + + // It's easy in the B == 0 case + if (B == 0) + return; + + const scalar_type C = (B > 0)? Cp : Cn; + + scalar_type temp = std::abs(B)/C; + long num = (long)std::floor(temp); + long num_total = (long)std::ceil(temp); + + const scalar_type B_sign = (B > 0)? 1 : -1; + + long count = 0; + for (long i = 0; i < alpha.nr(); ++i) + { + if (y(i) == B_sign) + { + if (count < num) + { + ++count; + alpha(i) = C; + } + else + { + if (count < num_total) + { + ++count; + alpha(i) = C*(temp - std::floor(temp)); + } + break; + } + } + } + + if (count != num_total) + { + std::ostringstream sout; + sout << "Invalid QP3 constraint parameters of B: " << B << ", Cp: " << Cp << ", Cn: "<< Cn; + throw invalid_qp3_error(sout.str(),B,Cp,Cn); + } + } + + // ------------------------------------------------------------------------------------ + + template < + typename scalar_vector_type, + typename scalar_type, + typename EXP, + typename U, typename V + > + inline bool find_working_group ( + const V& y, + const U& alpha, + const matrix_exp<EXP>& Q, + const scalar_vector_type& df, + const scalar_type Cp, + const scalar_type Cn, + const scalar_type tau, + const scalar_type eps, + long& i_out, + long& j_out + ) const + { + using namespace std; + + long ip = 0; + long jp = 0; + + + typedef typename colm_exp<EXP>::type col_type; + typedef typename diag_exp<EXP>::type diag_type; + + scalar_type ip_val = -numeric_limits<scalar_type>::infinity(); + scalar_type jp_val = numeric_limits<scalar_type>::infinity(); + + // loop over the alphas and find the maximum ip and in indices. + for (long i = 0; i < alpha.nr(); ++i) + { + if (y(i) == 1) + { + if (alpha(i) < Cp) + { + if (-df(i) > ip_val) + { + ip_val = -df(i); + ip = i; + } + } + } + else + { + if (alpha(i) > 0.0) + { + if (df(i) > ip_val) + { + ip_val = df(i); + ip = i; + } + } + } + } + + scalar_type Mp = -numeric_limits<scalar_type>::infinity(); + + // Pick out the column and diagonal of Q we need below. Doing + // it this way is faster if Q is actually a symmetric_matrix_cache + // object. + col_type Q_ip = colm(Q,ip); + diag_type Q_diag = diag(Q); + + + + // now we need to find the minimum jp indices + for (long j = 0; j < alpha.nr(); ++j) + { + if (y(j) == 1) + { + if (alpha(j) > 0.0) + { + scalar_type b = ip_val + df(j); + if (df(j) > Mp) + Mp = df(j); + + if (b > 0) + { + scalar_type a = Q_ip(ip) + Q_diag(j) - 2*y(ip)*Q_ip(j); + if (a <= 0) + a = tau; + scalar_type temp = -b*b/a; + if (temp < jp_val) + { + jp_val = temp; + jp = j; + } + } + } + } + else + { + if (alpha(j) < Cn) + { + scalar_type b = ip_val - df(j); + if (-df(j) > Mp) + Mp = -df(j); + + if (b > 0) + { + scalar_type a = Q_ip(ip) + Q_diag(j) + 2*y(ip)*Q_ip(j); + if (a <= 0) + a = tau; + scalar_type temp = -b*b/a; + if (temp < jp_val) + { + jp_val = temp; + jp = j; + } + } + } + } + } + + // if we are at the optimal point then return false so the caller knows + // to stop optimizing + if (Mp + ip_val < eps) + return false; + + + i_out = ip; + j_out = jp; + + return true; + } + + // ------------------------------------------------------------------------------------ + + template < + typename EXP, + typename EXP2, + typename T, typename U + > + inline void optimize_working_pair ( + T& alpha, + const matrix_exp<EXP>& Q, + const matrix_exp<EXP2>& y, + const U& df, + const scalar_type& tau, + const long i, + const long j, + const scalar_type& Cp, + const scalar_type& Cn + ) const + { + const scalar_type Ci = (y(i) > 0 )? Cp : Cn; + const scalar_type Cj = (y(j) > 0 )? Cp : Cn; + + if (y(i) != y(j)) + { + scalar_type quad_coef = Q(i,i)+Q(j,j)+2*Q(j,i); + if (quad_coef <= 0) + quad_coef = tau; + scalar_type delta = (-df(i)-df(j))/quad_coef; + scalar_type diff = alpha(i) - alpha(j); + alpha(i) += delta; + alpha(j) += delta; + + if (diff > 0) + { + if (alpha(j) < 0) + { + alpha(j) = 0; + alpha(i) = diff; + } + } + else + { + if (alpha(i) < 0) + { + alpha(i) = 0; + alpha(j) = -diff; + } + } + + if (diff > Ci - Cj) + { + if (alpha(i) > Ci) + { + alpha(i) = Ci; + alpha(j) = Ci - diff; + } + } + else + { + if (alpha(j) > Cj) + { + alpha(j) = Cj; + alpha(i) = Cj + diff; + } + } + } + else + { + scalar_type quad_coef = Q(i,i)+Q(j,j)-2*Q(j,i); + if (quad_coef <= 0) + quad_coef = tau; + scalar_type delta = (df(i)-df(j))/quad_coef; + scalar_type sum = alpha(i) + alpha(j); + alpha(i) -= delta; + alpha(j) += delta; + + if(sum > Ci) + { + if(alpha(i) > Ci) + { + alpha(i) = Ci; + alpha(j) = sum - Ci; + } + } + else + { + if(alpha(j) < 0) + { + alpha(j) = 0; + alpha(i) = sum; + } + } + + if(sum > Cj) + { + if(alpha(j) > Cj) + { + alpha(j) = Cj; + alpha(i) = sum - Cj; + } + } + else + { + if(alpha(i) < 0) + { + alpha(i) = 0; + alpha(j) = sum; + } + } + + } + } + + // ------------------------------------------------------------------------------------ + + column_matrix df; // gradient of f(alpha) + }; + +// ---------------------------------------------------------------------------------------- + +} + +#endif // DLIB_SOLVE_QP3_USING_SMo_Hh_ + + diff --git a/ml/dlib/dlib/optimization/optimization_solve_qp3_using_smo_abstract.h b/ml/dlib/dlib/optimization/optimization_solve_qp3_using_smo_abstract.h new file mode 100644 index 000000000..8efd7215b --- /dev/null +++ b/ml/dlib/dlib/optimization/optimization_solve_qp3_using_smo_abstract.h @@ -0,0 +1,139 @@ +// Copyright (C) 2010 Davis E. King (davis@dlib.net) +// License: Boost Software License See LICENSE.txt for the full license. +#undef DLIB_OPTIMIZATION_SOLVE_QP3_USING_SMO_ABSTRACT_H_ +#ifdef DLIB_OPTIMIZATION_SOLVE_QP3_USING_SMO_ABSTRACT_H_ + +#include "../matrix/matrix_abstract.h" +#include "../algs.h" + +namespace dlib +{ + +// ---------------------------------------------------------------------------------------- + + class invalid_qp3_error : public dlib::error + { + /*! + WHAT THIS OBJECT REPRESENTS + This object is an exception class used to indicate that the values + of B, Cp, and Cn given to the solve_qp3_using_smo object are incompatible + with the constraints of the quadratic program. + + this->B, this->Cp, and this->Cn will be set to the invalid values used. + !*/ + + public: + invalid_qp3_error( const std::string& msg, double B_, double Cp_, double Cn_) : + dlib::error(msg), B(B_), Cp(Cp_), Cn(Cn_) {}; + + const double B; + const double Cp; + const double Cn; + }; + +// ---------------------------------------------------------------------------------------- + + template < + typename matrix_type + > + class solve_qp3_using_smo + { + /*! + REQUIREMENTS ON matrix_type + Must be some type of dlib::matrix. + + WHAT THIS OBJECT REPRESENTS + This object is a tool for solving the following quadratic programming + problem using the sequential minimal optimization algorithm: + + Minimize: f(alpha) == 0.5*trans(alpha)*Q*alpha + trans(p)*alpha + subject to the following constraints: + - for all i such that y(i) == +1: 0 <= alpha(i) <= Cp + - for all i such that y(i) == -1: 0 <= alpha(i) <= Cn + - trans(y)*alpha == B + + Where all elements of y must be equal to +1 or -1 and f is convex. + This means that Q should be symmetric and positive-semidefinite. + + + This object implements the strategy used by the LIBSVM tool. The following papers + can be consulted for additional details: + - Chih-Chung Chang and Chih-Jen Lin, LIBSVM : a library for support vector + machines, 2001. Software available at http://www.csie.ntu.edu.tw/~cjlin/libsvm + - Working Set Selection Using Second Order Information for Training Support Vector Machines by + Fan, Chen, and Lin. In the Journal of Machine Learning Research 2005. + !*/ + + public: + + typedef typename matrix_type::mem_manager_type mem_manager_type; + typedef typename matrix_type::type scalar_type; + typedef typename matrix_type::layout_type layout_type; + typedef matrix<scalar_type,0,0,mem_manager_type,layout_type> general_matrix; + typedef matrix<scalar_type,0,1,mem_manager_type,layout_type> column_matrix; + + template < + typename EXP1, + typename EXP2, + typename EXP3, + long NR + > + unsigned long operator() ( + const matrix_exp<EXP1>& Q, + const matrix_exp<EXP2>& p, + const matrix_exp<EXP3>& y, + const scalar_type B, + const scalar_type Cp, + const scalar_type Cn, + matrix<scalar_type,NR,1,mem_manager_type, layout_type>& alpha, + scalar_type eps + ); + /*! + requires + - Q.nr() == Q.nc() + - is_col_vector(y) == true + - is_col_vector(p) == true + - p.size() == y.size() == Q.nr() + - y.size() > 0 + - sum((y == +1) + (y == -1)) == y.size() + (i.e. all elements of y must be equal to +1 or -1) + - alpha must be capable of representing a vector of size y.size() elements + - Cp > 0 + - Cn > 0 + - eps > 0 + ensures + - This function solves the quadratic program defined in this class's main comment. + - The solution to the quadratic program will be stored in #alpha. + - #alpha.size() == y.size() + - This function uses an implementation of the sequential minimal optimization + algorithm. It runs until the KKT violation is less than eps. So eps controls + how accurate the solution is and smaller values result in better solutions. + (a reasonable eps is usually about 1e-3) + - #get_gradient() == Q*(#alpha) + (i.e. stores the gradient of f() at #alpha in get_gradient()) + - returns the number of iterations performed. + throws + - invalid_qp3_error + This exception is thrown if the given parameters cause the constraints + of the quadratic programming problem to be impossible to satisfy. + !*/ + + const column_matrix& get_gradient ( + ) const; + /*! + ensures + - returns the gradient vector at the solution of the last problem solved + by this object. If no problem has been solved then returns an empty + vector. + !*/ + + }; + +// ---------------------------------------------------------------------------------------- + +} + +#endif // DLIB_OPTIMIZATION_SOLVE_QP3_USING_SMO_ABSTRACT_H_ + + + diff --git a/ml/dlib/dlib/optimization/optimization_solve_qp_using_smo.h b/ml/dlib/dlib/optimization/optimization_solve_qp_using_smo.h new file mode 100644 index 000000000..b9cc74df3 --- /dev/null +++ b/ml/dlib/dlib/optimization/optimization_solve_qp_using_smo.h @@ -0,0 +1,937 @@ +// Copyright (C) 2010 Davis E. King (davis@dlib.net) +// License: Boost Software License See LICENSE.txt for the full license. +#ifndef DLIB_OPTIMIZATION_SOLVE_QP_UsING_SMO_Hh_ +#define DLIB_OPTIMIZATION_SOLVE_QP_UsING_SMO_Hh_ + +#include "optimization_solve_qp_using_smo_abstract.h" +#include "../matrix.h" +#include <map> +#include "../unordered_pair.h" + +namespace dlib +{ + +// ---------------------------------------------------------------------------------------- + + /* + The algorithm defined in the solve_qp_using_smo() function below can be + derived by using an important theorem from the theory of constrained optimization. + This theorem tells us that any optimal point of a constrained function must + satisfy what are called the KKT conditions (also sometimes called just the KT + conditions, especially in older literature). A very good book to consult + regarding this topic is Practical Methods of Optimization (second edition) by + R. Fletcher. Below I will try to explain the general idea of how this is + applied. + + Let e == ones_matrix(alpha.size(),1) + + First, note that the function below solves the following quadratic program. + Minimize: f(alpha) == 0.5*trans(alpha)*Q*alpha - trans(alpha)*b + subject to the following constraints: + - trans(e)*alpha == C (i.e. the sum of alpha values doesn't change) + - min(alpha) >= 0 (i.e. all alpha values are nonnegative) + Where f is convex. This means that Q should be positive-semidefinite. + + + To get from this problem formulation to the algorithm below we have to + consider the KKT conditions. They tell us that any solution to the above + problem must satisfy the following 5 conditions: + 1. trans(e)*alpha == C + 2. min(alpha) >= 0 + + 3. Let L(alpha, x, y) == f(alpha) - trans(x)*alpha - y*(trans(e)*alpha - C) + Where x is a vector of length alpha.size() and y is a single scalar. + Then the derivative of L with respect to alpha must == 0 + So we get the following as our 3rd condition: + f'(alpha) - x - y*e == 0 + + 4. min(x) >= 0 (i.e. all x values are nonnegative) + 5. pointwise_multiply(x, alpha) == 0 + (i.e. only one member of each x(i) and alpha(i) pair can be non-zero) + + + From 3 we can easily obtain this rule: + for all i: f'(alpha)(i) - x(i) == y + + If we then consider 4 and 5 we see that we can infer that the following + must also be the case: + - if (alpha(i) > 0) then + - x(i) == 0 + - f'(alpha)(i) == y + - else + - x(i) == some nonnegative number + - f'(alpha)(i) >= y + + + The important thing to take away is the final rule. It tells us that at the + optimal solution all elements of the gradient of f have the same value if + their corresponding alpha is non-zero. It also tells us that all the other + gradient values are bigger than y. We can use this information to help us + pick which alpha variables to optimize at each iteration. + */ + +// ---------------------------------------------------------------------------------------- + + template < + typename EXP1, + typename EXP2, + typename T, long NR, long NC, typename MM, typename L + > + unsigned long solve_qp_using_smo ( + const matrix_exp<EXP1>& Q, + const matrix_exp<EXP2>& b, + matrix<T,NR,NC,MM,L>& alpha, + T eps, + unsigned long max_iter + ) + { + // make sure requires clause is not broken + DLIB_ASSERT(Q.nr() == Q.nc() && + is_col_vector(b) && + is_col_vector(alpha) && + b.size() == alpha.size() && + b.size() == Q.nr() && + alpha.size() > 0 && + min(alpha) >= 0 && + eps > 0 && + max_iter > 0, + "\t unsigned long solve_qp_using_smo()" + << "\n\t Invalid arguments were given to this function" + << "\n\t Q.nr(): " << Q.nr() + << "\n\t Q.nc(): " << Q.nc() + << "\n\t is_col_vector(b): " << is_col_vector(b) + << "\n\t is_col_vector(alpha): " << is_col_vector(alpha) + << "\n\t b.size(): " << b.size() + << "\n\t alpha.size(): " << alpha.size() + << "\n\t Q.nr(): " << Q.nr() + << "\n\t min(alpha): " << min(alpha) + << "\n\t eps: " << eps + << "\n\t max_iter: " << max_iter + ); + + const T C = sum(alpha); + + // Compute f'(alpha) (i.e. the gradient of f(alpha)) for the current alpha. + matrix<T,NR,NC,MM,L> df = Q*alpha - b; + + const T tau = 1000*std::numeric_limits<T>::epsilon(); + + T big, little; + unsigned long iter = 0; + for (; iter < max_iter; ++iter) + { + // Find the two elements of df that satisfy the following: + // - little_idx == index_of_min(df) + // - big_idx == the index of the largest element in df such that alpha(big_idx) > 0 + // These two indices will tell us which two alpha values are most in violation of the KKT + // optimality conditions. + big = -std::numeric_limits<T>::max(); + long big_idx = 0; + little = std::numeric_limits<T>::max(); + long little_idx = 0; + for (long i = 0; i < df.nr(); ++i) + { + if (df(i) > big && alpha(i) > 0) + { + big = df(i); + big_idx = i; + } + if (df(i) < little) + { + little = df(i); + little_idx = i; + } + } + + // Check if the KKT conditions are still violated and stop if so. + //if (alpha(little_idx) > 0 && (big - little) < eps) + // break; + + // Check how big the duality gap is and stop when it goes below eps. + // The duality gap is the gap between the objective value of the function + // we are optimizing and the value of its primal form. This value is always + // greater than or equal to the distance to the optimum solution so it is a + // good way to decide if we should stop. See the book referenced above for + // more information. In particular, see the part about the Wolfe Dual. + if (trans(alpha)*df - C*little < eps) + break; + + + // Save these values, we will need them later. + const T old_alpha_big = alpha(big_idx); + const T old_alpha_little = alpha(little_idx); + + + // Now optimize the two variables we just picked. + T quad_coef = Q(big_idx,big_idx) + Q(little_idx,little_idx) - 2*Q(big_idx, little_idx); + if (quad_coef <= tau) + quad_coef = tau; + const T delta = (big - little)/quad_coef; + alpha(big_idx) -= delta; + alpha(little_idx) += delta; + + // Make sure alpha stays feasible. That is, make sure the updated alpha doesn't + // violate the non-negativity constraint. + if (alpha(big_idx) < 0) + { + // Since an alpha can't be negative we will just set it to 0 and shift all the + // weight to the other alpha. + alpha(big_idx) = 0; + alpha(little_idx) = old_alpha_big + old_alpha_little; + } + + // Every 300 iterations + if ((iter%300) == 299) + { + // Perform this form of the update every so often because doing so can help + // avoid the buildup of numerical errors you get with the alternate update + // below. + df = Q*alpha - b; + } + else + { + // Now update the gradient. We will perform the equivalent of: df = Q*alpha - b; + const T delta_alpha_big = alpha(big_idx) - old_alpha_big; + const T delta_alpha_little = alpha(little_idx) - old_alpha_little; + + for(long k = 0; k < df.nr(); ++k) + df(k) += Q(big_idx,k)*delta_alpha_big + Q(little_idx,k)*delta_alpha_little;; + } + } + + /* + using namespace std; + cout << "SMO: " << endl; + cout << " duality gap: "<< trans(alpha)*df - C*min(df) << endl; + cout << " KKT gap: "<< big-little << endl; + cout << " iter: "<< iter+1 << endl; + cout << " eps: "<< eps << endl; + */ + + return iter+1; + } + +// ---------------------------------------------------------------------------------------- + + template < + typename EXP1, + typename EXP2, + typename EXP3, + typename EXP4, + typename T, long NR, long NC, typename MM, typename L, + long NR2, long NC2 + > + unsigned long solve_qp4_using_smo ( + const matrix_exp<EXP1>& A, + const matrix_exp<EXP2>& Q, + const matrix_exp<EXP3>& b, + const matrix_exp<EXP4>& d, + matrix<T,NR,NC,MM,L>& alpha, + matrix<T,NR2,NC2,MM,L>& lambda, + T eps, + unsigned long max_iter, + T max_lambda = std::numeric_limits<T>::infinity() + ) + { + // make sure requires clause is not broken + DLIB_ASSERT(A.nc() == alpha.size() && + Q.nr() == Q.nc() && + is_col_vector(b) && + is_col_vector(alpha) && + b.size() == alpha.size() && + b.size() == Q.nr() && + alpha.size() > 0 && + min(alpha) >= 0 && + eps > 0 && + max_iter > 0, + "\t void solve_qp4_using_smo()" + << "\n\t Invalid arguments were given to this function" + << "\n\t A.nc(): " << A.nc() + << "\n\t Q.nr(): " << Q.nr() + << "\n\t Q.nc(): " << Q.nc() + << "\n\t is_col_vector(b): " << is_col_vector(b) + << "\n\t is_col_vector(alpha): " << is_col_vector(alpha) + << "\n\t b.size(): " << b.size() + << "\n\t alpha.size(): " << alpha.size() + << "\n\t Q.nr(): " << Q.nr() + << "\n\t min(alpha): " << min(alpha) + << "\n\t eps: " << eps + << "\n\t max_iter: " << max_iter + ); + DLIB_ASSERT(is_col_vector(d) == true && + max_lambda >= 0 && + d.size() == A.nr(), + "\t void solve_qp4_using_smo()" + << "\n\t Invalid arguments were given to this function" + << "\n\t A.nr(): " << A.nr() + << "\n\t d.size(): " << d.size() + << "\n\t max_lambda: " << max_lambda + ); + + const T C = sum(alpha); + + /* + For this optimization problem, it is the case that the optimal + value of lambda is given by a simple closed form expression if we + know the optimal alpha. So what we will do is to just optimize + alpha and every now and then we will update lambda with its optimal + value. Therefore, we use essentially the same method as the + solve_qp_using_smo() routine. + */ + + const bool d_is_zero = d==zeros_matrix(d); + + // compute optimal lambda for current alpha + if (d_is_zero) + lambda = A*alpha; + else + lambda = A*alpha + d; + lambda = dlib::clamp(lambda, 0, max_lambda); + + // Compute f'(alpha) (i.e. the gradient of f(alpha) with respect to alpha) for the current alpha. + matrix<T,NR,NC,MM,L> df = Q*alpha - b - trans(A)*lambda; + + const T tau = 1000*std::numeric_limits<T>::epsilon(); + + T big, little; + unsigned long iter = 0; + for (; iter < max_iter; ++iter) + { + // Find the two elements of df that satisfy the following: + // - little_idx == index_of_min(df) + // - big_idx == the index of the largest element in df such that alpha(big_idx) > 0 + // These two indices will tell us which two alpha values are most in violation of the KKT + // optimality conditions. + big = -std::numeric_limits<T>::max(); + long big_idx = 0; + little = std::numeric_limits<T>::max(); + long little_idx = 0; + for (long i = 0; i < df.nr(); ++i) + { + if (df(i) > big && alpha(i) > 0) + { + big = df(i); + big_idx = i; + } + if (df(i) < little) + { + little = df(i); + little_idx = i; + } + } + + // Check how big the duality gap is and stop when it goes below eps. + // The duality gap is the gap between the objective value of the function + // we are optimizing and the value of its primal form. This value is always + // greater than or equal to the distance to the optimum solution so it is a + // good way to decide if we should stop. + if (trans(alpha)*df - C*little < eps) + { + // compute optimal lambda and recheck the duality gap to make + // sure we have really converged. + if (d_is_zero) + lambda = A*alpha; + else + lambda = A*alpha + d; + lambda = dlib::clamp(lambda, 0, max_lambda); + df = Q*alpha - b - trans(A)*lambda; + + if (trans(alpha)*df - C*min(df) < eps) + break; + else + continue; + } + + + // Save these values, we will need them later. + const T old_alpha_big = alpha(big_idx); + const T old_alpha_little = alpha(little_idx); + + + // Now optimize the two variables we just picked. + T quad_coef = Q(big_idx,big_idx) + Q(little_idx,little_idx) - 2*Q(big_idx, little_idx); + if (quad_coef <= tau) + quad_coef = tau; + const T delta = (big - little)/quad_coef; + alpha(big_idx) -= delta; + alpha(little_idx) += delta; + + // Make sure alpha stays feasible. That is, make sure the updated alpha doesn't + // violate the non-negativity constraint. + if (alpha(big_idx) < 0) + { + // Since an alpha can't be negative we will just set it to 0 and shift all the + // weight to the other alpha. + alpha(big_idx) = 0; + alpha(little_idx) = old_alpha_big + old_alpha_little; + } + + + // Every 300 iterations + if ((iter%300) == 299) + { + // compute the optimal lambda for the current alpha + if (d_is_zero) + lambda = A*alpha; + else + lambda = A*alpha + d; + lambda = dlib::clamp(lambda, 0, max_lambda); + + // Perform this form of the update every so often because doing so can help + // avoid the buildup of numerical errors you get with the alternate update + // below. + df = Q*alpha - b - trans(A)*lambda; + } + else + { + // Now update the gradient. We will perform the equivalent of: df = Q*alpha - b; + const T delta_alpha_big = alpha(big_idx) - old_alpha_big; + const T delta_alpha_little = alpha(little_idx) - old_alpha_little; + + for(long k = 0; k < df.nr(); ++k) + df(k) += Q(big_idx,k)*delta_alpha_big + Q(little_idx,k)*delta_alpha_little;; + } + } + + /* + using namespace std; + cout << "SMO: " << endl; + cout << " duality gap: "<< trans(alpha)*df - C*min(df) << endl; + cout << " KKT gap: "<< big-little << endl; + cout << " iter: "<< iter+1 << endl; + cout << " eps: "<< eps << endl; + */ + + + return iter+1; + } + +// ---------------------------------------------------------------------------------------- + + template < + typename EXP1, + typename EXP2, + typename T, long NR, long NC, typename MM, typename L + > + unsigned long solve_qp_box_constrained ( + const matrix_exp<EXP1>& Q, + const matrix_exp<EXP2>& b, + matrix<T,NR,NC,MM,L>& alpha, + const matrix<T,NR,NC,MM,L>& lower, + const matrix<T,NR,NC,MM,L>& upper, + T eps, + unsigned long max_iter + ) + { + // make sure requires clause is not broken + DLIB_ASSERT(Q.nr() == Q.nc() && + alpha.size() == lower.size() && + alpha.size() == upper.size() && + is_col_vector(b) && + is_col_vector(alpha) && + is_col_vector(lower) && + is_col_vector(upper) && + b.size() == alpha.size() && + b.size() == Q.nr() && + alpha.size() > 0 && + 0 <= min(alpha-lower) && + 0 <= max(upper-alpha) && + eps > 0 && + max_iter > 0, + "\t unsigned long solve_qp_box_constrained()" + << "\n\t Invalid arguments were given to this function" + << "\n\t Q.nr(): " << Q.nr() + << "\n\t Q.nc(): " << Q.nc() + << "\n\t is_col_vector(b): " << is_col_vector(b) + << "\n\t is_col_vector(alpha): " << is_col_vector(alpha) + << "\n\t is_col_vector(lower): " << is_col_vector(lower) + << "\n\t is_col_vector(upper): " << is_col_vector(upper) + << "\n\t b.size(): " << b.size() + << "\n\t alpha.size(): " << alpha.size() + << "\n\t lower.size(): " << lower.size() + << "\n\t upper.size(): " << upper.size() + << "\n\t Q.nr(): " << Q.nr() + << "\n\t min(alpha-lower): " << min(alpha-lower) + << "\n\t max(upper-alpha): " << max(upper-alpha) + << "\n\t eps: " << eps + << "\n\t max_iter: " << max_iter + ); + + + // Compute f'(alpha) (i.e. the gradient of f(alpha)) for the current alpha. + matrix<T,NR,NC,MM,L> df = Q*alpha + b; + matrix<T,NR,NC,MM,L> QQ = reciprocal_max(diag(Q)); + + // First we use a coordinate descent method to initialize alpha. + double max_df = 0; + for (long iter = 0; iter < alpha.size()*2; ++iter) + { + max_df = 0; + long best_r =0; + // find the best alpha to optimize. + for (long r = 0; r < Q.nr(); ++r) + { + if (alpha(r) <= lower(r) && df(r) > 0) + ;//alpha(r) = lower(r); + else if (alpha(r) >= upper(r) && df(r) < 0) + ;//alpha(r) = upper(r); + else if (std::abs(df(r)) > max_df) + { + best_r = r; + max_df = std::abs(df(r)); + } + } + + // now optimize alpha(best_r) + const long r = best_r; + const T old_alpha = alpha(r); + alpha(r) = -(df(r)-Q(r,r)*alpha(r))*QQ(r); + if (alpha(r) < lower(r)) + alpha(r) = lower(r); + else if (alpha(r) > upper(r)) + alpha(r) = upper(r); + + const T delta = old_alpha-alpha(r); + + // Now update the gradient. We will perform the equivalent of: df = Q*alpha + b; + for(long k = 0; k < df.nr(); ++k) + df(k) -= Q(r,k)*delta; + } + //cout << "max_df: " << max_df << endl; + //cout << "objective value: " << 0.5*trans(alpha)*Q*alpha + trans(b)*alpha << endl; + + + + // Now do the main iteration block of this solver. The coordinate descent method + // we used above can improve the objective rapidly in the beginning. However, + // Nesterov's method has more rapid convergence once it gets going so this is what + // we use for the main iteration. + matrix<T,NR,NC,MM,L> v, v_old; + v = alpha; + // We need to get an upper bound on the Lipschitz constant for this QP. Since that + // is just the max eigenvalue of Q we can do it using Gershgorin disks. + const T lipschitz_bound = max(diag(Q) + (sum_cols(abs(Q)) - abs(diag(Q)))); + double lambda = 0; + unsigned long iter; + for (iter = 0; iter < max_iter; ++iter) + { + const double next_lambda = (1 + std::sqrt(1+4*lambda*lambda))/2; + const double gamma = (1-lambda)/next_lambda; + lambda = next_lambda; + + v_old = v; + + df = Q*alpha + b; + // now take a projected gradient step using Nesterov's method. + v = clamp(alpha - 1.0/lipschitz_bound * df, lower, upper); + alpha = dlib::clamp((1-gamma)*v + gamma*v_old, lower, upper); + + + // check for convergence every 10 iterations + if (iter%10 == 0) + { + max_df = 0; + for (long r = 0; r < Q.nr(); ++r) + { + if (alpha(r) <= lower(r) && df(r) > 0) + ;//alpha(r) = lower(r); + else if (alpha(r) >= upper(r) && df(r) < 0) + ;//alpha(r) = upper(r); + else if (std::abs(df(r)) > max_df) + max_df = std::abs(df(r)); + } + if (max_df < eps) + break; + } + } + + //cout << "max_df: " << max_df << endl; + //cout << "objective value: " << 0.5*trans(alpha)*Q*alpha + trans(b)*alpha << endl; + return iter+1; + } + +// ---------------------------------------------------------------------------------------- +// ---------------------------------------------------------------------------------------- + + namespace impl + { + // Check if each vector in Q_offdiag is actually a constant times the 1s vector. + template < + typename T, long NR, long NC, typename MM, typename L + > + bool has_uniform_offdiag_vectors( + const std::map<unordered_pair<size_t>, matrix<T,NR,NC,MM,L>>& Q_offdiag + ) + { + for (auto& x : Q_offdiag) + { + auto ref = x.second(0); + for (auto& y : x.second) + if (ref != y) + return false; + } + return true; + } + + template < + typename T, long NR, long NC, typename MM, typename L + > + matrix<T,0,0,MM,L> compact_offdiag( + const size_t& num_blocks, + const std::map<unordered_pair<size_t>, matrix<T,NR,NC,MM,L>>& Q_offdiag + ) + { + matrix<T,0,0,MM,L> temp; + // we can only compact the offdiag information if they are uniform vectors + if (!has_uniform_offdiag_vectors(Q_offdiag)) + return temp; + + temp.set_size(num_blocks, num_blocks); + temp = 0; + + for (auto& x : Q_offdiag) + { + long r = x.first.first; + long c = x.first.second; + temp(r,c) = x.second(0); + temp(c,r) = x.second(0); + } + + return temp; + } + } + +// ---------------------------------------------------------------------------------------- + + template < + typename T, long NR, long NC, typename MM, typename L + > + unsigned long solve_qp_box_constrained_blockdiag ( + const std::vector<matrix<T,NR,NR,MM,L>>& Q_blocks, + const std::vector<matrix<T,NR,NC,MM,L>>& bs, + const std::map<unordered_pair<size_t>, matrix<T,NR,NC,MM,L>>& Q_offdiag, + std::vector<matrix<T,NR,NC,MM,L>>& alphas, + const std::vector<matrix<T,NR,NC,MM,L>>& lowers, + const std::vector<matrix<T,NR,NC,MM,L>>& uppers, + T eps, + unsigned long max_iter + ) + { + // make sure requires clause is not broken + DLIB_CASSERT(Q_blocks.size() > 0); + DLIB_CASSERT(Q_blocks.size() == bs.size() && + Q_blocks.size() == alphas.size() && + Q_blocks.size() == lowers.size() && + Q_blocks.size() == uppers.size(), + "Q_blocks.size(): "<< Q_blocks.size() << "\n" << + "bs.size(): "<< bs.size() << "\n" << + "alphas.size(): "<< alphas.size() << "\n" << + "lowers.size(): "<< lowers.size() << "\n" << + "uppers.size(): "<< uppers.size() << "\n" + ); + for (auto& Q : Q_blocks) + { + DLIB_CASSERT(Q.nr() == Q.nc(), "All the matrices in Q_blocks have the same dimensions."); + DLIB_CASSERT(Q.size() > 0, "All the matrices in Q_blocks must be non-empty and have the same dimensions."); + DLIB_CASSERT(Q.nr() == Q_blocks[0].nr() && Q.nc() == Q_blocks[0].nc(), "All the matrices in Q_blocks have the same dimensions."); + } +#ifdef ENABLE_ASSERTS + for (size_t i = 0; i < alphas.size(); ++i) + { + DLIB_CASSERT(is_col_vector(bs[i]) && bs[i].size() == Q_blocks[0].nr(), + "is_col_vector(bs["<<i<<"]): " << is_col_vector(bs[i]) << "\n" << + "bs["<<i<<"].size(): " << bs[i].size() << "\n" << + "Q_blocks[0].nr(): " << Q_blocks[0].nr()); + + for (auto& Qoffdiag : Q_offdiag) + { + auto& Q_offdiag_element = Qoffdiag.second; + long r = Qoffdiag.first.first; + long c = Qoffdiag.first.second; + DLIB_CASSERT(is_col_vector(Q_offdiag_element) && Q_offdiag_element.size() == Q_blocks[0].nr(), + "is_col_vector(Q_offdiag["<<r<<","<<c<<"]): " << is_col_vector(Q_offdiag_element) << "\n" << + "Q_offdiag["<<r<<","<<c<<"].size(): " << Q_offdiag_element.size() << "\n" << + "Q_blocks[0].nr(): " << Q_blocks[0].nr()); + } + + DLIB_CASSERT(is_col_vector(alphas[i]) && alphas[i].size() == Q_blocks[0].nr(), + "is_col_vector(alphas["<<i<<"]): " << is_col_vector(alphas[i]) << "\n" << + "alphas["<<i<<"].size(): " << alphas[i].size() << "\n" << + "Q_blocks[0].nr(): " << Q_blocks[0].nr()); + + DLIB_CASSERT(is_col_vector(lowers[i]) && lowers[i].size() == Q_blocks[0].nr(), + "is_col_vector(lowers["<<i<<"]): " << is_col_vector(lowers[i]) << "\n" << + "lowers["<<i<<"].size(): " << lowers[i].size() << "\n" << + "Q_blocks[0].nr(): " << Q_blocks[0].nr()); + + DLIB_CASSERT(is_col_vector(uppers[i]) && uppers[i].size() == Q_blocks[0].nr(), + "is_col_vector(uppers["<<i<<"]): " << is_col_vector(uppers[i]) << "\n" << + "uppers["<<i<<"].size(): " << uppers[i].size() << "\n" << + "Q_blocks[0].nr(): " << Q_blocks[0].nr()); + + DLIB_CASSERT(0 <= min(alphas[i]-lowers[i]), "min(alphas["<<i<<"]-lowers["<<i<<"]): " << min(alphas[i]-lowers[i])); + DLIB_CASSERT(0 <= max(uppers[i]-alphas[i]), "max(uppers["<<i<<"]-alphas["<<i<<"]): " << max(uppers[i]-alphas[i])); + } + DLIB_CASSERT(eps > 0 && max_iter > 0, "eps: " << eps << "\nmax_iter: "<< max_iter); +#endif // ENABLE_ASSERTS + + + const auto offdiag_compact = impl::compact_offdiag(Q_blocks.size(), Q_offdiag); + matrix<T,0,0,MM,L> temp, alphas_compact; + + // Compute f'(alpha) (i.e. the gradient of f(alpha)) for the current alpha. + std::vector<matrix<T,NR,NC,MM,L>> df;// = Q*alpha + b; + auto compute_df = [&]() + { + df.resize(Q_blocks.size()); + for (size_t i = 0; i < df.size(); ++i) + df[i] = Q_blocks[i]*alphas[i] + bs[i]; + + + // Don't forget to include the Q_offdiag terms in the computation. Note that + // we have two options for how we can compute this part. If Q_offdiag is + // uniform and can be compacted into a simple matrix and there are a lot of off + // diagonal entries then it's faster to do it as a matrix multiply. Otherwise + // we do the more general computation. + if (offdiag_compact.size() != 0 && Q_offdiag.size() > Q_blocks.size()*5) + { + // Do it as a matrix multiply (with a bit of data shuffling) + alphas_compact.set_size(alphas[0].size(), offdiag_compact.nr()); + for (long c = 0; c < alphas_compact.nc(); ++c) + set_colm(alphas_compact,c) = alphas[c]; + temp = alphas_compact*offdiag_compact; + for (size_t i = 0; i < df.size(); ++i) + df[i] += colm(temp,i); + } + else + { + // Do the fully general computation that allows for non-uniform values in + // the off diagonal vectors. + for (auto& p : Q_offdiag) + { + long r = p.first.first; + long c = p.first.second; + df[r] += pointwise_multiply(p.second, alphas[c]); + if (r != c) + df[c] += pointwise_multiply(p.second, alphas[r]); + } + } + }; + compute_df(); + + + + std::vector<matrix<T,NR,NC,MM,L>> Q_diag, Q_ggd; + std::vector<matrix<T,NR,NC,MM,L>> QQ;// = reciprocal_max(diag(Q)); + QQ.resize(Q_blocks.size()); + Q_diag.resize(Q_blocks.size()); + Q_ggd.resize(Q_blocks.size()); + + // We need to get an upper bound on the Lipschitz constant for this QP. Since that + // is just the max eigenvalue of Q we can do it using Gershgorin disks. + //const T lipschitz_bound = max(diag(Q) + (sum_cols(abs(Q)) - abs(diag(Q)))); + for (size_t i = 0; i < QQ.size(); ++i) + { + auto f = Q_offdiag.find(make_unordered_pair(i,i)); + if (f != Q_offdiag.end()) + Q_diag[i] = diag(Q_blocks[i]) + f->second; + else + Q_diag[i] = diag(Q_blocks[i]); + QQ[i] = reciprocal_max(Q_diag[i]); + + Q_ggd[i] = Q_diag[i] + (sum_cols(abs(Q_blocks[i]))-abs(diag(Q_blocks[i]))); + } + for (auto& p : Q_offdiag) + { + long r = p.first.first; + long c = p.first.second; + if (r != c) + { + Q_ggd[r] += abs(p.second); + Q_ggd[c] += abs(p.second); + } + } + T lipschitz_bound = -std::numeric_limits<T>::infinity(); + for (auto& x : Q_ggd) + lipschitz_bound = std::max(lipschitz_bound, max(x)); + + + const long num_variables = alphas.size()*alphas[0].size(); + + // First we use a coordinate descent method to initialize alpha. + double max_df = 0; + for (long iter = 0; iter < num_variables*2; ++iter) + { + max_df = 0; + long best_r =0; + size_t best_r2 =0; + // find the best alpha to optimize. + for (size_t r2 = 0; r2 < alphas.size(); ++r2) + { + auto& alpha = alphas[r2]; + auto& df_ = df[r2]; + auto& lower = lowers[r2]; + auto& upper = uppers[r2]; + for (long r = 0; r < alpha.nr(); ++r) + { + if (alpha(r) <= lower(r) && df_(r) > 0) + ;//alpha(r) = lower(r); + else if (alpha(r) >= upper(r) && df_(r) < 0) + ;//alpha(r) = upper(r); + else if (std::abs(df_(r)) > max_df) + { + best_r = r; + best_r2 = r2; + max_df = std::abs(df_(r)); + } + } + } + + // now optimize alphas[best_r2](best_r) + const long r = best_r; + auto& alpha = alphas[best_r2]; + auto& lower = lowers[best_r2]; + auto& upper = uppers[best_r2]; + auto& df_ = df[best_r2]; + const T old_alpha = alpha(r); + alpha(r) = -(df_(r)-Q_diag[best_r2](r)*alpha(r))*QQ[best_r2](r); + if (alpha(r) < lower(r)) + alpha(r) = lower(r); + else if (alpha(r) > upper(r)) + alpha(r) = upper(r); + + const T delta = old_alpha-alpha(r); + + // Now update the gradient. We will perform the equivalent of: df = Q*alpha + + // b; except we only need to compute one column of the matrix multiply because + // only one element of alpha changed. + auto& Q = Q_blocks[best_r2]; + for(long k = 0; k < df_.nr(); ++k) + df_(k) -= Q(r,k)*delta; + for(size_t j = 0; j < Q_blocks.size(); ++j) + { + auto f = Q_offdiag.find(make_unordered_pair(best_r2, j)); + if (f != Q_offdiag.end()) + df[j](r) -= f->second(r)*delta; + } + } + + + + + std::vector<matrix<T,NR,NC,MM,L>> v(alphas), v_old(alphas.size()); + double lambda = 0; + unsigned long iter; + // Now do the main iteration block of this solver. The coordinate descent method + // we used above can improve the objective rapidly in the beginning. However, + // Nesterov's method has more rapid convergence once it gets going so this is what + // we use for the main iteration. + for (iter = 0; iter < max_iter; ++iter) + { + const double next_lambda = (1 + std::sqrt(1+4*lambda*lambda))/2; + const double gamma = (1-lambda)/next_lambda; + lambda = next_lambda; + + v_old.swap(v); + + //df = Q*alpha + b; + compute_df(); + + // now take a projected gradient step using Nesterov's method. + for (size_t j = 0; j < alphas.size(); ++j) + { + v[j] = clamp(alphas[j] - 1.0/lipschitz_bound * df[j], lowers[j], uppers[j]); + alphas[j] = clamp((1-gamma)*v[j] + gamma*v_old[j], lowers[j], uppers[j]); + } + + + // check for convergence every 10 iterations + if (iter%10 == 0) + { + max_df = 0; + for (size_t r2 = 0; r2 < alphas.size(); ++r2) + { + auto& alpha = alphas[r2]; + auto& df_ = df[r2]; + auto& lower = lowers[r2]; + auto& upper = uppers[r2]; + for (long r = 0; r < alpha.nr(); ++r) + { + if (alpha(r) <= lower(r) && df_(r) > 0) + ;//alpha(r) = lower(r); + else if (alpha(r) >= upper(r) && df_(r) < 0) + ;//alpha(r) = upper(r); + else if (std::abs(df_(r)) > max_df) + max_df = std::abs(df_(r)); + } + } + if (max_df < eps) + break; + } + } + + return iter+1; + } + +// ---------------------------------------------------------------------------------------- + + template < + typename EXP1, + typename EXP2, + typename T, long NRa, long NRb + > + unsigned long find_gap_between_convex_hulls ( + const matrix_exp<EXP1>& A, + const matrix_exp<EXP2>& B, + matrix<T,NRa,1>& cA, + matrix<T,NRb,1>& cB, + const double eps, + const unsigned long max_iter = 1000 + ) + { + DLIB_CASSERT(A.size() != 0); + DLIB_CASSERT(B.size() != 0); + DLIB_CASSERT(A.nr() == B.nr(), "The dimensionality of the points in both convex hull sets must match"); + DLIB_CASSERT(eps > 0); + DLIB_CASSERT(max_iter > 0); + + cA.set_size(A.nc()); + cB.set_size(B.nc()); + + // initialize to the centroids of A and B respectively. + cA = 1.0/cA.size(); + cB = 1.0/cB.size(); + + + matrix<T> AA, BB, AB, ABb, ABa; + + AA = trans(A)*A; + BB = trans(B)*B; + AB = trans(A)*B; + + unsigned long iter = 0; + for (iter = 0; iter < max_iter; ++iter) + { + // find the convex combination of A that is nearest to B*cB + ABb = AB*cB; + const auto smo_iter1 = solve_qp_using_smo(AA, ABb, cA, eps, cA.size()); + + // now find the convex combination of B that is nearest to A*cA + ABa = trans(AB)*cA; + const auto smo_iter2 = solve_qp_using_smo(BB, ABa, cB, eps, cB.size()); + + // stop if the QP solvers failed to improve + if (smo_iter1 == 1 && smo_iter2 == 1) + break; + } + + + return iter+1; + } + +// ---------------------------------------------------------------------------------------- + +} + +#endif // DLIB_OPTIMIZATION_SOLVE_QP_UsING_SMO_Hh_ + diff --git a/ml/dlib/dlib/optimization/optimization_solve_qp_using_smo_abstract.h b/ml/dlib/dlib/optimization/optimization_solve_qp_using_smo_abstract.h new file mode 100644 index 000000000..5e7d5ec3f --- /dev/null +++ b/ml/dlib/dlib/optimization/optimization_solve_qp_using_smo_abstract.h @@ -0,0 +1,282 @@ +// Copyright (C) 2010 Davis E. King (davis@dlib.net) +// License: Boost Software License See LICENSE.txt for the full license. +#undef DLIB_OPTIMIZATION_SOLVE_QP_UsING_SMO_ABSTRACT_Hh_ +#ifdef DLIB_OPTIMIZATION_SOLVE_QP_UsING_SMO_ABSTRACT_Hh_ + +#include "../matrix.h" +#include <map> +#include "../unordered_pair.h" + +namespace dlib +{ + +// ---------------------------------------------------------------------------------------- + + template < + typename EXP1, + typename EXP2, + typename T, long NR, long NC, typename MM, typename L + > + unsigned long solve_qp_using_smo ( + const matrix_exp<EXP1>& Q, + const matrix_exp<EXP2>& b, + matrix<T,NR,NC,MM,L>& alpha, + T eps, + unsigned long max_iter + ); + /*! + requires + - Q.nr() == Q.nc() + - is_col_vector(b) == true + - is_col_vector(alpha) == true + - b.size() == alpha.size() == Q.nr() + - alpha.size() > 0 + - min(alpha) >= 0 + - eps > 0 + - max_iter > 0 + ensures + - Let C == sum(alpha) (i.e. C is the sum of the alpha values you + supply to this function) + - This function solves the following quadratic program: + Minimize: f(alpha) == 0.5*trans(alpha)*Q*alpha - trans(alpha)*b + subject to the following constraints: + - sum(alpha) == C (i.e. the sum of alpha values doesn't change) + - min(alpha) >= 0 (i.e. all alpha values are nonnegative) + Where f is convex. This means that Q should be positive-semidefinite. + - The solution to the above QP will be stored in #alpha. + - This function uses a simple implementation of the sequential minimal + optimization algorithm. It starts the algorithm with the given alpha + and it works on the problem until the duality gap (i.e. how far away + we are from the optimum solution) is less than eps. So eps controls + how accurate the solution is and smaller values result in better solutions. + - At most max_iter iterations of optimization will be performed. + - returns the number of iterations performed. If this method fails to + converge to eps accuracy then the number returned will be max_iter+1. + !*/ + +// ---------------------------------------------------------------------------------------- + + template < + typename EXP1, + typename EXP2, + typename EXP3, + typename T, long NR, long NC, typename MM, typename L, + long NR2, long NC2 + > + unsigned long solve_qp4_using_smo ( + const matrix_exp<EXP1>& A, + const matrix_exp<EXP2>& Q, + const matrix_exp<EXP3>& b, + const matrix_exp<EXP4>& d, + matrix<T,NR,NC,MM,L>& alpha, + matrix<T,NR2,NC2,MM,L>& lambda, + T eps, + unsigned long max_iter, + T max_lambda = std::numeric_limits<T>::infinity() + ); + /*! + requires + - A.nc() == alpha.size() + - Q.nr() == Q.nc() + - is_col_vector(b) == true + - is_col_vector(d) == true + - is_col_vector(alpha) == true + - b.size() == alpha.size() == Q.nr() + - d.size() == A.nr() + - alpha.size() > 0 + - min(alpha) >= 0 + - eps > 0 + - max_iter > 0 + - max_lambda >= 0 + ensures + - Let C == sum(alpha) (i.e. C is the sum of the alpha values you + supply to this function) + - This function solves the following quadratic program: + Minimize: f(alpha,lambda) == 0.5*trans(alpha)*Q*alpha - trans(alpha)*b + + 0.5*trans(lambda)*lambda - trans(lambda)*A*alpha - trans(lambda)*d + subject to the following constraints: + - sum(alpha) == C (i.e. the sum of alpha values doesn't change) + - min(alpha) >= 0 (i.e. all alpha values are nonnegative) + - min(lambda) >= 0 (i.e. all lambda values are nonnegative) + - max(lambda) <= max_lambda (i.e. all lambda values are less than max_lambda) + Where f is convex. This means that Q should be positive-semidefinite. + - If you don't want an upper limit on lambda then max_lambda can be set to + infinity. + - The solution to the above QP will be stored in #alpha and #lambda. + - This function uses a simple implementation of the sequential minimal + optimization algorithm. It starts the algorithm with the given alpha + and it works on the problem until the duality gap (i.e. how far away + we are from the optimum solution) is less than eps. So eps controls + how accurate the solution is and smaller values result in better solutions. + The initial value of lambda is ignored since the optimal lambda can be + obtained via a simple closed form expression given alpha. + - At most max_iter iterations of optimization will be performed. + - returns the number of iterations performed. If this method fails to + converge to eps accuracy then the number returned will be max_iter+1. + !*/ + +// ---------------------------------------------------------------------------------------- + + template < + typename EXP1, + typename EXP2, + typename T, long NR, long NC, typename MM, typename L + > + unsigned long solve_qp_box_constrained ( + const matrix_exp<EXP1>& Q, + const matrix_exp<EXP2>& b, + matrix<T,NR,NC,MM,L>& alpha, + const matrix<T,NR,NC,MM,L>& lower, + const matrix<T,NR,NC,MM,L>& upper, + T eps, + unsigned long max_iter + ); + /*! + requires + - Q.nr() == Q.nc() + - alpha.size() == lower.size() == upper.size() + - is_col_vector(b) == true + - is_col_vector(alpha) == true + - is_col_vector(lower) == true + - is_col_vector(upper) == true + - b.size() == alpha.size() == Q.nr() + - alpha.size() > 0 + - 0 <= min(alpha-lower) + - 0 <= max(upper-alpha) + - eps > 0 + - max_iter > 0 + ensures + - This function solves the following quadratic program: + Minimize: f(alpha) == 0.5*trans(alpha)*Q*alpha + trans(b)*alpha + subject to the following box constraints on alpha: + - 0 <= min(alpha-lower) + - 0 <= max(upper-alpha) + Where f is convex. This means that Q should be positive-semidefinite. + - The solution to the above QP will be stored in #alpha. + - This function uses a combination of a SMO algorithm along with Nesterov's + method as the main iteration of the solver. It starts the algorithm with the + given alpha and it works on the problem until the derivative of f(alpha) is + smaller than eps for each element of alpha or the alpha value is at a box + constraint. So eps controls how accurate the solution is and smaller values + result in better solutions. + - At most max_iter iterations of optimization will be performed. + - returns the number of iterations performed. If this method fails to + converge to eps accuracy then the number returned will be max_iter+1. + !*/ + +// ---------------------------------------------------------------------------------------- + + template < + typename T, long NR, long NC, typename MM, typename L + > + unsigned long solve_qp_box_constrained_blockdiag ( + const std::vector<matrix<T,NR,NR,MM,L>>& Q_blocks, + const std::vector<matrix<T,NR,NC,MM,L>>& bs, + const std::map<unordered_pair<size_t>, matrix<T,NR,NC,MM,L>>& Q_offdiag, + std::vector<matrix<T,NR,NC,MM,L>>& alphas, + const std::vector<matrix<T,NR,NC,MM,L>>& lowers, + const std::vector<matrix<T,NR,NC,MM,L>>& uppers, + T eps, + unsigned long max_iter + ); + /*! + requires + - Q_blocks.size() > 0 + - Q_blocks.size() == bs.size() == alphas.size() == lowers.size() == uppers.size() + - All the matrices in Q_blocks have the same dimensions. Moreover, they are + non-empty square matrices. + - All the matrices in bs, Q_offdiag, alphas, lowers, and uppers have the same + dimensions. Moreover, they are all column vectors. + - Q_blocks[0].nr() == alphas[0].size() + (i.e. the dimensionality of the column vectors in alphas must match the + dimensionality of the square matrices in Q_blocks.) + - for all valid i: + - 0 <= min(alphas[i]-lowers[i]) + - 0 <= max(uppers[i]-alphas[i]) + - eps > 0 + - max_iter > 0 + ensures + - This function solves the same QP as solve_qp_box_constrained(), except it is + optimized for the case where the Q matrix has a certain sparsity structure. + To be precise: + - Let Q1 be a block diagonal matrix with the elements of Q_blocks placed + along its diagonal, and in the order contained in Q_blocks. + - Let Q2 be a matrix with the same size as Q1, except instead of being block diagonal, it + is block structured into Q_blocks.nr() by Q_blocks.nc() blocks. If we let (r,c) be the + coordinate of each block then each block contains the matrix + diagm(Q_offdiag[make_unordered_pair(r,c)]) or the zero matrix if Q_offdiag has no entry + for the coordinate (r,c). + - Let Q == Q1+Q2 + - Let b == the concatenation of all the vectors in bs into one big vector. + - Let alpha == the concatenation of all the vectors in alphas into one big vector. + - Let lower == the concatenation of all the vectors in lowers into one big vector. + - Let upper == the concatenation of all the vectors in uppers into one big vector. + - Then this function solves the following quadratic program: + Minimize: f(alpha) == 0.5*trans(alpha)*Q*alpha + trans(b)*alpha + subject to the following box constraints on alpha: + - 0 <= min(alpha-lower) + - 0 <= max(upper-alpha) + Where f is convex. This means that Q should be positive-semidefinite. + - More specifically, this function is identical to + solve_qp_box_constrained(Q, b, alpha, lower, upper, eps, max_iter), + except that it runs faster since it avoids unnecessary computation by + taking advantage of the sparsity structure in the QP. + - The solution to the above QP will be stored in #alphas. + - This function uses a combination of a SMO algorithm along with Nesterov's + method as the main iteration of the solver. It starts the algorithm with the + given alpha and it works on the problem until the derivative of f(alpha) is + smaller than eps for each element of alpha or the alpha value is at a box + constraint. So eps controls how accurate the solution is and smaller values + result in better solutions. + - At most max_iter iterations of optimization will be performed. + - returns the number of iterations performed. If this method fails to + converge to eps accuracy then the number returned will be max_iter+1. + !*/ +// ---------------------------------------------------------------------------------------- + + template < + typename EXP1, + typename EXP2, + typename T, long NRa, long NRb + > + unsigned long find_gap_between_convex_hulls ( + const matrix_exp<EXP1>& A, + const matrix_exp<EXP2>& B, + matrix<T,NRa,1>& cA, + matrix<T,NRb,1>& cB, + const double eps, + const unsigned long max_iter = 1000 + ); + /*! + requires + - A.nr() == B.nr() + - A.size() != 0 + - B.size() != 0 + - eps > 0 + - max_iter > 0 + ensures + - If you think of A and B as sets of column vectors, then we can identify the + convex sets hullA and hullB, which are the convex hulls of A and B + respectively. This function finds the pair of points in hullA and hullB that + are nearest to each other. To be precise, this function solves the following + quadratic program: + Minimize: f(cA,cB) == length_squared(A*cA - B*cB) + subject to the following constraints on cA and cB: + - is_col_vector(cA) == true && cA.size() == A.nc() + - is_col_vector(cB) == true && cB.size() == B.nc() + - sum(cA) == 1 && min(cA) >= 0 + - sum(cB) == 1 && min(cB) >= 0 + - This function uses an iterative block coordinate descent algorithm to solve + the QP. It runs until either max_iter iterations have been performed or the + QP is solved to at least eps accuracy. + - returns the number of iterations performed. If this method fails to + converge to eps accuracy then the number returned will be max_iter+1. + !*/ + +// ---------------------------------------------------------------------------------------- + +} + +#endif // DLIB_OPTIMIZATION_SOLVE_QP_UsING_SMO_ABSTRACT_Hh_ + + diff --git a/ml/dlib/dlib/optimization/optimization_stop_strategies.h b/ml/dlib/dlib/optimization/optimization_stop_strategies.h new file mode 100644 index 000000000..a0243cacf --- /dev/null +++ b/ml/dlib/dlib/optimization/optimization_stop_strategies.h @@ -0,0 +1,173 @@ +// Copyright (C) 2008 Davis E. King (davis@dlib.net) +// License: Boost Software License See LICENSE.txt for the full license. +#ifndef DLIB_OPTIMIZATIOn_STOP_STRATEGIES_H_ +#define DLIB_OPTIMIZATIOn_STOP_STRATEGIES_H_ + +#include <cmath> +#include <limits> +#include "../matrix.h" +#include "../algs.h" +#include "optimization_stop_strategies_abstract.h" +#include <iostream> + +namespace dlib +{ + +// ---------------------------------------------------------------------------------------- + + class objective_delta_stop_strategy + { + public: + explicit objective_delta_stop_strategy ( + double min_delta = 1e-7 + ) : _verbose(false), _been_used(false), _min_delta(min_delta), _max_iter(0), _cur_iter(0), _prev_funct_value(0) + { + DLIB_ASSERT ( + min_delta >= 0, + "\t objective_delta_stop_strategy(min_delta)" + << "\n\t min_delta can't be negative" + << "\n\t min_delta: " << min_delta + ); + } + + objective_delta_stop_strategy ( + double min_delta, + unsigned long max_iter + ) : _verbose(false), _been_used(false), _min_delta(min_delta), _max_iter(max_iter), _cur_iter(0), _prev_funct_value(0) + { + DLIB_ASSERT ( + min_delta >= 0 && max_iter > 0, + "\t objective_delta_stop_strategy(min_delta, max_iter)" + << "\n\t min_delta can't be negative and max_iter can't be 0" + << "\n\t min_delta: " << min_delta + << "\n\t max_iter: " << max_iter + ); + } + + objective_delta_stop_strategy& be_verbose( + ) + { + _verbose = true; + return *this; + } + + template <typename T> + bool should_continue_search ( + const T& , + const double funct_value, + const T& + ) + { + if (_verbose) + { + using namespace std; + cout << "iteration: " << _cur_iter << " objective: " << funct_value << endl; + } + + ++_cur_iter; + if (_been_used) + { + // Check if we have hit the max allowable number of iterations. (but only + // check if _max_iter is enabled (i.e. not 0)). + if (_max_iter != 0 && _cur_iter > _max_iter) + return false; + + // check if the function change was too small + if (std::abs(funct_value - _prev_funct_value) < _min_delta) + return false; + } + + _been_used = true; + _prev_funct_value = funct_value; + return true; + } + + private: + bool _verbose; + + bool _been_used; + double _min_delta; + unsigned long _max_iter; + unsigned long _cur_iter; + double _prev_funct_value; + }; + +// ---------------------------------------------------------------------------------------- + + class gradient_norm_stop_strategy + { + public: + explicit gradient_norm_stop_strategy ( + double min_norm = 1e-7 + ) : _verbose(false), _min_norm(min_norm), _max_iter(0), _cur_iter(0) + { + DLIB_ASSERT ( + min_norm >= 0, + "\t gradient_norm_stop_strategy(min_norm)" + << "\n\t min_norm can't be negative" + << "\n\t min_norm: " << min_norm + ); + } + + gradient_norm_stop_strategy ( + double min_norm, + unsigned long max_iter + ) : _verbose(false), _min_norm(min_norm), _max_iter(max_iter), _cur_iter(0) + { + DLIB_ASSERT ( + min_norm >= 0 && max_iter > 0, + "\t gradient_norm_stop_strategy(min_norm, max_iter)" + << "\n\t min_norm can't be negative and max_iter can't be 0" + << "\n\t min_norm: " << min_norm + << "\n\t max_iter: " << max_iter + ); + } + + gradient_norm_stop_strategy& be_verbose( + ) + { + _verbose = true; + return *this; + } + + template <typename T> + bool should_continue_search ( + const T& , + const double funct_value, + const T& funct_derivative + ) + { + if (_verbose) + { + using namespace std; + cout << "iteration: " << _cur_iter << " objective: " << funct_value << " gradient norm: " << length(funct_derivative) << endl; + } + + ++_cur_iter; + + // Check if we have hit the max allowable number of iterations. (but only + // check if _max_iter is enabled (i.e. not 0)). + if (_max_iter != 0 && _cur_iter > _max_iter) + return false; + + // check if the gradient norm is too small + if (length(funct_derivative) < _min_norm) + return false; + + return true; + } + + private: + bool _verbose; + + double _min_norm; + unsigned long _max_iter; + unsigned long _cur_iter; + }; + +// ---------------------------------------------------------------------------------------- + +} + +#endif // DLIB_OPTIMIZATIOn_STOP_STRATEGIES_H_ + diff --git a/ml/dlib/dlib/optimization/optimization_stop_strategies_abstract.h b/ml/dlib/dlib/optimization/optimization_stop_strategies_abstract.h new file mode 100644 index 000000000..6a999f8d9 --- /dev/null +++ b/ml/dlib/dlib/optimization/optimization_stop_strategies_abstract.h @@ -0,0 +1,157 @@ +// Copyright (C) 2008 Davis E. King (davis@dlib.net) +// License: Boost Software License See LICENSE.txt for the full license. +#undef DLIB_OPTIMIZATIOn_STOP_STRATEGIES_ABSTRACT_ +#ifdef DLIB_OPTIMIZATIOn_STOP_STRATEGIES_ABSTRACT_ + +#include <cmath> +#include <limits> +#include "../matrix/matrix_abstract.h" +#include "../algs.h" + + +namespace dlib +{ + +// ---------------------------------------------------------------------------------------- + + class objective_delta_stop_strategy + { + /*! + WHAT THIS OBJECT REPRESENTS + This object represents a strategy for deciding if an optimization + algorithm should terminate. This particular object looks at the + change in the objective function from one iteration to the next and + bases its decision on how large this change is. If the change + is below a user given threshold then the search stops. + !*/ + + public: + explicit objective_delta_stop_strategy ( + double min_delta = 1e-7 + ); + /*! + requires + - min_delta >= 0 + ensures + - This stop strategy object will only consider a search to be complete + if a change in an objective function from one iteration to the next + is less than min_delta. + !*/ + + objective_delta_stop_strategy ( + double min_delta, + unsigned long max_iter + ); + /*! + requires + - min_delta >= 0 + - max_iter > 0 + ensures + - This stop strategy object will only consider a search to be complete + if a change in an objective function from one iteration to the next + is less than min_delta or more than max_iter iterations has been + executed. + !*/ + + objective_delta_stop_strategy& be_verbose( + ); + /*! + ensures + - causes this object to print status messages to standard out + every time should_continue_search() is called. + - returns *this + !*/ + + template <typename T> + bool should_continue_search ( + const T& x, + const double funct_value, + const T& funct_derivative + ); + /*! + requires + - this function is only called once per search iteration + - for some objective function f(): + - x == the search point for the current iteration + - funct_value == f(x) + - funct_derivative == derivative(f)(x) + ensures + - returns true if the point x doest not satisfy the stopping condition and + false otherwise. + !*/ + + }; + +// ---------------------------------------------------------------------------------------- + + class gradient_norm_stop_strategy + { + /*! + WHAT THIS OBJECT REPRESENTS + This object represents a strategy for deciding if an optimization + algorithm should terminate. This particular object looks at the + norm (i.e. the length) of the current gradient vector and stops + if it is smaller than a user given threshold. + !*/ + + public: + explicit gradient_norm_stop_strategy ( + double min_norm = 1e-7 + ); + /*! + requires + - min_norm >= 0 + ensures + - This stop strategy object will only consider a search to be complete + if the current gradient norm is less than min_norm + !*/ + + gradient_norm_stop_strategy ( + double min_norm, + unsigned long max_iter + ); + /*! + requires + - min_norm >= 0 + - max_iter > 0 + ensures + - This stop strategy object will only consider a search to be complete + if the current gradient norm is less than min_norm or more than + max_iter iterations has been executed. + !*/ + + gradient_norm_stop_strategy& be_verbose( + ); + /*! + ensures + - causes this object to print status messages to standard out + every time should_continue_search() is called. + - returns *this + !*/ + + template <typename T> + bool should_continue_search ( + const T& x, + const double funct_value, + const T& funct_derivative + ); + /*! + requires + - this function is only called once per search iteration + - for some objective function f(): + - x == the search point for the current iteration + - funct_value == f(x) + - funct_derivative == derivative(f)(x) + ensures + - returns true if the point x doest not satisfy the stopping condition and + false otherwise. + !*/ + + }; + +// ---------------------------------------------------------------------------------------- + +} + +#endif // DLIB_OPTIMIZATIOn_STOP_STRATEGIES_ABSTRACT_ + diff --git a/ml/dlib/dlib/optimization/optimization_trust_region.h b/ml/dlib/dlib/optimization/optimization_trust_region.h new file mode 100644 index 000000000..5f0ad897f --- /dev/null +++ b/ml/dlib/dlib/optimization/optimization_trust_region.h @@ -0,0 +1,564 @@ +// Copyright (C) 2010 Davis E. King (davis@dlib.net) +// License: Boost Software License See LICENSE.txt for the full license. +#ifndef DLIB_OPTIMIZATION_TRUST_REGIoN_Hh_ +#define DLIB_OPTIMIZATION_TRUST_REGIoN_Hh_ + +#include "../matrix.h" +#include "optimization_trust_region_abstract.h" + +namespace dlib +{ + +// ---------------------------------------------------------------------------------------- + + template < + typename EXP1, + typename EXP2, + typename T, long NR, long NC, typename MM, typename L + > + unsigned long solve_trust_region_subproblem ( + const matrix_exp<EXP1>& B, + const matrix_exp<EXP2>& g, + const typename EXP1::type radius, + matrix<T,NR,NC,MM,L>& p, + double eps, + unsigned long max_iter + ) + { + /* + This is an implementation of algorithm 4.3(Trust Region Subproblem) + from the book Numerical Optimization by Nocedal and Wright. Some of + the details are also from Practical Methods of Optimization by Fletcher. + */ + + // make sure requires clause is not broken + DLIB_ASSERT(B.nr() == B.nc() && is_col_vector(g) && g.size() == B.nr(), + "\t unsigned long solve_trust_region_subproblem()" + << "\n\t invalid arguments were given to this function" + << "\n\t B.nr(): " << B.nr() + << "\n\t B.nc(): " << B.nc() + << "\n\t is_col_vector(g): " << is_col_vector(g) + << "\n\t g.size(): " << g.size() + ); + DLIB_ASSERT(radius > 0 && eps > 0 && max_iter > 0, + "\t unsigned long solve_trust_region_subproblem()" + << "\n\t invalid arguments were given to this function" + << "\n\t radius: " << radius + << "\n\t eps: " << eps + << "\n\t max_iter: " << max_iter + ); + + + const_temp_matrix<EXP1> BB(B); + const_temp_matrix<EXP2> gg(g); + + p.set_size(g.nr(),g.nc()); + p = 0; + + + const T numeric_eps = max(diag(abs(BB)))*std::numeric_limits<T>::epsilon(); + + matrix<T,EXP1::NR,EXP2::NR,MM,L> R; + + T lambda = 0; + + // We need to put a bracket around lambda. It can't go below 0. We + // can get an upper bound using Gershgorin disks. + // This number is a lower bound on the eigenvalues in BB + const T BB_min_eigenvalue = min(diag(BB) - (sum_cols(abs(BB)) - abs(diag(BB)))); + + const T g_norm = length(gg); + + T lambda_min = 0; + T lambda_max = put_in_range(0, + std::numeric_limits<T>::max(), + g_norm/radius - BB_min_eigenvalue); + + + // If we can tell that the minimum is at 0 then don't do anything. Just return the answer. + if (g_norm < numeric_eps && BB_min_eigenvalue > numeric_eps) + { + return 0; + } + + + // how much lambda has changed recently + T lambda_delta = 0; + + for (unsigned long i = 0; i < max_iter; ++i) + { + R = chol(BB + lambda*identity_matrix<T>(BB.nr())); + + // if the cholesky decomposition doesn't exist. + if (R(R.nr()-1, R.nc()-1) <= 0) + { + // If B is indefinite and g is equal to 0 then we should + // quit this loop and go right to the eigenvalue decomposition method. + if (g_norm <= numeric_eps) + break; + + // narrow the bracket on lambda. Obviously the current lambda is + // too small. + lambda_min = lambda; + + // jump towards the max value. Eventually there will + // be a lambda that results in a cholesky decomposition. + const T alpha = 0.10; + lambda = (1-alpha)*lambda + alpha*lambda_max; + continue; + } + + using namespace blas_bindings; + + p = -gg; + // Solve RR'*p = -g for p. + // Solve R*q = -g for q where q = R'*p. + if (R.nr() == 2) + { + p(0) = p(0)/R(0,0); + p(1) = (p(1)-R(1,0)*p(0))/R(1,1); + } + else + { + triangular_solver(CblasLeft, CblasLower, CblasNoTrans, CblasNonUnit, R, p); + } + const T q_norm = length(p); + + // Solve R'*p = q for p. + if (R.nr() == 2) + { + p(1) = p(1)/R(1,1); + p(0) = (p(0)-R(1,0)*p(1))/R(0,0); + } + else + { + triangular_solver(CblasLeft, CblasLower, CblasTrans, CblasNonUnit, R, p); + } + const T p_norm = length(p); + + // check if we are done. + if (lambda == 0) + { + if (p_norm < radius) + { + // i will always be 0 in this case. So we return 1. + return i+1; + } + } + else + { + // if we are close enough to the solution then terminate + if (std::abs(p_norm - radius)/radius < eps) + return i+1; + } + + // shrink our bracket on lambda + if (p_norm < radius) + lambda_max = lambda; + else + lambda_min = lambda; + + + if (p_norm <= radius*std::numeric_limits<T>::epsilon()) + { + const T alpha = 0.01; + lambda = (1-alpha)*lambda_min + alpha*lambda_max; + continue; + } + + const T old_lambda = lambda; + + // figure out which lambda to try next + lambda = lambda + std::pow(q_norm/p_norm,2)*(p_norm - radius)/radius; + + // make sure the chosen lambda is within our bracket (but not exactly at either end). + const T gap = (lambda_max-lambda_min)*0.01; + lambda = put_in_range(lambda_min+gap, lambda_max-gap, lambda); + + // Keep track of how much lambda is thrashing around inside the search bracket. If it + // keeps moving around a whole lot then cut the search bracket in half. + lambda_delta += std::abs(lambda - old_lambda); + if (lambda_delta > 3*(lambda_max-lambda_min)) + { + lambda = (lambda_min+lambda_max)/2; + lambda_delta = 0; + } + } // end for loop + + + // We are probably in the "hard case". Use an eigenvalue decomposition to sort things out. + // Either that or the eps was just set too tight and really we are already done. + eigenvalue_decomposition<EXP1> ed(make_symmetric(BB)); + + matrix<T,NR,NC,MM,L> ev = ed.get_real_eigenvalues(); + const long min_eig_idx = index_of_min(ev); + + + ev -= min(ev); + // zero out any values which are basically zero + ev = pointwise_multiply(ev, ev > max(abs(ev))*std::numeric_limits<T>::epsilon()); + ev = reciprocal(ev); + + + // figure out part of what p should be assuming we are in the hard case. + matrix<T,NR,NC,MM,L> p_hard; + p_hard = trans(ed.get_pseudo_v())*gg; + p_hard = diagm(ev)*p_hard; + p_hard = ed.get_pseudo_v()*p_hard; + + + // If we really are in the hard case then this if will be true. Otherwise, the p + // we found in the "easy case" loop up top is the best answer. + if (length(p_hard) < radius && length(p_hard) >= length(p)) + { + // adjust the length of p_hard by adding a component along the eigenvector associated with + // the smallest eigenvalue. We want to make it the case that length(p) == radius. + const T tau = std::sqrt(radius*radius - length_squared(p_hard)); + p = p_hard + tau*colm(ed.get_pseudo_v(),min_eig_idx); + + + // if we have to do an eigenvalue decomposition then say we did all the iterations + return max_iter; + } + + // if we get this far it means we didn't converge to eps accuracy. + return max_iter+1; + } + +// ---------------------------------------------------------------------------------------- + + namespace impl + { + template < + typename EXP1, + typename EXP2, + typename EXP3 + > + bool bounds_violated ( + const matrix_exp<EXP1>& v, + const matrix_exp<EXP2>& l, + const matrix_exp<EXP3>& u + ) + { + DLIB_ASSERT(v.nr() == l.nr() && v.nr() == u.nr()); + DLIB_ASSERT(v.nc() == l.nc() && v.nc() == u.nc()); + for (long r = 0; r < v.nr(); ++r) + { + for (long c = 0; c < v.nc(); c++) + { + if (!(l(r,c) <= v(r,c) && v(r,c) <= u(r,c))) + return true; + } + } + return false; + } + } + +// ---------------------------------------------------------------------------------------- + + template < + typename EXP1, + typename EXP2, + typename T, long NR, long NC, typename MM, typename L, + typename EXP3 + > + void solve_trust_region_subproblem_bounded ( + const matrix_exp<EXP1>& B_, + const matrix_exp<EXP2>& g_, + const typename EXP1::type radius_, + matrix<T,NR,NC,MM,L>& p_, + double eps, + unsigned long max_iter, + const matrix_exp<EXP3>& lower_, + const matrix_exp<EXP3>& upper_ + ) + { + // make sure requires clause is not broken + DLIB_ASSERT(B_.nr() == B_.nc() && is_col_vector(g_) && g_.size() == B_.nr(), + "\t unsigned long solve_trust_region_subproblem_bounded()" + << "\n\t invalid arguments were given to this function" + << "\n\t B_.nr(): " << B_.nr() + << "\n\t B_.nc(): " << B_.nc() + << "\n\t is_col_vector(g_): " << is_col_vector(g_) + << "\n\t g_.size(): " << g_.size() + ); + DLIB_ASSERT(radius_ > 0 && eps > 0 && max_iter > 0, + "\t unsigned long solve_trust_region_subproblem_bounded()" + << "\n\t invalid arguments were given to this function" + << "\n\t radius_: " << radius_ + << "\n\t eps: " << eps + << "\n\t max_iter: " << max_iter + ); + DLIB_ASSERT(is_col_vector(lower_) && lower_.size() == g_.size()); + DLIB_ASSERT(is_col_vector(upper_) && upper_.size() == g_.size()); + DLIB_ASSERT(max(upper_-lower_) >= 0); + // make sure the problem is feasible. That is, there should be a point inside the + // lower and upper bounds that has a norm <= radius_ + DLIB_ASSERT(length(clamp(zeros_matrix(lower_),lower_,upper_)) <= radius_, + "The lower and upper bounds are incompatible with the radius since there is no point within the bounds with a norm less than the radius."); + + // We are going to solve this by greedily finding the most violated bound constraint, + // locking that variable to its constrained value, removing it from the problem, + // and then resolving. We do that until no more constraint violations are present. + + solve_trust_region_subproblem(B_,g_,radius_,p_,eps,max_iter); + + + // just stop here if all the bounds are satisfied. + if (!impl::bounds_violated(p_, lower_, upper_)) + return; + + matrix<double> B = matrix_cast<double>(B_); + matrix<double,0,1> g = matrix_cast<double>(g_); + double radius = radius_; + matrix<double,0,1> p = matrix_cast<double>(p_); + matrix<double,0,1> lower = matrix_cast<double>(lower_); + matrix<double,0,1> upper = matrix_cast<double>(upper_); + + // keep a table that tells us how to map any reduced QP back to the original QP + std::vector<long> idxs(g.size()); + for (size_t i = 0; i < idxs.size(); ++i) + idxs[i] = i; + + + // while we haven't found a p that satisfies the bounds constraints + while(impl::bounds_violated(p, lower, upper) ) + { + // Find the most violated variable and fix its value to a constant (the bound + // value). + long most_violated_idx = 0; + double max_violation = 0; + double bounded_value = 0; + for (long i = 0; i < lower.size(); ++i) + { + if (!(lower(i) <= p(i) && p(i) <= upper(i))) + { + if (lower(i)-p(i) > max_violation) + { + max_violation = lower(i)-p(i); + most_violated_idx = i; + bounded_value = lower(i); + } + else if (p(i)-upper(i) > max_violation) + { + max_violation = p(i)-upper(i); + most_violated_idx = i; + bounded_value = upper(i); + } + } + } + + // assign this variable to its final value. + p_(idxs[most_violated_idx]) = bounded_value; + + + // now reduce the QP by removing the variable p_(idxs[most_violated_idx]). + idxs.erase(idxs.begin()+most_violated_idx); + // we are out of variables to remove since everything is at bounds. + if (idxs.size() == 0) + break; + + lower = remove_row(lower,most_violated_idx); + upper = remove_row(upper,most_violated_idx); + g += colm(B,most_violated_idx)*bounded_value; + g = remove_row(g,most_violated_idx); + p = remove_row(p,most_violated_idx); + B = removerc(B,most_violated_idx, most_violated_idx); + + // Removing a variable changes the radius, so we have to subtract the bounded + // value from the radius so as to not change the effective radius for the whole + // problem. + double squared_radius = radius*radius - bounded_value*bounded_value; + if (squared_radius <= 0) + { + p = 0; + break; + } + radius = std::sqrt(squared_radius); + + + solve_trust_region_subproblem(B,g,radius,p,eps,max_iter); + } + + // assign the non-bound-constrained variables to their final values + for (size_t i = 0; i < idxs.size(); ++i) + p_(idxs[i]) = p(i); + + } + +// ---------------------------------------------------------------------------------------- +// ---------------------------------------------------------------------------------------- + + template < + typename stop_strategy_type, + typename funct_model + > + double find_min_trust_region ( + stop_strategy_type stop_strategy, + const funct_model& model, + typename funct_model::column_vector& x, + double radius = 1 + ) + { + /* + This is an implementation of algorithm 4.1(Trust Region) + from the book Numerical Optimization by Nocedal and Wright. + */ + + // make sure requires clause is not broken + DLIB_ASSERT(is_col_vector(x) && radius > 0, + "\t double find_min_trust_region()" + << "\n\t invalid arguments were given to this function" + << "\n\t is_col_vector(x): " << is_col_vector(x) + << "\n\t radius: " << radius + ); + + const double initial_radius = radius; + + typedef typename funct_model::column_vector T; + typedef typename T::type type; + + typename funct_model::general_matrix h; + typename funct_model::column_vector g, p, d; + type f_value = model(x); + + model.get_derivative_and_hessian(x,g,h); + + DLIB_ASSERT(is_finite(x), "The objective function generated non-finite outputs"); + DLIB_ASSERT(is_finite(g), "The objective function generated non-finite outputs"); + DLIB_ASSERT(is_finite(h), "The objective function generated non-finite outputs"); + + // Sometimes the loop below won't modify x because the trust region step failed. + // This bool tells us when we are in that case. + bool stale_x = false; + + while(stale_x || stop_strategy.should_continue_search(x, f_value, g)) + { + const unsigned long iter = solve_trust_region_subproblem(h, + g, + radius, + p, + 0.1, + 20); + + + const type new_f_value = model(x+p); + const type predicted_improvement = -0.5*trans(p)*h*p - trans(g)*p; + const type measured_improvement = (f_value - new_f_value); + + // If the sub-problem can't find a way to improve then stop. This only happens when p is essentially 0. + if (std::abs(predicted_improvement) <= std::abs(measured_improvement)*std::numeric_limits<type>::epsilon()) + break; + + // predicted_improvement shouldn't be negative but it might be if something went + // wrong in the trust region solver. So put abs() here to guard against that. This + // way the sign of rho is determined only by the sign of measured_improvement. + const type rho = measured_improvement/std::abs(predicted_improvement); + + + if (!is_finite(rho)) + break; + + if (rho < 0.25) + { + radius *= 0.25; + + // something has gone horribly wrong if the radius has shrunk to zero. So just + // give up if that happens. + if (radius <= initial_radius*std::numeric_limits<double>::epsilon()) + break; + } + else + { + // if rho > 0.75 and we are being checked by the radius + if (rho > 0.75 && iter > 1) + { + radius = std::min<type>(1000, 2*radius); + } + } + + if (rho > 0) + { + x = x + p; + f_value = new_f_value; + model.get_derivative_and_hessian(x,g,h); + stale_x = false; + } + else + { + stale_x = true; + } + + DLIB_ASSERT(is_finite(x), "The objective function generated non-finite outputs"); + DLIB_ASSERT(is_finite(g), "The objective function generated non-finite outputs"); + DLIB_ASSERT(is_finite(h), "The objective function generated non-finite outputs"); + } + + return f_value; + } + +// ---------------------------------------------------------------------------------------- + + template <typename funct_model> + struct negate_tr_model + { + negate_tr_model( const funct_model& m) : model(m) {} + + const funct_model& model; + + typedef typename funct_model::column_vector column_vector; + typedef typename funct_model::general_matrix general_matrix; + + template <typename T> + typename T::type operator() (const T& x) const + { + return -model(x); + } + + template <typename T, typename U> + void get_derivative_and_hessian ( + const T& x, + T& d, + U& h + ) const + { + model.get_derivative_and_hessian(x,d,h); + d = -d; + h = -h; + } + + }; + +// ---------------------------------------------------------------------------------------- + + template < + typename stop_strategy_type, + typename funct_model + > + double find_max_trust_region ( + stop_strategy_type stop_strategy, + const funct_model& model, + typename funct_model::column_vector& x, + double radius = 1 + ) + { + // make sure requires clause is not broken + DLIB_ASSERT(is_col_vector(x) && radius > 0, + "\t double find_max_trust_region()" + << "\n\t invalid arguments were given to this function" + << "\n\t is_col_vector(x): " << is_col_vector(x) + << "\n\t radius: " << radius + ); + + return -find_min_trust_region(stop_strategy, + negate_tr_model<funct_model>(model), + x, + radius); + } + +// ---------------------------------------------------------------------------------------- + +} + +#endif // DLIB_OPTIMIZATION_TRUST_REGIoN_Hh_ + diff --git a/ml/dlib/dlib/optimization/optimization_trust_region_abstract.h b/ml/dlib/dlib/optimization/optimization_trust_region_abstract.h new file mode 100644 index 000000000..303ee746d --- /dev/null +++ b/ml/dlib/dlib/optimization/optimization_trust_region_abstract.h @@ -0,0 +1,233 @@ +// Copyright (C) 2010 Davis E. King (davis@dlib.net) +// License: Boost Software License See LICENSE.txt for the full license. +#undef DLIB_OPTIMIZATION_TRUST_REGIoN_H_ABSTRACTh_ +#ifdef DLIB_OPTIMIZATION_TRUST_REGIoN_H_ABSTRACTh_ + +#include "../matrix/matrix_abstract.h" + +namespace dlib +{ + +// ---------------------------------------------------------------------------------------- + + template < + typename EXP1, + typename EXP2, + typename T, long NR, long NC, typename MM, typename L + > + unsigned long solve_trust_region_subproblem ( + const matrix_exp<EXP1>& B, + const matrix_exp<EXP2>& g, + const typename EXP1::type radius, + matrix<T,NR,NC,MM,L>& p, + double eps, + unsigned long max_iter + ); + /*! + requires + - B == trans(B) + (i.e. B should be a symmetric matrix) + - B.nr() == B.nc() + - is_col_vector(g) == true + - g.size() == B.nr() + - p is capable of containing a column vector the size of g + (i.e. p = g; should be a legal expression) + - radius > 0 + - eps > 0 + - max_iter > 0 + ensures + - This function solves the following optimization problem: + Minimize: f(p) == 0.5*trans(p)*B*p + trans(g)*p + subject to the following constraint: + - length(p) <= radius + - returns the number of iterations performed. If this method fails to converge + to eps accuracy then the number returned will be max_iter+1. + - if (this function didn't terminate due to hitting the max_iter iteration limit) then + - if this function returns 0 or 1 then we are not hitting the radius bound Otherwise, + the radius constraint is active and std::abs(length(#p)-radius)/radius <= eps. + !*/ + +// ---------------------------------------------------------------------------------------- + + template < + typename EXP1, + typename EXP2, + typename T, long NR, long NC, typename MM, typename L, + typename EXP3 + > + void solve_trust_region_subproblem_bounded ( + const matrix_exp<EXP1>& B, + const matrix_exp<EXP2>& g, + const typename EXP1::type radius, + matrix<T,NR,NC,MM,L>& p, + double eps, + unsigned long max_iter, + const matrix_exp<EXP3>& lower, + const matrix_exp<EXP3>& upper + ); + /*! + requires + - B == trans(B) + (i.e. B should be a symmetric matrix) + - B.nr() == B.nc() + - is_col_vector(g) == true + - is_col_vector(lower) == true + - is_col_vector(upper) == true + - g.size() == B.nr() + - lower.size() == B.nr() + - upper.size() == B.nr() + - p is capable of containing a column vector the size of g + (i.e. p = g; should be a legal expression) + - radius > 0 + - eps > 0 + - max_iter > 0 + - min(upper-lower) >= 0 + - length(clamp(zeros_matrix(lower),lower,upper)) <= radius + (i.e. the lower and upper bounds can't exclude all points with the desired radius.) + ensures + - This function solves the following optimization problem: + Minimize: f(p) == 0.5*trans(p)*B*p + trans(g)*p + subject to the following constraints: + - length(p) <= radius + - lower(i) <= p(i) <= upper(i), for all i + - Solves the problem to eps accuracy. We do this by greedily finding the most + violated bound constraint, locking that variable to its constrained value, removing + it from the problem, and then resolving. We do that until no more constraint + violations are present. Each time we just call solve_trust_region_subproblem() + to get the solution and pass eps and max_iter directly to these calls to + solve_trust_region_subproblem(). + !*/ + +// ---------------------------------------------------------------------------------------- + + class function_model + { + /*! + WHAT THIS OBJECT REPRESENTS + This object defines the interface for a function model + used by the trust-region optimizers defined below. + + In particular, this object represents a function f() and + its associated derivative and hessian. + + !*/ + + public: + + // Define the type used to represent column vectors + typedef matrix<double,0,1> column_vector; + // Define the type used to represent the hessian matrix + typedef matrix<double> general_matrix; + + double operator() ( + const column_vector& x + ) const; + /*! + ensures + - returns f(x) + (i.e. evaluates this model at the given point and returns the value) + !*/ + + void get_derivative_and_hessian ( + const column_vector& x, + column_vector& d, + general_matrix& h + ) const; + /*! + ensures + - #d == the derivative of f() at x + - #h == the hessian matrix of f() at x + - is_col_vector(#d) == true + - #d.size() == x.size() + - #h.nr() == #h.nc() == x.size() + - #h == trans(#h) + !*/ + }; + +// ---------------------------------------------------------------------------------------- + + template < + typename stop_strategy_type, + typename funct_model + > + double find_min_trust_region ( + stop_strategy_type stop_strategy, + const funct_model& model, + typename funct_model::column_vector& x, + double radius = 1 + ); + /*! + requires + - stop_strategy == an object that defines a stop strategy such as one of + the objects from dlib/optimization/optimization_stop_strategies_abstract.h + - is_col_vector(x) == true + - radius > 0 + - model must be an object with an interface as defined by the function_model + example object shown above. + ensures + - Performs an unconstrained minimization of the function defined by model + starting from the initial point x. This function uses a trust region + algorithm to perform the minimization. The radius parameter defines + the initial size of the trust region. + - The function is optimized until stop_strategy decides that an acceptable + point has been found or the trust region subproblem fails to make progress. + - #x == the value of x that was found to minimize model() + - returns model(#x). + - When this function makes calls to model.get_derivative_and_hessian() it always + does so by first calling model() and then calling model.get_derivative_and_hessian(). + That is, any call to model.get_derivative_and_hessian(val) will always be + preceded by a call to model(val) with the same value. This way you can reuse + any redundant computations performed by model() and model.get_derivative_and_hessian() + as appropriate. + !*/ + +// ---------------------------------------------------------------------------------------- + + template < + typename stop_strategy_type, + typename funct_model + > + double find_max_trust_region ( + stop_strategy_type stop_strategy, + const funct_model& model, + typename funct_model::column_vector& x, + double radius = 1 + ); + /*! + requires + - stop_strategy == an object that defines a stop strategy such as one of + the objects from dlib/optimization/optimization_stop_strategies_abstract.h + - is_col_vector(x) == true + - radius > 0 + - model must be an object with an interface as defined by the function_model + example object shown above. + ensures + - Performs an unconstrained maximization of the function defined by model + starting from the initial point x. This function uses a trust region + algorithm to perform the maximization. The radius parameter defines + the initial size of the trust region. + - The function is optimized until stop_strategy decides that an acceptable + point has been found or the trust region subproblem fails to make progress. + - #x == the value of x that was found to maximize model() + - returns model(#x). + - When this function makes calls to model.get_derivative_and_hessian() it always + does so by first calling model() and then calling model.get_derivative_and_hessian(). + That is, any call to model.get_derivative_and_hessian(val) will always be + preceded by a call to model(val) with the same value. This way you can reuse + any redundant computations performed by model() and model.get_derivative_and_hessian() + as appropriate. + - Note that this function solves the maximization problem by converting it + into a minimization problem. Therefore, the values of model() and its derivative + reported to the stopping strategy will be negated. That is, stop_strategy + will see -model() and -derivative. All this really means is that the status + messages from a stopping strategy in verbose mode will display a negated objective + value. + !*/ + +// ---------------------------------------------------------------------------------------- + +} + +#endif // DLIB_OPTIMIZATION_TRUST_REGIoN_H_ABSTRACTh_ + + |