summaryrefslogtreecommitdiffstats
path: root/ml/dlib/dlib/optimization
diff options
context:
space:
mode:
Diffstat (limited to 'ml/dlib/dlib/optimization')
-rw-r--r--ml/dlib/dlib/optimization/elastic_net.h389
-rw-r--r--ml/dlib/dlib/optimization/elastic_net_abstract.h190
-rw-r--r--ml/dlib/dlib/optimization/find_max_factor_graph_nmplp.h337
-rw-r--r--ml/dlib/dlib/optimization/find_max_factor_graph_nmplp_abstract.h365
-rw-r--r--ml/dlib/dlib/optimization/find_max_factor_graph_viterbi.h232
-rw-r--r--ml/dlib/dlib/optimization/find_max_factor_graph_viterbi_abstract.h131
-rw-r--r--ml/dlib/dlib/optimization/find_max_parse_cky.h414
-rw-r--r--ml/dlib/dlib/optimization/find_max_parse_cky_abstract.h388
-rw-r--r--ml/dlib/dlib/optimization/find_optimal_parameters.h117
-rw-r--r--ml/dlib/dlib/optimization/find_optimal_parameters_abstract.h58
-rw-r--r--ml/dlib/dlib/optimization/isotonic_regression.h169
-rw-r--r--ml/dlib/dlib/optimization/isotonic_regression_abstract.h128
-rw-r--r--ml/dlib/dlib/optimization/max_cost_assignment.h288
-rw-r--r--ml/dlib/dlib/optimization/max_cost_assignment_abstract.h63
-rw-r--r--ml/dlib/dlib/optimization/max_sum_submatrix.h285
-rw-r--r--ml/dlib/dlib/optimization/max_sum_submatrix_abstract.h49
-rw-r--r--ml/dlib/dlib/optimization/optimization.h714
-rw-r--r--ml/dlib/dlib/optimization/optimization_abstract.h468
-rw-r--r--ml/dlib/dlib/optimization/optimization_bobyqa.h3423
-rw-r--r--ml/dlib/dlib/optimization/optimization_bobyqa_abstract.h120
-rw-r--r--ml/dlib/dlib/optimization/optimization_least_squares.h345
-rw-r--r--ml/dlib/dlib/optimization/optimization_least_squares_abstract.h112
-rw-r--r--ml/dlib/dlib/optimization/optimization_line_search.h888
-rw-r--r--ml/dlib/dlib/optimization/optimization_line_search_abstract.h361
-rw-r--r--ml/dlib/dlib/optimization/optimization_oca.h407
-rw-r--r--ml/dlib/dlib/optimization/optimization_oca_abstract.h334
-rw-r--r--ml/dlib/dlib/optimization/optimization_search_strategies.h324
-rw-r--r--ml/dlib/dlib/optimization/optimization_search_strategies_abstract.h330
-rw-r--r--ml/dlib/dlib/optimization/optimization_solve_qp2_using_smo.h468
-rw-r--r--ml/dlib/dlib/optimization/optimization_solve_qp2_using_smo_abstract.h150
-rw-r--r--ml/dlib/dlib/optimization/optimization_solve_qp3_using_smo.h455
-rw-r--r--ml/dlib/dlib/optimization/optimization_solve_qp3_using_smo_abstract.h139
-rw-r--r--ml/dlib/dlib/optimization/optimization_solve_qp_using_smo.h937
-rw-r--r--ml/dlib/dlib/optimization/optimization_solve_qp_using_smo_abstract.h282
-rw-r--r--ml/dlib/dlib/optimization/optimization_stop_strategies.h173
-rw-r--r--ml/dlib/dlib/optimization/optimization_stop_strategies_abstract.h157
-rw-r--r--ml/dlib/dlib/optimization/optimization_trust_region.h564
-rw-r--r--ml/dlib/dlib/optimization/optimization_trust_region_abstract.h233
38 files changed, 14987 insertions, 0 deletions
diff --git a/ml/dlib/dlib/optimization/elastic_net.h b/ml/dlib/dlib/optimization/elastic_net.h
new file mode 100644
index 000000000..6c4b6d0b4
--- /dev/null
+++ b/ml/dlib/dlib/optimization/elastic_net.h
@@ -0,0 +1,389 @@
+// Copyright (C) 2016 Davis E. King (davis@dlib.net)
+// License: Boost Software License See LICENSE.txt for the full license.
+#ifndef DLIB_ElASTIC_NET_Hh_
+#define DLIB_ElASTIC_NET_Hh_
+
+#include "../matrix.h"
+#include "elastic_net_abstract.h"
+
+namespace dlib
+{
+
+// ----------------------------------------------------------------------------------------
+
+ class elastic_net
+ {
+ public:
+
+ template <typename EXP>
+ explicit elastic_net(
+ const matrix_exp<EXP>& XX
+ ) : eps(1e-5), max_iterations(50000), verbose(false)
+ {
+ // make sure requires clause is not broken
+ DLIB_ASSERT(XX.size() > 0 &&
+ XX.nr() == XX.nc(),
+ "\t elastic_net::elastic_net(XX)"
+ << " \n\t XX must be a non-empty square matrix."
+ << " \n\t XX.nr(): " << XX.nr()
+ << " \n\t XX.nc(): " << XX.nc()
+ << " \n\t this: " << this
+ );
+
+
+ // If the number of columns in X is big and in particular bigger than the number of
+ // rows then we can get rid of them by doing some SVD magic. Doing this doesn't
+ // make the final results of anything change but makes all the matrices have
+ // dimensions that are X.nr() in size, which can be much smaller.
+ matrix<double,0,1> s;
+ svd3(XX,u,eig_vals,eig_vects);
+ s = sqrt(eig_vals);
+ X = eig_vects*diagm(s);
+ u = eig_vects*inv(diagm(s));
+
+
+
+ samples.resize(X.nr()*2);
+
+ for (size_t i = 0; i < samples.size(); ++i)
+ index.push_back(i);
+ active_size = index.size();
+
+
+ // setup the training samples used in the SVM optimizer below
+ for (size_t i = 0; i < samples.size(); ++i)
+ {
+ auto& x = samples[i];
+ const long idx = i/2;
+ if (i%2 == 0)
+ x.label = +1;
+ else
+ x.label = -1;
+
+ x.r = idx%X.nr();
+ }
+ }
+
+ template <typename EXP1, typename EXP2>
+ elastic_net(
+ const matrix_exp<EXP1>& XX,
+ const matrix_exp<EXP2>& XY
+ ) : elastic_net(XX)
+ {
+ // make sure requires clause is not broken
+ DLIB_ASSERT(XX.size() > 0 &&
+ XX.nr() == XX.nc() &&
+ is_col_vector(XY) &&
+ XX.nc() == XY.size() ,
+ "\t elastic_net::elastic_net(XX,XY)"
+ << " \n\t Invalid inputs were given to this function."
+ << " \n\t XX.size(): " << XX.size()
+ << " \n\t is_col_vector(XY): " << is_col_vector(XY)
+ << " \n\t XX.nr(): " << XX.nr()
+ << " \n\t XX.nc(): " << XX.nc()
+ << " \n\t XY.size(): " << XY.size()
+ << " \n\t this: " << this
+ );
+
+ set_xy(XY);
+ }
+
+ long size (
+ ) const { return u.nr(); }
+
+ template <typename EXP>
+ void set_xy(
+ const matrix_exp<EXP>& XY
+ )
+ {
+ // make sure requires clause is not broken
+ DLIB_ASSERT(is_col_vector(XY) &&
+ XY.size() == size(),
+ "\t void elastic_net::set_y(Y)"
+ << " \n\t Invalid inputs were given to this function."
+ << " \n\t is_col_vector(XY): " << is_col_vector(XY)
+ << " \n\t size(): " << size()
+ << " \n\t XY.size(): " << XY.size()
+ << " \n\t this: " << this
+ );
+
+ Y = trans(u)*XY;
+ // We can use the ynorm after it has been projected because the only place Y
+ // appears in the algorithm is in terms of dot products with w and x vectors.
+ // But those vectors are always in the span of X and therefore we only see the
+ // part of the norm of Y that is in the span of X (and hence u since u and X
+ // have the same span by construction)
+ ynorm = length_squared(Y);
+ xdoty = X*Y;
+ eig_vects_xdoty = trans(eig_vects)*xdoty;
+
+ w.set_size(Y.size());
+ // zero out any memory of previous solutions
+ alpha.assign(X.nr()*2, 0);
+ }
+
+ bool have_target_values (
+ ) const { return Y.size() != 0; }
+
+ void set_epsilon(
+ double eps_
+ )
+ {
+ // make sure requires clause is not broken
+ DLIB_ASSERT(eps_ > 0,
+ "\t void elastic_net::set_epsilon()"
+ << " \n\t eps_ must be greater than 0"
+ << " \n\t eps_: " << eps_
+ << " \n\t this: " << this
+ );
+
+ eps = eps_;
+ }
+
+ unsigned long get_max_iterations (
+ ) const { return max_iterations; }
+
+ void set_max_iterations (
+ unsigned long max_iter
+ )
+ {
+ max_iterations = max_iter;
+ }
+
+ void be_verbose (
+ )
+ {
+ verbose = true;
+ }
+
+ void be_quiet (
+ )
+ {
+ verbose = false;
+ }
+
+ double get_epsilon (
+ ) const { return eps; }
+
+ matrix<double,0,1> operator() (
+ double ridge_lambda,
+ double lasso_budget = std::numeric_limits<double>::infinity()
+ )
+ {
+ // make sure requires clause is not broken
+ DLIB_ASSERT(have_target_values() &&
+ ridge_lambda > 0 &&
+ lasso_budget > 0 ,
+ "\t matrix<double,0,1> elastic_net::operator()()"
+ << " \n\t Invalid inputs were given to this function."
+ << " \n\t have_target_values(): " << have_target_values()
+ << " \n\t ridge_lambda: " << ridge_lambda
+ << " \n\t lasso_budget: " << lasso_budget
+ << " \n\t this: " << this
+ );
+
+
+ // First check if lasso_budget is so big that it isn't even active. We do this
+ // by doing just ridge regression and checking the result.
+ matrix<double,0,1> betas = eig_vects*tmp(inv(diagm(eig_vals + ridge_lambda))*eig_vects_xdoty);
+ if (sum(abs(betas)) <= lasso_budget)
+ return betas;
+
+
+ // Set w back to 0. We will compute the w corresponding to what is currently
+ // in alpha layer on. This way w and alpha are always in sync.
+ w = 0;
+ wy_mult = 0;
+ wdoty = 0;
+
+
+ // return dot(w,x)
+ auto dot = [&](const matrix<double,0,1>& w, const en_sample2& x)
+ {
+ const double xmul = -x.label*(1/lasso_budget);
+ // Do the base dot product but don't forget to add in the -(1/t)*y part from the svm reduction paper
+ double val = rowm(X,x.r)*w + xmul*wdoty + wy_mult*xdoty(x.r) + xmul*wy_mult*ynorm;
+
+ return val;
+ };
+
+
+ // perform w += scale*x;
+ auto add_to = [&](matrix<double,0,1>& w, double scale, const en_sample2& x)
+ {
+ const double xmul = -x.label*(1/lasso_budget);
+ wy_mult += scale*xmul;
+ wdoty += scale*xdoty(x.r);
+ w += scale*trans(rowm(X,x.r));
+
+ };
+
+ const double Dii = ridge_lambda;
+
+ // setup the training samples used in the SVM optimizer below
+ for (size_t i = 0; i < samples.size(); ++i)
+ {
+ auto& x = samples[i];
+
+ const double xmul = -x.label*(1/lasso_budget);
+ x.xdotx = xmul*xmul*ynorm;
+ for (long c = 0; c < X.nc(); ++c)
+ x.xdotx += std::pow(X(x.r,c)+xmul*Y(c), 2.0) - std::pow(xmul*Y(c),2.0);
+
+ // compute the correct w given whatever might be in alpha.
+ if (alpha[i] != 0)
+ add_to(w, x.label*alpha[i], samples[i]);
+ }
+
+
+ // Now run the optimizer
+ double PG_max_prev = std::numeric_limits<double>::infinity();
+ double PG_min_prev = -std::numeric_limits<double>::infinity();
+
+
+ unsigned int iter;
+ for (iter = 0; iter < max_iterations; ++iter)
+ {
+ // randomly shuffle the indices
+ for (unsigned long i = 0; i < active_size; ++i)
+ {
+ // pick a random index >= i
+ const long j = i + rnd.get_random_32bit_number()%(active_size-i);
+ std::swap(index[i], index[j]);
+ }
+
+ double PG_max = -std::numeric_limits<double>::infinity();
+ double PG_min = std::numeric_limits<double>::infinity();
+ for (size_t ii = 0; ii < active_size; ++ii)
+ {
+ const auto i = index[ii];
+ const auto& x = samples[i];
+ double G = x.label*dot(w, x) - 1 + Dii*alpha[i];
+
+ double PG = 0;
+ if (alpha[i] == 0)
+ {
+ if (G > PG_max_prev)
+ {
+ // shrink the active set of training examples
+ --active_size;
+ std::swap(index[ii], index[active_size]);
+ --ii;
+ continue;
+ }
+
+ if (G < 0)
+ PG = G;
+ }
+ else
+ {
+ PG = G;
+ }
+
+ if (PG > PG_max)
+ PG_max = PG;
+ if (PG < PG_min)
+ PG_min = PG;
+
+ // if PG != 0
+ if (std::abs(PG) > 1e-12)
+ {
+ const double alpha_old = alpha[i];
+ alpha[i] = std::max(alpha[i] - G/(x.xdotx+Dii), (double)0.0);
+ const double delta = (alpha[i]-alpha_old)*x.label;
+ add_to(w, delta, x);
+ }
+ }
+
+ if (verbose)
+ {
+ using namespace std;
+ cout << "gap: " << PG_max - PG_min << endl;
+ cout << "active_size: " << active_size << endl;
+ cout << "iter: " << iter << endl;
+ cout << endl;
+ }
+
+ if (PG_max - PG_min <= eps)
+ {
+ // stop if we are within eps tolerance and the last iteration
+ // was over all the samples
+ if (active_size == index.size())
+ break;
+
+ // Turn off shrinking on the next iteration. We will stop if the
+ // tolerance is still <= eps when shrinking is off.
+ active_size = index.size();
+ PG_max_prev = std::numeric_limits<double>::infinity();
+ PG_min_prev = -std::numeric_limits<double>::infinity();
+ }
+ else
+ {
+ PG_max_prev = PG_max;
+ PG_min_prev = PG_min;
+ if (PG_max_prev <= 0)
+ PG_max_prev = std::numeric_limits<double>::infinity();
+ if (PG_min_prev >= 0)
+ PG_min_prev = -std::numeric_limits<double>::infinity();
+ }
+
+
+ // recalculate wdoty every so often to avoid drift.
+ if (iter%100 == 0)
+ wdoty = dlib::dot(Y, w);
+ }
+
+
+ betas.set_size(alpha.size()/2);
+ for (long i = 0; i < betas.size(); ++i)
+ betas(i) = lasso_budget*(alpha[2*i] - alpha[2*i+1]);
+ betas /= sum(mat(alpha));
+ return betas;
+ }
+
+
+ private:
+
+ struct en_sample2
+ {
+ // X location
+ long r;
+
+
+ double label;
+
+ double xdotx;
+ };
+
+ std::vector<en_sample2> samples;
+ std::vector<double> alpha;
+ double ynorm;
+ matrix<double> X;
+ matrix<double,0,1> Y;
+ matrix<double,0,1> xdoty;
+ double wdoty;
+ double wy_mult; // logically, the real w is what is in the w vector + wy_mult*Y
+ matrix<double,0,1> w;
+ std::vector<long> index;
+ unsigned long active_size;
+
+ matrix<double,0,1> eig_vects_xdoty;
+ matrix<double,0,1> eig_vals;
+ matrix<double> eig_vects;
+ matrix<double> u;
+
+ dlib::rand rnd;
+
+
+ double eps;
+ unsigned long max_iterations;
+ bool verbose;
+ };
+
+// ----------------------------------------------------------------------------------------
+
+}
+
+#endif // DLIB_ElASTIC_NET_Hh_
+
+
diff --git a/ml/dlib/dlib/optimization/elastic_net_abstract.h b/ml/dlib/dlib/optimization/elastic_net_abstract.h
new file mode 100644
index 000000000..8ae69e37e
--- /dev/null
+++ b/ml/dlib/dlib/optimization/elastic_net_abstract.h
@@ -0,0 +1,190 @@
+// Copyright (C) 2016 Davis E. King (davis@dlib.net)
+// License: Boost Software License See LICENSE.txt for the full license.
+#undef DLIB_ElASTIC_NET_ABSTRACT_Hh_
+#ifdef DLIB_ElASTIC_NET_ABSTRACT_Hh_
+
+#include "../matrix.h"
+
+namespace dlib
+{
+
+// ----------------------------------------------------------------------------------------
+
+ class elastic_net
+ {
+ /*!
+ WHAT THIS OBJECT REPRESENTS
+ This object is a tool for solving the following optimization problem:
+
+ min_w: length_squared(X*w - Y) + ridge_lambda*length_squared(w)
+ such that: sum(abs(w)) <= lasso_budget
+
+ That is, it solves the elastic net optimization problem. This object also
+ has the special property that you can quickly obtain different solutions
+ for different settings of ridge_lambda, lasso_budget, and target Y values.
+
+ This is because a large amount of work is precomputed in the constructor.
+ The solver will also remember the previous solution and will use that to
+ warm start subsequent invocations. Therefore, you can efficiently get
+ solutions for a wide range of regularization parameters.
+
+
+ The particular algorithm used to solve it is described in the paper:
+ Zhou, Quan, et al. "A reduction of the elastic net to support vector
+ machines with an application to gpu computing." arXiv preprint
+ arXiv:1409.1976 (2014). APA
+
+ And for the SVM solver sub-component we use the algorithm from:
+ Hsieh, Cho-Jui, et al. "A dual coordinate descent method for large-scale
+ linear SVM." Proceedings of the 25th international conference on Machine
+ learning. ACM, 2008.
+ !*/
+
+ public:
+
+ template <typename EXP>
+ explicit elastic_net(
+ const matrix_exp<EXP>& XX
+ );
+ /*!
+ requires
+ - XX.size() != 0
+ - XX.nr() == XX.nc()
+ ensures
+ - #get_epsilon() == 1e-5
+ - #get_max_iterations() == 50000
+ - This object will not be verbose unless be_verbose() is called.
+ - #size() == XX.nc()
+ - #have_target_values() == false
+ - We interpret XX as trans(X)*X where X is as defined in the objective
+ function discussed above in WHAT THIS OBJECT REPRESENTS.
+ !*/
+
+ template <typename EXP1, typename EXP2>
+ elastic_net(
+ const matrix_exp<EXP1>& XX,
+ const matrix_exp<EXP2>& XY
+ );
+ /*!
+ requires
+ - XX.size() != 0
+ - XX.nr() == XX.nc()
+ - is_col_vector(XY)
+ - XX.nc() == Y.size()
+ ensures
+ - constructs this object by calling the elastic_net(XX) constructor and
+ then calling this->set_xy(XY).
+ - #have_target_values() == true
+ - We interpret XX as trans(X)*X where X is as defined in the objective
+ function discussed above in WHAT THIS OBJECT REPRESENTS. Similarly, XY
+ should be trans(X)*Y.
+ !*/
+
+ long size (
+ ) const;
+ /*!
+ ensures
+ - returns the dimensionality of the data loaded into this object. That is,
+ how many elements are in the optimal w vector? This function returns
+ that number.
+ !*/
+
+ bool have_target_values (
+ ) const;
+ /*!
+ ensures
+ - returns true if set_xy() has been called and false otherwise.
+ !*/
+
+ template <typename EXP>
+ void set_xy(
+ const matrix_exp<EXP>& XY
+ );
+ /*!
+ requires
+ - is_col_vector(Y)
+ - Y.size() == size()
+ ensures
+ - #have_target_values() == true
+ - Sets the target values of the regression. Note that we expect the given
+ matrix, XY, to be equal to trans(X)*Y, where X and Y have the definitions
+ discussed above in WHAT THIS OBJECT REPRESENTS.
+ !*/
+
+ void set_epsilon(
+ double eps
+ );
+ /*!
+ requires
+ - eps > 0
+ ensures
+ - #get_epsilon() == eps
+ !*/
+
+ double get_epsilon (
+ ) const;
+ /*!
+ ensures
+ - returns the error epsilon that determines when the solver should stop.
+ Smaller values may result in a more accurate solution but take longer to
+ execute.
+ !*/
+
+ unsigned long get_max_iterations (
+ ) const;
+ /*!
+ ensures
+ - returns the maximum number of iterations the optimizer is allowed to run
+ before it is required to stop and return a result.
+ !*/
+
+ void set_max_iterations (
+ unsigned long max_iter
+ );
+ /*!
+ ensures
+ - #get_max_iterations() == max_iter
+ !*/
+
+ void be_verbose (
+ );
+ /*!
+ ensures
+ - This object will print status messages to standard out so that a
+ user can observe the progress of the algorithm.
+ !*/
+
+ void be_quiet (
+ );
+ /*!
+ ensures
+ - this object will not print anything to standard out.
+ !*/
+
+
+ matrix<double,0,1> operator() (
+ double ridge_lambda,
+ double lasso_budget = std::numeric_limits<double>::infinity()
+ );
+ /*!
+ requires
+ - have_target_values() == true
+ - ridge_lambda > 0
+ - lasso_budget > 0
+ ensures
+ - Solves the optimization problem described in the WHAT THIS OBJECT
+ REPRESENTS section above and returns the optimal w.
+ - The returned vector has size() elements.
+ - if (lasso_budget == infinity) then
+ - The lasso constraint is ignored
+ !*/
+
+ };
+
+// ----------------------------------------------------------------------------------------
+
+}
+
+#endif // DLIB_ElASTIC_NET_ABSTRACT_Hh_
+
+
diff --git a/ml/dlib/dlib/optimization/find_max_factor_graph_nmplp.h b/ml/dlib/dlib/optimization/find_max_factor_graph_nmplp.h
new file mode 100644
index 000000000..3dd7fd56f
--- /dev/null
+++ b/ml/dlib/dlib/optimization/find_max_factor_graph_nmplp.h
@@ -0,0 +1,337 @@
+// Copyright (C) 2011 Davis E. King (davis@dlib.net)
+// License: Boost Software License See LICENSE.txt for the full license.
+#ifndef DLIB_FIND_MAX_FACTOR_GRAPH_nMPLP_Hh_
+#define DLIB_FIND_MAX_FACTOR_GRAPH_nMPLP_Hh_
+
+#include "find_max_factor_graph_nmplp_abstract.h"
+#include <vector>
+#include <map>
+#include "../matrix.h"
+#include "../hash.h"
+
+
+namespace dlib
+{
+
+// ----------------------------------------------------------------------------------------
+
+ namespace impl
+ {
+ class simple_hash_map
+ {
+ public:
+
+ simple_hash_map(
+ ) :
+ scan_dist(6)
+ {
+ data.resize(5000);
+ }
+
+ void insert (
+ const unsigned long a,
+ const unsigned long b,
+ const unsigned long value
+ )
+ /*!
+ requires
+ - a != std::numeric_limits<unsigned long>::max()
+ ensures
+ - #(*this)(a,b) == value
+ !*/
+ {
+ const uint32 h = murmur_hash3_2(a,b)%(data.size()-scan_dist);
+
+ const unsigned long empty_bucket = std::numeric_limits<unsigned long>::max();
+
+ for (uint32 i = 0; i < scan_dist; ++i)
+ {
+ if (data[i+h].key1 == empty_bucket)
+ {
+ data[i+h].key1 = a;
+ data[i+h].key2 = b;
+ data[i+h].value = value;
+ return;
+ }
+ }
+
+ // if we get this far it means the hash table is filling up. So double its size.
+ std::vector<bucket> new_data;
+ new_data.resize(data.size()*2);
+ new_data.swap(data);
+ for (uint32 i = 0; i < new_data.size(); ++i)
+ {
+ if (new_data[i].key1 != empty_bucket)
+ {
+ insert(new_data[i].key1, new_data[i].key2, new_data[i].value);
+ }
+ }
+
+ insert(a,b,value);
+ }
+
+ unsigned long operator() (
+ const unsigned long a,
+ const unsigned long b
+ ) const
+ /*!
+ requires
+ - this->insert(a,b,some_value) has been called
+ ensures
+ - returns the value stored at key (a,b)
+ !*/
+ {
+ DLIB_ASSERT(a != b, "An invalid map_problem was given to find_max_factor_graph_nmplp()."
+ << "\nNode " << a << " is listed as being a neighbor with itself, which is illegal.");
+
+ uint32 h = murmur_hash3_2(a,b)%(data.size()-scan_dist);
+
+
+ for (unsigned long i = 0; i < scan_dist; ++i)
+ {
+ if (data[h].key1 == a && data[h].key2 == b)
+ {
+ return data[h].value;
+ }
+ ++h;
+ }
+
+
+ // this should never happen (since this function requires (a,b) to be in the hash table
+ DLIB_ASSERT(false, "An invalid map_problem was given to find_max_factor_graph_nmplp()."
+ << "\nThe nodes in the map_problem are inconsistent because node "<<a<<" is in the neighbor list"
+ << "\nof node "<<b<< " but node "<<b<<" isn't in the neighbor list of node "<<a<<". The neighbor relationship"
+ << "\nis supposed to be symmetric."
+ );
+ return 0;
+ }
+
+ private:
+
+ struct bucket
+ {
+ // having max() in key1 indicates that the bucket isn't used.
+ bucket() : key1(std::numeric_limits<unsigned long>::max()) {}
+ unsigned long key1;
+ unsigned long key2;
+ unsigned long value;
+ };
+
+ std::vector<bucket> data;
+ const unsigned int scan_dist;
+ };
+ }
+
+// ----------------------------------------------------------------------------------------
+
+ template <
+ typename map_problem
+ >
+ void find_max_factor_graph_nmplp (
+ const map_problem& prob,
+ std::vector<unsigned long>& map_assignment,
+ unsigned long max_iter,
+ double eps
+ )
+ {
+ // make sure requires clause is not broken
+ DLIB_ASSERT( eps > 0,
+ "\t void find_max_factor_graph_nmplp()"
+ << "\n\t eps must be greater than zero"
+ << "\n\t eps: " << eps
+ );
+
+ /*
+ This function is an implementation of the NMPLP algorithm introduced in the
+ following papers:
+ Fixing Max-Product: Convergent Message Passing Algorithms for MAP LP-Relaxations (2008)
+ by Amir Globerson and Tommi Jaakkola
+
+ Introduction to dual decomposition for inference (2011)
+ by David Sontag, Amir Globerson, and Tommi Jaakkola
+
+ In particular, this function implements the star MPLP update equations shown as
+ equation 1.20 from the paper Introduction to dual decomposition for inference
+ (the method was called NMPLP in the first paper). It should also be noted that
+ the original description of the NMPLP in the first paper had an error in the
+ equations and the second paper contains corrected equations, which is what this
+ function uses.
+ */
+
+ typedef typename map_problem::node_iterator node_iterator;
+ typedef typename map_problem::neighbor_iterator neighbor_iterator;
+
+ map_assignment.resize(prob.number_of_nodes());
+
+
+ if (prob.number_of_nodes() == 0)
+ return;
+
+
+ std::vector<double> delta_elements;
+ delta_elements.reserve(prob.number_of_nodes()*prob.num_states(prob.begin())*3);
+
+ impl::simple_hash_map delta_idx;
+
+
+
+ // Initialize delta to zero and fill up the hash table with the appropriate values
+ // so we can index into delta later on.
+ for (node_iterator i = prob.begin(); i != prob.end(); ++i)
+ {
+ const unsigned long id_i = prob.node_id(i);
+
+ for (neighbor_iterator j = prob.begin(i); j != prob.end(i); ++j)
+ {
+ const unsigned long id_j = prob.node_id(j);
+ delta_idx.insert(id_i, id_j, delta_elements.size());
+
+ const unsigned long num_states_xj = prob.num_states(j);
+ for (unsigned long xj = 0; xj < num_states_xj; ++xj)
+ delta_elements.push_back(0);
+ }
+ }
+
+
+ std::vector<double> gamma_i;
+ std::vector<std::vector<double> > gamma_ji;
+ std::vector<std::vector<double> > delta_to_j_no_i;
+ // These arrays will end up with a length equal to the maximum number of neighbors
+ // of any node in the graph. So reserve a bigish number of slots so that we are
+ // very unlikely to need to preform an expensive reallocation during the
+ // optimization.
+ gamma_ji.reserve(10000);
+ delta_to_j_no_i.reserve(10000);
+
+
+ double max_change = eps + 1;
+ // Now do the main body of the optimization.
+ unsigned long iter;
+ for (iter = 0; iter < max_iter && max_change > eps; ++iter)
+ {
+ max_change = -std::numeric_limits<double>::infinity();
+
+ for (node_iterator i = prob.begin(); i != prob.end(); ++i)
+ {
+ const unsigned long id_i = prob.node_id(i);
+ const unsigned long num_states_xi = prob.num_states(i);
+ gamma_i.assign(num_states_xi, 0);
+
+ double num_neighbors = 0;
+
+ unsigned int jcnt = 0;
+ // first we fill in the gamma vectors
+ for (neighbor_iterator j = prob.begin(i); j != prob.end(i); ++j)
+ {
+ // Make sure these arrays are big enough to hold all the neighbor
+ // information.
+ if (jcnt >= gamma_ji.size())
+ {
+ gamma_ji.resize(gamma_ji.size()+1);
+ delta_to_j_no_i.resize(delta_to_j_no_i.size()+1);
+ }
+
+ ++num_neighbors;
+ const unsigned long id_j = prob.node_id(j);
+ const unsigned long num_states_xj = prob.num_states(j);
+
+ gamma_ji[jcnt].assign(num_states_xi, -std::numeric_limits<double>::infinity());
+ delta_to_j_no_i[jcnt].assign(num_states_xj, 0);
+
+ // compute delta_j^{-i} and store it in delta_to_j_no_i[jcnt]
+ for (neighbor_iterator k = prob.begin(j); k != prob.end(j); ++k)
+ {
+ const unsigned long id_k = prob.node_id(k);
+ if (id_k==id_i)
+ continue;
+ const double* const delta_kj = &delta_elements[delta_idx(id_k,id_j)];
+ for (unsigned long xj = 0; xj < num_states_xj; ++xj)
+ {
+ delta_to_j_no_i[jcnt][xj] += delta_kj[xj];
+ }
+ }
+
+ // now compute gamma values
+ for (unsigned long xi = 0; xi < num_states_xi; ++xi)
+ {
+ for (unsigned long xj = 0; xj < num_states_xj; ++xj)
+ {
+ gamma_ji[jcnt][xi] = std::max(gamma_ji[jcnt][xi], prob.factor_value(i,j,xi,xj) + delta_to_j_no_i[jcnt][xj]);
+ }
+ gamma_i[xi] += gamma_ji[jcnt][xi];
+ }
+ ++jcnt;
+ }
+
+ // now update the delta values
+ jcnt = 0;
+ for (neighbor_iterator j = prob.begin(i); j != prob.end(i); ++j)
+ {
+ const unsigned long id_j = prob.node_id(j);
+ const unsigned long num_states_xj = prob.num_states(j);
+
+ // messages from j to i
+ double* const delta_ji = &delta_elements[delta_idx(id_j,id_i)];
+
+ // messages from i to j
+ double* const delta_ij = &delta_elements[delta_idx(id_i,id_j)];
+
+ for (unsigned long xj = 0; xj < num_states_xj; ++xj)
+ {
+ double best_val = -std::numeric_limits<double>::infinity();
+
+ for (unsigned long xi = 0; xi < num_states_xi; ++xi)
+ {
+ double val = prob.factor_value(i,j,xi,xj) + 2/(num_neighbors+1)*gamma_i[xi] -gamma_ji[jcnt][xi];
+ if (val > best_val)
+ best_val = val;
+ }
+ best_val = -0.5*delta_to_j_no_i[jcnt][xj] + 0.5*best_val;
+
+ if (std::abs(delta_ij[xj] - best_val) > max_change)
+ max_change = std::abs(delta_ij[xj] - best_val);
+
+ delta_ij[xj] = best_val;
+ }
+
+ for (unsigned long xi = 0; xi < num_states_xi; ++xi)
+ {
+ double new_val = -1/(num_neighbors+1)*gamma_i[xi] + gamma_ji[jcnt][xi];
+ if (std::abs(delta_ji[xi] - new_val) > max_change)
+ max_change = std::abs(delta_ji[xi] - new_val);
+ delta_ji[xi] = new_val;
+ }
+ ++jcnt;
+ }
+ }
+ }
+
+
+ // now decode the "beliefs"
+ std::vector<double> b;
+ for (node_iterator i = prob.begin(); i != prob.end(); ++i)
+ {
+ const unsigned long id_i = prob.node_id(i);
+ b.assign(prob.num_states(i), 0);
+
+ for (neighbor_iterator k = prob.begin(i); k != prob.end(i); ++k)
+ {
+ const unsigned long id_k = prob.node_id(k);
+
+ for (unsigned long xi = 0; xi < b.size(); ++xi)
+ {
+ const double* const delta_ki = &delta_elements[delta_idx(id_k,id_i)];
+ b[xi] += delta_ki[xi];
+ }
+ }
+
+ map_assignment[id_i] = index_of_max(mat(b));
+ }
+ }
+
+// ----------------------------------------------------------------------------------------
+
+}
+
+#endif // DLIB_FIND_MAX_FACTOR_GRAPH_nMPLP_Hh_
+
diff --git a/ml/dlib/dlib/optimization/find_max_factor_graph_nmplp_abstract.h b/ml/dlib/dlib/optimization/find_max_factor_graph_nmplp_abstract.h
new file mode 100644
index 000000000..3dd9aead0
--- /dev/null
+++ b/ml/dlib/dlib/optimization/find_max_factor_graph_nmplp_abstract.h
@@ -0,0 +1,365 @@
+// Copyright (C) 2011 Davis E. King (davis@dlib.net)
+// License: Boost Software License See LICENSE.txt for the full license.
+#undef DLIB_FIND_MAX_FACTOR_GRAPH_nMPLP_ABSTRACT_Hh_
+#ifdef DLIB_FIND_MAX_FACTOR_GRAPH_nMPLP_ABSTRACT_Hh_
+
+#include <vector>
+
+
+namespace dlib
+{
+
+// ----------------------------------------------------------------------------------------
+
+ class map_problem
+ {
+ /*!
+ WHAT THIS OBJECT REPRESENTS
+ This object represents a factor graph or graphical model. In
+ particular, this object defines the interface a MAP problem on
+ a factor graph must implement if it is to be solved using the
+ find_max_factor_graph_nmplp() routine defined at the bottom of this file.
+
+ Note that there is no dlib::map_problem object. What you are
+ looking at here is simply the interface definition for a map problem.
+ You must implement your own version of this object for the problem
+ you wish to solve and then pass it to the find_max_factor_graph_nmplp() routine.
+
+
+ Note also that a factor graph should not have any nodes which are
+ neighbors with themselves. Additionally, the graph is undirected. This
+ mean that if A is a neighbor of B then B must be a neighbor of A for
+ the map problem to be valid.
+
+
+ Finally, note that the "neighbor" relationship between nodes means the
+ following: Two nodes are neighbors if and only if there is a potential
+ function (implemented by the factor_value() method) which operates on
+ the nodes.
+ !*/
+
+ public:
+
+ class node_iterator
+ {
+ /*!
+ WHAT THIS OBJECT REPRESENTS
+ This is a simple forward iterator for iterating over
+ the nodes/variables in this factor graph.
+
+ Note that you can't dereference the iterator and
+ obtain a value. That is, the iterator is opaque to
+ the user. It is used only as an argument to the other
+ methods defined in this interface.
+ !*/
+
+ public:
+ node_iterator(
+ );
+ /*!
+ ensures
+ - constructs an iterator in an undefined state
+ !*/
+
+ node_iterator(
+ const node_iterator& item
+ );
+ /*!
+ ensures
+ - #*this is a copy of item
+ !*/
+
+ node_iterator& operator= (
+ const node_iterator& item
+ );
+ /*!
+ ensures
+ - #*this is a copy of item
+ - returns #*this
+ !*/
+
+ bool operator== (
+ const node_iterator& item
+ ) const;
+ /*!
+ ensures
+ - returns true if *this and item both reference
+ the same node in the factor graph and false
+ otherwise.
+ !*/
+
+ bool operator!= (
+ const node_iterator& item
+ ) const;
+ /*!
+ ensures
+ - returns false if *this and item both reference
+ the same node in the factor graph and true
+ otherwise.
+ !*/
+
+ node_iterator& operator++(
+ );
+ /*!
+ ensures
+ - advances *this to the next node in the factor graph.
+ - returns a reference to the updated *this
+ (i.e. this is the ++object form of the increment operator)
+ !*/
+ };
+
+ class neighbor_iterator
+ {
+ /*!
+ WHAT THIS OBJECT REPRESENTS
+ This is a simple forward iterator for iterating over
+ the nodes/variables in this factor graph. This version
+ of the iterator is used for iterating over the neighbors
+ of another node in the graph.
+ !*/
+
+ public:
+ neighbor_iterator(
+ );
+ /*!
+ ensures
+ - constructs an iterator in an undefined state
+ !*/
+
+ neighbor_iterator(
+ const neighbor_iterator& item
+ );
+ /*!
+ ensures
+ - #*this is a copy of item
+ !*/
+
+ neighbor_iterator& operator= (
+ const neighbor_iterator& item
+ );
+ /*!
+ ensures
+ - #*this is a copy of item
+ - returns #*this
+ !*/
+
+ bool operator== (
+ const neighbor_iterator& item
+ ) const;
+ /*!
+ ensures
+ - returns true if *this and item both reference
+ the same node in the factor graph and false
+ otherwise.
+ !*/
+
+ bool operator!= (
+ const neighbor_iterator& item
+ ) const;
+ /*!
+ ensures
+ - returns false if *this and item both reference
+ the same node in the factor graph and true
+ otherwise.
+ !*/
+
+ neighbor_iterator& operator++(
+ );
+ /*!
+ ensures
+ - advances *this to the next node in the factor graph.
+ - returns a reference to the updated *this
+ (i.e. this is the ++object form of the increment operator)
+ !*/
+ };
+
+ unsigned long number_of_nodes (
+ ) const;
+ /*!
+ ensures
+ - returns the number of nodes in the factor graph. Or in other words,
+ returns the number of variables in the MAP problem.
+ !*/
+
+ node_iterator begin(
+ ) const;
+ /*!
+ ensures
+ - returns an iterator to the first node in the graph. If no such
+ node exists then returns end().
+ !*/
+
+ node_iterator end(
+ ) const;
+ /*!
+ ensures
+ - returns an iterator to one past the last node in the graph.
+ !*/
+
+ neighbor_iterator begin(
+ const node_iterator& it
+ ) const;
+ /*!
+ requires
+ - it == a valid iterator (i.e. it must be in the range [begin(), end()))
+ ensures
+ - returns an iterator to the first neighboring node of the node
+ referenced by it. If no such node exists then returns end(it).
+ !*/
+
+ neighbor_iterator begin(
+ const neighbor_iterator& it
+ ) const;
+ /*!
+ requires
+ - it == a valid iterator. (i.e. it must be in the range
+ [begin(i), end(i)) where i is some valid iterator. )
+ ensures
+ - returns an iterator to the first neighboring node of the node
+ referenced by it. If no such node exists then returns end(it).
+ !*/
+
+ neighbor_iterator end(
+ const node_iterator& it
+ ) const;
+ /*!
+ requires
+ - it == a valid iterator (i.e. it must be in the range [begin(), end()))
+ ensures
+ - returns an iterator to one past the last neighboring node of the node
+ referenced by it.
+ !*/
+
+ neighbor_iterator end(
+ const neighbor_iterator& it
+ ) const;
+ /*!
+ requires
+ - it == a valid iterator. (i.e. it must be in the range
+ [begin(i), end(i)) where i is some valid iterator. )
+ ensures
+ - returns an iterator to one past the last neighboring node of the node
+ referenced by it.
+ !*/
+
+ unsigned long node_id (
+ const node_iterator& it
+ ) const;
+ /*!
+ requires
+ - it == a valid iterator (i.e. it must be in the range [begin(), end()))
+ ensures
+ - returns a number ID such that:
+ - 0 <= ID < number_of_nodes()
+ - ID == a number which uniquely identifies the node pointed to by it.
+ !*/
+
+ unsigned long node_id (
+ const neighbor_iterator& it
+ ) const;
+ /*!
+ requires
+ - it == a valid iterator. (i.e. it must be in the range
+ [begin(i), end(i)) where i is some valid iterator. )
+ ensures
+ - returns a number ID such that:
+ - 0 <= ID < number_of_nodes()
+ - ID == a number which uniquely identifies the node pointed to by it.
+ !*/
+
+ unsigned long num_states (
+ const node_iterator& it
+ ) const;
+ /*!
+ requires
+ - it == a valid iterator (i.e. it must be in the range [begin(), end()))
+ ensures
+ - returns the number of states attainable by the node/variable referenced by it.
+ !*/
+
+ unsigned long num_states (
+ const neighbor_iterator& it
+ ) const;
+ /*!
+ requires
+ - it == a valid iterator. (i.e. it must be in the range
+ [begin(i), end(i)) where i is some valid iterator. )
+ ensures
+ - returns the number of states attainable by the node/variable referenced by it.
+ !*/
+
+ // The next four functions all have the same contract.
+ double factor_value (const node_iterator& it1, const node_iterator& it2, unsigned long s1, unsigned long s2) const;
+ double factor_value (const neighbor_iterator& it1, const node_iterator& it2, unsigned long s1, unsigned long s2) const;
+ double factor_value (const node_iterator& it1, const neighbor_iterator& it2, unsigned long s1, unsigned long s2) const;
+ double factor_value (const neighbor_iterator& it1, const neighbor_iterator& it2, unsigned long s1, unsigned long s2) const;
+ /*!
+ requires
+ - it1 == a valid iterator
+ - it2 == a valid iterator
+ - 0 <= s1 < num_states(it1)
+ - 0 <= s2 < num_states(it2)
+ - it1 and it2 reference nodes which are neighbors in the factor graph
+ ensures
+ - returns the value of the factor/potential function for the given pair of
+ nodes, defined by it1 and it2, for the case where they take on the values
+ s1 and s2 respectively.
+ !*/
+
+ };
+
+// ----------------------------------------------------------------------------------------
+
+ template <
+ typename map_problem
+ >
+ void find_max_factor_graph_nmplp (
+ const map_problem& prob,
+ std::vector<unsigned long>& map_assignment,
+ unsigned long max_iter,
+ double eps
+ );
+ /*!
+ requires
+ - for all valid i: prob.num_states(i) >= 2
+ - map_problem == an object with an interface compatible with the map_problem
+ object defined at the top of this file.
+ - eps > 0
+ ensures
+ - This function is a tool for approximately solving the given MAP problem in a graphical
+ model or factor graph with pairwise potential functions. That is, it attempts
+ to solve a certain kind of optimization problem which can be defined as follows:
+ maximize: f(X)
+ where X is a set of integer valued variables and f(X) can be written as the
+ sum of functions which each involve only two variables from X. In reference
+ to the prob object, the nodes in prob represent the variables in X and the
+ functions which are summed are represented by prob.factor_value().
+ - #map_assignment == the result of the optimization.
+ - #map_assignment.size() == prob.number_of_nodes()
+ - for all valid i:
+ - #map_assignment[prob.node_id(i)] < prob.num_states(i)
+ - #map_assignment[prob.node_id(i)] == The approximate MAP assignment for node/variable i.
+ - eps controls the stopping condition, smaller values of eps lead to more accurate
+ solutions of the relaxed linear program but may take more iterations. Note that
+ the algorithm will never execute more than max_iter iterations regardless of
+ the setting of eps.
+ - If the graph is tree-structured then this routine always gives the exact solution
+ to the MAP problem. However, for graphs with cycles, the solution may be approximate.
+
+
+ - This function is an implementation of the NMPLP algorithm introduced in the
+ following papers:
+ Fixing Max-Product: Convergent Message Passing Algorithms for MAP LP-Relaxations (2008)
+ by Amir Globerson and Tommi Jaakkola
+
+ Introduction to dual decomposition for inference (2011)
+ by David Sontag, Amir Globerson, and Tommi Jaakkola
+ !*/
+
+// ----------------------------------------------------------------------------------------
+
+}
+
+#endif // DLIB_FIND_MAX_FACTOR_GRAPH_nMPLP_ABSTRACT_Hh_
+
+
diff --git a/ml/dlib/dlib/optimization/find_max_factor_graph_viterbi.h b/ml/dlib/dlib/optimization/find_max_factor_graph_viterbi.h
new file mode 100644
index 000000000..f7cbcb8d9
--- /dev/null
+++ b/ml/dlib/dlib/optimization/find_max_factor_graph_viterbi.h
@@ -0,0 +1,232 @@
+// Copyright (C) 2011 Davis E. King (davis@dlib.net)
+// License: Boost Software License See LICENSE.txt for the full license.
+#ifndef DLIB_FIND_MAX_FACTOR_GRAPH_VITERBi_Hh_
+#define DLIB_FIND_MAX_FACTOR_GRAPH_VITERBi_Hh_
+
+#include "find_max_factor_graph_viterbi_abstract.h"
+#include <vector>
+#include "../matrix.h"
+#include "../array2d.h"
+
+
+namespace dlib
+{
+
+// ----------------------------------------------------------------------------------------
+
+ namespace impl
+ {
+ struct viterbi_data
+ {
+ viterbi_data() :val(-std::numeric_limits<double>::infinity()), back_index(0) {}
+ double val;
+ unsigned long back_index;
+ };
+
+ template <long NC>
+ inline bool advance_state(
+ matrix<unsigned long,1,NC>& node_states,
+ unsigned long num_states
+ )
+ /*!
+ ensures
+ - advances node_states to the next state by adding 1
+ to node_states(node_states.size()-1) and carrying any
+ rollover (modulo num_states). Stores the result into #node_states.
+ - if (#node_states is all zeros) then
+ - returns false
+ - else
+ - returns true
+ !*/
+ {
+ for (long i = node_states.size()-1; i >= 0; --i)
+ {
+ node_states(i) += 1;
+ if (node_states(i) < num_states)
+ return true;
+
+ node_states(i) = 0;
+ }
+ return false;
+ }
+ }
+
+// ----------------------------------------------------------------------------------------
+
+ template <
+ typename map_problem
+ >
+ void find_max_factor_graph_viterbi (
+ const map_problem& prob,
+ std::vector<unsigned long>& map_assignment
+ )
+ {
+ using namespace dlib::impl;
+ const unsigned long order = prob.order();
+ const unsigned long num_states = prob.num_states();
+
+
+ DLIB_ASSERT(prob.num_states() > 0,
+ "\t void find_max_factor_graph_viterbi()"
+ << "\n\t The nodes in a factor graph have to be able to take on more than 0 states."
+ );
+ DLIB_ASSERT(std::pow(num_states,(double)order) < std::numeric_limits<unsigned long>::max(),
+ "\t void find_max_factor_graph_viterbi()"
+ << "\n\t The order is way too large for this algorithm to handle."
+ << "\n\t order: " << order
+ << "\n\t num_states: " << num_states
+ << "\n\t std::pow(num_states,order): " << std::pow(num_states,(double)order)
+ << "\n\t std::numeric_limits<unsigned long>::max(): " << std::numeric_limits<unsigned long>::max()
+ );
+
+ if (prob.number_of_nodes() == 0)
+ {
+ map_assignment.clear();
+ return;
+ }
+
+ if (order == 0)
+ {
+ map_assignment.resize(prob.number_of_nodes());
+ for (unsigned long i = 0; i < map_assignment.size(); ++i)
+ {
+ matrix<unsigned long,1,1> node_state;
+ unsigned long best_state = 0;
+ double best_val = -std::numeric_limits<double>::infinity();
+ for (unsigned long s = 0; s < num_states; ++s)
+ {
+ node_state(0) = s;
+ const double temp = prob.factor_value(i,node_state);
+ if (temp > best_val)
+ {
+ best_val = temp;
+ best_state = s;
+ }
+ }
+ map_assignment[i] = best_state;
+ }
+ return;
+ }
+
+
+ const unsigned long trellis_size = static_cast<unsigned long>(std::pow(num_states,(double)order));
+ unsigned long init_ring_size = 1;
+
+ array2d<impl::viterbi_data> trellis;
+ trellis.set_size(prob.number_of_nodes(), trellis_size);
+
+
+ for (unsigned long node = 0; node < prob.number_of_nodes(); ++node)
+ {
+
+ if (node < order)
+ {
+ matrix<unsigned long,1,0> node_states;
+ node_states.set_size(std::min<int>(node, order) + 1);
+ node_states = 0;
+
+ unsigned long idx = 0;
+ if (node == 0)
+ {
+ do
+ {
+ trellis[node][idx].val = prob.factor_value(node,node_states);
+ ++idx;
+ } while(advance_state(node_states,num_states));
+ }
+ else
+ {
+ init_ring_size *= num_states;
+ do
+ {
+ const unsigned long back_index = idx%init_ring_size;
+ trellis[node][idx].val = prob.factor_value(node,node_states) + trellis[node-1][back_index].val;
+ trellis[node][idx].back_index = back_index;
+ ++idx;
+ } while(advance_state(node_states,num_states));
+
+ }
+ }
+ else if (order == 1)
+ {
+ /*
+ WHAT'S THE DEAL WITH THIS PREPROCESSOR MACRO?
+ Well, if we can declare the dimensions of node_states as a compile
+ time constant then this function runs significantly faster. So this macro
+ is here to let us do that. It just lets us avoid replicating this code
+ block in the following if statements for different order sizes.
+ */
+#define DLIB_FMFGV_WORK \
+ node_states = 0; \
+ unsigned long count = 0; \
+ for (unsigned long i = 0; i < trellis_size; ++i) \
+ { \
+ unsigned long back_index = 0; \
+ double best_score = -std::numeric_limits<double>::infinity(); \
+ for (unsigned long s = 0; s < num_states; ++s) \
+ { \
+ const double temp = prob.factor_value(node,node_states) + trellis[node-1][count%trellis_size].val; \
+ if (temp > best_score) \
+ { \
+ best_score = temp; \
+ back_index = count%trellis_size; \
+ } \
+ advance_state(node_states,num_states); \
+ ++count; \
+ } \
+ trellis[node][i].val = best_score; \
+ trellis[node][i].back_index = back_index; \
+ }
+
+ matrix<unsigned long,1,2> node_states;
+ DLIB_FMFGV_WORK
+ }
+ else if (order == 2)
+ {
+ matrix<unsigned long,1,3> node_states;
+ DLIB_FMFGV_WORK
+ }
+ else if (order == 3)
+ {
+ matrix<unsigned long,1,4> node_states;
+ DLIB_FMFGV_WORK
+ }
+ else
+ {
+ // The general case, here we don't define the size of node_states at compile time.
+ matrix<unsigned long,1,0> node_states(order+1);
+ DLIB_FMFGV_WORK
+ }
+ }
+
+
+ map_assignment.resize(prob.number_of_nodes());
+ // Figure out which state of the last node has the biggest value.
+ unsigned long back_index = 0;
+ double best_val = -std::numeric_limits<double>::infinity();
+ for (long i = 0; i < trellis.nc(); ++i)
+ {
+ if (trellis[trellis.nr()-1][i].val > best_val)
+ {
+ best_val = trellis[trellis.nr()-1][i].val;
+ back_index = i;
+ }
+ }
+ // Follow the back links to find the decoding.
+ for (long node = map_assignment.size()-1; node >= 0; --node)
+ {
+ map_assignment[node] = back_index/init_ring_size;
+ back_index = trellis[node][back_index].back_index;
+ if (node < (long)order)
+ init_ring_size /= num_states;
+ }
+
+ }
+
+// ----------------------------------------------------------------------------------------
+
+}
+
+#endif // DLIB_FIND_MAX_FACTOR_GRAPH_VITERBi_Hh_
+
+
diff --git a/ml/dlib/dlib/optimization/find_max_factor_graph_viterbi_abstract.h b/ml/dlib/dlib/optimization/find_max_factor_graph_viterbi_abstract.h
new file mode 100644
index 000000000..c19e4c7eb
--- /dev/null
+++ b/ml/dlib/dlib/optimization/find_max_factor_graph_viterbi_abstract.h
@@ -0,0 +1,131 @@
+// Copyright (C) 2011 Davis E. King (davis@dlib.net)
+// License: Boost Software License See LICENSE.txt for the full license.
+#undef DLIB_FIND_MAX_FACTOR_GRAPH_VITERBi_ABSTRACT_Hh_
+#ifdef DLIB_FIND_MAX_FACTOR_GRAPH_VITERBi_ABSTRACT_Hh_
+
+#include <vector>
+#include "../matrix.h"
+
+
+namespace dlib
+{
+
+// ----------------------------------------------------------------------------------------
+
+ class map_problem
+ {
+ /*!
+ WHAT THIS OBJECT REPRESENTS
+ This object represents a chain-structured factor graph or graphical
+ model. In particular, this object defines the interface a MAP problem
+ on a factor graph must implement if it is to be solved using the
+ find_max_factor_graph_viterbi() routine defined at the bottom of this file.
+
+ Note that there is no dlib::map_problem object. What you are looking
+ at here is simply the interface definition for a map problem. You must
+ implement your own version of this object for the problem you wish to
+ solve and then pass it to the find_max_factor_graph_viterbi() routine.
+ !*/
+
+ public:
+
+ unsigned long order (
+ ) const;
+ /*!
+ ensures
+ - returns the order of this model. The order has the following interpretation:
+ This model can represent a high order Markov chain. If order()==1 then map_problem
+ represents a basic chain-structured graph where nodes only depend on their immediate
+ neighbors. However, high order Markov models can also be used by setting order() > 1.
+ !*/
+
+ unsigned long num_states (
+ ) const;
+ /*!
+ ensures
+ - returns the number of states attainable by each variable/node in the graph.
+ !*/
+
+ unsigned long number_of_nodes (
+ ) const;
+ /*!
+ ensures
+ - returns the number of nodes in the factor graph. Or in other words,
+ returns the number of variables in the MAP problem.
+ !*/
+
+ template <
+ typename EXP
+ >
+ double factor_value (
+ unsigned long node_id,
+ const matrix_exp<EXP>& node_states
+ ) const;
+ /*!
+ requires
+ - EXP::type == unsigned long
+ (i.e. node_states contains unsigned longs)
+ - node_id < number_of_nodes()
+ - node_states.size() == min(node_id, order()) + 1
+ - is_vector(node_states) == true
+ - max(node_states) < num_states()
+ ensures
+ - In a chain-structured graph, each node has a potential function associated with
+ it. The potential function operates on the variable given by the node as well
+ as the order() previous variables. Therefore, factor_value() returns the value
+ of the factor/potential function associated with node node_id where the following
+ nodes take on the values defined below:
+ - node_states(0) == the value of the node with ID node_id
+ - node_states(i) == the value of the node with ID node_id-i
+ - It is ok for this function to return a value of -std::numeric_limits<double>::infinity().
+ !*/
+
+ };
+
+// ----------------------------------------------------------------------------------------
+
+ template <
+ typename map_problem
+ >
+ void find_max_factor_graph_viterbi (
+ const map_problem& prob,
+ std::vector<unsigned long>& map_assignment
+ );
+ /*!
+ requires
+ - prob.num_states() > 0
+ - std::pow(prob.num_states(), prob.order()) < std::numeric_limits<unsigned long>::max()
+ (i.e. The Viterbi algorithm is exponential in the order of the map problem. So don't
+ make order too large.)
+ - map_problem == an object with an interface compatible with the map_problem
+ object defined at the top of this file.
+ ensures
+ - This function is a tool for exactly solving the MAP problem in a chain-structured
+ graphical model or factor graph. That is, it attempts to solve a certain kind of
+ optimization problem which can be defined as follows:
+ - Let X denote a set of prob.number_of_nodes() integer valued variables, each taking
+ a value in the range [0, prob.num_states()).
+ - Let X(i) = the ith variable in X.
+ - Let F(i) = factor_value_i(X(i), X(i-1), ..., X(i-prob.order()))
+ (This is the value returned by prob.factor_value(i, node_states). Note that
+ each factor's value function operates on at most prob.order()+1 variables.
+ Moreover, the variables are adjacent and hence the graph is "chain-structured".)
+
+ Then this function finds the assignments to the X variables which
+ maximizes: sum over all valid i: F(i)
+
+ - #map_assignment == the result of the optimization.
+ - #map_assignment.size() == prob.number_of_nodes()
+ - for all valid i:
+ - #map_assignment[i] < prob.num_states()
+ - #map_assignment[i] == The MAP assignment for node/variable i.
+ !*/
+
+// ----------------------------------------------------------------------------------------
+
+}
+
+#endif // DLIB_FIND_MAX_FACTOR_GRAPH_VITERBi_ABSTRACT_Hh_
+
+
+
diff --git a/ml/dlib/dlib/optimization/find_max_parse_cky.h b/ml/dlib/dlib/optimization/find_max_parse_cky.h
new file mode 100644
index 000000000..79614792a
--- /dev/null
+++ b/ml/dlib/dlib/optimization/find_max_parse_cky.h
@@ -0,0 +1,414 @@
+// Copyright (C) 2012 Davis E. King (davis@dlib.net)
+// License: Boost Software License See LICENSE.txt for the full license.
+#ifndef DLIB_FIND_MAX_PaRSE_CKY_Hh_
+#define DLIB_FIND_MAX_PaRSE_CKY_Hh_
+
+#include "find_max_parse_cky_abstract.h"
+#include <vector>
+#include <string>
+#include <sstream>
+#include "../serialize.h"
+#include "../array2d.h"
+
+namespace dlib
+{
+
+// -----------------------------------------------------------------------------------------
+
+ template <typename T>
+ struct constituent
+ {
+ unsigned long begin, end, k;
+ T left_tag;
+ T right_tag;
+ };
+
+ template <typename T>
+ void serialize(
+ const constituent<T>& item,
+ std::ostream& out
+ )
+ {
+ serialize(item.begin, out);
+ serialize(item.end, out);
+ serialize(item.k, out);
+ serialize(item.left_tag, out);
+ serialize(item.right_tag, out);
+ }
+
+ template <typename T>
+ void deserialize(
+ constituent<T>& item,
+ std::istream& in
+ )
+ {
+ deserialize(item.begin, in);
+ deserialize(item.end, in);
+ deserialize(item.k, in);
+ deserialize(item.left_tag, in);
+ deserialize(item.right_tag, in);
+ }
+
+// -----------------------------------------------------------------------------------------
+
+ const unsigned long END_OF_TREE = 0xFFFFFFFF;
+
+// -----------------------------------------------------------------------------------------
+
+ template <typename T>
+ struct parse_tree_element
+ {
+ constituent<T> c;
+ T tag; // id for the constituent corresponding to this level of the tree
+
+ unsigned long left;
+ unsigned long right;
+ double score;
+ };
+
+ template <typename T>
+ void serialize (
+ const parse_tree_element<T>& item,
+ std::ostream& out
+ )
+ {
+ serialize(item.c, out);
+ serialize(item.tag, out);
+ serialize(item.left, out);
+ serialize(item.right, out);
+ serialize(item.score, out);
+ }
+
+ template <typename T>
+ void deserialize (
+ parse_tree_element<T>& item,
+ std::istream& in
+ )
+ {
+ deserialize(item.c, in);
+ deserialize(item.tag, in);
+ deserialize(item.left, in);
+ deserialize(item.right, in);
+ deserialize(item.score, in);
+ }
+
+// -----------------------------------------------------------------------------------------
+
+ namespace impl
+ {
+ template <typename T>
+ unsigned long fill_parse_tree(
+ std::vector<parse_tree_element<T> >& parse_tree,
+ const T& tag,
+ const array2d<std::map<T, parse_tree_element<T> > >& back,
+ long r, long c
+ )
+ /*!
+ requires
+ - back[r][c].size() == 0 || back[r][c].count(tag) != 0
+ !*/
+ {
+ // base case of the recursion
+ if (back[r][c].size() == 0)
+ {
+ return END_OF_TREE;
+ }
+
+ const unsigned long idx = parse_tree.size();
+ const parse_tree_element<T>& item = back[r][c].find(tag)->second;
+ parse_tree.push_back(item);
+
+ const long k = item.c.k;
+ const unsigned long idx_left = fill_parse_tree(parse_tree, item.c.left_tag, back, r, k-1);
+ const unsigned long idx_right = fill_parse_tree(parse_tree, item.c.right_tag, back, k, c);
+ parse_tree[idx].left = idx_left;
+ parse_tree[idx].right = idx_right;
+ return idx;
+ }
+ }
+
+ template <typename T, typename production_rule_function>
+ void find_max_parse_cky (
+ const std::vector<T>& sequence,
+ const production_rule_function& production_rules,
+ std::vector<parse_tree_element<T> >& parse_tree
+ )
+ {
+ parse_tree.clear();
+ if (sequence.size() == 0)
+ return;
+
+ array2d<std::map<T,double> > table(sequence.size(), sequence.size());
+ array2d<std::map<T,parse_tree_element<T> > > back(sequence.size(), sequence.size());
+ typedef typename std::map<T,double>::iterator itr;
+ typedef typename std::map<T,parse_tree_element<T> >::iterator itr_b;
+
+ for (long r = 0; r < table.nr(); ++r)
+ table[r][r][sequence[r]] = 0;
+
+ std::vector<std::pair<T,double> > possible_tags;
+
+ for (long r = table.nr()-2; r >= 0; --r)
+ {
+ for (long c = r+1; c < table.nc(); ++c)
+ {
+ for (long k = r; k < c; ++k)
+ {
+ for (itr i = table[k+1][c].begin(); i != table[k+1][c].end(); ++i)
+ {
+ for (itr j = table[r][k].begin(); j != table[r][k].end(); ++j)
+ {
+ constituent<T> con;
+ con.begin = r;
+ con.end = c+1;
+ con.k = k+1;
+ con.left_tag = j->first;
+ con.right_tag = i->first;
+ possible_tags.clear();
+ production_rules(sequence, con, possible_tags);
+ for (unsigned long m = 0; m < possible_tags.size(); ++m)
+ {
+ const double score = possible_tags[m].second + i->second + j->second;
+ itr match = table[r][c].find(possible_tags[m].first);
+ if (match == table[r][c].end() || score > match->second)
+ {
+ table[r][c][possible_tags[m].first] = score;
+ parse_tree_element<T> item;
+ item.c = con;
+ item.score = score;
+ item.tag = possible_tags[m].first;
+ item.left = END_OF_TREE;
+ item.right = END_OF_TREE;
+ back[r][c][possible_tags[m].first] = item;
+ }
+ }
+ }
+ }
+ }
+ }
+ }
+
+
+ // now use back pointers to build the parse trees
+ const long r = 0;
+ const long c = back.nc()-1;
+ if (back[r][c].size() != 0)
+ {
+
+ // find the max scoring element in back[r][c]
+ itr_b max_i = back[r][c].begin();
+ itr_b i = max_i;
+ ++i;
+ for (; i != back[r][c].end(); ++i)
+ {
+ if (i->second.score > max_i->second.score)
+ max_i = i;
+ }
+
+ parse_tree.reserve(c);
+ impl::fill_parse_tree(parse_tree, max_i->second.tag, back, r, c);
+ }
+ }
+
+// -----------------------------------------------------------------------------------------
+
+ class parse_tree_to_string_error : public error
+ {
+ public:
+ parse_tree_to_string_error(const std::string& str): error(str) {}
+ };
+
+ namespace impl
+ {
+ template <bool enabled, typename T>
+ typename enable_if_c<enabled>::type conditional_print(
+ const T& item,
+ std::ostream& out
+ ) { out << item << " "; }
+
+ template <bool enabled, typename T>
+ typename disable_if_c<enabled>::type conditional_print(
+ const T& ,
+ std::ostream&
+ ) { }
+
+ template <bool print_tag, bool skip_tag, typename T, typename U >
+ void print_parse_tree_helper (
+ const std::vector<parse_tree_element<T> >& tree,
+ const std::vector<U>& words,
+ unsigned long i,
+ const T& tag_to_skip,
+ std::ostream& out
+ )
+ {
+ if (!skip_tag || tree[i].tag != tag_to_skip)
+ out << "[";
+
+ bool left_recurse = false;
+
+ // Only print if we are supposed to. Doing it this funny way avoids compiler
+ // errors in parse_tree_to_string() for the case where tag isn't
+ // printable.
+ if (!skip_tag || tree[i].tag != tag_to_skip)
+ conditional_print<print_tag>(tree[i].tag, out);
+
+ if (tree[i].left < tree.size())
+ {
+ left_recurse = true;
+ print_parse_tree_helper<print_tag,skip_tag>(tree, words, tree[i].left, tag_to_skip, out);
+ }
+ else
+ {
+ if ((tree[i].c.begin) < words.size())
+ {
+ out << words[tree[i].c.begin] << " ";
+ }
+ else
+ {
+ std::ostringstream sout;
+ sout << "Parse tree refers to element " << tree[i].c.begin
+ << " of sequence which is only of size " << words.size() << ".";
+ throw parse_tree_to_string_error(sout.str());
+ }
+ }
+
+ if (left_recurse == true)
+ out << " ";
+
+ if (tree[i].right < tree.size())
+ {
+ print_parse_tree_helper<print_tag,skip_tag>(tree, words, tree[i].right, tag_to_skip, out);
+ }
+ else
+ {
+ if (tree[i].c.k < words.size())
+ {
+ out << words[tree[i].c.k];
+ }
+ else
+ {
+ std::ostringstream sout;
+ sout << "Parse tree refers to element " << tree[i].c.k
+ << " of sequence which is only of size " << words.size() << ".";
+ throw parse_tree_to_string_error(sout.str());
+ }
+ }
+
+
+ if (!skip_tag || tree[i].tag != tag_to_skip)
+ out << "]";
+ }
+ }
+
+// -----------------------------------------------------------------------------------------
+
+ template <typename T, typename U>
+ std::string parse_tree_to_string (
+ const std::vector<parse_tree_element<T> >& tree,
+ const std::vector<U>& words,
+ const unsigned long root_idx = 0
+ )
+ {
+ if (root_idx >= tree.size())
+ return "";
+
+ std::ostringstream sout;
+ impl::print_parse_tree_helper<false,false>(tree, words, root_idx, tree[root_idx].tag, sout);
+ return sout.str();
+ }
+
+// -----------------------------------------------------------------------------------------
+
+ template <typename T, typename U>
+ std::string parse_tree_to_string_tagged (
+ const std::vector<parse_tree_element<T> >& tree,
+ const std::vector<U>& words,
+ const unsigned long root_idx = 0
+ )
+ {
+ if (root_idx >= tree.size())
+ return "";
+
+ std::ostringstream sout;
+ impl::print_parse_tree_helper<true,false>(tree, words, root_idx, tree[root_idx].tag, sout);
+ return sout.str();
+ }
+
+// -----------------------------------------------------------------------------------------
+
+ template <typename T, typename U>
+ std::string parse_trees_to_string (
+ const std::vector<parse_tree_element<T> >& tree,
+ const std::vector<U>& words,
+ const T& tag_to_skip
+ )
+ {
+ if (tree.size() == 0)
+ return "";
+
+ std::ostringstream sout;
+ impl::print_parse_tree_helper<false,true>(tree, words, 0, tag_to_skip, sout);
+ return sout.str();
+ }
+
+// -----------------------------------------------------------------------------------------
+
+ template <typename T, typename U>
+ std::string parse_trees_to_string_tagged (
+ const std::vector<parse_tree_element<T> >& tree,
+ const std::vector<U>& words,
+ const T& tag_to_skip
+ )
+ {
+ if (tree.size() == 0)
+ return "";
+
+ std::ostringstream sout;
+ impl::print_parse_tree_helper<true,true>(tree, words, 0, tag_to_skip, sout);
+ return sout.str();
+ }
+
+// -----------------------------------------------------------------------------------------
+
+ namespace impl
+ {
+ template <typename T>
+ void helper_find_trees_without_tag (
+ const std::vector<parse_tree_element<T> >& tree,
+ const T& tag,
+ std::vector<unsigned long>& tree_roots,
+ unsigned long idx
+ )
+ {
+ if (idx < tree.size())
+ {
+ if (tree[idx].tag != tag)
+ {
+ tree_roots.push_back(idx);
+ }
+ else
+ {
+ helper_find_trees_without_tag(tree, tag, tree_roots, tree[idx].left);
+ helper_find_trees_without_tag(tree, tag, tree_roots, tree[idx].right);
+ }
+ }
+ }
+ }
+
+ template <typename T>
+ void find_trees_not_rooted_with_tag (
+ const std::vector<parse_tree_element<T> >& tree,
+ const T& tag,
+ std::vector<unsigned long>& tree_roots
+ )
+ {
+ tree_roots.clear();
+ impl::helper_find_trees_without_tag(tree, tag, tree_roots, 0);
+ }
+
+// -----------------------------------------------------------------------------------------
+
+}
+
+#endif // DLIB_FIND_MAX_PaRSE_CKY_Hh_
+
diff --git a/ml/dlib/dlib/optimization/find_max_parse_cky_abstract.h b/ml/dlib/dlib/optimization/find_max_parse_cky_abstract.h
new file mode 100644
index 000000000..52ffc787b
--- /dev/null
+++ b/ml/dlib/dlib/optimization/find_max_parse_cky_abstract.h
@@ -0,0 +1,388 @@
+// Copyright (C) 2012 Davis E. King (davis@dlib.net)
+// License: Boost Software License See LICENSE.txt for the full license.
+#undef DLIB_FIND_MAX_PARsE_CKY_ABSTRACT_Hh_
+#ifdef DLIB_FIND_MAX_PARsE_CKY_ABSTRACT_Hh_
+
+#include <vector>
+#include <string>
+#include "../algs.h"
+#include "../serialize.h"
+
+namespace dlib
+{
+
+// -----------------------------------------------------------------------------------------
+
+ template <
+ typename T
+ >
+ struct constituent
+ {
+ /*!
+ WHAT THIS OBJECT REPRESENTS
+ This object represents the linguistic idea of a constituent, that is, a
+ group of words that functions as a single unit. In particular, it
+ represents a combination of two constituents into a new constituent.
+
+ Additionally, a constituent object represents a range of words relative to
+ some std::vector of words. The range is from [begin, end) (i.e. including
+ begin but not including end, so using the normal C++ iterator notation).
+ Moreover, a constituent is always composed of two parts, each having a tag.
+ Therefore, the left part is composed of the words in the range [begin,k)
+ and has tag left_tag while the right part of the constituent contains the
+ words in the range [k,end) and has the tag right_tag.
+
+ The tags are user defined objects of type T. In general, they are used to
+ represent syntactic categories such as noun phrase, verb phrase, etc.
+ !*/
+
+ unsigned long begin, end, k;
+ T left_tag;
+ T right_tag;
+ };
+
+ template <
+ typename T
+ >
+ void serialize(
+ const constituent<T>& item,
+ std::ostream& out
+ );
+ /*!
+ provides serialization support
+ !*/
+
+ template <
+ typename T
+ >
+ void deserialize(
+ constituent<T>& item,
+ std::istream& in
+ );
+ /*!
+ provides deserialization support
+ !*/
+
+// -----------------------------------------------------------------------------------------
+
+ /*!A END_OF_TREE is used to indicate that parse_tree_element::left or
+ parse_tree_element::right doesn't point to another subtree.
+ !*/
+ const unsigned long END_OF_TREE = 0xFFFFFFFF;
+
+// -----------------------------------------------------------------------------------------
+
+ template <
+ typename T
+ >
+ struct parse_tree_element
+ {
+ /*!
+ WHAT THIS OBJECT REPRESENTS
+ This object is used to represent a node in a binary parse tree. An entire
+ parse tree is represented by a std::vector of parse_tree_element objects.
+ We follow the convention that the first element of this vector is always
+ the root of the entire tree.
+
+ The fields of this object have the following interpretations:
+ - c == the constituent spanned by this node in the parse tree.
+ Therefore, the node spans the words in the range [c.begin, c.end).
+ - tag == the syntactic category of this node in the parse tree.
+ - score == the score or log likelihood for this parse tree. In
+ general, this is the sum of scores of all the production rules used
+ to build the tree rooted at the current node.
+ - let PT denote the vector of parse_tree_elements that defines an
+ entire parse tree. Then we have:
+ - if (left != END_OF_TREE) then
+ - PT[left] == the left sub-tree of the current node.
+ - PT[left] spans the words [c.begin, c.k)
+ - PT[left].tag == c.left_tag
+ - else
+ - there is no left sub-tree
+
+ - if (right != END_OF_TREE) then
+ - PT[right] == the right sub-tree of the current node.
+ - PT[right] spans the words [c.k, c.end)
+ - PT[right].tag == c.right_tag
+ - else
+ - there is no right sub-tree
+ !*/
+
+ constituent<T> c;
+ T tag;
+ double score;
+
+ unsigned long left;
+ unsigned long right;
+ };
+
+ template <
+ typename T
+ >
+ void serialize (
+ const parse_tree_element<T>& item,
+ std::ostream& out
+ );
+ /*!
+ provides serialization support
+ !*/
+
+ template <
+ typename T
+ >
+ void deserialize (
+ parse_tree_element<T>& item,
+ std::istream& in
+ );
+ /*!
+ provides deserialization support
+ !*/
+
+// -----------------------------------------------------------------------------------------
+// -----------------------------------------------------------------------------------------
+
+ void example_production_rule_function (
+ const std::vector<T>& words,
+ const constituent<T>& c,
+ std::vector<std::pair<T,double> >& possible_tags
+ )
+ /*!
+ requires
+ - 0 <= c.begin < c.k < c.end <= words.size()
+ - possible_tags.size() == 0
+ ensures
+ - Finds all the syntactic categories that can be used to label c and puts those
+ categories, along with their scores, into possible_tags. Or in other words,
+ this function determines which production rules can be used to turn the left
+ and right sub-constituents in c into a single constituent. The contents of c
+ have the following interpretations:
+ - The left sub-constituent has syntactic category c.left_tag
+ - for all i such that c.begin <= i < c.k:
+ - words[i] is part of the left sub-constituent.
+ - The right sub-constituent has syntactic category c.right_tag
+ - for all i such that c.k <= i < c.end:
+ - words[i] is part of the right sub-constituent.
+
+ - Note that example_production_rule_function() is not a real function. It is
+ here just to show you how to define production rule producing functions for
+ use with the find_max_parse_cky() routine defined below.
+ !*/
+
+ template <
+ typename T,
+ typename production_rule_function
+ >
+ void find_max_parse_cky (
+ const std::vector<T>& words,
+ const production_rule_function& production_rules,
+ std::vector<parse_tree_element<T> >& parse_tree
+ );
+ /*!
+ requires
+ - production_rule_function == a function or function object with the same
+ interface as example_production_rule_function defined above.
+ - It must be possible to store T objects in a std::map.
+ ensures
+ - Uses the CKY algorithm to find the most probable/highest scoring binary parse
+ tree of the given vector of words.
+ - if (#parse_tree.size() == 0) then
+ - There is no parse tree, using the given production_rules, that can cover
+ the given word sequence.
+ - else
+ - #parse_tree == the highest scoring parse tree that covers all the
+ elements of words.
+ - #parse_tree[0] == the root node of the parse tree.
+ - #parse_tree[0].score == the score of the parse tree. This is the sum of
+ the scores of all production rules used to construct the tree.
+ - #parse_tree[0].begin == 0
+ - #parse_tree[0].end == words.size()
+ - This function uses production_rules() to find out what the allowed production
+ rules are. That is, production_rules() defines all properties of the grammar
+ used by find_max_parse_cky().
+ !*/
+
+// -----------------------------------------------------------------------------------------
+// -----------------------------------------------------------------------------------------
+
+ class parse_tree_to_string_error : public error
+ {
+ /*!
+ WHAT THIS OBJECT REPRESENTS
+ This is the exception thrown by parse_tree_to_string() and
+ parse_tree_to_string_tagged() if the inputs are discovered to be invalid.
+ !*/
+ };
+
+// -----------------------------------------------------------------------------------------
+
+ template <
+ typename T,
+ typename U
+ >
+ std::string parse_tree_to_string (
+ const std::vector<parse_tree_element<T> >& tree,
+ const std::vector<U>& words,
+ const unsigned long root_idx = 0
+ );
+ /*!
+ requires
+ - It must be possible to print U objects to an ostream using operator<<
+ (typically, U would be something like std::string)
+ ensures
+ - Interprets tree as a parse tree defined over the given sequence of words.
+ - returns a bracketed string that represents the parse tree over the words.
+ For example, suppose the following parse tree is input:
+
+ /\
+ / \
+ /\ \
+ / \ \
+ the dog ran
+
+ Then the output would be the string "[[the dog] ran]"
+ - Only the sub-tree rooted at tree[root_idx] will be output. If root_idx >=
+ tree.size() then the empty string is returned.
+ throws
+ - parse_tree_to_string_error
+ This exception is thrown if an invalid tree is detected. This might happen
+ if the tree refers to elements of words that don't exist because words is
+ shorted than it is supposed to be.
+ !*/
+
+// -----------------------------------------------------------------------------------------
+
+ template <
+ typename T,
+ typename U
+ >
+ std::string parse_tree_to_string_tagged (
+ const std::vector<parse_tree_element<T> >& tree,
+ const std::vector<U>& words,
+ const unsigned long root_idx = 0
+ );
+ /*!
+ requires
+ - It must be possible to print T objects to an ostream using operator<<
+ - It must be possible to print U objects to an ostream using operator<<
+ (typically, U would be something like std::string)
+ ensures
+ - This function does the same thing as parse_tree_to_string() except that it
+ also includes the parse_tree_element::tag object in the output. Therefore,
+ the tag of each bracket will be included as the first token inside the
+ bracket. For example, suppose the following parse tree is input (where tags
+ are shown at the vertices):
+
+ S
+ /\
+ NP \
+ /\ \
+ / \ \
+ the dog ran
+
+ Then the output would be the string "[S [NP the dog] ran]"
+ - Only the sub-tree rooted at tree[root_idx] will be output. If root_idx >=
+ tree.size() then the empty string is returned.
+ throws
+ - parse_tree_to_string_error
+ This exception is thrown if an invalid tree is detected. This might happen
+ if the tree refers to elements of words that don't exist because words is
+ shorted than it is supposed to be.
+ !*/
+
+// -----------------------------------------------------------------------------------------
+
+ template <
+ typename T,
+ typename U
+ >
+ std::string parse_trees_to_string (
+ const std::vector<parse_tree_element<T> >& tree,
+ const std::vector<U>& words,
+ const T& tag_to_skip
+ );
+ /*!
+ requires
+ - It must be possible to print U objects to an ostream using operator<<
+ (typically, U would be something like std::string)
+ ensures
+ - This function behaves just like parse_tree_to_string() except that it will
+ not print the brackets (i.e. []) for the top most parts of the tree which
+ have tags equal to tag_to_skip. It will however print all the words.
+ Therefore, this function only includes brackets on the subtrees which begin
+ with a tag other than tag_to_skip.
+ throws
+ - parse_tree_to_string_error
+ This exception is thrown if an invalid tree is detected. This might happen
+ if the tree refers to elements of words that don't exist because words is
+ shorted than it is supposed to be.
+ !*/
+
+// -----------------------------------------------------------------------------------------
+
+ template <
+ typename T,
+ typename U
+ >
+ std::string parse_trees_to_string_tagged (
+ const std::vector<parse_tree_element<T> >& tree,
+ const std::vector<U>& words,
+ const T& tag_to_skip
+ );
+ /*!
+ requires
+ - It must be possible to print T objects to an ostream using operator<<
+ - It must be possible to print U objects to an ostream using operator<<
+ (typically, U would be something like std::string)
+ ensures
+ - This function behaves just like parse_tree_to_string_tagged() except that it
+ will not print the brackets (i.e. []) for the top most parts of the tree
+ which have tags equal to tag_to_skip. It will however print all the words.
+ Therefore, this function only includes brackets on the subtrees which begin
+ with a tag other than tag_to_skip.
+ throws
+ - parse_tree_to_string_error
+ This exception is thrown if an invalid tree is detected. This might happen
+ if the tree refers to elements of words that don't exist because words is
+ shorted than it is supposed to be.
+ !*/
+
+// -----------------------------------------------------------------------------------------
+
+ template <
+ typename T
+ >
+ void find_trees_not_rooted_with_tag (
+ const std::vector<parse_tree_element<T> >& tree,
+ const T& tag,
+ std::vector<unsigned long>& tree_roots
+ );
+ /*!
+ requires
+ - objects of type T must be comparable using operator==
+ ensures
+ - Finds all the largest non-overlapping trees in tree that are not rooted with
+ the given tag.
+ - find_trees_not_rooted_with_tag() is useful when you want to cut a parse tree
+ into a bunch of sub-trees and you know that the top level of the tree is all
+ composed of the same kind of tag. So if you want to just "slice off" the top
+ of the tree where this tag lives then this function is useful for doing that.
+ - #tree_roots.size() == the number of sub-trees found.
+ - for all valid i:
+ - tree[#tree_roots[i]].tag != tag
+ - To make the operation of this function clearer, here are a few examples of
+ what it will do:
+ - if (tree[0].tag != tag) then
+ - #tree_roots.size() == 0
+ - #tree_roots[0] == 0
+ - else if (tree[0].tag == tag but its immediate children's tags are not equal to tag) then
+ - #tree_roots.size() == 2
+ - #tree_roots[0] == tree[0].left
+ - #tree_roots[1] == tree[0].right
+ !*/
+
+// -----------------------------------------------------------------------------------------
+
+}
+
+#endif // DLIB_FIND_MAX_PARsE_CKY_ABSTRACT_Hh_
+
diff --git a/ml/dlib/dlib/optimization/find_optimal_parameters.h b/ml/dlib/dlib/optimization/find_optimal_parameters.h
new file mode 100644
index 000000000..0884778c9
--- /dev/null
+++ b/ml/dlib/dlib/optimization/find_optimal_parameters.h
@@ -0,0 +1,117 @@
+// Copyright (C) 2016 Davis E. King (davis@dlib.net)
+// License: Boost Software License See LICENSE.txt for the full license.
+#ifndef DLIB_fIND_OPTIMAL_PARAMETERS_Hh_
+#define DLIB_fIND_OPTIMAL_PARAMETERS_Hh_
+
+#include "../matrix.h"
+#include "find_optimal_parameters_abstract.h"
+#include "optimization_bobyqa.h"
+#include "optimization_line_search.h"
+
+namespace dlib
+{
+
+// ----------------------------------------------------------------------------------------
+
+ template <
+ typename funct
+ >
+ double find_optimal_parameters (
+ double initial_search_radius,
+ double eps,
+ const unsigned int max_f_evals,
+ matrix<double,0,1>& x,
+ const matrix<double,0,1>& x_lower,
+ const matrix<double,0,1>& x_upper,
+ const funct& f
+ )
+ {
+ DLIB_CASSERT(x.size() == x_lower.size() && x_lower.size() == x_upper.size() && x.size() > 0,
+ "\t double find_optimal_parameters()"
+ << "\n\t x.size(): " << x.size()
+ << "\n\t x_lower.size(): " << x_lower.size()
+ << "\n\t x_upper.size(): " << x_upper.size()
+ );
+
+ // check the requirements. Also split the assert up so that the error message isn't huge.
+ DLIB_CASSERT(max_f_evals > 1 && eps > 0 && initial_search_radius > eps,
+ "\t double find_optimal_parameters()"
+ << "\n\t Invalid arguments have been given to this function"
+ << "\n\t initial_search_radius: " << initial_search_radius
+ << "\n\t eps: " << eps
+ << "\n\t max_f_evals: " << max_f_evals
+ );
+
+ DLIB_CASSERT( min(x_upper - x_lower) > 0 &&
+ min(x - x_lower) >= 0 && min(x_upper - x) >= 0,
+ "\t double find_optimal_parameters()"
+ << "\n\t The bounds constraints have to make sense and also contain the starting point."
+ << "\n\t min(x_upper - x_lower): " << min(x_upper - x_lower)
+ << "\n\t min(x - x_lower) >= 0 && min(x_upper - x) >= 0: " << (min(x - x_lower) >= 0 && min(x_upper - x) >= 0)
+ );
+
+ // if the search radius is too big then shrink it so it fits inside the bounds.
+ if (initial_search_radius*2 >= min(x_upper-x_lower))
+ initial_search_radius = 0.5*min(x_upper-x_lower)*0.99;
+
+
+ double objective_val = std::numeric_limits<double>::infinity();
+ size_t num_iter_used = 0;
+ if (x.size() == 1)
+ {
+ // BOBYQA requires x to have at least 2 variables in it. So we can't call it in
+ // this case. Instead we call find_min_single_variable().
+ matrix<double,0,1> temp(1);
+ auto ff = [&](const double& xx)
+ {
+ temp = xx;
+ double obj = f(temp);
+ ++num_iter_used;
+ // keep track of the best x.
+ if (obj < objective_val)
+ {
+ objective_val = obj;
+ x = temp;
+ }
+ return obj;
+ };
+ try
+ {
+ double dx = x(0);
+ find_min_single_variable(ff, dx, x_lower(0), x_upper(0), eps, max_f_evals, initial_search_radius);
+ } catch (optimize_single_variable_failure& )
+ {
+ }
+ }
+ else
+ {
+ auto ff = [&](const matrix<double,0,1>& xx)
+ {
+ double obj = f(xx);
+ ++num_iter_used;
+ // keep track of the best x.
+ if (obj < objective_val)
+ {
+ objective_val = obj;
+ x = xx;
+ }
+ return obj;
+ };
+ try
+ {
+ matrix<double,0,1> start_x = x;
+ find_min_bobyqa(ff, start_x, 2*x.size()+1, x_lower, x_upper, initial_search_radius, eps, max_f_evals);
+ } catch (bobyqa_failure& )
+ {
+ }
+ }
+
+ return objective_val;
+ }
+
+// ----------------------------------------------------------------------------------------
+
+}
+
+#endif // DLIB_fIND_OPTIMAL_PARAMETERS_Hh_
+
diff --git a/ml/dlib/dlib/optimization/find_optimal_parameters_abstract.h b/ml/dlib/dlib/optimization/find_optimal_parameters_abstract.h
new file mode 100644
index 000000000..96dcee89b
--- /dev/null
+++ b/ml/dlib/dlib/optimization/find_optimal_parameters_abstract.h
@@ -0,0 +1,58 @@
+// Copyright (C) 2016 Davis E. King (davis@dlib.net)
+// License: Boost Software License See LICENSE.txt for the full license.
+#undef DLIB_fIND_OPTIMAL_PARAMETERS_ABSTRACT_Hh_
+#ifdef DLIB_fIND_OPTIMAL_PARAMETERS_ABSTRACT_Hh_
+
+#include "../matrix.h"
+
+namespace dlib
+{
+
+// ----------------------------------------------------------------------------------------
+
+ template <
+ typename funct
+ >
+ double find_optimal_parameters (
+ double initial_search_radius,
+ double eps,
+ const unsigned int max_f_evals,
+ matrix<double,0,1>& x,
+ const matrix<double,0,1>& x_lower,
+ const matrix<double,0,1>& x_upper,
+ const funct& f
+ );
+ /*!
+ requires
+ - f(x) must be a valid expression that evaluates to a double
+ - x.size() == x_lower.size() == x_upper.size()
+ - x.size() > 0
+ - 0 < eps < initial_search_radius
+ - max_f_evals > 1
+ - min(x_upper - x_lower) > 0
+ - min(x - x_lower) >= 0 && min(x_upper - x) >= 0
+ (i.e. the given x should be within the bounds defined by x_lower and x_upper)
+ ensures
+ - Performs a constrained minimization of the function f() starting from
+ the initial point x.
+ - This function does not require derivatives of f(). Instead, it uses
+ derivative free methods to find the best setting of x. In particular, it
+ will begin by searching within a sphere of radius initial_search_radius
+ around x and will continue searching until either f() has been called
+ max_f_evals times or the search area has been shrunk to less than eps radius.
+ - #x == the value of x (within the bounds defined by x_lower and x_upper) that
+ was found to minimize f(). More precisely, it will always be true that:
+ - min(#x - x_lower) >= 0 && min(x_upper - #x) >= 0
+ - returns f(#x).
+ throws
+ - No exception is thrown for executing max_f_evals iterations. This function
+ will simply output the best x it has seen if it runs out of iterations.
+ !*/
+
+// ----------------------------------------------------------------------------------------
+
+}
+
+#endif // DLIB_fIND_OPTIMAL_PARAMETERS_ABSTRACT_Hh_
+
+
diff --git a/ml/dlib/dlib/optimization/isotonic_regression.h b/ml/dlib/dlib/optimization/isotonic_regression.h
new file mode 100644
index 000000000..e89e70b48
--- /dev/null
+++ b/ml/dlib/dlib/optimization/isotonic_regression.h
@@ -0,0 +1,169 @@
+// Copyright (C) 2018 Davis E. King (davis@dlib.net)
+// License: Boost Software License See LICENSE.txt for the full license.
+#ifndef DLIB_ISOTONIC_ReGRESSION_H_
+#define DLIB_ISOTONIC_ReGRESSION_H_
+
+#include "isotonic_regression_abstract.h"
+#include <vector>
+#include <utility>
+
+namespace dlib
+{
+ class isotonic_regression
+ {
+ public:
+
+ template <
+ typename const_iterator,
+ typename iterator
+ >
+ void operator() (
+ const_iterator begin,
+ const_iterator end,
+ iterator obegin
+ )
+ {
+ do_isotonic_regression(begin, end);
+
+ // unpack blocks to output
+ for (auto& block : blocks)
+ {
+ for (size_t k = 0; k < block.num; ++k)
+ set_val(*obegin++, block.avg);
+ }
+
+ blocks.clear();
+ }
+
+ void operator() (
+ std::vector<double>& vect
+ ) { (*this)(vect.begin(), vect.end(), vect.begin()); }
+
+ template <typename T, typename U>
+ void operator() (
+ std::vector<std::pair<T,U>>& vect
+ ) { (*this)(vect.begin(), vect.end(), vect.begin()); }
+
+
+ template <
+ typename const_iterator,
+ typename iterator
+ >
+ void fit_with_linear_output_interpolation (
+ const_iterator begin,
+ const_iterator end,
+ iterator obegin
+ )
+ {
+ do_isotonic_regression(begin, end);
+
+ // Unpack blocks to output, but here instead of producing the step function
+ // output we linearly interpolate. Note that this actually fits the data less
+ // than the step-function, but in many applications might be closer to what you
+ // really when when using isotonic_regression than the step function.
+ for (size_t i = 0; i < blocks.size(); ++i)
+ {
+ auto& block = blocks[i];
+
+ double prev = (blocks.front().avg + block.avg)/2;
+ if (i > 0)
+ prev = (blocks[i-1].avg+block.avg)/2;
+
+ double next = (blocks.back().avg + block.avg)/2;
+ if (i+1 < blocks.size())
+ next = (blocks[i+1].avg+block.avg)/2;
+
+ for (size_t k = 0; k < block.num; ++k)
+ {
+ const auto mid = block.num/2.0;
+ if (k < mid)
+ {
+ const double alpha = k/mid;
+ set_val(*obegin++, (1-alpha)*prev + alpha*block.avg);
+ }
+ else
+ {
+ const double alpha = k/mid-1;
+ set_val(*obegin++, alpha*next + (1-alpha)*block.avg);
+ }
+ }
+ }
+
+ blocks.clear();
+ }
+
+ void fit_with_linear_output_interpolation (
+ std::vector<double>& vect
+ ) { fit_with_linear_output_interpolation(vect.begin(), vect.end(), vect.begin()); }
+
+ template <typename T, typename U>
+ void fit_with_linear_output_interpolation (
+ std::vector<std::pair<T,U>>& vect
+ ) { fit_with_linear_output_interpolation(vect.begin(), vect.end(), vect.begin()); }
+
+ private:
+
+ template <
+ typename const_iterator
+ >
+ void do_isotonic_regression (
+ const_iterator begin,
+ const_iterator end
+ )
+ {
+ blocks.clear();
+
+ // Do the actual isotonic regression. The output is a step-function and is
+ // stored in the vector of blocks.
+ for (auto i = begin; i != end; ++i)
+ {
+ blocks.emplace_back(get_val(*i));
+ while (blocks.size() > 1 && prev_block().avg > current_block().avg)
+ {
+ // merge the last two blocks.
+ prev_block() = prev_block() + current_block();
+ blocks.pop_back();
+ }
+ }
+ }
+
+
+ template <typename T>
+ static double get_val(const T& v) { return v;}
+
+ template <typename T, typename U>
+ static double get_val(const std::pair<T,U>& v) { return v.second;}
+
+ template <typename T>
+ static void set_val(T& v, double val) { v = val;}
+
+ template <typename T, typename U>
+ static void set_val(std::pair<T,U>& v, double val) { v.second = val;}
+
+
+
+ struct block_t
+ {
+ block_t(double val) : num(1), avg(val) {}
+ block_t(size_t n, double val) : num(n), avg(val) {}
+
+ size_t num;
+ double avg;
+
+ inline block_t operator+(const block_t& rhs) const
+ {
+ return block_t(num+rhs.num,
+ (num*avg + rhs.num*rhs.avg)/(num+rhs.num));
+ }
+ };
+
+ inline block_t& prev_block() { return blocks[blocks.size()-2]; }
+ inline block_t& current_block() { return blocks.back(); }
+
+ std::vector<block_t> blocks;
+ };
+}
+
+#endif // DLIB_ISOTONIC_ReGRESSION_H_
+
+
diff --git a/ml/dlib/dlib/optimization/isotonic_regression_abstract.h b/ml/dlib/dlib/optimization/isotonic_regression_abstract.h
new file mode 100644
index 000000000..b00334bef
--- /dev/null
+++ b/ml/dlib/dlib/optimization/isotonic_regression_abstract.h
@@ -0,0 +1,128 @@
+// Copyright (C) 2018 Davis E. King (davis@dlib.net)
+// License: Boost Software License See LICENSE.txt for the full license.
+#undef DLIB_ISOTONIC_ReGRESSION_ABSTRACT_H_
+#ifdef DLIB_ISOTONIC_ReGRESSION_ABSTRACT_H_
+
+#include <vector>
+#include <utility>
+
+namespace dlib
+{
+ class isotonic_regression
+ {
+ /*!
+ WHAT THIS OBJECT REPRESENTS
+ This object is a tool for performing 1-D isotonic regression. That is, it
+ finds the least squares fit of a non-parametric curve to some user supplied
+ data, subject to the constraint that the fitted curve is non-decreasing.
+
+ This is done using the fast O(n) pool adjacent violators algorithm.
+ !*/
+
+ public:
+
+ template <
+ typename const_iterator,
+ typename iterator
+ >
+ void operator() (
+ const_iterator begin,
+ const_iterator end,
+ iterator obegin
+ );
+ /*!
+ requires
+ - [begin,end) is an iterator range of float or doubles or a range of
+ std::pair<T,double> or std::pair<T,float> where T an be anything.
+ - obegin points to an iterator range at least std::distance(begin,end).
+ - obegin points to an iterator range of objects of type float, double, std::pair<T,float>, or std::pair<T,double>.
+ ensures
+ - Given the range of real values stored in [begin,end), this method performs isotonic regression
+ on this data and writes the results to obegin. To be specific:
+ - let IN refer to the input values stored in the iterator range [begin,end).
+ - let OUT refer to the output values stored in the iterator range [obegin, obegin+std::distance(begin,end)).
+ - This function populates OUT with values such that the sum_i of
+ (IN[i]-OUT[i])^2 is minimized, subject to the constraint that
+ OUT[i] <= OUT[i+1], i.e. that OUT is monotonic.
+ - It is OK for [begin,end) to overlap with the range pointed to by obegin.
+ That is, this function can run in-place.
+ - Note that when the inputs or outputs are std::pairs this algorithm only
+ looks at the .second field of the pair. It therefore still treats these
+ iterator ranges as ranges of reals since it only looks at the .second
+ field, which is a real number. The .first field is entirely ignored.
+ !*/
+
+ void operator() (
+ std::vector<double>& vect
+ ) { (*this)(vect.begin(), vect.end(), vect.begin()); }
+ /*!
+ ensures
+ - performs in-place isotonic regression. Therefore, #vect will contain the
+ isotonic regression of vect.
+ - #vect.size() == vect.size()
+ !*/
+
+ template <typename T, typename U>
+ void operator() (
+ std::vector<std::pair<T,U>>& vect
+ ) { (*this)(vect.begin(), vect.end(), vect.begin()); }
+ /*!
+ ensures
+ - performs in-place isotonic regression. Therefore, #vect will contain the
+ isotonic regression of vect.
+ - #vect.size() == vect.size()
+ !*/
+
+
+ template <
+ typename const_iterator,
+ typename iterator
+ >
+ void fit_with_linear_output_interpolation (
+ const_iterator begin,
+ const_iterator end,
+ iterator obegin
+ );
+ /*!
+ requires
+ - [begin,end) is an iterator range of float or doubles or a range of
+ std::pair<T,double> or std::pair<T,float> where T an be anything.
+ - obegin points to an iterator range at least std::distance(begin,end).
+ - obegin points to an iterator range of objects of type float, double, std::pair<T,float>, or std::pair<T,double>.
+ ensures
+ - This function behaves just like (*this)(begin,end,obegin) except that the
+ output is interpolated. To explain, not that the optimal output of
+ isotonic regression is a step function. However, in many applications
+ that isn't really what you want. You want something smoother. So
+ fit_with_linear_output_interpolation() does isotonic regression and then
+ linearly interpolates the step function into a piecewise linear function.
+ !*/
+
+ void fit_with_linear_output_interpolation (
+ std::vector<double>& vect
+ ) { fit_with_linear_output_interpolation(vect.begin(), vect.end(), vect.begin()); }
+ /*!
+ ensures
+ - performs in-place isotonic regression. Therefore, #vect will contain the
+ isotonic regression of vect.
+ - #vect.size() == vect.size()
+ !*/
+
+ template <typename T, typename U>
+ void fit_with_linear_output_interpolation (
+ std::vector<std::pair<T,U>>& vect
+ ) { fit_with_linear_output_interpolation(vect.begin(), vect.end(), vect.begin()); }
+ /*!
+ ensures
+ - performs in-place isotonic regression. Therefore, #vect will contain the
+ isotonic regression of vect.
+ - #vect.size() == vect.size()
+ !*/
+
+ };
+}
+
+#endif // DLIB_ISOTONIC_ReGRESSION_ABSTRACT_H_
+
+
+
diff --git a/ml/dlib/dlib/optimization/max_cost_assignment.h b/ml/dlib/dlib/optimization/max_cost_assignment.h
new file mode 100644
index 000000000..db6c6f0d7
--- /dev/null
+++ b/ml/dlib/dlib/optimization/max_cost_assignment.h
@@ -0,0 +1,288 @@
+// Copyright (C) 2011 Davis E. King (davis@dlib.net)
+// License: Boost Software License See LICENSE.txt for the full license.
+#ifndef DLIB_MAX_COST_ASSIgNMENT_Hh_
+#define DLIB_MAX_COST_ASSIgNMENT_Hh_
+
+#include "max_cost_assignment_abstract.h"
+#include "../matrix.h"
+#include <vector>
+#include <deque>
+
+namespace dlib
+{
+
+// ----------------------------------------------------------------------------------------
+
+ template <typename EXP>
+ typename EXP::type assignment_cost (
+ const matrix_exp<EXP>& cost,
+ const std::vector<long>& assignment
+ )
+ {
+ DLIB_ASSERT(cost.nr() == cost.nc(),
+ "\t type assignment_cost(cost,assignment)"
+ << "\n\t cost.nr(): " << cost.nr()
+ << "\n\t cost.nc(): " << cost.nc()
+ );
+#ifdef ENABLE_ASSERTS
+ // can't call max on an empty vector. So put an if here to guard against it.
+ if (assignment.size() > 0)
+ {
+ DLIB_ASSERT(0 <= min(mat(assignment)) && max(mat(assignment)) < cost.nr(),
+ "\t type assignment_cost(cost,assignment)"
+ << "\n\t cost.nr(): " << cost.nr()
+ << "\n\t cost.nc(): " << cost.nc()
+ << "\n\t min(assignment): " << min(mat(assignment))
+ << "\n\t max(assignment): " << max(mat(assignment))
+ );
+ }
+#endif
+
+ typename EXP::type temp = 0;
+ for (unsigned long i = 0; i < assignment.size(); ++i)
+ {
+ temp += cost(i, assignment[i]);
+ }
+ return temp;
+ }
+
+// ----------------------------------------------------------------------------------------
+
+ namespace impl
+ {
+ template <typename EXP>
+ inline void compute_slack(
+ const long x,
+ std::vector<typename EXP::type>& slack,
+ std::vector<long>& slackx,
+ const matrix_exp<EXP>& cost,
+ const std::vector<typename EXP::type>& lx,
+ const std::vector<typename EXP::type>& ly
+ )
+ {
+ for (long y = 0; y < cost.nc(); ++y)
+ {
+ if (lx[x] + ly[y] - cost(x,y) < slack[y])
+ {
+ slack[y] = lx[x] + ly[y] - cost(x,y);
+ slackx[y] = x;
+ }
+ }
+ }
+ }
+
+// ----------------------------------------------------------------------------------------
+
+ template <typename EXP>
+ std::vector<long> max_cost_assignment (
+ const matrix_exp<EXP>& cost_
+ )
+ {
+ const_temp_matrix<EXP> cost(cost_);
+ typedef typename EXP::type type;
+ // This algorithm only works if the elements of the cost matrix can be reliably
+ // compared using operator==. However, comparing for equality with floating point
+ // numbers is not a stable operation. So you need to use an integer cost matrix.
+ COMPILE_TIME_ASSERT(std::numeric_limits<type>::is_integer);
+ DLIB_ASSERT(cost.nr() == cost.nc(),
+ "\t std::vector<long> max_cost_assignment(cost)"
+ << "\n\t cost.nr(): " << cost.nr()
+ << "\n\t cost.nc(): " << cost.nc()
+ );
+
+ using namespace dlib::impl;
+ /*
+ I based the implementation of this algorithm on the description of the
+ Hungarian algorithm on the following websites:
+ http://www.math.uwo.ca/~mdawes/courses/344/kuhn-munkres.pdf
+ http://www.topcoder.com/tc?module=Static&d1=tutorials&d2=hungarianAlgorithm
+
+ Note that this is the fast O(n^3) version of the algorithm.
+ */
+
+ if (cost.size() == 0)
+ return std::vector<long>();
+
+ std::vector<type> lx, ly;
+ std::vector<long> xy;
+ std::vector<long> yx;
+ std::vector<char> S, T;
+ std::vector<type> slack;
+ std::vector<long> slackx;
+ std::vector<long> aug_path;
+
+
+
+
+ // Initially, nothing is matched.
+ xy.assign(cost.nc(), -1);
+ yx.assign(cost.nc(), -1);
+ /*
+ We maintain the following invariant:
+ Vertex x is matched to vertex xy[x] and
+ vertex y is matched to vertex yx[y].
+
+ A value of -1 means a vertex isn't matched to anything. Moreover,
+ x corresponds to rows of the cost matrix and y corresponds to the
+ columns of the cost matrix. So we are matching X to Y.
+ */
+
+
+ // Create an initial feasible labeling. Moreover, in the following
+ // code we will always have:
+ // for all valid x and y: lx[x] + ly[y] >= cost(x,y)
+ lx.resize(cost.nc());
+ ly.assign(cost.nc(),0);
+ for (long x = 0; x < cost.nr(); ++x)
+ lx[x] = max(rowm(cost,x));
+
+ // Now grow the match set by picking edges from the equality subgraph until
+ // we have a complete matching.
+ for (long match_size = 0; match_size < cost.nc(); ++match_size)
+ {
+ std::deque<long> q;
+
+ // Empty out the S and T sets
+ S.assign(cost.nc(), false);
+ T.assign(cost.nc(), false);
+
+ // clear out old slack values
+ slack.assign(cost.nc(), std::numeric_limits<type>::max());
+ slackx.resize(cost.nc());
+ /*
+ slack and slackx are maintained such that we always
+ have the following (once they get initialized by compute_slack() below):
+ - for all y:
+ - let x == slackx[y]
+ - slack[y] == lx[x] + ly[y] - cost(x,y)
+ */
+
+ aug_path.assign(cost.nc(), -1);
+
+ for (long x = 0; x < cost.nc(); ++x)
+ {
+ // If x is not matched to anything
+ if (xy[x] == -1)
+ {
+ q.push_back(x);
+ S[x] = true;
+
+ compute_slack(x, slack, slackx, cost, lx, ly);
+ break;
+ }
+ }
+
+
+ long x_start = 0;
+ long y_start = 0;
+
+ // Find an augmenting path.
+ bool found_augmenting_path = false;
+ while (!found_augmenting_path)
+ {
+ while (q.size() > 0 && !found_augmenting_path)
+ {
+ const long x = q.front();
+ q.pop_front();
+ for (long y = 0; y < cost.nc(); ++y)
+ {
+ if (cost(x,y) == lx[x] + ly[y] && !T[y])
+ {
+ // if vertex y isn't matched with anything
+ if (yx[y] == -1)
+ {
+ y_start = y;
+ x_start = x;
+ found_augmenting_path = true;
+ break;
+ }
+
+ T[y] = true;
+ q.push_back(yx[y]);
+
+ aug_path[yx[y]] = x;
+ S[yx[y]] = true;
+ compute_slack(yx[y], slack, slackx, cost, lx, ly);
+ }
+ }
+ }
+
+ if (found_augmenting_path)
+ break;
+
+
+ // Since we didn't find an augmenting path we need to improve the
+ // feasible labeling stored in lx and ly. We also need to keep the
+ // slack updated accordingly.
+ type delta = std::numeric_limits<type>::max();
+ for (unsigned long i = 0; i < T.size(); ++i)
+ {
+ if (!T[i])
+ delta = std::min(delta, slack[i]);
+ }
+ for (unsigned long i = 0; i < T.size(); ++i)
+ {
+ if (S[i])
+ lx[i] -= delta;
+
+ if (T[i])
+ ly[i] += delta;
+ else
+ slack[i] -= delta;
+ }
+
+
+
+ q.clear();
+ for (long y = 0; y < cost.nc(); ++y)
+ {
+ if (!T[y] && slack[y] == 0)
+ {
+ // if vertex y isn't matched with anything
+ if (yx[y] == -1)
+ {
+ x_start = slackx[y];
+ y_start = y;
+ found_augmenting_path = true;
+ break;
+ }
+ else
+ {
+ T[y] = true;
+ if (!S[yx[y]])
+ {
+ q.push_back(yx[y]);
+
+ aug_path[yx[y]] = slackx[y];
+ S[yx[y]] = true;
+ compute_slack(yx[y], slack, slackx, cost, lx, ly);
+ }
+ }
+ }
+ }
+ } // end while (!found_augmenting_path)
+
+ // Flip the edges along the augmenting path. This means we will add one more
+ // item to our matching.
+ for (long cx = x_start, cy = y_start, ty;
+ cx != -1;
+ cx = aug_path[cx], cy = ty)
+ {
+ ty = xy[cx];
+ yx[cy] = cx;
+ xy[cx] = cy;
+ }
+
+ }
+
+
+ return xy;
+ }
+
+// ----------------------------------------------------------------------------------------
+
+}
+
+#endif // DLIB_MAX_COST_ASSIgNMENT_Hh_
+
+
diff --git a/ml/dlib/dlib/optimization/max_cost_assignment_abstract.h b/ml/dlib/dlib/optimization/max_cost_assignment_abstract.h
new file mode 100644
index 000000000..bbdb0abfb
--- /dev/null
+++ b/ml/dlib/dlib/optimization/max_cost_assignment_abstract.h
@@ -0,0 +1,63 @@
+// Copyright (C) 2011 Davis E. King (davis@dlib.net)
+// License: Boost Software License See LICENSE.txt for the full license.
+#undef DLIB_MAX_COST_ASSIgNMENT_ABSTRACT_Hh_
+#ifdef DLIB_MAX_COST_ASSIgNMENT_ABSTRACT_Hh_
+
+#include "../matrix.h"
+#include <vector>
+
+namespace dlib
+{
+
+// ----------------------------------------------------------------------------------------
+
+ template <typename EXP>
+ typename EXP::type assignment_cost (
+ const matrix_exp<EXP>& cost,
+ const std::vector<long>& assignment
+ );
+ /*!
+ requires
+ - cost.nr() == cost.nc()
+ - for all valid i:
+ - 0 <= assignment[i] < cost.nr()
+ ensures
+ - Interprets cost as a cost assignment matrix. That is, cost(i,j)
+ represents the cost of assigning i to j.
+ - Interprets assignment as a particular set of assignments. That is,
+ i is assigned to assignment[i].
+ - returns the cost of the given assignment. That is, returns
+ a number which is:
+ sum over i: cost(i,assignment[i])
+ !*/
+
+// ----------------------------------------------------------------------------------------
+
+ template <typename EXP>
+ std::vector<long> max_cost_assignment (
+ const matrix_exp<EXP>& cost
+ );
+ /*!
+ requires
+ - EXP::type == some integer type (e.g. int)
+ (i.e. cost must contain integers rather than floats or doubles)
+ - cost.nr() == cost.nc()
+ ensures
+ - Finds and returns the solution to the following optimization problem:
+
+ Maximize: f(A) == assignment_cost(cost, A)
+ Subject to the following constraints:
+ - The elements of A are unique. That is, there aren't any
+ elements of A which are equal.
+ - A.size() == cost.nr()
+
+ - This function implements the O(N^3) version of the Hungarian algorithm
+ where N is the number of rows in the cost matrix.
+ !*/
+
+// ----------------------------------------------------------------------------------------
+
+}
+
+#endif // DLIB_MAX_COST_ASSIgNMENT_ABSTRACT_Hh_
+
diff --git a/ml/dlib/dlib/optimization/max_sum_submatrix.h b/ml/dlib/dlib/optimization/max_sum_submatrix.h
new file mode 100644
index 000000000..1986cc26b
--- /dev/null
+++ b/ml/dlib/dlib/optimization/max_sum_submatrix.h
@@ -0,0 +1,285 @@
+// Copyright (C) 2011 Davis E. King (davis@dlib.net)
+// License: Boost Software License See LICENSE.txt for the full license.
+#ifndef DLIB_MAX_SUM_SUBMaTRIX_Hh_
+#define DLIB_MAX_SUM_SUBMaTRIX_Hh_
+
+#include "max_sum_submatrix_abstract.h"
+#include "../matrix.h"
+#include <vector>
+#include <queue>
+#include "../geometry.h"
+
+namespace dlib
+{
+ namespace impl
+ {
+
+ // ------------------------------------------------------------------------------------
+
+ template <typename T>
+ struct range_set
+ {
+ int top_min;
+ int top_max;
+ int bottom_min;
+ int bottom_max;
+ T weight;
+
+ bool operator<(const range_set& item) const { return weight < item.weight; }
+ };
+
+ // ------------------------------------------------------------------------------------
+
+ template <typename T>
+ bool is_terminal_set (
+ const range_set<T>& item
+ )
+ {
+ return (item.top_min >= item.top_max &&
+ item.bottom_min >= item.bottom_max);
+ }
+
+ // ------------------------------------------------------------------------------------
+
+ template <typename T>
+ void split (
+ const range_set<T>& rset,
+ range_set<T>& a,
+ range_set<T>& b
+ )
+ {
+ if (rset.top_max - rset.top_min > rset.bottom_max - rset.bottom_min)
+ {
+ // split top
+ const int middle = (rset.top_max + rset.top_min)/2;
+ a.top_min = rset.top_min;
+ a.top_max = middle;
+ b.top_min = middle+1;
+ b.top_max = rset.top_max;
+
+ a.bottom_min = rset.bottom_min;
+ a.bottom_max = rset.bottom_max;
+ b.bottom_min = rset.bottom_min;
+ b.bottom_max = rset.bottom_max;
+ }
+ else
+ {
+ // split bottom
+ const int middle = (rset.bottom_max + rset.bottom_min)/2;
+ a.bottom_min = rset.bottom_min;
+ a.bottom_max = middle;
+ b.bottom_min = middle+1;
+ b.bottom_max = rset.bottom_max;
+
+ a.top_min = rset.top_min;
+ a.top_max = rset.top_max;
+ b.top_min = rset.top_min;
+ b.top_max = rset.top_max;
+ }
+ }
+
+ // ------------------------------------------------------------------------------------
+
+ template <typename EXP, typename T>
+ void find_best_column_range (
+ const matrix_exp<EXP>& sum_pos,
+ const matrix_exp<EXP>& sum_neg,
+ const range_set<T>& row_range,
+ T& weight,
+ int& left,
+ int& right
+ )
+ {
+ left = 0;
+ right = -1;
+ weight = 0;
+ T cur_sum = 0;
+ int cur_pos = 0;
+ for (long c = 0; c < sum_pos.nc(); ++c)
+ {
+ // compute the value for the current column
+ T temp = sum_pos(row_range.bottom_max+1,c) - sum_pos(row_range.top_min,c);
+ if (row_range.top_max <= row_range.bottom_min)
+ temp += sum_neg(row_range.bottom_min+1,c) - sum_neg(row_range.top_max,c);
+
+
+ cur_sum += temp;
+ if (cur_sum > weight)
+ {
+ left = cur_pos;
+ right = c;
+ weight = cur_sum;
+ }
+
+ if (cur_sum <= 0)
+ {
+ cur_sum = 0;
+ cur_pos = c+1;
+ }
+
+ }
+ }
+ }
+
+// ----------------------------------------------------------------------------------------
+
+ template <typename EXP>
+ std::vector<rectangle> max_sum_submatrix(
+ const matrix_exp<EXP>& mat,
+ unsigned long max_rects,
+ double thresh_ = 0
+ )
+ {
+ // make sure requires clause is not broken
+ DLIB_ASSERT(thresh_ >= 0 && mat.size() > 0,
+ "\t std::vector<rectangle> max_sum_submatrix()"
+ << "\n\t Invalid arguments were given to this function."
+ << "\n\t mat.size(): " << mat.size()
+ << "\n\t thresh_: " << thresh_
+ );
+
+ /*
+ This function is basically an implementation of the efficient subwindow search (I-ESS)
+ algorithm presented in the following paper:
+ Efficient Algorithms for Subwindow Search in Object Detection and Localization
+ by Senjian An, Patrick Peursum, Wanquan Liu and Svetha Venkatesh
+ In CVPR 2009
+
+ */
+
+
+ if (max_rects == 0)
+ return std::vector<rectangle>();
+
+ using namespace dlib::impl;
+ typedef typename EXP::type element_type;
+ typedef typename promote<element_type>::type scalar_type;
+
+ const scalar_type thresh = static_cast<scalar_type>(thresh_);
+
+
+ matrix<scalar_type> sum_pos;
+ matrix<scalar_type> sum_neg;
+ sum_pos.set_size(mat.nr()+1, mat.nc());
+ sum_neg.set_size(mat.nr()+1, mat.nc());
+ // integrate over the rows.
+ for (long c = 0; c < mat.nc(); ++c)
+ {
+ sum_pos(0,c) = 0;
+ sum_neg(0,c) = 0;
+ }
+ for (long r = 0; r < mat.nr(); ++r)
+ {
+ for (long c = 0; c < mat.nc(); ++c)
+ {
+ if (mat(r,c) > 0)
+ {
+ sum_pos(r+1,c) = mat(r,c) + sum_pos(r,c);
+ sum_neg(r+1,c) = sum_neg(r,c);
+ }
+ else
+ {
+ sum_pos(r+1,c) = sum_pos(r,c);
+ sum_neg(r+1,c) = mat(r,c) + sum_neg(r,c);
+ }
+ }
+ }
+
+ std::priority_queue<range_set<scalar_type> > q;
+
+ // the range_sets will represent ranges of columns
+ range_set<scalar_type> universe_set;
+ universe_set.bottom_min = 0;
+ universe_set.top_min = 0;
+ universe_set.bottom_max = mat.nr()-1;
+ universe_set.top_max = mat.nr()-1;
+ universe_set.weight = sum(rowm(dlib::mat(sum_pos),mat.nr()));
+
+ q.push(universe_set);
+
+ std::vector<rectangle> results;
+ std::vector<scalar_type> temp_pos(mat.nc());
+ std::vector<scalar_type> temp_neg(mat.nc());
+
+ while (q.size() > 0)
+ {
+ if (is_terminal_set(q.top()))
+ {
+ int left, right;
+ scalar_type weight;
+ find_best_column_range(sum_pos, sum_neg, q.top(), weight, left, right);
+
+ rectangle rect(left, q.top().top_min,
+ right, q.top().bottom_min);
+
+ if (weight <= thresh)
+ break;
+
+ results.push_back(rect);
+
+ if (results.size() >= max_rects)
+ break;
+
+ q = std::priority_queue<range_set<scalar_type> >();
+ // We are going to blank out the weights we just used. So adjust the sum images appropriately.
+ for (long c = rect.left(); c <= rect.right(); ++c)
+ {
+ temp_pos[c] = sum_pos(rect.bottom()+1,c) - sum_pos(rect.top(),c);
+ temp_neg[c] = sum_neg(rect.bottom()+1,c) - sum_neg(rect.top(),c);
+ }
+ // blank out the area inside the rectangle
+ for (long r = rect.top(); r <= rect.bottom(); ++r)
+ {
+ for (long c = rect.left(); c <= rect.right(); ++c)
+ {
+ sum_pos(r+1,c) = sum_pos(r,c);
+ sum_neg(r+1,c) = sum_neg(r,c);
+ }
+ }
+ // account for the area below the rectangle
+ for (long r = rect.bottom()+2; r < sum_pos.nr(); ++r)
+ {
+ for (long c = rect.left(); c <= rect.right(); ++c)
+ {
+ sum_pos(r,c) -= temp_pos[c];
+ sum_neg(r,c) -= temp_neg[c];
+ }
+ }
+
+
+ universe_set.weight = sum(rowm(dlib::mat(sum_pos),mat.nr()));
+ if (universe_set.weight <= thresh)
+ break;
+
+ q.push(universe_set);
+ continue;
+ }
+
+ range_set<scalar_type> a, b;
+ split(q.top(), a,b);
+ q.pop();
+
+ // these variables are not used at this point in the algorithm.
+ int a_left, a_right;
+ int b_left, b_right;
+
+ find_best_column_range(sum_pos, sum_neg, a, a.weight, a_left, a_right);
+ find_best_column_range(sum_pos, sum_neg, b, b.weight, b_left, b_right);
+
+ if (a.weight > thresh)
+ q.push(a);
+ if (b.weight > thresh)
+ q.push(b);
+
+ }
+
+
+ return results;
+ }
+
+// ----------------------------------------------------------------------------------------
+
+}
+
+#endif // DLIB_MAX_SUM_SUBMaTRIX_Hh_
+
diff --git a/ml/dlib/dlib/optimization/max_sum_submatrix_abstract.h b/ml/dlib/dlib/optimization/max_sum_submatrix_abstract.h
new file mode 100644
index 000000000..6714dd7fe
--- /dev/null
+++ b/ml/dlib/dlib/optimization/max_sum_submatrix_abstract.h
@@ -0,0 +1,49 @@
+// Copyright (C) 2011 Davis E. King (davis@dlib.net)
+// License: Boost Software License See LICENSE.txt for the full license.
+#undef DLIB_MAX_SUM_SUBMaTRIX_ABSTRACT_Hh_
+#ifdef DLIB_MAX_SUM_SUBMaTRIX_ABSTRACT_Hh_
+
+#include "../matrix.h"
+#include <vector>
+#include "../geometry.h"
+
+namespace dlib
+{
+
+// ----------------------------------------------------------------------------------------
+
+ template <
+ typename EXP
+ >
+ std::vector<rectangle> max_sum_submatrix(
+ const matrix_exp<EXP>& mat,
+ unsigned long max_rects,
+ double thresh = 0
+ );
+ /*!
+ requires
+ - thresh >= 0
+ - mat.size() != 0
+ ensures
+ - This function finds the submatrix within mat which has the largest sum. It then
+ zeros out that submatrix and repeats the process until no more maximal submatrices can
+ be found. The std::vector returned will be ordered so that the rectangles with the
+ largest sum come first.
+ - Each submatrix must have a sum greater than thresh. If no such submatrix exists then
+ the algorithm terminates and returns an empty std::vector.
+ - At most max_rects rectangles are returned.
+
+ - This function is basically an implementation of the efficient subwindow search (I-ESS)
+ algorithm presented in the following paper:
+ Efficient Algorithms for Subwindow Search in Object Detection and Localization
+ by Senjian An, Patrick Peursum, Wanquan Liu and Svetha Venkatesh
+ In CVPR 2009
+ !*/
+
+// ----------------------------------------------------------------------------------------
+
+}
+
+#endif // DLIB_MAX_SUM_SUBMaTRIX_ABSTRACT_Hh_
+
+
diff --git a/ml/dlib/dlib/optimization/optimization.h b/ml/dlib/dlib/optimization/optimization.h
new file mode 100644
index 000000000..561d64376
--- /dev/null
+++ b/ml/dlib/dlib/optimization/optimization.h
@@ -0,0 +1,714 @@
+// Copyright (C) 2008 Davis E. King (davis@dlib.net)
+// License: Boost Software License See LICENSE.txt for the full license.
+#ifndef DLIB_OPTIMIZATIOn_H_
+#define DLIB_OPTIMIZATIOn_H_
+
+#include <cmath>
+#include <limits>
+#include "optimization_abstract.h"
+#include "optimization_search_strategies.h"
+#include "optimization_stop_strategies.h"
+#include "optimization_line_search.h"
+
+namespace dlib
+{
+
+// ----------------------------------------------------------------------------------------
+// ----------------------------------------------------------------------------------------
+// Functions that transform other functions
+// ----------------------------------------------------------------------------------------
+// ----------------------------------------------------------------------------------------
+
+ template <typename funct>
+ class central_differences
+ {
+ public:
+ central_differences(const funct& f_, double eps_ = 1e-7) : f(f_), eps(eps_){}
+
+ template <typename T>
+ typename T::matrix_type operator()(const T& x) const
+ {
+ // T must be some sort of dlib matrix
+ COMPILE_TIME_ASSERT(is_matrix<T>::value);
+
+ typename T::matrix_type der(x.size());
+ typename T::matrix_type e(x);
+ for (long i = 0; i < x.size(); ++i)
+ {
+ const double old_val = e(i);
+
+ e(i) += eps;
+ const double delta_plus = f(e);
+ e(i) = old_val - eps;
+ const double delta_minus = f(e);
+
+ der(i) = (delta_plus - delta_minus)/((old_val+eps)-(old_val-eps));
+
+ // and finally restore the old value of this element
+ e(i) = old_val;
+ }
+
+ return der;
+ }
+
+ template <typename T, typename U>
+ typename U::matrix_type operator()(const T& item, const U& x) const
+ {
+ // U must be some sort of dlib matrix
+ COMPILE_TIME_ASSERT(is_matrix<U>::value);
+
+ typename U::matrix_type der(x.size());
+ typename U::matrix_type e(x);
+ for (long i = 0; i < x.size(); ++i)
+ {
+ const double old_val = e(i);
+
+ e(i) += eps;
+ const double delta_plus = f(item,e);
+ e(i) = old_val - eps;
+ const double delta_minus = f(item,e);
+
+ der(i) = (delta_plus - delta_minus)/((old_val+eps)-(old_val-eps));
+
+ // and finally restore the old value of this element
+ e(i) = old_val;
+ }
+
+ return der;
+ }
+
+
+ double operator()(const double& x) const
+ {
+ return (f(x+eps)-f(x-eps))/((x+eps)-(x-eps));
+ }
+
+ private:
+ const funct& f;
+ const double eps;
+ };
+
+ template <typename funct>
+ const central_differences<funct> derivative(const funct& f) { return central_differences<funct>(f); }
+ template <typename funct>
+ const central_differences<funct> derivative(const funct& f, double eps)
+ {
+ DLIB_ASSERT (
+ eps > 0,
+ "\tcentral_differences derivative(f,eps)"
+ << "\n\tYou must give an epsilon > 0"
+ << "\n\teps: " << eps
+ );
+ return central_differences<funct>(f,eps);
+ }
+
+// ----------------------------------------------------------------------------------------
+
+ template <typename funct, typename EXP1, typename EXP2>
+ struct clamped_function_object
+ {
+ clamped_function_object(
+ const funct& f_,
+ const matrix_exp<EXP1>& x_lower_,
+ const matrix_exp<EXP2>& x_upper_
+ ) : f(f_), x_lower(x_lower_), x_upper(x_upper_)
+ {
+ }
+
+ template <typename T>
+ double operator() (
+ const T& x
+ ) const
+ {
+ return f(clamp(x,x_lower,x_upper));
+ }
+
+ const funct& f;
+ const matrix_exp<EXP1>& x_lower;
+ const matrix_exp<EXP2>& x_upper;
+ };
+
+ template <typename funct, typename EXP1, typename EXP2>
+ clamped_function_object<funct,EXP1,EXP2> clamp_function(
+ const funct& f,
+ const matrix_exp<EXP1>& x_lower,
+ const matrix_exp<EXP2>& x_upper
+ ) { return clamped_function_object<funct,EXP1,EXP2>(f,x_lower,x_upper); }
+
+// ----------------------------------------------------------------------------------------
+
+// ----------------------------------------------------------------------------------------
+// ----------------------------------------------------------------------------------------
+// Functions that perform unconstrained optimization
+// ----------------------------------------------------------------------------------------
+// ----------------------------------------------------------------------------------------
+
+ template <
+ typename search_strategy_type,
+ typename stop_strategy_type,
+ typename funct,
+ typename funct_der,
+ typename T
+ >
+ double find_min (
+ search_strategy_type search_strategy,
+ stop_strategy_type stop_strategy,
+ const funct& f,
+ const funct_der& der,
+ T& x,
+ double min_f
+ )
+ {
+ COMPILE_TIME_ASSERT(is_matrix<T>::value);
+ // The starting point (i.e. x) must be a column vector.
+ COMPILE_TIME_ASSERT(T::NC <= 1);
+
+ DLIB_CASSERT (
+ is_col_vector(x),
+ "\tdouble find_min()"
+ << "\n\tYou have to supply column vectors to this function"
+ << "\n\tx.nc(): " << x.nc()
+ );
+
+
+ T g, s;
+
+ double f_value = f(x);
+ g = der(x);
+
+ if (!is_finite(f_value))
+ throw error("The objective function generated non-finite outputs");
+ if (!is_finite(g))
+ throw error("The objective function generated non-finite outputs");
+
+ while(stop_strategy.should_continue_search(x, f_value, g) && f_value > min_f)
+ {
+ s = search_strategy.get_next_direction(x, f_value, g);
+
+ double alpha = line_search(
+ make_line_search_function(f,x,s, f_value),
+ f_value,
+ make_line_search_function(der,x,s, g),
+ dot(g,s), // compute initial gradient for the line search
+ search_strategy.get_wolfe_rho(), search_strategy.get_wolfe_sigma(), min_f,
+ search_strategy.get_max_line_search_iterations());
+
+ // Take the search step indicated by the above line search
+ x += alpha*s;
+
+ if (!is_finite(f_value))
+ throw error("The objective function generated non-finite outputs");
+ if (!is_finite(g))
+ throw error("The objective function generated non-finite outputs");
+ }
+
+ return f_value;
+ }
+
+// ----------------------------------------------------------------------------------------
+
+ template <
+ typename search_strategy_type,
+ typename stop_strategy_type,
+ typename funct,
+ typename funct_der,
+ typename T
+ >
+ double find_max (
+ search_strategy_type search_strategy,
+ stop_strategy_type stop_strategy,
+ const funct& f,
+ const funct_der& der,
+ T& x,
+ double max_f
+ )
+ {
+ COMPILE_TIME_ASSERT(is_matrix<T>::value);
+ // The starting point (i.e. x) must be a column vector.
+ COMPILE_TIME_ASSERT(T::NC <= 1);
+
+ DLIB_CASSERT (
+ is_col_vector(x),
+ "\tdouble find_max()"
+ << "\n\tYou have to supply column vectors to this function"
+ << "\n\tx.nc(): " << x.nc()
+ );
+
+ T g, s;
+
+ // This function is basically just a copy of find_min() but with - put in the right places
+ // to flip things around so that it ends up looking for the max rather than the min.
+
+ double f_value = -f(x);
+ g = -der(x);
+
+ if (!is_finite(f_value))
+ throw error("The objective function generated non-finite outputs");
+ if (!is_finite(g))
+ throw error("The objective function generated non-finite outputs");
+
+ while(stop_strategy.should_continue_search(x, f_value, g) && f_value > -max_f)
+ {
+ s = search_strategy.get_next_direction(x, f_value, g);
+
+ double alpha = line_search(
+ negate_function(make_line_search_function(f,x,s, f_value)),
+ f_value,
+ negate_function(make_line_search_function(der,x,s, g)),
+ dot(g,s), // compute initial gradient for the line search
+ search_strategy.get_wolfe_rho(), search_strategy.get_wolfe_sigma(), -max_f,
+ search_strategy.get_max_line_search_iterations()
+ );
+
+ // Take the search step indicated by the above line search
+ x += alpha*s;
+
+ // Don't forget to negate these outputs from the line search since they are
+ // from the unnegated versions of f() and der()
+ g *= -1;
+ f_value *= -1;
+
+ if (!is_finite(f_value))
+ throw error("The objective function generated non-finite outputs");
+ if (!is_finite(g))
+ throw error("The objective function generated non-finite outputs");
+ }
+
+ return -f_value;
+ }
+
+// ----------------------------------------------------------------------------------------
+
+ template <
+ typename search_strategy_type,
+ typename stop_strategy_type,
+ typename funct,
+ typename T
+ >
+ double find_min_using_approximate_derivatives (
+ search_strategy_type search_strategy,
+ stop_strategy_type stop_strategy,
+ const funct& f,
+ T& x,
+ double min_f,
+ double derivative_eps = 1e-7
+ )
+ {
+ COMPILE_TIME_ASSERT(is_matrix<T>::value);
+ // The starting point (i.e. x) must be a column vector.
+ COMPILE_TIME_ASSERT(T::NC <= 1);
+
+ DLIB_CASSERT (
+ is_col_vector(x) && derivative_eps > 0,
+ "\tdouble find_min_using_approximate_derivatives()"
+ << "\n\tYou have to supply column vectors to this function"
+ << "\n\tx.nc(): " << x.nc()
+ << "\n\tderivative_eps: " << derivative_eps
+ );
+
+ T g, s;
+
+ double f_value = f(x);
+ g = derivative(f,derivative_eps)(x);
+
+ if (!is_finite(f_value))
+ throw error("The objective function generated non-finite outputs");
+ if (!is_finite(g))
+ throw error("The objective function generated non-finite outputs");
+
+ while(stop_strategy.should_continue_search(x, f_value, g) && f_value > min_f)
+ {
+ s = search_strategy.get_next_direction(x, f_value, g);
+
+ double alpha = line_search(
+ make_line_search_function(f,x,s,f_value),
+ f_value,
+ derivative(make_line_search_function(f,x,s),derivative_eps),
+ dot(g,s), // Sometimes the following line is a better way of determining the initial gradient.
+ //derivative(make_line_search_function(f,x,s),derivative_eps)(0),
+ search_strategy.get_wolfe_rho(), search_strategy.get_wolfe_sigma(), min_f,
+ search_strategy.get_max_line_search_iterations()
+ );
+
+ // Take the search step indicated by the above line search
+ x += alpha*s;
+
+ g = derivative(f,derivative_eps)(x);
+
+ if (!is_finite(f_value))
+ throw error("The objective function generated non-finite outputs");
+ if (!is_finite(g))
+ throw error("The objective function generated non-finite outputs");
+ }
+
+ return f_value;
+ }
+
+// ----------------------------------------------------------------------------------------
+
+ template <
+ typename search_strategy_type,
+ typename stop_strategy_type,
+ typename funct,
+ typename T
+ >
+ double find_max_using_approximate_derivatives (
+ search_strategy_type search_strategy,
+ stop_strategy_type stop_strategy,
+ const funct& f,
+ T& x,
+ double max_f,
+ double derivative_eps = 1e-7
+ )
+ {
+ COMPILE_TIME_ASSERT(is_matrix<T>::value);
+ // The starting point (i.e. x) must be a column vector.
+ COMPILE_TIME_ASSERT(T::NC <= 1);
+
+ DLIB_CASSERT (
+ is_col_vector(x) && derivative_eps > 0,
+ "\tdouble find_max_using_approximate_derivatives()"
+ << "\n\tYou have to supply column vectors to this function"
+ << "\n\tx.nc(): " << x.nc()
+ << "\n\tderivative_eps: " << derivative_eps
+ );
+
+ // Just negate the necessary things and call the find_min version of this function.
+ return -find_min_using_approximate_derivatives(
+ search_strategy,
+ stop_strategy,
+ negate_function(f),
+ x,
+ -max_f,
+ derivative_eps
+ );
+ }
+
+// ----------------------------------------------------------------------------------------
+// ----------------------------------------------------------------------------------------
+// Functions for box constrained optimization
+// ----------------------------------------------------------------------------------------
+// ----------------------------------------------------------------------------------------
+
+ template <typename T, typename U, typename V>
+ T zero_bounded_variables (
+ const double eps,
+ T vect,
+ const T& x,
+ const T& gradient,
+ const U& x_lower,
+ const V& x_upper
+ )
+ {
+ for (long i = 0; i < gradient.size(); ++i)
+ {
+ const double tol = eps*std::abs(x(i));
+ // if x(i) is an active bound constraint
+ if (x_lower(i)+tol >= x(i) && gradient(i) > 0)
+ vect(i) = 0;
+ else if (x_upper(i)-tol <= x(i) && gradient(i) < 0)
+ vect(i) = 0;
+ }
+ return vect;
+ }
+
+// ----------------------------------------------------------------------------------------
+
+ template <typename T, typename U, typename V>
+ T gap_step_assign_bounded_variables (
+ const double eps,
+ T vect,
+ const T& x,
+ const T& gradient,
+ const U& x_lower,
+ const V& x_upper
+ )
+ {
+ for (long i = 0; i < gradient.size(); ++i)
+ {
+ const double tol = eps*std::abs(x(i));
+ // If x(i) is an active bound constraint then we should set its search
+ // direction such that a single step along the direction either does nothing or
+ // closes the gap of size tol before hitting the bound exactly.
+ if (x_lower(i)+tol >= x(i) && gradient(i) > 0)
+ vect(i) = x_lower(i)-x(i);
+ else if (x_upper(i)-tol <= x(i) && gradient(i) < 0)
+ vect(i) = x_upper(i)-x(i);
+ }
+ return vect;
+ }
+
+// ----------------------------------------------------------------------------------------
+
+ template <
+ typename search_strategy_type,
+ typename stop_strategy_type,
+ typename funct,
+ typename funct_der,
+ typename T,
+ typename EXP1,
+ typename EXP2
+ >
+ double find_min_box_constrained (
+ search_strategy_type search_strategy,
+ stop_strategy_type stop_strategy,
+ const funct& f,
+ const funct_der& der,
+ T& x,
+ const matrix_exp<EXP1>& x_lower,
+ const matrix_exp<EXP2>& x_upper
+ )
+ {
+ /*
+ The implementation of this function is more or less based on the discussion in
+ the paper Projected Newton-type Methods in Machine Learning by Mark Schmidt, et al.
+ */
+
+ // make sure the requires clause is not violated
+ COMPILE_TIME_ASSERT(is_matrix<T>::value);
+ // The starting point (i.e. x) must be a column vector.
+ COMPILE_TIME_ASSERT(T::NC <= 1);
+
+ DLIB_CASSERT (
+ is_col_vector(x) && is_col_vector(x_lower) && is_col_vector(x_upper) &&
+ x.size() == x_lower.size() && x.size() == x_upper.size(),
+ "\tdouble find_min_box_constrained()"
+ << "\n\t The inputs to this function must be equal length column vectors."
+ << "\n\t is_col_vector(x): " << is_col_vector(x)
+ << "\n\t is_col_vector(x_upper): " << is_col_vector(x_upper)
+ << "\n\t is_col_vector(x_upper): " << is_col_vector(x_upper)
+ << "\n\t x.size(): " << x.size()
+ << "\n\t x_lower.size(): " << x_lower.size()
+ << "\n\t x_upper.size(): " << x_upper.size()
+ );
+ DLIB_ASSERT (
+ min(x_upper-x_lower) >= 0,
+ "\tdouble find_min_box_constrained()"
+ << "\n\t You have to supply proper box constraints to this function."
+ << "\n\r min(x_upper-x_lower): " << min(x_upper-x_lower)
+ );
+
+
+ T g, s;
+ double f_value = f(x);
+ g = der(x);
+
+ if (!is_finite(f_value))
+ throw error("The objective function generated non-finite outputs");
+ if (!is_finite(g))
+ throw error("The objective function generated non-finite outputs");
+
+ // gap_eps determines how close we have to get to a bound constraint before we
+ // start basically dropping it from the optimization and consider it to be an
+ // active constraint.
+ const double gap_eps = 1e-8;
+
+ double last_alpha = 1;
+ while(stop_strategy.should_continue_search(x, f_value, g))
+ {
+ s = search_strategy.get_next_direction(x, f_value, zero_bounded_variables(gap_eps, g, x, g, x_lower, x_upper));
+ s = gap_step_assign_bounded_variables(gap_eps, s, x, g, x_lower, x_upper);
+
+ double alpha = backtracking_line_search(
+ make_line_search_function(clamp_function(f,x_lower,x_upper), x, s, f_value),
+ f_value,
+ dot(g,s), // compute gradient for the line search
+ last_alpha,
+ search_strategy.get_wolfe_rho(),
+ search_strategy.get_max_line_search_iterations());
+
+ // Do a trust region style thing for alpha. The idea is that if we take a
+ // small step then we are likely to take another small step. So we reuse the
+ // alpha from the last iteration unless the line search didn't shrink alpha at
+ // all, in that case, we start with a bigger alpha next time.
+ if (alpha == last_alpha)
+ last_alpha = std::min(last_alpha*10,1.0);
+ else
+ last_alpha = alpha;
+
+ // Take the search step indicated by the above line search
+ x = dlib::clamp(x + alpha*s, x_lower, x_upper);
+ g = der(x);
+
+ if (!is_finite(f_value))
+ throw error("The objective function generated non-finite outputs");
+ if (!is_finite(g))
+ throw error("The objective function generated non-finite outputs");
+ }
+
+ return f_value;
+ }
+
+// ----------------------------------------------------------------------------------------
+
+ template <
+ typename search_strategy_type,
+ typename stop_strategy_type,
+ typename funct,
+ typename funct_der,
+ typename T
+ >
+ double find_min_box_constrained (
+ search_strategy_type search_strategy,
+ stop_strategy_type stop_strategy,
+ const funct& f,
+ const funct_der& der,
+ T& x,
+ double x_lower,
+ double x_upper
+ )
+ {
+ // The starting point (i.e. x) must be a column vector.
+ COMPILE_TIME_ASSERT(T::NC <= 1);
+
+ typedef typename T::type scalar_type;
+ return find_min_box_constrained(search_strategy,
+ stop_strategy,
+ f,
+ der,
+ x,
+ uniform_matrix<scalar_type>(x.size(),1,x_lower),
+ uniform_matrix<scalar_type>(x.size(),1,x_upper) );
+ }
+
+// ----------------------------------------------------------------------------------------
+
+ template <
+ typename search_strategy_type,
+ typename stop_strategy_type,
+ typename funct,
+ typename funct_der,
+ typename T,
+ typename EXP1,
+ typename EXP2
+ >
+ double find_max_box_constrained (
+ search_strategy_type search_strategy,
+ stop_strategy_type stop_strategy,
+ const funct& f,
+ const funct_der& der,
+ T& x,
+ const matrix_exp<EXP1>& x_lower,
+ const matrix_exp<EXP2>& x_upper
+ )
+ {
+ // make sure the requires clause is not violated
+ COMPILE_TIME_ASSERT(is_matrix<T>::value);
+ // The starting point (i.e. x) must be a column vector.
+ COMPILE_TIME_ASSERT(T::NC <= 1);
+
+ DLIB_CASSERT (
+ is_col_vector(x) && is_col_vector(x_lower) && is_col_vector(x_upper) &&
+ x.size() == x_lower.size() && x.size() == x_upper.size(),
+ "\tdouble find_max_box_constrained()"
+ << "\n\t The inputs to this function must be equal length column vectors."
+ << "\n\t is_col_vector(x): " << is_col_vector(x)
+ << "\n\t is_col_vector(x_upper): " << is_col_vector(x_upper)
+ << "\n\t is_col_vector(x_upper): " << is_col_vector(x_upper)
+ << "\n\t x.size(): " << x.size()
+ << "\n\t x_lower.size(): " << x_lower.size()
+ << "\n\t x_upper.size(): " << x_upper.size()
+ );
+ DLIB_ASSERT (
+ min(x_upper-x_lower) >= 0,
+ "\tdouble find_max_box_constrained()"
+ << "\n\t You have to supply proper box constraints to this function."
+ << "\n\r min(x_upper-x_lower): " << min(x_upper-x_lower)
+ );
+
+ // This function is basically just a copy of find_min_box_constrained() but with - put
+ // in the right places to flip things around so that it ends up looking for the max
+ // rather than the min.
+
+ T g, s;
+ double f_value = -f(x);
+ g = -der(x);
+
+ if (!is_finite(f_value))
+ throw error("The objective function generated non-finite outputs");
+ if (!is_finite(g))
+ throw error("The objective function generated non-finite outputs");
+
+ // gap_eps determines how close we have to get to a bound constraint before we
+ // start basically dropping it from the optimization and consider it to be an
+ // active constraint.
+ const double gap_eps = 1e-8;
+
+ double last_alpha = 1;
+ while(stop_strategy.should_continue_search(x, f_value, g))
+ {
+ s = search_strategy.get_next_direction(x, f_value, zero_bounded_variables(gap_eps, g, x, g, x_lower, x_upper));
+ s = gap_step_assign_bounded_variables(gap_eps, s, x, g, x_lower, x_upper);
+
+ double alpha = backtracking_line_search(
+ negate_function(make_line_search_function(clamp_function(f,x_lower,x_upper), x, s, f_value)),
+ f_value,
+ dot(g,s), // compute gradient for the line search
+ last_alpha,
+ search_strategy.get_wolfe_rho(),
+ search_strategy.get_max_line_search_iterations());
+
+ // Do a trust region style thing for alpha. The idea is that if we take a
+ // small step then we are likely to take another small step. So we reuse the
+ // alpha from the last iteration unless the line search didn't shrink alpha at
+ // all, in that case, we start with a bigger alpha next time.
+ if (alpha == last_alpha)
+ last_alpha = std::min(last_alpha*10,1.0);
+ else
+ last_alpha = alpha;
+
+ // Take the search step indicated by the above line search
+ x = dlib::clamp(x + alpha*s, x_lower, x_upper);
+ g = -der(x);
+
+ // Don't forget to negate the output from the line search since it is from the
+ // unnegated version of f()
+ f_value *= -1;
+
+ if (!is_finite(f_value))
+ throw error("The objective function generated non-finite outputs");
+ if (!is_finite(g))
+ throw error("The objective function generated non-finite outputs");
+ }
+
+ return -f_value;
+ }
+
+// ----------------------------------------------------------------------------------------
+
+ template <
+ typename search_strategy_type,
+ typename stop_strategy_type,
+ typename funct,
+ typename funct_der,
+ typename T
+ >
+ double find_max_box_constrained (
+ search_strategy_type search_strategy,
+ stop_strategy_type stop_strategy,
+ const funct& f,
+ const funct_der& der,
+ T& x,
+ double x_lower,
+ double x_upper
+ )
+ {
+ // The starting point (i.e. x) must be a column vector.
+ COMPILE_TIME_ASSERT(T::NC <= 1);
+
+ typedef typename T::type scalar_type;
+ return find_max_box_constrained(search_strategy,
+ stop_strategy,
+ f,
+ der,
+ x,
+ uniform_matrix<scalar_type>(x.size(),1,x_lower),
+ uniform_matrix<scalar_type>(x.size(),1,x_upper) );
+ }
+
+// ----------------------------------------------------------------------------------------
+
+}
+
+#endif // DLIB_OPTIMIZATIOn_H_
+
diff --git a/ml/dlib/dlib/optimization/optimization_abstract.h b/ml/dlib/dlib/optimization/optimization_abstract.h
new file mode 100644
index 000000000..f3c42c2b4
--- /dev/null
+++ b/ml/dlib/dlib/optimization/optimization_abstract.h
@@ -0,0 +1,468 @@
+// Copyright (C) 2008 Davis E. King (davis@dlib.net)
+// License: Boost Software License See LICENSE.txt for the full license.
+#undef DLIB_OPTIMIZATIOn_ABSTRACT_
+#ifdef DLIB_OPTIMIZATIOn_ABSTRACT_
+
+#include <cmath>
+#include <limits>
+#include "../matrix/matrix_abstract.h"
+#include "../algs.h"
+#include "optimization_search_strategies_abstract.h"
+#include "optimization_stop_strategies_abstract.h"
+#include "optimization_line_search_abstract.h"
+
+
+namespace dlib
+{
+
+// ----------------------------------------------------------------------------------------
+// ----------------------------------------------------------------------------------------
+// Functions that transform other functions
+// ----------------------------------------------------------------------------------------
+// ----------------------------------------------------------------------------------------
+
+ template <
+ typename funct
+ >
+ class central_differences;
+ /*!
+ This is a function object that represents the derivative of some other
+ function.
+
+ Note that if funct is a function of a double then the derivative of
+ funct is just a double but if funct is a function of a dlib::matrix (i.e. a
+ function of many variables) then its derivative is a gradient vector (a column
+ vector in particular).
+ !*/
+
+ template <
+ typename funct
+ >
+ const central_differences<funct> derivative(
+ const funct& f,
+ double eps
+ );
+ /*!
+ requires
+ - f == a function that returns a scalar
+ - f must have one of the following forms:
+ - double f(double)
+ - double f(dlib::matrix) (where the matrix is a column vector)
+ - double f(T, dlib::matrix) (where the matrix is a column vector. In
+ this case the derivative of f is taken with respect to the second argument.)
+ - eps > 0
+ ensures
+ - returns a function that represents the derivative of the function f. It
+ is approximated numerically by:
+ (f(x+eps)-f(x-eps))/(2*eps)
+ !*/
+
+ template <
+ typename funct
+ >
+ const central_differences<funct> derivative(
+ const funct& f
+ );
+ /*!
+ ensures
+ - returns derivative(f, 1e-7)
+ !*/
+
+// ----------------------------------------------------------------------------------------
+
+ template <
+ typename funct,
+ typename EXP1,
+ typename EXP2
+ >
+ clamped_function_object<funct,EXP1,EXP2> clamp_function (
+ const funct& f,
+ const matrix_exp<EXP1>& x_lower,
+ const matrix_exp<EXP2>& x_upper
+ );
+ /*!
+ requires
+ - f == a function that takes a matrix and returns a scalar value. Moreover, f
+ must be capable of taking in matrices with the same dimensions as x_lower and
+ x_upper. So f(x_lower) must be a valid expression that evaluates to a scalar
+ value.
+ - x_lower.nr() == x_upper.nr() && x_lower.nc() == x_upper.nc()
+ (i.e. x_lower and x_upper must have the same dimensions)
+ - x_lower and x_upper must contain the same type of elements.
+ ensures
+ - returns a function object that represents the function g(x) where
+ g(x) == f(clamp(x,x_lower,x_upper))
+ !*/
+
+// ----------------------------------------------------------------------------------------
+// ----------------------------------------------------------------------------------------
+// Functions that perform unconstrained optimization
+// ----------------------------------------------------------------------------------------
+// ----------------------------------------------------------------------------------------
+
+ template <
+ typename search_strategy_type,
+ typename stop_strategy_type,
+ typename funct,
+ typename funct_der,
+ typename T
+ >
+ double find_min (
+ search_strategy_type search_strategy,
+ stop_strategy_type stop_strategy,
+ const funct& f,
+ const funct_der& der,
+ T& x,
+ double min_f
+ );
+ /*!
+ requires
+ - search_strategy == an object that defines a search strategy such as one
+ of the objects from dlib/optimization/optimization_search_strategies_abstract.h
+ - stop_strategy == an object that defines a stop strategy such as one of
+ the objects from dlib/optimization/optimization_stop_strategies_abstract.h
+ - f(x) must be a valid expression that evaluates to a double
+ - der(x) must be a valid expression that evaluates to the derivative of f() at x.
+ - is_col_vector(x) == true
+ ensures
+ - Performs an unconstrained minimization of the function f() using the given
+ search_strategy and starting from the initial point x.
+ - The function is optimized until stop_strategy decides that an acceptable
+ point has been found or f(#x) < min_f.
+ - #x == the value of x that was found to minimize f()
+ - returns f(#x).
+ - When this function makes calls to f() and der() it always does so by
+ first calling f() and then calling der(). That is, these two functions
+ are always called in pairs with f() being called first and then der()
+ being called second.
+ !*/
+
+// ----------------------------------------------------------------------------------------
+
+ template <
+ typename search_strategy_type,
+ typename stop_strategy_type,
+ typename funct,
+ typename funct_der,
+ typename T
+ >
+ double find_max (
+ search_strategy_type search_strategy,
+ stop_strategy_type stop_strategy,
+ const funct& f,
+ const funct_der& der,
+ T& x,
+ double max_f
+ );
+ /*!
+ requires
+ - search_strategy == an object that defines a search strategy such as one
+ of the objects from dlib/optimization/optimization_search_strategies_abstract.h
+ - stop_strategy == an object that defines a stop strategy such as one of
+ the objects from dlib/optimization/optimization_stop_strategies_abstract.h
+ - f(x) must be a valid expression that evaluates to a double
+ - der(x) must be a valid expression that evaluates to the derivative of f() at x.
+ - is_col_vector(x) == true
+ ensures
+ - Performs an unconstrained maximization of the function f() using the given
+ search_strategy and starting from the initial point x.
+ - The function is optimized until stop_strategy decides that an acceptable
+ point has been found or f(#x) > max_f.
+ - #x == the value of x that was found to maximize f()
+ - returns f(#x).
+ - When this function makes calls to f() and der() it always does so by
+ first calling f() and then calling der(). That is, these two functions
+ are always called in pairs with f() being called first and then der()
+ being called second.
+ - Note that this function solves the maximization problem by converting it
+ into a minimization problem. Therefore, the values of f and its derivative
+ reported to the stopping strategy will be negated. That is, stop_strategy
+ will see -f() and -der(). All this really means is that the status messages
+ from a stopping strategy in verbose mode will display a negated objective
+ value.
+ !*/
+
+// ----------------------------------------------------------------------------------------
+
+ template <
+ typename search_strategy_type,
+ typename stop_strategy_type,
+ typename funct,
+ typename T
+ >
+ double find_min_using_approximate_derivatives (
+ search_strategy_type search_strategy,
+ stop_strategy_type stop_strategy,
+ const funct& f,
+ T& x,
+ double min_f,
+ double derivative_eps = 1e-7
+ );
+ /*!
+ requires
+ - search_strategy == an object that defines a search strategy such as one
+ of the objects from dlib/optimization/optimization_search_strategies_abstract.h
+ - stop_strategy == an object that defines a stop strategy such as one of
+ the objects from dlib/optimization/optimization_stop_strategies_abstract.h
+ - f(x) must be a valid expression that evaluates to a double
+ - is_col_vector(x) == true
+ - derivative_eps > 0
+ ensures
+ - Performs an unconstrained minimization of the function f() using the given
+ search_strategy and starting from the initial point x.
+ - The function is optimized until stop_strategy decides that an acceptable
+ point has been found or f(#x) < min_f.
+ - #x == the value of x that was found to minimize f()
+ - returns f(#x).
+ - Uses the dlib::derivative(f,derivative_eps) function to compute gradient
+ information.
+ !*/
+
+// ----------------------------------------------------------------------------------------
+
+ template <
+ typename search_strategy_type,
+ typename stop_strategy_type,
+ typename funct,
+ typename T
+ >
+ double find_max_using_approximate_derivatives (
+ search_strategy_type search_strategy,
+ stop_strategy_type stop_strategy,
+ const funct& f,
+ T& x,
+ double max_f,
+ double derivative_eps = 1e-7
+ );
+ /*!
+ requires
+ - search_strategy == an object that defines a search strategy such as one
+ of the objects from dlib/optimization/optimization_search_strategies_abstract.h
+ - stop_strategy == an object that defines a stop strategy such as one of
+ the objects from dlib/optimization/optimization_stop_strategies_abstract.h
+ - f(x) must be a valid expression that evaluates to a double
+ - is_col_vector(x) == true
+ - derivative_eps > 0
+ ensures
+ - Performs an unconstrained maximization of the function f() using the given
+ search_strategy and starting from the initial point x.
+ - The function is optimized until stop_strategy decides that an acceptable
+ point has been found or f(#x) > max_f.
+ - #x == the value of x that was found to maximize f()
+ - returns f(#x).
+ - Uses the dlib::derivative(f,derivative_eps) function to compute gradient
+ information.
+ - Note that this function solves the maximization problem by converting it
+ into a minimization problem. Therefore, the values of f and its derivative
+ reported to the stopping strategy will be negated. That is, stop_strategy
+ will see -f() and -der(). All this really means is that the status messages
+ from a stopping strategy in verbose mode will display a negated objective
+ value.
+ !*/
+
+// ----------------------------------------------------------------------------------------
+// ----------------------------------------------------------------------------------------
+// Functions that perform box constrained optimization
+// ----------------------------------------------------------------------------------------
+// ----------------------------------------------------------------------------------------
+
+ template <
+ typename search_strategy_type,
+ typename stop_strategy_type,
+ typename funct,
+ typename funct_der,
+ typename T,
+ typename EXP1,
+ typename EXP2
+ >
+ double find_min_box_constrained (
+ search_strategy_type search_strategy,
+ stop_strategy_type stop_strategy,
+ const funct& f,
+ const funct_der& der,
+ T& x,
+ const matrix_exp<EXP1>& x_lower,
+ const matrix_exp<EXP2>& x_upper
+ );
+ /*!
+ requires
+ - search_strategy == an object that defines a search strategy such as one
+ of the objects from dlib/optimization/optimization_search_strategies_abstract.h
+ - stop_strategy == an object that defines a stop strategy such as one of
+ the objects from dlib/optimization/optimization_stop_strategies_abstract.h
+ - f(x) must be a valid expression that evaluates to a double
+ - der(x) must be a valid expression that evaluates to the derivative of f() at x.
+ - is_col_vector(x) == true
+ - is_col_vector(x_lower) == true
+ - is_col_vector(x_upper) == true
+ - x.size() == x_lower.size() == x_upper.size()
+ (i.e. x, x_lower, and x_upper need to all be column vectors of the same dimensionality)
+ - min(x_upper-x_lower) >= 0
+ (i.e. x_upper must contain upper bounds relative to x_lower)
+ ensures
+ - Performs a box constrained minimization of the function f() using the given
+ search_strategy and starting from the initial point x. That is, we try to
+ find the x value that minimizes f(x) but is also within the box constraints
+ specified by x_lower and x_upper. That is, we ensure that #x satisfies:
+ - min(#x - x_lower) >= 0 && min(x_upper - #x) >= 0
+ - This function uses a backtracking line search along with a gradient projection
+ step to handle the box constraints.
+ - The function is optimized until stop_strategy decides that an acceptable
+ point has been found.
+ - #x == the value of x that was found to minimize f() within the given box
+ constraints.
+ - returns f(#x).
+ - The last call to f() will be made with f(#x).
+ - When calling f() and der(), the input passed to them will always be inside
+ the box constraints defined by x_lower and x_upper.
+ - When calling der(x), it will always be the case that the last call to f() was
+ made with the same x value. This means that you can reuse any intermediate
+ results from the previous call to f(x) inside der(x) rather than recomputing
+ them inside der(x).
+ !*/
+
+// ----------------------------------------------------------------------------------------
+
+ template <
+ typename search_strategy_type,
+ typename stop_strategy_type,
+ typename funct,
+ typename funct_der,
+ typename T
+ >
+ double find_min_box_constrained (
+ search_strategy_type search_strategy,
+ stop_strategy_type stop_strategy,
+ const funct& f,
+ const funct_der& der,
+ T& x,
+ const double x_lower,
+ const double x_upper
+ );
+ /*!
+ requires
+ - search_strategy == an object that defines a search strategy such as one
+ of the objects from dlib/optimization/optimization_search_strategies_abstract.h
+ - stop_strategy == an object that defines a stop strategy such as one of
+ the objects from dlib/optimization/optimization_stop_strategies_abstract.h
+ - f(x) must be a valid expression that evaluates to a double
+ - der(x) must be a valid expression that evaluates to the derivative of f() at x.
+ - is_col_vector(x) == true
+ - x_lower < x_upper
+ ensures
+ - This function is identical to find_min_box_constrained() as defined above
+ except that it takes x_lower and x_upper as doubles rather than column
+ vectors. In this case, all variables have the same lower bound of x_lower
+ and similarly have the same upper bound of x_upper. Therefore, this is just
+ a convenience function for calling find_max_box_constrained() when all
+ variables have the same bound constraints.
+ !*/
+
+// ----------------------------------------------------------------------------------------
+
+ template <
+ typename search_strategy_type,
+ typename stop_strategy_type,
+ typename funct,
+ typename funct_der,
+ typename T,
+ typename EXP1,
+ typename EXP2
+ >
+ double find_max_box_constrained (
+ search_strategy_type search_strategy,
+ stop_strategy_type stop_strategy,
+ const funct& f,
+ const funct_der& der,
+ T& x,
+ const matrix_exp<EXP1>& x_lower,
+ const matrix_exp<EXP2>& x_upper
+ );
+ /*!
+ requires
+ - search_strategy == an object that defines a search strategy such as one
+ of the objects from dlib/optimization/optimization_search_strategies_abstract.h
+ - stop_strategy == an object that defines a stop strategy such as one of
+ the objects from dlib/optimization/optimization_stop_strategies_abstract.h
+ - f(x) must be a valid expression that evaluates to a double
+ - der(x) must be a valid expression that evaluates to the derivative of f() at x.
+ - is_col_vector(x) == true
+ - is_col_vector(x_lower) == true
+ - is_col_vector(x_upper) == true
+ - x.size() == x_lower.size() == x_upper.size()
+ (i.e. x, x_lower, and x_upper need to all be column vectors of the same dimensionality)
+ - min(x_upper-x_lower) >= 0
+ (i.e. x_upper must contain upper bounds relative to x_lower)
+ ensures
+ - Performs a box constrained maximization of the function f() using the given
+ search_strategy and starting from the initial point x. That is, we try to
+ find the x value that maximizes f(x) but is also within the box constraints
+ specified by x_lower and x_upper. That is, we ensure that #x satisfies:
+ - min(#x - x_lower) >= 0 && min(x_upper - #x) >= 0
+ - This function uses a backtracking line search along with a gradient projection
+ step to handle the box constraints.
+ - The function is optimized until stop_strategy decides that an acceptable
+ point has been found.
+ - #x == the value of x that was found to maximize f() within the given box
+ constraints.
+ - returns f(#x).
+ - The last call to f() will be made with f(#x).
+ - When calling f() and der(), the input passed to them will always be inside
+ the box constraints defined by x_lower and x_upper.
+ - When calling der(x), it will always be the case that the last call to f() was
+ made with the same x value. This means that you can reuse any intermediate
+ results from the previous call to f(x) inside der(x) rather than recomputing
+ them inside der(x).
+ - Note that this function solves the maximization problem by converting it
+ into a minimization problem. Therefore, the values of f and its derivative
+ reported to the stopping strategy will be negated. That is, stop_strategy
+ will see -f() and -der(). All this really means is that the status messages
+ from a stopping strategy in verbose mode will display a negated objective
+ value.
+ !*/
+
+// ----------------------------------------------------------------------------------------
+
+ template <
+ typename search_strategy_type,
+ typename stop_strategy_type,
+ typename funct,
+ typename funct_der,
+ typename T
+ >
+ double find_max_box_constrained (
+ search_strategy_type search_strategy,
+ stop_strategy_type stop_strategy,
+ const funct& f,
+ const funct_der& der,
+ T& x,
+ const double x_lower,
+ const double x_upper
+ );
+ /*!
+ requires
+ - search_strategy == an object that defines a search strategy such as one
+ of the objects from dlib/optimization/optimization_search_strategies_abstract.h
+ - stop_strategy == an object that defines a stop strategy such as one of
+ the objects from dlib/optimization/optimization_stop_strategies_abstract.h
+ - f(x) must be a valid expression that evaluates to a double
+ - der(x) must be a valid expression that evaluates to the derivative of f() at x.
+ - is_col_vector(x) == true
+ - x_lower < x_upper
+ ensures
+ - This function is identical to find_max_box_constrained() as defined above
+ except that it takes x_lower and x_upper as doubles rather than column
+ vectors. In this case, all variables have the same lower bound of x_lower
+ and similarly have the same upper bound of x_upper. Therefore, this is just
+ a convenience function for calling find_max_box_constrained() when all
+ variables have the same bound constraints.
+ !*/
+
+// ----------------------------------------------------------------------------------------
+
+}
+
+#endif // DLIB_OPTIMIZATIOn_ABSTRACT_
+
+
+
diff --git a/ml/dlib/dlib/optimization/optimization_bobyqa.h b/ml/dlib/dlib/optimization/optimization_bobyqa.h
new file mode 100644
index 000000000..6fbc40c06
--- /dev/null
+++ b/ml/dlib/dlib/optimization/optimization_bobyqa.h
@@ -0,0 +1,3423 @@
+// Copyright (C) 2009 M.J.D. Powell, Davis E. King (davis@dlib.net)
+// License: Boost Software License See LICENSE.txt for the full license.
+#ifndef DLIB_OPTIMIZATIOn_BOBYQA_Hh_
+#define DLIB_OPTIMIZATIOn_BOBYQA_Hh_
+
+/*
+ The code in this file is derived from Powell's BOBYQA Fortran code.
+ It was created by running f2c on the original Fortran code and then
+ massaging the resulting C code into what you can see below.
+
+
+ The following paper, published in 2009 by Powell, describes the
+ detailed workings of the BOBYQA algorithm.
+
+ The BOBYQA algorithm for bound constrained optimization
+ without derivatives by M.J.D. Powell
+*/
+
+#include <algorithm>
+#include <cmath>
+#include <memory>
+
+#include "../matrix.h"
+#include "optimization_bobyqa_abstract.h"
+#include "optimization.h"
+
+// ----------------------------------------------------------------------------------------
+
+namespace dlib
+{
+
+ class bobyqa_failure : public error {
+ public: bobyqa_failure(const std::string& s):error(s){}
+ };
+
+// ----------------------------------------------------------------------------------------
+
+ class bobyqa_implementation
+ {
+ typedef long integer;
+ typedef double doublereal;
+
+ public:
+
+ template <
+ typename funct,
+ typename T,
+ typename U
+ >
+ double find_min (
+ const funct& f,
+ T& x,
+ long npt,
+ const U& xl_,
+ const U& xu_,
+ const double rhobeg,
+ const double rhoend,
+ const long max_f_evals
+ ) const
+ {
+ const unsigned long n = x.size();
+ const unsigned long w_size = (npt+5)*(npt+n)+3*n*(n+5)/2;
+ std::unique_ptr<doublereal[]> w(new doublereal[w_size]);
+
+ // make these temporary matrices becuse U might be some
+ // kind of matrix_exp that doesn't support taking the address
+ // of the first element.
+ matrix<double,0,1> xl(xl_);
+ matrix<double,0,1> xu(xu_);
+
+
+ return bobyqa_ (f,
+ x.size(),
+ npt,
+ &x(0),
+ &xl(0),
+ &xu(0),
+ rhobeg,
+ rhoend,
+ max_f_evals,
+ w.get() );
+ }
+
+ private:
+
+
+ template <typename funct>
+ doublereal bobyqa_(
+ const funct& calfun,
+ const integer n,
+ const integer npt,
+ doublereal *x,
+ const doublereal *xl,
+ const doublereal *xu,
+ const doublereal rhobeg,
+ const doublereal rhoend,
+ const integer maxfun,
+ doublereal *w
+ ) const
+ {
+
+ /* System generated locals */
+ integer i__1;
+ doublereal d__1, d__2;
+
+ /* Local variables */
+ integer j, id_, np, iw, igo, ihq, ixb, ixa, ifv, isl, jsl, ipq, ivl, ixn, ixo, ixp, isu, jsu, ndim;
+ doublereal temp, zero;
+ integer ibmat, izmat;
+
+
+ /* This subroutine seeks the least value of a function of many variables, */
+ /* by applying a trust region method that forms quadratic models by */
+ /* interpolation. There is usually some freedom in the interpolation */
+ /* conditions, which is taken up by minimizing the Frobenius norm of */
+ /* the change to the second derivative of the model, beginning with the */
+ /* zero matrix. The values of the variables are constrained by upper and */
+ /* lower bounds. The arguments of the subroutine are as follows. */
+
+ /* N must be set to the number of variables and must be at least two. */
+ /* NPT is the number of interpolation conditions. Its value must be in */
+ /* the interval [N+2,(N+1)(N+2)/2]. Choices that exceed 2*N+1 are not */
+ /* recommended. */
+ /* Initial values of the variables must be set in X(1),X(2),...,X(N). They */
+ /* will be changed to the values that give the least calculated F. */
+ /* For I=1,2,...,N, XL(I) and XU(I) must provide the lower and upper */
+ /* bounds, respectively, on X(I). The construction of quadratic models */
+ /* requires XL(I) to be strictly less than XU(I) for each I. Further, */
+ /* the contribution to a model from changes to the I-th variable is */
+ /* damaged severely by rounding errors if XU(I)-XL(I) is too small. */
+ /* RHOBEG and RHOEND must be set to the initial and final values of a trust */
+ /* region radius, so both must be positive with RHOEND no greater than */
+ /* RHOBEG. Typically, RHOBEG should be about one tenth of the greatest */
+ /* expected change to a variable, while RHOEND should indicate the */
+ /* accuracy that is required in the final values of the variables. An */
+ /* error return occurs if any of the differences XU(I)-XL(I), I=1,...,N, */
+ /* is less than 2*RHOBEG. */
+ /* MAXFUN must be set to an upper bound on the number of calls of CALFUN. */
+ /* The array W will be used for working space. Its length must be at least */
+ /* (NPT+5)*(NPT+N)+3*N*(N+5)/2. */
+
+ /* Parameter adjustments */
+ --w;
+ --xu;
+ --xl;
+ --x;
+
+ /* Function Body */
+ np = n + 1;
+
+ /* Return if the value of NPT is unacceptable. */
+ if (npt < n + 2 || npt > (n + 2) * np / 2) {
+ throw bobyqa_failure("Return from BOBYQA because NPT is not in the required interval");
+ //goto L40;
+ }
+
+ /* Partition the working space array, so that different parts of it can */
+ /* be treated separately during the calculation of BOBYQB. The partition */
+ /* requires the first (NPT+2)*(NPT+N)+3*N*(N+5)/2 elements of W plus the */
+ /* space that is taken by the last array in the argument list of BOBYQB. */
+
+ ndim = npt + n;
+ ixb = 1;
+ ixp = ixb + n;
+ ifv = ixp + n * npt;
+ ixo = ifv + npt;
+ igo = ixo + n;
+ ihq = igo + n;
+ ipq = ihq + n * np / 2;
+ ibmat = ipq + npt;
+ izmat = ibmat + ndim * n;
+ isl = izmat + npt * (npt - np);
+ isu = isl + n;
+ ixn = isu + n;
+ ixa = ixn + n;
+ id_ = ixa + n;
+ ivl = id_ + n;
+ iw = ivl + ndim;
+
+ /* Return if there is insufficient space between the bounds. Modify the */
+ /* initial X if necessary in order to avoid conflicts between the bounds */
+ /* and the construction of the first quadratic model. The lower and upper */
+ /* bounds on moves from the updated X are set now, in the ISL and ISU */
+ /* partitions of W, in order to provide useful and exact information about */
+ /* components of X that become within distance RHOBEG from their bounds. */
+
+ zero = 0.;
+ i__1 = n;
+ for (j = 1; j <= i__1; ++j) {
+ temp = xu[j] - xl[j];
+ if (temp < rhobeg + rhobeg) {
+ throw bobyqa_failure("Return from BOBYQA because one of the differences in x_lower and x_upper is less than 2*rho_begin");
+ //goto L40;
+ }
+ jsl = isl + j - 1;
+ jsu = jsl + n;
+ w[jsl] = xl[j] - x[j];
+ w[jsu] = xu[j] - x[j];
+ if (w[jsl] >= -(rhobeg)) {
+ if (w[jsl] >= zero) {
+ x[j] = xl[j];
+ w[jsl] = zero;
+ w[jsu] = temp;
+ } else {
+ x[j] = xl[j] + rhobeg;
+ w[jsl] = -(rhobeg);
+ /* Computing MAX */
+ d__1 = xu[j] - x[j];
+ w[jsu] = std::max(d__1,rhobeg);
+ }
+ } else if (w[jsu] <= rhobeg) {
+ if (w[jsu] <= zero) {
+ x[j] = xu[j];
+ w[jsl] = -temp;
+ w[jsu] = zero;
+ } else {
+ x[j] = xu[j] - rhobeg;
+ /* Computing MIN */
+ d__1 = xl[j] - x[j], d__2 = -(rhobeg);
+ w[jsl] = std::min(d__1,d__2);
+ w[jsu] = rhobeg;
+ }
+ }
+ /* L30: */
+ }
+
+ /* Make the call of BOBYQB. */
+
+ return bobyqb_(calfun, n, npt, &x[1], &xl[1], &xu[1], rhobeg, rhoend, maxfun, &w[
+ ixb], &w[ixp], &w[ifv], &w[ixo], &w[igo], &w[ihq], &w[ipq], &w[
+ ibmat], &w[izmat], ndim, &w[isl], &w[isu], &w[ixn], &w[ixa], &w[
+ id_], &w[ivl], &w[iw]);
+ //L40:
+ ;
+ } /* bobyqa_ */
+
+ // ----------------------------------------------------------------------------------------
+
+ template <typename funct>
+ doublereal bobyqb_(
+ const funct& calfun,
+ const integer n,
+ const integer npt,
+ doublereal *x,
+ const doublereal *xl,
+ const doublereal *xu,
+ const doublereal rhobeg,
+ const doublereal rhoend,
+ const integer maxfun,
+ doublereal *xbase,
+ doublereal *xpt,
+ doublereal *fval,
+ doublereal *xopt,
+ doublereal *gopt,
+ doublereal *hq,
+ doublereal *pq,
+ doublereal *bmat,
+ doublereal *zmat,
+ const integer ndim,
+ doublereal *sl,
+ doublereal *su,
+ doublereal *xnew,
+ doublereal *xalt,
+ doublereal *d__,
+ doublereal *vlag,
+ doublereal *w
+ ) const
+ {
+ /* System generated locals */
+ integer xpt_dim1, xpt_offset, bmat_dim1, bmat_offset, zmat_dim1,
+ zmat_offset, i__1, i__2, i__3;
+ doublereal d__1, d__2, d__3, d__4;
+
+ /* Local variables */
+ doublereal f = 0;
+ integer i__, j, k, ih, nf, jj, nh, ip, jp;
+ doublereal dx;
+ integer np;
+ doublereal den = 0, one = 0, ten = 0, dsq = 0, rho = 0, sum = 0, two = 0, diff = 0, half = 0, beta = 0, gisq = 0;
+ integer knew = 0;
+ doublereal temp, suma, sumb, bsum, fopt;
+ integer kopt = 0, nptm;
+ doublereal zero, curv;
+ integer ksav;
+ doublereal gqsq = 0, dist = 0, sumw = 0, sumz = 0, diffa = 0, diffb = 0, diffc = 0, hdiag = 0;
+ integer kbase;
+ doublereal alpha = 0, delta = 0, adelt = 0, denom = 0, fsave = 0, bdtol = 0, delsq = 0;
+ integer nresc, nfsav;
+ doublereal ratio = 0, dnorm = 0, vquad = 0, pqold = 0, tenth = 0;
+ integer itest;
+ doublereal sumpq, scaden;
+ doublereal errbig, cauchy, fracsq, biglsq, densav;
+ doublereal bdtest;
+ doublereal crvmin, frhosq;
+ doublereal distsq;
+ integer ntrits;
+ doublereal xoptsq;
+
+
+
+ /* The arguments N, NPT, X, XL, XU, RHOBEG, RHOEND, IPRINT and MAXFUN */
+ /* are identical to the corresponding arguments in SUBROUTINE BOBYQA. */
+ /* XBASE holds a shift of origin that should reduce the contributions */
+ /* from rounding errors to values of the model and Lagrange functions. */
+ /* XPT is a two-dimensional array that holds the coordinates of the */
+ /* interpolation points relative to XBASE. */
+ /* FVAL holds the values of F at the interpolation points. */
+ /* XOPT is set to the displacement from XBASE of the trust region centre. */
+ /* GOPT holds the gradient of the quadratic model at XBASE+XOPT. */
+ /* HQ holds the explicit second derivatives of the quadratic model. */
+ /* PQ contains the parameters of the implicit second derivatives of the */
+ /* quadratic model. */
+ /* BMAT holds the last N columns of H. */
+ /* ZMAT holds the factorization of the leading NPT by NPT submatrix of H, */
+ /* this factorization being ZMAT times ZMAT^T, which provides both the */
+ /* correct rank and positive semi-definiteness. */
+ /* NDIM is the first dimension of BMAT and has the value NPT+N. */
+ /* SL and SU hold the differences XL-XBASE and XU-XBASE, respectively. */
+ /* All the components of every XOPT are going to satisfy the bounds */
+ /* SL(I) .LEQ. XOPT(I) .LEQ. SU(I), with appropriate equalities when */
+ /* XOPT is on a constraint boundary. */
+ /* XNEW is chosen by SUBROUTINE TRSBOX or ALTMOV. Usually XBASE+XNEW is the */
+ /* vector of variables for the next call of CALFUN. XNEW also satisfies */
+ /* the SL and SU constraints in the way that has just been mentioned. */
+ /* XALT is an alternative to XNEW, chosen by ALTMOV, that may replace XNEW */
+ /* in order to increase the denominator in the updating of UPDATE. */
+ /* D is reserved for a trial step from XOPT, which is usually XNEW-XOPT. */
+ /* VLAG contains the values of the Lagrange functions at a new point X. */
+ /* They are part of a product that requires VLAG to be of length NDIM. */
+ /* W is a one-dimensional array that is used for working space. Its length */
+ /* must be at least 3*NDIM = 3*(NPT+N). */
+
+ /* Set some constants. */
+
+ /* Parameter adjustments */
+ zmat_dim1 = npt;
+ zmat_offset = 1 + zmat_dim1;
+ zmat -= zmat_offset;
+ xpt_dim1 = npt;
+ xpt_offset = 1 + xpt_dim1;
+ xpt -= xpt_offset;
+ --x;
+ --xl;
+ --xu;
+ --xbase;
+ --fval;
+ --xopt;
+ --gopt;
+ --hq;
+ --pq;
+ bmat_dim1 = ndim;
+ bmat_offset = 1 + bmat_dim1;
+ bmat -= bmat_offset;
+ --sl;
+ --su;
+ --xnew;
+ --xalt;
+ --d__;
+ --vlag;
+ --w;
+
+ /* Function Body */
+ half = .5;
+ one = 1.;
+ ten = 10.;
+ tenth = .1;
+ two = 2.;
+ zero = 0.;
+ np = n + 1;
+ nptm = npt - np;
+ nh = n * np / 2;
+
+ /* The call of PRELIM sets the elements of XBASE, XPT, FVAL, GOPT, HQ, PQ, */
+ /* BMAT and ZMAT for the first iteration, with the corresponding values of */
+ /* of NF and KOPT, which are the number of calls of CALFUN so far and the */
+ /* index of the interpolation point at the trust region centre. Then the */
+ /* initial XOPT is set too. The branch to label 720 occurs if MAXFUN is */
+ /* less than NPT. GOPT will be updated if KOPT is different from KBASE. */
+
+ prelim_(calfun, n, npt, &x[1], &xl[1], &xu[1], rhobeg, maxfun, &xbase[1],
+ &xpt[xpt_offset], &fval[1], &gopt[1], &hq[1], &pq[1], &bmat[bmat_offset],
+ &zmat[zmat_offset], ndim, &sl[1], &su[1], nf, kopt);
+ xoptsq = zero;
+ i__1 = n;
+ for (i__ = 1; i__ <= i__1; ++i__) {
+ xopt[i__] = xpt[kopt + i__ * xpt_dim1];
+ /* L10: */
+ /* Computing 2nd power */
+ d__1 = xopt[i__];
+ xoptsq += d__1 * d__1;
+ }
+ fsave = fval[1];
+ if (nf < npt) {
+ throw bobyqa_failure("Return from BOBYQA because the objective function has been called max_f_evals times.");
+ //goto L720;
+ }
+ kbase = 1;
+
+ /* Complete the settings that are required for the iterative procedure. */
+
+ rho = rhobeg;
+ delta = rho;
+ nresc = nf;
+ ntrits = 0;
+ diffa = zero;
+ diffb = zero;
+ itest = 0;
+ nfsav = nf;
+
+ /* Update GOPT if necessary before the first iteration and after each */
+ /* call of RESCUE that makes a call of CALFUN. */
+
+L20:
+ if (kopt != kbase) {
+ ih = 0;
+ i__1 = n;
+ for (j = 1; j <= i__1; ++j) {
+ i__2 = j;
+ for (i__ = 1; i__ <= i__2; ++i__) {
+ ++ih;
+ if (i__ < j) {
+ gopt[j] += hq[ih] * xopt[i__];
+ }
+ /* L30: */
+ gopt[i__] += hq[ih] * xopt[j];
+ }
+ }
+ if (nf > npt) {
+ i__2 = npt;
+ for (k = 1; k <= i__2; ++k) {
+ temp = zero;
+ i__1 = n;
+ for (j = 1; j <= i__1; ++j) {
+ /* L40: */
+ temp += xpt[k + j * xpt_dim1] * xopt[j];
+ }
+ temp = pq[k] * temp;
+ i__1 = n;
+ for (i__ = 1; i__ <= i__1; ++i__) {
+ /* L50: */
+ gopt[i__] += temp * xpt[k + i__ * xpt_dim1];
+ }
+ }
+ }
+ }
+
+ /* Generate the next point in the trust region that provides a small value */
+ /* of the quadratic model subject to the constraints on the variables. */
+ /* The integer NTRITS is set to the number "trust region" iterations that */
+ /* have occurred since the last "alternative" iteration. If the length */
+ /* of XNEW-XOPT is less than HALF*RHO, however, then there is a branch to */
+ /* label 650 or 680 with NTRITS=-1, instead of calculating F at XNEW. */
+
+L60:
+ trsbox_(n, npt, &xpt[xpt_offset], &xopt[1], &gopt[1], &hq[1], &pq[1], &sl[1],
+ &su[1], delta, &xnew[1], &d__[1], &w[1], &w[np], &w[np + n],
+ &w[np + (n << 1)], &w[np + n * 3], &dsq, &crvmin);
+ /* Computing MIN */
+ d__1 = delta, d__2 = std::sqrt(dsq);
+ dnorm = std::min(d__1,d__2);
+ if (dnorm < half * rho) {
+ ntrits = -1;
+ /* Computing 2nd power */
+ d__1 = ten * rho;
+ distsq = d__1 * d__1;
+ if (nf <= nfsav + 2) {
+ goto L650;
+ }
+
+ /* The following choice between labels 650 and 680 depends on whether or */
+ /* not our work with the current RHO seems to be complete. Either RHO is */
+ /* decreased or termination occurs if the errors in the quadratic model at */
+ /* the last three interpolation points compare favourably with predictions */
+ /* of likely improvements to the model within distance HALF*RHO of XOPT. */
+
+ /* Computing MAX */
+ d__1 = std::max(diffa,diffb);
+ errbig = std::max(d__1,diffc);
+ frhosq = rho * .125 * rho;
+ if (crvmin > zero && errbig > frhosq * crvmin) {
+ goto L650;
+ }
+ bdtol = errbig / rho;
+ i__1 = n;
+ for (j = 1; j <= i__1; ++j) {
+ bdtest = bdtol;
+ if (xnew[j] == sl[j]) {
+ bdtest = w[j];
+ }
+ if (xnew[j] == su[j]) {
+ bdtest = -w[j];
+ }
+ if (bdtest < bdtol) {
+ curv = hq[(j + j * j) / 2];
+ i__2 = npt;
+ for (k = 1; k <= i__2; ++k) {
+ /* L70: */
+ /* Computing 2nd power */
+ d__1 = xpt[k + j * xpt_dim1];
+ curv += pq[k] * (d__1 * d__1);
+ }
+ bdtest += half * curv * rho;
+ if (bdtest < bdtol) {
+ goto L650;
+ }
+ }
+ /* L80: */
+ }
+ goto L680;
+ }
+ ++ntrits;
+
+ /* Severe cancellation is likely to occur if XOPT is too far from XBASE. */
+ /* If the following test holds, then XBASE is shifted so that XOPT becomes */
+ /* zero. The appropriate changes are made to BMAT and to the second */
+ /* derivatives of the current model, beginning with the changes to BMAT */
+ /* that do not depend on ZMAT. VLAG is used temporarily for working space. */
+
+L90:
+ if (dsq <= xoptsq * .001) {
+ fracsq = xoptsq * .25;
+ sumpq = zero;
+ i__1 = npt;
+ for (k = 1; k <= i__1; ++k) {
+ sumpq += pq[k];
+ sum = -half * xoptsq;
+ i__2 = n;
+ for (i__ = 1; i__ <= i__2; ++i__) {
+ /* L100: */
+ sum += xpt[k + i__ * xpt_dim1] * xopt[i__];
+ }
+ w[npt + k] = sum;
+ temp = fracsq - half * sum;
+ i__2 = n;
+ for (i__ = 1; i__ <= i__2; ++i__) {
+ w[i__] = bmat[k + i__ * bmat_dim1];
+ vlag[i__] = sum * xpt[k + i__ * xpt_dim1] + temp * xopt[i__];
+ ip = npt + i__;
+ i__3 = i__;
+ for (j = 1; j <= i__3; ++j) {
+ /* L110: */
+ bmat[ip + j * bmat_dim1] = bmat[ip + j * bmat_dim1] + w[
+ i__] * vlag[j] + vlag[i__] * w[j];
+ }
+ }
+ }
+
+ /* Then the revisions of BMAT that depend on ZMAT are calculated. */
+
+ i__3 = nptm;
+ for (jj = 1; jj <= i__3; ++jj) {
+ sumz = zero;
+ sumw = zero;
+ i__2 = npt;
+ for (k = 1; k <= i__2; ++k) {
+ sumz += zmat[k + jj * zmat_dim1];
+ vlag[k] = w[npt + k] * zmat[k + jj * zmat_dim1];
+ /* L120: */
+ sumw += vlag[k];
+ }
+ i__2 = n;
+ for (j = 1; j <= i__2; ++j) {
+ sum = (fracsq * sumz - half * sumw) * xopt[j];
+ i__1 = npt;
+ for (k = 1; k <= i__1; ++k) {
+ /* L130: */
+ sum += vlag[k] * xpt[k + j * xpt_dim1];
+ }
+ w[j] = sum;
+ i__1 = npt;
+ for (k = 1; k <= i__1; ++k) {
+ /* L140: */
+ bmat[k + j * bmat_dim1] += sum * zmat[k + jj * zmat_dim1];
+ }
+ }
+ i__1 = n;
+ for (i__ = 1; i__ <= i__1; ++i__) {
+ ip = i__ + npt;
+ temp = w[i__];
+ i__2 = i__;
+ for (j = 1; j <= i__2; ++j) {
+ /* L150: */
+ bmat[ip + j * bmat_dim1] += temp * w[j];
+ }
+ }
+ }
+
+ /* The following instructions complete the shift, including the changes */
+ /* to the second derivative parameters of the quadratic model. */
+
+ ih = 0;
+ i__2 = n;
+ for (j = 1; j <= i__2; ++j) {
+ w[j] = -half * sumpq * xopt[j];
+ i__1 = npt;
+ for (k = 1; k <= i__1; ++k) {
+ w[j] += pq[k] * xpt[k + j * xpt_dim1];
+ /* L160: */
+ xpt[k + j * xpt_dim1] -= xopt[j];
+ }
+ i__1 = j;
+ for (i__ = 1; i__ <= i__1; ++i__) {
+ ++ih;
+ hq[ih] = hq[ih] + w[i__] * xopt[j] + xopt[i__] * w[j];
+ /* L170: */
+ bmat[npt + i__ + j * bmat_dim1] = bmat[npt + j + i__ *
+ bmat_dim1];
+ }
+ }
+ i__1 = n;
+ for (i__ = 1; i__ <= i__1; ++i__) {
+ xbase[i__] += xopt[i__];
+ xnew[i__] -= xopt[i__];
+ sl[i__] -= xopt[i__];
+ su[i__] -= xopt[i__];
+ /* L180: */
+ xopt[i__] = zero;
+ }
+ xoptsq = zero;
+ }
+ if (ntrits == 0) {
+ goto L210;
+ }
+ goto L230;
+
+ /* XBASE is also moved to XOPT by a call of RESCUE. This calculation is */
+ /* more expensive than the previous shift, because new matrices BMAT and */
+ /* ZMAT are generated from scratch, which may include the replacement of */
+ /* interpolation points whose positions seem to be causing near linear */
+ /* dependence in the interpolation conditions. Therefore RESCUE is called */
+ /* only if rounding errors have reduced by at least a factor of two the */
+ /* denominator of the formula for updating the H matrix. It provides a */
+ /* useful safeguard, but is not invoked in most applications of BOBYQA. */
+
+L190:
+ nfsav = nf;
+ kbase = kopt;
+ rescue_(calfun, n, npt, &xl[1], &xu[1], maxfun, &xbase[1], &xpt[
+ xpt_offset], &fval[1], &xopt[1], &gopt[1], &hq[1], &pq[1], &bmat[
+ bmat_offset], &zmat[zmat_offset], ndim, &sl[1], &su[1], nf, delta,
+ kopt, &vlag[1], &w[1], &w[n + np], &w[ndim + np]);
+
+ /* XOPT is updated now in case the branch below to label 720 is taken. */
+ /* Any updating of GOPT occurs after the branch below to label 20, which */
+ /* leads to a trust region iteration as does the branch to label 60. */
+
+ xoptsq = zero;
+ if (kopt != kbase) {
+ i__1 = n;
+ for (i__ = 1; i__ <= i__1; ++i__) {
+ xopt[i__] = xpt[kopt + i__ * xpt_dim1];
+ /* L200: */
+ /* Computing 2nd power */
+ d__1 = xopt[i__];
+ xoptsq += d__1 * d__1;
+ }
+ }
+ if (nf < 0) {
+ nf = maxfun;
+ throw bobyqa_failure("Return from BOBYQA because the objective function has been called max_f_evals times.");
+ //goto L720;
+ }
+ nresc = nf;
+ if (nfsav < nf) {
+ nfsav = nf;
+ goto L20;
+ }
+ if (ntrits > 0) {
+ goto L60;
+ }
+
+ /* Pick two alternative vectors of variables, relative to XBASE, that */
+ /* are suitable as new positions of the KNEW-th interpolation point. */
+ /* Firstly, XNEW is set to the point on a line through XOPT and another */
+ /* interpolation point that minimizes the predicted value of the next */
+ /* denominator, subject to ||XNEW - XOPT|| .LEQ. ADELT and to the SL */
+ /* and SU bounds. Secondly, XALT is set to the best feasible point on */
+ /* a constrained version of the Cauchy step of the KNEW-th Lagrange */
+ /* function, the corresponding value of the square of this function */
+ /* being returned in CAUCHY. The choice between these alternatives is */
+ /* going to be made when the denominator is calculated. */
+
+L210:
+ altmov_(n, npt, &xpt[xpt_offset], &xopt[1], &bmat[bmat_offset], &zmat[zmat_offset],
+ ndim, &sl[1], &su[1], kopt, knew, adelt, &xnew[1],
+ &xalt[1], alpha, cauchy, &w[1], &w[np], &w[ndim + 1]);
+ i__1 = n;
+ for (i__ = 1; i__ <= i__1; ++i__) {
+ /* L220: */
+ d__[i__] = xnew[i__] - xopt[i__];
+ }
+
+ /* Calculate VLAG and BETA for the current choice of D. The scalar */
+ /* product of D with XPT(K,.) is going to be held in W(NPT+K) for */
+ /* use when VQUAD is calculated. */
+
+L230:
+ i__1 = npt;
+ for (k = 1; k <= i__1; ++k) {
+ suma = zero;
+ sumb = zero;
+ sum = zero;
+ i__2 = n;
+ for (j = 1; j <= i__2; ++j) {
+ suma += xpt[k + j * xpt_dim1] * d__[j];
+ sumb += xpt[k + j * xpt_dim1] * xopt[j];
+ /* L240: */
+ sum += bmat[k + j * bmat_dim1] * d__[j];
+ }
+ w[k] = suma * (half * suma + sumb);
+ vlag[k] = sum;
+ /* L250: */
+ w[npt + k] = suma;
+ }
+ beta = zero;
+ i__1 = nptm;
+ for (jj = 1; jj <= i__1; ++jj) {
+ sum = zero;
+ i__2 = npt;
+ for (k = 1; k <= i__2; ++k) {
+ /* L260: */
+ sum += zmat[k + jj * zmat_dim1] * w[k];
+ }
+ beta -= sum * sum;
+ i__2 = npt;
+ for (k = 1; k <= i__2; ++k) {
+ /* L270: */
+ vlag[k] += sum * zmat[k + jj * zmat_dim1];
+ }
+ }
+ dsq = zero;
+ bsum = zero;
+ dx = zero;
+ i__2 = n;
+ for (j = 1; j <= i__2; ++j) {
+ /* Computing 2nd power */
+ d__1 = d__[j];
+ dsq += d__1 * d__1;
+ sum = zero;
+ i__1 = npt;
+ for (k = 1; k <= i__1; ++k) {
+ /* L280: */
+ sum += w[k] * bmat[k + j * bmat_dim1];
+ }
+ bsum += sum * d__[j];
+ jp = npt + j;
+ i__1 = n;
+ for (i__ = 1; i__ <= i__1; ++i__) {
+ /* L290: */
+ sum += bmat[jp + i__ * bmat_dim1] * d__[i__];
+ }
+ vlag[jp] = sum;
+ bsum += sum * d__[j];
+ /* L300: */
+ dx += d__[j] * xopt[j];
+ }
+ beta = dx * dx + dsq * (xoptsq + dx + dx + half * dsq) + beta - bsum;
+ vlag[kopt] += one;
+
+ /* If NTRITS is zero, the denominator may be increased by replacing */
+ /* the step D of ALTMOV by a Cauchy step. Then RESCUE may be called if */
+ /* rounding errors have damaged the chosen denominator. */
+
+ if (ntrits == 0) {
+ /* Computing 2nd power */
+ d__1 = vlag[knew];
+ denom = d__1 * d__1 + alpha * beta;
+ if (denom < cauchy && cauchy > zero) {
+ i__2 = n;
+ for (i__ = 1; i__ <= i__2; ++i__) {
+ xnew[i__] = xalt[i__];
+ /* L310: */
+ d__[i__] = xnew[i__] - xopt[i__];
+ }
+ cauchy = zero;
+ goto L230;
+ }
+ /* Computing 2nd power */
+ d__1 = vlag[knew];
+ if (denom <= half * (d__1 * d__1)) {
+ if (nf > nresc) {
+ goto L190;
+ }
+ throw bobyqa_failure("Return from BOBYQA because of much cancellation in a denominator.");
+ //goto L720;
+ }
+
+ /* Alternatively, if NTRITS is positive, then set KNEW to the index of */
+ /* the next interpolation point to be deleted to make room for a trust */
+ /* region step. Again RESCUE may be called if rounding errors have damaged */
+ /* the chosen denominator, which is the reason for attempting to select */
+ /* KNEW before calculating the next value of the objective function. */
+
+ } else {
+ delsq = delta * delta;
+ scaden = zero;
+ biglsq = zero;
+ knew = 0;
+ i__2 = npt;
+ for (k = 1; k <= i__2; ++k) {
+ if (k == kopt) {
+ goto L350;
+ }
+ hdiag = zero;
+ i__1 = nptm;
+ for (jj = 1; jj <= i__1; ++jj) {
+ /* L330: */
+ /* Computing 2nd power */
+ d__1 = zmat[k + jj * zmat_dim1];
+ hdiag += d__1 * d__1;
+ }
+ /* Computing 2nd power */
+ d__1 = vlag[k];
+ den = beta * hdiag + d__1 * d__1;
+ distsq = zero;
+ i__1 = n;
+ for (j = 1; j <= i__1; ++j) {
+ /* L340: */
+ /* Computing 2nd power */
+ d__1 = xpt[k + j * xpt_dim1] - xopt[j];
+ distsq += d__1 * d__1;
+ }
+ /* Computing MAX */
+ /* Computing 2nd power */
+ d__3 = distsq / delsq;
+ d__1 = one, d__2 = d__3 * d__3;
+ temp = std::max(d__1,d__2);
+ if (temp * den > scaden) {
+ scaden = temp * den;
+ knew = k;
+ denom = den;
+ }
+ /* Computing MAX */
+ /* Computing 2nd power */
+ d__3 = vlag[k];
+ d__1 = biglsq, d__2 = temp * (d__3 * d__3);
+ biglsq = std::max(d__1,d__2);
+L350:
+ ;
+ }
+ if (scaden <= half * biglsq) {
+ if (nf > nresc) {
+ goto L190;
+ }
+ throw bobyqa_failure("Return from BOBYQA because of much cancellation in a denominator.");
+ //goto L720;
+ }
+ }
+
+ /* Put the variables for the next calculation of the objective function */
+ /* in XNEW, with any adjustments for the bounds. */
+
+
+ /* Calculate the value of the objective function at XBASE+XNEW, unless */
+ /* the limit on the number of calculations of F has been reached. */
+
+L360:
+ i__2 = n;
+ for (i__ = 1; i__ <= i__2; ++i__) {
+ /* Computing MIN */
+ /* Computing MAX */
+ d__3 = xl[i__], d__4 = xbase[i__] + xnew[i__];
+ d__1 = std::max(d__3,d__4), d__2 = xu[i__];
+ x[i__] = std::min(d__1,d__2);
+ if (xnew[i__] == sl[i__]) {
+ x[i__] = xl[i__];
+ }
+ if (xnew[i__] == su[i__]) {
+ x[i__] = xu[i__];
+ }
+ /* L380: */
+ }
+ if (nf >= maxfun) {
+ throw bobyqa_failure("Return from BOBYQA because the objective function has been called max_f_evals times.");
+ //goto L720;
+ }
+ ++nf;
+ f = calfun(mat(&x[1], n));
+ if (ntrits == -1) {
+ fsave = f;
+ goto L720;
+ }
+
+ /* Use the quadratic model to predict the change in F due to the step D, */
+ /* and set DIFF to the error of this prediction. */
+
+ fopt = fval[kopt];
+ vquad = zero;
+ ih = 0;
+ i__2 = n;
+ for (j = 1; j <= i__2; ++j) {
+ vquad += d__[j] * gopt[j];
+ i__1 = j;
+ for (i__ = 1; i__ <= i__1; ++i__) {
+ ++ih;
+ temp = d__[i__] * d__[j];
+ if (i__ == j) {
+ temp = half * temp;
+ }
+ /* L410: */
+ vquad += hq[ih] * temp;
+ }
+ }
+ i__1 = npt;
+ for (k = 1; k <= i__1; ++k) {
+ /* L420: */
+ /* Computing 2nd power */
+ d__1 = w[npt + k];
+ vquad += half * pq[k] * (d__1 * d__1);
+ }
+ diff = f - fopt - vquad;
+ diffc = diffb;
+ diffb = diffa;
+ diffa = std::abs(diff);
+ if (dnorm > rho) {
+ nfsav = nf;
+ }
+
+ /* Pick the next value of DELTA after a trust region step. */
+
+ if (ntrits > 0) {
+ if (vquad >= zero) {
+ throw bobyqa_failure("Return from BOBYQA because a trust region step has failed to reduce Q.");
+ //goto L720;
+ }
+ ratio = (f - fopt) / vquad;
+ if (ratio <= tenth) {
+ /* Computing MIN */
+ d__1 = half * delta;
+ delta = std::min(d__1,dnorm);
+ } else if (ratio <= .7) {
+ /* Computing MAX */
+ d__1 = half * delta;
+ delta = std::max(d__1,dnorm);
+ } else {
+ /* Computing MAX */
+ d__1 = half * delta, d__2 = dnorm + dnorm;
+ delta = std::max(d__1,d__2);
+ }
+ if (delta <= rho * 1.5) {
+ delta = rho;
+ }
+
+ /* Recalculate KNEW and DENOM if the new F is less than FOPT. */
+
+ if (f < fopt) {
+ ksav = knew;
+ densav = denom;
+ delsq = delta * delta;
+ scaden = zero;
+ biglsq = zero;
+ knew = 0;
+ i__1 = npt;
+ for (k = 1; k <= i__1; ++k) {
+ hdiag = zero;
+ i__2 = nptm;
+ for (jj = 1; jj <= i__2; ++jj) {
+ /* L440: */
+ /* Computing 2nd power */
+ d__1 = zmat[k + jj * zmat_dim1];
+ hdiag += d__1 * d__1;
+ }
+ /* Computing 2nd power */
+ d__1 = vlag[k];
+ den = beta * hdiag + d__1 * d__1;
+ distsq = zero;
+ i__2 = n;
+ for (j = 1; j <= i__2; ++j) {
+ /* L450: */
+ /* Computing 2nd power */
+ d__1 = xpt[k + j * xpt_dim1] - xnew[j];
+ distsq += d__1 * d__1;
+ }
+ /* Computing MAX */
+ /* Computing 2nd power */
+ d__3 = distsq / delsq;
+ d__1 = one, d__2 = d__3 * d__3;
+ temp = std::max(d__1,d__2);
+ if (temp * den > scaden) {
+ scaden = temp * den;
+ knew = k;
+ denom = den;
+ }
+ /* L460: */
+ /* Computing MAX */
+ /* Computing 2nd power */
+ d__3 = vlag[k];
+ d__1 = biglsq, d__2 = temp * (d__3 * d__3);
+ biglsq = std::max(d__1,d__2);
+ }
+ if (scaden <= half * biglsq) {
+ knew = ksav;
+ denom = densav;
+ }
+ }
+ }
+
+ /* Update BMAT and ZMAT, so that the KNEW-th interpolation point can be */
+ /* moved. Also update the second derivative terms of the model. */
+
+ update_(n, npt, &bmat[bmat_offset], &zmat[zmat_offset], ndim, &vlag[1],
+ beta, denom, knew, &w[1]);
+ ih = 0;
+ pqold = pq[knew];
+ pq[knew] = zero;
+ i__1 = n;
+ for (i__ = 1; i__ <= i__1; ++i__) {
+ temp = pqold * xpt[knew + i__ * xpt_dim1];
+ i__2 = i__;
+ for (j = 1; j <= i__2; ++j) {
+ ++ih;
+ /* L470: */
+ hq[ih] += temp * xpt[knew + j * xpt_dim1];
+ }
+ }
+ i__2 = nptm;
+ for (jj = 1; jj <= i__2; ++jj) {
+ temp = diff * zmat[knew + jj * zmat_dim1];
+ i__1 = npt;
+ for (k = 1; k <= i__1; ++k) {
+ /* L480: */
+ pq[k] += temp * zmat[k + jj * zmat_dim1];
+ }
+ }
+
+ /* Include the new interpolation point, and make the changes to GOPT at */
+ /* the old XOPT that are caused by the updating of the quadratic model. */
+
+ fval[knew] = f;
+ i__1 = n;
+ for (i__ = 1; i__ <= i__1; ++i__) {
+ xpt[knew + i__ * xpt_dim1] = xnew[i__];
+ /* L490: */
+ w[i__] = bmat[knew + i__ * bmat_dim1];
+ }
+ i__1 = npt;
+ for (k = 1; k <= i__1; ++k) {
+ suma = zero;
+ i__2 = nptm;
+ for (jj = 1; jj <= i__2; ++jj) {
+ /* L500: */
+ suma += zmat[knew + jj * zmat_dim1] * zmat[k + jj * zmat_dim1];
+ }
+ sumb = zero;
+ i__2 = n;
+ for (j = 1; j <= i__2; ++j) {
+ /* L510: */
+ sumb += xpt[k + j * xpt_dim1] * xopt[j];
+ }
+ temp = suma * sumb;
+ i__2 = n;
+ for (i__ = 1; i__ <= i__2; ++i__) {
+ /* L520: */
+ w[i__] += temp * xpt[k + i__ * xpt_dim1];
+ }
+ }
+ i__2 = n;
+ for (i__ = 1; i__ <= i__2; ++i__) {
+ /* L530: */
+ gopt[i__] += diff * w[i__];
+ }
+
+ /* Update XOPT, GOPT and KOPT if the new calculated F is less than FOPT. */
+
+ if (f < fopt) {
+ kopt = knew;
+ xoptsq = zero;
+ ih = 0;
+ i__2 = n;
+ for (j = 1; j <= i__2; ++j) {
+ xopt[j] = xnew[j];
+ /* Computing 2nd power */
+ d__1 = xopt[j];
+ xoptsq += d__1 * d__1;
+ i__1 = j;
+ for (i__ = 1; i__ <= i__1; ++i__) {
+ ++ih;
+ if (i__ < j) {
+ gopt[j] += hq[ih] * d__[i__];
+ }
+ /* L540: */
+ gopt[i__] += hq[ih] * d__[j];
+ }
+ }
+ i__1 = npt;
+ for (k = 1; k <= i__1; ++k) {
+ temp = zero;
+ i__2 = n;
+ for (j = 1; j <= i__2; ++j) {
+ /* L550: */
+ temp += xpt[k + j * xpt_dim1] * d__[j];
+ }
+ temp = pq[k] * temp;
+ i__2 = n;
+ for (i__ = 1; i__ <= i__2; ++i__) {
+ /* L560: */
+ gopt[i__] += temp * xpt[k + i__ * xpt_dim1];
+ }
+ }
+ }
+
+ /* Calculate the parameters of the least Frobenius norm interpolant to */
+ /* the current data, the gradient of this interpolant at XOPT being put */
+ /* into VLAG(NPT+I), I=1,2,...,N. */
+
+ if (ntrits > 0) {
+ i__2 = npt;
+ for (k = 1; k <= i__2; ++k) {
+ vlag[k] = fval[k] - fval[kopt];
+ /* L570: */
+ w[k] = zero;
+ }
+ i__2 = nptm;
+ for (j = 1; j <= i__2; ++j) {
+ sum = zero;
+ i__1 = npt;
+ for (k = 1; k <= i__1; ++k) {
+ /* L580: */
+ sum += zmat[k + j * zmat_dim1] * vlag[k];
+ }
+ i__1 = npt;
+ for (k = 1; k <= i__1; ++k) {
+ /* L590: */
+ w[k] += sum * zmat[k + j * zmat_dim1];
+ }
+ }
+ i__1 = npt;
+ for (k = 1; k <= i__1; ++k) {
+ sum = zero;
+ i__2 = n;
+ for (j = 1; j <= i__2; ++j) {
+ /* L600: */
+ sum += xpt[k + j * xpt_dim1] * xopt[j];
+ }
+ w[k + npt] = w[k];
+ /* L610: */
+ w[k] = sum * w[k];
+ }
+ gqsq = zero;
+ gisq = zero;
+ i__1 = n;
+ for (i__ = 1; i__ <= i__1; ++i__) {
+ sum = zero;
+ i__2 = npt;
+ for (k = 1; k <= i__2; ++k) {
+ /* L620: */
+ sum = sum + bmat[k + i__ * bmat_dim1] * vlag[k] + xpt[k + i__
+ * xpt_dim1] * w[k];
+ }
+ if (xopt[i__] == sl[i__]) {
+ /* Computing MIN */
+ d__2 = zero, d__3 = gopt[i__];
+ /* Computing 2nd power */
+ d__1 = std::min(d__2,d__3);
+ gqsq += d__1 * d__1;
+ /* Computing 2nd power */
+ d__1 = std::min(zero,sum);
+ gisq += d__1 * d__1;
+ } else if (xopt[i__] == su[i__]) {
+ /* Computing MAX */
+ d__2 = zero, d__3 = gopt[i__];
+ /* Computing 2nd power */
+ d__1 = std::max(d__2,d__3);
+ gqsq += d__1 * d__1;
+ /* Computing 2nd power */
+ d__1 = std::max(zero,sum);
+ gisq += d__1 * d__1;
+ } else {
+ /* Computing 2nd power */
+ d__1 = gopt[i__];
+ gqsq += d__1 * d__1;
+ gisq += sum * sum;
+ }
+ /* L630: */
+ vlag[npt + i__] = sum;
+ }
+
+ /* Test whether to replace the new quadratic model by the least Frobenius */
+ /* norm interpolant, making the replacement if the test is satisfied. */
+
+ ++itest;
+ if (gqsq < ten * gisq) {
+ itest = 0;
+ }
+ if (itest >= 3) {
+ i__1 = std::max(npt,nh);
+ for (i__ = 1; i__ <= i__1; ++i__) {
+ if (i__ <= n) {
+ gopt[i__] = vlag[npt + i__];
+ }
+ if (i__ <= npt) {
+ pq[i__] = w[npt + i__];
+ }
+ if (i__ <= nh) {
+ hq[i__] = zero;
+ }
+ itest = 0;
+ /* L640: */
+ }
+ }
+ }
+
+ /* If a trust region step has provided a sufficient decrease in F, then */
+ /* branch for another trust region calculation. The case NTRITS=0 occurs */
+ /* when the new interpolation point was reached by an alternative step. */
+
+ if (ntrits == 0) {
+ goto L60;
+ }
+ if (f <= fopt + tenth * vquad) {
+ goto L60;
+ }
+
+ /* Alternatively, find out if the interpolation points are close enough */
+ /* to the best point so far. */
+
+ /* Computing MAX */
+ /* Computing 2nd power */
+ d__3 = two * delta;
+ /* Computing 2nd power */
+ d__4 = ten * rho;
+ d__1 = d__3 * d__3, d__2 = d__4 * d__4;
+ distsq = std::max(d__1,d__2);
+L650:
+ knew = 0;
+ i__1 = npt;
+ for (k = 1; k <= i__1; ++k) {
+ sum = zero;
+ i__2 = n;
+ for (j = 1; j <= i__2; ++j) {
+ /* L660: */
+ /* Computing 2nd power */
+ d__1 = xpt[k + j * xpt_dim1] - xopt[j];
+ sum += d__1 * d__1;
+ }
+ if (sum > distsq) {
+ knew = k;
+ distsq = sum;
+ }
+ /* L670: */
+ }
+
+ /* If KNEW is positive, then ALTMOV finds alternative new positions for */
+ /* the KNEW-th interpolation point within distance ADELT of XOPT. It is */
+ /* reached via label 90. Otherwise, there is a branch to label 60 for */
+ /* another trust region iteration, unless the calculations with the */
+ /* current RHO are complete. */
+
+ if (knew > 0) {
+ dist = std::sqrt(distsq);
+ if (ntrits == -1) {
+ /* Computing MIN */
+ d__1 = tenth * delta, d__2 = half * dist;
+ delta = std::min(d__1,d__2);
+ if (delta <= rho * 1.5) {
+ delta = rho;
+ }
+ }
+ ntrits = 0;
+ /* Computing MAX */
+ /* Computing MIN */
+ d__2 = tenth * dist;
+ d__1 = std::min(d__2,delta);
+ adelt = std::max(d__1,rho);
+ dsq = adelt * adelt;
+ goto L90;
+ }
+ if (ntrits == -1) {
+ goto L680;
+ }
+ if (ratio > zero) {
+ goto L60;
+ }
+ if (std::max(delta,dnorm) > rho) {
+ goto L60;
+ }
+
+ /* The calculations with the current value of RHO are complete. Pick the */
+ /* next values of RHO and DELTA. */
+
+L680:
+ if (rho > rhoend) {
+ delta = half * rho;
+ ratio = rho / rhoend;
+ if (ratio <= 16.) {
+ rho = rhoend;
+ } else if (ratio <= 250.) {
+ rho = std::sqrt(ratio) * rhoend;
+ } else {
+ rho = tenth * rho;
+ }
+ delta = std::max(delta,rho);
+ ntrits = 0;
+ nfsav = nf;
+ goto L60;
+ }
+
+ /* Return from the calculation, after another Newton-Raphson step, if */
+ /* it is too short to have been tried before. */
+
+ if (ntrits == -1) {
+ goto L360;
+ }
+L720:
+ if (fval[kopt] <= fsave) {
+ i__1 = n;
+ for (i__ = 1; i__ <= i__1; ++i__) {
+ /* Computing MIN */
+ /* Computing MAX */
+ d__3 = xl[i__], d__4 = xbase[i__] + xopt[i__];
+ d__1 = std::max(d__3,d__4), d__2 = xu[i__];
+ x[i__] = std::min(d__1,d__2);
+ if (xopt[i__] == sl[i__]) {
+ x[i__] = xl[i__];
+ }
+ if (xopt[i__] == su[i__]) {
+ x[i__] = xu[i__];
+ }
+ /* L730: */
+ }
+ f = fval[kopt];
+ }
+
+ return f;
+ } /* bobyqb_ */
+
+ // ----------------------------------------------------------------------------------------
+
+ void altmov_(
+ const integer n,
+ const integer npt,
+ const doublereal *xpt,
+ const doublereal *xopt,
+ const doublereal *bmat,
+ const doublereal *zmat,
+ const integer ndim,
+ const doublereal *sl,
+ const doublereal *su,
+ const integer kopt,
+ const integer knew,
+ const doublereal adelt,
+ doublereal *xnew,
+ doublereal *xalt,
+ doublereal& alpha,
+ doublereal& cauchy,
+ doublereal *glag,
+ doublereal *hcol,
+ doublereal *w
+ ) const
+ {
+ /* System generated locals */
+ integer xpt_dim1, xpt_offset, bmat_dim1, bmat_offset, zmat_dim1,
+ zmat_offset, i__1, i__2;
+ doublereal d__1, d__2, d__3, d__4;
+
+
+ /* Local variables */
+ integer i__, j, k;
+ doublereal ha, gw, one, diff, half;
+ integer ilbd, isbd;
+ doublereal slbd;
+ integer iubd;
+ doublereal vlag, subd, temp;
+ integer ksav = 0;
+ doublereal step = 0, zero = 0, curv = 0;
+ integer iflag;
+ doublereal scale = 0, csave = 0, tempa = 0, tempb = 0, tempd = 0, const__ = 0, sumin = 0,
+ ggfree = 0;
+ integer ibdsav = 0;
+ doublereal dderiv = 0, bigstp = 0, predsq = 0, presav = 0, distsq = 0, stpsav = 0, wfixsq = 0, wsqsav = 0;
+
+
+ /* The arguments N, NPT, XPT, XOPT, BMAT, ZMAT, NDIM, SL and SU all have */
+ /* the same meanings as the corresponding arguments of BOBYQB. */
+ /* KOPT is the index of the optimal interpolation point. */
+ /* KNEW is the index of the interpolation point that is going to be moved. */
+ /* ADELT is the current trust region bound. */
+ /* XNEW will be set to a suitable new position for the interpolation point */
+ /* XPT(KNEW,.). Specifically, it satisfies the SL, SU and trust region */
+ /* bounds and it should provide a large denominator in the next call of */
+ /* UPDATE. The step XNEW-XOPT from XOPT is restricted to moves along the */
+ /* straight lines through XOPT and another interpolation point. */
+ /* XALT also provides a large value of the modulus of the KNEW-th Lagrange */
+ /* function subject to the constraints that have been mentioned, its main */
+ /* difference from XNEW being that XALT-XOPT is a constrained version of */
+ /* the Cauchy step within the trust region. An exception is that XALT is */
+ /* not calculated if all components of GLAG (see below) are zero. */
+ /* ALPHA will be set to the KNEW-th diagonal element of the H matrix. */
+ /* CAUCHY will be set to the square of the KNEW-th Lagrange function at */
+ /* the step XALT-XOPT from XOPT for the vector XALT that is returned, */
+ /* except that CAUCHY is set to zero if XALT is not calculated. */
+ /* GLAG is a working space vector of length N for the gradient of the */
+ /* KNEW-th Lagrange function at XOPT. */
+ /* HCOL is a working space vector of length NPT for the second derivative */
+ /* coefficients of the KNEW-th Lagrange function. */
+ /* W is a working space vector of length 2N that is going to hold the */
+ /* constrained Cauchy step from XOPT of the Lagrange function, followed */
+ /* by the downhill version of XALT when the uphill step is calculated. */
+
+ /* Set the first NPT components of W to the leading elements of the */
+ /* KNEW-th column of the H matrix. */
+
+ /* Parameter adjustments */
+ zmat_dim1 = npt;
+ zmat_offset = 1 + zmat_dim1;
+ zmat -= zmat_offset;
+ xpt_dim1 = npt;
+ xpt_offset = 1 + xpt_dim1;
+ xpt -= xpt_offset;
+ --xopt;
+ bmat_dim1 = ndim;
+ bmat_offset = 1 + bmat_dim1;
+ bmat -= bmat_offset;
+ --sl;
+ --su;
+ --xnew;
+ --xalt;
+ --glag;
+ --hcol;
+ --w;
+
+ /* Function Body */
+ half = .5;
+ one = 1.;
+ zero = 0.;
+ const__ = one + std::sqrt(2.);
+ i__1 = npt;
+ for (k = 1; k <= i__1; ++k) {
+ /* L10: */
+ hcol[k] = zero;
+ }
+ i__1 = npt - n - 1;
+ for (j = 1; j <= i__1; ++j) {
+ temp = zmat[knew + j * zmat_dim1];
+ i__2 = npt;
+ for (k = 1; k <= i__2; ++k) {
+ /* L20: */
+ hcol[k] += temp * zmat[k + j * zmat_dim1];
+ }
+ }
+ alpha = hcol[knew];
+ ha = half * alpha;
+
+ /* Calculate the gradient of the KNEW-th Lagrange function at XOPT. */
+
+ i__2 = n;
+ for (i__ = 1; i__ <= i__2; ++i__) {
+ /* L30: */
+ glag[i__] = bmat[knew + i__ * bmat_dim1];
+ }
+ i__2 = npt;
+ for (k = 1; k <= i__2; ++k) {
+ temp = zero;
+ i__1 = n;
+ for (j = 1; j <= i__1; ++j) {
+ /* L40: */
+ temp += xpt[k + j * xpt_dim1] * xopt[j];
+ }
+ temp = hcol[k] * temp;
+ i__1 = n;
+ for (i__ = 1; i__ <= i__1; ++i__) {
+ /* L50: */
+ glag[i__] += temp * xpt[k + i__ * xpt_dim1];
+ }
+ }
+
+ /* Search for a large denominator along the straight lines through XOPT */
+ /* and another interpolation point. SLBD and SUBD will be lower and upper */
+ /* bounds on the step along each of these lines in turn. PREDSQ will be */
+ /* set to the square of the predicted denominator for each line. PRESAV */
+ /* will be set to the largest admissible value of PREDSQ that occurs. */
+
+ presav = zero;
+ i__1 = npt;
+ for (k = 1; k <= i__1; ++k) {
+ if (k == kopt) {
+ goto L80;
+ }
+ dderiv = zero;
+ distsq = zero;
+ i__2 = n;
+ for (i__ = 1; i__ <= i__2; ++i__) {
+ temp = xpt[k + i__ * xpt_dim1] - xopt[i__];
+ dderiv += glag[i__] * temp;
+ /* L60: */
+ distsq += temp * temp;
+ }
+ subd = adelt / std::sqrt(distsq);
+ slbd = -subd;
+ ilbd = 0;
+ iubd = 0;
+ sumin = std::min(one,subd);
+
+ /* Revise SLBD and SUBD if necessary because of the bounds in SL and SU. */
+
+ i__2 = n;
+ for (i__ = 1; i__ <= i__2; ++i__) {
+ temp = xpt[k + i__ * xpt_dim1] - xopt[i__];
+ if (temp > zero) {
+ if (slbd * temp < sl[i__] - xopt[i__]) {
+ slbd = (sl[i__] - xopt[i__]) / temp;
+ ilbd = -i__;
+ }
+ if (subd * temp > su[i__] - xopt[i__]) {
+ /* Computing MAX */
+ d__1 = sumin, d__2 = (su[i__] - xopt[i__]) / temp;
+ subd = std::max(d__1,d__2);
+ iubd = i__;
+ }
+ } else if (temp < zero) {
+ if (slbd * temp > su[i__] - xopt[i__]) {
+ slbd = (su[i__] - xopt[i__]) / temp;
+ ilbd = i__;
+ }
+ if (subd * temp < sl[i__] - xopt[i__]) {
+ /* Computing MAX */
+ d__1 = sumin, d__2 = (sl[i__] - xopt[i__]) / temp;
+ subd = std::max(d__1,d__2);
+ iubd = -i__;
+ }
+ }
+ /* L70: */
+ }
+
+ /* Seek a large modulus of the KNEW-th Lagrange function when the index */
+ /* of the other interpolation point on the line through XOPT is KNEW. */
+
+ if (k == knew) {
+ diff = dderiv - one;
+ step = slbd;
+ vlag = slbd * (dderiv - slbd * diff);
+ isbd = ilbd;
+ temp = subd * (dderiv - subd * diff);
+ if (std::abs(temp) > std::abs(vlag)) {
+ step = subd;
+ vlag = temp;
+ isbd = iubd;
+ }
+ tempd = half * dderiv;
+ tempa = tempd - diff * slbd;
+ tempb = tempd - diff * subd;
+ if (tempa * tempb < zero) {
+ temp = tempd * tempd / diff;
+ if (std::abs(temp) > std::abs(vlag)) {
+ step = tempd / diff;
+ vlag = temp;
+ isbd = 0;
+ }
+ }
+
+ /* Search along each of the other lines through XOPT and another point. */
+
+ } else {
+ step = slbd;
+ vlag = slbd * (one - slbd);
+ isbd = ilbd;
+ temp = subd * (one - subd);
+ if (std::abs(temp) > std::abs(vlag)) {
+ step = subd;
+ vlag = temp;
+ isbd = iubd;
+ }
+ if (subd > half) {
+ if (std::abs(vlag) < .25) {
+ step = half;
+ vlag = .25;
+ isbd = 0;
+ }
+ }
+ vlag *= dderiv;
+ }
+
+ /* Calculate PREDSQ for the current line search and maintain PRESAV. */
+
+ temp = step * (one - step) * distsq;
+ predsq = vlag * vlag * (vlag * vlag + ha * temp * temp);
+ if (predsq > presav) {
+ presav = predsq;
+ ksav = k;
+ stpsav = step;
+ ibdsav = isbd;
+ }
+L80:
+ ;
+ }
+
+ /* Construct XNEW in a way that satisfies the bound constraints exactly. */
+
+ i__1 = n;
+ for (i__ = 1; i__ <= i__1; ++i__) {
+ temp = xopt[i__] + stpsav * (xpt[ksav + i__ * xpt_dim1] - xopt[i__]);
+ /* L90: */
+ /* Computing MAX */
+ /* Computing MIN */
+ d__3 = su[i__];
+ d__1 = sl[i__], d__2 = std::min(d__3,temp);
+ xnew[i__] = std::max(d__1,d__2);
+ }
+ if (ibdsav < 0) {
+ xnew[-ibdsav] = sl[-ibdsav];
+ }
+ if (ibdsav > 0) {
+ xnew[ibdsav] = su[ibdsav];
+ }
+
+ /* Prepare for the iterative method that assembles the constrained Cauchy */
+ /* step in W. The sum of squares of the fixed components of W is formed in */
+ /* WFIXSQ, and the free components of W are set to BIGSTP. */
+
+ bigstp = adelt + adelt;
+ iflag = 0;
+L100:
+ wfixsq = zero;
+ ggfree = zero;
+ i__1 = n;
+ for (i__ = 1; i__ <= i__1; ++i__) {
+ w[i__] = zero;
+ /* Computing MIN */
+ d__1 = xopt[i__] - sl[i__], d__2 = glag[i__];
+ tempa = std::min(d__1,d__2);
+ /* Computing MAX */
+ d__1 = xopt[i__] - su[i__], d__2 = glag[i__];
+ tempb = std::max(d__1,d__2);
+ if (tempa > zero || tempb < zero) {
+ w[i__] = bigstp;
+ /* Computing 2nd power */
+ d__1 = glag[i__];
+ ggfree += d__1 * d__1;
+ }
+ /* L110: */
+ }
+ if (ggfree == zero) {
+ cauchy = zero;
+ goto L200;
+ }
+
+ /* Investigate whether more components of W can be fixed. */
+
+L120:
+ temp = adelt * adelt - wfixsq;
+ if (temp > zero) {
+ wsqsav = wfixsq;
+ step = std::sqrt(temp / ggfree);
+ ggfree = zero;
+ i__1 = n;
+ for (i__ = 1; i__ <= i__1; ++i__) {
+ if (w[i__] == bigstp) {
+ temp = xopt[i__] - step * glag[i__];
+ if (temp <= sl[i__]) {
+ w[i__] = sl[i__] - xopt[i__];
+ /* Computing 2nd power */
+ d__1 = w[i__];
+ wfixsq += d__1 * d__1;
+ } else if (temp >= su[i__]) {
+ w[i__] = su[i__] - xopt[i__];
+ /* Computing 2nd power */
+ d__1 = w[i__];
+ wfixsq += d__1 * d__1;
+ } else {
+ /* Computing 2nd power */
+ d__1 = glag[i__];
+ ggfree += d__1 * d__1;
+ }
+ }
+ /* L130: */
+ }
+ if (wfixsq > wsqsav && ggfree > zero) {
+ goto L120;
+ }
+ }
+
+ /* Set the remaining free components of W and all components of XALT, */
+ /* except that W may be scaled later. */
+
+ gw = zero;
+ i__1 = n;
+ for (i__ = 1; i__ <= i__1; ++i__) {
+ if (w[i__] == bigstp) {
+ w[i__] = -step * glag[i__];
+ /* Computing MAX */
+ /* Computing MIN */
+ d__3 = su[i__], d__4 = xopt[i__] + w[i__];
+ d__1 = sl[i__], d__2 = std::min(d__3,d__4);
+ xalt[i__] = std::max(d__1,d__2);
+ } else if (w[i__] == zero) {
+ xalt[i__] = xopt[i__];
+ } else if (glag[i__] > zero) {
+ xalt[i__] = sl[i__];
+ } else {
+ xalt[i__] = su[i__];
+ }
+ /* L140: */
+ gw += glag[i__] * w[i__];
+ }
+
+ /* Set CURV to the curvature of the KNEW-th Lagrange function along W. */
+ /* Scale W by a factor less than one if that can reduce the modulus of */
+ /* the Lagrange function at XOPT+W. Set CAUCHY to the final value of */
+ /* the square of this function. */
+
+ curv = zero;
+ i__1 = npt;
+ for (k = 1; k <= i__1; ++k) {
+ temp = zero;
+ i__2 = n;
+ for (j = 1; j <= i__2; ++j) {
+ /* L150: */
+ temp += xpt[k + j * xpt_dim1] * w[j];
+ }
+ /* L160: */
+ curv += hcol[k] * temp * temp;
+ }
+ if (iflag == 1) {
+ curv = -curv;
+ }
+ if (curv > -gw && curv < -const__ * gw) {
+ scale = -gw / curv;
+ i__1 = n;
+ for (i__ = 1; i__ <= i__1; ++i__) {
+ temp = xopt[i__] + scale * w[i__];
+ /* L170: */
+ /* Computing MAX */
+ /* Computing MIN */
+ d__3 = su[i__];
+ d__1 = sl[i__], d__2 = std::min(d__3,temp);
+ xalt[i__] = std::max(d__1,d__2);
+ }
+ /* Computing 2nd power */
+ d__1 = half * gw * scale;
+ cauchy = d__1 * d__1;
+ } else {
+ /* Computing 2nd power */
+ d__1 = gw + half * curv;
+ cauchy = d__1 * d__1;
+ }
+
+ /* If IFLAG is zero, then XALT is calculated as before after reversing */
+ /* the sign of GLAG. Thus two XALT vectors become available. The one that */
+ /* is chosen is the one that gives the larger value of CAUCHY. */
+
+ if (iflag == 0) {
+ i__1 = n;
+ for (i__ = 1; i__ <= i__1; ++i__) {
+ glag[i__] = -glag[i__];
+ /* L180: */
+ w[n + i__] = xalt[i__];
+ }
+ csave = cauchy;
+ iflag = 1;
+ goto L100;
+ }
+ if (csave > cauchy) {
+ i__1 = n;
+ for (i__ = 1; i__ <= i__1; ++i__) {
+ /* L190: */
+ xalt[i__] = w[n + i__];
+ }
+ cauchy = csave;
+ }
+L200:
+ ;
+ } /* altmov_ */
+
+ // ----------------------------------------------------------------------------------------
+
+ template <typename funct>
+ void prelim_(
+ const funct& calfun,
+ const integer n,
+ const integer npt,
+ doublereal *x,
+ const doublereal *xl,
+ const doublereal *xu,
+ const doublereal rhobeg,
+ const integer maxfun,
+ doublereal *xbase,
+ doublereal *xpt,
+ doublereal *fval,
+ doublereal *gopt,
+ doublereal *hq,
+ doublereal *pq,
+ doublereal *bmat,
+ doublereal *zmat,
+ const integer ndim,
+ const doublereal *sl,
+ const doublereal *su,
+ integer& nf,
+ integer& kopt
+ ) const
+ {
+ /* System generated locals */
+ integer xpt_dim1, xpt_offset, bmat_dim1, bmat_offset, zmat_dim1,
+ zmat_offset, i__1, i__2;
+ doublereal d__1, d__2, d__3, d__4;
+
+
+ /* Local variables */
+ doublereal f;
+ integer i__, j, k, ih, np, nfm;
+ doublereal one;
+ integer nfx = 0, ipt = 0, jpt = 0;
+ doublereal two = 0, fbeg = 0, diff = 0, half = 0, temp = 0, zero = 0, recip = 0, stepa = 0, stepb = 0;
+ integer itemp;
+ doublereal rhosq;
+
+
+
+ /* The arguments N, NPT, X, XL, XU, RHOBEG, IPRINT and MAXFUN are the */
+ /* same as the corresponding arguments in SUBROUTINE BOBYQA. */
+ /* The arguments XBASE, XPT, FVAL, HQ, PQ, BMAT, ZMAT, NDIM, SL and SU */
+ /* are the same as the corresponding arguments in BOBYQB, the elements */
+ /* of SL and SU being set in BOBYQA. */
+ /* GOPT is usually the gradient of the quadratic model at XOPT+XBASE, but */
+ /* it is set by PRELIM to the gradient of the quadratic model at XBASE. */
+ /* If XOPT is nonzero, BOBYQB will change it to its usual value later. */
+ /* NF is maintaned as the number of calls of CALFUN so far. */
+ /* KOPT will be such that the least calculated value of F so far is at */
+ /* the point XPT(KOPT,.)+XBASE in the space of the variables. */
+
+ /* SUBROUTINE PRELIM sets the elements of XBASE, XPT, FVAL, GOPT, HQ, PQ, */
+ /* BMAT and ZMAT for the first iteration, and it maintains the values of */
+ /* NF and KOPT. The vector X is also changed by PRELIM. */
+
+ /* Set some constants. */
+
+ /* Parameter adjustments */
+ zmat_dim1 = npt;
+ zmat_offset = 1 + zmat_dim1;
+ zmat -= zmat_offset;
+ xpt_dim1 = npt;
+ xpt_offset = 1 + xpt_dim1;
+ xpt -= xpt_offset;
+ --x;
+ --xl;
+ --xu;
+ --xbase;
+ --fval;
+ --gopt;
+ --hq;
+ --pq;
+ bmat_dim1 = ndim;
+ bmat_offset = 1 + bmat_dim1;
+ bmat -= bmat_offset;
+ --sl;
+ --su;
+
+ /* Function Body */
+ half = .5;
+ one = 1.;
+ two = 2.;
+ zero = 0.;
+ rhosq = rhobeg * rhobeg;
+ recip = one / rhosq;
+ np = n + 1;
+
+ /* Set XBASE to the initial vector of variables, and set the initial */
+ /* elements of XPT, BMAT, HQ, PQ and ZMAT to zero. */
+
+ i__1 = n;
+ for (j = 1; j <= i__1; ++j) {
+ xbase[j] = x[j];
+ i__2 = npt;
+ for (k = 1; k <= i__2; ++k) {
+ /* L10: */
+ xpt[k + j * xpt_dim1] = zero;
+ }
+ i__2 = ndim;
+ for (i__ = 1; i__ <= i__2; ++i__) {
+ /* L20: */
+ bmat[i__ + j * bmat_dim1] = zero;
+ }
+ }
+ i__2 = n * np / 2;
+ for (ih = 1; ih <= i__2; ++ih) {
+ /* L30: */
+ hq[ih] = zero;
+ }
+ i__2 = npt;
+ for (k = 1; k <= i__2; ++k) {
+ pq[k] = zero;
+ i__1 = npt - np;
+ for (j = 1; j <= i__1; ++j) {
+ /* L40: */
+ zmat[k + j * zmat_dim1] = zero;
+ }
+ }
+
+ /* Begin the initialization procedure. NF becomes one more than the number */
+ /* of function values so far. The coordinates of the displacement of the */
+ /* next initial interpolation point from XBASE are set in XPT(NF+1,.). */
+
+ nf = 0;
+L50:
+ nfm = nf;
+ nfx = nf - n;
+ ++(nf);
+ if (nfm <= n << 1) {
+ if (nfm >= 1 && nfm <= n) {
+ stepa = rhobeg;
+ if (su[nfm] == zero) {
+ stepa = -stepa;
+ }
+ xpt[nf + nfm * xpt_dim1] = stepa;
+ } else if (nfm > n) {
+ stepa = xpt[nf - n + nfx * xpt_dim1];
+ stepb = -(rhobeg);
+ if (sl[nfx] == zero) {
+ /* Computing MIN */
+ d__1 = two * rhobeg, d__2 = su[nfx];
+ stepb = std::min(d__1,d__2);
+ }
+ if (su[nfx] == zero) {
+ /* Computing MAX */
+ d__1 = -two * rhobeg, d__2 = sl[nfx];
+ stepb = std::max(d__1,d__2);
+ }
+ xpt[nf + nfx * xpt_dim1] = stepb;
+ }
+ } else {
+ itemp = (nfm - np) / n;
+ jpt = nfm - itemp * n - n;
+ ipt = jpt + itemp;
+ if (ipt > n) {
+ itemp = jpt;
+ jpt = ipt - n;
+ ipt = itemp;
+ }
+ xpt[nf + ipt * xpt_dim1] = xpt[ipt + 1 + ipt * xpt_dim1];
+ xpt[nf + jpt * xpt_dim1] = xpt[jpt + 1 + jpt * xpt_dim1];
+ }
+
+ /* Calculate the next value of F. The least function value so far and */
+ /* its index are required. */
+
+ i__1 = n;
+ for (j = 1; j <= i__1; ++j) {
+ /* Computing MIN */
+ /* Computing MAX */
+ d__3 = xl[j], d__4 = xbase[j] + xpt[nf + j * xpt_dim1];
+ d__1 = std::max(d__3,d__4), d__2 = xu[j];
+ x[j] = std::min(d__1,d__2);
+ if (xpt[nf + j * xpt_dim1] == sl[j]) {
+ x[j] = xl[j];
+ }
+ if (xpt[nf + j * xpt_dim1] == su[j]) {
+ x[j] = xu[j];
+ }
+ /* L60: */
+ }
+ f = calfun(mat(&x[1],n));
+ fval[nf] = f;
+ if (nf == 1) {
+ fbeg = f;
+ kopt = 1;
+ } else if (f < fval[kopt]) {
+ kopt = nf;
+ }
+
+ /* Set the nonzero initial elements of BMAT and the quadratic model in the */
+ /* cases when NF is at most 2*N+1. If NF exceeds N+1, then the positions */
+ /* of the NF-th and (NF-N)-th interpolation points may be switched, in */
+ /* order that the function value at the first of them contributes to the */
+ /* off-diagonal second derivative terms of the initial quadratic model. */
+
+ if (nf <= (n << 1) + 1) {
+ if (nf >= 2 && nf <= n + 1) {
+ gopt[nfm] = (f - fbeg) / stepa;
+ if (npt < nf + n) {
+ bmat[nfm * bmat_dim1 + 1] = -one / stepa;
+ bmat[nf + nfm * bmat_dim1] = one / stepa;
+ bmat[npt + nfm + nfm * bmat_dim1] = -half * rhosq;
+ }
+ } else if (nf >= n + 2) {
+ ih = nfx * (nfx + 1) / 2;
+ temp = (f - fbeg) / stepb;
+ diff = stepb - stepa;
+ hq[ih] = two * (temp - gopt[nfx]) / diff;
+ gopt[nfx] = (gopt[nfx] * stepb - temp * stepa) / diff;
+ if (stepa * stepb < zero) {
+ if (f < fval[nf - n]) {
+ fval[nf] = fval[nf - n];
+ fval[nf - n] = f;
+ if (kopt == nf) {
+ kopt = nf - n;
+ }
+ xpt[nf - n + nfx * xpt_dim1] = stepb;
+ xpt[nf + nfx * xpt_dim1] = stepa;
+ }
+ }
+ bmat[nfx * bmat_dim1 + 1] = -(stepa + stepb) / (stepa * stepb);
+ bmat[nf + nfx * bmat_dim1] = -half / xpt[nf - n + nfx *
+ xpt_dim1];
+ bmat[nf - n + nfx * bmat_dim1] = -bmat[nfx * bmat_dim1 + 1] -
+ bmat[nf + nfx * bmat_dim1];
+ zmat[nfx * zmat_dim1 + 1] = std::sqrt(two) / (stepa * stepb);
+ zmat[nf + nfx * zmat_dim1] = std::sqrt(half) / rhosq;
+ zmat[nf - n + nfx * zmat_dim1] = -zmat[nfx * zmat_dim1 + 1] -
+ zmat[nf + nfx * zmat_dim1];
+ }
+
+ /* Set the off-diagonal second derivatives of the Lagrange functions and */
+ /* the initial quadratic model. */
+
+ } else {
+ ih = ipt * (ipt - 1) / 2 + jpt;
+ zmat[nfx * zmat_dim1 + 1] = recip;
+ zmat[nf + nfx * zmat_dim1] = recip;
+ zmat[ipt + 1 + nfx * zmat_dim1] = -recip;
+ zmat[jpt + 1 + nfx * zmat_dim1] = -recip;
+ temp = xpt[nf + ipt * xpt_dim1] * xpt[nf + jpt * xpt_dim1];
+ hq[ih] = (fbeg - fval[ipt + 1] - fval[jpt + 1] + f) / temp;
+ }
+ if (nf < npt && nf < maxfun) {
+ goto L50;
+ }
+
+ } /* prelim_ */
+
+ // ----------------------------------------------------------------------------------------
+
+ template <typename funct>
+ void rescue_ (
+ const funct& calfun,
+ const integer n,
+ const integer npt,
+ const doublereal *xl,
+ const doublereal *xu,
+ const integer maxfun,
+ doublereal *xbase,
+ doublereal *xpt,
+ doublereal *fval,
+ doublereal *xopt,
+ doublereal *gopt,
+ doublereal *hq,
+ doublereal *pq,
+ doublereal *bmat,
+ doublereal *zmat,
+ const integer ndim,
+ doublereal *sl,
+ doublereal *su,
+ integer& nf,
+ const doublereal delta,
+ integer& kopt,
+ doublereal *vlag,
+ doublereal * ptsaux,
+ doublereal *ptsid,
+ doublereal *w
+ ) const
+ {
+ /* System generated locals */
+ integer xpt_dim1, xpt_offset, bmat_dim1, bmat_offset, zmat_dim1,
+ zmat_offset, i__1, i__2, i__3;
+ doublereal d__1, d__2, d__3, d__4;
+
+
+ /* Local variables */
+ doublereal f;
+ integer i__, j, k, ih, jp, ip, iq, np, iw;
+ doublereal xp = 0, xq = 0, den = 0;
+ integer ihp = 0;
+ doublereal one;
+ integer ihq, jpn, kpt;
+ doublereal sum = 0, diff = 0, half = 0, beta = 0;
+ integer kold;
+ doublereal winc;
+ integer nrem, knew;
+ doublereal temp, bsum;
+ integer nptm;
+ doublereal zero = 0, hdiag = 0, fbase = 0, sfrac = 0, denom = 0, vquad = 0, sumpq = 0;
+ doublereal dsqmin, distsq, vlmxsq;
+
+
+
+ /* The arguments N, NPT, XL, XU, IPRINT, MAXFUN, XBASE, XPT, FVAL, XOPT, */
+ /* GOPT, HQ, PQ, BMAT, ZMAT, NDIM, SL and SU have the same meanings as */
+ /* the corresponding arguments of BOBYQB on the entry to RESCUE. */
+ /* NF is maintained as the number of calls of CALFUN so far, except that */
+ /* NF is set to -1 if the value of MAXFUN prevents further progress. */
+ /* KOPT is maintained so that FVAL(KOPT) is the least calculated function */
+ /* value. Its correct value must be given on entry. It is updated if a */
+ /* new least function value is found, but the corresponding changes to */
+ /* XOPT and GOPT have to be made later by the calling program. */
+ /* DELTA is the current trust region radius. */
+ /* VLAG is a working space vector that will be used for the values of the */
+ /* provisional Lagrange functions at each of the interpolation points. */
+ /* They are part of a product that requires VLAG to be of length NDIM. */
+ /* PTSAUX is also a working space array. For J=1,2,...,N, PTSAUX(1,J) and */
+ /* PTSAUX(2,J) specify the two positions of provisional interpolation */
+ /* points when a nonzero step is taken along e_J (the J-th coordinate */
+ /* direction) through XBASE+XOPT, as specified below. Usually these */
+ /* steps have length DELTA, but other lengths are chosen if necessary */
+ /* in order to satisfy the given bounds on the variables. */
+ /* PTSID is also a working space array. It has NPT components that denote */
+ /* provisional new positions of the original interpolation points, in */
+ /* case changes are needed to restore the linear independence of the */
+ /* interpolation conditions. The K-th point is a candidate for change */
+ /* if and only if PTSID(K) is nonzero. In this case let p and q be the */
+ /* integer parts of PTSID(K) and (PTSID(K)-p) multiplied by N+1. If p */
+ /* and q are both positive, the step from XBASE+XOPT to the new K-th */
+ /* interpolation point is PTSAUX(1,p)*e_p + PTSAUX(1,q)*e_q. Otherwise */
+ /* the step is PTSAUX(1,p)*e_p or PTSAUX(2,q)*e_q in the cases q=0 or */
+ /* p=0, respectively. */
+ /* The first NDIM+NPT elements of the array W are used for working space. */
+ /* The final elements of BMAT and ZMAT are set in a well-conditioned way */
+ /* to the values that are appropriate for the new interpolation points. */
+ /* The elements of GOPT, HQ and PQ are also revised to the values that are */
+ /* appropriate to the final quadratic model. */
+
+ /* Set some constants. */
+
+ /* Parameter adjustments */
+ zmat_dim1 = npt;
+ zmat_offset = 1 + zmat_dim1;
+ zmat -= zmat_offset;
+ xpt_dim1 = npt;
+ xpt_offset = 1 + xpt_dim1;
+ xpt -= xpt_offset;
+ --xl;
+ --xu;
+ --xbase;
+ --fval;
+ --xopt;
+ --gopt;
+ --hq;
+ --pq;
+ bmat_dim1 = ndim;
+ bmat_offset = 1 + bmat_dim1;
+ bmat -= bmat_offset;
+ --sl;
+ --su;
+ --vlag;
+ ptsaux -= 3;
+ --ptsid;
+ --w;
+
+ /* Function Body */
+ half = .5;
+ one = 1.;
+ zero = 0.;
+ np = n + 1;
+ sfrac = half / (doublereal) np;
+ nptm = npt - np;
+
+ /* Shift the interpolation points so that XOPT becomes the origin, and set */
+ /* the elements of ZMAT to zero. The value of SUMPQ is required in the */
+ /* updating of HQ below. The squares of the distances from XOPT to the */
+ /* other interpolation points are set at the end of W. Increments of WINC */
+ /* may be added later to these squares to balance the consideration of */
+ /* the choice of point that is going to become current. */
+
+ sumpq = zero;
+ winc = zero;
+ i__1 = npt;
+ for (k = 1; k <= i__1; ++k) {
+ distsq = zero;
+ i__2 = n;
+ for (j = 1; j <= i__2; ++j) {
+ xpt[k + j * xpt_dim1] -= xopt[j];
+ /* L10: */
+ /* Computing 2nd power */
+ d__1 = xpt[k + j * xpt_dim1];
+ distsq += d__1 * d__1;
+ }
+ sumpq += pq[k];
+ w[ndim + k] = distsq;
+ winc = std::max(winc,distsq);
+ i__2 = nptm;
+ for (j = 1; j <= i__2; ++j) {
+ /* L20: */
+ zmat[k + j * zmat_dim1] = zero;
+ }
+ }
+
+ /* Update HQ so that HQ and PQ define the second derivatives of the model */
+ /* after XBASE has been shifted to the trust region centre. */
+
+ ih = 0;
+ i__2 = n;
+ for (j = 1; j <= i__2; ++j) {
+ w[j] = half * sumpq * xopt[j];
+ i__1 = npt;
+ for (k = 1; k <= i__1; ++k) {
+ /* L30: */
+ w[j] += pq[k] * xpt[k + j * xpt_dim1];
+ }
+ i__1 = j;
+ for (i__ = 1; i__ <= i__1; ++i__) {
+ ++ih;
+ /* L40: */
+ hq[ih] = hq[ih] + w[i__] * xopt[j] + w[j] * xopt[i__];
+ }
+ }
+
+ /* Shift XBASE, SL, SU and XOPT. Set the elements of BMAT to zero, and */
+ /* also set the elements of PTSAUX. */
+
+ i__1 = n;
+ for (j = 1; j <= i__1; ++j) {
+ xbase[j] += xopt[j];
+ sl[j] -= xopt[j];
+ su[j] -= xopt[j];
+ xopt[j] = zero;
+ /* Computing MIN */
+ d__1 = delta, d__2 = su[j];
+ ptsaux[(j << 1) + 1] = std::min(d__1,d__2);
+ /* Computing MAX */
+ d__1 = -(delta), d__2 = sl[j];
+ ptsaux[(j << 1) + 2] = std::max(d__1,d__2);
+ if (ptsaux[(j << 1) + 1] + ptsaux[(j << 1) + 2] < zero) {
+ temp = ptsaux[(j << 1) + 1];
+ ptsaux[(j << 1) + 1] = ptsaux[(j << 1) + 2];
+ ptsaux[(j << 1) + 2] = temp;
+ }
+ if ((d__2 = ptsaux[(j << 1) + 2], std::abs(d__2)) < half * (d__1 = ptsaux[(
+ j << 1) + 1], std::abs(d__1))) {
+ ptsaux[(j << 1) + 2] = half * ptsaux[(j << 1) + 1];
+ }
+ i__2 = ndim;
+ for (i__ = 1; i__ <= i__2; ++i__) {
+ /* L50: */
+ bmat[i__ + j * bmat_dim1] = zero;
+ }
+ }
+ fbase = fval[kopt];
+
+ /* Set the identifiers of the artificial interpolation points that are */
+ /* along a coordinate direction from XOPT, and set the corresponding */
+ /* nonzero elements of BMAT and ZMAT. */
+
+ ptsid[1] = sfrac;
+ i__2 = n;
+ for (j = 1; j <= i__2; ++j) {
+ jp = j + 1;
+ jpn = jp + n;
+ ptsid[jp] = (doublereal) j + sfrac;
+ if (jpn <= npt) {
+ ptsid[jpn] = (doublereal) j / (doublereal) np + sfrac;
+ temp = one / (ptsaux[(j << 1) + 1] - ptsaux[(j << 1) + 2]);
+ bmat[jp + j * bmat_dim1] = -temp + one / ptsaux[(j << 1) + 1];
+ bmat[jpn + j * bmat_dim1] = temp + one / ptsaux[(j << 1) + 2];
+ bmat[j * bmat_dim1 + 1] = -bmat[jp + j * bmat_dim1] - bmat[jpn +
+ j * bmat_dim1];
+ zmat[j * zmat_dim1 + 1] = std::sqrt(2.) / (d__1 = ptsaux[(j << 1) + 1]
+ * ptsaux[(j << 1) + 2], std::abs(d__1));
+ zmat[jp + j * zmat_dim1] = zmat[j * zmat_dim1 + 1] * ptsaux[(j <<
+ 1) + 2] * temp;
+ zmat[jpn + j * zmat_dim1] = -zmat[j * zmat_dim1 + 1] * ptsaux[(j
+ << 1) + 1] * temp;
+ } else {
+ bmat[j * bmat_dim1 + 1] = -one / ptsaux[(j << 1) + 1];
+ bmat[jp + j * bmat_dim1] = one / ptsaux[(j << 1) + 1];
+ /* Computing 2nd power */
+ d__1 = ptsaux[(j << 1) + 1];
+ bmat[j + npt + j * bmat_dim1] = -half * (d__1 * d__1);
+ }
+ /* L60: */
+ }
+
+ /* Set any remaining identifiers with their nonzero elements of ZMAT. */
+
+ if (npt >= n + np) {
+ i__2 = npt;
+ for (k = np << 1; k <= i__2; ++k) {
+ iw = (integer) (((doublereal) (k - np) - half) / (doublereal) (n)
+ );
+ ip = k - np - iw * n;
+ iq = ip + iw;
+ if (iq > n) {
+ iq -= n;
+ }
+ ptsid[k] = (doublereal) ip + (doublereal) iq / (doublereal) np +
+ sfrac;
+ temp = one / (ptsaux[(ip << 1) + 1] * ptsaux[(iq << 1) + 1]);
+ zmat[(k - np) * zmat_dim1 + 1] = temp;
+ zmat[ip + 1 + (k - np) * zmat_dim1] = -temp;
+ zmat[iq + 1 + (k - np) * zmat_dim1] = -temp;
+ /* L70: */
+ zmat[k + (k - np) * zmat_dim1] = temp;
+ }
+ }
+ nrem = npt;
+ kold = 1;
+ knew = kopt;
+
+ /* Reorder the provisional points in the way that exchanges PTSID(KOLD) */
+ /* with PTSID(KNEW). */
+
+L80:
+ i__2 = n;
+ for (j = 1; j <= i__2; ++j) {
+ temp = bmat[kold + j * bmat_dim1];
+ bmat[kold + j * bmat_dim1] = bmat[knew + j * bmat_dim1];
+ /* L90: */
+ bmat[knew + j * bmat_dim1] = temp;
+ }
+ i__2 = nptm;
+ for (j = 1; j <= i__2; ++j) {
+ temp = zmat[kold + j * zmat_dim1];
+ zmat[kold + j * zmat_dim1] = zmat[knew + j * zmat_dim1];
+ /* L100: */
+ zmat[knew + j * zmat_dim1] = temp;
+ }
+ ptsid[kold] = ptsid[knew];
+ ptsid[knew] = zero;
+ w[ndim + knew] = zero;
+ --nrem;
+ if (knew != kopt) {
+ temp = vlag[kold];
+ vlag[kold] = vlag[knew];
+ vlag[knew] = temp;
+
+ /* Update the BMAT and ZMAT matrices so that the status of the KNEW-th */
+ /* interpolation point can be changed from provisional to original. The */
+ /* branch to label 350 occurs if all the original points are reinstated. */
+ /* The nonnegative values of W(NDIM+K) are required in the search below. */
+
+ update_(n, npt, &bmat[bmat_offset], &zmat[zmat_offset], ndim, &vlag[1],
+ beta, denom, knew, &w[1]);
+ if (nrem == 0) {
+ goto L350;
+ }
+ i__2 = npt;
+ for (k = 1; k <= i__2; ++k) {
+ /* L110: */
+ w[ndim + k] = (d__1 = w[ndim + k], std::abs(d__1));
+ }
+ }
+
+ /* Pick the index KNEW of an original interpolation point that has not */
+ /* yet replaced one of the provisional interpolation points, giving */
+ /* attention to the closeness to XOPT and to previous tries with KNEW. */
+
+L120:
+ dsqmin = zero;
+ i__2 = npt;
+ for (k = 1; k <= i__2; ++k) {
+ if (w[ndim + k] > zero) {
+ if (dsqmin == zero || w[ndim + k] < dsqmin) {
+ knew = k;
+ dsqmin = w[ndim + k];
+ }
+ }
+ /* L130: */
+ }
+ if (dsqmin == zero) {
+ goto L260;
+ }
+
+ /* Form the W-vector of the chosen original interpolation point. */
+
+ i__2 = n;
+ for (j = 1; j <= i__2; ++j) {
+ /* L140: */
+ w[npt + j] = xpt[knew + j * xpt_dim1];
+ }
+ i__2 = npt;
+ for (k = 1; k <= i__2; ++k) {
+ sum = zero;
+ if (k == kopt) {
+ } else if (ptsid[k] == zero) {
+ i__1 = n;
+ for (j = 1; j <= i__1; ++j) {
+ /* L150: */
+ sum += w[npt + j] * xpt[k + j * xpt_dim1];
+ }
+ } else {
+ ip = (integer) ptsid[k];
+ if (ip > 0) {
+ sum = w[npt + ip] * ptsaux[(ip << 1) + 1];
+ }
+ iq = (integer) ((doublereal) np * ptsid[k] - (doublereal) (ip *
+ np));
+ if (iq > 0) {
+ iw = 1;
+ if (ip == 0) {
+ iw = 2;
+ }
+ sum += w[npt + iq] * ptsaux[iw + (iq << 1)];
+ }
+ }
+ /* L160: */
+ w[k] = half * sum * sum;
+ }
+
+ /* Calculate VLAG and BETA for the required updating of the H matrix if */
+ /* XPT(KNEW,.) is reinstated in the set of interpolation points. */
+
+ i__2 = npt;
+ for (k = 1; k <= i__2; ++k) {
+ sum = zero;
+ i__1 = n;
+ for (j = 1; j <= i__1; ++j) {
+ /* L170: */
+ sum += bmat[k + j * bmat_dim1] * w[npt + j];
+ }
+ /* L180: */
+ vlag[k] = sum;
+ }
+ beta = zero;
+ i__2 = nptm;
+ for (j = 1; j <= i__2; ++j) {
+ sum = zero;
+ i__1 = npt;
+ for (k = 1; k <= i__1; ++k) {
+ /* L190: */
+ sum += zmat[k + j * zmat_dim1] * w[k];
+ }
+ beta -= sum * sum;
+ i__1 = npt;
+ for (k = 1; k <= i__1; ++k) {
+ /* L200: */
+ vlag[k] += sum * zmat[k + j * zmat_dim1];
+ }
+ }
+ bsum = zero;
+ distsq = zero;
+ i__1 = n;
+ for (j = 1; j <= i__1; ++j) {
+ sum = zero;
+ i__2 = npt;
+ for (k = 1; k <= i__2; ++k) {
+ /* L210: */
+ sum += bmat[k + j * bmat_dim1] * w[k];
+ }
+ jp = j + npt;
+ bsum += sum * w[jp];
+ i__2 = ndim;
+ for (ip = npt + 1; ip <= i__2; ++ip) {
+ /* L220: */
+ sum += bmat[ip + j * bmat_dim1] * w[ip];
+ }
+ bsum += sum * w[jp];
+ vlag[jp] = sum;
+ /* L230: */
+ /* Computing 2nd power */
+ d__1 = xpt[knew + j * xpt_dim1];
+ distsq += d__1 * d__1;
+ }
+ beta = half * distsq * distsq + beta - bsum;
+ vlag[kopt] += one;
+
+ /* KOLD is set to the index of the provisional interpolation point that is */
+ /* going to be deleted to make way for the KNEW-th original interpolation */
+ /* point. The choice of KOLD is governed by the avoidance of a small value */
+ /* of the denominator in the updating calculation of UPDATE. */
+
+ denom = zero;
+ vlmxsq = zero;
+ i__1 = npt;
+ for (k = 1; k <= i__1; ++k) {
+ if (ptsid[k] != zero) {
+ hdiag = zero;
+ i__2 = nptm;
+ for (j = 1; j <= i__2; ++j) {
+ /* L240: */
+ /* Computing 2nd power */
+ d__1 = zmat[k + j * zmat_dim1];
+ hdiag += d__1 * d__1;
+ }
+ /* Computing 2nd power */
+ d__1 = vlag[k];
+ den = beta * hdiag + d__1 * d__1;
+ if (den > denom) {
+ kold = k;
+ denom = den;
+ }
+ }
+ /* L250: */
+ /* Computing MAX */
+ /* Computing 2nd power */
+ d__3 = vlag[k];
+ d__1 = vlmxsq, d__2 = d__3 * d__3;
+ vlmxsq = std::max(d__1,d__2);
+ }
+ if (denom <= vlmxsq * .01) {
+ w[ndim + knew] = -w[ndim + knew] - winc;
+ goto L120;
+ }
+ goto L80;
+
+ /* When label 260 is reached, all the final positions of the interpolation */
+ /* points have been chosen although any changes have not been included yet */
+ /* in XPT. Also the final BMAT and ZMAT matrices are complete, but, apart */
+ /* from the shift of XBASE, the updating of the quadratic model remains to */
+ /* be done. The following cycle through the new interpolation points begins */
+ /* by putting the new point in XPT(KPT,.) and by setting PQ(KPT) to zero, */
+ /* except that a RETURN occurs if MAXFUN prohibits another value of F. */
+
+L260:
+ i__1 = npt;
+ for (kpt = 1; kpt <= i__1; ++kpt) {
+ if (ptsid[kpt] == zero) {
+ goto L340;
+ }
+ if (nf >= maxfun) {
+ nf = -1;
+ goto L350;
+ }
+ ih = 0;
+ i__2 = n;
+ for (j = 1; j <= i__2; ++j) {
+ w[j] = xpt[kpt + j * xpt_dim1];
+ xpt[kpt + j * xpt_dim1] = zero;
+ temp = pq[kpt] * w[j];
+ i__3 = j;
+ for (i__ = 1; i__ <= i__3; ++i__) {
+ ++ih;
+ /* L270: */
+ hq[ih] += temp * w[i__];
+ }
+ }
+ pq[kpt] = zero;
+ ip = (integer) ptsid[kpt];
+ iq = (integer) ((doublereal) np * ptsid[kpt] - (doublereal) (ip * np))
+ ;
+ if (ip > 0) {
+ xp = ptsaux[(ip << 1) + 1];
+ xpt[kpt + ip * xpt_dim1] = xp;
+ }
+ if (iq > 0) {
+ xq = ptsaux[(iq << 1) + 1];
+ if (ip == 0) {
+ xq = ptsaux[(iq << 1) + 2];
+ }
+ xpt[kpt + iq * xpt_dim1] = xq;
+ }
+
+ /* Set VQUAD to the value of the current model at the new point. */
+
+ vquad = fbase;
+ if (ip > 0) {
+ ihp = (ip + ip * ip) / 2;
+ vquad += xp * (gopt[ip] + half * xp * hq[ihp]);
+ }
+ if (iq > 0) {
+ ihq = (iq + iq * iq) / 2;
+ vquad += xq * (gopt[iq] + half * xq * hq[ihq]);
+ if (ip > 0) {
+ iw = std::max(ihp,ihq) - (i__3 = ip - iq, std::abs(i__3));
+ vquad += xp * xq * hq[iw];
+ }
+ }
+ i__3 = npt;
+ for (k = 1; k <= i__3; ++k) {
+ temp = zero;
+ if (ip > 0) {
+ temp += xp * xpt[k + ip * xpt_dim1];
+ }
+ if (iq > 0) {
+ temp += xq * xpt[k + iq * xpt_dim1];
+ }
+ /* L280: */
+ vquad += half * pq[k] * temp * temp;
+ }
+
+ /* Calculate F at the new interpolation point, and set DIFF to the factor */
+ /* that is going to multiply the KPT-th Lagrange function when the model */
+ /* is updated to provide interpolation to the new function value. */
+
+ i__3 = n;
+ for (i__ = 1; i__ <= i__3; ++i__) {
+ /* Computing MIN */
+ /* Computing MAX */
+ d__3 = xl[i__], d__4 = xbase[i__] + xpt[kpt + i__ * xpt_dim1];
+ d__1 = std::max(d__3,d__4), d__2 = xu[i__];
+ w[i__] = std::min(d__1,d__2);
+ if (xpt[kpt + i__ * xpt_dim1] == sl[i__]) {
+ w[i__] = xl[i__];
+ }
+ if (xpt[kpt + i__ * xpt_dim1] == su[i__]) {
+ w[i__] = xu[i__];
+ }
+ /* L290: */
+ }
+ ++(nf);
+ f = calfun(mat(&w[1],n));
+ fval[kpt] = f;
+ if (f < fval[kopt]) {
+ kopt = kpt;
+ }
+ diff = f - vquad;
+
+ /* Update the quadratic model. The RETURN from the subroutine occurs when */
+ /* all the new interpolation points are included in the model. */
+
+ i__3 = n;
+ for (i__ = 1; i__ <= i__3; ++i__) {
+ /* L310: */
+ gopt[i__] += diff * bmat[kpt + i__ * bmat_dim1];
+ }
+ i__3 = npt;
+ for (k = 1; k <= i__3; ++k) {
+ sum = zero;
+ i__2 = nptm;
+ for (j = 1; j <= i__2; ++j) {
+ /* L320: */
+ sum += zmat[k + j * zmat_dim1] * zmat[kpt + j * zmat_dim1];
+ }
+ temp = diff * sum;
+ if (ptsid[k] == zero) {
+ pq[k] += temp;
+ } else {
+ ip = (integer) ptsid[k];
+ iq = (integer) ((doublereal) np * ptsid[k] - (doublereal) (ip
+ * np));
+ ihq = (iq * iq + iq) / 2;
+ if (ip == 0) {
+ /* Computing 2nd power */
+ d__1 = ptsaux[(iq << 1) + 2];
+ hq[ihq] += temp * (d__1 * d__1);
+ } else {
+ ihp = (ip * ip + ip) / 2;
+ /* Computing 2nd power */
+ d__1 = ptsaux[(ip << 1) + 1];
+ hq[ihp] += temp * (d__1 * d__1);
+ if (iq > 0) {
+ /* Computing 2nd power */
+ d__1 = ptsaux[(iq << 1) + 1];
+ hq[ihq] += temp * (d__1 * d__1);
+ iw = std::max(ihp,ihq) - (i__2 = iq - ip, std::abs(i__2));
+ hq[iw] += temp * ptsaux[(ip << 1) + 1] * ptsaux[(iq <<
+ 1) + 1];
+ }
+ }
+ }
+ /* L330: */
+ }
+ ptsid[kpt] = zero;
+L340:
+ ;
+ }
+L350:
+ ;
+ } /* rescue_ */
+
+ // ----------------------------------------------------------------------------------------
+
+ void trsbox_(
+ const integer n,
+ const integer npt,
+ const doublereal *xpt,
+ const doublereal *xopt,
+ const doublereal *gopt,
+ const doublereal *hq,
+ const doublereal *pq,
+ const doublereal *sl,
+ const doublereal *su,
+ const doublereal delta,
+ doublereal *xnew,
+ doublereal *d__,
+ doublereal *gnew,
+ doublereal *xbdi,
+ doublereal *s,
+ doublereal *hs,
+ doublereal *hred,
+ doublereal *dsq,
+ doublereal *crvmin
+ ) const
+ {
+ /* System generated locals */
+ integer xpt_dim1, xpt_offset, i__1, i__2;
+ doublereal d__1, d__2, d__3, d__4;
+
+ /* Local variables */
+ integer i__, j, k, ih;
+ doublereal ds;
+ integer iu;
+ doublereal dhd, dhs, cth, one, shs, sth, ssq, half, beta, sdec, blen;
+ integer iact = 0, nact = 0;
+ doublereal angt, qred;
+ integer isav;
+ doublereal temp = 0, zero = 0, xsav = 0, xsum = 0, angbd = 0, dredg = 0, sredg = 0;
+ integer iterc;
+ doublereal resid = 0, delsq = 0, ggsav = 0, tempa = 0, tempb = 0,
+ redmax = 0, dredsq = 0, redsav = 0, onemin = 0, gredsq = 0, rednew = 0;
+ integer itcsav = 0;
+ doublereal rdprev = 0, rdnext = 0, stplen = 0, stepsq = 0;
+ integer itermax = 0;
+
+
+ /* The arguments N, NPT, XPT, XOPT, GOPT, HQ, PQ, SL and SU have the same */
+ /* meanings as the corresponding arguments of BOBYQB. */
+ /* DELTA is the trust region radius for the present calculation, which */
+ /* seeks a small value of the quadratic model within distance DELTA of */
+ /* XOPT subject to the bounds on the variables. */
+ /* XNEW will be set to a new vector of variables that is approximately */
+ /* the one that minimizes the quadratic model within the trust region */
+ /* subject to the SL and SU constraints on the variables. It satisfies */
+ /* as equations the bounds that become active during the calculation. */
+ /* D is the calculated trial step from XOPT, generated iteratively from an */
+ /* initial value of zero. Thus XNEW is XOPT+D after the final iteration. */
+ /* GNEW holds the gradient of the quadratic model at XOPT+D. It is updated */
+ /* when D is updated. */
+ /* XBDI is a working space vector. For I=1,2,...,N, the element XBDI(I) is */
+ /* set to -1.0, 0.0, or 1.0, the value being nonzero if and only if the */
+ /* I-th variable has become fixed at a bound, the bound being SL(I) or */
+ /* SU(I) in the case XBDI(I)=-1.0 or XBDI(I)=1.0, respectively. This */
+ /* information is accumulated during the construction of XNEW. */
+ /* The arrays S, HS and HRED are also used for working space. They hold the */
+ /* current search direction, and the changes in the gradient of Q along S */
+ /* and the reduced D, respectively, where the reduced D is the same as D, */
+ /* except that the components of the fixed variables are zero. */
+ /* DSQ will be set to the square of the length of XNEW-XOPT. */
+ /* CRVMIN is set to zero if D reaches the trust region boundary. Otherwise */
+ /* it is set to the least curvature of H that occurs in the conjugate */
+ /* gradient searches that are not restricted by any constraints. The */
+ /* value CRVMIN=-1.0D0 is set, however, if all of these searches are */
+ /* constrained. */
+
+ /* A version of the truncated conjugate gradient is applied. If a line */
+ /* search is restricted by a constraint, then the procedure is restarted, */
+ /* the values of the variables that are at their bounds being fixed. If */
+ /* the trust region boundary is reached, then further changes may be made */
+ /* to D, each one being in the two dimensional space that is spanned */
+ /* by the current D and the gradient of Q at XOPT+D, staying on the trust */
+ /* region boundary. Termination occurs when the reduction in Q seems to */
+ /* be close to the greatest reduction that can be achieved. */
+
+ /* Set some constants. */
+
+ /* Parameter adjustments */
+ xpt_dim1 = npt;
+ xpt_offset = 1 + xpt_dim1;
+ xpt -= xpt_offset;
+ --xopt;
+ --gopt;
+ --hq;
+ --pq;
+ --sl;
+ --su;
+ --xnew;
+ --d__;
+ --gnew;
+ --xbdi;
+ --s;
+ --hs;
+ --hred;
+
+ /* Function Body */
+ half = .5;
+ one = 1.;
+ onemin = -1.;
+ zero = 0.;
+
+ /* The sign of GOPT(I) gives the sign of the change to the I-th variable */
+ /* that will reduce Q from its value at XOPT. Thus XBDI(I) shows whether */
+ /* or not to fix the I-th variable at one of its bounds initially, with */
+ /* NACT being set to the number of fixed variables. D and GNEW are also */
+ /* set for the first iteration. DELSQ is the upper bound on the sum of */
+ /* squares of the free variables. QRED is the reduction in Q so far. */
+
+ iterc = 0;
+ nact = 0;
+ i__1 = n;
+ for (i__ = 1; i__ <= i__1; ++i__) {
+ xbdi[i__] = zero;
+ if (xopt[i__] <= sl[i__]) {
+ if (gopt[i__] >= zero) {
+ xbdi[i__] = onemin;
+ }
+ } else if (xopt[i__] >= su[i__]) {
+ if (gopt[i__] <= zero) {
+ xbdi[i__] = one;
+ }
+ }
+ if (xbdi[i__] != zero) {
+ ++nact;
+ }
+ d__[i__] = zero;
+ /* L10: */
+ gnew[i__] = gopt[i__];
+ }
+ delsq = delta * delta;
+ qred = zero;
+ *crvmin = onemin;
+
+ /* Set the next search direction of the conjugate gradient method. It is */
+ /* the steepest descent direction initially and when the iterations are */
+ /* restarted because a variable has just been fixed by a bound, and of */
+ /* course the components of the fixed variables are zero. ITERMAX is an */
+ /* upper bound on the indices of the conjugate gradient iterations. */
+
+L20:
+ beta = zero;
+L30:
+ stepsq = zero;
+ i__1 = n;
+ for (i__ = 1; i__ <= i__1; ++i__) {
+ if (xbdi[i__] != zero) {
+ s[i__] = zero;
+ } else if (beta == zero) {
+ s[i__] = -gnew[i__];
+ } else {
+ s[i__] = beta * s[i__] - gnew[i__];
+ }
+ /* L40: */
+ /* Computing 2nd power */
+ d__1 = s[i__];
+ stepsq += d__1 * d__1;
+ }
+ if (stepsq == zero) {
+ goto L190;
+ }
+ if (beta == zero) {
+ gredsq = stepsq;
+ itermax = iterc + n - nact;
+ }
+ if (gredsq * delsq <= qred * 1e-4 * qred) {
+ goto L190;
+ }
+
+ /* Multiply the search direction by the second derivative matrix of Q and */
+ /* calculate some scalars for the choice of steplength. Then set BLEN to */
+ /* the length of the the step to the trust region boundary and STPLEN to */
+ /* the steplength, ignoring the simple bounds. */
+
+ goto L210;
+L50:
+ resid = delsq;
+ ds = zero;
+ shs = zero;
+ i__1 = n;
+ for (i__ = 1; i__ <= i__1; ++i__) {
+ if (xbdi[i__] == zero) {
+ /* Computing 2nd power */
+ d__1 = d__[i__];
+ resid -= d__1 * d__1;
+ ds += s[i__] * d__[i__];
+ shs += s[i__] * hs[i__];
+ }
+ /* L60: */
+ }
+ if (resid <= zero) {
+ goto L90;
+ }
+ temp = std::sqrt(stepsq * resid + ds * ds);
+ if (ds < zero) {
+ blen = (temp - ds) / stepsq;
+ } else {
+ blen = resid / (temp + ds);
+ }
+ stplen = blen;
+ if (shs > zero) {
+ /* Computing MIN */
+ d__1 = blen, d__2 = gredsq / shs;
+ stplen = std::min(d__1,d__2);
+ }
+
+ /* Reduce STPLEN if necessary in order to preserve the simple bounds, */
+ /* letting IACT be the index of the new constrained variable. */
+
+ iact = 0;
+ i__1 = n;
+ for (i__ = 1; i__ <= i__1; ++i__) {
+ if (s[i__] != zero) {
+ xsum = xopt[i__] + d__[i__];
+ if (s[i__] > zero) {
+ temp = (su[i__] - xsum) / s[i__];
+ } else {
+ temp = (sl[i__] - xsum) / s[i__];
+ }
+ if (temp < stplen) {
+ stplen = temp;
+ iact = i__;
+ }
+ }
+ /* L70: */
+ }
+
+ /* Update CRVMIN, GNEW and D. Set SDEC to the decrease that occurs in Q. */
+
+ sdec = zero;
+ if (stplen > zero) {
+ ++iterc;
+ temp = shs / stepsq;
+ if (iact == 0 && temp > zero) {
+ *crvmin = std::min(*crvmin,temp);
+ if (*crvmin == onemin) {
+ *crvmin = temp;
+ }
+ }
+ ggsav = gredsq;
+ gredsq = zero;
+ i__1 = n;
+ for (i__ = 1; i__ <= i__1; ++i__) {
+ gnew[i__] += stplen * hs[i__];
+ if (xbdi[i__] == zero) {
+ /* Computing 2nd power */
+ d__1 = gnew[i__];
+ gredsq += d__1 * d__1;
+ }
+ /* L80: */
+ d__[i__] += stplen * s[i__];
+ }
+ /* Computing MAX */
+ d__1 = stplen * (ggsav - half * stplen * shs);
+ sdec = std::max(d__1,zero);
+ qred += sdec;
+ }
+
+ /* Restart the conjugate gradient method if it has hit a new bound. */
+
+ if (iact > 0) {
+ ++nact;
+ xbdi[iact] = one;
+ if (s[iact] < zero) {
+ xbdi[iact] = onemin;
+ }
+ /* Computing 2nd power */
+ d__1 = d__[iact];
+ delsq -= d__1 * d__1;
+ if (delsq <= zero) {
+ goto L90;
+ }
+ goto L20;
+ }
+
+ /* If STPLEN is less than BLEN, then either apply another conjugate */
+ /* gradient iteration or RETURN. */
+
+ if (stplen < blen) {
+ if (iterc == itermax) {
+ goto L190;
+ }
+ if (sdec <= qred * .01) {
+ goto L190;
+ }
+ beta = gredsq / ggsav;
+ goto L30;
+ }
+L90:
+ *crvmin = zero;
+
+ /* Prepare for the alternative iteration by calculating some scalars */
+ /* and by multiplying the reduced D by the second derivative matrix of */
+ /* Q, where S holds the reduced D in the call of GGMULT. */
+
+L100:
+ if (nact >= n - 1) {
+ goto L190;
+ }
+ dredsq = zero;
+ dredg = zero;
+ gredsq = zero;
+ i__1 = n;
+ for (i__ = 1; i__ <= i__1; ++i__) {
+ if (xbdi[i__] == zero) {
+ /* Computing 2nd power */
+ d__1 = d__[i__];
+ dredsq += d__1 * d__1;
+ dredg += d__[i__] * gnew[i__];
+ /* Computing 2nd power */
+ d__1 = gnew[i__];
+ gredsq += d__1 * d__1;
+ s[i__] = d__[i__];
+ } else {
+ s[i__] = zero;
+ }
+ /* L110: */
+ }
+ itcsav = iterc;
+ goto L210;
+
+ /* Let the search direction S be a linear combination of the reduced D */
+ /* and the reduced G that is orthogonal to the reduced D. */
+
+L120:
+ ++iterc;
+ temp = gredsq * dredsq - dredg * dredg;
+ if (temp <= qred * 1e-4 * qred) {
+ goto L190;
+ }
+ temp = std::sqrt(temp);
+ i__1 = n;
+ for (i__ = 1; i__ <= i__1; ++i__) {
+ if (xbdi[i__] == zero) {
+ s[i__] = (dredg * d__[i__] - dredsq * gnew[i__]) / temp;
+ } else {
+ s[i__] = zero;
+ }
+ /* L130: */
+ }
+ sredg = -temp;
+
+ /* By considering the simple bounds on the variables, calculate an upper */
+ /* bound on the tangent of half the angle of the alternative iteration, */
+ /* namely ANGBD, except that, if already a free variable has reached a */
+ /* bound, there is a branch back to label 100 after fixing that variable. */
+
+ angbd = one;
+ iact = 0;
+ i__1 = n;
+ for (i__ = 1; i__ <= i__1; ++i__) {
+ if (xbdi[i__] == zero) {
+ tempa = xopt[i__] + d__[i__] - sl[i__];
+ tempb = su[i__] - xopt[i__] - d__[i__];
+ if (tempa <= zero) {
+ ++nact;
+ xbdi[i__] = onemin;
+ goto L100;
+ } else if (tempb <= zero) {
+ ++nact;
+ xbdi[i__] = one;
+ goto L100;
+ }
+ /* Computing 2nd power */
+ d__1 = d__[i__];
+ /* Computing 2nd power */
+ d__2 = s[i__];
+ ssq = d__1 * d__1 + d__2 * d__2;
+ /* Computing 2nd power */
+ d__1 = xopt[i__] - sl[i__];
+ temp = ssq - d__1 * d__1;
+ if (temp > zero) {
+ temp = std::sqrt(temp) - s[i__];
+ if (angbd * temp > tempa) {
+ angbd = tempa / temp;
+ iact = i__;
+ xsav = onemin;
+ }
+ }
+ /* Computing 2nd power */
+ d__1 = su[i__] - xopt[i__];
+ temp = ssq - d__1 * d__1;
+ if (temp > zero) {
+ temp = std::sqrt(temp) + s[i__];
+ if (angbd * temp > tempb) {
+ angbd = tempb / temp;
+ iact = i__;
+ xsav = one;
+ }
+ }
+ }
+ /* L140: */
+ }
+
+ /* Calculate HHD and some curvatures for the alternative iteration. */
+
+ goto L210;
+L150:
+ shs = zero;
+ dhs = zero;
+ dhd = zero;
+ i__1 = n;
+ for (i__ = 1; i__ <= i__1; ++i__) {
+ if (xbdi[i__] == zero) {
+ shs += s[i__] * hs[i__];
+ dhs += d__[i__] * hs[i__];
+ dhd += d__[i__] * hred[i__];
+ }
+ /* L160: */
+ }
+
+ /* Seek the greatest reduction in Q for a range of equally spaced values */
+ /* of ANGT in [0,ANGBD], where ANGT is the tangent of half the angle of */
+ /* the alternative iteration. */
+
+ redmax = zero;
+ isav = 0;
+ redsav = zero;
+ iu = (integer) (angbd * 17. + 3.1);
+ i__1 = iu;
+ for (i__ = 1; i__ <= i__1; ++i__) {
+ angt = angbd * (doublereal) i__ / (doublereal) iu;
+ sth = (angt + angt) / (one + angt * angt);
+ temp = shs + angt * (angt * dhd - dhs - dhs);
+ rednew = sth * (angt * dredg - sredg - half * sth * temp);
+ if (rednew > redmax) {
+ redmax = rednew;
+ isav = i__;
+ rdprev = redsav;
+ } else if (i__ == isav + 1) {
+ rdnext = rednew;
+ }
+ /* L170: */
+ redsav = rednew;
+ }
+
+ /* Return if the reduction is zero. Otherwise, set the sine and cosine */
+ /* of the angle of the alternative iteration, and calculate SDEC. */
+
+ if (isav == 0) {
+ goto L190;
+ }
+ if (isav < iu) {
+ temp = (rdnext - rdprev) / (redmax + redmax - rdprev - rdnext);
+ angt = angbd * ((doublereal) isav + half * temp) / (doublereal) iu;
+ }
+ cth = (one - angt * angt) / (one + angt * angt);
+ sth = (angt + angt) / (one + angt * angt);
+ temp = shs + angt * (angt * dhd - dhs - dhs);
+ sdec = sth * (angt * dredg - sredg - half * sth * temp);
+ if (sdec <= zero) {
+ goto L190;
+ }
+
+ /* Update GNEW, D and HRED. If the angle of the alternative iteration */
+ /* is restricted by a bound on a free variable, that variable is fixed */
+ /* at the bound. */
+
+ dredg = zero;
+ gredsq = zero;
+ i__1 = n;
+ for (i__ = 1; i__ <= i__1; ++i__) {
+ gnew[i__] = gnew[i__] + (cth - one) * hred[i__] + sth * hs[i__];
+ if (xbdi[i__] == zero) {
+ d__[i__] = cth * d__[i__] + sth * s[i__];
+ dredg += d__[i__] * gnew[i__];
+ /* Computing 2nd power */
+ d__1 = gnew[i__];
+ gredsq += d__1 * d__1;
+ }
+ /* L180: */
+ hred[i__] = cth * hred[i__] + sth * hs[i__];
+ }
+ qred += sdec;
+ if (iact > 0 && isav == iu) {
+ ++nact;
+ xbdi[iact] = xsav;
+ goto L100;
+ }
+
+ /* If SDEC is sufficiently small, then RETURN after setting XNEW to */
+ /* XOPT+D, giving careful attention to the bounds. */
+
+ if (sdec > qred * .01) {
+ goto L120;
+ }
+L190:
+ *dsq = zero;
+ i__1 = n;
+ for (i__ = 1; i__ <= i__1; ++i__) {
+ /* Computing MAX */
+ /* Computing MIN */
+ d__3 = xopt[i__] + d__[i__], d__4 = su[i__];
+ d__1 = std::min(d__3,d__4), d__2 = sl[i__];
+ xnew[i__] = std::max(d__1,d__2);
+ if (xbdi[i__] == onemin) {
+ xnew[i__] = sl[i__];
+ }
+ if (xbdi[i__] == one) {
+ xnew[i__] = su[i__];
+ }
+ d__[i__] = xnew[i__] - xopt[i__];
+ /* L200: */
+ /* Computing 2nd power */
+ d__1 = d__[i__];
+ *dsq += d__1 * d__1;
+ }
+ return;
+ /* The following instructions multiply the current S-vector by the second */
+ /* derivative matrix of the quadratic model, putting the product in HS. */
+ /* They are reached from three different parts of the software above and */
+ /* they can be regarded as an external subroutine. */
+
+L210:
+ ih = 0;
+ i__1 = n;
+ for (j = 1; j <= i__1; ++j) {
+ hs[j] = zero;
+ i__2 = j;
+ for (i__ = 1; i__ <= i__2; ++i__) {
+ ++ih;
+ if (i__ < j) {
+ hs[j] += hq[ih] * s[i__];
+ }
+ /* L220: */
+ hs[i__] += hq[ih] * s[j];
+ }
+ }
+ i__2 = npt;
+ for (k = 1; k <= i__2; ++k) {
+ if (pq[k] != zero) {
+ temp = zero;
+ i__1 = n;
+ for (j = 1; j <= i__1; ++j) {
+ /* L230: */
+ temp += xpt[k + j * xpt_dim1] * s[j];
+ }
+ temp *= pq[k];
+ i__1 = n;
+ for (i__ = 1; i__ <= i__1; ++i__) {
+ /* L240: */
+ hs[i__] += temp * xpt[k + i__ * xpt_dim1];
+ }
+ }
+ /* L250: */
+ }
+ if (*crvmin != zero) {
+ goto L50;
+ }
+ if (iterc > itcsav) {
+ goto L150;
+ }
+ i__2 = n;
+ for (i__ = 1; i__ <= i__2; ++i__) {
+ /* L260: */
+ hred[i__] = hs[i__];
+ }
+ goto L120;
+ } /* trsbox_ */
+
+ // ----------------------------------------------------------------------------------------
+
+ void update_(
+ const integer n,
+ const integer npt,
+ doublereal *bmat,
+ doublereal *zmat,
+ const integer ndim,
+ doublereal *vlag,
+ const doublereal beta,
+ const doublereal denom,
+ const integer knew,
+ doublereal *w
+ ) const
+ {
+ /* System generated locals */
+ integer bmat_dim1, bmat_offset, zmat_dim1, zmat_offset, i__1, i__2;
+ doublereal d__1, d__2, d__3;
+
+ /* Local variables */
+ integer i__, j, k, jp;
+ doublereal one, tau, temp;
+ integer nptm;
+ doublereal zero, alpha, tempa, tempb, ztest;
+
+
+ /* The arrays BMAT and ZMAT are updated, as required by the new position */
+ /* of the interpolation point that has the index KNEW. The vector VLAG has */
+ /* N+NPT components, set on entry to the first NPT and last N components */
+ /* of the product Hw in equation (4.11) of the Powell (2006) paper on */
+ /* NEWUOA. Further, BETA is set on entry to the value of the parameter */
+ /* with that name, and DENOM is set to the denominator of the updating */
+ /* formula. Elements of ZMAT may be treated as zero if their moduli are */
+ /* at most ZTEST. The first NDIM elements of W are used for working space. */
+
+ /* Set some constants. */
+
+ /* Parameter adjustments */
+ zmat_dim1 = npt;
+ zmat_offset = 1 + zmat_dim1;
+ zmat -= zmat_offset;
+ bmat_dim1 = ndim;
+ bmat_offset = 1 + bmat_dim1;
+ bmat -= bmat_offset;
+ --vlag;
+ --w;
+
+ /* Function Body */
+ one = 1.;
+ zero = 0.;
+ nptm = npt - n - 1;
+ ztest = zero;
+ i__1 = npt;
+ for (k = 1; k <= i__1; ++k) {
+ i__2 = nptm;
+ for (j = 1; j <= i__2; ++j) {
+ /* L10: */
+ /* Computing MAX */
+ d__2 = ztest, d__3 = (d__1 = zmat[k + j * zmat_dim1], std::abs(d__1));
+ ztest = std::max(d__2,d__3);
+ }
+ }
+ ztest *= 1e-20;
+
+ /* Apply the rotations that put zeros in the KNEW-th row of ZMAT. */
+
+ i__2 = nptm;
+ for (j = 2; j <= i__2; ++j) {
+ if ((d__1 = zmat[knew + j * zmat_dim1], std::abs(d__1)) > ztest) {
+ /* Computing 2nd power */
+ d__1 = zmat[knew + zmat_dim1];
+ /* Computing 2nd power */
+ d__2 = zmat[knew + j * zmat_dim1];
+ temp = std::sqrt(d__1 * d__1 + d__2 * d__2);
+ tempa = zmat[knew + zmat_dim1] / temp;
+ tempb = zmat[knew + j * zmat_dim1] / temp;
+ i__1 = npt;
+ for (i__ = 1; i__ <= i__1; ++i__) {
+ temp = tempa * zmat[i__ + zmat_dim1] + tempb * zmat[i__ + j *
+ zmat_dim1];
+ zmat[i__ + j * zmat_dim1] = tempa * zmat[i__ + j * zmat_dim1]
+ - tempb * zmat[i__ + zmat_dim1];
+ /* L20: */
+ zmat[i__ + zmat_dim1] = temp;
+ }
+ }
+ zmat[knew + j * zmat_dim1] = zero;
+ /* L30: */
+ }
+
+ /* Put the first NPT components of the KNEW-th column of HLAG into W, */
+ /* and calculate the parameters of the updating formula. */
+
+ i__2 = npt;
+ for (i__ = 1; i__ <= i__2; ++i__) {
+ w[i__] = zmat[knew + zmat_dim1] * zmat[i__ + zmat_dim1];
+ /* L40: */
+ }
+ alpha = w[knew];
+ tau = vlag[knew];
+ vlag[knew] -= one;
+
+ /* Complete the updating of ZMAT. */
+
+ temp = std::sqrt(denom);
+ tempb = zmat[knew + zmat_dim1] / temp;
+ tempa = tau / temp;
+ i__2 = npt;
+ for (i__ = 1; i__ <= i__2; ++i__) {
+ /* L50: */
+ zmat[i__ + zmat_dim1] = tempa * zmat[i__ + zmat_dim1] - tempb * vlag[
+ i__];
+ }
+
+ /* Finally, update the matrix BMAT. */
+
+ i__2 = n;
+ for (j = 1; j <= i__2; ++j) {
+ jp = npt + j;
+ w[jp] = bmat[knew + j * bmat_dim1];
+ tempa = (alpha * vlag[jp] - tau * w[jp]) / denom;
+ tempb = (-(beta) * w[jp] - tau * vlag[jp]) / denom;
+ i__1 = jp;
+ for (i__ = 1; i__ <= i__1; ++i__) {
+ bmat[i__ + j * bmat_dim1] = bmat[i__ + j * bmat_dim1] + tempa *
+ vlag[i__] + tempb * w[i__];
+ if (i__ > npt) {
+ bmat[jp + (i__ - npt) * bmat_dim1] = bmat[i__ + j *
+ bmat_dim1];
+ }
+ /* L60: */
+ }
+ }
+ } /* update_ */
+ };
+
+// ----------------------------------------------------------------------------------------
+
+ template <
+ typename funct,
+ typename T,
+ typename U
+ >
+ double find_min_bobyqa (
+ const funct& f,
+ T& x,
+ long npt,
+ const U& x_lower,
+ const U& x_upper,
+ const double rho_begin,
+ const double rho_end,
+ const long max_f_evals
+ )
+ {
+ // The starting point (i.e. x) must be a column vector.
+ COMPILE_TIME_ASSERT(T::NC <= 1);
+
+ // check the requirements. Also split the assert up so that the error message isn't huge.
+ DLIB_CASSERT(is_col_vector(x) && is_col_vector(x_lower) && is_col_vector(x_upper) &&
+ x.size() == x_lower.size() && x_lower.size() == x_upper.size() &&
+ x.size() > 1 && max_f_evals > 1,
+ "\tdouble find_min_bobyqa()"
+ << "\n\t Invalid arguments have been given to this function"
+ << "\n\t is_col_vector(x): " << is_col_vector(x)
+ << "\n\t is_col_vector(x_lower): " << is_col_vector(x_lower)
+ << "\n\t is_col_vector(x_upper): " << is_col_vector(x_upper)
+ << "\n\t x.size(): " << x.size()
+ << "\n\t x_lower.size(): " << x_lower.size()
+ << "\n\t x_upper.size(): " << x_upper.size()
+ << "\n\t max_f_evals: " << max_f_evals
+ );
+
+ DLIB_CASSERT(x.size() + 2 <= npt && npt <= (x.size()+1)*(x.size()+2)/2 &&
+ 0 < rho_end && rho_end < rho_begin &&
+ min(x_upper - x_lower) > 2*rho_begin &&
+ min(x - x_lower) >= 0 && min(x_upper - x) >= 0,
+ "\tdouble find_min_bobyqa()"
+ << "\n\t Invalid arguments have been given to this function"
+ << "\n\t ntp in valid range: " << (x.size() + 2 <= npt && npt <= (x.size()+1)*(x.size()+2)/2)
+ << "\n\t npt: " << npt
+ << "\n\t rho_begin: " << rho_begin
+ << "\n\t rho_end: " << rho_end
+ << "\n\t min(x_upper - x_lower) > 2*rho_begin: " << (min(x_upper - x_lower) > 2*rho_begin)
+ << "\n\t min(x - x_lower) >= 0 && min(x_upper - x) >= 0: " << (min(x - x_lower) >= 0 && min(x_upper - x) >= 0)
+ );
+
+
+ bobyqa_implementation impl;
+ return impl.find_min(f, x, npt, x_lower, x_upper, rho_begin, rho_end, max_f_evals);
+ }
+
+// ----------------------------------------------------------------------------------------
+
+ template <
+ typename funct,
+ typename T,
+ typename U
+ >
+ double find_max_bobyqa (
+ const funct& f,
+ T& x,
+ long npt,
+ const U& x_lower,
+ const U& x_upper,
+ const double rho_begin,
+ const double rho_end,
+ const long max_f_evals
+ )
+ {
+ // The starting point (i.e. x) must be a column vector.
+ COMPILE_TIME_ASSERT(T::NC <= 1);
+
+ return -find_min_bobyqa(negate_function(f), x, npt, x_lower, x_upper, rho_begin, rho_end, max_f_evals);
+ }
+
+// ----------------------------------------------------------------------------------------
+
+}
+
+#endif // DLIB_OPTIMIZATIOn_BOBYQA_Hh_
+
diff --git a/ml/dlib/dlib/optimization/optimization_bobyqa_abstract.h b/ml/dlib/dlib/optimization/optimization_bobyqa_abstract.h
new file mode 100644
index 000000000..46f9436af
--- /dev/null
+++ b/ml/dlib/dlib/optimization/optimization_bobyqa_abstract.h
@@ -0,0 +1,120 @@
+// Copyright (C) 2009 Davis E. King (davis@dlib.net)
+// License: Boost Software License See LICENSE.txt for the full license.
+#undef DLIB_OPTIMIZATIOn_BOBYQA_ABSTRACT_Hh_
+#ifdef DLIB_OPTIMIZATIOn_BOBYQA_ABSTRACT_Hh_
+
+#include "../matrix.h"
+
+// ----------------------------------------------------------------------------------------
+
+/*
+ This file defines the dlib interface to the BOBYQA software developed by M.J.D Powell.
+ BOBYQA is a method for optimizing a function in the absence of derivative information.
+ Powell described it as a method that seeks the least value of a function of many
+ variables, by applying a trust region method that forms quadratic models by
+ interpolation. There is usually some freedom in the interpolation conditions,
+ which is taken up by minimizing the Frobenius norm of the change to the second
+ derivative of the model, beginning with the zero matrix. The values of the variables
+ are constrained by upper and lower bounds.
+
+
+ The following paper, published in 2009 by Powell, describes the
+ detailed working of the BOBYQA algorithm.
+
+ The BOBYQA algorithm for bound constrained optimization
+ without derivatives by M.J.D. Powell
+*/
+
+// ----------------------------------------------------------------------------------------
+
+namespace dlib
+{
+ class bobyqa_failure : public error;
+ /*!
+ This is the exception class used by the functions defined in this file.
+ !*/
+
+// ----------------------------------------------------------------------------------------
+
+ template <
+ typename funct,
+ typename T,
+ typename U
+ >
+ double find_min_bobyqa (
+ const funct& f,
+ T& x,
+ long npt,
+ const U& x_lower,
+ const U& x_upper,
+ const double rho_begin,
+ const double rho_end,
+ const long max_f_evals
+ );
+ /*!
+ requires
+ - f(x) must be a valid expression that evaluates to a double
+ - is_col_vector(x) == true
+ - is_col_vector(x_lower) == true
+ - is_col_vector(x_upper) == true
+ - x.size() == x_lower.size() == x_upper.size()
+ - x.size() > 1
+ - x.size() + 2 <= npt <= (x.size()+1)*(x.size()+2)/2
+ - 0 < rho_end < rho_begin
+ - min(x_upper - x_lower) > 2*rho_begin
+ (i.e. the lower and upper bounds on each x element must be larger than 2*rho_begin)
+ - min(x - x_lower) >= 0 && min(x_upper - x) >= 0
+ (i.e. the given x should be within the bounds defined by x_lower and x_upper)
+ - max_f_evals > 1
+ ensures
+ - Performs a constrained minimization of the function f() starting from
+ the initial point x.
+ - The BOBYQA algorithm uses a number of interpolating points to perform its
+ work. The npt argument controls how many points get used. Typically,
+ a good value to use is 2*x.size()+1.
+ - #x == the value of x (within the bounds defined by x_lower and x_upper) that
+ was found to minimize f(). More precisely:
+ - min(#x - x_lower) >= 0 && min(x_upper - #x) >= 0
+ - returns f(#x).
+ - rho_begin and rho_end are used as the initial and final values of a trust
+ region radius. Typically, rho_begin should be about one tenth of the greatest
+ expected change to a variable, while rho_end should indicate the accuracy that
+ is required in the final values of the variables.
+ throws
+ - bobyqa_failure
+ This exception is thrown if the algorithm is unable to make progress towards
+ solving the problem. This may occur because the algorithm detects excessive
+ numerical errors or because max_f_evals of f() have occurred without reaching
+ convergence.
+ !*/
+
+// ----------------------------------------------------------------------------------------
+
+ template <
+ typename funct,
+ typename T,
+ typename U
+ >
+ double find_max_bobyqa (
+ const funct& f,
+ T& x,
+ long npt,
+ const U& x_lower,
+ const U& x_upper,
+ const double rho_begin,
+ const double rho_end,
+ const long max_f_evals
+ );
+ /*!
+ This function is identical to the find_min_bobyqa() routine defined above
+ except that it negates the f() function before performing optimization.
+ Thus this function will attempt to find the maximizer of f() rather than
+ the minimizer.
+ !*/
+
+// ----------------------------------------------------------------------------------------
+
+}
+
+#endif // DLIB_OPTIMIZATIOn_BOBYQA_ABSTRACT_Hh_
+
diff --git a/ml/dlib/dlib/optimization/optimization_least_squares.h b/ml/dlib/dlib/optimization/optimization_least_squares.h
new file mode 100644
index 000000000..6d12a919d
--- /dev/null
+++ b/ml/dlib/dlib/optimization/optimization_least_squares.h
@@ -0,0 +1,345 @@
+// Copyright (C) 2010 Davis E. King (davis@dlib.net)
+// License: Boost Software License See LICENSE.txt for the full license.
+#ifndef DLIB_OPTIMIZATION_LEAST_SQuARES_H_h_
+#define DLIB_OPTIMIZATION_LEAST_SQuARES_H_h_
+
+#include "../matrix.h"
+#include "optimization_trust_region.h"
+#include "optimization_least_squares_abstract.h"
+
+namespace dlib
+{
+
+// ----------------------------------------------------------------------------------------
+
+ template <
+ typename column_vector_type,
+ typename funct_type,
+ typename funct_der_type,
+ typename vector_type
+ >
+ class least_squares_function_model
+ {
+ public:
+ least_squares_function_model (
+ const funct_type& f_,
+ const funct_der_type& der_,
+ const vector_type& list_
+ ) : f(f_), der(der_), list(list_)
+ {
+ S = 0;
+ last_f = 0;
+ last_f2 = 0;
+
+ r.set_size(list.size(),1);
+ }
+
+ const funct_type& f;
+ const funct_der_type& der;
+ const vector_type& list;
+
+ typedef typename column_vector_type::type type;
+ typedef typename column_vector_type::mem_manager_type mem_manager_type;
+ typedef typename column_vector_type::layout_type layout_type;
+ const static long NR = column_vector_type::NR;
+
+ typedef column_vector_type column_vector;
+ typedef matrix<type,NR,NR,mem_manager_type,layout_type> general_matrix;
+
+
+ type operator() (
+ const column_vector& x
+ ) const
+ {
+ type result = 0;
+ for (long i = 0; i < list.size(); ++i)
+ {
+ const type temp = f(list(i), x);
+ // save the residual for later
+ r(i) = temp;
+ result += temp*temp;
+ }
+
+ last_f = 0.5*result;
+ return 0.5*result;
+ }
+
+ void get_derivative_and_hessian (
+ const column_vector& x,
+ column_vector& d,
+ general_matrix& h
+ ) const
+ {
+ J.set_size(list.size(), x.size());
+
+ // compute the Jacobian
+ for (long i = 0; i < list.size(); ++i)
+ {
+ set_rowm(J,i) = trans(der(list(i), x));
+ }
+
+ // Compute the Levenberg-Marquardt gradient and hessian
+ d = trans(J)*r;
+ h = trans(J)*J;
+
+ if (S.size() == 0)
+ {
+ S.set_size(x.size(), x.size());
+ S = 0;
+ }
+
+ // If this isn't the first iteration then check if using
+ // a quasi-newton update helps things.
+ if (last_r.size() != 0)
+ {
+
+ s = x - last_x;
+ y = d - last_d;
+ yy = d - trans(last_J)*r;
+
+ const type ys = trans(y)*s;
+ vtemp = yy - S*s;
+ const type temp2 = std::abs(trans(s)*S*s);
+ type scale = (temp2 != 0) ? std::min<type>(1, std::abs(dot(s,yy))/temp2) : 1;
+
+ if (ys != 0)
+ S = scale*S + (vtemp*trans(y) + y*trans(vtemp))/(ys) - dot(vtemp,s)/ys/ys*y*trans(y);
+ else
+ S *= scale;
+
+ // check how well both the models fit the last change we saw in f()
+ const type measured_delta = last_f2 - last_f;
+ s = -s;
+ const type h_predicted_delta = 0.5*trans(s)*h*s + trans(d)*s;
+ const type s_predicted_delta = 0.5*trans(s)*(h+S)*s + trans(d)*s;
+
+ const type h_error = std::abs((h_predicted_delta/measured_delta) - 1);
+ const type s_error = std::abs((s_predicted_delta/measured_delta) - 1);
+
+ if (s_error < h_error && h_error > 0.01)
+ {
+ h += make_symmetric(S);
+ }
+ else if (s_error > 10)
+ {
+ S = 0;
+ }
+
+ // put r into last_r
+ r.swap(last_r);
+ }
+ else
+ {
+ // put r into last_r
+ last_r = r;
+ }
+
+ J.swap(last_J);
+ last_x = x;
+ last_d = d;
+
+ last_f2 = last_f;
+ }
+
+ mutable type last_f; // value of function we saw in last operator()
+ mutable type last_f2; // value of last_f we saw in get_derivative_and_hessian()
+ mutable matrix<type,0,1,mem_manager_type,layout_type> r;
+ mutable column_vector vtemp;
+ mutable column_vector s, y, yy;
+
+ mutable general_matrix S;
+ mutable column_vector last_x;
+ mutable column_vector last_d;
+ mutable matrix<type,0,1,mem_manager_type,layout_type> last_r;
+ mutable matrix<type,0,NR,mem_manager_type,layout_type> last_J;
+ mutable matrix<type,0,NR,mem_manager_type,layout_type> J;
+ };
+
+// ----------------------------------------------------------------------------------------
+
+ template <
+ typename column_vector_type,
+ typename funct_type,
+ typename funct_der_type,
+ typename vector_type
+ >
+ least_squares_function_model<column_vector_type,funct_type,funct_der_type,vector_type> least_squares_model (
+ const funct_type& f,
+ const funct_der_type& der,
+ const vector_type& list
+ )
+ {
+ return least_squares_function_model<column_vector_type,funct_type,funct_der_type,vector_type>(f,der,list);
+ }
+
+// ----------------------------------------------------------------------------------------
+
+ template <
+ typename stop_strategy_type,
+ typename funct_type,
+ typename funct_der_type,
+ typename vector_type,
+ typename T
+ >
+ double solve_least_squares (
+ stop_strategy_type stop_strategy,
+ const funct_type& f,
+ const funct_der_type& der,
+ const vector_type& list,
+ T& x,
+ double radius = 1
+ )
+ {
+ // The starting point (i.e. x) must be a column vector.
+ COMPILE_TIME_ASSERT(T::NC <= 1);
+
+ // make sure requires clause is not broken
+ DLIB_ASSERT(is_vector(mat(list)) && list.size() > 0 &&
+ is_col_vector(x) && radius > 0,
+ "\t double solve_least_squares()"
+ << "\n\t invalid arguments were given to this function"
+ << "\n\t is_vector(list): " << is_vector(mat(list))
+ << "\n\t list.size(): " << list.size()
+ << "\n\t is_col_vector(x): " << is_col_vector(x)
+ << "\n\t radius: " << radius
+ );
+
+ return find_min_trust_region(stop_strategy,
+ least_squares_model<T>(f, der, mat(list)),
+ x,
+ radius);
+ }
+
+// ----------------------------------------------------------------------------------------
+// ----------------------------------------------------------------------------------------
+// ----------------------------------------------------------------------------------------
+
+ template <
+ typename column_vector_type,
+ typename funct_type,
+ typename funct_der_type,
+ typename vector_type
+ >
+ class least_squares_lm_function_model
+ {
+ public:
+ least_squares_lm_function_model (
+ const funct_type& f_,
+ const funct_der_type& der_,
+ const vector_type& list_
+ ) : f(f_), der(der_), list(list_)
+ {
+ r.set_size(list.size(),1);
+ }
+
+ const funct_type& f;
+ const funct_der_type& der;
+ const vector_type& list;
+
+ typedef typename column_vector_type::type type;
+ typedef typename column_vector_type::mem_manager_type mem_manager_type;
+ typedef typename column_vector_type::layout_type layout_type;
+ const static long NR = column_vector_type::NR;
+
+ typedef column_vector_type column_vector;
+ typedef matrix<type,NR,NR,mem_manager_type,layout_type> general_matrix;
+
+ mutable matrix<type,0,1,mem_manager_type,layout_type> r;
+ mutable column_vector vtemp;
+
+ type operator() (
+ const column_vector& x
+ ) const
+ {
+ type result = 0;
+ for (long i = 0; i < list.size(); ++i)
+ {
+ const type temp = f(list(i), x);
+ // save the residual for later
+ r(i) = temp;
+ result += temp*temp;
+ }
+
+ return 0.5*result;
+ }
+
+ void get_derivative_and_hessian (
+ const column_vector& x,
+ column_vector& d,
+ general_matrix& h
+ ) const
+ {
+ d = 0;
+ h = 0;
+ for (long i = 0; i < list.size(); ++i)
+ {
+ vtemp = der(list(i), x);
+ d += r(i)*vtemp;
+ h += vtemp*trans(vtemp);
+ }
+ }
+
+ };
+
+// ----------------------------------------------------------------------------------------
+
+ template <
+ typename column_vector_type,
+ typename funct_type,
+ typename funct_der_type,
+ typename vector_type
+ >
+ least_squares_lm_function_model<column_vector_type,funct_type,funct_der_type,vector_type> least_squares_lm_model (
+ const funct_type& f,
+ const funct_der_type& der,
+ const vector_type& list
+ )
+ {
+ return least_squares_lm_function_model<column_vector_type,funct_type,funct_der_type,vector_type>(f,der,list);
+ }
+
+// ----------------------------------------------------------------------------------------
+
+ template <
+ typename stop_strategy_type,
+ typename funct_type,
+ typename funct_der_type,
+ typename vector_type,
+ typename T
+ >
+ double solve_least_squares_lm (
+ stop_strategy_type stop_strategy,
+ const funct_type& f,
+ const funct_der_type& der,
+ const vector_type& list,
+ T& x,
+ double radius = 1
+ )
+ {
+ // The starting point (i.e. x) must be a column vector.
+ COMPILE_TIME_ASSERT(T::NC <= 1);
+
+ // make sure requires clause is not broken
+ DLIB_ASSERT(is_vector(mat(list)) && list.size() > 0 &&
+ is_col_vector(x) && radius > 0,
+ "\t double solve_least_squares_lm()"
+ << "\n\t invalid arguments were given to this function"
+ << "\n\t is_vector(list): " << is_vector(mat(list))
+ << "\n\t list.size(): " << list.size()
+ << "\n\t is_col_vector(x): " << is_col_vector(x)
+ << "\n\t radius: " << radius
+ );
+
+ return find_min_trust_region(stop_strategy,
+ least_squares_lm_model<T>(f, der, mat(list)),
+ x,
+ radius);
+ }
+
+// ----------------------------------------------------------------------------------------
+
+}
+
+#endif // DLIB_OPTIMIZATION_LEAST_SQuARES_H_h_
+
+
diff --git a/ml/dlib/dlib/optimization/optimization_least_squares_abstract.h b/ml/dlib/dlib/optimization/optimization_least_squares_abstract.h
new file mode 100644
index 000000000..aaffb2221
--- /dev/null
+++ b/ml/dlib/dlib/optimization/optimization_least_squares_abstract.h
@@ -0,0 +1,112 @@
+// Copyright (C) 2010 Davis E. King (davis@dlib.net)
+// License: Boost Software License See LICENSE.txt for the full license.
+#undef DLIB_OPTIMIZATIOn_LEAST_SQUARES_ABSTRACT_
+#ifdef DLIB_OPTIMIZATIOn_LEAST_SQUARES_ABSTRACT_
+
+#include "../matrix/matrix_abstract.h"
+#include "optimization_trust_region_abstract.h"
+
+namespace dlib
+{
+
+// ----------------------------------------------------------------------------------------
+
+ template <
+ typename stop_strategy_type,
+ typename funct_type,
+ typename funct_der_type,
+ typename vector_type,
+ typename T
+ >
+ double solve_least_squares (
+ stop_strategy_type stop_strategy,
+ const funct_type& f,
+ const funct_der_type& der,
+ const vector_type& list,
+ T& x,
+ double radius = 1
+ );
+ /*!
+ requires
+ - stop_strategy == an object that defines a stop strategy such as one of
+ the objects from dlib/optimization/optimization_stop_strategies_abstract.h
+ - list == a matrix or something convertible to a matrix via mat()
+ such as a std::vector.
+ - is_vector(list) == true
+ - list.size() > 0
+ - is_col_vector(x) == true
+ - radius > 0
+ - for all valid i:
+ - f(list(i),x) must be a valid expression that evaluates to a floating point value.
+ - der(list(i),x) must be a valid expression that evaluates to the derivative of f(list(i),x)
+ with respect to x. This derivative must take the form of a column vector.
+ ensures
+ - This function performs an unconstrained minimization of the least squares
+ function g(x) defined by:
+ - g(x) = sum over all i: 0.5*pow( f(list(i),x), 2 )
+ - This method combines the Levenberg-Marquardt method with a quasi-newton method
+ for approximating the second order terms of the hessian and is appropriate for
+ large residual problems (i.e. problems where the f() function isn't driven to 0).
+ In particular, it uses the method of Dennis, Gay, and Welsch as described in
+ Numerical Optimization by Nocedal and Wright (second edition).
+ - Since this is a trust region algorithm, the radius parameter defines the initial
+ size of the trust region.
+ - The function is optimized until stop_strategy decides that an acceptable
+ point has been found or the trust region subproblem fails to make progress.
+ - #x == the value of x that was found to minimize g()
+ - returns g(#x).
+ !*/
+
+// ----------------------------------------------------------------------------------------
+
+ template <
+ typename stop_strategy_type,
+ typename funct_type,
+ typename funct_der_type,
+ typename vector_type,
+ typename T
+ >
+ double solve_least_squares_lm (
+ stop_strategy_type stop_strategy,
+ const funct_type& f,
+ const funct_der_type& der,
+ const vector_type& list,
+ T& x,
+ double radius = 1
+ );
+ /*!
+ requires
+ - stop_strategy == an object that defines a stop strategy such as one of
+ the objects from dlib/optimization/optimization_stop_strategies_abstract.h
+ - list == a matrix or something convertible to a matrix via mat()
+ such as a std::vector.
+ - is_vector(list) == true
+ - list.size() > 0
+ - is_col_vector(x) == true
+ - radius > 0
+ - for all valid i:
+ - f(list(i),x) must be a valid expression that evaluates to a floating point value.
+ - der(list(i),x) must be a valid expression that evaluates to the derivative of f(list(i),x)
+ with respect to x. This derivative must take the form of a column vector.
+ ensures
+ - This function performs an unconstrained minimization of the least squares
+ function g(x) defined by:
+ - g(x) = sum over all i: 0.5*pow( f(list(i),x), 2 )
+ - This method implements a plain Levenberg-Marquardt approach for approximating
+ the hessian of g(). Therefore, it is most appropriate for small residual problems
+ (i.e. problems where f() goes to 0 at the solution).
+ - Since this is a trust region algorithm, the radius parameter defines the initial
+ size of the trust region.
+ - The function is optimized until stop_strategy decides that an acceptable
+ point has been found or the trust region subproblem fails to make progress.
+ - #x == the value of x that was found to minimize g()
+ - returns g(#x).
+ !*/
+
+// ----------------------------------------------------------------------------------------
+
+}
+
+#endif // DLIB_OPTIMIZATIOn_LEAST_SQUARES_ABSTRACT_
+
+
diff --git a/ml/dlib/dlib/optimization/optimization_line_search.h b/ml/dlib/dlib/optimization/optimization_line_search.h
new file mode 100644
index 000000000..a91e3df84
--- /dev/null
+++ b/ml/dlib/dlib/optimization/optimization_line_search.h
@@ -0,0 +1,888 @@
+// Copyright (C) 2008 Davis E. King (davis@dlib.net)
+// License: Boost Software License See LICENSE.txt for the full license.
+#ifndef DLIB_OPTIMIZATIOn_LINE_SEARCH_H_
+#define DLIB_OPTIMIZATIOn_LINE_SEARCH_H_
+
+#include <cmath>
+#include <limits>
+#include "../matrix.h"
+#include "../algs.h"
+#include "optimization_line_search_abstract.h"
+#include <utility>
+
+namespace dlib
+{
+
+// ----------------------------------------------------------------------------------------
+
+ template <typename funct, typename T>
+ class line_search_funct
+ {
+ public:
+ line_search_funct(const funct& f_, const T& start_, const T& direction_)
+ : f(f_),start(start_), direction(direction_), matrix_r(0), scalar_r(0)
+ {}
+
+ line_search_funct(const funct& f_, const T& start_, const T& direction_, T& r)
+ : f(f_),start(start_), direction(direction_), matrix_r(&r), scalar_r(0)
+ {}
+
+ line_search_funct(const funct& f_, const T& start_, const T& direction_, double& r)
+ : f(f_),start(start_), direction(direction_), matrix_r(0), scalar_r(&r)
+ {}
+
+ double operator()(const double& x) const
+ {
+ return get_value(f(start + x*direction));
+ }
+
+ private:
+
+ double get_value (const double& r) const
+ {
+ // save a copy of this value for later
+ if (scalar_r)
+ *scalar_r = r;
+
+ return r;
+ }
+
+ template <typename U>
+ double get_value (const U& r) const
+ {
+ // U should be a matrix type
+ COMPILE_TIME_ASSERT(is_matrix<U>::value);
+
+ // save a copy of this value for later
+ if (matrix_r)
+ *matrix_r = r;
+
+ return dot(r,direction);
+ }
+
+ const funct& f;
+ const T& start;
+ const T& direction;
+ T* matrix_r;
+ double* scalar_r;
+ };
+
+ template <typename funct, typename T>
+ const line_search_funct<funct,T> make_line_search_function(const funct& f, const T& start, const T& direction)
+ {
+ COMPILE_TIME_ASSERT(is_matrix<T>::value);
+ DLIB_ASSERT (
+ is_col_vector(start) && is_col_vector(direction) && start.size() == direction.size(),
+ "\tline_search_funct make_line_search_function(f,start,direction)"
+ << "\n\tYou have to supply column vectors to this function"
+ << "\n\tstart.nc(): " << start.nc()
+ << "\n\tdirection.nc(): " << direction.nc()
+ << "\n\tstart.nr(): " << start.nr()
+ << "\n\tdirection.nr(): " << direction.nr()
+ );
+ return line_search_funct<funct,T>(f,start,direction);
+ }
+
+// ----------------------------------------------------------------------------------------
+
+ template <typename funct, typename T>
+ const line_search_funct<funct,T> make_line_search_function(const funct& f, const T& start, const T& direction, double& f_out)
+ {
+ COMPILE_TIME_ASSERT(is_matrix<T>::value);
+ DLIB_ASSERT (
+ is_col_vector(start) && is_col_vector(direction) && start.size() == direction.size(),
+ "\tline_search_funct make_line_search_function(f,start,direction)"
+ << "\n\tYou have to supply column vectors to this function"
+ << "\n\tstart.nc(): " << start.nc()
+ << "\n\tdirection.nc(): " << direction.nc()
+ << "\n\tstart.nr(): " << start.nr()
+ << "\n\tdirection.nr(): " << direction.nr()
+ );
+ return line_search_funct<funct,T>(f,start,direction, f_out);
+ }
+
+// ----------------------------------------------------------------------------------------
+
+ template <typename funct, typename T>
+ const line_search_funct<funct,T> make_line_search_function(const funct& f, const T& start, const T& direction, T& grad_out)
+ {
+ COMPILE_TIME_ASSERT(is_matrix<T>::value);
+ DLIB_ASSERT (
+ is_col_vector(start) && is_col_vector(direction) && start.size() == direction.size(),
+ "\tline_search_funct make_line_search_function(f,start,direction)"
+ << "\n\tYou have to supply column vectors to this function"
+ << "\n\tstart.nc(): " << start.nc()
+ << "\n\tdirection.nc(): " << direction.nc()
+ << "\n\tstart.nr(): " << start.nr()
+ << "\n\tdirection.nr(): " << direction.nr()
+ );
+ return line_search_funct<funct,T>(f,start,direction,grad_out);
+ }
+
+// ----------------------------------------------------------------------------------------
+// ----------------------------------------------------------------------------------------
+// ----------------------------------------------------------------------------------------
+
+ inline double poly_min_extrap (
+ double f0,
+ double d0,
+ double f1,
+ double d1,
+ double limit = 1
+ )
+ {
+ const double n = 3*(f1 - f0) - 2*d0 - d1;
+ const double e = d0 + d1 - 2*(f1 - f0);
+
+
+ // find the minimum of the derivative of the polynomial
+
+ double temp = std::max(n*n - 3*e*d0,0.0);
+
+ if (temp < 0)
+ return 0.5;
+
+ temp = std::sqrt(temp);
+
+ if (std::abs(e) <= std::numeric_limits<double>::epsilon())
+ return 0.5;
+
+ // figure out the two possible min values
+ double x1 = (temp - n)/(3*e);
+ double x2 = -(temp + n)/(3*e);
+
+ // compute the value of the interpolating polynomial at these two points
+ double y1 = f0 + d0*x1 + n*x1*x1 + e*x1*x1*x1;
+ double y2 = f0 + d0*x2 + n*x2*x2 + e*x2*x2*x2;
+
+ // pick the best point
+ double x;
+ if (y1 < y2)
+ x = x1;
+ else
+ x = x2;
+
+ // now make sure the minimum is within the allowed range of [0,limit]
+ return put_in_range(0,limit,x);
+ }
+
+// ----------------------------------------------------------------------------------------
+
+ inline double poly_min_extrap (
+ double f0,
+ double d0,
+ double f1
+ )
+ {
+ const double temp = 2*(f1 - f0 - d0);
+ if (std::abs(temp) <= d0*std::numeric_limits<double>::epsilon())
+ return 0.5;
+
+ const double alpha = -d0/temp;
+
+ // now make sure the minimum is within the allowed range of (0,1)
+ return put_in_range(0,1,alpha);
+ }
+
+// ----------------------------------------------------------------------------------------
+
+ inline double poly_min_extrap (
+ double f0,
+ double d0,
+ double x1,
+ double f_x1,
+ double x2,
+ double f_x2
+ )
+ {
+ DLIB_ASSERT(0 < x1 && x1 < x2,"Invalid inputs were given to this function");
+ // The contents of this function follow the equations described on page 58 of the
+ // book Numerical Optimization by Nocedal and Wright, second edition.
+ matrix<double,2,2> m;
+ matrix<double,2,1> v;
+
+ const double aa2 = x2*x2;
+ const double aa1 = x1*x1;
+ m = aa2, -aa1,
+ -aa2*x2, aa1*x1;
+ v = f_x1 - f0 - d0*x1,
+ f_x2 - f0 - d0*x2;
+
+
+ double temp = aa2*aa1*(x1-x2);
+
+ // just take a guess if this happens
+ if (temp == 0 || std::fpclassify(temp) == FP_SUBNORMAL)
+ {
+ return x1/2.0;
+ }
+
+ matrix<double,2,1> temp2;
+ temp2 = m*v/temp;
+ const double a = temp2(0);
+ const double b = temp2(1);
+
+ temp = b*b - 3*a*d0;
+ if (temp < 0 || a == 0)
+ {
+ // This is probably a line so just pick the lowest point
+ if (f0 < f_x2)
+ return 0;
+ else
+ return x2;
+ }
+ temp = (-b + std::sqrt(temp))/(3*a);
+ return put_in_range(0, x2, temp);
+ }
+
+// ----------------------------------------------------------------------------------------
+
+ inline double lagrange_poly_min_extrap (
+ double p1,
+ double p2,
+ double p3,
+ double f1,
+ double f2,
+ double f3
+ )
+ {
+ DLIB_ASSERT(p1 < p2 && p2 < p3 && f1 >= f2 && f2 <= f3,
+ " p1: " << p1
+ << " p2: " << p2
+ << " p3: " << p3
+ << " f1: " << f1
+ << " f2: " << f2
+ << " f3: " << f3);
+
+ // This formula is out of the book Nonlinear Optimization by Andrzej Ruszczynski. See section 5.2.
+ double temp1 = f1*(p3*p3 - p2*p2) + f2*(p1*p1 - p3*p3) + f3*(p2*p2 - p1*p1);
+ double temp2 = 2*(f1*(p3 - p2) + f2*(p1 - p3) + f3*(p2 - p1) );
+
+ if (temp2 == 0)
+ {
+ return p2;
+ }
+
+ const double result = temp1/temp2;
+
+ // do a final sanity check to make sure the result is in the right range
+ if (p1 <= result && result <= p3)
+ {
+ return result;
+ }
+ else
+ {
+ return std::min(std::max(p1,result),p3);
+ }
+ }
+
+// ----------------------------------------------------------------------------------------
+
+ template <
+ typename funct,
+ typename funct_der
+ >
+ double line_search (
+ const funct& f,
+ const double f0,
+ const funct_der& der,
+ const double d0,
+ double rho,
+ double sigma,
+ double min_f,
+ unsigned long max_iter
+ )
+ {
+ DLIB_ASSERT (
+ 0 < rho && rho < sigma && sigma < 1 && max_iter > 0,
+ "\tdouble line_search()"
+ << "\n\tYou have given invalid arguments to this function"
+ << "\n\t sigma: " << sigma
+ << "\n\t rho: " << rho
+ << "\n\t max_iter: " << max_iter
+ );
+
+ // The bracketing phase of this function is implemented according to block 2.6.2 from
+ // the book Practical Methods of Optimization by R. Fletcher. The sectioning
+ // phase is an implementation of 2.6.4 from the same book.
+
+ // 1 <= tau1a < tau1b. Controls the alpha jump size during the bracketing phase of
+ // the search.
+ const double tau1a = 1.4;
+ const double tau1b = 9;
+
+ // it must be the case that 0 < tau2 < tau3 <= 1/2 for the algorithm to function
+ // correctly but the specific values of tau2 and tau3 aren't super important.
+ const double tau2 = 1.0/10.0;
+ const double tau3 = 1.0/2.0;
+
+
+ // Stop right away and return a step size of 0 if the gradient is 0 at the starting point
+ if (std::abs(d0) <= std::abs(f0)*std::numeric_limits<double>::epsilon())
+ return 0;
+
+ // Stop right away if the current value is good enough according to min_f
+ if (f0 <= min_f)
+ return 0;
+
+ // Figure out a reasonable upper bound on how large alpha can get.
+ const double mu = (min_f-f0)/(rho*d0);
+
+
+ double alpha = 1;
+ if (mu < 0)
+ alpha = -alpha;
+ alpha = put_in_range(0, 0.65*mu, alpha);
+
+
+ double last_alpha = 0;
+ double last_val = f0;
+ double last_val_der = d0;
+
+ // The bracketing stage will find a range of points [a,b]
+ // that contains a reasonable solution to the line search
+ double a, b;
+
+ // These variables will hold the values and derivatives of f(a) and f(b)
+ double a_val, b_val, a_val_der, b_val_der;
+
+ // This thresh value represents the Wolfe curvature condition
+ const double thresh = std::abs(sigma*d0);
+
+ unsigned long itr = 0;
+ // do the bracketing stage to find the bracket range [a,b]
+ while (true)
+ {
+ ++itr;
+ const double val = f(alpha);
+ const double val_der = der(alpha);
+
+ // we are done with the line search since we found a value smaller
+ // than the minimum f value
+ if (val <= min_f)
+ return alpha;
+
+ if (val > f0 + rho*alpha*d0 || val >= last_val)
+ {
+ a_val = last_val;
+ a_val_der = last_val_der;
+ b_val = val;
+ b_val_der = val_der;
+
+ a = last_alpha;
+ b = alpha;
+ break;
+ }
+
+ if (std::abs(val_der) <= thresh)
+ return alpha;
+
+ // if we are stuck not making progress then quit with the current alpha
+ if (last_alpha == alpha || itr >= max_iter)
+ return alpha;
+
+ if (val_der >= 0)
+ {
+ a_val = val;
+ a_val_der = val_der;
+ b_val = last_val;
+ b_val_der = last_val_der;
+
+ a = alpha;
+ b = last_alpha;
+ break;
+ }
+
+
+
+ const double temp = alpha;
+ // Pick a larger range [first, last]. We will pick the next alpha in that
+ // range.
+ double first, last;
+ if (mu > 0)
+ {
+ first = std::min(mu, alpha + tau1a*(alpha - last_alpha));
+ last = std::min(mu, alpha + tau1b*(alpha - last_alpha));
+ }
+ else
+ {
+ first = std::max(mu, alpha + tau1a*(alpha - last_alpha));
+ last = std::max(mu, alpha + tau1b*(alpha - last_alpha));
+ }
+
+
+
+ // pick a point between first and last by doing some kind of interpolation
+ if (last_alpha < alpha)
+ alpha = last_alpha + (alpha-last_alpha)*poly_min_extrap(last_val, last_val_der, val, val_der, 1e10);
+ else
+ alpha = alpha + (last_alpha-alpha)*poly_min_extrap(val, val_der, last_val, last_val_der, 1e10);
+
+ alpha = put_in_range(first,last,alpha);
+
+ last_alpha = temp;
+
+ last_val = val;
+ last_val_der = val_der;
+
+ }
+
+
+ // Now do the sectioning phase from 2.6.4
+ while (true)
+ {
+ ++itr;
+ double first = a + tau2*(b-a);
+ double last = b - tau3*(b-a);
+
+ // use interpolation to pick alpha between first and last
+ alpha = a + (b-a)*poly_min_extrap(a_val, a_val_der, b_val, b_val_der);
+ alpha = put_in_range(first,last,alpha);
+
+ const double val = f(alpha);
+ const double val_der = der(alpha);
+
+ // we are done with the line search since we found a value smaller
+ // than the minimum f value or we ran out of iterations.
+ if (val <= min_f || itr >= max_iter)
+ return alpha;
+
+ // stop if the interval gets so small that it isn't shrinking any more due to rounding error
+ if (a == first || b == last)
+ {
+ return b;
+ }
+
+ // If alpha has basically become zero then just stop. Think of it like this,
+ // if we take the largest possible alpha step will the objective function
+ // change at all? If not then there isn't any point looking for a better
+ // alpha.
+ const double max_possible_alpha = std::max(std::abs(a),std::abs(b));
+ if (std::abs(max_possible_alpha*d0) <= std::abs(f0)*std::numeric_limits<double>::epsilon())
+ return alpha;
+
+
+ if (val > f0 + rho*alpha*d0 || val >= a_val)
+ {
+ b = alpha;
+ b_val = val;
+ b_val_der = val_der;
+ }
+ else
+ {
+ if (std::abs(val_der) <= thresh)
+ return alpha;
+
+ if ( (b-a)*val_der >= 0)
+ {
+ b = a;
+ b_val = a_val;
+ b_val_der = a_val_der;
+ }
+
+ a = alpha;
+ a_val = val;
+ a_val_der = val_der;
+ }
+ }
+ }
+
+// ----------------------------------------------------------------------------------------
+
+ template <
+ typename funct
+ >
+ double backtracking_line_search (
+ const funct& f,
+ double f0,
+ double d0,
+ double alpha,
+ double rho,
+ unsigned long max_iter
+ )
+ {
+ DLIB_ASSERT (
+ 0 < rho && rho < 1 && max_iter > 0,
+ "\tdouble backtracking_line_search()"
+ << "\n\tYou have given invalid arguments to this function"
+ << "\n\t rho: " << rho
+ << "\n\t max_iter: " << max_iter
+ );
+
+ // make sure alpha is going in the right direction. That is, it should be opposite
+ // the direction of the gradient.
+ if ((d0 > 0 && alpha > 0) ||
+ (d0 < 0 && alpha < 0))
+ {
+ alpha *= -1;
+ }
+
+ bool have_prev_alpha = false;
+ double prev_alpha = 0;
+ double prev_val = 0;
+ unsigned long iter = 0;
+ while (true)
+ {
+ ++iter;
+ const double val = f(alpha);
+ if (val <= f0 + alpha*rho*d0 || iter >= max_iter)
+ {
+ return alpha;
+ }
+ else
+ {
+ // Interpolate a new alpha. We also make sure the step by which we
+ // reduce alpha is not super small.
+ double step;
+ if (!have_prev_alpha)
+ {
+ if (d0 < 0)
+ step = alpha*put_in_range(0.1,0.9, poly_min_extrap(f0, d0, val));
+ else
+ step = alpha*put_in_range(0.1,0.9, poly_min_extrap(f0, -d0, val));
+ have_prev_alpha = true;
+ }
+ else
+ {
+ if (d0 < 0)
+ step = put_in_range(0.1*alpha,0.9*alpha, poly_min_extrap(f0, d0, alpha, val, prev_alpha, prev_val));
+ else
+ step = put_in_range(0.1*alpha,0.9*alpha, -poly_min_extrap(f0, -d0, -alpha, val, -prev_alpha, prev_val));
+ }
+
+ prev_alpha = alpha;
+ prev_val = val;
+
+ alpha = step;
+ }
+ }
+ }
+
+// ----------------------------------------------------------------------------------------
+
+ class optimize_single_variable_failure : public error {
+ public: optimize_single_variable_failure(const std::string& s):error(s){}
+ };
+
+// ----------------------------------------------------------------------------------------
+
+ template <typename funct>
+ double find_min_single_variable (
+ const funct& f,
+ double& starting_point,
+ const double begin = -1e200,
+ const double end = 1e200,
+ const double eps = 1e-3,
+ const long max_iter = 100,
+ const double initial_search_radius = 1
+ )
+ {
+ DLIB_CASSERT( eps > 0 &&
+ max_iter > 1 &&
+ begin <= starting_point && starting_point <= end &&
+ initial_search_radius > 0,
+ "eps: " << eps
+ << "\n max_iter: "<< max_iter
+ << "\n begin: "<< begin
+ << "\n end: "<< end
+ << "\n starting_point: "<< starting_point
+ << "\n initial_search_radius: "<< initial_search_radius
+ );
+
+ double search_radius = initial_search_radius;
+
+ double p1=0, p2=0, p3=0, f1=0, f2=0, f3=0;
+ long f_evals = 1;
+
+ if (begin == end)
+ {
+ return f(starting_point);
+ }
+
+ using std::abs;
+ using std::min;
+ using std::max;
+
+ // find three bracketing points such that f1 > f2 < f3. Do this by generating a sequence
+ // of points expanding away from 0. Also note that, in the following code, it is always the
+ // case that p1 < p2 < p3.
+
+
+
+ // The first thing we do is get a starting set of 3 points that are inside the [begin,end] bounds
+ p1 = max(starting_point-search_radius, begin);
+ p3 = min(starting_point+search_radius, end);
+ f1 = f(p1);
+ f3 = f(p3);
+
+ if (starting_point == p1 || starting_point == p3)
+ {
+ p2 = (p1+p3)/2;
+ f2 = f(p2);
+ }
+ else
+ {
+ p2 = starting_point;
+ f2 = f(starting_point);
+ }
+
+ f_evals += 2;
+
+ // Now we have 3 points on the function. Start looking for a bracketing set such that
+ // f1 > f2 < f3 is the case.
+ while ( !(f1 > f2 && f2 < f3))
+ {
+ // check for hitting max_iter or if the interval is now too small
+ if (f_evals >= max_iter)
+ {
+ throw optimize_single_variable_failure(
+ "The max number of iterations of single variable optimization have been reached\n"
+ "without converging.");
+ }
+ if (p3-p1 < eps)
+ {
+ if (f1 < min(f2,f3))
+ {
+ starting_point = p1;
+ return f1;
+ }
+
+ if (f2 < min(f1,f3))
+ {
+ starting_point = p2;
+ return f2;
+ }
+
+ starting_point = p3;
+ return f3;
+ }
+
+ // If the left most points are identical in function value then expand out the
+ // left a bit, unless it's already at bound or we would drop that left most
+ // point anyway because it's bad.
+ if (f1==f2 && f1<f3 && p1!=begin)
+ {
+ p1 = max(p1 - search_radius, begin);
+ f1 = f(p1);
+ ++f_evals;
+ search_radius *= 2;
+ continue;
+ }
+ if (f2==f3 && f3<f1 && p3!=end)
+ {
+ p3 = min(p3 + search_radius, end);
+ f3 = f(p3);
+ ++f_evals;
+ search_radius *= 2;
+ continue;
+ }
+
+
+ // if f1 is small then take a step to the left
+ if (f1 <= f3)
+ {
+ // check if the minimum is butting up against the bounds and if so then pick
+ // a point between p1 and p2 in the hopes that shrinking the interval will
+ // be a good thing to do. Or if p1 and p2 aren't differentiated then try and
+ // get them to obtain different values.
+ if (p1 == begin || (f1 == f2 && (end-begin) < search_radius ))
+ {
+ p3 = p2;
+ f3 = f2;
+
+ p2 = (p1+p2)/2.0;
+ f2 = f(p2);
+ }
+ else
+ {
+ // pick a new point to the left of our current bracket
+ p3 = p2;
+ f3 = f2;
+
+ p2 = p1;
+ f2 = f1;
+
+ p1 = max(p1 - search_radius, begin);
+ f1 = f(p1);
+
+ search_radius *= 2;
+ }
+
+ }
+ // otherwise f3 is small and we should take a step to the right
+ else
+ {
+ // check if the minimum is butting up against the bounds and if so then pick
+ // a point between p2 and p3 in the hopes that shrinking the interval will
+ // be a good thing to do. Or if p2 and p3 aren't differentiated then try and
+ // get them to obtain different values.
+ if (p3 == end || (f2 == f3 && (end-begin) < search_radius))
+ {
+ p1 = p2;
+ f1 = f2;
+
+ p2 = (p3+p2)/2.0;
+ f2 = f(p2);
+ }
+ else
+ {
+ // pick a new point to the right of our current bracket
+ p1 = p2;
+ f1 = f2;
+
+ p2 = p3;
+ f2 = f3;
+
+ p3 = min(p3 + search_radius, end);
+ f3 = f(p3);
+
+ search_radius *= 2;
+ }
+ }
+
+ ++f_evals;
+ }
+
+
+ // Loop until we have done the max allowable number of iterations or
+ // the bracketing window is smaller than eps.
+ // Within this loop we maintain the invariant that: f1 > f2 < f3 and p1 < p2 < p3
+ const double tau = 0.1;
+ while( f_evals < max_iter && p3-p1 > eps)
+ {
+ double p_min = lagrange_poly_min_extrap(p1,p2,p3, f1,f2,f3);
+
+
+ // make sure p_min isn't too close to the three points we already have
+ if (p_min < p2)
+ {
+ const double min_dist = (p2-p1)*tau;
+ if (abs(p1-p_min) < min_dist)
+ {
+ p_min = p1 + min_dist;
+ }
+ else if (abs(p2-p_min) < min_dist)
+ {
+ p_min = p2 - min_dist;
+ }
+ }
+ else
+ {
+ const double min_dist = (p3-p2)*tau;
+ if (abs(p2-p_min) < min_dist)
+ {
+ p_min = p2 + min_dist;
+ }
+ else if (abs(p3-p_min) < min_dist)
+ {
+ p_min = p3 - min_dist;
+ }
+ }
+
+ // make sure one side of the bracket isn't super huge compared to the other
+ // side. If it is then contract it.
+ const double bracket_ratio = abs(p1-p2)/abs(p2-p3);
+ if ( !( bracket_ratio < 10 && bracket_ratio > 0.1) )
+ {
+ // Force p_min to be on a reasonable side. But only if lagrange_poly_min_extrap()
+ // didn't put it on a good side already.
+ if (bracket_ratio > 1 && p_min > p2)
+ p_min = (p1+p2)/2;
+ else if (p_min < p2)
+ p_min = (p2+p3)/2;
+ }
+
+
+ const double f_min = f(p_min);
+
+
+ // Remove one of the endpoints of our bracket depending on where the new point falls.
+ if (p_min < p2)
+ {
+ if (f1 > f_min && f_min < f2)
+ {
+ p3 = p2;
+ f3 = f2;
+ p2 = p_min;
+ f2 = f_min;
+ }
+ else
+ {
+ p1 = p_min;
+ f1 = f_min;
+ }
+ }
+ else
+ {
+ if (f2 > f_min && f_min < f3)
+ {
+ p1 = p2;
+ f1 = f2;
+ p2 = p_min;
+ f2 = f_min;
+ }
+ else
+ {
+ p3 = p_min;
+ f3 = f_min;
+ }
+ }
+
+
+ ++f_evals;
+ }
+
+ if (f_evals >= max_iter)
+ {
+ throw optimize_single_variable_failure(
+ "The max number of iterations of single variable optimization have been reached\n"
+ "without converging.");
+ }
+
+ starting_point = p2;
+ return f2;
+ }
+
+// ----------------------------------------------------------------------------------------
+
+ template <typename funct>
+ class negate_function_object
+ {
+ public:
+ negate_function_object(const funct& f_) : f(f_){}
+
+ template <typename T>
+ double operator()(const T& x) const
+ {
+ return -f(x);
+ }
+
+ private:
+ const funct& f;
+ };
+
+ template <typename funct>
+ const negate_function_object<funct> negate_function(const funct& f) { return negate_function_object<funct>(f); }
+
+// ----------------------------------------------------------------------------------------
+
+ template <typename funct>
+ double find_max_single_variable (
+ const funct& f,
+ double& starting_point,
+ const double begin = -1e200,
+ const double end = 1e200,
+ const double eps = 1e-3,
+ const long max_iter = 100,
+ const double initial_search_radius = 1
+ )
+ {
+ return -find_min_single_variable(negate_function(f), starting_point, begin, end, eps, max_iter, initial_search_radius);
+ }
+
+// ----------------------------------------------------------------------------------------
+
+}
+
+#endif // DLIB_OPTIMIZATIOn_LINE_SEARCH_H_
+
diff --git a/ml/dlib/dlib/optimization/optimization_line_search_abstract.h b/ml/dlib/dlib/optimization/optimization_line_search_abstract.h
new file mode 100644
index 000000000..2aa221da4
--- /dev/null
+++ b/ml/dlib/dlib/optimization/optimization_line_search_abstract.h
@@ -0,0 +1,361 @@
+// Copyright (C) 2008 Davis E. King (davis@dlib.net)
+// License: Boost Software License See LICENSE.txt for the full license.
+#undef DLIB_OPTIMIZATIOn_ABSTRACT_
+#ifdef DLIB_OPTIMIZATIOn_ABSTRACT_
+
+#include <cmath>
+#include <limits>
+#include "../matrix/matrix_abstract.h"
+#include "../algs.h"
+
+
+namespace dlib
+{
+
+// ----------------------------------------------------------------------------------------
+
+ template <
+ typename funct,
+ typename T
+ >
+ class line_search_funct;
+ /*!
+ This object is a function object that represents a line search function.
+
+ Moreover, it represents a function with the signature:
+ double l(double x)
+ !*/
+
+ template <
+ typename funct,
+ typename T
+ >
+ const line_search_funct<funct,T> make_line_search_function (
+ const funct& f,
+ const T& start,
+ const T& direction
+ );
+ /*!
+ requires
+ - is_col_vector(start) && is_col_vector(direction) && start.size() == direction.size()
+ (i.e. start and direction should be column vectors of the same size)
+ - f must return either a double or a column vector the same length as start
+ - f(start + 1.5*direction) should be a valid expression
+ ensures
+ - if (f returns a double) then
+ - returns a line search function that computes l(x) == f(start + x*direction)
+ - else
+ - returns a line search function that computes l(x) == dot(f(start + x*direction),direction).
+ That is, we assume f is the derivative of some other function and that what
+ f returns is a gradient vector.
+ So the following two expressions both create the derivative of l(x):
+ - derivative(make_line_search_function(funct,start,direction))
+ - make_line_search_function(derivative(funct),start,direction)
+ !*/
+
+ template <
+ typename funct,
+ typename T
+ >
+ const line_search_funct<funct,T> make_line_search_function (
+ const funct& f,
+ const T& start,
+ const T& direction,
+ double& f_out
+ );
+ /*!
+ This function is identical to the above three argument version of make_line_search_function()
+ except that, if f() outputs a double, every time f() is evaluated its output is also stored
+ into f_out.
+ !*/
+
+ template <
+ typename funct,
+ typename T
+ >
+ const line_search_funct<funct,T> make_line_search_function (
+ const funct& f,
+ const T& start,
+ const T& direction,
+ T& gradient_out
+ );
+ /*!
+ This function is identical to the above three argument version of make_line_search_function()
+ except that, if f() outputs a column vector, every time f() is evaluated its output is also
+ stored into gradient_out.
+ !*/
+
+// ----------------------------------------------------------------------------------------
+
+ inline double poly_min_extrap (
+ double f0,
+ double d0,
+ double f1,
+ double d1,
+ double limit = 1
+ );
+ /*!
+ ensures
+ - let c(x) be a 3rd degree polynomial such that:
+ - c(0) == f0
+ - c(1) == f1
+ - derivative of c(x) at x==0 is d0
+ - derivative of c(x) at x==1 is d1
+ - returns the point in the range [0,limit] that minimizes the polynomial c(x)
+ !*/
+
+// ----------------------------------------------------------------------------------------
+
+ inline double poly_min_extrap (
+ double f0,
+ double d0,
+ double f1
+ );
+ /*!
+ ensures
+ - let c(x) be a 2nd degree polynomial such that:
+ - c(0) == f0
+ - c(1) == f1
+ - derivative of c(x) at x==0 is d0
+ - returns the point in the range [0,1] that minimizes the polynomial c(x)
+ !*/
+
+// ----------------------------------------------------------------------------------------
+
+ inline double poly_min_extrap (
+ double f0,
+ double d0,
+ double x1,
+ double f_x1,
+ double x2,
+ double f_x2
+ )
+ /*!
+ requires
+ - 0 < x1 < x2
+ ensures
+ - let f(x) be a 3rd degree polynomial such that:
+ - f(0) == f0
+ - derivative of f(x) at x==0 is d0
+ - f(x1) == f_x1
+ - f(x2) == f_x2
+ - returns the point in the range [0,x2] that minimizes the polynomial f(x)
+ !*/
+
+// ----------------------------------------------------------------------------------------
+
+ inline double lagrange_poly_min_extrap (
+ double p1,
+ double p2,
+ double p3,
+ double f1,
+ double f2,
+ double f3
+ );
+ /*!
+ requires
+ - f1 >= f2 <= f3
+ - p1 < p2 < p3
+ ensures
+ - let c(x) be the second order Lagrange polynomial that interpolates the
+ points p1, p2, and p3 where c(p1)==f1, c(p2)==f2, and c(p3)==f3
+ - this function returns the point in the range [p1,p3] that minimizes
+ the polynomial c(x)
+ !*/
+
+// ----------------------------------------------------------------------------------------
+
+ template <
+ typename funct,
+ typename funct_der
+ >
+ double line_search (
+ const funct& f,
+ const double f0,
+ const funct_der& der,
+ const double d0,
+ double rho,
+ double sigma,
+ double min_f,
+ unsigned long max_iter
+ )
+ /*!
+ requires
+ - 0 < rho < sigma < 1
+ - f and der are scalar functions of scalars
+ (e.g. line_search_funct objects)
+ - der is the derivative of f
+ - f0 == f(0)
+ - d0 == der(0)
+ - max_iter > 0
+ ensures
+ - Performs a line search and uses the strong Wolfe conditions to decide when
+ the search can stop.
+ - rho == the parameter of the Wolfe sufficient decrease condition
+ - sigma == the parameter of the Wolfe curvature condition
+ - max_iter == the maximum number of iterations allowable. After this
+ many evaluations of f() line_search() is guaranteed to terminate.
+ - returns a value alpha such that f(alpha) is significantly closer to
+ the minimum of f than f(0).
+ - It is assumed that the minimum possible value of f(x) is min_f. So if
+ an alpha is found such that f(alpha) <= min_f then the search stops
+ immediately.
+ - This function is also optimized for the case where der(0) is negative. I.e.
+ positive values of the argument to f() should be in a descent direction.
+ - When this function makes calls to f() and der() it always does so by
+ first calling f() and then calling der(). That is, these two functions
+ are always called in pairs with f() being called first and then der()
+ being called second.
+ !*/
+
+ /*
+ A good discussion of the Wolfe conditions and line search algorithms in
+ general can be found in the book Practical Methods of Optimization by R. Fletcher
+ and also in the more recent book Numerical Optimization by Nocedal and Wright.
+ */
+
+// ----------------------------------------------------------------------------------------
+
+ template <
+ typename funct
+ >
+ double backtracking_line_search (
+ const funct& f,
+ double f0,
+ double d0,
+ double alpha,
+ double rho,
+ unsigned long max_iter
+ );
+ /*!
+ requires
+ - 0 < rho < 1
+ - f is a scalar function of scalars
+ (e.g. a line_search_funct object)
+ - f0 == f(0)
+ - d0 == the derivative of f() at f(0).
+ - max_iter > 0
+ ensures
+ - Performs a backtracking line search and uses the Armijo sufficient decrease
+ rule to decide when the search can stop.
+ - rho == the parameter of the sufficient decrease condition.
+ - max_iter == the maximum number of iterations allowable. After this many
+ evaluations of f() backtracking_line_search() is guaranteed to terminate.
+ - The line search starts with the input alpha value and then backtracks until
+ it finds a good enough alpha value. Once found, it returns the alpha value
+ such that f(alpha) is significantly closer to the minimum of f than f(0).
+ - The returned value of alpha will always be the last value of alpha which was
+ passed to f(). That is, it will always be the case that the last call to f()
+ made by backtracking_line_search() was f(alpha) where alpha is the return
+ value from this function.
+ !*/
+
+// ----------------------------------------------------------------------------------------
+
+ template <
+ typename funct
+ >
+ const negate_function_object<funct> negate_function(
+ const funct& f
+ );
+ /*!
+ requires
+ - f == a function that returns a scalar
+ ensures
+ - returns a function that represents the negation of f. That is,
+ the returned function object represents g(x) == -f(x)
+ !*/
+
+// ----------------------------------------------------------------------------------------
+
+ class optimize_single_variable_failure : public error;
+ /*!
+ This is the exception class used by the functions defined below.
+ !*/
+
+// ----------------------------------------------------------------------------------------
+
+ template <
+ typename funct
+ >
+ double find_min_single_variable (
+ const funct& f,
+ double& starting_point,
+ const double begin = -1e200,
+ const double end = 1e200,
+ const double eps = 1e-3,
+ const long max_iter = 100,
+ const double initial_search_radius = 1
+ )
+ /*!
+ requires
+ - eps > 0
+ - max_iter > 1
+ - begin <= starting_point <= end
+ - f must be a function of a double that returns a double
+ (e.g. f(starting_point) should be a valid expression that evaluates to a double)
+ - initial_search_radius > 0
+ ensures
+ - Finds a point P such that:
+ - P is a local minimum of the function f().
+ - begin <= P <= end
+ - Evaluates f() no more than max_iter times
+ - Stops searching when the window around the minimum point is smaller than eps.
+ The search will begin with the given starting_point and expand out to the
+ left and right by initial_search_radius sized steps. So if you think the
+ minimum is likely to be found within X distance from the starting_point then
+ set initial_search_radius to X.
+ - #starting_point == P
+ - returns f(P)
+ throws
+ - optimize_single_variable_failure
+ This exception is thrown if max_iter iterations are performed without
+ determining the min point to the requested accuracy of eps.
+ !*/
+
+// ----------------------------------------------------------------------------------------
+
+ template <
+ typename funct
+ >
+ double find_max_single_variable (
+ const funct& f,
+ double& starting_point,
+ const double begin = -1e200,
+ const double end = 1e200,
+ const double eps = 1e-3,
+ const long max_iter = 100,
+ const double initial_search_radius = 1
+ )
+ /*!
+ requires
+ - eps > 0
+ - max_iter > 1
+ - begin <= starting_point <= end
+ - f must be a function of a double that returns a double
+ (e.g. f(starting_point) should be a valid expression that evaluates to a double)
+ - initial_search_radius > 0
+ ensures
+ - Finds a point P such that:
+ - P is a local maximum of the function f().
+ - begin <= P <= end
+ - Evaluates f() no more than max_iter times
+ - Stops searching when the window around the maximum point is smaller than eps.
+ The search will begin with the given starting_point and expand out to the
+ left and right by initial_search_radius sized steps. So if you think the
+ maximum is likely to be found within X distance from the starting_point then
+ set initial_search_radius to X.
+ - #starting_point == P
+ - returns f(P)
+ throws
+ - optimize_single_variable_failure
+ This exception is thrown if max_iter iterations are performed without
+ determining the max point to the requested accuracy of eps.
+ !*/
+
+// ----------------------------------------------------------------------------------------
+
+}
+
+#endif // DLIB_OPTIMIZATIOn_ABSTRACT_
+
diff --git a/ml/dlib/dlib/optimization/optimization_oca.h b/ml/dlib/dlib/optimization/optimization_oca.h
new file mode 100644
index 000000000..4ca9cd7ae
--- /dev/null
+++ b/ml/dlib/dlib/optimization/optimization_oca.h
@@ -0,0 +1,407 @@
+// Copyright (C) 2010 Davis E. King (davis@dlib.net)
+// License: Boost Software License See LICENSE.txt for the full license.
+#ifndef DLIB_OPTIMIZATIoN_OCA_Hh_
+#define DLIB_OPTIMIZATIoN_OCA_Hh_
+
+#include "optimization_oca_abstract.h"
+
+#include "../matrix.h"
+#include "optimization_solve_qp_using_smo.h"
+#include <vector>
+#include "../sequence.h"
+
+// ----------------------------------------------------------------------------------------
+
+namespace dlib
+{
+ template <typename matrix_type>
+ class oca_problem
+ {
+ public:
+ typedef typename matrix_type::type scalar_type;
+
+ virtual ~oca_problem() {}
+
+ virtual bool risk_has_lower_bound (
+ scalar_type&
+ ) const { return false; }
+
+ virtual bool optimization_status (
+ scalar_type ,
+ scalar_type ,
+ scalar_type ,
+ scalar_type ,
+ unsigned long,
+ unsigned long
+ ) const = 0;
+
+ virtual scalar_type get_c (
+ ) const = 0;
+
+ virtual long get_num_dimensions (
+ ) const = 0;
+
+ virtual void get_risk (
+ matrix_type& current_solution,
+ scalar_type& risk_value,
+ matrix_type& risk_subgradient
+ ) const = 0;
+
+ };
+
+// ----------------------------------------------------------------------------------------
+
+ class oca
+ {
+ public:
+
+ oca ()
+ {
+ sub_eps = 1e-2;
+ sub_max_iter = 50000;
+
+ inactive_thresh = 20;
+ }
+
+ void set_subproblem_epsilon (
+ double eps_
+ ) { sub_eps = eps_; }
+
+ double get_subproblem_epsilon (
+ ) const { return sub_eps; }
+
+ void set_subproblem_max_iterations (
+ unsigned long sub_max_iter_
+ )
+ {
+ // make sure requires clause is not broken
+ DLIB_ASSERT(sub_max_iter_ > 0,
+ "\t void oca::set_subproblem_max_iterations"
+ << "\n\t max iterations must be greater than 0"
+ << "\n\t sub_max_iter_: " << sub_max_iter_
+ << "\n\t this: " << this
+ );
+
+ sub_max_iter = sub_max_iter_;
+ }
+
+ unsigned long get_subproblem_max_iterations (
+ ) const { return sub_max_iter; }
+
+ void set_inactive_plane_threshold (
+ unsigned long inactive_thresh_
+ )
+ {
+ // make sure requires clause is not broken
+ DLIB_ASSERT(inactive_thresh_ > 0,
+ "\t void oca::set_inactive_plane_threshold"
+ << "\n\t inactive threshold must be greater than 0"
+ << "\n\t inactive_thresh_: " << inactive_thresh_
+ << "\n\t this: " << this
+ );
+
+ inactive_thresh = inactive_thresh_;
+ }
+
+ unsigned long get_inactive_plane_threshold (
+ ) const { return inactive_thresh; }
+
+ template <
+ typename matrix_type
+ >
+ typename matrix_type::type operator() (
+ const oca_problem<matrix_type>& problem,
+ matrix_type& w,
+ unsigned long num_nonnegative = 0,
+ unsigned long force_weight_to_1 = std::numeric_limits<unsigned long>::max()
+ ) const
+ {
+ matrix_type empty_prior;
+ return oca_impl(problem, w, empty_prior, false, num_nonnegative, force_weight_to_1, 0);
+ }
+
+ template <
+ typename matrix_type
+ >
+ typename matrix_type::type solve_with_elastic_net (
+ const oca_problem<matrix_type>& problem,
+ matrix_type& w,
+ double lasso_lambda,
+ unsigned long force_weight_to_1 = std::numeric_limits<unsigned long>::max()
+ ) const
+ {
+ matrix_type empty_prior;
+ return oca_impl(problem, w, empty_prior, false, 0, force_weight_to_1, lasso_lambda);
+ }
+
+ template <
+ typename matrix_type
+ >
+ typename matrix_type::type operator() (
+ const oca_problem<matrix_type>& problem,
+ matrix_type& w,
+ const matrix_type& prior
+ ) const
+ {
+ // make sure requires clause is not broken
+ DLIB_ASSERT(is_col_vector(prior) && prior.size() == problem.get_num_dimensions(),
+ "\t scalar_type oca::operator()"
+ << "\n\t The prior vector does not have the correct dimensions."
+ << "\n\t is_col_vector(prior): " << is_col_vector(prior)
+ << "\n\t prior.size(): " << prior.size()
+ << "\n\t problem.get_num_dimensions(): " << problem.get_num_dimensions()
+ << "\n\t this: " << this
+ );
+ // disable the force weight to 1 option for this mode. We also disable the
+ // non-negative constraints.
+ unsigned long force_weight_to_1 = std::numeric_limits<unsigned long>::max();
+ return oca_impl(problem, w, prior, true, 0, force_weight_to_1, 0);
+ }
+
+ private:
+
+ template <
+ typename matrix_type
+ >
+ typename matrix_type::type oca_impl (
+ const oca_problem<matrix_type>& problem,
+ matrix_type& w,
+ const matrix_type& prior,
+ bool have_prior,
+ unsigned long num_nonnegative,
+ unsigned long force_weight_to_1,
+ const double lasso_lambda
+ ) const
+ {
+ const unsigned long num_dims = problem.get_num_dimensions();
+
+ // make sure requires clause is not broken
+ DLIB_ASSERT(problem.get_c() > 0 &&
+ problem.get_num_dimensions() > 0 &&
+ 0 <= lasso_lambda && lasso_lambda < 1,
+ "\t scalar_type oca::operator()"
+ << "\n\t The oca_problem is invalid"
+ << "\n\t problem.get_c(): " << problem.get_c()
+ << "\n\t problem.get_num_dimensions(): " << num_dims
+ << "\n\t lasso_lambda: " << lasso_lambda
+ << "\n\t this: " << this
+ );
+ if (have_prior)
+ {
+ DLIB_ASSERT(lasso_lambda == 0, "Solver doesn't support using a prior with lasso.");
+ DLIB_ASSERT(num_nonnegative == 0, "Solver doesn't support using a prior with non-negative constraints.");
+ }
+ else if (lasso_lambda != 0)
+ {
+ DLIB_ASSERT(num_nonnegative == 0, "Solver doesn't support using lasso with non-negative constraints.");
+ }
+
+ const double ridge_lambda = 1-lasso_lambda;
+
+ if (num_nonnegative > num_dims)
+ num_nonnegative = num_dims;
+
+ typedef typename matrix_type::type scalar_type;
+ typedef typename matrix_type::layout_type layout_type;
+ typedef typename matrix_type::mem_manager_type mem_manager_type;
+ typedef matrix_type vect_type;
+
+ const scalar_type C = problem.get_c();
+
+ typename sequence<vect_type>::kernel_2a planes;
+ std::vector<scalar_type> bs, miss_count;
+
+ vect_type new_plane, alpha, btemp;
+
+ w.set_size(num_dims, 1);
+ w = 0;
+
+ // The current objective value. Note also that w always contains
+ // the current solution.
+ scalar_type cur_obj = std::numeric_limits<scalar_type>::max();
+
+ // This will hold the cutting plane objective value. This value is
+ // a lower bound on the true optimal objective value.
+ scalar_type cp_obj = 0;
+
+ matrix<scalar_type,0,0,mem_manager_type, layout_type> K, Ktmp;
+ matrix<scalar_type,0,1,mem_manager_type, layout_type> lambda, d;
+ if (lasso_lambda != 0)
+ d.set_size(num_dims);
+ else
+ d.set_size(num_nonnegative);
+ d = lasso_lambda*ones_matrix(d);
+
+ scalar_type R_lower_bound;
+ if (problem.risk_has_lower_bound(R_lower_bound))
+ {
+ // The flat lower bounding plane is always good to have if we know
+ // what it is.
+ bs.push_back(R_lower_bound);
+ new_plane = zeros_matrix(w);
+ planes.add(0, new_plane);
+ alpha = uniform_matrix<scalar_type>(1,1, C);
+ miss_count.push_back(0);
+
+ K.set_size(1,1);
+ K(0,0) = 0;
+ }
+
+ const double prior_norm = have_prior ? 0.5*dot(prior,prior) : 0;
+
+ unsigned long counter = 0;
+ while (true)
+ {
+
+ // add the next cutting plane
+ scalar_type cur_risk;
+ if (force_weight_to_1 < (unsigned long)w.size())
+ w(force_weight_to_1) = 1;
+
+ problem.get_risk(w, cur_risk, new_plane);
+
+ if (force_weight_to_1 < (unsigned long)w.size())
+ {
+ // We basically arrange for the w(force_weight_to_1) element and all
+ // subsequent elements of w to not be involved in the optimization at
+ // all. An easy way to do this is to just make sure the elements of w
+ // corresponding elements in the subgradient are always set to zero
+ // while we run the cutting plane algorithm. The only time
+ // w(force_weight_to_1) is 1 is when we pass it to the oca_problem.
+ set_rowm(w, range(force_weight_to_1, w.size()-1)) = 0;
+ set_rowm(new_plane, range(force_weight_to_1, new_plane.size()-1)) = 0;
+ }
+
+ if (have_prior)
+ bs.push_back(cur_risk - dot(w,new_plane) + dot(prior,new_plane));
+ else
+ bs.push_back(cur_risk - dot(w,new_plane));
+ planes.add(planes.size(), new_plane);
+ miss_count.push_back(0);
+
+ // If alpha is empty then initialize it (we must always have sum(alpha) == C).
+ // But otherwise, just append a zero.
+ if (alpha.size() == 0)
+ alpha = uniform_matrix<scalar_type>(1,1, C);
+ else
+ alpha = join_cols(alpha,zeros_matrix<scalar_type>(1,1));
+
+ const scalar_type wnorm = 0.5*ridge_lambda*trans(w)*w + lasso_lambda*sum(abs(w));
+ const double prior_part = have_prior? dot(w,prior) : 0;
+ cur_obj = wnorm + C*cur_risk + prior_norm-prior_part;
+
+ // report current status
+ const scalar_type risk_gap = cur_risk - (cp_obj-wnorm+prior_part-prior_norm)/C;
+ if (counter > 0 && problem.optimization_status(cur_obj, cur_obj - cp_obj,
+ cur_risk, risk_gap, planes.size(), counter))
+ {
+ break;
+ }
+
+ // compute kernel matrix for all the planes
+ K.swap(Ktmp);
+ K.set_size(planes.size(), planes.size());
+ // copy over the old K matrix
+ set_subm(K, 0,0, Ktmp.nr(), Ktmp.nc()) = Ktmp;
+
+ // now add the new row and column to K
+ for (unsigned long c = 0; c < planes.size(); ++c)
+ {
+ K(c, Ktmp.nc()) = dot(planes[c], planes[planes.size()-1]);
+ K(Ktmp.nc(), c) = K(c,Ktmp.nc());
+ }
+
+
+ // solve the cutting plane subproblem for the next w. We solve it to an
+ // accuracy that is related to how big the error gap is. Also, we multiply
+ // by ridge_lambda because the objective function for the QP we solve was
+ // implicitly scaled by ridge_lambda. That is, we want to ask the QP
+ // solver to solve the problem until the duality gap is 0.1 times smaller
+ // than what it is now. So the factor of ridge_lambda is necessary to make
+ // this happen.
+ scalar_type eps = std::min<scalar_type>(sub_eps, 0.1*ridge_lambda*(cur_obj-cp_obj));
+ // just a sanity check
+ if (eps < 1e-16)
+ eps = 1e-16;
+ // Note that we warm start this optimization by using the alpha from the last
+ // iteration as the starting point.
+ if (lasso_lambda != 0)
+ {
+ // copy planes into a matrix so we can call solve_qp4_using_smo()
+ matrix<scalar_type,0,0,mem_manager_type, layout_type> planes_mat(num_dims,planes.size());
+ for (unsigned long i = 0; i < planes.size(); ++i)
+ set_colm(planes_mat,i) = planes[i];
+
+ btemp = ridge_lambda*mat(bs) - trans(planes_mat)*d;
+ solve_qp4_using_smo(planes_mat, K, btemp, d, alpha, lambda, eps, sub_max_iter, (scalar_type)(2*lasso_lambda));
+ }
+ else if (num_nonnegative != 0)
+ {
+ // copy planes into a matrix so we can call solve_qp4_using_smo()
+ matrix<scalar_type,0,0,mem_manager_type, layout_type> planes_mat(num_nonnegative,planes.size());
+ for (unsigned long i = 0; i < planes.size(); ++i)
+ set_colm(planes_mat,i) = colm(planes[i],0,num_nonnegative);
+
+ solve_qp4_using_smo(planes_mat, K, mat(bs), d, alpha, lambda, eps, sub_max_iter);
+ }
+ else
+ {
+ solve_qp_using_smo(K, mat(bs), alpha, eps, sub_max_iter);
+ }
+
+ // construct the w that minimized the subproblem.
+ w = -alpha(0)*planes[0];
+ for (unsigned long i = 1; i < planes.size(); ++i)
+ w -= alpha(i)*planes[i];
+ if (lasso_lambda != 0)
+ w = (lambda-d+w)/ridge_lambda;
+ else if (num_nonnegative != 0) // threshold the first num_nonnegative w elements if necessary.
+ set_rowm(w,range(0,num_nonnegative-1)) = lowerbound(rowm(w,range(0,num_nonnegative-1)),0);
+
+ for (long i = 0; i < alpha.size(); ++i)
+ {
+ if (alpha(i) != 0)
+ miss_count[i] = 0;
+ else
+ miss_count[i] += 1;
+ }
+
+ // Compute the lower bound on the true objective given to us by the cutting
+ // plane subproblem.
+ cp_obj = -0.5*ridge_lambda*trans(w)*w + trans(alpha)*mat(bs);
+ if (have_prior)
+ w += prior;
+
+ // If it has been a while since a cutting plane was an active constraint then
+ // we should throw it away.
+ while (max(mat(miss_count)) >= inactive_thresh)
+ {
+ const long idx = index_of_max(mat(miss_count));
+ bs.erase(bs.begin()+idx);
+ miss_count.erase(miss_count.begin()+idx);
+ K = removerc(K, idx, idx);
+ alpha = remove_row(alpha,idx);
+ planes.remove(idx, new_plane);
+ }
+
+ ++counter;
+ }
+
+ if (force_weight_to_1 < (unsigned long)w.size())
+ w(force_weight_to_1) = 1;
+
+ return cur_obj;
+ }
+
+ double sub_eps;
+
+ unsigned long sub_max_iter;
+
+ unsigned long inactive_thresh;
+ };
+}
+
+// ----------------------------------------------------------------------------------------
+
+#endif // DLIB_OPTIMIZATIoN_OCA_Hh_
+
diff --git a/ml/dlib/dlib/optimization/optimization_oca_abstract.h b/ml/dlib/dlib/optimization/optimization_oca_abstract.h
new file mode 100644
index 000000000..859dbdcfc
--- /dev/null
+++ b/ml/dlib/dlib/optimization/optimization_oca_abstract.h
@@ -0,0 +1,334 @@
+// Copyright (C) 2010 Davis E. King (davis@dlib.net)
+// License: Boost Software License See LICENSE.txt for the full license.
+#undef DLIB_OPTIMIZATION_OCA_ABsTRACT_Hh_
+#ifdef DLIB_OPTIMIZATION_OCA_ABsTRACT_Hh_
+
+// ----------------------------------------------------------------------------------------
+
+namespace dlib
+{
+ template <typename matrix_type>
+ class oca_problem
+ {
+ /*!
+ REQUIREMENTS ON matrix_type
+ - matrix_type == a dlib::matrix capable of storing column vectors
+
+ WHAT THIS OBJECT REPRESENTS
+ This object is the interface used to define the optimization
+ problems solved by the oca optimizer defined later in this file.
+
+ OCA solves optimization problems with the following form:
+ Minimize: f(w) == 0.5*length_squared(w) + C*R(w)
+
+ Where R(w) is a user-supplied convex function and C > 0. Optionally,
+ there can also be non-negativity constraints on some or all of the
+ elements of w.
+
+ Or it can alternatively solve:
+ Minimize: f(w) == 0.5*length_squared(w-prior) + C*R(w)
+
+ Where prior is a user supplied vector and R(w) has the same
+ interpretation as above.
+
+ Or it can use the elastic net regularizer:
+ Minimize: f(w) == 0.5*(1-lasso_lambda)*length_squared(w) + lasso_lambda*sum(abs(w)) + C*R(w)
+
+ Where lasso_lambda is a number in the range [0, 1) and controls
+ trade-off between doing L1 and L2 regularization. R(w) has the same
+ interpretation as above.
+
+
+ Note that the stopping condition must be provided by the user
+ in the form of the optimization_status() function.
+ !*/
+
+ public:
+
+ typedef typename matrix_type::type scalar_type;
+
+ virtual ~oca_problem() {}
+
+ virtual bool risk_has_lower_bound (
+ scalar_type& lower_bound
+ ) const { return false; }
+ /*!
+ ensures
+ - if (R(w) >= a constant for all values of w) then
+ - returns true
+ - #lower_bound == the constant that lower bounds R(w)
+ - else
+ - returns false
+ !*/
+
+ virtual bool optimization_status (
+ scalar_type current_objective_value,
+ scalar_type current_error_gap,
+ scalar_type current_risk_value,
+ scalar_type current_risk_gap,
+ unsigned long num_cutting_planes,
+ unsigned long num_iterations
+ ) const = 0;
+ /*!
+ requires
+ - This function is called by the OCA optimizer each iteration.
+ - current_objective_value == the current value of the objective function f(w)
+ - current_error_gap == The bound on how much lower the objective function
+ can drop before we reach the optimal point. At the optimal solution the
+ error gap is equal to 0.
+ - current_risk_value == the current value of the R(w) term of the objective function.
+ - current_risk_gap == the bound on how much lower the risk term can go. At the optimal
+ solution the risk gap is zero.
+ - num_cutting_planes == the number of cutting planes the algorithm is currently using.
+ - num_iterations == A count of the total number of iterations that have executed
+ since we started running the optimization.
+ ensures
+ - If it is appropriate to terminate the optimization then this function returns true
+ and false otherwise.
+ !*/
+
+ virtual scalar_type get_c (
+ ) const = 0;
+ /*!
+ ensures
+ - returns the C parameter
+ !*/
+
+ virtual long get_num_dimensions (
+ ) const = 0;
+ /*!
+ ensures
+ - returns the number of free variables in this optimization problem
+ !*/
+
+ virtual void get_risk (
+ matrix_type& current_solution,
+ scalar_type& risk_value,
+ matrix_type& risk_subgradient
+ ) const = 0;
+ /*!
+ requires
+ - is_col_vector(current_solution) == true
+ - current_solution.size() == get_num_dimensions()
+ ensures
+ - #current_solution will be set to one of the following:
+ - current_solution (i.e. it won't be modified at all)
+ - The result of a line search passing through current_solution.
+ - #risk_value == R(#current_solution)
+ - #risk_subgradient == an element of the subgradient of R() at the
+ point #current_solution
+ - Note that #risk_value and #risk_subgradient are NOT multiplied by get_c()
+ !*/
+
+ };
+
+// ----------------------------------------------------------------------------------------
+
+ class oca
+ {
+ /*!
+ INITIAL VALUE
+ - get_subproblem_epsilon() == 1e-2
+ - get_subproblem_max_iterations() == 50000
+ - get_inactive_plane_threshold() == 20
+
+ WHAT THIS OBJECT REPRESENTS
+ This object is a tool for solving the optimization problem defined above
+ by the oca_problem abstract class.
+
+ For reference, OCA solves optimization problems with the following form:
+ Minimize: f(w) == 0.5*length_squared(w) + C*R(w)
+
+ Where R(w) is a user-supplied convex function and C > 0. Optionally,
+ this object can also add non-negativity constraints to some or all
+ of the elements of w.
+
+ Or it can alternatively solve:
+ Minimize: f(w) == 0.5*length_squared(w-prior) + C*R(w)
+
+ Where prior is a user supplied vector and R(w) has the same
+ interpretation as above.
+
+ Or it can use the elastic net regularizer:
+ Minimize: f(w) == 0.5*(1-lasso_lambda)*length_squared(w) + lasso_lambda*sum(abs(w)) + C*R(w)
+
+ Where lasso_lambda is a number in the range [0, 1) and controls
+ trade-off between doing L1 and L2 regularization. R(w) has the same
+ interpretation as above.
+
+
+ For a detailed discussion you should consult the following papers
+ from the Journal of Machine Learning Research:
+ Optimized Cutting Plane Algorithm for Large-Scale Risk Minimization
+ Vojtech Franc, Soren Sonnenburg; 10(Oct):2157--2192, 2009.
+
+ Bundle Methods for Regularized Risk Minimization
+ Choon Hui Teo, S.V.N. Vishwanthan, Alex J. Smola, Quoc V. Le; 11(Jan):311-365, 2010.
+ !*/
+ public:
+
+ oca (
+ );
+ /*!
+ ensures
+ - this object is properly initialized
+ !*/
+
+ template <
+ typename matrix_type
+ >
+ typename matrix_type::type operator() (
+ const oca_problem<matrix_type>& problem,
+ matrix_type& w,
+ unsigned long num_nonnegative = 0,
+ unsigned long force_weight_to_1 = std::numeric_limits<unsigned long>::max()
+ ) const;
+ /*!
+ requires
+ - problem.get_c() > 0
+ - problem.get_num_dimensions() > 0
+ ensures
+ - solves the given oca problem and stores the solution in #w. In particular,
+ this function solves:
+ Minimize: f(w) == 0.5*length_squared(w) + C*R(w)
+ - The optimization algorithm runs until problem.optimization_status()
+ indicates it is time to stop.
+ - returns the objective value at the solution #w
+ - if (num_nonnegative != 0) then
+ - Adds the constraint that #w(i) >= 0 for all i < num_nonnegative.
+ That is, the first num_nonnegative elements of #w will always be
+ non-negative. This includes the copies of w passed to get_risk()
+ in the form of the current_solution vector as well as the final
+ output of this function.
+ - if (force_weight_to_1 < problem.get_num_dimensions()) then
+ - The optimizer enforces the following constraints:
+ - #w(force_weight_to_1) == 1
+ - for all i > force_weight_to_1:
+ - #w(i) == 0
+ - That is, the element in the weight vector at the index indicated
+ by force_weight_to_1 will have a value of 1 upon completion of
+ this function, while all subsequent elements of w will have
+ values of 0.
+ !*/
+
+ template <
+ typename matrix_type
+ >
+ typename matrix_type::type operator() (
+ const oca_problem<matrix_type>& problem,
+ matrix_type& w,
+ const matrix_type& prior
+ ) const;
+ /*!
+ requires
+ - problem.get_c() > 0
+ - problem.get_num_dimensions() > 0
+ - is_col_vector(prior) == true
+ - prior.size() == problem.get_num_dimensions()
+ ensures
+ - solves the given oca problem and stores the solution in #w.
+ - In this mode, we solve a version of the problem with a different
+ regularizer. In particular, this function solves:
+ Minimize: f(w) == 0.5*length_squared(w-prior) + C*R(w)
+ - The optimization algorithm runs until problem.optimization_status()
+ indicates it is time to stop.
+ - returns the objective value at the solution #w
+ !*/
+
+ template <
+ typename matrix_type
+ >
+ typename matrix_type::type solve_with_elastic_net (
+ const oca_problem<matrix_type>& problem,
+ matrix_type& w,
+ scalar_type lasso_lambda,
+ unsigned long force_weight_to_1 = std::numeric_limits<unsigned long>::max()
+ ) const;
+ /*!
+ requires
+ - problem.get_c() > 0
+ - problem.get_num_dimensions() > 0
+ - 0 <= lasso_lambda < 1
+ ensures
+ - Solves the given oca problem and stores the solution in #w, but uses an
+ elastic net regularizer instead of the normal L2 regularizer. In
+ particular, this function solves:
+ Minimize: f(w) == 0.5*(1-lasso_lambda)*length_squared(w) + lasso_lambda*sum(abs(w)) + C*R(w)
+ - The optimization algorithm runs until problem.optimization_status()
+ indicates it is time to stop.
+ - returns the objective value at the solution #w
+ - if (force_weight_to_1 < problem.get_num_dimensions()) then
+ - The optimizer enforces the following constraints:
+ - #w(force_weight_to_1) == 1
+ - for all i > force_weight_to_1:
+ - #w(i) == 0
+ - That is, the element in the weight vector at the index indicated
+ by force_weight_to_1 will have a value of 1 upon completion of
+ this function, while all subsequent elements of w will have
+ values of 0.
+ !*/
+
+ void set_subproblem_epsilon (
+ double eps
+ );
+ /*!
+ requires
+ - eps > 0
+ ensures
+ - #get_subproblem_epsilon() == eps
+ !*/
+
+ double get_subproblem_epsilon (
+ ) const;
+ /*!
+ ensures
+ - returns the accuracy used in solving the quadratic programming
+ subproblem that is part of the overall OCA algorithm.
+ !*/
+
+ void set_subproblem_max_iterations (
+ unsigned long sub_max_iter
+ );
+ /*!
+ requires
+ - sub_max_iter > 0
+ ensures
+ - #get_subproblem_max_iterations() == sub_max_iter
+ !*/
+
+ unsigned long get_subproblem_max_iterations (
+ ) const;
+ /*!
+ ensures
+ - returns the maximum number of iterations this object will perform
+ while attempting to solve each quadratic programming subproblem.
+ !*/
+
+ void set_inactive_plane_threshold (
+ unsigned long inactive_thresh
+ );
+ /*!
+ requires
+ - inactive_thresh > 0
+ ensures
+ - #get_inactive_plane_threshold() == inactive_thresh
+ !*/
+
+ unsigned long get_inactive_plane_threshold (
+ ) const;
+ /*!
+ ensures
+ - As OCA runs it builds up a set of cutting planes. Typically
+ cutting planes become inactive after a certain point and can then
+ be removed. This function returns the number of iterations of
+ inactivity required before a cutting plane is removed.
+ !*/
+
+ };
+}
+
+// ----------------------------------------------------------------------------------------
+
+#endif // DLIB_OPTIMIZATION_OCA_ABsTRACT_Hh_
+
+
diff --git a/ml/dlib/dlib/optimization/optimization_search_strategies.h b/ml/dlib/dlib/optimization/optimization_search_strategies.h
new file mode 100644
index 000000000..eda637fa1
--- /dev/null
+++ b/ml/dlib/dlib/optimization/optimization_search_strategies.h
@@ -0,0 +1,324 @@
+// Copyright (C) 2008 Davis E. King (davis@dlib.net)
+// License: Boost Software License See LICENSE.txt for the full license.
+#ifndef DLIB_OPTIMIZATIOn_SEARCH_STRATEGIES_H_
+#define DLIB_OPTIMIZATIOn_SEARCH_STRATEGIES_H_
+
+#include <cmath>
+#include <limits>
+#include "../matrix.h"
+#include "../algs.h"
+#include "optimization_search_strategies_abstract.h"
+#include "../sequence.h"
+
+namespace dlib
+{
+
+// ----------------------------------------------------------------------------------------
+
+ class cg_search_strategy
+ {
+ public:
+ cg_search_strategy() : been_used(false) {}
+
+ double get_wolfe_rho (
+ ) const { return 0.001; }
+
+ double get_wolfe_sigma (
+ ) const { return 0.01; }
+
+ unsigned long get_max_line_search_iterations (
+ ) const { return 100; }
+
+ template <typename T>
+ const matrix<double,0,1>& get_next_direction (
+ const T& ,
+ const double ,
+ const T& funct_derivative
+ )
+ {
+ if (been_used == false)
+ {
+ been_used = true;
+ prev_direction = -funct_derivative;
+ }
+ else
+ {
+ // Use the Polak-Ribiere (4.1.12) conjugate gradient described by Fletcher on page 83
+ const double temp = trans(prev_derivative)*prev_derivative;
+ // If this value hits zero then just use the direction of steepest descent.
+ if (std::abs(temp) < std::numeric_limits<double>::epsilon())
+ {
+ prev_derivative = funct_derivative;
+ prev_direction = -funct_derivative;
+ return prev_direction;
+ }
+
+ double b = trans(funct_derivative-prev_derivative)*funct_derivative/(temp);
+ prev_direction = -funct_derivative + b*prev_direction;
+
+ }
+
+ prev_derivative = funct_derivative;
+ return prev_direction;
+ }
+
+ private:
+ bool been_used;
+ matrix<double,0,1> prev_derivative;
+ matrix<double,0,1> prev_direction;
+ };
+
+// ----------------------------------------------------------------------------------------
+
+ class bfgs_search_strategy
+ {
+ public:
+ bfgs_search_strategy() : been_used(false), been_used_twice(false) {}
+
+ double get_wolfe_rho (
+ ) const { return 0.01; }
+
+ double get_wolfe_sigma (
+ ) const { return 0.9; }
+
+ unsigned long get_max_line_search_iterations (
+ ) const { return 100; }
+
+ template <typename T>
+ const matrix<double,0,1>& get_next_direction (
+ const T& x,
+ const double ,
+ const T& funct_derivative
+ )
+ {
+ if (been_used == false)
+ {
+ been_used = true;
+ H = identity_matrix<double>(x.size());
+ }
+ else
+ {
+ // update H with the BFGS formula from (3.2.12) on page 55 of Fletcher
+ delta = (x-prev_x);
+ gamma = funct_derivative-prev_derivative;
+
+ double dg = dot(delta,gamma);
+
+ // Try to set the initial value of the H matrix to something reasonable if we are still
+ // in the early stages of figuring out what it is. This formula below is what is suggested
+ // in the book Numerical Optimization by Nocedal and Wright in the chapter on Quasi-Newton methods.
+ if (been_used_twice == false)
+ {
+ double gg = trans(gamma)*gamma;
+ if (std::abs(gg) > std::numeric_limits<double>::epsilon())
+ {
+ const double temp = put_in_range(0.01, 100, dg/gg);
+ H = diagm(uniform_matrix<double>(x.size(),1, temp));
+ been_used_twice = true;
+ }
+ }
+
+ Hg = H*gamma;
+ gH = trans(trans(gamma)*H);
+ double gHg = trans(gamma)*H*gamma;
+ if (gHg < std::numeric_limits<double>::infinity() && dg < std::numeric_limits<double>::infinity() &&
+ dg != 0)
+ {
+ H += (1 + gHg/dg)*delta*trans(delta)/(dg) - (delta*trans(gH) + Hg*trans(delta))/(dg);
+ }
+ else
+ {
+ H = identity_matrix<double>(H.nr());
+ been_used_twice = false;
+ }
+ }
+
+ prev_x = x;
+ prev_direction = -H*funct_derivative;
+ prev_derivative = funct_derivative;
+ return prev_direction;
+ }
+
+ private:
+ bool been_used;
+ bool been_used_twice;
+ matrix<double,0,1> prev_x;
+ matrix<double,0,1> prev_derivative;
+ matrix<double,0,1> prev_direction;
+ matrix<double> H;
+ matrix<double,0,1> delta, gamma, Hg, gH;
+ };
+
+// ----------------------------------------------------------------------------------------
+
+ class lbfgs_search_strategy
+ {
+ public:
+ explicit lbfgs_search_strategy(unsigned long max_size_) : max_size(max_size_), been_used(false)
+ {
+ DLIB_ASSERT (
+ max_size > 0,
+ "\t lbfgs_search_strategy(max_size)"
+ << "\n\t max_size can't be zero"
+ );
+ }
+
+ lbfgs_search_strategy(const lbfgs_search_strategy& item)
+ {
+ max_size = item.max_size;
+ been_used = item.been_used;
+ prev_x = item.prev_x;
+ prev_derivative = item.prev_derivative;
+ prev_direction = item.prev_direction;
+ alpha = item.alpha;
+ dh_temp = item.dh_temp;
+ }
+
+ double get_wolfe_rho (
+ ) const { return 0.01; }
+
+ double get_wolfe_sigma (
+ ) const { return 0.9; }
+
+ unsigned long get_max_line_search_iterations (
+ ) const { return 100; }
+
+ template <typename T>
+ const matrix<double,0,1>& get_next_direction (
+ const T& x,
+ const double ,
+ const T& funct_derivative
+ )
+ {
+ prev_direction = -funct_derivative;
+
+ if (been_used == false)
+ {
+ been_used = true;
+ }
+ else
+ {
+ // add an element into the stored data sequence
+ dh_temp.s = x - prev_x;
+ dh_temp.y = funct_derivative - prev_derivative;
+ double temp = dot(dh_temp.s, dh_temp.y);
+ // only accept this bit of data if temp isn't zero
+ if (std::abs(temp) > std::numeric_limits<double>::epsilon())
+ {
+ dh_temp.rho = 1/temp;
+ data.add(data.size(), dh_temp);
+ }
+ else
+ {
+ data.clear();
+ }
+
+ if (data.size() > 0)
+ {
+ // This block of code is from algorithm 7.4 in the Nocedal book.
+
+ alpha.resize(data.size());
+ for (unsigned long i = data.size()-1; i < data.size(); --i)
+ {
+ alpha[i] = data[i].rho*dot(data[i].s, prev_direction);
+ prev_direction -= alpha[i]*data[i].y;
+ }
+
+ // Take a guess at what the first H matrix should be. This formula below is what is suggested
+ // in the book Numerical Optimization by Nocedal and Wright in the chapter on Large Scale
+ // Unconstrained Optimization (in the L-BFGS section).
+ double H_0 = 1.0/data[data.size()-1].rho/dot(data[data.size()-1].y, data[data.size()-1].y);
+ H_0 = put_in_range(0.001, 1000.0, H_0);
+ prev_direction *= H_0;
+
+ for (unsigned long i = 0; i < data.size(); ++i)
+ {
+ double beta = data[i].rho*dot(data[i].y, prev_direction);
+ prev_direction += data[i].s * (alpha[i] - beta);
+ }
+ }
+
+ }
+
+ if (data.size() > max_size)
+ {
+ // remove the oldest element in the data sequence
+ data.remove(0, dh_temp);
+ }
+
+ prev_x = x;
+ prev_derivative = funct_derivative;
+ return prev_direction;
+ }
+
+ private:
+
+ struct data_helper
+ {
+ matrix<double,0,1> s;
+ matrix<double,0,1> y;
+ double rho;
+
+ friend void swap(data_helper& a, data_helper& b)
+ {
+ a.s.swap(b.s);
+ a.y.swap(b.y);
+ std::swap(a.rho, b.rho);
+ }
+ };
+ sequence<data_helper>::kernel_2a data;
+
+ unsigned long max_size;
+ bool been_used;
+ matrix<double,0,1> prev_x;
+ matrix<double,0,1> prev_derivative;
+ matrix<double,0,1> prev_direction;
+ std::vector<double> alpha;
+
+ data_helper dh_temp;
+ };
+
+// ----------------------------------------------------------------------------------------
+
+ template <typename hessian_funct>
+ class newton_search_strategy_obj
+ {
+ public:
+ explicit newton_search_strategy_obj(
+ const hessian_funct& hess
+ ) : hessian(hess) {}
+
+ double get_wolfe_rho (
+ ) const { return 0.01; }
+
+ double get_wolfe_sigma (
+ ) const { return 0.9; }
+
+ unsigned long get_max_line_search_iterations (
+ ) const { return 100; }
+
+ template <typename T>
+ const matrix<double,0,1> get_next_direction (
+ const T& x,
+ const double ,
+ const T& funct_derivative
+ )
+ {
+ return -inv(hessian(x))*funct_derivative;
+ }
+
+ private:
+ hessian_funct hessian;
+ };
+
+ template <typename hessian_funct>
+ newton_search_strategy_obj<hessian_funct> newton_search_strategy (
+ hessian_funct hessian
+ ) { return newton_search_strategy_obj<hessian_funct>(hessian); }
+
+// ----------------------------------------------------------------------------------------
+
+}
+
+#endif // DLIB_OPTIMIZATIOn_SEARCH_STRATEGIES_H_
+
diff --git a/ml/dlib/dlib/optimization/optimization_search_strategies_abstract.h b/ml/dlib/dlib/optimization/optimization_search_strategies_abstract.h
new file mode 100644
index 000000000..44b894bfc
--- /dev/null
+++ b/ml/dlib/dlib/optimization/optimization_search_strategies_abstract.h
@@ -0,0 +1,330 @@
+// Copyright (C) 2008 Davis E. King (davis@dlib.net)
+// License: Boost Software License See LICENSE.txt for the full license.
+#undef DLIB_OPTIMIZATIOn_SEARCH_STRATEGIES_ABSTRACT_
+#ifdef DLIB_OPTIMIZATIOn_SEARCH_STRATEGIES_ABSTRACT_
+
+#include <cmath>
+#include <limits>
+#include "../matrix/matrix_abstract.h"
+#include "../algs.h"
+
+
+namespace dlib
+{
+ /*
+ A good discussion of the search strategies in this file can be found in the
+ following book: Numerical Optimization by Nocedal and Wright.
+ */
+
+// ----------------------------------------------------------------------------------------
+
+ class cg_search_strategy
+ {
+ /*!
+ WHAT THIS OBJECT REPRESENTS
+ This object represents a strategy for determining which direction
+ a line search should be carried out along. This particular object
+ is an implementation of the Polak-Ribiere conjugate gradient method
+ for determining this direction.
+
+ This method uses an amount of memory that is linear in the number
+ of variables to be optimized. So it is capable of handling problems
+ with a very large number of variables. However, it is generally
+ not as good as the L-BFGS algorithm (which is defined below in
+ the lbfgs_search_strategy class).
+ !*/
+
+ public:
+ cg_search_strategy(
+ );
+ /*!
+ ensures
+ - This object is properly initialized and ready to generate
+ search directions.
+ !*/
+
+ double get_wolfe_rho (
+ ) const;
+ /*!
+ ensures
+ - returns the value of the Wolfe rho parameter that should be used when
+ this search strategy is used with the line_search() function.
+ !*/
+
+ double get_wolfe_sigma (
+ ) const;
+ /*!
+ ensures
+ - returns the value of the Wolfe sigma parameter that should be used when
+ this search strategy is used with the line_search() function.
+ !*/
+
+ unsigned long get_max_line_search_iterations (
+ ) const;
+ /*!
+ ensures
+ - returns the value of the max iterations parameter that should be used when
+ this search strategy is used with the line_search() function.
+ !*/
+
+ template <typename T>
+ const matrix<double,0,1>& get_next_direction (
+ const T& x,
+ const double funct_value,
+ const T& funct_derivative
+ );
+ /*!
+ requires
+ - this function is only called once per search iteration
+ - for some objective function f():
+ - x == the search point for the current iteration
+ - funct_value == f(x)
+ - funct_derivative == derivative(f)(x)
+ ensures
+ - Assuming that a line search is going to be conducted starting from the point x,
+ this function returns the direction in which the search should proceed.
+ !*/
+
+ };
+
+// ----------------------------------------------------------------------------------------
+
+ class bfgs_search_strategy
+ {
+ /*!
+ WHAT THIS OBJECT REPRESENTS
+ This object represents a strategy for determining which direction
+ a line search should be carried out along. This particular object
+ is an implementation of the BFGS quasi-newton method for determining
+ this direction.
+
+ This method uses an amount of memory that is quadratic in the number
+ of variables to be optimized. It is generally very effective but
+ if your problem has a very large number of variables then it isn't
+ appropriate. Instead You should try the lbfgs_search_strategy.
+ !*/
+
+ public:
+ bfgs_search_strategy(
+ );
+ /*!
+ ensures
+ - This object is properly initialized and ready to generate
+ search directions.
+ !*/
+
+ double get_wolfe_rho (
+ ) const;
+ /*!
+ ensures
+ - returns the value of the Wolfe rho parameter that should be used when
+ this search strategy is used with the line_search() function.
+ !*/
+
+ double get_wolfe_sigma (
+ ) const;
+ /*!
+ ensures
+ - returns the value of the Wolfe sigma parameter that should be used when
+ this search strategy is used with the line_search() function.
+ !*/
+
+ unsigned long get_max_line_search_iterations (
+ ) const;
+ /*!
+ ensures
+ - returns the value of the max iterations parameter that should be used when
+ this search strategy is used with the line_search() function.
+ !*/
+
+ template <typename T>
+ const matrix<double,0,1>& get_next_direction (
+ const T& x,
+ const double funct_value,
+ const T& funct_derivative
+ );
+ /*!
+ requires
+ - this function is only called once per search iteration
+ - for some objective function f():
+ - x == the search point for the current iteration
+ - funct_value == f(x)
+ - funct_derivative == derivative(f)(x)
+ ensures
+ - Assuming that a line search is going to be conducted starting from the point x,
+ this function returns the direction in which the search should proceed.
+ !*/
+ };
+
+// ----------------------------------------------------------------------------------------
+
+ class lbfgs_search_strategy
+ {
+ /*!
+ WHAT THIS OBJECT REPRESENTS
+ This object represents a strategy for determining which direction
+ a line search should be carried out along. This particular object
+ is an implementation of the L-BFGS quasi-newton method for determining
+ this direction.
+
+ This method uses an amount of memory that is linear in the number
+ of variables to be optimized. This makes it an excellent method
+ to use when an optimization problem has a large number of variables.
+ !*/
+ public:
+ explicit lbfgs_search_strategy(
+ unsigned long max_size
+ );
+ /*!
+ requires
+ - max_size > 0
+ ensures
+ - This object is properly initialized and ready to generate
+ search directions.
+ - L-BFGS works by remembering a certain number of position and gradient
+ pairs. It uses this remembered information to compute search directions.
+ The max_size argument determines how many of these pairs will be remembered.
+ Typically, using between 3 and 30 pairs performs well for many problems.
+ !*/
+
+ double get_wolfe_rho (
+ ) const;
+ /*!
+ ensures
+ - returns the value of the Wolfe rho parameter that should be used when
+ this search strategy is used with the line_search() function.
+ !*/
+
+ double get_wolfe_sigma (
+ ) const;
+ /*!
+ ensures
+ - returns the value of the Wolfe sigma parameter that should be used when
+ this search strategy is used with the line_search() function.
+ !*/
+
+ unsigned long get_max_line_search_iterations (
+ ) const;
+ /*!
+ ensures
+ - returns the value of the max iterations parameter that should be used when
+ this search strategy is used with the line_search() function.
+ !*/
+
+ template <typename T>
+ const matrix<double,0,1>& get_next_direction (
+ const T& x,
+ const double funct_value,
+ const T& funct_derivative
+ );
+ /*!
+ requires
+ - this function is only called once per search iteration
+ - for some objective function f():
+ - x == the search point for the current iteration
+ - funct_value == f(x)
+ - funct_derivative == derivative(f)(x)
+ ensures
+ - Assuming that a line search is going to be conducted starting from the point x,
+ this function returns the direction in which the search should proceed.
+ !*/
+ };
+
+// ----------------------------------------------------------------------------------------
+
+ template <
+ typename hessian_funct
+ >
+ class newton_search_strategy_obj
+ {
+ /*!
+ REQUIREMENTS ON hessian_funct
+ Objects of hessian_funct type must be function objects which
+ take a single argument and return a dlib::matrix of doubles. The
+ single argument must be a dlib::matrix capable of representing
+ column vectors of doubles. hessian_funct must also be copy
+ constructable.
+
+ WHAT THIS OBJECT REPRESENTS
+ This object represents a strategy for determining which direction
+ a line search should be carried out along. This particular object
+ is an implementation of the newton method for determining this
+ direction. That is, it uses the following formula to determine
+ the direction:
+ search_direction = -inv(hessian(x))*derivative
+ !*/
+ public:
+ explicit newton_search_strategy_obj(
+ const hessian_funct& hess
+ );
+ /*!
+ ensures
+ - This object is properly initialized and ready to generate
+ search directions.
+ - hess will be used by this object to generate the needed hessian
+ matrices every time get_next_direction() is called.
+ !*/
+
+ double get_wolfe_rho (
+ ) const;
+ /*!
+ ensures
+ - returns the value of the Wolfe rho parameter that should be used when
+ this search strategy is used with the line_search() function.
+ !*/
+
+ double get_wolfe_sigma (
+ ) const;
+ /*!
+ ensures
+ - returns the value of the Wolfe sigma parameter that should be used when
+ this search strategy is used with the line_search() function.
+ !*/
+
+ unsigned long get_max_line_search_iterations (
+ ) const;
+ /*!
+ ensures
+ - returns the value of the max iterations parameter that should be used when
+ this search strategy is used with the line_search() function.
+ !*/
+
+ template <typename T>
+ const matrix<double,0,1> get_next_direction (
+ const T& x,
+ const double funct_value,
+ const T& funct_derivative
+ );
+ /*!
+ requires
+ - for some objective function f():
+ - x == the search point for the current iteration
+ - funct_value == f(x)
+ - funct_derivative == derivative(f)(x)
+ ensures
+ - Assuming that a line search is going to be conducted starting from the
+ point x, this function returns the direction in which the search should
+ proceed.
+ - In particular, the search direction will be given by:
+ - search_direction = -inv(hessian(x))*funct_derivative
+ !*/
+
+ };
+
+ template <typename hessian_funct>
+ newton_search_strategy_obj<hessian_funct> newton_search_strategy (
+ hessian_funct hessian
+ ) { return newton_search_strategy_obj<hessian_funct>(hessian); }
+ /*!
+ ensures
+ - constructs and returns a newton_search_strategy_obj.
+ This function is just a helper to make the syntax for creating
+ these objects a little simpler.
+ !*/
+
+// ----------------------------------------------------------------------------------------
+
+}
+
+#endif // DLIB_OPTIMIZATIOn_SEARCH_STRATEGIES_ABSTRACT_
+
diff --git a/ml/dlib/dlib/optimization/optimization_solve_qp2_using_smo.h b/ml/dlib/dlib/optimization/optimization_solve_qp2_using_smo.h
new file mode 100644
index 000000000..88cad0cf3
--- /dev/null
+++ b/ml/dlib/dlib/optimization/optimization_solve_qp2_using_smo.h
@@ -0,0 +1,468 @@
+// Copyright (C) 2007 Davis E. King (davis@dlib.net)
+// License: Boost Software License See LICENSE.txt for the full license.
+#ifndef DLIB_SOLVE_QP2_USING_SMo_Hh_
+#define DLIB_SOLVE_QP2_USING_SMo_Hh_
+
+#include "optimization_solve_qp2_using_smo_abstract.h"
+#include <cmath>
+#include <limits>
+#include <sstream>
+#include "../matrix.h"
+#include "../algs.h"
+
+
+namespace dlib
+{
+
+// ----------------------------------------------------------------------------------------
+
+ class invalid_nu_error : public dlib::error
+ {
+ public:
+ invalid_nu_error(const std::string& msg, double nu_) : dlib::error(msg), nu(nu_) {};
+ const double nu;
+ };
+
+// ----------------------------------------------------------------------------------------
+
+ template <
+ typename T
+ >
+ typename T::type maximum_nu_impl (
+ const T& y
+ )
+ {
+ typedef typename T::type scalar_type;
+ // make sure requires clause is not broken
+ DLIB_ASSERT(y.size() > 1 && is_col_vector(y),
+ "\ttypedef T::type maximum_nu(y)"
+ << "\n\ty should be a column vector with more than one entry"
+ << "\n\ty.nr(): " << y.nr()
+ << "\n\ty.nc(): " << y.nc()
+ );
+
+ long pos_count = 0;
+ long neg_count = 0;
+ for (long r = 0; r < y.nr(); ++r)
+ {
+ if (y(r) == 1.0)
+ {
+ ++pos_count;
+ }
+ else if (y(r) == -1.0)
+ {
+ ++neg_count;
+ }
+ else
+ {
+ // make sure requires clause is not broken
+ DLIB_ASSERT(y(r) == -1.0 || y(r) == 1.0,
+ "\ttypedef T::type maximum_nu(y)"
+ << "\n\ty should contain only 1 and 0 entries"
+ << "\n\tr: " << r
+ << "\n\ty(r): " << y(r)
+ );
+ }
+ }
+ return static_cast<scalar_type>(2.0*(scalar_type)std::min(pos_count,neg_count)/(scalar_type)y.nr());
+ }
+
+ template <
+ typename T
+ >
+ typename T::type maximum_nu (
+ const T& y
+ )
+ {
+ return maximum_nu_impl(mat(y));
+ }
+
+ template <
+ typename T
+ >
+ typename T::value_type maximum_nu (
+ const T& y
+ )
+ {
+ return maximum_nu_impl(mat(y));
+ }
+
+// ----------------------------------------------------------------------------------------
+
+ template <
+ typename matrix_type
+ >
+ class solve_qp2_using_smo
+ {
+ public:
+ typedef typename matrix_type::mem_manager_type mem_manager_type;
+ typedef typename matrix_type::type scalar_type;
+ typedef typename matrix_type::layout_type layout_type;
+ typedef matrix<scalar_type,0,0,mem_manager_type,layout_type> general_matrix;
+ typedef matrix<scalar_type,0,1,mem_manager_type,layout_type> column_matrix;
+
+
+ template <
+ typename EXP1,
+ typename EXP2,
+ long NR
+ >
+ unsigned long operator() (
+ const matrix_exp<EXP1>& Q,
+ const matrix_exp<EXP2>& y,
+ const scalar_type nu,
+ matrix<scalar_type,NR,1,mem_manager_type, layout_type>& alpha,
+ scalar_type eps
+ )
+ {
+ DLIB_ASSERT(Q.nr() == Q.nc() && y.size() == Q.nr() && y.size() > 1 && is_col_vector(y) &&
+ sum((y == +1) + (y == -1)) == y.size() &&
+ 0 < nu && nu <= 1 &&
+ eps > 0,
+ "\t void solve_qp2_using_smo::operator()"
+ << "\n\t invalid arguments were given to this function"
+ << "\n\t Q.nr(): " << Q.nr()
+ << "\n\t Q.nc(): " << Q.nc()
+ << "\n\t is_col_vector(y): " << is_col_vector(y)
+ << "\n\t y.size(): " << y.size()
+ << "\n\t sum((y == +1) + (y == -1)): " << sum((y == +1) + (y == -1))
+ << "\n\t nu: " << nu
+ << "\n\t eps: " << eps
+ );
+
+ alpha.set_size(Q.nr(),1);
+ df.set_size(Q.nr());
+
+ // now initialize alpha
+ set_initial_alpha(y, nu, alpha);
+
+ const scalar_type tau = 1e-12;
+
+ typedef typename colm_exp<EXP1>::type col_type;
+
+ set_all_elements(df, 0);
+ // initialize df. Compute df = Q*alpha
+ for (long r = 0; r < df.nr(); ++r)
+ {
+ if (alpha(r) != 0)
+ {
+ df += alpha(r)*matrix_cast<scalar_type>(colm(Q,r));
+ }
+ }
+
+ unsigned long count = 0;
+
+ // now perform the actual optimization of alpha
+ long i=0, j=0;
+ while (find_working_group(y,alpha,Q,df,tau,eps,i,j))
+ {
+ ++count;
+ const scalar_type old_alpha_i = alpha(i);
+ const scalar_type old_alpha_j = alpha(j);
+
+ optimize_working_pair(alpha,Q,df,tau,i,j);
+
+ // update the df vector now that we have modified alpha(i) and alpha(j)
+ scalar_type delta_alpha_i = alpha(i) - old_alpha_i;
+ scalar_type delta_alpha_j = alpha(j) - old_alpha_j;
+
+ col_type Q_i = colm(Q,i);
+ col_type Q_j = colm(Q,j);
+ for(long k = 0; k < df.nr(); ++k)
+ df(k) += Q_i(k)*delta_alpha_i + Q_j(k)*delta_alpha_j;
+ }
+
+ return count;
+ }
+
+ const column_matrix& get_gradient (
+ ) const { return df; }
+
+ private:
+
+ // -------------------------------------------------------------------------------------
+
+ template <
+ typename scalar_type,
+ typename scalar_vector_type,
+ typename scalar_vector_type2
+ >
+ inline void set_initial_alpha (
+ const scalar_vector_type& y,
+ const scalar_type nu,
+ scalar_vector_type2& alpha
+ ) const
+ {
+ set_all_elements(alpha,0);
+ const scalar_type l = y.nr();
+ scalar_type temp = nu*l/2;
+ long num = (long)std::floor(temp);
+ long num_total = (long)std::ceil(temp);
+
+ bool has_slack = false;
+ int count = 0;
+ for (int i = 0; i < alpha.nr(); ++i)
+ {
+ if (y(i) == 1)
+ {
+ if (count < num)
+ {
+ ++count;
+ alpha(i) = 1;
+ }
+ else
+ {
+ has_slack = true;
+ if (num_total > num)
+ {
+ ++count;
+ alpha(i) = temp - std::floor(temp);
+ }
+ break;
+ }
+ }
+ }
+
+ if (count != num_total || has_slack == false)
+ {
+ std::ostringstream sout;
+ sout << "Invalid nu of " << nu << ". It is required that: 0 < nu < " << 2*(scalar_type)count/y.nr();
+ throw invalid_nu_error(sout.str(),nu);
+ }
+
+ has_slack = false;
+ count = 0;
+ for (int i = 0; i < alpha.nr(); ++i)
+ {
+ if (y(i) == -1)
+ {
+ if (count < num)
+ {
+ ++count;
+ alpha(i) = 1;
+ }
+ else
+ {
+ has_slack = true;
+ if (num_total > num)
+ {
+ ++count;
+ alpha(i) = temp - std::floor(temp);
+ }
+ break;
+ }
+ }
+ }
+
+ if (count != num_total || has_slack == false)
+ {
+ std::ostringstream sout;
+ sout << "Invalid nu of " << nu << ". It is required that: 0 < nu < " << 2*(scalar_type)count/y.nr();
+ throw invalid_nu_error(sout.str(),nu);
+ }
+ }
+
+ // ------------------------------------------------------------------------------------
+
+ template <
+ typename scalar_vector_type,
+ typename scalar_type,
+ typename EXP,
+ typename U, typename V
+ >
+ inline bool find_working_group (
+ const V& y,
+ const U& alpha,
+ const matrix_exp<EXP>& Q,
+ const scalar_vector_type& df,
+ const scalar_type tau,
+ const scalar_type eps,
+ long& i_out,
+ long& j_out
+ ) const
+ {
+ using namespace std;
+ long ip = 0;
+ long jp = 0;
+ long in = 0;
+ long jn = 0;
+
+
+ typedef typename colm_exp<EXP>::type col_type;
+ typedef typename diag_exp<EXP>::type diag_type;
+
+ scalar_type ip_val = -numeric_limits<scalar_type>::infinity();
+ scalar_type jp_val = numeric_limits<scalar_type>::infinity();
+ scalar_type in_val = -numeric_limits<scalar_type>::infinity();
+ scalar_type jn_val = numeric_limits<scalar_type>::infinity();
+
+ // loop over the alphas and find the maximum ip and in indices.
+ for (long i = 0; i < alpha.nr(); ++i)
+ {
+ if (y(i) == 1)
+ {
+ if (alpha(i) < 1.0)
+ {
+ if (-df(i) > ip_val)
+ {
+ ip_val = -df(i);
+ ip = i;
+ }
+ }
+ }
+ else
+ {
+ if (alpha(i) > 0.0)
+ {
+ if (df(i) > in_val)
+ {
+ in_val = df(i);
+ in = i;
+ }
+ }
+ }
+ }
+
+ scalar_type Mp = numeric_limits<scalar_type>::infinity();
+ scalar_type Mn = numeric_limits<scalar_type>::infinity();
+
+ // Pick out the columns and diagonal of Q we need below. Doing
+ // it this way is faster if Q is actually a symmetric_matrix_cache
+ // object.
+ col_type Q_ip = colm(Q,ip);
+ col_type Q_in = colm(Q,in);
+ diag_type Q_diag = diag(Q);
+
+
+
+ // now we need to find the minimum jp and jn indices
+ for (long j = 0; j < alpha.nr(); ++j)
+ {
+ if (y(j) == 1)
+ {
+ if (alpha(j) > 0.0)
+ {
+ scalar_type b = ip_val + df(j);
+ if (-df(j) < Mp)
+ Mp = -df(j);
+
+ if (b > 0)
+ {
+ scalar_type a = Q_ip(ip) + Q_diag(j) - 2*Q_ip(j);
+ if (a <= 0)
+ a = tau;
+ scalar_type temp = -b*b/a;
+ if (temp < jp_val)
+ {
+ jp_val = temp;
+ jp = j;
+ }
+ }
+ }
+ }
+ else
+ {
+ if (alpha(j) < 1.0)
+ {
+ scalar_type b = in_val - df(j);
+ if (df(j) < Mn)
+ Mn = df(j);
+
+ if (b > 0)
+ {
+ scalar_type a = Q_in(in) + Q_diag(j) - 2*Q_in(j);
+ if (a <= 0)
+ a = tau;
+ scalar_type temp = -b*b/a;
+ if (temp < jn_val)
+ {
+ jn_val = temp;
+ jn = j;
+ }
+ }
+ }
+ }
+ }
+
+ // if we are at the optimal point then return false so the caller knows
+ // to stop optimizing
+ if (std::max(ip_val - Mp, in_val - Mn) < eps)
+ return false;
+
+ if (jp_val < jn_val)
+ {
+ i_out = ip;
+ j_out = jp;
+ }
+ else
+ {
+ i_out = in;
+ j_out = jn;
+ }
+
+ return true;
+ }
+
+ // ------------------------------------------------------------------------------------
+
+ template <
+ typename EXP,
+ typename T, typename U
+ >
+ inline void optimize_working_pair (
+ T& alpha,
+ const matrix_exp<EXP>& Q,
+ const U& df,
+ const scalar_type tau,
+ const long i,
+ const long j
+ ) const
+ {
+ scalar_type quad_coef = Q(i,i)+Q(j,j)-2*Q(j,i);
+ if (quad_coef <= 0)
+ quad_coef = tau;
+ scalar_type delta = (df(i)-df(j))/quad_coef;
+ scalar_type sum = alpha(i) + alpha(j);
+ alpha(i) -= delta;
+ alpha(j) += delta;
+
+ if(sum > 1)
+ {
+ if(alpha(i) > 1)
+ {
+ alpha(i) = 1;
+ alpha(j) = sum - 1;
+ }
+ else if(alpha(j) > 1)
+ {
+ alpha(j) = 1;
+ alpha(i) = sum - 1;
+ }
+ }
+ else
+ {
+ if(alpha(j) < 0)
+ {
+ alpha(j) = 0;
+ alpha(i) = sum;
+ }
+ else if(alpha(i) < 0)
+ {
+ alpha(i) = 0;
+ alpha(j) = sum;
+ }
+ }
+ }
+
+ // ------------------------------------------------------------------------------------
+
+ column_matrix df; // gradient of f(alpha)
+ };
+
+// ----------------------------------------------------------------------------------------
+
+}
+
+#endif // DLIB_SOLVE_QP2_USING_SMo_Hh_
+
+
diff --git a/ml/dlib/dlib/optimization/optimization_solve_qp2_using_smo_abstract.h b/ml/dlib/dlib/optimization/optimization_solve_qp2_using_smo_abstract.h
new file mode 100644
index 000000000..962541c25
--- /dev/null
+++ b/ml/dlib/dlib/optimization/optimization_solve_qp2_using_smo_abstract.h
@@ -0,0 +1,150 @@
+// Copyright (C) 2007 Davis E. King (davis@dlib.net)
+// License: Boost Software License See LICENSE.txt for the full license.
+#undef DLIB_OPTIMIZATION_SOLVE_QP2_USING_SMO_ABSTRACT_H_
+#ifdef DLIB_OPTIMIZATION_SOLVE_QP2_USING_SMO_ABSTRACT_H_
+
+#include "../matrix/matrix_abstract.h"
+#include "../algs.h"
+
+namespace dlib
+{
+
+// ----------------------------------------------------------------------------------------
+
+ class invalid_nu_error : public dlib::error
+ {
+ /*!
+ WHAT THIS OBJECT REPRESENTS
+ This object is an exception class used to indicate that a
+ value of nu given to the solve_qp2_using_smo object is incompatible
+ with the constraints of the quadratic program.
+
+ this->nu will be set to the invalid value of nu used.
+ !*/
+
+ public:
+ invalid_nu_error(const std::string& msg, double nu_) : dlib::error(msg), nu(nu_) {};
+ const double nu;
+ };
+
+// ----------------------------------------------------------------------------------------
+
+ template <
+ typename T
+ >
+ typename T::type maximum_nu (
+ const T& y
+ );
+ /*!
+ requires
+ - T == a matrix object or an object convertible to a matrix via mat()
+ - is_col_vector(y) == true
+ - y.size() > 1
+ - sum((y == +1) + (y == -1)) == y.size()
+ (i.e. all elements of y must be equal to +1 or -1)
+ ensures
+ - returns the maximum valid nu that can be used with solve_qp2_using_smo and
+ the given y vector.
+ (i.e. 2.0*min(sum(y == +1), sum(y == -1))/y.size())
+ !*/
+
+// ----------------------------------------------------------------------------------------
+
+ template <
+ typename matrix_type
+ >
+ class solve_qp2_using_smo
+ {
+ /*!
+ REQUIREMENTS ON matrix_type
+ Must be some type of dlib::matrix.
+
+ WHAT THIS OBJECT REPRESENTS
+ This object is a tool for solving the following quadratic programming
+ problem using the sequential minimal optimization algorithm:
+
+ Minimize: f(alpha) == 0.5*trans(alpha)*Q*alpha
+ subject to the following constraints:
+ - sum(alpha) == nu*y.size()
+ - 0 <= min(alpha) && max(alpha) <= 1
+ - trans(y)*alpha == 0
+
+ Where all elements of y must be equal to +1 or -1 and f is convex.
+ This means that Q should be symmetric and positive-semidefinite.
+
+
+ This object implements the strategy used by the LIBSVM tool. The following papers
+ can be consulted for additional details:
+ - Chang and Lin, Training {nu}-Support Vector Classifiers: Theory and Algorithms
+ - Chih-Chung Chang and Chih-Jen Lin, LIBSVM : a library for support vector
+ machines, 2001. Software available at http://www.csie.ntu.edu.tw/~cjlin/libsvm
+ !*/
+
+ public:
+
+ typedef typename matrix_type::mem_manager_type mem_manager_type;
+ typedef typename matrix_type::type scalar_type;
+ typedef typename matrix_type::layout_type layout_type;
+ typedef matrix<scalar_type,0,0,mem_manager_type,layout_type> general_matrix;
+ typedef matrix<scalar_type,0,1,mem_manager_type,layout_type> column_matrix;
+
+ template <
+ typename EXP1,
+ typename EXP2,
+ long NR
+ >
+ unsigned long operator() (
+ const matrix_exp<EXP1>& Q,
+ const matrix_exp<EXP2>& y,
+ const scalar_type nu,
+ matrix<scalar_type,NR,1,mem_manager_type, layout_type>& alpha,
+ scalar_type eps
+ );
+ /*!
+ requires
+ - Q.nr() == Q.nc()
+ - is_col_vector(y) == true
+ - y.size() == Q.nr()
+ - y.size() > 1
+ - sum((y == +1) + (y == -1)) == y.size()
+ (i.e. all elements of y must be equal to +1 or -1)
+ - alpha must be capable of representing a vector of size y.size() elements
+ - 0 < nu <= 1
+ - eps > 0
+ ensures
+ - This function solves the quadratic program defined in this class's main comment.
+ - The solution to the quadratic program will be stored in #alpha.
+ - #alpha.size() == y.size()
+ - This function uses an implementation of the sequential minimal optimization
+ algorithm. It runs until the KKT violation is less than eps. So eps controls
+ how accurate the solution is and smaller values result in better solutions.
+ (a reasonable eps is usually about 1e-3)
+ - #get_gradient() == Q*(#alpha)
+ (i.e. stores the gradient of f() at #alpha in get_gradient())
+ - returns the number of iterations performed.
+ throws
+ - invalid_nu_error
+ This exception is thrown if nu >= maximum_nu(y).
+ (some values of nu cause the constraints to become impossible to satisfy.
+ If this is detected then an exception is thrown).
+ !*/
+
+ const column_matrix& get_gradient (
+ ) const;
+ /*!
+ ensures
+ - returns the gradient vector at the solution of the last problem solved
+ by this object. If no problem has been solved then returns an empty
+ vector.
+ !*/
+
+ };
+
+// ----------------------------------------------------------------------------------------
+
+}
+
+#endif // DLIB_OPTIMIZATION_SOLVE_QP2_USING_SMO_ABSTRACT_H_
+
+
+
diff --git a/ml/dlib/dlib/optimization/optimization_solve_qp3_using_smo.h b/ml/dlib/dlib/optimization/optimization_solve_qp3_using_smo.h
new file mode 100644
index 000000000..617ecc408
--- /dev/null
+++ b/ml/dlib/dlib/optimization/optimization_solve_qp3_using_smo.h
@@ -0,0 +1,455 @@
+// Copyright (C) 2007 Davis E. King (davis@dlib.net)
+// License: Boost Software License See LICENSE.txt for the full license.
+#ifndef DLIB_SOLVE_QP3_USING_SMo_Hh_
+#define DLIB_SOLVE_QP3_USING_SMo_Hh_
+
+#include "optimization_solve_qp3_using_smo_abstract.h"
+#include <cmath>
+#include <limits>
+#include <sstream>
+#include "../matrix.h"
+#include "../algs.h"
+
+
+namespace dlib
+{
+
+// ----------------------------------------------------------------------------------------
+
+ class invalid_qp3_error : public dlib::error
+ {
+
+ public:
+ invalid_qp3_error(
+ const std::string& msg,
+ double B_,
+ double Cp_,
+ double Cn_
+ ) :
+ dlib::error(msg),
+ B(B_),
+ Cp(Cp_),
+ Cn(Cn_)
+ {};
+
+ const double B;
+ const double Cp;
+ const double Cn;
+ };
+
+// ----------------------------------------------------------------------------------------
+
+ template <
+ typename matrix_type
+ >
+ class solve_qp3_using_smo
+ {
+ public:
+ typedef typename matrix_type::mem_manager_type mem_manager_type;
+ typedef typename matrix_type::type scalar_type;
+ typedef typename matrix_type::layout_type layout_type;
+ typedef matrix<scalar_type,0,0,mem_manager_type,layout_type> general_matrix;
+ typedef matrix<scalar_type,0,1,mem_manager_type,layout_type> column_matrix;
+
+
+ template <
+ typename EXP1,
+ typename EXP2,
+ typename EXP3,
+ long NR
+ >
+ unsigned long operator() (
+ const matrix_exp<EXP1>& Q,
+ const matrix_exp<EXP2>& p,
+ const matrix_exp<EXP3>& y,
+ const scalar_type B,
+ const scalar_type Cp,
+ const scalar_type Cn,
+ matrix<scalar_type,NR,1,mem_manager_type, layout_type>& alpha,
+ scalar_type eps
+ )
+ {
+ DLIB_ASSERT(Q.nr() == Q.nc() && y.size() == Q.nr() && p.size() == y.size() &&
+ y.size() > 0 && is_col_vector(y) && is_col_vector(p) &&
+ sum((y == +1) + (y == -1)) == y.size() &&
+ Cp > 0 && Cn > 0 &&
+ eps > 0,
+ "\t void solve_qp3_using_smo::operator()"
+ << "\n\t invalid arguments were given to this function"
+ << "\n\t Q.nr(): " << Q.nr()
+ << "\n\t Q.nc(): " << Q.nc()
+ << "\n\t is_col_vector(p): " << is_col_vector(p)
+ << "\n\t p.size(): " << p.size()
+ << "\n\t is_col_vector(y): " << is_col_vector(y)
+ << "\n\t y.size(): " << y.size()
+ << "\n\t sum((y == +1) + (y == -1)): " << sum((y == +1) + (y == -1))
+ << "\n\t Cp: " << Cp
+ << "\n\t Cn: " << Cn
+ << "\n\t eps: " << eps
+ );
+
+
+
+ set_initial_alpha(y, B, Cp, Cn, alpha);
+
+
+ const scalar_type tau = 1e-12;
+
+ typedef typename colm_exp<EXP1>::type col_type;
+
+ // initialize df. Compute df = Q*alpha + p
+ df = p;
+ for (long r = 0; r < df.nr(); ++r)
+ {
+ if (alpha(r) != 0)
+ {
+ df += alpha(r)*matrix_cast<scalar_type>(colm(Q,r));
+ }
+ }
+
+ unsigned long count = 0;
+ // now perform the actual optimization of alpha
+ long i=0, j=0;
+ while (find_working_group(y,alpha,Q,df,Cp,Cn,tau,eps,i,j))
+ {
+ ++count;
+ const scalar_type old_alpha_i = alpha(i);
+ const scalar_type old_alpha_j = alpha(j);
+
+ optimize_working_pair(alpha,Q,y,df,tau,i,j, Cp, Cn );
+
+ // update the df vector now that we have modified alpha(i) and alpha(j)
+ scalar_type delta_alpha_i = alpha(i) - old_alpha_i;
+ scalar_type delta_alpha_j = alpha(j) - old_alpha_j;
+
+ col_type Q_i = colm(Q,i);
+ col_type Q_j = colm(Q,j);
+ for(long k = 0; k < df.nr(); ++k)
+ df(k) += Q_i(k)*delta_alpha_i + Q_j(k)*delta_alpha_j;
+ }
+
+ return count;
+ }
+
+ const column_matrix& get_gradient (
+ ) const { return df; }
+
+ private:
+
+ // -------------------------------------------------------------------------------------
+
+ template <
+ typename scalar_type,
+ typename scalar_vector_type,
+ typename scalar_vector_type2
+ >
+ inline void set_initial_alpha (
+ const scalar_vector_type& y,
+ const scalar_type B,
+ const scalar_type Cp,
+ const scalar_type Cn,
+ scalar_vector_type2& alpha
+ ) const
+ {
+ alpha.set_size(y.size());
+
+ set_all_elements(alpha,0);
+
+ // It's easy in the B == 0 case
+ if (B == 0)
+ return;
+
+ const scalar_type C = (B > 0)? Cp : Cn;
+
+ scalar_type temp = std::abs(B)/C;
+ long num = (long)std::floor(temp);
+ long num_total = (long)std::ceil(temp);
+
+ const scalar_type B_sign = (B > 0)? 1 : -1;
+
+ long count = 0;
+ for (long i = 0; i < alpha.nr(); ++i)
+ {
+ if (y(i) == B_sign)
+ {
+ if (count < num)
+ {
+ ++count;
+ alpha(i) = C;
+ }
+ else
+ {
+ if (count < num_total)
+ {
+ ++count;
+ alpha(i) = C*(temp - std::floor(temp));
+ }
+ break;
+ }
+ }
+ }
+
+ if (count != num_total)
+ {
+ std::ostringstream sout;
+ sout << "Invalid QP3 constraint parameters of B: " << B << ", Cp: " << Cp << ", Cn: "<< Cn;
+ throw invalid_qp3_error(sout.str(),B,Cp,Cn);
+ }
+ }
+
+ // ------------------------------------------------------------------------------------
+
+ template <
+ typename scalar_vector_type,
+ typename scalar_type,
+ typename EXP,
+ typename U, typename V
+ >
+ inline bool find_working_group (
+ const V& y,
+ const U& alpha,
+ const matrix_exp<EXP>& Q,
+ const scalar_vector_type& df,
+ const scalar_type Cp,
+ const scalar_type Cn,
+ const scalar_type tau,
+ const scalar_type eps,
+ long& i_out,
+ long& j_out
+ ) const
+ {
+ using namespace std;
+
+ long ip = 0;
+ long jp = 0;
+
+
+ typedef typename colm_exp<EXP>::type col_type;
+ typedef typename diag_exp<EXP>::type diag_type;
+
+ scalar_type ip_val = -numeric_limits<scalar_type>::infinity();
+ scalar_type jp_val = numeric_limits<scalar_type>::infinity();
+
+ // loop over the alphas and find the maximum ip and in indices.
+ for (long i = 0; i < alpha.nr(); ++i)
+ {
+ if (y(i) == 1)
+ {
+ if (alpha(i) < Cp)
+ {
+ if (-df(i) > ip_val)
+ {
+ ip_val = -df(i);
+ ip = i;
+ }
+ }
+ }
+ else
+ {
+ if (alpha(i) > 0.0)
+ {
+ if (df(i) > ip_val)
+ {
+ ip_val = df(i);
+ ip = i;
+ }
+ }
+ }
+ }
+
+ scalar_type Mp = -numeric_limits<scalar_type>::infinity();
+
+ // Pick out the column and diagonal of Q we need below. Doing
+ // it this way is faster if Q is actually a symmetric_matrix_cache
+ // object.
+ col_type Q_ip = colm(Q,ip);
+ diag_type Q_diag = diag(Q);
+
+
+
+ // now we need to find the minimum jp indices
+ for (long j = 0; j < alpha.nr(); ++j)
+ {
+ if (y(j) == 1)
+ {
+ if (alpha(j) > 0.0)
+ {
+ scalar_type b = ip_val + df(j);
+ if (df(j) > Mp)
+ Mp = df(j);
+
+ if (b > 0)
+ {
+ scalar_type a = Q_ip(ip) + Q_diag(j) - 2*y(ip)*Q_ip(j);
+ if (a <= 0)
+ a = tau;
+ scalar_type temp = -b*b/a;
+ if (temp < jp_val)
+ {
+ jp_val = temp;
+ jp = j;
+ }
+ }
+ }
+ }
+ else
+ {
+ if (alpha(j) < Cn)
+ {
+ scalar_type b = ip_val - df(j);
+ if (-df(j) > Mp)
+ Mp = -df(j);
+
+ if (b > 0)
+ {
+ scalar_type a = Q_ip(ip) + Q_diag(j) + 2*y(ip)*Q_ip(j);
+ if (a <= 0)
+ a = tau;
+ scalar_type temp = -b*b/a;
+ if (temp < jp_val)
+ {
+ jp_val = temp;
+ jp = j;
+ }
+ }
+ }
+ }
+ }
+
+ // if we are at the optimal point then return false so the caller knows
+ // to stop optimizing
+ if (Mp + ip_val < eps)
+ return false;
+
+
+ i_out = ip;
+ j_out = jp;
+
+ return true;
+ }
+
+ // ------------------------------------------------------------------------------------
+
+ template <
+ typename EXP,
+ typename EXP2,
+ typename T, typename U
+ >
+ inline void optimize_working_pair (
+ T& alpha,
+ const matrix_exp<EXP>& Q,
+ const matrix_exp<EXP2>& y,
+ const U& df,
+ const scalar_type& tau,
+ const long i,
+ const long j,
+ const scalar_type& Cp,
+ const scalar_type& Cn
+ ) const
+ {
+ const scalar_type Ci = (y(i) > 0 )? Cp : Cn;
+ const scalar_type Cj = (y(j) > 0 )? Cp : Cn;
+
+ if (y(i) != y(j))
+ {
+ scalar_type quad_coef = Q(i,i)+Q(j,j)+2*Q(j,i);
+ if (quad_coef <= 0)
+ quad_coef = tau;
+ scalar_type delta = (-df(i)-df(j))/quad_coef;
+ scalar_type diff = alpha(i) - alpha(j);
+ alpha(i) += delta;
+ alpha(j) += delta;
+
+ if (diff > 0)
+ {
+ if (alpha(j) < 0)
+ {
+ alpha(j) = 0;
+ alpha(i) = diff;
+ }
+ }
+ else
+ {
+ if (alpha(i) < 0)
+ {
+ alpha(i) = 0;
+ alpha(j) = -diff;
+ }
+ }
+
+ if (diff > Ci - Cj)
+ {
+ if (alpha(i) > Ci)
+ {
+ alpha(i) = Ci;
+ alpha(j) = Ci - diff;
+ }
+ }
+ else
+ {
+ if (alpha(j) > Cj)
+ {
+ alpha(j) = Cj;
+ alpha(i) = Cj + diff;
+ }
+ }
+ }
+ else
+ {
+ scalar_type quad_coef = Q(i,i)+Q(j,j)-2*Q(j,i);
+ if (quad_coef <= 0)
+ quad_coef = tau;
+ scalar_type delta = (df(i)-df(j))/quad_coef;
+ scalar_type sum = alpha(i) + alpha(j);
+ alpha(i) -= delta;
+ alpha(j) += delta;
+
+ if(sum > Ci)
+ {
+ if(alpha(i) > Ci)
+ {
+ alpha(i) = Ci;
+ alpha(j) = sum - Ci;
+ }
+ }
+ else
+ {
+ if(alpha(j) < 0)
+ {
+ alpha(j) = 0;
+ alpha(i) = sum;
+ }
+ }
+
+ if(sum > Cj)
+ {
+ if(alpha(j) > Cj)
+ {
+ alpha(j) = Cj;
+ alpha(i) = sum - Cj;
+ }
+ }
+ else
+ {
+ if(alpha(i) < 0)
+ {
+ alpha(i) = 0;
+ alpha(j) = sum;
+ }
+ }
+
+ }
+ }
+
+ // ------------------------------------------------------------------------------------
+
+ column_matrix df; // gradient of f(alpha)
+ };
+
+// ----------------------------------------------------------------------------------------
+
+}
+
+#endif // DLIB_SOLVE_QP3_USING_SMo_Hh_
+
+
diff --git a/ml/dlib/dlib/optimization/optimization_solve_qp3_using_smo_abstract.h b/ml/dlib/dlib/optimization/optimization_solve_qp3_using_smo_abstract.h
new file mode 100644
index 000000000..8efd7215b
--- /dev/null
+++ b/ml/dlib/dlib/optimization/optimization_solve_qp3_using_smo_abstract.h
@@ -0,0 +1,139 @@
+// Copyright (C) 2010 Davis E. King (davis@dlib.net)
+// License: Boost Software License See LICENSE.txt for the full license.
+#undef DLIB_OPTIMIZATION_SOLVE_QP3_USING_SMO_ABSTRACT_H_
+#ifdef DLIB_OPTIMIZATION_SOLVE_QP3_USING_SMO_ABSTRACT_H_
+
+#include "../matrix/matrix_abstract.h"
+#include "../algs.h"
+
+namespace dlib
+{
+
+// ----------------------------------------------------------------------------------------
+
+ class invalid_qp3_error : public dlib::error
+ {
+ /*!
+ WHAT THIS OBJECT REPRESENTS
+ This object is an exception class used to indicate that the values
+ of B, Cp, and Cn given to the solve_qp3_using_smo object are incompatible
+ with the constraints of the quadratic program.
+
+ this->B, this->Cp, and this->Cn will be set to the invalid values used.
+ !*/
+
+ public:
+ invalid_qp3_error( const std::string& msg, double B_, double Cp_, double Cn_) :
+ dlib::error(msg), B(B_), Cp(Cp_), Cn(Cn_) {};
+
+ const double B;
+ const double Cp;
+ const double Cn;
+ };
+
+// ----------------------------------------------------------------------------------------
+
+ template <
+ typename matrix_type
+ >
+ class solve_qp3_using_smo
+ {
+ /*!
+ REQUIREMENTS ON matrix_type
+ Must be some type of dlib::matrix.
+
+ WHAT THIS OBJECT REPRESENTS
+ This object is a tool for solving the following quadratic programming
+ problem using the sequential minimal optimization algorithm:
+
+ Minimize: f(alpha) == 0.5*trans(alpha)*Q*alpha + trans(p)*alpha
+ subject to the following constraints:
+ - for all i such that y(i) == +1: 0 <= alpha(i) <= Cp
+ - for all i such that y(i) == -1: 0 <= alpha(i) <= Cn
+ - trans(y)*alpha == B
+
+ Where all elements of y must be equal to +1 or -1 and f is convex.
+ This means that Q should be symmetric and positive-semidefinite.
+
+
+ This object implements the strategy used by the LIBSVM tool. The following papers
+ can be consulted for additional details:
+ - Chih-Chung Chang and Chih-Jen Lin, LIBSVM : a library for support vector
+ machines, 2001. Software available at http://www.csie.ntu.edu.tw/~cjlin/libsvm
+ - Working Set Selection Using Second Order Information for Training Support Vector Machines by
+ Fan, Chen, and Lin. In the Journal of Machine Learning Research 2005.
+ !*/
+
+ public:
+
+ typedef typename matrix_type::mem_manager_type mem_manager_type;
+ typedef typename matrix_type::type scalar_type;
+ typedef typename matrix_type::layout_type layout_type;
+ typedef matrix<scalar_type,0,0,mem_manager_type,layout_type> general_matrix;
+ typedef matrix<scalar_type,0,1,mem_manager_type,layout_type> column_matrix;
+
+ template <
+ typename EXP1,
+ typename EXP2,
+ typename EXP3,
+ long NR
+ >
+ unsigned long operator() (
+ const matrix_exp<EXP1>& Q,
+ const matrix_exp<EXP2>& p,
+ const matrix_exp<EXP3>& y,
+ const scalar_type B,
+ const scalar_type Cp,
+ const scalar_type Cn,
+ matrix<scalar_type,NR,1,mem_manager_type, layout_type>& alpha,
+ scalar_type eps
+ );
+ /*!
+ requires
+ - Q.nr() == Q.nc()
+ - is_col_vector(y) == true
+ - is_col_vector(p) == true
+ - p.size() == y.size() == Q.nr()
+ - y.size() > 0
+ - sum((y == +1) + (y == -1)) == y.size()
+ (i.e. all elements of y must be equal to +1 or -1)
+ - alpha must be capable of representing a vector of size y.size() elements
+ - Cp > 0
+ - Cn > 0
+ - eps > 0
+ ensures
+ - This function solves the quadratic program defined in this class's main comment.
+ - The solution to the quadratic program will be stored in #alpha.
+ - #alpha.size() == y.size()
+ - This function uses an implementation of the sequential minimal optimization
+ algorithm. It runs until the KKT violation is less than eps. So eps controls
+ how accurate the solution is and smaller values result in better solutions.
+ (a reasonable eps is usually about 1e-3)
+ - #get_gradient() == Q*(#alpha)
+ (i.e. stores the gradient of f() at #alpha in get_gradient())
+ - returns the number of iterations performed.
+ throws
+ - invalid_qp3_error
+ This exception is thrown if the given parameters cause the constraints
+ of the quadratic programming problem to be impossible to satisfy.
+ !*/
+
+ const column_matrix& get_gradient (
+ ) const;
+ /*!
+ ensures
+ - returns the gradient vector at the solution of the last problem solved
+ by this object. If no problem has been solved then returns an empty
+ vector.
+ !*/
+
+ };
+
+// ----------------------------------------------------------------------------------------
+
+}
+
+#endif // DLIB_OPTIMIZATION_SOLVE_QP3_USING_SMO_ABSTRACT_H_
+
+
+
diff --git a/ml/dlib/dlib/optimization/optimization_solve_qp_using_smo.h b/ml/dlib/dlib/optimization/optimization_solve_qp_using_smo.h
new file mode 100644
index 000000000..b9cc74df3
--- /dev/null
+++ b/ml/dlib/dlib/optimization/optimization_solve_qp_using_smo.h
@@ -0,0 +1,937 @@
+// Copyright (C) 2010 Davis E. King (davis@dlib.net)
+// License: Boost Software License See LICENSE.txt for the full license.
+#ifndef DLIB_OPTIMIZATION_SOLVE_QP_UsING_SMO_Hh_
+#define DLIB_OPTIMIZATION_SOLVE_QP_UsING_SMO_Hh_
+
+#include "optimization_solve_qp_using_smo_abstract.h"
+#include "../matrix.h"
+#include <map>
+#include "../unordered_pair.h"
+
+namespace dlib
+{
+
+// ----------------------------------------------------------------------------------------
+
+ /*
+ The algorithm defined in the solve_qp_using_smo() function below can be
+ derived by using an important theorem from the theory of constrained optimization.
+ This theorem tells us that any optimal point of a constrained function must
+ satisfy what are called the KKT conditions (also sometimes called just the KT
+ conditions, especially in older literature). A very good book to consult
+ regarding this topic is Practical Methods of Optimization (second edition) by
+ R. Fletcher. Below I will try to explain the general idea of how this is
+ applied.
+
+ Let e == ones_matrix(alpha.size(),1)
+
+ First, note that the function below solves the following quadratic program.
+ Minimize: f(alpha) == 0.5*trans(alpha)*Q*alpha - trans(alpha)*b
+ subject to the following constraints:
+ - trans(e)*alpha == C (i.e. the sum of alpha values doesn't change)
+ - min(alpha) >= 0 (i.e. all alpha values are nonnegative)
+ Where f is convex. This means that Q should be positive-semidefinite.
+
+
+ To get from this problem formulation to the algorithm below we have to
+ consider the KKT conditions. They tell us that any solution to the above
+ problem must satisfy the following 5 conditions:
+ 1. trans(e)*alpha == C
+ 2. min(alpha) >= 0
+
+ 3. Let L(alpha, x, y) == f(alpha) - trans(x)*alpha - y*(trans(e)*alpha - C)
+ Where x is a vector of length alpha.size() and y is a single scalar.
+ Then the derivative of L with respect to alpha must == 0
+ So we get the following as our 3rd condition:
+ f'(alpha) - x - y*e == 0
+
+ 4. min(x) >= 0 (i.e. all x values are nonnegative)
+ 5. pointwise_multiply(x, alpha) == 0
+ (i.e. only one member of each x(i) and alpha(i) pair can be non-zero)
+
+
+ From 3 we can easily obtain this rule:
+ for all i: f'(alpha)(i) - x(i) == y
+
+ If we then consider 4 and 5 we see that we can infer that the following
+ must also be the case:
+ - if (alpha(i) > 0) then
+ - x(i) == 0
+ - f'(alpha)(i) == y
+ - else
+ - x(i) == some nonnegative number
+ - f'(alpha)(i) >= y
+
+
+ The important thing to take away is the final rule. It tells us that at the
+ optimal solution all elements of the gradient of f have the same value if
+ their corresponding alpha is non-zero. It also tells us that all the other
+ gradient values are bigger than y. We can use this information to help us
+ pick which alpha variables to optimize at each iteration.
+ */
+
+// ----------------------------------------------------------------------------------------
+
+ template <
+ typename EXP1,
+ typename EXP2,
+ typename T, long NR, long NC, typename MM, typename L
+ >
+ unsigned long solve_qp_using_smo (
+ const matrix_exp<EXP1>& Q,
+ const matrix_exp<EXP2>& b,
+ matrix<T,NR,NC,MM,L>& alpha,
+ T eps,
+ unsigned long max_iter
+ )
+ {
+ // make sure requires clause is not broken
+ DLIB_ASSERT(Q.nr() == Q.nc() &&
+ is_col_vector(b) &&
+ is_col_vector(alpha) &&
+ b.size() == alpha.size() &&
+ b.size() == Q.nr() &&
+ alpha.size() > 0 &&
+ min(alpha) >= 0 &&
+ eps > 0 &&
+ max_iter > 0,
+ "\t unsigned long solve_qp_using_smo()"
+ << "\n\t Invalid arguments were given to this function"
+ << "\n\t Q.nr(): " << Q.nr()
+ << "\n\t Q.nc(): " << Q.nc()
+ << "\n\t is_col_vector(b): " << is_col_vector(b)
+ << "\n\t is_col_vector(alpha): " << is_col_vector(alpha)
+ << "\n\t b.size(): " << b.size()
+ << "\n\t alpha.size(): " << alpha.size()
+ << "\n\t Q.nr(): " << Q.nr()
+ << "\n\t min(alpha): " << min(alpha)
+ << "\n\t eps: " << eps
+ << "\n\t max_iter: " << max_iter
+ );
+
+ const T C = sum(alpha);
+
+ // Compute f'(alpha) (i.e. the gradient of f(alpha)) for the current alpha.
+ matrix<T,NR,NC,MM,L> df = Q*alpha - b;
+
+ const T tau = 1000*std::numeric_limits<T>::epsilon();
+
+ T big, little;
+ unsigned long iter = 0;
+ for (; iter < max_iter; ++iter)
+ {
+ // Find the two elements of df that satisfy the following:
+ // - little_idx == index_of_min(df)
+ // - big_idx == the index of the largest element in df such that alpha(big_idx) > 0
+ // These two indices will tell us which two alpha values are most in violation of the KKT
+ // optimality conditions.
+ big = -std::numeric_limits<T>::max();
+ long big_idx = 0;
+ little = std::numeric_limits<T>::max();
+ long little_idx = 0;
+ for (long i = 0; i < df.nr(); ++i)
+ {
+ if (df(i) > big && alpha(i) > 0)
+ {
+ big = df(i);
+ big_idx = i;
+ }
+ if (df(i) < little)
+ {
+ little = df(i);
+ little_idx = i;
+ }
+ }
+
+ // Check if the KKT conditions are still violated and stop if so.
+ //if (alpha(little_idx) > 0 && (big - little) < eps)
+ // break;
+
+ // Check how big the duality gap is and stop when it goes below eps.
+ // The duality gap is the gap between the objective value of the function
+ // we are optimizing and the value of its primal form. This value is always
+ // greater than or equal to the distance to the optimum solution so it is a
+ // good way to decide if we should stop. See the book referenced above for
+ // more information. In particular, see the part about the Wolfe Dual.
+ if (trans(alpha)*df - C*little < eps)
+ break;
+
+
+ // Save these values, we will need them later.
+ const T old_alpha_big = alpha(big_idx);
+ const T old_alpha_little = alpha(little_idx);
+
+
+ // Now optimize the two variables we just picked.
+ T quad_coef = Q(big_idx,big_idx) + Q(little_idx,little_idx) - 2*Q(big_idx, little_idx);
+ if (quad_coef <= tau)
+ quad_coef = tau;
+ const T delta = (big - little)/quad_coef;
+ alpha(big_idx) -= delta;
+ alpha(little_idx) += delta;
+
+ // Make sure alpha stays feasible. That is, make sure the updated alpha doesn't
+ // violate the non-negativity constraint.
+ if (alpha(big_idx) < 0)
+ {
+ // Since an alpha can't be negative we will just set it to 0 and shift all the
+ // weight to the other alpha.
+ alpha(big_idx) = 0;
+ alpha(little_idx) = old_alpha_big + old_alpha_little;
+ }
+
+ // Every 300 iterations
+ if ((iter%300) == 299)
+ {
+ // Perform this form of the update every so often because doing so can help
+ // avoid the buildup of numerical errors you get with the alternate update
+ // below.
+ df = Q*alpha - b;
+ }
+ else
+ {
+ // Now update the gradient. We will perform the equivalent of: df = Q*alpha - b;
+ const T delta_alpha_big = alpha(big_idx) - old_alpha_big;
+ const T delta_alpha_little = alpha(little_idx) - old_alpha_little;
+
+ for(long k = 0; k < df.nr(); ++k)
+ df(k) += Q(big_idx,k)*delta_alpha_big + Q(little_idx,k)*delta_alpha_little;;
+ }
+ }
+
+ /*
+ using namespace std;
+ cout << "SMO: " << endl;
+ cout << " duality gap: "<< trans(alpha)*df - C*min(df) << endl;
+ cout << " KKT gap: "<< big-little << endl;
+ cout << " iter: "<< iter+1 << endl;
+ cout << " eps: "<< eps << endl;
+ */
+
+ return iter+1;
+ }
+
+// ----------------------------------------------------------------------------------------
+
+ template <
+ typename EXP1,
+ typename EXP2,
+ typename EXP3,
+ typename EXP4,
+ typename T, long NR, long NC, typename MM, typename L,
+ long NR2, long NC2
+ >
+ unsigned long solve_qp4_using_smo (
+ const matrix_exp<EXP1>& A,
+ const matrix_exp<EXP2>& Q,
+ const matrix_exp<EXP3>& b,
+ const matrix_exp<EXP4>& d,
+ matrix<T,NR,NC,MM,L>& alpha,
+ matrix<T,NR2,NC2,MM,L>& lambda,
+ T eps,
+ unsigned long max_iter,
+ T max_lambda = std::numeric_limits<T>::infinity()
+ )
+ {
+ // make sure requires clause is not broken
+ DLIB_ASSERT(A.nc() == alpha.size() &&
+ Q.nr() == Q.nc() &&
+ is_col_vector(b) &&
+ is_col_vector(alpha) &&
+ b.size() == alpha.size() &&
+ b.size() == Q.nr() &&
+ alpha.size() > 0 &&
+ min(alpha) >= 0 &&
+ eps > 0 &&
+ max_iter > 0,
+ "\t void solve_qp4_using_smo()"
+ << "\n\t Invalid arguments were given to this function"
+ << "\n\t A.nc(): " << A.nc()
+ << "\n\t Q.nr(): " << Q.nr()
+ << "\n\t Q.nc(): " << Q.nc()
+ << "\n\t is_col_vector(b): " << is_col_vector(b)
+ << "\n\t is_col_vector(alpha): " << is_col_vector(alpha)
+ << "\n\t b.size(): " << b.size()
+ << "\n\t alpha.size(): " << alpha.size()
+ << "\n\t Q.nr(): " << Q.nr()
+ << "\n\t min(alpha): " << min(alpha)
+ << "\n\t eps: " << eps
+ << "\n\t max_iter: " << max_iter
+ );
+ DLIB_ASSERT(is_col_vector(d) == true &&
+ max_lambda >= 0 &&
+ d.size() == A.nr(),
+ "\t void solve_qp4_using_smo()"
+ << "\n\t Invalid arguments were given to this function"
+ << "\n\t A.nr(): " << A.nr()
+ << "\n\t d.size(): " << d.size()
+ << "\n\t max_lambda: " << max_lambda
+ );
+
+ const T C = sum(alpha);
+
+ /*
+ For this optimization problem, it is the case that the optimal
+ value of lambda is given by a simple closed form expression if we
+ know the optimal alpha. So what we will do is to just optimize
+ alpha and every now and then we will update lambda with its optimal
+ value. Therefore, we use essentially the same method as the
+ solve_qp_using_smo() routine.
+ */
+
+ const bool d_is_zero = d==zeros_matrix(d);
+
+ // compute optimal lambda for current alpha
+ if (d_is_zero)
+ lambda = A*alpha;
+ else
+ lambda = A*alpha + d;
+ lambda = dlib::clamp(lambda, 0, max_lambda);
+
+ // Compute f'(alpha) (i.e. the gradient of f(alpha) with respect to alpha) for the current alpha.
+ matrix<T,NR,NC,MM,L> df = Q*alpha - b - trans(A)*lambda;
+
+ const T tau = 1000*std::numeric_limits<T>::epsilon();
+
+ T big, little;
+ unsigned long iter = 0;
+ for (; iter < max_iter; ++iter)
+ {
+ // Find the two elements of df that satisfy the following:
+ // - little_idx == index_of_min(df)
+ // - big_idx == the index of the largest element in df such that alpha(big_idx) > 0
+ // These two indices will tell us which two alpha values are most in violation of the KKT
+ // optimality conditions.
+ big = -std::numeric_limits<T>::max();
+ long big_idx = 0;
+ little = std::numeric_limits<T>::max();
+ long little_idx = 0;
+ for (long i = 0; i < df.nr(); ++i)
+ {
+ if (df(i) > big && alpha(i) > 0)
+ {
+ big = df(i);
+ big_idx = i;
+ }
+ if (df(i) < little)
+ {
+ little = df(i);
+ little_idx = i;
+ }
+ }
+
+ // Check how big the duality gap is and stop when it goes below eps.
+ // The duality gap is the gap between the objective value of the function
+ // we are optimizing and the value of its primal form. This value is always
+ // greater than or equal to the distance to the optimum solution so it is a
+ // good way to decide if we should stop.
+ if (trans(alpha)*df - C*little < eps)
+ {
+ // compute optimal lambda and recheck the duality gap to make
+ // sure we have really converged.
+ if (d_is_zero)
+ lambda = A*alpha;
+ else
+ lambda = A*alpha + d;
+ lambda = dlib::clamp(lambda, 0, max_lambda);
+ df = Q*alpha - b - trans(A)*lambda;
+
+ if (trans(alpha)*df - C*min(df) < eps)
+ break;
+ else
+ continue;
+ }
+
+
+ // Save these values, we will need them later.
+ const T old_alpha_big = alpha(big_idx);
+ const T old_alpha_little = alpha(little_idx);
+
+
+ // Now optimize the two variables we just picked.
+ T quad_coef = Q(big_idx,big_idx) + Q(little_idx,little_idx) - 2*Q(big_idx, little_idx);
+ if (quad_coef <= tau)
+ quad_coef = tau;
+ const T delta = (big - little)/quad_coef;
+ alpha(big_idx) -= delta;
+ alpha(little_idx) += delta;
+
+ // Make sure alpha stays feasible. That is, make sure the updated alpha doesn't
+ // violate the non-negativity constraint.
+ if (alpha(big_idx) < 0)
+ {
+ // Since an alpha can't be negative we will just set it to 0 and shift all the
+ // weight to the other alpha.
+ alpha(big_idx) = 0;
+ alpha(little_idx) = old_alpha_big + old_alpha_little;
+ }
+
+
+ // Every 300 iterations
+ if ((iter%300) == 299)
+ {
+ // compute the optimal lambda for the current alpha
+ if (d_is_zero)
+ lambda = A*alpha;
+ else
+ lambda = A*alpha + d;
+ lambda = dlib::clamp(lambda, 0, max_lambda);
+
+ // Perform this form of the update every so often because doing so can help
+ // avoid the buildup of numerical errors you get with the alternate update
+ // below.
+ df = Q*alpha - b - trans(A)*lambda;
+ }
+ else
+ {
+ // Now update the gradient. We will perform the equivalent of: df = Q*alpha - b;
+ const T delta_alpha_big = alpha(big_idx) - old_alpha_big;
+ const T delta_alpha_little = alpha(little_idx) - old_alpha_little;
+
+ for(long k = 0; k < df.nr(); ++k)
+ df(k) += Q(big_idx,k)*delta_alpha_big + Q(little_idx,k)*delta_alpha_little;;
+ }
+ }
+
+ /*
+ using namespace std;
+ cout << "SMO: " << endl;
+ cout << " duality gap: "<< trans(alpha)*df - C*min(df) << endl;
+ cout << " KKT gap: "<< big-little << endl;
+ cout << " iter: "<< iter+1 << endl;
+ cout << " eps: "<< eps << endl;
+ */
+
+
+ return iter+1;
+ }
+
+// ----------------------------------------------------------------------------------------
+
+ template <
+ typename EXP1,
+ typename EXP2,
+ typename T, long NR, long NC, typename MM, typename L
+ >
+ unsigned long solve_qp_box_constrained (
+ const matrix_exp<EXP1>& Q,
+ const matrix_exp<EXP2>& b,
+ matrix<T,NR,NC,MM,L>& alpha,
+ const matrix<T,NR,NC,MM,L>& lower,
+ const matrix<T,NR,NC,MM,L>& upper,
+ T eps,
+ unsigned long max_iter
+ )
+ {
+ // make sure requires clause is not broken
+ DLIB_ASSERT(Q.nr() == Q.nc() &&
+ alpha.size() == lower.size() &&
+ alpha.size() == upper.size() &&
+ is_col_vector(b) &&
+ is_col_vector(alpha) &&
+ is_col_vector(lower) &&
+ is_col_vector(upper) &&
+ b.size() == alpha.size() &&
+ b.size() == Q.nr() &&
+ alpha.size() > 0 &&
+ 0 <= min(alpha-lower) &&
+ 0 <= max(upper-alpha) &&
+ eps > 0 &&
+ max_iter > 0,
+ "\t unsigned long solve_qp_box_constrained()"
+ << "\n\t Invalid arguments were given to this function"
+ << "\n\t Q.nr(): " << Q.nr()
+ << "\n\t Q.nc(): " << Q.nc()
+ << "\n\t is_col_vector(b): " << is_col_vector(b)
+ << "\n\t is_col_vector(alpha): " << is_col_vector(alpha)
+ << "\n\t is_col_vector(lower): " << is_col_vector(lower)
+ << "\n\t is_col_vector(upper): " << is_col_vector(upper)
+ << "\n\t b.size(): " << b.size()
+ << "\n\t alpha.size(): " << alpha.size()
+ << "\n\t lower.size(): " << lower.size()
+ << "\n\t upper.size(): " << upper.size()
+ << "\n\t Q.nr(): " << Q.nr()
+ << "\n\t min(alpha-lower): " << min(alpha-lower)
+ << "\n\t max(upper-alpha): " << max(upper-alpha)
+ << "\n\t eps: " << eps
+ << "\n\t max_iter: " << max_iter
+ );
+
+
+ // Compute f'(alpha) (i.e. the gradient of f(alpha)) for the current alpha.
+ matrix<T,NR,NC,MM,L> df = Q*alpha + b;
+ matrix<T,NR,NC,MM,L> QQ = reciprocal_max(diag(Q));
+
+ // First we use a coordinate descent method to initialize alpha.
+ double max_df = 0;
+ for (long iter = 0; iter < alpha.size()*2; ++iter)
+ {
+ max_df = 0;
+ long best_r =0;
+ // find the best alpha to optimize.
+ for (long r = 0; r < Q.nr(); ++r)
+ {
+ if (alpha(r) <= lower(r) && df(r) > 0)
+ ;//alpha(r) = lower(r);
+ else if (alpha(r) >= upper(r) && df(r) < 0)
+ ;//alpha(r) = upper(r);
+ else if (std::abs(df(r)) > max_df)
+ {
+ best_r = r;
+ max_df = std::abs(df(r));
+ }
+ }
+
+ // now optimize alpha(best_r)
+ const long r = best_r;
+ const T old_alpha = alpha(r);
+ alpha(r) = -(df(r)-Q(r,r)*alpha(r))*QQ(r);
+ if (alpha(r) < lower(r))
+ alpha(r) = lower(r);
+ else if (alpha(r) > upper(r))
+ alpha(r) = upper(r);
+
+ const T delta = old_alpha-alpha(r);
+
+ // Now update the gradient. We will perform the equivalent of: df = Q*alpha + b;
+ for(long k = 0; k < df.nr(); ++k)
+ df(k) -= Q(r,k)*delta;
+ }
+ //cout << "max_df: " << max_df << endl;
+ //cout << "objective value: " << 0.5*trans(alpha)*Q*alpha + trans(b)*alpha << endl;
+
+
+
+ // Now do the main iteration block of this solver. The coordinate descent method
+ // we used above can improve the objective rapidly in the beginning. However,
+ // Nesterov's method has more rapid convergence once it gets going so this is what
+ // we use for the main iteration.
+ matrix<T,NR,NC,MM,L> v, v_old;
+ v = alpha;
+ // We need to get an upper bound on the Lipschitz constant for this QP. Since that
+ // is just the max eigenvalue of Q we can do it using Gershgorin disks.
+ const T lipschitz_bound = max(diag(Q) + (sum_cols(abs(Q)) - abs(diag(Q))));
+ double lambda = 0;
+ unsigned long iter;
+ for (iter = 0; iter < max_iter; ++iter)
+ {
+ const double next_lambda = (1 + std::sqrt(1+4*lambda*lambda))/2;
+ const double gamma = (1-lambda)/next_lambda;
+ lambda = next_lambda;
+
+ v_old = v;
+
+ df = Q*alpha + b;
+ // now take a projected gradient step using Nesterov's method.
+ v = clamp(alpha - 1.0/lipschitz_bound * df, lower, upper);
+ alpha = dlib::clamp((1-gamma)*v + gamma*v_old, lower, upper);
+
+
+ // check for convergence every 10 iterations
+ if (iter%10 == 0)
+ {
+ max_df = 0;
+ for (long r = 0; r < Q.nr(); ++r)
+ {
+ if (alpha(r) <= lower(r) && df(r) > 0)
+ ;//alpha(r) = lower(r);
+ else if (alpha(r) >= upper(r) && df(r) < 0)
+ ;//alpha(r) = upper(r);
+ else if (std::abs(df(r)) > max_df)
+ max_df = std::abs(df(r));
+ }
+ if (max_df < eps)
+ break;
+ }
+ }
+
+ //cout << "max_df: " << max_df << endl;
+ //cout << "objective value: " << 0.5*trans(alpha)*Q*alpha + trans(b)*alpha << endl;
+ return iter+1;
+ }
+
+// ----------------------------------------------------------------------------------------
+// ----------------------------------------------------------------------------------------
+
+ namespace impl
+ {
+ // Check if each vector in Q_offdiag is actually a constant times the 1s vector.
+ template <
+ typename T, long NR, long NC, typename MM, typename L
+ >
+ bool has_uniform_offdiag_vectors(
+ const std::map<unordered_pair<size_t>, matrix<T,NR,NC,MM,L>>& Q_offdiag
+ )
+ {
+ for (auto& x : Q_offdiag)
+ {
+ auto ref = x.second(0);
+ for (auto& y : x.second)
+ if (ref != y)
+ return false;
+ }
+ return true;
+ }
+
+ template <
+ typename T, long NR, long NC, typename MM, typename L
+ >
+ matrix<T,0,0,MM,L> compact_offdiag(
+ const size_t& num_blocks,
+ const std::map<unordered_pair<size_t>, matrix<T,NR,NC,MM,L>>& Q_offdiag
+ )
+ {
+ matrix<T,0,0,MM,L> temp;
+ // we can only compact the offdiag information if they are uniform vectors
+ if (!has_uniform_offdiag_vectors(Q_offdiag))
+ return temp;
+
+ temp.set_size(num_blocks, num_blocks);
+ temp = 0;
+
+ for (auto& x : Q_offdiag)
+ {
+ long r = x.first.first;
+ long c = x.first.second;
+ temp(r,c) = x.second(0);
+ temp(c,r) = x.second(0);
+ }
+
+ return temp;
+ }
+ }
+
+// ----------------------------------------------------------------------------------------
+
+ template <
+ typename T, long NR, long NC, typename MM, typename L
+ >
+ unsigned long solve_qp_box_constrained_blockdiag (
+ const std::vector<matrix<T,NR,NR,MM,L>>& Q_blocks,
+ const std::vector<matrix<T,NR,NC,MM,L>>& bs,
+ const std::map<unordered_pair<size_t>, matrix<T,NR,NC,MM,L>>& Q_offdiag,
+ std::vector<matrix<T,NR,NC,MM,L>>& alphas,
+ const std::vector<matrix<T,NR,NC,MM,L>>& lowers,
+ const std::vector<matrix<T,NR,NC,MM,L>>& uppers,
+ T eps,
+ unsigned long max_iter
+ )
+ {
+ // make sure requires clause is not broken
+ DLIB_CASSERT(Q_blocks.size() > 0);
+ DLIB_CASSERT(Q_blocks.size() == bs.size() &&
+ Q_blocks.size() == alphas.size() &&
+ Q_blocks.size() == lowers.size() &&
+ Q_blocks.size() == uppers.size(),
+ "Q_blocks.size(): "<< Q_blocks.size() << "\n" <<
+ "bs.size(): "<< bs.size() << "\n" <<
+ "alphas.size(): "<< alphas.size() << "\n" <<
+ "lowers.size(): "<< lowers.size() << "\n" <<
+ "uppers.size(): "<< uppers.size() << "\n"
+ );
+ for (auto& Q : Q_blocks)
+ {
+ DLIB_CASSERT(Q.nr() == Q.nc(), "All the matrices in Q_blocks have the same dimensions.");
+ DLIB_CASSERT(Q.size() > 0, "All the matrices in Q_blocks must be non-empty and have the same dimensions.");
+ DLIB_CASSERT(Q.nr() == Q_blocks[0].nr() && Q.nc() == Q_blocks[0].nc(), "All the matrices in Q_blocks have the same dimensions.");
+ }
+#ifdef ENABLE_ASSERTS
+ for (size_t i = 0; i < alphas.size(); ++i)
+ {
+ DLIB_CASSERT(is_col_vector(bs[i]) && bs[i].size() == Q_blocks[0].nr(),
+ "is_col_vector(bs["<<i<<"]): " << is_col_vector(bs[i]) << "\n" <<
+ "bs["<<i<<"].size(): " << bs[i].size() << "\n" <<
+ "Q_blocks[0].nr(): " << Q_blocks[0].nr());
+
+ for (auto& Qoffdiag : Q_offdiag)
+ {
+ auto& Q_offdiag_element = Qoffdiag.second;
+ long r = Qoffdiag.first.first;
+ long c = Qoffdiag.first.second;
+ DLIB_CASSERT(is_col_vector(Q_offdiag_element) && Q_offdiag_element.size() == Q_blocks[0].nr(),
+ "is_col_vector(Q_offdiag["<<r<<","<<c<<"]): " << is_col_vector(Q_offdiag_element) << "\n" <<
+ "Q_offdiag["<<r<<","<<c<<"].size(): " << Q_offdiag_element.size() << "\n" <<
+ "Q_blocks[0].nr(): " << Q_blocks[0].nr());
+ }
+
+ DLIB_CASSERT(is_col_vector(alphas[i]) && alphas[i].size() == Q_blocks[0].nr(),
+ "is_col_vector(alphas["<<i<<"]): " << is_col_vector(alphas[i]) << "\n" <<
+ "alphas["<<i<<"].size(): " << alphas[i].size() << "\n" <<
+ "Q_blocks[0].nr(): " << Q_blocks[0].nr());
+
+ DLIB_CASSERT(is_col_vector(lowers[i]) && lowers[i].size() == Q_blocks[0].nr(),
+ "is_col_vector(lowers["<<i<<"]): " << is_col_vector(lowers[i]) << "\n" <<
+ "lowers["<<i<<"].size(): " << lowers[i].size() << "\n" <<
+ "Q_blocks[0].nr(): " << Q_blocks[0].nr());
+
+ DLIB_CASSERT(is_col_vector(uppers[i]) && uppers[i].size() == Q_blocks[0].nr(),
+ "is_col_vector(uppers["<<i<<"]): " << is_col_vector(uppers[i]) << "\n" <<
+ "uppers["<<i<<"].size(): " << uppers[i].size() << "\n" <<
+ "Q_blocks[0].nr(): " << Q_blocks[0].nr());
+
+ DLIB_CASSERT(0 <= min(alphas[i]-lowers[i]), "min(alphas["<<i<<"]-lowers["<<i<<"]): " << min(alphas[i]-lowers[i]));
+ DLIB_CASSERT(0 <= max(uppers[i]-alphas[i]), "max(uppers["<<i<<"]-alphas["<<i<<"]): " << max(uppers[i]-alphas[i]));
+ }
+ DLIB_CASSERT(eps > 0 && max_iter > 0, "eps: " << eps << "\nmax_iter: "<< max_iter);
+#endif // ENABLE_ASSERTS
+
+
+ const auto offdiag_compact = impl::compact_offdiag(Q_blocks.size(), Q_offdiag);
+ matrix<T,0,0,MM,L> temp, alphas_compact;
+
+ // Compute f'(alpha) (i.e. the gradient of f(alpha)) for the current alpha.
+ std::vector<matrix<T,NR,NC,MM,L>> df;// = Q*alpha + b;
+ auto compute_df = [&]()
+ {
+ df.resize(Q_blocks.size());
+ for (size_t i = 0; i < df.size(); ++i)
+ df[i] = Q_blocks[i]*alphas[i] + bs[i];
+
+
+ // Don't forget to include the Q_offdiag terms in the computation. Note that
+ // we have two options for how we can compute this part. If Q_offdiag is
+ // uniform and can be compacted into a simple matrix and there are a lot of off
+ // diagonal entries then it's faster to do it as a matrix multiply. Otherwise
+ // we do the more general computation.
+ if (offdiag_compact.size() != 0 && Q_offdiag.size() > Q_blocks.size()*5)
+ {
+ // Do it as a matrix multiply (with a bit of data shuffling)
+ alphas_compact.set_size(alphas[0].size(), offdiag_compact.nr());
+ for (long c = 0; c < alphas_compact.nc(); ++c)
+ set_colm(alphas_compact,c) = alphas[c];
+ temp = alphas_compact*offdiag_compact;
+ for (size_t i = 0; i < df.size(); ++i)
+ df[i] += colm(temp,i);
+ }
+ else
+ {
+ // Do the fully general computation that allows for non-uniform values in
+ // the off diagonal vectors.
+ for (auto& p : Q_offdiag)
+ {
+ long r = p.first.first;
+ long c = p.first.second;
+ df[r] += pointwise_multiply(p.second, alphas[c]);
+ if (r != c)
+ df[c] += pointwise_multiply(p.second, alphas[r]);
+ }
+ }
+ };
+ compute_df();
+
+
+
+ std::vector<matrix<T,NR,NC,MM,L>> Q_diag, Q_ggd;
+ std::vector<matrix<T,NR,NC,MM,L>> QQ;// = reciprocal_max(diag(Q));
+ QQ.resize(Q_blocks.size());
+ Q_diag.resize(Q_blocks.size());
+ Q_ggd.resize(Q_blocks.size());
+
+ // We need to get an upper bound on the Lipschitz constant for this QP. Since that
+ // is just the max eigenvalue of Q we can do it using Gershgorin disks.
+ //const T lipschitz_bound = max(diag(Q) + (sum_cols(abs(Q)) - abs(diag(Q))));
+ for (size_t i = 0; i < QQ.size(); ++i)
+ {
+ auto f = Q_offdiag.find(make_unordered_pair(i,i));
+ if (f != Q_offdiag.end())
+ Q_diag[i] = diag(Q_blocks[i]) + f->second;
+ else
+ Q_diag[i] = diag(Q_blocks[i]);
+ QQ[i] = reciprocal_max(Q_diag[i]);
+
+ Q_ggd[i] = Q_diag[i] + (sum_cols(abs(Q_blocks[i]))-abs(diag(Q_blocks[i])));
+ }
+ for (auto& p : Q_offdiag)
+ {
+ long r = p.first.first;
+ long c = p.first.second;
+ if (r != c)
+ {
+ Q_ggd[r] += abs(p.second);
+ Q_ggd[c] += abs(p.second);
+ }
+ }
+ T lipschitz_bound = -std::numeric_limits<T>::infinity();
+ for (auto& x : Q_ggd)
+ lipschitz_bound = std::max(lipschitz_bound, max(x));
+
+
+ const long num_variables = alphas.size()*alphas[0].size();
+
+ // First we use a coordinate descent method to initialize alpha.
+ double max_df = 0;
+ for (long iter = 0; iter < num_variables*2; ++iter)
+ {
+ max_df = 0;
+ long best_r =0;
+ size_t best_r2 =0;
+ // find the best alpha to optimize.
+ for (size_t r2 = 0; r2 < alphas.size(); ++r2)
+ {
+ auto& alpha = alphas[r2];
+ auto& df_ = df[r2];
+ auto& lower = lowers[r2];
+ auto& upper = uppers[r2];
+ for (long r = 0; r < alpha.nr(); ++r)
+ {
+ if (alpha(r) <= lower(r) && df_(r) > 0)
+ ;//alpha(r) = lower(r);
+ else if (alpha(r) >= upper(r) && df_(r) < 0)
+ ;//alpha(r) = upper(r);
+ else if (std::abs(df_(r)) > max_df)
+ {
+ best_r = r;
+ best_r2 = r2;
+ max_df = std::abs(df_(r));
+ }
+ }
+ }
+
+ // now optimize alphas[best_r2](best_r)
+ const long r = best_r;
+ auto& alpha = alphas[best_r2];
+ auto& lower = lowers[best_r2];
+ auto& upper = uppers[best_r2];
+ auto& df_ = df[best_r2];
+ const T old_alpha = alpha(r);
+ alpha(r) = -(df_(r)-Q_diag[best_r2](r)*alpha(r))*QQ[best_r2](r);
+ if (alpha(r) < lower(r))
+ alpha(r) = lower(r);
+ else if (alpha(r) > upper(r))
+ alpha(r) = upper(r);
+
+ const T delta = old_alpha-alpha(r);
+
+ // Now update the gradient. We will perform the equivalent of: df = Q*alpha +
+ // b; except we only need to compute one column of the matrix multiply because
+ // only one element of alpha changed.
+ auto& Q = Q_blocks[best_r2];
+ for(long k = 0; k < df_.nr(); ++k)
+ df_(k) -= Q(r,k)*delta;
+ for(size_t j = 0; j < Q_blocks.size(); ++j)
+ {
+ auto f = Q_offdiag.find(make_unordered_pair(best_r2, j));
+ if (f != Q_offdiag.end())
+ df[j](r) -= f->second(r)*delta;
+ }
+ }
+
+
+
+
+ std::vector<matrix<T,NR,NC,MM,L>> v(alphas), v_old(alphas.size());
+ double lambda = 0;
+ unsigned long iter;
+ // Now do the main iteration block of this solver. The coordinate descent method
+ // we used above can improve the objective rapidly in the beginning. However,
+ // Nesterov's method has more rapid convergence once it gets going so this is what
+ // we use for the main iteration.
+ for (iter = 0; iter < max_iter; ++iter)
+ {
+ const double next_lambda = (1 + std::sqrt(1+4*lambda*lambda))/2;
+ const double gamma = (1-lambda)/next_lambda;
+ lambda = next_lambda;
+
+ v_old.swap(v);
+
+ //df = Q*alpha + b;
+ compute_df();
+
+ // now take a projected gradient step using Nesterov's method.
+ for (size_t j = 0; j < alphas.size(); ++j)
+ {
+ v[j] = clamp(alphas[j] - 1.0/lipschitz_bound * df[j], lowers[j], uppers[j]);
+ alphas[j] = clamp((1-gamma)*v[j] + gamma*v_old[j], lowers[j], uppers[j]);
+ }
+
+
+ // check for convergence every 10 iterations
+ if (iter%10 == 0)
+ {
+ max_df = 0;
+ for (size_t r2 = 0; r2 < alphas.size(); ++r2)
+ {
+ auto& alpha = alphas[r2];
+ auto& df_ = df[r2];
+ auto& lower = lowers[r2];
+ auto& upper = uppers[r2];
+ for (long r = 0; r < alpha.nr(); ++r)
+ {
+ if (alpha(r) <= lower(r) && df_(r) > 0)
+ ;//alpha(r) = lower(r);
+ else if (alpha(r) >= upper(r) && df_(r) < 0)
+ ;//alpha(r) = upper(r);
+ else if (std::abs(df_(r)) > max_df)
+ max_df = std::abs(df_(r));
+ }
+ }
+ if (max_df < eps)
+ break;
+ }
+ }
+
+ return iter+1;
+ }
+
+// ----------------------------------------------------------------------------------------
+
+ template <
+ typename EXP1,
+ typename EXP2,
+ typename T, long NRa, long NRb
+ >
+ unsigned long find_gap_between_convex_hulls (
+ const matrix_exp<EXP1>& A,
+ const matrix_exp<EXP2>& B,
+ matrix<T,NRa,1>& cA,
+ matrix<T,NRb,1>& cB,
+ const double eps,
+ const unsigned long max_iter = 1000
+ )
+ {
+ DLIB_CASSERT(A.size() != 0);
+ DLIB_CASSERT(B.size() != 0);
+ DLIB_CASSERT(A.nr() == B.nr(), "The dimensionality of the points in both convex hull sets must match");
+ DLIB_CASSERT(eps > 0);
+ DLIB_CASSERT(max_iter > 0);
+
+ cA.set_size(A.nc());
+ cB.set_size(B.nc());
+
+ // initialize to the centroids of A and B respectively.
+ cA = 1.0/cA.size();
+ cB = 1.0/cB.size();
+
+
+ matrix<T> AA, BB, AB, ABb, ABa;
+
+ AA = trans(A)*A;
+ BB = trans(B)*B;
+ AB = trans(A)*B;
+
+ unsigned long iter = 0;
+ for (iter = 0; iter < max_iter; ++iter)
+ {
+ // find the convex combination of A that is nearest to B*cB
+ ABb = AB*cB;
+ const auto smo_iter1 = solve_qp_using_smo(AA, ABb, cA, eps, cA.size());
+
+ // now find the convex combination of B that is nearest to A*cA
+ ABa = trans(AB)*cA;
+ const auto smo_iter2 = solve_qp_using_smo(BB, ABa, cB, eps, cB.size());
+
+ // stop if the QP solvers failed to improve
+ if (smo_iter1 == 1 && smo_iter2 == 1)
+ break;
+ }
+
+
+ return iter+1;
+ }
+
+// ----------------------------------------------------------------------------------------
+
+}
+
+#endif // DLIB_OPTIMIZATION_SOLVE_QP_UsING_SMO_Hh_
+
diff --git a/ml/dlib/dlib/optimization/optimization_solve_qp_using_smo_abstract.h b/ml/dlib/dlib/optimization/optimization_solve_qp_using_smo_abstract.h
new file mode 100644
index 000000000..5e7d5ec3f
--- /dev/null
+++ b/ml/dlib/dlib/optimization/optimization_solve_qp_using_smo_abstract.h
@@ -0,0 +1,282 @@
+// Copyright (C) 2010 Davis E. King (davis@dlib.net)
+// License: Boost Software License See LICENSE.txt for the full license.
+#undef DLIB_OPTIMIZATION_SOLVE_QP_UsING_SMO_ABSTRACT_Hh_
+#ifdef DLIB_OPTIMIZATION_SOLVE_QP_UsING_SMO_ABSTRACT_Hh_
+
+#include "../matrix.h"
+#include <map>
+#include "../unordered_pair.h"
+
+namespace dlib
+{
+
+// ----------------------------------------------------------------------------------------
+
+ template <
+ typename EXP1,
+ typename EXP2,
+ typename T, long NR, long NC, typename MM, typename L
+ >
+ unsigned long solve_qp_using_smo (
+ const matrix_exp<EXP1>& Q,
+ const matrix_exp<EXP2>& b,
+ matrix<T,NR,NC,MM,L>& alpha,
+ T eps,
+ unsigned long max_iter
+ );
+ /*!
+ requires
+ - Q.nr() == Q.nc()
+ - is_col_vector(b) == true
+ - is_col_vector(alpha) == true
+ - b.size() == alpha.size() == Q.nr()
+ - alpha.size() > 0
+ - min(alpha) >= 0
+ - eps > 0
+ - max_iter > 0
+ ensures
+ - Let C == sum(alpha) (i.e. C is the sum of the alpha values you
+ supply to this function)
+ - This function solves the following quadratic program:
+ Minimize: f(alpha) == 0.5*trans(alpha)*Q*alpha - trans(alpha)*b
+ subject to the following constraints:
+ - sum(alpha) == C (i.e. the sum of alpha values doesn't change)
+ - min(alpha) >= 0 (i.e. all alpha values are nonnegative)
+ Where f is convex. This means that Q should be positive-semidefinite.
+ - The solution to the above QP will be stored in #alpha.
+ - This function uses a simple implementation of the sequential minimal
+ optimization algorithm. It starts the algorithm with the given alpha
+ and it works on the problem until the duality gap (i.e. how far away
+ we are from the optimum solution) is less than eps. So eps controls
+ how accurate the solution is and smaller values result in better solutions.
+ - At most max_iter iterations of optimization will be performed.
+ - returns the number of iterations performed. If this method fails to
+ converge to eps accuracy then the number returned will be max_iter+1.
+ !*/
+
+// ----------------------------------------------------------------------------------------
+
+ template <
+ typename EXP1,
+ typename EXP2,
+ typename EXP3,
+ typename T, long NR, long NC, typename MM, typename L,
+ long NR2, long NC2
+ >
+ unsigned long solve_qp4_using_smo (
+ const matrix_exp<EXP1>& A,
+ const matrix_exp<EXP2>& Q,
+ const matrix_exp<EXP3>& b,
+ const matrix_exp<EXP4>& d,
+ matrix<T,NR,NC,MM,L>& alpha,
+ matrix<T,NR2,NC2,MM,L>& lambda,
+ T eps,
+ unsigned long max_iter,
+ T max_lambda = std::numeric_limits<T>::infinity()
+ );
+ /*!
+ requires
+ - A.nc() == alpha.size()
+ - Q.nr() == Q.nc()
+ - is_col_vector(b) == true
+ - is_col_vector(d) == true
+ - is_col_vector(alpha) == true
+ - b.size() == alpha.size() == Q.nr()
+ - d.size() == A.nr()
+ - alpha.size() > 0
+ - min(alpha) >= 0
+ - eps > 0
+ - max_iter > 0
+ - max_lambda >= 0
+ ensures
+ - Let C == sum(alpha) (i.e. C is the sum of the alpha values you
+ supply to this function)
+ - This function solves the following quadratic program:
+ Minimize: f(alpha,lambda) == 0.5*trans(alpha)*Q*alpha - trans(alpha)*b +
+ 0.5*trans(lambda)*lambda - trans(lambda)*A*alpha - trans(lambda)*d
+ subject to the following constraints:
+ - sum(alpha) == C (i.e. the sum of alpha values doesn't change)
+ - min(alpha) >= 0 (i.e. all alpha values are nonnegative)
+ - min(lambda) >= 0 (i.e. all lambda values are nonnegative)
+ - max(lambda) <= max_lambda (i.e. all lambda values are less than max_lambda)
+ Where f is convex. This means that Q should be positive-semidefinite.
+ - If you don't want an upper limit on lambda then max_lambda can be set to
+ infinity.
+ - The solution to the above QP will be stored in #alpha and #lambda.
+ - This function uses a simple implementation of the sequential minimal
+ optimization algorithm. It starts the algorithm with the given alpha
+ and it works on the problem until the duality gap (i.e. how far away
+ we are from the optimum solution) is less than eps. So eps controls
+ how accurate the solution is and smaller values result in better solutions.
+ The initial value of lambda is ignored since the optimal lambda can be
+ obtained via a simple closed form expression given alpha.
+ - At most max_iter iterations of optimization will be performed.
+ - returns the number of iterations performed. If this method fails to
+ converge to eps accuracy then the number returned will be max_iter+1.
+ !*/
+
+// ----------------------------------------------------------------------------------------
+
+ template <
+ typename EXP1,
+ typename EXP2,
+ typename T, long NR, long NC, typename MM, typename L
+ >
+ unsigned long solve_qp_box_constrained (
+ const matrix_exp<EXP1>& Q,
+ const matrix_exp<EXP2>& b,
+ matrix<T,NR,NC,MM,L>& alpha,
+ const matrix<T,NR,NC,MM,L>& lower,
+ const matrix<T,NR,NC,MM,L>& upper,
+ T eps,
+ unsigned long max_iter
+ );
+ /*!
+ requires
+ - Q.nr() == Q.nc()
+ - alpha.size() == lower.size() == upper.size()
+ - is_col_vector(b) == true
+ - is_col_vector(alpha) == true
+ - is_col_vector(lower) == true
+ - is_col_vector(upper) == true
+ - b.size() == alpha.size() == Q.nr()
+ - alpha.size() > 0
+ - 0 <= min(alpha-lower)
+ - 0 <= max(upper-alpha)
+ - eps > 0
+ - max_iter > 0
+ ensures
+ - This function solves the following quadratic program:
+ Minimize: f(alpha) == 0.5*trans(alpha)*Q*alpha + trans(b)*alpha
+ subject to the following box constraints on alpha:
+ - 0 <= min(alpha-lower)
+ - 0 <= max(upper-alpha)
+ Where f is convex. This means that Q should be positive-semidefinite.
+ - The solution to the above QP will be stored in #alpha.
+ - This function uses a combination of a SMO algorithm along with Nesterov's
+ method as the main iteration of the solver. It starts the algorithm with the
+ given alpha and it works on the problem until the derivative of f(alpha) is
+ smaller than eps for each element of alpha or the alpha value is at a box
+ constraint. So eps controls how accurate the solution is and smaller values
+ result in better solutions.
+ - At most max_iter iterations of optimization will be performed.
+ - returns the number of iterations performed. If this method fails to
+ converge to eps accuracy then the number returned will be max_iter+1.
+ !*/
+
+// ----------------------------------------------------------------------------------------
+
+ template <
+ typename T, long NR, long NC, typename MM, typename L
+ >
+ unsigned long solve_qp_box_constrained_blockdiag (
+ const std::vector<matrix<T,NR,NR,MM,L>>& Q_blocks,
+ const std::vector<matrix<T,NR,NC,MM,L>>& bs,
+ const std::map<unordered_pair<size_t>, matrix<T,NR,NC,MM,L>>& Q_offdiag,
+ std::vector<matrix<T,NR,NC,MM,L>>& alphas,
+ const std::vector<matrix<T,NR,NC,MM,L>>& lowers,
+ const std::vector<matrix<T,NR,NC,MM,L>>& uppers,
+ T eps,
+ unsigned long max_iter
+ );
+ /*!
+ requires
+ - Q_blocks.size() > 0
+ - Q_blocks.size() == bs.size() == alphas.size() == lowers.size() == uppers.size()
+ - All the matrices in Q_blocks have the same dimensions. Moreover, they are
+ non-empty square matrices.
+ - All the matrices in bs, Q_offdiag, alphas, lowers, and uppers have the same
+ dimensions. Moreover, they are all column vectors.
+ - Q_blocks[0].nr() == alphas[0].size()
+ (i.e. the dimensionality of the column vectors in alphas must match the
+ dimensionality of the square matrices in Q_blocks.)
+ - for all valid i:
+ - 0 <= min(alphas[i]-lowers[i])
+ - 0 <= max(uppers[i]-alphas[i])
+ - eps > 0
+ - max_iter > 0
+ ensures
+ - This function solves the same QP as solve_qp_box_constrained(), except it is
+ optimized for the case where the Q matrix has a certain sparsity structure.
+ To be precise:
+ - Let Q1 be a block diagonal matrix with the elements of Q_blocks placed
+ along its diagonal, and in the order contained in Q_blocks.
+ - Let Q2 be a matrix with the same size as Q1, except instead of being block diagonal, it
+ is block structured into Q_blocks.nr() by Q_blocks.nc() blocks. If we let (r,c) be the
+ coordinate of each block then each block contains the matrix
+ diagm(Q_offdiag[make_unordered_pair(r,c)]) or the zero matrix if Q_offdiag has no entry
+ for the coordinate (r,c).
+ - Let Q == Q1+Q2
+ - Let b == the concatenation of all the vectors in bs into one big vector.
+ - Let alpha == the concatenation of all the vectors in alphas into one big vector.
+ - Let lower == the concatenation of all the vectors in lowers into one big vector.
+ - Let upper == the concatenation of all the vectors in uppers into one big vector.
+ - Then this function solves the following quadratic program:
+ Minimize: f(alpha) == 0.5*trans(alpha)*Q*alpha + trans(b)*alpha
+ subject to the following box constraints on alpha:
+ - 0 <= min(alpha-lower)
+ - 0 <= max(upper-alpha)
+ Where f is convex. This means that Q should be positive-semidefinite.
+ - More specifically, this function is identical to
+ solve_qp_box_constrained(Q, b, alpha, lower, upper, eps, max_iter),
+ except that it runs faster since it avoids unnecessary computation by
+ taking advantage of the sparsity structure in the QP.
+ - The solution to the above QP will be stored in #alphas.
+ - This function uses a combination of a SMO algorithm along with Nesterov's
+ method as the main iteration of the solver. It starts the algorithm with the
+ given alpha and it works on the problem until the derivative of f(alpha) is
+ smaller than eps for each element of alpha or the alpha value is at a box
+ constraint. So eps controls how accurate the solution is and smaller values
+ result in better solutions.
+ - At most max_iter iterations of optimization will be performed.
+ - returns the number of iterations performed. If this method fails to
+ converge to eps accuracy then the number returned will be max_iter+1.
+ !*/
+// ----------------------------------------------------------------------------------------
+
+ template <
+ typename EXP1,
+ typename EXP2,
+ typename T, long NRa, long NRb
+ >
+ unsigned long find_gap_between_convex_hulls (
+ const matrix_exp<EXP1>& A,
+ const matrix_exp<EXP2>& B,
+ matrix<T,NRa,1>& cA,
+ matrix<T,NRb,1>& cB,
+ const double eps,
+ const unsigned long max_iter = 1000
+ );
+ /*!
+ requires
+ - A.nr() == B.nr()
+ - A.size() != 0
+ - B.size() != 0
+ - eps > 0
+ - max_iter > 0
+ ensures
+ - If you think of A and B as sets of column vectors, then we can identify the
+ convex sets hullA and hullB, which are the convex hulls of A and B
+ respectively. This function finds the pair of points in hullA and hullB that
+ are nearest to each other. To be precise, this function solves the following
+ quadratic program:
+ Minimize: f(cA,cB) == length_squared(A*cA - B*cB)
+ subject to the following constraints on cA and cB:
+ - is_col_vector(cA) == true && cA.size() == A.nc()
+ - is_col_vector(cB) == true && cB.size() == B.nc()
+ - sum(cA) == 1 && min(cA) >= 0
+ - sum(cB) == 1 && min(cB) >= 0
+ - This function uses an iterative block coordinate descent algorithm to solve
+ the QP. It runs until either max_iter iterations have been performed or the
+ QP is solved to at least eps accuracy.
+ - returns the number of iterations performed. If this method fails to
+ converge to eps accuracy then the number returned will be max_iter+1.
+ !*/
+
+// ----------------------------------------------------------------------------------------
+
+}
+
+#endif // DLIB_OPTIMIZATION_SOLVE_QP_UsING_SMO_ABSTRACT_Hh_
+
+
diff --git a/ml/dlib/dlib/optimization/optimization_stop_strategies.h b/ml/dlib/dlib/optimization/optimization_stop_strategies.h
new file mode 100644
index 000000000..a0243cacf
--- /dev/null
+++ b/ml/dlib/dlib/optimization/optimization_stop_strategies.h
@@ -0,0 +1,173 @@
+// Copyright (C) 2008 Davis E. King (davis@dlib.net)
+// License: Boost Software License See LICENSE.txt for the full license.
+#ifndef DLIB_OPTIMIZATIOn_STOP_STRATEGIES_H_
+#define DLIB_OPTIMIZATIOn_STOP_STRATEGIES_H_
+
+#include <cmath>
+#include <limits>
+#include "../matrix.h"
+#include "../algs.h"
+#include "optimization_stop_strategies_abstract.h"
+#include <iostream>
+
+namespace dlib
+{
+
+// ----------------------------------------------------------------------------------------
+
+ class objective_delta_stop_strategy
+ {
+ public:
+ explicit objective_delta_stop_strategy (
+ double min_delta = 1e-7
+ ) : _verbose(false), _been_used(false), _min_delta(min_delta), _max_iter(0), _cur_iter(0), _prev_funct_value(0)
+ {
+ DLIB_ASSERT (
+ min_delta >= 0,
+ "\t objective_delta_stop_strategy(min_delta)"
+ << "\n\t min_delta can't be negative"
+ << "\n\t min_delta: " << min_delta
+ );
+ }
+
+ objective_delta_stop_strategy (
+ double min_delta,
+ unsigned long max_iter
+ ) : _verbose(false), _been_used(false), _min_delta(min_delta), _max_iter(max_iter), _cur_iter(0), _prev_funct_value(0)
+ {
+ DLIB_ASSERT (
+ min_delta >= 0 && max_iter > 0,
+ "\t objective_delta_stop_strategy(min_delta, max_iter)"
+ << "\n\t min_delta can't be negative and max_iter can't be 0"
+ << "\n\t min_delta: " << min_delta
+ << "\n\t max_iter: " << max_iter
+ );
+ }
+
+ objective_delta_stop_strategy& be_verbose(
+ )
+ {
+ _verbose = true;
+ return *this;
+ }
+
+ template <typename T>
+ bool should_continue_search (
+ const T& ,
+ const double funct_value,
+ const T&
+ )
+ {
+ if (_verbose)
+ {
+ using namespace std;
+ cout << "iteration: " << _cur_iter << " objective: " << funct_value << endl;
+ }
+
+ ++_cur_iter;
+ if (_been_used)
+ {
+ // Check if we have hit the max allowable number of iterations. (but only
+ // check if _max_iter is enabled (i.e. not 0)).
+ if (_max_iter != 0 && _cur_iter > _max_iter)
+ return false;
+
+ // check if the function change was too small
+ if (std::abs(funct_value - _prev_funct_value) < _min_delta)
+ return false;
+ }
+
+ _been_used = true;
+ _prev_funct_value = funct_value;
+ return true;
+ }
+
+ private:
+ bool _verbose;
+
+ bool _been_used;
+ double _min_delta;
+ unsigned long _max_iter;
+ unsigned long _cur_iter;
+ double _prev_funct_value;
+ };
+
+// ----------------------------------------------------------------------------------------
+
+ class gradient_norm_stop_strategy
+ {
+ public:
+ explicit gradient_norm_stop_strategy (
+ double min_norm = 1e-7
+ ) : _verbose(false), _min_norm(min_norm), _max_iter(0), _cur_iter(0)
+ {
+ DLIB_ASSERT (
+ min_norm >= 0,
+ "\t gradient_norm_stop_strategy(min_norm)"
+ << "\n\t min_norm can't be negative"
+ << "\n\t min_norm: " << min_norm
+ );
+ }
+
+ gradient_norm_stop_strategy (
+ double min_norm,
+ unsigned long max_iter
+ ) : _verbose(false), _min_norm(min_norm), _max_iter(max_iter), _cur_iter(0)
+ {
+ DLIB_ASSERT (
+ min_norm >= 0 && max_iter > 0,
+ "\t gradient_norm_stop_strategy(min_norm, max_iter)"
+ << "\n\t min_norm can't be negative and max_iter can't be 0"
+ << "\n\t min_norm: " << min_norm
+ << "\n\t max_iter: " << max_iter
+ );
+ }
+
+ gradient_norm_stop_strategy& be_verbose(
+ )
+ {
+ _verbose = true;
+ return *this;
+ }
+
+ template <typename T>
+ bool should_continue_search (
+ const T& ,
+ const double funct_value,
+ const T& funct_derivative
+ )
+ {
+ if (_verbose)
+ {
+ using namespace std;
+ cout << "iteration: " << _cur_iter << " objective: " << funct_value << " gradient norm: " << length(funct_derivative) << endl;
+ }
+
+ ++_cur_iter;
+
+ // Check if we have hit the max allowable number of iterations. (but only
+ // check if _max_iter is enabled (i.e. not 0)).
+ if (_max_iter != 0 && _cur_iter > _max_iter)
+ return false;
+
+ // check if the gradient norm is too small
+ if (length(funct_derivative) < _min_norm)
+ return false;
+
+ return true;
+ }
+
+ private:
+ bool _verbose;
+
+ double _min_norm;
+ unsigned long _max_iter;
+ unsigned long _cur_iter;
+ };
+
+// ----------------------------------------------------------------------------------------
+
+}
+
+#endif // DLIB_OPTIMIZATIOn_STOP_STRATEGIES_H_
+
diff --git a/ml/dlib/dlib/optimization/optimization_stop_strategies_abstract.h b/ml/dlib/dlib/optimization/optimization_stop_strategies_abstract.h
new file mode 100644
index 000000000..6a999f8d9
--- /dev/null
+++ b/ml/dlib/dlib/optimization/optimization_stop_strategies_abstract.h
@@ -0,0 +1,157 @@
+// Copyright (C) 2008 Davis E. King (davis@dlib.net)
+// License: Boost Software License See LICENSE.txt for the full license.
+#undef DLIB_OPTIMIZATIOn_STOP_STRATEGIES_ABSTRACT_
+#ifdef DLIB_OPTIMIZATIOn_STOP_STRATEGIES_ABSTRACT_
+
+#include <cmath>
+#include <limits>
+#include "../matrix/matrix_abstract.h"
+#include "../algs.h"
+
+
+namespace dlib
+{
+
+// ----------------------------------------------------------------------------------------
+
+ class objective_delta_stop_strategy
+ {
+ /*!
+ WHAT THIS OBJECT REPRESENTS
+ This object represents a strategy for deciding if an optimization
+ algorithm should terminate. This particular object looks at the
+ change in the objective function from one iteration to the next and
+ bases its decision on how large this change is. If the change
+ is below a user given threshold then the search stops.
+ !*/
+
+ public:
+ explicit objective_delta_stop_strategy (
+ double min_delta = 1e-7
+ );
+ /*!
+ requires
+ - min_delta >= 0
+ ensures
+ - This stop strategy object will only consider a search to be complete
+ if a change in an objective function from one iteration to the next
+ is less than min_delta.
+ !*/
+
+ objective_delta_stop_strategy (
+ double min_delta,
+ unsigned long max_iter
+ );
+ /*!
+ requires
+ - min_delta >= 0
+ - max_iter > 0
+ ensures
+ - This stop strategy object will only consider a search to be complete
+ if a change in an objective function from one iteration to the next
+ is less than min_delta or more than max_iter iterations has been
+ executed.
+ !*/
+
+ objective_delta_stop_strategy& be_verbose(
+ );
+ /*!
+ ensures
+ - causes this object to print status messages to standard out
+ every time should_continue_search() is called.
+ - returns *this
+ !*/
+
+ template <typename T>
+ bool should_continue_search (
+ const T& x,
+ const double funct_value,
+ const T& funct_derivative
+ );
+ /*!
+ requires
+ - this function is only called once per search iteration
+ - for some objective function f():
+ - x == the search point for the current iteration
+ - funct_value == f(x)
+ - funct_derivative == derivative(f)(x)
+ ensures
+ - returns true if the point x doest not satisfy the stopping condition and
+ false otherwise.
+ !*/
+
+ };
+
+// ----------------------------------------------------------------------------------------
+
+ class gradient_norm_stop_strategy
+ {
+ /*!
+ WHAT THIS OBJECT REPRESENTS
+ This object represents a strategy for deciding if an optimization
+ algorithm should terminate. This particular object looks at the
+ norm (i.e. the length) of the current gradient vector and stops
+ if it is smaller than a user given threshold.
+ !*/
+
+ public:
+ explicit gradient_norm_stop_strategy (
+ double min_norm = 1e-7
+ );
+ /*!
+ requires
+ - min_norm >= 0
+ ensures
+ - This stop strategy object will only consider a search to be complete
+ if the current gradient norm is less than min_norm
+ !*/
+
+ gradient_norm_stop_strategy (
+ double min_norm,
+ unsigned long max_iter
+ );
+ /*!
+ requires
+ - min_norm >= 0
+ - max_iter > 0
+ ensures
+ - This stop strategy object will only consider a search to be complete
+ if the current gradient norm is less than min_norm or more than
+ max_iter iterations has been executed.
+ !*/
+
+ gradient_norm_stop_strategy& be_verbose(
+ );
+ /*!
+ ensures
+ - causes this object to print status messages to standard out
+ every time should_continue_search() is called.
+ - returns *this
+ !*/
+
+ template <typename T>
+ bool should_continue_search (
+ const T& x,
+ const double funct_value,
+ const T& funct_derivative
+ );
+ /*!
+ requires
+ - this function is only called once per search iteration
+ - for some objective function f():
+ - x == the search point for the current iteration
+ - funct_value == f(x)
+ - funct_derivative == derivative(f)(x)
+ ensures
+ - returns true if the point x doest not satisfy the stopping condition and
+ false otherwise.
+ !*/
+
+ };
+
+// ----------------------------------------------------------------------------------------
+
+}
+
+#endif // DLIB_OPTIMIZATIOn_STOP_STRATEGIES_ABSTRACT_
+
diff --git a/ml/dlib/dlib/optimization/optimization_trust_region.h b/ml/dlib/dlib/optimization/optimization_trust_region.h
new file mode 100644
index 000000000..5f0ad897f
--- /dev/null
+++ b/ml/dlib/dlib/optimization/optimization_trust_region.h
@@ -0,0 +1,564 @@
+// Copyright (C) 2010 Davis E. King (davis@dlib.net)
+// License: Boost Software License See LICENSE.txt for the full license.
+#ifndef DLIB_OPTIMIZATION_TRUST_REGIoN_Hh_
+#define DLIB_OPTIMIZATION_TRUST_REGIoN_Hh_
+
+#include "../matrix.h"
+#include "optimization_trust_region_abstract.h"
+
+namespace dlib
+{
+
+// ----------------------------------------------------------------------------------------
+
+ template <
+ typename EXP1,
+ typename EXP2,
+ typename T, long NR, long NC, typename MM, typename L
+ >
+ unsigned long solve_trust_region_subproblem (
+ const matrix_exp<EXP1>& B,
+ const matrix_exp<EXP2>& g,
+ const typename EXP1::type radius,
+ matrix<T,NR,NC,MM,L>& p,
+ double eps,
+ unsigned long max_iter
+ )
+ {
+ /*
+ This is an implementation of algorithm 4.3(Trust Region Subproblem)
+ from the book Numerical Optimization by Nocedal and Wright. Some of
+ the details are also from Practical Methods of Optimization by Fletcher.
+ */
+
+ // make sure requires clause is not broken
+ DLIB_ASSERT(B.nr() == B.nc() && is_col_vector(g) && g.size() == B.nr(),
+ "\t unsigned long solve_trust_region_subproblem()"
+ << "\n\t invalid arguments were given to this function"
+ << "\n\t B.nr(): " << B.nr()
+ << "\n\t B.nc(): " << B.nc()
+ << "\n\t is_col_vector(g): " << is_col_vector(g)
+ << "\n\t g.size(): " << g.size()
+ );
+ DLIB_ASSERT(radius > 0 && eps > 0 && max_iter > 0,
+ "\t unsigned long solve_trust_region_subproblem()"
+ << "\n\t invalid arguments were given to this function"
+ << "\n\t radius: " << radius
+ << "\n\t eps: " << eps
+ << "\n\t max_iter: " << max_iter
+ );
+
+
+ const_temp_matrix<EXP1> BB(B);
+ const_temp_matrix<EXP2> gg(g);
+
+ p.set_size(g.nr(),g.nc());
+ p = 0;
+
+
+ const T numeric_eps = max(diag(abs(BB)))*std::numeric_limits<T>::epsilon();
+
+ matrix<T,EXP1::NR,EXP2::NR,MM,L> R;
+
+ T lambda = 0;
+
+ // We need to put a bracket around lambda. It can't go below 0. We
+ // can get an upper bound using Gershgorin disks.
+ // This number is a lower bound on the eigenvalues in BB
+ const T BB_min_eigenvalue = min(diag(BB) - (sum_cols(abs(BB)) - abs(diag(BB))));
+
+ const T g_norm = length(gg);
+
+ T lambda_min = 0;
+ T lambda_max = put_in_range(0,
+ std::numeric_limits<T>::max(),
+ g_norm/radius - BB_min_eigenvalue);
+
+
+ // If we can tell that the minimum is at 0 then don't do anything. Just return the answer.
+ if (g_norm < numeric_eps && BB_min_eigenvalue > numeric_eps)
+ {
+ return 0;
+ }
+
+
+ // how much lambda has changed recently
+ T lambda_delta = 0;
+
+ for (unsigned long i = 0; i < max_iter; ++i)
+ {
+ R = chol(BB + lambda*identity_matrix<T>(BB.nr()));
+
+ // if the cholesky decomposition doesn't exist.
+ if (R(R.nr()-1, R.nc()-1) <= 0)
+ {
+ // If B is indefinite and g is equal to 0 then we should
+ // quit this loop and go right to the eigenvalue decomposition method.
+ if (g_norm <= numeric_eps)
+ break;
+
+ // narrow the bracket on lambda. Obviously the current lambda is
+ // too small.
+ lambda_min = lambda;
+
+ // jump towards the max value. Eventually there will
+ // be a lambda that results in a cholesky decomposition.
+ const T alpha = 0.10;
+ lambda = (1-alpha)*lambda + alpha*lambda_max;
+ continue;
+ }
+
+ using namespace blas_bindings;
+
+ p = -gg;
+ // Solve RR'*p = -g for p.
+ // Solve R*q = -g for q where q = R'*p.
+ if (R.nr() == 2)
+ {
+ p(0) = p(0)/R(0,0);
+ p(1) = (p(1)-R(1,0)*p(0))/R(1,1);
+ }
+ else
+ {
+ triangular_solver(CblasLeft, CblasLower, CblasNoTrans, CblasNonUnit, R, p);
+ }
+ const T q_norm = length(p);
+
+ // Solve R'*p = q for p.
+ if (R.nr() == 2)
+ {
+ p(1) = p(1)/R(1,1);
+ p(0) = (p(0)-R(1,0)*p(1))/R(0,0);
+ }
+ else
+ {
+ triangular_solver(CblasLeft, CblasLower, CblasTrans, CblasNonUnit, R, p);
+ }
+ const T p_norm = length(p);
+
+ // check if we are done.
+ if (lambda == 0)
+ {
+ if (p_norm < radius)
+ {
+ // i will always be 0 in this case. So we return 1.
+ return i+1;
+ }
+ }
+ else
+ {
+ // if we are close enough to the solution then terminate
+ if (std::abs(p_norm - radius)/radius < eps)
+ return i+1;
+ }
+
+ // shrink our bracket on lambda
+ if (p_norm < radius)
+ lambda_max = lambda;
+ else
+ lambda_min = lambda;
+
+
+ if (p_norm <= radius*std::numeric_limits<T>::epsilon())
+ {
+ const T alpha = 0.01;
+ lambda = (1-alpha)*lambda_min + alpha*lambda_max;
+ continue;
+ }
+
+ const T old_lambda = lambda;
+
+ // figure out which lambda to try next
+ lambda = lambda + std::pow(q_norm/p_norm,2)*(p_norm - radius)/radius;
+
+ // make sure the chosen lambda is within our bracket (but not exactly at either end).
+ const T gap = (lambda_max-lambda_min)*0.01;
+ lambda = put_in_range(lambda_min+gap, lambda_max-gap, lambda);
+
+ // Keep track of how much lambda is thrashing around inside the search bracket. If it
+ // keeps moving around a whole lot then cut the search bracket in half.
+ lambda_delta += std::abs(lambda - old_lambda);
+ if (lambda_delta > 3*(lambda_max-lambda_min))
+ {
+ lambda = (lambda_min+lambda_max)/2;
+ lambda_delta = 0;
+ }
+ } // end for loop
+
+
+ // We are probably in the "hard case". Use an eigenvalue decomposition to sort things out.
+ // Either that or the eps was just set too tight and really we are already done.
+ eigenvalue_decomposition<EXP1> ed(make_symmetric(BB));
+
+ matrix<T,NR,NC,MM,L> ev = ed.get_real_eigenvalues();
+ const long min_eig_idx = index_of_min(ev);
+
+
+ ev -= min(ev);
+ // zero out any values which are basically zero
+ ev = pointwise_multiply(ev, ev > max(abs(ev))*std::numeric_limits<T>::epsilon());
+ ev = reciprocal(ev);
+
+
+ // figure out part of what p should be assuming we are in the hard case.
+ matrix<T,NR,NC,MM,L> p_hard;
+ p_hard = trans(ed.get_pseudo_v())*gg;
+ p_hard = diagm(ev)*p_hard;
+ p_hard = ed.get_pseudo_v()*p_hard;
+
+
+ // If we really are in the hard case then this if will be true. Otherwise, the p
+ // we found in the "easy case" loop up top is the best answer.
+ if (length(p_hard) < radius && length(p_hard) >= length(p))
+ {
+ // adjust the length of p_hard by adding a component along the eigenvector associated with
+ // the smallest eigenvalue. We want to make it the case that length(p) == radius.
+ const T tau = std::sqrt(radius*radius - length_squared(p_hard));
+ p = p_hard + tau*colm(ed.get_pseudo_v(),min_eig_idx);
+
+
+ // if we have to do an eigenvalue decomposition then say we did all the iterations
+ return max_iter;
+ }
+
+ // if we get this far it means we didn't converge to eps accuracy.
+ return max_iter+1;
+ }
+
+// ----------------------------------------------------------------------------------------
+
+ namespace impl
+ {
+ template <
+ typename EXP1,
+ typename EXP2,
+ typename EXP3
+ >
+ bool bounds_violated (
+ const matrix_exp<EXP1>& v,
+ const matrix_exp<EXP2>& l,
+ const matrix_exp<EXP3>& u
+ )
+ {
+ DLIB_ASSERT(v.nr() == l.nr() && v.nr() == u.nr());
+ DLIB_ASSERT(v.nc() == l.nc() && v.nc() == u.nc());
+ for (long r = 0; r < v.nr(); ++r)
+ {
+ for (long c = 0; c < v.nc(); c++)
+ {
+ if (!(l(r,c) <= v(r,c) && v(r,c) <= u(r,c)))
+ return true;
+ }
+ }
+ return false;
+ }
+ }
+
+// ----------------------------------------------------------------------------------------
+
+ template <
+ typename EXP1,
+ typename EXP2,
+ typename T, long NR, long NC, typename MM, typename L,
+ typename EXP3
+ >
+ void solve_trust_region_subproblem_bounded (
+ const matrix_exp<EXP1>& B_,
+ const matrix_exp<EXP2>& g_,
+ const typename EXP1::type radius_,
+ matrix<T,NR,NC,MM,L>& p_,
+ double eps,
+ unsigned long max_iter,
+ const matrix_exp<EXP3>& lower_,
+ const matrix_exp<EXP3>& upper_
+ )
+ {
+ // make sure requires clause is not broken
+ DLIB_ASSERT(B_.nr() == B_.nc() && is_col_vector(g_) && g_.size() == B_.nr(),
+ "\t unsigned long solve_trust_region_subproblem_bounded()"
+ << "\n\t invalid arguments were given to this function"
+ << "\n\t B_.nr(): " << B_.nr()
+ << "\n\t B_.nc(): " << B_.nc()
+ << "\n\t is_col_vector(g_): " << is_col_vector(g_)
+ << "\n\t g_.size(): " << g_.size()
+ );
+ DLIB_ASSERT(radius_ > 0 && eps > 0 && max_iter > 0,
+ "\t unsigned long solve_trust_region_subproblem_bounded()"
+ << "\n\t invalid arguments were given to this function"
+ << "\n\t radius_: " << radius_
+ << "\n\t eps: " << eps
+ << "\n\t max_iter: " << max_iter
+ );
+ DLIB_ASSERT(is_col_vector(lower_) && lower_.size() == g_.size());
+ DLIB_ASSERT(is_col_vector(upper_) && upper_.size() == g_.size());
+ DLIB_ASSERT(max(upper_-lower_) >= 0);
+ // make sure the problem is feasible. That is, there should be a point inside the
+ // lower and upper bounds that has a norm <= radius_
+ DLIB_ASSERT(length(clamp(zeros_matrix(lower_),lower_,upper_)) <= radius_,
+ "The lower and upper bounds are incompatible with the radius since there is no point within the bounds with a norm less than the radius.");
+
+ // We are going to solve this by greedily finding the most violated bound constraint,
+ // locking that variable to its constrained value, removing it from the problem,
+ // and then resolving. We do that until no more constraint violations are present.
+
+ solve_trust_region_subproblem(B_,g_,radius_,p_,eps,max_iter);
+
+
+ // just stop here if all the bounds are satisfied.
+ if (!impl::bounds_violated(p_, lower_, upper_))
+ return;
+
+ matrix<double> B = matrix_cast<double>(B_);
+ matrix<double,0,1> g = matrix_cast<double>(g_);
+ double radius = radius_;
+ matrix<double,0,1> p = matrix_cast<double>(p_);
+ matrix<double,0,1> lower = matrix_cast<double>(lower_);
+ matrix<double,0,1> upper = matrix_cast<double>(upper_);
+
+ // keep a table that tells us how to map any reduced QP back to the original QP
+ std::vector<long> idxs(g.size());
+ for (size_t i = 0; i < idxs.size(); ++i)
+ idxs[i] = i;
+
+
+ // while we haven't found a p that satisfies the bounds constraints
+ while(impl::bounds_violated(p, lower, upper) )
+ {
+ // Find the most violated variable and fix its value to a constant (the bound
+ // value).
+ long most_violated_idx = 0;
+ double max_violation = 0;
+ double bounded_value = 0;
+ for (long i = 0; i < lower.size(); ++i)
+ {
+ if (!(lower(i) <= p(i) && p(i) <= upper(i)))
+ {
+ if (lower(i)-p(i) > max_violation)
+ {
+ max_violation = lower(i)-p(i);
+ most_violated_idx = i;
+ bounded_value = lower(i);
+ }
+ else if (p(i)-upper(i) > max_violation)
+ {
+ max_violation = p(i)-upper(i);
+ most_violated_idx = i;
+ bounded_value = upper(i);
+ }
+ }
+ }
+
+ // assign this variable to its final value.
+ p_(idxs[most_violated_idx]) = bounded_value;
+
+
+ // now reduce the QP by removing the variable p_(idxs[most_violated_idx]).
+ idxs.erase(idxs.begin()+most_violated_idx);
+ // we are out of variables to remove since everything is at bounds.
+ if (idxs.size() == 0)
+ break;
+
+ lower = remove_row(lower,most_violated_idx);
+ upper = remove_row(upper,most_violated_idx);
+ g += colm(B,most_violated_idx)*bounded_value;
+ g = remove_row(g,most_violated_idx);
+ p = remove_row(p,most_violated_idx);
+ B = removerc(B,most_violated_idx, most_violated_idx);
+
+ // Removing a variable changes the radius, so we have to subtract the bounded
+ // value from the radius so as to not change the effective radius for the whole
+ // problem.
+ double squared_radius = radius*radius - bounded_value*bounded_value;
+ if (squared_radius <= 0)
+ {
+ p = 0;
+ break;
+ }
+ radius = std::sqrt(squared_radius);
+
+
+ solve_trust_region_subproblem(B,g,radius,p,eps,max_iter);
+ }
+
+ // assign the non-bound-constrained variables to their final values
+ for (size_t i = 0; i < idxs.size(); ++i)
+ p_(idxs[i]) = p(i);
+
+ }
+
+// ----------------------------------------------------------------------------------------
+// ----------------------------------------------------------------------------------------
+
+ template <
+ typename stop_strategy_type,
+ typename funct_model
+ >
+ double find_min_trust_region (
+ stop_strategy_type stop_strategy,
+ const funct_model& model,
+ typename funct_model::column_vector& x,
+ double radius = 1
+ )
+ {
+ /*
+ This is an implementation of algorithm 4.1(Trust Region)
+ from the book Numerical Optimization by Nocedal and Wright.
+ */
+
+ // make sure requires clause is not broken
+ DLIB_ASSERT(is_col_vector(x) && radius > 0,
+ "\t double find_min_trust_region()"
+ << "\n\t invalid arguments were given to this function"
+ << "\n\t is_col_vector(x): " << is_col_vector(x)
+ << "\n\t radius: " << radius
+ );
+
+ const double initial_radius = radius;
+
+ typedef typename funct_model::column_vector T;
+ typedef typename T::type type;
+
+ typename funct_model::general_matrix h;
+ typename funct_model::column_vector g, p, d;
+ type f_value = model(x);
+
+ model.get_derivative_and_hessian(x,g,h);
+
+ DLIB_ASSERT(is_finite(x), "The objective function generated non-finite outputs");
+ DLIB_ASSERT(is_finite(g), "The objective function generated non-finite outputs");
+ DLIB_ASSERT(is_finite(h), "The objective function generated non-finite outputs");
+
+ // Sometimes the loop below won't modify x because the trust region step failed.
+ // This bool tells us when we are in that case.
+ bool stale_x = false;
+
+ while(stale_x || stop_strategy.should_continue_search(x, f_value, g))
+ {
+ const unsigned long iter = solve_trust_region_subproblem(h,
+ g,
+ radius,
+ p,
+ 0.1,
+ 20);
+
+
+ const type new_f_value = model(x+p);
+ const type predicted_improvement = -0.5*trans(p)*h*p - trans(g)*p;
+ const type measured_improvement = (f_value - new_f_value);
+
+ // If the sub-problem can't find a way to improve then stop. This only happens when p is essentially 0.
+ if (std::abs(predicted_improvement) <= std::abs(measured_improvement)*std::numeric_limits<type>::epsilon())
+ break;
+
+ // predicted_improvement shouldn't be negative but it might be if something went
+ // wrong in the trust region solver. So put abs() here to guard against that. This
+ // way the sign of rho is determined only by the sign of measured_improvement.
+ const type rho = measured_improvement/std::abs(predicted_improvement);
+
+
+ if (!is_finite(rho))
+ break;
+
+ if (rho < 0.25)
+ {
+ radius *= 0.25;
+
+ // something has gone horribly wrong if the radius has shrunk to zero. So just
+ // give up if that happens.
+ if (radius <= initial_radius*std::numeric_limits<double>::epsilon())
+ break;
+ }
+ else
+ {
+ // if rho > 0.75 and we are being checked by the radius
+ if (rho > 0.75 && iter > 1)
+ {
+ radius = std::min<type>(1000, 2*radius);
+ }
+ }
+
+ if (rho > 0)
+ {
+ x = x + p;
+ f_value = new_f_value;
+ model.get_derivative_and_hessian(x,g,h);
+ stale_x = false;
+ }
+ else
+ {
+ stale_x = true;
+ }
+
+ DLIB_ASSERT(is_finite(x), "The objective function generated non-finite outputs");
+ DLIB_ASSERT(is_finite(g), "The objective function generated non-finite outputs");
+ DLIB_ASSERT(is_finite(h), "The objective function generated non-finite outputs");
+ }
+
+ return f_value;
+ }
+
+// ----------------------------------------------------------------------------------------
+
+ template <typename funct_model>
+ struct negate_tr_model
+ {
+ negate_tr_model( const funct_model& m) : model(m) {}
+
+ const funct_model& model;
+
+ typedef typename funct_model::column_vector column_vector;
+ typedef typename funct_model::general_matrix general_matrix;
+
+ template <typename T>
+ typename T::type operator() (const T& x) const
+ {
+ return -model(x);
+ }
+
+ template <typename T, typename U>
+ void get_derivative_and_hessian (
+ const T& x,
+ T& d,
+ U& h
+ ) const
+ {
+ model.get_derivative_and_hessian(x,d,h);
+ d = -d;
+ h = -h;
+ }
+
+ };
+
+// ----------------------------------------------------------------------------------------
+
+ template <
+ typename stop_strategy_type,
+ typename funct_model
+ >
+ double find_max_trust_region (
+ stop_strategy_type stop_strategy,
+ const funct_model& model,
+ typename funct_model::column_vector& x,
+ double radius = 1
+ )
+ {
+ // make sure requires clause is not broken
+ DLIB_ASSERT(is_col_vector(x) && radius > 0,
+ "\t double find_max_trust_region()"
+ << "\n\t invalid arguments were given to this function"
+ << "\n\t is_col_vector(x): " << is_col_vector(x)
+ << "\n\t radius: " << radius
+ );
+
+ return -find_min_trust_region(stop_strategy,
+ negate_tr_model<funct_model>(model),
+ x,
+ radius);
+ }
+
+// ----------------------------------------------------------------------------------------
+
+}
+
+#endif // DLIB_OPTIMIZATION_TRUST_REGIoN_Hh_
+
diff --git a/ml/dlib/dlib/optimization/optimization_trust_region_abstract.h b/ml/dlib/dlib/optimization/optimization_trust_region_abstract.h
new file mode 100644
index 000000000..303ee746d
--- /dev/null
+++ b/ml/dlib/dlib/optimization/optimization_trust_region_abstract.h
@@ -0,0 +1,233 @@
+// Copyright (C) 2010 Davis E. King (davis@dlib.net)
+// License: Boost Software License See LICENSE.txt for the full license.
+#undef DLIB_OPTIMIZATION_TRUST_REGIoN_H_ABSTRACTh_
+#ifdef DLIB_OPTIMIZATION_TRUST_REGIoN_H_ABSTRACTh_
+
+#include "../matrix/matrix_abstract.h"
+
+namespace dlib
+{
+
+// ----------------------------------------------------------------------------------------
+
+ template <
+ typename EXP1,
+ typename EXP2,
+ typename T, long NR, long NC, typename MM, typename L
+ >
+ unsigned long solve_trust_region_subproblem (
+ const matrix_exp<EXP1>& B,
+ const matrix_exp<EXP2>& g,
+ const typename EXP1::type radius,
+ matrix<T,NR,NC,MM,L>& p,
+ double eps,
+ unsigned long max_iter
+ );
+ /*!
+ requires
+ - B == trans(B)
+ (i.e. B should be a symmetric matrix)
+ - B.nr() == B.nc()
+ - is_col_vector(g) == true
+ - g.size() == B.nr()
+ - p is capable of containing a column vector the size of g
+ (i.e. p = g; should be a legal expression)
+ - radius > 0
+ - eps > 0
+ - max_iter > 0
+ ensures
+ - This function solves the following optimization problem:
+ Minimize: f(p) == 0.5*trans(p)*B*p + trans(g)*p
+ subject to the following constraint:
+ - length(p) <= radius
+ - returns the number of iterations performed. If this method fails to converge
+ to eps accuracy then the number returned will be max_iter+1.
+ - if (this function didn't terminate due to hitting the max_iter iteration limit) then
+ - if this function returns 0 or 1 then we are not hitting the radius bound Otherwise,
+ the radius constraint is active and std::abs(length(#p)-radius)/radius <= eps.
+ !*/
+
+// ----------------------------------------------------------------------------------------
+
+ template <
+ typename EXP1,
+ typename EXP2,
+ typename T, long NR, long NC, typename MM, typename L,
+ typename EXP3
+ >
+ void solve_trust_region_subproblem_bounded (
+ const matrix_exp<EXP1>& B,
+ const matrix_exp<EXP2>& g,
+ const typename EXP1::type radius,
+ matrix<T,NR,NC,MM,L>& p,
+ double eps,
+ unsigned long max_iter,
+ const matrix_exp<EXP3>& lower,
+ const matrix_exp<EXP3>& upper
+ );
+ /*!
+ requires
+ - B == trans(B)
+ (i.e. B should be a symmetric matrix)
+ - B.nr() == B.nc()
+ - is_col_vector(g) == true
+ - is_col_vector(lower) == true
+ - is_col_vector(upper) == true
+ - g.size() == B.nr()
+ - lower.size() == B.nr()
+ - upper.size() == B.nr()
+ - p is capable of containing a column vector the size of g
+ (i.e. p = g; should be a legal expression)
+ - radius > 0
+ - eps > 0
+ - max_iter > 0
+ - min(upper-lower) >= 0
+ - length(clamp(zeros_matrix(lower),lower,upper)) <= radius
+ (i.e. the lower and upper bounds can't exclude all points with the desired radius.)
+ ensures
+ - This function solves the following optimization problem:
+ Minimize: f(p) == 0.5*trans(p)*B*p + trans(g)*p
+ subject to the following constraints:
+ - length(p) <= radius
+ - lower(i) <= p(i) <= upper(i), for all i
+ - Solves the problem to eps accuracy. We do this by greedily finding the most
+ violated bound constraint, locking that variable to its constrained value, removing
+ it from the problem, and then resolving. We do that until no more constraint
+ violations are present. Each time we just call solve_trust_region_subproblem()
+ to get the solution and pass eps and max_iter directly to these calls to
+ solve_trust_region_subproblem().
+ !*/
+
+// ----------------------------------------------------------------------------------------
+
+ class function_model
+ {
+ /*!
+ WHAT THIS OBJECT REPRESENTS
+ This object defines the interface for a function model
+ used by the trust-region optimizers defined below.
+
+ In particular, this object represents a function f() and
+ its associated derivative and hessian.
+
+ !*/
+
+ public:
+
+ // Define the type used to represent column vectors
+ typedef matrix<double,0,1> column_vector;
+ // Define the type used to represent the hessian matrix
+ typedef matrix<double> general_matrix;
+
+ double operator() (
+ const column_vector& x
+ ) const;
+ /*!
+ ensures
+ - returns f(x)
+ (i.e. evaluates this model at the given point and returns the value)
+ !*/
+
+ void get_derivative_and_hessian (
+ const column_vector& x,
+ column_vector& d,
+ general_matrix& h
+ ) const;
+ /*!
+ ensures
+ - #d == the derivative of f() at x
+ - #h == the hessian matrix of f() at x
+ - is_col_vector(#d) == true
+ - #d.size() == x.size()
+ - #h.nr() == #h.nc() == x.size()
+ - #h == trans(#h)
+ !*/
+ };
+
+// ----------------------------------------------------------------------------------------
+
+ template <
+ typename stop_strategy_type,
+ typename funct_model
+ >
+ double find_min_trust_region (
+ stop_strategy_type stop_strategy,
+ const funct_model& model,
+ typename funct_model::column_vector& x,
+ double radius = 1
+ );
+ /*!
+ requires
+ - stop_strategy == an object that defines a stop strategy such as one of
+ the objects from dlib/optimization/optimization_stop_strategies_abstract.h
+ - is_col_vector(x) == true
+ - radius > 0
+ - model must be an object with an interface as defined by the function_model
+ example object shown above.
+ ensures
+ - Performs an unconstrained minimization of the function defined by model
+ starting from the initial point x. This function uses a trust region
+ algorithm to perform the minimization. The radius parameter defines
+ the initial size of the trust region.
+ - The function is optimized until stop_strategy decides that an acceptable
+ point has been found or the trust region subproblem fails to make progress.
+ - #x == the value of x that was found to minimize model()
+ - returns model(#x).
+ - When this function makes calls to model.get_derivative_and_hessian() it always
+ does so by first calling model() and then calling model.get_derivative_and_hessian().
+ That is, any call to model.get_derivative_and_hessian(val) will always be
+ preceded by a call to model(val) with the same value. This way you can reuse
+ any redundant computations performed by model() and model.get_derivative_and_hessian()
+ as appropriate.
+ !*/
+
+// ----------------------------------------------------------------------------------------
+
+ template <
+ typename stop_strategy_type,
+ typename funct_model
+ >
+ double find_max_trust_region (
+ stop_strategy_type stop_strategy,
+ const funct_model& model,
+ typename funct_model::column_vector& x,
+ double radius = 1
+ );
+ /*!
+ requires
+ - stop_strategy == an object that defines a stop strategy such as one of
+ the objects from dlib/optimization/optimization_stop_strategies_abstract.h
+ - is_col_vector(x) == true
+ - radius > 0
+ - model must be an object with an interface as defined by the function_model
+ example object shown above.
+ ensures
+ - Performs an unconstrained maximization of the function defined by model
+ starting from the initial point x. This function uses a trust region
+ algorithm to perform the maximization. The radius parameter defines
+ the initial size of the trust region.
+ - The function is optimized until stop_strategy decides that an acceptable
+ point has been found or the trust region subproblem fails to make progress.
+ - #x == the value of x that was found to maximize model()
+ - returns model(#x).
+ - When this function makes calls to model.get_derivative_and_hessian() it always
+ does so by first calling model() and then calling model.get_derivative_and_hessian().
+ That is, any call to model.get_derivative_and_hessian(val) will always be
+ preceded by a call to model(val) with the same value. This way you can reuse
+ any redundant computations performed by model() and model.get_derivative_and_hessian()
+ as appropriate.
+ - Note that this function solves the maximization problem by converting it
+ into a minimization problem. Therefore, the values of model() and its derivative
+ reported to the stopping strategy will be negated. That is, stop_strategy
+ will see -model() and -derivative. All this really means is that the status
+ messages from a stopping strategy in verbose mode will display a negated objective
+ value.
+ !*/
+
+// ----------------------------------------------------------------------------------------
+
+}
+
+#endif // DLIB_OPTIMIZATION_TRUST_REGIoN_H_ABSTRACTh_
+
+