38 files changed, 14987 insertions, 0 deletions
diff --git a/ml/dlib/dlib/optimization/elastic_net.h b/ml/dlib/dlib/optimization/elastic_net.h
new file mode 100644
index 000000000..6c4b6d0b4
--- /dev/null
+++ b/ml/dlib/dlib/optimization/elastic_net.h
@@ -0,0 +1,389 @@
+// Copyright (C) 2016  Davis E. King (davis@dlib.net)
+// License: Boost Software License   See LICENSE.txt for the full license.
+#ifndef DLIB_ElASTIC_NET_Hh_
+#define DLIB_ElASTIC_NET_Hh_
+
+#include "../matrix.h"
+#include "elastic_net_abstract.h"
+
+namespace dlib
+{
+
+// ----------------------------------------------------------------------------------------
+
+    class elastic_net
+    {
+    public:
+
+        template <typename EXP>
+        explicit elastic_net(
+            const matrix_exp<EXP>& XX
+        ) : eps(1e-5), max_iterations(50000), verbose(false)
+        {
+            // make sure requires clause is not broken
+            DLIB_ASSERT(XX.size() > 0 &&
+                        XX.nr() == XX.nc(),
+                "\t elastic_net::elastic_net(XX)"
+                << " \n\t XX must be a non-empty square matrix."
+                << " \n\t XX.nr():   " << XX.nr() 
+                << " \n\t XX.nc():   " << XX.nc() 
+                << " \n\t this: " << this
+                );
+
+
+            // If the number of columns in X is big and in particular bigger than the number of
+            // rows then we can get rid of them by doing some SVD magic.  Doing this doesn't
+            // make the final results of anything change but makes all the matrices have
+            // dimensions that are X.nr() in size, which can be much smaller.
+            matrix<double,0,1> s;
+            svd3(XX,u,eig_vals,eig_vects);
+            s = sqrt(eig_vals);
+            X = eig_vects*diagm(s);
+            u = eig_vects*inv(diagm(s));
+
+
+
+            samples.resize(X.nr()*2);
+
+            for (size_t i = 0; i < samples.size(); ++i)
+                index.push_back(i);
+            active_size = index.size();
+
+
+            // setup the training samples used in the SVM optimizer below
+            for (size_t i = 0; i < samples.size(); ++i)
+            {
+                auto& x = samples[i];
+                const long idx = i/2;
+                if (i%2 == 0)
+                    x.label = +1;
+                else
+                    x.label = -1;
+
+                x.r = idx%X.nr();
+            }
+        }
+
+        template <typename EXP1, typename EXP2>
+        elastic_net(
+            const matrix_exp<EXP1>& XX,
+            const matrix_exp<EXP2>& XY
+        ) : elastic_net(XX)
+        {
+            // make sure requires clause is not broken
+            DLIB_ASSERT(XX.size() > 0 && 
+                        XX.nr() == XX.nc() &&
+                        is_col_vector(XY) && 
+                        XX.nc() == XY.size() ,
+                "\t elastic_net::elastic_net(XX,XY)"
+                << " \n\t Invalid inputs were given to this function."
+                << " \n\t XX.size(): " << XX.size() 
+                << " \n\t is_col_vector(XY): " << is_col_vector(XY) 
+                << " \n\t XX.nr():   " << XX.nr() 
+                << " \n\t XX.nc():   " << XX.nc() 
+                << " \n\t XY.size(): " << XY.size() 
+                << " \n\t this: " << this
+                );
+
+            set_xy(XY);
+        }
+
+        long size (
+        ) const { return u.nr(); }
+
+        template <typename EXP>
+        void set_xy(
+            const matrix_exp<EXP>& XY
+        )
+        {
+            // make sure requires clause is not broken
+            DLIB_ASSERT(is_col_vector(XY) && 
+                        XY.size() == size(),
+                "\t void elastic_net::set_y(Y)"
+                << " \n\t Invalid inputs were given to this function."
+                << " \n\t is_col_vector(XY): " << is_col_vector(XY) 
+                << " \n\t size():    " << size() 
+                << " \n\t XY.size(): " << XY.size() 
+                << " \n\t this: " << this
+                );
+
+            Y = trans(u)*XY;
+            // We can use the ynorm after it has been projected because the only place Y
+            // appears in the algorithm is in terms of dot products with w and x vectors.
+            // But those vectors are always in the span of X and therefore we only see the
+            // part of the norm of Y that is in the span of X (and hence u since u and X
+            // have the same span by construction)
+            ynorm = length_squared(Y); 
+            xdoty = X*Y;
+            eig_vects_xdoty = trans(eig_vects)*xdoty;
+
+            w.set_size(Y.size());
+            // zero out any memory of previous solutions
+            alpha.assign(X.nr()*2, 0);
+        }
+
+        bool have_target_values (
+        ) const { return Y.size() != 0; }
+
+        void set_epsilon(
+            double eps_
+        ) 
+        {
+            // make sure requires clause is not broken
+            DLIB_ASSERT(eps_ > 0,
+                "\t void elastic_net::set_epsilon()"
+                << " \n\t eps_ must be greater than 0"
+                << " \n\t eps_: " << eps_ 
+                << " \n\t this: " << this
+                );
+
+            eps = eps_;
+        }
+
+        unsigned long get_max_iterations (
+        ) const { return max_iterations; }
+
+        void set_max_iterations (
+            unsigned long max_iter
+        ) 
+        {
+            max_iterations = max_iter;
+        }
+
+        void be_verbose (
+        )
+        {
+            verbose = true;
+        }
+
+        void be_quiet (
+        )
+        {
+            verbose = false;
+        }
+
+        double get_epsilon (
+        ) const { return eps; }
+
+        matrix<double,0,1> operator() (
+            double ridge_lambda,
+            double lasso_budget = std::numeric_limits<double>::infinity()
+        )
+        {
+            // make sure requires clause is not broken
+            DLIB_ASSERT(have_target_values() && 
+                        ridge_lambda > 0 && 
+                        lasso_budget > 0 ,
+                "\t matrix<double,0,1> elastic_net::operator()()"
+                << " \n\t Invalid inputs were given to this function."
+                << " \n\t have_target_values(): " << have_target_values() 
+                << " \n\t ridge_lambda: " << ridge_lambda 
+                << " \n\t lasso_budget: " << lasso_budget 
+                << " \n\t this: " << this
+                );
+
+
+            // First check if lasso_budget is so big that it isn't even active.  We do this
+            // by doing just ridge regression and checking the result.
+            matrix<double,0,1> betas = eig_vects*tmp(inv(diagm(eig_vals + ridge_lambda))*eig_vects_xdoty);
+            if (sum(abs(betas)) <= lasso_budget)
+                return betas;
+
+
+            // Set w back to 0.  We will compute the w corresponding to what is currently
+            // in alpha layer on.  This way w and alpha are always in sync.
+            w = 0;
+            wy_mult = 0;
+            wdoty = 0;
+
+
+            // return dot(w,x)
+            auto dot = [&](const matrix<double,0,1>& w, const en_sample2& x)
+            {
+                const double xmul = -x.label*(1/lasso_budget);
+                // Do the base dot product but don't forget to add in the -(1/t)*y part from the svm reduction paper
+                double val = rowm(X,x.r)*w + xmul*wdoty + wy_mult*xdoty(x.r) + xmul*wy_mult*ynorm;
+
+                return val;
+            };
+
+
+            // perform w += scale*x;
+            auto add_to = [&](matrix<double,0,1>& w, double scale, const en_sample2& x)
+            {
+                const double xmul = -x.label*(1/lasso_budget);
+                wy_mult += scale*xmul;
+                wdoty += scale*xdoty(x.r);
+                w += scale*trans(rowm(X,x.r));
+
+            };
+
+            const double Dii = ridge_lambda;
+
+            // setup the training samples used in the SVM optimizer below
+            for (size_t i = 0; i < samples.size(); ++i)
+            {
+                auto& x = samples[i];
+
+                const double xmul = -x.label*(1/lasso_budget);
+                x.xdotx = xmul*xmul*ynorm;
+                for (long c = 0; c < X.nc(); ++c)
+                    x.xdotx += std::pow(X(x.r,c)+xmul*Y(c), 2.0) - std::pow(xmul*Y(c),2.0);
+
+                // compute the correct w given whatever might be in alpha.
+                if (alpha[i] != 0)
+                    add_to(w, x.label*alpha[i], samples[i]);
+            }
+
+
+            // Now run the optimizer
+            double PG_max_prev = std::numeric_limits<double>::infinity();
+            double PG_min_prev = -std::numeric_limits<double>::infinity();
+
+
+            unsigned int iter;
+            for (iter = 0; iter < max_iterations; ++iter)
+            {
+                // randomly shuffle the indices
+                for (unsigned long i = 0; i < active_size; ++i)
+                {
+                    // pick a random index >= i
+                    const long j = i + rnd.get_random_32bit_number()%(active_size-i);
+                    std::swap(index[i], index[j]);
+                }
+
+                double PG_max = -std::numeric_limits<double>::infinity();
+                double PG_min = std::numeric_limits<double>::infinity();
+                for (size_t ii = 0; ii < active_size; ++ii)
+                {
+                    const auto i = index[ii];
+                    const auto& x = samples[i];
+                    double G = x.label*dot(w, x) - 1 + Dii*alpha[i];
+
+                    double PG = 0;
+                    if (alpha[i] == 0)
+                    {
+                        if (G > PG_max_prev)
+                        {
+                            // shrink the active set of training examples
+                            --active_size;
+                            std::swap(index[ii], index[active_size]);
+                            --ii;
+                            continue;
+                        }
+
+                        if (G < 0)
+                            PG = G;
+                    }
+                    else
+                    {
+                        PG = G;
+                    }
+
+                    if (PG > PG_max) 
+                        PG_max = PG;
+                    if (PG < PG_min) 
+                        PG_min = PG;
+
+                    // if PG != 0
+                    if (std::abs(PG) > 1e-12)
+                    {
+                        const double alpha_old = alpha[i];
+                        alpha[i] = std::max(alpha[i] - G/(x.xdotx+Dii), (double)0.0);
+                        const double delta = (alpha[i]-alpha_old)*x.label;
+                        add_to(w, delta, x);
+                    }
+                }
+
+                if (verbose)
+                {
+                    using namespace std;
+                    cout << "gap:         " << PG_max - PG_min << endl;
+                    cout << "active_size: " << active_size << endl;
+                    cout << "iter:        " << iter << endl;
+                    cout << endl;
+                }
+
+                if (PG_max - PG_min <= eps)
+                {
+                    // stop if we are within eps tolerance and the last iteration
+                    // was over all the samples
+                    if (active_size == index.size())
+                        break;
+
+                    // Turn off shrinking on the next iteration.  We will stop if the
+                    // tolerance is still <= eps when shrinking is off.
+                    active_size = index.size();
+                    PG_max_prev = std::numeric_limits<double>::infinity();
+                    PG_min_prev = -std::numeric_limits<double>::infinity();
+                }
+                else
+                {
+                    PG_max_prev = PG_max;
+                    PG_min_prev = PG_min;
+                    if (PG_max_prev <= 0)
+                        PG_max_prev = std::numeric_limits<double>::infinity();
+                    if (PG_min_prev >= 0)
+                        PG_min_prev = -std::numeric_limits<double>::infinity();
+                }
+
+
+                // recalculate wdoty every so often to avoid drift.
+                if (iter%100 == 0)
+                    wdoty = dlib::dot(Y, w);
+            }
+
+
+            betas.set_size(alpha.size()/2);
+            for (long i = 0; i < betas.size(); ++i)
+                betas(i) = lasso_budget*(alpha[2*i] - alpha[2*i+1]);
+            betas /= sum(mat(alpha));
+            return betas;
+        }
+
+
+    private:
+
+        struct en_sample2
+        {
+            // X location
+            long r;
+
+
+            double label;
+
+            double xdotx;
+        };
+
+        std::vector<en_sample2> samples;
+        std::vector<double> alpha;
+        double ynorm;
+        matrix<double> X;
+        matrix<double,0,1> Y;
+        matrix<double,0,1> xdoty;
+        double wdoty;
+        double wy_mult; // logically, the real w is what is in the w vector + wy_mult*Y
+        matrix<double,0,1> w;
+        std::vector<long> index; 
+        unsigned long active_size;
+
+        matrix<double,0,1> eig_vects_xdoty;
+        matrix<double,0,1> eig_vals;
+        matrix<double> eig_vects;
+        matrix<double> u;
+
+        dlib::rand rnd;
+
+
+        double eps;
+        unsigned long max_iterations;
+        bool verbose;
+    };
+
+// ----------------------------------------------------------------------------------------
+
+}
+
+#endif // DLIB_ElASTIC_NET_Hh_
+
+
diff --git a/ml/dlib/dlib/optimization/elastic_net_abstract.h b/ml/dlib/dlib/optimization/elastic_net_abstract.h
new file mode 100644
index 000000000..8ae69e37e
--- /dev/null
+++ b/ml/dlib/dlib/optimization/elastic_net_abstract.h
@@ -0,0 +1,190 @@
+// Copyright (C) 2016  Davis E. King (davis@dlib.net)
+// License: Boost Software License   See LICENSE.txt for the full license.
+#undef DLIB_ElASTIC_NET_ABSTRACT_Hh_
+#ifdef DLIB_ElASTIC_NET_ABSTRACT_Hh_
+
+#include "../matrix.h"
+
+namespace dlib
+{
+
+// ----------------------------------------------------------------------------------------
+
+    class elastic_net
+    {
+        /*!
+            WHAT THIS OBJECT REPRESENTS
+                This object is a tool for solving the following optimization problem:
+
+                    min_w:      length_squared(X*w - Y) + ridge_lambda*length_squared(w)
+                    such that:  sum(abs(w)) <= lasso_budget
+
+                That is, it solves the elastic net optimization problem.  This object also
+                has the special property that you can quickly obtain different solutions
+                for different settings of ridge_lambda, lasso_budget, and target Y values.
+
+                This is because a large amount of work is precomputed in the constructor.
+                The solver will also remember the previous solution and will use that to
+                warm start subsequent invocations.  Therefore, you can efficiently get
+                solutions for a wide range of regularization parameters.
+                
+                
+                The particular algorithm used to solve it is described in the paper:
+                    Zhou, Quan, et al. "A reduction of the elastic net to support vector
+                    machines with an application to gpu computing." arXiv preprint
+                    arXiv:1409.1976 (2014).  APA 
+
+                And for the SVM solver sub-component we use the algorithm from:
+                    Hsieh, Cho-Jui, et al. "A dual coordinate descent method for large-scale
+                    linear SVM." Proceedings of the 25th international conference on Machine
+                    learning. ACM, 2008. 
+        !*/
+
+    public:
+
+        template <typename EXP>
+        explicit elastic_net(
+            const matrix_exp<EXP>& XX
+        ); 
+        /*!
+            requires
+                - XX.size() != 0
+                - XX.nr() == XX.nc()
+            ensures
+                - #get_epsilon() == 1e-5
+                - #get_max_iterations() == 50000
+                - This object will not be verbose unless be_verbose() is called.
+                - #size() == XX.nc()
+                - #have_target_values() == false
+                - We interpret XX as trans(X)*X where X is as defined in the objective
+                  function discussed above in WHAT THIS OBJECT REPRESENTS.
+        !*/
+
+        template <typename EXP1, typename EXP2>
+        elastic_net(
+            const matrix_exp<EXP1>& XX,
+            const matrix_exp<EXP2>& XY
+        ); 
+        /*!
+            requires
+                - XX.size() != 0
+                - XX.nr() == XX.nc()
+                - is_col_vector(XY)
+                - XX.nc() == Y.size()
+            ensures
+                - constructs this object by calling the elastic_net(XX) constructor and
+                  then calling this->set_xy(XY).
+                - #have_target_values() == true 
+                - We interpret XX as trans(X)*X where X is as defined in the objective
+                  function discussed above in WHAT THIS OBJECT REPRESENTS.  Similarly, XY
+                  should be trans(X)*Y.
+        !*/
+
+        long size (
+        ) const; 
+        /*!
+            ensures
+                - returns the dimensionality of the data loaded into this object.  That is,
+                  how many elements are in the optimal w vector?  This function returns
+                  that number.
+        !*/
+
+        bool have_target_values (
+        ) const;
+        /*!
+            ensures
+                - returns true if set_xy() has been called and false otherwise.
+        !*/
+
+        template <typename EXP>
+        void set_xy(
+            const matrix_exp<EXP>& XY
+        );
+        /*!
+            requires
+                - is_col_vector(Y)
+                - Y.size() == size()
+            ensures
+                - #have_target_values() == true
+                - Sets the target values of the regression.  Note that we expect the given
+                  matrix, XY, to be equal to trans(X)*Y, where X and Y have the definitions
+                  discussed above in WHAT THIS OBJECT REPRESENTS.
+        !*/
+
+        void set_epsilon(
+            double eps
+        );
+        /*!
+            requires
+                - eps > 0
+            ensures
+                - #get_epsilon() == eps
+        !*/
+
+        double get_epsilon (
+        ) const;
+        /*!
+            ensures
+                - returns the error epsilon that determines when the solver should stop.
+                  Smaller values may result in a more accurate solution but take longer to
+                  execute.  
+        !*/
+
+        unsigned long get_max_iterations (
+        ) const; 
+        /*!
+            ensures
+                - returns the maximum number of iterations the optimizer is allowed to run
+                  before it is required to stop and return a result.
+        !*/
+
+        void set_max_iterations (
+            unsigned long max_iter
+        );
+        /*!
+            ensures
+                - #get_max_iterations() == max_iter
+        !*/
+
+        void be_verbose (
+        );
+        /*!
+            ensures
+                - This object will print status messages to standard out so that a 
+                  user can observe the progress of the algorithm.
+        !*/
+
+        void be_quiet (
+        );
+        /*!
+            ensures
+                - this object will not print anything to standard out.
+        !*/
+
+
+        matrix<double,0,1> operator() (
+            double ridge_lambda,
+            double lasso_budget = std::numeric_limits<double>::infinity()
+        );
+        /*!
+            requires
+                - have_target_values() == true
+                - ridge_lambda > 0
+                - lasso_budget > 0
+            ensures
+                - Solves the optimization problem described in the WHAT THIS OBJECT
+                  REPRESENTS section above and returns the optimal w.
+                - The returned vector has size() elements.
+                - if (lasso_budget == infinity) then
+                    - The lasso constraint is ignored 
+        !*/
+
+    };
+
+// ----------------------------------------------------------------------------------------
+
+}
+
+#endif // DLIB_ElASTIC_NET_ABSTRACT_Hh_
+
+
diff --git a/ml/dlib/dlib/optimization/find_max_factor_graph_nmplp.h b/ml/dlib/dlib/optimization/find_max_factor_graph_nmplp.h
new file mode 100644
index 000000000..3dd7fd56f
--- /dev/null
+++ b/ml/dlib/dlib/optimization/find_max_factor_graph_nmplp.h
@@ -0,0 +1,337 @@
+// Copyright (C) 2011  Davis E. King (davis@dlib.net)
+// License: Boost Software License   See LICENSE.txt for the full license.
+#ifndef DLIB_FIND_MAX_FACTOR_GRAPH_nMPLP_Hh_
+#define DLIB_FIND_MAX_FACTOR_GRAPH_nMPLP_Hh_
+
+#include "find_max_factor_graph_nmplp_abstract.h"
+#include <vector>
+#include <map>
+#include "../matrix.h"
+#include "../hash.h"
+
+
+namespace dlib
+{
+
+// ----------------------------------------------------------------------------------------
+
+    namespace impl
+    {
+        class simple_hash_map
+        {
+        public:
+
+            simple_hash_map(
+            ) : 
+                scan_dist(6)
+            {
+                data.resize(5000);
+            }
+
+            void insert (
+                const unsigned long a,
+                const unsigned long b,
+                const unsigned long value
+            ) 
+            /*!
+                requires
+                    - a != std::numeric_limits<unsigned long>::max()
+                ensures
+                    - #(*this)(a,b) == value
+            !*/
+            {
+                const uint32 h = murmur_hash3_2(a,b)%(data.size()-scan_dist);
+
+                const unsigned long empty_bucket = std::numeric_limits<unsigned long>::max();
+
+                for (uint32 i = 0; i < scan_dist; ++i)
+                {
+                    if (data[i+h].key1 == empty_bucket)
+                    {
+                        data[i+h].key1 = a;
+                        data[i+h].key2 = b;
+                        data[i+h].value = value;
+                        return;
+                    }
+                }
+
+                // if we get this far it means the hash table is filling up.  So double its size.
+                std::vector<bucket> new_data;
+                new_data.resize(data.size()*2);
+                new_data.swap(data);
+                for (uint32 i = 0; i < new_data.size(); ++i)
+                {
+                    if (new_data[i].key1 != empty_bucket)
+                    {
+                        insert(new_data[i].key1, new_data[i].key2, new_data[i].value);
+                    }
+                }
+
+                insert(a,b,value);
+            }
+
+            unsigned long operator() (
+                const unsigned long a,
+                const unsigned long b
+            ) const
+            /*!
+                requires
+                    - this->insert(a,b,some_value) has been called
+                ensures
+                    - returns the value stored at key (a,b)
+            !*/
+            {
+                DLIB_ASSERT(a != b, "An invalid map_problem was given to find_max_factor_graph_nmplp()."
+                            << "\nNode " << a << " is listed as being a neighbor with itself, which is illegal.");
+
+                uint32 h = murmur_hash3_2(a,b)%(data.size()-scan_dist);
+
+
+                for (unsigned long i = 0; i < scan_dist; ++i)
+                {
+                    if (data[h].key1 == a && data[h].key2 == b)
+                    {
+                        return data[h].value;
+                    }
+                    ++h;
+                }
+                
+
+                // this should never happen (since this function requires (a,b) to be in the hash table
+                DLIB_ASSERT(false, "An invalid map_problem was given to find_max_factor_graph_nmplp()."
+                            << "\nThe nodes in the map_problem are inconsistent because node "<<a<<" is in the neighbor list"
+                            << "\nof node "<<b<< " but node "<<b<<" isn't in the neighbor list of node "<<a<<".  The neighbor relationship"
+                            << "\nis supposed to be symmetric."
+                            );
+                return 0;
+            }
+
+        private:
+
+            struct bucket
+            {
+                // having max() in key1 indicates that the bucket isn't used.
+                bucket() : key1(std::numeric_limits<unsigned long>::max()) {}
+                unsigned long key1;
+                unsigned long key2;
+                unsigned long value;
+            };
+
+            std::vector<bucket> data;
+            const unsigned int scan_dist;
+        };
+    }
+
+// ----------------------------------------------------------------------------------------
+
+    template <
+        typename map_problem
+        >
+    void find_max_factor_graph_nmplp (
+        const map_problem& prob,
+        std::vector<unsigned long>& map_assignment,
+        unsigned long max_iter,
+        double eps
+    )
+    {
+        // make sure requires clause is not broken
+        DLIB_ASSERT( eps > 0,
+                     "\t void find_max_factor_graph_nmplp()"
+                     << "\n\t eps must be greater than zero"
+                     << "\n\t eps:  " << eps 
+                );
+
+        /*
+            This function is an implementation of the NMPLP algorithm introduced in the 
+            following papers:
+                Fixing Max-Product: Convergent Message Passing Algorithms for MAP LP-Relaxations (2008)
+                by Amir Globerson and Tommi Jaakkola
+
+                Introduction to dual decomposition for inference (2011)
+                by David Sontag, Amir Globerson, and Tommi Jaakkola 
+
+            In particular, this function implements the star MPLP update equations shown as
+            equation 1.20 from the paper Introduction to dual decomposition for inference
+            (the method was called NMPLP in the first paper).  It should also be noted that
+            the original description of the NMPLP in the first paper had an error in the
+            equations and the second paper contains corrected equations, which is what this 
+            function uses.
+        */
+
+        typedef typename map_problem::node_iterator node_iterator;
+        typedef typename map_problem::neighbor_iterator neighbor_iterator;
+
+        map_assignment.resize(prob.number_of_nodes());
+
+
+        if (prob.number_of_nodes() == 0)
+            return;
+
+
+        std::vector<double> delta_elements;
+        delta_elements.reserve(prob.number_of_nodes()*prob.num_states(prob.begin())*3);
+
+        impl::simple_hash_map delta_idx;
+
+
+
+        // Initialize delta to zero and fill up the hash table with the appropriate values
+        // so we can index into delta later on.
+        for (node_iterator i = prob.begin(); i != prob.end(); ++i)
+        {
+            const unsigned long id_i = prob.node_id(i);
+
+            for (neighbor_iterator j = prob.begin(i); j != prob.end(i); ++j)
+            {
+                const unsigned long id_j = prob.node_id(j);
+                delta_idx.insert(id_i, id_j, delta_elements.size());
+
+                const unsigned long num_states_xj = prob.num_states(j);
+                for (unsigned long xj = 0; xj < num_states_xj; ++xj)
+                    delta_elements.push_back(0);
+            }
+        }
+
+
+        std::vector<double> gamma_i;
+        std::vector<std::vector<double> > gamma_ji;
+        std::vector<std::vector<double> > delta_to_j_no_i;
+        // These arrays will end up with a length equal to the maximum number of neighbors
+        // of any node in the graph.  So reserve a bigish number of slots so that we are
+        // very unlikely to need to preform an expensive reallocation during the
+        // optimization.
+        gamma_ji.reserve(10000);
+        delta_to_j_no_i.reserve(10000);
+
+
+        double max_change = eps + 1; 
+        // Now do the main body of the optimization. 
+        unsigned long iter;
+        for (iter = 0; iter < max_iter && max_change > eps; ++iter)
+        {
+            max_change = -std::numeric_limits<double>::infinity();
+
+            for (node_iterator i = prob.begin(); i != prob.end(); ++i)
+            {
+                const unsigned long id_i = prob.node_id(i);
+                const unsigned long num_states_xi = prob.num_states(i);
+                gamma_i.assign(num_states_xi, 0);
+
+                double num_neighbors = 0;
+
+                unsigned int jcnt = 0;
+                // first we fill in the gamma vectors
+                for (neighbor_iterator j = prob.begin(i); j != prob.end(i); ++j)
+                {
+                    // Make sure these arrays are big enough to hold all the neighbor
+                    // information.
+                    if (jcnt >= gamma_ji.size())
+                    {
+                        gamma_ji.resize(gamma_ji.size()+1);
+                        delta_to_j_no_i.resize(delta_to_j_no_i.size()+1);
+                    }
+
+                    ++num_neighbors;
+                    const unsigned long id_j = prob.node_id(j);
+                    const unsigned long num_states_xj = prob.num_states(j);
+
+                    gamma_ji[jcnt].assign(num_states_xi, -std::numeric_limits<double>::infinity());
+                    delta_to_j_no_i[jcnt].assign(num_states_xj, 0);
+
+                    // compute delta_j^{-i} and store it in delta_to_j_no_i[jcnt]  
+                    for (neighbor_iterator k = prob.begin(j); k != prob.end(j); ++k)
+                    {
+                        const unsigned long id_k = prob.node_id(k);
+                        if (id_k==id_i)
+                            continue;
+                        const double* const delta_kj = &delta_elements[delta_idx(id_k,id_j)];
+                        for (unsigned long xj = 0; xj < num_states_xj; ++xj)
+                        {
+                            delta_to_j_no_i[jcnt][xj] += delta_kj[xj];
+                        }
+                    }
+
+                    // now compute gamma values
+                    for (unsigned long xi = 0; xi < num_states_xi; ++xi)
+                    {
+                        for (unsigned long xj = 0; xj < num_states_xj; ++xj)
+                        {
+                            gamma_ji[jcnt][xi] = std::max(gamma_ji[jcnt][xi], prob.factor_value(i,j,xi,xj) + delta_to_j_no_i[jcnt][xj]);
+                        }
+                        gamma_i[xi] += gamma_ji[jcnt][xi];
+                    }
+                    ++jcnt;
+                }
+
+                // now update the delta values
+                jcnt = 0;
+                for (neighbor_iterator j = prob.begin(i); j != prob.end(i); ++j)
+                {
+                    const unsigned long id_j = prob.node_id(j);
+                    const unsigned long num_states_xj = prob.num_states(j);
+
+                    // messages from j to i
+                    double* const delta_ji = &delta_elements[delta_idx(id_j,id_i)];
+
+                    // messages from i to j
+                    double* const delta_ij = &delta_elements[delta_idx(id_i,id_j)];
+
+                    for (unsigned long xj = 0; xj < num_states_xj; ++xj)
+                    {
+                        double best_val = -std::numeric_limits<double>::infinity();
+
+                        for (unsigned long xi = 0; xi < num_states_xi; ++xi)
+                        {
+                            double val = prob.factor_value(i,j,xi,xj) + 2/(num_neighbors+1)*gamma_i[xi] -gamma_ji[jcnt][xi];
+                            if (val > best_val)
+                                best_val = val;
+                        }
+                        best_val = -0.5*delta_to_j_no_i[jcnt][xj] + 0.5*best_val;
+
+                        if (std::abs(delta_ij[xj] - best_val) > max_change)
+                            max_change = std::abs(delta_ij[xj] - best_val);
+
+                        delta_ij[xj] = best_val;
+                    }
+
+                    for (unsigned long xi = 0; xi < num_states_xi; ++xi)
+                    {
+                        double new_val = -1/(num_neighbors+1)*gamma_i[xi] + gamma_ji[jcnt][xi];
+                        if (std::abs(delta_ji[xi] - new_val) > max_change)
+                            max_change = std::abs(delta_ji[xi] - new_val);
+                        delta_ji[xi] = new_val;
+                    }
+                    ++jcnt;
+                }
+            }
+        }
+
+
+        // now decode the "beliefs"
+        std::vector<double> b;
+        for (node_iterator i = prob.begin(); i != prob.end(); ++i)
+        {
+            const unsigned long id_i = prob.node_id(i);
+            b.assign(prob.num_states(i), 0);
+
+            for (neighbor_iterator k = prob.begin(i); k != prob.end(i); ++k)
+            {
+                const unsigned long id_k = prob.node_id(k);
+
+                for (unsigned long xi = 0; xi < b.size(); ++xi)
+                {
+                    const double* const delta_ki = &delta_elements[delta_idx(id_k,id_i)];
+                    b[xi] += delta_ki[xi];
+                }
+            }
+
+            map_assignment[id_i] = index_of_max(mat(b));
+        }
+    }
+
+// ----------------------------------------------------------------------------------------
+
+}
+
+#endif // DLIB_FIND_MAX_FACTOR_GRAPH_nMPLP_Hh_
+
diff --git a/ml/dlib/dlib/optimization/find_max_factor_graph_nmplp_abstract.h b/ml/dlib/dlib/optimization/find_max_factor_graph_nmplp_abstract.h
new file mode 100644
index 000000000..3dd9aead0
--- /dev/null
+++ b/ml/dlib/dlib/optimization/find_max_factor_graph_nmplp_abstract.h
@@ -0,0 +1,365 @@
+// Copyright (C) 2011  Davis E. King (davis@dlib.net)
+// License: Boost Software License   See LICENSE.txt for the full license.
+#undef DLIB_FIND_MAX_FACTOR_GRAPH_nMPLP_ABSTRACT_Hh_
+#ifdef DLIB_FIND_MAX_FACTOR_GRAPH_nMPLP_ABSTRACT_Hh_
+
+#include <vector>
+
+
+namespace dlib
+{
+
+// ----------------------------------------------------------------------------------------
+
+    class map_problem 
+    {
+        /*!
+            WHAT THIS OBJECT REPRESENTS
+                This object represents a factor graph or graphical model.  In 
+                particular, this object defines the interface a MAP problem on
+                a factor graph must implement if it is to be solved using the 
+                find_max_factor_graph_nmplp() routine defined at the bottom of this file.  
+
+                Note that there is no dlib::map_problem object.  What you are
+                looking at here is simply the interface definition for a map problem.
+                You must implement your own version of this object for the problem
+                you wish to solve and then pass it to the find_max_factor_graph_nmplp() routine.
+
+
+                Note also that a factor graph should not have any nodes which are 
+                neighbors with themselves.  Additionally, the graph is undirected. This
+                mean that if A is a neighbor of B then B must be a neighbor of A for
+                the map problem to be valid.
+
+
+                Finally, note that the "neighbor" relationship between nodes means the
+                following:  Two nodes are neighbors if and only if there is a potential 
+                function (implemented by the factor_value() method) which operates on 
+                the nodes.
+        !*/
+
+    public:
+
+        class node_iterator
+        {
+            /*!
+                WHAT THIS OBJECT REPRESENTS
+                    This is a simple forward iterator for iterating over 
+                    the nodes/variables in this factor graph.  
+
+                    Note that you can't dereference the iterator and
+                    obtain a value.  That is, the iterator is opaque to 
+                    the user.  It is used only as an argument to the other 
+                    methods defined in this interface.
+            !*/
+
+        public:
+            node_iterator(
+            );
+            /*!
+                ensures
+                    - constructs an iterator in an undefined state
+            !*/
+
+            node_iterator(
+                const node_iterator& item
+            );
+            /*!
+                ensures
+                    - #*this is a copy of item
+            !*/
+
+            node_iterator& operator= (
+                const node_iterator& item
+            );
+            /*!
+                ensures
+                    - #*this is a copy of item
+                    - returns #*this
+            !*/
+
+            bool operator== (
+                const node_iterator& item
+            ) const; 
+            /*!
+                ensures
+                    - returns true if *this and item both reference
+                      the same node in the factor graph and false 
+                      otherwise.
+            !*/
+
+            bool operator!= (
+                const node_iterator& item
+            ) const;
+            /*!
+                ensures
+                    - returns false if *this and item both reference
+                      the same node in the factor graph and true 
+                      otherwise.
+            !*/
+
+            node_iterator& operator++(
+            );
+            /*!
+                ensures
+                    - advances *this to the next node in the factor graph.
+                    - returns a reference to the updated *this
+                      (i.e. this is the ++object form of the increment operator)
+            !*/
+        };
+
+        class neighbor_iterator
+        {
+            /*!
+                WHAT THIS OBJECT REPRESENTS
+                    This is a simple forward iterator for iterating over 
+                    the nodes/variables in this factor graph.  This version
+                    of the iterator is used for iterating over the neighbors
+                    of another node in the graph.
+            !*/
+
+        public:
+            neighbor_iterator(
+            ); 
+            /*!
+                ensures
+                    - constructs an iterator in an undefined state
+            !*/
+
+            neighbor_iterator(
+                const neighbor_iterator& item
+            );
+            /*!
+                ensures
+                    - #*this is a copy of item
+            !*/
+
+            neighbor_iterator& operator= (
+                const neighbor_iterator& item
+            );
+            /*!
+                ensures
+                    - #*this is a copy of item
+                    - returns #*this
+            !*/
+
+            bool operator== (
+                const neighbor_iterator& item
+            ) const; 
+            /*!
+                ensures
+                    - returns true if *this and item both reference
+                      the same node in the factor graph and false 
+                      otherwise.
+            !*/
+
+            bool operator!= (
+                const neighbor_iterator& item
+            ) const;
+            /*!
+                ensures
+                    - returns false if *this and item both reference
+                      the same node in the factor graph and true 
+                      otherwise.
+            !*/
+
+            neighbor_iterator& operator++(
+            ); 
+            /*!
+                ensures
+                    - advances *this to the next node in the factor graph.
+                    - returns a reference to the updated *this
+                      (i.e. this is the ++object form of the increment operator) 
+            !*/
+        };
+
+        unsigned long number_of_nodes (
+        ) const;
+        /*!
+            ensures
+                - returns the number of nodes in the factor graph.  Or in other words, 
+                  returns the number of variables in the MAP problem.
+        !*/
+
+        node_iterator begin(
+        ) const;
+        /*!
+            ensures
+                - returns an iterator to the first node in the graph.  If no such
+                  node exists then returns end().
+        !*/
+
+        node_iterator end(
+        ) const;
+        /*!
+            ensures
+                - returns an iterator to one past the last node in the graph.
+        !*/
+
+        neighbor_iterator begin(
+            const node_iterator& it
+        ) const;
+        /*!
+            requires
+                - it == a valid iterator (i.e. it must be in the range [begin(), end()))
+            ensures
+                - returns an iterator to the first neighboring node of the node
+                  referenced by it.  If no such node exists then returns end(it).
+        !*/
+
+        neighbor_iterator begin(
+            const neighbor_iterator& it
+        ) const;
+        /*!
+            requires
+                - it == a valid iterator. (i.e. it must be in the range 
+                  [begin(i), end(i)) where i is some valid iterator. ) 
+            ensures
+                - returns an iterator to the first neighboring node of the node
+                  referenced by it.  If no such node exists then returns end(it).
+        !*/
+
+        neighbor_iterator end(
+            const node_iterator& it
+        ) const;
+        /*!
+            requires
+                - it == a valid iterator (i.e. it must be in the range [begin(), end()))
+            ensures
+                - returns an iterator to one past the last neighboring node of the node
+                  referenced by it.
+        !*/
+
+        neighbor_iterator end(
+            const neighbor_iterator& it
+        ) const;
+        /*!
+            requires
+                - it == a valid iterator. (i.e. it must be in the range 
+                  [begin(i), end(i)) where i is some valid iterator. ) 
+            ensures
+                - returns an iterator to one past the last neighboring node of the node
+                  referenced by it.
+        !*/
+
+        unsigned long node_id (
+            const node_iterator& it
+        ) const;
+        /*!
+            requires
+                - it == a valid iterator (i.e. it must be in the range [begin(), end()))
+            ensures
+                - returns a number ID such that:
+                    - 0 <= ID < number_of_nodes()
+                    - ID == a number which uniquely identifies the node pointed to by it.
+        !*/
+
+        unsigned long node_id (
+            const neighbor_iterator& it
+        ) const;
+        /*!
+            requires
+                - it == a valid iterator. (i.e. it must be in the range 
+                  [begin(i), end(i)) where i is some valid iterator. ) 
+            ensures
+                - returns a number ID such that:
+                    - 0 <= ID < number_of_nodes()
+                    - ID == a number which uniquely identifies the node pointed to by it.
+        !*/
+
+        unsigned long num_states (
+            const node_iterator& it
+        ) const;
+        /*!
+            requires
+                - it == a valid iterator (i.e. it must be in the range [begin(), end()))
+            ensures
+                - returns the number of states attainable by the node/variable referenced by it.
+        !*/
+
+        unsigned long num_states (
+            const neighbor_iterator& it
+        ) const;
+        /*!
+            requires
+                - it == a valid iterator. (i.e. it must be in the range 
+                  [begin(i), end(i)) where i is some valid iterator. ) 
+            ensures
+                - returns the number of states attainable by the node/variable referenced by it.
+        !*/
+
+        // The next four functions all have the same contract.
+        double factor_value (const node_iterator& it1,     const node_iterator& it2,     unsigned long s1, unsigned long s2) const;
+        double factor_value (const neighbor_iterator& it1, const node_iterator& it2,     unsigned long s1, unsigned long s2) const;
+        double factor_value (const node_iterator& it1,     const neighbor_iterator& it2, unsigned long s1, unsigned long s2) const;
+        double factor_value (const neighbor_iterator& it1, const neighbor_iterator& it2, unsigned long s1, unsigned long s2) const;
+        /*!
+            requires
+                - it1 == a valid iterator
+                - it2 == a valid iterator
+                - 0 <= s1 < num_states(it1)
+                - 0 <= s2 < num_states(it2)
+                - it1 and it2 reference nodes which are neighbors in the factor graph
+            ensures
+                - returns the value of the factor/potential function for the given pair of 
+                  nodes, defined by it1 and it2, for the case where they take on the values
+                  s1 and s2 respectively.
+        !*/
+
+    };
+
+// ----------------------------------------------------------------------------------------
+
+    template <
+        typename map_problem
+        >
+    void find_max_factor_graph_nmplp (
+        const map_problem& prob,
+        std::vector<unsigned long>& map_assignment,
+        unsigned long max_iter,
+        double eps
+    );
+    /*!
+        requires
+            - for all valid i: prob.num_states(i) >= 2
+            - map_problem == an object with an interface compatible with the map_problem
+              object defined at the top of this file.
+            - eps > 0
+        ensures
+            - This function is a tool for approximately solving the given MAP problem in a graphical 
+              model or factor graph with pairwise potential functions.  That is, it attempts 
+              to solve a certain kind of optimization problem which can be defined as follows:
+                 maximize: f(X)
+                 where X is a set of integer valued variables and f(X) can be written as the 
+                 sum of functions which each involve only two variables from X.  In reference 
+                 to the prob object, the nodes in prob represent the variables in X and the 
+                 functions which are summed are represented by prob.factor_value().
+            - #map_assignment == the result of the optimization.   
+            - #map_assignment.size() == prob.number_of_nodes()
+            - for all valid i:
+                - #map_assignment[prob.node_id(i)] < prob.num_states(i)
+                - #map_assignment[prob.node_id(i)] == The approximate MAP assignment for node/variable i.
+            - eps controls the stopping condition, smaller values of eps lead to more accurate 
+              solutions of the relaxed linear program but may take more iterations.  Note that
+              the algorithm will never execute more than max_iter iterations regardless of
+              the setting of eps.
+            - If the graph is tree-structured then this routine always gives the exact solution 
+              to the MAP problem.  However, for graphs with cycles, the solution may be approximate.
+              
+
+            - This function is an implementation of the NMPLP algorithm introduced in the
+              following papers:
+                Fixing Max-Product: Convergent Message Passing Algorithms for MAP LP-Relaxations (2008)
+                by Amir Globerson and Tommi Jaakkola
+
+                Introduction to dual decomposition for inference (2011)
+                by David Sontag, Amir Globerson, and Tommi Jaakkola 
+    !*/
+
+// ----------------------------------------------------------------------------------------
+
+}
+
+#endif // DLIB_FIND_MAX_FACTOR_GRAPH_nMPLP_ABSTRACT_Hh_
+
+
diff --git a/ml/dlib/dlib/optimization/find_max_factor_graph_viterbi.h b/ml/dlib/dlib/optimization/find_max_factor_graph_viterbi.h
new file mode 100644
index 000000000..f7cbcb8d9
--- /dev/null
+++ b/ml/dlib/dlib/optimization/find_max_factor_graph_viterbi.h
@@ -0,0 +1,232 @@
+// Copyright (C) 2011  Davis E. King (davis@dlib.net)
+// License: Boost Software License   See LICENSE.txt for the full license.
+#ifndef DLIB_FIND_MAX_FACTOR_GRAPH_VITERBi_Hh_
+#define DLIB_FIND_MAX_FACTOR_GRAPH_VITERBi_Hh_
+
+#include "find_max_factor_graph_viterbi_abstract.h"
+#include <vector>
+#include "../matrix.h"
+#include "../array2d.h"
+
+
+namespace dlib
+{
+
+// ----------------------------------------------------------------------------------------
+
+    namespace impl
+    {
+        struct viterbi_data
+        {
+            viterbi_data() :val(-std::numeric_limits<double>::infinity()), back_index(0) {}
+            double val;
+            unsigned long back_index;
+        };
+
+        template <long NC>
+        inline bool advance_state(
+            matrix<unsigned long,1,NC>& node_states,
+            unsigned long num_states
+        )
+        /*!
+            ensures
+                - advances node_states to the next state by adding 1
+                  to node_states(node_states.size()-1) and carrying any 
+                  rollover (modulo num_states).  Stores the result into #node_states.
+                - if (#node_states is all zeros) then
+                    - returns false
+                - else
+                    - returns true
+        !*/
+        {
+            for (long i = node_states.size()-1; i >= 0; --i)
+            {
+                node_states(i) += 1;
+                if (node_states(i) < num_states)
+                    return true;
+
+                node_states(i) = 0;
+            }
+            return false;
+        }
+    }
+
+// ----------------------------------------------------------------------------------------
+
+    template <
+        typename map_problem
+        >
+    void find_max_factor_graph_viterbi (
+        const map_problem& prob,
+        std::vector<unsigned long>& map_assignment
+    )
+    {
+        using namespace dlib::impl;
+        const unsigned long order = prob.order();
+        const unsigned long num_states = prob.num_states();
+
+
+        DLIB_ASSERT(prob.num_states() > 0,
+            "\t void find_max_factor_graph_viterbi()"
+            << "\n\t The nodes in a factor graph have to be able to take on more than 0 states."
+            );
+        DLIB_ASSERT(std::pow(num_states,(double)order) < std::numeric_limits<unsigned long>::max(),
+            "\t void find_max_factor_graph_viterbi()"
+            << "\n\t The order is way too large for this algorithm to handle."
+            << "\n\t order:      " << order
+            << "\n\t num_states: " << num_states 
+            << "\n\t std::pow(num_states,order):                " << std::pow(num_states,(double)order) 
+            << "\n\t std::numeric_limits<unsigned long>::max(): " << std::numeric_limits<unsigned long>::max() 
+            );
+
+        if (prob.number_of_nodes() == 0)
+        {
+            map_assignment.clear();
+            return;
+        }
+
+        if (order == 0)
+        {
+            map_assignment.resize(prob.number_of_nodes());
+            for (unsigned long i = 0; i < map_assignment.size(); ++i)
+            {
+                matrix<unsigned long,1,1> node_state;
+                unsigned long best_state = 0;
+                double best_val = -std::numeric_limits<double>::infinity();
+                for (unsigned long s = 0; s < num_states; ++s)
+                {
+                    node_state(0) = s;
+                    const double temp = prob.factor_value(i,node_state);
+                    if (temp > best_val)
+                    {
+                        best_val = temp;
+                        best_state = s;
+                    }
+                }
+                map_assignment[i] = best_state;
+            }
+            return;
+        }
+
+
+        const unsigned long trellis_size = static_cast<unsigned long>(std::pow(num_states,(double)order)); 
+        unsigned long init_ring_size = 1; 
+
+        array2d<impl::viterbi_data> trellis;
+        trellis.set_size(prob.number_of_nodes(), trellis_size);
+
+
+        for (unsigned long node = 0; node < prob.number_of_nodes(); ++node)
+        {
+
+            if (node < order)
+            {
+                matrix<unsigned long,1,0> node_states;
+                node_states.set_size(std::min<int>(node, order) + 1);
+                node_states = 0;
+
+                unsigned long idx = 0;
+                if (node == 0)
+                {
+                    do 
+                    {
+                        trellis[node][idx].val = prob.factor_value(node,node_states);
+                        ++idx;
+                    } while(advance_state(node_states,num_states));
+                }
+                else
+                {
+                    init_ring_size *= num_states;
+                    do 
+                    {
+                        const unsigned long back_index = idx%init_ring_size;
+                        trellis[node][idx].val = prob.factor_value(node,node_states) + trellis[node-1][back_index].val;
+                        trellis[node][idx].back_index = back_index;
+                        ++idx;
+                    } while(advance_state(node_states,num_states));
+
+                }
+            }
+            else if (order == 1)
+            {
+                /*
+                    WHAT'S THE DEAL WITH THIS PREPROCESSOR MACRO?
+                        Well, if we can declare the dimensions of node_states as a compile
+                        time constant then this function runs significantly faster.  So this macro
+                        is here to let us do that.  It just lets us avoid replicating this code
+                        block in the following if statements for different order sizes.
+                */
+#define DLIB_FMFGV_WORK                                                                                                     \
+                node_states = 0;                                                                                            \
+                unsigned long count = 0;                                                                                    \
+                for (unsigned long i = 0; i < trellis_size; ++i)                                                            \
+                {                                                                                                           \
+                    unsigned long back_index = 0;                                                                           \
+                    double best_score = -std::numeric_limits<double>::infinity();                                           \
+                    for (unsigned long s = 0; s < num_states; ++s)                                                          \
+                    {                                                                                                       \
+                        const double temp = prob.factor_value(node,node_states) + trellis[node-1][count%trellis_size].val;  \
+                        if (temp > best_score)                                                                              \
+                        {                                                                                                   \
+                            best_score = temp;                                                                              \
+                            back_index = count%trellis_size;                                                                \
+                        }                                                                                                   \
+                        advance_state(node_states,num_states);                                                              \
+                        ++count;                                                                                            \
+                    }                                                                                                       \
+                    trellis[node][i].val = best_score;                                                                      \
+                    trellis[node][i].back_index = back_index;                                                               \
+                }
+
+                matrix<unsigned long,1,2> node_states;
+                DLIB_FMFGV_WORK
+            }
+            else if (order == 2)
+            {
+                matrix<unsigned long,1,3> node_states;
+                DLIB_FMFGV_WORK
+            }
+            else if (order == 3)
+            {
+                matrix<unsigned long,1,4> node_states;
+                DLIB_FMFGV_WORK
+            }
+            else 
+            {
+                // The general case, here we don't define the size of node_states at compile time.
+                matrix<unsigned long,1,0> node_states(order+1);
+                DLIB_FMFGV_WORK
+            }
+        }
+
+
+        map_assignment.resize(prob.number_of_nodes());
+        // Figure out which state of the last node has the biggest value. 
+        unsigned long back_index = 0;
+        double best_val = -std::numeric_limits<double>::infinity();
+        for (long i = 0; i < trellis.nc(); ++i)
+        {
+            if (trellis[trellis.nr()-1][i].val > best_val)
+            {
+                best_val = trellis[trellis.nr()-1][i].val;
+                back_index = i;
+            }
+        }
+        // Follow the back links to find the decoding.
+        for (long node = map_assignment.size()-1; node >= 0; --node)
+        {
+            map_assignment[node] = back_index/init_ring_size;
+            back_index = trellis[node][back_index].back_index;
+            if (node < (long)order)
+                init_ring_size /= num_states;
+        }
+
+    }
+
+// ----------------------------------------------------------------------------------------
+
+}
+
+#endif // DLIB_FIND_MAX_FACTOR_GRAPH_VITERBi_Hh_
+
+
diff --git a/ml/dlib/dlib/optimization/find_max_factor_graph_viterbi_abstract.h b/ml/dlib/dlib/optimization/find_max_factor_graph_viterbi_abstract.h
new file mode 100644
index 000000000..c19e4c7eb
--- /dev/null
+++ b/ml/dlib/dlib/optimization/find_max_factor_graph_viterbi_abstract.h
@@ -0,0 +1,131 @@
+// Copyright (C) 2011  Davis E. King (davis@dlib.net)
+// License: Boost Software License   See LICENSE.txt for the full license.
+#undef DLIB_FIND_MAX_FACTOR_GRAPH_VITERBi_ABSTRACT_Hh_
+#ifdef DLIB_FIND_MAX_FACTOR_GRAPH_VITERBi_ABSTRACT_Hh_
+
+#include <vector>
+#include "../matrix.h"
+
+
+namespace dlib
+{
+
+// ----------------------------------------------------------------------------------------
+
+    class map_problem 
+    {
+        /*!
+            WHAT THIS OBJECT REPRESENTS
+                This object represents a chain-structured factor graph or graphical 
+                model.  In particular, this object defines the interface a MAP problem 
+                on a factor graph must implement if it is to be solved using the 
+                find_max_factor_graph_viterbi() routine defined at the bottom of this file.  
+
+                Note that there is no dlib::map_problem object.  What you are looking 
+                at here is simply the interface definition for a map problem.  You must 
+                implement your own version of this object for the problem you wish to 
+                solve and then pass it to the find_max_factor_graph_viterbi() routine.
+        !*/
+
+    public:
+
+        unsigned long order (
+        ) const;
+        /*!
+            ensures
+                - returns the order of this model.  The order has the following interpretation:
+                  This model can represent a high order Markov chain.  If order()==1 then map_problem
+                  represents a basic chain-structured graph where nodes only depend on their immediate
+                  neighbors.  However, high order Markov models can also be used by setting order() > 1.
+        !*/
+
+        unsigned long num_states (
+        ) const;
+        /*!
+            ensures
+                - returns the number of states attainable by each variable/node in the graph.
+        !*/
+
+        unsigned long number_of_nodes (
+        ) const;
+        /*!
+            ensures
+                - returns the number of nodes in the factor graph.  Or in other words, 
+                  returns the number of variables in the MAP problem.
+        !*/
+
+        template <
+            typename EXP 
+            >
+        double factor_value (
+            unsigned long node_id,
+            const matrix_exp<EXP>& node_states
+        ) const;
+        /*!
+            requires
+                - EXP::type == unsigned long
+                  (i.e. node_states contains unsigned longs)
+                - node_id < number_of_nodes()
+                - node_states.size() == min(node_id, order()) + 1
+                - is_vector(node_states) == true
+                - max(node_states) < num_states()
+            ensures
+                - In a chain-structured graph, each node has a potential function associated with
+                  it.  The potential function operates on the variable given by the node as well
+                  as the order() previous variables.  Therefore, factor_value() returns the value 
+                  of the factor/potential function associated with node node_id where the following 
+                  nodes take on the values defined below:
+                    - node_states(0) == the value of the node with ID node_id
+                    - node_states(i) == the value of the node with ID node_id-i
+                - It is ok for this function to return a value of -std::numeric_limits<double>::infinity().
+        !*/
+
+    };
+
+// ----------------------------------------------------------------------------------------
+
+    template <
+        typename map_problem
+        >
+    void find_max_factor_graph_viterbi (
+        const map_problem& prob,
+        std::vector<unsigned long>& map_assignment
+    );
+    /*!
+        requires
+            - prob.num_states() > 0
+            - std::pow(prob.num_states(), prob.order()) < std::numeric_limits<unsigned long>::max()
+              (i.e. The Viterbi algorithm is exponential in the order of the map problem.  So don't 
+              make order too large.)
+            - map_problem == an object with an interface compatible with the map_problem
+              object defined at the top of this file.
+        ensures
+            - This function is a tool for exactly solving the MAP problem in a chain-structured 
+              graphical model or factor graph.  That is, it attempts to solve a certain kind of 
+              optimization problem which can be defined as follows:
+                - Let X denote a set of prob.number_of_nodes() integer valued variables, each taking
+                  a value in the range [0, prob.num_states()).
+                - Let X(i) = the ith variable in X.
+                - Let F(i) = factor_value_i(X(i), X(i-1), ..., X(i-prob.order()))
+                  (This is the value returned by prob.factor_value(i, node_states).  Note that
+                  each factor's value function operates on at most prob.order()+1 variables.
+                  Moreover, the variables are adjacent and hence the graph is "chain-structured".)
+
+                 Then this function finds the assignments to the X variables which  
+                 maximizes: sum over all valid i: F(i)
+
+            - #map_assignment == the result of the optimization.   
+            - #map_assignment.size() == prob.number_of_nodes()
+            - for all valid i:
+                - #map_assignment[i] < prob.num_states()
+                - #map_assignment[i] == The MAP assignment for node/variable i.
+    !*/
+
+// ----------------------------------------------------------------------------------------
+
+}
+
+#endif // DLIB_FIND_MAX_FACTOR_GRAPH_VITERBi_ABSTRACT_Hh_
+
+
+
diff --git a/ml/dlib/dlib/optimization/find_max_parse_cky.h b/ml/dlib/dlib/optimization/find_max_parse_cky.h
new file mode 100644
index 000000000..79614792a
--- /dev/null
+++ b/ml/dlib/dlib/optimization/find_max_parse_cky.h
@@ -0,0 +1,414 @@
+// Copyright (C) 2012  Davis E. King (davis@dlib.net)
+// License: Boost Software License   See LICENSE.txt for the full license.
+#ifndef DLIB_FIND_MAX_PaRSE_CKY_Hh_
+#define DLIB_FIND_MAX_PaRSE_CKY_Hh_
+
+#include "find_max_parse_cky_abstract.h"
+#include <vector>
+#include <string>
+#include <sstream>
+#include "../serialize.h" 
+#include "../array2d.h"
+
+namespace dlib
+{
+
+// -----------------------------------------------------------------------------------------
+
+    template <typename T>
+    struct constituent 
+    {
+        unsigned long begin, end, k;
+        T left_tag; 
+        T right_tag;
+    };
+
+    template <typename T>
+    void serialize(
+        const constituent<T>& item,
+        std::ostream& out
+    )
+    {
+        serialize(item.begin, out);
+        serialize(item.end, out);
+        serialize(item.k, out);
+        serialize(item.left_tag, out);
+        serialize(item.right_tag, out);
+    }
+
+    template <typename T>
+    void deserialize(
+        constituent<T>& item,
+        std::istream& in 
+    )
+    {
+        deserialize(item.begin, in);
+        deserialize(item.end, in);
+        deserialize(item.k, in);
+        deserialize(item.left_tag, in);
+        deserialize(item.right_tag, in);
+    }
+
+// -----------------------------------------------------------------------------------------
+
+    const unsigned long END_OF_TREE = 0xFFFFFFFF;
+
+// -----------------------------------------------------------------------------------------
+
+    template <typename T>
+    struct parse_tree_element
+    {
+        constituent<T> c;
+        T tag; // id for the constituent corresponding to this level of the tree
+
+        unsigned long left;
+        unsigned long right; 
+        double score; 
+    };
+
+    template <typename T>
+    void serialize (
+        const parse_tree_element<T>& item,
+        std::ostream& out
+    )
+    {
+        serialize(item.c, out);
+        serialize(item.tag, out);
+        serialize(item.left, out);
+        serialize(item.right, out);
+        serialize(item.score, out);
+    }
+
+    template <typename T>
+    void deserialize (
+        parse_tree_element<T>& item,
+        std::istream& in 
+    )
+    {
+        deserialize(item.c, in);
+        deserialize(item.tag, in);
+        deserialize(item.left, in);
+        deserialize(item.right, in);
+        deserialize(item.score, in);
+    }
+
+// -----------------------------------------------------------------------------------------
+
+    namespace impl
+    {
+        template <typename T>
+        unsigned long fill_parse_tree(
+            std::vector<parse_tree_element<T> >& parse_tree, 
+            const T& tag,
+            const array2d<std::map<T, parse_tree_element<T> > >& back, 
+            long r, long c
+        )
+        /*!
+            requires
+                - back[r][c].size() == 0 || back[r][c].count(tag) != 0
+        !*/
+        {
+            // base case of the recursion 
+            if (back[r][c].size() == 0)
+            {
+                return END_OF_TREE;
+            }
+
+            const unsigned long idx = parse_tree.size();
+            const parse_tree_element<T>& item = back[r][c].find(tag)->second;
+            parse_tree.push_back(item);
+
+            const long k = item.c.k;
+            const unsigned long idx_left  = fill_parse_tree(parse_tree, item.c.left_tag, back, r, k-1); 
+            const unsigned long idx_right = fill_parse_tree(parse_tree, item.c.right_tag, back, k, c); 
+            parse_tree[idx].left = idx_left;
+            parse_tree[idx].right = idx_right;
+            return idx;
+        }
+    }
+
+    template <typename T, typename production_rule_function>
+    void find_max_parse_cky (
+        const std::vector<T>& sequence,
+        const production_rule_function& production_rules,
+        std::vector<parse_tree_element<T> >& parse_tree
+    )
+    {
+        parse_tree.clear();
+        if (sequence.size() == 0)
+            return;
+
+        array2d<std::map<T,double> > table(sequence.size(), sequence.size());
+        array2d<std::map<T,parse_tree_element<T> > > back(sequence.size(), sequence.size());
+        typedef typename std::map<T,double>::iterator itr;
+        typedef typename std::map<T,parse_tree_element<T> >::iterator itr_b;
+
+        for (long r = 0; r < table.nr(); ++r)
+            table[r][r][sequence[r]] = 0;
+
+        std::vector<std::pair<T,double> > possible_tags;
+
+        for (long r = table.nr()-2; r >= 0; --r)
+        {
+            for (long c = r+1; c < table.nc(); ++c)
+            {
+                for (long k = r; k < c; ++k)
+                {
+                    for (itr i = table[k+1][c].begin(); i != table[k+1][c].end(); ++i)
+                    {
+                        for (itr j = table[r][k].begin(); j != table[r][k].end(); ++j)
+                        {
+                            constituent<T> con;
+                            con.begin = r;
+                            con.end = c+1;
+                            con.k = k+1;
+                            con.left_tag = j->first;
+                            con.right_tag = i->first;
+                            possible_tags.clear();
+                            production_rules(sequence, con, possible_tags);
+                            for (unsigned long m = 0; m < possible_tags.size(); ++m)
+                            {
+                                const double score = possible_tags[m].second + i->second + j->second;
+                                itr match = table[r][c].find(possible_tags[m].first);
+                                if (match == table[r][c].end() || score > match->second)
+                                {
+                                    table[r][c][possible_tags[m].first] = score;
+                                    parse_tree_element<T> item;
+                                    item.c = con;
+                                    item.score = score;
+                                    item.tag = possible_tags[m].first;
+                                    item.left = END_OF_TREE;
+                                    item.right = END_OF_TREE;
+                                    back[r][c][possible_tags[m].first] = item;
+                                }
+                            }
+                        }
+                    }
+                }
+            }
+        }
+
+
+        // now use back pointers to build the parse trees
+        const long r = 0;
+        const long c = back.nc()-1;
+        if (back[r][c].size() != 0)
+        {
+
+            // find the max scoring element in back[r][c]
+            itr_b max_i = back[r][c].begin();
+            itr_b i = max_i;
+            ++i;
+            for (; i != back[r][c].end(); ++i)
+            {
+                if (i->second.score > max_i->second.score)
+                    max_i = i;
+            }
+
+            parse_tree.reserve(c);
+            impl::fill_parse_tree(parse_tree, max_i->second.tag, back, r, c);
+        }
+    }
+
+// -----------------------------------------------------------------------------------------
+
+    class parse_tree_to_string_error : public error
+    {
+    public:
+        parse_tree_to_string_error(const std::string& str): error(str) {}
+    };
+
+    namespace impl
+    {
+        template <bool enabled, typename T>
+        typename enable_if_c<enabled>::type conditional_print(
+            const T& item,
+            std::ostream& out
+        ) { out << item << " "; }
+
+        template <bool enabled, typename T>
+        typename disable_if_c<enabled>::type conditional_print(
+            const T& ,
+            std::ostream& 
+        ) {  }
+
+        template <bool print_tag, bool skip_tag, typename T, typename U >
+        void print_parse_tree_helper (
+            const std::vector<parse_tree_element<T> >& tree,
+            const std::vector<U>& words,
+            unsigned long i,
+            const T& tag_to_skip,
+            std::ostream& out
+        )
+        {
+            if (!skip_tag || tree[i].tag != tag_to_skip)
+                out << "[";
+
+            bool left_recurse = false;
+
+            // Only print if we are supposed to.  Doing it this funny way avoids compiler
+            // errors in parse_tree_to_string() for the case where tag isn't
+            // printable.
+            if (!skip_tag || tree[i].tag != tag_to_skip)
+                conditional_print<print_tag>(tree[i].tag, out);
+
+            if (tree[i].left < tree.size())
+            {
+                left_recurse = true;
+                print_parse_tree_helper<print_tag,skip_tag>(tree, words, tree[i].left, tag_to_skip, out);
+            }
+            else
+            {
+                if ((tree[i].c.begin) < words.size())
+                {
+                    out << words[tree[i].c.begin] << " ";
+                }
+                else
+                {
+                    std::ostringstream sout;
+                    sout << "Parse tree refers to element " << tree[i].c.begin 
+                         << " of sequence which is only of size " << words.size() << ".";
+                    throw parse_tree_to_string_error(sout.str());
+                }
+            }
+
+            if (left_recurse == true)
+                out << " ";
+
+            if (tree[i].right < tree.size())
+            {
+                print_parse_tree_helper<print_tag,skip_tag>(tree, words, tree[i].right, tag_to_skip, out);
+            }
+            else
+            {
+                if (tree[i].c.k < words.size())
+                {
+                    out << words[tree[i].c.k];
+                }
+                else
+                {
+                    std::ostringstream sout;
+                    sout << "Parse tree refers to element " << tree[i].c.k 
+                         << " of sequence which is only of size " << words.size() << ".";
+                    throw parse_tree_to_string_error(sout.str());
+                }
+            }
+
+
+            if (!skip_tag || tree[i].tag != tag_to_skip)
+                out << "]";
+        }
+    }
+
+// -----------------------------------------------------------------------------------------
+
+    template <typename T, typename U>
+    std::string parse_tree_to_string (
+        const std::vector<parse_tree_element<T> >& tree,
+        const std::vector<U>& words,
+        const unsigned long root_idx = 0
+    )
+    {
+        if (root_idx >= tree.size())
+            return "";
+
+        std::ostringstream sout;
+        impl::print_parse_tree_helper<false,false>(tree, words, root_idx, tree[root_idx].tag, sout);
+        return sout.str();
+    }
+
+// -----------------------------------------------------------------------------------------
+
+    template <typename T, typename U>
+    std::string parse_tree_to_string_tagged (
+        const std::vector<parse_tree_element<T> >& tree,
+        const std::vector<U>& words,
+        const unsigned long root_idx = 0
+    )
+    {
+        if (root_idx >= tree.size())
+            return "";
+
+        std::ostringstream sout;
+        impl::print_parse_tree_helper<true,false>(tree, words, root_idx, tree[root_idx].tag, sout);
+        return sout.str();
+    }
+
+// -----------------------------------------------------------------------------------------
+
+    template <typename T, typename U>
+    std::string parse_trees_to_string (
+        const std::vector<parse_tree_element<T> >& tree,
+        const std::vector<U>& words,
+        const T& tag_to_skip
+    )
+    {
+        if (tree.size() == 0)
+            return "";
+
+        std::ostringstream sout;
+        impl::print_parse_tree_helper<false,true>(tree, words, 0, tag_to_skip, sout);
+        return sout.str();
+    }
+
+// -----------------------------------------------------------------------------------------
+
+    template <typename T, typename U>
+    std::string parse_trees_to_string_tagged (
+        const std::vector<parse_tree_element<T> >& tree,
+        const std::vector<U>& words,
+        const T& tag_to_skip
+    )
+    {
+        if (tree.size() == 0)
+            return "";
+
+        std::ostringstream sout;
+        impl::print_parse_tree_helper<true,true>(tree, words, 0, tag_to_skip, sout);
+        return sout.str();
+    }
+
+// -----------------------------------------------------------------------------------------
+
+    namespace impl
+    {
+        template <typename T>
+        void helper_find_trees_without_tag (
+            const std::vector<parse_tree_element<T> >& tree,
+            const T& tag,
+            std::vector<unsigned long>& tree_roots,
+            unsigned long idx
+        )
+        {
+            if (idx < tree.size())
+            {
+                if (tree[idx].tag != tag)
+                {
+                    tree_roots.push_back(idx);
+                }
+                else
+                {
+                    helper_find_trees_without_tag(tree, tag, tree_roots, tree[idx].left);
+                    helper_find_trees_without_tag(tree, tag, tree_roots, tree[idx].right);
+                }
+            }
+        }
+    }
+
+    template <typename T>
+    void find_trees_not_rooted_with_tag (
+        const std::vector<parse_tree_element<T> >& tree,
+        const T& tag,
+        std::vector<unsigned long>& tree_roots 
+    )
+    {
+        tree_roots.clear();
+        impl::helper_find_trees_without_tag(tree, tag, tree_roots, 0);
+    }
+
+// -----------------------------------------------------------------------------------------
+
+}
+
+#endif // DLIB_FIND_MAX_PaRSE_CKY_Hh_
+
diff --git a/ml/dlib/dlib/optimization/find_max_parse_cky_abstract.h b/ml/dlib/dlib/optimization/find_max_parse_cky_abstract.h
new file mode 100644
index 000000000..52ffc787b
--- /dev/null
+++ b/ml/dlib/dlib/optimization/find_max_parse_cky_abstract.h
@@ -0,0 +1,388 @@
+// Copyright (C) 2012  Davis E. King (davis@dlib.net)
+// License: Boost Software License   See LICENSE.txt for the full license.
+#undef DLIB_FIND_MAX_PARsE_CKY_ABSTRACT_Hh_
+#ifdef DLIB_FIND_MAX_PARsE_CKY_ABSTRACT_Hh_
+
+#include <vector>
+#include <string>
+#include "../algs.h" 
+#include "../serialize.h" 
+
+namespace dlib
+{
+
+// -----------------------------------------------------------------------------------------
+
+    template <
+        typename T
+        >
+    struct constituent 
+    {
+        /*!
+            WHAT THIS OBJECT REPRESENTS
+                This object represents the linguistic idea of a constituent, that is, a
+                group of words that functions as a single unit.  In particular, it
+                represents a combination of two constituents into a new constituent.
+
+                Additionally, a constituent object represents a range of words relative to
+                some std::vector of words.  The range is from [begin, end) (i.e. including
+                begin but not including end, so using the normal C++ iterator notation).
+                Moreover, a constituent is always composed of two parts, each having a tag.
+                Therefore, the left part is composed of the words in the range [begin,k)
+                and has tag left_tag while the right part of the constituent contains the
+                words in the range [k,end) and has the tag right_tag.
+
+                The tags are user defined objects of type T.  In general, they are used to
+                represent syntactic categories such as noun phrase, verb phrase, etc.
+        !*/
+
+        unsigned long begin, end, k;
+        T left_tag; 
+        T right_tag;
+    };
+
+    template <
+        typename T
+        >
+    void serialize(
+        const constituent<T>& item,
+        std::ostream& out
+    );
+    /*!
+        provides serialization support
+    !*/
+
+    template <
+        typename T
+        >
+    void deserialize(
+        constituent<T>& item,
+        std::istream& in 
+    );
+    /*!
+        provides deserialization support 
+    !*/
+
+// -----------------------------------------------------------------------------------------
+
+    /*!A END_OF_TREE is used to indicate that parse_tree_element::left or
+         parse_tree_element::right doesn't point to another subtree.
+    !*/
+    const unsigned long END_OF_TREE = 0xFFFFFFFF;
+
+// -----------------------------------------------------------------------------------------
+
+    template <
+        typename T
+        >
+    struct parse_tree_element
+    {
+        /*!
+            WHAT THIS OBJECT REPRESENTS
+                This object is used to represent a node in a binary parse tree.  An entire
+                parse tree is represented by a std::vector of parse_tree_element objects.
+                We follow the convention that the first element of this vector is always
+                the root of the entire tree.
+
+                The fields of this object have the following interpretations:   
+                    - c == the constituent spanned by this node in the parse tree.
+                      Therefore, the node spans the words in the range [c.begin, c.end).
+                    - tag == the syntactic category of this node in the parse tree.
+                    - score == the score or log likelihood for this parse tree.  In
+                      general, this is the sum of scores of all the production rules used
+                      to build the tree rooted at the current node.
+                    - let PT denote the vector of parse_tree_elements that defines an
+                      entire parse tree.  Then we have:
+                        - if (left != END_OF_TREE) then
+                            - PT[left] == the left sub-tree of the current node.
+                            - PT[left] spans the words [c.begin, c.k)
+                            - PT[left].tag == c.left_tag
+                        - else
+                            - there is no left sub-tree
+
+                        - if (right != END_OF_TREE) then
+                            - PT[right] == the right sub-tree of the current node.
+                            - PT[right] spans the words [c.k, c.end)
+                            - PT[right].tag == c.right_tag
+                        - else
+                            - there is no right sub-tree
+        !*/
+
+        constituent<T> c;
+        T tag; 
+        double score; 
+
+        unsigned long left;
+        unsigned long right; 
+    };
+
+    template <
+        typename T
+        >
+    void serialize (
+        const parse_tree_element<T>& item,
+        std::ostream& out
+    );
+    /*!
+        provides serialization support
+    !*/
+
+    template <
+        typename T
+        >
+    void deserialize (
+        parse_tree_element<T>& item,
+        std::istream& in 
+    );
+    /*!
+        provides deserialization support 
+    !*/
+
+// -----------------------------------------------------------------------------------------
+// -----------------------------------------------------------------------------------------
+
+    void example_production_rule_function (
+        const std::vector<T>& words,
+        const constituent<T>& c,
+        std::vector<std::pair<T,double> >& possible_tags
+    )
+    /*!
+        requires
+            - 0 <= c.begin < c.k < c.end <= words.size()
+            - possible_tags.size() == 0 
+        ensures
+            - Finds all the syntactic categories that can be used to label c and puts those
+              categories, along with their scores, into possible_tags.  Or in other words,
+              this function determines which production rules can be used to turn the left
+              and right sub-constituents in c into a single constituent.  The contents of c
+              have the following interpretations:
+                - The left sub-constituent has syntactic category c.left_tag 
+                - for all i such that c.begin <= i < c.k: 
+                    - words[i] is part of the left sub-constituent.
+                - The right sub-constituent has syntactic category c.right_tag 
+                - for all i such that c.k <= i < c.end: 
+                    - words[i] is part of the right sub-constituent.
+
+            - Note that example_production_rule_function() is not a real function.  It is
+              here just to show you how to define production rule producing functions for
+              use with the find_max_parse_cky() routine defined below.
+    !*/
+
+    template <
+        typename T, 
+        typename production_rule_function
+        >
+    void find_max_parse_cky (
+        const std::vector<T>& words,
+        const production_rule_function& production_rules,
+        std::vector<parse_tree_element<T> >& parse_tree
+    );
+    /*!
+        requires
+            - production_rule_function == a function or function object with the same
+              interface as example_production_rule_function defined above.
+            - It must be possible to store T objects in a std::map.
+        ensures
+            - Uses the CKY algorithm to find the most probable/highest scoring binary parse
+              tree of the given vector of words.  
+            - if (#parse_tree.size() == 0) then
+                - There is no parse tree, using the given production_rules, that can cover
+                  the given word sequence.
+            - else
+                - #parse_tree == the highest scoring parse tree that covers all the
+                  elements of words.
+                - #parse_tree[0] == the root node of the parse tree.
+                - #parse_tree[0].score == the score of the parse tree.  This is the sum of
+                  the scores of all production rules used to construct the tree.
+                - #parse_tree[0].begin == 0
+                - #parse_tree[0].end == words.size()
+            - This function uses production_rules() to find out what the allowed production
+              rules are.  That is, production_rules() defines all properties of the grammar
+              used by find_max_parse_cky(). 
+    !*/
+
+// -----------------------------------------------------------------------------------------
+// -----------------------------------------------------------------------------------------
+
+    class parse_tree_to_string_error : public error
+    {
+        /*!
+            WHAT THIS OBJECT REPRESENTS
+                This is the exception thrown by parse_tree_to_string() and
+                parse_tree_to_string_tagged() if the inputs are discovered to be invalid.
+        !*/
+    };
+
+// -----------------------------------------------------------------------------------------
+
+    template <
+        typename T, 
+        typename U
+        >
+    std::string parse_tree_to_string (
+        const std::vector<parse_tree_element<T> >& tree,
+        const std::vector<U>& words,
+        const unsigned long root_idx = 0
+    );
+    /*!
+        requires
+            - It must be possible to print U objects to an ostream using operator<<
+              (typically, U would be something like std::string)
+        ensures
+            - Interprets tree as a parse tree defined over the given sequence of words.  
+            - returns a bracketed string that represents the parse tree over the words.  
+              For example, suppose the following parse tree is input:
+
+                        /\
+                       /  \
+                      /\   \
+                     /  \   \
+                   the dog  ran
+
+              Then the output would be the string "[[the dog] ran]"
+            - Only the sub-tree rooted at tree[root_idx] will be output.  If root_idx >= 
+              tree.size() then the empty string is returned.
+        throws
+            - parse_tree_to_string_error
+                This exception is thrown if an invalid tree is detected.  This might happen
+                if the tree refers to elements of words that don't exist because words is
+                shorted than it is supposed to be.
+    !*/
+
+// -----------------------------------------------------------------------------------------
+
+    template <
+        typename T, 
+        typename U
+        >
+    std::string parse_tree_to_string_tagged (
+        const std::vector<parse_tree_element<T> >& tree,
+        const std::vector<U>& words,
+        const unsigned long root_idx = 0
+    );
+    /*!
+        requires
+            - It must be possible to print T objects to an ostream using operator<<
+            - It must be possible to print U objects to an ostream using operator<<
+              (typically, U would be something like std::string)
+        ensures
+            - This function does the same thing as parse_tree_to_string() except that it
+              also includes the parse_tree_element::tag object in the output.  Therefore,
+              the tag of each bracket will be included as the first token inside the
+              bracket.  For example, suppose the following parse tree is input (where tags
+              are shown at the vertices):
+
+                        S
+                        /\
+                      NP  \
+                      /\   \
+                     /  \   \
+                   the dog  ran
+
+              Then the output would be the string "[S [NP the dog] ran]"
+            - Only the sub-tree rooted at tree[root_idx] will be output.  If root_idx >=
+              tree.size() then the empty string is returned.
+        throws
+            - parse_tree_to_string_error
+                This exception is thrown if an invalid tree is detected.  This might happen
+                if the tree refers to elements of words that don't exist because words is
+                shorted than it is supposed to be.
+    !*/
+
+// -----------------------------------------------------------------------------------------
+
+    template <
+        typename T, 
+        typename U
+        >
+    std::string parse_trees_to_string (
+        const std::vector<parse_tree_element<T> >& tree,
+        const std::vector<U>& words,
+        const T& tag_to_skip
+    );
+    /*!
+        requires
+            - It must be possible to print U objects to an ostream using operator<<
+              (typically, U would be something like std::string)
+        ensures
+            - This function behaves just like parse_tree_to_string() except that it will
+              not print the brackets (i.e. []) for the top most parts of the tree which
+              have tags equal to tag_to_skip.  It will however print all the words.
+              Therefore, this function only includes brackets on the subtrees which begin
+              with a tag other than tag_to_skip.
+        throws
+            - parse_tree_to_string_error
+                This exception is thrown if an invalid tree is detected.  This might happen
+                if the tree refers to elements of words that don't exist because words is
+                shorted than it is supposed to be.
+    !*/
+
+// -----------------------------------------------------------------------------------------
+
+    template <
+        typename T, 
+        typename U
+        >
+    std::string parse_trees_to_string_tagged (
+        const std::vector<parse_tree_element<T> >& tree,
+        const std::vector<U>& words,
+        const T& tag_to_skip
+    );
+    /*!
+        requires
+            - It must be possible to print T objects to an ostream using operator<<
+            - It must be possible to print U objects to an ostream using operator<<
+              (typically, U would be something like std::string)
+        ensures
+            - This function behaves just like parse_tree_to_string_tagged() except that it
+              will not print the brackets (i.e. []) for the top most parts of the tree
+              which have tags equal to tag_to_skip.  It will however print all the words.
+              Therefore, this function only includes brackets on the subtrees which begin
+              with a tag other than tag_to_skip.
+        throws
+            - parse_tree_to_string_error
+                This exception is thrown if an invalid tree is detected.  This might happen
+                if the tree refers to elements of words that don't exist because words is
+                shorted than it is supposed to be.
+    !*/
+
+// -----------------------------------------------------------------------------------------
+
+    template <
+        typename T
+        >
+    void find_trees_not_rooted_with_tag (
+        const std::vector<parse_tree_element<T> >& tree,
+        const T& tag,
+        std::vector<unsigned long>& tree_roots 
+    );
+    /*!
+        requires
+            - objects of type T must be comparable using operator==
+        ensures
+            - Finds all the largest non-overlapping trees in tree that are not rooted with
+              the given tag.  
+            - find_trees_not_rooted_with_tag() is useful when you want to cut a parse tree
+              into a bunch of sub-trees and you know that the top level of the tree is all
+              composed of the same kind of tag.  So if you want to just "slice off" the top
+              of the tree where this tag lives then this function is useful for doing that.
+            - #tree_roots.size() == the number of sub-trees found.
+            - for all valid i:
+                - tree[#tree_roots[i]].tag != tag
+            - To make the operation of this function clearer, here are a few examples of
+              what it will do:
+                - if (tree[0].tag != tag) then 
+                    - #tree_roots.size() == 0
+                    - #tree_roots[0] == 0
+                - else if (tree[0].tag == tag but its immediate children's tags are not equal to tag) then 
+                    - #tree_roots.size() == 2
+                    - #tree_roots[0] == tree[0].left
+                    - #tree_roots[1] == tree[0].right
+    !*/
+
+// -----------------------------------------------------------------------------------------
+
+}
+
+#endif // DLIB_FIND_MAX_PARsE_CKY_ABSTRACT_Hh_
+
diff --git a/ml/dlib/dlib/optimization/find_optimal_parameters.h b/ml/dlib/dlib/optimization/find_optimal_parameters.h
new file mode 100644
index 000000000..0884778c9
--- /dev/null
+++ b/ml/dlib/dlib/optimization/find_optimal_parameters.h
@@ -0,0 +1,117 @@
+// Copyright (C) 2016  Davis E. King (davis@dlib.net)
+// License: Boost Software License   See LICENSE.txt for the full license.
+#ifndef DLIB_fIND_OPTIMAL_PARAMETERS_Hh_
+#define DLIB_fIND_OPTIMAL_PARAMETERS_Hh_
+
+#include "../matrix.h"
+#include "find_optimal_parameters_abstract.h"
+#include "optimization_bobyqa.h"
+#include "optimization_line_search.h"
+
+namespace dlib
+{
+
+// ----------------------------------------------------------------------------------------
+
+    template <
+        typename funct
+        >
+    double find_optimal_parameters (
+        double initial_search_radius,
+        double eps,
+        const unsigned int max_f_evals,
+        matrix<double,0,1>& x,
+        const matrix<double,0,1>& x_lower,
+        const matrix<double,0,1>& x_upper,
+        const funct& f
+    ) 
+    {
+        DLIB_CASSERT(x.size() == x_lower.size() && x_lower.size() == x_upper.size() && x.size() > 0,
+            "\t double find_optimal_parameters()"
+            << "\n\t x.size():       " << x.size()
+            << "\n\t x_lower.size(): " << x_lower.size()
+            << "\n\t x_upper.size(): " << x_upper.size()
+            );
+
+        // check the requirements.  Also split the assert up so that the error message isn't huge.
+        DLIB_CASSERT(max_f_evals > 1 && eps > 0 && initial_search_radius > eps,
+            "\t double find_optimal_parameters()"
+            << "\n\t Invalid arguments have been given to this function"
+            << "\n\t initial_search_radius: " << initial_search_radius
+            << "\n\t eps:                   " << eps
+            << "\n\t max_f_evals:           " << max_f_evals
+        );
+
+        DLIB_CASSERT( min(x_upper - x_lower) > 0 &&
+                     min(x - x_lower) >= 0 && min(x_upper - x) >= 0,
+            "\t double find_optimal_parameters()"
+            << "\n\t The bounds constraints have to make sense and also contain the starting point."
+            << "\n\t min(x_upper - x_lower):                         " << min(x_upper - x_lower) 
+            << "\n\t min(x - x_lower) >= 0 && min(x_upper - x) >= 0: " << (min(x - x_lower) >= 0 && min(x_upper - x) >= 0)
+        );
+
+        // if the search radius is too big then shrink it so it fits inside the bounds.
+        if (initial_search_radius*2 >= min(x_upper-x_lower))
+            initial_search_radius = 0.5*min(x_upper-x_lower)*0.99;
+
+
+        double objective_val = std::numeric_limits<double>::infinity();
+        size_t num_iter_used = 0;
+        if (x.size() == 1)
+        {
+            // BOBYQA requires x to have at least 2 variables in it.  So we can't call it in
+            // this case.  Instead we call find_min_single_variable().
+            matrix<double,0,1> temp(1);
+            auto ff = [&](const double& xx)
+            {
+                temp = xx;
+                double obj = f(temp);  
+                ++num_iter_used;
+                // keep track of the best x.
+                if (obj < objective_val)
+                {
+                    objective_val = obj;
+                    x = temp;
+                }
+                return obj;
+            };
+            try
+            {
+                double dx = x(0);
+                find_min_single_variable(ff, dx, x_lower(0), x_upper(0), eps, max_f_evals, initial_search_radius);
+            } catch (optimize_single_variable_failure& )
+            {
+            }
+        }
+        else
+        {
+            auto ff = [&](const matrix<double,0,1>& xx)
+            {
+                double obj = f(xx); 
+                ++num_iter_used;
+                // keep track of the best x.
+                if (obj < objective_val)
+                {
+                    objective_val = obj;
+                    x = xx;
+                }
+                return obj;
+            };
+            try
+            {
+                matrix<double,0,1> start_x = x;
+                find_min_bobyqa(ff, start_x, 2*x.size()+1, x_lower, x_upper, initial_search_radius, eps, max_f_evals);
+            } catch (bobyqa_failure& )
+            {
+            }
+        }
+
+        return objective_val;
+    }
+
+// ----------------------------------------------------------------------------------------
+
+}
+
+#endif // DLIB_fIND_OPTIMAL_PARAMETERS_Hh_
+
diff --git a/ml/dlib/dlib/optimization/find_optimal_parameters_abstract.h b/ml/dlib/dlib/optimization/find_optimal_parameters_abstract.h
new file mode 100644
index 000000000..96dcee89b
--- /dev/null
+++ b/ml/dlib/dlib/optimization/find_optimal_parameters_abstract.h
@@ -0,0 +1,58 @@
+// Copyright (C) 2016  Davis E. King (davis@dlib.net)
+// License: Boost Software License   See LICENSE.txt for the full license.
+#undef DLIB_fIND_OPTIMAL_PARAMETERS_ABSTRACT_Hh_
+#ifdef DLIB_fIND_OPTIMAL_PARAMETERS_ABSTRACT_Hh_
+
+#include "../matrix.h"
+
+namespace dlib
+{
+
+// ----------------------------------------------------------------------------------------
+
+    template <
+        typename funct
+        >
+    double find_optimal_parameters (
+        double initial_search_radius,
+        double eps,
+        const unsigned int max_f_evals,
+        matrix<double,0,1>& x,
+        const matrix<double,0,1>& x_lower,
+        const matrix<double,0,1>& x_upper,
+        const funct& f
+    );
+    /*!
+        requires
+            - f(x) must be a valid expression that evaluates to a double
+            - x.size() == x_lower.size() == x_upper.size()
+            - x.size() > 0
+            - 0 < eps < initial_search_radius 
+            - max_f_evals > 1
+            - min(x_upper - x_lower) > 0 
+            - min(x - x_lower) >= 0 && min(x_upper - x) >= 0
+              (i.e. the given x should be within the bounds defined by x_lower and x_upper)
+        ensures
+            - Performs a constrained minimization of the function f() starting from 
+              the initial point x.  
+            - This function does not require derivatives of f().  Instead, it uses
+              derivative free methods to find the best setting of x.  In particular, it
+              will begin by searching within a sphere of radius initial_search_radius
+              around x and will continue searching until either f() has been called
+              max_f_evals times or the search area has been shrunk to less than eps radius.
+            - #x == the value of x (within the bounds defined by x_lower and x_upper) that 
+              was found to minimize f().  More precisely, it will always be true that:
+                - min(#x - x_lower) >= 0 && min(x_upper - #x) >= 0
+            - returns f(#x). 
+        throws
+            - No exception is thrown for executing max_f_evals iterations.  This function
+              will simply output the best x it has seen if it runs out of iterations.
+    !*/
+
+// ----------------------------------------------------------------------------------------
+
+}
+
+#endif // DLIB_fIND_OPTIMAL_PARAMETERS_ABSTRACT_Hh_
+
+
diff --git a/ml/dlib/dlib/optimization/isotonic_regression.h b/ml/dlib/dlib/optimization/isotonic_regression.h
new file mode 100644
index 000000000..e89e70b48
--- /dev/null
+++ b/ml/dlib/dlib/optimization/isotonic_regression.h
@@ -0,0 +1,169 @@
+// Copyright (C) 2018  Davis E. King (davis@dlib.net)
+// License: Boost Software License   See LICENSE.txt for the full license.
+#ifndef DLIB_ISOTONIC_ReGRESSION_H_
+#define DLIB_ISOTONIC_ReGRESSION_H_
+
+#include "isotonic_regression_abstract.h"
+#include <vector>
+#include <utility>
+
+namespace dlib
+{
+    class isotonic_regression
+    {
+    public:
+
+        template <
+            typename const_iterator, 
+            typename iterator
+            >
+        void operator() (
+            const_iterator begin,
+            const_iterator end,
+            iterator obegin
+        )
+        {
+            do_isotonic_regression(begin, end);
+
+            // unpack blocks to output
+            for (auto& block : blocks)
+            {
+                for (size_t k = 0; k < block.num; ++k)
+                    set_val(*obegin++, block.avg);
+            }
+
+            blocks.clear();
+        }
+
+        void operator() (
+            std::vector<double>& vect
+        ) { (*this)(vect.begin(), vect.end(), vect.begin()); }
+
+        template <typename T, typename U>
+        void operator() (
+            std::vector<std::pair<T,U>>& vect
+        ) { (*this)(vect.begin(), vect.end(), vect.begin()); }
+
+
+        template <
+            typename const_iterator, 
+            typename iterator
+            >
+        void fit_with_linear_output_interpolation (
+            const_iterator begin,
+            const_iterator end,
+            iterator obegin
+        )
+        {
+            do_isotonic_regression(begin, end);
+
+            // Unpack blocks to output, but here instead of producing the step function
+            // output we linearly interpolate.  Note that this actually fits the data less
+            // than the step-function, but in many applications might be closer to what you
+            // really when when using isotonic_regression than the step function.
+            for (size_t i = 0; i < blocks.size(); ++i)
+            {
+                auto& block = blocks[i];
+
+                double prev = (blocks.front().avg + block.avg)/2;
+                if (i > 0)
+                    prev = (blocks[i-1].avg+block.avg)/2;
+
+                double next = (blocks.back().avg + block.avg)/2;
+                if (i+1 < blocks.size())
+                    next = (blocks[i+1].avg+block.avg)/2;
+
+                for (size_t k = 0; k < block.num; ++k)
+                {
+                    const auto mid = block.num/2.0;
+                    if (k < mid)
+                    {
+                        const double alpha = k/mid;
+                        set_val(*obegin++, (1-alpha)*prev + alpha*block.avg);
+                    }
+                    else
+                    {
+                        const double alpha = k/mid-1;
+                        set_val(*obegin++, alpha*next + (1-alpha)*block.avg);
+                    }
+                }
+            }
+
+            blocks.clear();
+        }
+
+        void fit_with_linear_output_interpolation (
+            std::vector<double>& vect
+        ) { fit_with_linear_output_interpolation(vect.begin(), vect.end(), vect.begin()); }
+
+        template <typename T, typename U>
+        void fit_with_linear_output_interpolation (
+            std::vector<std::pair<T,U>>& vect
+        ) { fit_with_linear_output_interpolation(vect.begin(), vect.end(), vect.begin()); }
+
+    private:
+
+        template <
+            typename const_iterator
+            >
+        void do_isotonic_regression (
+            const_iterator begin,
+            const_iterator end
+        )
+        {
+            blocks.clear();
+
+            // Do the actual isotonic regression.  The output is a step-function and is
+            // stored in the vector of blocks.
+            for (auto i = begin; i != end; ++i)
+            {
+                blocks.emplace_back(get_val(*i));
+                while (blocks.size() > 1 && prev_block().avg > current_block().avg)
+                {
+                    // merge the last two blocks.
+                    prev_block() = prev_block() + current_block();
+                    blocks.pop_back();
+                }
+            }
+        }
+
+
+        template <typename T>
+        static double get_val(const T& v) { return v;}
+
+        template <typename T, typename U>
+        static double get_val(const std::pair<T,U>& v) { return v.second;}
+
+        template <typename T>
+        static void set_val(T& v, double val) { v = val;}
+
+        template <typename T, typename U>
+        static void set_val(std::pair<T,U>& v, double val) { v.second = val;}
+
+
+
+        struct block_t
+        {
+            block_t(double val) : num(1), avg(val) {}
+            block_t(size_t n, double val) : num(n), avg(val) {}
+
+            size_t num;
+            double avg;
+
+            inline block_t operator+(const block_t& rhs) const
+            {
+                return block_t(num+rhs.num,
+                    (num*avg + rhs.num*rhs.avg)/(num+rhs.num));
+            }
+        };
+
+        inline block_t& prev_block() { return blocks[blocks.size()-2]; }
+        inline block_t& current_block() { return blocks.back(); }
+
+        std::vector<block_t> blocks;
+    };
+}
+
+#endif // DLIB_ISOTONIC_ReGRESSION_H_
+
+
diff --git a/ml/dlib/dlib/optimization/isotonic_regression_abstract.h b/ml/dlib/dlib/optimization/isotonic_regression_abstract.h
new file mode 100644
index 000000000..b00334bef
--- /dev/null
+++ b/ml/dlib/dlib/optimization/isotonic_regression_abstract.h
@@ -0,0 +1,128 @@
+// Copyright (C) 2018  Davis E. King (davis@dlib.net)
+// License: Boost Software License   See LICENSE.txt for the full license.
+#undef DLIB_ISOTONIC_ReGRESSION_ABSTRACT_H_
+#ifdef DLIB_ISOTONIC_ReGRESSION_ABSTRACT_H_
+
+#include <vector>
+#include <utility>
+
+namespace dlib
+{
+    class isotonic_regression
+    {
+        /*!
+            WHAT THIS OBJECT REPRESENTS
+                This object is a tool for performing 1-D isotonic regression. That is, it
+                finds the least squares fit of a non-parametric curve to some user supplied
+                data, subject to the constraint that the fitted curve is non-decreasing.
+
+                This is done using the fast O(n) pool adjacent violators algorithm.
+        !*/
+
+    public:
+
+        template <
+            typename const_iterator, 
+            typename iterator
+            >
+        void operator() (
+            const_iterator begin,
+            const_iterator end,
+            iterator obegin
+        );
+        /*!
+            requires
+                - [begin,end) is an iterator range of float or doubles or a range of
+                  std::pair<T,double> or std::pair<T,float> where T an be anything.
+                - obegin points to an iterator range at least std::distance(begin,end). 
+                - obegin points to an iterator range of objects of type float, double, std::pair<T,float>, or std::pair<T,double>.
+            ensures
+                - Given the range of real values stored in [begin,end), this method performs isotonic regression
+                  on this data and writes the results to obegin.  To be specific:
+                    - let IN refer to the input values stored in the iterator range [begin,end).
+                    - let OUT refer to the output values stored in the iterator range [obegin, obegin+std::distance(begin,end)).
+                    - This function populates OUT with values such that the sum_i of
+                      (IN[i]-OUT[i])^2 is minimized, subject to the constraint that 
+                      OUT[i] <= OUT[i+1], i.e. that OUT is monotonic.
+                - It is OK for [begin,end) to overlap with the range pointed to by obegin.
+                  That is, this function can run in-place.
+                - Note that when the inputs or outputs are std::pairs this algorithm only
+                  looks at the .second field of the pair.  It therefore still treats these
+                  iterator ranges as ranges of reals since it only looks at the .second
+                  field, which is a real number.  The .first field is entirely ignored.
+        !*/
+
+        void operator() (
+            std::vector<double>& vect
+        ) { (*this)(vect.begin(), vect.end(), vect.begin()); }
+        /*!
+            ensures
+                - performs in-place isotonic regression.  Therefore, #vect will contain the
+                  isotonic regression of vect.
+                - #vect.size() == vect.size()
+        !*/
+
+        template <typename T, typename U>
+        void operator() (
+            std::vector<std::pair<T,U>>& vect
+        ) { (*this)(vect.begin(), vect.end(), vect.begin()); }
+        /*!
+            ensures
+                - performs in-place isotonic regression.  Therefore, #vect will contain the
+                  isotonic regression of vect.
+                - #vect.size() == vect.size()
+        !*/
+
+
+        template <
+            typename const_iterator, 
+            typename iterator
+            >
+        void fit_with_linear_output_interpolation (
+            const_iterator begin,
+            const_iterator end,
+            iterator obegin
+        );
+        /*!
+            requires
+                - [begin,end) is an iterator range of float or doubles or a range of
+                  std::pair<T,double> or std::pair<T,float> where T an be anything.
+                - obegin points to an iterator range at least std::distance(begin,end). 
+                - obegin points to an iterator range of objects of type float, double, std::pair<T,float>, or std::pair<T,double>.
+            ensures
+                - This function behaves just like (*this)(begin,end,obegin) except that the
+                  output is interpolated.  To explain, not that the optimal output of
+                  isotonic regression is a step function.  However, in many applications
+                  that isn't really what you want.  You want something smoother.  So
+                  fit_with_linear_output_interpolation() does isotonic regression and then
+                  linearly interpolates the step function into a piecewise linear function.
+        !*/
+
+        void fit_with_linear_output_interpolation (
+            std::vector<double>& vect
+        ) { fit_with_linear_output_interpolation(vect.begin(), vect.end(), vect.begin()); }
+        /*!
+            ensures
+                - performs in-place isotonic regression.  Therefore, #vect will contain the
+                  isotonic regression of vect.
+                - #vect.size() == vect.size()
+        !*/
+
+        template <typename T, typename U>
+        void fit_with_linear_output_interpolation (
+            std::vector<std::pair<T,U>>& vect
+        ) { fit_with_linear_output_interpolation(vect.begin(), vect.end(), vect.begin()); }
+        /*!
+            ensures
+                - performs in-place isotonic regression.  Therefore, #vect will contain the
+                  isotonic regression of vect.
+                - #vect.size() == vect.size()
+        !*/
+
+    };
+}
+
+#endif // DLIB_ISOTONIC_ReGRESSION_ABSTRACT_H_
+
+
+
diff --git a/ml/dlib/dlib/optimization/max_cost_assignment.h b/ml/dlib/dlib/optimization/max_cost_assignment.h
new file mode 100644
index 000000000..db6c6f0d7
--- /dev/null
+++ b/ml/dlib/dlib/optimization/max_cost_assignment.h
@@ -0,0 +1,288 @@
+// Copyright (C) 2011  Davis E. King (davis@dlib.net)
+// License: Boost Software License   See LICENSE.txt for the full license.
+#ifndef DLIB_MAX_COST_ASSIgNMENT_Hh_
+#define DLIB_MAX_COST_ASSIgNMENT_Hh_
+
+#include "max_cost_assignment_abstract.h"
+#include "../matrix.h"
+#include <vector>
+#include <deque>
+
+namespace dlib
+{
+
+// ----------------------------------------------------------------------------------------
+
+    template <typename EXP>
+    typename EXP::type assignment_cost (
+        const matrix_exp<EXP>& cost,
+        const std::vector<long>& assignment
+    )
+    {
+        DLIB_ASSERT(cost.nr() == cost.nc(),
+            "\t type assignment_cost(cost,assignment)"
+            << "\n\t cost.nr(): " << cost.nr()
+            << "\n\t cost.nc(): " << cost.nc()
+            );
+#ifdef ENABLE_ASSERTS
+        // can't call max on an empty vector. So put an if here to guard against it.
+        if (assignment.size() > 0)
+        {
+            DLIB_ASSERT(0 <= min(mat(assignment)) && max(mat(assignment)) < cost.nr(),
+                "\t type assignment_cost(cost,assignment)"
+                << "\n\t cost.nr(): " << cost.nr()
+                << "\n\t cost.nc(): " << cost.nc()
+                << "\n\t min(assignment): " << min(mat(assignment)) 
+                << "\n\t max(assignment): " << max(mat(assignment)) 
+                );
+        }
+#endif
+
+        typename EXP::type temp = 0;
+        for (unsigned long i = 0; i < assignment.size(); ++i)
+        {
+            temp += cost(i, assignment[i]);
+        }
+        return temp;
+    }
+
+// ----------------------------------------------------------------------------------------
+
+    namespace impl
+    {
+        template <typename EXP>
+        inline void compute_slack(
+            const long x,
+            std::vector<typename EXP::type>& slack,
+            std::vector<long>& slackx,
+            const matrix_exp<EXP>& cost,
+            const std::vector<typename EXP::type>& lx,
+            const std::vector<typename EXP::type>& ly
+        )
+        {
+            for (long y = 0; y < cost.nc(); ++y)
+            {
+                if (lx[x] + ly[y] - cost(x,y) < slack[y])
+                {
+                    slack[y] = lx[x] + ly[y] - cost(x,y);
+                    slackx[y] = x;
+                }
+            }
+        }
+    }
+
+// ----------------------------------------------------------------------------------------
+
+    template <typename EXP>
+    std::vector<long> max_cost_assignment (
+        const matrix_exp<EXP>& cost_
+    )                         
+    {
+        const_temp_matrix<EXP> cost(cost_);
+        typedef typename EXP::type type;
+        // This algorithm only works if the elements of the cost matrix can be reliably 
+        // compared using operator==. However, comparing for equality with floating point
+        // numbers is not a stable operation. So you need to use an integer cost matrix.
+        COMPILE_TIME_ASSERT(std::numeric_limits<type>::is_integer);
+        DLIB_ASSERT(cost.nr() == cost.nc(),
+            "\t std::vector<long> max_cost_assignment(cost)"
+            << "\n\t cost.nr(): " << cost.nr()
+            << "\n\t cost.nc(): " << cost.nc()
+            );
+
+        using namespace dlib::impl;
+        /*
+            I based the implementation of this algorithm on the description of the
+            Hungarian algorithm on the following websites:
+                http://www.math.uwo.ca/~mdawes/courses/344/kuhn-munkres.pdf
+                http://www.topcoder.com/tc?module=Static&d1=tutorials&d2=hungarianAlgorithm
+
+            Note that this is the fast O(n^3) version of the algorithm.
+        */
+
+        if (cost.size() == 0)
+            return std::vector<long>();
+
+        std::vector<type> lx, ly;
+        std::vector<long> xy;
+        std::vector<long> yx;
+        std::vector<char> S, T;
+        std::vector<type> slack;
+        std::vector<long> slackx;
+        std::vector<long> aug_path;
+
+
+
+
+        // Initially, nothing is matched. 
+        xy.assign(cost.nc(), -1);
+        yx.assign(cost.nc(), -1);
+        /*
+            We maintain the following invariant:
+                Vertex x is matched to vertex xy[x] and
+                vertex y is matched to vertex yx[y].
+
+                A value of -1 means a vertex isn't matched to anything.  Moreover,
+                x corresponds to rows of the cost matrix and y corresponds to the
+                columns of the cost matrix.  So we are matching X to Y.
+        */
+
+
+        // Create an initial feasible labeling.  Moreover, in the following
+        // code we will always have: 
+        //     for all valid x and y:  lx[x] + ly[y] >= cost(x,y)
+        lx.resize(cost.nc());
+        ly.assign(cost.nc(),0);
+        for (long x = 0; x < cost.nr(); ++x)
+            lx[x] = max(rowm(cost,x));
+
+        // Now grow the match set by picking edges from the equality subgraph until
+        // we have a complete matching.
+        for (long match_size = 0; match_size < cost.nc(); ++match_size)
+        {
+            std::deque<long> q;
+
+            // Empty out the S and T sets
+            S.assign(cost.nc(), false);
+            T.assign(cost.nc(), false);
+
+            // clear out old slack values
+            slack.assign(cost.nc(), std::numeric_limits<type>::max());
+            slackx.resize(cost.nc());
+            /*
+                slack and slackx are maintained such that we always
+                have the following (once they get initialized by compute_slack() below):
+                    - for all y:
+                        - let x == slackx[y]
+                        - slack[y] == lx[x] + ly[y] - cost(x,y)
+            */
+
+            aug_path.assign(cost.nc(), -1);
+
+            for (long x = 0; x < cost.nc(); ++x)
+            {
+                // If x is not matched to anything
+                if (xy[x] == -1)
+                {
+                    q.push_back(x);
+                    S[x] = true;
+
+                    compute_slack(x, slack, slackx, cost, lx, ly);
+                    break;
+                }
+            }
+
+
+            long x_start = 0;
+            long y_start = 0;
+
+            // Find an augmenting path.  
+            bool found_augmenting_path = false;
+            while (!found_augmenting_path)
+            {
+                while (q.size() > 0 && !found_augmenting_path)
+                {
+                    const long x = q.front();
+                    q.pop_front();
+                    for (long y = 0; y < cost.nc(); ++y)
+                    {
+                        if (cost(x,y) == lx[x] + ly[y] && !T[y])
+                        {
+                            // if vertex y isn't matched with anything
+                            if (yx[y] == -1) 
+                            {
+                                y_start = y;
+                                x_start = x;
+                                found_augmenting_path = true;
+                                break;
+                            }
+
+                            T[y] = true;
+                            q.push_back(yx[y]);
+
+                            aug_path[yx[y]] = x;
+                            S[yx[y]] = true;
+                            compute_slack(yx[y], slack, slackx, cost, lx, ly);
+                        }
+                    }
+                }
+
+                if (found_augmenting_path)
+                    break;
+
+
+                // Since we didn't find an augmenting path we need to improve the 
+                // feasible labeling stored in lx and ly.  We also need to keep the
+                // slack updated accordingly.
+                type delta = std::numeric_limits<type>::max();
+                for (unsigned long i = 0; i < T.size(); ++i)
+                {
+                    if (!T[i])
+                        delta = std::min(delta, slack[i]);
+                }
+                for (unsigned long i = 0; i < T.size(); ++i)
+                {
+                    if (S[i])
+                        lx[i] -= delta;
+
+                    if (T[i])
+                        ly[i] += delta;
+                    else
+                        slack[i] -= delta;
+                }
+
+
+
+                q.clear();
+                for (long y = 0; y < cost.nc(); ++y)
+                {
+                    if (!T[y] && slack[y] == 0)
+                    {
+                        // if vertex y isn't matched with anything
+                        if (yx[y] == -1)
+                        {
+                            x_start = slackx[y];
+                            y_start = y;
+                            found_augmenting_path = true;
+                            break;
+                        }
+                        else
+                        {
+                            T[y] = true;
+                            if (!S[yx[y]])
+                            {
+                                q.push_back(yx[y]);
+
+                                aug_path[yx[y]] = slackx[y];
+                                S[yx[y]] = true;
+                                compute_slack(yx[y], slack, slackx, cost, lx, ly);
+                            }
+                        }
+                    }
+                }
+            } // end while (!found_augmenting_path)
+
+            // Flip the edges along the augmenting path.  This means we will add one more
+            // item to our matching.
+            for (long cx = x_start, cy = y_start, ty; 
+                 cx != -1; 
+                 cx = aug_path[cx], cy = ty)
+            {
+                ty = xy[cx];
+                yx[cy] = cx;
+                xy[cx] = cy;
+            }
+
+        }
+
+
+        return xy;
+    }
+
+// ----------------------------------------------------------------------------------------
+
+}
+
+#endif // DLIB_MAX_COST_ASSIgNMENT_Hh_
+
+
diff --git a/ml/dlib/dlib/optimization/max_cost_assignment_abstract.h b/ml/dlib/dlib/optimization/max_cost_assignment_abstract.h
new file mode 100644
index 000000000..bbdb0abfb
--- /dev/null
+++ b/ml/dlib/dlib/optimization/max_cost_assignment_abstract.h
@@ -0,0 +1,63 @@
+// Copyright (C) 2011  Davis E. King (davis@dlib.net)
+// License: Boost Software License   See LICENSE.txt for the full license.
+#undef DLIB_MAX_COST_ASSIgNMENT_ABSTRACT_Hh_
+#ifdef DLIB_MAX_COST_ASSIgNMENT_ABSTRACT_Hh_
+
+#include "../matrix.h"
+#include <vector>
+
+namespace dlib
+{
+
+// ----------------------------------------------------------------------------------------
+
+    template <typename EXP>
+    typename EXP::type assignment_cost (
+        const matrix_exp<EXP>& cost,
+        const std::vector<long>& assignment
+    );
+    /*!
+        requires
+            - cost.nr() == cost.nc()
+            - for all valid i:
+                - 0 <= assignment[i] < cost.nr()
+        ensures
+            - Interprets cost as a cost assignment matrix. That is, cost(i,j) 
+              represents the cost of assigning i to j.  
+            - Interprets assignment as a particular set of assignments. That is,
+              i is assigned to assignment[i].
+            - returns the cost of the given assignment. That is, returns
+              a number which is:
+                sum over i: cost(i,assignment[i])
+    !*/
+
+// ----------------------------------------------------------------------------------------
+
+    template <typename EXP>
+    std::vector<long> max_cost_assignment (
+        const matrix_exp<EXP>& cost
+    );
+    /*!
+        requires
+            - EXP::type == some integer type (e.g. int)
+              (i.e. cost must contain integers rather than floats or doubles)
+            - cost.nr() == cost.nc()
+        ensures
+            - Finds and returns the solution to the following optimization problem:
+
+                Maximize: f(A) == assignment_cost(cost, A)
+                Subject to the following constraints:
+                    - The elements of A are unique. That is, there aren't any 
+                      elements of A which are equal.  
+                    - A.size() == cost.nr()
+
+            - This function implements the O(N^3) version of the Hungarian algorithm 
+              where N is the number of rows in the cost matrix.
+    !*/
+
+// ----------------------------------------------------------------------------------------
+
+}
+
+#endif // DLIB_MAX_COST_ASSIgNMENT_ABSTRACT_Hh_
+
diff --git a/ml/dlib/dlib/optimization/max_sum_submatrix.h b/ml/dlib/dlib/optimization/max_sum_submatrix.h
new file mode 100644
index 000000000..1986cc26b
--- /dev/null
+++ b/ml/dlib/dlib/optimization/max_sum_submatrix.h
@@ -0,0 +1,285 @@
+// Copyright (C) 2011  Davis E. King (davis@dlib.net)
+// License: Boost Software License   See LICENSE.txt for the full license.
+#ifndef DLIB_MAX_SUM_SUBMaTRIX_Hh_
+#define DLIB_MAX_SUM_SUBMaTRIX_Hh_
+
+#include "max_sum_submatrix_abstract.h"
+#include "../matrix.h"
+#include <vector>
+#include <queue>
+#include "../geometry.h"
+
+namespace dlib
+{
+    namespace impl
+    {
+
+    // ------------------------------------------------------------------------------------
+
+        template <typename T>
+        struct range_set
+        {
+            int top_min;
+            int top_max;
+            int bottom_min;
+            int bottom_max;
+            T weight;
+
+            bool operator<(const range_set& item) const { return weight < item.weight; }
+        };
+
+    // ------------------------------------------------------------------------------------
+
+        template <typename T>
+        bool is_terminal_set (
+            const range_set<T>& item
+        )
+        {
+            return (item.top_min  >= item.top_max &&
+                    item.bottom_min >= item.bottom_max);
+        }
+
+    // ------------------------------------------------------------------------------------
+
+        template <typename T>
+        void split (
+            const range_set<T>& rset,
+            range_set<T>& a,
+            range_set<T>& b
+        )
+        {
+            if (rset.top_max - rset.top_min > rset.bottom_max - rset.bottom_min)
+            {
+                // split top
+                const int middle = (rset.top_max + rset.top_min)/2;
+                a.top_min = rset.top_min;
+                a.top_max = middle;
+                b.top_min = middle+1;
+                b.top_max = rset.top_max;
+
+                a.bottom_min = rset.bottom_min;
+                a.bottom_max = rset.bottom_max;
+                b.bottom_min = rset.bottom_min;
+                b.bottom_max = rset.bottom_max;
+            }
+            else
+            {
+                // split bottom
+                const int middle = (rset.bottom_max + rset.bottom_min)/2;
+                a.bottom_min = rset.bottom_min;
+                a.bottom_max = middle;
+                b.bottom_min = middle+1;
+                b.bottom_max = rset.bottom_max;
+
+                a.top_min = rset.top_min;
+                a.top_max = rset.top_max;
+                b.top_min = rset.top_min;
+                b.top_max = rset.top_max;
+            }
+        }
+
+    // ------------------------------------------------------------------------------------
+
+        template <typename EXP, typename T>
+        void find_best_column_range (
+            const matrix_exp<EXP>& sum_pos,
+            const matrix_exp<EXP>& sum_neg,
+            const range_set<T>& row_range,
+            T& weight,
+            int& left,
+            int& right
+        )
+        {
+            left = 0;
+            right = -1;
+            weight = 0;
+            T cur_sum = 0;
+            int cur_pos = 0;
+            for (long c = 0; c < sum_pos.nc(); ++c)
+            {
+                // compute the value for the current column
+                T temp = sum_pos(row_range.bottom_max+1,c) - sum_pos(row_range.top_min,c);
+                if (row_range.top_max <= row_range.bottom_min)
+                    temp += sum_neg(row_range.bottom_min+1,c) - sum_neg(row_range.top_max,c);
+
+
+                cur_sum += temp;
+                if (cur_sum > weight)
+                {
+                    left = cur_pos;
+                    right = c;
+                    weight = cur_sum;
+                }
+
+                if (cur_sum <= 0)
+                {
+                    cur_sum = 0;
+                    cur_pos = c+1;
+                }
+
+            }
+        }
+    }
+
+// ----------------------------------------------------------------------------------------
+
+    template <typename EXP>
+    std::vector<rectangle> max_sum_submatrix(
+        const matrix_exp<EXP>& mat,
+        unsigned long max_rects,
+        double thresh_ = 0
+    )
+    {
+        // make sure requires clause is not broken
+        DLIB_ASSERT(thresh_ >= 0 && mat.size() > 0,
+            "\t std::vector<rectangle> max_sum_submatrix()"
+            << "\n\t Invalid arguments were given to this function."
+            << "\n\t mat.size(): " << mat.size()
+            << "\n\t thresh_:    " << thresh_
+            );
+
+        /*
+            This function is basically an implementation of the efficient subwindow search (I-ESS)
+            algorithm presented in the following paper: 
+                Efficient Algorithms for Subwindow Search in Object Detection and Localization
+                by Senjian An, Patrick Peursum, Wanquan Liu and Svetha Venkatesh
+                In CVPR 2009
+
+        */
+
+
+        if (max_rects == 0)
+            return std::vector<rectangle>();
+
+        using namespace dlib::impl;
+        typedef typename EXP::type element_type;
+        typedef typename promote<element_type>::type scalar_type;
+
+        const scalar_type thresh = static_cast<scalar_type>(thresh_);
+
+
+        matrix<scalar_type> sum_pos;
+        matrix<scalar_type> sum_neg;
+        sum_pos.set_size(mat.nr()+1, mat.nc());
+        sum_neg.set_size(mat.nr()+1, mat.nc());
+        // integrate over the rows.  
+        for (long c = 0; c < mat.nc(); ++c)
+        {
+            sum_pos(0,c) = 0;
+            sum_neg(0,c) = 0;
+        }
+        for (long r = 0; r < mat.nr(); ++r)
+        {
+            for (long c = 0; c < mat.nc(); ++c)
+            {
+                if (mat(r,c) > 0)
+                {
+                    sum_pos(r+1,c) = mat(r,c) + sum_pos(r,c);
+                    sum_neg(r+1,c) = sum_neg(r,c);
+                }
+                else
+                {
+                    sum_pos(r+1,c) = sum_pos(r,c);
+                    sum_neg(r+1,c) = mat(r,c) + sum_neg(r,c);
+                }
+            }
+        }
+
+        std::priority_queue<range_set<scalar_type> > q;
+
+        // the range_sets will represent ranges of columns 
+        range_set<scalar_type> universe_set;
+        universe_set.bottom_min = 0;
+        universe_set.top_min = 0;
+        universe_set.bottom_max = mat.nr()-1;
+        universe_set.top_max = mat.nr()-1;
+        universe_set.weight = sum(rowm(dlib::mat(sum_pos),mat.nr()));
+
+        q.push(universe_set);
+
+        std::vector<rectangle> results;
+        std::vector<scalar_type> temp_pos(mat.nc());
+        std::vector<scalar_type> temp_neg(mat.nc());
+
+        while (q.size() > 0)
+        {
+            if (is_terminal_set(q.top()))
+            {
+                int left, right;
+                scalar_type weight;
+                find_best_column_range(sum_pos, sum_neg, q.top(), weight, left, right);
+
+                rectangle rect(left, q.top().top_min, 
+                               right, q.top().bottom_min);
+
+                if (weight <= thresh)
+                    break;
+
+                results.push_back(rect);
+
+                if (results.size() >= max_rects)
+                    break;
+
+                q = std::priority_queue<range_set<scalar_type> >();
+                // We are going to blank out the weights we just used.  So adjust the sum images appropriately.
+                for (long c = rect.left(); c <= rect.right(); ++c)
+                {
+                    temp_pos[c] = sum_pos(rect.bottom()+1,c) - sum_pos(rect.top(),c);
+                    temp_neg[c] = sum_neg(rect.bottom()+1,c) - sum_neg(rect.top(),c);
+                }
+                // blank out the area inside the rectangle
+                for (long r = rect.top(); r <= rect.bottom(); ++r)
+                {
+                    for (long c = rect.left(); c <= rect.right(); ++c)
+                    {
+                        sum_pos(r+1,c) = sum_pos(r,c);
+                        sum_neg(r+1,c) = sum_neg(r,c);
+                    }
+                }
+                // account for the area below the rectangle
+                for (long r = rect.bottom()+2; r < sum_pos.nr(); ++r)
+                {
+                    for (long c = rect.left(); c <= rect.right(); ++c)
+                    {
+                        sum_pos(r,c) -= temp_pos[c];
+                        sum_neg(r,c) -= temp_neg[c];
+                    }
+                }
+
+
+                universe_set.weight = sum(rowm(dlib::mat(sum_pos),mat.nr()));
+                if (universe_set.weight <= thresh)
+                    break;
+
+                q.push(universe_set);
+                continue;
+            }
+
+            range_set<scalar_type> a, b;
+            split(q.top(), a,b);
+            q.pop();
+
+            // these variables are not used at this point in the algorithm.
+            int a_left, a_right;
+            int b_left, b_right;
+
+            find_best_column_range(sum_pos, sum_neg, a, a.weight, a_left, a_right);
+            find_best_column_range(sum_pos, sum_neg, b, b.weight, b_left, b_right);
+
+            if (a.weight > thresh)
+                q.push(a);
+            if (b.weight > thresh)
+                q.push(b);
+
+        }
+
+
+        return results;
+    }
+
+// ----------------------------------------------------------------------------------------
+
+}
+
+#endif // DLIB_MAX_SUM_SUBMaTRIX_Hh_
+
diff --git a/ml/dlib/dlib/optimization/max_sum_submatrix_abstract.h b/ml/dlib/dlib/optimization/max_sum_submatrix_abstract.h
new file mode 100644
index 000000000..6714dd7fe
--- /dev/null
+++ b/ml/dlib/dlib/optimization/max_sum_submatrix_abstract.h
@@ -0,0 +1,49 @@
+// Copyright (C) 2011  Davis E. King (davis@dlib.net)
+// License: Boost Software License   See LICENSE.txt for the full license.
+#undef DLIB_MAX_SUM_SUBMaTRIX_ABSTRACT_Hh_
+#ifdef DLIB_MAX_SUM_SUBMaTRIX_ABSTRACT_Hh_
+
+#include "../matrix.h"
+#include <vector>
+#include "../geometry.h"
+
+namespace dlib
+{
+
+// ----------------------------------------------------------------------------------------
+
+    template <
+        typename EXP
+        >
+    std::vector<rectangle> max_sum_submatrix(
+        const matrix_exp<EXP>& mat,
+        unsigned long max_rects,
+        double thresh = 0
+    );
+    /*!
+        requires
+            - thresh >= 0
+            - mat.size() != 0
+        ensures
+            - This function finds the submatrix within mat which has the largest sum.  It then
+              zeros out that submatrix and repeats the process until no more maximal submatrices can 
+              be found.  The std::vector returned will be ordered so that the rectangles with the
+              largest sum come first. 
+            - Each submatrix must have a sum greater than thresh.  If no such submatrix exists then
+              the algorithm terminates and returns an empty std::vector.  
+            - At most max_rects rectangles are returned. 
+
+            - This function is basically an implementation of the efficient subwindow search (I-ESS)
+              algorithm presented in the following paper: 
+                Efficient Algorithms for Subwindow Search in Object Detection and Localization
+                by Senjian An, Patrick Peursum, Wanquan Liu and Svetha Venkatesh
+                In CVPR 2009
+    !*/
+
+// ----------------------------------------------------------------------------------------
+
+}
+
+#endif // DLIB_MAX_SUM_SUBMaTRIX_ABSTRACT_Hh_
+
+
diff --git a/ml/dlib/dlib/optimization/optimization.h b/ml/dlib/dlib/optimization/optimization.h
new file mode 100644
index 000000000..561d64376
--- /dev/null
+++ b/ml/dlib/dlib/optimization/optimization.h
@@ -0,0 +1,714 @@
+// Copyright (C) 2008  Davis E. King (davis@dlib.net)
+// License: Boost Software License   See LICENSE.txt for the full license.
+#ifndef DLIB_OPTIMIZATIOn_H_
+#define DLIB_OPTIMIZATIOn_H_
+
+#include <cmath>
+#include <limits>
+#include "optimization_abstract.h"
+#include "optimization_search_strategies.h"
+#include "optimization_stop_strategies.h"
+#include "optimization_line_search.h"
+
+namespace dlib
+{
+
+// ----------------------------------------------------------------------------------------
+// ----------------------------------------------------------------------------------------
+//                    Functions that transform other functions  
+// ----------------------------------------------------------------------------------------
+// ----------------------------------------------------------------------------------------
+
+    template <typename funct>
+    class central_differences
+    {
+    public:
+        central_differences(const funct& f_, double eps_ = 1e-7) : f(f_), eps(eps_){}
+
+        template <typename T>
+        typename T::matrix_type operator()(const T& x) const
+        {
+            // T must be some sort of dlib matrix 
+            COMPILE_TIME_ASSERT(is_matrix<T>::value);
+
+            typename T::matrix_type der(x.size());
+            typename T::matrix_type e(x);
+            for (long i = 0; i < x.size(); ++i)
+            {
+                const double old_val = e(i);
+
+                e(i) += eps;
+                const double delta_plus = f(e);
+                e(i) = old_val - eps;
+                const double delta_minus = f(e);
+
+                der(i) = (delta_plus - delta_minus)/((old_val+eps)-(old_val-eps)); 
+
+                // and finally restore the old value of this element
+                e(i) = old_val;
+            }
+
+            return der;
+        }
+
+        template <typename T, typename U>
+        typename U::matrix_type operator()(const T& item, const U& x) const
+        {
+            // U must be some sort of dlib matrix 
+            COMPILE_TIME_ASSERT(is_matrix<U>::value);
+
+            typename U::matrix_type der(x.size());
+            typename U::matrix_type e(x);
+            for (long i = 0; i < x.size(); ++i)
+            {
+                const double old_val = e(i);
+
+                e(i) += eps;
+                const double delta_plus = f(item,e);
+                e(i) = old_val - eps;
+                const double delta_minus = f(item,e);
+
+                der(i) = (delta_plus - delta_minus)/((old_val+eps)-(old_val-eps)); 
+
+                // and finally restore the old value of this element
+                e(i) = old_val;
+            }
+
+            return der;
+        }
+        
+
+        double operator()(const double& x) const
+        {
+            return (f(x+eps)-f(x-eps))/((x+eps)-(x-eps));
+        }
+
+    private:
+        const funct& f;
+        const double eps;
+    };
+
+    template <typename funct>
+    const central_differences<funct> derivative(const funct& f) { return central_differences<funct>(f); }
+    template <typename funct>
+    const central_differences<funct> derivative(const funct& f, double eps) 
+    { 
+        DLIB_ASSERT (
+            eps > 0,
+            "\tcentral_differences derivative(f,eps)"
+            << "\n\tYou must give an epsilon > 0"
+            << "\n\teps:     " << eps 
+        );
+        return central_differences<funct>(f,eps); 
+    }
+
+// ----------------------------------------------------------------------------------------
+
+    template <typename funct, typename EXP1, typename EXP2>
+    struct clamped_function_object
+    {
+        clamped_function_object(
+            const funct& f_,
+            const matrix_exp<EXP1>& x_lower_,
+            const matrix_exp<EXP2>& x_upper_ 
+        ) : f(f_), x_lower(x_lower_), x_upper(x_upper_)
+        {
+        }
+
+        template <typename T>
+        double operator() (
+            const T& x
+        ) const
+        {
+            return f(clamp(x,x_lower,x_upper));
+        }
+        
+        const funct& f;
+        const matrix_exp<EXP1>& x_lower;
+        const matrix_exp<EXP2>& x_upper; 
+    };
+
+    template <typename funct, typename EXP1, typename EXP2>
+    clamped_function_object<funct,EXP1,EXP2> clamp_function(
+        const funct& f,
+        const matrix_exp<EXP1>& x_lower,
+        const matrix_exp<EXP2>& x_upper 
+    ) { return clamped_function_object<funct,EXP1,EXP2>(f,x_lower,x_upper); }
+
+// ----------------------------------------------------------------------------------------
+
+// ----------------------------------------------------------------------------------------
+// ----------------------------------------------------------------------------------------
+//                    Functions that perform unconstrained optimization 
+// ----------------------------------------------------------------------------------------
+// ----------------------------------------------------------------------------------------
+
+    template <
+        typename search_strategy_type,
+        typename stop_strategy_type,
+        typename funct, 
+        typename funct_der, 
+        typename T
+        >
+    double find_min (
+        search_strategy_type search_strategy,
+        stop_strategy_type stop_strategy,
+        const funct& f, 
+        const funct_der& der, 
+        T& x, 
+        double min_f
+    )
+    {
+        COMPILE_TIME_ASSERT(is_matrix<T>::value);
+        // The starting point (i.e. x) must be a column vector.  
+        COMPILE_TIME_ASSERT(T::NC <= 1);
+
+        DLIB_CASSERT (
+            is_col_vector(x),
+            "\tdouble find_min()"
+            << "\n\tYou have to supply column vectors to this function"
+            << "\n\tx.nc():    " << x.nc()
+        );
+
+
+        T g, s;
+
+        double f_value = f(x);
+        g = der(x);
+
+        if (!is_finite(f_value))
+            throw error("The objective function generated non-finite outputs");
+        if (!is_finite(g))
+            throw error("The objective function generated non-finite outputs");
+
+        while(stop_strategy.should_continue_search(x, f_value, g) && f_value > min_f)
+        {
+            s = search_strategy.get_next_direction(x, f_value, g);
+
+            double alpha = line_search(
+                        make_line_search_function(f,x,s, f_value),
+                        f_value,
+                        make_line_search_function(der,x,s, g),
+                        dot(g,s), // compute initial gradient for the line search
+                        search_strategy.get_wolfe_rho(), search_strategy.get_wolfe_sigma(), min_f,
+                        search_strategy.get_max_line_search_iterations());
+
+            // Take the search step indicated by the above line search
+            x += alpha*s;
+
+            if (!is_finite(f_value))
+                throw error("The objective function generated non-finite outputs");
+            if (!is_finite(g))
+                throw error("The objective function generated non-finite outputs");
+        }
+
+        return f_value;
+    }
+
+// ----------------------------------------------------------------------------------------
+
+    template <
+        typename search_strategy_type,
+        typename stop_strategy_type,
+        typename funct, 
+        typename funct_der, 
+        typename T
+        >
+    double find_max (
+        search_strategy_type search_strategy,
+        stop_strategy_type stop_strategy,
+        const funct& f, 
+        const funct_der& der, 
+        T& x, 
+        double max_f
+    )
+    {
+        COMPILE_TIME_ASSERT(is_matrix<T>::value);
+        // The starting point (i.e. x) must be a column vector.  
+        COMPILE_TIME_ASSERT(T::NC <= 1);
+
+        DLIB_CASSERT (
+            is_col_vector(x),
+            "\tdouble find_max()"
+            << "\n\tYou have to supply column vectors to this function"
+            << "\n\tx.nc():    " << x.nc()
+        );
+
+        T g, s;
+
+        // This function is basically just a copy of find_min() but with - put in the right places
+        // to flip things around so that it ends up looking for the max rather than the min.
+
+        double f_value = -f(x);
+        g = -der(x);
+
+        if (!is_finite(f_value))
+            throw error("The objective function generated non-finite outputs");
+        if (!is_finite(g))
+            throw error("The objective function generated non-finite outputs");
+
+        while(stop_strategy.should_continue_search(x, f_value, g) && f_value > -max_f)
+        {
+            s = search_strategy.get_next_direction(x, f_value, g);
+
+            double alpha = line_search(
+                        negate_function(make_line_search_function(f,x,s, f_value)),
+                        f_value,
+                        negate_function(make_line_search_function(der,x,s, g)),
+                        dot(g,s), // compute initial gradient for the line search
+                        search_strategy.get_wolfe_rho(), search_strategy.get_wolfe_sigma(), -max_f,
+                        search_strategy.get_max_line_search_iterations()
+                        );
+
+            // Take the search step indicated by the above line search
+            x += alpha*s;
+
+            // Don't forget to negate these outputs from the line search since they are 
+            // from the unnegated versions of f() and der()
+            g *= -1;
+            f_value *= -1;
+
+            if (!is_finite(f_value))
+                throw error("The objective function generated non-finite outputs");
+            if (!is_finite(g))
+                throw error("The objective function generated non-finite outputs");
+        }
+
+        return -f_value;
+    }
+
+// ----------------------------------------------------------------------------------------
+
+    template <
+        typename search_strategy_type,
+        typename stop_strategy_type,
+        typename funct,
+        typename T
+        >
+    double find_min_using_approximate_derivatives (
+        search_strategy_type search_strategy,
+        stop_strategy_type stop_strategy,
+        const funct& f,
+        T& x,
+        double min_f,
+        double derivative_eps = 1e-7
+    )
+    {
+        COMPILE_TIME_ASSERT(is_matrix<T>::value);
+        // The starting point (i.e. x) must be a column vector.  
+        COMPILE_TIME_ASSERT(T::NC <= 1);
+
+        DLIB_CASSERT (
+            is_col_vector(x) && derivative_eps > 0,
+            "\tdouble find_min_using_approximate_derivatives()"
+            << "\n\tYou have to supply column vectors to this function"
+            << "\n\tx.nc():         " << x.nc()
+            << "\n\tderivative_eps: " << derivative_eps 
+        );
+
+        T g, s;
+
+        double f_value = f(x);
+        g = derivative(f,derivative_eps)(x);
+
+        if (!is_finite(f_value))
+            throw error("The objective function generated non-finite outputs");
+        if (!is_finite(g))
+            throw error("The objective function generated non-finite outputs");
+
+        while(stop_strategy.should_continue_search(x, f_value, g) && f_value > min_f)
+        {
+            s = search_strategy.get_next_direction(x, f_value, g);
+
+            double alpha = line_search(
+                        make_line_search_function(f,x,s,f_value),
+                        f_value,
+                        derivative(make_line_search_function(f,x,s),derivative_eps),
+                        dot(g,s),  // Sometimes the following line is a better way of determining the initial gradient. 
+                        //derivative(make_line_search_function(f,x,s),derivative_eps)(0),
+                        search_strategy.get_wolfe_rho(), search_strategy.get_wolfe_sigma(), min_f,
+                        search_strategy.get_max_line_search_iterations()
+                        );
+
+            // Take the search step indicated by the above line search
+            x += alpha*s;
+
+            g = derivative(f,derivative_eps)(x);
+
+            if (!is_finite(f_value))
+                throw error("The objective function generated non-finite outputs");
+            if (!is_finite(g))
+                throw error("The objective function generated non-finite outputs");
+        }
+
+        return f_value;
+    }
+
+// ----------------------------------------------------------------------------------------
+
+    template <
+        typename search_strategy_type,
+        typename stop_strategy_type,
+        typename funct,
+        typename T
+        >
+    double find_max_using_approximate_derivatives (
+        search_strategy_type search_strategy,
+        stop_strategy_type stop_strategy,
+        const funct& f,
+        T& x,
+        double max_f,
+        double derivative_eps = 1e-7
+    )
+    {
+        COMPILE_TIME_ASSERT(is_matrix<T>::value);
+        // The starting point (i.e. x) must be a column vector.  
+        COMPILE_TIME_ASSERT(T::NC <= 1);
+
+        DLIB_CASSERT (
+            is_col_vector(x) && derivative_eps > 0,
+            "\tdouble find_max_using_approximate_derivatives()"
+            << "\n\tYou have to supply column vectors to this function"
+            << "\n\tx.nc():         " << x.nc()
+            << "\n\tderivative_eps: " << derivative_eps 
+        );
+
+        // Just negate the necessary things and call the find_min version of this function.
+        return -find_min_using_approximate_derivatives(
+            search_strategy, 
+            stop_strategy, 
+            negate_function(f),
+            x,
+            -max_f,
+            derivative_eps
+        );
+    }
+
+// ----------------------------------------------------------------------------------------
+// ----------------------------------------------------------------------------------------
+//                      Functions for box constrained optimization
+// ----------------------------------------------------------------------------------------
+// ----------------------------------------------------------------------------------------
+
+    template <typename T, typename U, typename V>
+    T zero_bounded_variables (
+        const double eps,
+        T vect,
+        const T& x,
+        const T& gradient,
+        const U& x_lower,
+        const V& x_upper
+    )
+    {
+        for (long i = 0; i < gradient.size(); ++i)
+        {
+            const double tol = eps*std::abs(x(i));
+            // if x(i) is an active bound constraint
+            if (x_lower(i)+tol >= x(i) && gradient(i) > 0)
+                vect(i) = 0;
+            else if (x_upper(i)-tol <= x(i) && gradient(i) < 0)
+                vect(i) = 0;
+        }
+        return vect;
+    }
+
+// ----------------------------------------------------------------------------------------
+
+    template <typename T, typename U, typename V>
+    T gap_step_assign_bounded_variables (
+        const double eps,
+        T vect,
+        const T& x,
+        const T& gradient,
+        const U& x_lower,
+        const V& x_upper
+    )
+    {
+        for (long i = 0; i < gradient.size(); ++i)
+        {
+            const double tol = eps*std::abs(x(i));
+            // If x(i) is an active bound constraint then we should set its search
+            // direction such that a single step along the direction either does nothing or
+            // closes the gap of size tol before hitting the bound exactly.
+            if (x_lower(i)+tol >= x(i) && gradient(i) > 0)
+                vect(i) = x_lower(i)-x(i);
+            else if (x_upper(i)-tol <= x(i) && gradient(i) < 0)
+                vect(i) = x_upper(i)-x(i);
+        }
+        return vect;
+    }
+
+// ----------------------------------------------------------------------------------------
+
+    template <
+        typename search_strategy_type,
+        typename stop_strategy_type,
+        typename funct, 
+        typename funct_der, 
+        typename T,
+        typename EXP1,
+        typename EXP2
+        >
+    double find_min_box_constrained (
+        search_strategy_type search_strategy,
+        stop_strategy_type stop_strategy,
+        const funct& f, 
+        const funct_der& der, 
+        T& x,
+        const matrix_exp<EXP1>& x_lower,
+        const matrix_exp<EXP2>& x_upper
+    )
+    {
+        /*
+            The implementation of this function is more or less based on the discussion in
+            the paper Projected Newton-type Methods in Machine Learning by Mark Schmidt, et al.
+        */
+
+        // make sure the requires clause is not violated
+        COMPILE_TIME_ASSERT(is_matrix<T>::value);
+        // The starting point (i.e. x) must be a column vector.  
+        COMPILE_TIME_ASSERT(T::NC <= 1);
+
+        DLIB_CASSERT (
+            is_col_vector(x) && is_col_vector(x_lower) && is_col_vector(x_upper) &&
+            x.size() == x_lower.size() && x.size() == x_upper.size(),
+            "\tdouble find_min_box_constrained()"
+            << "\n\t The inputs to this function must be equal length column vectors."
+            << "\n\t is_col_vector(x):       " << is_col_vector(x)
+            << "\n\t is_col_vector(x_upper): " << is_col_vector(x_upper)
+            << "\n\t is_col_vector(x_upper): " << is_col_vector(x_upper)
+            << "\n\t x.size():               " << x.size()
+            << "\n\t x_lower.size():         " << x_lower.size()
+            << "\n\t x_upper.size():         " << x_upper.size()
+        );
+        DLIB_ASSERT (
+            min(x_upper-x_lower) >= 0,
+            "\tdouble find_min_box_constrained()"
+            << "\n\t You have to supply proper box constraints to this function."
+            << "\n\r min(x_upper-x_lower): " << min(x_upper-x_lower)
+        );
+
+
+        T g, s;
+        double f_value = f(x);
+        g = der(x);
+
+        if (!is_finite(f_value))
+            throw error("The objective function generated non-finite outputs");
+        if (!is_finite(g))
+            throw error("The objective function generated non-finite outputs");
+
+        // gap_eps determines how close we have to get to a bound constraint before we
+        // start basically dropping it from the optimization and consider it to be an
+        // active constraint.
+        const double gap_eps = 1e-8;
+
+        double last_alpha = 1;
+        while(stop_strategy.should_continue_search(x, f_value, g))
+        {
+            s = search_strategy.get_next_direction(x, f_value, zero_bounded_variables(gap_eps, g, x, g, x_lower, x_upper));
+            s = gap_step_assign_bounded_variables(gap_eps, s, x, g, x_lower, x_upper);
+
+            double alpha = backtracking_line_search(
+                        make_line_search_function(clamp_function(f,x_lower,x_upper), x, s, f_value),
+                        f_value,
+                        dot(g,s), // compute gradient for the line search
+                        last_alpha, 
+                        search_strategy.get_wolfe_rho(), 
+                        search_strategy.get_max_line_search_iterations());
+
+            // Do a trust region style thing for alpha.  The idea is that if we take a
+            // small step then we are likely to take another small step.  So we reuse the
+            // alpha from the last iteration unless the line search didn't shrink alpha at
+            // all, in that case, we start with a bigger alpha next time.
+            if (alpha == last_alpha)
+                last_alpha = std::min(last_alpha*10,1.0);
+            else
+                last_alpha = alpha;
+
+            // Take the search step indicated by the above line search
+            x = dlib::clamp(x + alpha*s, x_lower, x_upper);
+            g = der(x);
+
+            if (!is_finite(f_value))
+                throw error("The objective function generated non-finite outputs");
+            if (!is_finite(g))
+                throw error("The objective function generated non-finite outputs");
+        }
+
+        return f_value;
+    }
+
+// ----------------------------------------------------------------------------------------
+
+    template <
+        typename search_strategy_type,
+        typename stop_strategy_type,
+        typename funct, 
+        typename funct_der, 
+        typename T
+        >
+    double find_min_box_constrained (
+        search_strategy_type search_strategy,
+        stop_strategy_type stop_strategy,
+        const funct& f, 
+        const funct_der& der, 
+        T& x,
+        double x_lower,
+        double x_upper
+    )
+    {
+        // The starting point (i.e. x) must be a column vector.  
+        COMPILE_TIME_ASSERT(T::NC <= 1);
+
+        typedef typename T::type scalar_type;
+        return find_min_box_constrained(search_strategy,
+                                        stop_strategy,
+                                        f,
+                                        der,
+                                        x,
+                                        uniform_matrix<scalar_type>(x.size(),1,x_lower),
+                                        uniform_matrix<scalar_type>(x.size(),1,x_upper) );
+    }
+
+// ----------------------------------------------------------------------------------------
+
+    template <
+        typename search_strategy_type,
+        typename stop_strategy_type,
+        typename funct, 
+        typename funct_der, 
+        typename T,
+        typename EXP1,
+        typename EXP2
+        >
+    double find_max_box_constrained (
+        search_strategy_type search_strategy,
+        stop_strategy_type stop_strategy,
+        const funct& f, 
+        const funct_der& der, 
+        T& x,
+        const matrix_exp<EXP1>& x_lower,
+        const matrix_exp<EXP2>& x_upper
+    )
+    {
+        // make sure the requires clause is not violated
+        COMPILE_TIME_ASSERT(is_matrix<T>::value);
+        // The starting point (i.e. x) must be a column vector.  
+        COMPILE_TIME_ASSERT(T::NC <= 1);
+
+        DLIB_CASSERT (
+            is_col_vector(x) && is_col_vector(x_lower) && is_col_vector(x_upper) &&
+            x.size() == x_lower.size() && x.size() == x_upper.size(),
+            "\tdouble find_max_box_constrained()"
+            << "\n\t The inputs to this function must be equal length column vectors."
+            << "\n\t is_col_vector(x):       " << is_col_vector(x)
+            << "\n\t is_col_vector(x_upper): " << is_col_vector(x_upper)
+            << "\n\t is_col_vector(x_upper): " << is_col_vector(x_upper)
+            << "\n\t x.size():               " << x.size()
+            << "\n\t x_lower.size():         " << x_lower.size()
+            << "\n\t x_upper.size():         " << x_upper.size()
+        );
+        DLIB_ASSERT (
+            min(x_upper-x_lower) >= 0,
+            "\tdouble find_max_box_constrained()"
+            << "\n\t You have to supply proper box constraints to this function."
+            << "\n\r min(x_upper-x_lower): " << min(x_upper-x_lower)
+        );
+
+        // This function is basically just a copy of find_min_box_constrained() but with - put 
+        // in the right places to flip things around so that it ends up looking for the max
+        // rather than the min.
+
+        T g, s;
+        double f_value = -f(x);
+        g = -der(x);
+
+        if (!is_finite(f_value))
+            throw error("The objective function generated non-finite outputs");
+        if (!is_finite(g))
+            throw error("The objective function generated non-finite outputs");
+
+        // gap_eps determines how close we have to get to a bound constraint before we
+        // start basically dropping it from the optimization and consider it to be an
+        // active constraint.
+        const double gap_eps = 1e-8;
+
+        double last_alpha = 1;
+        while(stop_strategy.should_continue_search(x, f_value, g))
+        {
+            s = search_strategy.get_next_direction(x, f_value, zero_bounded_variables(gap_eps, g, x, g, x_lower, x_upper));
+            s = gap_step_assign_bounded_variables(gap_eps, s, x, g, x_lower, x_upper);
+
+            double alpha = backtracking_line_search(
+                        negate_function(make_line_search_function(clamp_function(f,x_lower,x_upper), x, s, f_value)),
+                        f_value,
+                        dot(g,s), // compute gradient for the line search
+                        last_alpha, 
+                        search_strategy.get_wolfe_rho(), 
+                        search_strategy.get_max_line_search_iterations());
+
+            // Do a trust region style thing for alpha.  The idea is that if we take a
+            // small step then we are likely to take another small step.  So we reuse the
+            // alpha from the last iteration unless the line search didn't shrink alpha at
+            // all, in that case, we start with a bigger alpha next time.
+            if (alpha == last_alpha)
+                last_alpha = std::min(last_alpha*10,1.0);
+            else
+                last_alpha = alpha;
+
+            // Take the search step indicated by the above line search
+            x = dlib::clamp(x + alpha*s, x_lower, x_upper);
+            g = -der(x);
+
+            // Don't forget to negate the output from the line search since it is  from the
+            // unnegated version of f() 
+            f_value *= -1;
+
+            if (!is_finite(f_value))
+                throw error("The objective function generated non-finite outputs");
+            if (!is_finite(g))
+                throw error("The objective function generated non-finite outputs");
+        }
+
+        return -f_value;
+    }
+
+// ----------------------------------------------------------------------------------------
+
+    template <
+        typename search_strategy_type,
+        typename stop_strategy_type,
+        typename funct, 
+        typename funct_der, 
+        typename T
+        >
+    double find_max_box_constrained (
+        search_strategy_type search_strategy,
+        stop_strategy_type stop_strategy,
+        const funct& f, 
+        const funct_der& der, 
+        T& x,
+        double x_lower,
+        double x_upper
+    )
+    {
+        // The starting point (i.e. x) must be a column vector.  
+        COMPILE_TIME_ASSERT(T::NC <= 1);
+
+        typedef typename T::type scalar_type;
+        return find_max_box_constrained(search_strategy,
+                                        stop_strategy,
+                                        f,
+                                        der,
+                                        x,
+                                        uniform_matrix<scalar_type>(x.size(),1,x_lower),
+                                        uniform_matrix<scalar_type>(x.size(),1,x_upper) );
+    }
+
+// ----------------------------------------------------------------------------------------
+
+}
+
+#endif // DLIB_OPTIMIZATIOn_H_
+
diff --git a/ml/dlib/dlib/optimization/optimization_abstract.h b/ml/dlib/dlib/optimization/optimization_abstract.h
new file mode 100644
index 000000000..f3c42c2b4
--- /dev/null
+++ b/ml/dlib/dlib/optimization/optimization_abstract.h
@@ -0,0 +1,468 @@
+// Copyright (C) 2008  Davis E. King (davis@dlib.net)
+// License: Boost Software License   See LICENSE.txt for the full license.
+#undef DLIB_OPTIMIZATIOn_ABSTRACT_
+#ifdef DLIB_OPTIMIZATIOn_ABSTRACT_
+
+#include <cmath>
+#include <limits>
+#include "../matrix/matrix_abstract.h"
+#include "../algs.h"
+#include "optimization_search_strategies_abstract.h"
+#include "optimization_stop_strategies_abstract.h"
+#include "optimization_line_search_abstract.h"
+
+
+namespace dlib
+{
+
+// ----------------------------------------------------------------------------------------
+// ----------------------------------------------------------------------------------------
+//                    Functions that transform other functions  
+// ----------------------------------------------------------------------------------------
+// ----------------------------------------------------------------------------------------
+
+    template <
+        typename funct
+        >
+    class central_differences;
+    /*!
+        This is a function object that represents the derivative of some other
+        function. 
+
+        Note that if funct is a function of a double then the derivative of 
+        funct is just a double but if funct is a function of a dlib::matrix (i.e. a
+        function of many variables) then its derivative is a gradient vector (a column
+        vector in particular).
+    !*/
+
+    template <
+        typename funct
+        >
+    const central_differences<funct> derivative(
+        const funct& f, 
+        double eps
+    );
+    /*!
+        requires
+            - f == a function that returns a scalar
+            - f must have one of the following forms:
+                - double f(double)
+                - double f(dlib::matrix)  (where the matrix is a column vector)
+                - double f(T, dlib::matrix)  (where the matrix is a column vector.  In 
+                  this case the derivative of f is taken with respect to the second argument.)
+            - eps > 0
+        ensures
+            - returns a function that represents the derivative of the function f.  It
+              is approximated numerically by:
+                  (f(x+eps)-f(x-eps))/(2*eps)
+    !*/
+
+    template <
+        typename funct
+        >
+    const central_differences<funct> derivative(
+        const funct& f
+    );
+    /*!
+        ensures
+            - returns derivative(f, 1e-7)
+    !*/
+
+// ----------------------------------------------------------------------------------------
+
+    template <
+        typename funct, 
+        typename EXP1, 
+        typename EXP2
+        >
+    clamped_function_object<funct,EXP1,EXP2> clamp_function (
+        const funct& f,
+        const matrix_exp<EXP1>& x_lower,
+        const matrix_exp<EXP2>& x_upper 
+    );
+    /*!
+        requires
+            - f == a function that takes a matrix and returns a scalar value.  Moreover, f
+              must be capable of taking in matrices with the same dimensions as x_lower and
+              x_upper.  So f(x_lower) must be a valid expression that evaluates to a scalar
+              value.
+            - x_lower.nr() == x_upper.nr() && x_lower.nc() == x_upper.nc()
+              (i.e. x_lower and x_upper must have the same dimensions)
+            - x_lower and x_upper must contain the same type of elements.
+        ensures
+            - returns a function object that represents the function g(x) where
+              g(x) == f(clamp(x,x_lower,x_upper))
+    !*/
+
+// ----------------------------------------------------------------------------------------
+// ----------------------------------------------------------------------------------------
+//                    Functions that perform unconstrained optimization 
+// ----------------------------------------------------------------------------------------
+// ----------------------------------------------------------------------------------------
+
+    template <
+        typename search_strategy_type,
+        typename stop_strategy_type,
+        typename funct, 
+        typename funct_der, 
+        typename T
+        >
+    double find_min (
+        search_strategy_type search_strategy,
+        stop_strategy_type stop_strategy,
+        const funct& f, 
+        const funct_der& der, 
+        T& x, 
+        double min_f
+    );
+    /*!
+        requires
+            - search_strategy == an object that defines a search strategy such as one 
+              of the objects from dlib/optimization/optimization_search_strategies_abstract.h
+            - stop_strategy == an object that defines a stop strategy such as one of 
+              the objects from dlib/optimization/optimization_stop_strategies_abstract.h
+            - f(x) must be a valid expression that evaluates to a double
+            - der(x) must be a valid expression that evaluates to the derivative of f() at x.
+            - is_col_vector(x) == true
+        ensures
+            - Performs an unconstrained minimization of the function f() using the given
+              search_strategy and starting from the initial point x.  
+            - The function is optimized until stop_strategy decides that an acceptable 
+              point has been found or f(#x) < min_f.
+            - #x == the value of x that was found to minimize f()
+            - returns f(#x). 
+            - When this function makes calls to f() and der() it always does so by
+              first calling f() and then calling der().  That is, these two functions
+              are always called in pairs with f() being called first and then der()
+              being called second.
+    !*/
+
+// ----------------------------------------------------------------------------------------
+
+    template <
+        typename search_strategy_type,
+        typename stop_strategy_type,
+        typename funct, 
+        typename funct_der, 
+        typename T
+        >
+    double find_max (
+        search_strategy_type search_strategy,
+        stop_strategy_type stop_strategy,
+        const funct& f, 
+        const funct_der& der, 
+        T& x, 
+        double max_f
+    );
+    /*!
+        requires
+            - search_strategy == an object that defines a search strategy such as one 
+              of the objects from dlib/optimization/optimization_search_strategies_abstract.h
+            - stop_strategy == an object that defines a stop strategy such as one of 
+              the objects from dlib/optimization/optimization_stop_strategies_abstract.h
+            - f(x) must be a valid expression that evaluates to a double
+            - der(x) must be a valid expression that evaluates to the derivative of f() at x.
+            - is_col_vector(x) == true
+        ensures
+            - Performs an unconstrained maximization of the function f() using the given
+              search_strategy and starting from the initial point x.  
+            - The function is optimized until stop_strategy decides that an acceptable 
+              point has been found or f(#x) > max_f.
+            - #x == the value of x that was found to maximize f()
+            - returns f(#x). 
+            - When this function makes calls to f() and der() it always does so by
+              first calling f() and then calling der().  That is, these two functions
+              are always called in pairs with f() being called first and then der()
+              being called second.
+            - Note that this function solves the maximization problem by converting it 
+              into a minimization problem.  Therefore, the values of f and its derivative
+              reported to the stopping strategy will be negated.  That is, stop_strategy
+              will see -f() and -der().  All this really means is that the status messages
+              from a stopping strategy in verbose mode will display a negated objective
+              value.
+    !*/
+
+// ----------------------------------------------------------------------------------------
+
+    template <
+        typename search_strategy_type,
+        typename stop_strategy_type,
+        typename funct,
+        typename T
+        >
+    double find_min_using_approximate_derivatives (
+        search_strategy_type search_strategy,
+        stop_strategy_type stop_strategy,
+        const funct& f,
+        T& x,
+        double min_f,
+        double derivative_eps = 1e-7
+    );
+    /*!
+        requires
+            - search_strategy == an object that defines a search strategy such as one 
+              of the objects from dlib/optimization/optimization_search_strategies_abstract.h
+            - stop_strategy == an object that defines a stop strategy such as one of 
+              the objects from dlib/optimization/optimization_stop_strategies_abstract.h
+            - f(x) must be a valid expression that evaluates to a double
+            - is_col_vector(x) == true
+            - derivative_eps > 0 
+        ensures
+            - Performs an unconstrained minimization of the function f() using the given
+              search_strategy and starting from the initial point x.  
+            - The function is optimized until stop_strategy decides that an acceptable 
+              point has been found or f(#x) < min_f.
+            - #x == the value of x that was found to minimize f()
+            - returns f(#x). 
+            - Uses the dlib::derivative(f,derivative_eps) function to compute gradient
+              information.
+    !*/
+
+// ----------------------------------------------------------------------------------------
+
+    template <
+        typename search_strategy_type,
+        typename stop_strategy_type,
+        typename funct,
+        typename T
+        >
+    double find_max_using_approximate_derivatives (
+        search_strategy_type search_strategy,
+        stop_strategy_type stop_strategy,
+        const funct& f,
+        T& x,
+        double max_f,
+        double derivative_eps = 1e-7
+    );
+    /*!
+        requires
+            - search_strategy == an object that defines a search strategy such as one 
+              of the objects from dlib/optimization/optimization_search_strategies_abstract.h
+            - stop_strategy == an object that defines a stop strategy such as one of 
+              the objects from dlib/optimization/optimization_stop_strategies_abstract.h
+            - f(x) must be a valid expression that evaluates to a double
+            - is_col_vector(x) == true
+            - derivative_eps > 0 
+        ensures
+            - Performs an unconstrained maximization of the function f() using the given
+              search_strategy and starting from the initial point x.  
+            - The function is optimized until stop_strategy decides that an acceptable 
+              point has been found or f(#x) > max_f.
+            - #x == the value of x that was found to maximize f()
+            - returns f(#x). 
+            - Uses the dlib::derivative(f,derivative_eps) function to compute gradient
+              information.
+            - Note that this function solves the maximization problem by converting it 
+              into a minimization problem.  Therefore, the values of f and its derivative
+              reported to the stopping strategy will be negated.  That is, stop_strategy
+              will see -f() and -der().  All this really means is that the status messages
+              from a stopping strategy in verbose mode will display a negated objective
+              value.
+    !*/
+
+// ----------------------------------------------------------------------------------------
+// ----------------------------------------------------------------------------------------
+//                  Functions that perform box constrained optimization 
+// ----------------------------------------------------------------------------------------
+// ----------------------------------------------------------------------------------------
+
+    template <
+        typename search_strategy_type,
+        typename stop_strategy_type,
+        typename funct, 
+        typename funct_der, 
+        typename T,
+        typename EXP1,
+        typename EXP2
+        >
+    double find_min_box_constrained (
+        search_strategy_type search_strategy,
+        stop_strategy_type stop_strategy,
+        const funct& f, 
+        const funct_der& der, 
+        T& x,
+        const matrix_exp<EXP1>& x_lower,
+        const matrix_exp<EXP2>& x_upper
+    );
+    /*!
+        requires
+            - search_strategy == an object that defines a search strategy such as one 
+              of the objects from dlib/optimization/optimization_search_strategies_abstract.h
+            - stop_strategy == an object that defines a stop strategy such as one of 
+              the objects from dlib/optimization/optimization_stop_strategies_abstract.h
+            - f(x) must be a valid expression that evaluates to a double
+            - der(x) must be a valid expression that evaluates to the derivative of f() at x.
+            - is_col_vector(x) == true
+            - is_col_vector(x_lower) == true
+            - is_col_vector(x_upper) == true
+            - x.size() == x_lower.size() == x_upper.size()
+              (i.e. x, x_lower, and x_upper need to all be column vectors of the same dimensionality)
+            - min(x_upper-x_lower) >= 0
+              (i.e. x_upper must contain upper bounds relative to x_lower)
+        ensures
+            - Performs a box constrained minimization of the function f() using the given
+              search_strategy and starting from the initial point x.  That is, we try to
+              find the x value that minimizes f(x) but is also within the box constraints 
+              specified by x_lower and x_upper.  That is, we ensure that #x satisfies: 
+                - min(#x - x_lower) >= 0 && min(x_upper - #x) >= 0
+            - This function uses a backtracking line search along with a gradient projection
+              step to handle the box constraints.
+            - The function is optimized until stop_strategy decides that an acceptable
+              point has been found. 
+            - #x == the value of x that was found to minimize f() within the given box
+              constraints.
+            - returns f(#x). 
+            - The last call to f() will be made with f(#x).  
+            - When calling f() and der(), the input passed to them will always be inside
+              the box constraints defined by x_lower and x_upper.
+            - When calling der(x), it will always be the case that the last call to f() was
+              made with the same x value.  This means that you can reuse any intermediate
+              results from the previous call to f(x) inside der(x) rather than recomputing
+              them inside der(x).
+    !*/
+
+// ----------------------------------------------------------------------------------------
+
+    template <
+        typename search_strategy_type,
+        typename stop_strategy_type,
+        typename funct, 
+        typename funct_der, 
+        typename T
+        >
+    double find_min_box_constrained (
+        search_strategy_type search_strategy,
+        stop_strategy_type stop_strategy,
+        const funct& f, 
+        const funct_der& der, 
+        T& x,
+        const double x_lower,
+        const double x_upper
+    );
+    /*!
+        requires
+            - search_strategy == an object that defines a search strategy such as one 
+              of the objects from dlib/optimization/optimization_search_strategies_abstract.h
+            - stop_strategy == an object that defines a stop strategy such as one of 
+              the objects from dlib/optimization/optimization_stop_strategies_abstract.h
+            - f(x) must be a valid expression that evaluates to a double
+            - der(x) must be a valid expression that evaluates to the derivative of f() at x.
+            - is_col_vector(x) == true
+            - x_lower < x_upper
+        ensures
+            - This function is identical to find_min_box_constrained() as defined above
+              except that it takes x_lower and x_upper as doubles rather than column
+              vectors.  In this case, all variables have the same lower bound of x_lower
+              and similarly have the same upper bound of x_upper.  Therefore, this is just
+              a convenience function for calling find_max_box_constrained() when all
+              variables have the same bound constraints.
+    !*/
+
+// ----------------------------------------------------------------------------------------
+
+    template <
+        typename search_strategy_type,
+        typename stop_strategy_type,
+        typename funct, 
+        typename funct_der, 
+        typename T,
+        typename EXP1,
+        typename EXP2
+        >
+    double find_max_box_constrained (
+        search_strategy_type search_strategy,
+        stop_strategy_type stop_strategy,
+        const funct& f, 
+        const funct_der& der, 
+        T& x,
+        const matrix_exp<EXP1>& x_lower,
+        const matrix_exp<EXP2>& x_upper
+    );
+    /*!
+        requires
+            - search_strategy == an object that defines a search strategy such as one 
+              of the objects from dlib/optimization/optimization_search_strategies_abstract.h
+            - stop_strategy == an object that defines a stop strategy such as one of 
+              the objects from dlib/optimization/optimization_stop_strategies_abstract.h
+            - f(x) must be a valid expression that evaluates to a double
+            - der(x) must be a valid expression that evaluates to the derivative of f() at x.
+            - is_col_vector(x) == true
+            - is_col_vector(x_lower) == true
+            - is_col_vector(x_upper) == true
+            - x.size() == x_lower.size() == x_upper.size()
+              (i.e. x, x_lower, and x_upper need to all be column vectors of the same dimensionality)
+            - min(x_upper-x_lower) >= 0
+              (i.e. x_upper must contain upper bounds relative to x_lower)
+        ensures
+            - Performs a box constrained maximization of the function f() using the given
+              search_strategy and starting from the initial point x.  That is, we try to
+              find the x value that maximizes f(x) but is also within the box constraints 
+              specified by x_lower and x_upper.  That is, we ensure that #x satisfies: 
+                - min(#x - x_lower) >= 0 && min(x_upper - #x) >= 0
+            - This function uses a backtracking line search along with a gradient projection
+              step to handle the box constraints.
+            - The function is optimized until stop_strategy decides that an acceptable
+              point has been found. 
+            - #x == the value of x that was found to maximize f() within the given box
+              constraints.
+            - returns f(#x). 
+            - The last call to f() will be made with f(#x).  
+            - When calling f() and der(), the input passed to them will always be inside
+              the box constraints defined by x_lower and x_upper.
+            - When calling der(x), it will always be the case that the last call to f() was
+              made with the same x value.  This means that you can reuse any intermediate
+              results from the previous call to f(x) inside der(x) rather than recomputing
+              them inside der(x).
+            - Note that this function solves the maximization problem by converting it 
+              into a minimization problem.  Therefore, the values of f and its derivative
+              reported to the stopping strategy will be negated.  That is, stop_strategy
+              will see -f() and -der().  All this really means is that the status messages
+              from a stopping strategy in verbose mode will display a negated objective
+              value.
+    !*/
+
+// ----------------------------------------------------------------------------------------
+    
+    template <
+        typename search_strategy_type,
+        typename stop_strategy_type,
+        typename funct, 
+        typename funct_der, 
+        typename T
+        >
+    double find_max_box_constrained (
+        search_strategy_type search_strategy,
+        stop_strategy_type stop_strategy,
+        const funct& f, 
+        const funct_der& der, 
+        T& x,
+        const double x_lower,
+        const double x_upper
+    );
+    /*!
+        requires
+            - search_strategy == an object that defines a search strategy such as one 
+              of the objects from dlib/optimization/optimization_search_strategies_abstract.h
+            - stop_strategy == an object that defines a stop strategy such as one of 
+              the objects from dlib/optimization/optimization_stop_strategies_abstract.h
+            - f(x) must be a valid expression that evaluates to a double
+            - der(x) must be a valid expression that evaluates to the derivative of f() at x.
+            - is_col_vector(x) == true
+            - x_lower < x_upper
+        ensures
+            - This function is identical to find_max_box_constrained() as defined above
+              except that it takes x_lower and x_upper as doubles rather than column
+              vectors.  In this case, all variables have the same lower bound of x_lower
+              and similarly have the same upper bound of x_upper.  Therefore, this is just
+              a convenience function for calling find_max_box_constrained() when all
+              variables have the same bound constraints.
+    !*/
+
+// ----------------------------------------------------------------------------------------
+
+}
+
+#endif // DLIB_OPTIMIZATIOn_ABSTRACT_
+
+
+
diff --git a/ml/dlib/dlib/optimization/optimization_bobyqa.h b/ml/dlib/dlib/optimization/optimization_bobyqa.h
new file mode 100644
index 000000000..6fbc40c06
--- /dev/null
+++ b/ml/dlib/dlib/optimization/optimization_bobyqa.h
@@ -0,0 +1,3423 @@
+// Copyright (C) 2009 M.J.D. Powell, Davis E. King (davis@dlib.net)
+// License: Boost Software License   See LICENSE.txt for the full license.
+#ifndef DLIB_OPTIMIZATIOn_BOBYQA_Hh_
+#define DLIB_OPTIMIZATIOn_BOBYQA_Hh_
+
+/*
+    The code in this file is derived from Powell's BOBYQA Fortran code.
+    It was created by running f2c on the original Fortran code and then 
+    massaging the resulting C code into what you can see below.
+
+
+    The following paper, published in 2009 by Powell, describes the
+    detailed workings of the BOBYQA algorithm.  
+
+        The BOBYQA algorithm for bound constrained optimization 
+        without derivatives by M.J.D. Powell
+*/
+
+#include <algorithm>
+#include <cmath>
+#include <memory>
+
+#include "../matrix.h"
+#include "optimization_bobyqa_abstract.h"
+#include "optimization.h"
+
+// ----------------------------------------------------------------------------------------
+
+namespace dlib
+{
+
+    class bobyqa_failure : public error {
+    public: bobyqa_failure(const std::string& s):error(s){}
+    };
+
+// ----------------------------------------------------------------------------------------
+
+    class bobyqa_implementation
+    {
+        typedef long integer;
+        typedef double doublereal;
+
+    public:
+
+        template <
+            typename funct,
+            typename T, 
+            typename U
+            >
+        double find_min (
+            const funct& f,
+            T& x,
+            long npt,
+            const U& xl_,
+            const U& xu_,
+            const double rhobeg,
+            const double rhoend,
+            const long max_f_evals
+        ) const
+        {
+            const unsigned long n = x.size();
+            const unsigned long w_size = (npt+5)*(npt+n)+3*n*(n+5)/2;
+            std::unique_ptr<doublereal[]> w(new doublereal[w_size]);
+
+            // make these temporary matrices becuse U might be some
+            // kind of matrix_exp that doesn't support taking the address
+            // of the first element.
+            matrix<double,0,1> xl(xl_);
+            matrix<double,0,1> xu(xu_);
+
+            
+            return bobyqa_ (f,
+                            x.size(),
+                            npt,
+                            &x(0),
+                            &xl(0),
+                            &xu(0),
+                            rhobeg,
+                            rhoend,
+                            max_f_evals,
+                            w.get() );
+        }
+
+    private:
+
+
+        template <typename funct>
+        doublereal bobyqa_(
+            const funct& calfun,
+            const integer n, 
+            const integer npt, 
+            doublereal *x, 
+            const doublereal *xl,
+            const doublereal *xu,
+            const doublereal rhobeg,
+            const doublereal rhoend,
+            const integer maxfun,
+            doublereal *w
+        ) const
+        {
+
+            /* System generated locals */
+            integer i__1;
+            doublereal d__1, d__2;
+
+            /* Local variables */
+            integer j, id_, np, iw, igo, ihq, ixb, ixa, ifv, isl, jsl, ipq, ivl, ixn, ixo, ixp, isu, jsu, ndim;
+            doublereal temp, zero;
+            integer ibmat, izmat;
+
+
+            /*     This subroutine seeks the least value of a function of many variables, */
+            /*     by applying a trust region method that forms quadratic models by */
+            /*     interpolation. There is usually some freedom in the interpolation */
+            /*     conditions, which is taken up by minimizing the Frobenius norm of */
+            /*     the change to the second derivative of the model, beginning with the */
+            /*     zero matrix. The values of the variables are constrained by upper and */
+            /*     lower bounds. The arguments of the subroutine are as follows. */
+
+            /*     N must be set to the number of variables and must be at least two. */
+            /*     NPT is the number of interpolation conditions. Its value must be in */
+            /*       the interval [N+2,(N+1)(N+2)/2]. Choices that exceed 2*N+1 are not */
+            /*       recommended. */
+            /*     Initial values of the variables must be set in X(1),X(2),...,X(N). They */
+            /*       will be changed to the values that give the least calculated F. */
+            /*     For I=1,2,...,N, XL(I) and XU(I) must provide the lower and upper */
+            /*       bounds, respectively, on X(I). The construction of quadratic models */
+            /*       requires XL(I) to be strictly less than XU(I) for each I. Further, */
+            /*       the contribution to a model from changes to the I-th variable is */
+            /*       damaged severely by rounding errors if XU(I)-XL(I) is too small. */
+            /*     RHOBEG and RHOEND must be set to the initial and final values of a trust */
+            /*       region radius, so both must be positive with RHOEND no greater than */
+            /*       RHOBEG. Typically, RHOBEG should be about one tenth of the greatest */
+            /*       expected change to a variable, while RHOEND should indicate the */
+            /*       accuracy that is required in the final values of the variables. An */
+            /*       error return occurs if any of the differences XU(I)-XL(I), I=1,...,N, */
+            /*       is less than 2*RHOBEG. */
+            /*     MAXFUN must be set to an upper bound on the number of calls of CALFUN. */
+            /*     The array W will be used for working space. Its length must be at least */
+            /*       (NPT+5)*(NPT+N)+3*N*(N+5)/2. */
+
+            /* Parameter adjustments */
+            --w;
+            --xu;
+            --xl;
+            --x;
+
+            /* Function Body */
+            np = n + 1;
+
+            /*     Return if the value of NPT is unacceptable. */
+            if (npt < n + 2 || npt > (n + 2) * np / 2) {
+                throw bobyqa_failure("Return from BOBYQA because NPT is not in the required interval");
+                //goto L40;
+            }
+
+            /*     Partition the working space array, so that different parts of it can */
+            /*     be treated separately during the calculation of BOBYQB. The partition */
+            /*     requires the first (NPT+2)*(NPT+N)+3*N*(N+5)/2 elements of W plus the */
+            /*     space that is taken by the last array in the argument list of BOBYQB. */
+
+            ndim = npt + n;
+            ixb = 1;
+            ixp = ixb + n;
+            ifv = ixp + n * npt;
+            ixo = ifv + npt;
+            igo = ixo + n;
+            ihq = igo + n;
+            ipq = ihq + n * np / 2;
+            ibmat = ipq + npt;
+            izmat = ibmat + ndim * n;
+            isl = izmat + npt * (npt - np);
+            isu = isl + n;
+            ixn = isu + n;
+            ixa = ixn + n;
+            id_ = ixa + n;
+            ivl = id_ + n;
+            iw = ivl + ndim;
+
+            /*     Return if there is insufficient space between the bounds. Modify the */
+            /*     initial X if necessary in order to avoid conflicts between the bounds */
+            /*     and the construction of the first quadratic model. The lower and upper */
+            /*     bounds on moves from the updated X are set now, in the ISL and ISU */
+            /*     partitions of W, in order to provide useful and exact information about */
+            /*     components of X that become within distance RHOBEG from their bounds. */
+
+            zero = 0.;
+            i__1 = n;
+            for (j = 1; j <= i__1; ++j) {
+                temp = xu[j] - xl[j];
+                if (temp < rhobeg + rhobeg) {
+                    throw bobyqa_failure("Return from BOBYQA because one of the differences in x_lower and x_upper is less than 2*rho_begin");
+                    //goto L40;
+                }
+                jsl = isl + j - 1;
+                jsu = jsl + n;
+                w[jsl] = xl[j] - x[j];
+                w[jsu] = xu[j] - x[j];
+                if (w[jsl] >= -(rhobeg)) {
+                    if (w[jsl] >= zero) {
+                        x[j] = xl[j];
+                        w[jsl] = zero;
+                        w[jsu] = temp;
+                    } else {
+                        x[j] = xl[j] + rhobeg;
+                        w[jsl] = -(rhobeg);
+                        /* Computing MAX */
+                        d__1 = xu[j] - x[j];
+                        w[jsu] = std::max(d__1,rhobeg);
+                    }
+                } else if (w[jsu] <= rhobeg) {
+                    if (w[jsu] <= zero) {
+                        x[j] = xu[j];
+                        w[jsl] = -temp;
+                        w[jsu] = zero;
+                    } else {
+                        x[j] = xu[j] - rhobeg;
+                        /* Computing MIN */
+                        d__1 = xl[j] - x[j], d__2 = -(rhobeg);
+                        w[jsl] = std::min(d__1,d__2);
+                        w[jsu] = rhobeg;
+                    }
+                }
+                /* L30: */
+            }
+
+            /*     Make the call of BOBYQB. */
+
+            return bobyqb_(calfun, n, npt, &x[1], &xl[1], &xu[1], rhobeg, rhoend, maxfun, &w[
+                    ixb], &w[ixp], &w[ifv], &w[ixo], &w[igo], &w[ihq], &w[ipq], &w[
+                    ibmat], &w[izmat], ndim, &w[isl], &w[isu], &w[ixn], &w[ixa], &w[
+                    id_], &w[ivl], &w[iw]);
+            //L40:
+            ;
+        } /* bobyqa_ */
+
+    // ----------------------------------------------------------------------------------------
+
+        template <typename funct>
+        doublereal bobyqb_(
+            const funct& calfun,
+            const integer n,
+            const integer npt,
+            doublereal *x,
+            const doublereal *xl,
+            const doublereal *xu,
+            const doublereal rhobeg,
+            const doublereal rhoend,
+            const integer maxfun,
+            doublereal *xbase, 
+            doublereal *xpt,
+            doublereal *fval,
+            doublereal *xopt,
+            doublereal *gopt,
+            doublereal *hq,
+            doublereal *pq,
+            doublereal *bmat,
+            doublereal *zmat, 
+            const integer ndim,
+            doublereal *sl,
+            doublereal *su, 
+            doublereal *xnew, 
+            doublereal *xalt,
+            doublereal *d__,
+            doublereal *vlag, 
+            doublereal *w
+        ) const
+        {
+            /* System generated locals */
+            integer xpt_dim1, xpt_offset, bmat_dim1, bmat_offset, zmat_dim1, 
+            zmat_offset, i__1, i__2, i__3;
+            doublereal d__1, d__2, d__3, d__4;
+
+            /* Local variables */
+            doublereal f = 0;
+            integer i__, j, k, ih, nf, jj, nh, ip, jp;
+            doublereal dx;
+            integer np;
+            doublereal den = 0, one = 0, ten = 0, dsq = 0, rho = 0, sum = 0, two = 0, diff = 0, half = 0, beta = 0, gisq = 0;
+            integer knew = 0;
+            doublereal temp, suma, sumb, bsum, fopt;
+            integer kopt = 0, nptm;
+            doublereal zero, curv;
+            integer ksav;
+            doublereal gqsq = 0, dist = 0, sumw = 0, sumz = 0, diffa = 0, diffb = 0, diffc = 0, hdiag = 0;
+            integer kbase;
+            doublereal alpha = 0, delta = 0, adelt = 0, denom = 0, fsave = 0, bdtol = 0, delsq = 0;
+            integer nresc, nfsav;
+            doublereal ratio = 0, dnorm = 0, vquad = 0, pqold = 0, tenth = 0;
+            integer itest;
+            doublereal sumpq, scaden;
+            doublereal errbig, cauchy, fracsq, biglsq, densav;
+            doublereal bdtest;
+            doublereal crvmin, frhosq;
+            doublereal distsq;
+            integer ntrits;
+            doublereal xoptsq;
+
+
+
+            /*     The arguments N, NPT, X, XL, XU, RHOBEG, RHOEND, IPRINT and MAXFUN */
+            /*       are identical to the corresponding arguments in SUBROUTINE BOBYQA. */
+            /*     XBASE holds a shift of origin that should reduce the contributions */
+            /*       from rounding errors to values of the model and Lagrange functions. */
+            /*     XPT is a two-dimensional array that holds the coordinates of the */
+            /*       interpolation points relative to XBASE. */
+            /*     FVAL holds the values of F at the interpolation points. */
+            /*     XOPT is set to the displacement from XBASE of the trust region centre. */
+            /*     GOPT holds the gradient of the quadratic model at XBASE+XOPT. */
+            /*     HQ holds the explicit second derivatives of the quadratic model. */
+            /*     PQ contains the parameters of the implicit second derivatives of the */
+            /*       quadratic model. */
+            /*     BMAT holds the last N columns of H. */
+            /*     ZMAT holds the factorization of the leading NPT by NPT submatrix of H, */
+            /*       this factorization being ZMAT times ZMAT^T, which provides both the */
+            /*       correct rank and positive semi-definiteness. */
+            /*     NDIM is the first dimension of BMAT and has the value NPT+N. */
+            /*     SL and SU hold the differences XL-XBASE and XU-XBASE, respectively. */
+            /*       All the components of every XOPT are going to satisfy the bounds */
+            /*       SL(I) .LEQ. XOPT(I) .LEQ. SU(I), with appropriate equalities when */
+            /*       XOPT is on a constraint boundary. */
+            /*     XNEW is chosen by SUBROUTINE TRSBOX or ALTMOV. Usually XBASE+XNEW is the */
+            /*       vector of variables for the next call of CALFUN. XNEW also satisfies */
+            /*       the SL and SU constraints in the way that has just been mentioned. */
+            /*     XALT is an alternative to XNEW, chosen by ALTMOV, that may replace XNEW */
+            /*       in order to increase the denominator in the updating of UPDATE. */
+            /*     D is reserved for a trial step from XOPT, which is usually XNEW-XOPT. */
+            /*     VLAG contains the values of the Lagrange functions at a new point X. */
+            /*       They are part of a product that requires VLAG to be of length NDIM. */
+            /*     W is a one-dimensional array that is used for working space. Its length */
+            /*       must be at least 3*NDIM = 3*(NPT+N). */
+
+            /*     Set some constants. */
+
+            /* Parameter adjustments */
+            zmat_dim1 = npt;
+            zmat_offset = 1 + zmat_dim1;
+            zmat -= zmat_offset;
+            xpt_dim1 = npt;
+            xpt_offset = 1 + xpt_dim1;
+            xpt -= xpt_offset;
+            --x;
+            --xl;
+            --xu;
+            --xbase;
+            --fval;
+            --xopt;
+            --gopt;
+            --hq;
+            --pq;
+            bmat_dim1 = ndim;
+            bmat_offset = 1 + bmat_dim1;
+            bmat -= bmat_offset;
+            --sl;
+            --su;
+            --xnew;
+            --xalt;
+            --d__;
+            --vlag;
+            --w;
+
+            /* Function Body */
+            half = .5;
+            one = 1.;
+            ten = 10.;
+            tenth = .1;
+            two = 2.;
+            zero = 0.;
+            np = n + 1;
+            nptm = npt - np;
+            nh = n * np / 2;
+
+            /*     The call of PRELIM sets the elements of XBASE, XPT, FVAL, GOPT, HQ, PQ, */
+            /*     BMAT and ZMAT for the first iteration, with the corresponding values of */
+            /*     of NF and KOPT, which are the number of calls of CALFUN so far and the */
+            /*     index of the interpolation point at the trust region centre. Then the */
+            /*     initial XOPT is set too. The branch to label 720 occurs if MAXFUN is */
+            /*     less than NPT. GOPT will be updated if KOPT is different from KBASE. */
+
+            prelim_(calfun, n, npt, &x[1], &xl[1], &xu[1], rhobeg, maxfun, &xbase[1], 
+                    &xpt[xpt_offset], &fval[1], &gopt[1], &hq[1], &pq[1], &bmat[bmat_offset], 
+                    &zmat[zmat_offset], ndim, &sl[1], &su[1], nf, kopt);
+            xoptsq = zero;
+            i__1 = n;
+            for (i__ = 1; i__ <= i__1; ++i__) {
+                xopt[i__] = xpt[kopt + i__ * xpt_dim1];
+                /* L10: */
+                /* Computing 2nd power */
+                d__1 = xopt[i__];
+                xoptsq += d__1 * d__1;
+            }
+            fsave = fval[1];
+            if (nf < npt) {
+                throw bobyqa_failure("Return from BOBYQA because the objective function has been called max_f_evals times.");
+                //goto L720;
+            }
+            kbase = 1;
+
+            /*     Complete the settings that are required for the iterative procedure. */
+
+            rho = rhobeg;
+            delta = rho;
+            nresc = nf;
+            ntrits = 0;
+            diffa = zero;
+            diffb = zero;
+            itest = 0;
+            nfsav = nf;
+
+            /*     Update GOPT if necessary before the first iteration and after each */
+            /*     call of RESCUE that makes a call of CALFUN. */
+
+L20:
+            if (kopt != kbase) {
+                ih = 0;
+                i__1 = n;
+                for (j = 1; j <= i__1; ++j) {
+                    i__2 = j;
+                    for (i__ = 1; i__ <= i__2; ++i__) {
+                        ++ih;
+                        if (i__ < j) {
+                            gopt[j] += hq[ih] * xopt[i__];
+                        }
+                        /* L30: */
+                        gopt[i__] += hq[ih] * xopt[j];
+                    }
+                }
+                if (nf > npt) {
+                    i__2 = npt;
+                    for (k = 1; k <= i__2; ++k) {
+                        temp = zero;
+                        i__1 = n;
+                        for (j = 1; j <= i__1; ++j) {
+                            /* L40: */
+                            temp += xpt[k + j * xpt_dim1] * xopt[j];
+                        }
+                        temp = pq[k] * temp;
+                        i__1 = n;
+                        for (i__ = 1; i__ <= i__1; ++i__) {
+                            /* L50: */
+                            gopt[i__] += temp * xpt[k + i__ * xpt_dim1];
+                        }
+                    }
+                }
+            }
+
+            /*     Generate the next point in the trust region that provides a small value */
+            /*     of the quadratic model subject to the constraints on the variables. */
+            /*     The integer NTRITS is set to the number "trust region" iterations that */
+            /*     have occurred since the last "alternative" iteration. If the length */
+            /*     of XNEW-XOPT is less than HALF*RHO, however, then there is a branch to */
+            /*     label 650 or 680 with NTRITS=-1, instead of calculating F at XNEW. */
+
+L60:
+            trsbox_(n, npt, &xpt[xpt_offset], &xopt[1], &gopt[1], &hq[1], &pq[1], &sl[1], 
+                    &su[1], delta, &xnew[1], &d__[1], &w[1], &w[np], &w[np + n],
+                    &w[np + (n << 1)], &w[np + n * 3], &dsq, &crvmin);
+            /* Computing MIN */
+            d__1 = delta, d__2 = std::sqrt(dsq);
+            dnorm = std::min(d__1,d__2);
+            if (dnorm < half * rho) {
+                ntrits = -1;
+                /* Computing 2nd power */
+                d__1 = ten * rho;
+                distsq = d__1 * d__1;
+                if (nf <= nfsav + 2) {
+                    goto L650;
+                }
+
+                /*     The following choice between labels 650 and 680 depends on whether or */
+                /*     not our work with the current RHO seems to be complete. Either RHO is */
+                /*     decreased or termination occurs if the errors in the quadratic model at */
+                /*     the last three interpolation points compare favourably with predictions */
+                /*     of likely improvements to the model within distance HALF*RHO of XOPT. */
+
+                /* Computing MAX */
+                d__1 = std::max(diffa,diffb);
+                errbig = std::max(d__1,diffc);
+                frhosq = rho * .125 * rho;
+                if (crvmin > zero && errbig > frhosq * crvmin) {
+                    goto L650;
+                }
+                bdtol = errbig / rho;
+                i__1 = n;
+                for (j = 1; j <= i__1; ++j) {
+                    bdtest = bdtol;
+                    if (xnew[j] == sl[j]) {
+                        bdtest = w[j];
+                    }
+                    if (xnew[j] == su[j]) {
+                        bdtest = -w[j];
+                    }
+                    if (bdtest < bdtol) {
+                        curv = hq[(j + j * j) / 2];
+                        i__2 = npt;
+                        for (k = 1; k <= i__2; ++k) {
+                            /* L70: */
+                            /* Computing 2nd power */
+                            d__1 = xpt[k + j * xpt_dim1];
+                            curv += pq[k] * (d__1 * d__1);
+                        }
+                        bdtest += half * curv * rho;
+                        if (bdtest < bdtol) {
+                            goto L650;
+                        }
+                    }
+                    /* L80: */
+                }
+                goto L680;
+            }
+            ++ntrits;
+
+            /*     Severe cancellation is likely to occur if XOPT is too far from XBASE. */
+            /*     If the following test holds, then XBASE is shifted so that XOPT becomes */
+            /*     zero. The appropriate changes are made to BMAT and to the second */
+            /*     derivatives of the current model, beginning with the changes to BMAT */
+            /*     that do not depend on ZMAT. VLAG is used temporarily for working space. */
+
+L90:
+            if (dsq <= xoptsq * .001) {
+                fracsq = xoptsq * .25;
+                sumpq = zero;
+                i__1 = npt;
+                for (k = 1; k <= i__1; ++k) {
+                    sumpq += pq[k];
+                    sum = -half * xoptsq;
+                    i__2 = n;
+                    for (i__ = 1; i__ <= i__2; ++i__) {
+                        /* L100: */
+                        sum += xpt[k + i__ * xpt_dim1] * xopt[i__];
+                    }
+                    w[npt + k] = sum;
+                    temp = fracsq - half * sum;
+                    i__2 = n;
+                    for (i__ = 1; i__ <= i__2; ++i__) {
+                        w[i__] = bmat[k + i__ * bmat_dim1];
+                        vlag[i__] = sum * xpt[k + i__ * xpt_dim1] + temp * xopt[i__];
+                        ip = npt + i__;
+                        i__3 = i__;
+                        for (j = 1; j <= i__3; ++j) {
+                            /* L110: */
+                            bmat[ip + j * bmat_dim1] = bmat[ip + j * bmat_dim1] + w[
+                                i__] * vlag[j] + vlag[i__] * w[j];
+                        }
+                    }
+                }
+
+                /*     Then the revisions of BMAT that depend on ZMAT are calculated. */
+
+                i__3 = nptm;
+                for (jj = 1; jj <= i__3; ++jj) {
+                    sumz = zero;
+                    sumw = zero;
+                    i__2 = npt;
+                    for (k = 1; k <= i__2; ++k) {
+                        sumz += zmat[k + jj * zmat_dim1];
+                        vlag[k] = w[npt + k] * zmat[k + jj * zmat_dim1];
+                        /* L120: */
+                        sumw += vlag[k];
+                    }
+                    i__2 = n;
+                    for (j = 1; j <= i__2; ++j) {
+                        sum = (fracsq * sumz - half * sumw) * xopt[j];
+                        i__1 = npt;
+                        for (k = 1; k <= i__1; ++k) {
+                            /* L130: */
+                            sum += vlag[k] * xpt[k + j * xpt_dim1];
+                        }
+                        w[j] = sum;
+                        i__1 = npt;
+                        for (k = 1; k <= i__1; ++k) {
+                            /* L140: */
+                            bmat[k + j * bmat_dim1] += sum * zmat[k + jj * zmat_dim1];
+                        }
+                    }
+                    i__1 = n;
+                    for (i__ = 1; i__ <= i__1; ++i__) {
+                        ip = i__ + npt;
+                        temp = w[i__];
+                        i__2 = i__;
+                        for (j = 1; j <= i__2; ++j) {
+                            /* L150: */
+                            bmat[ip + j * bmat_dim1] += temp * w[j];
+                        }
+                    }
+                }
+
+                /*     The following instructions complete the shift, including the changes */
+                /*     to the second derivative parameters of the quadratic model. */
+
+                ih = 0;
+                i__2 = n;
+                for (j = 1; j <= i__2; ++j) {
+                    w[j] = -half * sumpq * xopt[j];
+                    i__1 = npt;
+                    for (k = 1; k <= i__1; ++k) {
+                        w[j] += pq[k] * xpt[k + j * xpt_dim1];
+                        /* L160: */
+                        xpt[k + j * xpt_dim1] -= xopt[j];
+                    }
+                    i__1 = j;
+                    for (i__ = 1; i__ <= i__1; ++i__) {
+                        ++ih;
+                        hq[ih] = hq[ih] + w[i__] * xopt[j] + xopt[i__] * w[j];
+                        /* L170: */
+                        bmat[npt + i__ + j * bmat_dim1] = bmat[npt + j + i__ * 
+                            bmat_dim1];
+                    }
+                }
+                i__1 = n;
+                for (i__ = 1; i__ <= i__1; ++i__) {
+                    xbase[i__] += xopt[i__];
+                    xnew[i__] -= xopt[i__];
+                    sl[i__] -= xopt[i__];
+                    su[i__] -= xopt[i__];
+                    /* L180: */
+                    xopt[i__] = zero;
+                }
+                xoptsq = zero;
+            }
+            if (ntrits == 0) {
+                goto L210;
+            }
+            goto L230;
+
+            /*     XBASE is also moved to XOPT by a call of RESCUE. This calculation is */
+            /*     more expensive than the previous shift, because new matrices BMAT and */
+            /*     ZMAT are generated from scratch, which may include the replacement of */
+            /*     interpolation points whose positions seem to be causing near linear */
+            /*     dependence in the interpolation conditions. Therefore RESCUE is called */
+            /*     only if rounding errors have reduced by at least a factor of two the */
+            /*     denominator of the formula for updating the H matrix. It provides a */
+            /*     useful safeguard, but is not invoked in most applications of BOBYQA. */
+
+L190:
+            nfsav = nf;
+            kbase = kopt;
+            rescue_(calfun, n, npt, &xl[1], &xu[1], maxfun, &xbase[1], &xpt[
+                    xpt_offset], &fval[1], &xopt[1], &gopt[1], &hq[1], &pq[1], &bmat[
+                    bmat_offset], &zmat[zmat_offset], ndim, &sl[1], &su[1], nf, delta, 
+                    kopt, &vlag[1], &w[1], &w[n + np], &w[ndim + np]);
+
+            /*     XOPT is updated now in case the branch below to label 720 is taken. */
+            /*     Any updating of GOPT occurs after the branch below to label 20, which */
+            /*     leads to a trust region iteration as does the branch to label 60. */
+
+            xoptsq = zero;
+            if (kopt != kbase) {
+                i__1 = n;
+                for (i__ = 1; i__ <= i__1; ++i__) {
+                    xopt[i__] = xpt[kopt + i__ * xpt_dim1];
+                    /* L200: */
+                    /* Computing 2nd power */
+                    d__1 = xopt[i__];
+                    xoptsq += d__1 * d__1;
+                }
+            }
+            if (nf < 0) {
+                nf = maxfun;
+                throw bobyqa_failure("Return from BOBYQA because the objective function has been called max_f_evals times.");
+                //goto L720;
+            }
+            nresc = nf;
+            if (nfsav < nf) {
+                nfsav = nf;
+                goto L20;
+            }
+            if (ntrits > 0) {
+                goto L60;
+            }
+
+            /*     Pick two alternative vectors of variables, relative to XBASE, that */
+            /*     are suitable as new positions of the KNEW-th interpolation point. */
+            /*     Firstly, XNEW is set to the point on a line through XOPT and another */
+            /*     interpolation point that minimizes the predicted value of the next */
+            /*     denominator, subject to ||XNEW - XOPT|| .LEQ. ADELT and to the SL */
+            /*     and SU bounds. Secondly, XALT is set to the best feasible point on */
+            /*     a constrained version of the Cauchy step of the KNEW-th Lagrange */
+            /*     function, the corresponding value of the square of this function */
+            /*     being returned in CAUCHY. The choice between these alternatives is */
+            /*     going to be made when the denominator is calculated. */
+
+L210:
+            altmov_(n, npt, &xpt[xpt_offset], &xopt[1], &bmat[bmat_offset], &zmat[zmat_offset], 
+                    ndim, &sl[1], &su[1], kopt, knew, adelt, &xnew[1], 
+                    &xalt[1], alpha, cauchy, &w[1], &w[np], &w[ndim + 1]);
+            i__1 = n;
+            for (i__ = 1; i__ <= i__1; ++i__) {
+                /* L220: */
+                d__[i__] = xnew[i__] - xopt[i__];
+            }
+
+            /*     Calculate VLAG and BETA for the current choice of D. The scalar */
+            /*     product of D with XPT(K,.) is going to be held in W(NPT+K) for */
+            /*     use when VQUAD is calculated. */
+
+L230:
+            i__1 = npt;
+            for (k = 1; k <= i__1; ++k) {
+                suma = zero;
+                sumb = zero;
+                sum = zero;
+                i__2 = n;
+                for (j = 1; j <= i__2; ++j) {
+                    suma += xpt[k + j * xpt_dim1] * d__[j];
+                    sumb += xpt[k + j * xpt_dim1] * xopt[j];
+                    /* L240: */
+                    sum += bmat[k + j * bmat_dim1] * d__[j];
+                }
+                w[k] = suma * (half * suma + sumb);
+                vlag[k] = sum;
+                /* L250: */
+                w[npt + k] = suma;
+            }
+            beta = zero;
+            i__1 = nptm;
+            for (jj = 1; jj <= i__1; ++jj) {
+                sum = zero;
+                i__2 = npt;
+                for (k = 1; k <= i__2; ++k) {
+                    /* L260: */
+                    sum += zmat[k + jj * zmat_dim1] * w[k];
+                }
+                beta -= sum * sum;
+                i__2 = npt;
+                for (k = 1; k <= i__2; ++k) {
+                    /* L270: */
+                    vlag[k] += sum * zmat[k + jj * zmat_dim1];
+                }
+            }
+            dsq = zero;
+            bsum = zero;
+            dx = zero;
+            i__2 = n;
+            for (j = 1; j <= i__2; ++j) {
+                /* Computing 2nd power */
+                d__1 = d__[j];
+                dsq += d__1 * d__1;
+                sum = zero;
+                i__1 = npt;
+                for (k = 1; k <= i__1; ++k) {
+                    /* L280: */
+                    sum += w[k] * bmat[k + j * bmat_dim1];
+                }
+                bsum += sum * d__[j];
+                jp = npt + j;
+                i__1 = n;
+                for (i__ = 1; i__ <= i__1; ++i__) {
+                    /* L290: */
+                    sum += bmat[jp + i__ * bmat_dim1] * d__[i__];
+                }
+                vlag[jp] = sum;
+                bsum += sum * d__[j];
+                /* L300: */
+                dx += d__[j] * xopt[j];
+            }
+            beta = dx * dx + dsq * (xoptsq + dx + dx + half * dsq) + beta - bsum;
+            vlag[kopt] += one;
+
+            /*     If NTRITS is zero, the denominator may be increased by replacing */
+            /*     the step D of ALTMOV by a Cauchy step. Then RESCUE may be called if */
+            /*     rounding errors have damaged the chosen denominator. */
+
+            if (ntrits == 0) {
+                /* Computing 2nd power */
+                d__1 = vlag[knew];
+                denom = d__1 * d__1 + alpha * beta;
+                if (denom < cauchy && cauchy > zero) {
+                    i__2 = n;
+                    for (i__ = 1; i__ <= i__2; ++i__) {
+                        xnew[i__] = xalt[i__];
+                        /* L310: */
+                        d__[i__] = xnew[i__] - xopt[i__];
+                    }
+                    cauchy = zero;
+                    goto L230;
+                }
+                /* Computing 2nd power */
+                d__1 = vlag[knew];
+                if (denom <= half * (d__1 * d__1)) {
+                    if (nf > nresc) {
+                        goto L190;
+                    }
+                    throw bobyqa_failure("Return from BOBYQA because of much cancellation in a denominator.");
+                    //goto L720;
+                }
+
+                /*     Alternatively, if NTRITS is positive, then set KNEW to the index of */
+                /*     the next interpolation point to be deleted to make room for a trust */
+                /*     region step. Again RESCUE may be called if rounding errors have damaged */
+                /*     the chosen denominator, which is the reason for attempting to select */
+                /*     KNEW before calculating the next value of the objective function. */
+
+            } else {
+                delsq = delta * delta;
+                scaden = zero;
+                biglsq = zero;
+                knew = 0;
+                i__2 = npt;
+                for (k = 1; k <= i__2; ++k) {
+                    if (k == kopt) {
+                        goto L350;
+                    }
+                    hdiag = zero;
+                    i__1 = nptm;
+                    for (jj = 1; jj <= i__1; ++jj) {
+                        /* L330: */
+                        /* Computing 2nd power */
+                        d__1 = zmat[k + jj * zmat_dim1];
+                        hdiag += d__1 * d__1;
+                    }
+                    /* Computing 2nd power */
+                    d__1 = vlag[k];
+                    den = beta * hdiag + d__1 * d__1;
+                    distsq = zero;
+                    i__1 = n;
+                    for (j = 1; j <= i__1; ++j) {
+                        /* L340: */
+                        /* Computing 2nd power */
+                        d__1 = xpt[k + j * xpt_dim1] - xopt[j];
+                        distsq += d__1 * d__1;
+                    }
+                    /* Computing MAX */
+                    /* Computing 2nd power */
+                    d__3 = distsq / delsq;
+                    d__1 = one, d__2 = d__3 * d__3;
+                    temp = std::max(d__1,d__2);
+                    if (temp * den > scaden) {
+                        scaden = temp * den;
+                        knew = k;
+                        denom = den;
+                    }
+                    /* Computing MAX */
+                    /* Computing 2nd power */
+                    d__3 = vlag[k];
+                    d__1 = biglsq, d__2 = temp * (d__3 * d__3);
+                    biglsq = std::max(d__1,d__2);
+L350:
+                    ;
+                }
+                if (scaden <= half * biglsq) {
+                    if (nf > nresc) {
+                        goto L190;
+                    }
+                    throw bobyqa_failure("Return from BOBYQA because of much cancellation in a denominator.");
+                    //goto L720;
+                }
+            }
+
+            /*     Put the variables for the next calculation of the objective function */
+            /*       in XNEW, with any adjustments for the bounds. */
+
+
+            /*     Calculate the value of the objective function at XBASE+XNEW, unless */
+            /*       the limit on the number of calculations of F has been reached. */
+
+L360:
+            i__2 = n;
+            for (i__ = 1; i__ <= i__2; ++i__) {
+                /* Computing MIN */
+                /* Computing MAX */
+                d__3 = xl[i__], d__4 = xbase[i__] + xnew[i__];
+                d__1 = std::max(d__3,d__4), d__2 = xu[i__];
+                x[i__] = std::min(d__1,d__2);
+                if (xnew[i__] == sl[i__]) {
+                    x[i__] = xl[i__];
+                }
+                if (xnew[i__] == su[i__]) {
+                    x[i__] = xu[i__];
+                }
+                /* L380: */
+            }
+            if (nf >= maxfun) {
+                throw bobyqa_failure("Return from BOBYQA because the objective function has been called max_f_evals times.");
+                //goto L720;
+            }
+            ++nf;
+            f = calfun(mat(&x[1], n));
+            if (ntrits == -1) {
+                fsave = f;
+                goto L720;
+            }
+
+            /*     Use the quadratic model to predict the change in F due to the step D, */
+            /*       and set DIFF to the error of this prediction. */
+
+            fopt = fval[kopt];
+            vquad = zero;
+            ih = 0;
+            i__2 = n;
+            for (j = 1; j <= i__2; ++j) {
+                vquad += d__[j] * gopt[j];
+                i__1 = j;
+                for (i__ = 1; i__ <= i__1; ++i__) {
+                    ++ih;
+                    temp = d__[i__] * d__[j];
+                    if (i__ == j) {
+                        temp = half * temp;
+                    }
+                    /* L410: */
+                    vquad += hq[ih] * temp;
+                }
+            }
+            i__1 = npt;
+            for (k = 1; k <= i__1; ++k) {
+                /* L420: */
+                /* Computing 2nd power */
+                d__1 = w[npt + k];
+                vquad += half * pq[k] * (d__1 * d__1);
+            }
+            diff = f - fopt - vquad;
+            diffc = diffb;
+            diffb = diffa;
+            diffa = std::abs(diff);
+            if (dnorm > rho) {
+                nfsav = nf;
+            }
+
+            /*     Pick the next value of DELTA after a trust region step. */
+
+            if (ntrits > 0) {
+                if (vquad >= zero) {
+                    throw bobyqa_failure("Return from BOBYQA because a trust region step has failed to reduce Q.");
+                    //goto L720;
+                }
+                ratio = (f - fopt) / vquad;
+                if (ratio <= tenth) {
+                    /* Computing MIN */
+                    d__1 = half * delta;
+                    delta = std::min(d__1,dnorm);
+                } else if (ratio <= .7) {
+                    /* Computing MAX */
+                    d__1 = half * delta;
+                    delta = std::max(d__1,dnorm);
+                } else {
+                    /* Computing MAX */
+                    d__1 = half * delta, d__2 = dnorm + dnorm;
+                    delta = std::max(d__1,d__2);
+                }
+                if (delta <= rho * 1.5) {
+                    delta = rho;
+                }
+
+                /*     Recalculate KNEW and DENOM if the new F is less than FOPT. */
+
+                if (f < fopt) {
+                    ksav = knew;
+                    densav = denom;
+                    delsq = delta * delta;
+                    scaden = zero;
+                    biglsq = zero;
+                    knew = 0;
+                    i__1 = npt;
+                    for (k = 1; k <= i__1; ++k) {
+                        hdiag = zero;
+                        i__2 = nptm;
+                        for (jj = 1; jj <= i__2; ++jj) {
+                            /* L440: */
+                            /* Computing 2nd power */
+                            d__1 = zmat[k + jj * zmat_dim1];
+                            hdiag += d__1 * d__1;
+                        }
+                        /* Computing 2nd power */
+                        d__1 = vlag[k];
+                        den = beta * hdiag + d__1 * d__1;
+                        distsq = zero;
+                        i__2 = n;
+                        for (j = 1; j <= i__2; ++j) {
+                            /* L450: */
+                            /* Computing 2nd power */
+                            d__1 = xpt[k + j * xpt_dim1] - xnew[j];
+                            distsq += d__1 * d__1;
+                        }
+                        /* Computing MAX */
+                        /* Computing 2nd power */
+                        d__3 = distsq / delsq;
+                        d__1 = one, d__2 = d__3 * d__3;
+                        temp = std::max(d__1,d__2);
+                        if (temp * den > scaden) {
+                            scaden = temp * den;
+                            knew = k;
+                            denom = den;
+                        }
+                        /* L460: */
+                        /* Computing MAX */
+                        /* Computing 2nd power */
+                        d__3 = vlag[k];
+                        d__1 = biglsq, d__2 = temp * (d__3 * d__3);
+                        biglsq = std::max(d__1,d__2);
+                    }
+                    if (scaden <= half * biglsq) {
+                        knew = ksav;
+                        denom = densav;
+                    }
+                }
+            }
+
+            /*     Update BMAT and ZMAT, so that the KNEW-th interpolation point can be */
+            /*     moved. Also update the second derivative terms of the model. */
+
+            update_(n, npt, &bmat[bmat_offset], &zmat[zmat_offset], ndim, &vlag[1], 
+                    beta, denom, knew, &w[1]);
+            ih = 0;
+            pqold = pq[knew];
+            pq[knew] = zero;
+            i__1 = n;
+            for (i__ = 1; i__ <= i__1; ++i__) {
+                temp = pqold * xpt[knew + i__ * xpt_dim1];
+                i__2 = i__;
+                for (j = 1; j <= i__2; ++j) {
+                    ++ih;
+                    /* L470: */
+                    hq[ih] += temp * xpt[knew + j * xpt_dim1];
+                }
+            }
+            i__2 = nptm;
+            for (jj = 1; jj <= i__2; ++jj) {
+                temp = diff * zmat[knew + jj * zmat_dim1];
+                i__1 = npt;
+                for (k = 1; k <= i__1; ++k) {
+                    /* L480: */
+                    pq[k] += temp * zmat[k + jj * zmat_dim1];
+                }
+            }
+
+            /*     Include the new interpolation point, and make the changes to GOPT at */
+            /*     the old XOPT that are caused by the updating of the quadratic model. */
+
+            fval[knew] = f;
+            i__1 = n;
+            for (i__ = 1; i__ <= i__1; ++i__) {
+                xpt[knew + i__ * xpt_dim1] = xnew[i__];
+                /* L490: */
+                w[i__] = bmat[knew + i__ * bmat_dim1];
+            }
+            i__1 = npt;
+            for (k = 1; k <= i__1; ++k) {
+                suma = zero;
+                i__2 = nptm;
+                for (jj = 1; jj <= i__2; ++jj) {
+                    /* L500: */
+                    suma += zmat[knew + jj * zmat_dim1] * zmat[k + jj * zmat_dim1];
+                }
+                sumb = zero;
+                i__2 = n;
+                for (j = 1; j <= i__2; ++j) {
+                    /* L510: */
+                    sumb += xpt[k + j * xpt_dim1] * xopt[j];
+                }
+                temp = suma * sumb;
+                i__2 = n;
+                for (i__ = 1; i__ <= i__2; ++i__) {
+                    /* L520: */
+                    w[i__] += temp * xpt[k + i__ * xpt_dim1];
+                }
+            }
+            i__2 = n;
+            for (i__ = 1; i__ <= i__2; ++i__) {
+                /* L530: */
+                gopt[i__] += diff * w[i__];
+            }
+
+            /*     Update XOPT, GOPT and KOPT if the new calculated F is less than FOPT. */
+
+            if (f < fopt) {
+                kopt = knew;
+                xoptsq = zero;
+                ih = 0;
+                i__2 = n;
+                for (j = 1; j <= i__2; ++j) {
+                    xopt[j] = xnew[j];
+                    /* Computing 2nd power */
+                    d__1 = xopt[j];
+                    xoptsq += d__1 * d__1;
+                    i__1 = j;
+                    for (i__ = 1; i__ <= i__1; ++i__) {
+                        ++ih;
+                        if (i__ < j) {
+                            gopt[j] += hq[ih] * d__[i__];
+                        }
+                        /* L540: */
+                        gopt[i__] += hq[ih] * d__[j];
+                    }
+                }
+                i__1 = npt;
+                for (k = 1; k <= i__1; ++k) {
+                    temp = zero;
+                    i__2 = n;
+                    for (j = 1; j <= i__2; ++j) {
+                        /* L550: */
+                        temp += xpt[k + j * xpt_dim1] * d__[j];
+                    }
+                    temp = pq[k] * temp;
+                    i__2 = n;
+                    for (i__ = 1; i__ <= i__2; ++i__) {
+                        /* L560: */
+                        gopt[i__] += temp * xpt[k + i__ * xpt_dim1];
+                    }
+                }
+            }
+
+            /*     Calculate the parameters of the least Frobenius norm interpolant to */
+            /*     the current data, the gradient of this interpolant at XOPT being put */
+            /*     into VLAG(NPT+I), I=1,2,...,N. */
+
+            if (ntrits > 0) {
+                i__2 = npt;
+                for (k = 1; k <= i__2; ++k) {
+                    vlag[k] = fval[k] - fval[kopt];
+                    /* L570: */
+                    w[k] = zero;
+                }
+                i__2 = nptm;
+                for (j = 1; j <= i__2; ++j) {
+                    sum = zero;
+                    i__1 = npt;
+                    for (k = 1; k <= i__1; ++k) {
+                        /* L580: */
+                        sum += zmat[k + j * zmat_dim1] * vlag[k];
+                    }
+                    i__1 = npt;
+                    for (k = 1; k <= i__1; ++k) {
+                        /* L590: */
+                        w[k] += sum * zmat[k + j * zmat_dim1];
+                    }
+                }
+                i__1 = npt;
+                for (k = 1; k <= i__1; ++k) {
+                    sum = zero;
+                    i__2 = n;
+                    for (j = 1; j <= i__2; ++j) {
+                        /* L600: */
+                        sum += xpt[k + j * xpt_dim1] * xopt[j];
+                    }
+                    w[k + npt] = w[k];
+                    /* L610: */
+                    w[k] = sum * w[k];
+                }
+                gqsq = zero;
+                gisq = zero;
+                i__1 = n;
+                for (i__ = 1; i__ <= i__1; ++i__) {
+                    sum = zero;
+                    i__2 = npt;
+                    for (k = 1; k <= i__2; ++k) {
+                        /* L620: */
+                        sum = sum + bmat[k + i__ * bmat_dim1] * vlag[k] + xpt[k + i__ 
+                            * xpt_dim1] * w[k];
+                    }
+                    if (xopt[i__] == sl[i__]) {
+                        /* Computing MIN */
+                        d__2 = zero, d__3 = gopt[i__];
+                        /* Computing 2nd power */
+                        d__1 = std::min(d__2,d__3);
+                        gqsq += d__1 * d__1;
+                        /* Computing 2nd power */
+                        d__1 = std::min(zero,sum);
+                        gisq += d__1 * d__1;
+                    } else if (xopt[i__] == su[i__]) {
+                        /* Computing MAX */
+                        d__2 = zero, d__3 = gopt[i__];
+                        /* Computing 2nd power */
+                        d__1 = std::max(d__2,d__3);
+                        gqsq += d__1 * d__1;
+                        /* Computing 2nd power */
+                        d__1 = std::max(zero,sum);
+                        gisq += d__1 * d__1;
+                    } else {
+                        /* Computing 2nd power */
+                        d__1 = gopt[i__];
+                        gqsq += d__1 * d__1;
+                        gisq += sum * sum;
+                    }
+                    /* L630: */
+                    vlag[npt + i__] = sum;
+                }
+
+                /*     Test whether to replace the new quadratic model by the least Frobenius */
+                /*     norm interpolant, making the replacement if the test is satisfied. */
+
+                ++itest;
+                if (gqsq < ten * gisq) {
+                    itest = 0;
+                }
+                if (itest >= 3) {
+                    i__1 = std::max(npt,nh);
+                    for (i__ = 1; i__ <= i__1; ++i__) {
+                        if (i__ <= n) {
+                            gopt[i__] = vlag[npt + i__];
+                        }
+                        if (i__ <= npt) {
+                            pq[i__] = w[npt + i__];
+                        }
+                        if (i__ <= nh) {
+                            hq[i__] = zero;
+                        }
+                        itest = 0;
+                        /* L640: */
+                    }
+                }
+            }
+
+            /*     If a trust region step has provided a sufficient decrease in F, then */
+            /*     branch for another trust region calculation. The case NTRITS=0 occurs */
+            /*     when the new interpolation point was reached by an alternative step. */
+
+            if (ntrits == 0) {
+                goto L60;
+            }
+            if (f <= fopt + tenth * vquad) {
+                goto L60;
+            }
+
+            /*     Alternatively, find out if the interpolation points are close enough */
+            /*       to the best point so far. */
+
+            /* Computing MAX */
+            /* Computing 2nd power */
+            d__3 = two * delta;
+            /* Computing 2nd power */
+            d__4 = ten * rho;
+            d__1 = d__3 * d__3, d__2 = d__4 * d__4;
+            distsq = std::max(d__1,d__2);
+L650:
+            knew = 0;
+            i__1 = npt;
+            for (k = 1; k <= i__1; ++k) {
+                sum = zero;
+                i__2 = n;
+                for (j = 1; j <= i__2; ++j) {
+                    /* L660: */
+                    /* Computing 2nd power */
+                    d__1 = xpt[k + j * xpt_dim1] - xopt[j];
+                    sum += d__1 * d__1;
+                }
+                if (sum > distsq) {
+                    knew = k;
+                    distsq = sum;
+                }
+                /* L670: */
+            }
+
+            /*     If KNEW is positive, then ALTMOV finds alternative new positions for */
+            /*     the KNEW-th interpolation point within distance ADELT of XOPT. It is */
+            /*     reached via label 90. Otherwise, there is a branch to label 60 for */
+            /*     another trust region iteration, unless the calculations with the */
+            /*     current RHO are complete. */
+
+            if (knew > 0) {
+                dist = std::sqrt(distsq);
+                if (ntrits == -1) {
+                    /* Computing MIN */
+                    d__1 = tenth * delta, d__2 = half * dist;
+                    delta = std::min(d__1,d__2);
+                    if (delta <= rho * 1.5) {
+                        delta = rho;
+                    }
+                }
+                ntrits = 0;
+                /* Computing MAX */
+                /* Computing MIN */
+                d__2 = tenth * dist;
+                d__1 = std::min(d__2,delta);
+                adelt = std::max(d__1,rho);
+                dsq = adelt * adelt;
+                goto L90;
+            }
+            if (ntrits == -1) {
+                goto L680;
+            }
+            if (ratio > zero) {
+                goto L60;
+            }
+            if (std::max(delta,dnorm) > rho) {
+                goto L60;
+            }
+
+            /*     The calculations with the current value of RHO are complete. Pick the */
+            /*       next values of RHO and DELTA. */
+
+L680:
+            if (rho > rhoend) {
+                delta = half * rho;
+                ratio = rho / rhoend;
+                if (ratio <= 16.) {
+                    rho = rhoend;
+                } else if (ratio <= 250.) {
+                    rho = std::sqrt(ratio) * rhoend;
+                } else {
+                    rho = tenth * rho;
+                }
+                delta = std::max(delta,rho);
+                ntrits = 0;
+                nfsav = nf;
+                goto L60;
+            }
+
+            /*     Return from the calculation, after another Newton-Raphson step, if */
+            /*       it is too short to have been tried before. */
+
+            if (ntrits == -1) {
+                goto L360;
+            }
+L720:
+            if (fval[kopt] <= fsave) {
+                i__1 = n;
+                for (i__ = 1; i__ <= i__1; ++i__) {
+                    /* Computing MIN */
+                    /* Computing MAX */
+                    d__3 = xl[i__], d__4 = xbase[i__] + xopt[i__];
+                    d__1 = std::max(d__3,d__4), d__2 = xu[i__];
+                    x[i__] = std::min(d__1,d__2);
+                    if (xopt[i__] == sl[i__]) {
+                        x[i__] = xl[i__];
+                    }
+                    if (xopt[i__] == su[i__]) {
+                        x[i__] = xu[i__];
+                    }
+                    /* L730: */
+                }
+                f = fval[kopt];
+            }
+
+            return f;
+        } /* bobyqb_ */
+
+    // ----------------------------------------------------------------------------------------
+
+        void altmov_(
+            const integer n,
+            const integer npt,
+            const doublereal *xpt, 
+            const doublereal *xopt,
+            const doublereal *bmat,
+            const doublereal *zmat,
+            const integer ndim, 
+            const doublereal *sl,
+            const doublereal *su,
+            const integer kopt,
+            const integer knew, 
+            const doublereal adelt,
+            doublereal *xnew,
+            doublereal *xalt,
+            doublereal& alpha,
+            doublereal& cauchy,
+            doublereal *glag,
+            doublereal *hcol, 
+            doublereal *w
+        ) const
+        {
+            /* System generated locals */
+            integer xpt_dim1, xpt_offset, bmat_dim1, bmat_offset, zmat_dim1, 
+            zmat_offset, i__1, i__2;
+            doublereal d__1, d__2, d__3, d__4;
+
+
+            /* Local variables */
+            integer i__, j, k;
+            doublereal ha, gw, one, diff, half;
+            integer ilbd, isbd;
+            doublereal slbd;
+            integer iubd;
+            doublereal vlag, subd, temp;
+            integer ksav = 0;
+            doublereal step = 0, zero = 0, curv = 0;
+            integer iflag;
+            doublereal scale = 0, csave = 0, tempa = 0, tempb = 0, tempd = 0, const__ = 0, sumin = 0, 
+                       ggfree = 0;
+            integer ibdsav = 0;
+            doublereal dderiv = 0, bigstp = 0, predsq = 0, presav = 0, distsq = 0, stpsav = 0, wfixsq = 0, wsqsav = 0;
+
+
+            /*     The arguments N, NPT, XPT, XOPT, BMAT, ZMAT, NDIM, SL and SU all have */
+            /*       the same meanings as the corresponding arguments of BOBYQB. */
+            /*     KOPT is the index of the optimal interpolation point. */
+            /*     KNEW is the index of the interpolation point that is going to be moved. */
+            /*     ADELT is the current trust region bound. */
+            /*     XNEW will be set to a suitable new position for the interpolation point */
+            /*       XPT(KNEW,.). Specifically, it satisfies the SL, SU and trust region */
+            /*       bounds and it should provide a large denominator in the next call of */
+            /*       UPDATE. The step XNEW-XOPT from XOPT is restricted to moves along the */
+            /*       straight lines through XOPT and another interpolation point. */
+            /*     XALT also provides a large value of the modulus of the KNEW-th Lagrange */
+            /*       function subject to the constraints that have been mentioned, its main */
+            /*       difference from XNEW being that XALT-XOPT is a constrained version of */
+            /*       the Cauchy step within the trust region. An exception is that XALT is */
+            /*       not calculated if all components of GLAG (see below) are zero. */
+            /*     ALPHA will be set to the KNEW-th diagonal element of the H matrix. */
+            /*     CAUCHY will be set to the square of the KNEW-th Lagrange function at */
+            /*       the step XALT-XOPT from XOPT for the vector XALT that is returned, */
+            /*       except that CAUCHY is set to zero if XALT is not calculated. */
+            /*     GLAG is a working space vector of length N for the gradient of the */
+            /*       KNEW-th Lagrange function at XOPT. */
+            /*     HCOL is a working space vector of length NPT for the second derivative */
+            /*       coefficients of the KNEW-th Lagrange function. */
+            /*     W is a working space vector of length 2N that is going to hold the */
+            /*       constrained Cauchy step from XOPT of the Lagrange function, followed */
+            /*       by the downhill version of XALT when the uphill step is calculated. */
+
+            /*     Set the first NPT components of W to the leading elements of the */
+            /*     KNEW-th column of the H matrix. */
+
+            /* Parameter adjustments */
+            zmat_dim1 = npt;
+            zmat_offset = 1 + zmat_dim1;
+            zmat -= zmat_offset;
+            xpt_dim1 = npt;
+            xpt_offset = 1 + xpt_dim1;
+            xpt -= xpt_offset;
+            --xopt;
+            bmat_dim1 = ndim;
+            bmat_offset = 1 + bmat_dim1;
+            bmat -= bmat_offset;
+            --sl;
+            --su;
+            --xnew;
+            --xalt;
+            --glag;
+            --hcol;
+            --w;
+
+            /* Function Body */
+            half = .5;
+            one = 1.;
+            zero = 0.;
+            const__ = one + std::sqrt(2.);
+            i__1 = npt;
+            for (k = 1; k <= i__1; ++k) {
+                /* L10: */
+                hcol[k] = zero;
+            }
+            i__1 = npt - n - 1;
+            for (j = 1; j <= i__1; ++j) {
+                temp = zmat[knew + j * zmat_dim1];
+                i__2 = npt;
+                for (k = 1; k <= i__2; ++k) {
+                    /* L20: */
+                    hcol[k] += temp * zmat[k + j * zmat_dim1];
+                }
+            }
+            alpha = hcol[knew];
+            ha = half * alpha;
+
+            /*     Calculate the gradient of the KNEW-th Lagrange function at XOPT. */
+
+            i__2 = n;
+            for (i__ = 1; i__ <= i__2; ++i__) {
+                /* L30: */
+                glag[i__] = bmat[knew + i__ * bmat_dim1];
+            }
+            i__2 = npt;
+            for (k = 1; k <= i__2; ++k) {
+                temp = zero;
+                i__1 = n;
+                for (j = 1; j <= i__1; ++j) {
+                    /* L40: */
+                    temp += xpt[k + j * xpt_dim1] * xopt[j];
+                }
+                temp = hcol[k] * temp;
+                i__1 = n;
+                for (i__ = 1; i__ <= i__1; ++i__) {
+                    /* L50: */
+                    glag[i__] += temp * xpt[k + i__ * xpt_dim1];
+                }
+            }
+
+            /*     Search for a large denominator along the straight lines through XOPT */
+            /*     and another interpolation point. SLBD and SUBD will be lower and upper */
+            /*     bounds on the step along each of these lines in turn. PREDSQ will be */
+            /*     set to the square of the predicted denominator for each line. PRESAV */
+            /*     will be set to the largest admissible value of PREDSQ that occurs. */
+
+            presav = zero;
+            i__1 = npt;
+            for (k = 1; k <= i__1; ++k) {
+                if (k == kopt) {
+                    goto L80;
+                }
+                dderiv = zero;
+                distsq = zero;
+                i__2 = n;
+                for (i__ = 1; i__ <= i__2; ++i__) {
+                    temp = xpt[k + i__ * xpt_dim1] - xopt[i__];
+                    dderiv += glag[i__] * temp;
+                    /* L60: */
+                    distsq += temp * temp;
+                }
+                subd = adelt / std::sqrt(distsq);
+                slbd = -subd;
+                ilbd = 0;
+                iubd = 0;
+                sumin = std::min(one,subd);
+
+                /*     Revise SLBD and SUBD if necessary because of the bounds in SL and SU. */
+
+                i__2 = n;
+                for (i__ = 1; i__ <= i__2; ++i__) {
+                    temp = xpt[k + i__ * xpt_dim1] - xopt[i__];
+                    if (temp > zero) {
+                        if (slbd * temp < sl[i__] - xopt[i__]) {
+                            slbd = (sl[i__] - xopt[i__]) / temp;
+                            ilbd = -i__;
+                        }
+                        if (subd * temp > su[i__] - xopt[i__]) {
+                            /* Computing MAX */
+                            d__1 = sumin, d__2 = (su[i__] - xopt[i__]) / temp;
+                            subd = std::max(d__1,d__2);
+                            iubd = i__;
+                        }
+                    } else if (temp < zero) {
+                        if (slbd * temp > su[i__] - xopt[i__]) {
+                            slbd = (su[i__] - xopt[i__]) / temp;
+                            ilbd = i__;
+                        }
+                        if (subd * temp < sl[i__] - xopt[i__]) {
+                            /* Computing MAX */
+                            d__1 = sumin, d__2 = (sl[i__] - xopt[i__]) / temp;
+                            subd = std::max(d__1,d__2);
+                            iubd = -i__;
+                        }
+                    }
+                    /* L70: */
+                }
+
+                /*     Seek a large modulus of the KNEW-th Lagrange function when the index */
+                /*     of the other interpolation point on the line through XOPT is KNEW. */
+
+                if (k == knew) {
+                    diff = dderiv - one;
+                    step = slbd;
+                    vlag = slbd * (dderiv - slbd * diff);
+                    isbd = ilbd;
+                    temp = subd * (dderiv - subd * diff);
+                    if (std::abs(temp) > std::abs(vlag)) {
+                        step = subd;
+                        vlag = temp;
+                        isbd = iubd;
+                    }
+                    tempd = half * dderiv;
+                    tempa = tempd - diff * slbd;
+                    tempb = tempd - diff * subd;
+                    if (tempa * tempb < zero) {
+                        temp = tempd * tempd / diff;
+                        if (std::abs(temp) > std::abs(vlag)) {
+                            step = tempd / diff;
+                            vlag = temp;
+                            isbd = 0;
+                        }
+                    }
+
+                    /*     Search along each of the other lines through XOPT and another point. */
+
+                } else {
+                    step = slbd;
+                    vlag = slbd * (one - slbd);
+                    isbd = ilbd;
+                    temp = subd * (one - subd);
+                    if (std::abs(temp) > std::abs(vlag)) {
+                        step = subd;
+                        vlag = temp;
+                        isbd = iubd;
+                    }
+                    if (subd > half) {
+                        if (std::abs(vlag) < .25) {
+                            step = half;
+                            vlag = .25;
+                            isbd = 0;
+                        }
+                    }
+                    vlag *= dderiv;
+                }
+
+                /*     Calculate PREDSQ for the current line search and maintain PRESAV. */
+
+                temp = step * (one - step) * distsq;
+                predsq = vlag * vlag * (vlag * vlag + ha * temp * temp);
+                if (predsq > presav) {
+                    presav = predsq;
+                    ksav = k;
+                    stpsav = step;
+                    ibdsav = isbd;
+                }
+L80:
+                ;
+            }
+
+            /*     Construct XNEW in a way that satisfies the bound constraints exactly. */
+
+            i__1 = n;
+            for (i__ = 1; i__ <= i__1; ++i__) {
+                temp = xopt[i__] + stpsav * (xpt[ksav + i__ * xpt_dim1] - xopt[i__]);
+                /* L90: */
+                /* Computing MAX */
+                /* Computing MIN */
+                d__3 = su[i__];
+                d__1 = sl[i__], d__2 = std::min(d__3,temp);
+                xnew[i__] = std::max(d__1,d__2);
+            }
+            if (ibdsav < 0) {
+                xnew[-ibdsav] = sl[-ibdsav];
+            }
+            if (ibdsav > 0) {
+                xnew[ibdsav] = su[ibdsav];
+            }
+
+            /*     Prepare for the iterative method that assembles the constrained Cauchy */
+            /*     step in W. The sum of squares of the fixed components of W is formed in */
+            /*     WFIXSQ, and the free components of W are set to BIGSTP. */
+
+            bigstp = adelt + adelt;
+            iflag = 0;
+L100:
+            wfixsq = zero;
+            ggfree = zero;
+            i__1 = n;
+            for (i__ = 1; i__ <= i__1; ++i__) {
+                w[i__] = zero;
+                /* Computing MIN */
+                d__1 = xopt[i__] - sl[i__], d__2 = glag[i__];
+                tempa = std::min(d__1,d__2);
+                /* Computing MAX */
+                d__1 = xopt[i__] - su[i__], d__2 = glag[i__];
+                tempb = std::max(d__1,d__2);
+                if (tempa > zero || tempb < zero) {
+                    w[i__] = bigstp;
+                    /* Computing 2nd power */
+                    d__1 = glag[i__];
+                    ggfree += d__1 * d__1;
+                }
+                /* L110: */
+            }
+            if (ggfree == zero) {
+                cauchy = zero;
+                goto L200;
+            }
+
+            /*     Investigate whether more components of W can be fixed. */
+
+L120:
+            temp = adelt * adelt - wfixsq;
+            if (temp > zero) {
+                wsqsav = wfixsq;
+                step = std::sqrt(temp / ggfree);
+                ggfree = zero;
+                i__1 = n;
+                for (i__ = 1; i__ <= i__1; ++i__) {
+                    if (w[i__] == bigstp) {
+                        temp = xopt[i__] - step * glag[i__];
+                        if (temp <= sl[i__]) {
+                            w[i__] = sl[i__] - xopt[i__];
+                            /* Computing 2nd power */
+                            d__1 = w[i__];
+                            wfixsq += d__1 * d__1;
+                        } else if (temp >= su[i__]) {
+                            w[i__] = su[i__] - xopt[i__];
+                            /* Computing 2nd power */
+                            d__1 = w[i__];
+                            wfixsq += d__1 * d__1;
+                        } else {
+                            /* Computing 2nd power */
+                            d__1 = glag[i__];
+                            ggfree += d__1 * d__1;
+                        }
+                    }
+                    /* L130: */
+                }
+                if (wfixsq > wsqsav && ggfree > zero) {
+                    goto L120;
+                }
+            }
+
+            /*     Set the remaining free components of W and all components of XALT, */
+            /*     except that W may be scaled later. */
+
+            gw = zero;
+            i__1 = n;
+            for (i__ = 1; i__ <= i__1; ++i__) {
+                if (w[i__] == bigstp) {
+                    w[i__] = -step * glag[i__];
+                    /* Computing MAX */
+                    /* Computing MIN */
+                    d__3 = su[i__], d__4 = xopt[i__] + w[i__];
+                    d__1 = sl[i__], d__2 = std::min(d__3,d__4);
+                    xalt[i__] = std::max(d__1,d__2);
+                } else if (w[i__] == zero) {
+                    xalt[i__] = xopt[i__];
+                } else if (glag[i__] > zero) {
+                    xalt[i__] = sl[i__];
+                } else {
+                    xalt[i__] = su[i__];
+                }
+                /* L140: */
+                gw += glag[i__] * w[i__];
+            }
+
+            /*     Set CURV to the curvature of the KNEW-th Lagrange function along W. */
+            /*     Scale W by a factor less than one if that can reduce the modulus of */
+            /*     the Lagrange function at XOPT+W. Set CAUCHY to the final value of */
+            /*     the square of this function. */
+
+            curv = zero;
+            i__1 = npt;
+            for (k = 1; k <= i__1; ++k) {
+                temp = zero;
+                i__2 = n;
+                for (j = 1; j <= i__2; ++j) {
+                    /* L150: */
+                    temp += xpt[k + j * xpt_dim1] * w[j];
+                }
+                /* L160: */
+                curv += hcol[k] * temp * temp;
+            }
+            if (iflag == 1) {
+                curv = -curv;
+            }
+            if (curv > -gw && curv < -const__ * gw) {
+                scale = -gw / curv;
+                i__1 = n;
+                for (i__ = 1; i__ <= i__1; ++i__) {
+                    temp = xopt[i__] + scale * w[i__];
+                    /* L170: */
+                    /* Computing MAX */
+                    /* Computing MIN */
+                    d__3 = su[i__];
+                    d__1 = sl[i__], d__2 = std::min(d__3,temp);
+                    xalt[i__] = std::max(d__1,d__2);
+                }
+                /* Computing 2nd power */
+                d__1 = half * gw * scale;
+                cauchy = d__1 * d__1;
+            } else {
+                /* Computing 2nd power */
+                d__1 = gw + half * curv;
+                cauchy = d__1 * d__1;
+            }
+
+            /*     If IFLAG is zero, then XALT is calculated as before after reversing */
+            /*     the sign of GLAG. Thus two XALT vectors become available. The one that */
+            /*     is chosen is the one that gives the larger value of CAUCHY. */
+
+            if (iflag == 0) {
+                i__1 = n;
+                for (i__ = 1; i__ <= i__1; ++i__) {
+                    glag[i__] = -glag[i__];
+                    /* L180: */
+                    w[n + i__] = xalt[i__];
+                }
+                csave = cauchy;
+                iflag = 1;
+                goto L100;
+            }
+            if (csave > cauchy) {
+                i__1 = n;
+                for (i__ = 1; i__ <= i__1; ++i__) {
+                    /* L190: */
+                    xalt[i__] = w[n + i__];
+                }
+                cauchy = csave;
+            }
+L200:
+            ;
+        } /* altmov_ */
+
+    // ----------------------------------------------------------------------------------------
+
+        template <typename funct>
+        void prelim_(
+            const funct& calfun,
+            const integer n,
+            const integer npt,
+            doublereal *x, 
+            const doublereal *xl,
+            const doublereal *xu,
+            const doublereal rhobeg,
+            const integer maxfun,
+            doublereal *xbase,
+            doublereal *xpt,
+            doublereal *fval,
+            doublereal *gopt,
+            doublereal *hq,
+            doublereal *pq,
+            doublereal *bmat, 
+            doublereal *zmat,
+            const integer ndim,
+            const doublereal *sl,
+            const doublereal *su, 
+            integer& nf, 
+            integer& kopt
+        ) const
+        {
+            /* System generated locals */
+            integer xpt_dim1, xpt_offset, bmat_dim1, bmat_offset, zmat_dim1, 
+            zmat_offset, i__1, i__2;
+            doublereal d__1, d__2, d__3, d__4;
+
+
+            /* Local variables */
+            doublereal f;
+            integer i__, j, k, ih, np, nfm;
+            doublereal one;
+            integer nfx = 0, ipt = 0, jpt = 0;
+            doublereal two = 0, fbeg = 0, diff = 0, half = 0, temp = 0, zero = 0, recip = 0, stepa = 0, stepb = 0;
+            integer itemp;
+            doublereal rhosq;
+
+
+
+            /*     The arguments N, NPT, X, XL, XU, RHOBEG, IPRINT and MAXFUN are the */
+            /*       same as the corresponding arguments in SUBROUTINE BOBYQA. */
+            /*     The arguments XBASE, XPT, FVAL, HQ, PQ, BMAT, ZMAT, NDIM, SL and SU */
+            /*       are the same as the corresponding arguments in BOBYQB, the elements */
+            /*       of SL and SU being set in BOBYQA. */
+            /*     GOPT is usually the gradient of the quadratic model at XOPT+XBASE, but */
+            /*       it is set by PRELIM to the gradient of the quadratic model at XBASE. */
+            /*       If XOPT is nonzero, BOBYQB will change it to its usual value later. */
+            /*     NF is maintaned as the number of calls of CALFUN so far. */
+            /*     KOPT will be such that the least calculated value of F so far is at */
+            /*       the point XPT(KOPT,.)+XBASE in the space of the variables. */
+
+            /*     SUBROUTINE PRELIM sets the elements of XBASE, XPT, FVAL, GOPT, HQ, PQ, */
+            /*     BMAT and ZMAT for the first iteration, and it maintains the values of */
+            /*     NF and KOPT. The vector X is also changed by PRELIM. */
+
+            /*     Set some constants. */
+
+            /* Parameter adjustments */
+            zmat_dim1 = npt;
+            zmat_offset = 1 + zmat_dim1;
+            zmat -= zmat_offset;
+            xpt_dim1 = npt;
+            xpt_offset = 1 + xpt_dim1;
+            xpt -= xpt_offset;
+            --x;
+            --xl;
+            --xu;
+            --xbase;
+            --fval;
+            --gopt;
+            --hq;
+            --pq;
+            bmat_dim1 = ndim;
+            bmat_offset = 1 + bmat_dim1;
+            bmat -= bmat_offset;
+            --sl;
+            --su;
+
+            /* Function Body */
+            half = .5;
+            one = 1.;
+            two = 2.;
+            zero = 0.;
+            rhosq = rhobeg * rhobeg;
+            recip = one / rhosq;
+            np = n + 1;
+
+            /*     Set XBASE to the initial vector of variables, and set the initial */
+            /*     elements of XPT, BMAT, HQ, PQ and ZMAT to zero. */
+
+            i__1 = n;
+            for (j = 1; j <= i__1; ++j) {
+                xbase[j] = x[j];
+                i__2 = npt;
+                for (k = 1; k <= i__2; ++k) {
+                    /* L10: */
+                    xpt[k + j * xpt_dim1] = zero;
+                }
+                i__2 = ndim;
+                for (i__ = 1; i__ <= i__2; ++i__) {
+                    /* L20: */
+                    bmat[i__ + j * bmat_dim1] = zero;
+                }
+            }
+            i__2 = n * np / 2;
+            for (ih = 1; ih <= i__2; ++ih) {
+                /* L30: */
+                hq[ih] = zero;
+            }
+            i__2 = npt;
+            for (k = 1; k <= i__2; ++k) {
+                pq[k] = zero;
+                i__1 = npt - np;
+                for (j = 1; j <= i__1; ++j) {
+                    /* L40: */
+                    zmat[k + j * zmat_dim1] = zero;
+                }
+            }
+
+            /*     Begin the initialization procedure. NF becomes one more than the number */
+            /*     of function values so far. The coordinates of the displacement of the */
+            /*     next initial interpolation point from XBASE are set in XPT(NF+1,.). */
+
+            nf = 0;
+L50:
+            nfm = nf;
+            nfx = nf - n;
+            ++(nf);
+            if (nfm <= n << 1) {
+                if (nfm >= 1 && nfm <= n) {
+                    stepa = rhobeg;
+                    if (su[nfm] == zero) {
+                        stepa = -stepa;
+                    }
+                    xpt[nf + nfm * xpt_dim1] = stepa;
+                } else if (nfm > n) {
+                    stepa = xpt[nf - n + nfx * xpt_dim1];
+                    stepb = -(rhobeg);
+                    if (sl[nfx] == zero) {
+                        /* Computing MIN */
+                        d__1 = two * rhobeg, d__2 = su[nfx];
+                        stepb = std::min(d__1,d__2);
+                    }
+                    if (su[nfx] == zero) {
+                        /* Computing MAX */
+                        d__1 = -two * rhobeg, d__2 = sl[nfx];
+                        stepb = std::max(d__1,d__2);
+                    }
+                    xpt[nf + nfx * xpt_dim1] = stepb;
+                }
+            } else {
+                itemp = (nfm - np) / n;
+                jpt = nfm - itemp * n - n;
+                ipt = jpt + itemp;
+                if (ipt > n) {
+                    itemp = jpt;
+                    jpt = ipt - n;
+                    ipt = itemp;
+                }
+                xpt[nf + ipt * xpt_dim1] = xpt[ipt + 1 + ipt * xpt_dim1];
+                xpt[nf + jpt * xpt_dim1] = xpt[jpt + 1 + jpt * xpt_dim1];
+            }
+
+            /*     Calculate the next value of F. The least function value so far and */
+            /*     its index are required. */
+
+            i__1 = n;
+            for (j = 1; j <= i__1; ++j) {
+                /* Computing MIN */
+                /* Computing MAX */
+                d__3 = xl[j], d__4 = xbase[j] + xpt[nf + j * xpt_dim1];
+                d__1 = std::max(d__3,d__4), d__2 = xu[j];
+                x[j] = std::min(d__1,d__2);
+                if (xpt[nf + j * xpt_dim1] == sl[j]) {
+                    x[j] = xl[j];
+                }
+                if (xpt[nf + j * xpt_dim1] == su[j]) {
+                    x[j] = xu[j];
+                }
+                /* L60: */
+            }
+            f = calfun(mat(&x[1],n));
+            fval[nf] = f;
+            if (nf == 1) {
+                fbeg = f;
+                kopt = 1;
+            } else if (f < fval[kopt]) {
+                kopt = nf;
+            }
+
+            /*     Set the nonzero initial elements of BMAT and the quadratic model in the */
+            /*     cases when NF is at most 2*N+1. If NF exceeds N+1, then the positions */
+            /*     of the NF-th and (NF-N)-th interpolation points may be switched, in */
+            /*     order that the function value at the first of them contributes to the */
+            /*     off-diagonal second derivative terms of the initial quadratic model. */
+
+            if (nf <= (n << 1) + 1) {
+                if (nf >= 2 && nf <= n + 1) {
+                    gopt[nfm] = (f - fbeg) / stepa;
+                    if (npt < nf + n) {
+                        bmat[nfm * bmat_dim1 + 1] = -one / stepa;
+                        bmat[nf + nfm * bmat_dim1] = one / stepa;
+                        bmat[npt + nfm + nfm * bmat_dim1] = -half * rhosq;
+                    }
+                } else if (nf >= n + 2) {
+                    ih = nfx * (nfx + 1) / 2;
+                    temp = (f - fbeg) / stepb;
+                    diff = stepb - stepa;
+                    hq[ih] = two * (temp - gopt[nfx]) / diff;
+                    gopt[nfx] = (gopt[nfx] * stepb - temp * stepa) / diff;
+                    if (stepa * stepb < zero) {
+                        if (f < fval[nf - n]) {
+                            fval[nf] = fval[nf - n];
+                            fval[nf - n] = f;
+                            if (kopt == nf) {
+                                kopt = nf - n;
+                            }
+                            xpt[nf - n + nfx * xpt_dim1] = stepb;
+                            xpt[nf + nfx * xpt_dim1] = stepa;
+                        }
+                    }
+                    bmat[nfx * bmat_dim1 + 1] = -(stepa + stepb) / (stepa * stepb);
+                    bmat[nf + nfx * bmat_dim1] = -half / xpt[nf - n + nfx * 
+                        xpt_dim1];
+                    bmat[nf - n + nfx * bmat_dim1] = -bmat[nfx * bmat_dim1 + 1] - 
+                        bmat[nf + nfx * bmat_dim1];
+                    zmat[nfx * zmat_dim1 + 1] = std::sqrt(two) / (stepa * stepb);
+                    zmat[nf + nfx * zmat_dim1] = std::sqrt(half) / rhosq;
+                    zmat[nf - n + nfx * zmat_dim1] = -zmat[nfx * zmat_dim1 + 1] - 
+                        zmat[nf + nfx * zmat_dim1];
+                }
+
+                /*     Set the off-diagonal second derivatives of the Lagrange functions and */
+                /*     the initial quadratic model. */
+
+            } else {
+                ih = ipt * (ipt - 1) / 2 + jpt;
+                zmat[nfx * zmat_dim1 + 1] = recip;
+                zmat[nf + nfx * zmat_dim1] = recip;
+                zmat[ipt + 1 + nfx * zmat_dim1] = -recip;
+                zmat[jpt + 1 + nfx * zmat_dim1] = -recip;
+                temp = xpt[nf + ipt * xpt_dim1] * xpt[nf + jpt * xpt_dim1];
+                hq[ih] = (fbeg - fval[ipt + 1] - fval[jpt + 1] + f) / temp;
+            }
+            if (nf < npt && nf < maxfun) {
+                goto L50;
+            }
+
+        } /* prelim_ */
+
+    // ----------------------------------------------------------------------------------------
+
+        template <typename funct>
+        void rescue_ (
+            const funct& calfun,
+            const integer n,
+            const integer npt,
+            const doublereal *xl, 
+            const doublereal *xu,
+            const integer maxfun,
+            doublereal *xbase, 
+            doublereal *xpt, 
+            doublereal *fval,
+            doublereal *xopt,
+            doublereal *gopt,
+            doublereal *hq, 
+            doublereal *pq,
+            doublereal *bmat,
+            doublereal *zmat, 
+            const integer ndim,
+            doublereal *sl,
+            doublereal *su,
+            integer& nf, 
+            const doublereal delta,
+            integer& kopt,
+            doublereal *vlag,
+            doublereal * ptsaux,
+            doublereal *ptsid,
+            doublereal *w
+        ) const
+        {
+            /* System generated locals */
+            integer xpt_dim1, xpt_offset, bmat_dim1, bmat_offset, zmat_dim1, 
+            zmat_offset, i__1, i__2, i__3;
+            doublereal d__1, d__2, d__3, d__4;
+
+
+            /* Local variables */
+            doublereal f;
+            integer i__, j, k, ih, jp, ip, iq, np, iw;
+            doublereal xp = 0, xq = 0, den = 0;
+            integer ihp = 0;
+            doublereal one;
+            integer ihq, jpn, kpt;
+            doublereal sum = 0, diff = 0, half = 0, beta = 0;
+            integer kold;
+            doublereal winc;
+            integer nrem, knew;
+            doublereal temp, bsum;
+            integer nptm;
+            doublereal zero = 0, hdiag = 0, fbase = 0, sfrac = 0, denom = 0, vquad = 0, sumpq = 0;
+            doublereal dsqmin, distsq, vlmxsq;
+
+
+
+            /*     The arguments N, NPT, XL, XU, IPRINT, MAXFUN, XBASE, XPT, FVAL, XOPT, */
+            /*       GOPT, HQ, PQ, BMAT, ZMAT, NDIM, SL and SU have the same meanings as */
+            /*       the corresponding arguments of BOBYQB on the entry to RESCUE. */
+            /*     NF is maintained as the number of calls of CALFUN so far, except that */
+            /*       NF is set to -1 if the value of MAXFUN prevents further progress. */
+            /*     KOPT is maintained so that FVAL(KOPT) is the least calculated function */
+            /*       value. Its correct value must be given on entry. It is updated if a */
+            /*       new least function value is found, but the corresponding changes to */
+            /*       XOPT and GOPT have to be made later by the calling program. */
+            /*     DELTA is the current trust region radius. */
+            /*     VLAG is a working space vector that will be used for the values of the */
+            /*       provisional Lagrange functions at each of the interpolation points. */
+            /*       They are part of a product that requires VLAG to be of length NDIM. */
+            /*     PTSAUX is also a working space array. For J=1,2,...,N, PTSAUX(1,J) and */
+            /*       PTSAUX(2,J) specify the two positions of provisional interpolation */
+            /*       points when a nonzero step is taken along e_J (the J-th coordinate */
+            /*       direction) through XBASE+XOPT, as specified below. Usually these */
+            /*       steps have length DELTA, but other lengths are chosen if necessary */
+            /*       in order to satisfy the given bounds on the variables. */
+            /*     PTSID is also a working space array. It has NPT components that denote */
+            /*       provisional new positions of the original interpolation points, in */
+            /*       case changes are needed to restore the linear independence of the */
+            /*       interpolation conditions. The K-th point is a candidate for change */
+            /*       if and only if PTSID(K) is nonzero. In this case let p and q be the */
+            /*       integer parts of PTSID(K) and (PTSID(K)-p) multiplied by N+1. If p */
+            /*       and q are both positive, the step from XBASE+XOPT to the new K-th */
+            /*       interpolation point is PTSAUX(1,p)*e_p + PTSAUX(1,q)*e_q. Otherwise */
+            /*       the step is PTSAUX(1,p)*e_p or PTSAUX(2,q)*e_q in the cases q=0 or */
+            /*       p=0, respectively. */
+            /*     The first NDIM+NPT elements of the array W are used for working space. */
+            /*     The final elements of BMAT and ZMAT are set in a well-conditioned way */
+            /*       to the values that are appropriate for the new interpolation points. */
+            /*     The elements of GOPT, HQ and PQ are also revised to the values that are */
+            /*       appropriate to the final quadratic model. */
+
+            /*     Set some constants. */
+
+            /* Parameter adjustments */
+            zmat_dim1 = npt;
+            zmat_offset = 1 + zmat_dim1;
+            zmat -= zmat_offset;
+            xpt_dim1 = npt;
+            xpt_offset = 1 + xpt_dim1;
+            xpt -= xpt_offset;
+            --xl;
+            --xu;
+            --xbase;
+            --fval;
+            --xopt;
+            --gopt;
+            --hq;
+            --pq;
+            bmat_dim1 = ndim;
+            bmat_offset = 1 + bmat_dim1;
+            bmat -= bmat_offset;
+            --sl;
+            --su;
+            --vlag;
+            ptsaux -= 3;
+            --ptsid;
+            --w;
+
+            /* Function Body */
+            half = .5;
+            one = 1.;
+            zero = 0.;
+            np = n + 1;
+            sfrac = half / (doublereal) np;
+            nptm = npt - np;
+
+            /*     Shift the interpolation points so that XOPT becomes the origin, and set */
+            /*     the elements of ZMAT to zero. The value of SUMPQ is required in the */
+            /*     updating of HQ below. The squares of the distances from XOPT to the */
+            /*     other interpolation points are set at the end of W. Increments of WINC */
+            /*     may be added later to these squares to balance the consideration of */
+            /*     the choice of point that is going to become current. */
+
+            sumpq = zero;
+            winc = zero;
+            i__1 = npt;
+            for (k = 1; k <= i__1; ++k) {
+                distsq = zero;
+                i__2 = n;
+                for (j = 1; j <= i__2; ++j) {
+                    xpt[k + j * xpt_dim1] -= xopt[j];
+                    /* L10: */
+                    /* Computing 2nd power */
+                    d__1 = xpt[k + j * xpt_dim1];
+                    distsq += d__1 * d__1;
+                }
+                sumpq += pq[k];
+                w[ndim + k] = distsq;
+                winc = std::max(winc,distsq);
+                i__2 = nptm;
+                for (j = 1; j <= i__2; ++j) {
+                    /* L20: */
+                    zmat[k + j * zmat_dim1] = zero;
+                }
+            }
+
+            /*     Update HQ so that HQ and PQ define the second derivatives of the model */
+            /*     after XBASE has been shifted to the trust region centre. */
+
+            ih = 0;
+            i__2 = n;
+            for (j = 1; j <= i__2; ++j) {
+                w[j] = half * sumpq * xopt[j];
+                i__1 = npt;
+                for (k = 1; k <= i__1; ++k) {
+                    /* L30: */
+                    w[j] += pq[k] * xpt[k + j * xpt_dim1];
+                }
+                i__1 = j;
+                for (i__ = 1; i__ <= i__1; ++i__) {
+                    ++ih;
+                    /* L40: */
+                    hq[ih] = hq[ih] + w[i__] * xopt[j] + w[j] * xopt[i__];
+                }
+            }
+
+            /*     Shift XBASE, SL, SU and XOPT. Set the elements of BMAT to zero, and */
+            /*     also set the elements of PTSAUX. */
+
+            i__1 = n;
+            for (j = 1; j <= i__1; ++j) {
+                xbase[j] += xopt[j];
+                sl[j] -= xopt[j];
+                su[j] -= xopt[j];
+                xopt[j] = zero;
+                /* Computing MIN */
+                d__1 = delta, d__2 = su[j];
+                ptsaux[(j << 1) + 1] = std::min(d__1,d__2);
+                /* Computing MAX */
+                d__1 = -(delta), d__2 = sl[j];
+                ptsaux[(j << 1) + 2] = std::max(d__1,d__2);
+                if (ptsaux[(j << 1) + 1] + ptsaux[(j << 1) + 2] < zero) {
+                    temp = ptsaux[(j << 1) + 1];
+                    ptsaux[(j << 1) + 1] = ptsaux[(j << 1) + 2];
+                    ptsaux[(j << 1) + 2] = temp;
+                }
+                if ((d__2 = ptsaux[(j << 1) + 2], std::abs(d__2)) < half * (d__1 = ptsaux[(
+                            j << 1) + 1], std::abs(d__1))) {
+                    ptsaux[(j << 1) + 2] = half * ptsaux[(j << 1) + 1];
+                }
+                i__2 = ndim;
+                for (i__ = 1; i__ <= i__2; ++i__) {
+                    /* L50: */
+                    bmat[i__ + j * bmat_dim1] = zero;
+                }
+            }
+            fbase = fval[kopt];
+
+            /*     Set the identifiers of the artificial interpolation points that are */
+            /*     along a coordinate direction from XOPT, and set the corresponding */
+            /*     nonzero elements of BMAT and ZMAT. */
+
+            ptsid[1] = sfrac;
+            i__2 = n;
+            for (j = 1; j <= i__2; ++j) {
+                jp = j + 1;
+                jpn = jp + n;
+                ptsid[jp] = (doublereal) j + sfrac;
+                if (jpn <= npt) {
+                    ptsid[jpn] = (doublereal) j / (doublereal) np + sfrac;
+                    temp = one / (ptsaux[(j << 1) + 1] - ptsaux[(j << 1) + 2]);
+                    bmat[jp + j * bmat_dim1] = -temp + one / ptsaux[(j << 1) + 1];
+                    bmat[jpn + j * bmat_dim1] = temp + one / ptsaux[(j << 1) + 2];
+                    bmat[j * bmat_dim1 + 1] = -bmat[jp + j * bmat_dim1] - bmat[jpn + 
+                        j * bmat_dim1];
+                    zmat[j * zmat_dim1 + 1] = std::sqrt(2.) / (d__1 = ptsaux[(j << 1) + 1] 
+                                                          * ptsaux[(j << 1) + 2], std::abs(d__1));
+                    zmat[jp + j * zmat_dim1] = zmat[j * zmat_dim1 + 1] * ptsaux[(j << 
+                                                                                 1) + 2] * temp;
+                    zmat[jpn + j * zmat_dim1] = -zmat[j * zmat_dim1 + 1] * ptsaux[(j 
+                                                                                   << 1) + 1] * temp;
+                } else {
+                    bmat[j * bmat_dim1 + 1] = -one / ptsaux[(j << 1) + 1];
+                    bmat[jp + j * bmat_dim1] = one / ptsaux[(j << 1) + 1];
+                    /* Computing 2nd power */
+                    d__1 = ptsaux[(j << 1) + 1];
+                    bmat[j + npt + j * bmat_dim1] = -half * (d__1 * d__1);
+                }
+                /* L60: */
+            }
+
+            /*     Set any remaining identifiers with their nonzero elements of ZMAT. */
+
+            if (npt >= n + np) {
+                i__2 = npt;
+                for (k = np << 1; k <= i__2; ++k) {
+                    iw = (integer) (((doublereal) (k - np) - half) / (doublereal) (n)
+                    );
+                    ip = k - np - iw * n;
+                    iq = ip + iw;
+                    if (iq > n) {
+                        iq -= n;
+                    }
+                    ptsid[k] = (doublereal) ip + (doublereal) iq / (doublereal) np + 
+                        sfrac;
+                    temp = one / (ptsaux[(ip << 1) + 1] * ptsaux[(iq << 1) + 1]);
+                    zmat[(k - np) * zmat_dim1 + 1] = temp;
+                    zmat[ip + 1 + (k - np) * zmat_dim1] = -temp;
+                    zmat[iq + 1 + (k - np) * zmat_dim1] = -temp;
+                    /* L70: */
+                    zmat[k + (k - np) * zmat_dim1] = temp;
+                }
+            }
+            nrem = npt;
+            kold = 1;
+            knew = kopt;
+
+            /*     Reorder the provisional points in the way that exchanges PTSID(KOLD) */
+            /*     with PTSID(KNEW). */
+
+L80:
+            i__2 = n;
+            for (j = 1; j <= i__2; ++j) {
+                temp = bmat[kold + j * bmat_dim1];
+                bmat[kold + j * bmat_dim1] = bmat[knew + j * bmat_dim1];
+                /* L90: */
+                bmat[knew + j * bmat_dim1] = temp;
+            }
+            i__2 = nptm;
+            for (j = 1; j <= i__2; ++j) {
+                temp = zmat[kold + j * zmat_dim1];
+                zmat[kold + j * zmat_dim1] = zmat[knew + j * zmat_dim1];
+                /* L100: */
+                zmat[knew + j * zmat_dim1] = temp;
+            }
+            ptsid[kold] = ptsid[knew];
+            ptsid[knew] = zero;
+            w[ndim + knew] = zero;
+            --nrem;
+            if (knew != kopt) {
+                temp = vlag[kold];
+                vlag[kold] = vlag[knew];
+                vlag[knew] = temp;
+
+                /*     Update the BMAT and ZMAT matrices so that the status of the KNEW-th */
+                /*     interpolation point can be changed from provisional to original. The */
+                /*     branch to label 350 occurs if all the original points are reinstated. */
+                /*     The nonnegative values of W(NDIM+K) are required in the search below. */
+
+                update_(n, npt, &bmat[bmat_offset], &zmat[zmat_offset], ndim, &vlag[1], 
+                        beta, denom, knew, &w[1]);
+                if (nrem == 0) {
+                    goto L350;
+                }
+                i__2 = npt;
+                for (k = 1; k <= i__2; ++k) {
+                    /* L110: */
+                    w[ndim + k] = (d__1 = w[ndim + k], std::abs(d__1));
+                }
+            }
+
+            /*     Pick the index KNEW of an original interpolation point that has not */
+            /*     yet replaced one of the provisional interpolation points, giving */
+            /*     attention to the closeness to XOPT and to previous tries with KNEW. */
+
+L120:
+            dsqmin = zero;
+            i__2 = npt;
+            for (k = 1; k <= i__2; ++k) {
+                if (w[ndim + k] > zero) {
+                    if (dsqmin == zero || w[ndim + k] < dsqmin) {
+                        knew = k;
+                        dsqmin = w[ndim + k];
+                    }
+                }
+                /* L130: */
+            }
+            if (dsqmin == zero) {
+                goto L260;
+            }
+
+            /*     Form the W-vector of the chosen original interpolation point. */
+
+            i__2 = n;
+            for (j = 1; j <= i__2; ++j) {
+                /* L140: */
+                w[npt + j] = xpt[knew + j * xpt_dim1];
+            }
+            i__2 = npt;
+            for (k = 1; k <= i__2; ++k) {
+                sum = zero;
+                if (k == kopt) {
+                } else if (ptsid[k] == zero) {
+                    i__1 = n;
+                    for (j = 1; j <= i__1; ++j) {
+                        /* L150: */
+                        sum += w[npt + j] * xpt[k + j * xpt_dim1];
+                    }
+                } else {
+                    ip = (integer) ptsid[k];
+                    if (ip > 0) {
+                        sum = w[npt + ip] * ptsaux[(ip << 1) + 1];
+                    }
+                    iq = (integer) ((doublereal) np * ptsid[k] - (doublereal) (ip * 
+                                                                               np));
+                    if (iq > 0) {
+                        iw = 1;
+                        if (ip == 0) {
+                            iw = 2;
+                        }
+                        sum += w[npt + iq] * ptsaux[iw + (iq << 1)];
+                    }
+                }
+                /* L160: */
+                w[k] = half * sum * sum;
+            }
+
+            /*     Calculate VLAG and BETA for the required updating of the H matrix if */
+            /*     XPT(KNEW,.) is reinstated in the set of interpolation points. */
+
+            i__2 = npt;
+            for (k = 1; k <= i__2; ++k) {
+                sum = zero;
+                i__1 = n;
+                for (j = 1; j <= i__1; ++j) {
+                    /* L170: */
+                    sum += bmat[k + j * bmat_dim1] * w[npt + j];
+                }
+                /* L180: */
+                vlag[k] = sum;
+            }
+            beta = zero;
+            i__2 = nptm;
+            for (j = 1; j <= i__2; ++j) {
+                sum = zero;
+                i__1 = npt;
+                for (k = 1; k <= i__1; ++k) {
+                    /* L190: */
+                    sum += zmat[k + j * zmat_dim1] * w[k];
+                }
+                beta -= sum * sum;
+                i__1 = npt;
+                for (k = 1; k <= i__1; ++k) {
+                    /* L200: */
+                    vlag[k] += sum * zmat[k + j * zmat_dim1];
+                }
+            }
+            bsum = zero;
+            distsq = zero;
+            i__1 = n;
+            for (j = 1; j <= i__1; ++j) {
+                sum = zero;
+                i__2 = npt;
+                for (k = 1; k <= i__2; ++k) {
+                    /* L210: */
+                    sum += bmat[k + j * bmat_dim1] * w[k];
+                }
+                jp = j + npt;
+                bsum += sum * w[jp];
+                i__2 = ndim;
+                for (ip = npt + 1; ip <= i__2; ++ip) {
+                    /* L220: */
+                    sum += bmat[ip + j * bmat_dim1] * w[ip];
+                }
+                bsum += sum * w[jp];
+                vlag[jp] = sum;
+                /* L230: */
+                /* Computing 2nd power */
+                d__1 = xpt[knew + j * xpt_dim1];
+                distsq += d__1 * d__1;
+            }
+            beta = half * distsq * distsq + beta - bsum;
+            vlag[kopt] += one;
+
+            /*     KOLD is set to the index of the provisional interpolation point that is */
+            /*     going to be deleted to make way for the KNEW-th original interpolation */
+            /*     point. The choice of KOLD is governed by the avoidance of a small value */
+            /*     of the denominator in the updating calculation of UPDATE. */
+
+            denom = zero;
+            vlmxsq = zero;
+            i__1 = npt;
+            for (k = 1; k <= i__1; ++k) {
+                if (ptsid[k] != zero) {
+                    hdiag = zero;
+                    i__2 = nptm;
+                    for (j = 1; j <= i__2; ++j) {
+                        /* L240: */
+                        /* Computing 2nd power */
+                        d__1 = zmat[k + j * zmat_dim1];
+                        hdiag += d__1 * d__1;
+                    }
+                    /* Computing 2nd power */
+                    d__1 = vlag[k];
+                    den = beta * hdiag + d__1 * d__1;
+                    if (den > denom) {
+                        kold = k;
+                        denom = den;
+                    }
+                }
+                /* L250: */
+                /* Computing MAX */
+                /* Computing 2nd power */
+                d__3 = vlag[k];
+                d__1 = vlmxsq, d__2 = d__3 * d__3;
+                vlmxsq = std::max(d__1,d__2);
+            }
+            if (denom <= vlmxsq * .01) {
+                w[ndim + knew] = -w[ndim + knew] - winc;
+                goto L120;
+            }
+            goto L80;
+
+            /*     When label 260 is reached, all the final positions of the interpolation */
+            /*     points have been chosen although any changes have not been included yet */
+            /*     in XPT. Also the final BMAT and ZMAT matrices are complete, but, apart */
+            /*     from the shift of XBASE, the updating of the quadratic model remains to */
+            /*     be done. The following cycle through the new interpolation points begins */
+            /*     by putting the new point in XPT(KPT,.) and by setting PQ(KPT) to zero, */
+            /*     except that a RETURN occurs if MAXFUN prohibits another value of F. */
+
+L260:
+            i__1 = npt;
+            for (kpt = 1; kpt <= i__1; ++kpt) {
+                if (ptsid[kpt] == zero) {
+                    goto L340;
+                }
+                if (nf >= maxfun) {
+                    nf = -1;
+                    goto L350;
+                }
+                ih = 0;
+                i__2 = n;
+                for (j = 1; j <= i__2; ++j) {
+                    w[j] = xpt[kpt + j * xpt_dim1];
+                    xpt[kpt + j * xpt_dim1] = zero;
+                    temp = pq[kpt] * w[j];
+                    i__3 = j;
+                    for (i__ = 1; i__ <= i__3; ++i__) {
+                        ++ih;
+                        /* L270: */
+                        hq[ih] += temp * w[i__];
+                    }
+                }
+                pq[kpt] = zero;
+                ip = (integer) ptsid[kpt];
+                iq = (integer) ((doublereal) np * ptsid[kpt] - (doublereal) (ip * np))
+                    ;
+                if (ip > 0) {
+                    xp = ptsaux[(ip << 1) + 1];
+                    xpt[kpt + ip * xpt_dim1] = xp;
+                }
+                if (iq > 0) {
+                    xq = ptsaux[(iq << 1) + 1];
+                    if (ip == 0) {
+                        xq = ptsaux[(iq << 1) + 2];
+                    }
+                    xpt[kpt + iq * xpt_dim1] = xq;
+                }
+
+                /*     Set VQUAD to the value of the current model at the new point. */
+
+                vquad = fbase;
+                if (ip > 0) {
+                    ihp = (ip + ip * ip) / 2;
+                    vquad += xp * (gopt[ip] + half * xp * hq[ihp]);
+                }
+                if (iq > 0) {
+                    ihq = (iq + iq * iq) / 2;
+                    vquad += xq * (gopt[iq] + half * xq * hq[ihq]);
+                    if (ip > 0) {
+                        iw = std::max(ihp,ihq) - (i__3 = ip - iq, std::abs(i__3));
+                        vquad += xp * xq * hq[iw];
+                    }
+                }
+                i__3 = npt;
+                for (k = 1; k <= i__3; ++k) {
+                    temp = zero;
+                    if (ip > 0) {
+                        temp += xp * xpt[k + ip * xpt_dim1];
+                    }
+                    if (iq > 0) {
+                        temp += xq * xpt[k + iq * xpt_dim1];
+                    }
+                    /* L280: */
+                    vquad += half * pq[k] * temp * temp;
+                }
+
+                /*     Calculate F at the new interpolation point, and set DIFF to the factor */
+                /*     that is going to multiply the KPT-th Lagrange function when the model */
+                /*     is updated to provide interpolation to the new function value. */
+
+                i__3 = n;
+                for (i__ = 1; i__ <= i__3; ++i__) {
+                    /* Computing MIN */
+                    /* Computing MAX */
+                    d__3 = xl[i__], d__4 = xbase[i__] + xpt[kpt + i__ * xpt_dim1];
+                    d__1 = std::max(d__3,d__4), d__2 = xu[i__];
+                    w[i__] = std::min(d__1,d__2);
+                    if (xpt[kpt + i__ * xpt_dim1] == sl[i__]) {
+                        w[i__] = xl[i__];
+                    }
+                    if (xpt[kpt + i__ * xpt_dim1] == su[i__]) {
+                        w[i__] = xu[i__];
+                    }
+                    /* L290: */
+                }
+                ++(nf);
+                f = calfun(mat(&w[1],n));
+                fval[kpt] = f;
+                if (f < fval[kopt]) {
+                    kopt = kpt;
+                }
+                diff = f - vquad;
+
+                /*     Update the quadratic model. The RETURN from the subroutine occurs when */
+                /*     all the new interpolation points are included in the model. */
+
+                i__3 = n;
+                for (i__ = 1; i__ <= i__3; ++i__) {
+                    /* L310: */
+                    gopt[i__] += diff * bmat[kpt + i__ * bmat_dim1];
+                }
+                i__3 = npt;
+                for (k = 1; k <= i__3; ++k) {
+                    sum = zero;
+                    i__2 = nptm;
+                    for (j = 1; j <= i__2; ++j) {
+                        /* L320: */
+                        sum += zmat[k + j * zmat_dim1] * zmat[kpt + j * zmat_dim1];
+                    }
+                    temp = diff * sum;
+                    if (ptsid[k] == zero) {
+                        pq[k] += temp;
+                    } else {
+                        ip = (integer) ptsid[k];
+                        iq = (integer) ((doublereal) np * ptsid[k] - (doublereal) (ip 
+                                                                                   * np));
+                        ihq = (iq * iq + iq) / 2;
+                        if (ip == 0) {
+                            /* Computing 2nd power */
+                            d__1 = ptsaux[(iq << 1) + 2];
+                            hq[ihq] += temp * (d__1 * d__1);
+                        } else {
+                            ihp = (ip * ip + ip) / 2;
+                            /* Computing 2nd power */
+                            d__1 = ptsaux[(ip << 1) + 1];
+                            hq[ihp] += temp * (d__1 * d__1);
+                            if (iq > 0) {
+                                /* Computing 2nd power */
+                                d__1 = ptsaux[(iq << 1) + 1];
+                                hq[ihq] += temp * (d__1 * d__1);
+                                iw = std::max(ihp,ihq) - (i__2 = iq - ip, std::abs(i__2));
+                                hq[iw] += temp * ptsaux[(ip << 1) + 1] * ptsaux[(iq <<
+                                                                                 1) + 1];
+                            }
+                        }
+                    }
+                    /* L330: */
+                }
+                ptsid[kpt] = zero;
+L340:
+                ;
+            }
+L350:
+            ;
+        } /* rescue_ */
+
+    // ----------------------------------------------------------------------------------------
+
+        void trsbox_(
+            const integer n,
+            const integer npt,
+            const doublereal *xpt, 
+            const doublereal *xopt,
+            const doublereal *gopt,
+            const doublereal *hq,
+            const doublereal *pq, 
+            const doublereal *sl,
+            const doublereal *su,
+            const doublereal delta,
+            doublereal *xnew, 
+            doublereal *d__,
+            doublereal *gnew,
+            doublereal *xbdi,
+            doublereal *s, 
+            doublereal *hs,
+            doublereal *hred,
+            doublereal *dsq,
+            doublereal *crvmin
+        ) const
+        {
+            /* System generated locals */
+            integer xpt_dim1, xpt_offset, i__1, i__2;
+            doublereal d__1, d__2, d__3, d__4;
+
+            /* Local variables */
+            integer i__, j, k, ih;
+            doublereal ds;
+            integer iu;
+            doublereal dhd, dhs, cth, one, shs, sth, ssq, half, beta, sdec, blen;
+            integer iact = 0, nact = 0;
+            doublereal angt, qred;
+            integer isav;
+            doublereal temp = 0, zero = 0, xsav = 0, xsum = 0, angbd = 0, dredg = 0, sredg = 0;
+            integer iterc;
+            doublereal resid = 0, delsq = 0, ggsav = 0, tempa = 0, tempb = 0,  
+                       redmax = 0, dredsq = 0, redsav = 0, onemin = 0, gredsq = 0, rednew = 0;
+            integer itcsav = 0;
+            doublereal rdprev = 0, rdnext = 0, stplen = 0, stepsq = 0;
+            integer itermax = 0;
+
+
+            /*     The arguments N, NPT, XPT, XOPT, GOPT, HQ, PQ, SL and SU have the same */
+            /*       meanings as the corresponding arguments of BOBYQB. */
+            /*     DELTA is the trust region radius for the present calculation, which */
+            /*       seeks a small value of the quadratic model within distance DELTA of */
+            /*       XOPT subject to the bounds on the variables. */
+            /*     XNEW will be set to a new vector of variables that is approximately */
+            /*       the one that minimizes the quadratic model within the trust region */
+            /*       subject to the SL and SU constraints on the variables. It satisfies */
+            /*       as equations the bounds that become active during the calculation. */
+            /*     D is the calculated trial step from XOPT, generated iteratively from an */
+            /*       initial value of zero. Thus XNEW is XOPT+D after the final iteration. */
+            /*     GNEW holds the gradient of the quadratic model at XOPT+D. It is updated */
+            /*       when D is updated. */
+            /*     XBDI is a working space vector. For I=1,2,...,N, the element XBDI(I) is */
+            /*       set to -1.0, 0.0, or 1.0, the value being nonzero if and only if the */
+            /*       I-th variable has become fixed at a bound, the bound being SL(I) or */
+            /*       SU(I) in the case XBDI(I)=-1.0 or XBDI(I)=1.0, respectively. This */
+            /*       information is accumulated during the construction of XNEW. */
+            /*     The arrays S, HS and HRED are also used for working space. They hold the */
+            /*       current search direction, and the changes in the gradient of Q along S */
+            /*       and the reduced D, respectively, where the reduced D is the same as D, */
+            /*       except that the components of the fixed variables are zero. */
+            /*     DSQ will be set to the square of the length of XNEW-XOPT. */
+            /*     CRVMIN is set to zero if D reaches the trust region boundary. Otherwise */
+            /*       it is set to the least curvature of H that occurs in the conjugate */
+            /*       gradient searches that are not restricted by any constraints. The */
+            /*       value CRVMIN=-1.0D0 is set, however, if all of these searches are */
+            /*       constrained. */
+
+            /*     A version of the truncated conjugate gradient is applied. If a line */
+            /*     search is restricted by a constraint, then the procedure is restarted, */
+            /*     the values of the variables that are at their bounds being fixed. If */
+            /*     the trust region boundary is reached, then further changes may be made */
+            /*     to D, each one being in the two dimensional space that is spanned */
+            /*     by the current D and the gradient of Q at XOPT+D, staying on the trust */
+            /*     region boundary. Termination occurs when the reduction in Q seems to */
+            /*     be close to the greatest reduction that can be achieved. */
+
+            /*     Set some constants. */
+
+            /* Parameter adjustments */
+            xpt_dim1 = npt;
+            xpt_offset = 1 + xpt_dim1;
+            xpt -= xpt_offset;
+            --xopt;
+            --gopt;
+            --hq;
+            --pq;
+            --sl;
+            --su;
+            --xnew;
+            --d__;
+            --gnew;
+            --xbdi;
+            --s;
+            --hs;
+            --hred;
+
+            /* Function Body */
+            half = .5;
+            one = 1.;
+            onemin = -1.;
+            zero = 0.;
+
+            /*     The sign of GOPT(I) gives the sign of the change to the I-th variable */
+            /*     that will reduce Q from its value at XOPT. Thus XBDI(I) shows whether */
+            /*     or not to fix the I-th variable at one of its bounds initially, with */
+            /*     NACT being set to the number of fixed variables. D and GNEW are also */
+            /*     set for the first iteration. DELSQ is the upper bound on the sum of */
+            /*     squares of the free variables. QRED is the reduction in Q so far. */
+
+            iterc = 0;
+            nact = 0;
+            i__1 = n;
+            for (i__ = 1; i__ <= i__1; ++i__) {
+                xbdi[i__] = zero;
+                if (xopt[i__] <= sl[i__]) {
+                    if (gopt[i__] >= zero) {
+                        xbdi[i__] = onemin;
+                    }
+                } else if (xopt[i__] >= su[i__]) {
+                    if (gopt[i__] <= zero) {
+                        xbdi[i__] = one;
+                    }
+                }
+                if (xbdi[i__] != zero) {
+                    ++nact;
+                }
+                d__[i__] = zero;
+                /* L10: */
+                gnew[i__] = gopt[i__];
+            }
+            delsq = delta * delta;
+            qred = zero;
+            *crvmin = onemin;
+
+            /*     Set the next search direction of the conjugate gradient method. It is */
+            /*     the steepest descent direction initially and when the iterations are */
+            /*     restarted because a variable has just been fixed by a bound, and of */
+            /*     course the components of the fixed variables are zero. ITERMAX is an */
+            /*     upper bound on the indices of the conjugate gradient iterations. */
+
+L20:
+            beta = zero;
+L30:
+            stepsq = zero;
+            i__1 = n;
+            for (i__ = 1; i__ <= i__1; ++i__) {
+                if (xbdi[i__] != zero) {
+                    s[i__] = zero;
+                } else if (beta == zero) {
+                    s[i__] = -gnew[i__];
+                } else {
+                    s[i__] = beta * s[i__] - gnew[i__];
+                }
+                /* L40: */
+                /* Computing 2nd power */
+                d__1 = s[i__];
+                stepsq += d__1 * d__1;
+            }
+            if (stepsq == zero) {
+                goto L190;
+            }
+            if (beta == zero) {
+                gredsq = stepsq;
+                itermax = iterc + n - nact;
+            }
+            if (gredsq * delsq <= qred * 1e-4 * qred) {
+                goto L190;
+            }
+
+            /*     Multiply the search direction by the second derivative matrix of Q and */
+            /*     calculate some scalars for the choice of steplength. Then set BLEN to */
+            /*     the length of the the step to the trust region boundary and STPLEN to */
+            /*     the steplength, ignoring the simple bounds. */
+
+            goto L210;
+L50:
+            resid = delsq;
+            ds = zero;
+            shs = zero;
+            i__1 = n;
+            for (i__ = 1; i__ <= i__1; ++i__) {
+                if (xbdi[i__] == zero) {
+                    /* Computing 2nd power */
+                    d__1 = d__[i__];
+                    resid -= d__1 * d__1;
+                    ds += s[i__] * d__[i__];
+                    shs += s[i__] * hs[i__];
+                }
+                /* L60: */
+            }
+            if (resid <= zero) {
+                goto L90;
+            }
+            temp = std::sqrt(stepsq * resid + ds * ds);
+            if (ds < zero) {
+                blen = (temp - ds) / stepsq;
+            } else {
+                blen = resid / (temp + ds);
+            }
+            stplen = blen;
+            if (shs > zero) {
+                /* Computing MIN */
+                d__1 = blen, d__2 = gredsq / shs;
+                stplen = std::min(d__1,d__2);
+            }
+
+            /*     Reduce STPLEN if necessary in order to preserve the simple bounds, */
+            /*     letting IACT be the index of the new constrained variable. */
+
+            iact = 0;
+            i__1 = n;
+            for (i__ = 1; i__ <= i__1; ++i__) {
+                if (s[i__] != zero) {
+                    xsum = xopt[i__] + d__[i__];
+                    if (s[i__] > zero) {
+                        temp = (su[i__] - xsum) / s[i__];
+                    } else {
+                        temp = (sl[i__] - xsum) / s[i__];
+                    }
+                    if (temp < stplen) {
+                        stplen = temp;
+                        iact = i__;
+                    }
+                }
+                /* L70: */
+            }
+
+            /*     Update CRVMIN, GNEW and D. Set SDEC to the decrease that occurs in Q. */
+
+            sdec = zero;
+            if (stplen > zero) {
+                ++iterc;
+                temp = shs / stepsq;
+                if (iact == 0 && temp > zero) {
+                    *crvmin = std::min(*crvmin,temp);
+                    if (*crvmin == onemin) {
+                        *crvmin = temp;
+                    }
+                }
+                ggsav = gredsq;
+                gredsq = zero;
+                i__1 = n;
+                for (i__ = 1; i__ <= i__1; ++i__) {
+                    gnew[i__] += stplen * hs[i__];
+                    if (xbdi[i__] == zero) {
+                        /* Computing 2nd power */
+                        d__1 = gnew[i__];
+                        gredsq += d__1 * d__1;
+                    }
+                    /* L80: */
+                    d__[i__] += stplen * s[i__];
+                }
+                /* Computing MAX */
+                d__1 = stplen * (ggsav - half * stplen * shs);
+                sdec = std::max(d__1,zero);
+                qred += sdec;
+            }
+
+            /*     Restart the conjugate gradient method if it has hit a new bound. */
+
+            if (iact > 0) {
+                ++nact;
+                xbdi[iact] = one;
+                if (s[iact] < zero) {
+                    xbdi[iact] = onemin;
+                }
+                /* Computing 2nd power */
+                d__1 = d__[iact];
+                delsq -= d__1 * d__1;
+                if (delsq <= zero) {
+                    goto L90;
+                }
+                goto L20;
+            }
+
+            /*     If STPLEN is less than BLEN, then either apply another conjugate */
+            /*     gradient iteration or RETURN. */
+
+            if (stplen < blen) {
+                if (iterc == itermax) {
+                    goto L190;
+                }
+                if (sdec <= qred * .01) {
+                    goto L190;
+                }
+                beta = gredsq / ggsav;
+                goto L30;
+            }
+L90:
+            *crvmin = zero;
+
+            /*     Prepare for the alternative iteration by calculating some scalars */
+            /*     and by multiplying the reduced D by the second derivative matrix of */
+            /*     Q, where S holds the reduced D in the call of GGMULT. */
+
+L100:
+            if (nact >= n - 1) {
+                goto L190;
+            }
+            dredsq = zero;
+            dredg = zero;
+            gredsq = zero;
+            i__1 = n;
+            for (i__ = 1; i__ <= i__1; ++i__) {
+                if (xbdi[i__] == zero) {
+                    /* Computing 2nd power */
+                    d__1 = d__[i__];
+                    dredsq += d__1 * d__1;
+                    dredg += d__[i__] * gnew[i__];
+                    /* Computing 2nd power */
+                    d__1 = gnew[i__];
+                    gredsq += d__1 * d__1;
+                    s[i__] = d__[i__];
+                } else {
+                    s[i__] = zero;
+                }
+                /* L110: */
+            }
+            itcsav = iterc;
+            goto L210;
+
+            /*     Let the search direction S be a linear combination of the reduced D */
+            /*     and the reduced G that is orthogonal to the reduced D. */
+
+L120:
+            ++iterc;
+            temp = gredsq * dredsq - dredg * dredg;
+            if (temp <= qred * 1e-4 * qred) {
+                goto L190;
+            }
+            temp = std::sqrt(temp);
+            i__1 = n;
+            for (i__ = 1; i__ <= i__1; ++i__) {
+                if (xbdi[i__] == zero) {
+                    s[i__] = (dredg * d__[i__] - dredsq * gnew[i__]) / temp;
+                } else {
+                    s[i__] = zero;
+                }
+                /* L130: */
+            }
+            sredg = -temp;
+
+            /*     By considering the simple bounds on the variables, calculate an upper */
+            /*     bound on the tangent of half the angle of the alternative iteration, */
+            /*     namely ANGBD, except that, if already a free variable has reached a */
+            /*     bound, there is a branch back to label 100 after fixing that variable. */
+
+            angbd = one;
+            iact = 0;
+            i__1 = n;
+            for (i__ = 1; i__ <= i__1; ++i__) {
+                if (xbdi[i__] == zero) {
+                    tempa = xopt[i__] + d__[i__] - sl[i__];
+                    tempb = su[i__] - xopt[i__] - d__[i__];
+                    if (tempa <= zero) {
+                        ++nact;
+                        xbdi[i__] = onemin;
+                        goto L100;
+                    } else if (tempb <= zero) {
+                        ++nact;
+                        xbdi[i__] = one;
+                        goto L100;
+                    }
+                    /* Computing 2nd power */
+                    d__1 = d__[i__];
+                    /* Computing 2nd power */
+                    d__2 = s[i__];
+                    ssq = d__1 * d__1 + d__2 * d__2;
+                    /* Computing 2nd power */
+                    d__1 = xopt[i__] - sl[i__];
+                    temp = ssq - d__1 * d__1;
+                    if (temp > zero) {
+                        temp = std::sqrt(temp) - s[i__];
+                        if (angbd * temp > tempa) {
+                            angbd = tempa / temp;
+                            iact = i__;
+                            xsav = onemin;
+                        }
+                    }
+                    /* Computing 2nd power */
+                    d__1 = su[i__] - xopt[i__];
+                    temp = ssq - d__1 * d__1;
+                    if (temp > zero) {
+                        temp = std::sqrt(temp) + s[i__];
+                        if (angbd * temp > tempb) {
+                            angbd = tempb / temp;
+                            iact = i__;
+                            xsav = one;
+                        }
+                    }
+                }
+                /* L140: */
+            }
+
+            /*     Calculate HHD and some curvatures for the alternative iteration. */
+
+            goto L210;
+L150:
+            shs = zero;
+            dhs = zero;
+            dhd = zero;
+            i__1 = n;
+            for (i__ = 1; i__ <= i__1; ++i__) {
+                if (xbdi[i__] == zero) {
+                    shs += s[i__] * hs[i__];
+                    dhs += d__[i__] * hs[i__];
+                    dhd += d__[i__] * hred[i__];
+                }
+                /* L160: */
+            }
+
+            /*     Seek the greatest reduction in Q for a range of equally spaced values */
+            /*     of ANGT in [0,ANGBD], where ANGT is the tangent of half the angle of */
+            /*     the alternative iteration. */
+
+            redmax = zero;
+            isav = 0;
+            redsav = zero;
+            iu = (integer) (angbd * 17. + 3.1);
+            i__1 = iu;
+            for (i__ = 1; i__ <= i__1; ++i__) {
+                angt = angbd * (doublereal) i__ / (doublereal) iu;
+                sth = (angt + angt) / (one + angt * angt);
+                temp = shs + angt * (angt * dhd - dhs - dhs);
+                rednew = sth * (angt * dredg - sredg - half * sth * temp);
+                if (rednew > redmax) {
+                    redmax = rednew;
+                    isav = i__;
+                    rdprev = redsav;
+                } else if (i__ == isav + 1) {
+                    rdnext = rednew;
+                }
+                /* L170: */
+                redsav = rednew;
+            }
+
+            /*     Return if the reduction is zero. Otherwise, set the sine and cosine */
+            /*     of the angle of the alternative iteration, and calculate SDEC. */
+
+            if (isav == 0) {
+                goto L190;
+            }
+            if (isav < iu) {
+                temp = (rdnext - rdprev) / (redmax + redmax - rdprev - rdnext);
+                angt = angbd * ((doublereal) isav + half * temp) / (doublereal) iu;
+            }
+            cth = (one - angt * angt) / (one + angt * angt);
+            sth = (angt + angt) / (one + angt * angt);
+            temp = shs + angt * (angt * dhd - dhs - dhs);
+            sdec = sth * (angt * dredg - sredg - half * sth * temp);
+            if (sdec <= zero) {
+                goto L190;
+            }
+
+            /*     Update GNEW, D and HRED. If the angle of the alternative iteration */
+            /*     is restricted by a bound on a free variable, that variable is fixed */
+            /*     at the bound. */
+
+            dredg = zero;
+            gredsq = zero;
+            i__1 = n;
+            for (i__ = 1; i__ <= i__1; ++i__) {
+                gnew[i__] = gnew[i__] + (cth - one) * hred[i__] + sth * hs[i__];
+                if (xbdi[i__] == zero) {
+                    d__[i__] = cth * d__[i__] + sth * s[i__];
+                    dredg += d__[i__] * gnew[i__];
+                    /* Computing 2nd power */
+                    d__1 = gnew[i__];
+                    gredsq += d__1 * d__1;
+                }
+                /* L180: */
+                hred[i__] = cth * hred[i__] + sth * hs[i__];
+            }
+            qred += sdec;
+            if (iact > 0 && isav == iu) {
+                ++nact;
+                xbdi[iact] = xsav;
+                goto L100;
+            }
+
+            /*     If SDEC is sufficiently small, then RETURN after setting XNEW to */
+            /*     XOPT+D, giving careful attention to the bounds. */
+
+            if (sdec > qred * .01) {
+                goto L120;
+            }
+L190:
+            *dsq = zero;
+            i__1 = n;
+            for (i__ = 1; i__ <= i__1; ++i__) {
+                /* Computing MAX */
+                /* Computing MIN */
+                d__3 = xopt[i__] + d__[i__], d__4 = su[i__];
+                d__1 = std::min(d__3,d__4), d__2 = sl[i__];
+                xnew[i__] = std::max(d__1,d__2);
+                if (xbdi[i__] == onemin) {
+                    xnew[i__] = sl[i__];
+                }
+                if (xbdi[i__] == one) {
+                    xnew[i__] = su[i__];
+                }
+                d__[i__] = xnew[i__] - xopt[i__];
+                /* L200: */
+                /* Computing 2nd power */
+                d__1 = d__[i__];
+                *dsq += d__1 * d__1;
+            }
+            return;
+            /*     The following instructions multiply the current S-vector by the second */
+            /*     derivative matrix of the quadratic model, putting the product in HS. */
+            /*     They are reached from three different parts of the software above and */
+            /*     they can be regarded as an external subroutine. */
+
+L210:
+            ih = 0;
+            i__1 = n;
+            for (j = 1; j <= i__1; ++j) {
+                hs[j] = zero;
+                i__2 = j;
+                for (i__ = 1; i__ <= i__2; ++i__) {
+                    ++ih;
+                    if (i__ < j) {
+                        hs[j] += hq[ih] * s[i__];
+                    }
+                    /* L220: */
+                    hs[i__] += hq[ih] * s[j];
+                }
+            }
+            i__2 = npt;
+            for (k = 1; k <= i__2; ++k) {
+                if (pq[k] != zero) {
+                    temp = zero;
+                    i__1 = n;
+                    for (j = 1; j <= i__1; ++j) {
+                        /* L230: */
+                        temp += xpt[k + j * xpt_dim1] * s[j];
+                    }
+                    temp *= pq[k];
+                    i__1 = n;
+                    for (i__ = 1; i__ <= i__1; ++i__) {
+                        /* L240: */
+                        hs[i__] += temp * xpt[k + i__ * xpt_dim1];
+                    }
+                }
+                /* L250: */
+            }
+            if (*crvmin != zero) {
+                goto L50;
+            }
+            if (iterc > itcsav) {
+                goto L150;
+            }
+            i__2 = n;
+            for (i__ = 1; i__ <= i__2; ++i__) {
+                /* L260: */
+                hred[i__] = hs[i__];
+            }
+            goto L120;
+        } /* trsbox_ */
+
+    // ----------------------------------------------------------------------------------------
+
+        void update_(
+            const integer n,
+            const integer npt,
+            doublereal *bmat, 
+            doublereal *zmat,
+            const integer ndim,
+            doublereal *vlag,
+            const doublereal beta, 
+            const doublereal denom,
+            const integer knew,
+            doublereal *w
+        ) const
+        {
+            /* System generated locals */
+            integer bmat_dim1, bmat_offset, zmat_dim1, zmat_offset, i__1, i__2;
+            doublereal d__1, d__2, d__3;
+
+            /* Local variables */
+            integer i__, j, k, jp;
+            doublereal one, tau, temp;
+            integer nptm;
+            doublereal zero, alpha, tempa, tempb, ztest;
+
+
+            /*     The arrays BMAT and ZMAT are updated, as required by the new position */
+            /*     of the interpolation point that has the index KNEW. The vector VLAG has */
+            /*     N+NPT components, set on entry to the first NPT and last N components */
+            /*     of the product Hw in equation (4.11) of the Powell (2006) paper on */
+            /*     NEWUOA. Further, BETA is set on entry to the value of the parameter */
+            /*     with that name, and DENOM is set to the denominator of the updating */
+            /*     formula. Elements of ZMAT may be treated as zero if their moduli are */
+            /*     at most ZTEST. The first NDIM elements of W are used for working space. */
+
+            /*     Set some constants. */
+
+            /* Parameter adjustments */
+            zmat_dim1 = npt;
+            zmat_offset = 1 + zmat_dim1;
+            zmat -= zmat_offset;
+            bmat_dim1 = ndim;
+            bmat_offset = 1 + bmat_dim1;
+            bmat -= bmat_offset;
+            --vlag;
+            --w;
+
+            /* Function Body */
+            one = 1.;
+            zero = 0.;
+            nptm = npt - n - 1;
+            ztest = zero;
+            i__1 = npt;
+            for (k = 1; k <= i__1; ++k) {
+                i__2 = nptm;
+                for (j = 1; j <= i__2; ++j) {
+                    /* L10: */
+                    /* Computing MAX */
+                    d__2 = ztest, d__3 = (d__1 = zmat[k + j * zmat_dim1], std::abs(d__1));
+                    ztest = std::max(d__2,d__3);
+                }
+            }
+            ztest *= 1e-20;
+
+            /*     Apply the rotations that put zeros in the KNEW-th row of ZMAT. */
+
+            i__2 = nptm;
+            for (j = 2; j <= i__2; ++j) {
+                if ((d__1 = zmat[knew + j * zmat_dim1], std::abs(d__1)) > ztest) {
+                    /* Computing 2nd power */
+                    d__1 = zmat[knew + zmat_dim1];
+                    /* Computing 2nd power */
+                    d__2 = zmat[knew + j * zmat_dim1];
+                    temp = std::sqrt(d__1 * d__1 + d__2 * d__2);
+                    tempa = zmat[knew + zmat_dim1] / temp;
+                    tempb = zmat[knew + j * zmat_dim1] / temp;
+                    i__1 = npt;
+                    for (i__ = 1; i__ <= i__1; ++i__) {
+                        temp = tempa * zmat[i__ + zmat_dim1] + tempb * zmat[i__ + j * 
+                            zmat_dim1];
+                        zmat[i__ + j * zmat_dim1] = tempa * zmat[i__ + j * zmat_dim1] 
+                            - tempb * zmat[i__ + zmat_dim1];
+                        /* L20: */
+                        zmat[i__ + zmat_dim1] = temp;
+                    }
+                }
+                zmat[knew + j * zmat_dim1] = zero;
+                /* L30: */
+            }
+
+            /*     Put the first NPT components of the KNEW-th column of HLAG into W, */
+            /*     and calculate the parameters of the updating formula. */
+
+            i__2 = npt;
+            for (i__ = 1; i__ <= i__2; ++i__) {
+                w[i__] = zmat[knew + zmat_dim1] * zmat[i__ + zmat_dim1];
+                /* L40: */
+            }
+            alpha = w[knew];
+            tau = vlag[knew];
+            vlag[knew] -= one;
+
+            /*     Complete the updating of ZMAT. */
+
+            temp = std::sqrt(denom);
+            tempb = zmat[knew + zmat_dim1] / temp;
+            tempa = tau / temp;
+            i__2 = npt;
+            for (i__ = 1; i__ <= i__2; ++i__) {
+                /* L50: */
+                zmat[i__ + zmat_dim1] = tempa * zmat[i__ + zmat_dim1] - tempb * vlag[
+                    i__];
+            }
+
+            /*     Finally, update the matrix BMAT. */
+
+            i__2 = n;
+            for (j = 1; j <= i__2; ++j) {
+                jp = npt + j;
+                w[jp] = bmat[knew + j * bmat_dim1];
+                tempa = (alpha * vlag[jp] - tau * w[jp]) / denom;
+                tempb = (-(beta) * w[jp] - tau * vlag[jp]) / denom;
+                i__1 = jp;
+                for (i__ = 1; i__ <= i__1; ++i__) {
+                    bmat[i__ + j * bmat_dim1] = bmat[i__ + j * bmat_dim1] + tempa * 
+                        vlag[i__] + tempb * w[i__];
+                    if (i__ > npt) {
+                        bmat[jp + (i__ - npt) * bmat_dim1] = bmat[i__ + j * 
+                            bmat_dim1];
+                    }
+                    /* L60: */
+                }
+            }
+        } /* update_ */
+    };
+
+// ----------------------------------------------------------------------------------------
+
+    template <
+        typename funct,
+        typename T, 
+        typename U
+        >
+    double find_min_bobyqa (
+        const funct& f,
+        T& x,
+        long npt,
+        const U& x_lower,
+        const U& x_upper,
+        const double rho_begin,
+        const double rho_end,
+        const long max_f_evals
+    ) 
+    {
+        // The starting point (i.e. x) must be a column vector.  
+        COMPILE_TIME_ASSERT(T::NC <= 1);
+
+        // check the requirements.  Also split the assert up so that the error message isn't huge.
+        DLIB_CASSERT(is_col_vector(x) && is_col_vector(x_lower) && is_col_vector(x_upper) &&
+                    x.size() == x_lower.size() && x_lower.size() == x_upper.size() &&
+                    x.size() > 1 && max_f_evals > 1,
+            "\tdouble find_min_bobyqa()"
+            << "\n\t Invalid arguments have been given to this function"
+            << "\n\t is_col_vector(x):       " << is_col_vector(x) 
+            << "\n\t is_col_vector(x_lower): " << is_col_vector(x_lower) 
+            << "\n\t is_col_vector(x_upper): " << is_col_vector(x_upper) 
+            << "\n\t x.size():               " << x.size()
+            << "\n\t x_lower.size():         " << x_lower.size()
+            << "\n\t x_upper.size():         " << x_upper.size()
+            << "\n\t max_f_evals:            " << max_f_evals
+        );
+
+        DLIB_CASSERT(x.size() + 2 <= npt && npt <= (x.size()+1)*(x.size()+2)/2 &&
+                    0 < rho_end && rho_end < rho_begin &&
+                    min(x_upper - x_lower) > 2*rho_begin &&
+                    min(x - x_lower) >= 0 && min(x_upper - x) >= 0,
+            "\tdouble find_min_bobyqa()"
+            << "\n\t Invalid arguments have been given to this function"
+            << "\n\t ntp in valid range: " << (x.size() + 2 <= npt && npt <= (x.size()+1)*(x.size()+2)/2)
+            << "\n\t npt:                " << npt 
+            << "\n\t rho_begin:          " << rho_begin 
+            << "\n\t rho_end:            " << rho_end
+            << "\n\t min(x_upper - x_lower) > 2*rho_begin:           " << (min(x_upper - x_lower) > 2*rho_begin)
+            << "\n\t min(x - x_lower) >= 0 && min(x_upper - x) >= 0: " << (min(x - x_lower) >= 0 && min(x_upper - x) >= 0)
+        );
+
+
+        bobyqa_implementation impl;
+        return impl.find_min(f, x, npt, x_lower, x_upper, rho_begin, rho_end, max_f_evals);
+    }
+
+// ----------------------------------------------------------------------------------------
+
+    template <
+        typename funct,
+        typename T, 
+        typename U
+        >
+    double find_max_bobyqa (
+        const funct& f,
+        T& x,
+        long npt,
+        const U& x_lower,
+        const U& x_upper,
+        const double rho_begin,
+        const double rho_end,
+        const long max_f_evals
+    ) 
+    {
+        // The starting point (i.e. x) must be a column vector.  
+        COMPILE_TIME_ASSERT(T::NC <= 1);
+
+        return -find_min_bobyqa(negate_function(f), x, npt, x_lower, x_upper, rho_begin, rho_end, max_f_evals);
+    }
+
+// ----------------------------------------------------------------------------------------
+
+}
+
+#endif // DLIB_OPTIMIZATIOn_BOBYQA_Hh_
+
diff --git a/ml/dlib/dlib/optimization/optimization_bobyqa_abstract.h b/ml/dlib/dlib/optimization/optimization_bobyqa_abstract.h
new file mode 100644
index 000000000..46f9436af
--- /dev/null
+++ b/ml/dlib/dlib/optimization/optimization_bobyqa_abstract.h
@@ -0,0 +1,120 @@
+// Copyright (C) 2009  Davis E. King (davis@dlib.net)
+// License: Boost Software License   See LICENSE.txt for the full license.
+#undef DLIB_OPTIMIZATIOn_BOBYQA_ABSTRACT_Hh_
+#ifdef DLIB_OPTIMIZATIOn_BOBYQA_ABSTRACT_Hh_
+
+#include "../matrix.h"
+
+// ----------------------------------------------------------------------------------------
+
+/*
+    This file defines the dlib interface to the BOBYQA software developed by M.J.D Powell.
+    BOBYQA is a method for optimizing a function in the absence of derivative information.  
+    Powell described it as a method that seeks the least value of a function of many 
+    variables, by applying a trust region method that forms quadratic models by 
+    interpolation.  There is usually some freedom in the interpolation conditions, 
+    which is taken up by minimizing the Frobenius norm of the change to the second 
+    derivative of the model, beginning with the zero matrix. The values of the variables 
+    are constrained by upper and lower bounds.  
+
+
+    The following paper, published in 2009 by Powell, describes the
+    detailed working of the BOBYQA algorithm.  
+
+        The BOBYQA algorithm for bound constrained optimization 
+        without derivatives by M.J.D. Powell
+*/
+
+// ----------------------------------------------------------------------------------------
+
+namespace dlib
+{
+    class bobyqa_failure : public error;
+    /*!
+        This is the exception class used by the functions defined in this file.
+    !*/
+
+// ----------------------------------------------------------------------------------------
+
+    template <
+        typename funct,
+        typename T, 
+        typename U
+        >
+    double find_min_bobyqa (
+        const funct& f,
+        T& x,
+        long npt,
+        const U& x_lower,
+        const U& x_upper,
+        const double rho_begin,
+        const double rho_end,
+        const long max_f_evals
+    );
+    /*!
+        requires
+            - f(x) must be a valid expression that evaluates to a double
+            - is_col_vector(x) == true
+            - is_col_vector(x_lower) == true
+            - is_col_vector(x_upper) == true
+            - x.size() == x_lower.size() == x_upper.size()
+            - x.size() > 1
+            - x.size() + 2 <= npt <= (x.size()+1)*(x.size()+2)/2
+            - 0 < rho_end < rho_begin
+            - min(x_upper - x_lower) > 2*rho_begin
+              (i.e. the lower and upper bounds on each x element must be larger than 2*rho_begin)
+            - min(x - x_lower) >= 0 && min(x_upper - x) >= 0
+              (i.e. the given x should be within the bounds defined by x_lower and x_upper)
+            - max_f_evals > 1
+        ensures
+            - Performs a constrained minimization of the function f() starting from 
+              the initial point x.  
+            - The BOBYQA algorithm uses a number of interpolating points to perform its
+              work.  The npt argument controls how many points get used.  Typically,
+              a good value to use is 2*x.size()+1.
+            - #x == the value of x (within the bounds defined by x_lower and x_upper) that 
+              was found to minimize f().  More precisely:
+                - min(#x - x_lower) >= 0 && min(x_upper - #x) >= 0
+            - returns f(#x). 
+            - rho_begin and rho_end are used as the initial and final values of a trust 
+              region radius.  Typically, rho_begin should be about one tenth of the greatest 
+              expected change to a variable, while rho_end should indicate the accuracy that 
+              is required in the final values of the variables. 
+        throws
+            - bobyqa_failure
+                This exception is thrown if the algorithm is unable to make progress towards
+                solving the problem.  This may occur because the algorithm detects excessive
+                numerical errors or because max_f_evals of f() have occurred without reaching
+                convergence.
+    !*/
+
+// ----------------------------------------------------------------------------------------
+
+    template <
+        typename funct,
+        typename T, 
+        typename U
+        >
+    double find_max_bobyqa (
+        const funct& f,
+        T& x,
+        long npt,
+        const U& x_lower,
+        const U& x_upper,
+        const double rho_begin,
+        const double rho_end,
+        const long max_f_evals
+    );
+    /*!
+        This function is identical to the find_min_bobyqa() routine defined above
+        except that it negates the f() function before performing optimization.  
+        Thus this function will attempt to find the maximizer of f() rather than 
+        the minimizer.
+    !*/
+
+// ----------------------------------------------------------------------------------------
+
+}
+
+#endif // DLIB_OPTIMIZATIOn_BOBYQA_ABSTRACT_Hh_
+
diff --git a/ml/dlib/dlib/optimization/optimization_least_squares.h b/ml/dlib/dlib/optimization/optimization_least_squares.h
new file mode 100644
index 000000000..6d12a919d
--- /dev/null
+++ b/ml/dlib/dlib/optimization/optimization_least_squares.h
@@ -0,0 +1,345 @@
+// Copyright (C) 2010  Davis E. King (davis@dlib.net)
+// License: Boost Software License   See LICENSE.txt for the full license.
+#ifndef DLIB_OPTIMIZATION_LEAST_SQuARES_H_h_
+#define DLIB_OPTIMIZATION_LEAST_SQuARES_H_h_
+
+#include "../matrix.h"
+#include "optimization_trust_region.h"
+#include "optimization_least_squares_abstract.h"
+
+namespace dlib
+{
+
+// ----------------------------------------------------------------------------------------
+
+    template <
+        typename column_vector_type,
+        typename funct_type,
+        typename funct_der_type,
+        typename vector_type
+        >
+    class least_squares_function_model 
+    {
+    public:
+        least_squares_function_model (
+            const funct_type& f_,
+            const funct_der_type& der_,
+            const vector_type& list_
+        ) : f(f_), der(der_), list(list_) 
+        {
+            S = 0;
+            last_f = 0;
+            last_f2 = 0;
+
+            r.set_size(list.size(),1);
+        }
+
+        const funct_type& f;
+        const funct_der_type& der;
+        const vector_type& list;
+
+        typedef typename column_vector_type::type type;
+        typedef typename column_vector_type::mem_manager_type mem_manager_type;
+        typedef typename column_vector_type::layout_type layout_type;
+        const static long NR = column_vector_type::NR;
+
+        typedef column_vector_type column_vector;
+        typedef matrix<type,NR,NR,mem_manager_type,layout_type> general_matrix;
+
+
+        type operator() ( 
+            const column_vector& x
+        ) const
+        {
+            type result = 0;
+            for (long i = 0; i < list.size(); ++i)
+            {
+                const type temp = f(list(i), x);
+                // save the residual for later
+                r(i) = temp;
+                result += temp*temp;
+            }
+
+            last_f = 0.5*result;
+            return 0.5*result;
+        }
+
+        void get_derivative_and_hessian (
+            const column_vector& x,
+            column_vector& d,
+            general_matrix& h
+        ) const
+        {
+            J.set_size(list.size(), x.size());
+
+            // compute the Jacobian
+            for (long i = 0; i < list.size(); ++i)
+            {
+                set_rowm(J,i) = trans(der(list(i), x));
+            }
+
+            // Compute the Levenberg-Marquardt gradient and hessian
+            d = trans(J)*r;
+            h = trans(J)*J;
+
+            if (S.size() == 0)
+            {
+                S.set_size(x.size(), x.size());
+                S = 0;
+            }
+
+            // If this isn't the first iteration then check if using
+            // a quasi-newton update helps things.
+            if (last_r.size() != 0)
+            {
+
+                s = x - last_x;
+                y = d - last_d;
+                yy = d - trans(last_J)*r;
+
+                const type ys = trans(y)*s;
+                vtemp = yy - S*s;
+                const type temp2 = std::abs(trans(s)*S*s);
+                type scale = (temp2 != 0) ? std::min<type>(1, std::abs(dot(s,yy))/temp2)  :  1;
+
+                if (ys != 0)
+                    S = scale*S + (vtemp*trans(y) + y*trans(vtemp))/(ys) - dot(vtemp,s)/ys/ys*y*trans(y);
+                else
+                    S *= scale;
+
+                // check how well both the models fit the last change we saw in f()
+                const type measured_delta = last_f2 - last_f;
+                s = -s;
+                const type h_predicted_delta = 0.5*trans(s)*h*s + trans(d)*s;
+                const type s_predicted_delta = 0.5*trans(s)*(h+S)*s + trans(d)*s;
+
+                const type h_error = std::abs((h_predicted_delta/measured_delta) - 1);
+                const type s_error = std::abs((s_predicted_delta/measured_delta) - 1);
+
+                if (s_error < h_error && h_error > 0.01)
+                {
+                    h += make_symmetric(S);
+                }
+                else if (s_error > 10)
+                {
+                    S = 0;
+                }
+
+                // put r into last_r
+                r.swap(last_r);
+            }
+            else
+            {
+                // put r into last_r
+                last_r = r;
+            }
+
+            J.swap(last_J);
+            last_x = x;
+            last_d = d;
+
+            last_f2 = last_f;
+        }
+
+        mutable type last_f;   // value of function we saw in last operator()
+        mutable type last_f2;  // value of last_f we saw in get_derivative_and_hessian()
+        mutable matrix<type,0,1,mem_manager_type,layout_type> r;
+        mutable column_vector vtemp;
+        mutable column_vector s, y, yy;
+
+        mutable general_matrix S;
+        mutable column_vector last_x;
+        mutable column_vector last_d;
+        mutable matrix<type,0,1,mem_manager_type,layout_type> last_r;
+        mutable matrix<type,0,NR,mem_manager_type,layout_type> last_J;
+        mutable matrix<type,0,NR,mem_manager_type,layout_type> J;
+    };
+
+// ----------------------------------------------------------------------------------------
+
+    template <
+        typename column_vector_type,
+        typename funct_type,
+        typename funct_der_type,
+        typename vector_type
+        >
+    least_squares_function_model<column_vector_type,funct_type,funct_der_type,vector_type> least_squares_model (
+        const funct_type& f,
+        const funct_der_type& der,
+        const vector_type& list
+    )
+    {
+        return least_squares_function_model<column_vector_type,funct_type,funct_der_type,vector_type>(f,der,list);
+    }
+
+// ----------------------------------------------------------------------------------------
+
+    template <
+        typename stop_strategy_type,
+        typename funct_type,
+        typename funct_der_type,
+        typename vector_type,
+        typename T
+        >
+    double solve_least_squares (
+        stop_strategy_type stop_strategy,
+        const funct_type& f,
+        const funct_der_type& der,
+        const vector_type& list,
+        T& x, 
+        double radius = 1
+    )
+    {
+        // The starting point (i.e. x) must be a column vector.  
+        COMPILE_TIME_ASSERT(T::NC <= 1);
+
+        // make sure requires clause is not broken
+        DLIB_ASSERT(is_vector(mat(list)) && list.size() > 0 && 
+                    is_col_vector(x) && radius > 0,
+            "\t double solve_least_squares()"
+            << "\n\t invalid arguments were given to this function"
+            << "\n\t is_vector(list):  " << is_vector(mat(list)) 
+            << "\n\t list.size():      " << list.size() 
+            << "\n\t is_col_vector(x): " << is_col_vector(x) 
+            << "\n\t radius:           " << radius
+            );
+
+        return find_min_trust_region(stop_strategy,
+                                     least_squares_model<T>(f, der, mat(list)), 
+                                     x, 
+                                     radius);
+    }
+
+// ----------------------------------------------------------------------------------------
+// ----------------------------------------------------------------------------------------
+// ----------------------------------------------------------------------------------------
+
+    template <
+        typename column_vector_type,
+        typename funct_type,
+        typename funct_der_type,
+        typename vector_type
+        >
+    class least_squares_lm_function_model 
+    {
+    public:
+        least_squares_lm_function_model (
+            const funct_type& f_,
+            const funct_der_type& der_,
+            const vector_type& list_
+        ) : f(f_), der(der_), list(list_) 
+        {
+            r.set_size(list.size(),1);
+        }
+
+        const funct_type& f;
+        const funct_der_type& der;
+        const vector_type& list;
+
+        typedef typename column_vector_type::type type;
+        typedef typename column_vector_type::mem_manager_type mem_manager_type;
+        typedef typename column_vector_type::layout_type layout_type;
+        const static long NR = column_vector_type::NR;
+
+        typedef column_vector_type column_vector;
+        typedef matrix<type,NR,NR,mem_manager_type,layout_type> general_matrix;
+
+        mutable matrix<type,0,1,mem_manager_type,layout_type> r;
+        mutable column_vector vtemp;
+
+        type operator() ( 
+            const column_vector& x
+        ) const
+        {
+            type result = 0;
+            for (long i = 0; i < list.size(); ++i)
+            {
+                const type temp = f(list(i), x);
+                // save the residual for later
+                r(i) = temp;
+                result += temp*temp;
+            }
+
+            return 0.5*result;
+        }
+
+        void get_derivative_and_hessian (
+            const column_vector& x,
+            column_vector& d,
+            general_matrix& h
+        ) const
+        {
+            d = 0;
+            h = 0;
+            for (long i = 0; i < list.size(); ++i)
+            {
+                vtemp = der(list(i), x); 
+                d += r(i)*vtemp;
+                h += vtemp*trans(vtemp);
+            }
+        }
+
+    };
+
+// ----------------------------------------------------------------------------------------
+
+    template <
+        typename column_vector_type,
+        typename funct_type,
+        typename funct_der_type,
+        typename vector_type
+        >
+    least_squares_lm_function_model<column_vector_type,funct_type,funct_der_type,vector_type> least_squares_lm_model (
+        const funct_type& f,
+        const funct_der_type& der,
+        const vector_type& list
+    )
+    {
+        return least_squares_lm_function_model<column_vector_type,funct_type,funct_der_type,vector_type>(f,der,list);
+    }
+
+// ----------------------------------------------------------------------------------------
+
+    template <
+        typename stop_strategy_type,
+        typename funct_type,
+        typename funct_der_type,
+        typename vector_type,
+        typename T
+        >
+    double solve_least_squares_lm (
+        stop_strategy_type stop_strategy,
+        const funct_type& f,
+        const funct_der_type& der,
+        const vector_type& list,
+        T& x, 
+        double radius = 1
+    )
+    {
+        // The starting point (i.e. x) must be a column vector.  
+        COMPILE_TIME_ASSERT(T::NC <= 1);
+
+        // make sure requires clause is not broken
+        DLIB_ASSERT(is_vector(mat(list)) && list.size() > 0 && 
+                    is_col_vector(x) && radius > 0,
+            "\t double solve_least_squares_lm()"
+            << "\n\t invalid arguments were given to this function"
+            << "\n\t is_vector(list):  " << is_vector(mat(list)) 
+            << "\n\t list.size():      " << list.size() 
+            << "\n\t is_col_vector(x): " << is_col_vector(x) 
+            << "\n\t radius:           " << radius
+            );
+
+        return find_min_trust_region(stop_strategy,
+                                     least_squares_lm_model<T>(f, der, mat(list)), 
+                                     x, 
+                                     radius);
+    }
+
+// ----------------------------------------------------------------------------------------
+
+}
+
+#endif // DLIB_OPTIMIZATION_LEAST_SQuARES_H_h_
+
+
diff --git a/ml/dlib/dlib/optimization/optimization_least_squares_abstract.h b/ml/dlib/dlib/optimization/optimization_least_squares_abstract.h
new file mode 100644
index 000000000..aaffb2221
--- /dev/null
+++ b/ml/dlib/dlib/optimization/optimization_least_squares_abstract.h
@@ -0,0 +1,112 @@
+// Copyright (C) 2010  Davis E. King (davis@dlib.net)
+// License: Boost Software License   See LICENSE.txt for the full license.
+#undef DLIB_OPTIMIZATIOn_LEAST_SQUARES_ABSTRACT_
+#ifdef DLIB_OPTIMIZATIOn_LEAST_SQUARES_ABSTRACT_
+
+#include "../matrix/matrix_abstract.h"
+#include "optimization_trust_region_abstract.h"
+
+namespace dlib
+{
+
+// ----------------------------------------------------------------------------------------
+
+    template <
+        typename stop_strategy_type,
+        typename funct_type,
+        typename funct_der_type,
+        typename vector_type,
+        typename T
+        >
+    double solve_least_squares (
+        stop_strategy_type stop_strategy,
+        const funct_type& f,
+        const funct_der_type& der,
+        const vector_type& list,
+        T& x, 
+        double radius = 1
+    );
+    /*!
+        requires
+            - stop_strategy == an object that defines a stop strategy such as one of 
+              the objects from dlib/optimization/optimization_stop_strategies_abstract.h
+            - list == a matrix or something convertible to a matrix via mat()
+              such as a std::vector.
+            - is_vector(list) == true
+            - list.size() > 0
+            - is_col_vector(x) == true
+            - radius > 0
+            - for all valid i:
+                - f(list(i),x) must be a valid expression that evaluates to a floating point value.
+                - der(list(i),x) must be a valid expression that evaluates to the derivative of f(list(i),x) 
+                  with respect to x. This derivative must take the form of a column vector.
+        ensures
+            - This function performs an unconstrained minimization of the least squares
+              function g(x) defined by:
+                - g(x) = sum over all i: 0.5*pow( f(list(i),x), 2 )
+            - This method combines the Levenberg-Marquardt method with a quasi-newton method
+              for approximating the second order terms of the hessian and is appropriate for
+              large residual problems (i.e. problems where the f() function isn't driven to 0).  
+              In particular, it uses the method of Dennis, Gay, and Welsch as described in 
+              Numerical Optimization by Nocedal and Wright (second edition).
+            - Since this is a trust region algorithm, the radius parameter defines the initial 
+              size of the trust region.  
+            - The function is optimized until stop_strategy decides that an acceptable 
+              point has been found or the trust region subproblem fails to make progress.
+            - #x == the value of x that was found to minimize g()
+            - returns g(#x). 
+    !*/
+
+// ----------------------------------------------------------------------------------------
+
+    template <
+        typename stop_strategy_type,
+        typename funct_type,
+        typename funct_der_type,
+        typename vector_type,
+        typename T
+        >
+    double solve_least_squares_lm (
+        stop_strategy_type stop_strategy,
+        const funct_type& f,
+        const funct_der_type& der,
+        const vector_type& list,
+        T& x, 
+        double radius = 1
+    );
+    /*!
+        requires
+            - stop_strategy == an object that defines a stop strategy such as one of 
+              the objects from dlib/optimization/optimization_stop_strategies_abstract.h
+            - list == a matrix or something convertible to a matrix via mat()
+              such as a std::vector.
+            - is_vector(list) == true
+            - list.size() > 0
+            - is_col_vector(x) == true
+            - radius > 0
+            - for all valid i:
+                - f(list(i),x) must be a valid expression that evaluates to a floating point value.
+                - der(list(i),x) must be a valid expression that evaluates to the derivative of f(list(i),x) 
+                  with respect to x.  This derivative must take the form of a column vector.
+        ensures
+            - This function performs an unconstrained minimization of the least squares
+              function g(x) defined by:
+                - g(x) = sum over all i: 0.5*pow( f(list(i),x), 2 )
+            - This method implements a plain Levenberg-Marquardt approach for approximating
+              the hessian of g().  Therefore, it is most appropriate for small residual problems
+              (i.e. problems where f() goes to 0 at the solution).
+            - Since this is a trust region algorithm, the radius parameter defines the initial 
+              size of the trust region.  
+            - The function is optimized until stop_strategy decides that an acceptable 
+              point has been found or the trust region subproblem fails to make progress.
+            - #x == the value of x that was found to minimize g()
+            - returns g(#x). 
+    !*/
+
+// ----------------------------------------------------------------------------------------
+
+}
+
+#endif // DLIB_OPTIMIZATIOn_LEAST_SQUARES_ABSTRACT_
+
+
diff --git a/ml/dlib/dlib/optimization/optimization_line_search.h b/ml/dlib/dlib/optimization/optimization_line_search.h
new file mode 100644
index 000000000..a91e3df84
--- /dev/null
+++ b/ml/dlib/dlib/optimization/optimization_line_search.h
@@ -0,0 +1,888 @@
+// Copyright (C) 2008  Davis E. King (davis@dlib.net)
+// License: Boost Software License   See LICENSE.txt for the full license.
+#ifndef DLIB_OPTIMIZATIOn_LINE_SEARCH_H_
+#define DLIB_OPTIMIZATIOn_LINE_SEARCH_H_
+
+#include <cmath>
+#include <limits>
+#include "../matrix.h"
+#include "../algs.h"
+#include "optimization_line_search_abstract.h"
+#include <utility>
+
+namespace dlib
+{
+
+// ----------------------------------------------------------------------------------------
+
+    template <typename funct, typename T>
+    class line_search_funct 
+    {
+    public:
+        line_search_funct(const funct& f_, const T& start_, const T& direction_) 
+            : f(f_),start(start_), direction(direction_), matrix_r(0), scalar_r(0)
+        {}
+
+        line_search_funct(const funct& f_, const T& start_, const T& direction_, T& r) 
+            : f(f_),start(start_), direction(direction_), matrix_r(&r), scalar_r(0)
+        {}
+
+        line_search_funct(const funct& f_, const T& start_, const T& direction_, double& r) 
+            : f(f_),start(start_), direction(direction_), matrix_r(0), scalar_r(&r)
+        {}
+
+        double operator()(const double& x) const
+        {
+            return get_value(f(start + x*direction));
+        }
+
+    private:
+
+        double get_value (const double& r) const
+        {
+            // save a copy of this value for later
+            if (scalar_r)
+                *scalar_r = r;
+
+            return r;
+        }
+
+        template <typename U>
+        double get_value (const U& r) const
+        {
+            // U should be a matrix type
+            COMPILE_TIME_ASSERT(is_matrix<U>::value);
+
+            // save a copy of this value for later
+            if (matrix_r)
+                *matrix_r = r;
+
+            return dot(r,direction);
+        }
+
+        const funct& f;
+        const T& start;
+        const T& direction;
+        T* matrix_r;
+        double* scalar_r;
+    };
+
+    template <typename funct, typename T>
+    const line_search_funct<funct,T> make_line_search_function(const funct& f, const T& start, const T& direction) 
+    { 
+        COMPILE_TIME_ASSERT(is_matrix<T>::value);
+        DLIB_ASSERT (
+            is_col_vector(start) && is_col_vector(direction) && start.size() == direction.size(),
+            "\tline_search_funct make_line_search_function(f,start,direction)"
+            << "\n\tYou have to supply column vectors to this function"
+            << "\n\tstart.nc():     " << start.nc()
+            << "\n\tdirection.nc(): " << direction.nc()
+            << "\n\tstart.nr():     " << start.nr()
+            << "\n\tdirection.nr(): " << direction.nr()
+        );
+        return line_search_funct<funct,T>(f,start,direction); 
+    }
+
+// ----------------------------------------------------------------------------------------
+
+    template <typename funct, typename T>
+    const line_search_funct<funct,T> make_line_search_function(const funct& f, const T& start, const T& direction, double& f_out) 
+    { 
+        COMPILE_TIME_ASSERT(is_matrix<T>::value);
+        DLIB_ASSERT (
+            is_col_vector(start) && is_col_vector(direction) && start.size() == direction.size(),
+            "\tline_search_funct make_line_search_function(f,start,direction)"
+            << "\n\tYou have to supply column vectors to this function"
+            << "\n\tstart.nc():     " << start.nc()
+            << "\n\tdirection.nc(): " << direction.nc()
+            << "\n\tstart.nr():     " << start.nr()
+            << "\n\tdirection.nr(): " << direction.nr()
+        );
+        return line_search_funct<funct,T>(f,start,direction, f_out); 
+    }
+
+// ----------------------------------------------------------------------------------------
+
+    template <typename funct, typename T>
+    const line_search_funct<funct,T> make_line_search_function(const funct& f, const T& start, const T& direction, T& grad_out) 
+    { 
+        COMPILE_TIME_ASSERT(is_matrix<T>::value);
+        DLIB_ASSERT (
+            is_col_vector(start) && is_col_vector(direction) && start.size() == direction.size(),
+            "\tline_search_funct make_line_search_function(f,start,direction)"
+            << "\n\tYou have to supply column vectors to this function"
+            << "\n\tstart.nc():     " << start.nc()
+            << "\n\tdirection.nc(): " << direction.nc()
+            << "\n\tstart.nr():     " << start.nr()
+            << "\n\tdirection.nr(): " << direction.nr()
+        );
+        return line_search_funct<funct,T>(f,start,direction,grad_out); 
+    }
+
+// ----------------------------------------------------------------------------------------
+// ----------------------------------------------------------------------------------------
+// ----------------------------------------------------------------------------------------
+
+    inline double poly_min_extrap (
+        double f0,
+        double d0,
+        double f1,
+        double d1,
+        double limit = 1
+    )
+    {
+        const double n = 3*(f1 - f0) - 2*d0 - d1;
+        const double e = d0 + d1 - 2*(f1 - f0);
+
+
+        // find the minimum of the derivative of the polynomial
+
+        double temp = std::max(n*n - 3*e*d0,0.0);
+
+        if (temp < 0)
+            return 0.5;
+
+        temp = std::sqrt(temp);
+
+        if (std::abs(e) <= std::numeric_limits<double>::epsilon())
+            return 0.5;
+
+        // figure out the two possible min values
+        double x1 = (temp - n)/(3*e);
+        double x2 = -(temp + n)/(3*e);
+
+        // compute the value of the interpolating polynomial at these two points
+        double y1 = f0 + d0*x1 + n*x1*x1 + e*x1*x1*x1;
+        double y2 = f0 + d0*x2 + n*x2*x2 + e*x2*x2*x2;
+
+        // pick the best point
+        double x;
+        if (y1 < y2)
+            x = x1;
+        else
+            x = x2;
+
+        // now make sure the minimum is within the allowed range of [0,limit] 
+        return put_in_range(0,limit,x);
+    }
+
+// ----------------------------------------------------------------------------------------
+
+    inline double poly_min_extrap (
+        double f0,
+        double d0,
+        double f1
+    )
+    {
+        const double temp = 2*(f1 - f0 - d0);
+        if (std::abs(temp) <= d0*std::numeric_limits<double>::epsilon())
+            return 0.5;
+
+        const double alpha = -d0/temp;
+
+        // now make sure the minimum is within the allowed range of (0,1) 
+        return put_in_range(0,1,alpha);
+    }
+
+// ----------------------------------------------------------------------------------------
+
+    inline double poly_min_extrap (
+        double f0,
+        double d0,
+        double x1,
+        double f_x1,
+        double x2,
+        double f_x2
+    )
+    {
+        DLIB_ASSERT(0 < x1 && x1 < x2,"Invalid inputs were given to this function");
+        // The contents of this function follow the equations described on page 58 of the
+        // book Numerical Optimization by Nocedal and Wright, second edition.
+        matrix<double,2,2> m;
+        matrix<double,2,1> v;
+
+        const double aa2 = x2*x2;
+        const double aa1 = x1*x1;
+        m =  aa2,       -aa1,
+            -aa2*x2, aa1*x1;   
+        v = f_x1 - f0 - d0*x1,
+            f_x2 - f0 - d0*x2;
+
+
+        double temp = aa2*aa1*(x1-x2);
+
+        // just take a guess if this happens
+        if (temp == 0 || std::fpclassify(temp) == FP_SUBNORMAL)
+        {
+            return x1/2.0;
+        }
+
+        matrix<double,2,1> temp2;
+        temp2 = m*v/temp;
+        const double a = temp2(0);
+        const double b = temp2(1);
+
+        temp = b*b - 3*a*d0;
+        if (temp < 0 || a == 0)
+        {
+            // This is probably a line so just pick the lowest point
+            if (f0 < f_x2)
+                return 0;
+            else
+                return x2;
+        }
+        temp = (-b + std::sqrt(temp))/(3*a);
+        return put_in_range(0, x2, temp);
+    }
+
+// ----------------------------------------------------------------------------------------
+
+    inline double lagrange_poly_min_extrap (
+        double p1, 
+        double p2,
+        double p3,
+        double f1,
+        double f2,
+        double f3
+    )
+    {
+        DLIB_ASSERT(p1 < p2 && p2 < p3 && f1 >= f2 && f2 <= f3,
+                     "   p1: " << p1 
+                     << "   p2: " << p2 
+                     << "   p3: " << p3  
+                     << "   f1: " << f1 
+                     << "   f2: " << f2 
+                     << "   f3: " << f3);
+
+        // This formula is out of the book Nonlinear Optimization by Andrzej Ruszczynski.  See section 5.2.
+        double temp1 =    f1*(p3*p3 - p2*p2) + f2*(p1*p1 - p3*p3) + f3*(p2*p2 - p1*p1);
+        double temp2 = 2*(f1*(p3 - p2)       + f2*(p1 - p3)       + f3*(p2 - p1) );
+
+        if (temp2 == 0)
+        {
+            return p2;
+        }
+
+        const double result = temp1/temp2;
+
+        // do a final sanity check to make sure the result is in the right range
+        if (p1 <= result && result <= p3)
+        {
+            return result;
+        }
+        else
+        {
+            return std::min(std::max(p1,result),p3);
+        }
+    }
+
+// ----------------------------------------------------------------------------------------
+
+    template <
+        typename funct, 
+        typename funct_der
+        >
+    double line_search (
+        const funct& f, 
+        const double f0,
+        const funct_der& der, 
+        const double d0,
+        double rho, 
+        double sigma, 
+        double min_f,
+        unsigned long max_iter 
+    )
+    {
+        DLIB_ASSERT (
+            0 < rho && rho < sigma && sigma < 1 && max_iter > 0,
+            "\tdouble line_search()"
+            << "\n\tYou have given invalid arguments to this function"
+            << "\n\t sigma:    " << sigma
+            << "\n\t rho:      " << rho 
+            << "\n\t max_iter: " << max_iter 
+        );
+
+        // The bracketing phase of this function is implemented according to block 2.6.2 from
+        // the book Practical Methods of Optimization by R. Fletcher.   The sectioning 
+        // phase is an implementation of 2.6.4 from the same book.
+
+        // 1 <= tau1a < tau1b. Controls the alpha jump size during the bracketing phase of
+        // the search.
+        const double tau1a = 1.4;
+        const double tau1b = 9;
+
+        // it must be the case that 0 < tau2 < tau3 <= 1/2 for the algorithm to function
+        // correctly but the specific values of tau2 and tau3 aren't super important.
+        const double tau2 = 1.0/10.0;
+        const double tau3 = 1.0/2.0;
+
+
+        // Stop right away and return a step size of 0 if the gradient is 0 at the starting point
+        if (std::abs(d0) <= std::abs(f0)*std::numeric_limits<double>::epsilon())
+            return 0;
+
+        // Stop right away if the current value is good enough according to min_f
+        if (f0 <= min_f)
+            return 0;
+
+        // Figure out a reasonable upper bound on how large alpha can get.
+        const double mu = (min_f-f0)/(rho*d0);
+
+
+        double alpha = 1;
+        if (mu < 0)
+            alpha = -alpha;
+        alpha = put_in_range(0, 0.65*mu, alpha);
+
+
+        double last_alpha = 0;
+        double last_val = f0;
+        double last_val_der = d0;
+
+        // The bracketing stage will find a range of points [a,b]
+        // that contains a reasonable solution to the line search
+        double a, b;
+
+        // These variables will hold the values and derivatives of f(a) and f(b)
+        double a_val, b_val, a_val_der, b_val_der;
+
+        // This thresh value represents the Wolfe curvature condition
+        const double thresh = std::abs(sigma*d0);
+
+        unsigned long itr = 0;
+        // do the bracketing stage to find the bracket range [a,b]
+        while (true)
+        {
+            ++itr;
+            const double val = f(alpha);
+            const double val_der = der(alpha);
+
+            // we are done with the line search since we found a value smaller
+            // than the minimum f value
+            if (val <= min_f)
+                return alpha;
+
+            if (val > f0 + rho*alpha*d0 || val >= last_val)
+            {
+                a_val = last_val;
+                a_val_der = last_val_der;
+                b_val = val;
+                b_val_der = val_der;
+
+                a = last_alpha;
+                b = alpha;
+                break;
+            }
+
+            if (std::abs(val_der) <= thresh)
+                return alpha;
+
+            // if we are stuck not making progress then quit with the current alpha
+            if (last_alpha == alpha || itr >= max_iter)
+                return alpha;
+
+            if (val_der >= 0)
+            {
+                a_val = val;
+                a_val_der = val_der;
+                b_val = last_val;
+                b_val_der = last_val_der;
+
+                a = alpha;
+                b = last_alpha;
+                break;
+            }
+
+
+
+            const double temp = alpha;
+            // Pick a larger range [first, last].  We will pick the next alpha in that
+            // range.
+            double first, last;
+            if (mu > 0)
+            {
+                first = std::min(mu, alpha + tau1a*(alpha - last_alpha));
+                last  = std::min(mu, alpha + tau1b*(alpha - last_alpha));
+            }
+            else
+            {
+                first = std::max(mu, alpha + tau1a*(alpha - last_alpha));
+                last  = std::max(mu, alpha + tau1b*(alpha - last_alpha));
+            }
+            
+
+
+            // pick a point between first and last by doing some kind of interpolation
+            if (last_alpha < alpha)
+                alpha = last_alpha + (alpha-last_alpha)*poly_min_extrap(last_val, last_val_der, val, val_der, 1e10);
+            else
+                alpha = alpha + (last_alpha-alpha)*poly_min_extrap(val, val_der, last_val, last_val_der, 1e10);
+
+            alpha = put_in_range(first,last,alpha);
+
+            last_alpha = temp;
+
+            last_val = val;
+            last_val_der = val_der;
+
+        }
+
+
+        // Now do the sectioning phase from 2.6.4
+        while (true)
+        {
+            ++itr;
+            double first = a + tau2*(b-a);
+            double last = b - tau3*(b-a);
+
+            // use interpolation to pick alpha between first and last
+            alpha = a + (b-a)*poly_min_extrap(a_val, a_val_der, b_val, b_val_der);
+            alpha = put_in_range(first,last,alpha);
+
+            const double val = f(alpha);
+            const double val_der = der(alpha);
+
+            // we are done with the line search since we found a value smaller
+            // than the minimum f value or we ran out of iterations.
+            if (val <= min_f || itr >= max_iter)
+                return alpha;
+
+            // stop if the interval gets so small that it isn't shrinking any more due to rounding error 
+            if (a == first || b == last)
+            {
+                return b;
+            }
+
+            // If alpha has basically become zero then just stop.  Think of it like this,
+            // if we take the largest possible alpha step will the objective function
+            // change at all?  If not then there isn't any point looking for a better
+            // alpha.
+            const double max_possible_alpha = std::max(std::abs(a),std::abs(b));
+            if (std::abs(max_possible_alpha*d0) <= std::abs(f0)*std::numeric_limits<double>::epsilon())
+                return alpha;
+
+
+            if (val > f0 + rho*alpha*d0 || val >= a_val)
+            {
+                b = alpha;
+                b_val = val;
+                b_val_der = val_der;
+            }
+            else
+            {
+                if (std::abs(val_der) <= thresh)
+                    return alpha;
+
+                if ( (b-a)*val_der >= 0)
+                {
+                    b = a;
+                    b_val = a_val;
+                    b_val_der = a_val_der;
+                }
+
+                a = alpha;
+                a_val = val;
+                a_val_der = val_der;
+            }
+        }
+    }
+
+// ----------------------------------------------------------------------------------------
+
+    template <
+        typename funct
+        >
+    double backtracking_line_search (
+        const funct& f, 
+        double f0,
+        double d0,
+        double alpha,
+        double rho, 
+        unsigned long max_iter 
+    )
+    {
+        DLIB_ASSERT (
+            0 < rho && rho < 1 && max_iter > 0,
+            "\tdouble backtracking_line_search()"
+            << "\n\tYou have given invalid arguments to this function"
+            << "\n\t rho:      " << rho 
+            << "\n\t max_iter: " << max_iter 
+        );
+
+        // make sure alpha is going in the right direction.  That is, it should be opposite
+        // the direction of the gradient.
+        if ((d0 > 0 && alpha > 0) ||
+            (d0 < 0 && alpha < 0))
+        {
+            alpha *= -1;
+        }
+
+        bool have_prev_alpha = false;
+        double prev_alpha = 0;
+        double prev_val = 0;
+        unsigned long iter = 0;
+        while (true)
+        {
+            ++iter;
+            const double val = f(alpha);
+            if (val <= f0 + alpha*rho*d0 || iter >= max_iter)
+            {
+                return alpha;
+            }
+            else
+            {
+                // Interpolate a new alpha.  We also make sure the step by which we
+                // reduce alpha is not super small.
+                double step;
+                if (!have_prev_alpha)
+                {
+                    if (d0 < 0)
+                        step = alpha*put_in_range(0.1,0.9, poly_min_extrap(f0, d0, val));
+                    else
+                        step = alpha*put_in_range(0.1,0.9, poly_min_extrap(f0, -d0, val));
+                    have_prev_alpha = true;
+                }
+                else
+                {
+                    if (d0 < 0)
+                        step = put_in_range(0.1*alpha,0.9*alpha, poly_min_extrap(f0, d0, alpha, val, prev_alpha, prev_val));
+                    else
+                        step = put_in_range(0.1*alpha,0.9*alpha, -poly_min_extrap(f0, -d0, -alpha, val, -prev_alpha, prev_val));
+                }
+
+                prev_alpha = alpha;
+                prev_val = val;
+
+                alpha = step;
+            }
+        }
+    }
+
+// ----------------------------------------------------------------------------------------
+
+    class optimize_single_variable_failure : public error {
+    public: optimize_single_variable_failure(const std::string& s):error(s){}
+    };
+
+// ----------------------------------------------------------------------------------------
+
+    template <typename funct>
+    double find_min_single_variable (
+        const funct& f,
+        double& starting_point,
+        const double begin = -1e200,
+        const double end = 1e200,
+        const double eps = 1e-3,
+        const long max_iter = 100,
+        const double initial_search_radius = 1
+    )
+    {
+        DLIB_CASSERT( eps > 0 &&
+                      max_iter > 1 &&
+                      begin <= starting_point && starting_point <= end && 
+                      initial_search_radius > 0,
+                      "eps: " << eps
+                      << "\n max_iter: "<< max_iter 
+                      << "\n begin: "<< begin 
+                      << "\n end:   "<< end 
+                      << "\n starting_point: "<< starting_point 
+                      << "\n initial_search_radius: "<< initial_search_radius 
+        );
+
+        double search_radius = initial_search_radius;
+
+        double p1=0, p2=0, p3=0, f1=0, f2=0, f3=0;
+        long f_evals = 1;
+
+        if (begin == end)
+        {
+            return f(starting_point);
+        }
+
+        using std::abs;
+        using std::min;
+        using std::max;
+
+        // find three bracketing points such that f1 > f2 < f3.   Do this by generating a sequence
+        // of points expanding away from 0.   Also note that, in the following code, it is always the
+        // case that p1 < p2 < p3.
+
+
+
+        // The first thing we do is get a starting set of 3 points that are inside the [begin,end] bounds
+        p1 = max(starting_point-search_radius, begin);
+        p3 = min(starting_point+search_radius, end);
+        f1 = f(p1);
+        f3 = f(p3);
+
+        if (starting_point == p1 || starting_point == p3)
+        {
+            p2 = (p1+p3)/2;
+            f2 = f(p2);
+        }
+        else
+        {
+            p2 = starting_point;
+            f2 = f(starting_point);
+        }
+
+        f_evals += 2;
+
+        // Now we have 3 points on the function.  Start looking for a bracketing set such that
+        // f1 > f2 < f3 is the case.
+        while ( !(f1 > f2 && f2 < f3))
+        {
+            // check for hitting max_iter or if the interval is now too small
+            if (f_evals >= max_iter)
+            {
+                throw optimize_single_variable_failure(
+                    "The max number of iterations of single variable optimization have been reached\n"
+                    "without converging.");
+            }
+            if (p3-p1 < eps)
+            {
+                if (f1 < min(f2,f3)) 
+                {
+                    starting_point = p1;
+                    return f1;
+                }
+
+                if (f2 < min(f1,f3)) 
+                {
+                    starting_point = p2;
+                    return f2;
+                }
+
+                starting_point = p3;
+                return f3;
+            }
+            
+            // If the left most points are identical in function value then expand out the
+            // left a bit, unless it's already at bound or we would drop that left most
+            // point anyway because it's bad.
+            if (f1==f2 && f1<f3 && p1!=begin)
+            {
+                p1 = max(p1 - search_radius, begin);
+                f1 = f(p1);
+                ++f_evals;
+                search_radius *= 2;
+                continue;
+            }
+            if (f2==f3 && f3<f1 && p3!=end)
+            {
+                p3 = min(p3 + search_radius, end);
+                f3 = f(p3);
+                ++f_evals;
+                search_radius *= 2;
+                continue;
+            }
+
+
+            // if f1 is small then take a step to the left
+            if (f1 <= f3)
+            { 
+                // check if the minimum is butting up against the bounds and if so then pick
+                // a point between p1 and p2 in the hopes that shrinking the interval will
+                // be a good thing to do.  Or if p1 and p2 aren't differentiated then try and
+                // get them to obtain different values.
+                if (p1 == begin || (f1 == f2 && (end-begin) < search_radius ))
+                {
+                    p3 = p2;
+                    f3 = f2;
+
+                    p2 = (p1+p2)/2.0;
+                    f2 = f(p2);
+                }
+                else
+                {
+                    // pick a new point to the left of our current bracket
+                    p3 = p2;
+                    f3 = f2;
+
+                    p2 = p1;
+                    f2 = f1;
+
+                    p1 = max(p1 - search_radius, begin);
+                    f1 = f(p1);
+
+                    search_radius *= 2;
+                }
+
+            }
+            // otherwise f3 is small and we should take a step to the right
+            else 
+            {
+                // check if the minimum is butting up against the bounds and if so then pick
+                // a point between p2 and p3 in the hopes that shrinking the interval will
+                // be a good thing to do.  Or if p2 and p3 aren't differentiated then try and
+                // get them to obtain different values.
+                if (p3 == end || (f2 == f3 && (end-begin) < search_radius))
+                {
+                    p1 = p2;
+                    f1 = f2;
+
+                    p2 = (p3+p2)/2.0;
+                    f2 = f(p2);
+                }
+                else
+                {
+                    // pick a new point to the right of our current bracket
+                    p1 = p2;
+                    f1 = f2;
+
+                    p2 = p3;
+                    f2 = f3;
+
+                    p3 = min(p3 + search_radius, end);
+                    f3 = f(p3);
+
+                    search_radius *= 2;
+                }
+            }
+
+            ++f_evals;
+        }
+
+
+        // Loop until we have done the max allowable number of iterations or
+        // the bracketing window is smaller than eps.
+        // Within this loop we maintain the invariant that: f1 > f2 < f3 and p1 < p2 < p3
+        const double tau = 0.1;
+        while( f_evals < max_iter && p3-p1 > eps)
+        {
+            double p_min = lagrange_poly_min_extrap(p1,p2,p3, f1,f2,f3);
+
+
+            // make sure p_min isn't too close to the three points we already have
+            if (p_min < p2)
+            {
+                const double min_dist = (p2-p1)*tau;
+                if (abs(p1-p_min) < min_dist) 
+                {
+                    p_min = p1 + min_dist;
+                }
+                else if (abs(p2-p_min) < min_dist)
+                {
+                    p_min = p2 - min_dist;
+                }
+            }
+            else
+            {
+                const double min_dist = (p3-p2)*tau;
+                if (abs(p2-p_min) < min_dist) 
+                {
+                    p_min = p2 + min_dist;
+                }
+                else if (abs(p3-p_min) < min_dist)
+                {
+                    p_min = p3 - min_dist;
+                }
+            }
+
+            // make sure one side of the bracket isn't super huge compared to the other
+            // side.  If it is then contract it.
+            const double bracket_ratio = abs(p1-p2)/abs(p2-p3);
+            if ( !( bracket_ratio < 10 && bracket_ratio > 0.1) )
+            {
+                // Force p_min to be on a reasonable side.  But only if lagrange_poly_min_extrap()
+                // didn't put it on a good side already.
+                if (bracket_ratio > 1 && p_min > p2)
+                    p_min = (p1+p2)/2;
+                else if (p_min < p2)
+                    p_min = (p2+p3)/2;
+            }
+
+
+            const double f_min = f(p_min);
+
+
+            // Remove one of the endpoints of our bracket depending on where the new point falls.
+            if (p_min < p2)
+            {
+                if (f1 > f_min && f_min < f2)
+                {
+                    p3 = p2;
+                    f3 = f2;
+                    p2 = p_min;
+                    f2 = f_min;
+                }
+                else
+                {
+                    p1 = p_min;
+                    f1 = f_min;
+                }
+            }
+            else
+            {
+                if (f2 > f_min && f_min < f3)
+                {
+                    p1 = p2;
+                    f1 = f2;
+                    p2 = p_min;
+                    f2 = f_min;
+                }
+                else
+                {
+                    p3 = p_min;
+                    f3 = f_min;
+                }
+            }
+
+
+            ++f_evals;
+        }
+
+        if (f_evals >= max_iter)
+        {
+            throw optimize_single_variable_failure(
+                "The max number of iterations of single variable optimization have been reached\n"
+                "without converging.");
+        }
+
+        starting_point = p2;
+        return f2;
+    }
+
+// ----------------------------------------------------------------------------------------
+
+    template <typename funct>
+    class negate_function_object 
+    {
+    public:
+        negate_function_object(const funct& f_) : f(f_){}
+
+        template <typename T>
+        double operator()(const T& x) const
+        {
+            return -f(x);
+        }
+
+    private:
+        const funct& f;
+    };
+
+    template <typename funct>
+    const negate_function_object<funct> negate_function(const funct& f) { return negate_function_object<funct>(f); }
+
+// ----------------------------------------------------------------------------------------
+
+    template <typename funct>
+    double find_max_single_variable (
+        const funct& f,
+        double& starting_point,
+        const double begin = -1e200,
+        const double end = 1e200,
+        const double eps = 1e-3,
+        const long max_iter = 100,
+        const double initial_search_radius = 1
+    )
+    {
+        return -find_min_single_variable(negate_function(f), starting_point, begin, end, eps, max_iter, initial_search_radius);
+    }
+
+// ----------------------------------------------------------------------------------------
+
+}
+
+#endif // DLIB_OPTIMIZATIOn_LINE_SEARCH_H_
+
diff --git a/ml/dlib/dlib/optimization/optimization_line_search_abstract.h b/ml/dlib/dlib/optimization/optimization_line_search_abstract.h
new file mode 100644
index 000000000..2aa221da4
--- /dev/null
+++ b/ml/dlib/dlib/optimization/optimization_line_search_abstract.h
@@ -0,0 +1,361 @@
+// Copyright (C) 2008  Davis E. King (davis@dlib.net)
+// License: Boost Software License   See LICENSE.txt for the full license.
+#undef DLIB_OPTIMIZATIOn_ABSTRACT_
+#ifdef DLIB_OPTIMIZATIOn_ABSTRACT_
+
+#include <cmath>
+#include <limits>
+#include "../matrix/matrix_abstract.h"
+#include "../algs.h"
+
+
+namespace dlib
+{
+
+// ----------------------------------------------------------------------------------------
+
+    template <
+        typename funct, 
+        typename T
+        >
+    class line_search_funct; 
+    /*!
+        This object is a function object that represents a line search function.
+
+        Moreover, it represents a function with the signature:
+            double l(double x)
+    !*/
+
+    template <
+        typename funct, 
+        typename T
+        >
+    const line_search_funct<funct,T> make_line_search_function (
+        const funct& f, 
+        const T& start, 
+        const T& direction
+    ); 
+    /*!
+        requires
+            - is_col_vector(start) && is_col_vector(direction) && start.size() == direction.size() 
+              (i.e. start and direction should be column vectors of the same size)
+            - f must return either a double or a column vector the same length as start
+            - f(start + 1.5*direction) should be a valid expression
+        ensures
+            - if (f returns a double) then
+                - returns a line search function that computes l(x) == f(start + x*direction)
+            - else
+                - returns a line search function that computes l(x) == dot(f(start + x*direction),direction).
+                  That is, we assume f is the derivative of some other function and that what
+                  f returns is a gradient vector. 
+                  So the following two expressions both create the derivative of l(x): 
+                    - derivative(make_line_search_function(funct,start,direction))
+                    - make_line_search_function(derivative(funct),start,direction)
+    !*/
+
+    template <
+        typename funct, 
+        typename T
+        >
+    const line_search_funct<funct,T> make_line_search_function (
+        const funct& f, 
+        const T& start, 
+        const T& direction,
+        double& f_out
+    ); 
+    /*!
+        This function is identical to the above three argument version of make_line_search_function() 
+        except that, if f() outputs a double, every time f() is evaluated its output is also stored 
+        into f_out.
+    !*/
+
+    template <
+        typename funct, 
+        typename T
+        >
+    const line_search_funct<funct,T> make_line_search_function (
+        const funct& f, 
+        const T& start, 
+        const T& direction,
+        T& gradient_out
+    ); 
+    /*!
+        This function is identical to the above three argument version of make_line_search_function() 
+        except that, if f() outputs a column vector, every time f() is evaluated its output is also 
+        stored into gradient_out.
+    !*/
+
+// ----------------------------------------------------------------------------------------
+
+    inline double poly_min_extrap (
+        double f0,
+        double d0,
+        double f1,
+        double d1,
+        double limit = 1
+    );
+    /*!
+        ensures
+            - let c(x) be a 3rd degree polynomial such that:
+                - c(0) == f0
+                - c(1) == f1
+                - derivative of c(x) at x==0 is d0
+                - derivative of c(x) at x==1 is d1
+            - returns the point in the range [0,limit] that minimizes the polynomial c(x) 
+    !*/
+
+// ----------------------------------------------------------------------------------------
+
+    inline double poly_min_extrap (
+        double f0,
+        double d0,
+        double f1
+    );
+    /*!
+        ensures
+            - let c(x) be a 2nd degree polynomial such that:
+                - c(0) == f0
+                - c(1) == f1
+                - derivative of c(x) at x==0 is d0
+            - returns the point in the range [0,1] that minimizes the polynomial c(x) 
+    !*/
+
+// ----------------------------------------------------------------------------------------
+
+    inline double poly_min_extrap (
+        double f0,
+        double d0,
+        double x1,
+        double f_x1,
+        double x2,
+        double f_x2
+    )
+    /*!
+        requires
+            - 0 < x1 < x2
+        ensures
+            - let f(x) be a 3rd degree polynomial such that:
+                - f(0) == f0
+                - derivative of f(x) at x==0 is d0
+                - f(x1) == f_x1
+                - f(x2) == f_x2
+            - returns the point in the range [0,x2] that minimizes the polynomial f(x) 
+    !*/
+
+// ----------------------------------------------------------------------------------------
+
+    inline double lagrange_poly_min_extrap (
+        double p1, 
+        double p2,
+        double p3,
+        double f1,
+        double f2,
+        double f3
+    );
+    /*!
+        requires
+            - f1 >= f2 <= f3
+            - p1 < p2 < p3
+        ensures
+            - let c(x) be the second order Lagrange polynomial that interpolates the
+              points p1, p2, and p3 where c(p1)==f1, c(p2)==f2, and c(p3)==f3
+            - this function returns the point in the range [p1,p3] that minimizes 
+              the polynomial c(x) 
+    !*/
+
+// ----------------------------------------------------------------------------------------
+
+    template <
+        typename funct, 
+        typename funct_der
+        >
+    double line_search (
+        const funct& f, 
+        const double f0,
+        const funct_der& der, 
+        const double d0,
+        double rho, 
+        double sigma, 
+        double min_f,
+        unsigned long max_iter
+    )
+    /*!
+        requires
+            - 0 < rho < sigma < 1
+            - f and der are scalar functions of scalars
+              (e.g. line_search_funct objects)
+            - der is the derivative of f
+            - f0 == f(0)
+            - d0 == der(0)
+            - max_iter > 0
+        ensures
+            - Performs a line search and uses the strong Wolfe conditions to decide when
+              the search can stop.  
+                - rho == the parameter of the Wolfe sufficient decrease condition
+                - sigma == the parameter of the Wolfe curvature condition
+                - max_iter == the maximum number of iterations allowable.  After this
+                  many evaluations of f() line_search() is guaranteed to terminate.
+            - returns a value alpha such that f(alpha) is significantly closer to 
+              the minimum of f than f(0).
+            - It is assumed that the minimum possible value of f(x) is min_f.  So if
+              an alpha is found such that f(alpha) <= min_f then the search stops
+              immediately.
+            - This function is also optimized for the case where der(0) is negative.  I.e.
+              positive values of the argument to f() should be in a descent direction.  
+            - When this function makes calls to f() and der() it always does so by
+              first calling f() and then calling der().  That is, these two functions
+              are always called in pairs with f() being called first and then der()
+              being called second.
+    !*/
+
+    /*
+        A good discussion of the Wolfe conditions and line search algorithms in 
+        general can be found in the book Practical Methods of Optimization by R. Fletcher
+        and also in the more recent book Numerical Optimization by Nocedal and Wright.
+    */
+
+// ----------------------------------------------------------------------------------------
+
+    template <
+        typename funct
+        >
+    double backtracking_line_search (
+        const funct& f, 
+        double f0,
+        double d0,
+        double alpha,
+        double rho, 
+        unsigned long max_iter 
+    );
+    /*!
+        requires
+            - 0 < rho < 1
+            - f is a scalar function of scalars 
+              (e.g. a line_search_funct object)
+            - f0 == f(0)
+            - d0 == the derivative of f() at f(0). 
+            - max_iter > 0
+        ensures
+            - Performs a backtracking line search and uses the Armijo sufficient decrease
+              rule to decide when the search can stop.
+                - rho == the parameter of the sufficient decrease condition. 
+                - max_iter == the maximum number of iterations allowable.  After this many
+                  evaluations of f() backtracking_line_search() is guaranteed to terminate.
+            - The line search starts with the input alpha value and then backtracks until
+              it finds a good enough alpha value.  Once found, it returns the alpha value
+              such that f(alpha) is significantly closer to the minimum of f than f(0).
+            - The returned value of alpha will always be the last value of alpha which was
+              passed to f().  That is, it will always be the case that the last call to f()
+              made by backtracking_line_search() was f(alpha) where alpha is the return
+              value from this function.
+    !*/
+
+// ----------------------------------------------------------------------------------------
+
+    template <
+        typename funct
+        >
+    const negate_function_object<funct> negate_function(
+        const funct& f
+    );
+    /*!
+        requires
+            - f == a function that returns a scalar
+        ensures
+            - returns a function that represents the negation of f.  That is,
+              the returned function object represents g(x) == -f(x)
+    !*/
+
+// ----------------------------------------------------------------------------------------
+
+    class optimize_single_variable_failure : public error;
+    /*!
+        This is the exception class used by the functions defined below.
+    !*/
+
+// ----------------------------------------------------------------------------------------
+
+    template <
+        typename funct
+        >
+    double find_min_single_variable (
+        const funct& f,
+        double& starting_point,
+        const double begin = -1e200,
+        const double end = 1e200,
+        const double eps = 1e-3,
+        const long max_iter = 100,
+        const double initial_search_radius = 1
+    )
+    /*!
+        requires
+            - eps > 0
+            - max_iter > 1
+            - begin <= starting_point <= end
+            - f must be a function of a double that returns a double
+              (e.g. f(starting_point) should be a valid expression that evaluates to a double)
+            - initial_search_radius > 0
+        ensures
+            - Finds a point P such that:
+                - P is a local minimum of the function f().   
+                - begin <= P <= end
+            - Evaluates f() no more than max_iter times
+            - Stops searching when the window around the minimum point is smaller than eps.
+              The search will begin with the given starting_point and expand out to the
+              left and right by initial_search_radius sized steps.  So if you think the
+              minimum is likely to be found within X distance from the starting_point then
+              set initial_search_radius to X.
+            - #starting_point == P
+            - returns f(P)
+        throws
+            - optimize_single_variable_failure 
+                This exception is thrown if max_iter iterations are performed without 
+                determining the min point to the requested accuracy of eps.
+    !*/
+
+// ----------------------------------------------------------------------------------------
+
+    template <
+        typename funct
+        >
+    double find_max_single_variable (
+        const funct& f,
+        double& starting_point,
+        const double begin = -1e200,
+        const double end = 1e200,
+        const double eps = 1e-3,
+        const long max_iter = 100,
+        const double initial_search_radius = 1
+    )
+    /*!
+        requires
+            - eps > 0
+            - max_iter > 1
+            - begin <= starting_point <= end
+            - f must be a function of a double that returns a double
+              (e.g. f(starting_point) should be a valid expression that evaluates to a double)
+            - initial_search_radius > 0
+        ensures
+            - Finds a point P such that:
+                - P is a local maximum of the function f().   
+                - begin <= P <= end
+            - Evaluates f() no more than max_iter times
+            - Stops searching when the window around the maximum point is smaller than eps.
+              The search will begin with the given starting_point and expand out to the
+              left and right by initial_search_radius sized steps.  So if you think the
+              maximum is likely to be found within X distance from the starting_point then
+              set initial_search_radius to X.
+            - #starting_point == P
+            - returns f(P)
+        throws
+            - optimize_single_variable_failure 
+                This exception is thrown if max_iter iterations are performed without 
+                determining the max point to the requested accuracy of eps.
+    !*/
+
+// ----------------------------------------------------------------------------------------
+
+}
+
+#endif // DLIB_OPTIMIZATIOn_ABSTRACT_
+
diff --git a/ml/dlib/dlib/optimization/optimization_oca.h b/ml/dlib/dlib/optimization/optimization_oca.h
new file mode 100644
index 000000000..4ca9cd7ae
--- /dev/null
+++ b/ml/dlib/dlib/optimization/optimization_oca.h
@@ -0,0 +1,407 @@
+// Copyright (C) 2010  Davis E. King (davis@dlib.net)
+// License: Boost Software License   See LICENSE.txt for the full license.
+#ifndef DLIB_OPTIMIZATIoN_OCA_Hh_
+#define DLIB_OPTIMIZATIoN_OCA_Hh_
+
+#include "optimization_oca_abstract.h"
+
+#include "../matrix.h"
+#include "optimization_solve_qp_using_smo.h"
+#include <vector>
+#include "../sequence.h"
+
+// ----------------------------------------------------------------------------------------
+
+namespace dlib
+{
+    template <typename matrix_type>
+    class oca_problem
+    {
+    public:
+        typedef typename matrix_type::type scalar_type;
+
+        virtual ~oca_problem() {}
+
+        virtual bool risk_has_lower_bound (
+            scalar_type& 
+        ) const { return false; }
+
+        virtual bool optimization_status (
+            scalar_type ,
+            scalar_type ,
+            scalar_type ,
+            scalar_type ,
+            unsigned long,
+            unsigned long
+        ) const = 0;
+
+        virtual scalar_type get_c (
+        ) const = 0;
+
+        virtual long get_num_dimensions (
+        ) const = 0;
+
+        virtual void get_risk (
+            matrix_type& current_solution,
+            scalar_type& risk_value,
+            matrix_type& risk_subgradient
+        ) const = 0;
+
+    };
+
+// ----------------------------------------------------------------------------------------
+
+    class oca
+    {
+    public:
+
+        oca () 
+        {
+            sub_eps = 1e-2;
+            sub_max_iter = 50000;
+
+            inactive_thresh = 20;
+        }
+
+        void set_subproblem_epsilon (
+            double eps_
+        ) { sub_eps = eps_; }
+
+        double get_subproblem_epsilon (
+        ) const { return sub_eps; }
+
+        void set_subproblem_max_iterations (
+            unsigned long sub_max_iter_
+        ) 
+        { 
+            // make sure requires clause is not broken
+            DLIB_ASSERT(sub_max_iter_ > 0,
+                "\t void oca::set_subproblem_max_iterations"
+                << "\n\t max iterations must be greater than 0"
+                << "\n\t sub_max_iter_: " << sub_max_iter_
+                << "\n\t this: " << this
+                );
+
+            sub_max_iter = sub_max_iter_; 
+        }
+
+        unsigned long get_subproblem_max_iterations (
+        ) const { return sub_max_iter; }
+
+        void set_inactive_plane_threshold (
+            unsigned long inactive_thresh_
+        ) 
+        { 
+            // make sure requires clause is not broken
+            DLIB_ASSERT(inactive_thresh_ > 0,
+                "\t void oca::set_inactive_plane_threshold"
+                << "\n\t inactive threshold must be greater than 0"
+                << "\n\t inactive_thresh_: " << inactive_thresh_
+                << "\n\t this: " << this
+                );
+
+            inactive_thresh = inactive_thresh_; 
+        }
+
+        unsigned long get_inactive_plane_threshold (
+        ) const { return inactive_thresh; }
+
+        template <
+            typename matrix_type
+            >
+        typename matrix_type::type operator() (
+            const oca_problem<matrix_type>& problem,
+            matrix_type& w,
+            unsigned long num_nonnegative = 0,
+            unsigned long force_weight_to_1 = std::numeric_limits<unsigned long>::max()
+        ) const
+        {
+            matrix_type empty_prior;
+            return oca_impl(problem, w, empty_prior, false, num_nonnegative, force_weight_to_1, 0);
+        }
+
+        template <
+            typename matrix_type
+            >
+        typename matrix_type::type solve_with_elastic_net (
+            const oca_problem<matrix_type>& problem,
+            matrix_type& w,
+            double lasso_lambda,
+            unsigned long force_weight_to_1 = std::numeric_limits<unsigned long>::max()
+        ) const
+        {
+            matrix_type empty_prior;
+            return oca_impl(problem, w, empty_prior, false, 0, force_weight_to_1, lasso_lambda);
+        }
+
+        template <
+            typename matrix_type
+            >
+        typename matrix_type::type operator() (
+            const oca_problem<matrix_type>& problem,
+            matrix_type& w,
+            const matrix_type& prior
+        ) const
+        {
+            // make sure requires clause is not broken
+            DLIB_ASSERT(is_col_vector(prior) && prior.size() == problem.get_num_dimensions(),
+                "\t scalar_type oca::operator()"
+                << "\n\t The prior vector does not have the correct dimensions."
+                << "\n\t is_col_vector(prior):         " << is_col_vector(prior) 
+                << "\n\t prior.size():                 " << prior.size() 
+                << "\n\t problem.get_num_dimensions(): " << problem.get_num_dimensions() 
+                << "\n\t this:                         " << this
+                );
+            // disable the force weight to 1 option for this mode.  We also disable the
+            // non-negative constraints.
+            unsigned long force_weight_to_1 = std::numeric_limits<unsigned long>::max();
+            return oca_impl(problem, w, prior, true, 0, force_weight_to_1, 0);
+        }
+
+    private:
+
+        template <
+            typename matrix_type
+            >
+        typename matrix_type::type oca_impl (
+            const oca_problem<matrix_type>& problem,
+            matrix_type& w,
+            const matrix_type& prior,
+            bool have_prior,
+            unsigned long num_nonnegative,
+            unsigned long force_weight_to_1,
+            const double lasso_lambda
+        ) const
+        {
+            const unsigned long num_dims = problem.get_num_dimensions();
+
+            // make sure requires clause is not broken
+            DLIB_ASSERT(problem.get_c() > 0 &&
+                        problem.get_num_dimensions() > 0 && 
+                        0 <= lasso_lambda && lasso_lambda < 1,
+                "\t scalar_type oca::operator()"
+                << "\n\t The oca_problem is invalid"
+                << "\n\t problem.get_c():              " << problem.get_c() 
+                << "\n\t problem.get_num_dimensions(): " << num_dims 
+                << "\n\t lasso_lambda:                 " << lasso_lambda 
+                << "\n\t this: " << this
+                );
+            if (have_prior)
+            {
+                DLIB_ASSERT(lasso_lambda == 0, "Solver doesn't support using a prior with lasso.");
+                DLIB_ASSERT(num_nonnegative == 0, "Solver doesn't support using a prior with non-negative constraints.");
+            }
+            else if (lasso_lambda != 0)
+            {
+                DLIB_ASSERT(num_nonnegative == 0, "Solver doesn't support using lasso with non-negative constraints.");
+            }
+
+            const double ridge_lambda = 1-lasso_lambda;
+
+            if (num_nonnegative > num_dims)
+                num_nonnegative = num_dims;
+
+            typedef typename matrix_type::type scalar_type;
+            typedef typename matrix_type::layout_type layout_type;
+            typedef typename matrix_type::mem_manager_type mem_manager_type;
+            typedef matrix_type vect_type;
+
+            const scalar_type C = problem.get_c();
+
+            typename sequence<vect_type>::kernel_2a planes;
+            std::vector<scalar_type> bs, miss_count;
+
+            vect_type new_plane, alpha, btemp;
+
+            w.set_size(num_dims, 1);
+            w = 0;
+
+            // The current objective value.  Note also that w always contains 
+            // the current solution.
+            scalar_type cur_obj = std::numeric_limits<scalar_type>::max();
+
+            // This will hold the cutting plane objective value.  This value is
+            // a lower bound on the true optimal objective value.
+            scalar_type cp_obj = 0;
+
+            matrix<scalar_type,0,0,mem_manager_type, layout_type> K, Ktmp;
+            matrix<scalar_type,0,1,mem_manager_type, layout_type> lambda, d;
+            if (lasso_lambda != 0)
+                d.set_size(num_dims);
+            else
+                d.set_size(num_nonnegative);
+            d = lasso_lambda*ones_matrix(d);
+
+            scalar_type R_lower_bound;
+            if (problem.risk_has_lower_bound(R_lower_bound))
+            {
+                // The flat lower bounding plane is always good to have if we know
+                // what it is.
+                bs.push_back(R_lower_bound);
+                new_plane = zeros_matrix(w);
+                planes.add(0, new_plane);
+                alpha = uniform_matrix<scalar_type>(1,1, C);
+                miss_count.push_back(0);
+
+                K.set_size(1,1);
+                K(0,0) = 0;
+            }
+
+            const double prior_norm = have_prior ?  0.5*dot(prior,prior) : 0;
+
+            unsigned long counter = 0;
+            while (true)
+            {
+
+                // add the next cutting plane
+                scalar_type cur_risk;
+                if (force_weight_to_1 < (unsigned long)w.size())
+                    w(force_weight_to_1) = 1;
+
+                problem.get_risk(w, cur_risk, new_plane);
+
+                if (force_weight_to_1 < (unsigned long)w.size())
+                {
+                    // We basically arrange for the w(force_weight_to_1) element and all
+                    // subsequent elements of w to not be involved in the optimization at
+                    // all.  An easy way to do this is to just make sure the elements of w
+                    // corresponding elements in the subgradient are always set to zero
+                    // while we run the cutting plane algorithm.  The only time
+                    // w(force_weight_to_1) is 1 is when we pass it to the oca_problem.
+                    set_rowm(w, range(force_weight_to_1, w.size()-1)) = 0;
+                    set_rowm(new_plane, range(force_weight_to_1, new_plane.size()-1)) = 0;
+                }
+
+                if (have_prior)
+                    bs.push_back(cur_risk - dot(w,new_plane) + dot(prior,new_plane));
+                else
+                    bs.push_back(cur_risk - dot(w,new_plane));
+                planes.add(planes.size(), new_plane);
+                miss_count.push_back(0);
+
+                // If alpha is empty then initialize it (we must always have sum(alpha) == C).  
+                // But otherwise, just append a zero.
+                if (alpha.size() == 0)
+                    alpha = uniform_matrix<scalar_type>(1,1, C);
+                else
+                    alpha = join_cols(alpha,zeros_matrix<scalar_type>(1,1));
+
+                const scalar_type wnorm = 0.5*ridge_lambda*trans(w)*w + lasso_lambda*sum(abs(w));
+                const double prior_part = have_prior? dot(w,prior) : 0;
+                cur_obj = wnorm + C*cur_risk + prior_norm-prior_part;
+
+                // report current status
+                const scalar_type risk_gap = cur_risk - (cp_obj-wnorm+prior_part-prior_norm)/C;
+                if (counter > 0 && problem.optimization_status(cur_obj, cur_obj - cp_obj, 
+                                                               cur_risk, risk_gap, planes.size(), counter))
+                {
+                    break;
+                }
+
+                // compute kernel matrix for all the planes
+                K.swap(Ktmp);
+                K.set_size(planes.size(), planes.size());
+                // copy over the old K matrix
+                set_subm(K, 0,0, Ktmp.nr(), Ktmp.nc()) = Ktmp;
+
+                // now add the new row and column to K
+                for (unsigned long c = 0; c < planes.size(); ++c)
+                {
+                    K(c, Ktmp.nc()) = dot(planes[c], planes[planes.size()-1]);
+                    K(Ktmp.nc(), c) = K(c,Ktmp.nc());
+                }
+
+
+                // solve the cutting plane subproblem for the next w.   We solve it to an
+                // accuracy that is related to how big the error gap is.  Also, we multiply
+                // by ridge_lambda because the objective function for the QP we solve was
+                // implicitly scaled by ridge_lambda.  That is, we want to ask the QP
+                // solver to solve the problem until the duality gap is 0.1 times smaller
+                // than what it is now.  So the factor of ridge_lambda is necessary to make
+                // this happen. 
+                scalar_type eps = std::min<scalar_type>(sub_eps, 0.1*ridge_lambda*(cur_obj-cp_obj));
+                // just a sanity check
+                if (eps < 1e-16)
+                    eps = 1e-16;
+                // Note that we warm start this optimization by using the alpha from the last
+                // iteration as the starting point.
+                if (lasso_lambda != 0)
+                {
+                    // copy planes into a matrix so we can call solve_qp4_using_smo()
+                    matrix<scalar_type,0,0,mem_manager_type, layout_type> planes_mat(num_dims,planes.size());
+                    for (unsigned long i = 0; i < planes.size(); ++i)
+                        set_colm(planes_mat,i) = planes[i];
+
+                    btemp = ridge_lambda*mat(bs) - trans(planes_mat)*d;
+                    solve_qp4_using_smo(planes_mat, K, btemp, d, alpha, lambda, eps, sub_max_iter, (scalar_type)(2*lasso_lambda)); 
+                }
+                else if (num_nonnegative != 0)
+                {
+                    // copy planes into a matrix so we can call solve_qp4_using_smo()
+                    matrix<scalar_type,0,0,mem_manager_type, layout_type> planes_mat(num_nonnegative,planes.size());
+                    for (unsigned long i = 0; i < planes.size(); ++i)
+                        set_colm(planes_mat,i) = colm(planes[i],0,num_nonnegative);
+
+                    solve_qp4_using_smo(planes_mat, K, mat(bs), d, alpha, lambda, eps, sub_max_iter); 
+                }
+                else
+                {
+                    solve_qp_using_smo(K, mat(bs), alpha, eps, sub_max_iter); 
+                }
+
+                // construct the w that minimized the subproblem.
+                w = -alpha(0)*planes[0];
+                for (unsigned long i = 1; i < planes.size(); ++i)
+                    w -= alpha(i)*planes[i];
+                if (lasso_lambda != 0)
+                    w = (lambda-d+w)/ridge_lambda;
+                else if (num_nonnegative != 0) // threshold the first num_nonnegative w elements if necessary.
+                    set_rowm(w,range(0,num_nonnegative-1)) = lowerbound(rowm(w,range(0,num_nonnegative-1)),0);
+
+                for (long i = 0; i < alpha.size(); ++i)
+                {
+                    if (alpha(i) != 0)
+                        miss_count[i] = 0;
+                    else
+                        miss_count[i] += 1;
+                }
+
+                // Compute the lower bound on the true objective given to us by the cutting 
+                // plane subproblem.
+                cp_obj = -0.5*ridge_lambda*trans(w)*w + trans(alpha)*mat(bs);
+                if (have_prior)
+                    w += prior;
+
+                // If it has been a while since a cutting plane was an active constraint then
+                // we should throw it away.
+                while (max(mat(miss_count)) >= inactive_thresh)
+                {
+                    const long idx = index_of_max(mat(miss_count));
+                    bs.erase(bs.begin()+idx);
+                    miss_count.erase(miss_count.begin()+idx);
+                    K = removerc(K, idx, idx);
+                    alpha = remove_row(alpha,idx);
+                    planes.remove(idx, new_plane);
+                }
+
+                ++counter;
+            }
+
+            if (force_weight_to_1 < (unsigned long)w.size())
+                w(force_weight_to_1) = 1;
+
+            return cur_obj;
+        }
+
+        double sub_eps;
+
+        unsigned long sub_max_iter;
+
+        unsigned long inactive_thresh;
+    };
+}
+
+// ----------------------------------------------------------------------------------------
+
+#endif // DLIB_OPTIMIZATIoN_OCA_Hh_
+
diff --git a/ml/dlib/dlib/optimization/optimization_oca_abstract.h b/ml/dlib/dlib/optimization/optimization_oca_abstract.h
new file mode 100644
index 000000000..859dbdcfc
--- /dev/null
+++ b/ml/dlib/dlib/optimization/optimization_oca_abstract.h
@@ -0,0 +1,334 @@
+// Copyright (C) 2010  Davis E. King (davis@dlib.net)
+// License: Boost Software License   See LICENSE.txt for the full license.
+#undef DLIB_OPTIMIZATION_OCA_ABsTRACT_Hh_
+#ifdef DLIB_OPTIMIZATION_OCA_ABsTRACT_Hh_
+
+// ----------------------------------------------------------------------------------------
+
+namespace dlib
+{
+    template <typename matrix_type>
+    class oca_problem
+    {
+        /*!
+            REQUIREMENTS ON matrix_type
+                - matrix_type == a dlib::matrix capable of storing column vectors
+
+            WHAT THIS OBJECT REPRESENTS
+                This object is the interface used to define the optimization 
+                problems solved by the oca optimizer defined later in this file.
+
+                OCA solves optimization problems with the following form:
+                    Minimize: f(w) == 0.5*length_squared(w) + C*R(w)
+
+                    Where R(w) is a user-supplied convex function and C > 0.  Optionally,
+                    there can also be non-negativity constraints on some or all of the 
+                    elements of w.
+
+                Or it can alternatively solve:
+                    Minimize: f(w) == 0.5*length_squared(w-prior) + C*R(w)
+
+                    Where prior is a user supplied vector and R(w) has the same
+                    interpretation as above.
+                       
+                Or it can use the elastic net regularizer:
+                    Minimize: f(w) == 0.5*(1-lasso_lambda)*length_squared(w) + lasso_lambda*sum(abs(w)) + C*R(w)
+
+                    Where lasso_lambda is a number in the range [0, 1) and controls
+                    trade-off between doing L1 and L2 regularization.  R(w) has the same
+                    interpretation as above.
+                       
+
+                Note that the stopping condition must be provided by the user
+                in the form of the optimization_status() function.
+        !*/
+
+    public:
+
+        typedef typename matrix_type::type scalar_type;
+
+        virtual ~oca_problem() {}
+
+        virtual bool risk_has_lower_bound (
+            scalar_type& lower_bound
+        ) const { return false; }
+        /*!
+            ensures
+                - if (R(w) >= a constant for all values of w) then
+                    - returns true
+                    - #lower_bound == the constant that lower bounds R(w)
+                - else
+                    - returns false
+        !*/
+
+        virtual bool optimization_status (
+            scalar_type current_objective_value,
+            scalar_type current_error_gap,
+            scalar_type current_risk_value,
+            scalar_type current_risk_gap,
+            unsigned long num_cutting_planes,
+            unsigned long num_iterations
+        ) const = 0;
+        /*!
+            requires
+                - This function is called by the OCA optimizer each iteration.  
+                - current_objective_value == the current value of the objective function f(w)
+                - current_error_gap == The bound on how much lower the objective function
+                  can drop before we reach the optimal point.  At the optimal solution the
+                  error gap is equal to 0.
+                - current_risk_value == the current value of the R(w) term of the objective function.
+                - current_risk_gap == the bound on how much lower the risk term can go.  At the optimal
+                  solution the risk gap is zero.
+                - num_cutting_planes == the number of cutting planes the algorithm is currently using.
+                - num_iterations == A count of the total number of iterations that have executed
+                  since we started running the optimization.
+            ensures
+                - If it is appropriate to terminate the optimization then this function returns true
+                  and false otherwise.
+        !*/
+
+        virtual scalar_type get_c (
+        ) const = 0;
+        /*!
+            ensures
+                - returns the C parameter
+        !*/
+
+        virtual long get_num_dimensions (
+        ) const = 0;
+        /*!
+            ensures
+                - returns the number of free variables in this optimization problem
+        !*/
+
+        virtual void get_risk (
+            matrix_type& current_solution,
+            scalar_type& risk_value,
+            matrix_type& risk_subgradient
+        ) const = 0;
+        /*!
+            requires
+                - is_col_vector(current_solution) == true
+                - current_solution.size() == get_num_dimensions()
+            ensures
+                - #current_solution will be set to one of the following:
+                    - current_solution (i.e. it won't be modified at all)
+                    - The result of a line search passing through current_solution.  
+                - #risk_value == R(#current_solution) 
+                - #risk_subgradient == an element of the subgradient of R() at the 
+                  point #current_solution
+                - Note that #risk_value and #risk_subgradient are NOT multiplied by get_c()
+        !*/
+
+    };
+
+// ----------------------------------------------------------------------------------------
+
+    class oca
+    {
+        /*!
+            INITIAL VALUE
+                - get_subproblem_epsilon() == 1e-2
+                - get_subproblem_max_iterations() == 50000
+                - get_inactive_plane_threshold() == 20
+
+            WHAT THIS OBJECT REPRESENTS
+                This object is a tool for solving the optimization problem defined above
+                by the oca_problem abstract class.  
+
+                For reference, OCA solves optimization problems with the following form:
+                    Minimize: f(w) == 0.5*length_squared(w) + C*R(w)
+
+                    Where R(w) is a user-supplied convex function and C > 0.  Optionally,
+                    this object can also add non-negativity constraints to some or all
+                    of the elements of w.
+
+                Or it can alternatively solve:
+                    Minimize: f(w) == 0.5*length_squared(w-prior) + C*R(w)
+
+                    Where prior is a user supplied vector and R(w) has the same
+                    interpretation as above.
+                       
+                Or it can use the elastic net regularizer:
+                    Minimize: f(w) == 0.5*(1-lasso_lambda)*length_squared(w) + lasso_lambda*sum(abs(w)) + C*R(w)
+
+                    Where lasso_lambda is a number in the range [0, 1) and controls
+                    trade-off between doing L1 and L2 regularization.  R(w) has the same
+                    interpretation as above.
+                       
+
+                For a detailed discussion you should consult the following papers
+                from the Journal of Machine Learning Research:
+                    Optimized Cutting Plane Algorithm for Large-Scale Risk Minimization
+                        Vojtech Franc, Soren Sonnenburg; 10(Oct):2157--2192, 2009. 
+
+                    Bundle Methods for Regularized Risk Minimization
+                        Choon Hui Teo, S.V.N. Vishwanthan, Alex J. Smola, Quoc V. Le; 11(Jan):311-365, 2010. 
+        !*/
+    public:
+
+        oca (
+        ); 
+        /*!
+            ensures
+                - this object is properly initialized
+        !*/
+
+        template <
+            typename matrix_type
+            >
+        typename matrix_type::type operator() (
+            const oca_problem<matrix_type>& problem,
+            matrix_type& w,
+            unsigned long num_nonnegative = 0,
+            unsigned long force_weight_to_1 = std::numeric_limits<unsigned long>::max()
+        ) const;
+        /*!
+            requires
+                - problem.get_c() > 0
+                - problem.get_num_dimensions() > 0
+            ensures
+                - solves the given oca problem and stores the solution in #w.  In particular,
+                  this function solves:
+                    Minimize: f(w) == 0.5*length_squared(w) + C*R(w)
+                - The optimization algorithm runs until problem.optimization_status() 
+                  indicates it is time to stop.
+                - returns the objective value at the solution #w
+                - if (num_nonnegative != 0) then
+                    - Adds the constraint that #w(i) >= 0 for all i < num_nonnegative.
+                      That is, the first num_nonnegative elements of #w will always be
+                      non-negative.  This includes the copies of w passed to get_risk()
+                      in the form of the current_solution vector as well as the final
+                      output of this function.
+                - if (force_weight_to_1 < problem.get_num_dimensions()) then
+                    - The optimizer enforces the following constraints:
+                        - #w(force_weight_to_1) == 1
+                        - for all i > force_weight_to_1:
+                            - #w(i) == 0 
+                        - That is, the element in the weight vector at the index indicated
+                          by force_weight_to_1 will have a value of 1 upon completion of
+                          this function, while all subsequent elements of w will have
+                          values of 0.
+        !*/
+
+        template <
+            typename matrix_type
+            >
+        typename matrix_type::type operator() (
+            const oca_problem<matrix_type>& problem,
+            matrix_type& w,
+            const matrix_type& prior
+        ) const;
+        /*!
+            requires
+                - problem.get_c() > 0
+                - problem.get_num_dimensions() > 0
+                - is_col_vector(prior) == true
+                - prior.size() == problem.get_num_dimensions()
+            ensures
+                - solves the given oca problem and stores the solution in #w.
+                - In this mode, we solve a version of the problem with a different
+                  regularizer.  In particular, this function solves:
+                    Minimize: f(w) == 0.5*length_squared(w-prior) + C*R(w)
+                - The optimization algorithm runs until problem.optimization_status() 
+                  indicates it is time to stop.
+                - returns the objective value at the solution #w
+        !*/
+
+        template <
+            typename matrix_type
+            >
+        typename matrix_type::type solve_with_elastic_net (
+            const oca_problem<matrix_type>& problem,
+            matrix_type& w,
+            scalar_type lasso_lambda,
+            unsigned long force_weight_to_1 = std::numeric_limits<unsigned long>::max()
+        ) const;
+        /*!
+            requires
+                - problem.get_c() > 0
+                - problem.get_num_dimensions() > 0
+                - 0 <= lasso_lambda < 1
+            ensures
+                - Solves the given oca problem and stores the solution in #w, but uses an
+                  elastic net regularizer instead of the normal L2 regularizer.  In
+                  particular, this function solves:
+                    Minimize: f(w) == 0.5*(1-lasso_lambda)*length_squared(w) + lasso_lambda*sum(abs(w)) + C*R(w)
+                - The optimization algorithm runs until problem.optimization_status() 
+                  indicates it is time to stop.
+                - returns the objective value at the solution #w
+                - if (force_weight_to_1 < problem.get_num_dimensions()) then
+                    - The optimizer enforces the following constraints:
+                        - #w(force_weight_to_1) == 1
+                        - for all i > force_weight_to_1:
+                            - #w(i) == 0 
+                        - That is, the element in the weight vector at the index indicated
+                          by force_weight_to_1 will have a value of 1 upon completion of
+                          this function, while all subsequent elements of w will have
+                          values of 0.
+        !*/
+
+        void set_subproblem_epsilon (
+            double eps
+        ); 
+        /*!
+            requires
+                - eps > 0
+            ensures
+                - #get_subproblem_epsilon() == eps 
+        !*/
+
+        double get_subproblem_epsilon (
+        ) const; 
+        /*!
+            ensures
+                - returns the accuracy used in solving the quadratic programming
+                  subproblem that is part of the overall OCA algorithm.
+        !*/
+
+        void set_subproblem_max_iterations (
+            unsigned long sub_max_iter
+        ); 
+        /*!
+            requires
+                - sub_max_iter > 0
+            ensures
+                - #get_subproblem_max_iterations() == sub_max_iter
+        !*/
+
+        unsigned long get_subproblem_max_iterations (
+        ) const; 
+        /*!
+            ensures
+                - returns the maximum number of iterations this object will perform
+                  while attempting to solve each quadratic programming subproblem.
+        !*/
+
+        void set_inactive_plane_threshold (
+            unsigned long inactive_thresh
+        ); 
+        /*!
+            requires
+                - inactive_thresh > 0
+            ensures
+                - #get_inactive_plane_threshold() == inactive_thresh
+        !*/
+
+        unsigned long get_inactive_plane_threshold (
+        ) const; 
+        /*!
+            ensures
+                - As OCA runs it builds up a set of cutting planes.  Typically
+                  cutting planes become inactive after a certain point and can then
+                  be removed.  This function returns the number of iterations of
+                  inactivity required before a cutting plane is removed.
+        !*/
+
+    };
+}
+
+// ----------------------------------------------------------------------------------------
+
+#endif // DLIB_OPTIMIZATION_OCA_ABsTRACT_Hh_
+
+
diff --git a/ml/dlib/dlib/optimization/optimization_search_strategies.h b/ml/dlib/dlib/optimization/optimization_search_strategies.h
new file mode 100644
index 000000000..eda637fa1
--- /dev/null
+++ b/ml/dlib/dlib/optimization/optimization_search_strategies.h
@@ -0,0 +1,324 @@
+// Copyright (C) 2008  Davis E. King (davis@dlib.net)
+// License: Boost Software License   See LICENSE.txt for the full license.
+#ifndef DLIB_OPTIMIZATIOn_SEARCH_STRATEGIES_H_
+#define DLIB_OPTIMIZATIOn_SEARCH_STRATEGIES_H_
+
+#include <cmath>
+#include <limits>
+#include "../matrix.h"
+#include "../algs.h"
+#include "optimization_search_strategies_abstract.h"
+#include "../sequence.h"
+
+namespace dlib
+{
+
+// ----------------------------------------------------------------------------------------
+
+    class cg_search_strategy
+    {
+    public:
+        cg_search_strategy() : been_used(false) {}
+
+        double get_wolfe_rho (
+        ) const { return 0.001; }
+
+        double get_wolfe_sigma (
+        ) const { return 0.01; }
+
+        unsigned long get_max_line_search_iterations (
+        ) const { return 100; }
+
+        template <typename T>
+        const matrix<double,0,1>& get_next_direction (
+            const T& ,
+            const double ,
+            const T& funct_derivative
+        )
+        {
+            if (been_used == false)
+            {
+                been_used = true;
+                prev_direction = -funct_derivative;
+            }
+            else
+            {
+                // Use the Polak-Ribiere (4.1.12) conjugate gradient described by Fletcher on page 83
+                const double temp = trans(prev_derivative)*prev_derivative;
+                // If this value hits zero then just use the direction of steepest descent.
+                if (std::abs(temp) < std::numeric_limits<double>::epsilon())
+                {
+                    prev_derivative = funct_derivative;
+                    prev_direction = -funct_derivative;
+                    return prev_direction;
+                }
+
+                double b = trans(funct_derivative-prev_derivative)*funct_derivative/(temp);
+                prev_direction = -funct_derivative + b*prev_direction;
+
+            }
+
+            prev_derivative = funct_derivative;
+            return prev_direction;
+        }
+
+    private:
+        bool been_used;
+        matrix<double,0,1> prev_derivative;
+        matrix<double,0,1> prev_direction;
+    };
+
+// ----------------------------------------------------------------------------------------
+
+    class bfgs_search_strategy
+    {
+    public:
+        bfgs_search_strategy() : been_used(false), been_used_twice(false) {}
+
+        double get_wolfe_rho (
+        ) const { return 0.01; }
+
+        double get_wolfe_sigma (
+        ) const { return 0.9; }
+
+        unsigned long get_max_line_search_iterations (
+        ) const { return 100; }
+
+        template <typename T>
+        const matrix<double,0,1>& get_next_direction (
+            const T& x,
+            const double ,
+            const T& funct_derivative
+        )
+        {
+            if (been_used == false)
+            {
+                been_used = true;
+                H = identity_matrix<double>(x.size());
+            }
+            else
+            {
+                // update H with the BFGS formula from (3.2.12) on page 55 of Fletcher 
+                delta = (x-prev_x); 
+                gamma = funct_derivative-prev_derivative;
+
+                double dg = dot(delta,gamma);
+
+                // Try to set the initial value of the H matrix to something reasonable if we are still
+                // in the early stages of figuring out what it is.  This formula below is what is suggested
+                // in the book Numerical Optimization by Nocedal and Wright in the chapter on Quasi-Newton methods.
+                if (been_used_twice == false)
+                {
+                    double gg = trans(gamma)*gamma;
+                    if (std::abs(gg) > std::numeric_limits<double>::epsilon())
+                    {
+                        const double temp = put_in_range(0.01, 100, dg/gg);
+                        H = diagm(uniform_matrix<double>(x.size(),1, temp));
+                        been_used_twice = true;
+                    }
+                }
+
+                Hg = H*gamma;
+                gH = trans(trans(gamma)*H);
+                double gHg = trans(gamma)*H*gamma;
+                if (gHg < std::numeric_limits<double>::infinity() && dg < std::numeric_limits<double>::infinity() &&
+                    dg != 0)
+                {
+                    H += (1 + gHg/dg)*delta*trans(delta)/(dg) - (delta*trans(gH) + Hg*trans(delta))/(dg);
+                }
+                else
+                {
+                    H = identity_matrix<double>(H.nr());
+                    been_used_twice = false;
+                }
+            }
+
+            prev_x = x;
+            prev_direction = -H*funct_derivative;
+            prev_derivative = funct_derivative;
+            return prev_direction;
+        }
+
+    private:
+        bool been_used;
+        bool been_used_twice;
+        matrix<double,0,1> prev_x;
+        matrix<double,0,1> prev_derivative;
+        matrix<double,0,1> prev_direction;
+        matrix<double> H;
+        matrix<double,0,1> delta, gamma, Hg, gH;
+    };
+
+// ----------------------------------------------------------------------------------------
+
+    class lbfgs_search_strategy
+    {
+    public:
+        explicit lbfgs_search_strategy(unsigned long max_size_) : max_size(max_size_), been_used(false) 
+        {
+            DLIB_ASSERT (
+                max_size > 0,
+                "\t lbfgs_search_strategy(max_size)"
+                << "\n\t max_size can't be zero"
+            );
+        }
+
+        lbfgs_search_strategy(const lbfgs_search_strategy& item) 
+        {
+            max_size = item.max_size;
+            been_used = item.been_used;
+            prev_x = item.prev_x;
+            prev_derivative = item.prev_derivative;
+            prev_direction = item.prev_direction;
+            alpha = item.alpha;
+            dh_temp = item.dh_temp;
+        }
+
+        double get_wolfe_rho (
+        ) const { return 0.01; }
+
+        double get_wolfe_sigma (
+        ) const { return 0.9; }
+
+        unsigned long get_max_line_search_iterations (
+        ) const { return 100; }
+
+        template <typename T>
+        const matrix<double,0,1>& get_next_direction (
+            const T& x,
+            const double ,
+            const T& funct_derivative
+        )
+        {
+            prev_direction = -funct_derivative;
+
+            if (been_used == false)
+            {
+                been_used = true;
+            }
+            else
+            {
+                // add an element into the stored data sequence
+                dh_temp.s = x - prev_x;
+                dh_temp.y = funct_derivative - prev_derivative;
+                double temp = dot(dh_temp.s, dh_temp.y);
+                // only accept this bit of data if temp isn't zero
+                if (std::abs(temp) > std::numeric_limits<double>::epsilon())
+                {
+                    dh_temp.rho = 1/temp;
+                    data.add(data.size(), dh_temp);
+                }
+                else
+                {
+                    data.clear();
+                }
+
+                if (data.size() > 0)
+                {
+                    // This block of code is from algorithm 7.4 in the Nocedal book.
+
+                    alpha.resize(data.size());
+                    for (unsigned long i = data.size()-1; i < data.size(); --i)
+                    {
+                        alpha[i] = data[i].rho*dot(data[i].s, prev_direction);
+                        prev_direction -= alpha[i]*data[i].y;
+                    }
+
+                    // Take a guess at what the first H matrix should be.  This formula below is what is suggested
+                    // in the book Numerical Optimization by Nocedal and Wright in the chapter on Large Scale 
+                    // Unconstrained Optimization (in the L-BFGS section).
+                    double H_0 = 1.0/data[data.size()-1].rho/dot(data[data.size()-1].y, data[data.size()-1].y);
+                    H_0 = put_in_range(0.001, 1000.0, H_0);
+                    prev_direction *= H_0;
+
+                    for (unsigned long i = 0; i < data.size(); ++i)
+                    {
+                        double beta = data[i].rho*dot(data[i].y, prev_direction);
+                        prev_direction += data[i].s * (alpha[i] - beta);
+                    }
+                }
+
+            }
+
+            if (data.size() > max_size)
+            {
+                // remove the oldest element in the data sequence
+                data.remove(0, dh_temp);
+            }
+
+            prev_x = x;
+            prev_derivative = funct_derivative;
+            return prev_direction;
+        }
+
+    private:
+
+        struct data_helper
+        {
+            matrix<double,0,1> s;
+            matrix<double,0,1> y;
+            double rho;
+
+            friend void swap(data_helper& a, data_helper& b)
+            {
+                a.s.swap(b.s);
+                a.y.swap(b.y);
+                std::swap(a.rho, b.rho);
+            }
+        };
+        sequence<data_helper>::kernel_2a data;
+
+        unsigned long max_size;
+        bool been_used;
+        matrix<double,0,1> prev_x;
+        matrix<double,0,1> prev_derivative;
+        matrix<double,0,1> prev_direction;
+        std::vector<double> alpha;
+
+        data_helper dh_temp;
+    };
+
+// ----------------------------------------------------------------------------------------
+
+    template <typename hessian_funct>
+    class newton_search_strategy_obj
+    {
+    public:
+        explicit newton_search_strategy_obj(
+            const hessian_funct& hess
+        ) : hessian(hess) {}
+
+        double get_wolfe_rho (
+        ) const { return 0.01; }
+
+        double get_wolfe_sigma (
+        ) const { return 0.9; }
+
+        unsigned long get_max_line_search_iterations (
+        ) const { return 100; }
+
+        template <typename T>
+        const matrix<double,0,1> get_next_direction (
+            const T& x,
+            const double ,
+            const T& funct_derivative
+        )
+        {
+            return -inv(hessian(x))*funct_derivative;
+        }
+
+    private:
+        hessian_funct hessian;
+    };
+
+    template <typename hessian_funct>
+    newton_search_strategy_obj<hessian_funct> newton_search_strategy (
+        hessian_funct hessian
+    ) { return newton_search_strategy_obj<hessian_funct>(hessian); }
+
+// ----------------------------------------------------------------------------------------
+
+}
+
+#endif // DLIB_OPTIMIZATIOn_SEARCH_STRATEGIES_H_
+
diff --git a/ml/dlib/dlib/optimization/optimization_search_strategies_abstract.h b/ml/dlib/dlib/optimization/optimization_search_strategies_abstract.h
new file mode 100644
index 000000000..44b894bfc
--- /dev/null
+++ b/ml/dlib/dlib/optimization/optimization_search_strategies_abstract.h
@@ -0,0 +1,330 @@
+// Copyright (C) 2008  Davis E. King (davis@dlib.net)
+// License: Boost Software License   See LICENSE.txt for the full license.
+#undef DLIB_OPTIMIZATIOn_SEARCH_STRATEGIES_ABSTRACT_
+#ifdef DLIB_OPTIMIZATIOn_SEARCH_STRATEGIES_ABSTRACT_
+
+#include <cmath>
+#include <limits>
+#include "../matrix/matrix_abstract.h"
+#include "../algs.h"
+
+
+namespace dlib
+{
+    /*
+        A good discussion of the search strategies in this file can be found in the 
+        following book:  Numerical Optimization by Nocedal and Wright.
+    */
+
+// ----------------------------------------------------------------------------------------
+
+    class cg_search_strategy
+    {
+        /*!
+            WHAT THIS OBJECT REPRESENTS
+                This object represents a strategy for determining which direction
+                a line search should be carried out along.  This particular object
+                is an implementation of the Polak-Ribiere conjugate gradient method
+                for determining this direction.
+
+                This method uses an amount of memory that is linear in the number
+                of variables to be optimized.  So it is capable of handling problems
+                with a very large number of variables.  However, it is generally
+                not as good as the L-BFGS algorithm (which is defined below in
+                the lbfgs_search_strategy class).
+        !*/
+
+    public:
+        cg_search_strategy(
+        );
+        /*!
+            ensures
+                - This object is properly initialized and ready to generate
+                  search directions.
+        !*/
+
+        double get_wolfe_rho (
+        ) const;
+        /*!
+            ensures
+                - returns the value of the Wolfe rho parameter that should be used when 
+                  this search strategy is used with the line_search() function.
+        !*/
+
+        double get_wolfe_sigma (
+        ) const; 
+        /*!
+            ensures
+                - returns the value of the Wolfe sigma parameter that should be used when 
+                  this search strategy is used with the line_search() function.
+        !*/
+
+        unsigned long get_max_line_search_iterations (
+        ) const; 
+        /*!
+            ensures
+                - returns the value of the max iterations parameter that should be used when 
+                  this search strategy is used with the line_search() function.
+        !*/
+
+        template <typename T>
+        const matrix<double,0,1>& get_next_direction (
+            const T& x,
+            const double funct_value,
+            const T& funct_derivative
+        );
+        /*!
+            requires
+                - this function is only called once per search iteration
+                - for some objective function f():
+                    - x == the search point for the current iteration
+                    - funct_value == f(x)
+                    - funct_derivative == derivative(f)(x)
+            ensures
+                - Assuming that a line search is going to be conducted starting from the point x,
+                  this function returns the direction in which the search should proceed.
+        !*/
+
+    };
+
+// ----------------------------------------------------------------------------------------
+
+    class bfgs_search_strategy
+    {
+        /*!
+            WHAT THIS OBJECT REPRESENTS
+                This object represents a strategy for determining which direction
+                a line search should be carried out along.  This particular object
+                is an implementation of the BFGS quasi-newton method for determining 
+                this direction.
+
+                This method uses an amount of memory that is quadratic in the number
+                of variables to be optimized.  It is generally very effective but 
+                if your problem has a very large number of variables then it isn't 
+                appropriate.  Instead You should try the lbfgs_search_strategy.
+        !*/
+
+    public:
+        bfgs_search_strategy(
+        );
+        /*!
+            ensures
+                - This object is properly initialized and ready to generate
+                  search directions.
+        !*/
+
+        double get_wolfe_rho (
+        ) const;
+        /*!
+            ensures
+                - returns the value of the Wolfe rho parameter that should be used when 
+                  this search strategy is used with the line_search() function.
+        !*/
+
+        double get_wolfe_sigma (
+        ) const; 
+        /*!
+            ensures
+                - returns the value of the Wolfe sigma parameter that should be used when 
+                  this search strategy is used with the line_search() function.
+        !*/
+
+        unsigned long get_max_line_search_iterations (
+        ) const; 
+        /*!
+            ensures
+                - returns the value of the max iterations parameter that should be used when 
+                  this search strategy is used with the line_search() function.
+        !*/
+
+        template <typename T>
+        const matrix<double,0,1>& get_next_direction (
+            const T& x,
+            const double funct_value,
+            const T& funct_derivative
+        );
+        /*!
+            requires
+                - this function is only called once per search iteration
+                - for some objective function f():
+                    - x == the search point for the current iteration
+                    - funct_value == f(x)
+                    - funct_derivative == derivative(f)(x)
+            ensures
+                - Assuming that a line search is going to be conducted starting from the point x,
+                  this function returns the direction in which the search should proceed.
+        !*/
+    };
+
+// ----------------------------------------------------------------------------------------
+
+    class lbfgs_search_strategy
+    {
+        /*!
+            WHAT THIS OBJECT REPRESENTS
+                This object represents a strategy for determining which direction
+                a line search should be carried out along.  This particular object
+                is an implementation of the L-BFGS quasi-newton method for determining 
+                this direction.
+
+                This method uses an amount of memory that is linear in the number
+                of variables to be optimized.  This makes it an excellent method 
+                to use when an optimization problem has a large number of variables.
+        !*/
+    public:
+        explicit lbfgs_search_strategy(
+            unsigned long max_size
+        ); 
+        /*!
+            requires
+                - max_size > 0
+            ensures
+                - This object is properly initialized and ready to generate
+                  search directions.
+                - L-BFGS works by remembering a certain number of position and gradient 
+                  pairs.  It uses this remembered information to compute search directions.
+                  The max_size argument determines how many of these pairs will be remembered.
+                  Typically, using between 3 and 30 pairs performs well for many problems.
+        !*/
+
+        double get_wolfe_rho (
+        ) const;
+        /*!
+            ensures
+                - returns the value of the Wolfe rho parameter that should be used when 
+                  this search strategy is used with the line_search() function.
+        !*/
+
+        double get_wolfe_sigma (
+        ) const; 
+        /*!
+            ensures
+                - returns the value of the Wolfe sigma parameter that should be used when 
+                  this search strategy is used with the line_search() function.
+        !*/
+
+        unsigned long get_max_line_search_iterations (
+        ) const; 
+        /*!
+            ensures
+                - returns the value of the max iterations parameter that should be used when 
+                  this search strategy is used with the line_search() function.
+        !*/
+
+        template <typename T>
+        const matrix<double,0,1>& get_next_direction (
+            const T& x,
+            const double funct_value,
+            const T& funct_derivative
+        );
+        /*!
+            requires
+                - this function is only called once per search iteration
+                - for some objective function f():
+                    - x == the search point for the current iteration
+                    - funct_value == f(x)
+                    - funct_derivative == derivative(f)(x)
+            ensures
+                - Assuming that a line search is going to be conducted starting from the point x,
+                  this function returns the direction in which the search should proceed.
+        !*/
+    };
+
+// ----------------------------------------------------------------------------------------
+
+    template <
+        typename hessian_funct
+        >
+    class newton_search_strategy_obj
+    {
+        /*!
+            REQUIREMENTS ON hessian_funct
+                Objects of hessian_funct type must be function objects which
+                take a single argument and return a dlib::matrix of doubles.  The
+                single argument must be a dlib::matrix capable of representing
+                column vectors of doubles.  hessian_funct must also be copy 
+                constructable.  
+
+            WHAT THIS OBJECT REPRESENTS
+                This object represents a strategy for determining which direction
+                a line search should be carried out along.  This particular object
+                is an implementation of the newton method for determining this 
+                direction.  That is, it uses the following formula to determine
+                the direction:
+                    search_direction = -inv(hessian(x))*derivative
+        !*/
+    public:
+        explicit newton_search_strategy_obj(
+            const hessian_funct& hess
+        ); 
+        /*!
+            ensures
+                - This object is properly initialized and ready to generate
+                  search directions.
+                - hess will be used by this object to generate the needed hessian
+                  matrices every time get_next_direction() is called.
+        !*/
+
+        double get_wolfe_rho (
+        ) const;
+        /*!
+            ensures
+                - returns the value of the Wolfe rho parameter that should be used when 
+                  this search strategy is used with the line_search() function.
+        !*/
+
+        double get_wolfe_sigma (
+        ) const; 
+        /*!
+            ensures
+                - returns the value of the Wolfe sigma parameter that should be used when 
+                  this search strategy is used with the line_search() function.
+        !*/
+
+        unsigned long get_max_line_search_iterations (
+        ) const; 
+        /*!
+            ensures
+                - returns the value of the max iterations parameter that should be used when 
+                  this search strategy is used with the line_search() function.
+        !*/
+
+        template <typename T>
+        const matrix<double,0,1> get_next_direction (
+            const T& x,
+            const double funct_value,
+            const T& funct_derivative
+        );
+        /*!
+            requires
+                - for some objective function f():
+                    - x == the search point for the current iteration
+                    - funct_value == f(x)
+                    - funct_derivative == derivative(f)(x)
+            ensures
+                - Assuming that a line search is going to be conducted starting from the 
+                  point x, this function returns the direction in which the search should 
+                  proceed.
+                - In particular, the search direction will be given by:
+                    - search_direction = -inv(hessian(x))*funct_derivative
+        !*/
+
+    };
+
+    template <typename hessian_funct>
+    newton_search_strategy_obj<hessian_funct> newton_search_strategy (
+        hessian_funct hessian
+    ) { return newton_search_strategy_obj<hessian_funct>(hessian); }
+    /*!
+        ensures
+            - constructs and returns a newton_search_strategy_obj.  
+              This function is just a helper to make the syntax for creating
+              these objects a little simpler.
+    !*/
+
+// ----------------------------------------------------------------------------------------
+
+}
+
+#endif // DLIB_OPTIMIZATIOn_SEARCH_STRATEGIES_ABSTRACT_
+
diff --git a/ml/dlib/dlib/optimization/optimization_solve_qp2_using_smo.h b/ml/dlib/dlib/optimization/optimization_solve_qp2_using_smo.h
new file mode 100644
index 000000000..88cad0cf3
--- /dev/null
+++ b/ml/dlib/dlib/optimization/optimization_solve_qp2_using_smo.h
@@ -0,0 +1,468 @@
+// Copyright (C) 2007  Davis E. King (davis@dlib.net)
+// License: Boost Software License   See LICENSE.txt for the full license.
+#ifndef DLIB_SOLVE_QP2_USING_SMo_Hh_
+#define DLIB_SOLVE_QP2_USING_SMo_Hh_
+
+#include "optimization_solve_qp2_using_smo_abstract.h"
+#include <cmath>
+#include <limits>
+#include <sstream>
+#include "../matrix.h"
+#include "../algs.h"
+
+
+namespace dlib 
+{
+
+// ----------------------------------------------------------------------------------------
+
+    class invalid_nu_error : public dlib::error 
+    { 
+    public: 
+        invalid_nu_error(const std::string& msg, double nu_) : dlib::error(msg), nu(nu_) {};
+        const double nu;
+    };
+
+// ----------------------------------------------------------------------------------------
+
+    template <
+        typename T
+        >
+    typename T::type maximum_nu_impl (
+        const T& y
+    )
+    {
+        typedef typename T::type scalar_type;
+        // make sure requires clause is not broken
+        DLIB_ASSERT(y.size() > 1 && is_col_vector(y),
+            "\ttypedef T::type maximum_nu(y)"
+            << "\n\ty should be a column vector with more than one entry"
+            << "\n\ty.nr(): " << y.nr() 
+            << "\n\ty.nc(): " << y.nc() 
+            );
+
+        long pos_count = 0;
+        long neg_count = 0;
+        for (long r = 0; r < y.nr(); ++r)
+        {
+            if (y(r) == 1.0)
+            {
+                ++pos_count;
+            }
+            else if (y(r) == -1.0)
+            {
+                ++neg_count;
+            }
+            else
+            {
+                // make sure requires clause is not broken
+                DLIB_ASSERT(y(r) == -1.0 || y(r) == 1.0,
+                       "\ttypedef T::type maximum_nu(y)"
+                       << "\n\ty should contain only 1 and 0 entries"
+                       << "\n\tr:    " << r 
+                       << "\n\ty(r): " << y(r) 
+                );
+            }
+        }
+        return static_cast<scalar_type>(2.0*(scalar_type)std::min(pos_count,neg_count)/(scalar_type)y.nr());
+    }
+
+    template <
+        typename T
+        >
+    typename T::type maximum_nu (
+        const T& y
+    )
+    {
+        return maximum_nu_impl(mat(y));
+    }
+
+    template <
+        typename T
+        >
+    typename T::value_type maximum_nu (
+        const T& y
+    )
+    {
+        return maximum_nu_impl(mat(y));
+    }
+
+// ----------------------------------------------------------------------------------------
+
+    template <
+        typename matrix_type
+        >
+    class solve_qp2_using_smo
+    {
+    public:
+        typedef typename matrix_type::mem_manager_type mem_manager_type;
+        typedef typename matrix_type::type scalar_type;
+        typedef typename matrix_type::layout_type layout_type;
+        typedef matrix<scalar_type,0,0,mem_manager_type,layout_type> general_matrix;
+        typedef matrix<scalar_type,0,1,mem_manager_type,layout_type> column_matrix;
+
+
+        template <
+            typename EXP1,
+            typename EXP2,
+            long NR
+            >
+        unsigned long operator() ( 
+            const matrix_exp<EXP1>& Q,
+            const matrix_exp<EXP2>& y,
+            const scalar_type nu,
+            matrix<scalar_type,NR,1,mem_manager_type, layout_type>& alpha,
+            scalar_type eps
+        ) 
+        {
+            DLIB_ASSERT(Q.nr() == Q.nc() && y.size() == Q.nr() && y.size() > 1 && is_col_vector(y) &&
+                        sum((y == +1) + (y == -1)) == y.size() &&
+                        0 < nu && nu <= 1 &&
+                        eps > 0,
+                "\t void solve_qp2_using_smo::operator()"
+                << "\n\t invalid arguments were given to this function"
+                << "\n\t Q.nr():                     " << Q.nr() 
+                << "\n\t Q.nc():                     " << Q.nc() 
+                << "\n\t is_col_vector(y):           " << is_col_vector(y) 
+                << "\n\t y.size():                   " << y.size() 
+                << "\n\t sum((y == +1) + (y == -1)): " << sum((y == +1) + (y == -1)) 
+                << "\n\t nu:                         " << nu 
+                << "\n\t eps:                        " << eps 
+                );
+
+            alpha.set_size(Q.nr(),1);
+            df.set_size(Q.nr());
+
+            // now initialize alpha
+            set_initial_alpha(y, nu, alpha);
+
+            const scalar_type tau = 1e-12;
+
+            typedef typename colm_exp<EXP1>::type col_type;
+
+            set_all_elements(df, 0);
+            // initialize df.  Compute df = Q*alpha
+            for (long r = 0; r < df.nr(); ++r)
+            {
+                if (alpha(r) != 0)
+                {
+                    df += alpha(r)*matrix_cast<scalar_type>(colm(Q,r));
+                }
+            }
+
+            unsigned long count = 0;
+
+            // now perform the actual optimization of alpha
+            long i=0, j=0;
+            while (find_working_group(y,alpha,Q,df,tau,eps,i,j))
+            {
+                ++count;
+                const scalar_type old_alpha_i = alpha(i);
+                const scalar_type old_alpha_j = alpha(j);
+
+                optimize_working_pair(alpha,Q,df,tau,i,j);
+
+                // update the df vector now that we have modified alpha(i) and alpha(j)
+                scalar_type delta_alpha_i = alpha(i) - old_alpha_i;
+                scalar_type delta_alpha_j = alpha(j) - old_alpha_j;
+
+                col_type Q_i = colm(Q,i);
+                col_type Q_j = colm(Q,j);
+                for(long k = 0; k < df.nr(); ++k)
+                    df(k) += Q_i(k)*delta_alpha_i + Q_j(k)*delta_alpha_j;
+            }
+
+            return count;
+        }
+
+        const column_matrix& get_gradient (
+        ) const { return df; }
+
+    private:
+
+    // -------------------------------------------------------------------------------------
+
+        template <
+            typename scalar_type,
+            typename scalar_vector_type,
+            typename scalar_vector_type2
+            >
+        inline void set_initial_alpha (
+            const scalar_vector_type& y,
+            const scalar_type nu,
+            scalar_vector_type2& alpha
+        ) const
+        {
+            set_all_elements(alpha,0);
+            const scalar_type l = y.nr();
+            scalar_type temp = nu*l/2;
+            long num = (long)std::floor(temp);
+            long num_total = (long)std::ceil(temp);
+
+            bool has_slack = false;
+            int count = 0;
+            for (int i = 0; i < alpha.nr(); ++i)
+            {
+                if (y(i) == 1)
+                {
+                    if (count < num)
+                    {
+                        ++count;
+                        alpha(i) = 1;
+                    }
+                    else 
+                    {
+                        has_slack = true;
+                        if (num_total > num)
+                        {
+                            ++count;
+                            alpha(i) = temp - std::floor(temp);
+                        }
+                        break;
+                    }
+                }
+            }
+
+            if (count != num_total || has_slack == false)
+            {
+                std::ostringstream sout;
+                sout << "Invalid nu of " << nu << ".  It is required that: 0 < nu < " << 2*(scalar_type)count/y.nr();
+                throw invalid_nu_error(sout.str(),nu);
+            }
+
+            has_slack = false;
+            count = 0;
+            for (int i = 0; i < alpha.nr(); ++i)
+            {
+                if (y(i) == -1)
+                {
+                    if (count < num)
+                    {
+                        ++count;
+                        alpha(i) = 1;
+                    }
+                    else 
+                    {
+                        has_slack = true;
+                        if (num_total > num)
+                        {
+                            ++count;
+                            alpha(i) = temp - std::floor(temp);
+                        }
+                        break;
+                    }
+                }
+            }
+
+            if (count != num_total || has_slack == false)
+            {
+                std::ostringstream sout;
+                sout << "Invalid nu of " << nu << ".  It is required that: 0 < nu < " << 2*(scalar_type)count/y.nr();
+                throw invalid_nu_error(sout.str(),nu);
+            }
+        }
+
+    // ------------------------------------------------------------------------------------
+
+        template <
+            typename scalar_vector_type,
+            typename scalar_type,
+            typename EXP,
+            typename U, typename V
+            >
+        inline bool find_working_group (
+            const V& y,
+            const U& alpha,
+            const matrix_exp<EXP>& Q,
+            const scalar_vector_type& df,
+            const scalar_type tau,
+            const scalar_type eps,
+            long& i_out,
+            long& j_out
+        ) const
+        {
+            using namespace std;
+            long ip = 0;
+            long jp = 0;
+            long in = 0;
+            long jn = 0;
+
+
+            typedef typename colm_exp<EXP>::type col_type;
+            typedef typename diag_exp<EXP>::type diag_type;
+
+            scalar_type ip_val = -numeric_limits<scalar_type>::infinity();
+            scalar_type jp_val = numeric_limits<scalar_type>::infinity();
+            scalar_type in_val = -numeric_limits<scalar_type>::infinity();
+            scalar_type jn_val = numeric_limits<scalar_type>::infinity();
+
+            // loop over the alphas and find the maximum ip and in indices.
+            for (long i = 0; i < alpha.nr(); ++i)
+            {
+                if (y(i) == 1)
+                {
+                    if (alpha(i) < 1.0)
+                    {
+                        if (-df(i) > ip_val)
+                        {
+                            ip_val = -df(i);
+                            ip = i;
+                        }
+                    }
+                }
+                else
+                {
+                    if (alpha(i) > 0.0)
+                    {
+                        if (df(i) > in_val)
+                        {
+                            in_val = df(i);
+                            in = i;
+                        }
+                    }
+                }
+            }
+
+            scalar_type Mp = numeric_limits<scalar_type>::infinity();
+            scalar_type Mn = numeric_limits<scalar_type>::infinity();
+
+            // Pick out the columns and diagonal of Q we need below.  Doing
+            // it this way is faster if Q is actually a symmetric_matrix_cache
+            // object.
+            col_type Q_ip = colm(Q,ip);
+            col_type Q_in = colm(Q,in);
+            diag_type Q_diag = diag(Q);
+
+
+
+            // now we need to find the minimum jp and jn indices
+            for (long j = 0; j < alpha.nr(); ++j)
+            {
+                if (y(j) == 1)
+                {
+                    if (alpha(j) > 0.0)
+                    {
+                        scalar_type b = ip_val + df(j);
+                        if (-df(j) < Mp)
+                            Mp = -df(j);
+
+                        if (b > 0)
+                        {
+                            scalar_type a = Q_ip(ip) + Q_diag(j) - 2*Q_ip(j); 
+                            if (a <= 0)
+                                a = tau;
+                            scalar_type temp = -b*b/a;
+                            if (temp < jp_val)
+                            {
+                                jp_val = temp;
+                                jp = j;
+                            }
+                        }
+                    }
+                }
+                else
+                {
+                    if (alpha(j) < 1.0)
+                    {
+                        scalar_type b = in_val - df(j);
+                        if (df(j) < Mn)
+                            Mn = df(j);
+
+                        if (b > 0)
+                        {
+                            scalar_type a = Q_in(in) + Q_diag(j) - 2*Q_in(j); 
+                            if (a <= 0)
+                                a = tau;
+                            scalar_type temp = -b*b/a;
+                            if (temp < jn_val)
+                            {
+                                jn_val = temp;
+                                jn = j;
+                            }
+                        }
+                    }
+                }
+            }
+
+            // if we are at the optimal point then return false so the caller knows
+            // to stop optimizing
+            if (std::max(ip_val - Mp, in_val - Mn) < eps)
+                return false;
+
+            if (jp_val < jn_val)
+            {
+                i_out = ip;
+                j_out = jp;
+            }
+            else
+            {
+                i_out = in;
+                j_out = jn;
+            }
+
+            return true;
+        }
+
+    // ------------------------------------------------------------------------------------
+
+        template <
+            typename EXP,
+            typename T, typename U
+            >
+        inline void optimize_working_pair (
+            T& alpha,
+            const matrix_exp<EXP>& Q,
+            const U& df,
+            const scalar_type tau,
+            const long i,
+            const long j
+        ) const
+        {
+            scalar_type quad_coef = Q(i,i)+Q(j,j)-2*Q(j,i);
+            if (quad_coef <= 0)
+                quad_coef = tau;
+            scalar_type delta = (df(i)-df(j))/quad_coef;
+            scalar_type sum = alpha(i) + alpha(j);
+            alpha(i) -= delta;
+            alpha(j) += delta;
+
+            if(sum > 1)
+            {
+                if(alpha(i) > 1)
+                {
+                    alpha(i) = 1;
+                    alpha(j) = sum - 1;
+                }
+                else if(alpha(j) > 1)
+                {
+                    alpha(j) = 1;
+                    alpha(i) = sum - 1;
+                }
+            }
+            else
+            {
+                if(alpha(j) < 0)
+                {
+                    alpha(j) = 0;
+                    alpha(i) = sum;
+                }
+                else if(alpha(i) < 0)
+                {
+                    alpha(i) = 0;
+                    alpha(j) = sum;
+                }
+            }
+        }
+
+    // ------------------------------------------------------------------------------------
+
+        column_matrix df; // gradient of f(alpha)
+    };
+
+// ----------------------------------------------------------------------------------------
+
+}
+
+#endif // DLIB_SOLVE_QP2_USING_SMo_Hh_
+
+
diff --git a/ml/dlib/dlib/optimization/optimization_solve_qp2_using_smo_abstract.h b/ml/dlib/dlib/optimization/optimization_solve_qp2_using_smo_abstract.h
new file mode 100644
index 000000000..962541c25
--- /dev/null
+++ b/ml/dlib/dlib/optimization/optimization_solve_qp2_using_smo_abstract.h
@@ -0,0 +1,150 @@
+// Copyright (C) 2007  Davis E. King (davis@dlib.net)
+// License: Boost Software License   See LICENSE.txt for the full license.
+#undef DLIB_OPTIMIZATION_SOLVE_QP2_USING_SMO_ABSTRACT_H_
+#ifdef DLIB_OPTIMIZATION_SOLVE_QP2_USING_SMO_ABSTRACT_H_
+
+#include "../matrix/matrix_abstract.h"
+#include "../algs.h"
+
+namespace dlib
+{
+
+// ----------------------------------------------------------------------------------------
+
+    class invalid_nu_error : public dlib::error 
+    { 
+        /*!
+            WHAT THIS OBJECT REPRESENTS
+                This object is an exception class used to indicate that a
+                value of nu given to the solve_qp2_using_smo object is incompatible 
+                with the constraints of the quadratic program.
+
+                this->nu will be set to the invalid value of nu used.
+        !*/
+
+    public: 
+        invalid_nu_error(const std::string& msg, double nu_) : dlib::error(msg), nu(nu_) {};
+        const double nu;
+    };
+
+// ----------------------------------------------------------------------------------------
+
+    template <
+        typename T
+        >
+    typename T::type maximum_nu (
+        const T& y
+    );
+    /*!
+        requires
+            - T == a matrix object or an object convertible to a matrix via mat()
+            - is_col_vector(y) == true
+            - y.size() > 1
+            - sum((y == +1) + (y == -1)) == y.size()
+              (i.e. all elements of y must be equal to +1 or -1)
+        ensures
+            - returns the maximum valid nu that can be used with solve_qp2_using_smo and
+              the given y vector.
+              (i.e. 2.0*min(sum(y == +1), sum(y == -1))/y.size())
+    !*/
+
+// ----------------------------------------------------------------------------------------
+
+    template <
+        typename matrix_type
+        >
+    class solve_qp2_using_smo
+    {
+        /*!
+            REQUIREMENTS ON matrix_type
+                Must be some type of dlib::matrix.
+
+            WHAT THIS OBJECT REPRESENTS
+                This object is a tool for solving the following quadratic programming
+                problem using the sequential minimal optimization algorithm:  
+
+                  Minimize: f(alpha) == 0.5*trans(alpha)*Q*alpha 
+                  subject to the following constraints:
+                    - sum(alpha) == nu*y.size() 
+                    - 0 <= min(alpha) && max(alpha) <= 1 
+                    - trans(y)*alpha == 0
+
+                  Where all elements of y must be equal to +1 or -1 and f is convex.  
+                  This means that Q should be symmetric and positive-semidefinite.
+                
+                
+                This object implements the strategy used by the LIBSVM tool.  The following papers
+                can be consulted for additional details:
+                    - Chang and Lin, Training {nu}-Support Vector Classifiers: Theory and Algorithms
+                    - Chih-Chung Chang and Chih-Jen Lin, LIBSVM : a library for support vector 
+                      machines, 2001. Software available at http://www.csie.ntu.edu.tw/~cjlin/libsvm
+        !*/
+
+    public:
+
+        typedef typename matrix_type::mem_manager_type mem_manager_type;
+        typedef typename matrix_type::type scalar_type;
+        typedef typename matrix_type::layout_type layout_type;
+        typedef matrix<scalar_type,0,0,mem_manager_type,layout_type> general_matrix;
+        typedef matrix<scalar_type,0,1,mem_manager_type,layout_type> column_matrix;
+
+        template <
+            typename EXP1,
+            typename EXP2,
+            long NR
+            >
+        unsigned long operator() ( 
+            const matrix_exp<EXP1>& Q,
+            const matrix_exp<EXP2>& y,
+            const scalar_type nu,
+            matrix<scalar_type,NR,1,mem_manager_type, layout_type>& alpha,
+            scalar_type eps
+        );
+        /*!
+            requires
+                - Q.nr() == Q.nc()
+                - is_col_vector(y) == true
+                - y.size() == Q.nr()
+                - y.size() > 1
+                - sum((y == +1) + (y == -1)) == y.size()
+                  (i.e. all elements of y must be equal to +1 or -1)
+                - alpha must be capable of representing a vector of size y.size() elements
+                - 0 < nu <= 1
+                - eps > 0
+            ensures
+                - This function solves the quadratic program defined in this class's main comment.
+                - The solution to the quadratic program will be stored in #alpha.
+                - #alpha.size() == y.size()
+                - This function uses an implementation of the sequential minimal optimization 
+                  algorithm.  It runs until the KKT violation is less than eps.  So eps controls 
+                  how accurate the solution is and smaller values result in better solutions.
+                  (a reasonable eps is usually about 1e-3)
+                - #get_gradient() == Q*(#alpha)
+                  (i.e. stores the gradient of f() at #alpha in get_gradient())
+                - returns the number of iterations performed.  
+            throws
+                - invalid_nu_error
+                  This exception is thrown if nu >= maximum_nu(y).  
+                  (some values of nu cause the constraints to become impossible to satisfy. 
+                  If this is detected then an exception is thrown).
+        !*/
+
+        const column_matrix& get_gradient (
+        ) const;
+        /*!
+            ensures
+                - returns the gradient vector at the solution of the last problem solved
+                  by this object.  If no problem has been solved then returns an empty
+                  vector.
+        !*/
+
+    };
+
+// ----------------------------------------------------------------------------------------
+
+}
+
+#endif // DLIB_OPTIMIZATION_SOLVE_QP2_USING_SMO_ABSTRACT_H_
+
+
+
diff --git a/ml/dlib/dlib/optimization/optimization_solve_qp3_using_smo.h b/ml/dlib/dlib/optimization/optimization_solve_qp3_using_smo.h
new file mode 100644
index 000000000..617ecc408
--- /dev/null
+++ b/ml/dlib/dlib/optimization/optimization_solve_qp3_using_smo.h
@@ -0,0 +1,455 @@
+// Copyright (C) 2007  Davis E. King (davis@dlib.net)
+// License: Boost Software License   See LICENSE.txt for the full license.
+#ifndef DLIB_SOLVE_QP3_USING_SMo_Hh_
+#define DLIB_SOLVE_QP3_USING_SMo_Hh_
+
+#include "optimization_solve_qp3_using_smo_abstract.h"
+#include <cmath>
+#include <limits>
+#include <sstream>
+#include "../matrix.h"
+#include "../algs.h"
+
+
+namespace dlib 
+{
+
+// ----------------------------------------------------------------------------------------
+
+    class invalid_qp3_error : public dlib::error 
+    { 
+
+    public: 
+        invalid_qp3_error(
+            const std::string& msg, 
+            double B_,
+            double Cp_,
+            double Cn_
+        ) : 
+            dlib::error(msg), 
+            B(B_),
+            Cp(Cp_),
+            Cn(Cn_)
+        {};
+
+        const double B;
+        const double Cp;
+        const double Cn;
+    };
+
+// ----------------------------------------------------------------------------------------
+
+    template <
+        typename matrix_type
+        >
+    class solve_qp3_using_smo
+    {
+    public:
+        typedef typename matrix_type::mem_manager_type mem_manager_type;
+        typedef typename matrix_type::type scalar_type;
+        typedef typename matrix_type::layout_type layout_type;
+        typedef matrix<scalar_type,0,0,mem_manager_type,layout_type> general_matrix;
+        typedef matrix<scalar_type,0,1,mem_manager_type,layout_type> column_matrix;
+
+
+        template <
+            typename EXP1,
+            typename EXP2,
+            typename EXP3,
+            long NR
+            >
+        unsigned long operator() ( 
+            const matrix_exp<EXP1>& Q,
+            const matrix_exp<EXP2>& p,
+            const matrix_exp<EXP3>& y,
+            const scalar_type B,
+            const scalar_type Cp,
+            const scalar_type Cn,
+            matrix<scalar_type,NR,1,mem_manager_type, layout_type>& alpha,
+            scalar_type eps
+        ) 
+        {
+            DLIB_ASSERT(Q.nr() == Q.nc() && y.size() == Q.nr() && p.size() == y.size() && 
+                        y.size() > 0 && is_col_vector(y) && is_col_vector(p) &&
+                        sum((y == +1) + (y == -1)) == y.size() &&
+                        Cp > 0 && Cn > 0 &&
+                        eps > 0,
+                "\t void solve_qp3_using_smo::operator()"
+                << "\n\t invalid arguments were given to this function"
+                << "\n\t Q.nr():                     " << Q.nr() 
+                << "\n\t Q.nc():                     " << Q.nc() 
+                << "\n\t is_col_vector(p):           " << is_col_vector(p) 
+                << "\n\t p.size():                   " << p.size() 
+                << "\n\t is_col_vector(y):           " << is_col_vector(y) 
+                << "\n\t y.size():                   " << y.size() 
+                << "\n\t sum((y == +1) + (y == -1)): " << sum((y == +1) + (y == -1)) 
+                << "\n\t Cp:                         " << Cp
+                << "\n\t Cn:                         " << Cn
+                << "\n\t eps:                        " << eps 
+                );
+
+
+
+            set_initial_alpha(y, B, Cp, Cn, alpha);
+
+
+            const scalar_type tau = 1e-12;
+
+            typedef typename colm_exp<EXP1>::type col_type;
+
+            // initialize df.  Compute df = Q*alpha + p
+            df = p;
+            for (long r = 0; r < df.nr(); ++r)
+            {
+                if (alpha(r) != 0)
+                {
+                    df += alpha(r)*matrix_cast<scalar_type>(colm(Q,r));
+                }
+            }
+
+            unsigned long count = 0;
+            // now perform the actual optimization of alpha
+            long i=0, j=0;
+            while (find_working_group(y,alpha,Q,df,Cp,Cn,tau,eps,i,j))
+            {
+                ++count;
+                const scalar_type old_alpha_i = alpha(i);
+                const scalar_type old_alpha_j = alpha(j);
+
+                optimize_working_pair(alpha,Q,y,df,tau,i,j, Cp, Cn );
+
+                // update the df vector now that we have modified alpha(i) and alpha(j)
+                scalar_type delta_alpha_i = alpha(i) - old_alpha_i;
+                scalar_type delta_alpha_j = alpha(j) - old_alpha_j;
+
+                col_type Q_i = colm(Q,i);
+                col_type Q_j = colm(Q,j);
+                for(long k = 0; k < df.nr(); ++k)
+                    df(k) += Q_i(k)*delta_alpha_i + Q_j(k)*delta_alpha_j;
+            }
+
+            return count;
+        }
+
+        const column_matrix& get_gradient (
+        ) const { return df; }
+
+    private:
+
+    // -------------------------------------------------------------------------------------
+
+        template <
+            typename scalar_type,
+            typename scalar_vector_type,
+            typename scalar_vector_type2
+            >
+        inline void set_initial_alpha (
+            const scalar_vector_type& y,
+            const scalar_type B,
+            const scalar_type Cp,
+            const scalar_type Cn,
+            scalar_vector_type2& alpha
+        ) const
+        {
+            alpha.set_size(y.size());
+
+            set_all_elements(alpha,0);
+
+            // It's easy in the B == 0 case
+            if (B == 0)
+                return;
+
+            const scalar_type C = (B > 0)?  Cp : Cn;
+
+            scalar_type temp = std::abs(B)/C;
+            long num = (long)std::floor(temp);
+            long num_total = (long)std::ceil(temp);
+
+            const scalar_type B_sign = (B > 0)? 1 : -1;
+
+            long count = 0;
+            for (long i = 0; i < alpha.nr(); ++i)
+            {
+                if (y(i) == B_sign)
+                {
+                    if (count < num)
+                    {
+                        ++count;
+                        alpha(i) = C;
+                    }
+                    else 
+                    {
+                        if (count < num_total)
+                        {
+                            ++count;
+                            alpha(i) = C*(temp - std::floor(temp));
+                        }
+                        break;
+                    }
+                }
+            }
+
+            if (count != num_total)
+            {
+                std::ostringstream sout;
+                sout << "Invalid QP3 constraint parameters of B: " << B << ", Cp: " << Cp << ", Cn: "<< Cn;
+                throw invalid_qp3_error(sout.str(),B,Cp,Cn);
+            }
+        }
+
+    // ------------------------------------------------------------------------------------
+
+        template <
+            typename scalar_vector_type,
+            typename scalar_type,
+            typename EXP,
+            typename U, typename V
+            >
+        inline bool find_working_group (
+            const V& y,
+            const U& alpha,
+            const matrix_exp<EXP>& Q,
+            const scalar_vector_type& df,
+            const scalar_type Cp,
+            const scalar_type Cn,
+            const scalar_type tau,
+            const scalar_type eps,
+            long& i_out,
+            long& j_out
+        ) const
+        {
+            using namespace std;
+
+            long ip = 0;
+            long jp = 0;
+
+
+            typedef typename colm_exp<EXP>::type col_type;
+            typedef typename diag_exp<EXP>::type diag_type;
+
+            scalar_type ip_val = -numeric_limits<scalar_type>::infinity();
+            scalar_type jp_val = numeric_limits<scalar_type>::infinity();
+
+            // loop over the alphas and find the maximum ip and in indices.
+            for (long i = 0; i < alpha.nr(); ++i)
+            {
+                if (y(i) == 1)
+                {
+                    if (alpha(i) < Cp)
+                    {
+                        if (-df(i) > ip_val)
+                        {
+                            ip_val = -df(i);
+                            ip = i;
+                        }
+                    }
+                }
+                else
+                {
+                    if (alpha(i) > 0.0)
+                    {
+                        if (df(i) > ip_val)
+                        {
+                            ip_val = df(i);
+                            ip = i;
+                        }
+                    }
+                }
+            }
+
+            scalar_type Mp = -numeric_limits<scalar_type>::infinity();
+
+            // Pick out the column and diagonal of Q we need below.  Doing
+            // it this way is faster if Q is actually a symmetric_matrix_cache
+            // object.
+            col_type Q_ip = colm(Q,ip);
+            diag_type Q_diag = diag(Q);
+
+
+
+            // now we need to find the minimum jp indices
+            for (long j = 0; j < alpha.nr(); ++j)
+            {
+                if (y(j) == 1)
+                {
+                    if (alpha(j) > 0.0)
+                    {
+                        scalar_type b = ip_val + df(j);
+                        if (df(j) > Mp)
+                            Mp = df(j);
+
+                        if (b > 0)
+                        {
+                            scalar_type a = Q_ip(ip) + Q_diag(j) - 2*y(ip)*Q_ip(j); 
+                            if (a <= 0)
+                                a = tau;
+                            scalar_type temp = -b*b/a;
+                            if (temp < jp_val)
+                            {
+                                jp_val = temp;
+                                jp = j;
+                            }
+                        }
+                    }
+                }
+                else
+                {
+                    if (alpha(j) < Cn)
+                    {
+                        scalar_type b = ip_val - df(j);
+                        if (-df(j) > Mp)
+                            Mp = -df(j);
+
+                        if (b > 0)
+                        {
+                            scalar_type a = Q_ip(ip) + Q_diag(j) + 2*y(ip)*Q_ip(j); 
+                            if (a <= 0)
+                                a = tau;
+                            scalar_type temp = -b*b/a;
+                            if (temp < jp_val)
+                            {
+                                jp_val = temp;
+                                jp = j;
+                            }
+                        }
+                    }
+                }
+            }
+
+            // if we are at the optimal point then return false so the caller knows
+            // to stop optimizing
+            if (Mp + ip_val < eps)
+                return false;
+
+
+            i_out = ip;
+            j_out = jp;
+
+            return true;
+        }
+
+    // ------------------------------------------------------------------------------------
+
+        template <
+            typename EXP,
+            typename EXP2,
+            typename T, typename U
+            >
+        inline void optimize_working_pair (
+            T& alpha,
+            const matrix_exp<EXP>& Q,
+            const matrix_exp<EXP2>& y,
+            const U& df,
+            const scalar_type& tau,
+            const long i,
+            const long j,
+            const scalar_type& Cp,
+            const scalar_type& Cn
+        ) const
+        {
+            const scalar_type Ci = (y(i) > 0 )? Cp : Cn;
+            const scalar_type Cj = (y(j) > 0 )? Cp : Cn;
+
+            if (y(i) != y(j))
+            {
+                scalar_type quad_coef = Q(i,i)+Q(j,j)+2*Q(j,i);
+                if (quad_coef <= 0)
+                    quad_coef = tau;
+                scalar_type delta = (-df(i)-df(j))/quad_coef;
+                scalar_type diff = alpha(i) - alpha(j);
+                alpha(i) += delta;
+                alpha(j) += delta;
+
+                if (diff > 0)
+                {
+                    if (alpha(j) < 0)
+                    {
+                        alpha(j) = 0;
+                        alpha(i) = diff;
+                    }
+                }
+                else
+                {
+                    if (alpha(i) < 0)
+                    {
+                        alpha(i) = 0;
+                        alpha(j) = -diff;
+                    }
+                }
+
+                if (diff > Ci - Cj)
+                {
+                    if (alpha(i) > Ci)
+                    {
+                        alpha(i) = Ci;
+                        alpha(j) = Ci - diff;
+                    }
+                }
+                else
+                {
+                    if (alpha(j) > Cj)
+                    {
+                        alpha(j) = Cj;
+                        alpha(i) = Cj + diff;
+                    }
+                }
+            }
+            else
+            {
+                scalar_type quad_coef = Q(i,i)+Q(j,j)-2*Q(j,i);
+                if (quad_coef <= 0)
+                    quad_coef = tau;
+                scalar_type delta = (df(i)-df(j))/quad_coef;
+                scalar_type sum = alpha(i) + alpha(j);
+                alpha(i) -= delta;
+                alpha(j) += delta;
+
+                if(sum > Ci)
+                {
+                    if(alpha(i) > Ci)
+                    {
+                        alpha(i) = Ci;
+                        alpha(j) = sum - Ci;
+                    }
+                }
+                else
+                {
+                    if(alpha(j) < 0)
+                    {
+                        alpha(j) = 0;
+                        alpha(i) = sum;
+                    }
+                }
+
+                if(sum > Cj)
+                {
+                    if(alpha(j) > Cj)
+                    {
+                        alpha(j) = Cj;
+                        alpha(i) = sum - Cj;
+                    }
+                }
+                else
+                {
+                    if(alpha(i) < 0)
+                    {
+                        alpha(i) = 0;
+                        alpha(j) = sum;
+                    }
+                }
+
+            }
+        }
+
+    // ------------------------------------------------------------------------------------
+
+        column_matrix df; // gradient of f(alpha)
+    };
+
+// ----------------------------------------------------------------------------------------
+
+}
+
+#endif // DLIB_SOLVE_QP3_USING_SMo_Hh_
+
+
diff --git a/ml/dlib/dlib/optimization/optimization_solve_qp3_using_smo_abstract.h b/ml/dlib/dlib/optimization/optimization_solve_qp3_using_smo_abstract.h
new file mode 100644
index 000000000..8efd7215b
--- /dev/null
+++ b/ml/dlib/dlib/optimization/optimization_solve_qp3_using_smo_abstract.h
@@ -0,0 +1,139 @@
+// Copyright (C) 2010  Davis E. King (davis@dlib.net)
+// License: Boost Software License   See LICENSE.txt for the full license.
+#undef DLIB_OPTIMIZATION_SOLVE_QP3_USING_SMO_ABSTRACT_H_
+#ifdef DLIB_OPTIMIZATION_SOLVE_QP3_USING_SMO_ABSTRACT_H_
+
+#include "../matrix/matrix_abstract.h"
+#include "../algs.h"
+
+namespace dlib
+{
+
+// ----------------------------------------------------------------------------------------
+
+    class invalid_qp3_error : public dlib::error 
+    { 
+        /*!
+            WHAT THIS OBJECT REPRESENTS
+                This object is an exception class used to indicate that the values 
+                of B, Cp, and Cn given to the solve_qp3_using_smo object are incompatible 
+                with the constraints of the quadratic program.
+
+                this->B, this->Cp, and this->Cn will be set to the invalid values used.
+        !*/
+
+    public: 
+        invalid_qp3_error( const std::string& msg, double B_, double Cp_, double Cn_) : 
+            dlib::error(msg), B(B_), Cp(Cp_), Cn(Cn_) {};
+
+        const double B;
+        const double Cp;
+        const double Cn;
+    };
+
+// ----------------------------------------------------------------------------------------
+
+    template <
+        typename matrix_type
+        >
+    class solve_qp3_using_smo
+    {
+        /*!
+            REQUIREMENTS ON matrix_type
+                Must be some type of dlib::matrix.
+
+            WHAT THIS OBJECT REPRESENTS
+                This object is a tool for solving the following quadratic programming
+                problem using the sequential minimal optimization algorithm:  
+
+                  Minimize: f(alpha) == 0.5*trans(alpha)*Q*alpha + trans(p)*alpha
+                  subject to the following constraints:
+                    - for all i such that y(i) == +1:  0 <= alpha(i) <= Cp 
+                    - for all i such that y(i) == -1:  0 <= alpha(i) <= Cn 
+                    - trans(y)*alpha == B 
+
+                  Where all elements of y must be equal to +1 or -1 and f is convex.  
+                  This means that Q should be symmetric and positive-semidefinite.
+                
+                
+                This object implements the strategy used by the LIBSVM tool.  The following papers
+                can be consulted for additional details:
+                    - Chih-Chung Chang and Chih-Jen Lin, LIBSVM : a library for support vector 
+                      machines, 2001. Software available at http://www.csie.ntu.edu.tw/~cjlin/libsvm
+                    - Working Set Selection Using Second Order Information for Training Support Vector Machines by
+                      Fan, Chen, and Lin.  In the Journal of Machine Learning Research 2005.
+        !*/
+
+    public:
+
+        typedef typename matrix_type::mem_manager_type mem_manager_type;
+        typedef typename matrix_type::type scalar_type;
+        typedef typename matrix_type::layout_type layout_type;
+        typedef matrix<scalar_type,0,0,mem_manager_type,layout_type> general_matrix;
+        typedef matrix<scalar_type,0,1,mem_manager_type,layout_type> column_matrix;
+
+        template <
+            typename EXP1,
+            typename EXP2,
+            typename EXP3,
+            long NR
+            >
+        unsigned long operator() ( 
+            const matrix_exp<EXP1>& Q,
+            const matrix_exp<EXP2>& p,
+            const matrix_exp<EXP3>& y,
+            const scalar_type B,
+            const scalar_type Cp,
+            const scalar_type Cn,
+            matrix<scalar_type,NR,1,mem_manager_type, layout_type>& alpha,
+            scalar_type eps
+        );
+        /*!
+            requires
+                - Q.nr() == Q.nc()
+                - is_col_vector(y) == true
+                - is_col_vector(p) == true
+                - p.size() == y.size() == Q.nr()
+                - y.size() > 0
+                - sum((y == +1) + (y == -1)) == y.size()
+                  (i.e. all elements of y must be equal to +1 or -1)
+                - alpha must be capable of representing a vector of size y.size() elements
+                - Cp > 0
+                - Cn > 0
+                - eps > 0
+            ensures
+                - This function solves the quadratic program defined in this class's main comment.
+                - The solution to the quadratic program will be stored in #alpha.
+                - #alpha.size() == y.size()
+                - This function uses an implementation of the sequential minimal optimization 
+                  algorithm.  It runs until the KKT violation is less than eps.  So eps controls 
+                  how accurate the solution is and smaller values result in better solutions.
+                  (a reasonable eps is usually about 1e-3)
+                - #get_gradient() == Q*(#alpha)
+                  (i.e. stores the gradient of f() at #alpha in get_gradient())
+                - returns the number of iterations performed.  
+            throws
+                - invalid_qp3_error
+                  This exception is thrown if the given parameters cause the constraints
+                  of the quadratic programming problem to be impossible to satisfy. 
+        !*/
+
+        const column_matrix& get_gradient (
+        ) const;
+        /*!
+            ensures
+                - returns the gradient vector at the solution of the last problem solved
+                  by this object.  If no problem has been solved then returns an empty
+                  vector.
+        !*/
+
+    };
+
+// ----------------------------------------------------------------------------------------
+
+}
+
+#endif // DLIB_OPTIMIZATION_SOLVE_QP3_USING_SMO_ABSTRACT_H_
+
+
+
diff --git a/ml/dlib/dlib/optimization/optimization_solve_qp_using_smo.h b/ml/dlib/dlib/optimization/optimization_solve_qp_using_smo.h
new file mode 100644
index 000000000..b9cc74df3
--- /dev/null
+++ b/ml/dlib/dlib/optimization/optimization_solve_qp_using_smo.h
@@ -0,0 +1,937 @@
+// Copyright (C) 2010  Davis E. King (davis@dlib.net)
+// License: Boost Software License   See LICENSE.txt for the full license.
+#ifndef DLIB_OPTIMIZATION_SOLVE_QP_UsING_SMO_Hh_
+#define DLIB_OPTIMIZATION_SOLVE_QP_UsING_SMO_Hh_
+
+#include "optimization_solve_qp_using_smo_abstract.h"
+#include "../matrix.h"
+#include <map>
+#include "../unordered_pair.h"
+
+namespace dlib
+{
+
+// ----------------------------------------------------------------------------------------
+
+    /*
+        The algorithm defined in the solve_qp_using_smo() function below can be
+        derived by using an important theorem from the theory of constrained optimization.
+        This theorem tells us that any optimal point of a constrained function must
+        satisfy what are called the KKT conditions (also sometimes called just the KT 
+        conditions, especially in older literature).  A very good book to consult 
+        regarding this topic is Practical Methods of Optimization (second edition) by 
+        R. Fletcher.  Below I will try to explain the general idea of how this is 
+        applied.
+
+        Let e == ones_matrix(alpha.size(),1)
+
+        First, note that the function below solves the following quadratic program.  
+            Minimize: f(alpha) == 0.5*trans(alpha)*Q*alpha - trans(alpha)*b
+            subject to the following constraints:
+                - trans(e)*alpha == C (i.e. the sum of alpha values doesn't change)
+                - min(alpha) >= 0 (i.e. all alpha values are nonnegative)
+            Where f is convex.  This means that Q should be positive-semidefinite.
+
+
+        To get from this problem formulation to the algorithm below we have to 
+        consider the KKT conditions.  They tell us that any solution to the above
+        problem must satisfy the following 5 conditions:
+            1. trans(e)*alpha == C
+            2. min(alpha) >= 0
+
+            3. Let L(alpha, x, y) == f(alpha) - trans(x)*alpha - y*(trans(e)*alpha - C)
+               Where x is a vector of length alpha.size() and y is a single scalar.
+               Then the derivative of L with respect to alpha must == 0
+               So we get the following as our 3rd condition:
+               f'(alpha) - x - y*e == 0
+
+            4. min(x) >= 0 (i.e. all x values are nonnegative)
+            5. pointwise_multiply(x, alpha) == 0
+               (i.e. only one member of each x(i) and alpha(i) pair can be non-zero)
+        
+        
+        From 3 we can easily obtain this rule:
+            for all i: f'(alpha)(i) - x(i) == y
+
+        If we then consider 4 and 5 we see that we can infer that the following
+        must also be the case:
+            - if (alpha(i) > 0) then
+                - x(i) == 0
+                - f'(alpha)(i) == y
+            - else
+                - x(i) == some nonnegative number
+                - f'(alpha)(i) >= y
+
+        
+        The important thing to take away is the final rule.  It tells us that at the
+        optimal solution all elements of the gradient of f have the same value if 
+        their corresponding alpha is non-zero.  It also tells us that all the other
+        gradient values are bigger than y.  We can use this information to help us
+        pick which alpha variables to optimize at each iteration. 
+    */
+
+// ----------------------------------------------------------------------------------------
+
+    template <
+        typename EXP1,
+        typename EXP2,
+        typename T, long NR, long NC, typename MM, typename L
+        >
+    unsigned long solve_qp_using_smo ( 
+        const matrix_exp<EXP1>& Q,
+        const matrix_exp<EXP2>& b,
+        matrix<T,NR,NC,MM,L>& alpha,
+        T eps,
+        unsigned long max_iter
+    )
+    {
+        // make sure requires clause is not broken
+        DLIB_ASSERT(Q.nr() == Q.nc() &&
+                     is_col_vector(b) &&
+                     is_col_vector(alpha) &&
+                     b.size() == alpha.size() &&
+                     b.size() == Q.nr() &&
+                     alpha.size() > 0 &&
+                     min(alpha) >= 0 &&
+                     eps > 0 &&
+                     max_iter > 0,
+                     "\t unsigned long solve_qp_using_smo()"
+                     << "\n\t Invalid arguments were given to this function"
+                     << "\n\t Q.nr():               " << Q.nr()
+                     << "\n\t Q.nc():               " << Q.nc()
+                     << "\n\t is_col_vector(b):     " << is_col_vector(b)
+                     << "\n\t is_col_vector(alpha): " << is_col_vector(alpha)
+                     << "\n\t b.size():             " << b.size() 
+                     << "\n\t alpha.size():         " << alpha.size() 
+                     << "\n\t Q.nr():               " << Q.nr() 
+                     << "\n\t min(alpha):           " << min(alpha) 
+                     << "\n\t eps:                  " << eps 
+                     << "\n\t max_iter:             " << max_iter 
+        );
+
+        const T C = sum(alpha);
+
+        // Compute f'(alpha) (i.e. the gradient of f(alpha)) for the current alpha.  
+        matrix<T,NR,NC,MM,L> df = Q*alpha - b;
+
+        const T tau = 1000*std::numeric_limits<T>::epsilon();
+
+        T big, little;
+        unsigned long iter = 0;
+        for (; iter < max_iter; ++iter)
+        {
+            // Find the two elements of df that satisfy the following:
+            //    - little_idx == index_of_min(df)
+            //    - big_idx   == the index of the largest element in df such that alpha(big_idx) > 0
+            // These two indices will tell us which two alpha values are most in violation of the KKT 
+            // optimality conditions.  
+            big = -std::numeric_limits<T>::max();
+            long big_idx = 0;
+            little = std::numeric_limits<T>::max();
+            long little_idx = 0;
+            for (long i = 0; i < df.nr(); ++i)
+            {
+                if (df(i) > big && alpha(i) > 0)
+                {
+                    big = df(i);
+                    big_idx = i;
+                }
+                if (df(i) < little)
+                {
+                    little = df(i);
+                    little_idx = i;
+                }
+            }
+
+            // Check if the KKT conditions are still violated and stop if so.  
+            //if (alpha(little_idx) > 0 && (big - little) < eps)
+            //    break;
+
+            // Check how big the duality gap is and stop when it goes below eps.  
+            // The duality gap is the gap between the objective value of the function
+            // we are optimizing and the value of its primal form.  This value is always 
+            // greater than or equal to the distance to the optimum solution so it is a 
+            // good way to decide if we should stop.   See the book referenced above for 
+            // more information.  In particular, see the part about the Wolfe Dual.
+            if (trans(alpha)*df - C*little < eps)
+                break;
+
+
+            // Save these values, we will need them later.
+            const T old_alpha_big = alpha(big_idx);
+            const T old_alpha_little = alpha(little_idx);
+
+
+            // Now optimize the two variables we just picked. 
+            T quad_coef = Q(big_idx,big_idx) + Q(little_idx,little_idx) - 2*Q(big_idx, little_idx);
+            if (quad_coef <= tau)
+                quad_coef = tau;
+            const T delta = (big - little)/quad_coef;
+            alpha(big_idx) -= delta;
+            alpha(little_idx) += delta;
+
+            // Make sure alpha stays feasible.  That is, make sure the updated alpha doesn't
+            // violate the non-negativity constraint.  
+            if (alpha(big_idx) < 0)
+            {
+                // Since an alpha can't be negative we will just set it to 0 and shift all the
+                // weight to the other alpha.
+                alpha(big_idx) = 0;
+                alpha(little_idx) = old_alpha_big + old_alpha_little;
+            }
+
+            // Every 300 iterations
+            if ((iter%300) == 299)
+            {
+                // Perform this form of the update every so often because doing so can help
+                // avoid the buildup of numerical errors you get with the alternate update
+                // below.
+                df = Q*alpha - b;
+            }
+            else
+            {
+                // Now update the gradient. We will perform the equivalent of: df = Q*alpha - b;
+                const T delta_alpha_big   = alpha(big_idx) - old_alpha_big;
+                const T delta_alpha_little = alpha(little_idx) - old_alpha_little;
+
+                for(long k = 0; k < df.nr(); ++k)
+                    df(k) += Q(big_idx,k)*delta_alpha_big + Q(little_idx,k)*delta_alpha_little;;
+            }
+        }
+
+        /*
+        using namespace std;
+        cout << "SMO: " << endl;
+        cout << "   duality gap: "<< trans(alpha)*df - C*min(df) << endl;
+        cout << "   KKT gap:     "<< big-little << endl;
+        cout << "   iter:        "<< iter+1 << endl;
+        cout << "   eps:         "<< eps << endl;
+        */
+
+        return iter+1;
+    }
+
+// ----------------------------------------------------------------------------------------
+
+    template <
+        typename EXP1,
+        typename EXP2,
+        typename EXP3,
+        typename EXP4,
+        typename T, long NR, long NC, typename MM, typename L,
+        long NR2, long NC2
+        >
+    unsigned long solve_qp4_using_smo ( 
+        const matrix_exp<EXP1>& A,
+        const matrix_exp<EXP2>& Q,
+        const matrix_exp<EXP3>& b,
+        const matrix_exp<EXP4>& d,
+        matrix<T,NR,NC,MM,L>& alpha,
+        matrix<T,NR2,NC2,MM,L>& lambda,
+        T eps,
+        unsigned long max_iter,
+        T max_lambda = std::numeric_limits<T>::infinity()
+    )
+    {
+        // make sure requires clause is not broken
+        DLIB_ASSERT(A.nc() == alpha.size() &&
+                     Q.nr() == Q.nc() &&
+                     is_col_vector(b) &&
+                     is_col_vector(alpha) &&
+                     b.size() == alpha.size() &&
+                     b.size() == Q.nr() &&
+                     alpha.size() > 0 &&
+                     min(alpha) >= 0 &&
+                     eps > 0 &&
+                     max_iter > 0,
+                     "\t void solve_qp4_using_smo()"
+                     << "\n\t Invalid arguments were given to this function"
+                     << "\n\t A.nc():               " << A.nc()
+                     << "\n\t Q.nr():               " << Q.nr()
+                     << "\n\t Q.nc():               " << Q.nc()
+                     << "\n\t is_col_vector(b):     " << is_col_vector(b)
+                     << "\n\t is_col_vector(alpha): " << is_col_vector(alpha)
+                     << "\n\t b.size():             " << b.size() 
+                     << "\n\t alpha.size():         " << alpha.size() 
+                     << "\n\t Q.nr():               " << Q.nr() 
+                     << "\n\t min(alpha):           " << min(alpha) 
+                     << "\n\t eps:                  " << eps 
+                     << "\n\t max_iter:             " << max_iter 
+        );
+        DLIB_ASSERT(is_col_vector(d) == true &&
+                     max_lambda >= 0 &&
+                     d.size() == A.nr(),
+                     "\t void solve_qp4_using_smo()"
+                     << "\n\t Invalid arguments were given to this function"
+                     << "\n\t A.nr():     " << A.nr()
+                     << "\n\t d.size():   " << d.size()
+                     << "\n\t max_lambda: " << max_lambda
+        );
+
+        const T C = sum(alpha);
+
+        /*
+            For this optimization problem, it is the case that the optimal
+            value of lambda is given by a simple closed form expression if we
+            know the optimal alpha.  So what we will do is to just optimize 
+            alpha and every now and then we will update lambda with its optimal
+            value.  Therefore, we use essentially the same method as the
+            solve_qp_using_smo() routine.  
+        */
+
+        const bool d_is_zero = d==zeros_matrix(d);
+
+        // compute optimal lambda for current alpha
+        if (d_is_zero)
+            lambda = A*alpha;
+        else
+            lambda = A*alpha + d;
+        lambda = dlib::clamp(lambda, 0, max_lambda);
+
+        // Compute f'(alpha) (i.e. the gradient of f(alpha) with respect to alpha) for the current alpha.  
+        matrix<T,NR,NC,MM,L> df = Q*alpha - b - trans(A)*lambda;
+
+        const T tau = 1000*std::numeric_limits<T>::epsilon();
+
+        T big, little;
+        unsigned long iter = 0;
+        for (; iter < max_iter; ++iter)
+        {
+            // Find the two elements of df that satisfy the following:
+            //    - little_idx == index_of_min(df)
+            //    - big_idx   == the index of the largest element in df such that alpha(big_idx) > 0
+            // These two indices will tell us which two alpha values are most in violation of the KKT 
+            // optimality conditions.  
+            big = -std::numeric_limits<T>::max();
+            long big_idx = 0;
+            little = std::numeric_limits<T>::max();
+            long little_idx = 0;
+            for (long i = 0; i < df.nr(); ++i)
+            {
+                if (df(i) > big && alpha(i) > 0)
+                {
+                    big = df(i);
+                    big_idx = i;
+                }
+                if (df(i) < little)
+                {
+                    little = df(i);
+                    little_idx = i;
+                }
+            }
+
+            // Check how big the duality gap is and stop when it goes below eps.  
+            // The duality gap is the gap between the objective value of the function
+            // we are optimizing and the value of its primal form.  This value is always 
+            // greater than or equal to the distance to the optimum solution so it is a 
+            // good way to decide if we should stop.   
+            if (trans(alpha)*df - C*little < eps)
+            {
+                // compute optimal lambda and recheck the duality gap to make
+                // sure we have really converged.
+                if (d_is_zero)
+                    lambda = A*alpha;
+                else
+                    lambda = A*alpha + d;
+                lambda = dlib::clamp(lambda, 0, max_lambda);
+                df = Q*alpha - b - trans(A)*lambda;
+
+                if (trans(alpha)*df - C*min(df) < eps)
+                    break;
+                else
+                    continue;
+            }
+
+
+            // Save these values, we will need them later.
+            const T old_alpha_big = alpha(big_idx);
+            const T old_alpha_little = alpha(little_idx);
+
+
+            // Now optimize the two variables we just picked. 
+            T quad_coef = Q(big_idx,big_idx) + Q(little_idx,little_idx) - 2*Q(big_idx, little_idx);
+            if (quad_coef <= tau)
+                quad_coef = tau;
+            const T delta = (big - little)/quad_coef;
+            alpha(big_idx) -= delta;
+            alpha(little_idx) += delta;
+
+            // Make sure alpha stays feasible.  That is, make sure the updated alpha doesn't
+            // violate the non-negativity constraint.  
+            if (alpha(big_idx) < 0)
+            {
+                // Since an alpha can't be negative we will just set it to 0 and shift all the
+                // weight to the other alpha.
+                alpha(big_idx) = 0;
+                alpha(little_idx) = old_alpha_big + old_alpha_little;
+            }
+
+
+            // Every 300 iterations
+            if ((iter%300) == 299)
+            {
+                // compute the optimal lambda for the current alpha
+                if (d_is_zero)
+                    lambda = A*alpha;
+                else
+                    lambda = A*alpha + d;
+                lambda = dlib::clamp(lambda, 0, max_lambda);
+
+                // Perform this form of the update every so often because doing so can help
+                // avoid the buildup of numerical errors you get with the alternate update
+                // below.
+                df = Q*alpha - b - trans(A)*lambda;
+            }
+            else
+            {
+                // Now update the gradient. We will perform the equivalent of: df = Q*alpha - b;
+                const T delta_alpha_big   = alpha(big_idx) - old_alpha_big;
+                const T delta_alpha_little = alpha(little_idx) - old_alpha_little;
+
+                for(long k = 0; k < df.nr(); ++k)
+                    df(k) += Q(big_idx,k)*delta_alpha_big + Q(little_idx,k)*delta_alpha_little;;
+            }
+        }
+
+        /*
+        using namespace std;
+        cout << "SMO: " << endl;
+        cout << "   duality gap: "<< trans(alpha)*df - C*min(df) << endl;
+        cout << "   KKT gap:     "<< big-little << endl;
+        cout << "   iter:        "<< iter+1 << endl;
+        cout << "   eps:         "<< eps << endl;
+        */
+
+
+        return iter+1;
+    }
+
+// ----------------------------------------------------------------------------------------
+
+    template <
+        typename EXP1,
+        typename EXP2,
+        typename T, long NR, long NC, typename MM, typename L
+        >
+    unsigned long solve_qp_box_constrained ( 
+        const matrix_exp<EXP1>& Q,
+        const matrix_exp<EXP2>& b,
+        matrix<T,NR,NC,MM,L>& alpha,
+        const matrix<T,NR,NC,MM,L>& lower,
+        const matrix<T,NR,NC,MM,L>& upper,
+        T eps,
+        unsigned long max_iter
+    )
+    {
+        // make sure requires clause is not broken
+        DLIB_ASSERT(Q.nr() == Q.nc() &&
+                     alpha.size() == lower.size() &&
+                     alpha.size() == upper.size() &&
+                     is_col_vector(b) &&
+                     is_col_vector(alpha) &&
+                     is_col_vector(lower) &&
+                     is_col_vector(upper) &&
+                     b.size() == alpha.size() &&
+                     b.size() == Q.nr() &&
+                     alpha.size() > 0 &&
+                     0 <= min(alpha-lower) &&
+                     0 <= max(upper-alpha) &&
+                     eps > 0 &&
+                     max_iter > 0,
+                     "\t unsigned long solve_qp_box_constrained()"
+                     << "\n\t Invalid arguments were given to this function"
+                     << "\n\t Q.nr():               " << Q.nr()
+                     << "\n\t Q.nc():               " << Q.nc()
+                     << "\n\t is_col_vector(b):     " << is_col_vector(b)
+                     << "\n\t is_col_vector(alpha): " << is_col_vector(alpha)
+                     << "\n\t is_col_vector(lower): " << is_col_vector(lower)
+                     << "\n\t is_col_vector(upper): " << is_col_vector(upper)
+                     << "\n\t b.size():             " << b.size() 
+                     << "\n\t alpha.size():         " << alpha.size() 
+                     << "\n\t lower.size():         " << lower.size() 
+                     << "\n\t upper.size():         " << upper.size() 
+                     << "\n\t Q.nr():               " << Q.nr() 
+                     << "\n\t min(alpha-lower):     " << min(alpha-lower) 
+                     << "\n\t max(upper-alpha):     " << max(upper-alpha) 
+                     << "\n\t eps:                  " << eps 
+                     << "\n\t max_iter:             " << max_iter 
+        );
+
+
+        // Compute f'(alpha) (i.e. the gradient of f(alpha)) for the current alpha.  
+        matrix<T,NR,NC,MM,L> df = Q*alpha + b;
+        matrix<T,NR,NC,MM,L> QQ = reciprocal_max(diag(Q));
+
+        // First we use a coordinate descent method to initialize alpha. 
+        double max_df = 0;
+        for (long iter = 0; iter < alpha.size()*2; ++iter)
+        {
+            max_df = 0;
+            long best_r =0;
+            // find the best alpha to optimize.
+            for (long r = 0; r < Q.nr(); ++r)
+            {
+                if (alpha(r) <= lower(r) && df(r) > 0)
+                    ;//alpha(r) = lower(r);
+                else if (alpha(r) >= upper(r) && df(r) < 0)
+                    ;//alpha(r) = upper(r);
+                else if (std::abs(df(r)) > max_df)
+                {
+                    best_r = r;
+                    max_df = std::abs(df(r));
+                }
+            }
+
+            // now optimize alpha(best_r)
+            const long r = best_r;
+            const T old_alpha = alpha(r);
+            alpha(r) = -(df(r)-Q(r,r)*alpha(r))*QQ(r);
+            if (alpha(r) < lower(r))
+                alpha(r) = lower(r);
+            else if (alpha(r) > upper(r))
+                alpha(r) = upper(r);
+
+            const T delta = old_alpha-alpha(r);
+
+            // Now update the gradient. We will perform the equivalent of: df = Q*alpha + b;
+            for(long k = 0; k < df.nr(); ++k)
+                df(k) -= Q(r,k)*delta;
+        }
+        //cout << "max_df: " << max_df << endl;
+        //cout << "objective value: " << 0.5*trans(alpha)*Q*alpha + trans(b)*alpha << endl;
+
+
+
+        // Now do the main iteration block of this solver.  The coordinate descent method
+        // we used above can improve the objective rapidly in the beginning.  However,
+        // Nesterov's method has more rapid convergence once it gets going so this is what
+        // we use for the main iteration.
+        matrix<T,NR,NC,MM,L> v, v_old; 
+        v = alpha;
+        // We need to get an upper bound on the Lipschitz constant for this QP. Since that
+        // is just the max eigenvalue of Q we can do it using Gershgorin disks.
+        const T lipschitz_bound = max(diag(Q) + (sum_cols(abs(Q)) - abs(diag(Q))));
+        double lambda = 0;
+        unsigned long iter;
+        for (iter = 0; iter < max_iter; ++iter)
+        {
+            const double next_lambda = (1 + std::sqrt(1+4*lambda*lambda))/2;
+            const double gamma = (1-lambda)/next_lambda;
+            lambda = next_lambda;
+
+            v_old = v;
+
+            df = Q*alpha + b;
+            // now take a projected gradient step using Nesterov's method.
+            v = clamp(alpha - 1.0/lipschitz_bound * df, lower, upper);
+            alpha = dlib::clamp((1-gamma)*v + gamma*v_old, lower, upper);
+
+
+            // check for convergence every 10 iterations
+            if (iter%10 == 0)
+            {
+                max_df = 0;
+                for (long r = 0; r < Q.nr(); ++r)
+                {
+                    if (alpha(r) <= lower(r) && df(r) > 0)
+                        ;//alpha(r) = lower(r);
+                    else if (alpha(r) >= upper(r) && df(r) < 0)
+                        ;//alpha(r) = upper(r);
+                    else if (std::abs(df(r)) > max_df)
+                        max_df = std::abs(df(r));
+                }
+                if (max_df < eps)
+                    break;
+            }
+        }
+
+        //cout << "max_df: " << max_df << endl;
+        //cout << "objective value: " << 0.5*trans(alpha)*Q*alpha + trans(b)*alpha << endl;
+        return iter+1;
+    }
+
+// ----------------------------------------------------------------------------------------
+// ----------------------------------------------------------------------------------------
+
+    namespace impl
+    {
+        // Check if each vector in Q_offdiag is actually a constant times the 1s vector.
+        template <
+            typename T, long NR, long NC, typename MM, typename L
+            >
+        bool has_uniform_offdiag_vectors(
+            const std::map<unordered_pair<size_t>, matrix<T,NR,NC,MM,L>>& Q_offdiag
+        )
+        {
+            for (auto& x : Q_offdiag)
+            {
+                auto ref = x.second(0);
+                for (auto& y : x.second)
+                    if (ref != y)
+                        return false;
+            }
+            return true;
+        }
+
+        template <
+            typename T, long NR, long NC, typename MM, typename L
+            >
+        matrix<T,0,0,MM,L> compact_offdiag(
+            const size_t& num_blocks,
+            const std::map<unordered_pair<size_t>, matrix<T,NR,NC,MM,L>>& Q_offdiag
+        )
+        {
+            matrix<T,0,0,MM,L> temp;
+            // we can only compact the offdiag information if they are uniform vectors
+            if (!has_uniform_offdiag_vectors(Q_offdiag))
+                return temp;
+
+            temp.set_size(num_blocks, num_blocks);
+            temp = 0;
+
+            for (auto& x : Q_offdiag)
+            {
+                long r = x.first.first;
+                long c = x.first.second;
+                temp(r,c) = x.second(0);
+                temp(c,r) = x.second(0);
+            }
+
+            return temp;
+        }
+    }
+
+// ----------------------------------------------------------------------------------------
+
+    template <
+        typename T, long NR, long NC, typename MM, typename L
+        >
+    unsigned long solve_qp_box_constrained_blockdiag ( 
+        const std::vector<matrix<T,NR,NR,MM,L>>& Q_blocks,
+        const std::vector<matrix<T,NR,NC,MM,L>>& bs,
+        const std::map<unordered_pair<size_t>, matrix<T,NR,NC,MM,L>>& Q_offdiag,
+        std::vector<matrix<T,NR,NC,MM,L>>& alphas,
+        const std::vector<matrix<T,NR,NC,MM,L>>& lowers,
+        const std::vector<matrix<T,NR,NC,MM,L>>& uppers,
+        T eps,
+        unsigned long max_iter
+    )
+    {
+        // make sure requires clause is not broken
+        DLIB_CASSERT(Q_blocks.size() > 0);
+        DLIB_CASSERT(Q_blocks.size() == bs.size() && 
+                     Q_blocks.size() == alphas.size() &&
+                     Q_blocks.size() == lowers.size() &&
+                     Q_blocks.size() == uppers.size(),
+                   "Q_blocks.size():  "<< Q_blocks.size() << "\n" <<
+                   "bs.size():        "<< bs.size() << "\n" <<
+                   "alphas.size():    "<< alphas.size() << "\n" <<
+                   "lowers.size():    "<< lowers.size() << "\n" <<
+                   "uppers.size():    "<< uppers.size() << "\n"
+                   );
+        for (auto& Q : Q_blocks)
+        {
+            DLIB_CASSERT(Q.nr() == Q.nc(), "All the matrices in Q_blocks have the same dimensions.");
+            DLIB_CASSERT(Q.size() > 0, "All the matrices in Q_blocks must be non-empty and have the same dimensions.");
+            DLIB_CASSERT(Q.nr() == Q_blocks[0].nr() && Q.nc() == Q_blocks[0].nc(), "All the matrices in Q_blocks have the same dimensions.");
+        }
+#ifdef ENABLE_ASSERTS
+        for (size_t i = 0; i < alphas.size(); ++i)
+        {
+            DLIB_CASSERT(is_col_vector(bs[i]) && bs[i].size() == Q_blocks[0].nr(),
+                "is_col_vector(bs["<<i<<"]): " << is_col_vector(bs[i]) << "\n" <<
+                "bs["<<i<<"].size():         " << bs[i].size() << "\n" <<
+                "Q_blocks[0].nr():           " << Q_blocks[0].nr());
+
+            for (auto& Qoffdiag : Q_offdiag)
+            {
+                auto& Q_offdiag_element = Qoffdiag.second;
+                long r = Qoffdiag.first.first;
+                long c = Qoffdiag.first.second;
+                DLIB_CASSERT(is_col_vector(Q_offdiag_element) && Q_offdiag_element.size() == Q_blocks[0].nr(),
+                    "is_col_vector(Q_offdiag["<<r<<","<<c<<"]): " << is_col_vector(Q_offdiag_element) << "\n" <<
+                    "Q_offdiag["<<r<<","<<c<<"].size():         " << Q_offdiag_element.size() << "\n" <<
+                    "Q_blocks[0].nr():                  " << Q_blocks[0].nr());
+            }
+
+            DLIB_CASSERT(is_col_vector(alphas[i]) && alphas[i].size() == Q_blocks[0].nr(),
+                "is_col_vector(alphas["<<i<<"]): " << is_col_vector(alphas[i]) << "\n" <<
+                "alphas["<<i<<"].size():         " << alphas[i].size() << "\n" <<
+                "Q_blocks[0].nr():               " << Q_blocks[0].nr());
+
+            DLIB_CASSERT(is_col_vector(lowers[i]) && lowers[i].size() == Q_blocks[0].nr(),
+                "is_col_vector(lowers["<<i<<"]): " << is_col_vector(lowers[i]) << "\n" <<
+                "lowers["<<i<<"].size():         " << lowers[i].size() << "\n" <<
+                "Q_blocks[0].nr():               " << Q_blocks[0].nr());
+
+            DLIB_CASSERT(is_col_vector(uppers[i]) && uppers[i].size() == Q_blocks[0].nr(),
+                "is_col_vector(uppers["<<i<<"]): " << is_col_vector(uppers[i]) << "\n" <<
+                "uppers["<<i<<"].size():         " << uppers[i].size() << "\n" <<
+                "Q_blocks[0].nr():               " << Q_blocks[0].nr());
+
+            DLIB_CASSERT(0 <= min(alphas[i]-lowers[i]), "min(alphas["<<i<<"]-lowers["<<i<<"]): " << min(alphas[i]-lowers[i]));
+            DLIB_CASSERT(0 <= max(uppers[i]-alphas[i]), "max(uppers["<<i<<"]-alphas["<<i<<"]): " << max(uppers[i]-alphas[i]));
+        }
+        DLIB_CASSERT(eps > 0 && max_iter > 0, "eps: " << eps << "\nmax_iter: "<< max_iter);
+#endif // ENABLE_ASSERTS
+
+
+        const auto offdiag_compact = impl::compact_offdiag(Q_blocks.size(), Q_offdiag);
+        matrix<T,0,0,MM,L> temp, alphas_compact;
+
+        // Compute f'(alpha) (i.e. the gradient of f(alpha)) for the current alpha.  
+        std::vector<matrix<T,NR,NC,MM,L>> df;// = Q*alpha + b;
+        auto compute_df = [&]()
+        {
+            df.resize(Q_blocks.size());
+            for (size_t i = 0; i < df.size(); ++i)
+                df[i] = Q_blocks[i]*alphas[i] + bs[i];
+
+
+            // Don't forget to include the Q_offdiag terms in the computation.  Note that
+            // we have two options for how we can compute this part.  If Q_offdiag is
+            // uniform and can be compacted into a simple matrix and there are a lot of off
+            // diagonal entries then it's faster to do it as a matrix multiply.  Otherwise
+            // we do the more general computation.
+            if (offdiag_compact.size() != 0 && Q_offdiag.size() > Q_blocks.size()*5)
+            {
+                // Do it as a matrix multiply (with a bit of data shuffling)
+                alphas_compact.set_size(alphas[0].size(), offdiag_compact.nr());
+                for (long c = 0; c < alphas_compact.nc(); ++c)
+                    set_colm(alphas_compact,c) = alphas[c];
+                temp = alphas_compact*offdiag_compact;
+                for (size_t i = 0; i < df.size(); ++i)
+                    df[i] += colm(temp,i);
+            }
+            else
+            {
+                // Do the fully general computation that allows for non-uniform values in
+                // the off diagonal vectors.
+                for (auto& p : Q_offdiag)
+                {
+                    long r = p.first.first;
+                    long c = p.first.second;
+                    df[r] += pointwise_multiply(p.second, alphas[c]);
+                    if (r != c)
+                        df[c] += pointwise_multiply(p.second, alphas[r]);
+                }
+            }
+        };
+        compute_df();
+
+
+
+        std::vector<matrix<T,NR,NC,MM,L>> Q_diag, Q_ggd;
+        std::vector<matrix<T,NR,NC,MM,L>> QQ;// = reciprocal_max(diag(Q));
+        QQ.resize(Q_blocks.size());
+        Q_diag.resize(Q_blocks.size());
+        Q_ggd.resize(Q_blocks.size());
+
+        // We need to get an upper bound on the Lipschitz constant for this QP. Since that
+        // is just the max eigenvalue of Q we can do it using Gershgorin disks.
+        //const T lipschitz_bound = max(diag(Q) + (sum_cols(abs(Q)) - abs(diag(Q))));
+        for (size_t i = 0; i < QQ.size(); ++i)
+        {
+            auto f = Q_offdiag.find(make_unordered_pair(i,i));
+            if (f != Q_offdiag.end())
+                Q_diag[i] = diag(Q_blocks[i]) + f->second;
+            else
+                Q_diag[i] = diag(Q_blocks[i]);
+            QQ[i] = reciprocal_max(Q_diag[i]);
+
+            Q_ggd[i] = Q_diag[i] + (sum_cols(abs(Q_blocks[i]))-abs(diag(Q_blocks[i])));
+        }
+        for (auto& p : Q_offdiag)
+        {
+            long r = p.first.first;
+            long c = p.first.second;
+            if (r != c)
+            {
+                Q_ggd[r] += abs(p.second);
+                Q_ggd[c] += abs(p.second);
+            }
+        }
+        T lipschitz_bound = -std::numeric_limits<T>::infinity();
+        for (auto& x : Q_ggd)
+            lipschitz_bound = std::max(lipschitz_bound, max(x));
+
+
+        const long num_variables = alphas.size()*alphas[0].size();
+
+        // First we use a coordinate descent method to initialize alpha. 
+        double max_df = 0;
+        for (long iter = 0; iter < num_variables*2; ++iter)
+        {
+            max_df = 0;
+            long best_r =0;
+            size_t best_r2 =0;
+            // find the best alpha to optimize.
+            for (size_t r2 = 0; r2 < alphas.size(); ++r2) 
+            {
+                auto& alpha = alphas[r2];
+                auto& df_ = df[r2];
+                auto& lower = lowers[r2];
+                auto& upper = uppers[r2];
+                for (long r = 0; r < alpha.nr(); ++r)
+                {
+                    if (alpha(r) <= lower(r) && df_(r) > 0)
+                        ;//alpha(r) = lower(r);
+                    else if (alpha(r) >= upper(r) && df_(r) < 0)
+                        ;//alpha(r) = upper(r);
+                    else if (std::abs(df_(r)) > max_df)
+                    {
+                        best_r = r;
+                        best_r2 = r2;
+                        max_df = std::abs(df_(r));
+                    }
+                }
+            }
+
+            // now optimize alphas[best_r2](best_r)
+            const long r = best_r;
+            auto& alpha = alphas[best_r2];
+            auto& lower = lowers[best_r2];
+            auto& upper = uppers[best_r2];
+            auto& df_ = df[best_r2];
+            const T old_alpha = alpha(r);
+            alpha(r) = -(df_(r)-Q_diag[best_r2](r)*alpha(r))*QQ[best_r2](r);
+            if (alpha(r) < lower(r))
+                alpha(r) = lower(r);
+            else if (alpha(r) > upper(r))
+                alpha(r) = upper(r);
+
+            const T delta = old_alpha-alpha(r);
+
+            // Now update the gradient. We will perform the equivalent of: df = Q*alpha +
+            // b; except we only need to compute one column of the matrix multiply because
+            // only one element of alpha changed.
+            auto& Q = Q_blocks[best_r2];
+            for(long k = 0; k < df_.nr(); ++k)
+                df_(k) -= Q(r,k)*delta;
+            for(size_t j = 0; j < Q_blocks.size(); ++j)
+            {
+                auto f = Q_offdiag.find(make_unordered_pair(best_r2, j));
+                if (f != Q_offdiag.end())
+                    df[j](r) -= f->second(r)*delta;
+            }
+        }
+
+
+
+
+        std::vector<matrix<T,NR,NC,MM,L>> v(alphas), v_old(alphas.size());
+        double lambda = 0;
+        unsigned long iter;
+        // Now do the main iteration block of this solver.  The coordinate descent method
+        // we used above can improve the objective rapidly in the beginning.  However,
+        // Nesterov's method has more rapid convergence once it gets going so this is what
+        // we use for the main iteration.
+        for (iter = 0; iter < max_iter; ++iter)
+        {
+            const double next_lambda = (1 + std::sqrt(1+4*lambda*lambda))/2;
+            const double gamma = (1-lambda)/next_lambda;
+            lambda = next_lambda;
+
+            v_old.swap(v);
+
+            //df = Q*alpha + b;
+            compute_df();
+
+            // now take a projected gradient step using Nesterov's method.
+            for (size_t j = 0; j < alphas.size(); ++j)
+            {
+                v[j] = clamp(alphas[j] - 1.0/lipschitz_bound * df[j], lowers[j], uppers[j]);
+                alphas[j] = clamp((1-gamma)*v[j] + gamma*v_old[j], lowers[j], uppers[j]);
+            }
+
+
+            // check for convergence every 10 iterations
+            if (iter%10 == 0)
+            {
+                max_df = 0;
+                for (size_t r2 = 0; r2 < alphas.size(); ++r2) 
+                {
+                    auto& alpha = alphas[r2];
+                    auto& df_ = df[r2];
+                    auto& lower = lowers[r2];
+                    auto& upper = uppers[r2];
+                    for (long r = 0; r < alpha.nr(); ++r)
+                    {
+                        if (alpha(r) <= lower(r) && df_(r) > 0)
+                            ;//alpha(r) = lower(r);
+                        else if (alpha(r) >= upper(r) && df_(r) < 0)
+                            ;//alpha(r) = upper(r);
+                        else if (std::abs(df_(r)) > max_df)
+                            max_df = std::abs(df_(r));
+                    }
+                }
+                if (max_df < eps)
+                    break;
+            }
+        }
+
+        return iter+1;
+    }
+
+// ----------------------------------------------------------------------------------------
+
+    template <
+        typename EXP1,
+        typename EXP2,
+        typename T, long NRa, long NRb
+        >
+    unsigned long find_gap_between_convex_hulls (
+        const matrix_exp<EXP1>& A,
+        const matrix_exp<EXP2>& B,
+        matrix<T,NRa,1>& cA,
+        matrix<T,NRb,1>& cB,
+        const double eps,
+        const unsigned long max_iter = 1000
+    )
+    {
+        DLIB_CASSERT(A.size() != 0);
+        DLIB_CASSERT(B.size() != 0);
+        DLIB_CASSERT(A.nr() == B.nr(), "The dimensionality of the points in both convex hull sets must match");
+        DLIB_CASSERT(eps > 0);
+        DLIB_CASSERT(max_iter > 0);
+
+        cA.set_size(A.nc());
+        cB.set_size(B.nc());
+
+        // initialize to the centroids of A and B respectively.
+        cA = 1.0/cA.size();
+        cB = 1.0/cB.size();
+
+
+        matrix<T> AA, BB, AB, ABb, ABa;
+
+        AA = trans(A)*A;
+        BB = trans(B)*B;
+        AB = trans(A)*B;
+
+        unsigned long iter = 0;
+        for (iter = 0; iter < max_iter; ++iter)
+        {
+            // find the convex combination of A that is nearest to B*cB
+            ABb = AB*cB;
+            const auto smo_iter1 = solve_qp_using_smo(AA, ABb, cA, eps, cA.size());
+
+            // now find the convex combination of B that is nearest to A*cA
+            ABa = trans(AB)*cA;
+            const auto smo_iter2 = solve_qp_using_smo(BB, ABa, cB, eps, cB.size());
+
+            // stop if the QP solvers failed to improve 
+            if (smo_iter1 == 1 && smo_iter2 == 1)
+                break;
+        }
+
+
+        return iter+1;
+    }
+
+// ----------------------------------------------------------------------------------------
+
+}
+
+#endif // DLIB_OPTIMIZATION_SOLVE_QP_UsING_SMO_Hh_
+
diff --git a/ml/dlib/dlib/optimization/optimization_solve_qp_using_smo_abstract.h b/ml/dlib/dlib/optimization/optimization_solve_qp_using_smo_abstract.h
new file mode 100644
index 000000000..5e7d5ec3f
--- /dev/null
+++ b/ml/dlib/dlib/optimization/optimization_solve_qp_using_smo_abstract.h
@@ -0,0 +1,282 @@
+// Copyright (C) 2010  Davis E. King (davis@dlib.net)
+// License: Boost Software License   See LICENSE.txt for the full license.
+#undef DLIB_OPTIMIZATION_SOLVE_QP_UsING_SMO_ABSTRACT_Hh_
+#ifdef DLIB_OPTIMIZATION_SOLVE_QP_UsING_SMO_ABSTRACT_Hh_
+
+#include "../matrix.h"
+#include <map>
+#include "../unordered_pair.h"
+
+namespace dlib
+{
+
+// ----------------------------------------------------------------------------------------
+
+    template <
+        typename EXP1,
+        typename EXP2,
+        typename T, long NR, long NC, typename MM, typename L
+        >
+    unsigned long solve_qp_using_smo ( 
+        const matrix_exp<EXP1>& Q,
+        const matrix_exp<EXP2>& b,
+        matrix<T,NR,NC,MM,L>& alpha,
+        T eps,
+        unsigned long max_iter
+    );
+    /*!
+        requires
+            - Q.nr() == Q.nc()
+            - is_col_vector(b) == true
+            - is_col_vector(alpha) == true
+            - b.size() == alpha.size() == Q.nr()
+            - alpha.size() > 0
+            - min(alpha) >= 0
+            - eps > 0
+            - max_iter > 0
+        ensures
+            - Let C == sum(alpha) (i.e. C is the sum of the alpha values you 
+              supply to this function)
+            - This function solves the following quadratic program:
+                Minimize: f(alpha) == 0.5*trans(alpha)*Q*alpha - trans(alpha)*b
+                subject to the following constraints:
+                    - sum(alpha) == C (i.e. the sum of alpha values doesn't change)
+                    - min(alpha) >= 0 (i.e. all alpha values are nonnegative)
+                Where f is convex.  This means that Q should be positive-semidefinite.
+            - The solution to the above QP will be stored in #alpha.
+            - This function uses a simple implementation of the sequential minimal
+              optimization algorithm.  It starts the algorithm with the given alpha
+              and it works on the problem until the duality gap (i.e. how far away
+              we are from the optimum solution) is less than eps.  So eps controls 
+              how accurate the solution is and smaller values result in better solutions.
+            - At most max_iter iterations of optimization will be performed.  
+            - returns the number of iterations performed.  If this method fails to
+              converge to eps accuracy then the number returned will be max_iter+1.
+    !*/
+
+// ----------------------------------------------------------------------------------------
+
+    template <
+        typename EXP1,
+        typename EXP2,
+        typename EXP3,
+        typename T, long NR, long NC, typename MM, typename L,
+        long NR2, long NC2
+        >
+    unsigned long solve_qp4_using_smo ( 
+        const matrix_exp<EXP1>& A,
+        const matrix_exp<EXP2>& Q,
+        const matrix_exp<EXP3>& b,
+        const matrix_exp<EXP4>& d,
+        matrix<T,NR,NC,MM,L>& alpha,
+        matrix<T,NR2,NC2,MM,L>& lambda,
+        T eps,
+        unsigned long max_iter,
+        T max_lambda = std::numeric_limits<T>::infinity()
+    );
+    /*!
+        requires
+            - A.nc() == alpha.size()
+            - Q.nr() == Q.nc()
+            - is_col_vector(b) == true
+            - is_col_vector(d) == true
+            - is_col_vector(alpha) == true
+            - b.size() == alpha.size() == Q.nr()
+            - d.size() == A.nr()
+            - alpha.size() > 0
+            - min(alpha) >= 0
+            - eps > 0
+            - max_iter > 0
+            - max_lambda >= 0
+        ensures
+            - Let C == sum(alpha) (i.e. C is the sum of the alpha values you 
+              supply to this function)
+            - This function solves the following quadratic program:
+                Minimize: f(alpha,lambda) == 0.5*trans(alpha)*Q*alpha - trans(alpha)*b + 
+                                             0.5*trans(lambda)*lambda - trans(lambda)*A*alpha - trans(lambda)*d
+                subject to the following constraints:
+                    - sum(alpha)  == C (i.e. the sum of alpha values doesn't change)
+                    - min(alpha)  >= 0 (i.e. all alpha values are nonnegative)
+                    - min(lambda) >= 0 (i.e. all lambda values are nonnegative)
+                    - max(lambda) <= max_lambda (i.e. all lambda values are less than max_lambda)
+                Where f is convex.  This means that Q should be positive-semidefinite.
+            - If you don't want an upper limit on lambda then max_lambda can be set to
+              infinity.
+            - The solution to the above QP will be stored in #alpha and #lambda.  
+            - This function uses a simple implementation of the sequential minimal
+              optimization algorithm.  It starts the algorithm with the given alpha
+              and it works on the problem until the duality gap (i.e. how far away
+              we are from the optimum solution) is less than eps.  So eps controls 
+              how accurate the solution is and smaller values result in better solutions.
+              The initial value of lambda is ignored since the optimal lambda can be
+              obtained via a simple closed form expression given alpha.
+            - At most max_iter iterations of optimization will be performed.  
+            - returns the number of iterations performed.  If this method fails to
+              converge to eps accuracy then the number returned will be max_iter+1.
+    !*/
+
+// ----------------------------------------------------------------------------------------
+
+    template <
+        typename EXP1,
+        typename EXP2,
+        typename T, long NR, long NC, typename MM, typename L
+        >
+    unsigned long solve_qp_box_constrained ( 
+        const matrix_exp<EXP1>& Q,
+        const matrix_exp<EXP2>& b,
+        matrix<T,NR,NC,MM,L>& alpha,
+        const matrix<T,NR,NC,MM,L>& lower,
+        const matrix<T,NR,NC,MM,L>& upper,
+        T eps,
+        unsigned long max_iter
+    );
+    /*!
+        requires
+            - Q.nr() == Q.nc()
+            - alpha.size() == lower.size() == upper.size()
+            - is_col_vector(b) == true
+            - is_col_vector(alpha) == true
+            - is_col_vector(lower) == true
+            - is_col_vector(upper) == true
+            - b.size() == alpha.size() == Q.nr()
+            - alpha.size() > 0
+            - 0 <= min(alpha-lower)
+            - 0 <= max(upper-alpha)
+            - eps > 0
+            - max_iter > 0
+        ensures
+            - This function solves the following quadratic program:
+                Minimize: f(alpha) == 0.5*trans(alpha)*Q*alpha + trans(b)*alpha 
+                subject to the following box constraints on alpha:
+                    - 0 <= min(alpha-lower)
+                    - 0 <= max(upper-alpha)
+                Where f is convex.  This means that Q should be positive-semidefinite.
+            - The solution to the above QP will be stored in #alpha.
+            - This function uses a combination of a SMO algorithm along with Nesterov's
+              method as the main iteration of the solver.  It starts the algorithm with the
+              given alpha and it works on the problem until the derivative of f(alpha) is
+              smaller than eps for each element of alpha or the alpha value is at a box
+              constraint.  So eps controls how accurate the solution is and smaller values
+              result in better solutions.
+            - At most max_iter iterations of optimization will be performed.  
+            - returns the number of iterations performed.  If this method fails to
+              converge to eps accuracy then the number returned will be max_iter+1.
+    !*/
+
+// ----------------------------------------------------------------------------------------
+
+    template <
+        typename T, long NR, long NC, typename MM, typename L
+        >
+    unsigned long solve_qp_box_constrained_blockdiag ( 
+        const std::vector<matrix<T,NR,NR,MM,L>>& Q_blocks,
+        const std::vector<matrix<T,NR,NC,MM,L>>& bs,
+        const std::map<unordered_pair<size_t>, matrix<T,NR,NC,MM,L>>& Q_offdiag,
+        std::vector<matrix<T,NR,NC,MM,L>>& alphas,
+        const std::vector<matrix<T,NR,NC,MM,L>>& lowers,
+        const std::vector<matrix<T,NR,NC,MM,L>>& uppers,
+        T eps,
+        unsigned long max_iter
+    );
+    /*!
+        requires
+            - Q_blocks.size() > 0
+            - Q_blocks.size() == bs.size() == alphas.size() == lowers.size() == uppers.size()
+            - All the matrices in Q_blocks have the same dimensions.  Moreover, they are
+              non-empty square matrices.
+            - All the matrices in bs, Q_offdiag, alphas, lowers, and uppers have the same
+              dimensions.  Moreover, they are all column vectors.
+            - Q_blocks[0].nr() == alphas[0].size()
+              (i.e. the dimensionality of the column vectors in alphas must match the
+              dimensionality of the square matrices in Q_blocks.)
+            - for all valid i:
+                - 0 <= min(alphas[i]-lowers[i])
+                - 0 <= max(uppers[i]-alphas[i])
+            - eps > 0
+            - max_iter > 0
+        ensures
+            - This function solves the same QP as solve_qp_box_constrained(), except it is
+              optimized for the case where the Q matrix has a certain sparsity structure.
+              To be precise:
+                - Let Q1 be a block diagonal matrix with the elements of Q_blocks placed
+                  along its diagonal, and in the order contained in Q_blocks.  
+                - Let Q2 be a matrix with the same size as Q1, except instead of being block diagonal, it
+                  is block structured into Q_blocks.nr() by Q_blocks.nc() blocks.  If we let (r,c) be the
+                  coordinate of each block then each block contains the matrix
+                  diagm(Q_offdiag[make_unordered_pair(r,c)]) or the zero matrix if Q_offdiag has no entry
+                  for the coordinate (r,c).
+                - Let Q == Q1+Q2
+                - Let b == the concatenation of all the vectors in bs into one big vector.
+                - Let alpha == the concatenation of all the vectors in alphas into one big vector.
+                - Let lower == the concatenation of all the vectors in lowers into one big vector.
+                - Let upper == the concatenation of all the vectors in uppers into one big vector.
+                - Then this function solves the following quadratic program:
+                    Minimize: f(alpha) == 0.5*trans(alpha)*Q*alpha + trans(b)*alpha 
+                    subject to the following box constraints on alpha:
+                        - 0 <= min(alpha-lower)
+                        - 0 <= max(upper-alpha)
+                    Where f is convex.  This means that Q should be positive-semidefinite.
+                - More specifically, this function is identical to
+                  solve_qp_box_constrained(Q, b, alpha, lower, upper, eps, max_iter),
+                  except that it runs faster since it avoids unnecessary computation by
+                  taking advantage of the sparsity structure in the QP.
+            - The solution to the above QP will be stored in #alphas.
+            - This function uses a combination of a SMO algorithm along with Nesterov's
+              method as the main iteration of the solver.  It starts the algorithm with the
+              given alpha and it works on the problem until the derivative of f(alpha) is
+              smaller than eps for each element of alpha or the alpha value is at a box
+              constraint.  So eps controls how accurate the solution is and smaller values
+              result in better solutions.
+            - At most max_iter iterations of optimization will be performed.  
+            - returns the number of iterations performed.  If this method fails to
+              converge to eps accuracy then the number returned will be max_iter+1.
+    !*/
+// ----------------------------------------------------------------------------------------
+
+    template <
+        typename EXP1,
+        typename EXP2,
+        typename T, long NRa, long NRb
+        >
+    unsigned long find_gap_between_convex_hulls (
+        const matrix_exp<EXP1>& A,
+        const matrix_exp<EXP2>& B,
+        matrix<T,NRa,1>& cA,
+        matrix<T,NRb,1>& cB,
+        const double eps,
+        const unsigned long max_iter = 1000
+    );
+    /*!
+        requires
+            - A.nr() == B.nr()
+            - A.size() != 0
+            - B.size() != 0
+            - eps > 0
+            - max_iter > 0
+        ensures
+            - If you think of A and B as sets of column vectors, then we can identify the
+              convex sets hullA and hullB, which are the convex hulls of A and B
+              respectively.  This function finds the pair of points in hullA and hullB that
+              are nearest to each other.  To be precise, this function solves the following
+              quadratic program:
+                Minimize: f(cA,cB) == length_squared(A*cA - B*cB) 
+                subject to the following constraints on cA and cB:
+                    - is_col_vector(cA) == true && cA.size() == A.nc()
+                    - is_col_vector(cB) == true && cB.size() == B.nc()
+                    - sum(cA) == 1 && min(cA) >= 0
+                    - sum(cB) == 1 && min(cB) >= 0
+            - This function uses an iterative block coordinate descent algorithm to solve
+              the QP.  It runs until either max_iter iterations have been performed or the
+              QP is solved to at least eps accuracy.
+            - returns the number of iterations performed.  If this method fails to
+              converge to eps accuracy then the number returned will be max_iter+1.
+    !*/
+
+// ----------------------------------------------------------------------------------------
+
+}
+
+#endif // DLIB_OPTIMIZATION_SOLVE_QP_UsING_SMO_ABSTRACT_Hh_
+
+
diff --git a/ml/dlib/dlib/optimization/optimization_stop_strategies.h b/ml/dlib/dlib/optimization/optimization_stop_strategies.h
new file mode 100644
index 000000000..a0243cacf
--- /dev/null
+++ b/ml/dlib/dlib/optimization/optimization_stop_strategies.h
@@ -0,0 +1,173 @@
+// Copyright (C) 2008  Davis E. King (davis@dlib.net)
+// License: Boost Software License   See LICENSE.txt for the full license.
+#ifndef DLIB_OPTIMIZATIOn_STOP_STRATEGIES_H_
+#define DLIB_OPTIMIZATIOn_STOP_STRATEGIES_H_
+
+#include <cmath>
+#include <limits>
+#include "../matrix.h"
+#include "../algs.h"
+#include "optimization_stop_strategies_abstract.h"
+#include <iostream>
+
+namespace dlib
+{
+
+// ----------------------------------------------------------------------------------------
+
+    class objective_delta_stop_strategy
+    {
+    public:
+        explicit objective_delta_stop_strategy (
+            double min_delta = 1e-7
+        ) : _verbose(false), _been_used(false), _min_delta(min_delta), _max_iter(0), _cur_iter(0), _prev_funct_value(0) 
+        {
+            DLIB_ASSERT (
+                min_delta >= 0,
+                "\t objective_delta_stop_strategy(min_delta)"
+                << "\n\t min_delta can't be negative"
+                << "\n\t min_delta: " << min_delta
+            );
+        }
+
+        objective_delta_stop_strategy (
+            double min_delta,
+            unsigned long max_iter
+        ) : _verbose(false), _been_used(false), _min_delta(min_delta), _max_iter(max_iter), _cur_iter(0), _prev_funct_value(0) 
+        {
+            DLIB_ASSERT (
+                min_delta >= 0 && max_iter > 0,
+                "\t objective_delta_stop_strategy(min_delta, max_iter)"
+                << "\n\t min_delta can't be negative and max_iter can't be 0"
+                << "\n\t min_delta: " << min_delta
+                << "\n\t max_iter:  " << max_iter 
+            );
+        }
+
+        objective_delta_stop_strategy& be_verbose( 
+        ) 
+        {
+            _verbose = true;
+            return *this;
+        }
+
+        template <typename T>
+        bool should_continue_search (
+            const T& ,
+            const double funct_value,
+            const T& 
+        ) 
+        {
+            if (_verbose)
+            {
+                using namespace std;
+                cout << "iteration: " << _cur_iter << "   objective: " << funct_value << endl;
+            }
+
+            ++_cur_iter;
+            if (_been_used)
+            {
+                // Check if we have hit the max allowable number of iterations.  (but only
+                // check if _max_iter is enabled (i.e. not 0)).
+                if (_max_iter != 0 && _cur_iter > _max_iter)
+                    return false;
+
+                // check if the function change was too small
+                if (std::abs(funct_value - _prev_funct_value) < _min_delta)
+                    return false;
+            }
+
+            _been_used = true;
+            _prev_funct_value = funct_value;
+            return true;
+        }
+
+    private:
+        bool _verbose;
+
+        bool _been_used;
+        double _min_delta;
+        unsigned long _max_iter;
+        unsigned long _cur_iter;
+        double _prev_funct_value;
+    };
+
+// ----------------------------------------------------------------------------------------
+
+    class gradient_norm_stop_strategy
+    {
+    public:
+        explicit gradient_norm_stop_strategy (
+            double min_norm = 1e-7
+        ) : _verbose(false), _min_norm(min_norm), _max_iter(0), _cur_iter(0) 
+        {
+            DLIB_ASSERT (
+                min_norm >= 0,
+                "\t gradient_norm_stop_strategy(min_norm)"
+                << "\n\t min_norm can't be negative"
+                << "\n\t min_norm: " << min_norm
+            );
+        }
+
+        gradient_norm_stop_strategy (
+            double min_norm,
+            unsigned long max_iter
+        ) : _verbose(false), _min_norm(min_norm), _max_iter(max_iter), _cur_iter(0) 
+        {
+            DLIB_ASSERT (
+                min_norm >= 0 && max_iter > 0,
+                "\t gradient_norm_stop_strategy(min_norm, max_iter)"
+                << "\n\t min_norm can't be negative and max_iter can't be 0"
+                << "\n\t min_norm: " << min_norm
+                << "\n\t max_iter:  " << max_iter 
+            );
+        }
+
+        gradient_norm_stop_strategy& be_verbose( 
+        ) 
+        {
+            _verbose = true;
+            return *this;
+        }
+
+        template <typename T>
+        bool should_continue_search (
+            const T& ,
+            const double funct_value,
+            const T& funct_derivative
+        ) 
+        {
+            if (_verbose)
+            {
+                using namespace std;
+                cout << "iteration: " << _cur_iter << "   objective: " << funct_value << "   gradient norm: " << length(funct_derivative) << endl;
+            }
+
+            ++_cur_iter;
+
+            // Check if we have hit the max allowable number of iterations.  (but only
+            // check if _max_iter is enabled (i.e. not 0)).
+            if (_max_iter != 0 && _cur_iter > _max_iter)
+                return false;
+
+            // check if the gradient norm is too small 
+            if (length(funct_derivative) < _min_norm)
+                return false;
+
+            return true;
+        }
+
+    private:
+        bool _verbose;
+
+        double _min_norm;
+        unsigned long _max_iter;
+        unsigned long _cur_iter;
+    };
+
+// ----------------------------------------------------------------------------------------
+
+}
+
+#endif // DLIB_OPTIMIZATIOn_STOP_STRATEGIES_H_
+
diff --git a/ml/dlib/dlib/optimization/optimization_stop_strategies_abstract.h b/ml/dlib/dlib/optimization/optimization_stop_strategies_abstract.h
new file mode 100644
index 000000000..6a999f8d9
--- /dev/null
+++ b/ml/dlib/dlib/optimization/optimization_stop_strategies_abstract.h
@@ -0,0 +1,157 @@
+// Copyright (C) 2008  Davis E. King (davis@dlib.net)
+// License: Boost Software License   See LICENSE.txt for the full license.
+#undef DLIB_OPTIMIZATIOn_STOP_STRATEGIES_ABSTRACT_
+#ifdef DLIB_OPTIMIZATIOn_STOP_STRATEGIES_ABSTRACT_
+
+#include <cmath>
+#include <limits>
+#include "../matrix/matrix_abstract.h"
+#include "../algs.h"
+
+
+namespace dlib
+{
+
+// ----------------------------------------------------------------------------------------
+
+    class objective_delta_stop_strategy
+    {
+        /*!
+            WHAT THIS OBJECT REPRESENTS
+                This object represents a strategy for deciding if an optimization
+                algorithm should terminate.   This particular object looks at the 
+                change in the objective function from one iteration to the next and 
+                bases its decision on how large this change is.  If the change
+                is below a user given threshold then the search stops.
+        !*/
+
+    public:
+        explicit objective_delta_stop_strategy (
+            double min_delta = 1e-7
+        ); 
+        /*!
+            requires
+                - min_delta >= 0
+            ensures
+                - This stop strategy object will only consider a search to be complete
+                  if a change in an objective function from one iteration to the next
+                  is less than min_delta.
+        !*/
+
+        objective_delta_stop_strategy (
+            double min_delta,
+            unsigned long max_iter
+        );
+        /*!
+            requires
+                - min_delta >= 0
+                - max_iter > 0
+            ensures
+                - This stop strategy object will only consider a search to be complete
+                  if a change in an objective function from one iteration to the next
+                  is less than min_delta or more than max_iter iterations has been
+                  executed.
+        !*/
+
+        objective_delta_stop_strategy& be_verbose( 
+        );
+        /*!
+            ensures
+                - causes this object to print status messages to standard out 
+                  every time should_continue_search() is called.
+                - returns *this
+        !*/
+
+        template <typename T>
+        bool should_continue_search (
+            const T& x,
+            const double funct_value,
+            const T& funct_derivative
+        );
+        /*!
+            requires
+                - this function is only called once per search iteration
+                - for some objective function f():
+                    - x == the search point for the current iteration
+                    - funct_value == f(x)
+                    - funct_derivative == derivative(f)(x)
+            ensures
+                - returns true if the point x doest not satisfy the stopping condition and
+                  false otherwise.
+        !*/
+
+    };
+
+// ----------------------------------------------------------------------------------------
+
+    class gradient_norm_stop_strategy
+    {
+        /*!
+            WHAT THIS OBJECT REPRESENTS
+                This object represents a strategy for deciding if an optimization
+                algorithm should terminate.   This particular object looks at the 
+                norm (i.e. the length) of the current gradient vector and stops 
+                if it is smaller than a user given threshold.  
+        !*/
+
+    public:
+        explicit gradient_norm_stop_strategy (
+            double min_norm = 1e-7
+        ); 
+        /*!
+            requires
+                - min_norm >= 0
+            ensures
+                - This stop strategy object will only consider a search to be complete
+                  if the current gradient norm is less than min_norm
+        !*/
+
+        gradient_norm_stop_strategy (
+            double min_norm,
+            unsigned long max_iter
+        );
+        /*!
+            requires
+                - min_norm >= 0
+                - max_iter > 0
+            ensures
+                - This stop strategy object will only consider a search to be complete
+                  if the current gradient norm is less than min_norm or more than 
+                  max_iter iterations has been executed.
+        !*/
+
+        gradient_norm_stop_strategy& be_verbose( 
+        );
+        /*!
+            ensures
+                - causes this object to print status messages to standard out 
+                  every time should_continue_search() is called.
+                - returns *this
+        !*/
+
+        template <typename T>
+        bool should_continue_search (
+            const T& x,
+            const double funct_value,
+            const T& funct_derivative
+        );
+        /*!
+            requires
+                - this function is only called once per search iteration
+                - for some objective function f():
+                    - x == the search point for the current iteration
+                    - funct_value == f(x)
+                    - funct_derivative == derivative(f)(x)
+            ensures
+                - returns true if the point x doest not satisfy the stopping condition and
+                  false otherwise.
+        !*/
+
+    };
+
+// ----------------------------------------------------------------------------------------
+
+}
+
+#endif // DLIB_OPTIMIZATIOn_STOP_STRATEGIES_ABSTRACT_
+
diff --git a/ml/dlib/dlib/optimization/optimization_trust_region.h b/ml/dlib/dlib/optimization/optimization_trust_region.h
new file mode 100644
index 000000000..5f0ad897f
--- /dev/null
+++ b/ml/dlib/dlib/optimization/optimization_trust_region.h
@@ -0,0 +1,564 @@
+// Copyright (C) 2010  Davis E. King (davis@dlib.net)
+// License: Boost Software License   See LICENSE.txt for the full license.
+#ifndef DLIB_OPTIMIZATION_TRUST_REGIoN_Hh_
+#define DLIB_OPTIMIZATION_TRUST_REGIoN_Hh_
+
+#include "../matrix.h"
+#include "optimization_trust_region_abstract.h"
+
+namespace dlib
+{
+
+// ----------------------------------------------------------------------------------------
+
+    template <
+        typename EXP1,
+        typename EXP2,
+        typename T, long NR, long NC, typename MM, typename L
+        >
+    unsigned long solve_trust_region_subproblem ( 
+        const matrix_exp<EXP1>& B,
+        const matrix_exp<EXP2>& g,
+        const typename EXP1::type radius,
+        matrix<T,NR,NC,MM,L>& p,
+        double eps,
+        unsigned long max_iter
+    )
+    {
+        /*
+            This is an implementation of algorithm 4.3(Trust Region Subproblem)
+            from the book Numerical Optimization by Nocedal and Wright.  Some of
+            the details are also from Practical Methods of Optimization by Fletcher.
+        */
+
+        // make sure requires clause is not broken
+        DLIB_ASSERT(B.nr() == B.nc() && is_col_vector(g) && g.size() == B.nr(),
+            "\t unsigned long solve_trust_region_subproblem()"
+            << "\n\t invalid arguments were given to this function"
+            << "\n\t B.nr():            " << B.nr()
+            << "\n\t B.nc():            " << B.nc()
+            << "\n\t is_col_vector(g):  " << is_col_vector(g) 
+            << "\n\t g.size():          " << g.size() 
+            );
+        DLIB_ASSERT(radius > 0 && eps > 0 && max_iter > 0,
+            "\t unsigned long solve_trust_region_subproblem()"
+            << "\n\t invalid arguments were given to this function"
+            << "\n\t radius:   " << radius
+            << "\n\t eps:      " << eps 
+            << "\n\t max_iter: " << max_iter 
+            );
+
+
+        const_temp_matrix<EXP1> BB(B);
+        const_temp_matrix<EXP2> gg(g);
+
+        p.set_size(g.nr(),g.nc());
+        p = 0;
+
+
+        const T numeric_eps = max(diag(abs(BB)))*std::numeric_limits<T>::epsilon();
+
+        matrix<T,EXP1::NR,EXP2::NR,MM,L> R;
+
+        T lambda = 0;
+
+        // We need to put a bracket around lambda.  It can't go below 0.  We
+        // can get an upper bound using Gershgorin disks.  
+        // This number is a lower bound on the eigenvalues in BB
+        const T BB_min_eigenvalue = min(diag(BB) - (sum_cols(abs(BB)) - abs(diag(BB))));
+
+        const T g_norm = length(gg);
+
+        T lambda_min = 0;
+        T lambda_max = put_in_range(0, 
+                                    std::numeric_limits<T>::max(), 
+                                    g_norm/radius - BB_min_eigenvalue);
+
+
+        // If we can tell that the minimum is at 0 then don't do anything.  Just return the answer. 
+        if (g_norm < numeric_eps && BB_min_eigenvalue > numeric_eps)
+        {
+            return 0;
+        }
+
+
+        // how much lambda has changed recently
+        T lambda_delta = 0;
+
+        for (unsigned long i = 0; i < max_iter; ++i)
+        {
+            R = chol(BB + lambda*identity_matrix<T>(BB.nr()));
+
+            // if the cholesky decomposition doesn't exist. 
+            if (R(R.nr()-1, R.nc()-1) <= 0)
+            {
+                // If B is indefinite and g is equal to 0 then we should
+                // quit this loop and go right to the eigenvalue decomposition method.
+                if (g_norm <= numeric_eps)
+                    break;
+
+                // narrow the bracket on lambda.  Obviously the current lambda is
+                // too small.
+                lambda_min = lambda;
+
+                // jump towards the max value.  Eventually there will
+                // be a lambda that results in a cholesky decomposition.
+                const T alpha = 0.10;
+                lambda = (1-alpha)*lambda + alpha*lambda_max;
+                continue;
+            }
+
+            using namespace blas_bindings;
+
+            p = -gg;
+            // Solve RR'*p = -g for p.
+            // Solve R*q = -g for q where q = R'*p.
+            if (R.nr() == 2)
+            {
+                p(0) = p(0)/R(0,0);
+                p(1) = (p(1)-R(1,0)*p(0))/R(1,1);
+            }
+            else
+            {
+                triangular_solver(CblasLeft, CblasLower, CblasNoTrans, CblasNonUnit, R, p);
+            }
+            const T q_norm = length(p);
+
+            // Solve R'*p = q for p.
+            if (R.nr() == 2)
+            {
+                p(1) = p(1)/R(1,1);
+                p(0) = (p(0)-R(1,0)*p(1))/R(0,0);
+            }
+            else
+            {
+                triangular_solver(CblasLeft, CblasLower, CblasTrans, CblasNonUnit, R, p);
+            }
+            const T p_norm = length(p);
+
+            // check if we are done.  
+            if (lambda == 0)
+            {
+                if (p_norm < radius)
+                {
+                    // i will always be 0 in this case.  So we return 1.
+                    return i+1;
+                }
+            }
+            else
+            {
+                // if we are close enough to the solution then terminate
+                if (std::abs(p_norm - radius)/radius < eps)
+                    return i+1;
+            }
+
+            // shrink our bracket on lambda
+            if (p_norm < radius)
+                lambda_max = lambda;
+            else
+                lambda_min = lambda;
+
+
+            if (p_norm <= radius*std::numeric_limits<T>::epsilon())
+            {
+                const T alpha = 0.01;
+                lambda = (1-alpha)*lambda_min + alpha*lambda_max;
+                continue;
+            }
+
+            const T old_lambda = lambda;
+
+            // figure out which lambda to try next
+            lambda = lambda + std::pow(q_norm/p_norm,2)*(p_norm - radius)/radius;
+
+            // make sure the chosen lambda is within our bracket (but not exactly at either end).
+            const T gap = (lambda_max-lambda_min)*0.01;
+            lambda = put_in_range(lambda_min+gap, lambda_max-gap, lambda);
+
+            // Keep track of how much lambda is thrashing around inside the search bracket.  If it
+            // keeps moving around a whole lot then cut the search bracket in half.
+            lambda_delta += std::abs(lambda - old_lambda);
+            if (lambda_delta > 3*(lambda_max-lambda_min))
+            {
+                lambda = (lambda_min+lambda_max)/2;
+                lambda_delta = 0;
+            }
+        } // end for loop
+
+
+        // We are probably in the "hard case".   Use an eigenvalue decomposition to sort things out.
+        // Either that or the eps was just set too tight and really we are already done.
+        eigenvalue_decomposition<EXP1> ed(make_symmetric(BB));
+
+        matrix<T,NR,NC,MM,L> ev = ed.get_real_eigenvalues();
+        const long min_eig_idx = index_of_min(ev);
+
+
+        ev -= min(ev);
+        // zero out any values which are basically zero
+        ev = pointwise_multiply(ev, ev > max(abs(ev))*std::numeric_limits<T>::epsilon());
+        ev = reciprocal(ev);
+
+
+        // figure out part of what p should be assuming we are in the hard case.
+        matrix<T,NR,NC,MM,L> p_hard;
+        p_hard = trans(ed.get_pseudo_v())*gg;
+        p_hard = diagm(ev)*p_hard;
+        p_hard = ed.get_pseudo_v()*p_hard;
+
+
+        // If we really are in the hard case then this if will be true.  Otherwise, the p
+        // we found in the "easy case" loop up top is the best answer.
+        if (length(p_hard) < radius && length(p_hard) >= length(p))
+        {
+            // adjust the length of p_hard by adding a component along the eigenvector associated with
+            // the smallest eigenvalue.  We want to make it the case that length(p) == radius.
+            const T tau = std::sqrt(radius*radius - length_squared(p_hard));
+            p = p_hard + tau*colm(ed.get_pseudo_v(),min_eig_idx);
+
+
+            // if we have to do an eigenvalue decomposition then say we did all the iterations
+            return max_iter;
+        }
+
+        // if we get this far it means we didn't converge to eps accuracy. 
+        return max_iter+1;
+    }
+
+// ----------------------------------------------------------------------------------------
+
+    namespace impl
+    {
+        template <
+            typename EXP1,
+            typename EXP2,
+            typename EXP3
+            >
+        bool bounds_violated (
+            const matrix_exp<EXP1>& v,
+            const matrix_exp<EXP2>& l,
+            const matrix_exp<EXP3>& u
+        )
+        {
+            DLIB_ASSERT(v.nr() == l.nr() && v.nr() == u.nr());
+            DLIB_ASSERT(v.nc() == l.nc() && v.nc() == u.nc());
+            for (long r = 0; r < v.nr(); ++r)
+            {
+                for (long c = 0; c < v.nc(); c++)
+                {
+                    if (!(l(r,c) <= v(r,c) && v(r,c) <= u(r,c)))
+                        return true;
+                }
+            }
+            return false;
+        }
+    }
+
+// ----------------------------------------------------------------------------------------
+
+    template <
+        typename EXP1,
+        typename EXP2,
+        typename T, long NR, long NC, typename MM, typename L,
+        typename EXP3
+        >
+    void solve_trust_region_subproblem_bounded ( 
+        const matrix_exp<EXP1>& B_,
+        const matrix_exp<EXP2>& g_,
+        const typename EXP1::type radius_,
+        matrix<T,NR,NC,MM,L>& p_,
+        double eps,
+        unsigned long max_iter,
+        const matrix_exp<EXP3>& lower_,
+        const matrix_exp<EXP3>& upper_
+    )
+    {
+        // make sure requires clause is not broken
+        DLIB_ASSERT(B_.nr() == B_.nc() && is_col_vector(g_) && g_.size() == B_.nr(),
+            "\t unsigned long solve_trust_region_subproblem_bounded()"
+            << "\n\t invalid arguments were given to this function"
+            << "\n\t B_.nr():            " << B_.nr()
+            << "\n\t B_.nc():            " << B_.nc()
+            << "\n\t is_col_vector(g_):  " << is_col_vector(g_) 
+            << "\n\t g_.size():          " << g_.size() 
+            );
+        DLIB_ASSERT(radius_ > 0 && eps > 0 && max_iter > 0,
+            "\t unsigned long solve_trust_region_subproblem_bounded()"
+            << "\n\t invalid arguments were given to this function"
+            << "\n\t radius_:   " << radius_
+            << "\n\t eps:      " << eps 
+            << "\n\t max_iter: " << max_iter 
+            );
+        DLIB_ASSERT(is_col_vector(lower_) && lower_.size() == g_.size());
+        DLIB_ASSERT(is_col_vector(upper_) && upper_.size() == g_.size());
+        DLIB_ASSERT(max(upper_-lower_) >= 0);
+        // make sure the problem is feasible.  That is, there should be a point inside the
+        // lower and upper bounds that has a norm <= radius_
+        DLIB_ASSERT(length(clamp(zeros_matrix(lower_),lower_,upper_)) <= radius_, 
+            "The lower and upper bounds are incompatible with the radius since there is no point within the bounds with a norm less than the radius.");
+
+        // We are going to solve this by greedily finding the most violated bound constraint,
+        // locking that variable to its constrained value, removing it from the problem,
+        // and then resolving.  We do that until no more constraint violations are present.
+
+        solve_trust_region_subproblem(B_,g_,radius_,p_,eps,max_iter);
+
+
+        // just stop here if all the bounds are satisfied.
+        if (!impl::bounds_violated(p_, lower_, upper_))
+            return;
+
+        matrix<double> B = matrix_cast<double>(B_);
+        matrix<double,0,1> g = matrix_cast<double>(g_);
+        double radius = radius_;
+        matrix<double,0,1> p = matrix_cast<double>(p_);
+        matrix<double,0,1> lower = matrix_cast<double>(lower_);
+        matrix<double,0,1> upper = matrix_cast<double>(upper_);
+
+        // keep a table that tells us how to map any reduced QP back to the original QP
+        std::vector<long> idxs(g.size());
+        for (size_t i = 0; i < idxs.size(); ++i)
+            idxs[i] = i;
+
+
+        // while we haven't found a p that satisfies the bounds constraints
+        while(impl::bounds_violated(p, lower, upper) )
+        {
+            // Find the most violated variable and fix its value to a constant (the bound
+            // value).
+            long most_violated_idx = 0;
+            double max_violation = 0;
+            double bounded_value = 0;
+            for (long i = 0; i < lower.size(); ++i)
+            {
+                if (!(lower(i) <= p(i) && p(i) <= upper(i)))
+                {
+                    if (lower(i)-p(i) > max_violation)
+                    {
+                        max_violation = lower(i)-p(i);
+                        most_violated_idx = i;
+                        bounded_value = lower(i);
+                    }
+                    else if (p(i)-upper(i) > max_violation)
+                    {
+                        max_violation = p(i)-upper(i);
+                        most_violated_idx = i;
+                        bounded_value = upper(i);
+                    }
+                }
+            }
+
+            // assign this variable to its final value.
+            p_(idxs[most_violated_idx]) = bounded_value;
+
+
+            // now reduce the QP by removing the variable p_(idxs[most_violated_idx]).  
+            idxs.erase(idxs.begin()+most_violated_idx);
+            // we are out of variables to remove since everything is at bounds.
+            if (idxs.size() == 0)
+                break;
+
+            lower = remove_row(lower,most_violated_idx);
+            upper = remove_row(upper,most_violated_idx);
+            g += colm(B,most_violated_idx)*bounded_value;
+            g = remove_row(g,most_violated_idx);
+            p = remove_row(p,most_violated_idx);
+            B = removerc(B,most_violated_idx, most_violated_idx);
+
+            // Removing a variable changes the radius, so we have to subtract the bounded
+            // value from the radius so as to not change the effective radius for the whole
+            // problem.
+            double squared_radius = radius*radius - bounded_value*bounded_value;
+            if (squared_radius <= 0)
+            {
+                p = 0;
+                break;
+            }
+            radius = std::sqrt(squared_radius);
+
+
+            solve_trust_region_subproblem(B,g,radius,p,eps,max_iter);
+        }
+
+        // assign the non-bound-constrained variables to their final values
+        for (size_t i = 0; i < idxs.size(); ++i)
+            p_(idxs[i]) = p(i);
+
+    }
+
+// ----------------------------------------------------------------------------------------
+// ----------------------------------------------------------------------------------------
+
+    template <
+        typename stop_strategy_type,
+        typename funct_model
+        >
+    double find_min_trust_region (
+        stop_strategy_type stop_strategy,
+        const funct_model& model, 
+        typename funct_model::column_vector& x, 
+        double radius = 1
+    )
+    {
+        /*
+            This is an implementation of algorithm 4.1(Trust Region)
+            from the book Numerical Optimization by Nocedal and Wright.  
+        */
+
+        // make sure requires clause is not broken
+        DLIB_ASSERT(is_col_vector(x) && radius > 0,
+            "\t double find_min_trust_region()"
+            << "\n\t invalid arguments were given to this function"
+            << "\n\t is_col_vector(x): " << is_col_vector(x) 
+            << "\n\t radius:           " << radius
+            );
+
+        const double initial_radius = radius;
+
+        typedef typename funct_model::column_vector T;
+        typedef typename T::type type;
+
+        typename funct_model::general_matrix h;
+        typename funct_model::column_vector g, p, d;
+        type f_value = model(x);
+
+        model.get_derivative_and_hessian(x,g,h);
+
+        DLIB_ASSERT(is_finite(x), "The objective function generated non-finite outputs");
+        DLIB_ASSERT(is_finite(g), "The objective function generated non-finite outputs");
+        DLIB_ASSERT(is_finite(h), "The objective function generated non-finite outputs");
+
+        // Sometimes the loop below won't modify x because the trust region step failed.
+        // This bool tells us when we are in that case.
+        bool stale_x = false;
+
+        while(stale_x || stop_strategy.should_continue_search(x, f_value, g))
+        {
+            const unsigned long iter = solve_trust_region_subproblem(h,
+                                                                     g,
+                                                                     radius,
+                                                                     p, 
+                                                                     0.1, 
+                                                                     20);
+
+
+            const type new_f_value = model(x+p);
+            const type predicted_improvement = -0.5*trans(p)*h*p - trans(g)*p;
+            const type measured_improvement = (f_value - new_f_value);
+
+            // If the sub-problem can't find a way to improve then stop.  This only happens when p is essentially 0.
+            if (std::abs(predicted_improvement) <= std::abs(measured_improvement)*std::numeric_limits<type>::epsilon())
+                break;
+
+            // predicted_improvement shouldn't be negative but it might be if something went
+            // wrong in the trust region solver.  So put abs() here to guard against that.  This
+            // way the sign of rho is determined only by the sign of measured_improvement.
+            const type rho = measured_improvement/std::abs(predicted_improvement);
+
+
+            if (!is_finite(rho))
+                break;
+            
+            if (rho < 0.25)
+            {
+                radius *= 0.25;
+
+                // something has gone horribly wrong if the radius has shrunk to zero.  So just
+                // give up if that happens.
+                if (radius <= initial_radius*std::numeric_limits<double>::epsilon())
+                    break;
+            }
+            else
+            {
+                // if rho > 0.75 and we are being checked by the radius 
+                if (rho > 0.75 && iter > 1)
+                {
+                    radius = std::min<type>(1000,  2*radius);
+                }
+            }
+
+            if (rho > 0)
+            {
+                x = x + p;
+                f_value = new_f_value;
+                model.get_derivative_and_hessian(x,g,h);
+                stale_x = false;
+            }
+            else
+            {
+                stale_x = true;
+            }
+
+            DLIB_ASSERT(is_finite(x), "The objective function generated non-finite outputs");
+            DLIB_ASSERT(is_finite(g), "The objective function generated non-finite outputs");
+            DLIB_ASSERT(is_finite(h), "The objective function generated non-finite outputs");
+        }
+
+        return f_value;
+    }
+
+// ----------------------------------------------------------------------------------------
+
+    template <typename funct_model>
+    struct negate_tr_model 
+    {
+        negate_tr_model( const funct_model& m) : model(m) {}
+
+        const funct_model& model;
+
+        typedef typename funct_model::column_vector column_vector;
+        typedef typename funct_model::general_matrix general_matrix;
+
+        template <typename T>
+        typename T::type operator() (const T& x) const
+        {
+            return -model(x);
+        }
+
+        template <typename T, typename U>
+        void get_derivative_and_hessian (
+            const T& x,
+            T& d,
+            U& h
+        ) const 
+        {
+            model.get_derivative_and_hessian(x,d,h);
+            d = -d;
+            h = -h;
+        }
+
+    };
+
+// ----------------------------------------------------------------------------------------
+
+    template <
+        typename stop_strategy_type,
+        typename funct_model
+        >
+    double find_max_trust_region (
+        stop_strategy_type stop_strategy,
+        const funct_model& model, 
+        typename funct_model::column_vector& x, 
+        double radius = 1
+    )
+    {
+        // make sure requires clause is not broken
+        DLIB_ASSERT(is_col_vector(x) && radius > 0,
+            "\t double find_max_trust_region()"
+            << "\n\t invalid arguments were given to this function"
+            << "\n\t is_col_vector(x): " << is_col_vector(x) 
+            << "\n\t radius:           " << radius
+            );
+
+        return -find_min_trust_region(stop_strategy,
+                                      negate_tr_model<funct_model>(model),
+                                      x,
+                                      radius);
+    }
+
+// ----------------------------------------------------------------------------------------
+
+}
+
+#endif // DLIB_OPTIMIZATION_TRUST_REGIoN_Hh_
+
diff --git a/ml/dlib/dlib/optimization/optimization_trust_region_abstract.h b/ml/dlib/dlib/optimization/optimization_trust_region_abstract.h
new file mode 100644
index 000000000..303ee746d
--- /dev/null
+++ b/ml/dlib/dlib/optimization/optimization_trust_region_abstract.h
@@ -0,0 +1,233 @@
+// Copyright (C) 2010  Davis E. King (davis@dlib.net)
+// License: Boost Software License   See LICENSE.txt for the full license.
+#undef DLIB_OPTIMIZATION_TRUST_REGIoN_H_ABSTRACTh_
+#ifdef DLIB_OPTIMIZATION_TRUST_REGIoN_H_ABSTRACTh_
+
+#include "../matrix/matrix_abstract.h"
+
+namespace dlib
+{
+
+// ----------------------------------------------------------------------------------------
+
+    template <
+        typename EXP1,
+        typename EXP2,
+        typename T, long NR, long NC, typename MM, typename L
+        >
+    unsigned long solve_trust_region_subproblem ( 
+        const matrix_exp<EXP1>& B,
+        const matrix_exp<EXP2>& g,
+        const typename EXP1::type radius,
+        matrix<T,NR,NC,MM,L>& p,
+        double eps,
+        unsigned long max_iter
+    );
+    /*!
+        requires
+            - B == trans(B)
+              (i.e. B should be a symmetric matrix)
+            - B.nr() == B.nc()
+            - is_col_vector(g) == true
+            - g.size() == B.nr()
+            - p is capable of containing a column vector the size of g
+              (i.e. p = g; should be a legal expression)
+            - radius > 0
+            - eps > 0
+            - max_iter > 0
+        ensures
+            - This function solves the following optimization problem:
+                Minimize: f(p) == 0.5*trans(p)*B*p + trans(g)*p
+                subject to the following constraint:
+                    - length(p) <= radius
+            - returns the number of iterations performed.  If this method fails to converge
+              to eps accuracy then the number returned will be max_iter+1.
+            - if (this function didn't terminate due to hitting the max_iter iteration limit) then
+                - if this function returns 0 or 1 then we are not hitting the radius bound Otherwise, 
+                  the radius constraint is active and std::abs(length(#p)-radius)/radius <= eps.
+    !*/
+
+// ----------------------------------------------------------------------------------------
+
+    template <
+        typename EXP1,
+        typename EXP2,
+        typename T, long NR, long NC, typename MM, typename L,
+        typename EXP3
+        >
+    void solve_trust_region_subproblem_bounded ( 
+        const matrix_exp<EXP1>& B,
+        const matrix_exp<EXP2>& g,
+        const typename EXP1::type radius,
+        matrix<T,NR,NC,MM,L>& p,
+        double eps,
+        unsigned long max_iter,
+        const matrix_exp<EXP3>& lower,
+        const matrix_exp<EXP3>& upper
+    );
+    /*!
+        requires
+            - B == trans(B)
+              (i.e. B should be a symmetric matrix)
+            - B.nr() == B.nc()
+            - is_col_vector(g) == true
+            - is_col_vector(lower) == true
+            - is_col_vector(upper) == true
+            - g.size() == B.nr()
+            - lower.size() == B.nr()
+            - upper.size() == B.nr()
+            - p is capable of containing a column vector the size of g
+              (i.e. p = g; should be a legal expression)
+            - radius > 0
+            - eps > 0
+            - max_iter > 0
+            - min(upper-lower) >= 0
+            - length(clamp(zeros_matrix(lower),lower,upper)) <= radius
+              (i.e. the lower and upper bounds can't exclude all points with the desired radius.)
+        ensures
+            - This function solves the following optimization problem:
+                Minimize: f(p) == 0.5*trans(p)*B*p + trans(g)*p
+                subject to the following constraints:
+                    - length(p) <= radius
+                    - lower(i) <= p(i) <= upper(i), for all i
+            - Solves the problem to eps accuracy.  We do this by greedily finding the most
+              violated bound constraint, locking that variable to its constrained value, removing
+              it from the problem, and then resolving.  We do that until no more constraint
+              violations are present.  Each time we just call solve_trust_region_subproblem() 
+              to get the solution and pass eps and max_iter directly to these calls to
+              solve_trust_region_subproblem().
+    !*/
+
+// ----------------------------------------------------------------------------------------
+
+    class function_model 
+    {
+        /*!
+            WHAT THIS OBJECT REPRESENTS
+                This object defines the interface for a function model
+                used by the trust-region optimizers defined below.
+
+                In particular, this object represents a function f() and
+                its associated derivative and hessian.
+
+        !*/
+
+    public:
+
+        // Define the type used to represent column vectors
+        typedef matrix<double,0,1> column_vector;
+        // Define the type used to represent the hessian matrix
+        typedef matrix<double> general_matrix;
+
+        double operator() ( 
+            const column_vector& x
+        ) const;
+        /*!
+            ensures
+                - returns f(x)
+                  (i.e. evaluates this model at the given point and returns the value)
+        !*/
+
+        void get_derivative_and_hessian (
+            const column_vector& x,
+            column_vector& d,
+            general_matrix& h
+        ) const;
+        /*!
+            ensures
+                - #d == the derivative of f() at x
+                - #h == the hessian matrix of f() at x
+                - is_col_vector(#d) == true
+                - #d.size() == x.size()
+                - #h.nr() == #h.nc() == x.size()
+                - #h == trans(#h)
+        !*/
+    };
+
+// ----------------------------------------------------------------------------------------
+
+    template <
+        typename stop_strategy_type,
+        typename funct_model
+        >
+    double find_min_trust_region (
+        stop_strategy_type stop_strategy,
+        const funct_model& model, 
+        typename funct_model::column_vector& x, 
+        double radius = 1
+    );
+    /*!
+        requires
+            - stop_strategy == an object that defines a stop strategy such as one of 
+              the objects from dlib/optimization/optimization_stop_strategies_abstract.h
+            - is_col_vector(x) == true
+            - radius > 0
+            - model must be an object with an interface as defined by the function_model
+              example object shown above.
+        ensures
+            - Performs an unconstrained minimization of the function defined by model 
+              starting from the initial point x.  This function uses a trust region
+              algorithm to perform the minimization.  The radius parameter defines
+              the initial size of the trust region.
+            - The function is optimized until stop_strategy decides that an acceptable 
+              point has been found or the trust region subproblem fails to make progress.
+            - #x == the value of x that was found to minimize model()
+            - returns model(#x). 
+            - When this function makes calls to model.get_derivative_and_hessian() it always 
+              does so by first calling model() and then calling model.get_derivative_and_hessian().  
+              That is, any call to model.get_derivative_and_hessian(val) will always be 
+              preceded by a call to model(val) with the same value.  This way you can reuse 
+              any redundant computations performed by model() and model.get_derivative_and_hessian()
+              as appropriate.
+    !*/
+
+// ----------------------------------------------------------------------------------------
+
+    template <
+        typename stop_strategy_type,
+        typename funct_model
+        >
+    double find_max_trust_region (
+        stop_strategy_type stop_strategy,
+        const funct_model& model, 
+        typename funct_model::column_vector& x, 
+        double radius = 1
+    );
+    /*!
+        requires
+            - stop_strategy == an object that defines a stop strategy such as one of 
+              the objects from dlib/optimization/optimization_stop_strategies_abstract.h
+            - is_col_vector(x) == true
+            - radius > 0
+            - model must be an object with an interface as defined by the function_model
+              example object shown above.
+        ensures
+            - Performs an unconstrained maximization of the function defined by model 
+              starting from the initial point x.  This function uses a trust region
+              algorithm to perform the maximization.  The radius parameter defines
+              the initial size of the trust region.
+            - The function is optimized until stop_strategy decides that an acceptable 
+              point has been found or the trust region subproblem fails to make progress.
+            - #x == the value of x that was found to maximize model()
+            - returns model(#x). 
+            - When this function makes calls to model.get_derivative_and_hessian() it always 
+              does so by first calling model() and then calling model.get_derivative_and_hessian().  
+              That is, any call to model.get_derivative_and_hessian(val) will always be 
+              preceded by a call to model(val) with the same value.  This way you can reuse 
+              any redundant computations performed by model() and model.get_derivative_and_hessian()
+              as appropriate.
+            - Note that this function solves the maximization problem by converting it 
+              into a minimization problem.  Therefore, the values of model() and its derivative
+              reported to the stopping strategy will be negated.  That is, stop_strategy
+              will see -model() and -derivative.  All this really means is that the status 
+              messages from a stopping strategy in verbose mode will display a negated objective
+              value.
+    !*/
+
+// ----------------------------------------------------------------------------------------
+
+}
+
+#endif // DLIB_OPTIMIZATION_TRUST_REGIoN_H_ABSTRACTh_
+
+