diff options
Diffstat (limited to 'ml/dlib/dlib/optimization/optimization_line_search.h')
-rw-r--r-- | ml/dlib/dlib/optimization/optimization_line_search.h | 888 |
1 files changed, 888 insertions, 0 deletions
diff --git a/ml/dlib/dlib/optimization/optimization_line_search.h b/ml/dlib/dlib/optimization/optimization_line_search.h new file mode 100644 index 00000000..a91e3df8 --- /dev/null +++ b/ml/dlib/dlib/optimization/optimization_line_search.h @@ -0,0 +1,888 @@ +// Copyright (C) 2008 Davis E. King (davis@dlib.net) +// License: Boost Software License See LICENSE.txt for the full license. +#ifndef DLIB_OPTIMIZATIOn_LINE_SEARCH_H_ +#define DLIB_OPTIMIZATIOn_LINE_SEARCH_H_ + +#include <cmath> +#include <limits> +#include "../matrix.h" +#include "../algs.h" +#include "optimization_line_search_abstract.h" +#include <utility> + +namespace dlib +{ + +// ---------------------------------------------------------------------------------------- + + template <typename funct, typename T> + class line_search_funct + { + public: + line_search_funct(const funct& f_, const T& start_, const T& direction_) + : f(f_),start(start_), direction(direction_), matrix_r(0), scalar_r(0) + {} + + line_search_funct(const funct& f_, const T& start_, const T& direction_, T& r) + : f(f_),start(start_), direction(direction_), matrix_r(&r), scalar_r(0) + {} + + line_search_funct(const funct& f_, const T& start_, const T& direction_, double& r) + : f(f_),start(start_), direction(direction_), matrix_r(0), scalar_r(&r) + {} + + double operator()(const double& x) const + { + return get_value(f(start + x*direction)); + } + + private: + + double get_value (const double& r) const + { + // save a copy of this value for later + if (scalar_r) + *scalar_r = r; + + return r; + } + + template <typename U> + double get_value (const U& r) const + { + // U should be a matrix type + COMPILE_TIME_ASSERT(is_matrix<U>::value); + + // save a copy of this value for later + if (matrix_r) + *matrix_r = r; + + return dot(r,direction); + } + + const funct& f; + const T& start; + const T& direction; + T* matrix_r; + double* scalar_r; + }; + + template <typename funct, typename T> + const line_search_funct<funct,T> make_line_search_function(const funct& f, const T& start, const T& direction) + { + COMPILE_TIME_ASSERT(is_matrix<T>::value); + DLIB_ASSERT ( + is_col_vector(start) && is_col_vector(direction) && start.size() == direction.size(), + "\tline_search_funct make_line_search_function(f,start,direction)" + << "\n\tYou have to supply column vectors to this function" + << "\n\tstart.nc(): " << start.nc() + << "\n\tdirection.nc(): " << direction.nc() + << "\n\tstart.nr(): " << start.nr() + << "\n\tdirection.nr(): " << direction.nr() + ); + return line_search_funct<funct,T>(f,start,direction); + } + +// ---------------------------------------------------------------------------------------- + + template <typename funct, typename T> + const line_search_funct<funct,T> make_line_search_function(const funct& f, const T& start, const T& direction, double& f_out) + { + COMPILE_TIME_ASSERT(is_matrix<T>::value); + DLIB_ASSERT ( + is_col_vector(start) && is_col_vector(direction) && start.size() == direction.size(), + "\tline_search_funct make_line_search_function(f,start,direction)" + << "\n\tYou have to supply column vectors to this function" + << "\n\tstart.nc(): " << start.nc() + << "\n\tdirection.nc(): " << direction.nc() + << "\n\tstart.nr(): " << start.nr() + << "\n\tdirection.nr(): " << direction.nr() + ); + return line_search_funct<funct,T>(f,start,direction, f_out); + } + +// ---------------------------------------------------------------------------------------- + + template <typename funct, typename T> + const line_search_funct<funct,T> make_line_search_function(const funct& f, const T& start, const T& direction, T& grad_out) + { + COMPILE_TIME_ASSERT(is_matrix<T>::value); + DLIB_ASSERT ( + is_col_vector(start) && is_col_vector(direction) && start.size() == direction.size(), + "\tline_search_funct make_line_search_function(f,start,direction)" + << "\n\tYou have to supply column vectors to this function" + << "\n\tstart.nc(): " << start.nc() + << "\n\tdirection.nc(): " << direction.nc() + << "\n\tstart.nr(): " << start.nr() + << "\n\tdirection.nr(): " << direction.nr() + ); + return line_search_funct<funct,T>(f,start,direction,grad_out); + } + +// ---------------------------------------------------------------------------------------- +// ---------------------------------------------------------------------------------------- +// ---------------------------------------------------------------------------------------- + + inline double poly_min_extrap ( + double f0, + double d0, + double f1, + double d1, + double limit = 1 + ) + { + const double n = 3*(f1 - f0) - 2*d0 - d1; + const double e = d0 + d1 - 2*(f1 - f0); + + + // find the minimum of the derivative of the polynomial + + double temp = std::max(n*n - 3*e*d0,0.0); + + if (temp < 0) + return 0.5; + + temp = std::sqrt(temp); + + if (std::abs(e) <= std::numeric_limits<double>::epsilon()) + return 0.5; + + // figure out the two possible min values + double x1 = (temp - n)/(3*e); + double x2 = -(temp + n)/(3*e); + + // compute the value of the interpolating polynomial at these two points + double y1 = f0 + d0*x1 + n*x1*x1 + e*x1*x1*x1; + double y2 = f0 + d0*x2 + n*x2*x2 + e*x2*x2*x2; + + // pick the best point + double x; + if (y1 < y2) + x = x1; + else + x = x2; + + // now make sure the minimum is within the allowed range of [0,limit] + return put_in_range(0,limit,x); + } + +// ---------------------------------------------------------------------------------------- + + inline double poly_min_extrap ( + double f0, + double d0, + double f1 + ) + { + const double temp = 2*(f1 - f0 - d0); + if (std::abs(temp) <= d0*std::numeric_limits<double>::epsilon()) + return 0.5; + + const double alpha = -d0/temp; + + // now make sure the minimum is within the allowed range of (0,1) + return put_in_range(0,1,alpha); + } + +// ---------------------------------------------------------------------------------------- + + inline double poly_min_extrap ( + double f0, + double d0, + double x1, + double f_x1, + double x2, + double f_x2 + ) + { + DLIB_ASSERT(0 < x1 && x1 < x2,"Invalid inputs were given to this function"); + // The contents of this function follow the equations described on page 58 of the + // book Numerical Optimization by Nocedal and Wright, second edition. + matrix<double,2,2> m; + matrix<double,2,1> v; + + const double aa2 = x2*x2; + const double aa1 = x1*x1; + m = aa2, -aa1, + -aa2*x2, aa1*x1; + v = f_x1 - f0 - d0*x1, + f_x2 - f0 - d0*x2; + + + double temp = aa2*aa1*(x1-x2); + + // just take a guess if this happens + if (temp == 0 || std::fpclassify(temp) == FP_SUBNORMAL) + { + return x1/2.0; + } + + matrix<double,2,1> temp2; + temp2 = m*v/temp; + const double a = temp2(0); + const double b = temp2(1); + + temp = b*b - 3*a*d0; + if (temp < 0 || a == 0) + { + // This is probably a line so just pick the lowest point + if (f0 < f_x2) + return 0; + else + return x2; + } + temp = (-b + std::sqrt(temp))/(3*a); + return put_in_range(0, x2, temp); + } + +// ---------------------------------------------------------------------------------------- + + inline double lagrange_poly_min_extrap ( + double p1, + double p2, + double p3, + double f1, + double f2, + double f3 + ) + { + DLIB_ASSERT(p1 < p2 && p2 < p3 && f1 >= f2 && f2 <= f3, + " p1: " << p1 + << " p2: " << p2 + << " p3: " << p3 + << " f1: " << f1 + << " f2: " << f2 + << " f3: " << f3); + + // This formula is out of the book Nonlinear Optimization by Andrzej Ruszczynski. See section 5.2. + double temp1 = f1*(p3*p3 - p2*p2) + f2*(p1*p1 - p3*p3) + f3*(p2*p2 - p1*p1); + double temp2 = 2*(f1*(p3 - p2) + f2*(p1 - p3) + f3*(p2 - p1) ); + + if (temp2 == 0) + { + return p2; + } + + const double result = temp1/temp2; + + // do a final sanity check to make sure the result is in the right range + if (p1 <= result && result <= p3) + { + return result; + } + else + { + return std::min(std::max(p1,result),p3); + } + } + +// ---------------------------------------------------------------------------------------- + + template < + typename funct, + typename funct_der + > + double line_search ( + const funct& f, + const double f0, + const funct_der& der, + const double d0, + double rho, + double sigma, + double min_f, + unsigned long max_iter + ) + { + DLIB_ASSERT ( + 0 < rho && rho < sigma && sigma < 1 && max_iter > 0, + "\tdouble line_search()" + << "\n\tYou have given invalid arguments to this function" + << "\n\t sigma: " << sigma + << "\n\t rho: " << rho + << "\n\t max_iter: " << max_iter + ); + + // The bracketing phase of this function is implemented according to block 2.6.2 from + // the book Practical Methods of Optimization by R. Fletcher. The sectioning + // phase is an implementation of 2.6.4 from the same book. + + // 1 <= tau1a < tau1b. Controls the alpha jump size during the bracketing phase of + // the search. + const double tau1a = 1.4; + const double tau1b = 9; + + // it must be the case that 0 < tau2 < tau3 <= 1/2 for the algorithm to function + // correctly but the specific values of tau2 and tau3 aren't super important. + const double tau2 = 1.0/10.0; + const double tau3 = 1.0/2.0; + + + // Stop right away and return a step size of 0 if the gradient is 0 at the starting point + if (std::abs(d0) <= std::abs(f0)*std::numeric_limits<double>::epsilon()) + return 0; + + // Stop right away if the current value is good enough according to min_f + if (f0 <= min_f) + return 0; + + // Figure out a reasonable upper bound on how large alpha can get. + const double mu = (min_f-f0)/(rho*d0); + + + double alpha = 1; + if (mu < 0) + alpha = -alpha; + alpha = put_in_range(0, 0.65*mu, alpha); + + + double last_alpha = 0; + double last_val = f0; + double last_val_der = d0; + + // The bracketing stage will find a range of points [a,b] + // that contains a reasonable solution to the line search + double a, b; + + // These variables will hold the values and derivatives of f(a) and f(b) + double a_val, b_val, a_val_der, b_val_der; + + // This thresh value represents the Wolfe curvature condition + const double thresh = std::abs(sigma*d0); + + unsigned long itr = 0; + // do the bracketing stage to find the bracket range [a,b] + while (true) + { + ++itr; + const double val = f(alpha); + const double val_der = der(alpha); + + // we are done with the line search since we found a value smaller + // than the minimum f value + if (val <= min_f) + return alpha; + + if (val > f0 + rho*alpha*d0 || val >= last_val) + { + a_val = last_val; + a_val_der = last_val_der; + b_val = val; + b_val_der = val_der; + + a = last_alpha; + b = alpha; + break; + } + + if (std::abs(val_der) <= thresh) + return alpha; + + // if we are stuck not making progress then quit with the current alpha + if (last_alpha == alpha || itr >= max_iter) + return alpha; + + if (val_der >= 0) + { + a_val = val; + a_val_der = val_der; + b_val = last_val; + b_val_der = last_val_der; + + a = alpha; + b = last_alpha; + break; + } + + + + const double temp = alpha; + // Pick a larger range [first, last]. We will pick the next alpha in that + // range. + double first, last; + if (mu > 0) + { + first = std::min(mu, alpha + tau1a*(alpha - last_alpha)); + last = std::min(mu, alpha + tau1b*(alpha - last_alpha)); + } + else + { + first = std::max(mu, alpha + tau1a*(alpha - last_alpha)); + last = std::max(mu, alpha + tau1b*(alpha - last_alpha)); + } + + + + // pick a point between first and last by doing some kind of interpolation + if (last_alpha < alpha) + alpha = last_alpha + (alpha-last_alpha)*poly_min_extrap(last_val, last_val_der, val, val_der, 1e10); + else + alpha = alpha + (last_alpha-alpha)*poly_min_extrap(val, val_der, last_val, last_val_der, 1e10); + + alpha = put_in_range(first,last,alpha); + + last_alpha = temp; + + last_val = val; + last_val_der = val_der; + + } + + + // Now do the sectioning phase from 2.6.4 + while (true) + { + ++itr; + double first = a + tau2*(b-a); + double last = b - tau3*(b-a); + + // use interpolation to pick alpha between first and last + alpha = a + (b-a)*poly_min_extrap(a_val, a_val_der, b_val, b_val_der); + alpha = put_in_range(first,last,alpha); + + const double val = f(alpha); + const double val_der = der(alpha); + + // we are done with the line search since we found a value smaller + // than the minimum f value or we ran out of iterations. + if (val <= min_f || itr >= max_iter) + return alpha; + + // stop if the interval gets so small that it isn't shrinking any more due to rounding error + if (a == first || b == last) + { + return b; + } + + // If alpha has basically become zero then just stop. Think of it like this, + // if we take the largest possible alpha step will the objective function + // change at all? If not then there isn't any point looking for a better + // alpha. + const double max_possible_alpha = std::max(std::abs(a),std::abs(b)); + if (std::abs(max_possible_alpha*d0) <= std::abs(f0)*std::numeric_limits<double>::epsilon()) + return alpha; + + + if (val > f0 + rho*alpha*d0 || val >= a_val) + { + b = alpha; + b_val = val; + b_val_der = val_der; + } + else + { + if (std::abs(val_der) <= thresh) + return alpha; + + if ( (b-a)*val_der >= 0) + { + b = a; + b_val = a_val; + b_val_der = a_val_der; + } + + a = alpha; + a_val = val; + a_val_der = val_der; + } + } + } + +// ---------------------------------------------------------------------------------------- + + template < + typename funct + > + double backtracking_line_search ( + const funct& f, + double f0, + double d0, + double alpha, + double rho, + unsigned long max_iter + ) + { + DLIB_ASSERT ( + 0 < rho && rho < 1 && max_iter > 0, + "\tdouble backtracking_line_search()" + << "\n\tYou have given invalid arguments to this function" + << "\n\t rho: " << rho + << "\n\t max_iter: " << max_iter + ); + + // make sure alpha is going in the right direction. That is, it should be opposite + // the direction of the gradient. + if ((d0 > 0 && alpha > 0) || + (d0 < 0 && alpha < 0)) + { + alpha *= -1; + } + + bool have_prev_alpha = false; + double prev_alpha = 0; + double prev_val = 0; + unsigned long iter = 0; + while (true) + { + ++iter; + const double val = f(alpha); + if (val <= f0 + alpha*rho*d0 || iter >= max_iter) + { + return alpha; + } + else + { + // Interpolate a new alpha. We also make sure the step by which we + // reduce alpha is not super small. + double step; + if (!have_prev_alpha) + { + if (d0 < 0) + step = alpha*put_in_range(0.1,0.9, poly_min_extrap(f0, d0, val)); + else + step = alpha*put_in_range(0.1,0.9, poly_min_extrap(f0, -d0, val)); + have_prev_alpha = true; + } + else + { + if (d0 < 0) + step = put_in_range(0.1*alpha,0.9*alpha, poly_min_extrap(f0, d0, alpha, val, prev_alpha, prev_val)); + else + step = put_in_range(0.1*alpha,0.9*alpha, -poly_min_extrap(f0, -d0, -alpha, val, -prev_alpha, prev_val)); + } + + prev_alpha = alpha; + prev_val = val; + + alpha = step; + } + } + } + +// ---------------------------------------------------------------------------------------- + + class optimize_single_variable_failure : public error { + public: optimize_single_variable_failure(const std::string& s):error(s){} + }; + +// ---------------------------------------------------------------------------------------- + + template <typename funct> + double find_min_single_variable ( + const funct& f, + double& starting_point, + const double begin = -1e200, + const double end = 1e200, + const double eps = 1e-3, + const long max_iter = 100, + const double initial_search_radius = 1 + ) + { + DLIB_CASSERT( eps > 0 && + max_iter > 1 && + begin <= starting_point && starting_point <= end && + initial_search_radius > 0, + "eps: " << eps + << "\n max_iter: "<< max_iter + << "\n begin: "<< begin + << "\n end: "<< end + << "\n starting_point: "<< starting_point + << "\n initial_search_radius: "<< initial_search_radius + ); + + double search_radius = initial_search_radius; + + double p1=0, p2=0, p3=0, f1=0, f2=0, f3=0; + long f_evals = 1; + + if (begin == end) + { + return f(starting_point); + } + + using std::abs; + using std::min; + using std::max; + + // find three bracketing points such that f1 > f2 < f3. Do this by generating a sequence + // of points expanding away from 0. Also note that, in the following code, it is always the + // case that p1 < p2 < p3. + + + + // The first thing we do is get a starting set of 3 points that are inside the [begin,end] bounds + p1 = max(starting_point-search_radius, begin); + p3 = min(starting_point+search_radius, end); + f1 = f(p1); + f3 = f(p3); + + if (starting_point == p1 || starting_point == p3) + { + p2 = (p1+p3)/2; + f2 = f(p2); + } + else + { + p2 = starting_point; + f2 = f(starting_point); + } + + f_evals += 2; + + // Now we have 3 points on the function. Start looking for a bracketing set such that + // f1 > f2 < f3 is the case. + while ( !(f1 > f2 && f2 < f3)) + { + // check for hitting max_iter or if the interval is now too small + if (f_evals >= max_iter) + { + throw optimize_single_variable_failure( + "The max number of iterations of single variable optimization have been reached\n" + "without converging."); + } + if (p3-p1 < eps) + { + if (f1 < min(f2,f3)) + { + starting_point = p1; + return f1; + } + + if (f2 < min(f1,f3)) + { + starting_point = p2; + return f2; + } + + starting_point = p3; + return f3; + } + + // If the left most points are identical in function value then expand out the + // left a bit, unless it's already at bound or we would drop that left most + // point anyway because it's bad. + if (f1==f2 && f1<f3 && p1!=begin) + { + p1 = max(p1 - search_radius, begin); + f1 = f(p1); + ++f_evals; + search_radius *= 2; + continue; + } + if (f2==f3 && f3<f1 && p3!=end) + { + p3 = min(p3 + search_radius, end); + f3 = f(p3); + ++f_evals; + search_radius *= 2; + continue; + } + + + // if f1 is small then take a step to the left + if (f1 <= f3) + { + // check if the minimum is butting up against the bounds and if so then pick + // a point between p1 and p2 in the hopes that shrinking the interval will + // be a good thing to do. Or if p1 and p2 aren't differentiated then try and + // get them to obtain different values. + if (p1 == begin || (f1 == f2 && (end-begin) < search_radius )) + { + p3 = p2; + f3 = f2; + + p2 = (p1+p2)/2.0; + f2 = f(p2); + } + else + { + // pick a new point to the left of our current bracket + p3 = p2; + f3 = f2; + + p2 = p1; + f2 = f1; + + p1 = max(p1 - search_radius, begin); + f1 = f(p1); + + search_radius *= 2; + } + + } + // otherwise f3 is small and we should take a step to the right + else + { + // check if the minimum is butting up against the bounds and if so then pick + // a point between p2 and p3 in the hopes that shrinking the interval will + // be a good thing to do. Or if p2 and p3 aren't differentiated then try and + // get them to obtain different values. + if (p3 == end || (f2 == f3 && (end-begin) < search_radius)) + { + p1 = p2; + f1 = f2; + + p2 = (p3+p2)/2.0; + f2 = f(p2); + } + else + { + // pick a new point to the right of our current bracket + p1 = p2; + f1 = f2; + + p2 = p3; + f2 = f3; + + p3 = min(p3 + search_radius, end); + f3 = f(p3); + + search_radius *= 2; + } + } + + ++f_evals; + } + + + // Loop until we have done the max allowable number of iterations or + // the bracketing window is smaller than eps. + // Within this loop we maintain the invariant that: f1 > f2 < f3 and p1 < p2 < p3 + const double tau = 0.1; + while( f_evals < max_iter && p3-p1 > eps) + { + double p_min = lagrange_poly_min_extrap(p1,p2,p3, f1,f2,f3); + + + // make sure p_min isn't too close to the three points we already have + if (p_min < p2) + { + const double min_dist = (p2-p1)*tau; + if (abs(p1-p_min) < min_dist) + { + p_min = p1 + min_dist; + } + else if (abs(p2-p_min) < min_dist) + { + p_min = p2 - min_dist; + } + } + else + { + const double min_dist = (p3-p2)*tau; + if (abs(p2-p_min) < min_dist) + { + p_min = p2 + min_dist; + } + else if (abs(p3-p_min) < min_dist) + { + p_min = p3 - min_dist; + } + } + + // make sure one side of the bracket isn't super huge compared to the other + // side. If it is then contract it. + const double bracket_ratio = abs(p1-p2)/abs(p2-p3); + if ( !( bracket_ratio < 10 && bracket_ratio > 0.1) ) + { + // Force p_min to be on a reasonable side. But only if lagrange_poly_min_extrap() + // didn't put it on a good side already. + if (bracket_ratio > 1 && p_min > p2) + p_min = (p1+p2)/2; + else if (p_min < p2) + p_min = (p2+p3)/2; + } + + + const double f_min = f(p_min); + + + // Remove one of the endpoints of our bracket depending on where the new point falls. + if (p_min < p2) + { + if (f1 > f_min && f_min < f2) + { + p3 = p2; + f3 = f2; + p2 = p_min; + f2 = f_min; + } + else + { + p1 = p_min; + f1 = f_min; + } + } + else + { + if (f2 > f_min && f_min < f3) + { + p1 = p2; + f1 = f2; + p2 = p_min; + f2 = f_min; + } + else + { + p3 = p_min; + f3 = f_min; + } + } + + + ++f_evals; + } + + if (f_evals >= max_iter) + { + throw optimize_single_variable_failure( + "The max number of iterations of single variable optimization have been reached\n" + "without converging."); + } + + starting_point = p2; + return f2; + } + +// ---------------------------------------------------------------------------------------- + + template <typename funct> + class negate_function_object + { + public: + negate_function_object(const funct& f_) : f(f_){} + + template <typename T> + double operator()(const T& x) const + { + return -f(x); + } + + private: + const funct& f; + }; + + template <typename funct> + const negate_function_object<funct> negate_function(const funct& f) { return negate_function_object<funct>(f); } + +// ---------------------------------------------------------------------------------------- + + template <typename funct> + double find_max_single_variable ( + const funct& f, + double& starting_point, + const double begin = -1e200, + const double end = 1e200, + const double eps = 1e-3, + const long max_iter = 100, + const double initial_search_radius = 1 + ) + { + return -find_min_single_variable(negate_function(f), starting_point, begin, end, eps, max_iter, initial_search_radius); + } + +// ---------------------------------------------------------------------------------------- + +} + +#endif // DLIB_OPTIMIZATIOn_LINE_SEARCH_H_ + |