diff options
Diffstat (limited to 'ml/dlib/dlib/global_optimization')
7 files changed, 3297 insertions, 0 deletions
diff --git a/ml/dlib/dlib/global_optimization/find_max_global.h b/ml/dlib/dlib/global_optimization/find_max_global.h new file mode 100644 index 000000000..5356129f5 --- /dev/null +++ b/ml/dlib/dlib/global_optimization/find_max_global.h @@ -0,0 +1,511 @@ +// Copyright (C) 2017 Davis E. King (davis@dlib.net) +// License: Boost Software License See LICENSE.txt for the full license. +#ifndef DLIB_FiND_GLOBAL_MAXIMUM_hH_ +#define DLIB_FiND_GLOBAL_MAXIMUM_hH_ + +#include "find_max_global_abstract.h" +#include "global_function_search.h" +#include "../metaprogramming.h" +#include <utility> +#include <chrono> + +namespace dlib +{ + namespace gopt_impl + { + + // ---------------------------------------------------------------------------------------- + + class disable_decay_to_scalar + { + const matrix<double,0,1>& a; + public: + disable_decay_to_scalar(const matrix<double,0,1>& a) : a(a){} + operator const matrix<double,0,1>&() const { return a;} + }; + + + template <typename T, size_t... indices> + auto _cwv ( + T&& f, + const matrix<double,0,1>& a, + compile_time_integer_list<indices...> + ) -> decltype(f(a(indices-1)...)) + { + DLIB_CASSERT(a.size() == sizeof...(indices), + "You invoked dlib::call_function_and_expand_args(f,a) but the number of arguments expected by f() doesn't match the size of 'a'. " + << "Expected " << sizeof...(indices) << " arguments but got " << a.size() << "." + ); + return f(a(indices-1)...); + } + + // Visual studio, as of November 2017, doesn't support C++11 and can't compile this code. + // So we write the terrible garbage in the #else for visual studio. When Visual Studio supports C++11 I'll update this #ifdef to use the C++11 code. +#ifndef _MSC_VER + template <size_t max_unpack> + struct call_function_and_expand_args + { + template <typename T> + static auto go(T&& f, const matrix<double,0,1>& a) -> decltype(_cwv(std::forward<T>(f),a,typename make_compile_time_integer_range<max_unpack>::type())) + { + return _cwv(std::forward<T>(f),a,typename make_compile_time_integer_range<max_unpack>::type()); + } + + template <typename T> + static auto go(T&& f, const matrix<double,0,1>& a) -> decltype(call_function_and_expand_args<max_unpack-1>::template go(std::forward<T>(f),a)) + { + return call_function_and_expand_args<max_unpack-1>::go(std::forward<T>(f),a); + } + }; + + template <> + struct call_function_and_expand_args<0> + { + template <typename T> + static auto go(T&& f, const matrix<double,0,1>& a) -> decltype(f(disable_decay_to_scalar(a))) + { + return f(disable_decay_to_scalar(a)); + } + }; +#else + template <size_t max_unpack> + struct call_function_and_expand_args + { +template <typename T> static auto go(T&& f, const matrix<double, 0, 1>& a) -> decltype(f(disable_decay_to_scalar(a))) {return f(disable_decay_to_scalar(a)); } +template <typename T> static auto go(T&& f, const matrix<double, 0, 1>& a) -> decltype(f(a(0))) { DLIB_CASSERT(a.size() == 1); return f(a(0)); } +template <typename T> static auto go(T&& f, const matrix<double, 0, 1>& a) -> decltype(f(a(0),a(1))) { DLIB_CASSERT(a.size() == 2); return f(a(0),a(1)); } +template <typename T> static auto go(T&& f, const matrix<double, 0, 1>& a) -> decltype(f(a(0), a(1), a(2))) { DLIB_CASSERT(a.size() == 3); return f(a(0), a(1),a(2)); } +template <typename T> static auto go(T&& f, const matrix<double, 0, 1>& a) -> decltype(f(a(0), a(1), a(2), a(3))) { DLIB_CASSERT(a.size() == 4); return f(a(0), a(1), a(2), a(3)); } +template <typename T> static auto go(T&& f, const matrix<double, 0, 1>& a) -> decltype(f(a(0), a(1), a(2), a(3), a(4))) { DLIB_CASSERT(a.size() == 5); return f(a(0), a(1), a(2), a(3), a(4)); } +template <typename T> static auto go(T&& f, const matrix<double, 0, 1>& a) -> decltype(f(a(0), a(1), a(2), a(3), a(4), a(5))) { DLIB_CASSERT(a.size() == 6); return f(a(0), a(1), a(2), a(3), a(4), a(5)); } +template <typename T> static auto go(T&& f, const matrix<double, 0, 1>& a) -> decltype(f(a(0), a(1), a(2), a(3), a(4), a(5), a(6))) { DLIB_CASSERT(a.size() == 7); return f(a(0), a(1), a(2), a(3), a(4), a(5), a(6)); } + }; +#endif + } + +// ---------------------------------------------------------------------------------------- +// ---------------------------------------------------------------------------------------- + + template <typename T> + auto call_function_and_expand_args( + T&& f, + const matrix<double,0,1>& a + ) -> decltype(gopt_impl::call_function_and_expand_args<40>::go(f,a)) + { + // unpack up to 40 parameters when calling f() + return gopt_impl::call_function_and_expand_args<40>::go(std::forward<T>(f),a); + } + +// ---------------------------------------------------------------------------------------- +// ---------------------------------------------------------------------------------------- + + struct max_function_calls + { + max_function_calls() = default; + explicit max_function_calls(size_t max_calls) : max_calls(max_calls) {} + size_t max_calls = std::numeric_limits<size_t>::max(); + }; + +// ---------------------------------------------------------------------------------------- + + const auto FOREVER = std::chrono::hours(24*356*290); // 290 years + +// ---------------------------------------------------------------------------------------- + + namespace impl + { + template < + typename funct + > + std::pair<size_t,function_evaluation> find_max_global ( + std::vector<funct>& functions, + std::vector<function_spec> specs, + const max_function_calls num, + const std::chrono::nanoseconds max_runtime, + double solver_epsilon, + double ymult + ) + { + // Decide which parameters should be searched on a log scale. Basically, it's + // common for machine learning models to have parameters that should be searched on + // a log scale (e.g. SVM C). These parameters are usually identifiable because + // they have bounds like [1e-5 1e10], that is, they span a very large range of + // magnitudes from really small to really big. So there we are going to check for + // that and if we find parameters with that kind of bound constraints we will + // transform them to a log scale automatically. + std::vector<std::vector<bool>> log_scale(specs.size()); + for (size_t i = 0; i < specs.size(); ++i) + { + for (long j = 0; j < specs[i].lower.size(); ++j) + { + if (!specs[i].is_integer_variable[j] && specs[i].lower(j) > 0 && specs[i].upper(j)/specs[i].lower(j) >= 1000) + { + log_scale[i].push_back(true); + specs[i].lower(j) = std::log(specs[i].lower(j)); + specs[i].upper(j) = std::log(specs[i].upper(j)); + } + else + { + log_scale[i].push_back(false); + } + } + } + + global_function_search opt(specs); + opt.set_solver_epsilon(solver_epsilon); + + const auto time_to_stop = std::chrono::steady_clock::now() + max_runtime; + + // Now run the main solver loop. + for (size_t i = 0; i < num.max_calls && std::chrono::steady_clock::now() < time_to_stop; ++i) + { + auto next = opt.get_next_x(); + matrix<double,0,1> x = next.x(); + // Undo any log-scaling that was applied to the variables before we pass them + // to the functions being optimized. + for (long j = 0; j < x.size(); ++j) + { + if (log_scale[next.function_idx()][j]) + x(j) = std::exp(x(j)); + } + double y = ymult*call_function_and_expand_args(functions[next.function_idx()], x); + next.set(y); + } + + + matrix<double,0,1> x; + double y; + size_t function_idx; + opt.get_best_function_eval(x,y,function_idx); + // Undo any log-scaling that was applied to the variables before we output them. + for (long j = 0; j < x.size(); ++j) + { + if (log_scale[function_idx][j]) + x(j) = std::exp(x(j)); + } + return std::make_pair(function_idx, function_evaluation(x,y/ymult)); + } + } + +// ---------------------------------------------------------------------------------------- + + template < + typename funct + > + std::pair<size_t,function_evaluation> find_max_global ( + std::vector<funct>& functions, + std::vector<function_spec> specs, + const max_function_calls num, + const std::chrono::nanoseconds max_runtime = FOREVER, + double solver_epsilon = 0 + ) + { + return impl::find_max_global(functions, std::move(specs), num, max_runtime, solver_epsilon, +1); + } + + template < + typename funct + > + std::pair<size_t,function_evaluation> find_min_global ( + std::vector<funct>& functions, + std::vector<function_spec> specs, + const max_function_calls num, + const std::chrono::nanoseconds max_runtime = FOREVER, + double solver_epsilon = 0 + ) + { + return impl::find_max_global(functions, std::move(specs), num, max_runtime, solver_epsilon, -1); + } + +// ---------------------------------------------------------------------------------------- + + template < + typename funct + > + function_evaluation find_max_global ( + funct f, + const matrix<double,0,1>& bound1, + const matrix<double,0,1>& bound2, + const std::vector<bool>& is_integer_variable, + const max_function_calls num, + const std::chrono::nanoseconds max_runtime = FOREVER, + double solver_epsilon = 0 + ) + { + std::vector<funct> functions(1,std::move(f)); + std::vector<function_spec> specs(1, function_spec(bound1, bound2, is_integer_variable)); + return find_max_global(functions, std::move(specs), num, max_runtime, solver_epsilon).second; + } + + template < + typename funct + > + function_evaluation find_min_global ( + funct f, + const matrix<double,0,1>& bound1, + const matrix<double,0,1>& bound2, + const std::vector<bool>& is_integer_variable, + const max_function_calls num, + const std::chrono::nanoseconds max_runtime = FOREVER, + double solver_epsilon = 0 + ) + { + std::vector<funct> functions(1,std::move(f)); + std::vector<function_spec> specs(1, function_spec(bound1, bound2, is_integer_variable)); + return find_min_global(functions, std::move(specs), num, max_runtime, solver_epsilon).second; + } + +// ---------------------------------------------------------------------------------------- + + template < + typename funct + > + function_evaluation find_max_global ( + funct f, + const matrix<double,0,1>& bound1, + const matrix<double,0,1>& bound2, + const std::vector<bool>& is_integer_variable, + const max_function_calls num, + double solver_epsilon + ) + { + return find_max_global(std::move(f), bound1, bound2, is_integer_variable, num, FOREVER, solver_epsilon); + } + + template < + typename funct + > + function_evaluation find_min_global ( + funct f, + const matrix<double,0,1>& bound1, + const matrix<double,0,1>& bound2, + const std::vector<bool>& is_integer_variable, + const max_function_calls num, + double solver_epsilon + ) + { + return find_min_global(std::move(f), bound1, bound2, is_integer_variable, num, FOREVER, solver_epsilon); + } + +// ---------------------------------------------------------------------------------------- + + template < + typename funct + > + function_evaluation find_max_global ( + funct f, + const matrix<double,0,1>& bound1, + const matrix<double,0,1>& bound2, + const max_function_calls num, + const std::chrono::nanoseconds max_runtime = FOREVER, + double solver_epsilon = 0 + ) + { + return find_max_global(std::move(f), bound1, bound2, std::vector<bool>(bound1.size(),false), num, max_runtime, solver_epsilon); + } + + template < + typename funct + > + function_evaluation find_min_global ( + funct f, + const matrix<double,0,1>& bound1, + const matrix<double,0,1>& bound2, + const max_function_calls num, + const std::chrono::nanoseconds max_runtime = FOREVER, + double solver_epsilon = 0 + ) + { + return find_min_global(std::move(f), bound1, bound2, std::vector<bool>(bound1.size(),false), num, max_runtime, solver_epsilon); + } + +// ---------------------------------------------------------------------------------------- + + template < + typename funct + > + function_evaluation find_max_global ( + funct f, + const matrix<double,0,1>& bound1, + const matrix<double,0,1>& bound2, + const max_function_calls num, + double solver_epsilon + ) + { + return find_max_global(std::move(f), bound1, bound2, std::vector<bool>(bound1.size(),false), num, FOREVER, solver_epsilon); + } + + template < + typename funct + > + function_evaluation find_min_global ( + funct f, + const matrix<double,0,1>& bound1, + const matrix<double,0,1>& bound2, + const max_function_calls num, + double solver_epsilon + ) + { + return find_min_global(std::move(f), bound1, bound2, std::vector<bool>(bound1.size(),false), num, FOREVER, solver_epsilon); + } + +// ---------------------------------------------------------------------------------------- + + template < + typename funct + > + function_evaluation find_max_global ( + funct f, + const double bound1, + const double bound2, + const max_function_calls num, + const std::chrono::nanoseconds max_runtime = FOREVER, + double solver_epsilon = 0 + ) + { + return find_max_global(std::move(f), matrix<double,0,1>({bound1}), matrix<double,0,1>({bound2}), num, max_runtime, solver_epsilon); + } + + template < + typename funct + > + function_evaluation find_min_global ( + funct f, + const double bound1, + const double bound2, + const max_function_calls num, + const std::chrono::nanoseconds max_runtime = FOREVER, + double solver_epsilon = 0 + ) + { + return find_min_global(std::move(f), matrix<double,0,1>({bound1}), matrix<double,0,1>({bound2}), num, max_runtime, solver_epsilon); + } + +// ---------------------------------------------------------------------------------------- + + template < + typename funct + > + function_evaluation find_max_global ( + funct f, + const double bound1, + const double bound2, + const max_function_calls num, + double solver_epsilon + ) + { + return find_max_global(std::move(f), matrix<double,0,1>({bound1}), matrix<double,0,1>({bound2}), num, FOREVER, solver_epsilon); + } + + template < + typename funct + > + function_evaluation find_min_global ( + funct f, + const double bound1, + const double bound2, + const max_function_calls num, + double solver_epsilon + ) + { + return find_min_global(std::move(f), matrix<double,0,1>({bound1}), matrix<double,0,1>({bound2}), num, FOREVER, solver_epsilon); + } + +// ---------------------------------------------------------------------------------------- + + template < + typename funct + > + function_evaluation find_max_global ( + funct f, + const matrix<double,0,1>& bound1, + const matrix<double,0,1>& bound2, + const std::chrono::nanoseconds max_runtime, + double solver_epsilon = 0 + ) + { + return find_max_global(std::move(f), bound1, bound2, max_function_calls(), max_runtime, solver_epsilon); + } + + template < + typename funct + > + function_evaluation find_min_global ( + funct f, + const matrix<double,0,1>& bound1, + const matrix<double,0,1>& bound2, + const std::chrono::nanoseconds max_runtime, + double solver_epsilon = 0 + ) + { + return find_min_global(std::move(f), bound1, bound2, max_function_calls(), max_runtime, solver_epsilon); + } + +// ---------------------------------------------------------------------------------------- + + template < + typename funct + > + function_evaluation find_max_global ( + funct f, + const double bound1, + const double bound2, + const std::chrono::nanoseconds max_runtime, + double solver_epsilon = 0 + ) + { + return find_max_global(std::move(f), bound1, bound2, max_function_calls(), max_runtime, solver_epsilon); + } + + template < + typename funct + > + function_evaluation find_min_global ( + funct f, + const double bound1, + const double bound2, + const std::chrono::nanoseconds max_runtime, + double solver_epsilon = 0 + ) + { + return find_min_global(std::move(f), bound1, bound2, max_function_calls(), max_runtime, solver_epsilon); + } + +// ---------------------------------------------------------------------------------------- + + template < + typename funct + > + function_evaluation find_max_global ( + funct f, + const matrix<double,0,1>& bound1, + const matrix<double,0,1>& bound2, + const std::vector<bool>& is_integer_variable, + const std::chrono::nanoseconds max_runtime, + double solver_epsilon = 0 + ) + { + return find_max_global(std::move(f), bound1, bound2, is_integer_variable, max_function_calls(), max_runtime, solver_epsilon); + } + + template < + typename funct + > + function_evaluation find_min_global ( + funct f, + const matrix<double,0,1>& bound1, + const matrix<double,0,1>& bound2, + const std::vector<bool>& is_integer_variable, + const std::chrono::nanoseconds max_runtime, + double solver_epsilon = 0 + ) + { + return find_min_global(std::move(f), bound1, bound2, is_integer_variable, max_function_calls(), max_runtime, solver_epsilon); + } + +// ---------------------------------------------------------------------------------------- + +} + +#endif // DLIB_FiND_GLOBAL_MAXIMUM_hH_ + diff --git a/ml/dlib/dlib/global_optimization/find_max_global_abstract.h b/ml/dlib/dlib/global_optimization/find_max_global_abstract.h new file mode 100644 index 000000000..4be62b154 --- /dev/null +++ b/ml/dlib/dlib/global_optimization/find_max_global_abstract.h @@ -0,0 +1,496 @@ +// Copyright (C) 2017 Davis E. King (davis@dlib.net) +// License: Boost Software License See LICENSE.txt for the full license. +#undef DLIB_FiND_GLOBAL_MAXIMUM_ABSTRACT_hH_ +#ifdef DLIB_FiND_GLOBAL_MAXIMUM_ABSTRACT_hH_ + +#include "upper_bound_function_abstract.h" +#include "global_function_search_abstract.h" +#include "../metaprogramming.h" +#include "../matrix.h" +#include <utility> +#include <chrono> + +namespace dlib +{ + +// ---------------------------------------------------------------------------------------- + + template < + typename T + > + auto call_function_and_expand_args( + T&& f, + const matrix<double,0,1>& args + ) -> decltype(f(args or args expanded out as discussed below)); + /*! + requires + - f is a function object with one of the following signatures: + auto f(matrix<double,0,1>) + auto f(double) + auto f(double,double) + auto f(double,double,double) + ... + auto f(double,double,...,double) // up to 40 double arguments + - if (f() explicitly expands its arguments) then + - args.size() == the number of arguments taken by f. + ensures + - This function invokes f() with the given arguments and returns the result. + However, the signature of f() is allowed to vary. In particular, if f() + takes a matrix<double,0,1> as a single argument then this function simply + calls f(args). However, if f() takes double arguments then args is expanded + appropriately, i.e. it calls one of the following as appropriate: + f(args(0)) + f(args(0),args(1)) + ... + f(args(0),args(1),...,args(N)) + and the result of f() is returned. + !*/ + +// ---------------------------------------------------------------------------------------- + + struct max_function_calls + { + /*! + WHAT THIS OBJECT REPRESENTS + This is a simple typed integer class used to strongly type the "max number + of function calls" argument to find_max_global() and find_min_global(). + + !*/ + + max_function_calls() = default; + + explicit max_function_calls(size_t max_calls) : max_calls(max_calls) {} + + size_t max_calls = std::numeric_limits<size_t>::max(); + }; + +// ---------------------------------------------------------------------------------------- + + const auto FOREVER = std::chrono::hours(24*356*290); // 290 years, basically forever + +// ---------------------------------------------------------------------------------------- + + template < + typename funct + > + std::pair<size_t,function_evaluation> find_max_global ( + std::vector<funct>& functions, + const std::vector<function_spec>& specs, + const max_function_calls num, + const std::chrono::nanoseconds max_runtime = FOREVER, + double solver_epsilon = 0 + ); + /*! + requires + - functions.size() != 0 + - functions.size() == specs.size() + - solver_epsilon >= 0 + - for all valid i: + - functions[i] is a real valued multi-variate function object. Moreover, + it must be callable via an expression of the form: + call_function_and_expand_args(functions[i], specs.lower). This means + function[i] should have a signature like one of the following: + double f(matrix<double,0,1>) + double f(double) + double f(double,double) + etc. + - The range of inputs defined by specs[i] must be valid inputs to + functions[i]. + ensures + - This function performs global optimization on the set of given functions. + The goal is to maximize the following objective function: + max_{i,x_i}: functions[i](x_i) + subject to the constraints on x_i defined by specs[i]. + Once found, the return value of find_max_global() is: + make_pair(i, function_evaluation(x_i,functions[i](x_i))). + That is, we search for the settings of i and x that return the largest output + and return those settings. + - The search is performed using the global_function_search object. See its + documentation for details of the algorithm. + - We set the global_function_search::get_solver_epsilon() parameter to + solver_epsilon. Therefore, the search will only attempt to find a global + maximizer to at most solver_epsilon accuracy. Once a local maximizer is + found to that accuracy the search will focus entirely on finding other maxima + elsewhere rather than on further improving the current local optima found so + far. That is, once a local maxima is identified to about solver_epsilon + accuracy, the algorithm will spend all its time exploring the functions to + find other local maxima to investigate. An epsilon of 0 means it will keep + solving until it reaches full floating point precision. Larger values will + cause it to switch to pure global exploration sooner and therefore might be + more effective if your objective function has many local maxima and you don't + care about a super high precision solution. + - find_max_global() runs until one of the following is true: + - The total number of calls to the provided functions is == num.max_calls + - More than max_runtime time has elapsed since the start of this function. + - Any variables that satisfy the following conditions are optimized on a log-scale: + - The lower bound on the variable is > 0 + - The ratio of the upper bound to lower bound is >= 1000 + - The variable is not an integer variable + We do this because it's common to optimize machine learning models that have + parameters with bounds in a range such as [1e-5 to 1e10] (e.g. the SVM C + parameter) and it's much more appropriate to optimize these kinds of + variables on a log scale. So we transform them by applying std::log() to + them and then undo the transform via std::exp() before invoking the function + being optimized. Therefore, this transformation is invisible to the user + supplied functions. In most cases, it improves the efficiency of the + optimizer. + !*/ + + template < + typename funct + > + std::pair<size_t,function_evaluation> find_min_global ( + std::vector<funct>& functions, + const std::vector<function_spec>& specs, + const max_function_calls num, + const std::chrono::nanoseconds max_runtime = FOREVER, + double solver_epsilon = 0 + ); + /*! + This function is identical to the find_max_global() defined immediately above, + except that we perform minimization rather than maximization. + !*/ + +// ---------------------------------------------------------------------------------------- + + template < + typename funct + > + function_evaluation find_max_global ( + funct f, + const matrix<double,0,1>& bound1, + const matrix<double,0,1>& bound2, + const std::vector<bool>& is_integer_variable, + const max_function_calls num, + const std::chrono::nanoseconds max_runtime = FOREVER, + double solver_epsilon = 0 + ); + /*! + requires + - bound1.size() == bound2.size() == is_integer_variable.size() + - for all valid i: bound1(i) != bound2(i) + - solver_epsilon >= 0 + - f() is a real valued multi-variate function object. Moreover, it must be + callable via an expression of the form: call_function_and_expand_args(f, + bound1). This means f() should have a signature like one of the following: + double f(matrix<double,0,1>) + double f(double) + double f(double,double) + etc. + - The range of inputs defined by function_spec(bound1,bound2,is_integer_variable) + must be valid inputs to f(). + ensures + - This function performs global optimization on the given f() function. + The goal is to maximize the following objective function: + f(x) + subject to the constraints on x defined by function_spec(bound1,bound2,is_integer_variable). + Once found, the return value of find_max_global() is: + function_evaluation(x,f(x))). + That is, we search for the setting of x that returns the largest output and + return that setting. + - The search is performed using the global_function_search object. See its + documentation for details of the algorithm. + - We set the global_function_search::get_solver_epsilon() parameter to + solver_epsilon. Therefore, the search will only attempt to find a global + maximizer to at most solver_epsilon accuracy. Once a local maximizer is + found to that accuracy the search will focus entirely on finding other maxima + elsewhere rather than on further improving the current local optima found so + far. That is, once a local maxima is identified to about solver_epsilon + accuracy, the algorithm will spend all its time exploring the function to + find other local maxima to investigate. An epsilon of 0 means it will keep + solving until it reaches full floating point precision. Larger values will + cause it to switch to pure global exploration sooner and therefore might be + more effective if your objective function has many local maxima and you don't + care about a super high precision solution. + - find_max_global() runs until one of the following is true: + - The total number of calls to f() is == num.max_calls + - More than max_runtime time has elapsed since the start of this function. + - Any variables that satisfy the following conditions are optimized on a log-scale: + - The lower bound on the variable is > 0 + - The ratio of the upper bound to lower bound is >= 1000 + - The variable is not an integer variable + We do this because it's common to optimize machine learning models that have + parameters with bounds in a range such as [1e-5 to 1e10] (e.g. the SVM C + parameter) and it's much more appropriate to optimize these kinds of + variables on a log scale. So we transform them by applying std::log() to + them and then undo the transform via std::exp() before invoking the function + being optimized. Therefore, this transformation is invisible to the user + supplied functions. In most cases, it improves the efficiency of the + optimizer. + !*/ + + template < + typename funct + > + function_evaluation find_min_global ( + funct f, + const matrix<double,0,1>& bound1, + const matrix<double,0,1>& bound2, + const std::vector<bool>& is_integer_variable, + const max_function_calls num, + const std::chrono::nanoseconds max_runtime = FOREVER, + double solver_epsilon = 0 + ); + /*! + This function is identical to the find_max_global() defined immediately above, + except that we perform minimization rather than maximization. + !*/ + +// ---------------------------------------------------------------------------------------- +// The following functions are just convenient overloads for calling the above defined +// find_max_global() and find_min_global() routines. +// ---------------------------------------------------------------------------------------- + + template < + typename funct + > + function_evaluation find_max_global ( + funct f, + const matrix<double,0,1>& bound1, + const matrix<double,0,1>& bound2, + const std::vector<bool>& is_integer_variable, + const max_function_calls num, + double solver_epsilon + ) + { + return find_max_global(std::move(f), bound1, bound2, std::vector<bool>(bound1.size(),false), num, FOREVER, solver_epsilon); + } + + template < + typename funct + > + function_evaluation find_min_global ( + funct f, + const matrix<double,0,1>& bound1, + const matrix<double,0,1>& bound2, + const std::vector<bool>& is_integer_variable, + const max_function_calls num, + double solver_epsilon + ) + { + return find_min_global(std::move(f), bound1, bound2, std::vector<bool>(bound1.size(),false), num, FOREVER, solver_epsilon); + } + +// ---------------------------------------------------------------------------------------- + + template < + typename funct + > + function_evaluation find_max_global ( + funct f, + const matrix<double,0,1>& bound1, + const matrix<double,0,1>& bound2, + const max_function_calls num, + const std::chrono::nanoseconds max_runtime = FOREVER, + double solver_epsilon = 0 + ) + { + return find_max_global(std::move(f), bound1, bound2, std::vector<bool>(bound1.size(),false), num, max_runtime, solver_epsilon); + } + + template < + typename funct + > + function_evaluation find_min_global ( + funct f, + const matrix<double,0,1>& bound1, + const matrix<double,0,1>& bound2, + const max_function_calls num, + const std::chrono::nanoseconds max_runtime = FOREVER, + double solver_epsilon = 0 + ) + { + return find_min_global(std::move(f), bound1, bound2, std::vector<bool>(bound1.size(),false), num, max_runtime, solver_epsilon); + } + +// ---------------------------------------------------------------------------------------- + + template < + typename funct + > + function_evaluation find_max_global ( + funct f, + const matrix<double,0,1>& bound1, + const matrix<double,0,1>& bound2, + const max_function_calls num, + double solver_epsilon + ) + { + return find_max_global(std::move(f), bound1, bound2, std::vector<bool>(bound1.size(),false), num, FOREVER, solver_epsilon); + } + + template < + typename funct + > + function_evaluation find_min_global ( + funct f, + const matrix<double,0,1>& bound1, + const matrix<double,0,1>& bound2, + const max_function_calls num, + double solver_epsilon + ) + { + return find_min_global(std::move(f), bound1, bound2, std::vector<bool>(bound1.size(),false), num, FOREVER, solver_epsilon); + } + +// ---------------------------------------------------------------------------------------- + + template < + typename funct + > + function_evaluation find_max_global ( + funct f, + const double bound1, + const double bound2, + const max_function_calls num, + const std::chrono::nanoseconds max_runtime = FOREVER, + double solver_epsilon = 0 + ) + { + return find_max_global(std::move(f), matrix<double,0,1>({bound1}), matrix<double,0,1>({bound2}), num, max_runtime, solver_epsilon); + } + + template < + typename funct + > + function_evaluation find_min_global ( + funct f, + const double bound1, + const double bound2, + const max_function_calls num, + const std::chrono::nanoseconds max_runtime = FOREVER, + double solver_epsilon = 0 + ) + { + return find_min_global(std::move(f), matrix<double,0,1>({bound1}), matrix<double,0,1>({bound2}), num, max_runtime, solver_epsilon); + } + +// ---------------------------------------------------------------------------------------- + + template < + typename funct + > + function_evaluation find_max_global ( + funct f, + const double bound1, + const double bound2, + const max_function_calls num, + double solver_epsilon + ) + { + return find_max_global(std::move(f), matrix<double,0,1>({bound1}), matrix<double,0,1>({bound2}), num, FOREVER, solver_epsilon); + } + + template < + typename funct + > + function_evaluation find_min_global ( + funct f, + const double bound1, + const double bound2, + const max_function_calls num, + double solver_epsilon + ) + { + return find_min_global(std::move(f), matrix<double,0,1>({bound1}), matrix<double,0,1>({bound2}), num, FOREVER, solver_epsilon); + } + +// ---------------------------------------------------------------------------------------- + + template < + typename funct + > + function_evaluation find_max_global ( + funct f, + const matrix<double,0,1>& bound1, + const matrix<double,0,1>& bound2, + const std::chrono::nanoseconds max_runtime, + double solver_epsilon = 0 + ) + { + return find_max_global(std::move(f), bound1, bound2, max_function_calls(), max_runtime, solver_epsilon); + } + + template < + typename funct + > + function_evaluation find_min_global ( + funct f, + const matrix<double,0,1>& bound1, + const matrix<double,0,1>& bound2, + const std::chrono::nanoseconds max_runtime, + double solver_epsilon = 0 + ) + { + return find_min_global(std::move(f), bound1, bound2, max_function_calls(), max_runtime, solver_epsilon); + } + +// ---------------------------------------------------------------------------------------- + + template < + typename funct + > + function_evaluation find_max_global ( + funct f, + const double bound1, + const double bound2, + const std::chrono::nanoseconds max_runtime, + double solver_epsilon = 0 + ) + { + return find_max_global(std::move(f), bound1, bound2, max_function_calls(), max_runtime, solver_epsilon); + } + + template < + typename funct + > + function_evaluation find_min_global ( + funct f, + const double bound1, + const double bound2, + const std::chrono::nanoseconds max_runtime, + double solver_epsilon = 0 + ) + { + return find_min_global(std::move(f), bound1, bound2, max_function_calls(), max_runtime, solver_epsilon); + } + +// ---------------------------------------------------------------------------------------- + + template < + typename funct + > + function_evaluation find_max_global ( + funct f, + const matrix<double,0,1>& bound1, + const matrix<double,0,1>& bound2, + const std::vector<bool>& is_integer_variable, + const std::chrono::nanoseconds max_runtime, + double solver_epsilon = 0 + ) + { + return find_max_global(std::move(f), bound1, bound2, is_integer_variable, max_function_calls(), max_runtime, solver_epsilon); + } + + template < + typename funct + > + function_evaluation find_min_global ( + funct f, + const matrix<double,0,1>& bound1, + const matrix<double,0,1>& bound2, + const std::vector<bool>& is_integer_variable, + const std::chrono::nanoseconds max_runtime, + double solver_epsilon = 0 + ) + { + return find_min_global(std::move(f), bound1, bound2, is_integer_variable, max_function_calls(), max_runtime, solver_epsilon); + } + +// ---------------------------------------------------------------------------------------- + +} + +#endif // DLIB_FiND_GLOBAL_MAXIMUM_ABSTRACT_hH_ + + diff --git a/ml/dlib/dlib/global_optimization/global_function_search.cpp b/ml/dlib/dlib/global_optimization/global_function_search.cpp new file mode 100644 index 000000000..fada289a4 --- /dev/null +++ b/ml/dlib/dlib/global_optimization/global_function_search.cpp @@ -0,0 +1,942 @@ + +#include "global_function_search.h" +#include "upper_bound_function.h" +#include "../optimization.h" + + +namespace dlib +{ + +// ---------------------------------------------------------------------------------------- + + namespace qopt_impl + { + void fit_quadratic_to_points_mse( + const matrix<double>& X, + const matrix<double,0,1>& Y, + matrix<double>& H, + matrix<double,0,1>& g, + double& c + ) + { + DLIB_CASSERT(X.size() > 0); + DLIB_CASSERT(X.nc() == Y.size()); + DLIB_CASSERT(X.nc() >= (X.nr()+1)*(X.nr()+2)/2); + + const long dims = X.nr(); + const long M = X.nc(); + + matrix<double> W((X.nr()+1)*(X.nr()+2)/2, M); + + set_subm(W, 0,0, dims, M) = X; + set_subm(W, dims,0, 1, M) = 1; + for (long c = 0; c < X.nc(); ++c) + { + long wr = dims+1; + for (long r = 0; r < X.nr(); ++r) + { + for (long r2 = r; r2 < X.nr(); ++r2) + { + W(wr,c) = X(r,c)*X(r2,c); + if (r2 == r) + W(wr,c) *= 0.5; + ++wr; + } + } + } + + matrix<double,0,1> z = pinv(trans(W))*Y; + + c = z(dims); + g = rowm(z, range(0,dims-1)); + + H.set_size(dims,dims); + + long wr = dims+1; + for (long r = 0; r < X.nr(); ++r) + { + for (long r2 = r; r2 < X.nr(); ++r2) + { + H(r,r2) = H(r2,r) = z(wr++); + } + } + } + + // ---------------------------------------------------------------------------------------- + + void fit_quadratic_to_points( + const matrix<double>& X, + const matrix<double,0,1>& Y, + matrix<double>& H, + matrix<double,0,1>& g, + double& c + ) + /*! + requires + - X.size() > 0 + - X.nc() == Y.size() + - X.nr()+1 <= X.nc() + ensures + - This function finds a quadratic function, Q(x), that interpolates the + given set of points. If there aren't enough points to uniquely define + Q(x) then the Q(x) that fits the given points with the minimum Frobenius + norm hessian matrix is selected. + - To be precise: + - Let: Q(x) == 0.5*trans(x)*H*x + trans(x)*g + c + - Then this function finds H, g, and c that minimizes the following: + sum(squared(H)) + such that: + Q(colm(X,i)) == Y(i), for all valid i + - If there are more points than necessary to constrain Q then the Q + that best interpolates the function in the mean squared sense is + found. + !*/ + { + DLIB_CASSERT(X.size() > 0); + DLIB_CASSERT(X.nc() == Y.size()); + DLIB_CASSERT(X.nr()+1 <= X.nc()); + + + if (X.nc() >= (X.nr()+1)*(X.nr()+2)/2) + { + fit_quadratic_to_points_mse(X,Y,H,g,c); + return; + } + + + const long dims = X.nr(); + const long M = X.nc(); + + /* + Our implementation uses the equations 3.9 - 3.12 from the paper: + The NEWUOA software for unconstrained optimization without derivatives + By M.J.D. Powell, 40th Workshop on Large Scale Nonlinear Optimization (Erice, Italy, 2004) + */ + + matrix<double> W(M + dims + 1, M + dims + 1); + + set_subm(W, 0, 0, M, M) = 0.5*squared(tmp(trans(X)*X)); + set_subm(W, 0, M, M, 1) = 1; + set_subm(W, M, 0, 1, M) = 1; + set_subm(W, M, M, dims+1, dims+1) = 0; + set_subm(W, 0, M+1, X.nc(), X.nr()) = trans(X); + set_subm(W, M+1, 0, X.nr(), X.nc()) = X; + + + const matrix<double,0,1> r = join_cols(Y, zeros_matrix<double>(dims+1,1)); + + //matrix<double,0,1> z = pinv(W)*r; + lu_decomposition<decltype(W)> lu(W); + matrix<double,0,1> z = lu.solve(r); + //if (lu.is_singular()) std::cout << "WARNING, THE W MATRIX IS SINGULAR!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!" << std::endl; + + matrix<double,0,1> lambda = rowm(z, range(0,M-1)); + + c = z(M); + g = rowm(z, range(M+1,z.size()-1)); + H = X*diagm(lambda)*trans(X); + } + + // ---------------------------------------------------------------------------------------- + + struct quad_interp_result + { + quad_interp_result() = default; + + template <typename EXP> + quad_interp_result( + const matrix_exp<EXP>& best_x, + double predicted_improvement + ) : best_x(best_x), predicted_improvement(predicted_improvement) {} + + matrix<double,0,1> best_x; + double predicted_improvement = std::numeric_limits<double>::quiet_NaN(); + }; + + // ---------------------------------------------------------------------------------------- + + quad_interp_result find_max_quadraticly_interpolated_vector ( + const matrix<double,0,1>& anchor, + const double radius, + const std::vector<matrix<double,0,1>>& x, + const std::vector<double>& y, + const matrix<double,0,1>& lower, + const matrix<double,0,1>& upper + ) + { + DLIB_CASSERT(x.size() == y.size()); + DLIB_CASSERT(x.size() > 0); + for (size_t i = 0; i < x.size(); ++i) + DLIB_CASSERT(anchor.size() == x[i].size()); + + const long x_size = static_cast<long>(x.size()); + DLIB_CASSERT(anchor.size()+1 <= x_size && x_size <= (anchor.size()+1)*(anchor.size()+2)/2); + + + matrix<double> X(anchor.size(), x.size()); + matrix<double,0,1> Y(x.size()); + for (size_t i = 0; i < x.size(); ++i) + { + set_colm(X,i) = x[i] - anchor; + Y(i) = y[i]; + } + + matrix<double> H; + matrix<double,0,1> g; + double c; + + fit_quadratic_to_points(X, Y, H, g, c); + + matrix<double,0,1> p; + + solve_trust_region_subproblem_bounded(-H,-g, radius, p, 0.001, 500, lower-anchor, upper-anchor); + + // ensure we never move more than radius from the anchor. This might happen if the + // trust region subproblem isn't solved accurately enough. + if (length(p) >= radius) + p *= radius/length(p); + + + double predicted_improvement = 0.5*trans(p)*H*p + trans(p)*g; + return quad_interp_result{clamp(anchor+p,lower,upper), predicted_improvement}; + } + + // ---------------------------------------------------------------------------------------- + + quad_interp_result pick_next_sample_using_trust_region ( + const std::vector<function_evaluation>& samples, + double& radius, + const matrix<double,0,1>& lower, + const matrix<double,0,1>& upper, + const std::vector<bool>& is_integer_variable + ) + { + DLIB_CASSERT(samples.size() > 0); + // We don't use the QP to optimize integer variables. Instead, we just fix them at + // their best observed value and use the QP to optimize the real variables. So the + // number of dimensions, as far as the QP is concerned, is the number of non-integer + // variables. + size_t dims = 0; + for (auto is_int : is_integer_variable) + { + if (!is_int) + ++dims; + } + + DLIB_CASSERT(samples.size() >= dims+1); + + // Use enough points to fill out a quadratic model or the max available if we don't + // have quite enough. + const long N = std::min(samples.size(), (dims+1)*(dims+2)/2); + + + // first find the best sample; + double best_val = -1e300; + matrix<double,0,1> best_x; + for (auto& v : samples) + { + if (v.y > best_val) + { + best_val = v.y; + best_x = v.x; + } + } + + // if there are only integer variables then there isn't really anything to do. So just + // return the best_x and say there is no improvement. + if (dims == 0) + return quad_interp_result(best_x, 0); + + matrix<long,0,1> active_dims(dims); + long j = 0; + for (size_t i = 0; i < is_integer_variable.size(); ++i) + { + if (!is_integer_variable[i]) + active_dims(j++) = i; + } + + // now find the N-1 nearest neighbors of best_x + std::vector<std::pair<double,size_t>> distances; + for (size_t i = 0; i < samples.size(); ++i) + distances.emplace_back(length(best_x-samples[i].x), i); + std::sort(distances.begin(), distances.end()); + distances.resize(N); + + std::vector<matrix<double,0,1>> x; + std::vector<double> y; + for (auto& idx : distances) + { + x.emplace_back(rowm(samples[idx.second].x, active_dims)); + y.emplace_back(samples[idx.second].y); + } + + if (radius == 0) + { + for (auto& idx : distances) + radius = std::max(radius, length(rowm(best_x-samples[idx.second].x, active_dims)) ); + // Shrink the radius a little so we are always going to be making the sampling of + // points near the best current point smaller. + radius *= 0.95; + } + + + auto tmp = find_max_quadraticly_interpolated_vector(rowm(best_x,active_dims), radius, x, y, rowm(lower,active_dims), rowm(upper,active_dims)); + + // stick the integer variables back into the solution + for (long i = 0; i < active_dims.size(); ++i) + best_x(active_dims(i)) = tmp.best_x(i); + + tmp.best_x = best_x; + return tmp; + } + + // ---------------------------------------------------------------------------------------- + + matrix<double,0,1> make_random_vector( + dlib::rand& rnd, + const matrix<double,0,1>& lower, + const matrix<double,0,1>& upper, + const std::vector<bool>& is_integer_variable + ) + { + matrix<double,0,1> temp(lower.size()); + for (long i = 0; i < temp.size(); ++i) + { + temp(i) = rnd.get_double_in_range(lower(i), upper(i)); + if (is_integer_variable[i]) + temp(i) = std::round(temp(i)); + } + return temp; + } + + // ---------------------------------------------------------------------------------------- + + struct max_upper_bound_function + { + max_upper_bound_function() = default; + + template <typename EXP> + max_upper_bound_function( + const matrix_exp<EXP>& x, + double predicted_improvement, + double upper_bound + ) : x(x), predicted_improvement(predicted_improvement), upper_bound(upper_bound) {} + + matrix<double,0,1> x; + double predicted_improvement = 0; + double upper_bound = 0; + }; + + // ------------------------------------------------------------------------------------ + + max_upper_bound_function pick_next_sample_as_max_upper_bound ( + dlib::rand& rnd, + const upper_bound_function& ub, + const matrix<double,0,1>& lower, + const matrix<double,0,1>& upper, + const std::vector<bool>& is_integer_variable, + const size_t num_random_samples + ) + { + DLIB_CASSERT(ub.num_points() > 0); + + + + // now do a simple random search to find the maximum upper bound + double best_ub_so_far = -std::numeric_limits<double>::infinity(); + matrix<double,0,1> vtemp(lower.size()), v; + for (size_t rounds = 0; rounds < num_random_samples; ++rounds) + { + vtemp = make_random_vector(rnd, lower, upper, is_integer_variable); + + double bound = ub(vtemp); + if (bound > best_ub_so_far) + { + best_ub_so_far = bound; + v = vtemp; + } + } + + double max_value = -std::numeric_limits<double>::infinity(); + for (auto& v : ub.get_points()) + max_value = std::max(max_value, v.y); + + return max_upper_bound_function(v, best_ub_so_far - max_value, best_ub_so_far); + } + + } // end of namespace qopt_impl; + + using namespace qopt_impl; + +// ---------------------------------------------------------------------------------------- +// ---------------------------------------------------------------------------------------- + + function_spec::function_spec( + matrix<double,0,1> bound1, + matrix<double,0,1> bound2 + ) : + lower(std::move(bound1)), upper(std::move(bound2)) + { + DLIB_CASSERT(lower.size() == upper.size()); + for (long i = 0; i < lower.size(); ++i) + { + if (upper(i) < lower(i)) + std::swap(lower(i), upper(i)); + DLIB_CASSERT(upper(i) != lower(i), "The upper and lower bounds can't be equal."); + } + is_integer_variable.assign(lower.size(), false); + } + +// ---------------------------------------------------------------------------------------- + + function_spec::function_spec( + matrix<double,0,1> bound1, + matrix<double,0,1> bound2, + std::vector<bool> is_integer + ) : + function_spec(std::move(bound1),std::move(bound2)) + { + is_integer_variable = std::move(is_integer); + DLIB_CASSERT(lower.size() == (long)is_integer_variable.size()); + + + // Make sure any integer variables have integer bounds. + for (size_t i = 0; i < is_integer_variable.size(); ++i) + { + if (is_integer_variable[i]) + { + DLIB_CASSERT(std::round(lower(i)) == lower(i), "If you say a variable is an integer variable then it must have an integer lower bound. \n" + << "lower[i] = " << lower(i)); + DLIB_CASSERT(std::round(upper(i)) == upper(i), "If you say a variable is an integer variable then it must have an integer upper bound. \n" + << "upper[i] = " << upper(i)); + } + } + } + +// ---------------------------------------------------------------------------------------- + + namespace gopt_impl + { + upper_bound_function funct_info::build_upper_bound_with_all_function_evals ( + ) const + { + upper_bound_function tmp(ub); + + // we are going to add the outstanding evals into this and assume the + // outstanding evals are going to take y values equal to their nearest + // neighbor complete evals. + for (auto& eval : outstanding_evals) + { + function_evaluation e; + e.x = eval.x; + e.y = find_nn(ub.get_points(), eval.x); + tmp.add(e); + } + + return tmp; + } + + // ------------------------------------------------------------------------------------ + + double funct_info::find_nn ( + const std::vector<function_evaluation>& evals, + const matrix<double,0,1>& x + ) + { + double best_y = 0; + double best_dist = std::numeric_limits<double>::infinity(); + for (auto& v : evals) + { + double dist = length_squared(v.x-x); + if (dist < best_dist) + { + best_dist = dist; + best_y = v.y; + } + } + return best_y; + } + + } // end namespace gopt_impl + +// ---------------------------------------------------------------------------------------- +// ---------------------------------------------------------------------------------------- +// ---------------------------------------------------------------------------------------- + + function_evaluation_request::function_evaluation_request( + function_evaluation_request&& item + ) + { + m_has_been_evaluated = item.m_has_been_evaluated; + req = item.req; + info = item.info; + item.info.reset(); + + item.m_has_been_evaluated = true; + } + +// ---------------------------------------------------------------------------------------- + + function_evaluation_request& function_evaluation_request:: + operator=( + function_evaluation_request&& item + ) + { + function_evaluation_request(std::move(item)).swap(*this); + return *this; + } + +// ---------------------------------------------------------------------------------------- + + void function_evaluation_request:: + swap( + function_evaluation_request& item + ) + { + std::swap(m_has_been_evaluated, item.m_has_been_evaluated); + std::swap(req, item.req); + std::swap(info, item.info); + } + +// ---------------------------------------------------------------------------------------- + + size_t function_evaluation_request:: + function_idx ( + ) const + { + return info->function_idx; + } + + const matrix<double,0,1>& function_evaluation_request:: + x ( + ) const + { + return req.x; + } + +// ---------------------------------------------------------------------------------------- + + bool function_evaluation_request:: + has_been_evaluated ( + ) const + { + return m_has_been_evaluated; + } + +// ---------------------------------------------------------------------------------------- + + function_evaluation_request:: + ~function_evaluation_request() + { + if (!m_has_been_evaluated) + { + std::lock_guard<std::mutex> lock(*info->m); + + // remove the evaluation request from the outstanding list. + auto i = std::find(info->outstanding_evals.begin(), info->outstanding_evals.end(), req); + info->outstanding_evals.erase(i); + } + } + +// ---------------------------------------------------------------------------------------- + + void function_evaluation_request:: + set ( + double y + ) + { + DLIB_CASSERT(has_been_evaluated() == false); + std::lock_guard<std::mutex> lock(*info->m); + + m_has_been_evaluated = true; + + + // move the evaluation from outstanding to complete + auto i = std::find(info->outstanding_evals.begin(), info->outstanding_evals.end(), req); + DLIB_CASSERT(i != info->outstanding_evals.end()); + info->outstanding_evals.erase(i); + info->ub.add(function_evaluation(req.x,y)); + + + // Now do trust region radius maintenance and keep track of the best objective + // values and all that. + if (req.was_trust_region_generated_request) + { + // Adjust trust region radius based on how good this evaluation + // was. + double measured_improvement = y-req.anchor_objective_value; + double rho = measured_improvement/std::abs(req.predicted_improvement); + //std::cout << "rho: "<< rho << std::endl; + //std::cout << "radius: "<< info->radius << std::endl; + if (rho < 0.25) + info->radius *= 0.5; + else if (rho > 0.75) + info->radius *= 2; + } + + if (y > info->best_objective_value) + { + if (!req.was_trust_region_generated_request && length(req.x - info->best_x) > info->radius*1.001) + { + //std::cout << "reset radius because of big move, " << length(req.x - info->best_x) << " radius was " << info->radius << std::endl; + // reset trust region radius since we made a big move. Doing this will + // cause the radius to be reset to the size of the local region. + info->radius = 0; + } + info->best_objective_value = y; + info->best_x = std::move(req.x); + } + } + +// ---------------------------------------------------------------------------------------- +// ---------------------------------------------------------------------------------------- +// ---------------------------------------------------------------------------------------- + + global_function_search:: + global_function_search( + const function_spec& function + ) : global_function_search(std::vector<function_spec>(1,function)) {} + +// ---------------------------------------------------------------------------------------- + + global_function_search:: + global_function_search( + const std::vector<function_spec>& functions_ + ) + { + DLIB_CASSERT(functions_.size() > 0); + m = std::make_shared<std::mutex>(); + functions.reserve(functions_.size()); + for (size_t i = 0; i < functions_.size(); ++i) + functions.emplace_back(std::make_shared<gopt_impl::funct_info>(functions_[i],i,m)); + } + +// ---------------------------------------------------------------------------------------- + + global_function_search:: + global_function_search( + const std::vector<function_spec>& functions_, + const std::vector<std::vector<function_evaluation>>& initial_function_evals, + const double relative_noise_magnitude_ + ) : + global_function_search(functions_) + { + DLIB_CASSERT(functions_.size() > 0); + DLIB_CASSERT(functions_.size() == initial_function_evals.size()); + DLIB_CASSERT(relative_noise_magnitude >= 0); + relative_noise_magnitude = relative_noise_magnitude_; + for (size_t i = 0; i < initial_function_evals.size(); ++i) + { + functions[i]->ub = upper_bound_function(initial_function_evals[i], relative_noise_magnitude); + } + } + +// ---------------------------------------------------------------------------------------- + + size_t global_function_search:: + num_functions( + ) const + { + return functions.size(); + } + +// ---------------------------------------------------------------------------------------- + + void global_function_search:: + set_seed ( + time_t seed + ) + { + rnd = dlib::rand(seed); + } + +// ---------------------------------------------------------------------------------------- + + void global_function_search:: + get_function_evaluations ( + std::vector<function_spec>& specs, + std::vector<std::vector<function_evaluation>>& function_evals + ) const + { + std::lock_guard<std::mutex> lock(*m); + specs.clear(); + function_evals.clear(); + for (size_t i = 0; i < functions.size(); ++i) + { + specs.emplace_back(functions[i]->spec); + function_evals.emplace_back(functions[i]->ub.get_points()); + } + } + +// ---------------------------------------------------------------------------------------- + + void global_function_search:: + get_best_function_eval ( + matrix<double,0,1>& x, + double& y, + size_t& function_idx + ) const + { + DLIB_CASSERT(num_functions() != 0); + + std::lock_guard<std::mutex> lock(*m); + + // find the largest value + auto& info = *best_function(function_idx); + y = info.best_objective_value; + x = info.best_x; + } + +// ---------------------------------------------------------------------------------------- + + function_evaluation_request global_function_search:: + get_next_x ( + ) + { + DLIB_CASSERT(num_functions() != 0); + + using namespace gopt_impl; + + std::lock_guard<std::mutex> lock(*m); + + + // the first thing we do is make sure each function has at least max(3,dimensionality of function) evaluations + for (auto& info : functions) + { + const long dims = info->spec.lower.size(); + // If this is the very beginning of the optimization process + if (info->ub.num_points()+info->outstanding_evals.size() < 1) + { + outstanding_function_eval_request new_req; + new_req.request_id = next_request_id++; + // Pick the point right in the center of the bounds to evaluate first since + // people will commonly center the bound on a location they think is good. + // So might as well try there first. + new_req.x = (info->spec.lower + info->spec.upper)/2.0; + for (long i = 0; i < new_req.x.size(); ++i) + { + if (info->spec.is_integer_variable[i]) + new_req.x(i) = std::round(new_req.x(i)); + } + info->outstanding_evals.emplace_back(new_req); + return function_evaluation_request(new_req,info); + } + else if (info->ub.num_points() < std::max<long>(3,dims)) + { + outstanding_function_eval_request new_req; + new_req.request_id = next_request_id++; + new_req.x = make_random_vector(rnd, info->spec.lower, info->spec.upper, info->spec.is_integer_variable); + info->outstanding_evals.emplace_back(new_req); + return function_evaluation_request(new_req,info); + } + } + + + if (do_trust_region_step && !has_outstanding_trust_region_request()) + { + // find the currently best performing function, we will do a trust region + // step on it. + auto info = best_function(); + const long dims = info->spec.lower.size(); + // if we have enough points to do a trust region step + if (info->ub.num_points() > dims+1) + { + auto tmp = pick_next_sample_using_trust_region(info->ub.get_points(), + info->radius, info->spec.lower, info->spec.upper, info->spec.is_integer_variable); + //std::cout << "QP predicted improvement: "<< tmp.predicted_improvement << std::endl; + if (tmp.predicted_improvement > min_trust_region_epsilon) + { + do_trust_region_step = false; + outstanding_function_eval_request new_req; + new_req.request_id = next_request_id++; + new_req.x = tmp.best_x; + new_req.was_trust_region_generated_request = true; + new_req.anchor_objective_value = info->best_objective_value; + new_req.predicted_improvement = tmp.predicted_improvement; + info->outstanding_evals.emplace_back(new_req); + return function_evaluation_request(new_req, info); + } + } + } + + // make it so we alternate between upper bounded and trust region steps. + do_trust_region_step = true; + + if (rnd.get_random_double() >= pure_random_search_probability) + { + // pick a point at random to sample according to the upper bound + double best_upper_bound = -std::numeric_limits<double>::infinity(); + std::shared_ptr<funct_info> best_funct; + matrix<double,0,1> next_sample; + // so figure out if any function has a good upper bound and if so pick the + // function with the largest upper bound for evaluation. + for (auto& info : functions) + { + auto tmp = pick_next_sample_as_max_upper_bound(rnd, + info->build_upper_bound_with_all_function_evals(), info->spec.lower, info->spec.upper, + info->spec.is_integer_variable, num_random_samples); + if (tmp.predicted_improvement > 0 && tmp.upper_bound > best_upper_bound) + { + best_upper_bound = tmp.upper_bound; + next_sample = std::move(tmp.x); + best_funct = info; + } + } + + // if we found a good function to evaluate then return that. + if (best_funct) + { + outstanding_function_eval_request new_req; + new_req.request_id = next_request_id++; + new_req.x = std::move(next_sample); + best_funct->outstanding_evals.emplace_back(new_req); + return function_evaluation_request(new_req, best_funct); + } + } + + + // pick entirely at random + size_t function_idx = rnd.get_integer(functions.size()); + auto info = functions[function_idx]; + outstanding_function_eval_request new_req; + new_req.request_id = next_request_id++; + new_req.x = make_random_vector(rnd, info->spec.lower, info->spec.upper, info->spec.is_integer_variable); + info->outstanding_evals.emplace_back(new_req); + return function_evaluation_request(new_req, info); + + } + +// ---------------------------------------------------------------------------------------- + + double global_function_search:: + get_pure_random_search_probability ( + ) const + { + return pure_random_search_probability; + } + +// ---------------------------------------------------------------------------------------- + + void global_function_search:: + set_pure_random_search_probability ( + double prob + ) + { + DLIB_CASSERT(0 <= prob && prob <= 1); + pure_random_search_probability = prob; + } + +// ---------------------------------------------------------------------------------------- + + double global_function_search:: + get_solver_epsilon ( + ) const + { + return min_trust_region_epsilon; + } + +// ---------------------------------------------------------------------------------------- + + void global_function_search:: + set_solver_epsilon ( + double eps + ) + { + DLIB_CASSERT(0 <= eps); + min_trust_region_epsilon = eps; + } + +// ---------------------------------------------------------------------------------------- + + double global_function_search:: + get_relative_noise_magnitude ( + ) const + { + return relative_noise_magnitude; + } + +// ---------------------------------------------------------------------------------------- + + void global_function_search:: + set_relative_noise_magnitude ( + double value + ) + { + DLIB_CASSERT(0 <= value); + relative_noise_magnitude = value; + if (m) + { + std::lock_guard<std::mutex> lock(*m); + // recreate all the upper bound functions with the new relative noise magnitude + for (auto& f : functions) + f->ub = upper_bound_function(f->ub.get_points(), relative_noise_magnitude); + } + } + +// ---------------------------------------------------------------------------------------- + + size_t global_function_search:: + get_monte_carlo_upper_bound_sample_num ( + ) const + { + return num_random_samples; + } + +// ---------------------------------------------------------------------------------------- + + void global_function_search:: + set_monte_carlo_upper_bound_sample_num ( + size_t num + ) + { + DLIB_CASSERT(0 <= num); + num_random_samples = num; + } + +// ---------------------------------------------------------------------------------------- + + std::shared_ptr<gopt_impl::funct_info> global_function_search:: + best_function( + ) const + { + size_t idx = 0; + return best_function(idx); + } + +// ---------------------------------------------------------------------------------------- + + std::shared_ptr<gopt_impl::funct_info> global_function_search:: + best_function( + size_t& idx + ) const + { + auto compare = [](const std::shared_ptr<gopt_impl::funct_info>& a, const std::shared_ptr<gopt_impl::funct_info>& b) + { return a->best_objective_value < b->best_objective_value; }; + + auto i = std::max_element(functions.begin(), functions.end(), compare); + + idx = std::distance(functions.begin(),i); + return *i; + } + +// ---------------------------------------------------------------------------------------- + + bool global_function_search:: + has_outstanding_trust_region_request ( + ) const + { + for (auto& f : functions) + { + for (auto& i : f->outstanding_evals) + { + if (i.was_trust_region_generated_request) + return true; + } + } + return false; + } + +// ---------------------------------------------------------------------------------------- + +} + diff --git a/ml/dlib/dlib/global_optimization/global_function_search.h b/ml/dlib/dlib/global_optimization/global_function_search.h new file mode 100644 index 000000000..fa036884a --- /dev/null +++ b/ml/dlib/dlib/global_optimization/global_function_search.h @@ -0,0 +1,245 @@ +// Copyright (C) 2017 Davis E. King (davis@dlib.net) +// License: Boost Software License See LICENSE.txt for the full license. +#ifndef DLIB_GLOBAL_FuNCTION_SEARCH_Hh_ +#define DLIB_GLOBAL_FuNCTION_SEARCH_Hh_ + +#include "global_function_search_abstract.h" +#include <vector> +#include "../matrix.h" +#include <mutex> +#include "../rand.h" +#include "upper_bound_function.h" +#include "../test_for_odr_violations.h" + +namespace dlib +{ + +// ---------------------------------------------------------------------------------------- + + struct function_spec + { + function_spec( + matrix<double,0,1> bound1, + matrix<double,0,1> bound2 + ); + + function_spec( + matrix<double,0,1> bound1, + matrix<double,0,1> bound2, + std::vector<bool> is_integer + ); + + matrix<double,0,1> lower; + matrix<double,0,1> upper; + std::vector<bool> is_integer_variable; + }; + +// ---------------------------------------------------------------------------------------- + + namespace gopt_impl + { + struct outstanding_function_eval_request + { + size_t request_id = 0; // unique id for this eval request + matrix<double,0,1> x; // function x to evaluate + + // trust region specific stuff + bool was_trust_region_generated_request = false; + double predicted_improvement = std::numeric_limits<double>::quiet_NaN(); + double anchor_objective_value = std::numeric_limits<double>::quiet_NaN(); // objective value at center of TR step + + bool operator==(const outstanding_function_eval_request& item) const { return request_id == item.request_id; } + }; + + struct funct_info + { + funct_info() = delete; + funct_info(const funct_info&) = delete; + funct_info& operator=(const funct_info&) = delete; + + funct_info( + const function_spec& spec, + size_t function_idx, + const std::shared_ptr<std::mutex>& m + ) : + spec(spec), function_idx(function_idx), m(m) + { + best_x = zeros_matrix(spec.lower); + } + + upper_bound_function build_upper_bound_with_all_function_evals ( + ) const; + + static double find_nn ( + const std::vector<function_evaluation>& evals, + const matrix<double,0,1>& x + ); + + + function_spec spec; + size_t function_idx = 0; + std::shared_ptr<std::mutex> m; + upper_bound_function ub; + std::vector<outstanding_function_eval_request> outstanding_evals; + matrix<double,0,1> best_x; + double best_objective_value = -std::numeric_limits<double>::infinity(); + double radius = 0; + }; + + } + +// ---------------------------------------------------------------------------------------- + + class function_evaluation_request + { + public: + + function_evaluation_request() = delete; + function_evaluation_request(const function_evaluation_request&) = delete; + function_evaluation_request& operator=(const function_evaluation_request&) = delete; + + + function_evaluation_request(function_evaluation_request&& item); + function_evaluation_request& operator=(function_evaluation_request&& item); + + ~function_evaluation_request(); + + size_t function_idx ( + ) const; + + const matrix<double,0,1>& x ( + ) const; + + bool has_been_evaluated ( + ) const; + + void set ( + double y + ); + + void swap(function_evaluation_request& item); + + private: + + friend class global_function_search; + + explicit function_evaluation_request( + const gopt_impl::outstanding_function_eval_request& req, + const std::shared_ptr<gopt_impl::funct_info>& info + ) : req(req), info(info) {} + + bool m_has_been_evaluated = false; + gopt_impl::outstanding_function_eval_request req; + std::shared_ptr<gopt_impl::funct_info> info; + }; + +// ---------------------------------------------------------------------------------------- + + class global_function_search + { + public: + + global_function_search() = default; + + explicit global_function_search( + const function_spec& function + ); + + explicit global_function_search( + const std::vector<function_spec>& functions_ + ); + + global_function_search( + const std::vector<function_spec>& functions_, + const std::vector<std::vector<function_evaluation>>& initial_function_evals, + const double relative_noise_magnitude = 0.001 + ); + + global_function_search(const global_function_search&) = delete; + global_function_search& operator=(const global_function_search& item) = delete; + + global_function_search(global_function_search&& item) = default; + global_function_search& operator=(global_function_search&& item) = default; + + size_t num_functions( + ) const; + + void set_seed ( + time_t seed + ); + + void get_function_evaluations ( + std::vector<function_spec>& specs, + std::vector<std::vector<function_evaluation>>& function_evals + ) const; + + void get_best_function_eval ( + matrix<double,0,1>& x, + double& y, + size_t& function_idx + ) const; + + function_evaluation_request get_next_x ( + ); + + double get_pure_random_search_probability ( + ) const; + + void set_pure_random_search_probability ( + double prob + ); + + double get_solver_epsilon ( + ) const; + + void set_solver_epsilon ( + double eps + ); + + double get_relative_noise_magnitude ( + ) const; + + void set_relative_noise_magnitude ( + double value + ); + + size_t get_monte_carlo_upper_bound_sample_num ( + ) const; + + void set_monte_carlo_upper_bound_sample_num ( + size_t num + ); + + private: + + std::shared_ptr<gopt_impl::funct_info> best_function( + ) const; + + std::shared_ptr<gopt_impl::funct_info> best_function( + size_t& idx + ) const; + + bool has_outstanding_trust_region_request ( + ) const; + + + dlib::rand rnd; + double pure_random_search_probability = 0.02; + double min_trust_region_epsilon = 0; + double relative_noise_magnitude = 0.001; + size_t num_random_samples = 5000; + bool do_trust_region_step = true; + + size_t next_request_id = 1; + + std::vector<std::shared_ptr<gopt_impl::funct_info>> functions; + std::shared_ptr<std::mutex> m; + + }; + +// ---------------------------------------------------------------------------------------- + +} + +#endif // DLIB_GLOBAL_FuNCTION_SEARCH_Hh_ + diff --git a/ml/dlib/dlib/global_optimization/global_function_search_abstract.h b/ml/dlib/dlib/global_optimization/global_function_search_abstract.h new file mode 100644 index 000000000..c8bfc3993 --- /dev/null +++ b/ml/dlib/dlib/global_optimization/global_function_search_abstract.h @@ -0,0 +1,605 @@ +// Copyright (C) 2017 Davis E. King (davis@dlib.net) +// License: Boost Software License See LICENSE.txt for the full license. +#undef DLIB_GLOBAL_FuNCTION_SEARCH_ABSTRACT_Hh_ +#ifdef DLIB_GLOBAL_FuNCTION_SEARCH_ABSTRACT_Hh_ + +#include <vector> +#include "../matrix.h" +#include "upper_bound_function_abstract.h" + +namespace dlib +{ + +// ---------------------------------------------------------------------------------------- + + struct function_spec + { + /*! + WHAT THIS OBJECT REPRESENTS + This object is a simple struct that lets you define the valid inputs to a + multivariate function. It lets you define bound constraints for each + variable as well as say if a variable is integer valued or not. Therefore, + an instance of this struct says that a function takes upper.size() input + variables, where the ith variable must be in the range [lower(i) upper(i)] + and be an integer if is_integer_variable[i]==true. + !*/ + + function_spec( + matrix<double,0,1> bound1, + matrix<double,0,1> bound2 + ); + /*! + requires + - bound1.size() == bound2.size() + - for all valid i: bound1(i) != bound2(i) + ensures + - #is_integer_variable.size() == bound1.size() + - #lower.size() == bound1.size() + - #upper.size() == bound1.size() + - for all valid i: + - #is_integer_variable[i] == false + - #lower(i) == min(bound1(i), bound2(i)) + - #upper(i) == max(bound1(i), bound2(i)) + !*/ + + function_spec( + matrix<double,0,1> lower, + matrix<double,0,1> upper, + std::vector<bool> is_integer + ); + /*! + requires + - bound1.size() == bound2.size() == is_integer.size() + - for all valid i: bound1(i) != bound2(i) + ensures + - #is_integer_variable.size() == bound1.size() + - #lower.size() == bound1.size() + - #upper.size() == bound1.size() + - for all valid i: + - #is_integer_variable[i] == is_integer[i] + - #lower(i) == min(bound1(i), bound2(i)) + - #upper(i) == max(bound1(i), bound2(i)) + !*/ + + matrix<double,0,1> lower; + matrix<double,0,1> upper; + std::vector<bool> is_integer_variable; + }; + +// ---------------------------------------------------------------------------------------- + + class function_evaluation_request + { + /*! + WHAT THIS OBJECT REPRESENTS + This object represents a request, by the global_function_search object, to + evaluate a real-valued function and report back the results. + + THREAD SAFETY + You shouldn't let more than one thread touch a function_evaluation_request + at the same time. However, it is safe to send instances of this class to + other threads for processing. This lets you evaluate multiple + function_evaluation_requests in parallel. Any appropriate synchronization + with regard to the originating global_function_search instance is handled + automatically. + !*/ + + public: + + // You can't make or copy this object, the only way to get one is from the + // global_function_search class via get_next_x(). + function_evaluation_request() = delete; + function_evaluation_request(const function_evaluation_request&) = delete; + function_evaluation_request& operator=(const function_evaluation_request&) = delete; + + // You can however move and swap this object. + function_evaluation_request(function_evaluation_request&& item); + function_evaluation_request& operator=(function_evaluation_request&& item); + /*! + ensures + - *this takes the state of item. + - #item.has_been_evaluated() == true + !*/ + + ~function_evaluation_request( + ); + /*! + ensures + - frees all resources associated with this object. + - It's fine to destruct function_evaluation_requests even if they haven't + been evaluated yet. If this happens it will simply be as if the request + was never issued. + !*/ + + size_t function_idx ( + ) const; + /*! + ensures + - Returns the function index that identifies which function is to be + evaluated. + !*/ + + const matrix<double,0,1>& x ( + ) const; + /*! + ensures + - returns the input parameters to the function to be evaluated. + !*/ + + bool has_been_evaluated ( + ) const; + /*! + ensures + - If this evaluation request is still outstanding then returns false, + otherwise returns true. That is, if the global_function_search is still + waiting for you report back by calling set() then + has_been_evaluated()==false. + !*/ + + void set ( + double y + ); + /*! + requires + - has_been_evaluated() == false + ensures + - #has_been_evaluated() == true + - Notifies the global_function_search instance that created this object + that when the function_idx()th function is evaluated with x() as input + then the output is y. + !*/ + + void swap( + function_evaluation_request& item + ); + /*! + ensures + - swaps the state of *this and item + !*/ + + }; + +// ---------------------------------------------------------------------------------------- + + class global_function_search + { + /*! + WHAT THIS OBJECT REPRESENTS + This object performs global optimization of a set of user supplied + functions. The goal is to maximize the following objective function: + max_{function_i,x_i}: function_i(x_i) + subject to bound constraints on each element of x_i. Moreover, each + element of x_i can be either real valued or integer valued. Each of the + functions can also take a different number of variables. Therefore, the + final result of the optimization tells you which function produced the + largest output and what input (i.e. the x value) to that function is + necessary to obtain that maximal value. + + Importantly, the global_function_search object does not require the user to + supply derivatives. Moreover, the functions may contain discontinuities, + behave stochastically, and have many local maxima. The global_function_search + object will attempt to find the global optima in the face of these challenges. + It is also designed to use as few function evaluations as possible, making + it suitable for optimizing functions that are very expensive to evaluate. + + It does this by alternating between two modes. A global exploration mode + and a local optima refinement mode. This is accomplished by building and + maintaining two models of the objective function: + 1. A global model that upper bounds our objective function. This is a + non-parametric piecewise linear model based on all function + evaluations ever seen by the global_function_search object. + 2. A local quadratic model fit around the best point seen so far. + + The optimization procedure therefore looks like this: + + while(not done) + { + DO GLOBAL EXPLORE STEP: + Find the point that maximizes the upper bounding model since + that is the point with the largest possible improvement in the + objective function. + + Evaluate the new point and incorporate it into our models. + + DO LOCAL REFINEMENT STEP: + Find the optimal solution to the local quadratic model. + + If this point looks like it will improve on the "best point seen + so far" by at least get_solver_epsilon() then we evaluate that + point and incorporate it into our models, otherwise we ignore + it. + } + + You can see that we alternate between global search and local refinement, + except in the case where the local model seems to have converged to within + get_solver_epsilon() accuracy. In that case only global search steps are + used. We do this in the hope that the global search will find a new and + better local optima to explore, which would then reactivate local + refinement when it has something productive to do. + + + Now let's turn our attention to the specific API defined by the + global_function_search object. We will begin by showing a short example of + its use: + + // Suppose we want to find which of these functions, F() and G(), have + // the largest output and what input is necessary to produce the + // maximal output. + auto F = [](double a, double b) { return -std::pow(a-2,2.0) - std::pow(b-4,2.0); }; + auto G = [](double x) { return 2-std::pow(x-5,2.0); }; + + // We first define function_spec objects that specify bounds on the + // inputs to each function. The search process will only search within + // these bounds. + function_spec spec_F({-10,-10}, {10,10}); + function_spec spec_G({-2}, {6}); + // Then we create a global_function_search object with those function specifications. + global_function_search opt({spec_F, spec_G}); + + // Here we run 15 iterations of the search process. Note that the user + // of global_function_search writes the main solver loop, which is + // somewhat unusual. We will discuss why that is in a moment, but for + // now let's look at this example. + for (int i = 0; i < 15; ++i) + { + // All we do here is ask the global_function_search object what to + // evaluate next, then do what it asked, and then report the + // results back by calling function_evaluation_request's set() + // method. + function_evaluation_request next = opt.get_next_x(); + // next.function_idx() tells you which of the functions you should + // evaluate. We have 2 functions here (F and G) so function_idx() + // can take only the values 0 and 1. If, for example, we had 10 + // functions it would take the values 0 through 9. + if (next.function_idx() == 0) + { + // Call F with the inputs requested by the + // global_function_search and report them back. + double a = next.x()(0); + double b = next.x()(1); + next.set(F(a,b)); // Tell the solver what happened. + } + else + { + double x = next.x()(0); + next.set(G(x)); + } + } + + // Find out what point gave the largest outputs: + matrix<double,0,1> x; + double y; + size_t function_idx; + opt.get_best_function_eval(x,y,function_idx); + + cout << "function_idx: "<< function_idx << endl; + cout << "y: " << y << endl; + cout << "x: " << x << endl; + + The above cout statements will print this: + + function_idx: 1 + y: 2 + x: 5 + + Which is the correct result since G(5) gives the largest possible output in + our example. + + So why does the user write the main loop? Why isn't it embedded inside + dlib? Well, there are two answers to this. The first is that it is. Most + users should just call dlib::find_max_global() which does exactly that, it + runs the loop for you. However, the API shown above gives you the + opportunity to run multiple function evaluations in parallel. For + instance, it is perfectly valid to call get_next_x() multiple times and + send the resulting function_evaluation_request objects to separate threads + for processing. Those separate threads can run the functions being + optimized (e.g. F and G or whatever) and report back by calling + function_evaluation_request::set(). You could even spread the work across + a compute cluster if you have one. + + So what happens if you have N outstanding function evaluation requests? + Or in other words, what happens if you called get_next_x() N times and + haven't yet called their set() methods? Well, 1 of the N requests will be + a local refinement step while the N-1 other requests will be global + exploration steps generated from the current upper bounding model. This + should give you an idea of the usefulness of this kind of parallelism. If + for example, your functions being optimized were simple convex functions + this kind of parallelism wouldn't help since essentially all the + interesting work in the solver is going to be done by the local optimizer. + However, if your function has a lot of local optima, running many global + exploration steps in parallel might significantly reduce the time it takes + to find a good solution. + + It should also be noted that our upper bounding model is implemented by the + dlib::upper_bound_function object, which is a tool that allows us to create + a tight upper bound on our objective function. This upper bound is + non-parametric and gets progressively more accurate as the optimization + progresses, but also more and more expensive to maintain. It causes the + runtime of the entire optimization procedure to be O(N^2) where N is the + number of objective function evaluations. So problems that require millions + of function evaluations to find a good solution are not appropriate for the + global_function_search tool. However, if your objective function is very + expensive to evaluate then this relatively expensive upper bounding model + is well worth its computational cost. + + Finally, let's introduce some background literature on this algorithm. The + two most relevant papers in the optimization literature are: + Global optimization of Lipschitz functions Malherbe, Cédric and Vayatis, + Nicolas International Conference on Machine Learning - 2017 + and + The NEWUOA software for unconstrained optimization without derivatives By + M.J.D. Powell, 40th Workshop on Large Scale Nonlinear Optimization (Erice, + Italy, 2004) + + Our upper bounding model is an extension of the AdaLIPO method in the + Malherbe. See the documentation of dlib::upper_bound_function for more + details on that, as we make a number of important extensions. The other + part of our method, our local refinement model, is essentially the same + type of trust region model proposed by Powell in the above paper. That is, + each time we do a local refinement step we identify the best point seen so + far, fit a quadratic function around it using the function evaluations we + have collected so far, and then use a simple trust region procedure to + decide the next best point to evaluate based on our quadratic model. + + The method proposed by Malherbe gives excellent global search performance + but has terrible convergence properties in the area around a maxima. + Powell's method on the other hand has excellent convergence in the area + around a local maxima, as expected by a quadratic trust region method, but + is aggressively local maxima seeking. It will simply get stuck in the + nearest local optima. Combining the two together as we do here gives us + excellent performance in both global search and final convergence speed + near a local optima. Causing the global_function_search to perform well + for functions with many local optima while still giving high precision + solutions. For instance, on typical tests problems, like the Holder table + function, the global_function_search object can reliably find the globally + optimal solution to full floating point precision in under a few hundred + steps. + + + THREAD SAFETY + You shouldn't let more than one thread touch a global_function_search + instance at the same time. + !*/ + + public: + + global_function_search( + ); + /*! + ensures + - #num_functions() == 0 + - #get_relative_noise_magnitude() == 0.001 + - #get_solver_epsilon() == 0 + - #get_monte_carlo_upper_bound_sample_num() == 5000 + - #get_pure_random_search_probability() == 0.02 + !*/ + + explicit global_function_search( + const function_spec& function + ); + /*! + ensures + - #num_functions() == 1 + - #get_function_evaluations() will indicate that there are no function evaluations yet. + - #get_relative_noise_magnitude() == 0.001 + - #get_solver_epsilon() == 0 + - #get_monte_carlo_upper_bound_sample_num() == 5000 + - #get_pure_random_search_probability() == 0.02 + !*/ + + explicit global_function_search( + const std::vector<function_spec>& functions + ); + /*! + ensures + - #num_functions() == functions.size() + - #get_function_evaluations() will indicate that there are no function evaluations yet. + - #get_relative_noise_magnitude() == 0.001 + - #get_solver_epsilon() == 0 + - #get_monte_carlo_upper_bound_sample_num() == 5000 + - #get_pure_random_search_probability() == 0.02 + !*/ + + global_function_search( + const std::vector<function_spec>& functions, + const std::vector<std::vector<function_evaluation>>& initial_function_evals, + const double relative_noise_magnitude = 0.001 + ); + /*! + requires + - functions.size() == initial_function_evals.size() + - relative_noise_magnitude >= 0 + ensures + - #num_functions() == functions.size() + - #get_function_evaluations() will return the provided initial_function_evals. + - #get_relative_noise_magnitude() == relative_noise_magnitude + - #get_solver_epsilon() == 0 + - #get_monte_carlo_upper_bound_sample_num() == 5000 + - #get_pure_random_search_probability() == 0.02 + !*/ + + // This object can't be copied. + global_function_search(const global_function_search&) = delete; + global_function_search& operator=(const global_function_search& item) = delete; + // But it can be moved + global_function_search(global_function_search&& item) = default; + global_function_search& operator=(global_function_search&& item) = default; + /*! + ensures + - moves the state of item into *this + - #item.num_functions() == 0 + !*/ + + void set_seed ( + time_t seed + ); + /*! + ensures + - Part of this object's algorithm uses random sampling to decide what + points to evaluate next. Calling set_seed() lets you set the seed used + by the random number generator. Note that if you don't call set_seed() + you will always get the same deterministic behavior. + !*/ + + size_t num_functions( + ) const; + /*! + ensures + - returns the number of functions being optimized. + !*/ + + void get_function_evaluations ( + std::vector<function_spec>& specs, + std::vector<std::vector<function_evaluation>>& function_evals + ) const; + /*! + ensures + - #specs.size() == num_functions() + - #function_evals.size() == num_functions() + - This function allows you to query the state of the solver. In + particular, you can find the function_specs for each function being + optimized and their recorded evaluations. + - for all valid i: + - function_evals[i] == all the function evaluations that have been + recorded for the ith function (i.e. the function with the + function_spec #specs[i]). That is, this is the record of all the x + and y values reported back by function_evaluation_request::set() + calls. + !*/ + + void get_best_function_eval ( + matrix<double,0,1>& x, + double& y, + size_t& function_idx + ) const; + /*! + requires + - num_functions() != 0 + ensures + - if (no function evaluations have been recorded yet) then + - The outputs of this function are in a valid but undefined state. + - else + - This function tells you which function has produced the largest + output seen so far. It also tells you the inputs to that function + that leads to those outputs (x) as well as the output value itself (y). + - 0 <= #function_idx < num_functions() + - #function_idx == the index of the function that produced the largest output seen so far. + - #x == the input parameters to the function that produced the largest outputs seen so far. + - #y == the largest output seen so far. + !*/ + + function_evaluation_request get_next_x ( + ); + /*! + requires + - num_functions() != 0 + ensures + - Generates and returns a function evaluation request. See the discussion + in the WHAT THIS OBJECT REPRESENTS section above for details. + !*/ + + double get_pure_random_search_probability ( + ) const; + /*! + ensures + - When we decide to do a global explore step we will, with probability + get_pure_random_search_probability(), sample a point completely at random + rather than using the upper bounding model. Therefore, if you set this + probability to 0 then we will depend entirely on the upper bounding + model. Alternatively, if you set get_pure_random_search_probability() to + 1 then we won't use the upper bounding model at all and instead use pure + random search to do global exploration. Pure random search is much + faster than using the upper bounding model, so if you know that your + objective function is especially simple it can be faster to use pure + random search. However, if you really know your function that well you + should probably use a gradient based optimizer :) + !*/ + + void set_pure_random_search_probability ( + double prob + ); + /*! + requires + - prob >= 0 + ensures + - #get_pure_random_search_probability() == prob + !*/ + + double get_solver_epsilon ( + ) const; + /*! + ensures + - As discussed in the WHAT THIS OBJECT REPRESENTS section, we only do a + local refinement step if we haven't already found the peak of the current + local optima. get_solver_epsilon() sets the tolerance for deciding if + the local search method has found the local optima. Therefore, when the + local trust region model runs we check if its predicted improvement in + the objective function is greater than get_solver_epsilon(). If it isn't + then we assume it has converged and we should focus entirely on global + search. + + This means that, for instance, setting get_solver_epsilon() to 0 + essentially instructs the solver to find each local optima to full + floating point precision and only then to focus on pure global search. + !*/ + + void set_solver_epsilon ( + double eps + ); + /*! + requires + - eps >= 0 + ensures + - #get_solver_epsilon() == eps + !*/ + + double get_relative_noise_magnitude ( + ) const; + /*! + ensures + - Returns the value of the relative noise magnitude parameter to the + dlib::upper_bound_function's used by this object. See the + upper_bound_function's documentation for a detailed discussion of this + parameter's meaning. Most users should leave this value as its default + setting. + !*/ + + void set_relative_noise_magnitude ( + double value + ); + /*! + requires + - value >= 0 + ensures + - #get_relative_noise_magnitude() == value + !*/ + + size_t get_monte_carlo_upper_bound_sample_num ( + ) const; + /*! + ensures + - To find the point that maximizes the upper bounding model we use + get_monte_carlo_upper_bound_sample_num() random evaluations and select + the largest upper bound from that set. So this parameter influences how + well we estimate the maximum point on the upper bounding model. + !*/ + + void set_monte_carlo_upper_bound_sample_num ( + size_t num + ); + /*! + requires + - num > 0 + ensures + - #get_monte_carlo_upper_bound_sample_num() == num + !*/ + + }; + +// ---------------------------------------------------------------------------------------- + +} + +#endif // DLIB_GLOBAL_FuNCTION_SEARCH_ABSTRACT_Hh_ + + diff --git a/ml/dlib/dlib/global_optimization/upper_bound_function.h b/ml/dlib/dlib/global_optimization/upper_bound_function.h new file mode 100644 index 000000000..d1957623e --- /dev/null +++ b/ml/dlib/dlib/global_optimization/upper_bound_function.h @@ -0,0 +1,286 @@ +// Copyright (C) 2017 Davis E. King (davis@dlib.net) +// License: Boost Software License See LICENSE.txt for the full license. +#ifndef DLIB_UPPER_bOUND_FUNCTION_Hh_ +#define DLIB_UPPER_bOUND_FUNCTION_Hh_ + +#include "upper_bound_function_abstract.h" +#include "../svm/svm_c_linear_dcd_trainer.h" +#include "../statistics.h" +#include <limits> +#include <utility> + +namespace dlib +{ + +// ---------------------------------------------------------------------------------------- + + struct function_evaluation + { + function_evaluation() = default; + function_evaluation(const matrix<double,0,1>& x, double y) :x(x), y(y) {} + + matrix<double,0,1> x; + double y = std::numeric_limits<double>::quiet_NaN(); + }; + +// ---------------------------------------------------------------------------------------- + + class upper_bound_function + { + + public: + + upper_bound_function( + ) = default; + + upper_bound_function( + const double relative_noise_magnitude, + const double solver_eps + ) : relative_noise_magnitude(relative_noise_magnitude), solver_eps(solver_eps) + { + DLIB_CASSERT(relative_noise_magnitude >= 0); + DLIB_CASSERT(solver_eps > 0); + } + + explicit upper_bound_function( + const std::vector<function_evaluation>& _points, + const double relative_noise_magnitude = 0.001, + const double solver_eps = 0.0001 + ) : relative_noise_magnitude(relative_noise_magnitude), solver_eps(solver_eps), points(_points) + { + DLIB_CASSERT(relative_noise_magnitude >= 0); + DLIB_CASSERT(solver_eps > 0); + + if (points.size() > 1) + { + DLIB_CASSERT(points[0].x.size() > 0, "The vectors can't be empty."); + + const long dims = points[0].x.size(); + for (auto& p : points) + DLIB_CASSERT(p.x.size() == dims, "All the vectors given to upper_bound_function must have the same dimensionality."); + + learn_params(); + } + + } + + void add ( + const function_evaluation& point + ) + { + DLIB_CASSERT(point.x.size() != 0, "The vectors can't be empty."); + if (points.size() == 0) + { + points.push_back(point); + return; + } + + DLIB_CASSERT(point.x.size() == dimensionality(), "All the vectors given to upper_bound_function must have the same dimensionality."); + + if (points.size() < 4) + { + points.push_back(point); + *this = upper_bound_function(points, relative_noise_magnitude, solver_eps); + return; + } + + points.push_back(point); + // add constraints between the new point and the old points + for (size_t i = 0; i < points.size()-1; ++i) + active_constraints.push_back(std::make_pair(i,points.size()-1)); + + learn_params(); + } + + long num_points( + ) const + { + return points.size(); + } + + long dimensionality( + ) const + { + if (points.size() == 0) + return 0; + else + return points[0].x.size(); + } + + const std::vector<function_evaluation>& get_points( + ) const + { + return points; + } + + double operator() ( + const matrix<double,0,1>& x + ) const + { + DLIB_CASSERT(num_points() > 0); + DLIB_CASSERT(x.size() == dimensionality()); + + + + double upper_bound = std::numeric_limits<double>::infinity(); + + for (size_t i = 0; i < points.size(); ++i) + { + const double local_bound = points[i].y + std::sqrt(offsets[i] + dot(slopes, squared(x-points[i].x))); + upper_bound = std::min(upper_bound, local_bound); + } + + return upper_bound; + } + + private: + + void learn_params ( + ) + { + const long dims = points[0].x.size(); + + using sample_type = std::vector<std::pair<size_t,double>>; + using kernel_type = sparse_linear_kernel<sample_type>; + std::vector<sample_type> x; + std::vector<double> y; + + // We are going to normalize the data so the values aren't extreme. First, we + // collect statistics on our data. + std::vector<running_stats<double>> x_rs(dims); + running_stats<double> y_rs; + for (auto& v : points) + { + for (long i = 0; i < v.x.size(); ++i) + x_rs[i].add(v.x(i)); + y_rs.add(v.y); + } + + + // compute normalization vectors for the data. The only reason we do this is + // to make the optimization well conditioned. In particular, scaling the y + // values will prevent numerical errors in the 1-diff*diff computation below that + // would otherwise result when diff is really big. Also, scaling the xvalues + // to be about 1 will similarly make the optimization more stable and it also + // has the added benefit of keeping the relative_noise_magnitude's scale + // constant regardless of the size of x values. + const double yscale = 1.0/y_rs.stddev(); + std::vector<double> xscale(dims); + for (size_t i = 0; i < xscale.size(); ++i) + xscale[i] = 1.0/(x_rs[i].stddev()*yscale); // make it so that xscale[i]*yscale == 1/x_rs[i].stddev() + + sample_type samp; + auto add_constraint = [&](long i, long j) { + samp.clear(); + for (long k = 0; k < dims; ++k) + { + double temp = (points[i].x(k) - points[j].x(k))*xscale[k]*yscale; + samp.push_back(std::make_pair(k, temp*temp)); + } + + if (points[i].y > points[j].y) + samp.push_back(std::make_pair(dims + j, relative_noise_magnitude)); + else + samp.push_back(std::make_pair(dims + i, relative_noise_magnitude)); + + const double diff = (points[i].y - points[j].y)*yscale; + samp.push_back(std::make_pair(dims + points.size(), 1-diff*diff)); + + x.push_back(samp); + y.push_back(1); + }; + + if (active_constraints.size() == 0) + { + x.reserve(points.size()*(points.size()-1)/2); + y.reserve(points.size()*(points.size()-1)/2); + for (size_t i = 0; i < points.size(); ++i) + { + for (size_t j = i+1; j < points.size(); ++j) + { + add_constraint(i,j); + } + } + } + else + { + for (auto& p : active_constraints) + add_constraint(p.first, p.second); + } + + + + + svm_c_linear_dcd_trainer<kernel_type> trainer; + trainer.set_c(std::numeric_limits<double>::infinity()); + //trainer.be_verbose(); + trainer.force_last_weight_to_1(true); + trainer.set_epsilon(solver_eps); + + svm_c_linear_dcd_trainer<kernel_type>::optimizer_state state; + auto df = trainer.train(x,y, state); + + // save the active constraints for later so we can use them inside add() to add + // new points efficiently. + if (active_constraints.size() == 0) + { + long k = 0; + for (size_t i = 0; i < points.size(); ++i) + { + for (size_t j = i+1; j < points.size(); ++j) + { + if (state.get_alpha()[k++] != 0) + active_constraints.push_back(std::make_pair(i,j)); + } + } + } + else + { + DLIB_CASSERT(state.get_alpha().size() == active_constraints.size()); + new_active_constraints.clear(); + for (size_t i = 0; i < state.get_alpha().size(); ++i) + { + if (state.get_alpha()[i] != 0) + new_active_constraints.push_back(active_constraints[i]); + } + active_constraints.swap(new_active_constraints); + } + + //std::cout << "points.size(): " << points.size() << std::endl; + //std::cout << "active_constraints.size(): " << active_constraints.size() << std::endl; + + + const auto& bv = df.basis_vectors(0); + slopes.set_size(dims); + for (long i = 0; i < dims; ++i) + slopes(i) = bv[i].second*xscale[i]*xscale[i]; + + //std::cout << "slopes:" << trans(slopes); + + offsets.assign(points.size(),0); + + + for (size_t i = 0; i < points.size(); ++i) + { + offsets[i] += bv[slopes.size()+i].second*relative_noise_magnitude; + } + } + + + + double relative_noise_magnitude = 0.001; + double solver_eps = 0.0001; + std::vector<std::pair<size_t,size_t>> active_constraints, new_active_constraints; + + std::vector<function_evaluation> points; + std::vector<double> offsets; // offsets.size() == points.size() + matrix<double,0,1> slopes; // slopes.size() == points[0].first.size() + }; + +// ---------------------------------------------------------------------------------------- + +} + +#endif // DLIB_UPPER_bOUND_FUNCTION_Hh_ + + diff --git a/ml/dlib/dlib/global_optimization/upper_bound_function_abstract.h b/ml/dlib/dlib/global_optimization/upper_bound_function_abstract.h new file mode 100644 index 000000000..56b361597 --- /dev/null +++ b/ml/dlib/dlib/global_optimization/upper_bound_function_abstract.h @@ -0,0 +1,212 @@ +// Copyright (C) 2017 Davis E. King (davis@dlib.net) +// License: Boost Software License See LICENSE.txt for the full license. +#undef DLIB_UPPER_bOUND_FUNCTION_ABSTRACT_Hh_ +#ifdef DLIB_UPPER_bOUND_FUNCTION_ABSTRACT_Hh_ + +#include "../matrix.h" +#include <limits> + +namespace dlib +{ + +// ---------------------------------------------------------------------------------------- + + struct function_evaluation + { + /*! + WHAT THIS OBJECT REPRESENTS + This object records the output of a real valued function in response to + some input. + + In particular, if you have a function F(x) then the function_evaluation is + simply a struct that records x and the scalar value F(x). + !*/ + + function_evaluation() = default; + function_evaluation(const matrix<double,0,1>& x, double y) :x(x), y(y) {} + + matrix<double,0,1> x; + double y = std::numeric_limits<double>::quiet_NaN(); + }; + +// ---------------------------------------------------------------------------------------- + + class upper_bound_function + { + /*! + WHAT THIS OBJECT REPRESENTS + This object represents a piecewise linear non-parametric function that can + be used to define an upper bound on some more complex and unknown function. + To describe this precisely, lets assume there is a function F(x) which you + are capable of sampling from but otherwise know nothing about, and that you + would like to find an upper bounding function U(x) such that U(x) >= F(x) + for any x. It would also be good if U(x)-F(x) was minimal. I.e. we would + like U(x) to be a tight upper bound, not something vacuous like U(x) = + infinity. + + The upper_bound_function class is a tool for creating this kind of upper + bounding function from a set of function_evaluations of F(x). We do this + by considering only U(x) of the form: + U = [](matrix<double,0,1> x) { + double min_ub = infinity; + for (size_t i = 0; i < POINTS.size(); ++i) { + function_evaluation p = POINTS[i] + double local_bound = p.y + sqrt(noise_terms[i] + trans(p.x-x)*M*(p.x-x)) + min_ub = min(min_ub, local_bound) + } + return min_ub; + } + Where POINTS is an array of function_evaluation instances drawn from F(x), + M is a diagonal matrix, and noise_terms is an array of scalars. + + To create an upper bound U(x), the upper_bound_function takes a POINTS array + containing evaluations of F(x) as input and solves the following quadratic + program to find the parameters of U(x): + + min_{M,noise_terms}: sum(squared(M)) + sum(squared(noise_terms/relative_noise_magnitude)) + s.t. U(POINTS[i].x) >= POINTS[i].y, for all i + noise_terms[i] >= 0 + min(M) >= 0 + M is a diagonal matrix + + Therefore, the quadratic program finds the U(x) that always upper bounds + F(x) on the supplied POINTS, but is otherwise as small as possible. + + + + The inspiration for the upper_bound_function object came from the AdaLIPO + algorithm from this excellent paper: + Global optimization of Lipschitz functions + Malherbe, Cédric and Vayatis, Nicolas + International Conference on Machine Learning - 2017 + In that paper, they propose to use a simpler U(x) where noise_terms is + always 0 and M is a diagonal matrix where each diagonal element is the same + value. Therefore, there is only a single scalar parameter for U(x) in + their formulation of the problem. This causes difficulties if F(x) is + stochastic or has discontinuities since, without the noise term, M will + become really huge and the upper bound becomes vacuously large. It is also + problematic if the gradient of F(x) with respect to x contains elements of + widely varying magnitude since the simpler formulation of U(x) assumes a + uniform rate of change regardless of which dimension is varying. + !*/ + + public: + + upper_bound_function( + ); + /*! + ensures + - #num_points() == 0 + - #dimensionality() == 0 + !*/ + + explicit upper_bound_function( + const std::vector<function_evaluation>& points, + const double relative_noise_magnitude = 0.001, + const double solver_eps = 0.0001 + ); + /*! + requires + - all the x vectors in points must have the same non-zero dimensionality. + - relative_noise_magnitude >= 0 + - solver_eps > 0 + ensures + - Creates an upper bounding function U(x), as described above, assuming that + the given points are drawn from F(x). + - Uses the provided relative_noise_magnitude when solving the QP, as + described above. Note that relative_noise_magnitude can be set to 0. If + you do this then all the noise terms are constrained to 0. You should + only do this if you know F(x) is non-stochastic and continuous + everywhere. + - When solving the QP used to find the parameters of U(x), the upper + bounding function, we solve the QP to solver_eps accuracy. It's + possible that large enough solver_eps can lead to upper bounds that don't + upper bound all the supplied points. But for reasonable epsilon values + this shouldn't be a problem. + - #num_points() == points.size() + - #dimensionality() == points[0].x.size() + !*/ + + upper_bound_function( + const double relative_noise_magnitude, + const double solver_eps + ); + /*! + requires + - relative_noise_magnitude >= 0 + - solver_eps > 0 + ensures + - #num_points() == 0 + - #dimensionality() == 0 + - This destructor is the same as calling the above constructor with points.size()==0 + !*/ + + + void add ( + const function_evaluation& point + ); + /*! + requires + - num_points() == 0 || point.x.size() == dimensionality() + - point.x.size() != 0 + ensures + - Adds point to get_points(). + - Incrementally updates the upper bounding function with the given function + evaluation. That is, we assume that F(point.x)==point.y and solve the QP + described above to find the new U(x) that upper bounds all the points + this object knows about (i.e. all the points in get_points() and the new point). + - Calling add() is much faster than recreating the upper_bound_function + from scratch with all the points. This is because we warm start with the + previous solution to the QP. This is done by discarding any non-active + constraints and solving the QP again with only the previously active + constraints and the new constraints formed by all the pairs of the new + point and the old points. This means the QP solved by add() is much + smaller than the QP that would be solved by a fresh call to the + upper_bound_function constructor. + !*/ + + const std::vector<function_evaluation>& get_points( + ) const; + /*! + ensures + - returns the points from F(x) used to define this upper bounding function. + These are all the function_evaluation objects given to this object via + its constructor and add(). + !*/ + + long num_points( + ) const; + /*! + ensures + - returns the number of points used to define the upper bounding function. + (i.e. returns get_points().size()) + !*/ + + long dimensionality( + ) const; + /*! + ensures + - returns the dimensionality of the input vectors to the upper bounding function. + !*/ + + double operator() ( + const matrix<double,0,1>& x + ) const; + /*! + requires + - num_points() > 0 + - x.size() == dimensionality() + ensures + - return U(x) + (i.e. returns the upper bound on F(x) at x given by our upper bounding function) + !*/ + + }; + +// ---------------------------------------------------------------------------------------- + +} + +#endif // DLIB_UPPER_bOUND_FUNCTION_ABSTRACT_Hh_ + + |