diff options
Diffstat (limited to 'ml/dlib/dlib/dnn/solvers.h')
-rw-r--r-- | ml/dlib/dlib/dnn/solvers.h | 405 |
1 files changed, 0 insertions, 405 deletions
diff --git a/ml/dlib/dlib/dnn/solvers.h b/ml/dlib/dlib/dnn/solvers.h deleted file mode 100644 index 204541a7e..000000000 --- a/ml/dlib/dlib/dnn/solvers.h +++ /dev/null @@ -1,405 +0,0 @@ -// Copyright (C) 2015 Davis E. King (davis@dlib.net) -// License: Boost Software License See LICENSE.txt for the full license. -#ifndef DLIB_DNn_SOLVERS_H_ -#define DLIB_DNn_SOLVERS_H_ - -#include "solvers_abstract.h" -#include "tensor.h" -#include <iostream> -#include "layers.h" - -namespace dlib -{ - class sgd - { - public: - - explicit sgd( - float weight_decay_, - float momentum_ = 0.9 - ) - { - weight_decay = weight_decay_; - momentum = momentum_; - } - - sgd( - ) : sgd(0.0005, 0.9) - { - } - - float get_momentum ( - ) const { return momentum; } - - float get_weight_decay ( - ) const { return weight_decay; } - - template <typename layer_type> - const tensor& operator() ( - const float learning_rate, - const layer_type& l, - const tensor& params_grad - ) - { - const tensor& params = l.get_layer_params(); - - DLIB_CASSERT(params.size() != 0); - if (v.size() == 0) - { - v.copy_size(params_grad); - v = 0; - } - - const double lr = learning_rate*get_learning_rate_multiplier(l); - const double wd = weight_decay*get_weight_decay_multiplier(l); - - //perform: v = momentum*mat(v) - wd*lr*mat(params) - lr*mat(params_grad); - tt::affine_transform(v, v, params, params_grad, momentum, -wd*lr, -lr); - - return v; - } - - template <unsigned long N> - const tensor& operator() ( - const float learning_rate, - const fc_<N,FC_HAS_BIAS>& l, - const tensor& params_grad - ) - { - update_considering_bias(learning_rate, l, params_grad, params_grad.size()-l.get_num_outputs()); - return v; - } - - template < - long _num_filters, - long _nr, - long _nc, - int _stride_y, - int _stride_x, - int _padding_y, - int _padding_x - > - const tensor& operator() ( - const float learning_rate, - const con_<_num_filters,_nr,_nc,_stride_y,_stride_x,_padding_y,_padding_x>& l, - const tensor& params_grad - ) - { - update_considering_bias(learning_rate, l, params_grad, params_grad.size()-l.num_filters()); - return v; - } - - template < - long _num_filters, - long _nr, - long _nc, - int _stride_y, - int _stride_x, - int _padding_y, - int _padding_x - > - const tensor& operator() ( - const float learning_rate, - const cont_<_num_filters,_nr,_nc,_stride_y,_stride_x,_padding_y,_padding_x>& l, - const tensor& params_grad - ) - { - update_considering_bias(learning_rate, l, params_grad, params_grad.size()-l.num_filters()); - return v; - } - - template < layer_mode mode > - const tensor& operator() ( - const float learning_rate, - const bn_<mode>& l, - const tensor& params_grad - ) - { - update_considering_bias(learning_rate, l, params_grad, params_grad.size()/2); - return v; - } - - friend void serialize(const sgd& item, std::ostream& out) - { - serialize("sgd2", out); - serialize(item.v, out); - serialize(item.weight_decay, out); - serialize(item.momentum, out); - } - - friend void deserialize(sgd& item, std::istream& in) - { - std::string version; - deserialize(version, in); - if (version != "sgd2") - throw serialization_error("Unexpected version found while deserializing dlib::sgd."); - deserialize(item.v, in); - deserialize(item.weight_decay, in); - deserialize(item.momentum, in); - } - - friend std::ostream& operator<< (std::ostream& out, const sgd& item) - { - out << "sgd: weight_decay="<<item.get_weight_decay() << ", momentum="<<item.get_momentum(); - return out; - } - - private: - - template <typename layer_type> - void update_considering_bias( - const float learning_rate, - const layer_type& l, - const tensor& params_grad, - unsigned long bias_offset - ) - { - const tensor& params = l.get_layer_params(); - - DLIB_CASSERT(params.size() != 0); - if (v.size() == 0) - { - v.copy_size(params_grad); - v = 0; - } - - double lr = learning_rate*get_learning_rate_multiplier(l); - double wd = weight_decay*get_weight_decay_multiplier(l); - - //perform: v = momentum*mat(v) - wd*lr*mat(params) - lr*mat(params_grad); - - if (l.get_bias_learning_rate_multiplier() == 1 && l.get_bias_weight_decay_multiplier() == 1) - { - tt::affine_transform(v, v, params, params_grad, momentum, -wd*lr, -lr); - } - else - { - - tt::affine_transform_range(0, bias_offset, v, v, params, params_grad, momentum, -wd*lr, -lr); - - // now update the biases but apply their multipliers - lr *= l.get_bias_learning_rate_multiplier(); - wd *= l.get_bias_weight_decay_multiplier(); - tt::affine_transform_range(bias_offset, v.size(), v, v, params, params_grad, momentum, -wd*lr, -lr); - } - } - - resizable_tensor v; - float weight_decay; - float momentum; - - }; - -// ---------------------------------------------------------------------------------------- - - class adam - { - public: - - adam( - float weight_decay_, - float momentum1_, - float momentum2_ - ) - { - weight_decay = weight_decay_; - momentum1 = momentum1_; - momentum2 = momentum2_; - t = 0; - } - - adam( - ) : adam(0.0005, 0.9, 0.999) - {} - - float get_momentum1 ( - ) const { return momentum1; } - - float get_momentum2 ( - ) const { return momentum2; } - - float get_weight_decay ( - ) const { return weight_decay; } - - template <typename layer_type> - const tensor& operator() ( - const float learning_rate, - const layer_type& l, - const tensor& params_grad - ) - { - const tensor& params = l.get_layer_params(); - DLIB_CASSERT(params.size() != 0); - if (v.size() == 0) - { - m.copy_size(params_grad); - m = 0; - v.copy_size(params_grad); - v = 0; - s.copy_size(params_grad); - } - - ++t; - - - tt::compute_adam_update(0, params.size(), s, m, v, t, - learning_rate*get_learning_rate_multiplier(l), - weight_decay*get_weight_decay_multiplier(l), - momentum1, momentum2, params, params_grad); - - return s; - } - - template <unsigned long N> - const tensor& operator() ( - const float learning_rate, - const fc_<N,FC_HAS_BIAS>& l, - const tensor& params_grad - ) - { - update_considering_bias(learning_rate, l, params_grad, params_grad.size()-l.get_num_outputs()); - return s; - } - - template < - long _num_filters, - long _nr, - long _nc, - int _stride_y, - int _stride_x, - int _padding_y, - int _padding_x - > - const tensor& operator() ( - const float learning_rate, - const con_<_num_filters,_nr,_nc,_stride_y,_stride_x,_padding_y,_padding_x>& l, - const tensor& params_grad - ) - { - update_considering_bias(learning_rate, l, params_grad, params_grad.size()-l.num_filters()); - return s; - } - - template < - long _num_filters, - long _nr, - long _nc, - int _stride_y, - int _stride_x, - int _padding_y, - int _padding_x - > - const tensor& operator() ( - const float learning_rate, - const cont_<_num_filters,_nr,_nc,_stride_y,_stride_x,_padding_y,_padding_x>& l, - const tensor& params_grad - ) - { - update_considering_bias(learning_rate, l, params_grad, params_grad.size()-l.num_filters()); - return s; - } - - template < layer_mode mode > - const tensor& operator() ( - const float learning_rate, - const bn_<mode>& l, - const tensor& params_grad - ) - { - update_considering_bias(learning_rate, l, params_grad, params_grad.size()/2); - return s; - } - - - friend void serialize(const adam& item, std::ostream& out) - { - serialize("adam2", out); - serialize(item.m, out); - serialize(item.v, out); - serialize(item.s, out); - serialize(item.weight_decay, out); - serialize(item.momentum1, out); - serialize(item.momentum2, out); - serialize(item.t, out); - } - - friend void deserialize(adam& item, std::istream& in) - { - std::string version; - deserialize(version, in); - if (version != "adam2") - throw serialization_error("Unexpected version found while deserializing dlib::adam."); - deserialize(item.m, in); - deserialize(item.v, in); - deserialize(item.s, in); - deserialize(item.weight_decay, in); - deserialize(item.momentum1, in); - deserialize(item.momentum2, in); - deserialize(item.t, in); - } - - friend std::ostream& operator<< (std::ostream& out, const adam& item) - { - out << "adam: weight_decay="<<item.get_weight_decay() << ", momentum1="<<item.get_momentum1() << ", momentum2="<<item.get_momentum2(); - return out; - } - - private: - - template <typename layer_type> - void update_considering_bias( - const float learning_rate, - const layer_type& l, - const tensor& params_grad, - unsigned long bias_offset - ) - { - const tensor& params = l.get_layer_params(); - DLIB_CASSERT(params.size() != 0); - if (v.size() == 0) - { - m.copy_size(params_grad); - m = 0; - v.copy_size(params_grad); - v = 0; - s.copy_size(params_grad); - } - - - ++t; - - if (l.get_bias_learning_rate_multiplier() == 1 && l.get_bias_weight_decay_multiplier() == 1) - { - tt::compute_adam_update(0, params.size(), s, m, v, t, - learning_rate*get_learning_rate_multiplier(l), - weight_decay*get_weight_decay_multiplier(l), - momentum1, momentum2, params, params_grad); - } - else - { - tt::compute_adam_update(0, bias_offset, s, m, v, t, - learning_rate*get_learning_rate_multiplier(l), - weight_decay*get_weight_decay_multiplier(l), - momentum1, momentum2, params, params_grad); - - tt::compute_adam_update(bias_offset, params.size(), s, m, v, t, - learning_rate*get_learning_rate_multiplier(l)*l.get_bias_learning_rate_multiplier(), - weight_decay*get_weight_decay_multiplier(l)*l.get_bias_weight_decay_multiplier(), - momentum1, momentum2, params, params_grad); - } - } - resizable_tensor m; - resizable_tensor v; - resizable_tensor s; - float weight_decay; - float momentum1; - float momentum2; - float t; - }; - -// ---------------------------------------------------------------------------------------- - -} - -#endif // DLIB_DNn_SOLVERS_H_ - |