diff options
Diffstat (limited to 'ml/dlib/dlib/dnn/trainer_abstract.h')
-rw-r--r-- | ml/dlib/dlib/dnn/trainer_abstract.h | 765 |
1 files changed, 765 insertions, 0 deletions
diff --git a/ml/dlib/dlib/dnn/trainer_abstract.h b/ml/dlib/dlib/dnn/trainer_abstract.h new file mode 100644 index 000000000..3bfb6dc99 --- /dev/null +++ b/ml/dlib/dlib/dnn/trainer_abstract.h @@ -0,0 +1,765 @@ +// Copyright (C) 2015 Davis E. King (davis@dlib.net) +// License: Boost Software License See LICENSE.txt for the full license. +#undef DLIB_DNn_TRAINER_ABSTRACT_H_ +#ifdef DLIB_DNn_TRAINER_ABSTRACT_H_ + +#include "core_abstract.h" +#include "solvers_abstract.h" +#include <vector> +#include <chrono> + + +namespace dlib +{ + +// ---------------------------------------------------------------------------------------- + + enum class force_flush_to_disk { + no = 0, + yes = 1 + }; + +// ---------------------------------------------------------------------------------------- + + template < + typename net_type, + typename solver_type = sgd + > + class dnn_trainer + { + /*! + REQUIREMENTS ON net_type + - net_type is an add_loss_layer object. + + REQUIREMENTS ON solver_type + - solver_type is an implementation of the EXAMPLE_SOLVER interface defined + in solvers_abstract.h + + WHAT THIS OBJECT REPRESENTS + This object is a tool training a deep neural network. To use it you supply + a neural network type and a solver, then you call train() with your + training data and it will output a new network instance that has hopefully + learned something useful from your training data. + + If you are compiling with CUDA then this object will use the GPU that is + currently selected (i.e. the one indicated by cudaGetDevice()) when + dnn_trainer is constructed. It will continue to use that device even if + you later change it by a call to cudaSetDevice(). + + EXCEPTIONS + If an exception is thrown by any part of the neural network during training + then the exception will be propagated out of the trainer to the user. + Moreover, the trainer instance will be unusable and should be destroyed. + !*/ + + public: + + typedef typename net_type::training_label_type training_label_type; + typedef typename net_type::input_type input_type; + const static size_t num_computational_layers = net_type::num_computational_layers; + + dnn_trainer() = delete; + dnn_trainer(const dnn_trainer&) = delete; + dnn_trainer& operator=(const dnn_trainer&) = delete; + + dnn_trainer( + net_type& net, + const solver_type& solver = solver_type(), + const std::vector<int>& cuda_extra_devices = {} + ); + /*! + requires + - for all valid i: + - 0 <= cuda_extra_devices[i] < dlib::cuda::get_num_devices() + ensures + - &#get_net() == &net + (i.e. The dnn_trainer holds a reference to net, it does not copy it. + Therefore, you must ensure net has a lifetime at least as long as the + dnn_trainer). + - #get_solvers() == a set of solvers that are all initialized with the + provided solver instance. + - #get_max_num_epochs() == 10000 + - #get_mini_batch_size() == 128 + - #get_learning_rate() == 1e-2 + - #get_min_learning_rate() == 1e-5 + - #get_iterations_without_progress_threshold() == 2000 + - #get_test_iterations_without_progress_threshold() == 500 + - #get_learning_rate_shrink_factor() == 0.1 + - #get_learning_rate_schedule().size() == 0 + - #get_train_one_step_calls() == 0 + - #get_test_one_step_calls() == 0 + - #get_synchronization_file() == "" + - if (cuda_extra_devices.size() > 0) then + - This object will use multiple graphics cards to run the learning + algorithms. In particular, it will always use whatever device is + currently selected on the calling thread (the device indicated by + cudaGetDevice()). In addition, you can ask to use additional + devices, which you do by putting their device numbers into + cuda_extra_devices. + !*/ + + net_type& get_net ( + force_flush_to_disk force_flush = force_flush_to_disk::yes + ); + /*! + ensures + - returns the neural network object used by this trainer. This is the + network that is optimized when you call train() or train_one_step(). + Recall that the dnn_trainer doesn't contain the net_type object but + simply holds a reference to an external network which was provided to the + dnn_trainer's constructor. + - This function blocks until all threads inside the dnn_trainer have + stopped touching the net. + - If force_flush is yes, then this function will sync the trainer state to + disk if the current state hasn't already been synced to disk since the + last network modification. + !*/ + + const std::vector<solver_type>& get_solvers ( + ) const; + /*! + ensures + - returns the solvers used to optimize each layer of the neural network + get_net(). In particular, the first layer's solver is + get_solvers()[0], the second layer's solver is + get_solvers()[1], and so on. + - This function blocks until all threads inside the dnn_trainer have + stopped touching the net. + !*/ + + unsigned long get_mini_batch_size ( + ) const; + /*! + ensures + - During training, we call the network's update() routine over and over + with training data. The number of training samples we give to each call + to update is the "mini-batch size", which is defined by + get_mini_batch_size(). + !*/ + + void set_mini_batch_size ( + unsigned long batch_size + ); + /*! + requires + - batch_size > 0 + ensures + - #get_mini_batch_size() == batch_size + !*/ + + unsigned long get_max_num_epochs ( + ) const; + /*! + ensures + - train() will execute at most get_max_num_epochs() iterations over the + training data before returning. + !*/ + + void set_max_num_epochs ( + unsigned long num + ); + /*! + requires + - num > 0 + ensures + - #get_max_num_epochs() == num + !*/ + + void set_learning_rate ( + double lr + ); + /*! + requires + - lr > 0 + ensures + - #get_learning_rate() == lr + - #get_learning_rate_schedule().size() == 0 + - This function blocks until all threads inside the dnn_trainer have + stopped touching the net. + !*/ + + double get_learning_rate( + ) const; + /*! + ensures + - During each training step, a solver tells us how to modify the parameters + of each layer in the network. It does this by outputting a step vector + that, when added to the parameters, will hopefully result in improved + network performance. The learning rate is one of the inputs to the + solver and influences the size of this step vector. This function + returns the current learning rate, that is, the learning rate that will + be used during the next training step. + !*/ + + void set_min_learning_rate ( + double lr + ); + /*! + requires + - lr > 0 + ensures + - #get_min_learning_rate() == lr + - #get_learning_rate_schedule().size() == 0 + - This function blocks until all threads inside the dnn_trainer have + stopped touching the net. + !*/ + + double get_min_learning_rate ( + ) const; + /*! + ensures + - During training via this->train(), this object will test if progress is + still being made and if it isn't then it will reduce get_learning_rate() + by setting it to get_learning_rate()*get_learning_rate_shrink_factor(). + However, it will not reduce it below get_min_learning_rate(). Once this + minimum learning rate is crossed the training will terminate. + - get_min_learning_rate() doesn't apply if you are using train_one_step(). + You can keep calling train_one_step() as many times as you want and the + learning rate will drop infinitely close to 0 if you run long enough. + !*/ + + template <typename EXP> + void set_learning_rate_schedule ( + const matrix_exp<EXP>& schedule + ); + /*! + requires + - schedule.size() > 0 + - min(schedule) > 0 + ensures + - #get_learning_rate_schedule() == reshape_to_column_vector(schedule) + - #get_learning_rate() == schedule(0,0) + - #get_min_learning_rate() == min(schedule) + - #set_learning_rate_shrink_factor() == 1 + !*/ + + const matrix<double,0,1>& get_learning_rate_schedule ( + ) const; + /*! + ensures + - if (this function returns a non-empty matrix) then + - This trainer will use an explicit learning rate schedule defined by + the learning rate values in get_learning_rate_schedule(). For + example, if get_learning_rate_schedule() returned {0.1, 0.09, 0.08, + 0.07, 0.06} then the first training mini-batch would use a learning + rate of 0.1, then the next training mini-batch uses 0.09, and then + 0.8, and so on until the end of the schedule is reached. + + If you continue to run training after the end of the schedule has + been reached then the learning rate will be fixed to 0.99 times the + final value. So in our example, eventually the learning rate would + be fixed to 0.99*0.06. This allows you to test if we have reached the + end of the schedule by checking if get_learning_rate() >= 0.06. + !*/ + + unsigned long get_steps_without_progress ( + ) const; + /*! + ensures + - if (get_learning_rate_shrink_factor() != 1) then + - returns an estimate of how many mini-batches have executed without us + observing a statistically significant decrease in the training error. + - else + - returns 0 + !*/ + + void set_iterations_without_progress_threshold ( + unsigned long thresh + ); + /*! + ensures + - #get_iterations_without_progress_threshold() == thresh + - #get_learning_rate_schedule().size() == 0 + - This function blocks until all threads inside the dnn_trainer have + stopped touching the net. + !*/ + + unsigned long get_iterations_without_progress_threshold ( + ) const; + /*! + ensures + - This object monitors the progress of training and estimates if the + training error is being reduced. It does this by looking at the previous + get_iterations_without_progress_threshold() mini-batch results and + applying the statistical test defined by the running_gradient object to + see if the training error is getting smaller. If it isn't being reduced + then get_learning_rate() is made smaller by a factor of get_learning_rate_shrink_factor(). + + Therefore, get_iterations_without_progress_threshold() should always be + set to something sensibly large so that this test can be done with + reasonably high confidence. Think of this test as saying "if the loss + hasn't decreased for the previous get_iterations_without_progress_threshold() + then shrink the learning rate". + !*/ + + void set_learning_rate_shrink_factor ( + double shrink + ); + /*! + requires + - 0 < shrink && shrink <= 1 + ensures + - #get_learning_rate_shrink_factor() == shrink + - #get_learning_rate_schedule().size() == 0 + - This function blocks until all threads inside the dnn_trainer have + stopped touching the net. + !*/ + + double get_learning_rate_shrink_factor ( + ) const; + /*! + ensures + - Whenever the training routine thinks it isn't making progress anymore it + will reduce get_learning_rate() by multiplying it by get_learning_rate_shrink_factor(). + - You can disable the automatic learning rate reduction by setting + get_learning_rate_shrink_factor() to 1. + !*/ + + unsigned long long get_train_one_step_calls ( + ) const; + /*! + ensures + - returns the number of times train_one_step() has been called. + !*/ + + unsigned long long get_test_one_step_calls ( + ) const; + /*! + ensures + - returns the number of times test_one_step() has been called. + !*/ + + void be_verbose ( + ); + /*! + ensures + - This object will print status messages to standard out so that a + user can observe the progress of the algorithm. + !*/ + + void be_quiet ( + ); + /*! + ensures + - This object will not print anything to standard out + !*/ + + void set_synchronization_file ( + const std::string& filename, + std::chrono::seconds time_between_syncs = std::chrono::minutes(15) + ); + /*! + ensures + - #get_synchronization_file() == filename + - While training is running, either via train() or repeated calls to + train_one_step(), this object will save its entire state, including the + state of get_net(), to disk in the file named filename every + time_between_syncs seconds. + - If the filename file already exists then the state of this trainer will + be loaded from that file by this call to set_synchronization_file(). + This allows you to resume a training session which was previously + interrupted. + - It should be noted that when saving, the trainer will alternate between + saving to a file called filename and another file called filename+"_". + We do this because it's possible that your computer might crash (not + because of dlib, just in general) before the data is safely saved to + disk. This way, you will always have a backup file if the write to disk + gets corrupted or is incomplete. Moreover, when loading, we will always + load from the newest of the two possible files. + !*/ + + const std::string& get_synchronization_file ( + ); + /*! + ensures + - Returns the name of the file the dnn_trainer will periodically save it's + state to. If the return value is "" then synchronization is disabled. + !*/ + + void train ( + const std::vector<input_type>& data, + const std::vector<training_label_type>& labels + ); + /*! + requires + - data.size() == labels.size() + - data.size() > 0 + - net_type uses a supervised loss. + i.e. net_type::training_label_type != no_label_type. + ensures + - Trains a supervised neural network based on the given training data. + The goal of training is to find the network parameters that minimize + get_net().compute_loss(data.begin(), data.end(), labels.begin()). + - The optimizer will run until get_learning_rate() < get_min_learning_rate() + or get_max_num_epochs() training epochs have been executed. + - Each layer in the network will be optimized by its corresponding solver + in get_solvers(). + - Each call to train DOES NOT reinitialize the state of get_net() or + get_solvers(). That is, the existing state of the solvers and network is + the starting point for the optimization each time train() is called. In + particular, if you use the set_synchronization_file() method you can + resume an interrupted train() call by simply calling train() again and it + will pick up from the last synchronization point. + - You can obtain the average loss value during the final training epoch by + calling get_average_loss(). + - This function blocks until all threads inside the dnn_trainer have + stopped touching the net. + !*/ + + void train ( + const std::vector<input_type>& data + ); + /*! + requires + - data.size() > 0 + - net_type uses an unsupervised loss. + i.e. net_type::training_label_type == no_label_type. + ensures + - Trains an unsupervised neural network based on the given training data. + The goal of training is to find the network parameters that minimize + get_net().compute_loss(data.begin(), data.end()). + - The optimizer will run until get_learning_rate() < get_min_learning_rate() + or get_max_num_epochs() training epochs have been executed. + - Each layer in the network will be optimized by its corresponding solver + in get_solvers(). + - Each call to train DOES NOT reinitialize the state of get_net() or + get_solvers(). That is, the existing state of the solvers and network is + the starting point for the optimization each time train() is called. In + particular, if you use the set_synchronization_file() method you can + resume an interrupted train() call by simply calling train() again and it + will pick up from the last synchronization point. + - You can obtain the average loss value during the final training epoch by + calling get_average_loss(). + - This function blocks until all threads inside the dnn_trainer have + stopped touching the net. + !*/ + + void train_one_step ( + const std::vector<input_type>& data, + const std::vector<training_label_type>& labels + ); + /*! + requires + - data.size() == labels.size() + - data.size() > 0 + - net_type uses a supervised loss. + i.e. net_type::training_label_type != no_label_type. + ensures + - Performs one stochastic gradient update step based on the mini-batch of + data and labels supplied to this function. In particular, calling + train_one_step() in a loop is equivalent to calling the train() method + defined above. However, train_one_step() allows you to stream data from + disk into the training process while train() requires you to first load + all the training data into RAM. Otherwise, these training methods are + equivalent. + - You can observe the current average loss value by calling get_average_loss(). + - The network training will happen in another thread. Therefore, after + calling this function you should call get_net() before you touch the net + object from the calling thread to ensure no other threads are still + accessing the network. + - #get_train_one_step_calls() == get_train_one_step_calls() + 1. + !*/ + + template < + typename data_iterator, + typename label_iterator + > + void train_one_step ( + data_iterator dbegin, + data_iterator dend, + label_iterator lbegin + ); + /*! + requires + - std::advance(lbegin, std::distance(dbegin, dend) - 1) is dereferencable + - std::distance(dbegin, dend) > 0 + - net_type uses a supervised loss. + i.e. net_type::training_label_type != no_label_type. + ensures + - Performs one stochastic gradient update step based on the mini-batch of + data and labels supplied to this function. In particular, calling + train_one_step() in a loop is equivalent to calling the train() method + defined above. However, train_one_step() allows you to stream data from + disk into the training process while train() requires you to first load + all the training data into RAM. Otherwise, these training methods are + equivalent. + - You can observe the current average loss value by calling get_average_loss(). + - The network training will happen in another thread. Therefore, after + calling this function you should call get_net() before you touch the net + object from the calling thread to ensure no other threads are still + accessing the network. + - #get_train_one_step_calls() == get_train_one_step_calls() + 1. + !*/ + + void train_one_step ( + const std::vector<input_type>& data + ); + /*! + requires + - data.size() > 0 + - net_type uses an unsupervised loss. + i.e. net_type::training_label_type == no_label_type. + ensures + - Performs one stochastic gradient update step based on the mini-batch of + data supplied to this function. In particular, calling train_one_step() + in a loop is equivalent to calling the train() method defined above. + However, train_one_step() allows you to stream data from disk into the + training process while train() requires you to first load all the + training data into RAM. Otherwise, these training methods are + equivalent. + - You can observe the current average loss value by calling get_average_loss(). + - The network training will happen in another thread. Therefore, after + calling this function you should call get_net() before you touch the net + object from the calling thread to ensure no other threads are still + accessing the network. + - #get_train_one_step_calls() == get_train_one_step_calls() + 1. + !*/ + + template < + typename data_iterator + > + void train_one_step ( + data_iterator dbegin, + data_iterator dend + ); + /*! + requires + - std::distance(dbegin, dend) > 0 + - net_type uses an unsupervised loss. + i.e. net_type::training_label_type == no_label_type. + ensures + - Performs one stochastic gradient update step based on the mini-batch of + data supplied to this function. In particular, calling train_one_step() + in a loop is equivalent to calling the train() method defined above. + However, train_one_step() allows you to stream data from disk into the + training process while train() requires you to first load all the + training data into RAM. Otherwise, these training methods are + equivalent. + - You can observe the current average loss value by calling get_average_loss(). + - The network training will happen in another thread. Therefore, after + calling this function you should call get_net() before you touch the net + object from the calling thread to ensure no other threads are still + accessing the network. + - #get_train_one_step_calls() == get_train_one_step_calls() + 1. + !*/ + + double get_average_loss ( + ) const; + /*! + ensures + - returns the average loss value observed during previous calls to + train_one_step() or train(). That is, the average output of + net_type::update() during the previous mini-batch updates. + - Note that, if be_verbose() has been called, then this object will + automatically call clear_average_loss() periodically when it logs the + loss to the console. + - This function blocks until all threads inside the dnn_trainer have + stopped touching the net. + !*/ + + void clear_average_loss ( + ); + /*! + ensures + - #get_average_loss() == 0 + - get_average_loss() uses a dlib::running_stats object to keep a running + average of the loss values seen during the previous mini-batch updates + applied during training. Calling clear_average_loss() resets the + running_stats object so it forgets about all previous loss values + observed. + - This function blocks until all threads inside the dnn_trainer have + stopped touching the net. + !*/ + + // ---------------------- + + double get_average_test_loss ( + ) const; + /*! + ensures + - returns the average loss value observed during previous calls to + test_one_step(). + - This function blocks until all threads inside the dnn_trainer have + stopped touching the net. + !*/ + + void test_one_step ( + const std::vector<input_type>& data, + const std::vector<training_label_type>& labels + ); + /*! + requires + - data.size() == labels.size() + - data.size() > 0 + - net_type uses a supervised loss. + i.e. net_type::training_label_type != no_label_type. + ensures + - Runs the given data through the network and computes and records the loss. + - This call does not modify network parameters. The point of + test_one_step() is two fold, to allow you to observe the accuracy of the + network on hold out data during training, and to allow the trainer to + automatically adjust the learning rate when the test loss stops + improving. It should be noted that you are not required to use + test_one_step() at all, but if you want to do this kind of thing it is + available. + - You can observe the current average loss value by calling get_average_test_loss(). + - The computation will happen in another thread. Therefore, after calling + this function you should call get_net() before you touch the net object + from the calling thread to ensure no other threads are still accessing + the network. + - #get_test_one_step_calls() == get_test_one_step_calls() + 1. + !*/ + + template < + typename data_iterator, + typename label_iterator + > + void test_one_step ( + data_iterator dbegin, + data_iterator dend, + label_iterator lbegin + ); + /*! + requires + - std::advance(lbegin, std::distance(dbegin, dend) - 1) is dereferencable + - std::distance(dbegin, dend) > 0 + - net_type uses a supervised loss. + i.e. net_type::training_label_type != no_label_type. + ensures + - Runs the given data through the network and computes and records the loss. + - This call does not modify network parameters. The point of + test_one_step() is two fold, to allow you to observe the accuracy of the + network on hold out data during training, and to allow the trainer to + automatically adjust the learning rate when the test loss stops + improving. It should be noted that you are not required to use + test_one_step() at all, but if you want to do this kind of thing it is + available. + - You can observe the current average loss value by calling get_average_test_loss(). + - The computation will happen in another thread. Therefore, after calling + this function you should call get_net() before you touch the net object + from the calling thread to ensure no other threads are still accessing + the network. + - #get_test_one_step_calls() == get_test_one_step_calls() + 1. + !*/ + + void test_one_step ( + const std::vector<input_type>& data + ); + /*! + requires + - data.size() > 0 + - net_type uses an unsupervised loss. + i.e. net_type::training_label_type == no_label_type. + ensures + - Runs the given data through the network and computes and records the loss. + - This call does not modify network parameters. The point of + test_one_step() is two fold, to allow you to observe the accuracy of the + network on hold out data during training, and to allow the trainer to + automatically adjust the learning rate when the test loss stops + improving. It should be noted that you are not required to use + test_one_step() at all, but if you want to do this kind of thing it is + available. + - You can observe the current average loss value by calling get_average_test_loss(). + - The computation will happen in another thread. Therefore, after calling + this function you should call get_net() before you touch the net object + from the calling thread to ensure no other threads are still accessing + the network. + - #get_test_one_step_calls() == get_test_one_step_calls() + 1. + !*/ + + template < + typename data_iterator + > + void test_one_step ( + data_iterator dbegin, + data_iterator dend + ); + /*! + requires + - std::distance(dbegin, dend) > 0 + - net_type uses an unsupervised loss. + i.e. net_type::training_label_type == no_label_type. + ensures + - Runs the given data through the network and computes and records the loss. + - This call does not modify network parameters. The point of + test_one_step() is two fold, to allow you to observe the accuracy of the + network on hold out data during training, and to allow the trainer to + automatically adjust the learning rate when the test loss stops + improving. It should be noted that you are not required to use + test_one_step() at all, but if you want to do this kind of thing it is + available. + - You can observe the current average loss value by calling get_average_test_loss(). + - The computation will happen in another thread. Therefore, after calling + this function you should call get_net() before you touch the net object + from the calling thread to ensure no other threads are still accessing + the network. + - #get_test_one_step_calls() == get_test_one_step_calls() + 1. + !*/ + + void set_test_iterations_without_progress_threshold ( + unsigned long thresh + ); + /*! + ensures + - #get_test_iterations_without_progress_threshold() == thresh + - #get_learning_rate_schedule().size() == 0 + - This function blocks until all threads inside the dnn_trainer have + stopped touching the net. + !*/ + + unsigned long get_test_iterations_without_progress_threshold ( + ) const; + /*! + ensures + - This object monitors the progress of training and estimates if the + testing error is being reduced. It does this by looking at the previous + get_test_iterations_without_progress_threshold() mini-batch results from + test_one_step() and applying the statistical test defined by the + running_gradient object to see if the testing error is getting smaller. + If it isn't being reduced then get_learning_rate() is made smaller by a + factor of get_learning_rate_shrink_factor(). + + Therefore, get_test_iterations_without_progress_threshold() should always be + set to something sensibly large so that this test can be done with + reasonably high confidence. Think of this test as saying "if the testing loss + hasn't decreased for the previous get_test_iterations_without_progress_threshold() + calls to test_one_step() then shrink the learning rate". + !*/ + + unsigned long get_test_steps_without_progress ( + ) const; + /*! + ensures + - if (get_learning_rate_shrink_factor() != 1) then + - returns an estimate of how many mini-batches have executed without us + observing a statistically significant decrease in the testing error + (i.e. the error on the data given to the trainer via test_one_step() + calls). + - else + - returns 0 + !*/ + + }; + +// ---------------------------------------------------------------------------------------- + + template < + typename net_type, + typename solver_type + > + std::ostream& operator<< ( + std::ostream& out, + dnn_trainer<net_type,solver_type>& trainer + ); + /*! + ensures + - Prints a log of the current parameters of trainer to out. + !*/ + +// ---------------------------------------------------------------------------------------- + +} + +#endif // DLIB_DNn_TRAINER_ABSTRACT_H_ + + |