diff options
Diffstat (limited to 'ml/dlib/dlib/dnn')
38 files changed, 33283 insertions, 0 deletions
diff --git a/ml/dlib/dlib/dnn/core.h b/ml/dlib/dlib/dnn/core.h new file mode 100644 index 000000000..5f1d05498 --- /dev/null +++ b/ml/dlib/dlib/dnn/core.h @@ -0,0 +1,3599 @@ +// Copyright (C) 2015 Davis E. King (davis@dlib.net) +// License: Boost Software License See LICENSE.txt for the full license. +#ifndef DLIB_DNn_CORE_H_ +#define DLIB_DNn_CORE_H_ + +#include "core_abstract.h" +#include "tensor.h" +#include <iterator> +#include <memory> +#include <sstream> +#include <type_traits> +#include "../statistics.h" +#include "../rand.h" +#include "../algs.h" +#include <utility> +#include <tuple> +#include <cmath> +#include <vector> +#include "tensor_tools.h" +#include <type_traits> +#include "../metaprogramming.h" + +#ifdef _MSC_VER +// Tell Visual Studio not to recursively inline functions very much because otherwise it +// takes hours to compile the DNN code sometimes. It's crazy. Hopefully we can remove +// this some day when the visual studio compiler is more efficient. +#pragma inline_depth(2) +#endif + + +namespace dlib +{ + +// ---------------------------------------------------------------------------------------- + + namespace impl + { + template <typename T, typename int_<decltype(&T::get_learning_rate_multiplier)>::type = 0> + double get_learning_rate_multiplier ( + const T& obj, + special_ + ) { return obj.get_learning_rate_multiplier(); } + + template <typename T> + double get_learning_rate_multiplier ( const T& , general_) { return 1; } + } + template <typename T> + double get_learning_rate_multiplier(const T& obj) { return impl::get_learning_rate_multiplier(obj, special_()); } + +// ---------------------------------------------------------------------------------------- + + namespace impl + { + template <typename T, typename int_<decltype(&T::get_weight_decay_multiplier)>::type = 0> + double get_weight_decay_multiplier ( + const T& obj, + special_ + ) { return obj.get_weight_decay_multiplier(); } + + template <typename T> + double get_weight_decay_multiplier ( const T& , general_) { return 1; } + } + template <typename T> + double get_weight_decay_multiplier(const T& obj) { return impl::get_weight_decay_multiplier(obj, special_()); } + +// ---------------------------------------------------------------------------------------- + + namespace impl + { + // The reason we return an int for this version rather than doing the more straight forward thing (like we do above) is to avoid a bug in visual studio 2015. + template <typename T> + auto call_clean_method_if_exists ( + T& obj, + special_ + ) -> typename int_<decltype(&T::clean)>::type { obj.clean(); return 0; } + + template <typename T> + void call_clean_method_if_exists (T& , general_) {} + } + template <typename T> + void call_clean_method_if_exists(T& obj) { impl::call_clean_method_if_exists(obj, special_()); } + /*! + ensures + - calls obj.clean() if obj has a .clean() method. + !*/ + +// ---------------------------------------------------------------------------------------- + + namespace impl + { + class repeat_input_layer + { + /*! + None of the declarations in this object are really used. The only reason it + exists is to allow the repeat object to use a special input layer in its + internal networks which will cause add_tag_layer objects that happen to be + right at the input to not create copies of their input tensors. So + introducing the repeat_input_layer object allows us to optimize the + implementation of add_tag_layer for a special case that arises when it's + used in the context of the repeat layer. + !*/ + public: + typedef int input_type; + + template <typename forward_iterator> + void to_tensor ( + forward_iterator , + forward_iterator , + resizable_tensor& + ) const + { + } + + friend void serialize(const repeat_input_layer&, std::ostream&){} + friend void deserialize(repeat_input_layer&, std::istream&){} + friend std::ostream& operator<<(std::ostream& out, const repeat_input_layer&) { return out; } + }; + + inline std::string tensor_to_str ( + const tensor& t, + int& min_length + ) + { + if (t.size() == 0) + return ""; + + std::ostringstream sout; + sout << "output size=(num:"<< t.num_samples() << ", "; + sout << "k:" << t.k() << ","; + while (sout.tellp() < 28) sout << " "; + sout << "nr:" << t.nr() << ","; + while (sout.tellp() < 28+8) sout << " "; + sout << "nc:" << t.nc() << ")"; + while (sout.tellp() < min_length) sout << " "; + min_length = sout.tellp(); + sout << "\t"; + return sout.str(); + } + } + +// ---------------------------------------------------------------------------------------- + + // Tell us if T is one of the special layer types (i.e. add_layer, repeat, add_tag_layer, or + // add_skip_layer). + template <typename T> struct is_nonloss_layer_type : std::false_type {}; + // Tell us if T is an instance of add_loss_layer. + template <typename T> struct is_loss_layer_type : std::false_type {}; + // Tell us if T is an instance of add_layer + template <typename T> struct is_add_layer : std::false_type {}; + + namespace impl + { + template <size_t... indices, typename Tuple> + auto tuple_subset( + const Tuple& item, + compile_time_integer_list<indices...> + ) -> decltype(std::make_tuple(std::get<indices>(item)...)) + { + return std::make_tuple(std::get<indices>(item)...); + } + + template <typename Head, typename... Tail> + std::tuple<Tail...> basic_tuple_tail( + const std::tuple<Head, Tail...>& item + ) + { + return tuple_subset(item, typename make_compile_time_integer_range<sizeof...(Tail)>::type()); + } + + template <typename T> + std::tuple<T> tuple_flatten(const T& t) + { + return std::make_tuple(t); + } + + template <typename... T> + auto tuple_flatten( + const std::tuple<T...>& item + ) -> decltype(tuple_flatten(item, typename make_compile_time_integer_range<sizeof...(T)>::type())) + { + return tuple_flatten(item, typename make_compile_time_integer_range<sizeof...(T)>::type()); + } + + template <size_t... indices, typename... T> + auto tuple_flatten( + const std::tuple<T...>& item, + compile_time_integer_list<indices...> + ) -> decltype(std::tuple_cat(tuple_flatten(std::get<indices-1>(item))...)) + { + return std::tuple_cat(tuple_flatten(std::get<indices-1>(item))...); + } + + template <typename T> + struct tuple_head_helper + { + typedef T type; + static const type& get(const T& item) + { + return item; + } + }; + + template <typename T, typename... U> + struct tuple_head_helper<std::tuple<T, U...>> + { + typedef typename tuple_head_helper<T>::type type; + static const type& get(const std::tuple<T,U...>& item) + { + return tuple_head_helper<T>::get(std::get<0>(item)); + } + }; + + template <typename T> struct alwaysbool { typedef bool type; }; + // one more structure for VS 2015 UP3 support workaround + template <typename T> struct alwaysbool2 { typedef bool type; }; + + resizable_tensor& rt(); + + // The significance of a layer's backward method requiring forward's outputs is + // that such as layer can't have an in-place layer stacked on top of it because + // in-place layers overwrite the output of the layer they sit on top of. + template <typename layer_type, typename SUBNET> + constexpr auto backward_requires_forward_output( + layer_type& layer, + SUBNET& sub + ) -> typename alwaysbool<decltype(layer.backward(rt(),rt(),sub,rt()))>::type + { + return true; + } + + template <typename layer_type, typename SUBNET> + constexpr auto backward_requires_forward_output( + layer_type& layer, + SUBNET& sub + ) -> typename alwaysbool<decltype(layer.backward(rt(),sub,rt()))>::type + { + return false; + } + + template <typename layer_type, typename SUBNET> + constexpr auto backward_requires_forward_output( + layer_type& layer, + SUBNET& sub + ) -> typename alwaysbool<decltype(layer.backward_inplace(rt(),rt(),sub.get_gradient_input(),rt()))>::type + { + return true; + } + + template <typename layer_type, typename SUBNET> + constexpr auto backward_requires_forward_output( + layer_type& layer, + SUBNET& sub + ) -> typename alwaysbool<decltype(layer.backward_inplace(rt(),sub.get_gradient_input(),rt()))>::type + { + return false; + } + + template <typename layer_type, typename SUBNET> + constexpr auto has_inplace_backward( + layer_type& layer, + SUBNET& sub + ) -> typename alwaysbool2<decltype(layer.backward(rt(),rt(),sub,rt()))>::type + { + return false; + } + + template <typename layer_type, typename SUBNET> + constexpr auto has_inplace_backward( + layer_type& layer, + SUBNET& sub + ) -> typename alwaysbool2<decltype(layer.backward(rt(),sub,rt()))>::type + { + return false; + } + + template <typename layer_type, typename SUBNET> + constexpr auto has_inplace_backward( + layer_type& layer, + SUBNET& sub + ) -> typename alwaysbool2<decltype(layer.backward_inplace(rt(),rt(),sub.get_gradient_input(),rt()))>::type + { + return true; + } + + template <typename layer_type, typename SUBNET> + constexpr auto has_inplace_backward( + layer_type& layer, + SUBNET& sub + ) -> typename alwaysbool2<decltype(layer.backward_inplace(rt(),sub.get_gradient_input(),rt()))>::type + { + return true; + } + + template <typename layer_type, typename SUBNET> + constexpr auto is_inplace_layer( + layer_type& layer, + const SUBNET& sub + ) -> typename alwaysbool2<decltype(layer.forward(sub,rt()))>::type + { + return false; + } + + template <typename layer_type, typename SUBNET> + constexpr auto is_inplace_layer( + layer_type& layer, + const SUBNET& sub + ) -> typename alwaysbool<decltype(layer.forward_inplace(sub.get_output(),rt()))>::type + { + return true; + } + + template <typename layer_type, typename SUBNET> + auto call_layer_backward( + layer_type& layer, + const tensor& computed_output, + const tensor& gradient_input, + SUBNET& sub, + tensor& params_grad + ) -> decltype(layer.backward(computed_output,gradient_input,sub,params_grad)) + { + layer.backward(computed_output,gradient_input,sub,params_grad); + } + + template <typename layer_type, typename SUBNET> + auto call_layer_backward( + layer_type& layer, + const tensor& , + const tensor& gradient_input, + SUBNET& sub, + tensor& params_grad + ) -> decltype(layer.backward(gradient_input,sub,params_grad)) + { + layer.backward(gradient_input,sub,params_grad); + } + + template <typename layer_type, typename SUBNET> + auto call_layer_backward( + layer_type& layer, + const tensor& computed_output, + const tensor& gradient_input, + SUBNET& sub, + tensor& params_grad + ) -> decltype(layer.backward_inplace(computed_output,gradient_input,sub.get_gradient_input(),params_grad)) + { + layer.backward_inplace(computed_output,gradient_input,sub.get_gradient_input(),params_grad); + } + + template <typename layer_type, typename SUBNET> + auto call_layer_backward( + layer_type& layer, + const tensor& , + const tensor& gradient_input, + SUBNET& sub, + tensor& params_grad + ) -> decltype(layer.backward_inplace(gradient_input,sub.get_gradient_input(),params_grad)) + { + layer.backward_inplace(gradient_input,sub.get_gradient_input(),params_grad); + } + + + template <typename layer_type, typename SUBNET> + auto call_layer_forward( + layer_type& layer, + const SUBNET& sub, + tensor& /*data_output*/ + ) -> decltype(layer.forward(sub,rt())) + { + // This overload of call_layer_forward() is here because this template + // naturally gets instantiated but only on code paths that never get executed. + // So rather than writing a bunch of hard to read template magic around call + // sites we just have this overload that doesn't do anything (and an assert to + // make sure that's the case). + DLIB_CASSERT(false, "This should never happen"); + } + + template <typename layer_type, typename SUBNET> + auto call_layer_forward( + layer_type& layer, + const SUBNET& sub, + resizable_tensor& data_output + ) -> decltype(layer.forward(sub,data_output)) + { + layer.forward(sub,data_output); + } + + template <typename layer_type, typename SUBNET> + auto call_layer_forward( + layer_type& layer, + const SUBNET& sub, + tensor& data_output + ) -> decltype(layer.forward_inplace(sub.get_output(),data_output)) + { + layer.forward_inplace(sub.get_output(),data_output); + } + + template <typename layer_type, typename SUBNET> + auto call_layer_forward( + layer_type& layer, + const SUBNET& sub, + resizable_tensor& data_output + ) -> decltype(layer.forward_inplace(sub.get_output(),data_output)) + { + if (!have_same_dimensions(data_output, sub.get_output())) + data_output.copy_size(sub.get_output()); + layer.forward_inplace(sub.get_output(),static_cast<tensor&>(data_output)); + } + + + } // end namespace impl + + template <typename... T> + typename impl::tuple_head_helper<std::tuple<T...>>::type tuple_head ( + const std::tuple<T...>& item + ) + { + return impl::tuple_head_helper<std::tuple<T...>>::get(item); + } + + template <typename... T> + auto tuple_tail( + const std::tuple<T...>& item + ) -> decltype(impl::basic_tuple_tail(impl::tuple_flatten(item))) + { + return impl::basic_tuple_tail(impl::tuple_flatten(item)); + } + + inline std::tuple<> tuple_tail( + const std::tuple<>& item + ) + { + return item; + } +// ---------------------------------------------------------------------------------------- + + template <typename T> + class sstack + { + public: + typedef T value_type; + + sstack() = delete; + + sstack ( + T* data_, + size_t s + ) : data(data_), mysize(s) {} + + const T& top() const + { + DLIB_CASSERT(size() != 0, "You can't call top() on an empty stack"); + return *data; + } + T& top() + { + DLIB_CASSERT(size() != 0, "You can't call top() on an empty stack"); + return *data; + } + + size_t size() const { return mysize; } + + sstack pop(size_t num=1) + { + DLIB_CASSERT(num <= size(), "You can't pop more things from the stack than it has in it."); + return sstack(data+num, mysize-num); + } + + private: + + T* data; + size_t mysize; + }; + + template <typename T> + sstack<T> make_sstack(std::vector<T>& item) + { + return sstack<T>(item.data(), item.size()); + } + +// ---------------------------------------------------------------------------------------- +// ---------------------------------------------------------------------------------------- +// ---------------------------------------------------------------------------------------- + + namespace dimpl + { + template <typename T, bool is_first = true, typename enabled=void> + class subnet_wrapper + { + /*! + WHAT THIS OBJECT REPRESENTS + This is a tool that makes an add_layer or add_loss_layer object + expose only the part of its interface defined by the SUBNET + type in layers_abstract.h. This way, when we pass subnetwork + objects to the layer callbacks those callbacks won't be able to + interact with the subnetworks in a way other than specified + by the SUBNET interface spec. + + We also allow the top layer of a subnet_wrapper stack to call the + private_get_output() and private_get_gradient_input() functions. This + way, layers that have had their output/gradient overwritten by in-place + layers can only be accessed from the in-place layers that sit directly + on top of them since those in-place layers are the only layers that + know how to interact with them properly. + !*/ + + public: + subnet_wrapper(const subnet_wrapper&) = delete; + subnet_wrapper& operator=(const subnet_wrapper&) = delete; + + subnet_wrapper(T& l_, unsigned int sef) : l(l_),_sample_expansion_factor(sef) {} + // Not much here because in this case T is one of the input layer types + // that doesn't have anything in it. + typedef T layer_details_type; + const layer_details_type& layer_details() const { return l; } + unsigned int sample_expansion_factor() const { return _sample_expansion_factor; } + private: + T& l; + unsigned int _sample_expansion_factor; + }; + + template <typename T> + class subnet_wrapper<T,true, typename std::enable_if<is_nonloss_layer_type<T>::value>::type> + { + + public: + subnet_wrapper(const subnet_wrapper&) = delete; + subnet_wrapper& operator=(const subnet_wrapper&) = delete; + + typedef T wrapped_type; + const static size_t num_computational_layers = T::num_computational_layers; + const static size_t num_layers = T::num_layers; + typedef typename T::layer_details_type layer_details_type; + + subnet_wrapper(T& l_, unsigned int = 0) : l(l_),subnetwork(l.subnet(), l.sample_expansion_factor()) {} + + const tensor& get_output() const { return l.private_get_output(); } + tensor& get_gradient_input() { return l.private_get_gradient_input(); } + + const layer_details_type& layer_details() const { return l.layer_details(); } + + const subnet_wrapper<typename T::subnet_type,false>& subnet() const { return subnetwork; } + subnet_wrapper<typename T::subnet_type,false>& subnet() { return subnetwork; } + unsigned int sample_expansion_factor() const { return l.sample_expansion_factor(); } + + private: + T& l; + subnet_wrapper<typename T::subnet_type,false> subnetwork; + }; + + template <typename T> + class subnet_wrapper<T,false, typename std::enable_if<is_nonloss_layer_type<T>::value>::type> + { + + public: + subnet_wrapper(const subnet_wrapper&) = delete; + subnet_wrapper& operator=(const subnet_wrapper&) = delete; + + typedef T wrapped_type; + const static size_t num_computational_layers = T::num_computational_layers; + const static size_t num_layers = T::num_layers; + typedef typename T::layer_details_type layer_details_type; + + subnet_wrapper(T& l_, unsigned int = 0) : l(l_),subnetwork(l.subnet(), l.sample_expansion_factor()) {} + + const tensor& get_output() const { return l.get_output(); } + tensor& get_gradient_input() { return l.get_gradient_input(); } + + const layer_details_type& layer_details() const { return l.layer_details(); } + + const subnet_wrapper<typename T::subnet_type,false>& subnet() const { return subnetwork; } + subnet_wrapper<typename T::subnet_type,false>& subnet() { return subnetwork; } + unsigned int sample_expansion_factor() const { return l.sample_expansion_factor(); } + + private: + T& l; + subnet_wrapper<typename T::subnet_type,false> subnetwork; + }; + } + +// ---------------------------------------------------------------------------------------- + + template <typename LAYER_DETAILS, typename SUBNET, typename enabled = void> + class add_layer; + + template <typename LAYER_DETAILS, typename SUBNET, typename enabled> + void serialize(const add_layer<LAYER_DETAILS,SUBNET,enabled>& item, std::ostream& out); + template <typename LAYER_DETAILS, typename SUBNET, typename enabled> + void deserialize(add_layer<LAYER_DETAILS,SUBNET,enabled>& item, std::istream& in); + + template <typename T, typename U> + struct is_nonloss_layer_type<add_layer<T,U>> : std::true_type {}; + + template <typename LAYER_DETAILS, typename SUBNET> + class add_layer<LAYER_DETAILS,SUBNET, + typename std::enable_if<is_nonloss_layer_type<SUBNET>::value>::type> + { + public: + typedef LAYER_DETAILS layer_details_type; + typedef SUBNET subnet_type; + typedef typename subnet_type::input_type input_type; + const static size_t num_layers = subnet_type::num_layers + 1; + const static size_t num_computational_layers = subnet_type::num_computational_layers + 1; + + add_layer( + ): + subnetwork(new subnet_type()), + this_layer_setup_called(false), + gradient_input_is_stale(true), + get_output_and_gradient_input_disabled(false) + { + if (this_layer_operates_inplace()) + subnetwork->disable_output_and_gradient_getters(); + } + + add_layer(const add_layer& item) + { + details = item.details; + subnetwork.reset(new subnet_type(*item.subnetwork)); + this_layer_setup_called = item.this_layer_setup_called; + gradient_input_is_stale = item.gradient_input_is_stale; + get_output_and_gradient_input_disabled = item.get_output_and_gradient_input_disabled; + x_grad = item.x_grad; + cached_output = item.cached_output; + params_grad = item.params_grad; + temp_tensor = item.temp_tensor; + } + add_layer& operator=(const add_layer& item) { add_layer(item).swap(*this); return *this;} + add_layer(add_layer&& item) : add_layer() { swap(item); } + add_layer& operator=(add_layer&& item) { swap(item); return *this; } + + template <typename T, typename U, typename E> + friend class add_layer; + template <typename T, bool is_first, typename E> + friend class dimpl::subnet_wrapper; + template <unsigned long T, typename U, typename E> + friend class add_tag_layer; + template <template<typename> class T, typename U> + friend class add_skip_layer; + template <size_t N, template<typename> class L, typename S> + friend class repeat; + + // Allow copying networks from one to another as long as their corresponding + // layers can be constructed from each other. + template <typename T, typename U, typename E> + add_layer( + const add_layer<T,U,E>& item + ) : + details(item.layer_details()), + subnetwork(new subnet_type(item.subnet())), + this_layer_setup_called(item.this_layer_setup_called), + gradient_input_is_stale(item.gradient_input_is_stale), + get_output_and_gradient_input_disabled(item.get_output_and_gradient_input_disabled), + x_grad(item.x_grad), + cached_output(item.cached_output) + { + if (this_layer_operates_inplace()) + subnetwork->disable_output_and_gradient_getters(); + } + + template <typename ...T> + add_layer( + const LAYER_DETAILS& layer_det, + T&& ...args + ) : + details(layer_det), + subnetwork(new subnet_type(std::forward<T>(args)...)), + this_layer_setup_called(false), + gradient_input_is_stale(true), + get_output_and_gradient_input_disabled(false) + { + if (this_layer_operates_inplace()) + subnetwork->disable_output_and_gradient_getters(); + } + + template <typename T, typename ...U> + struct disable_forwarding_constr + { + const static bool value = std::is_constructible<LAYER_DETAILS,T>::value; + }; + template <typename ...T, typename ...U> + struct disable_forwarding_constr<std::tuple<T...>,U...> + { + const static bool value = disable_forwarding_constr<typename std::remove_reference<T>::type...>::value; + }; + template <typename T, typename ...U> + struct disable_forwarding_constr<std::tuple<T>,U...> + { + const static bool value = disable_forwarding_constr<typename std::remove_reference<T>::type>::value; + }; + template <typename ...U> + struct disable_forwarding_constr<std::tuple<>,U...> + { + const static bool value = true; + }; + template <typename ...T> + struct disable_forwarding_constr<add_layer<T...>> + { + const static bool value = true; + }; + + template < + typename ...T, + typename = typename std::enable_if<!disable_forwarding_constr<typename std::remove_reference<T>::type...>::value>::type + > + add_layer( + T&& ...args + ) : + subnetwork(new subnet_type(std::forward<T>(args)...)), + this_layer_setup_called(false), + gradient_input_is_stale(true), + get_output_and_gradient_input_disabled(false) + { + if (this_layer_operates_inplace()) + subnetwork->disable_output_and_gradient_getters(); + } + + template <typename ...T> + add_layer( + LAYER_DETAILS&& layer_det, + T&& ...args + ) : + details(std::move(layer_det)), + subnetwork(new subnet_type(std::forward<T>(args)...)), + this_layer_setup_called(false), + gradient_input_is_stale(true), + get_output_and_gradient_input_disabled(false) + { + if (this_layer_operates_inplace()) + subnetwork->disable_output_and_gradient_getters(); + } + + template <typename ...T, typename LD, typename ...U> + add_layer( + const std::tuple<LD,U...>& layer_det, + T&& ...args + ) : + details(tuple_head(layer_det)), + subnetwork(new subnet_type(tuple_tail(layer_det),std::forward<T>(args)...)), + this_layer_setup_called(false), + gradient_input_is_stale(true), + get_output_and_gradient_input_disabled(false) + { + if (this_layer_operates_inplace()) + subnetwork->disable_output_and_gradient_getters(); + } + + template <typename ...T, typename LD, typename ...U> + add_layer( + std::tuple<>, + const std::tuple<LD,U...>& layer_det, + T&& ...args + ) : add_layer(layer_det,args...) { } + + add_layer ( + std::tuple<> + ) : add_layer() {} + + template <typename ...T> + add_layer( + std::tuple<>, + LAYER_DETAILS&& layer_det, + T&& ...args + ) : add_layer(layer_det, args...) { } + + template <typename forward_iterator> + void to_tensor ( + forward_iterator ibegin, + forward_iterator iend, + resizable_tensor& data + ) const + { + subnetwork->to_tensor(ibegin,iend,data); + } + + template <typename forward_iterator> + const tensor& operator() ( + forward_iterator ibegin, + forward_iterator iend + ) + { + to_tensor(ibegin,iend,temp_tensor); + return forward(temp_tensor); + } + + + const tensor& operator() (const input_type& x) + { + return (*this)(&x, &x+1); + } + + const tensor& forward(const tensor& x) + { + subnetwork->forward(x); + const dimpl::subnet_wrapper<subnet_type> wsub(*subnetwork); + if (!this_layer_setup_called) + { + details.setup(wsub); + this_layer_setup_called = true; + } + if (this_layer_operates_inplace()) + impl::call_layer_forward(details, wsub, private_get_output()); + else + impl::call_layer_forward(details, wsub, cached_output); + + gradient_input_is_stale = true; + return private_get_output(); + } + + private: + tensor& private_get_output() const + { + if (const_cast<add_layer&>(*this).this_layer_operates_inplace()) + return subnetwork->private_get_output(); + else + return const_cast<resizable_tensor&>(cached_output); + } + tensor& private_get_gradient_input() + { + if (this_layer_operates_inplace()) + { + return subnetwork->private_get_gradient_input(); + } + else + { + if (gradient_input_is_stale) + { + gradient_input_is_stale = false; + x_grad.copy_size(private_get_output()); + x_grad = 0; + } + return x_grad; + } + } + void disable_output_and_gradient_getters ( + ) { get_output_and_gradient_input_disabled = true; } + public: + const tensor& get_output() const + { + if (get_output_and_gradient_input_disabled) + throw dlib::error("Accessing this layer's get_output() is disabled because an in-place layer has been stacked on top of it."); + return private_get_output(); + } + tensor& get_gradient_input() + { + if (get_output_and_gradient_input_disabled) + throw dlib::error("Accessing this layer's get_gradient_input() is disabled because an in-place layer has been stacked on top of it."); + return private_get_gradient_input(); + } + + const tensor& get_final_data_gradient( + ) const { return subnetwork->get_final_data_gradient(); } + + void back_propagate_error(const tensor& x) + { + back_propagate_error(x, private_get_gradient_input()); + } + void back_propagate_error(const tensor& x, const tensor& gradient_input) + { + dimpl::subnet_wrapper<subnet_type> wsub(*subnetwork); + params_grad.copy_size(details.get_layer_params()); + impl::call_layer_backward(details, private_get_output(), + gradient_input, wsub, static_cast<tensor&>(params_grad)); + + subnetwork->back_propagate_error(x); + + // zero out get_gradient_input() + gradient_input_is_stale = true; + } + + template <typename solver_type> + void update_parameters(sstack<solver_type> solvers, double learning_rate) + { + DLIB_CASSERT(solvers.size()>=num_computational_layers); + // Don't try to adjust the parameters if this layer doesn't have any or the + // learning rate is disabled for this layer. + if (params_grad.size() != 0 && get_learning_rate_multiplier(details) != 0) + { + const tensor& step = solvers.top()(learning_rate, details, static_cast<const tensor&>(params_grad)); + tt::add(details.get_layer_params(), details.get_layer_params(), step); + } + subnetwork->update_parameters(solvers.pop(), learning_rate); + } + + const tensor& get_parameter_gradient( + ) const { return params_grad; } + + tensor& get_parameter_gradient ( + ) { return params_grad; } + + const subnet_type& subnet() const { return *subnetwork; } + subnet_type& subnet() { return *subnetwork; } + + const layer_details_type& layer_details() const { return details; } + layer_details_type& layer_details() { return details; } + + unsigned int sample_expansion_factor() const { return subnet().sample_expansion_factor(); } + + void clean() + { + x_grad.clear(); + cached_output.clear(); + params_grad.clear(); + temp_tensor.clear(); + gradient_input_is_stale = true; + subnetwork->clean(); + call_clean_method_if_exists(details); + } + + friend void serialize(const add_layer& item, std::ostream& out) + { + int version = 2; + serialize(version, out); + serialize(*item.subnetwork, out); + serialize(item.details, out); + serialize(item.this_layer_setup_called, out); + serialize(item.gradient_input_is_stale, out); + serialize(item.get_output_and_gradient_input_disabled, out); + serialize(item.x_grad, out); + serialize(item.cached_output, out); + serialize(item.params_grad, out); + } + + friend void deserialize(add_layer& item, std::istream& in) + { + int version = 0; + deserialize(version, in); + if (!(1 <= version && version <= 2)) + throw serialization_error("Unexpected version found while deserializing dlib::add_layer."); + deserialize(*item.subnetwork, in); + deserialize(item.details, in); + deserialize(item.this_layer_setup_called, in); + deserialize(item.gradient_input_is_stale, in); + deserialize(item.get_output_and_gradient_input_disabled, in); + deserialize(item.x_grad, in); + deserialize(item.cached_output, in); + if (version == 2) + deserialize(item.params_grad, in); + } + + friend std::ostream& operator<< (std::ostream& out, const add_layer& item) + { + int min_length = 0; + item.print(out, 0, min_length); + return out; + } + + void print (std::ostream& out, unsigned long idx, int& min_length) const + { + out << "layer<" << idx << ">\t" << impl::tensor_to_str(private_get_output(), min_length) << layer_details() << "\n"; + subnet().print(out, idx+1, min_length); + } + + private: + + bool this_layer_operates_inplace( + ) + { + // This layer can run in-place if it's an in-place capable layer and also if + // the layer it's on top of doesn't need its own output tensor (since in-place + // layers overwrite that tensor) + return impl::is_inplace_layer(details, *subnetwork) && !subnetwork->this_layer_requires_forward_output(); + } + bool this_layer_requires_forward_output( + ) + { + return impl::backward_requires_forward_output(details, *subnetwork); + } + + void swap(add_layer& item) + { + std::swap(subnetwork,item.subnetwork); + std::swap(details, item.details); + std::swap(this_layer_setup_called, item.this_layer_setup_called); + std::swap(gradient_input_is_stale, item.gradient_input_is_stale); + std::swap(get_output_and_gradient_input_disabled, item.get_output_and_gradient_input_disabled); + std::swap(x_grad, item.x_grad); + std::swap(cached_output, item.cached_output); + std::swap(params_grad, item.params_grad); + } + + + LAYER_DETAILS details; + std::unique_ptr<subnet_type> subnetwork; + bool this_layer_setup_called; + bool gradient_input_is_stale; + bool get_output_and_gradient_input_disabled; + // Note that if this_layer_operates_inplace()==true then x_grad and cached_output + // are not used at all. Instead, this layer uses these variables from the lower + // layer. + resizable_tensor x_grad; + resizable_tensor cached_output; + + resizable_tensor params_grad; + + // temp_tensor doesn't logically contribute to the state of this object. + // It is here only to prevent it from being reallocated over and over. + resizable_tensor temp_tensor; + + }; + + template <typename T, typename U, typename E> + struct is_add_layer<add_layer<T,U,E>> : std::true_type {}; + template <typename T, typename U, typename E> + struct is_add_layer<const add_layer<T,U,E>> : std::true_type {}; + template <typename T, typename U, typename E> + struct is_add_layer<add_layer<T,U,E>&> : std::true_type {}; + template <typename T, typename U, typename E> + struct is_add_layer<const add_layer<T,U,E>&> : std::true_type {}; + +// ---------------------------------------------------------------------------------------- + +// This version of add_layer handles the special case where the subnetwork being given is +// just an input layer object. + template <typename LAYER_DETAILS, typename INPUT_LAYER, typename enabled> + class add_layer + { + public: + typedef LAYER_DETAILS layer_details_type; + typedef INPUT_LAYER subnet_type; + typedef typename INPUT_LAYER::input_type input_type; + const static size_t num_layers = 2; + const static size_t num_computational_layers = 1; + + add_layer( + ): + this_layer_setup_called(false), + gradient_input_is_stale(true), + get_output_and_gradient_input_disabled(false), + _sample_expansion_factor(0) + {} + + add_layer(const add_layer&) = default; + add_layer(add_layer&& item) : add_layer() { swap(item); } + add_layer& operator=(const add_layer&) = default; + add_layer& operator=(add_layer&& item) { swap(item); return *this; } + + template <typename T, typename U, typename E> + friend class add_layer; + template <typename T, bool is_first, typename E> + friend class dimpl::subnet_wrapper; + template <unsigned long T, typename U, typename E> + friend class add_tag_layer; + template <template<typename> class T, typename U> + friend class add_skip_layer; + template <size_t N, template<typename> class L, typename S> + friend class repeat; + + // Allow copying networks from one to another as long as their corresponding + // layers can be constructed from each other. + template <typename T, typename U, typename E> + add_layer( + const add_layer<T,U,E>& item + ): + input_layer(item.subnet()), + details(item.layer_details()), + this_layer_setup_called(item.this_layer_setup_called), + gradient_input_is_stale(item.gradient_input_is_stale), + get_output_and_gradient_input_disabled(false), + _sample_expansion_factor(item._sample_expansion_factor), + x_grad(item.x_grad), + cached_output(item.cached_output), + grad_final(item.grad_final) + { + } + + add_layer( + const LAYER_DETAILS& layer_det + ) : + details(layer_det), + this_layer_setup_called(false), + gradient_input_is_stale(true), + get_output_and_gradient_input_disabled(false), + _sample_expansion_factor(0) + {} + + add_layer( + const INPUT_LAYER& il + ) : + input_layer(il), + this_layer_setup_called(false), + gradient_input_is_stale(true), + get_output_and_gradient_input_disabled(false), + _sample_expansion_factor(0) + {} + + add_layer( + LAYER_DETAILS&& layer_det + ) : + details(std::move(layer_det)), + this_layer_setup_called(false), + gradient_input_is_stale(true), + get_output_and_gradient_input_disabled(false), + _sample_expansion_factor(0) + {} + + add_layer( + LAYER_DETAILS layer_det, + INPUT_LAYER il + ) : + details(std::move(layer_det)), + input_layer(std::move(il)), + this_layer_setup_called(false), + gradient_input_is_stale(true), + get_output_and_gradient_input_disabled(false), + _sample_expansion_factor(0) + {} + + add_layer( + std::tuple<>, + const LAYER_DETAILS& layer_det + ) : add_layer(layer_det) {} + + add_layer( + std::tuple<>, + LAYER_DETAILS&& layer_det + ) : add_layer(layer_det) {} + + add_layer( + std::tuple<>, + LAYER_DETAILS layer_det, + INPUT_LAYER il + ) : add_layer(layer_det,il) {} + + add_layer( + const std::tuple<LAYER_DETAILS>& layer_det + ) : add_layer(tuple_head(layer_det)) {} + + add_layer( + const std::tuple<LAYER_DETAILS>& layer_det, + INPUT_LAYER il + ) : add_layer(tuple_head(layer_det),il) {} + + template <typename forward_iterator> + void to_tensor ( + forward_iterator ibegin, + forward_iterator iend, + resizable_tensor& data + ) const + { + input_layer.to_tensor(ibegin, iend, data); + // make sure the input layer's to_tensor() function is implemented properly. + DLIB_CASSERT(data.num_samples() >= std::distance(ibegin,iend), + "The input layer can't produce fewer output tensors than there are inputs."); + DLIB_CASSERT(data.num_samples()%std::distance(ibegin,iend) == 0, + "The number of tensors produced by the input layer must be an integer multiple of the number of input objects."); + + _sample_expansion_factor = data.num_samples()/std::distance(ibegin,iend); + data.async_copy_to_device(); + } + + + template <typename forward_iterator> + const tensor& operator() ( + forward_iterator ibegin, + forward_iterator iend + ) + { + to_tensor(ibegin,iend,temp_tensor); + return forward(temp_tensor); + } + + + const tensor& operator() (const input_type& x) + { + return (*this)(&x, &x+1); + } + + const tensor& forward (const tensor& x) + { + DLIB_CASSERT(sample_expansion_factor() != 0, "You must call to_tensor() before this function can be used."); + DLIB_CASSERT(x.num_samples()%sample_expansion_factor() == 0); + subnet_wrapper wsub(x, grad_final, _sample_expansion_factor); + if (!this_layer_setup_called) + { + details.setup(wsub); + this_layer_setup_called = true; + } + impl::call_layer_forward(details, wsub, cached_output); + gradient_input_is_stale = true; + return private_get_output(); + } + + private: + tensor& private_get_output() const { return const_cast<resizable_tensor&>(cached_output); } + tensor& private_get_gradient_input() + { + if (gradient_input_is_stale) + { + gradient_input_is_stale = false; + x_grad.copy_size(private_get_output()); + x_grad = 0; + } + return x_grad; + } + void disable_output_and_gradient_getters ( + ) { get_output_and_gradient_input_disabled = true; } + public: + const tensor& get_output() const + { + if (get_output_and_gradient_input_disabled) + throw dlib::error("Accessing this layer's get_output() is disabled because an in-place layer has been stacked on top of it."); + return private_get_output(); + } + tensor& get_gradient_input() + { + if (get_output_and_gradient_input_disabled) + throw dlib::error("Accessing this layer's get_gradient_input() is disabled because an in-place layer has been stacked on top of it."); + return private_get_gradient_input(); + } + + const tensor& get_final_data_gradient( + ) const { return grad_final; } + + void back_propagate_error(const tensor& x) + { + back_propagate_error(x, private_get_gradient_input()); + } + void back_propagate_error(const tensor& x, const tensor& gradient_input) + { + // make sure grad_final is initialized to 0 + if (!have_same_dimensions(x, grad_final)) + grad_final.copy_size(x); + grad_final = 0; + + subnet_wrapper wsub(x, grad_final, _sample_expansion_factor); + params_grad.copy_size(details.get_layer_params()); + impl::call_layer_backward(details, private_get_output(), + gradient_input, wsub, static_cast<tensor&>(params_grad)); + + // zero out get_gradient_input() + gradient_input_is_stale = true; + } + + template <typename solver_type> + void update_parameters(sstack<solver_type> solvers, double learning_rate) + { + DLIB_CASSERT(solvers.size()>=num_computational_layers); + // Don't try to adjust the parameters if this layer doesn't have any or the + // learning rate is disabled for this layer. + if (params_grad.size() != 0 && get_learning_rate_multiplier(details) != 0) + { + const tensor& step = solvers.top()(learning_rate, details, static_cast<const tensor&>(params_grad)); + tt::add(details.get_layer_params(), details.get_layer_params(), step); + } + } + + const tensor& get_parameter_gradient( + ) const { return params_grad; } + + tensor& get_parameter_gradient ( + ) { return params_grad; } + + const subnet_type& subnet() const { return input_layer; } + subnet_type& subnet() { return input_layer; } + + const layer_details_type& layer_details() const { return details; } + layer_details_type& layer_details() { return details; } + + unsigned int sample_expansion_factor() const { return _sample_expansion_factor; } + + void clean() + { + x_grad.clear(); + grad_final.clear(); + cached_output.clear(); + params_grad.clear(); + temp_tensor.clear(); + gradient_input_is_stale = true; + call_clean_method_if_exists(details); + } + + friend void serialize(const add_layer& item, std::ostream& out) + { + int version = 3; + serialize(version, out); + serialize(item.input_layer, out); + serialize(item.details, out); + serialize(item.this_layer_setup_called, out); + serialize(item.gradient_input_is_stale, out); + serialize(item.get_output_and_gradient_input_disabled, out); + serialize(item.x_grad, out); + serialize(item.cached_output, out); + serialize(item.grad_final, out); + serialize(item._sample_expansion_factor, out); + } + + friend void deserialize(add_layer& item, std::istream& in) + { + int version = 0; + deserialize(version, in); + if (!(2 <= version && version <= 3)) + throw serialization_error("Unexpected version found while deserializing dlib::add_layer."); + deserialize(item.input_layer, in); + deserialize(item.details, in); + deserialize(item.this_layer_setup_called, in); + deserialize(item.gradient_input_is_stale, in); + deserialize(item.get_output_and_gradient_input_disabled, in); + deserialize(item.x_grad, in); + deserialize(item.cached_output, in); + deserialize(item.grad_final, in); + if (version >= 3) + deserialize(item._sample_expansion_factor, in); + else + item._sample_expansion_factor = 1; // all layer types set this to 1 in older dlib versions, so that's what we put here. + } + + friend std::ostream& operator<< (std::ostream& out, const add_layer& item) + { + int min_length = 0; + item.print(out, 0, min_length); + return out; + } + + void print (std::ostream& out, unsigned long idx, int& min_length) const + { + out << "layer<" << idx << ">\t" << impl::tensor_to_str(private_get_output(), min_length) << layer_details() << "\n"; + + // Don't print the repeat_input_layer since it doesn't exist from the user's + // point of view. It's just an artifact of how repeat<> works. + if (!std::is_same<subnet_type, impl::repeat_input_layer>::value) + out << "layer<" << idx+1 << ">\t" << subnet() << "\n"; + } + + private: + + bool this_layer_requires_forward_output( + ) + { + subnet_wrapper wsub(grad_final, grad_final, _sample_expansion_factor); + return impl::backward_requires_forward_output(details, wsub); + } + + class subnet_wrapper + { + public: + subnet_wrapper(const tensor& x_, resizable_tensor& grad_final_, unsigned int sef) : + x(x_), grad_final(grad_final_), _sample_expansion_factor(sef) {} + + subnet_wrapper(const subnet_wrapper&) = delete; + subnet_wrapper& operator=(const subnet_wrapper&) = delete; + + unsigned int sample_expansion_factor() const { return _sample_expansion_factor;} + const tensor& get_output() const { return x; } + tensor& get_gradient_input() + { + if (!have_same_dimensions(x, grad_final)) + { + grad_final.copy_size(x); + grad_final = 0; + } + return grad_final; + } + + private: + const tensor& x; + resizable_tensor& grad_final; + unsigned int _sample_expansion_factor; + }; + + void swap(add_layer& item) + { + std::swap(input_layer, item.input_layer); + std::swap(details, item.details); + std::swap(this_layer_setup_called, item.this_layer_setup_called); + std::swap(gradient_input_is_stale, item.gradient_input_is_stale); + std::swap(get_output_and_gradient_input_disabled, item.get_output_and_gradient_input_disabled); + std::swap(x_grad, item.x_grad); + std::swap(cached_output, item.cached_output); + std::swap(grad_final, item.grad_final); + std::swap(_sample_expansion_factor, item._sample_expansion_factor); + } + + subnet_type input_layer; + LAYER_DETAILS details; + bool this_layer_setup_called; + bool gradient_input_is_stale; + bool get_output_and_gradient_input_disabled; + mutable unsigned int _sample_expansion_factor; + resizable_tensor x_grad; + resizable_tensor cached_output; + resizable_tensor grad_final; + + // The following 2 objects don't logically contribute to the state of this class. + // They are only here to prevent them from being reallocated over and over in + // member functions. + resizable_tensor params_grad; + resizable_tensor temp_tensor; + }; + +// ---------------------------------------------------------------------------------------- + + template <unsigned long ID, typename SUBNET, typename enabled=void> + class add_tag_layer; + + template <template<typename SUBNET> class tag> + struct tag_id + { + const static unsigned long id = tag<impl::repeat_input_layer>::id; + }; + + template <unsigned long ID, typename SUBNET> + class add_tag_layer<ID,SUBNET, + typename std::enable_if<is_nonloss_layer_type<SUBNET>::value>::type> + { + public: + typedef SUBNET subnet_type; + typedef typename subnet_type::input_type input_type; + typedef int layer_details_type; // not really used anywhere, but required by subnet_wrapper. + const static size_t num_layers = subnet_type::num_layers + 1; + const static size_t num_computational_layers = subnet_type::num_computational_layers; + const static unsigned long id = ID; + + add_tag_layer() {}; + add_tag_layer(const add_tag_layer&) = default; + add_tag_layer(add_tag_layer&&) = default; + add_tag_layer& operator=(add_tag_layer&&) = default; + add_tag_layer& operator=(const add_tag_layer&) = default; + + template <typename T> + add_tag_layer( + const add_tag_layer<ID,T>& item + ) : subnetwork(item.subnet()) + {} + + template <typename ...T> + add_tag_layer( + T ...args + ) : + subnetwork(std::move(args)...) + { + } + + template <typename forward_iterator> + void to_tensor ( + forward_iterator ibegin, + forward_iterator iend, + resizable_tensor& data + ) const + { + subnetwork.to_tensor(ibegin,iend,data); + } + + template <typename forward_iterator> + const tensor& operator() ( + forward_iterator ibegin, + forward_iterator iend + ) + { + return subnetwork(ibegin,iend); + } + + const tensor& operator() (const input_type& x) + { + return subnetwork(x); + } + + const tensor& forward(const tensor& x) + { + return subnetwork.forward(x); + } + + const tensor& get_output() const { return subnetwork.get_output(); } + + tensor& get_gradient_input() + { + return subnetwork.get_gradient_input(); + } + + const tensor& get_final_data_gradient( + ) const { return subnetwork.get_final_data_gradient(); } + + void back_propagate_error(const tensor& x) + { + subnetwork.back_propagate_error(x); + } + void back_propagate_error(const tensor& x, const tensor& gradient_input) + { + subnetwork.back_propagate_error(x,gradient_input); + } + + template <typename solver_type> + void update_parameters(sstack<solver_type> solvers, double learning_rate) + { + subnetwork.update_parameters(solvers, learning_rate); + } + + const tensor& get_parameter_gradient( + ) const { return params_grad; } + + tensor& get_parameter_gradient ( + ) { return params_grad; } + + const subnet_type& subnet() const { return subnetwork; } + subnet_type& subnet() { return subnetwork; } + + unsigned int sample_expansion_factor() const { return subnet().sample_expansion_factor(); } + + void clean() + { + subnetwork.clean(); + } + + friend void serialize(const add_tag_layer& item, std::ostream& out) + { + int version = 1; + serialize(version, out); + serialize(item.subnetwork, out); + } + + friend void deserialize(add_tag_layer& item, std::istream& in) + { + int version = 0; + deserialize(version, in); + if (version != 1) + throw serialization_error("Unexpected version found while deserializing dlib::add_tag_layer."); + deserialize(item.subnetwork, in); + } + + friend std::ostream& operator<< (std::ostream& out, const add_tag_layer& item) + { + int min_length = 0; + item.print(out, 0, min_length); + return out; + } + + void print (std::ostream& out, unsigned long idx, int& min_length) const + { + out << "layer<" << idx << ">\t" << impl::tensor_to_str(private_get_output(), min_length) << "tag" << ID << "\n"; + subnet().print(out, idx+1, min_length); + } + + private: + + template <typename T, typename U, typename E> + friend class add_layer; + template <typename T, bool is_first, typename E> + friend class dimpl::subnet_wrapper; + template <unsigned long T, typename U, typename E> + friend class add_tag_layer; + template <template<typename> class T, typename U> + friend class add_skip_layer; + template <size_t N, template<typename> class L, typename S> + friend class repeat; + + // You wouldn't put a tag on a layer if you didn't want to access its forward + // outputs. So this is always true. + bool this_layer_requires_forward_output( + ) { return true; } + + void disable_output_and_gradient_getters ( + ) + { + // This should never happen because only inplace layers call + // disable_output_and_gradient_getters(), however, putting a tag layer right + // before an inplace layer basically means you don't want the following layer + // to operate in place. So the inplace layer should turn itself into an + // out-of-place layer and not call disable_output_and_gradient_getters(). + DLIB_CASSERT(false,"This should never happen"); + } + + tensor& private_get_output() const + { return subnetwork.private_get_output(); } + tensor& private_get_gradient_input() + { return subnetwork.private_get_gradient_input(); } + + subnet_type subnetwork; + + // This member doesn't logically contribute to the state of the object since it is + // always empty. It's just here so we can have the get_parameter_gradient() methods + // which have to return something. So they return this empty tensor. + resizable_tensor params_grad; + }; + +// ---------------------------------------------------------------------------------------- + + template <typename ...T> + struct decorator_repeat_group + { + decorator_repeat_group( + T&& ...args + ) : data(std::forward<T>(args)...) {} + + std::tuple<T...> data; + }; + template <typename ...T> + decorator_repeat_group<T...> repeat_group ( + T&& ...args + ) + { + return decorator_repeat_group<T...>(std::forward<T>(args)...); + } + + template < + size_t num, + template<typename> class REPEATED_LAYER, + typename SUBNET + > + class repeat + { + static_assert(num > 0, "You can't have a layer repeated 0 times."); + public: + typedef SUBNET subnet_type; + typedef typename SUBNET::input_type input_type; + typedef int layer_details_type; // not really used anywhere, but required by subnet_wrapper. + const static size_t comp_layers_in_each_group = (REPEATED_LAYER<SUBNET>::num_computational_layers-SUBNET::num_computational_layers); + const static size_t comp_layers_in_repeated_group = comp_layers_in_each_group*num; + const static size_t num_computational_layers = comp_layers_in_repeated_group + SUBNET::num_computational_layers; + + const static size_t layers_in_each_group = (REPEATED_LAYER<SUBNET>::num_layers-SUBNET::num_layers); + const static size_t layers_in_repeated_group = layers_in_each_group*num; + const static size_t num_layers = subnet_type::num_layers + layers_in_repeated_group; + + + typedef REPEATED_LAYER<impl::repeat_input_layer> repeated_layer_type; + + repeat( + ) : + details(num) + { + } + + size_t num_repetitions ( + ) const { return num; } + + const repeated_layer_type& get_repeated_layer ( + size_t i + ) const + { + DLIB_CASSERT(i < num_repetitions()); + return details[i]; + } + + repeated_layer_type& get_repeated_layer ( + size_t i + ) + { + DLIB_CASSERT(i < num_repetitions()); + return details[i]; + } + + repeat(const repeat&) = default; + repeat(repeat&&) = default; + repeat& operator=(repeat&&) = default; + repeat& operator=(const repeat&) = default; + + template <template<typename> class T, typename U> + repeat( + const repeat<num,T,U>& item + ) : + subnetwork(item.subnetwork) + { + for (auto&& d : item.details) + details.emplace_back(d); + } + + template <typename T, typename ...U> + repeat( + T arg1, + U ...args2 + ): + details(num, std::move(arg1)), + subnetwork(std::move(args2)...) + { + } + + template <typename ...T, typename ...U> + repeat( + decorator_repeat_group<T...>&& arg1, + U ...args2 + ): + details(num, arg1.data), + subnetwork(std::move(args2)...) + { + } + + template <typename T, typename ...U> + repeat( + std::tuple<>, + T arg1, + U ...args2 + ): + details(num, std::move(arg1)), + subnetwork(std::move(args2)...) + { + } + + template <typename forward_iterator> + void to_tensor ( + forward_iterator ibegin, + forward_iterator iend, + resizable_tensor& data + ) const + { + subnetwork.to_tensor(ibegin,iend,data); + // call to_tensor on the networks in details just to populate the + // _sample_expansion_factor values in those networks. Other than that this + // call is a noop. + for (auto& d : details) + d.to_tensor(ibegin, iend, data); + } + + template <typename forward_iterator> + const tensor& operator() ( + forward_iterator ibegin, + forward_iterator iend + ) + { + to_tensor(ibegin,iend,temp_tensor); + return forward(temp_tensor); + } + + const tensor& operator() (const input_type& x) + { + return (*this)(&x, &x+1); + } + + const tensor& forward(const tensor& x) + { + subnetwork.forward(x); + details[details.size()-1].forward(subnetwork.get_output()); + for (long i = details.size()-2; i >= 0; --i) + details[i].forward(details[i+1].get_output()); + return private_get_output(); + } + + private: + tensor& private_get_output() const + { + return details[0].private_get_output(); + } + tensor& private_get_gradient_input() + { + return details[0].private_get_gradient_input(); + } + public: + const tensor& get_output() const + { + return details[0].get_output(); + } + tensor& get_gradient_input() + { + return details[0].get_gradient_input(); + } + + const tensor& get_parameter_gradient( + ) const { return details[0].get_parameter_gradient(); } + + tensor& get_parameter_gradient ( + ) { return details[0].get_parameter_gradient(); } + + void back_propagate_error(const tensor& x) + { + back_propagate_error(x, private_get_gradient_input()); + } + void back_propagate_error(const tensor& x, const tensor& gradient_input) + { + if (details.size() > 1) + { + details[0].back_propagate_error(details[1].get_output(), gradient_input); + for (size_t i = 1; i < details.size(); ++i) + { + if (i+1 < details.size()) + details[i].back_propagate_error(details[i+1].get_output(), details[i-1].get_final_data_gradient()); + else + details[i].back_propagate_error(subnetwork.get_output(), details[i-1].get_final_data_gradient()); + } + } + else + { + details[0].back_propagate_error(subnetwork.get_output(), gradient_input); + } + subnetwork.back_propagate_error(x, details.back().get_final_data_gradient()); + } + + template <typename solver_type> + void update_parameters(sstack<solver_type> solvers, double learning_rate) + { + for (size_t i = 0; i < details.size(); ++i) + details[i].update_parameters(solvers.pop(comp_layers_in_each_group*i),learning_rate); + subnetwork.update_parameters(solvers.pop(comp_layers_in_each_group*details.size()),learning_rate); + } + + const subnet_type& subnet() const { return subnetwork; } + subnet_type& subnet() { return subnetwork; } + + unsigned int sample_expansion_factor() const { return subnet().sample_expansion_factor(); } + + void clean() + { + temp_tensor.clear(); + subnetwork.clean(); + for (auto&& d : details) + d.clean(); + } + + friend void serialize(const repeat& item, std::ostream& out) + { + int version = 1; + serialize(version, out); + serialize(item.details, out); + serialize(item.subnetwork, out); + } + + friend void deserialize(repeat& item, std::istream& in) + { + int version = 0; + deserialize(version, in); + if (version != 1) + throw serialization_error("Unexpected version found while deserializing dlib::repeat."); + deserialize(item.details, in); + deserialize(item.subnetwork, in); + } + + friend std::ostream& operator<< (std::ostream& out, const repeat& item) + { + int min_length = 0; + item.print(out, 0, min_length); + return out; + } + + void print (std::ostream& out, unsigned long idx, int& min_length) const + { + for (size_t i = 0; i < num_repetitions(); ++i) + { + get_repeated_layer(i).print(out, idx, min_length); + idx += layers_in_each_group; + } + subnet().print(out, idx, min_length); + } + private: + + + template <typename T, typename U, typename E> + friend class add_layer; + template <typename T, bool is_first, typename E> + friend class dimpl::subnet_wrapper; + template <unsigned long T, typename U, typename E> + friend class add_tag_layer; + template <template<typename> class T, typename U> + friend class add_skip_layer; + template <size_t N, template<typename> class L, typename S> + friend class repeat; + + bool this_layer_requires_forward_output( + ) + { + return details[0].this_layer_requires_forward_output(); + } + + void disable_output_and_gradient_getters ( + ) + { + details[0].disable_output_and_gradient_getters(); + } + + + std::vector<repeated_layer_type> details; + subnet_type subnetwork; + + // temp_tensor doesn't logically contribute to the state of this class. + // It is here only to void needing to reallocate it over and over. + resizable_tensor temp_tensor; + }; + + template < + size_t num, + template<typename> class REPEATED_LAYER, + typename SUBNET + > + struct is_nonloss_layer_type<repeat<num,REPEATED_LAYER,SUBNET>> : std::true_type {}; + +// ---------------------------------------------------------------------------------------- + +// This version of add_tag_layer handles the special case where the subnetwork being given +// is just an input layer object. + template <unsigned long ID, typename INPUT_LAYER, typename enabled> + class add_tag_layer + { + public: + typedef INPUT_LAYER subnet_type; + typedef typename subnet_type::input_type input_type; + typedef int layer_details_type; // not really used anywhere, but required by subnet_wrapper. + const static size_t num_computational_layers = 0; + const static size_t num_layers = 2; + const static unsigned long id = ID; + + add_tag_layer():cached_output_ptr(nullptr),gradient_input_is_stale(true),_sample_expansion_factor(0) {} + + add_tag_layer(const add_tag_layer&) = default; + add_tag_layer& operator=(const add_tag_layer&) = default; + add_tag_layer(add_tag_layer&& item) : add_tag_layer() { swap(item); } + add_tag_layer& operator=(add_tag_layer&& item) { swap(item); return *this; } + + template <typename T, typename E> + add_tag_layer( + const add_tag_layer<ID,T,E>& item + ) : input_layer(item.subnet()), + cached_output(item.cached_output), + cached_output_ptr(nullptr), + grad_final(item.grad_final), + gradient_input_is_stale(item.gradient_input_is_stale), + _sample_expansion_factor(0) + {} + + template <typename ...T> + add_tag_layer( + T ...args + ) : + input_layer(std::move(args)...), + cached_output_ptr(nullptr), + gradient_input_is_stale(true), + _sample_expansion_factor(0) + { + } + + add_tag_layer ( + std::tuple<> + ) : + cached_output_ptr(nullptr), + gradient_input_is_stale(true), + _sample_expansion_factor(0) + {} + + template <typename forward_iterator> + void to_tensor ( + forward_iterator ibegin, + forward_iterator iend, + resizable_tensor& data + ) const + { + input_layer.to_tensor(ibegin,iend,data); + + // make sure the input layer's to_tensor() function is implemented properly. + DLIB_CASSERT(data.num_samples() >= std::distance(ibegin,iend), + "The input layer can't produce fewer output tensors than there are inputs."); + DLIB_CASSERT(data.num_samples()%std::distance(ibegin,iend) == 0, + "The number of tensors produced by the input layer must be an integer multiple of the number of input objects."); + + _sample_expansion_factor = data.num_samples()/std::distance(ibegin,iend); + data.async_copy_to_device(); + } + + unsigned int sample_expansion_factor() const { return _sample_expansion_factor; } + + template <typename forward_iterator> + const tensor& operator() ( + forward_iterator ibegin, + forward_iterator iend + ) + { + input_layer.to_tensor(ibegin,iend,cached_output); + cached_output_ptr = nullptr; + return get_output(); + } + + const tensor& operator() (const input_type& x) + { + return (*this)(&x, &x+1); + } + + const tensor& forward(const tensor& x) + { + // If this tag is the first layer in one of the sub networks inside a repeat + // layer then we don't want it to be creating copies of x. This is because, we + // can just hold a pointer to x since the way repeat is constructed guarantees + // that x will have a lifetime larger than this pointer. + if (is_same_type<INPUT_LAYER, impl::repeat_input_layer>::value) + cached_output_ptr = const_cast<tensor*>(&x); + else + cached_output = x; + gradient_input_is_stale = true; + return get_output(); + } + + const tensor& get_output() const + { + if (cached_output_ptr) + return *cached_output_ptr; + else + return cached_output; + } + + const tensor& get_final_data_gradient( + ) const { return grad_final; } + + tensor& get_gradient_input() + { + if (!have_same_dimensions(get_output(), grad_final) || + gradient_input_is_stale) + { + grad_final.copy_size(get_output()); + grad_final = 0; + gradient_input_is_stale = false; + } + return grad_final; + } + + void back_propagate_error(const tensor& /*x*/) + { + // nothing to do + } + void back_propagate_error(const tensor& /*x*/, const tensor& /*gradient_input*/) + { + // nothing to do + } + + template <typename solver_type> + void update_parameters(sstack<solver_type> /*solvers*/, double /*learning_rate*/) + { + // nothing to do + } + + const subnet_type& subnet() const { return input_layer; } + subnet_type& subnet() { return input_layer; } + + void clean() + { + grad_final.clear(); + cached_output.clear(); + cached_output_ptr = 0; + } + + friend void serialize(const add_tag_layer& item, std::ostream& out) + { + int version = 2; + serialize(version, out); + serialize(item.input_layer, out); + serialize(item.cached_output, out); + serialize(item.grad_final, out); + serialize(item.gradient_input_is_stale, out); + serialize(item._sample_expansion_factor, out); + } + + friend void deserialize(add_tag_layer& item, std::istream& in) + { + int version = 0; + deserialize(version, in); + if (!(1 <= version && version <= 2)) + throw serialization_error("Unexpected version found while deserializing dlib::add_tag_layer."); + deserialize(item.input_layer, in); + deserialize(item.cached_output, in); + deserialize(item.grad_final, in); + deserialize(item.gradient_input_is_stale, in); + item.cached_output_ptr = nullptr; + if (version >= 2) + deserialize(item._sample_expansion_factor, in); + else + item._sample_expansion_factor = 1; // all layer types set this to 1 in older dlib versions, so that's what we put here. + + } + + friend std::ostream& operator<< (std::ostream& out, const add_tag_layer& item) + { + int min_length = 0; + item.print(out, 0, min_length); + return out; + } + + void print (std::ostream& out, unsigned long idx, int& min_length) const + { + out << "layer<"<<idx << ">\t"<<impl::tensor_to_str(private_get_output(), min_length)<< "tag" << ID << "\n"; + // Don't print the repeat_input_layer since it doesn't exist from the user's + // point of view. It's just an artifact of how repeat<> works. + if (!std::is_same<subnet_type, impl::repeat_input_layer>::value) + out << "layer<"<< idx+1 << ">\t" << subnet() << "\n"; + } + + private: + + template <typename T, typename U, typename E> + friend class add_layer; + template <typename T, bool is_first, typename E> + friend class dimpl::subnet_wrapper; + template <unsigned long T, typename U, typename E> + friend class add_tag_layer; + template <template<typename> class T, typename U> + friend class add_skip_layer; + template <size_t N, template<typename> class L, typename S> + friend class repeat; + + // You woudln't put a tag on a layer if you didn't want to access its forward + // outputs. So this is always true. + bool this_layer_requires_forward_output( + ) { return true; } + + void disable_output_and_gradient_getters ( + ) + { + // This should never happen because only inplace layers call + // disable_output_and_gradient_getters(), however, putting a tag layer right + // before an inplace layer basically means you don't want the following layer + // to operate in place. So the inplace layer should turn itself into an + // out-of-place layer and not call disable_output_and_gradient_getters(). + DLIB_CASSERT(false,"This should never happen"); + } + + tensor& private_get_output() const + { return const_cast<tensor&>(get_output()); } + tensor& private_get_gradient_input() + { return get_gradient_input(); } + + void swap(add_tag_layer& item) + { + std::swap(input_layer, item.input_layer); + std::swap(cached_output, item.cached_output); + std::swap(cached_output_ptr, item.cached_output_ptr); + std::swap(grad_final, item.grad_final); + std::swap(gradient_input_is_stale, item.gradient_input_is_stale); + std::swap(_sample_expansion_factor, item._sample_expansion_factor); + } + + subnet_type input_layer; + resizable_tensor cached_output; + tensor* cached_output_ptr; + resizable_tensor grad_final; + bool gradient_input_is_stale; + mutable unsigned int _sample_expansion_factor; + }; + + template <unsigned long ID, typename U, typename E> + struct is_nonloss_layer_type<add_tag_layer<ID,U,E>> : std::true_type {}; + + +// ---------------------------------------------------------------------------------------- +// ---------------------------------------------------------------------------------------- +// ---------------------------------------------------------------------------------------- + + template <typename LOSS_DETAILS, typename SUBNET> + class add_loss_layer; + + class no_label_type + { + private: + // We don't want anyone making these no_label_type objects. They are here only to + // allow add_loss_layer::training_label_type and dnn_trainer::training_label_type + // to exist which avoids needing to overload add_loss_layer and dnn_trainer for + // supervised an unsupervised losses. It also can be a type to use in template + // metaprogramming to indicate "no label". So here we make the constructor private + // with the exception that add_loss_layer objects can make it (again, just to + // simplify add_loss_layer's implementation). + no_label_type(){}; + template <typename LOSS_DETAILS, typename SUBNET> friend class add_loss_layer; + template < typename net_type, typename solver_type > friend class dnn_trainer; + }; + +// ---------------------------------------------------------------------------------------- + + template <typename LOSS_DETAILS, typename SUBNET> + class add_loss_layer + { + template <typename T, typename enabled=void> + struct get_loss_layer_training_label_type + { + typedef no_label_type type; + }; + template <typename T> + struct get_loss_layer_training_label_type<T,typename std::enable_if<sizeof(typename T::training_label_type)!=0>::type> + { + typedef typename T::training_label_type type; + }; + + template <typename T, typename enabled=void> + struct get_loss_layer_output_label_type + { + typedef no_label_type type; + }; + template <typename T> + struct get_loss_layer_output_label_type<T,typename std::enable_if<sizeof(typename T::output_label_type)!=0>::type> + { + typedef typename T::output_label_type type; + }; + + public: + typedef LOSS_DETAILS loss_details_type; + typedef SUBNET subnet_type; + typedef typename subnet_type::input_type input_type; + const static size_t num_layers = subnet_type::num_layers + 1; + // Note that the loss layer doesn't count as an additional computational layer. + const static size_t num_computational_layers = subnet_type::num_computational_layers; + typedef typename get_loss_layer_training_label_type<LOSS_DETAILS>::type training_label_type; + typedef typename get_loss_layer_output_label_type<LOSS_DETAILS>::type output_label_type; + + static_assert(is_nonloss_layer_type<SUBNET>::value, + "SUBNET must be of type add_layer, add_skip_layer, or add_tag_layer."); + + + add_loss_layer() {}; + add_loss_layer(const add_loss_layer&) = default; + add_loss_layer& operator=(const add_loss_layer&) = default; + add_loss_layer(add_loss_layer&& item) : add_loss_layer() { swap(item); } + add_loss_layer& operator=(add_loss_layer&& item) { swap(item); return *this; } + + template <typename T, typename U> + add_loss_layer( + const add_loss_layer<T,U>& item + ) : + loss(item.loss_details()), + subnetwork(item.subnet()) + {} + + template <typename ...T> + add_loss_layer( + const LOSS_DETAILS& layer_det, + T&& ...args + ) : + loss(layer_det), + subnetwork(std::forward<T>(args)...) + { + } + + template <typename ...T> + add_loss_layer( + LOSS_DETAILS&& layer_det, + T&& ...args + ) : + loss(std::move(layer_det)), + subnetwork(std::forward<T>(args)...) + { + } + + template <typename T, typename ...U> + struct disable_forwarding_constr + { + const static bool value = std::is_constructible<LOSS_DETAILS,T>::value; + }; + template <typename ...T> + struct disable_forwarding_constr<add_loss_layer<T...>> + { + const static bool value = true; + }; + + template < + typename ...T, + typename = typename std::enable_if<!disable_forwarding_constr<typename std::remove_reference<T>::type...>::value>::type + > + add_loss_layer( + T&& ...args + ) : + subnetwork(std::forward<T>(args)...) + { + } + + template <typename forward_iterator> + void to_tensor ( + forward_iterator ibegin, + forward_iterator iend, + resizable_tensor& data + ) const + { + subnetwork.to_tensor(ibegin,iend,data); + } + + unsigned int sample_expansion_factor() const { return subnet().sample_expansion_factor(); } + + template <typename output_iterator> + void operator() ( + const tensor& x, + output_iterator obegin + ) + { + subnetwork.forward(x); + const dimpl::subnet_wrapper<subnet_type> wsub(subnetwork); + loss.to_label(x, wsub, obegin); + } + + template <typename forward_iterator, typename output_iterator> + void operator() ( + forward_iterator ibegin, + forward_iterator iend, + output_iterator obegin + ) + { + to_tensor(ibegin,iend,temp_tensor); + (*this)(temp_tensor, obegin); + } + + const output_label_type& operator() (const input_type& x) + { + (*this)(&x, &x+1, &temp_label); + return temp_label; + } + + template <typename ...T> + const output_label_type& process (const input_type& x, T&& ...args) + { + to_tensor(&x,&x+1,temp_tensor); + subnetwork.forward(temp_tensor); + const dimpl::subnet_wrapper<subnet_type> wsub(subnetwork); + loss.to_label(temp_tensor, wsub, &temp_label, std::forward<T>(args)...); + return temp_label; + } + + template <typename iterable_type, typename ...T> + std::vector<output_label_type> process_batch (const iterable_type& data, size_t batch_size, T&& ...args) + { + std::vector<output_label_type> results(std::distance(data.begin(), data.end())); + auto o = results.begin(); + auto i = data.begin(); + auto num_remaining = results.size(); + while(num_remaining != 0) + { + auto inc = std::min(batch_size, num_remaining); + to_tensor(i,i+inc,temp_tensor); + subnetwork.forward(temp_tensor); + const dimpl::subnet_wrapper<subnet_type> wsub(subnetwork); + loss.to_label(temp_tensor, wsub, o, std::forward<T>(args)...); + + i += inc; + o += inc; + num_remaining -= inc; + } + return results; + } + + template <typename iterable_type> + std::vector<output_label_type> operator() ( + const iterable_type& data, + size_t batch_size = 128 + ) + { + std::vector<output_label_type> results(std::distance(data.begin(), data.end())); + auto o = results.begin(); + auto i = data.begin(); + auto num_remaining = results.size(); + while(num_remaining != 0) + { + auto inc = std::min(batch_size, num_remaining); + (*this)(i, i+inc, o); + i += inc; + o += inc; + num_remaining -= inc; + } + return results; + } + + template <typename label_iterator> + double compute_loss ( + const tensor& x, + label_iterator lbegin + ) + { + subnetwork.forward(x); + dimpl::subnet_wrapper<subnet_type> wsub(subnetwork); + return loss.compute_loss_value_and_gradient(x, lbegin, wsub); + } + + template <typename forward_iterator, typename label_iterator> + double compute_loss ( + forward_iterator ibegin, + forward_iterator iend, + label_iterator lbegin + ) + { + to_tensor(ibegin,iend,temp_tensor); + return compute_loss(temp_tensor, lbegin); + } + + double compute_loss ( + const tensor& x + ) + { + subnetwork.forward(x); + dimpl::subnet_wrapper<subnet_type> wsub(subnetwork); + return loss.compute_loss_value_and_gradient(x, wsub); + } + + template <typename forward_iterator> + double compute_loss ( + forward_iterator ibegin, + forward_iterator iend + ) + { + to_tensor(ibegin,iend,temp_tensor); + return compute_loss(temp_tensor); + } + + template <typename label_iterator> + double compute_parameter_gradients ( + const tensor& x, + label_iterator lbegin + ) + { + subnetwork.forward(x); + dimpl::subnet_wrapper<subnet_type> wsub(subnetwork); + double l = loss.compute_loss_value_and_gradient(x, lbegin, wsub); + subnetwork.back_propagate_error(x); + return l; + } + template <typename forward_iterator, typename label_iterator> + double compute_parameter_gradients ( + forward_iterator ibegin, + forward_iterator iend, + label_iterator lbegin + ) + { + to_tensor(ibegin,iend,temp_tensor); + return compute_parameter_gradients(temp_tensor, lbegin); + } + double compute_parameter_gradients ( + const tensor& x + ) + { + subnetwork.forward(x); + dimpl::subnet_wrapper<subnet_type> wsub(subnetwork); + double l = loss.compute_loss_value_and_gradient(x, wsub); + subnetwork.back_propagate_error(x); + return l; + } + template <typename forward_iterator> + double compute_parameter_gradients ( + forward_iterator ibegin, + forward_iterator iend + ) + { + to_tensor(ibegin,iend,temp_tensor); + return compute_parameter_gradients(temp_tensor); + } + + template <typename solver_type> + void update_parameters ( + sstack<solver_type> solvers, + double learning_rate + ) + { + subnetwork.update_parameters(solvers, learning_rate); + } + + const subnet_type& subnet() const { return subnetwork; } + subnet_type& subnet() { return subnetwork; } + const loss_details_type& loss_details() const { return loss; } + loss_details_type& loss_details() { return loss; } + + void clean ( + ) + { + temp_tensor.clear(); + subnetwork.clean(); + } + + template <typename T, typename U> + friend void serialize(const add_loss_layer<T,U>& item, std::ostream& out); + template <typename T, typename U> + friend void deserialize(add_loss_layer<T,U>& item, std::istream& in); + + friend std::ostream& operator<< (std::ostream& out, const add_loss_layer& item) + { + int min_length = 0; + item.print(out, 0, min_length); + return out; + } + + void print (std::ostream& out, unsigned long idx, int& min_length) const + { + out << "layer<" << idx << ">\t" << loss_details() << "\n"; + subnet().print(out, idx+1, min_length); + } + + private: + + + void swap(add_loss_layer& item) + { + std::swap(loss, item.loss); + std::swap(subnetwork, item.subnetwork); + } + + loss_details_type loss; + subnet_type subnetwork; + + // These two objects don't logically contribute to the state of this object. They + // are here to prevent them from being reallocated over and over. + output_label_type temp_label; + resizable_tensor temp_tensor; + }; + + template <typename LOSS_DETAILS, typename SUBNET> + void serialize(const add_loss_layer<LOSS_DETAILS,SUBNET>& item, std::ostream& out) + { + int version = 1; + serialize(version, out); + serialize(item.loss, out); + serialize(item.subnetwork, out); + } + + template <typename LOSS_DETAILS, typename SUBNET> + void deserialize(add_loss_layer<LOSS_DETAILS,SUBNET>& item, std::istream& in) + { + int version = 0; + deserialize(version, in); + if (version != 1) + throw serialization_error("Unexpected version found while deserializing dlib::add_loss_layer."); + deserialize(item.loss, in); + deserialize(item.subnetwork, in); + } + + + template <typename T, typename U> + struct is_loss_layer_type<add_loss_layer<T,U>> : std::true_type {}; + +// ---------------------------------------------------------------------------------------- +// ---------------------------------------------------------------------------------------- +// ---------------------------------------------------------------------------------------- + + namespace impl + { + template <unsigned int i, typename T, typename enabled = void> + struct layer_helper + { + static_assert(i < T::num_layers, "Call to layer() attempted to access non-existing layer in neural network."); + static T& makeT(); + using next_type = typename std::remove_reference<decltype(makeT().subnet())>::type; + using type = typename layer_helper<i-1,next_type>::type; + static type& layer(T& n) + { + return layer_helper<i-1,next_type>::layer(n.subnet()); + } + }; + template < + unsigned int i, + size_t N, template<typename> class L, typename S + > + struct layer_helper<i,repeat<N,L,S>, typename std::enable_if<(i!=0&&i>=repeat<N,L,S>::layers_in_repeated_group)>::type> + { + const static size_t layers_in_repeated_group = repeat<N,L,S>::layers_in_repeated_group; + + static repeat<N,L,S>& makeT(); + using next_type = typename std::remove_reference<decltype(makeT().subnet())>::type; + using type = typename layer_helper<i-layers_in_repeated_group,next_type>::type; + static type& layer(repeat<N,L,S>& n) + { + return layer_helper<i-layers_in_repeated_group,next_type>::layer(n.subnet()); + } + }; + template < + unsigned int i, + size_t N, template<typename> class L, typename S + > + struct layer_helper<i,repeat<N,L,S>, typename std::enable_if<(i!=0&&i<repeat<N,L,S>::layers_in_repeated_group)>::type> + { + const static size_t layers_in_each_group = repeat<N,L,S>::layers_in_each_group; + typedef typename repeat<N,L,S>::repeated_layer_type repeated_layer_type; + using next_type = repeated_layer_type; + using type = typename layer_helper<i%layers_in_each_group,next_type>::type; + static type& layer(repeat<N,L,S>& n) + { + return layer_helper<i%layers_in_each_group,next_type>::layer(n.get_repeated_layer(i/layers_in_each_group)); + } + }; + template < + size_t N, template<typename> class L, typename S + > + struct layer_helper<0,repeat<N,L,S>, void> + { + typedef typename repeat<N,L,S>::repeated_layer_type repeated_layer_type; + using type = repeated_layer_type; + static type& layer(repeat<N,L,S>& n) + { + return n.get_repeated_layer(0); + } + }; + + + + template < + unsigned int i, + size_t N, template<typename> class L, typename S + > + struct layer_helper<i,const repeat<N,L,S>, typename std::enable_if<(i!=0&&i>=repeat<N,L,S>::layers_in_repeated_group)>::type> + { + const static size_t layers_in_repeated_group = repeat<N,L,S>::layers_in_repeated_group; + + static const repeat<N,L,S>& makeT(); + using next_type = const typename std::remove_reference<decltype(makeT().subnet())>::type; + using type = const typename layer_helper<i-layers_in_repeated_group,next_type>::type; + static type& layer(const repeat<N,L,S>& n) + { + return layer_helper<i-layers_in_repeated_group,next_type>::layer(n.subnet()); + } + }; + template < + unsigned int i, + size_t N, template<typename> class L, typename S + > + struct layer_helper<i,const repeat<N,L,S>, typename std::enable_if<(i!=0&&i<repeat<N,L,S>::layers_in_repeated_group)>::type> + { + const static size_t layers_in_each_group = repeat<N,L,S>::layers_in_each_group; + typedef typename repeat<N,L,S>::repeated_layer_type repeated_layer_type; + using next_type = const repeated_layer_type; + using type = const typename layer_helper<i%layers_in_each_group,next_type>::type; + static type& layer(const repeat<N,L,S>& n) + { + return layer_helper<i%layers_in_each_group,next_type>::layer(n.get_repeated_layer(i/layers_in_each_group)); + } + }; + template < + size_t N, template<typename> class L, typename S + > + struct layer_helper<0,const repeat<N,L,S>, void> + { + typedef typename repeat<N,L,S>::repeated_layer_type repeated_layer_type; + using type = const repeated_layer_type; + static type& layer(const repeat<N,L,S>& n) + { + return n.get_repeated_layer(0); + } + }; + + + + template <typename T> + struct layer_helper<0,T,void> + { + using type = T; + static type& layer(T& n) + { + return n; + } + }; + + template <template<typename> class Match, typename T, unsigned int i, typename enabled = void> + struct layer_helper_match + { + static T& makeT(); + using next_type = typename std::remove_reference<decltype(makeT().subnet())>::type; + using type = typename layer_helper_match<Match,next_type,i>::type; + static type& layer(T& n) + { + return layer_helper_match<Match,next_type,i>::layer(n.subnet()); + } + }; + // This overload catches add_layer and add_loss_layer templates. + template <template<typename> class Match, typename T, unsigned int i> + struct layer_helper_match<Match,T,i, + typename std::enable_if<std::is_same<const T,const Match<typename T::subnet_type>>::value>::type> + { + using type = typename layer_helper<i,T>::type; + static type& layer(T& n) + { + return layer_helper<i,T>::layer(n); + } + }; + // This overload catches input templates. + template <template<typename> class Match, typename T, unsigned int i> + struct layer_helper_match<Match,T,i, + typename std::enable_if<std::is_same<const T,const Match<typename T::input_type>>::value>::type> + { + using type = typename layer_helper<i,T>::type; + static type& layer(T& n) + { + return layer_helper<i,T>::layer(n); + } + }; + // This overload catches subnet_wrapper templates. + template <template<typename> class Match, typename T, unsigned int i> + struct layer_helper_match<Match,T,i, + typename std::enable_if<std::is_same<const typename T::wrapped_type, + const Match<typename T::wrapped_type::subnet_type>>::value>::type> + { + using type = typename layer_helper<i,T>::type; + static type& layer(T& n) + { + return layer_helper<i,T>::layer(n); + } + }; + } + + template <unsigned int i, typename T> + typename impl::layer_helper<i,T>::type& layer (T& n) + { + return impl::layer_helper<i,T>::layer(n); + } + + template <template<typename> class Match, typename T> + typename impl::layer_helper_match<Match,T,0>::type& layer (T& n) + { + return impl::layer_helper_match<Match,T,0>::layer(n); + } + + template <template<typename> class Match, unsigned int i, typename T> + typename impl::layer_helper_match<Match,T,i>::type& layer (T& n) + { + return impl::layer_helper_match<Match,T,i>::layer(n); + } + +// ---------------------------------------------------------------------------------------- + + + namespace dimpl + { + template <typename T> + T& get_input_details ( + T& net + ) + { + return net; + } + + template <typename T, bool is_first, typename enabled> + auto get_input_details ( + dimpl::subnet_wrapper<T,is_first,enabled>& net + ) -> decltype(net.layer_details())& + { + return net.layer_details(); + } + + template <typename T, bool is_first, typename enabled> + auto get_input_details ( + const dimpl::subnet_wrapper<T,is_first,enabled>& net + ) -> decltype(net.layer_details())& + { + return net.layer_details(); + } + } + + template <typename net_type> + auto input_layer ( + net_type& net + ) -> decltype(dimpl::get_input_details(layer<net_type::num_layers-1>(net)))& + { + // Calling input_layer() on a subnet_wrapper is a little funny since the behavior of + // .subnet() returns another subnet_wrapper rather than an input details object as it + // does in add_layer. + return dimpl::get_input_details(layer<net_type::num_layers-1>(net)); + } + +// ---------------------------------------------------------------------------------------- + + template <template<typename> class TAG_TYPE, typename SUBNET> + class add_skip_layer + { + public: + typedef SUBNET subnet_type; + typedef typename subnet_type::input_type input_type; + typedef int layer_details_type; // not really used anywhere, but required by subnet_wrapper. + const static size_t num_layers = subnet_type::num_layers + 1; + const static size_t num_computational_layers = subnet_type::num_computational_layers; + const static unsigned long id = tag_id<TAG_TYPE>::id; + + add_skip_layer() {}; + add_skip_layer(const add_skip_layer&) = default; + add_skip_layer(add_skip_layer&&) = default; + add_skip_layer& operator=(add_skip_layer&&) = default; + add_skip_layer& operator=(const add_skip_layer&) = default; + + template <typename T> + add_skip_layer( + const add_skip_layer<TAG_TYPE,T>& item + ) : subnetwork(item.subnet()) + {} + + template <typename ...T> + add_skip_layer( + T ...args + ) : + subnetwork(std::move(args)...) + { + } + + template <typename forward_iterator> + void to_tensor ( + forward_iterator ibegin, + forward_iterator iend, + resizable_tensor& data + ) const + { + subnetwork.to_tensor(ibegin,iend,data); + } + + template <typename forward_iterator> + const tensor& operator() ( + forward_iterator ibegin, + forward_iterator iend + ) + { + subnetwork(ibegin,iend); + return layer<TAG_TYPE>(subnetwork).get_output(); + } + + const tensor& operator() (const input_type& x) + { + subnetwork(x); + return layer<TAG_TYPE>(subnetwork).get_output(); + } + + const tensor& forward(const tensor& x) + { + subnetwork.forward(x); + return layer<TAG_TYPE>(subnetwork).get_output(); + } + + const tensor& get_output() const + { + return layer<TAG_TYPE>(subnetwork).get_output(); + } + + tensor& get_gradient_input() + { + return layer<TAG_TYPE>(subnetwork).get_gradient_input(); + } + + const tensor& get_final_data_gradient( + ) const + { + return subnetwork.get_final_data_gradient(); + } + + void back_propagate_error(const tensor& x) + { + subnetwork.back_propagate_error(x); + } + + template <typename solver_type> + void update_parameters(sstack<solver_type> solvers, double learning_rate) + { + subnetwork.update_parameters(solvers, learning_rate); + } + + const tensor& get_parameter_gradient( + ) const { return params_grad; } + + tensor& get_parameter_gradient ( + ) { return params_grad; } + + + const subnet_type& subnet() const + { + return subnetwork; + } + + subnet_type& subnet() + { + return subnetwork; + } + + unsigned int sample_expansion_factor() const { return subnet().sample_expansion_factor(); } + + void clean() + { + subnetwork.clean(); + } + + friend void serialize(const add_skip_layer& item, std::ostream& out) + { + int version = 1; + serialize(version, out); + serialize(item.subnetwork, out); + } + + friend void deserialize(add_skip_layer& item, std::istream& in) + { + int version = 0; + deserialize(version, in); + if (version != 1) + throw serialization_error("Unexpected version found while deserializing dlib::add_skip_layer."); + deserialize(item.subnetwork, in); + } + + friend std::ostream& operator<< (std::ostream& out, const add_skip_layer& item) + { + int min_length = 0; + item.print(out, 0, min_length); + return out; + } + + void print (std::ostream& out, unsigned long idx, int& min_length) const + { + out << "layer<" << idx << ">\t"<<impl::tensor_to_str(private_get_output(), min_length) <<"skip"<<id<<"\n"; + subnet().print(out, idx+1, min_length); + } + + private: + + + template <typename T, typename U, typename E> + friend class add_layer; + template <typename T, bool is_first, typename E> + friend class dimpl::subnet_wrapper; + template <unsigned long T, typename U, typename E> + friend class add_tag_layer; + template <template<typename> class T, typename U> + friend class add_skip_layer; + template <size_t N, template<typename> class L, typename S> + friend class repeat; + + bool this_layer_requires_forward_output( + ) { return layer<TAG_TYPE>(subnetwork).this_layer_requires_forward_output(); } + + void disable_output_and_gradient_getters ( + ) { layer<TAG_TYPE>(subnetwork).disable_output_and_gradient_getters(); } + + tensor& private_get_output() const + { return layer<TAG_TYPE>(subnetwork).private_get_output(); } + tensor& private_get_gradient_input() + { return layer<TAG_TYPE>(subnetwork).private_get_gradient_input(); } + + subnet_type subnetwork; + + // This member doesn't logically contribute to the state of the object since it is + // always empty. It's just here so we can have the get_parameter_gradient() methods + // which have to return something. So they return this empty tensor. + resizable_tensor params_grad; + }; + template <template<typename> class T, typename U> + struct is_nonloss_layer_type<add_skip_layer<T,U>> : std::true_type {}; + + template <typename SUBNET> using tag1 = add_tag_layer< 1, SUBNET>; + template <typename SUBNET> using tag2 = add_tag_layer< 2, SUBNET>; + template <typename SUBNET> using tag3 = add_tag_layer< 3, SUBNET>; + template <typename SUBNET> using tag4 = add_tag_layer< 4, SUBNET>; + template <typename SUBNET> using tag5 = add_tag_layer< 5, SUBNET>; + template <typename SUBNET> using tag6 = add_tag_layer< 6, SUBNET>; + template <typename SUBNET> using tag7 = add_tag_layer< 7, SUBNET>; + template <typename SUBNET> using tag8 = add_tag_layer< 8, SUBNET>; + template <typename SUBNET> using tag9 = add_tag_layer< 9, SUBNET>; + template <typename SUBNET> using tag10 = add_tag_layer<10, SUBNET>; + + template <typename SUBNET> using skip1 = add_skip_layer< tag1, SUBNET>; + template <typename SUBNET> using skip2 = add_skip_layer< tag2, SUBNET>; + template <typename SUBNET> using skip3 = add_skip_layer< tag3, SUBNET>; + template <typename SUBNET> using skip4 = add_skip_layer< tag4, SUBNET>; + template <typename SUBNET> using skip5 = add_skip_layer< tag5, SUBNET>; + template <typename SUBNET> using skip6 = add_skip_layer< tag6, SUBNET>; + template <typename SUBNET> using skip7 = add_skip_layer< tag7, SUBNET>; + template <typename SUBNET> using skip8 = add_skip_layer< tag8, SUBNET>; + template <typename SUBNET> using skip9 = add_skip_layer< tag9, SUBNET>; + template <typename SUBNET> using skip10 = add_skip_layer<tag10, SUBNET>; + +// ---------------------------------------------------------------------------------------- + + namespace timpl + { + inline void fill_with_gassuan_random_numbers ( + tensor& t, + dlib::rand& rnd, + double sigma = 1 + ) + { + float* data = t.host(); + for (size_t i = 0; i < t.size(); ++i) + data[i] = rnd.get_random_gaussian()*sigma; + } + + class test_layer_subnet + { + public: + test_layer_subnet ( + dlib::rand& rnd_ + ) : rnd(rnd_) + { + // Output and gradient_input have to have the same dimensions in each + // layer. + const long num_samples = rnd.get_random_32bit_number()%4+3; + const long k = rnd.get_random_32bit_number()%4+2; + const long nr = rnd.get_random_32bit_number()%4+2; + const long nc = rnd.get_random_32bit_number()%4+2; + + output.set_size(num_samples, k, nr, nc); + gradient_input.set_size(num_samples, k, nr, nc); + + // Use a non-zero initial gradient to make sure the layers add to it + // rather than assign and blow away the initial value. + fill_with_gassuan_random_numbers(gradient_input, rnd, 0.01); + + fill_with_gassuan_random_numbers(output, rnd); + } + + + tensor& get_mutable_output() { return output; } + const tensor& get_output() const { return output; } + const tensor& private_get_output() const { return get_output(); } + const test_layer_subnet& subnet() const { init_sub(); return *subnetwork; } + + tensor& get_gradient_input() { return gradient_input; } + tensor& private_get_gradient_input() { return get_gradient_input(); } + test_layer_subnet& subnet() { init_sub(); return *subnetwork; } + + + + unsigned long count_outputs() const + { + if (subnetwork) + return subnetwork->count_outputs() + output.size(); + else + return output.size(); + } + + float& get_output_element(unsigned long i) + { + if (i < output.size()) + return output.host()[i]; + else + return subnet().get_output_element(i-output.size()); + } + + float get_gradient_input_element(unsigned long i) const + { + if (i < gradient_input.size()) + return gradient_input.host()[i]; + else + return subnet().get_gradient_input_element(i-gradient_input.size()); + } + + + private: + // We lazily initialize sub-layers as needed when someone tries to call + // subnet() + void init_sub() const + { + if (!subnetwork) + subnetwork.reset(new test_layer_subnet(rnd)); + } + + dlib::rand& rnd; + mutable std::unique_ptr<test_layer_subnet> subnetwork; + resizable_tensor output; + resizable_tensor gradient_input; + }; + + } + + struct layer_test_results + { + layer_test_results() : was_good(true) {} + explicit layer_test_results(const std::string& l) : log(l),was_good(false) {} + + std::string log; + bool was_good; + + operator bool() const { return was_good; } + }; + + inline std::ostream& operator<< (std::ostream& out, const layer_test_results& item) + { + out << item.log; + return out; + } + + template < + typename layer_details_type + > + layer_test_results impl_test_layer ( + layer_details_type l, + const float base_eps + ) + { + using namespace timpl; + // Do some setup + running_stats<double> rs_data, rs_params; + dlib::rand rnd; + std::ostringstream sout; + for (int iter = 0; iter < 10; ++iter) + { + test_layer_subnet subnetwork(rnd); + resizable_tensor output, out2, out3; + // Run setup() and forward() as well to make sure any calls to subnet() have + // happened before we start assuming we know how many data elements there are + // (since we do a lazy layer creation thing based on calls to subnet() inside + // test_layer_subnet). + l.setup(subnetwork); + impl::call_layer_forward(l, subnetwork, output); + + resizable_tensor input_grad; + input_grad.copy_size(output); + fill_with_gassuan_random_numbers(input_grad, rnd); + + + // The f() we are computing gradients of is this thing. It's value at the current + // parameter and data values is: + //sout << "f(data,params): " << dot(output, input_grad) << std::endl; + + // We are going to save a copy of the subnetwork.get_gradient_input() data before we do + // backpropagation since the backward() function is supposed to *add* to the + // gradients rather than overwrite them. We will use this saved data to check if + // that is the case. + const unsigned long num_data_inputs = subnetwork.count_outputs(); + std::vector<float> initial_gradient_input(num_data_inputs); + for (unsigned long i = 0; i < num_data_inputs; ++i) + initial_gradient_input[i] = subnetwork.get_gradient_input_element(i); + + + // Now tell the layer to compute all the gradients. In the rest of this function + // we will just be checking that these gradients were computed correctly by + // comparing them to a central differences approximation. + resizable_tensor params_grad; + params_grad.copy_size(l.get_layer_params()); + // But first, set the params grad to something crazy so that it's very obvious if + // it doesn't get fully assigned. + params_grad = std::numeric_limits<float>::infinity(); + impl::call_layer_backward(l, output, input_grad, subnetwork, params_grad); + + static_assert(impl::is_inplace_layer(l, subnetwork) == impl::has_inplace_backward(l, subnetwork), + "Layer not defined correctly. forward and backward methods must either both be in-place or both out-of-place. "); + + // Make sure the outputs of forward() and backward() are the same when they are run + // in in-place mode. + if (impl::is_inplace_layer(l, subnetwork)) + { + test_layer_subnet subnetwork2(rnd); + layer_details_type ll(l); + ll.setup(subnetwork2); + resizable_tensor ip_out; + impl::call_layer_forward(ll, subnetwork2, ip_out); + impl::call_layer_forward(ll, subnetwork2, subnetwork2.get_mutable_output()); + const auto forward_error = max(abs(mat(ip_out) - mat(subnetwork2.get_output()))); + if (forward_error > 0.00001) + { + using namespace std; + sout << "This layer is supposed to support in-place computations but the output of forward_inplace()\n"; + sout << "changes when invoked in-place vs. out-of-place. The error was: " << forward_error << endl; + return layer_test_results(sout.str()); + } + + resizable_tensor params_grad; + params_grad.copy_size(ll.get_layer_params()); + params_grad = std::numeric_limits<float>::infinity(); + + resizable_tensor input_grad; + input_grad.copy_size(ip_out); + fill_with_gassuan_random_numbers(input_grad, rnd); + resizable_tensor params_grad1, params_grad2, data_grad1, data_grad2; + params_grad1 = params_grad; + params_grad2 = params_grad; + // Now call backward() and make sure it works as well. Recall that when an + // in-place layer works in-place it assigns to it's outputs but when it's + // not running in-place it adds. So we initialize to a non-zero value to + // check that this is the behavior that really executes. + subnetwork2.get_gradient_input() = 9; + impl::call_layer_backward(ll, ip_out, input_grad, subnetwork2, params_grad1); + data_grad1 = subnetwork2.get_gradient_input(); + + subnetwork2.get_gradient_input() = mat(input_grad); + impl::call_layer_backward(ll, ip_out, subnetwork2.get_gradient_input(), subnetwork2, params_grad2); + data_grad2 = subnetwork2.get_gradient_input(); + if (params_grad.size() != 0) + { + const auto backward_param_error = max(abs(mat(params_grad1) - mat(params_grad2))); + if (backward_param_error > 0.00001) + { + using namespace std; + sout << "This layer is supposed to support in-place computations but the output of backward_inplace()\n"; + sout << "changes when invoked in-place vs. out-of-place. The error was: " << backward_param_error << endl; + return layer_test_results(sout.str()); + } + } + const auto backward_data_error = max(abs(mat(data_grad1)-9 - mat(data_grad2))); + if (backward_data_error > 0.00001) + { + using namespace std; + sout << "This layer is supposed to support in-place computations but the output of backward_inplace()\n"; + sout << "changes when invoked in-place vs. out-of-place. The error was: " << backward_data_error << endl; + return layer_test_results(sout.str()); + } + } + + // ================================================================== + // first validate the way the parameter gradients are computed + for (unsigned long i = 0; i < params_grad.size(); ++i) + { + layer_details_type l1(l); + + float eps = l1.get_layer_params().host()[i]*base_eps; + if (eps == 0) + eps = base_eps; + const float oldval = l1.get_layer_params().host()[i]; + l1.get_layer_params().host()[i] = oldval+eps; + impl::call_layer_forward(l1, subnetwork, out2); + l1.get_layer_params().host()[i] = oldval-eps; + impl::call_layer_forward(l1, subnetwork, out3); + l1.get_layer_params().host()[i] = oldval; + + // Compute a reference derivative via a central differences approximation and + // compare it to the one output by the layer and make sure they match. + double reference_derivative = (dot(out2,input_grad)-dot(out3, input_grad))/(2*eps); + double output_derivative = params_grad.host()[i]; + double relative_error; + if (reference_derivative*output_derivative != 0) + relative_error = (reference_derivative - output_derivative)/(reference_derivative); + else + relative_error = (reference_derivative - output_derivative); + double absolute_error = (reference_derivative - output_derivative); + rs_params.add(std::abs(relative_error)); + if (std::abs(relative_error) > 0.05 && std::abs(absolute_error) > 0.006) + { + using namespace std; + sout << "Gradient error in parameter #" << i <<". Relative error: "<< relative_error << endl; + sout << "expected derivative: " << reference_derivative << endl; + sout << "output derivative: " << output_derivative << endl; + sout << "iteration: " << iter << endl; + return layer_test_results(sout.str()); + } + } + + // ================================================================== + // now validate the data gradients + for (unsigned long i = 0; i < num_data_inputs; ++i) + { + const float oldval = subnetwork.get_output_element(i); + float eps = oldval*base_eps; + if (eps == 0) + eps = base_eps; + subnetwork.get_output_element(i) = oldval+eps; + impl::call_layer_forward(l, subnetwork, out2); + subnetwork.get_output_element(i) = oldval-eps; + impl::call_layer_forward(l, subnetwork, out3); + subnetwork.get_output_element(i) = oldval; + + // Compute a reference derivative via a central differences approximation and + // compare it to the one output by the layer and make sure they match. + double reference_derivative = (dot(out2,input_grad)-dot(out3, input_grad))/(2*eps); + double output_derivative = subnetwork.get_gradient_input_element(i); + output_derivative -= initial_gradient_input[i]; + double relative_error; + if (reference_derivative*output_derivative != 0) + relative_error = (reference_derivative - output_derivative)/(reference_derivative); + else + relative_error = (reference_derivative - output_derivative); + double absolute_error = (reference_derivative - output_derivative); + rs_data.add(std::abs(relative_error)); + if (std::abs(relative_error) > 0.05 && std::abs(absolute_error) > 0.006) + { + using namespace std; + sout << "Gradient error in data variable #" << i <<". Relative error: "<< relative_error << endl; + sout << "expected derivative: " << reference_derivative << endl; + sout << "output derivative: " << output_derivative << endl; + sout << "iteration: " << iter << endl; + return layer_test_results(sout.str()); + } + } + + } // end for (int iter = 0; iter < 10; ++iter) + + if (rs_params.mean() > 0.003) + { + using namespace std; + sout << "Average parameter gradient error is somewhat large at: "<< rs_params.mean() << endl; + return layer_test_results(sout.str()); + } + if (rs_data.mean() > 0.003) + { + using namespace std; + sout << "Average data gradient error is somewhat large at: "<< rs_data.mean() << endl; + return layer_test_results(sout.str()); + } + + return layer_test_results(); + } + + template < + typename layer_details_type + > + layer_test_results test_layer ( + layer_details_type l + ) + { + // Try a few different derivative step sizes to see if any work. + for (float base_eps = 0.0001; base_eps < 0.1; base_eps *= 2) + { + auto result = impl_test_layer(l, base_eps); + if (result) + return result; + } + // However, if none of the step sizes worked then try this one and probably result + // in returning an error. + return impl_test_layer(l, 0.01); + } + +// ---------------------------------------------------------------------------------------- + + namespace impl + { + template <size_t i, size_t num> + struct vlp_loop + { + template <typename T, typename U> + static typename std::enable_if<!is_add_layer<U>::value>::type invoke_functor(T&& , size_t& , U&& ) + { + // intentionally left empty + } + + template <typename T, typename U> + static typename std::enable_if<is_add_layer<U>::value>::type invoke_functor(T&& v , size_t& comp_i, U&& l ) + { + v(comp_i, l.layer_details().get_layer_params()); + ++comp_i; + } + + template < + typename net_type, + typename visitor + > + static void visit( + size_t comp_i, + net_type& net, + visitor&& v + ) + { + invoke_functor(v, comp_i, layer<i>(net)); + vlp_loop<i+1, num>::visit(comp_i, net,v); + } + }; + + template <size_t num> + struct vlp_loop<num,num> + { + template < + typename net_type, + typename visitor + > + static void visit( + size_t, + net_type&, + visitor&& + ) + { + // Base case of recursion. Don't do anything. + } + }; + + } + + template < + typename net_type, + typename visitor + > + void visit_layer_parameters( + net_type& net, + visitor v + ) + { + size_t comp_i = 0; + impl::vlp_loop<0, net_type::num_layers>::visit(comp_i, net, v); + } + +// ---------------------------------------------------------------------------------------- + + namespace impl + { + template <size_t i, size_t num> + struct vlpg_loop + { + template <typename T, typename U> + static typename std::enable_if<!is_add_layer<U>::value>::type invoke_functor(T&& , size_t& , U&& ) + { + // intentionally left empty + } + + template <typename T, typename U> + static typename std::enable_if<is_add_layer<U>::value>::type invoke_functor(T&& v , size_t& comp_i, U&& l ) + { + v(comp_i, l.get_parameter_gradient()); + ++comp_i; + } + + template < + typename net_type, + typename visitor + > + static void visit( + size_t comp_i, + net_type& net, + visitor&& v + ) + { + invoke_functor(v, comp_i, layer<i>(net)); + vlpg_loop<i+1, num>::visit(comp_i, net,v); + } + }; + + template <size_t num> + struct vlpg_loop<num,num> + { + template < + typename net_type, + typename visitor + > + static void visit( + size_t, + net_type&, + visitor&& + ) + { + // Base case of recursion. Don't do anything. + } + }; + + } + + template < + typename net_type, + typename visitor + > + void visit_layer_parameter_gradients( + net_type& net, + visitor v + ) + { + size_t comp_i = 0; + impl::vlpg_loop<0, net_type::num_layers>::visit(comp_i, net, v); + } + +// ---------------------------------------------------------------------------------------- + + namespace impl + { + template <size_t i, size_t num> + struct vl_loop + { + template < + typename net_type, + typename visitor + > + static void visit( + net_type& net, + visitor&& v + ) + { + v(i, layer<i>(net)); + vl_loop<i+1, num>::visit(net,v); + } + }; + + template <size_t num> + struct vl_loop<num,num> + { + template < + typename net_type, + typename visitor + > + static void visit( + net_type&, + visitor&& + ) + { + // Base case of recursion. Don't do anything. + } + }; + + template <size_t i, size_t num> + struct vl_loop_backwards + { + template < + typename net_type, + typename visitor + > + static void visit( + net_type& net, + visitor&& v + ) + { + vl_loop_backwards<i+1, num>::visit(net,v); + v(i, layer<i>(net)); + } + }; + + template <size_t num> + struct vl_loop_backwards<num,num> + { + template < + typename net_type, + typename visitor + > + static void visit( + net_type&, + visitor&& + ) + { + // Base case of recursion. Don't do anything. + } + }; + + } + + template < + typename net_type, + typename visitor + > + void visit_layers( + net_type& net, + visitor v + ) + { + impl::vl_loop<0, net_type::num_layers>::visit(net, v); + } + + template < + typename net_type, + typename visitor + > + void visit_layers_backwards( + net_type& net, + visitor v + ) + { + impl::vl_loop_backwards<0, net_type::num_layers>::visit(net, v); + } + + template < + size_t begin, + size_t end, + typename net_type, + typename visitor + > + void visit_layers_range( + net_type& net, + visitor v + ) + { + static_assert(begin <= end, "Invalid range"); + static_assert(end <= net_type::num_layers, "Invalid range"); + impl::vl_loop<begin,end>::visit(net, v); + } + + template < + size_t begin, + size_t end, + typename net_type, + typename visitor + > + void visit_layers_backwards_range( + net_type& net, + visitor v + ) + { + static_assert(begin <= end, "Invalid range"); + static_assert(end <= net_type::num_layers, "Invalid range"); + impl::vl_loop_backwards<begin,end>::visit(net, v); + } + +// ---------------------------------------------------------------------------------------- + + namespace impl + { + template <size_t i, unsigned long tag_id> + struct vl_until_tag + { + template < + typename net_type, + typename next_net_type, + typename visitor + > + static void visit( + net_type& net, + next_net_type& next_net, + visitor&& v + ) + { + v(next_net); + vl_until_tag<i+1,tag_id>::visit(net,layer<i+1>(net),v); + } + + template < + typename net_type, + typename SUBNET, + typename visitor + > + static void visit( + net_type& net, + const add_tag_layer<tag_id,SUBNET>& next_net, + visitor&& v + ) + { + v(next_net); + } + + template < + typename net_type, + typename SUBNET, + typename visitor + > + static void visit( + net_type& net, + add_tag_layer<tag_id,SUBNET>& next_net, + visitor&& v + ) + { + v(next_net); + } + }; + } + + template < + unsigned long tag_id, + typename net_type, + typename visitor + > + void visit_layers_until_tag( + net_type& net, + visitor v + ) + { + impl::vl_until_tag<0,tag_id>::visit(net, net, v); + } + +// ---------------------------------------------------------------------------------------- + +} + +#endif // DLIB_DNn_CORE_H_ + + diff --git a/ml/dlib/dlib/dnn/core_abstract.h b/ml/dlib/dlib/dnn/core_abstract.h new file mode 100644 index 000000000..db168a88b --- /dev/null +++ b/ml/dlib/dlib/dnn/core_abstract.h @@ -0,0 +1,1700 @@ +// Copyright (C) 2015 Davis E. King (davis@dlib.net) +// License: Boost Software License See LICENSE.txt for the full license. +#undef DLIB_DNn_CORE_ABSTRACT_H_ +#ifdef DLIB_DNn_CORE_ABSTRACT_H_ + +#include "tensor_abstract.h" +#include <memory> +#include <type_traits> +#include <tuple> +#include <vector> +#include "../rand.h" + + +namespace dlib +{ + +// ---------------------------------------------------------------------------------------- + + template < + typename... T + > + auto tuple_tail( + const std::tuple<T...>& item + ); + /*! + ensures + - returns a tuple that contains everything in item except for tuple_head(item). + The items will be in the same order as they are in item, just without + tuple_head(item). + - This function will correctly handle nested tuples. + !*/ + + template <typename... T> + auto tuple_head ( + const std::tuple<T...>& item + ); + /*! + ensures + - returns a copy of the first thing in the tuple that isn't a std::tuple. + Essentially, this function calls std::get<0>() recursively on item until + a non-std::tuple object is found. + !*/ + +// ---------------------------------------------------------------------------------------- + + template <typename T> + double get_learning_rate_multiplier( + const T& obj + ); + /*! + ensures + - if (obj has a get_learning_rate_multiplier() member function) then + - returns obj.get_learning_rate_multiplier() + - else + - returns 1 + !*/ + + template <typename T> + double get_weight_decay_multiplier( + const T& obj + ); + /*! + ensures + - if (obj has a get_weight_decay_multiplier() member function) then + - returns obj.get_weight_decay_multiplier() + - else + - returns 1 + !*/ + +// ---------------------------------------------------------------------------------------- + + bool dnn_prefer_fastest_algorithms( + ); + /*! + ensures + - If dlib should prefer to use fast algorithms rather than ones that use less + RAM then this function returns true and false otherwise. + - On program startup this function will default to true. + !*/ + + void set_dnn_prefer_fastest_algorithms( + ); + /*! + ensures + - #dnn_prefer_fastest_algorithms() == true + !*/ + + void set_dnn_prefer_smallest_algorithms( + ); + /*! + ensures + - #dnn_prefer_fastest_algorithms() == false + !*/ + +// ---------------------------------------------------------------------------------------- + + template < + typename T + > + class sstack + { + /*! + WHAT THIS OBJECT REPRESENTS + This is a basic stack of T objects. It contains no data itself but simply + points to a memory range of T object and allows you to access that block of + T objects as a stack. + !*/ + + public: + typedef T value_type; + + sstack() = delete; + + sstack ( + T* data, + size_t s + ); + /*! + ensures + - #size() == s + - #top() == *data + - #pop(i).top() == data[i] + !*/ + + const T& top( + ) const; + /*! + requires + - size() != 0 + ensures + - returns the top element of the stack. + !*/ + + T& top( + ); + /*! + requires + - size() != 0 + ensures + - returns the top element of the stack. + !*/ + + size_t size( + ) const; + /*! + ensures + - returns the number of elements in this stack. + !*/ + + sstack pop( + size_t num = 1 + ); + /*! + requires + - num <= size() + ensures + - returns a reference to the sub-stack S such that: + - S.size() == size()-num. + - S.top() is num elements down the stack. + !*/ + }; + + template < + typename T + > + sstack<T> make_sstack( + std::vector<T>& item + ) { return sstack<T>(item.data(), item.size()); } + /*! + ensures + - returns a sstack that sits on top of the given std::vector. + !*/ + +// ---------------------------------------------------------------------------------------- + + template < + typename LAYER_DETAILS, + typename SUBNET + > + class add_layer + { + /*! + REQUIREMENTS ON LAYER_DETAILS + - Must be a type that implements the EXAMPLE_COMPUTATIONAL_LAYER_ interface + defined in layers_abstract.h + + REQUIREMENTS ON SUBNET + - One of the following must be true: + - SUBNET implements the EXAMPLE_INPUT_LAYER interface defined in + input_abstract.h. + - SUBNET is an add_layer object. + - SUBNET is an add_tag_layer object. + - SUBNET is an add_skip_layer object. + - SUBNET is a repeat object. + + WHAT THIS OBJECT REPRESENTS + This object represents a deep neural network. In particular, it is a tool + for adding another layer on top of the neural network of type SUBNET, which + is specified as a template argument. The specific layer added is defined + by the LAYER_DETAILS details template argument. + !*/ + + public: + typedef LAYER_DETAILS layer_details_type; + typedef SUBNET subnet_type; + typedef typename subnet_type::input_type input_type; + // num_computational_layers will always give the number of layers in the network + // that transform tensors (i.e. layers defined by something that implements the + // EXAMPLE_COMPUTATIONAL_LAYER_ interface). This is all the layers except for + // loss, tag, and skip layers. + const static size_t num_computational_layers = subnet_type::num_computational_layers + 1; + // num_layers counts all the layers in the network regardless of their type. + const static size_t num_layers = subnet_type::num_layers + 1; + + add_layer( + ); + /*! + ensures + - default constructs all the layers in this network. + - #sample_expansion_factor() == 0 + !*/ + + add_layer(const add_layer&) = default; + add_layer(add_layer&&) = default; + add_layer& operator=(add_layer&&) = default; + add_layer& operator=(const add_layer&) = default; + /*! + ensures + - this object is copyable and movable. + !*/ + + template <typename T, typename U> + add_layer( + const add_layer<T,U>& item + ); + /*! + ensures + - This constructor allows you to copy neural network objects from one to + another as long as their corresponding layers can be constructed from + each other. + - #layer_details() == layer_details_type(item.layer_details()) + - #subnet() == subnet_type(item.subnet()) + - #sample_expansion_factor() == item.sample_expansion_factor() + !*/ + + template <typename ...T, typename LD, typename ...U> + add_layer( + const std::tuple<LD,U...>& layer_det, + T&& ...args + ); + /*! + ensures + - #layer_details() == layer_details_type(tuple_head(layer_det)) + - #subnet() == subnet_type(tuple_tail(layer_det),args) + - #sample_expansion_factor() == 0 + !*/ + + template <typename ...T> + add_layer( + const layer_details_type& layer_det, + T&& ...args + ); + /*! + ensures + - #layer_details() == layer_details_type(layer_det) + - #subnet() == subnet_type(args) + - #sample_expansion_factor() == 0 + !*/ + + template <typename ...T> + add_layer( + T&& ...args + ); + /*! + ensures + - This version of the constructor is only called if layer_details_type + can't be constructed from the first thing in args. In this case, the + args are simply passed on to the sub layers in their entirety. + - #layer_details() == layer_details_type() + - #subnet() == subnet_type(args) + - #sample_expansion_factor() == 0 + !*/ + + template <typename ...T> + add_layer( + layer_details_type&& layer_det, + T&& ...args + ); + /*! + ensures + - #layer_details() == layer_det + - #subnet() == subnet_type(args) + - #sample_expansion_factor() == 0 + !*/ + + template <typename forward_iterator> + void to_tensor ( + forward_iterator ibegin, + forward_iterator iend, + resizable_tensor& data + ) const; + /*! + requires + - [ibegin, iend) is an iterator range over input_type objects. + - std::distance(ibegin,iend) > 0 + ensures + - Converts the iterator range into a tensor and stores it into #data. + - #data.num_samples()%distance(ibegin,iend) == 0. + - #sample_expansion_factor() == #data.num_samples()/distance(ibegin,iend). + - #sample_expansion_factor() > 0 + - The data in the ith sample of #data corresponds to the input_type object + *(ibegin+i/#sample_expansion_factor()). + - Invokes data.async_copy_to_device() so that the data begins transferring + to the GPU device, if present. + - This function is implemented by calling the to_tensor() routine defined + at the input layer of this network. + !*/ + + unsigned int sample_expansion_factor ( + ) const; + /*! + ensures + - When to_tensor() is invoked on this network's input layer it converts N + input objects into M samples, all stored inside a resizable_tensor. It + is always the case that M is some integer multiple of N. + sample_expansion_factor() returns the value of this multiplier. To be + very specific, it is always true that M==I*N where I is some integer. + This integer I is what is returned by sample_expansion_factor(). + !*/ + + const subnet_type& subnet( + ) const; + /*! + ensures + - returns the immediate subnetwork of *this network. + !*/ + + subnet_type& subnet( + ); + /*! + ensures + - returns the immediate subnetwork of *this network. + !*/ + + const layer_details_type& layer_details( + ) const; + /*! + ensures + - returns the layer_details_type instance that defines the behavior of the + layer at the top of this network. I.e. returns the layer details that + defines the behavior of the layer nearest to the network output rather + than the input layer. + !*/ + + layer_details_type& layer_details( + ); + /*! + ensures + - returns the layer_details_type instance that defines the behavior of the + layer at the top of this network. I.e. returns the layer details that + defines the behavior of the layer nearest to the network output rather + than the input layer. + !*/ + + template <typename forward_iterator> + const tensor& operator() ( + forward_iterator ibegin, + forward_iterator iend + ); + /*! + requires + - [ibegin, iend) is an iterator range over input_type objects. + - std::distance(ibegin,iend) > 0 + ensures + - runs [ibegin,iend) through the network and returns the results. + In particular, this function performs: + to_tensor(ibegin,iend,temp_tensor); + return forward(temp_tensor); + - The return value from this function is also available in #get_output(). + i.e. this function returns #get_output(). + - have_same_dimensions(#get_gradient_input(), #get_output()) == true. + - All elements of #get_gradient_input() are set to 0. + i.e. calling this function clears out #get_gradient_input() and ensures + it has the same dimensions as the most recent output. + !*/ + + const tensor& operator() ( + const input_type& x + ); + /*! + ensures + - runs a single x through the network and returns the output. + I.e. returns (*this)(&x, &x+1); + !*/ + + const tensor& forward( + const tensor& x + ); + /*! + requires + - sample_expansion_factor() != 0 + (i.e. to_tensor() must have been called to set sample_expansion_factor() + to something non-zero.) + - x.num_samples()%sample_expansion_factor() == 0 + - x.num_samples() > 0 + ensures + - Runs x through the network and returns the results. In particular, this + function performs the equivalent of: + subnet().forward(x); + if (this is the first time forward() has been called) then + layer_details().setup(subnet()); + layer_details().forward(subnet(), get_output()); + - The return value from this function is also available in #get_output(). + i.e. this function returns #get_output(). + - have_same_dimensions(#get_gradient_input(), #get_output()) == true + - All elements of #get_gradient_input() are set to 0. + i.e. calling this function clears out #get_gradient_input() and ensures + it has the same dimensions as the most recent output. + !*/ + + const tensor& get_output( + ) const; + /*! + ensures + - returns the output for the last tensor that was run through the network. + If nothing has been run through the network yet then returns an empty + tensor. + !*/ + + tensor& get_gradient_input( + ); + /*! + ensures + - returns the error gradient for this network. That is, this is the error + gradient that this network will use to compute parameter gradients when + back_propagate_error() is called. Therefore, when performing back + propagation, layers that sit on top of this network layer write their + back-propagated error gradients into get_gradient_input(). Or to put it + another way, during back-propagation, layers take the contents of their + get_gradient_input() and back-propagate it through themselves and store + the result into their subnetwork's get_gradient_input(). + + This means you should consider get_gradient_input() as an input to the + back_propagate_error() method. + !*/ + + const tensor& get_final_data_gradient( + ) const; + /*! + ensures + - if back_propagate_error() has been called to back-propagate a gradient + through this network then you can call get_final_data_gradient() to + obtain the last data gradient computed. That is, this function returns + the gradient of the network with respect to its inputs. + - Note that there is only one "final data gradient" for an entire network, + not one per layer, since there is only one input to the entire network. + !*/ + + const tensor& get_parameter_gradient( + ) const; + /*! + ensures + - if back_propagate_error() has been called then you can call + get_parameter_gradient() to find the gradient of this layer's parameters. + When we update the parameters by calling update_parameters(), it will use + the gradient in get_parameter_gradient() to perform the update. + Therefore, you should consider get_parameter_gradient() as an input to + update_parameters(). + !*/ + + tensor& get_parameter_gradient ( + ); + /*! + ensures + - returns a non-const reference to the tensor returned by the above + get_parameter_gradient() method. You could use this method to modify the + parameter gradient in some way before invoking update_parameters(). + !*/ + + void back_propagate_error( + const tensor& x + ); + /*! + requires + - forward(x) was called to forward propagate x though the network. + Moreover, this was the most recent call to forward() and x has not been + subsequently modified in any way. + - get_gradient_input() has been set equal to the gradient of this network's + output with respect to some loss function. + ensures + - Back propagates the error gradient, get_gradient_input(), through this + network and computes parameter and data gradients, via backpropagation. + Specifically, this function populates get_final_data_gradient() and also, + for each layer, the tensor returned by get_parameter_gradient(). + - All elements of #get_gradient_input() are set to 0. + - have_same_dimensions(#get_final_data_gradient(), x) == true. + - have_same_dimensions(#get_parameter_gradient(), layer_details().get_layer_params()) == true. + - #get_final_data_gradient() contains the gradient of the network with + respect to x. + !*/ + + void back_propagate_error( + const tensor& x, + const tensor& gradient_input + ); + /*! + requires + - forward(x) was called to forward propagate x though the network. + Moreover, this was the most recent call to forward() and x has not been + subsequently modified in any way. + - have_same_dimensions(gradient_input, get_output()) == true + ensures + - This function is identical to the version of back_propagate_error() + defined immediately above except that it back-propagates gradient_input + through the network instead of get_gradient_input(). Therefore, this + version of back_propagate_error() is equivalent to performing: + get_gradient_input() = gradient_input; + back_propagate_error(x); + Except that calling back_propagate_error(x,gradient_input) avoids the + copy and is therefore slightly more efficient. + - All elements of #get_gradient_input() are set to 0. + - have_same_dimensions(#get_final_data_gradient(), x) == true. + - have_same_dimensions(#get_parameter_gradient(), layer_details().get_layer_params()) == true. + - #get_final_data_gradient() contains the gradient of the network with + respect to x. + !*/ + + template <typename solver_type> + void update_parameters( + sstack<solver_type> solvers, + double learning_rate + ); + /*! + requires + - solver_type is an implementation of the EXAMPLE_SOLVER interface defined + in solvers_abstract.h + - back_propagate_error() has been called. + - The given solvers have only ever been used with this network. That is, + if you want to call update_parameters() on some other neural network + object then you must NOT reuse the same solvers object. + - solvers.size() >= num_computational_layers + - 0 < learning_rate <= 1 + ensures + - Updates all the parameters in the network. In particular, we pass each + layer's parameter gradient (i.e. the tensor returned by the layer's + get_parameter_gradient() member) through that layer's corresponding + solver object. This produces a parameter delta vector which we add to + the layer's parameters. + - The solvers use the given learning rate. + !*/ + + void clean( + ); + /*! + ensures + - Causes the network to forget about everything but its parameters. + That is, for each layer we will have: + - get_output().num_samples() == 0 + - get_gradient_input().num_samples() == 0 + However, running new input data though this network will still produce + the same output it would have produced regardless of any calls to + clean(). The purpose of clean() is to compact the network object prior + to saving it to disk so that it takes up less space and the IO is + quicker. + - This also calls the .clean() method on any layer details objects that + define a .clean() method. + !*/ + + }; + + template <typename T, typename U> + std::ostream& operator<<(std::ostream& out, const add_layer<T,U>& item); + /*! + prints the network architecture to the given output stream. + !*/ + + template <typename T, typename U> + void serialize(const add_layer<T,U>& item, std::ostream& out); + template <typename T, typename U> + void deserialize(add_layer<T,U>& item, std::istream& in); + /*! + provides serialization support + !*/ + +// ---------------------------------------------------------------------------------------- +// ---------------------------------------------------------------------------------------- +// ---------------------------------------------------------------------------------------- + + class no_label_type; + + template < + typename LOSS_DETAILS, + typename SUBNET + > + class add_loss_layer + { + /*! + REQUIREMENTS ON LOSS_DETAILS + - Must be a type that implements the EXAMPLE_LOSS_LAYER_ interface defined + in loss_abstract.h + + REQUIREMENTS ON SUBNET + - One of the following must be true: + - SUBNET is an add_layer object. + - SUBNET is an add_tag_layer object. + - SUBNET is an add_skip_layer object. + - SUBNET is a repeat object. + + WHAT THIS OBJECT REPRESENTS + This object represents a deep neural network. In particular, it is a tool + for adding a loss layer on top of the neural network of type SUBNET, which + is specified as a template argument. The specific layer added is defined + by the LOSS_DETAILS details template argument. Importantly, a loss layer + is the last layer in a deep neural network. So once it is added you can't + add any other layers of any type. + !*/ + + public: + typedef LOSS_DETAILS loss_details_type; + typedef SUBNET subnet_type; + typedef typename subnet_type::input_type input_type; + const static size_t num_computational_layers = subnet_type::num_computational_layers; + const static size_t num_layers = subnet_type::num_layers + 1; + // If LOSS_DETAILS is an unsupervised loss then training_label_type==no_label_type. + // Otherwise it is defined as follows: + typedef typename LOSS_DETAILS::training_label_type training_label_type; + // Similarly, if LOSS_DETAILS doesn't provide any output conversion then + // output_label_type==no_label_type. + typedef typename LOSS_DETAILS::output_label_type output_label_type; + + + + add_loss_layer() = default; + /*! + ensures + - default constructs all the layers in this network. + !*/ + + add_loss_layer(const add_loss_layer&) = default; + add_loss_layer(add_loss_layer&&) = default; + add_loss_layer& operator=(add_loss_layer&&) = default; + add_loss_layer& operator=(const add_loss_layer&) = default; + /*! + ensures + - this object is copyable and movable. + !*/ + + template <typename T, typename U> + add_loss_layer( + const add_loss_layer<T,U>& item + ); + /*! + ensures + - This constructor allows you to copy neural network objects from one to + another as long as their corresponding layers can be constructed from + each other. + - #loss_details() == loss_details_type(item.loss_details()) + - #subnet() == subnet_type(item.subnet()) + !*/ + + template <typename ...T> + add_loss_layer( + const LOSS_DETAILS& layer_det, + T&& ...args + ); + /*! + ensures + - #loss_details() == loss_details_type(layer_det) + - #subnet() == subnet_type(args) + !*/ + + template <typename ...T> + add_loss_layer( + LOSS_DETAILS&& layer_det, + T&& ...args + ); + /*! + ensures + - #loss_details() == loss_details_type(layer_det) + - #subnet() == subnet_type(args) + !*/ + + template <typename ...T> + add_loss_layer( + T&& ...args + ); + /*! + ensures + - This version of the constructor is only called if loss_details_type can't + be constructed from the first thing in args. In this case, the args are + simply passed on to the sub layers in their entirety. + - #loss_details() == loss_details_type() + - #subnet() == subnet_type(args) + !*/ + + const subnet_type& subnet( + ) const; + /*! + ensures + - returns the immediate subnetwork of *this network. + !*/ + + subnet_type& subnet( + ); + /*! + ensures + - returns the immediate subnetwork of *this network. + !*/ + + const loss_details_type& loss_details( + ) const; + /*! + ensures + - returns the loss_details_type instance that defines the behavior of the + loss layer used by this network. + !*/ + + loss_details_type& loss_details( + ); + /*! + ensures + - returns the loss_details_type instance that defines the behavior of the + loss layer used by this network. + !*/ + + template <typename forward_iterator> + void to_tensor ( + forward_iterator ibegin, + forward_iterator iend, + resizable_tensor& data + ) const; + /*! + requires + - [ibegin, iend) is an iterator range over input_type objects. + - std::distance(ibegin,iend) > 0 + ensures + - Converts the iterator range into a tensor and stores it into #data. + - #data.num_samples()%distance(ibegin,iend) == 0. + - #sample_expansion_factor() == #data.num_samples()/distance(ibegin,iend). + - #sample_expansion_factor() > 0 + - The data in the ith sample of #data corresponds to the input_type object + *(ibegin+i/sample_expansion_factor()). + - Invokes data.async_copy_to_device() so that the data begins transferring + to the GPU device, if present. + - This function is implemented by calling the to_tensor() routine defined + at the input layer of this network. + !*/ + + unsigned int sample_expansion_factor ( + ) const; + /*! + ensures + - When to_tensor() is invoked on this network's input layer it converts N + input objects into M samples, all stored inside a resizable_tensor. It + is always the case that M is some integer multiple of N. + sample_expansion_factor() returns the value of this multiplier. To be + very specific, it is always true that M==I*N where I is some integer. + This integer I is what is returned by sample_expansion_factor(). + !*/ + + // ------------- + + template <typename output_iterator> + void operator() ( + const tensor& x, + output_iterator obegin + ); + /*! + requires + - sample_expansion_factor() != 0 + (i.e. to_tensor() must have been called to set sample_expansion_factor() + to something non-zero.) + - x.num_samples()%sample_expansion_factor() == 0 + - x.num_samples() > 0 + - obegin == iterator pointing to the start of a range of + x.num_samples()/sample_expansion_factor() output_label_type elements. + ensures + - runs x through the network and writes the output to the range at obegin. + - loss_details().to_label() is used to write the network output into + obegin. + !*/ + + template <typename forward_iterator, typename label_iterator> + void operator() ( + forward_iterator ibegin, + forward_iterator iend, + label_iterator obegin + ); + /*! + requires + - [ibegin, iend) is an iterator range over input_type objects. + - std::distance(ibegin,iend) > 0 + - obegin == iterator pointing to the start of a range of + std::distance(ibegin,iend) output_label_type elements. + ensures + - runs [ibegin,iend) through the network and writes the output to the range + at obegin. + - loss_details().to_label() is used to write the network output into + obegin. + !*/ + + // ------------- + + const output_label_type& operator() ( + const input_type& x + ); + /*! + ensures + - runs a single object, x, through the network and returns the output. + - loss_details().to_label() is used to convert the network output into a + output_label_type. + !*/ + + template <typename iterable_type> + std::vector<output_label_type> operator() ( + const iterable_type& data, + size_t batch_size = 128 + ); + /*! + requires + - batch_size > 0 + - data must have a .begin() and .end() that supply iterators over a + sequence of input_type elements. E.g. data could have a type of + std::vector<input_type> + ensures + - runs all the objects in data through the network and returns their + predicted labels. This means this function returns a vector V such that: + - V.size() == data.size() + - for all valid i: V[i] == the predicted label of data[i]. + - Elements of data are run through the network in batches of batch_size + items. Using a batch_size > 1 can be faster because it better exploits + the available hardware parallelism. + - loss_details().to_label() is used to convert the network output into a + output_label_type. + !*/ + + template <typename ...T> + const output_label_type& process ( + const input_type& x, + T&& ...args + ); + /*! + ensures + - This function is just like (*this)(x), i.e. it runs a single object, x, + through the network and returns the output. But we additionally pass the + given args to loss_details().to_label() as the 4th argument (or more, + depending on how many things are in args) when converting the network + output to an output_label_type. This is useful, for instance, with loss + layers like loss_mmod_ which has an optional adjust_threshold argument to + to_label() that adjusts the detection threshold. Therefore, for such + networks you could call them like: net.process(some_image, -0.5), and -0.5 + would be passed so the adjust_threshold argument of to_tensor(). + !*/ + + template <typename iterable_type, typename ...T> + std::vector<output_label_type> process_batch ( + const iterable_type& data, + size_t batch_size, + T&& ...args + ); + /*! + requires + - batch_size > 0 + - data must have a .begin() and .end() that supply iterators over a + sequence of input_type elements. E.g. data could have a type of + std::vector<input_type> + ensures + - This function is just like (*this)(data,batch_size), i.e. it runs a + bunch of objects through the network and returns the outputs. But we + additionally pass the given args to loss_details().to_label() as the 4th + argument (or more, depending on how many things are in args) when + converting the network output to output_label_types. This is useful, + for instance, with loss layers like loss_mmod_ which has an optional + adjust_threshold argument to to_label() that adjusts the detection + threshold. Therefore, for such networks you could call them like: + net.process_batch(std::vector<image_type>({some_image, another_image}), 128, -0.5), + and -0.5 would be passed so the adjust_threshold argument of to_tensor(). + !*/ + + // ------------- + + template <typename label_iterator> + double compute_loss ( + const tensor& x, + label_iterator lbegin + ); + /*! + requires + - sample_expansion_factor() != 0 + (i.e. to_tensor() must have been called to set sample_expansion_factor() + to something non-zero.) + - x.num_samples()%sample_expansion_factor() == 0 + - x.num_samples() > 0 + - lbegin == iterator pointing to the start of a range of + x.num_samples()/sample_expansion_factor() training_label_type elements. + ensures + - runs x through the network, compares the output to the expected output + pointed to by lbegin, and returns the resulting loss. + - for all valid k: + - the expected label of the kth sample in x is *(lbegin+k/sample_expansion_factor()). + - This function does not update the network parameters. + !*/ + + template <typename forward_iterator, typename label_iterator> + double compute_loss ( + forward_iterator ibegin, + forward_iterator iend, + label_iterator lbegin + ); + /*! + requires + - [ibegin, iend) is an iterator range over input_type objects. + - std::distance(ibegin,iend) > 0 + - lbegin == iterator pointing to the start of a range of + std::distance(ibegin,iend) training_label_type elements. + ensures + - runs [ibegin,iend) through the network, compares the output to the + expected output pointed to by lbegin, and returns the resulting loss. + - for all valid k: + - the expected label of *(ibegin+k) is *(lbegin+k). + - This function does not update the network parameters. + !*/ + + // ------------- + + double compute_loss ( + const tensor& x + ); + /*! + requires + - LOSS_DETAILS is an unsupervised loss. i.e. training_label_type==no_label_type. + - sample_expansion_factor() != 0 + (i.e. to_tensor() must have been called to set sample_expansion_factor() + to something non-zero.) + - x.num_samples()%sample_expansion_factor() == 0 + - x.num_samples() > 0 + ensures + - runs x through the network and returns the resulting loss. + - This function does not update the network parameters. + !*/ + + template <typename forward_iterator> + double compute_loss ( + forward_iterator ibegin, + forward_iterator iend, + ); + /*! + requires + - LOSS_DETAILS is an unsupervised loss. i.e. training_label_type==no_label_type. + - [ibegin, iend) is an iterator range over input_type objects. + - std::distance(ibegin,iend) > 0 + ensures + - runs [ibegin,iend) through the network and returns the resulting loss. + - This function does not update the network parameters. + !*/ + + // ------------- + + template <typename label_iterator> + double compute_parameter_gradients ( + const tensor& x, + label_iterator lbegin + ); + /*! + requires + - sample_expansion_factor() != 0 + (i.e. to_tensor() must have been called to set sample_expansion_factor() + to something non-zero.) + - x.num_samples()%sample_expansion_factor() == 0 + - x.num_samples() > 0 + - lbegin == iterator pointing to the start of a range of + x.num_samples()/sample_expansion_factor() training_label_type elements. + ensures + - runs x through the network, compares the output to the expected output + pointed to by lbegin, and computes parameter and data gradients with + respect to the loss, via backpropagation. Specifically, this function + updates get_final_data_gradient() and also, for each layer, the tensor + returned by get_parameter_gradient(). + - for all valid k: + - the expected label of the kth sample in x is *(lbegin+k/sample_expansion_factor()). + - returns compute_loss(x,lbegin) + !*/ + + template <typename forward_iterator, typename label_iterator> + double compute_parameter_gradients ( + forward_iterator ibegin, + forward_iterator iend, + label_iterator lbegin + ); + /*! + requires + - [ibegin, iend) is an iterator range over input_type objects. + - std::distance(ibegin,iend) > 0 + - lbegin == iterator pointing to the start of a range of + std::distance(ibegin,iend) training_label_type elements. + ensures + - runs [ibegin,iend) through the network, compares the output to the + expected output pointed to by lbegin, and computes parameter and data + gradients with respect to the loss, via backpropagation. Specifically, + this function updates get_final_data_gradient() and also, for each layer, + the tensor returned by get_parameter_gradient(). + - for all valid k: + - the expected label of *(ibegin+k) is *(lbegin+k). + - returns compute_loss(ibegin,iend,lbegin) + !*/ + + double compute_parameter_gradients ( + const tensor& x + ); + /*! + requires + - LOSS_DETAILS is an unsupervised loss. i.e. training_label_type==no_label_type. + - sample_expansion_factor() != 0 + (i.e. to_tensor() must have been called to set sample_expansion_factor() + to something non-zero.) + - x.num_samples()%sample_expansion_factor() == 0 + - x.num_samples() > 0 + ensures + - runs x through the network and computes parameter and data gradients with + respect to the loss, via backpropagation. Specifically, this function + updates get_final_data_gradient() and also, for each layer, the tensor + returned by get_parameter_gradient(). + - returns compute_loss(x) + !*/ + + template <typename forward_iterator> + double compute_parameter_gradients ( + forward_iterator ibegin, + forward_iterator iend + ); + /*! + requires + - LOSS_DETAILS is an unsupervised loss. i.e. training_label_type==no_label_type. + - [ibegin, iend) is an iterator range over input_type objects. + - std::distance(ibegin,iend) > 0 + ensures + - runs [ibegin,iend) through the network and computes parameter and data + gradients with respect to the loss, via backpropagation. Specifically, + this function updates get_final_data_gradient() and also, for each layer, + the tensor returned by get_parameter_gradient(). + - returns compute_loss(ibegin,iend) + !*/ + + template <typename solver_type> + void update_parameters ( + sstack<solver_type> solvers, + double learning_rate + ); + /*! + requires + - solver_type is an implementation of the EXAMPLE_SOLVER interface defined + in solvers_abstract.h + - compute_parameter_gradients() has been called. + - The given solvers have only ever been used with this network. That + is, if you want to call update_parameters() on some other neural network + object then you must NOT reuse the same solvers object. + - solvers.size() >= num_computational_layers + - 0 < learning_rate <= 1 + ensures + - Updates all the parameters in the network. In particular, we pass each + layer's parameter gradient (i.e. the tensor returned by the layer's + get_parameter_gradient() member) through that layer's corresponding + solver object. This produces a parameter delta vector which we add to + the layer's parameters. + - The solvers use the given learning rate. + !*/ + + // ------------- + + void clean ( + ); + /*! + ensures + - Causes the network to forget about everything but its parameters. + - invokes subnet().clean() + !*/ + }; + + template <typename T, typename U> + std::ostream& operator<<(std::ostream& out, const add_loss_layer<T,U>& item); + /*! + prints the network architecture to the given output stream. + !*/ + + template <typename T, typename U> + void serialize(const add_loss_layer<T,U>& item, std::ostream& out); + template <typename T, typename U> + void deserialize(add_loss_layer<T,U>& item, std::istream& in); + /*! + provides serialization support + !*/ + +// ---------------------------------------------------------------------------------------- +// ---------------------------------------------------------------------------------------- +// ---------------------------------------------------------------------------------------- + + template <typename ...T> + decorator_repeat_group<T...> repeat_group ( + T&& ...args + ); + /*! + ensures + - Decorates a group of variables. This is essentially like std::make_tuple() + except it's only purpose is to group variables together so they can be passed + to the repeat object's constructor. + !*/ + + template < + size_t num, + template<typename> class REPEATED_LAYER, + typename SUBNET + > + class repeat + { + /*! + REQUIREMENTS ON num + - num > 0 + + REQUIREMENTS ON REPEATED_LAYER + - REPEATED_LAYER must be a template that stacks more layers onto a deep neural + network. For example, if net_type were a network without a loss layer, + then it should be legal to create a deeper network with a type of + REPEATED_LAYER<net_type>. + + REQUIREMENTS ON SUBNET + - One of the following must be true: + - SUBNET is an add_layer object. + - SUBNET is an add_tag_layer object. + - SUBNET is an add_skip_layer object. + - SUBNET is a repeat object. + + WHAT THIS OBJECT REPRESENTS + This object adds more layers to a deep neural network. In particular, it + adds REPEATED_LAYER on top of SUBNET num times. So for example, if num were 2 then + repeat<2,REPEATED_LAYER,SUBNET> would create a network equivalent to REPEATED_LAYER<REPEATED_LAYER<SUBNET>>. + + Also, this object provides an interface identical to the one defined by the + add_layer object except that we add the num_repetitions() and + get_repeated_layer() methods. These additions are shown below along with + some additional explanatory comments. + !*/ + + public: + + typedef SUBNET subnet_type; + typedef typename SUBNET::input_type input_type; + const static size_t num_computational_layers = (REPEATED_LAYER<SUBNET>::num_computational_layers-SUBNET::num_computational_layers)*num + SUBNET::num_computational_layers; + const static size_t num_layers = (REPEATED_LAYER<SUBNET>::num_layers-SUBNET::num_layers)*num + SUBNET::num_layers; + typedef REPEATED_LAYER<an_unspecified_input_type> repeated_layer_type; + + template <typename T, typename ...U> + repeat( + T arg1, + U ...args2 + ); + /*! + ensures + - arg1 is used to initialize the num_repetitions() copies of REPEATED_LAYER inside + this object. That is, all the REPEATED_LAYER elements are initialized identically + by being given copies of arg1. + - The rest of the arguments to the constructor, i.e. args2, are passed to + SUBNET's constructor. + !*/ + + template <typename ...T, typename ...U> + repeat( + decorator_repeat_group<T...>&& arg1, + U ...args2 + ); + /*! + ensures + - arg1 is used to initialize the num_repetitions() copies of REPEATED_LAYER inside + this object. That is, all the REPEATED_LAYER elements are initialized identically + by being given copies of an undecorated arg1. + - The rest of the arguments to the constructor, i.e. args2, are passed to + SUBNET's constructor. + !*/ + + size_t num_repetitions ( + ) const; + /*! + ensures + - returns num (i.e. the number of times REPEATED_LAYER was stacked on top of SUBNET) + !*/ + + const repeated_layer_type& get_repeated_layer ( + size_t i + ) const; + /*! + requires + - i < num_repetitions() + ensures + - returns a reference to the i-th instance of REPEATED_LAYER. For example, + get_repeated_layer(0) returns the instance of REPEATED_LAYER that is on the top of + the network while get_repeated_layer(num_repetitions()-1) returns the + instance of REPEATED_LAYER that is stacked immediately on top of SUBNET. + !*/ + + repeated_layer_type& get_repeated_layer ( + size_t i + ); + /*! + requires + - i < num_repetitions() + ensures + - returns a reference to the i-th instance of REPEATED_LAYER. For example, + get_repeated_layer(0) returns the instance of REPEATED_LAYER that is on the top of + the network while get_repeated_layer(num_repetitions()-1) returns the + instance of REPEATED_LAYER that is stacked immediately on top of SUBNET. + !*/ + + const subnet_type& subnet( + ) const; + /*! + ensures + - returns the SUBNET base network that repeat sits on top of. If you want + to access the REPEATED_LAYER components then you must use get_repeated_layer(). + !*/ + + subnet_type& subnet( + ); + /*! + ensures + - returns the SUBNET base network that repeat sits on top of. If you want + to access the REPEATED_LAYER components then you must use get_repeated_layer(). + !*/ + }; + + template < size_t num, template<typename> class T, typename U > + std::ostream& operator<<(std::ostream& out, const repeat<num,T,U>& item); + /*! + prints the network architecture to the given output stream. + !*/ + + template < size_t num, template<typename> class T, typename U > + void serialize(const repeat<num,T,U>& item, std::ostream& out); + template < size_t num, template<typename> class T, typename U > + void deserialize(repeat<num,T,U>& item, std::istream& in); + /*! + provides serialization support + !*/ + +// ---------------------------------------------------------------------------------------- + + template < + unsigned long ID, + typename SUBNET + > + class add_tag_layer + { + /*! + REQUIREMENTS ON SUBNET + - One of the following must be true: + - SUBNET implements the EXAMPLE_INPUT_LAYER interface defined in + input_abstract.h. + - SUBNET is an add_layer object. + - SUBNET is an add_tag_layer object. + - SUBNET is an add_skip_layer object. + - SUBNET is a repeat object. + + WHAT THIS OBJECT REPRESENTS + This object adds a new layer to a deep neural network. However, this layer + simply performs the identity transform. This means it is a no-op and its + presence does not change the behavior of the network. It exists solely to + be used by add_skip_layer to reference a particular part of a network. + + Also, this object provides an interface identical to the one defined by the + add_layer object. + !*/ + }; + + template <unsigned long ID, typename U> + std::ostream& operator<<(std::ostream& out, const add_tag_layer<ID,U>& item); + /*! + prints the network architecture to the given output stream. + !*/ + + template <unsigned long ID, typename U> + void serialize(const add_tag_layer<ID,U>& item, std::ostream& out); + template <unsigned long ID, typename U> + void deserialize(add_tag_layer<ID,U>& item, std::istream& in); + /*! + provides serialization support + !*/ + + template <typename SUBNET> using tag1 = add_tag_layer< 1, SUBNET>; + template <typename SUBNET> using tag2 = add_tag_layer< 2, SUBNET>; + template <typename SUBNET> using tag3 = add_tag_layer< 3, SUBNET>; + template <typename SUBNET> using tag4 = add_tag_layer< 4, SUBNET>; + template <typename SUBNET> using tag5 = add_tag_layer< 5, SUBNET>; + template <typename SUBNET> using tag6 = add_tag_layer< 6, SUBNET>; + template <typename SUBNET> using tag7 = add_tag_layer< 7, SUBNET>; + template <typename SUBNET> using tag8 = add_tag_layer< 8, SUBNET>; + template <typename SUBNET> using tag9 = add_tag_layer< 9, SUBNET>; + template <typename SUBNET> using tag10 = add_tag_layer<10, SUBNET>; + + template <template<typename SUBNET> class tag> + struct tag_id + { + /*! + REQUIREMENTS ON tag + Tag should be an add_tag_layer template such as tag1, tag2, etc. + + WHAT THIS OBJECT REPRESENTS + This is a tool for finding the numeric ID of a tag layer. For example, + tag_id<tag3>::id == 3. + !*/ + + const static unsigned long id; + }; + +// ---------------------------------------------------------------------------------------- + + template < + template<typename> class TAG_TYPE, + typename SUBNET + > + class add_skip_layer + { + /*! + REQUIREMENTS ON SUBNET + - One of the following must be true: + - SUBNET is an add_layer object. + - SUBNET is an add_tag_layer object. + - SUBNET is an add_skip_layer object. + - SUBNET is a repeat object. + + WHAT THIS OBJECT REPRESENTS + This object adds a new layer to a deep neural network which draws its + inputs from layer<TAG_TYPE>(subnet()) and performs the identity transform. + + Also, this object provides an interface identical to the one defined by the + add_layer object. + !*/ + }; + + template <template<typename> class T, typename U> + std::ostream& operator<<(std::ostream& out, const add_skip_layer<T,U>& item); + /*! + prints the network architecture to the given output stream. + !*/ + + template <template<typename> class T, typename U> + void serialize(const add_skip_layer<T,U>& item, std::ostream& out); + template <template<typename> class T, typename U> + void deserialize(add_skip_layer<T,U>& item, std::istream& in); + /*! + provides serialization support + !*/ + + template <typename SUBNET> using skip1 = add_skip_layer< tag1, SUBNET>; + template <typename SUBNET> using skip2 = add_skip_layer< tag2, SUBNET>; + template <typename SUBNET> using skip3 = add_skip_layer< tag3, SUBNET>; + template <typename SUBNET> using skip4 = add_skip_layer< tag4, SUBNET>; + template <typename SUBNET> using skip5 = add_skip_layer< tag5, SUBNET>; + template <typename SUBNET> using skip6 = add_skip_layer< tag6, SUBNET>; + template <typename SUBNET> using skip7 = add_skip_layer< tag7, SUBNET>; + template <typename SUBNET> using skip8 = add_skip_layer< tag8, SUBNET>; + template <typename SUBNET> using skip9 = add_skip_layer< tag9, SUBNET>; + template <typename SUBNET> using skip10 = add_skip_layer<tag10, SUBNET>; + +// ---------------------------------------------------------------------------------------- + + template < + unsigned int i, + typename net_type + > + auto& layer ( + net_type& n + ); + /*! + requires + - net_type is an object of type add_layer, add_loss_layer, add_skip_layer, or + add_tag_layer. + - i < net_type::num_layers + ensures + - This function allows you to access any layer in a network by its layer index + i. Therefore, it will walk i steps down the network and return the layer + object there. Since networks can be big, the best way to find layer index + numbers is to print a network to the screen since the print out will include + indexes for each layer. + - In general, this function chains together i calls to n.subnet() and returns + the result. So for example: + - if (i == 0) + - returns n + - else if (i == 1) + - returns n.subnet() + - else if (i == 2) + - returns n.subnet().subnet() + - else if (i == 3) + - returns n.subnet().subnet().subnet() + - else + - etc. + Except that when it hits a repeat layer it recurses into the repeated layers + contained inside. That is, if the layer index indicates a layer in a repeat + object this function will make the appropriate call to get_repeated_layer() + and do the right thing. + !*/ + + template < + template<typename> class Match, + typename net_type + > + auto& layer ( + net_type& n + ); + /*! + requires + - net_type is an object of type add_layer, add_loss_layer, add_skip_layer, or + add_tag_layer. + ensures + - returns the first layer in n that is of type Match. E.g. if net_type is + fc<relu<fc<input<sample_type>>>> then calling layer<relu>(n) would return + layer<1>(n), that is, a reference to the relu layer. + !*/ + + template < + template<typename> class Match, + unsigned int i, + typename net_type + > + auto& layer ( + net_type& n + ); + /*! + requires + - net_type is an object of type add_layer, add_loss_layer, add_skip_layer, or + add_tag_layer. + ensures + - returns layer<i>(layer<Match>(n)) + !*/ + +// ---------------------------------------------------------------------------------------- + + template <typename net_type> + auto& input_layer ( + net_type& net + ); + /*! + requires + - net_type is an object of type add_layer, add_loss_layer, add_skip_layer, or + add_tag_layer. + ensures + - returns the input later of the given network object. Specifically, this + function is equivalent to calling: + layer<net_type::num_layers-1>(net); + That is, you get the input layer details object for the network. + !*/ + +// ---------------------------------------------------------------------------------------- + + template < + typename net_type, + typename visitor + > + void visit_layer_parameters( + net_type& net, + visitor v + ); + /*! + requires + - net_type is an object of type add_layer, add_loss_layer, add_skip_layer, or + add_tag_layer. + - v is a function object with a signature equivalent to: + v(size_t idx, tensor& t) + ensures + - Loops over all the computational layers (i.e. layers with parameters, as + opposed to loss, tag, or input layers) in net and passes their parameters to + v(). To be specific, this function essentially performs the following: + + size_t computational_layer_idx = 0; + for (size_t i = 0; i < net_type::num_layers; ++i) + { + if (layer<i>(net) is a computational layer) + { + v(computational_layer_idx, layer<i>(net).layer_details().get_layer_params()); + ++computational_layer_idx; + } + } + - When v() is called, the first argument is always < net_type::num_computational_layers. + !*/ + +// ---------------------------------------------------------------------------------------- + + template < + typename net_type, + typename visitor + > + void visit_layer_parameter_gradients( + net_type& net, + visitor v + ); + /*! + requires + - net_type is an object of type add_layer, add_loss_layer, add_skip_layer, or + add_tag_layer. + - v is a function object with a signature equivalent to: + v(size_t idx, tensor& t) + ensures + - Loops over all the computational layers (i.e. layers with parameters, as + opposed to loss, tag, or input layers) in net and passes their parameter + gradients to v(). To be specific, this function essentially performs the + following: + + size_t computational_layer_idx = 0; + for (size_t i = 0; i < net_type::num_layers; ++i) + { + if (layer<i>(net) is a computational layer) + { + v(computational_layer_idx, layer<i>(net).get_parameter_gradient()); + ++computational_layer_idx; + } + } + - When v() is called, the first argument is always < net_type::num_computational_layers. + !*/ + +// ---------------------------------------------------------------------------------------- + + template < + typename net_type, + typename visitor + > + void visit_layers( + net_type& net, + visitor v + ); + /*! + requires + - net_type is an object of type add_layer, add_loss_layer, add_skip_layer, or + add_tag_layer. + - v is a function object with a signature equivalent to: + v(size_t idx, any_net_type& t) + That is, it must take a size_t and then any of the network types such as + add_layer, add_loss_layer, etc. + ensures + - Loops over all the layers in net and calls v() on them. To be specific, this + function essentially performs the following: + + for (size_t i = 0; i < net_type::num_layers; ++i) + v(i, layer<i>(net)); + !*/ + + template < + typename net_type, + typename visitor + > + void visit_layers_backwards( + net_type& net, + visitor v + ); + /*! + requires + - net_type is an object of type add_layer, add_loss_layer, add_skip_layer, or + add_tag_layer. + - v is a function object with a signature equivalent to: + v(size_t idx, any_net_type& t) + That is, it must take a size_t and then any of the network types such as + add_layer, add_loss_layer, etc. + ensures + - Loops over all the layers in net and calls v() on them. The loop happens in + the reverse order of visit_layers(). To be specific, this function + essentially performs the following: + + for (size_t i = net_type::num_layers; i != 0; --i) + v(i-1, layer<i-1>(net)); + !*/ + +// ---------------------------------------------------------------------------------------- + + template < + size_t begin, + size_t end, + typename net_type, + typename visitor + > + void visit_layers_range( + net_type& net, + visitor v + ); + /*! + requires + - net_type is an object of type add_layer, add_loss_layer, add_skip_layer, or + add_tag_layer. + - v is a function object with a signature equivalent to: + v(size_t idx, any_net_type& t) + That is, it must take a size_t and then any of the network types such as + add_layer, add_loss_layer, etc. + - begin <= end <= net_type::num_layers + ensures + - Loops over the layers in the range [begin,end) in net and calls v() on them. + The loop happens in the reverse order of visit_layers(). To be specific, + this function essentially performs the following: + + for (size_t i = begin; i < end; ++i) + v(i, layer<i>(net)); + !*/ + + template < + size_t begin, + size_t end, + typename net_type, + typename visitor + > + void visit_layers_backwards_range( + net_type& net, + visitor v + ); + /*! + requires + - net_type is an object of type add_layer, add_loss_layer, add_skip_layer, or + add_tag_layer. + - v is a function object with a signature equivalent to: + v(size_t idx, any_net_type& t) + That is, it must take a size_t and then any of the network types such as + add_layer, add_loss_layer, etc. + - begin <= end <= net_type::num_layers + ensures + - Loops over the layers in the range [begin,end) in net and calls v() on them. + The loop happens in the reverse order of visit_layers_range(). To be specific, + this function essentially performs the following: + + for (size_t i = end; i != begin; --i) + v(i-1, layer<i-1>(net)); + !*/ + +// ---------------------------------------------------------------------------------------- + + template < + unsigned long tag_id, + typename net_type, + typename visitor + > + void visit_layers_until_tag( + net_type& net, + visitor v + ); + /*! + requires + - net_type is an object of type add_layer, add_loss_layer, add_skip_layer, or + add_tag_layer. + - v is a function object with a signature equivalent to: + v(any_net_type& t) + That is, it must take any of the network types such as add_layer, + add_loss_layer, etc. + ensures + - Loops over all the layers in net beginning with layer<0>(net) and going until + a tag layer with an ID of tag_id is encountered. To be specific, this + function essentially performs the following: + + size_t i = 0; + while(layer<i>(net) isn't an add_tag_layer with ID == tag_id) { + v(layer<i>(net)); + ++i; + } + v(layer<i>(net)); // also visits the tag layer itself at the very end. + !*/ + +// ---------------------------------------------------------------------------------------- + + struct layer_test_results + { + std::string log; + bool was_good; + + operator bool() const { return was_good; } + }; + + inline std::ostream& operator<< (std::ostream& out, const layer_test_results& item) + { + out << item.log; + return out; + } + + template < + typename layer_details_type + > + layer_test_results test_layer ( + layer_details_type l + ); + /*! + ensures + - Checks if l correctly implements the EXAMPLE_COMPUTATIONAL_LAYER_ interface + defined in layers_abstract.h. Importantly, it computes numerical approximations + to the gradients and compares them to the outputs of the layer. + - The results of the testing are returned. In particular, if the returned object + is RESULT then we will have: + - RESULT.was_good == false if and only if the layer failed the testing. + - RESULT.log == a string describing why the testing failed if was_good==false. + - Note that this function is only capable of checking layers that take + arbitrary subnetworks as input. So if you have designed a layer that expects + only a certain restricted type of subnetwork then you might get a compile or + runtime error when you call this function. + !*/ + +// ---------------------------------------------------------------------------------------- + +} + +#endif // DLIB_DNn_CORE_ABSTRACT_H_ + diff --git a/ml/dlib/dlib/dnn/cpu_dlib.cpp b/ml/dlib/dlib/dnn/cpu_dlib.cpp new file mode 100644 index 000000000..ed5661102 --- /dev/null +++ b/ml/dlib/dlib/dnn/cpu_dlib.cpp @@ -0,0 +1,2170 @@ +// Copyright (C) 2015 Davis E. King (davis@dlib.net) +// License: Boost Software License See LICENSE.txt for the full license. +#ifndef DLIB_DNN_CPU_cPP_ +#define DLIB_DNN_CPU_cPP_ + +// This file contains CPU implementations of the GPU based functions in cuda_dlib.h + +#include "cpu_dlib.h" +#include "tensor_tools.h" +#include "../image_transforms/interpolation.h" +#include "../threads.h" + +namespace dlib +{ + namespace cpu + { + + // ----------------------------------------------------------------------------------- + + void multiply ( + bool add_to, + tensor& dest, + const tensor& src1, + const tensor& src2 + ) + { + DLIB_CASSERT(dest.k() == src1.k() && src1.k() == src2.k() && + dest.nr() == src1.nr() && src1.nr() == src2.nr() && + dest.nc() == src1.nc() && src1.nc() == src2.nc() ); + const long MD = std::max(std::max(dest.num_samples(),src1.num_samples()),src2.num_samples()); + DLIB_CASSERT((dest.num_samples()==1 || dest.num_samples()==MD) && + (src1.num_samples()==1 || src1.num_samples()==MD) && + (src2.num_samples()==1 || src2.num_samples()==MD) ); + + if (dest.size() == 0) + return; + + const size_t max_size = std::max(std::max(dest.size(),src1.size()),src2.size()); + const auto d = dest.host(); + const auto s1 = src1.host(); + const auto s2 = src2.host(); + if (dest.size() == src1.size() && src1.size() == src2.size()) + { + if (add_to) + { + for (size_t i = 0; i < src1.size(); ++i) + d[i] += s1[i]*s2[i]; + } + else + { + for (size_t i = 0; i < src1.size(); ++i) + d[i] = s1[i]*s2[i]; + } + } + else if (dest.num_samples() == 1) + { + if (!add_to) + { + for (size_t i = 0; i < dest.size(); ++i) + d[i] = 0; + } + for (size_t i = 0; i < max_size; ++i) + d[i%dest.size()] += s1[i%src1.size()]*s2[i%src2.size()]; + } + else + { + if (add_to) + { + for (size_t i = 0; i < max_size; ++i) + d[i] += s1[i%src1.size()]*s2[i%src2.size()]; + } + else + { + for (size_t i = 0; i < max_size; ++i) + d[i] = s1[i%src1.size()]*s2[i%src2.size()]; + } + } + } + + // ------------------------------------------------------------------------------------ + + void multiply_conv ( + bool add_to, + tensor& dest, + const tensor& src1, + const tensor& src2 + ) + { + auto d = dest.host(); + auto s1 = src1.host(); + auto s2 = src2.host(); + if (have_same_dimensions(dest,src1)) + { + DLIB_CASSERT(src2.num_samples() == 1 && src2.nr() == 1 && src2.nc() == 1 && src2.k() == src1.k()); + + if (add_to) + { + for (long n = 0; n < dest.num_samples(); ++n) + { + for (long k = 0; k < dest.k(); ++k) + { + for (long r = 0; r < dest.nr(); ++r) + { + for (long c = 0; c < dest.nc(); ++c) + { + *d++ += (*s1++)*s2[k]; + } + } + } + } + } + else + { + for (long n = 0; n < dest.num_samples(); ++n) + { + for (long k = 0; k < dest.k(); ++k) + { + for (long r = 0; r < dest.nr(); ++r) + { + for (long c = 0; c < dest.nc(); ++c) + { + *d++ = (*s1++)*s2[k]; + } + } + } + } + } + } + else + { + DLIB_CASSERT(have_same_dimensions(src1,src2)); + DLIB_CASSERT(dest.num_samples() == 1 && dest.nr() == 1 && dest.nc() == 1 && dest.k() == src1.k()); + + if (!add_to) + { + for (long k = 0; k < src1.k(); ++k) + d[k] = 0; + } + + for (long n = 0; n < src1.num_samples(); ++n) + { + for (long k = 0; k < src1.k(); ++k) + { + for (long r = 0; r < src1.nr(); ++r) + { + for (long c = 0; c < src1.nc(); ++c) + { + d[k] += (*s1++)*(*s2++); + } + } + } + } + } + } + + // ------------------------------------------------------------------------------------ + + void scale_channels ( + bool add_to, + tensor& dest, + const tensor& src, + const tensor& scales + ) + { + DLIB_CASSERT(have_same_dimensions(dest,src) && + scales.num_samples() == src.num_samples() && + scales.k() == src.k() && + scales.nr() == 1 && + scales.nc() == 1 ); + + if (dest.size() == 0) + return; + + if (add_to) + { + auto d = dest.host(); + auto s = src.host(); + auto scal = scales.host(); + + for (long n = 0; n < src.num_samples(); ++n) + { + for (long k = 0; k < src.k(); ++k) + { + const auto scale = scal[n*scales.k() + k]; + for (long r = 0; r < src.nr(); ++r) + { + for (long c = 0; c < src.nc(); ++c) + { + *d++ += (*s++) * scale; + } + } + } + } + + + } + else + { + auto d = dest.host_write_only(); + auto s = src.host(); + auto scal = scales.host(); + + for (long n = 0; n < src.num_samples(); ++n) + { + for (long k = 0; k < src.k(); ++k) + { + const auto scale = scal[n*scales.k() + k]; + for (long r = 0; r < src.nr(); ++r) + { + for (long c = 0; c < src.nc(); ++c) + { + *d++ = (*s++) * scale; + } + } + } + } + } + } + + // ------------------------------------------------------------------------------------ + + void add( + float beta, + tensor& dest, + float alpha, + const tensor& src + ) + { + DLIB_CASSERT( + (have_same_dimensions(src, dest) || + (src.num_samples()==1 && src.k()==dest.k() && src.nr()==1 && src.nc()==1) || + (src.num_samples()==1 && src.k()==dest.k() && src.nr()==dest.nr() && src.nc()==dest.nc()) || + (src.num_samples()==1 && src.k()==1 && src.nr()==dest.nr() && src.nc()==dest.nc()) || + (src.num_samples()==dest.num_samples() && src.k()==1 && src.nr()==1 && src.nc()==1)) && + is_same_object(src,dest) == false , + "\n\t dest.num_samples(): " << dest.num_samples() + <<"\n\t dest.k(): " << dest.k() + <<"\n\t dest.nr(): " << dest.nr() + <<"\n\t dest.nc(): " << dest.nc() + <<"\n\t src.num_samples(): " << src.num_samples() + <<"\n\t src.k(): " << src.k() + <<"\n\t src.nr(): " << src.nr() + <<"\n\t src.nc(): " << src.nc() + ); + + + if (beta == 0 && alpha == 0) + { + dest = 0; + return; + } + + auto d = dest.host(); + auto s = src.host(); + for (long n = 0; n < dest.num_samples(); ++n) + { + const auto sn = src.num_samples()==1 ? 0:n; + for (long k = 0; k < dest.k(); ++k) + { + const auto sk = src.k()==1 ? 0:k; + for (long r = 0; r < dest.nr(); ++r) + { + const auto sr = src.nr()==1 ? 0:r; + for (long c = 0; c < dest.nc(); ++c) + { + const auto sc = src.nc()==1 ? 0:c; + + const auto s_idx = ((sn*src.k() + sk)*src.nr() + sr)*src.nc() + sc; + *d = beta*(*d) + alpha*s[s_idx]; + ++d; + } + } + } + } + } + + // ---------------------------------------------------------------------------------------- + + void add ( + tensor& dest, + const tensor& src1, + const tensor& src2 + ) + { + auto d = dest.host(); + auto s1 = src1.host(); + auto s2 = src2.host(); + + // Do the simple and fast version if everything has the same dimensions + if (have_same_dimensions(dest, src1) && + have_same_dimensions(dest, src2)) + { + for (size_t i = 0; i < dest.size(); ++i) + d[i] = s1[i] + s2[i]; + return; + } + + // Otherwise, do the more complex version with bounds checking. + for (long n = 0; n < dest.num_samples(); ++n) + { + for (long k = 0; k < dest.k(); ++k) + { + for (long r = 0; r < dest.nr(); ++r) + { + for (long c = 0; c < dest.nc(); ++c) + { + float v1 = 0; + float v2 = 0; + + // if this index is inside src1 + if (n < src1.num_samples() && + k < src1.k() && + r < src1.nr() && + c < src1.nc() ) + { + const auto s_idx = ((n*src1.k() + k)*src1.nr() + r)*src1.nc() + c; + v1 = s1[s_idx]; + } + + // if this index is inside src2 + if (n < src2.num_samples() && + k < src2.k() && + r < src2.nr() && + c < src2.nc() ) + { + const auto s_idx = ((n*src2.k() + k)*src2.nr() + r)*src2.nc() + c; + v2 = s2[s_idx]; + } + + *d = v1 + v2; + ++d; + } + } + } + } + } + + // ---------------------------------------------------------------------------------------- + + void multiply_zero_padded ( + bool add_to, + tensor& dest, + const tensor& src1, + const tensor& src2 + ) + { + auto d = dest.host(); + auto s1 = src1.host(); + auto s2 = src2.host(); + + // Do the simple and fast version if everything has the same dimensions + if (have_same_dimensions(dest, src1) && + have_same_dimensions(dest, src2)) + { + if (add_to) + { + for (size_t i = 0; i < dest.size(); ++i) + d[i] += s1[i] * s2[i]; + } + else + { + for (size_t i = 0; i < dest.size(); ++i) + d[i] = s1[i] * s2[i]; + } + return; + } + + // Otherwise, do the more complex version with bounds checking. + for (long n = 0; n < dest.num_samples(); ++n) + { + for (long k = 0; k < dest.k(); ++k) + { + for (long r = 0; r < dest.nr(); ++r) + { + for (long c = 0; c < dest.nc(); ++c) + { + float v1 = 0; + float v2 = 0; + + // if this index is inside src1 + if (n < src1.num_samples() && + k < src1.k() && + r < src1.nr() && + c < src1.nc() ) + { + const auto s_idx = ((n*src1.k() + k)*src1.nr() + r)*src1.nc() + c; + v1 = s1[s_idx]; + } + + // if this index is inside src2 + if (n < src2.num_samples() && + k < src2.k() && + r < src2.nr() && + c < src2.nc() ) + { + const auto s_idx = ((n*src2.k() + k)*src2.nr() + r)*src2.nc() + c; + v2 = s2[s_idx]; + } + + if (add_to) + *d += v1 * v2; + else + *d = v1 * v2; + ++d; + } + } + } + } + } + + // ---------------------------------------------------------------------------------------- + + void assign_bias_gradient ( + tensor& grad, + const tensor& gradient_input + ) + { + DLIB_CASSERT( + grad.num_samples() == 1 && + gradient_input.k() == grad.k() && + gradient_input.nr() == grad.nr() && + gradient_input.nc() == grad.nc() && + gradient_input.size() > 0); + + auto out = grad.host(); + auto in = gradient_input.host(); + + for (size_t i = 0; i < grad.size(); ++i) + out[i] = *in++; + + for (long j = 1; j < gradient_input.num_samples(); ++j) + { + for (size_t i = 0; i < grad.size(); ++i) + out[i] += *in++; + } + } + + // ------------------------------------------------------------------------------------ + + void assign_conv_bias_gradient ( + tensor& grad, + const tensor& gradient_input + ) + { + DLIB_CASSERT( + grad.num_samples() == 1 && + grad.k() >= 1 && + grad.nr() == 1 && + grad.nc() == 1 && + gradient_input.k() == grad.k() && + gradient_input.size() > 0 && + is_same_object(grad,gradient_input) == false + ); + + auto g = grad.host(); + auto gi = gradient_input.host(); + + for (long k = 0; k < gradient_input.k(); ++k) + g[k] = 0; + + for (long n = 0; n < gradient_input.num_samples(); ++n) + { + for (long k = 0; k < gradient_input.k(); ++k) + { + for (long r = 0; r < gradient_input.nr(); ++r) + { + for (long c = 0; c < gradient_input.nc(); ++c) + { + g[k] += (*gi++); + } + } + } + } + } + + // ----------------------------------------------------------------------------------- + + void affine_transform( + tensor& dest, + const tensor& src, + const float A, + const float B + ) + { + DLIB_CASSERT(dest.size()==src.size()); + const auto d = dest.host(); + const auto s = src.host(); + for (size_t i = 0; i < src.size(); ++i) + d[i] = A*s[i] + B; + } + + void affine_transform( + tensor& dest, + const tensor& src1, + const tensor& src2, + const float A, + const float B, + const float C + ) + { + DLIB_CASSERT(dest.size()==src1.size()); + DLIB_CASSERT(dest.size()==src2.size()); + const auto d = dest.host(); + const auto s1 = src1.host(); + const auto s2 = src2.host(); + for (size_t i = 0; i < src1.size(); ++i) + d[i] = A*s1[i] + B*s2[i] + C; + } + + void affine_transform( + tensor& dest, + const tensor& src1, + const tensor& src2, + const tensor& src3, + const float A, + const float B, + const float C, + const float D + ) + { + DLIB_CASSERT(dest.size()==src1.size()); + DLIB_CASSERT(dest.size()==src2.size()); + DLIB_CASSERT(dest.size()==src3.size()); + const auto d = dest.host(); + const auto s1 = src1.host(); + const auto s2 = src2.host(); + const auto s3 = src3.host(); + for (size_t i = 0; i < src1.size(); ++i) + d[i] = A*s1[i] + B*s2[i] + C*s3[i] + D; + } + + void affine_transform_range( + size_t begin, + size_t end, + tensor& dest, + const tensor& src1, + const tensor& src2, + const tensor& src3, + const float A, + const float B, + const float C + ) + { + DLIB_CASSERT(dest.size()==src1.size()); + DLIB_CASSERT(dest.size()==src2.size()); + DLIB_CASSERT(dest.size()==src3.size()); + DLIB_CASSERT(begin <= end && end <= dest.size()); + const auto d = dest.host(); + const auto s1 = src1.host(); + const auto s2 = src2.host(); + const auto s3 = src3.host(); + for (size_t i = begin; i < end; ++i) + d[i] = A*s1[i] + B*s2[i] + C*s3[i]; + } + + // ----------------------------------------------------------------------------------- + + void affine_transform( + tensor& dest, + const tensor& src, + const tensor& A, + const tensor& B + ) + { + DLIB_CASSERT(have_same_dimensions(dest,src)); + DLIB_CASSERT( + ((A.num_samples()==1 && B.num_samples()==1) || + (A.num_samples()==src.num_samples() && B.num_samples()==src.num_samples())) && + A.nr()==B.nr() && B.nr()==src.nr() && + A.nc()==B.nc() && B.nc()==src.nc() && + A.k() ==B.k() && B.k()==src.k()); + + auto d = dest.host(); + auto s = src.host(); + const auto a = A.host(); + const auto b = B.host(); + if (A.num_samples() == 1) + { + const long num = src.size()/src.num_samples(); + for (long i = 0; i < src.num_samples(); ++i) + { + for (long j = 0; j < num; ++j) + { + *d = a[j]*(*s) + b[j]; + d++; + s++; + } + } + } + else + { + for (size_t i = 0; i < src.size(); ++i) + d[i] = a[i]*s[i] + b[i]; + } + } + + // ----------------------------------------------------------------------------------- + + void affine_transform_conv( + tensor& dest, + const tensor& src, + const tensor& A, + const tensor& B + ) + { + DLIB_CASSERT(have_same_dimensions(dest,src)); + DLIB_CASSERT(have_same_dimensions(A,B)); + DLIB_CASSERT(A.num_samples() == 1 && + A.nr() == 1 && + A.nc() == 1 && + A.k() == src.k()); + + auto d = dest.host(); + auto s = src.host(); + const auto a = A.host(); + const auto b = B.host(); + for (long n = 0; n < dest.num_samples(); ++n) + { + for (long k = 0; k < dest.k(); ++k) + { + for (long r = 0; r < dest.nr(); ++r) + { + for (long c = 0; c < dest.nc(); ++c) + { + *d++ = a[k]*(*s++) + b[k]; + } + } + } + } + } + + // ---------------------------------------------------------------------------------------- + + void affine_transform( + const rectangle& rect, + tensor& dest, + const tensor& src1, + const tensor& src2, + const tensor& src3, + float A, + float B, + float C + ) + { + DLIB_CASSERT(dest.size() == src1.size()); + DLIB_CASSERT(dest.size() == src2.size()); + DLIB_CASSERT(dest.size() == src3.size()); + DLIB_CASSERT(dest.num_samples() == src1.num_samples()); + DLIB_CASSERT(dest.num_samples() == src2.num_samples()); + DLIB_CASSERT(dest.num_samples() == src3.num_samples()); + DLIB_CASSERT(rectangle(0,0, dest.size()/dest.num_samples()-1, dest.num_samples()-1).contains(rect)); + + + auto d = dest.host(); + auto s1 = src1.host(); + auto s2 = src2.host(); + auto s3 = src3.host(); + + const auto nc = dest.size()/dest.num_samples(); + + for (long r = rect.top(); r <= rect.bottom(); ++r) + { + for (long c = rect.left(); c <= rect.right(); ++c) + { + auto idx = r*nc + c; + d[idx] = s1[idx]*A + s2[idx]*B + s3[idx]*C; + } + } + + } + + // ----------------------------------------------------------------------------------- + + void compute_adam_update ( + size_t begin, + size_t end, + tensor& s, + tensor& m, + tensor& v, + const float t, + const float learning_rate, + const float weight_decay, + const float momentum1, + const float momentum2, + const tensor& params, + const tensor& params_grad + ) + { + DLIB_CASSERT(s.size() == m.size() && + s.size() == v.size() && + s.size() == params.size() && + s.size() == params_grad.size()); + DLIB_CASSERT(begin <= end && end <= params.size()); + const float eps = 1e-8; + const float alpha = learning_rate*std::sqrt(1-std::pow(momentum2,t))/(1-std::pow(momentum1, t)); + + // The loop is equivalent to doing this: + // m = momentum1*m + (1-momentum1) * (weight_decay*params + params_grad); + // v = momentum2*v + (1-momentum2)*squared(weight_decay*params + params_grad); + // s = -alpha*m/(sqrt(v) + eps); + auto pm = m.host(); + auto pv = v.host(); + auto ps = s.host_write_only(); + auto pparams = params.host(); + auto ppgrad = params_grad.host(); + for (size_t i = begin; i < end; ++i) + { + float g = weight_decay*pparams[i] + ppgrad[i]; + pm[i] = momentum1*pm[i] + (1-momentum1)*g; + pv[i] = momentum2*pv[i] + (1-momentum2)*g*g; + ps[i] = -alpha*pm[i]/(std::sqrt(pv[i]) + eps); + } + } + + // ----------------------------------------------------------------------------------- + + void batch_normalize_inference ( + const double eps, + resizable_tensor& dest, + const tensor& src, + const tensor& gamma, + const tensor& beta, + const tensor& running_means, + const tensor& running_variances + ) + { + DLIB_CASSERT( + gamma.num_samples() == 1 && + gamma.nr() == src.nr() && + gamma.nc() == src.nc() && + gamma.k() == src.k() && + have_same_dimensions(gamma, beta) && + have_same_dimensions(gamma, running_means) && + have_same_dimensions(gamma, running_variances) && + eps > 0, + "\ngamma.num_samples(): " << gamma.num_samples() << + "\ngamma.k(): " << gamma.k() << + "\ngamma.nr(): " << gamma.nr() << + "\ngamma.nc(): " << gamma.nc() << + "\nbeta.num_samples(): " << beta.num_samples() << + "\nbeta.k(): " << beta.k() << + "\nbeta.nr(): " << beta.nr() << + "\nbeta.nc(): " << beta.nc() << + "\nrunning_means.num_samples(): " << running_means.num_samples() << + "\nrunning_means.k(): " << running_means.k() << + "\nrunning_means.nr(): " << running_means.nr() << + "\nrunning_means.nc(): " << running_means.nc() << + "\nrunning_variances.num_samples(): " << running_variances.num_samples() << + "\nrunning_variances.k(): " << running_variances.k() << + "\nrunning_variances.nr(): " << running_variances.nr() << + "\nrunning_variances.nc(): " << running_variances.nc() << + "\nsrc.k(): " << src.k() << + "\nsrc.nr(): " << src.nr() << + "\nsrc.nc(): " << src.nc() << + "\neps: " << eps + ); + dest.copy_size(src); + + auto d = dest.host(); + auto s = src.host(); + auto g = gamma.host(); + auto b = beta.host(); + auto m = running_means.host(); + auto v = running_variances.host(); + + const long num = src.k()*src.nr()*src.nc(); + for (long n = 0; n < src.num_samples(); ++n) + { + for (long k = 0; k < num; ++k) + { + *d = g[k]*(*s - m[k])/std::sqrt(v[k]+eps) + b[k]; + ++d; + ++s; + } + } + } + + void batch_normalize ( + const double eps, + resizable_tensor& dest, + resizable_tensor& means, + resizable_tensor& invstds, + const double averaging_factor, + resizable_tensor& running_means, + resizable_tensor& running_variances, + const tensor& src, + const tensor& gamma, + const tensor& beta + ) + { + DLIB_CASSERT(0 <= averaging_factor && averaging_factor <= 1, "averaging_factor: " << averaging_factor); + DLIB_CASSERT(averaging_factor==1 || have_same_dimensions(running_means,means)); + DLIB_CASSERT(averaging_factor==1 || have_same_dimensions(running_variances,invstds)); + DLIB_CASSERT( + src.num_samples() > 1 && + gamma.num_samples() == 1 && + beta.num_samples() == 1 && + gamma.nr() == beta.nr() && beta.nr() == src.nr() && + gamma.nc() == beta.nc() && beta.nc() == src.nc() && + gamma.k() == beta.k() && beta.k() == src.k() && + eps > 0, + "\ngamma.num_samples(): " << gamma.num_samples() << + "\ngamma.k(): " << gamma.k() << + "\ngamma.nr(): " << gamma.nr() << + "\ngamma.nc(): " << gamma.nc() << + "\nbeta.num_samples(): " << beta.num_samples() << + "\nbeta.k(): " << beta.k() << + "\nbeta.nr(): " << beta.nr() << + "\nbeta.nc(): " << beta.nc() << + "\nsrc.k(): " << src.k() << + "\nsrc.nr(): " << src.nr() << + "\nsrc.nc(): " << src.nc() << + "\neps: " << eps + ); + + dest.copy_size(src); + means.set_size(1, src.k(), src.nr(), src.nc()); + invstds.set_size(1, src.k(), src.nr(), src.nc()); + + // first compute means and invstds + means = 0; + invstds = 0; + const auto p_invstds = invstds.host(); + const auto p_means = means.host(); + auto p_src = src.host(); + const long num = src.k()*src.nr()*src.nc(); + // compute means, and sum of squares + for (long i = 0; i < num; ++i) + { + for (long n = 0; n < src.num_samples(); ++n) + { + float val = p_src[n*num+i]; + p_means[i] += val; + p_invstds[i] += val*val; + } + } + means /= src.num_samples(); + invstds /= src.num_samples(); + // copy data back to host + invstds.host(); means.host(); + + // compute variances + running_variances.copy_size(invstds); + auto rvar = running_variances.host(); + // This scale makes the running variances unbiased. + const double scale = (src.num_samples())/(src.num_samples()-1.0); + for (long i = 0; i < num; ++i) + { + auto actual_var = p_invstds[i] - p_means[i]*p_means[i]; + if (averaging_factor == 1) + rvar[i] = scale*actual_var; + else + rvar[i] = (1-averaging_factor)*rvar[i] + scale*averaging_factor*actual_var; + + p_invstds[i] = 1.0f/std::sqrt(actual_var + eps); + } + + p_src = src.host(); + auto p_dest = dest.host(); + const auto p_gamma = gamma.host(); + const auto p_beta = beta.host(); + for (long n = 0; n < src.num_samples(); ++n) + { + for (long i = 0; i < num; ++i) + { + *p_dest = (*p_src - p_means[i])*p_invstds[i]; + *p_dest = (*p_dest)*p_gamma[i] + p_beta[i]; + ++p_src; + ++p_dest; + } + } + + // now keep track of the running means + running_means.copy_size(means); + if (averaging_factor != 1) + running_means = (1-averaging_factor)*mat(running_means) + averaging_factor*mat(means); + else + running_means = means; + } + + void batch_normalize_gradient ( + const double eps, + const tensor& gradient_input, + const tensor& means, + const tensor& invstds, + const tensor& src, + const tensor& gamma, + tensor& src_grad, + tensor& gamma_grad, + tensor& beta_grad + ) + { + + const long num = src.k()*src.nr()*src.nc(); + DLIB_CASSERT(src.num_samples() > 1); + DLIB_CASSERT(num == (long)means.size()); + DLIB_CASSERT(num == (long)invstds.size()); + DLIB_CASSERT(num == (long)gamma.size()); + DLIB_CASSERT(num == (long)gamma_grad.size()); + DLIB_CASSERT(num == (long)beta_grad.size()); + DLIB_CASSERT(have_same_dimensions(gradient_input, src)); + DLIB_CASSERT(have_same_dimensions(gradient_input, src_grad)); + DLIB_CASSERT(eps > 0); + + beta_grad = 0; + gamma_grad = 0; + auto p_grad = gradient_input.host(); + auto p_src = src.host(); + const auto p_gamma = gamma.host(); + const auto p_gamma_grad = gamma_grad.host(); + const auto p_beta_grad = beta_grad.host(); + const auto p_invstds = invstds.host(); + const auto p_means = means.host(); + + resizable_tensor dvars, dmeans; + dvars.copy_size(invstds); + dmeans.copy_size(means); + dvars = 0; + dmeans = 0; + const auto p_dvars = dvars.host(); + const auto p_dmeans = dmeans.host(); + + for (long n = 0; n < src.num_samples(); ++n) + { + for (long i = 0; i < num; ++i) + { + const float x_hat = (*p_src - p_means[i])*p_invstds[i]; + p_beta_grad[i] += *p_grad; + p_gamma_grad[i] += (*p_grad)*x_hat; + + const float dx = *p_grad * p_gamma[i]; + + p_dvars[i] += dx*(*p_src - p_means[i])*-0.5*std::pow(p_invstds[i], 3.0f); + + ++p_grad; + ++p_src; + } + } + + const float invnum = 1.0f/src.num_samples(); + p_grad = gradient_input.host(); + p_src = src.host(); + for (long n = 0; n < src.num_samples(); ++n) + { + for (long i = 0; i < num; ++i) + { + const float dx = *p_grad * p_gamma[i]; + + p_dmeans[i] += dx*-p_invstds[i] + p_dvars[i] * -2*(*p_src - p_means[i])*invnum; + + ++p_grad; + ++p_src; + } + } + p_grad = gradient_input.host(); + p_src = src.host(); + auto p_src_grad = src_grad.host(); + for (long n = 0; n < src.num_samples(); ++n) + { + for (long i = 0; i < num; ++i) + { + const float dx = *p_grad * p_gamma[i]; + + *p_src_grad += dx*p_invstds[i] + + p_dvars[i] *2*(*p_src - p_means[i])*invnum + + p_dmeans[i]*invnum; + + + ++p_grad; + ++p_src; + ++p_src_grad; + } + } + } + + // ---------------------------------------------------------------------------------------- + + void batch_normalize_conv_inference ( + const double eps, + resizable_tensor& dest, + const tensor& src, + const tensor& gamma, + const tensor& beta, + const tensor& running_means, + const tensor& running_variances + ) + { + DLIB_CASSERT( + gamma.num_samples() == 1 && + gamma.nr() == 1 && + gamma.nc() == 1 && + gamma.k() == src.k() && + have_same_dimensions(gamma, beta) && + have_same_dimensions(gamma, running_means) && + have_same_dimensions(gamma, running_variances) && + eps > 0, + "\ngamma.num_samples(): " << gamma.num_samples() << + "\ngamma.k(): " << gamma.k() << + "\ngamma.nr(): " << gamma.nr() << + "\ngamma.nc(): " << gamma.nc() << + "\nbeta.num_samples(): " << beta.num_samples() << + "\nbeta.k(): " << beta.k() << + "\nbeta.nr(): " << beta.nr() << + "\nbeta.nc(): " << beta.nc() << + "\nrunning_means.num_samples(): " << running_means.num_samples() << + "\nrunning_means.k(): " << running_means.k() << + "\nrunning_means.nr(): " << running_means.nr() << + "\nrunning_means.nc(): " << running_means.nc() << + "\nrunning_variances.num_samples(): " << running_variances.num_samples() << + "\nrunning_variances.k(): " << running_variances.k() << + "\nrunning_variances.nr(): " << running_variances.nr() << + "\nrunning_variances.nc(): " << running_variances.nc() << + "\nsrc.k(): " << src.k() << + "\nsrc.nr(): " << src.nr() << + "\nsrc.nc(): " << src.nc() << + "\neps: " << eps + ); + dest.copy_size(src); + + auto d = dest.host(); + auto s = src.host(); + auto g = gamma.host(); + auto b = beta.host(); + auto m = running_means.host(); + auto v = running_variances.host(); + + const long num = src.nr()*src.nc(); + for (long n = 0; n < src.num_samples(); ++n) + { + for (long k = 0; k < src.k(); ++k) + { + const float invstd = 1.0f/std::sqrt(v[k] + eps); + for (long j = 0; j < num; ++j) + { + *d = g[k]*(*s - m[k])*invstd + b[k]; + ++d; + ++s; + } + } + } + } + + void batch_normalize_conv ( + const double eps, + resizable_tensor& dest, + resizable_tensor& means, + resizable_tensor& invstds, + const double averaging_factor, + resizable_tensor& running_means, + resizable_tensor& running_variances, + const tensor& src, + const tensor& gamma, + const tensor& beta + ) + { + DLIB_CASSERT(0 <= averaging_factor && averaging_factor <= 1, "averaging_factor: " << averaging_factor); + DLIB_CASSERT(averaging_factor==1 || have_same_dimensions(running_means,means)); + DLIB_CASSERT(averaging_factor==1 || have_same_dimensions(running_variances,invstds)); + DLIB_CASSERT( + src.num_samples() > 1 && + gamma.num_samples() == 1 && + beta.num_samples() == 1 && + gamma.nr() == 1 && + beta.nr() == 1 && + gamma.nc() == 1 && + beta.nc() == 1 && + gamma.k() == beta.k() && beta.k() == src.k() && + eps > 0, + "\ngamma.num_samples(): " << gamma.num_samples() << + "\ngamma.k(): " << gamma.k() << + "\ngamma.nr(): " << gamma.nr() << + "\ngamma.nc(): " << gamma.nc() << + "\nbeta.num_samples(): " << beta.num_samples() << + "\nbeta.k(): " << beta.k() << + "\nbeta.nr(): " << beta.nr() << + "\nbeta.nc(): " << beta.nc() << + "\nsrc.k(): " << src.k() << + "\nsrc.nr(): " << src.nr() << + "\nsrc.nc(): " << src.nc() << + "\neps: " << eps + ); + + dest.copy_size(src); + means.set_size(1, src.k()); + invstds.set_size(1, src.k()); + + // first compute means and invstds + means = 0; + invstds = 0; + const auto p_invstds = invstds.host(); + const auto p_means = means.host(); + const auto p_gamma = gamma.host(); + const auto p_beta = beta.host(); + auto p_src = src.host(); + const long num = src.nr()*src.nc(); + // compute means, and sum of squares + for (long n = 0; n < src.num_samples(); ++n) + { + for (long k = 0; k < src.k(); ++k) + { + for (long i = 0; i < num; ++i) + { + p_means[k] += *p_src; + p_invstds[k] += (*p_src)*(*p_src); + ++p_src; + } + } + } + means /= src.num_samples()*num; + invstds /= src.num_samples()*num; + // copy data back to host + invstds.host(); means.host(); + + p_src = src.host(); + // compute variances + running_variances.copy_size(invstds); + auto rvar = running_variances.host(); + // This scale makes the running variances unbiased. + const double scale = (src.num_samples()*num)/(src.num_samples()*num-1.0); + for (long k = 0; k < src.k(); ++k) + { + float actual_var = p_invstds[k] - p_means[k]*p_means[k]; + if (averaging_factor == 1) + rvar[k] = scale*actual_var; + else + rvar[k] = (1-averaging_factor)*rvar[k] + scale*averaging_factor*actual_var; + + p_invstds[k] = 1.0f/std::sqrt(actual_var + eps); + } + + p_src = src.host(); + auto p_dest = dest.host(); + for (long n = 0; n < src.num_samples(); ++n) + { + for (long k = 0; k < src.k(); ++k) + { + for (long i = 0; i < num; ++i) + { + *p_dest = (*p_src - p_means[k])*p_invstds[k]; + *p_dest = (*p_dest)*p_gamma[k] + p_beta[k]; + ++p_src; + ++p_dest; + } + } + } + + // now keep track of the running means + running_means.copy_size(means); + if (averaging_factor != 1) + running_means = (1-averaging_factor)*mat(running_means) + averaging_factor*mat(means); + else + running_means = means; + } + + void batch_normalize_conv_gradient( + const double eps, + const tensor& gradient_input, + const tensor& means, + const tensor& invstds, + const tensor& src, + const tensor& gamma, + tensor& src_grad, + tensor& gamma_grad, + tensor& beta_grad + ) + { + + const long num = src.nr()*src.nc(); + DLIB_CASSERT(src.num_samples() > 1); + DLIB_CASSERT(src.k() == (long)means.size()); + DLIB_CASSERT(src.k() == (long)invstds.size()); + DLIB_CASSERT(src.k() == (long)gamma.size()); + DLIB_CASSERT(src.k() == (long)gamma_grad.size()); + DLIB_CASSERT(src.k() == (long)beta_grad.size()); + DLIB_CASSERT(have_same_dimensions(gradient_input, src)); + DLIB_CASSERT(have_same_dimensions(gradient_input, src_grad)); + DLIB_CASSERT(eps > 0); + + beta_grad = 0; + gamma_grad = 0; + + auto p_grad = gradient_input.host(); + auto p_src = src.host(); + const auto p_gamma = gamma.host(); + const auto p_gamma_grad = gamma_grad.host(); + const auto p_beta_grad = beta_grad.host(); + const auto p_invstds = invstds.host(); + const auto p_means = means.host(); + + resizable_tensor dvars, dmeans; + dvars.copy_size(invstds); + dmeans.copy_size(means); + dvars = 0; + dmeans = 0; + const auto p_dvars = dvars.host(); + const auto p_dmeans = dmeans.host(); + + for (long n = 0; n < src.num_samples(); ++n) + { + for (long k = 0; k < src.k(); ++k) + { + const float invstd_pow = -0.5*std::pow(p_invstds[k], 3.0f); + for (long i = 0; i < num; ++i) + { + const float x_hat = (*p_src - p_means[k])*p_invstds[k]; + p_beta_grad[k] += *p_grad; + p_gamma_grad[k] += (*p_grad)*x_hat; + + const float dx = *p_grad * p_gamma[k]; + + p_dvars[k] += dx*(*p_src - p_means[k])*invstd_pow; + + ++p_grad; + ++p_src; + } + } + } + + p_grad = gradient_input.host(); + p_src = src.host(); + const float invnum = 1.0f/(src.num_samples()*num); + for (long n = 0; n < src.num_samples(); ++n) + { + for (long k = 0; k < src.k(); ++k) + { + for (long i = 0; i < num; ++i) + { + const float dx = *p_grad * p_gamma[k]; + + p_dmeans[k] += -dx*p_invstds[k] + p_dvars[k] * -2*(*p_src - p_means[k])*invnum; + + ++p_grad; + ++p_src; + } + } + } + p_grad = gradient_input.host(); + p_src = src.host(); + auto p_src_grad = src_grad.host(); + for (long n = 0; n < src.num_samples(); ++n) + { + for (long k = 0; k < src.k(); ++k) + { + for (long i = 0; i < num; ++i) + { + const float dx = *p_grad * p_gamma[k]; + + *p_src_grad += dx*p_invstds[k] + + p_dvars[k]*2*(*p_src - p_means[k])*invnum + + p_dmeans[k]*invnum; + + + ++p_grad; + ++p_src; + ++p_src_grad; + } + } + } + } + + // ----------------------------------------------------------------------------------- + + void threshold ( + tensor& data, + float thresh + ) + { + const auto d = data.host(); + for (size_t i = 0; i < data.size(); ++i) + d[i] = d[i]>thresh ? 1:0; + } + + void dot ( + const tensor& a, + const tensor& b, + tensor& result, + size_t idx + ) + { + DLIB_CASSERT(a.size() == b.size()); + DLIB_CASSERT(idx < result.size()); + + const auto aa = a.host(); + const auto bb = b.host(); + auto r = result.host(); + for (size_t i = 0; i < a.size(); ++i) + r[idx] += aa[i]*bb[i]; + } + + // ----------------------------------------------------------------------------------- + // ----------------------------------------------------------------------------------- + // ----------------------------------------------------------------------------------- + + namespace ttimpl + { + void softmax ( + const long num_locations, + const long num_channels, + tensor& dest, + const tensor& src + ) + { + DLIB_ASSERT(num_channels*num_locations == src.nr()*src.nc()*src.k()); + DLIB_CASSERT(have_same_dimensions(dest,src)); + const auto d = dest.host(); + const auto s = src.host(); + + // Note that we subtract out the max values in each channel before applying + // exp() to avoid numeric overflow in the subsequent computations. Doing this + // doesn't change the resulting output, it just makes it more numerically + // stable. + for (long n = 0; n < src.num_samples(); ++n) + { + auto ss = s + num_locations*num_channels*n; + auto dd = d + num_locations*num_channels*n; + for (long i = 0; i < num_locations; ++i) + { + float max_val = -std::numeric_limits<float>::infinity(); + for (long k = 0; k < num_channels; ++k) + max_val = std::max(max_val, ss[k*num_locations]); + + for (long k = 0; k < num_channels; ++k) + dd[k*num_locations] = std::exp(ss[k*num_locations]-max_val); + + ++ss; + ++dd; + } + } + + // Now normalize each channel so they sum to 1. + for (long n = 0; n < src.num_samples(); ++n) + { + const auto dd = d + num_locations*num_channels*n; + for (long i = 0; i < num_locations; ++i) + { + const auto ddd = dd+i; + + float temp = 0; + for (long k = 0; k < num_channels; ++k) + temp += ddd[k*num_locations]; + for (long k = 0; k < num_channels; ++k) + ddd[k*num_locations] /= temp; + } + } + } + + void softmax_gradient ( + const long num_locations, + const long num_channels, + tensor& grad, + const tensor& dest, + const tensor& gradient_input + ) + { + DLIB_ASSERT(num_channels*num_locations == grad.nr()*grad.nc()*grad.k()); + DLIB_CASSERT(have_same_dimensions(grad,dest)); + DLIB_CASSERT(have_same_dimensions(grad,gradient_input)); + const auto d = dest.host(); + const auto g = grad.host(); + const auto in = gradient_input.host(); + + + for (long n = 0; n < grad.num_samples(); ++n) + { + const auto d2 = d + num_locations*num_channels*n; + const auto g2 = g + num_locations*num_channels*n; + const auto in2 = in + num_locations*num_channels*n; + for (long i = 0; i < num_locations; ++i) + { + const auto d3 = d2+i; + const auto g3 = g2+i; + const auto in3 = in2+i; + + float temp = 0; + for (long k = 0; k < num_channels; ++k) + temp += -d3[k*num_locations]*in3[k*num_locations]; + if (is_same_object(gradient_input, grad)) + { + for (long k = 0; k < num_channels; ++k) + g3[k*num_locations] = d3[k*num_locations]*(temp+in3[k*num_locations]); + } + else + { + for (long k = 0; k < num_channels; ++k) + g3[k*num_locations] += d3[k*num_locations]*(temp+in3[k*num_locations]); + } + } + } + } + } + + // ---------------------------------------------------------------------------------------- + + void softmax ( + tensor& dest, + const tensor& src + ) + { + DLIB_CASSERT(have_same_dimensions(dest,src)); + ttimpl::softmax(src.nr()*src.nc(), src.k(), dest, src); + } + + void softmax_gradient ( + tensor& grad, + const tensor& dest, + const tensor& gradient_input + ) + { + DLIB_CASSERT(have_same_dimensions(grad,dest)); + DLIB_CASSERT(have_same_dimensions(grad,gradient_input)); + ttimpl::softmax_gradient(grad.nr()*grad.nc(), grad.k(), grad, dest, gradient_input); + } + + // ------------------------------------------------------------------------------------ + + void softmax_all ( + tensor& dest, + const tensor& src + ) + { + DLIB_CASSERT(have_same_dimensions(dest,src)); + ttimpl::softmax(1, src.nr()*src.nc()*src.k(), dest, src); + } + + void softmax_all_gradient ( + tensor& grad, + const tensor& dest, + const tensor& gradient_input + ) + { + DLIB_CASSERT(have_same_dimensions(grad,dest)); + DLIB_CASSERT(have_same_dimensions(grad,gradient_input)); + ttimpl::softmax_gradient(1, grad.nr()*grad.nc()*grad.k(), grad, dest, gradient_input); + } + + // ------------------------------------------------------------------------------------ + + void sigmoid ( + tensor& dest, + const tensor& src + ) + { + const auto d = dest.host(); + const auto s = src.host(); + for (size_t i = 0; i < src.size(); ++i) + d[i] = 1/(1+std::exp(-s[i])); + } + + void sigmoid_gradient ( + tensor& grad, + const tensor& dest, + const tensor& gradient_input + ) + { + const auto g = grad.host(); + const auto d = dest.host(); + const auto in = gradient_input.host(); + if (is_same_object(gradient_input, grad)) + { + for (size_t i = 0; i < dest.size(); ++i) + g[i] = in[i]*d[i]*(1-d[i]); + } + else + { + for (size_t i = 0; i < dest.size(); ++i) + g[i] += in[i]*d[i]*(1-d[i]); + } + } + + // ------------------------------------------------------------------------------------ + + void relu ( + tensor& dest, + const tensor& src + ) + { + dest = lowerbound(mat(src), 0); + } + + void relu_gradient ( + tensor& grad, + const tensor& dest, + const tensor& gradient_input + ) + { + const float* gi = gradient_input.host(); + const float* in = dest.host(); + float* out = grad.host(); + if (is_same_object(grad, gradient_input)) + { + for (size_t i = 0; i < dest.size(); ++i) + { + if (in[i] > 0) + out[i] = gi[i]; + else + out[i] = 0; + } + } + else + { + for (size_t i = 0; i < dest.size(); ++i) + { + if (in[i] > 0) + out[i] += gi[i]; + } + } + } + + // ---------------------------------------------------------------------------------------- + + void prelu ( + tensor& dest, + const tensor& src, + const tensor& param + ) + { + const float p = param.host()[0]; + const float* s = src.host(); + float* d = dest.host(); + for (size_t i = 0; i < dest.size(); ++i) + { + if (s[i] > 0) + d[i] = s[i]; + else + d[i] = p*s[i]; + } + } + + void prelu_gradient ( + tensor& grad, + const tensor& src, + const tensor& gradient_input, + const tensor& param, + tensor& params_grad + ) + { + DLIB_CASSERT(is_same_object(grad, gradient_input) == false); + const float p = param.host()[0]; + const float* gi = gradient_input.host(); + const float* s = src.host(); + float* out = grad.host(); + float pgrad = 0; + for (size_t i = 0; i < src.size(); ++i) + { + if (s[i] > 0) + { + out[i] += gi[i]; + } + else + { + out[i] += p*gi[i]; + pgrad += gi[i]*s[i]; + } + } + params_grad.host()[0] = pgrad; + } + + // ------------------------------------------------------------------------------------ + + void tanh ( + tensor& dest, + const tensor& src + ) + { + const auto d = dest.host(); + const auto s = src.host(); + for (size_t i = 0; i < src.size(); ++i) + d[i] = std::tanh(s[i]); + } + + void tanh_gradient ( + tensor& grad, + const tensor& dest, + const tensor& gradient_input + ) + { + const auto g = grad.host(); + const auto d = dest.host(); + const auto in = gradient_input.host(); + if (is_same_object(grad, gradient_input)) + { + for (size_t i = 0; i < dest.size(); ++i) + g[i] = in[i]*(1-d[i]*d[i]); + } + else + { + for (size_t i = 0; i < dest.size(); ++i) + g[i] += in[i]*(1-d[i]*d[i]); + } + } + + // ---------------------------------------------------------------------------------------- + + void resize_bilinear ( + tensor& dest, + long dest_row_stride, + long dest_channel_stride, + const tensor& src, + long src_row_stride, + long src_channel_stride + ) + { + DLIB_CASSERT(is_same_object(dest, src)==false); + DLIB_CASSERT(dest.num_samples() == src.num_samples()); + DLIB_CASSERT(dest.k() == src.k()); + + if (dest.size() == 0 || src.size() == 0) + return; + + const float* s = src.host(); + float* d = dest.host(); + + parallel_for(0, dest.k()*dest.num_samples(), [&](long i) + { + auto simg = sub_image(s+i*src_channel_stride, src.nr(), src.nc(), src_row_stride); + auto dimg = sub_image(d+i*dest_channel_stride, dest.nr(), dest.nc(), dest_row_stride); + + resize_image(simg, dimg); + }); + } + + void resize_bilinear_gradient ( + tensor& grad, + long grad_row_stride, + long grad_channel_stride, + const tensor& gradient_input, + long gradient_input_row_stride, + long gradient_input_channel_stride + ) + { + DLIB_CASSERT(is_same_object(grad, gradient_input)==false); + DLIB_CASSERT(gradient_input.num_samples() == grad.num_samples()); + DLIB_CASSERT(gradient_input.k() == grad.k()); + + if (gradient_input.size() == 0 || grad.size() == 0) + return; + + const float* gi = gradient_input.host(); + float* g = grad.host(); + const float x_scale = (grad.nc()-1)/(float)std::max<long>((gradient_input.nc()-1),1); + const float y_scale = (grad.nr()-1)/(float)std::max<long>((gradient_input.nr()-1),1); + for (long long samp = 0; samp < gradient_input.num_samples(); ++samp) + { + for (long long k = 0; k < gradient_input.k(); ++k) + { + for (long long r = 0; r < gradient_input.nr(); ++r) + { + const float y = r*y_scale; + const long long top = static_cast<long long>(std::floor(y)); + const long long bottom = std::min(top+1, grad.nr()-1); + const float tb_frac = y - top; + for (long long c = 0; c < gradient_input.nc(); ++c) + { + const float x = c*x_scale; + const long long left = static_cast<long long>(std::floor(x)); + const long long right = std::min(left+1, grad.nc()-1); + const float lr_frac = x - left; + + const float tmp = gi[r*gradient_input_row_stride+c]; + + g[top*grad_row_stride+left] += tmp*(1-tb_frac)*(1-lr_frac); + g[top*grad_row_stride+right] += tmp*(1-tb_frac)*(lr_frac); + g[bottom*grad_row_stride+left] += tmp*(tb_frac)*(1-lr_frac); + g[bottom*grad_row_stride+right] += tmp*(tb_frac)*(lr_frac); + } + } + + g += grad_channel_stride; + gi += gradient_input_channel_stride; + } + } + } + + // ------------------------------------------------------------------------------------ + // ------------------------------------------------------------------------------------ + // ------------------------------------------------------------------------------------ + + pooling::pooling ( + ) : window_height(0),window_width(0),stride_y(0),stride_x(0),padding_y(0),padding_x(0),do_max_pooling(true) + { + } + + void pooling:: + clear( + ) + { + window_height = 0; + window_width = 0; + stride_y = 0; + stride_x = 0; + padding_y = 0; + padding_x = 0; + } + + void pooling:: + setup_max_pooling( + int window_height_, + int window_width_, + int stride_y_, + int stride_x_, + int padding_y_, + int padding_x_ + ) + { + DLIB_CASSERT(window_width_ > 0); + DLIB_CASSERT(window_height_ > 0); + DLIB_CASSERT(stride_y_ > 0); + DLIB_CASSERT(stride_x_ > 0); + DLIB_CASSERT(0 <= padding_y_ && padding_y_ < window_height_); + DLIB_CASSERT(0 <= padding_x_ && padding_x_ < window_width_); + + window_height = window_height_; + window_width = window_width_; + stride_y = stride_y_; + stride_x = stride_x_; + padding_y = padding_y_; + padding_x = padding_x_; + do_max_pooling = true; + } + + void pooling:: + setup_avg_pooling( + int window_height_, + int window_width_, + int stride_y_, + int stride_x_, + int padding_y_, + int padding_x_ + ) + { + DLIB_CASSERT(window_width_ > 0); + DLIB_CASSERT(window_height_ > 0); + DLIB_CASSERT(stride_y_ > 0); + DLIB_CASSERT(stride_x_ > 0); + DLIB_CASSERT(0 <= padding_y_ && padding_y_ < window_height_); + DLIB_CASSERT(0 <= padding_x_ && padding_x_ < window_width_); + + window_height = window_height_; + window_width = window_width_; + stride_y = stride_y_; + stride_x = stride_x_; + padding_y = padding_y_; + padding_x = padding_x_; + do_max_pooling = false; + } + + void pooling:: + operator() ( + resizable_tensor& dest, + const tensor& src + ) + { + DLIB_CASSERT(window_width > 0); + DLIB_CASSERT(window_height > 0); + DLIB_CASSERT(stride_y > 0); + DLIB_CASSERT(stride_x > 0); + DLIB_CASSERT(0 <= padding_y && padding_y < window_height); + DLIB_CASSERT(0 <= padding_x && padding_x < window_width); + DLIB_CASSERT(window_width <= src.nc() + 2*padding_x, + "Pooling windows must be small enough to fit into the padded image."); + DLIB_CASSERT(window_height <= src.nr() + 2*padding_y, + "Pooling windows must be small enough to fit into the padded image."); + + dest.set_size( + src.num_samples(), + src.k(), + 1+(src.nr()+2*padding_y-window_height)/stride_y, + 1+(src.nc()+2*padding_x-window_width)/stride_x + ); + + if (src.size() == 0) + { + dest = 0; + return; + } + + + auto d = dest.host(); + const long x_offset = window_width/2 - padding_x; + const long y_offset = window_height/2 - padding_y; + if (does_max_pooling()) + { + for (long n = 0; n < dest.num_samples(); ++n) + { + for (long k = 0; k < dest.k(); ++k) + { + auto simg = image_plane(src,n,k); + auto dimg = d + (n*dest.k() + k)*dest.nr()*dest.nc(); + + for (long r = 0; r < dest.nr(); ++r) + { + for (long c = 0; c < dest.nc(); ++c) + { + auto win = centered_rect(c*stride_x+x_offset, + r*stride_y+y_offset, + window_width, + window_height); + dimg[r*dest.nc() + c] = max(subm_clipped(simg,win)); + } + } + } + } + } + else + { + for (long n = 0; n < dest.num_samples(); ++n) + { + for (long k = 0; k < dest.k(); ++k) + { + auto simg = image_plane(src,n,k); + auto dimg = d + (n*dest.k() + k)*dest.nr()*dest.nc(); + + for (long r = 0; r < dest.nr(); ++r) + { + for (long c = 0; c < dest.nc(); ++c) + { + auto win = centered_rect(c*stride_x+x_offset, + r*stride_y+y_offset, + window_width, + window_height); + dimg[r*dest.nc() + c] = mean(subm_clipped(simg,win)); + } + } + } + } + } + + } + + void pooling::get_gradient( + const tensor& gradient_input, + const tensor& dest, + const tensor& src, + tensor& grad + ) + { + DLIB_CASSERT(have_same_dimensions(gradient_input,dest)); + DLIB_CASSERT(have_same_dimensions(src,grad)); + + + if (src.size() == 0) + { + return; + } + + + auto gi = gradient_input.host(); + auto g = grad.host(); + const long x_offset = window_width/2 - padding_x; + const long y_offset = window_height/2 - padding_y; + if (does_max_pooling()) + { + for (long n = 0; n < dest.num_samples(); ++n) + { + for (long k = 0; k < dest.k(); ++k) + { + auto simg = image_plane(src,n,k); + auto gimg = g + (n*grad.k() + k)*grad.nr()*grad.nc(); + auto giimg = gi + (n*dest.k() + k)*dest.nr()*dest.nc(); + auto imgbox = get_rect(simg); + + for (long r = 0; r < dest.nr(); ++r) + { + for (long c = 0; c < dest.nc(); ++c) + { + auto win = centered_rect(c*stride_x+x_offset, + r*stride_y+y_offset, + window_width, + window_height).intersect(imgbox); + auto p = max_point(subm(simg,win))+win.tl_corner(); + gimg[p.y()*grad.nc()+p.x()] += giimg[r*dest.nc()+c]; + } + } + } + } + } + else + { + for (long n = 0; n < dest.num_samples(); ++n) + { + for (long k = 0; k < dest.k(); ++k) + { + auto simg = image_plane(src,n,k); + auto gimg = g + (n*grad.k() + k)*grad.nr()*grad.nc(); + auto giimg = gi + (n*dest.k() + k)*dest.nr()*dest.nc(); + auto imgbox = get_rect(simg); + + for (long r = 0; r < dest.nr(); ++r) + { + for (long c = 0; c < dest.nc(); ++c) + { + auto win = centered_rect(c*stride_x+x_offset, + r*stride_y+y_offset, + window_width, + window_height).intersect(imgbox); + const float delta = giimg[r*dest.nc()+c]/win.area(); + for (long y = win.top(); y <= win.bottom(); ++y) + { + for (long x = win.left(); x <= win.right(); ++x) + { + gimg[y*grad.nc()+x] += delta; + } + } + } + } + } + } + } + + } + + // ------------------------------------------------------------------------------------ + // ------------------------------------------------------------------------------------ + // ------------------------------------------------------------------------------------ + + void img2col( + matrix<float>& output, + const tensor& data, + long n, + long filter_nr, + long filter_nc, + long stride_y, + long stride_x, + long padding_y, + long padding_x + ) + { + const auto d = data.host() + data.k()*data.nr()*data.nc()*n; + const rectangle boundary = get_rect(data); + + const long out_nr = 1+(data.nr()+2*padding_y-filter_nr)/stride_y; + const long out_nc = 1+(data.nc()+2*padding_x-filter_nc)/stride_x; + + output.set_size(out_nr*out_nc, + data.k()*filter_nr*filter_nc); + DLIB_CASSERT(output.size() != 0); + float* t = &output(0,0); + + // now fill in the Toeplitz output matrix for the n-th sample in data. + size_t cnt = 0; + const long max_r = data.nr() + padding_y-(filter_nr-1); + const long max_c = data.nc() + padding_x-(filter_nc-1); + for (long r = -padding_y; r < max_r; r+=stride_y) + { + for (long c = -padding_x; c < max_c; c+=stride_x) + { + for (long k = 0; k < data.k(); ++k) + { + for (long y = 0; y < filter_nr; ++y) + { + for (long x = 0; x < filter_nc; ++x) + { + DLIB_ASSERT(cnt < output.size()); + long xx = c+x; + long yy = r+y; + if (boundary.contains(xx,yy)) + *t = d[(k*data.nr() + yy)*data.nc() + xx]; + else + *t = 0; + ++t; + ++cnt; + } + } + } + } + } + } + + void col2img( + const matrix<float>& output, + tensor& data, + long n, + long filter_nr, + long filter_nc, + long stride_y, + long stride_x, + long padding_y, + long padding_x + ) + { + const auto d = data.host() + data.k()*data.nr()*data.nc()*n; + const rectangle boundary = get_rect(data); + + DLIB_CASSERT(output.size() != 0); + const float* t = &output(0,0); + + // now fill in the Toeplitz output matrix for the n-th sample in data. + const long max_r = data.nr() + padding_y-(filter_nr-1); + const long max_c = data.nc() + padding_x-(filter_nc-1); + for (long r = -padding_y; r < max_r; r+=stride_y) + { + for (long c = -padding_x; c < max_c; c+=stride_x) + { + for (long k = 0; k < data.k(); ++k) + { + for (long y = 0; y < filter_nr; ++y) + { + for (long x = 0; x < filter_nc; ++x) + { + long xx = c+x; + long yy = r+y; + if (boundary.contains(xx,yy)) + d[(k*data.nr() + yy)*data.nc() + xx] += *t; + ++t; + } + } + } + } + } + } + + void tensor_conv::operator() ( + const bool add_to_output, + resizable_tensor& output, + const tensor& data, + const tensor& filters + ) + { + DLIB_CASSERT(last_stride_y > 0 && last_stride_x > 0, "You must call setup() before calling this function."); + output.set_size(data.num_samples(), + filters.num_samples(), + 1+(data.nr()+2*last_padding_y-filters.nr())/last_stride_y, + 1+(data.nc()+2*last_padding_x-filters.nc())/last_stride_x); + (*this)(add_to_output, static_cast<tensor&>(output),data,filters); + } + + void tensor_conv::operator() ( + const bool add_to_output, + tensor& output, + const tensor& data, + const tensor& filters + ) + { + DLIB_CASSERT(is_same_object(output,data) == false); + DLIB_CASSERT(is_same_object(output,filters) == false); + DLIB_CASSERT(filters.k() == data.k()); + DLIB_CASSERT(last_stride_y > 0 && last_stride_x > 0, "You must call setup() before calling this function."); + DLIB_CASSERT(filters.nr() <= data.nr() + 2*last_padding_y, + "Filter windows must be small enough to fit into the padded image."); + DLIB_CASSERT(filters.nc() <= data.nc() + 2*last_padding_x, + "Filter windows must be small enough to fit into the padded image."); + + DLIB_CASSERT(output.num_samples() == data.num_samples()); + DLIB_CASSERT(output.k() == filters.num_samples()); + DLIB_CASSERT(output.nr() == 1+(data.nr()+2*last_padding_y-filters.nr())/last_stride_y); + DLIB_CASSERT(output.nc() == 1+(data.nc()+2*last_padding_x-filters.nc())/last_stride_x); + + + matrix<float> temp; + for (long n = 0; n < data.num_samples(); ++n) + { + img2col(temp, data, n, filters.nr(), filters.nc(), last_stride_y, last_stride_x, last_padding_y, last_padding_x); + + if (add_to_output) + output.add_to_sample(n, mat(filters)*trans(temp)); + else + output.set_sample(n, mat(filters)*trans(temp)); + } + } + + // ------------------------------------------------------------------------------------ + + void tensor_conv:: + get_gradient_for_data ( + const bool add_to_output, + const tensor& gradient_input, + const tensor& filters, + tensor& data_gradient + ) + { + matrix<float> temp; + if (!add_to_output) + data_gradient = 0; + for (long n = 0; n < gradient_input.num_samples(); ++n) + { + auto gi = mat(gradient_input.host()+gradient_input.k()*gradient_input.nr()*gradient_input.nc()*n, + gradient_input.k(), + gradient_input.nr()*gradient_input.nc()); + + + temp = trans(gi)*mat(filters); + col2img(temp, data_gradient, n, filters.nr(), filters.nc(), last_stride_y, last_stride_x, last_padding_y, last_padding_x); + } + } + + // ------------------------------------------------------------------------------------ + + void tensor_conv:: + get_gradient_for_filters ( + const bool add_to_output, + const tensor& gradient_input, + const tensor& data, + tensor& filters_gradient + ) + { + matrix<float> temp; + for (long n = 0; n < gradient_input.num_samples(); ++n) + { + auto gi = mat(gradient_input.host()+gradient_input.k()*gradient_input.nr()*gradient_input.nc()*n, + gradient_input.k(), + gradient_input.nr()*gradient_input.nc()); + + + img2col(temp, data, n, filters_gradient.nr(), filters_gradient.nc(), last_stride_y, last_stride_x, last_padding_y, last_padding_x); + if (n == 0) + { + if (add_to_output) + filters_gradient += gi*temp; + else + filters_gradient = gi*temp; + } + else + { + filters_gradient += gi*temp; + } + } + } + + // ------------------------------------------------------------------------------------ + + void copy_tensor( + bool add_to, + tensor& dest, + size_t dest_k_offset, + const tensor& src, + size_t src_k_offset, + size_t count_k + ) + { + const size_t dest_sample_size = static_cast<size_t>(dest.nc() * dest.nr() * dest.k()); + const size_t src_sample_size = static_cast<size_t>(src.nc() * src.nr() * src.k()); + + const size_t block_size = count_k * dest.nc() * dest.nr(); + + DLIB_CASSERT(dest.num_samples() == src.num_samples() && + dest.nc() == src.nc() && dest.nr() == src.nr(), "All sources should fit into dest tensor size"); + DLIB_CASSERT(dest.k() - dest_k_offset >= count_k, "Not enough space in dest tensor"); + DLIB_CASSERT(src.k() - src_k_offset >= count_k, "Not enough space in src tensor"); + + float* dest_p = dest.host() + dest_k_offset * dest.nc() * dest.nr(); + const float* src_p = src.host() + src_k_offset * src.nc() * src.nr(); + + for (long i = 0; i < src.num_samples(); ++i) + { + if (add_to) + { + for (size_t j = 0; j < block_size; ++j) + dest_p[j] += src_p[j]; + } + else + { + ::memcpy(dest_p, src_p, block_size * sizeof(float)); + } + + dest_p += dest_sample_size; + src_p += src_sample_size; + } + } + + // ------------------------------------------------------------------------------------ + // ------------------------------------------------------------------------------------ + // ------------------------------------------------------------------------------------ + + } +} + + +#endif // DLIB_DNN_CPU_cPP_ + + diff --git a/ml/dlib/dlib/dnn/cpu_dlib.h b/ml/dlib/dlib/dnn/cpu_dlib.h new file mode 100644 index 000000000..330df01a2 --- /dev/null +++ b/ml/dlib/dlib/dnn/cpu_dlib.h @@ -0,0 +1,505 @@ +// Copyright (C) 2015 Davis E. King (davis@dlib.net) +// License: Boost Software License See LICENSE.txt for the full license. +#ifndef DLIB_DNN_CPU_H_ +#define DLIB_DNN_CPU_H_ + +// This file contains CPU implementations of the GPU based functions in cuda_dlib.h +// and cudnn_dlibapi.h + +#include "tensor.h" +#include "../geometry/rectangle.h" + +namespace dlib +{ + namespace cpu + { + + // ----------------------------------------------------------------------------------- + + void multiply ( + bool add_to, + tensor& dest, + const tensor& src1, + const tensor& src2 + ); + + void multiply_conv ( + bool add_to, + tensor& dest, + const tensor& src1, + const tensor& src2 + ); + + void multiply_zero_padded ( + bool add_to, + tensor& dest, + const tensor& src1, + const tensor& src2 + ); + + void scale_channels ( + bool add_to, + tensor& dest, + const tensor& src, + const tensor& scales + ); + + void add( + float beta, + tensor& dest, + float alpha, + const tensor& src + ); + + void assign_bias_gradient ( + tensor& grad, + const tensor& gradient_input + ); + + void add ( + tensor& dest, + const tensor& src1, + const tensor& src2 + ); + + void assign_conv_bias_gradient ( + tensor& grad, + const tensor& gradient_input + ); + + // ----------------------------------------------------------------------------------- + + void affine_transform( + tensor& dest, + const tensor& src, + const float A, + const float B + ); + + void affine_transform( + tensor& dest, + const tensor& src1, + const tensor& src2, + const float A, + const float B, + const float C + ); + + void affine_transform( + tensor& dest, + const tensor& src1, + const tensor& src2, + const tensor& src3, + const float A, + const float B, + const float C, + const float D + ); + + void affine_transform_range( + size_t begin, + size_t end, + tensor& dest, + const tensor& src1, + const tensor& src2, + const tensor& src3, + const float A, + const float B, + const float C + ); + + // ----------------------------------------------------------------------------------- + + void affine_transform( + tensor& dest, + const tensor& src, + const tensor& A, + const tensor& B + ); + + // ----------------------------------------------------------------------------------- + + void affine_transform_conv( + tensor& dest, + const tensor& src, + const tensor& A, + const tensor& B + ); + + // ----------------------------------------------------------------------------------- + + void affine_transform( + const rectangle& rect, + tensor& dest, + const tensor& src1, + const tensor& src2, + const tensor& src3, + float A, + float B, + float C + ); + + // ----------------------------------------------------------------------------------- + + void compute_adam_update ( + size_t begin, + size_t end, + tensor& s, + tensor& m, + tensor& v, + const float t, + const float learning_rate, + const float weight_decay, + const float momentum1, + const float momentum2, + const tensor& params, + const tensor& params_grad + ); + + // ----------------------------------------------------------------------------------- + + void batch_normalize_inference ( + const double eps, + resizable_tensor& dest, + const tensor& src, + const tensor& gamma, + const tensor& beta, + const tensor& running_means, + const tensor& running_variances + ); + + void batch_normalize ( + const double eps, + resizable_tensor& dest, + resizable_tensor& means, + resizable_tensor& invstds, + const double averaging_factor, + resizable_tensor& running_means, + resizable_tensor& running_variances, + const tensor& src, + const tensor& gamma, + const tensor& beta + ); + + void batch_normalize_gradient ( + const double eps, + const tensor& gradient_input, + const tensor& means, + const tensor& invstds, + const tensor& src, + const tensor& gamma, + tensor& src_grad, + tensor& gamma_grad, + tensor& beta_grad + ); + + void batch_normalize_conv_inference ( + const double eps, + resizable_tensor& dest, + const tensor& src, + const tensor& gamma, + const tensor& beta, + const tensor& running_means, + const tensor& running_variances + ); + + void batch_normalize_conv ( + const double eps, + resizable_tensor& dest, + resizable_tensor& means, + resizable_tensor& invstds, + const double averaging_factor, + resizable_tensor& running_means, + resizable_tensor& running_variances, + const tensor& src, + const tensor& gamma, + const tensor& beta + ); + + void batch_normalize_conv_gradient ( + const double eps, + const tensor& gradient_input, + const tensor& means, + const tensor& invstds, + const tensor& src, + const tensor& gamma, + tensor& src_grad, + tensor& gamma_grad, + tensor& beta_grad + ); + + // ----------------------------------------------------------------------------------- + + void threshold ( + tensor& data, + float thresh + ); + + void dot ( + const tensor& a, + const tensor& b, + tensor& result, + size_t idx + ); + + // ----------------------------------------------------------------------------------- + + void softmax ( + tensor& dest, + const tensor& src + ); + + void softmax_gradient ( + tensor& grad, + const tensor& dest, + const tensor& gradient_input + ); + + // ------------------------------------------------------------------------------------ + + void softmax_all ( + tensor& dest, + const tensor& src + ); + + void softmax_all_gradient ( + tensor& grad, + const tensor& dest, + const tensor& gradient_input + ); + + // ------------------------------------------------------------------------------------ + + void sigmoid ( + tensor& dest, + const tensor& src + ); + + void sigmoid_gradient ( + tensor& grad, + const tensor& dest, + const tensor& gradient_input + ); + + // ------------------------------------------------------------------------------------ + + void relu ( + tensor& dest, + const tensor& src + ); + + void relu_gradient ( + tensor& grad, + const tensor& dest, + const tensor& gradient_input + ); + + // ---------------------------------------------------------------------------------------- + + void prelu ( + tensor& dest, + const tensor& src, + const tensor& param + ); + + void prelu_gradient ( + tensor& grad, + const tensor& src, + const tensor& gradient_input, + const tensor& param, + tensor& params_grad + ); + + // ------------------------------------------------------------------------------------ + + void tanh ( + tensor& dest, + const tensor& src + ); + + void tanh_gradient ( + tensor& grad, + const tensor& dest, + const tensor& gradient_input + ); + + // ---------------------------------------------------------------------------------------- + + void resize_bilinear ( + tensor& dest, + long dest_row_stride, + long dest_channel_stride, + const tensor& src, + long src_row_stride, + long src_channel_stride + ); + + void resize_bilinear_gradient ( + tensor& grad, + long grad_row_stride, + long grad_channel_stride, + const tensor& gradient_input, + long gradient_input_row_stride, + long gradient_input_channel_stride + ); + + inline void resize_bilinear ( + tensor& dest, + const tensor& src + ) { resize_bilinear(dest, dest.nc(), dest.nr()*dest.nc(), src, src.nc(), src.nr()*src.nc()); } + + inline void resize_bilinear_gradient ( + tensor& grad, + const tensor& gradient_input + ) { resize_bilinear_gradient(grad, grad.nc(), grad.nr()*grad.nc(), gradient_input, gradient_input.nc(), gradient_input.nr()*gradient_input.nc()); } + + // ----------------------------------------------------------------------------------- + + class pooling + { + public: + + pooling(const pooling&) = delete; + pooling& operator=(const pooling&) = delete; + + pooling ( + ); + + void clear( + ); + + void setup_max_pooling( + int window_height, + int window_width, + int stride_y, + int stride_x, + int padding_y, + int padding_x + ); + + void setup_avg_pooling( + int window_height, + int window_width, + int stride_y, + int stride_x, + int padding_y, + int padding_x + ); + + bool does_max_pooling( + ) const { return do_max_pooling; } + + void operator() ( + resizable_tensor& dest, + const tensor& src + ); + + void get_gradient( + const tensor& gradient_input, + const tensor& dest, + const tensor& src, + tensor& grad + ); + + private: + int window_height; + int window_width; + int stride_y; + int stride_x; + int padding_y; + int padding_x; + bool do_max_pooling; + + }; + + // ----------------------------------------------------------------------------------- + + class tensor_conv + { + public: + tensor_conv(const tensor_conv&) = delete; + tensor_conv& operator=(const tensor_conv&) = delete; + + tensor_conv() {} + + void clear( + ) {} + + void setup( + const tensor& data, /* not used but required for interface */ + const tensor& filters, /* not used but required for interface */ + int stride_y, + int stride_x, + int padding_y, + int padding_x + ) + { + (void)data; /* silence compiler */ + DLIB_CASSERT(stride_y > 0 && stride_x > 0); + DLIB_CASSERT(0 <= padding_y && padding_y < filters.nr()); + DLIB_CASSERT(0 <= padding_x && padding_x < filters.nc()); + last_stride_y = stride_y; + last_stride_x = stride_x; + last_padding_y = padding_y; + last_padding_x = padding_x; + } + + void operator() ( + const bool add_to_output, + resizable_tensor& output, + const tensor& data, + const tensor& filters + ); + + void operator() ( + const bool add_to_output, + tensor& output, + const tensor& data, + const tensor& filters + ); + + void get_gradient_for_data ( + const bool add_to_output, + const tensor& gradient_input, + const tensor& filters, + tensor& data_gradient + ); + + void get_gradient_for_filters ( + const bool add_to_output, + const tensor& gradient_input, + const tensor& data, + tensor& filters_gradient + ); + + private: + + long last_stride_y = 0; + long last_stride_x = 0; + long last_padding_y = 0; + long last_padding_x = 0; + }; + + // ----------------------------------------------------------------------------------- + + void copy_tensor( + bool add_to, + tensor& dest, + size_t dest_k_offset, + const tensor& src, + size_t src_k_offset, + size_t count_k + ); + + // ----------------------------------------------------------------------------------- + + } +} + +#ifdef NO_MAKEFILE +#include "cpu_dlib.cpp" +#endif + +#endif // DLIB_DNN_CPU_H_ + + diff --git a/ml/dlib/dlib/dnn/cublas_dlibapi.cpp b/ml/dlib/dlib/dnn/cublas_dlibapi.cpp new file mode 100644 index 000000000..376cc9f00 --- /dev/null +++ b/ml/dlib/dlib/dnn/cublas_dlibapi.cpp @@ -0,0 +1,165 @@ +// Copyright (C) 2015 Davis E. King (davis@dlib.net) +// License: Boost Software License See LICENSE.txt for the full license. +#ifndef DLIB_DNN_CuBLAS_CPP_ +#define DLIB_DNN_CuBLAS_CPP_ + +#ifdef DLIB_USE_CUDA + +#include "cublas_dlibapi.h" +#include "cuda_utils.h" + +#include <cublas_v2.h> +#include <vector> + +static const char* cublas_get_error_string(cublasStatus_t s) +{ + switch(s) + { + case CUBLAS_STATUS_NOT_INITIALIZED: + return "CUDA Runtime API initialization failed."; + case CUBLAS_STATUS_ALLOC_FAILED: + return "CUDA Resources could not be allocated."; + default: + return "A call to cuBLAS failed"; + } +} + +// Check the return value of a call to the cuBLAS runtime for an error condition. +#define CHECK_CUBLAS(call) \ +do{ \ + const cublasStatus_t error = call; \ + if (error != CUBLAS_STATUS_SUCCESS) \ + { \ + std::ostringstream sout; \ + sout << "Error while calling " << #call << " in file " << __FILE__ << ":" << __LINE__ << ". ";\ + sout << "code: " << error << ", reason: " << cublas_get_error_string(error);\ + throw dlib::cublas_error(sout.str()); \ + } \ +}while(false) + +namespace dlib +{ + namespace cuda + { + + // ----------------------------------------------------------------------------------- + + class cublas_context + { + public: + // not copyable + cublas_context(const cublas_context&) = delete; + cublas_context& operator=(const cublas_context&) = delete; + + cublas_context() + { + handles.resize(16); + } + ~cublas_context() + { + for (auto h : handles) + { + if (h) + cublasDestroy(h); + } + } + + cublasHandle_t get_handle ( + ) + { + int new_device_id; + CHECK_CUDA(cudaGetDevice(&new_device_id)); + // make room for more devices if needed + if (new_device_id >= (long)handles.size()) + handles.resize(new_device_id+16); + + // If we don't have a handle already for this device then make one + if (!handles[new_device_id]) + CHECK_CUBLAS(cublasCreate(&handles[new_device_id])); + + // Finally, return the handle for the current device + return handles[new_device_id]; + } + + private: + + std::vector<cublasHandle_t> handles; + }; + + static cublasHandle_t context() + { + thread_local cublas_context c; + return c.get_handle(); + } + + // ----------------------------------------------------------------------------------- + + void gemm ( + float beta, + tensor& dest, + float alpha, + const tensor& lhs, + bool trans_lhs, + const tensor& rhs, + bool trans_rhs + ) + { + // Recall that BLAS uses column major order so to deal with that we flip the + // order of the lhs and rhs arguments. + const auto transa = trans_lhs ? CUBLAS_OP_T : CUBLAS_OP_N; + const auto transb = trans_rhs ? CUBLAS_OP_T : CUBLAS_OP_N; + + const int dest_nr = dest.num_samples(); + const int dest_nc = dest.size()/dest_nr; + const int lhs_nr = lhs.num_samples(); + const int lhs_nc = lhs.size()/lhs_nr; + const int rhs_nr = rhs.num_samples(); + const int rhs_nc = rhs.size()/rhs_nr; + if (trans_lhs && trans_rhs) + { + DLIB_ASSERT( dest_nr == lhs_nc && + dest_nc == rhs_nr && + lhs_nr == rhs_nc) + } + else if (!trans_lhs && trans_rhs) + { + DLIB_ASSERT( dest_nr == lhs_nr && + dest_nc == rhs_nr && + lhs_nc == rhs_nc) + } + else if (trans_lhs && !trans_rhs) + { + DLIB_ASSERT( dest_nr == lhs_nc && + dest_nc == rhs_nc && + lhs_nr == rhs_nr) + } + else + { + DLIB_ASSERT( dest_nr == lhs_nr && + dest_nc == rhs_nc && + lhs_nc == rhs_nr) + } + + const int k = trans_rhs ? rhs_nc : rhs_nr; + CHECK_CUBLAS(cublasSgemm(context(), + transb, + transa, + dest_nc, dest_nr, k, + &alpha, + rhs.device(), rhs_nc, + lhs.device(), lhs_nc, + &beta, + dest.device(),dest_nc)); + } + + // ------------------------------------------------------------------------------------ + + } +} + +#endif // DLIB_USE_CUDA + +#endif // DLIB_DNN_CuBLAS_CPP_ + + + diff --git a/ml/dlib/dlib/dnn/cublas_dlibapi.h b/ml/dlib/dlib/dnn/cublas_dlibapi.h new file mode 100644 index 000000000..b46fd25ca --- /dev/null +++ b/ml/dlib/dlib/dnn/cublas_dlibapi.h @@ -0,0 +1,50 @@ +// Copyright (C) 2015 Davis E. King (davis@dlib.net) +// License: Boost Software License See LICENSE.txt for the full license. +#ifndef DLIB_DNN_CuBLAS_H_ +#define DLIB_DNN_CuBLAS_H_ + +#ifdef DLIB_USE_CUDA + +#include "tensor.h" +#include "cuda_errors.h" + +namespace dlib +{ + namespace cuda + { + + // ----------------------------------------------------------------------------------- + + void gemm ( + float beta, + tensor& dest, + float alpha, + const tensor& lhs, + bool trans_lhs, + const tensor& rhs, + bool trans_rhs + ); + /*! + requires + - The dimensions of lhs and rhs must be compatible for matrix + multiplication. In particular: + - Let L == trans_lhs ? trans(mat(lhs)) : mat(lhs) + - Let R == trans_rhs ? trans(mat(rhs)) : mat(rhs) + - Let D == mat(dest) + - D.nr() == L.nr() && D.nc() == R.nc() + (i.e. dest must be preallocated and have the correct output dimensions) + - L.nc() == R.nr() + ensures + - performs: dest = alpha*L*R + beta*mat(dest) + !*/ + + // ------------------------------------------------------------------------------------ + + } +} + +#endif // DLIB_USE_CUDA + +#endif // DLIB_DNN_CuBLAS_H_ + + diff --git a/ml/dlib/dlib/dnn/cuda_data_ptr.cpp b/ml/dlib/dlib/dnn/cuda_data_ptr.cpp new file mode 100644 index 000000000..8abce0695 --- /dev/null +++ b/ml/dlib/dlib/dnn/cuda_data_ptr.cpp @@ -0,0 +1,71 @@ +// Copyright (C) 2017 Davis E. King (davis@dlib.net) +// License: Boost Software License See LICENSE.txt for the full license. +#ifndef DLIB_DNN_CuDA_DATA_PTR_CPP_ +#define DLIB_DNN_CuDA_DATA_PTR_CPP_ + +#ifdef DLIB_USE_CUDA + +#include "cuda_data_ptr.h" +#include "cuda_utils.h" + +namespace dlib +{ + namespace cuda + { + + // ----------------------------------------------------------------------------------- + + cuda_data_void_ptr:: + cuda_data_void_ptr( + size_t n + ) : num(n) + { + if (n == 0) + return; + + void* data = nullptr; + + CHECK_CUDA(cudaMalloc(&data, n)); + pdata.reset(data, [](void* ptr){ + auto err = cudaFree(ptr); + if(err!=cudaSuccess) + std::cerr << "cudaFree() failed. Reason: " << cudaGetErrorString(err) << std::endl; + }); + } + + // ------------------------------------------------------------------------------------ + + void memcpy( + void* dest, + const cuda_data_void_ptr& src + ) + { + if (src.size() != 0) + { + CHECK_CUDA(cudaMemcpy(dest, src.data(), src.size(), cudaMemcpyDefault)); + } + } + + // ------------------------------------------------------------------------------------ + + void memcpy( + cuda_data_void_ptr& dest, + const void* src + ) + { + if (dest.size() != 0) + { + CHECK_CUDA(cudaMemcpy(dest.data(), src, dest.size(), cudaMemcpyDefault)); + } + } + + // ------------------------------------------------------------------------------------ + + } +} + +#endif // DLIB_USE_CUDA + +#endif // DLIB_DNN_CuDA_DATA_PTR_CPP_ + + diff --git a/ml/dlib/dlib/dnn/cuda_data_ptr.h b/ml/dlib/dlib/dnn/cuda_data_ptr.h new file mode 100644 index 000000000..7eca608a0 --- /dev/null +++ b/ml/dlib/dlib/dnn/cuda_data_ptr.h @@ -0,0 +1,184 @@ +// Copyright (C) 2017 Davis E. King (davis@dlib.net) +// License: Boost Software License See LICENSE.txt for the full license. +#ifndef DLIB_DNN_CuDA_DATA_PTR_H_ +#define DLIB_DNN_CuDA_DATA_PTR_H_ + +#ifdef DLIB_USE_CUDA + +#include <memory> +#include <vector> + +namespace dlib +{ + namespace cuda + { + + // ------------------------------------------------------------------------------------ + + class cuda_data_void_ptr + { + /*! + WHAT THIS OBJECT REPRESENTS + This is a block of memory on a CUDA device. + !*/ + public: + + cuda_data_void_ptr() = default; + + cuda_data_void_ptr(size_t n); + /*! + ensures + - This object will allocate a device memory buffer of n bytes. + - #size() == n + !*/ + + void* data() { return pdata.get(); } + const void* data() const { return pdata.get(); } + operator void*() { return pdata.get(); } + operator const void*() const { return pdata.get(); } + + void reset() { pdata.reset(); } + + size_t size() const { return num; } + /*! + ensures + - returns the length of this buffer, in bytes. + !*/ + + private: + + size_t num = 0; + std::shared_ptr<void> pdata; + }; + + // ------------------------------------------------------------------------------------ + + void memcpy( + void* dest, + const cuda_data_void_ptr& src + ); + /*! + requires + - dest == a pointer to at least src.size() bytes on the host machine. + ensures + - copies the GPU data from src into dest. + !*/ + + // ------------------------------------------------------------------------------------ + + void memcpy( + cuda_data_void_ptr& dest, + const void* src + ); + /*! + requires + - dest == a pointer to at least src.size() bytes on the host machine. + ensures + - copies the host data from src to the GPU memory buffer dest. + !*/ + + // ------------------------------------------------------------------------------------ + // ------------------------------------------------------------------------------------ + // ------------------------------------------------------------------------------------ + + template <typename T> + class cuda_data_ptr + { + /*! + WHAT THIS OBJECT REPRESENTS + This is a block of memory on a CUDA device. It is just a type safe + version of cuda_data_void_ptr. + !*/ + + public: + + static_assert(std::is_standard_layout<T>::value, "You can only create basic standard layout types on the GPU"); + + cuda_data_ptr() = default; + cuda_data_ptr(size_t n) : num(n) + /*! + ensures + - This object will allocate a device memory buffer of n T objects. + - #size() == n + !*/ + { + if (n == 0) + return; + + pdata = cuda_data_void_ptr(n*sizeof(T)); + } + + T* data() { return (T*)pdata.data(); } + const T* data() const { return (T*)pdata.data(); } + + operator T*() { return (T*)pdata.data(); } + operator const T*() const { return (T*)pdata.data(); } + + void reset() { pdata.reset(); } + + size_t size() const { return num; } + + + friend void memcpy( + std::vector<T>& dest, + const cuda_data_ptr& src + ) + { + dest.resize(src.size()); + if (src.size() != 0) + memcpy(dest.data(), src.pdata); + } + + friend void memcpy( + cuda_data_ptr& src, + const std::vector<T>& dest + ) + { + if (dest.size() != src.size()) + dest = cuda_data_ptr<T>(src.size()); + + if (src.size() != 0) + memcpy(src.pdata, dest.data()); + } + + private: + + size_t num = 0; + cuda_data_void_ptr pdata; + }; + + // ------------------------------------------------------------------------------------ + + class resizable_cuda_buffer + { + /*! + WHAT THIS OBJECT REPRESENTS + This is a block of memory on a CUDA device that will be automatically + resized if requested size is larger than allocated. + !*/ + public: + cuda_data_void_ptr get(size_t size) + /*! + ensures + - This object will return the buffer of requested size of larger + - buffer.size() >= size + !*/ + { + if (buffer.size() < size) + { + buffer.reset(); + buffer = cuda_data_void_ptr(size); + } + return buffer; + } + private: + cuda_data_void_ptr buffer; + }; + + } +} + +#endif // DLIB_USE_CUDA + +#endif // DLIB_DNN_CuDA_DATA_PTR_H_ + diff --git a/ml/dlib/dlib/dnn/cuda_dlib.cu b/ml/dlib/dlib/dnn/cuda_dlib.cu new file mode 100644 index 000000000..6c37593f1 --- /dev/null +++ b/ml/dlib/dlib/dnn/cuda_dlib.cu @@ -0,0 +1,1630 @@ +// Copyright (C) 2015 Davis E. King (davis@dlib.net) +// License: Boost Software License See LICENSE.txt for the full license. + +#include "cuda_utils.h" +#include "cuda_dlib.h" + + +namespace dlib +{ + namespace cuda + { + + // ----------------------------------------------------------------------------------- + + void set_device ( + int dev + ) + { + CHECK_CUDA(cudaSetDevice(dev)); + } + + int get_device ( + ) + { + int dev = 0; + CHECK_CUDA(cudaGetDevice(&dev)); + return dev; + } + + std::string get_device_name ( + int device + ) + { + cudaDeviceProp props; + CHECK_CUDA(cudaGetDeviceProperties(&props, device)); + return props.name; + } + + void set_current_device_blocking_sync( + ) + { + CHECK_CUDA(cudaSetDeviceFlags(cudaDeviceScheduleBlockingSync)); + } + + int get_num_devices ( + ) + { + int num_devices; + CHECK_CUDA(cudaGetDeviceCount(&num_devices)); + return num_devices; + } + + bool can_access_peer (int device_id, int peer_device_id) + { + int can_access; + CHECK_CUDA(cudaDeviceCanAccessPeer(&can_access, device_id, peer_device_id)); + return can_access != 0; + } + bool can_access_peer (const tensor& device, const tensor& peer_device) + { + return can_access_peer(device.device_id(), peer_device.device_id()); + } + + void device_synchronize (int dev) + { + raii_set_device set_dev(dev); + CHECK_CUDA(cudaDeviceSynchronize()); + } + void device_synchronize (const tensor& dev) { device_synchronize(dev.device_id()); } + + enable_peer_access:: + enable_peer_access( + int device_id, + int peer_device_id + ) : call_disable(false), device_id(device_id), peer_device_id(peer_device_id) + { + raii_set_device set_dev(device_id); + + auto err = cudaDeviceEnablePeerAccess(peer_device_id, 0); + if (err == cudaSuccess) + { + call_disable = true; + } + else if (err == cudaErrorPeerAccessAlreadyEnabled) + { + // call cudaGetLastError() to dispose of this error since we don't + // care. + auto err2 = cudaGetLastError(); + if (err2 != cudaErrorPeerAccessAlreadyEnabled) + CHECK_CUDA(err2); + } + else + { + CHECK_CUDA(err); + } + } + + + enable_peer_access:: + ~enable_peer_access() noexcept(false) + { + if (call_disable) + { + raii_set_device set_dev(device_id); + CHECK_CUDA(cudaDeviceDisablePeerAccess(peer_device_id)); + } + } + + // ----------------------------------------------------------------------------------- + // ----------------------------------------------------------------------------------- + // ----------------------------------------------------------------------------------- + + __global__ void _cuda_inverse_norms(float* invnorms, const float* data, size_t nr, size_t nc, const float eps) + { + // initialize invnorms before we begin. + for (auto i : grid_stride_range_y(0, nr)) + for (auto j : grid_stride_range(0, 1)) + invnorms[i] = eps; + __syncthreads(); + + for (auto i : grid_stride_range_y(0, nr)) + { + auto p = data + i*nc; + float temp = 0; + for (auto j : grid_stride_range(0, nc)) + temp += p[j]*p[j]; + + // and store the sum into invnorms[i] + warp_reduce_atomic_add(invnorms[i], temp); + } + __syncthreads(); + + for (auto i : grid_stride_range_y(0, nr)) + for (auto j : grid_stride_range(0, 1)) + invnorms[i] = 1.0/std::sqrt(invnorms[i]); + } + + void inverse_norms ( + resizable_tensor& invnorms, + const tensor& data, + const double eps + ) + { + invnorms.set_size(data.num_samples()); + launch_kernel(_cuda_inverse_norms, max_jobs(data.size()/data.num_samples(), data.num_samples()), + invnorms.device(), data.device(), data.num_samples(), data.size()/data.num_samples(), eps); + } + + // ---------------------------------------------------------------------------------------- + + __global__ void _cuda_dot_prods(float* out, const float* lhs, const float* rhs, size_t nr, size_t nc) + { + // initialize out before we begin. + for (auto i : grid_stride_range_y(0, nr)) + for (auto j : grid_stride_range(0, 1)) + out[i] = 0; + __syncthreads(); + + for (auto i : grid_stride_range_y(0, nr)) + { + auto l = lhs + i*nc; + auto r = rhs + i*nc; + float temp = 0; + for (auto j : grid_stride_range(0, nc)) + temp += l[j]*r[j]; + + // and store the sum into out[i] + warp_reduce_atomic_add(out[i], temp); + } + } + + __global__ void _cuda_dot_prods_add_to(float* out, const float* lhs, const float* rhs, size_t nr, size_t nc) + { + for (auto i : grid_stride_range_y(0, nr)) + { + auto l = lhs + i*nc; + auto r = rhs + i*nc; + float temp = 0; + for (auto j : grid_stride_range(0, nc)) + temp += l[j]*r[j]; + + // and store the sum into out[i] + warp_reduce_atomic_add(out[i], temp); + } + } + + void dot_prods ( + resizable_tensor& out, + const tensor& lhs, + const tensor& rhs + ) + { + DLIB_CASSERT(have_same_dimensions(lhs,rhs)); + + out.set_size(lhs.num_samples()); + if (out.size() == 0) + return; + + const auto nr = lhs.num_samples(); + const auto nc = lhs.size()/lhs.num_samples(); + + launch_kernel(_cuda_dot_prods, max_jobs(nc,nr), out.device_write_only(), lhs.device(), rhs.device(), nr, nc); + } + + void dot_prods ( + bool add_to, + tensor& out, + const tensor& lhs, + const tensor& rhs + ) + { + DLIB_CASSERT(have_same_dimensions(lhs,rhs)); + DLIB_CASSERT(out.k() == 1 && out.nr() == 1 && out.nc() == 1); + DLIB_CASSERT(out.size() == lhs.num_samples()); + + const auto nr = lhs.num_samples(); + const auto nc = lhs.size()/lhs.num_samples(); + + if (add_to) + launch_kernel(_cuda_dot_prods_add_to, max_jobs(nc,nr), out.device(), lhs.device(), rhs.device(), nr, nc); + else + launch_kernel(_cuda_dot_prods, max_jobs(nc,nr), out.device_write_only(), lhs.device(), rhs.device(), nr, nc); + } + + // ---------------------------------------------------------------------------------------- + + __global__ void _cuda_scale_columns(float* out, const float* m, const float* v, size_t nr, size_t nc) + { + for (auto j : grid_stride_range(0, nr*nc)) + { + out[j] = m[j]*v[j%nc]; + } + } + + void scale_columns ( + tensor& out, + const tensor& m, + const tensor& v + ) + { + launch_kernel(_cuda_scale_columns, max_jobs(m.size()), out.device(), m.device(), v.device(), m.num_samples(), m.size()/m.num_samples()); + } + + // ---------------------------------------------------------------------------------------- + + __global__ void _cuda_scale_rows(float* out, const float* m, const float* v, size_t nr, size_t nc) + { + for (auto j : grid_stride_range(0, nr*nc)) + { + out[j] = m[j]*v[j/nc]; + } + } + + void scale_rows ( + tensor& out, + const tensor& m, + const tensor& v + ) + { + launch_kernel(_cuda_scale_rows, max_jobs(m.size()), out.device(), m.device(), v.device(), m.num_samples(), m.size()/m.num_samples()); + } + + // ---------------------------------------------------------------------------------------- + + __global__ void _cuda_scale_rows2(float* out, const float* m1, const float* m2, const float* v1, const float* v2, size_t nr, size_t nc) + { + for (auto j : grid_stride_range(0, nr*nc)) + { + out[j] = (m1[j] - m2[j]*v1[j/nc]) * v2[j/nc]; + } + } + + __global__ void _cuda_scale_rows2_beta(const float beta, float* out, const float* m1, const float* m2, const float* v1, const float* v2, size_t nr, size_t nc) + { + for (auto j : grid_stride_range(0, nr*nc)) + { + out[j] = beta*out[j] + (m1[j] - m2[j]*v1[j/nc]) * v2[j/nc]; + } + } + + void scale_rows2 ( + float beta, + tensor& out, + const tensor& m1, + const tensor& m2, + const tensor& v1, + const tensor& v2 + ) + { + if (beta == 0) + { + launch_kernel(_cuda_scale_rows2, max_jobs(m1.size()), out.device(), + m1.device(), m2.device(), v1.device(), v2.device(), m1.num_samples(), + m1.size()/m1.num_samples()); + } + else + { + launch_kernel(_cuda_scale_rows2_beta, max_jobs(m1.size()), beta, + out.device(), m1.device(), m2.device(), v1.device(), v2.device(), + m1.num_samples(), m1.size()/m1.num_samples()); + } + } + + // ---------------------------------------------------------------------------------------- + + __global__ void _cuda_exp(float* dest, const float* src, size_t n) + { + for (auto i : grid_stride_range(0, n)) + dest[i] = ::exp(src[i]); + } + + void exp ( + tensor& dest, + const tensor& src + ) + { + DLIB_ASSERT(dest.size() == src.size()); + launch_kernel(_cuda_exp, max_jobs(src.size()), dest.device(), src.device(), src.size()); + } + + // ---------------------------------------------------------------------------------------- + + __global__ void _cuda_log(float* dest, const float* src, size_t n) + { + for (auto i : grid_stride_range(0, n)) + dest[i] = ::log(src[i]); + } + + void log ( + tensor& dest, + const tensor& src + ) + { + DLIB_ASSERT(dest.size() == src.size()); + launch_kernel(_cuda_log, max_jobs(src.size()), dest.device(), src.device(), src.size()); + } + + // ---------------------------------------------------------------------------------------- + + __global__ void _cuda_log10(float* dest, const float* src, size_t n) + { + for (auto i : grid_stride_range(0, n)) + dest[i] = ::log10(src[i]); + } + + void log10 ( + tensor& dest, + const tensor& src + ) + { + DLIB_ASSERT(dest.size() == src.size()); + launch_kernel(_cuda_log10, max_jobs(src.size()), dest.device(), src.device(), src.size()); + } + + // ----------------------------------------------------------------------------------- + + __global__ void _cuda_multiply1(float* d, const float* s1, const float* s2, size_t n) + { + for (auto i : grid_stride_range(0, n)) + { + d[i] = s1[i]*s2[i]; + } + } + __global__ void _cuda_multiply2(float* d, const float* s1, const float* s2, + size_t n, size_t s1_n, size_t s2_n, size_t max_size) + { + for (auto i : grid_stride_range(0, n)) + { + d[i] = 0; + for (size_t j = i; j < max_size; j += n) + d[i] += s1[j%s1_n]*s2[j%s2_n]; + } + } + + __global__ void _cuda_multiply3(float* d, const float* s1, const float* s2, + size_t n, size_t s1_n, size_t s2_n) + { + for (auto i : grid_stride_range(0, n)) + { + d[i] = s1[i%s1_n]*s2[i%s2_n]; + } + } + + __global__ void _cuda_multiply1_add_to(float* d, const float* s1, const float* s2, size_t n) + { + for (auto i : grid_stride_range(0, n)) + { + d[i] += s1[i]*s2[i]; + } + } + __global__ void _cuda_multiply2_add_to(float* d, const float* s1, const float* s2, + size_t n, size_t s1_n, size_t s2_n, size_t max_size) + { + for (auto i : grid_stride_range(0, n)) + { + for (size_t j = i; j < max_size; j += n) + d[i] += s1[j%s1_n]*s2[j%s2_n]; + } + } + + __global__ void _cuda_multiply3_add_to(float* d, const float* s1, const float* s2, + size_t n, size_t s1_n, size_t s2_n) + { + for (auto i : grid_stride_range(0, n)) + { + d[i] += s1[i%s1_n]*s2[i%s2_n]; + } + } + + void multiply ( + bool add_to, + tensor& dest, + const tensor& src1, + const tensor& src2 + ) + { + + DLIB_CASSERT(dest.k() == src1.k() && src1.k() == src2.k() && + dest.nr() == src1.nr() && src1.nr() == src2.nr() && + dest.nc() == src1.nc() && src1.nc() == src2.nc() ); + const long MD = std::max(std::max(dest.num_samples(),src1.num_samples()),src2.num_samples()); + DLIB_CASSERT((dest.num_samples()==1 || dest.num_samples()==MD) && + (src1.num_samples()==1 || src1.num_samples()==MD) && + (src2.num_samples()==1 || src2.num_samples()==MD) ); + + if (dest.size() == 0) + return; + + const size_t max_size = std::max(std::max(dest.size(),src1.size()),src2.size()); + const auto d = dest.host(); + const auto s1 = src1.host(); + const auto s2 = src2.host(); + if (dest.size() == src1.size() && src1.size() == src2.size()) + { + if (add_to) + launch_kernel(_cuda_multiply1_add_to,max_jobs(dest.size()),dest.device(), src1.device(), src2.device(), src1.size()); + else + launch_kernel(_cuda_multiply1,max_jobs(dest.size()),dest.device(), src1.device(), src2.device(), src1.size()); + } + else if (dest.num_samples() == 1) + { + if (add_to) + launch_kernel(_cuda_multiply2_add_to,max_jobs(dest.size()),dest.device(), src1.device(), src2.device(), + dest.size(), src1.size(), src2.size(), max_size); + else + launch_kernel(_cuda_multiply2,max_jobs(dest.size()),dest.device(), src1.device(), src2.device(), + dest.size(), src1.size(), src2.size(), max_size); + } + else + { + if (add_to) + launch_kernel(_cuda_multiply3_add_to,max_jobs(dest.size()),dest.device(), src1.device(), src2.device(), + dest.size(), src1.size(), src2.size()); + else + launch_kernel(_cuda_multiply3,max_jobs(dest.size()),dest.device(), src1.device(), src2.device(), + dest.size(), src1.size(), src2.size()); + } + } + + // ------------------------------------------------------------------------------------ + + __global__ void _cuda_multiply_conv(float* d, const float* s1, size_t n, const float* s2, size_t bs, size_t ks) + { + for (auto i : grid_stride_range(0, n)) + { + auto k = (i/bs)%ks; + d[i] = s1[i]*s2[k]; + } + } + + __global__ void _cuda_multiply_conv2(float* d, const float* s1, size_t n, const float* s2, size_t bs, size_t ks) + { + // zero initialize d before we begin. + for (auto i : grid_stride_range_y(0, ks)) + for (auto j : grid_stride_range(0, 1)) + d[i] = 0; + __syncthreads(); + + // loop over all the image planes + for (auto i : grid_stride_range_y(0, n)) + { + // sum all the elements in the i-th image plane + float temp = 0; + for (auto j : grid_stride_range(i*bs, (i+1)*bs)) + temp += s1[j]*s2[j]; + auto k = i%ks; + // and store the sum into d[k] + warp_reduce_atomic_add(d[k], temp); + } + } + + __global__ void _cuda_multiply_conv_add_to(float* d, const float* s1, size_t n, const float* s2, size_t bs, size_t ks) + { + for (auto i : grid_stride_range(0, n)) + { + auto k = (i/bs)%ks; + d[i] += s1[i]*s2[k]; + } + } + + __global__ void _cuda_multiply_conv2_add_to(float* d, const float* s1, size_t n, const float* s2, size_t bs, size_t ks) + { + // loop over all the image planes + for (auto i : grid_stride_range_y(0, n)) + { + // sum all the elements in the i-th image plane + float temp = 0; + for (auto j : grid_stride_range(i*bs, (i+1)*bs)) + temp += s1[j]*s2[j]; + auto k = i%ks; + // and store the sum into d[k] + warp_reduce_atomic_add(d[k], temp); + } + } + + + void multiply_conv ( + bool add_to, + tensor& dest, + const tensor& src1, + const tensor& src2 + ) + { + if (have_same_dimensions(dest,src1)) + { + DLIB_CASSERT(src2.num_samples() == 1 && src2.nr() == 1 && src2.nc() == 1 && src2.k() == src1.k()); + if (dest.size() == 0) + return; + + if (add_to) + launch_kernel(_cuda_multiply_conv_add_to,max_jobs(dest.size()), + dest.device(), src1.device(), src1.size(), src2.device(), src1.nr()*src1.nc(), src1.k()); + else + launch_kernel(_cuda_multiply_conv,max_jobs(dest.size()), + dest.device(), src1.device(), src1.size(), src2.device(), src1.nr()*src1.nc(), src1.k()); + } + else + { + DLIB_CASSERT(have_same_dimensions(src1,src2)); + DLIB_CASSERT(dest.num_samples() == 1 && dest.nr() == 1 && dest.nc() == 1 && dest.k() == src1.k()); + if (dest.size() == 0) + return; + + + const auto bs = src1.nr()*src1.nc(); + const auto n = src1.num_samples()*src1.k(); + if (add_to) + launch_kernel(_cuda_multiply_conv2_add_to, max_jobs(bs,n), + dest.device(), src1.device(), n, src2.device(), bs, src1.k()); + else + launch_kernel(_cuda_multiply_conv2, max_jobs(bs,n), + dest.device(), src1.device(), n, src2.device(), bs, src1.k()); + } + + } + + // ------------------------------------------------------------------------------------ + + __global__ void _cuda_scale_channels_add_to(float* d, const float* src, size_t n, const float* scales, size_t bs) + { + for (auto i : grid_stride_range(0, n)) + { + auto k = i/bs; + d[i] += src[i]*scales[k]; + } + } + + __global__ void _cuda_scale_channels(float* d, const float* src, size_t n, const float* scales, size_t bs) + { + for (auto i : grid_stride_range(0, n)) + { + auto k = i/bs; + d[i] = src[i]*scales[k]; + } + } + + void scale_channels ( + bool add_to, + tensor& dest, + const tensor& src, + const tensor& scales + ) + { + DLIB_CASSERT(have_same_dimensions(dest,src) && + scales.num_samples() == src.num_samples() && + scales.k() == src.k() && + scales.nr() == 1 && + scales.nc() == 1 ); + + if (dest.size() == 0) + return; + + if (add_to) + launch_kernel(_cuda_scale_channels_add_to,max_jobs(dest.size()), + dest.device(), src.device(), src.size(), scales.device(), src.nr()*src.nc()); + else + launch_kernel(_cuda_scale_channels,max_jobs(dest.size()), + dest.device_write_only(), src.device(), src.size(), scales.device(), src.nr()*src.nc()); + } + + // ------------------------------------------------------------------------------------ + + __global__ void _cuda_mult1(float* d, const float* s1, const float* s2, size_t n) + { + for (auto i : grid_stride_range(0, n)) + { + d[i] = s1[i]*s2[i]; + } + } + + __global__ void _cuda_mult1_add_to(float* d, const float* s1, const float* s2, size_t n) + { + for (auto i : grid_stride_range(0, n)) + { + d[i] += s1[i]*s2[i]; + } + } + + __global__ void _cuda_mult2(float* d, const float* s1, const float* s2, + size_t dn, size_t dk, size_t dr, size_t dc, + size_t s1n, size_t s1k, size_t s1r, size_t s1c, + size_t s2n, size_t s2k, size_t s2r, size_t s2c) + { + for (auto i : grid_stride_range(0, dn*dk*dr*dc)) + { + size_t n,k,r,c; + unpack_idx(i, dk,dr,dc, n,k,r,c); + + float v1 = 0; + float v2 = 0; + + if (n < s1n && + k < s1k && + r < s1r && + c < s1c ) + { + v1 = s1[pack_idx(s1k,s1r,s1c, n,k,r,c)]; + } + + if (n < s2n && + k < s2k && + r < s2r && + c < s2c ) + { + v2 = s2[pack_idx(s2k,s2r,s2c, n,k,r,c)]; + } + + d[i] = v1*v2; + } + } + + __global__ void _cuda_mult2_add_to(float* d, const float* s1, const float* s2, + size_t dn, size_t dk, size_t dr, size_t dc, + size_t s1n, size_t s1k, size_t s1r, size_t s1c, + size_t s2n, size_t s2k, size_t s2r, size_t s2c) + { + for (auto i : grid_stride_range(0, dn*dk*dr*dc)) + { + size_t n,k,r,c; + unpack_idx(i, dk,dr,dc, n,k,r,c); + + float v1 = 0; + float v2 = 0; + + if (n < s1n && + k < s1k && + r < s1r && + c < s1c ) + { + v1 = s1[pack_idx(s1k,s1r,s1c, n,k,r,c)]; + } + + if (n < s2n && + k < s2k && + r < s2r && + c < s2c ) + { + v2 = s2[pack_idx(s2k,s2r,s2c, n,k,r,c)]; + } + + d[i] += v1*v2; + } + } + + void multiply_zero_padded ( + bool add_to, + tensor& dest, + const tensor& src1, + const tensor& src2 + ) + { + if (dest.size() == 0) + return; + + // Do the simple and fast version if everything has the same dimensions + if (have_same_dimensions(dest, src1) && + have_same_dimensions(dest, src2)) + { + if (add_to) + launch_kernel(_cuda_mult1_add_to,max_jobs(dest.size()), dest.device(), src1.device(), src2.device(), dest.size()); + else + launch_kernel(_cuda_mult1,max_jobs(dest.size()), dest.device(), src1.device(), src2.device(), dest.size()); + } + else + { + if (add_to) + { + // Otherwise, do the more complex version with bounds checking. + launch_kernel(_cuda_mult2_add_to,max_jobs(dest.size()), + dest.device(), src1.device(), src2.device(), + dest.num_samples(), dest.k(), dest.nr(), dest.nc(), + src1.num_samples(), src1.k(), src1.nr(), src1.nc(), + src2.num_samples(), src2.k(), src2.nr(), src2.nc() + ); + } + else + { + // Otherwise, do the more complex version with bounds checking. + launch_kernel(_cuda_mult2,max_jobs(dest.size()), + dest.device(), src1.device(), src2.device(), + dest.num_samples(), dest.k(), dest.nr(), dest.nc(), + src1.num_samples(), src1.k(), src1.nr(), src1.nc(), + src2.num_samples(), src2.k(), src2.nr(), src2.nc() + ); + } + } + } + + // ------------------------------------------------------------------------------------ + + __global__ void _cuda_add1(float* d, const float* s1, const float* s2, size_t n) + { + for (auto i : grid_stride_range(0, n)) + { + d[i] = s1[i]+s2[i]; + } + } + + __global__ void _cuda_add2(float* d, const float* s1, const float* s2, + size_t dn, size_t dk, size_t dr, size_t dc, + size_t s1n, size_t s1k, size_t s1r, size_t s1c, + size_t s2n, size_t s2k, size_t s2r, size_t s2c) + { + for (auto i : grid_stride_range(0, dn*dk*dr*dc)) + { + size_t n,k,r,c; + unpack_idx(i, dk,dr,dc, n,k,r,c); + + float v1 = 0; + float v2 = 0; + + if (n < s1n && + k < s1k && + r < s1r && + c < s1c ) + { + v1 = s1[pack_idx(s1k,s1r,s1c, n,k,r,c)]; + } + + if (n < s2n && + k < s2k && + r < s2r && + c < s2c ) + { + v2 = s2[pack_idx(s2k,s2r,s2c, n,k,r,c)]; + } + + d[i] = v1+v2; + } + } + + void add ( + tensor& dest, + const tensor& src1, + const tensor& src2 + ) + { + if (dest.size() == 0) + return; + + // Do the simple and fast version if everything has the same dimensions + if (have_same_dimensions(dest, src1) && + have_same_dimensions(dest, src2)) + { + launch_kernel(_cuda_add1,max_jobs(dest.size()), dest.device(), src1.device(), src2.device(), dest.size()); + } + else + { + // Otherwise, do the more complex version with bounds checking. + launch_kernel(_cuda_add2,max_jobs(dest.size()), + dest.device(), src1.device(), src2.device(), + dest.num_samples(), dest.k(), dest.nr(), dest.nc(), + src1.num_samples(), src1.k(), src1.nr(), src1.nc(), + src2.num_samples(), src2.k(), src2.nr(), src2.nc() + ); + } + + } + + // ------------------------------------------------------------------------------------ + + __global__ void _cuda_affine_transform1(float* d, const float* s, size_t n, float A, float B) + { + for (auto i : grid_stride_range(0, n)) + { + d[i] = A*s[i] + B; + } + } + + __global__ void _cuda_affine_transform1_0(float* d, const float* s, size_t n, float A) + { + for (auto i : grid_stride_range(0, n)) + { + d[i] = A*s[i]; + } + } + + void affine_transform( + tensor& dest, + const tensor& src, + const float A, + const float B + ) + { + DLIB_CASSERT(dest.size()==src.size()); + if (B != 0) + launch_kernel(_cuda_affine_transform1,max_jobs(dest.size()),dest.device(), src.device(), src.size(), A, B); + else + launch_kernel(_cuda_affine_transform1_0,max_jobs(dest.size()),dest.device(), src.device(), src.size(), A); + } + + void affine_transform( + tensor& dest, + const tensor& src, + const float A + ) + { + DLIB_CASSERT(dest.size()==src.size()); + launch_kernel(_cuda_affine_transform1_0,max_jobs(dest.size()),dest.device(), src.device(), src.size(), A); + } + + // ---------------------------------------------------------------------------------------- + + __global__ void _cuda_affine_transform_rect( + float* d, + const float* s1, + const float* s2, + const float* s3, + float A, + float B, + float C, + size_t start_idx, + size_t n, + size_t rect_nc, + size_t total_nc + ) + { + for (auto i : grid_stride_range(0, n)) + { + size_t r = i/rect_nc; + size_t c = i%rect_nc; + size_t idx = r*total_nc + c + start_idx; + d[idx] = A*s1[idx] + B*s2[idx] + C*s3[idx]; + } + } + + void affine_transform( + const rectangle& rect, + tensor& dest, + const tensor& src1, + const tensor& src2, + const tensor& src3, + float A, + float B, + float C + ) + { + DLIB_CASSERT(dest.size() == src1.size()); + DLIB_CASSERT(dest.size() == src2.size()); + DLIB_CASSERT(dest.size() == src3.size()); + DLIB_CASSERT(dest.num_samples() == src1.num_samples()); + DLIB_CASSERT(dest.num_samples() == src2.num_samples()); + DLIB_CASSERT(dest.num_samples() == src3.num_samples()); + DLIB_CASSERT(rectangle(0,0, dest.size()/dest.num_samples()-1, dest.num_samples()-1).contains(rect)); + launch_kernel(_cuda_affine_transform_rect,max_jobs(rect.area()), + dest.device(), src1.device(), src2.device(), src3.device(), A, B, C, + rect.left() + rect.top()*(dest.size()/dest.num_samples()), + rect.area(), + rect.width(), + dest.size()/dest.num_samples()); + } + + // ---------------------------------------------------------------------------------------- + + __global__ void _cuda_affine_transform4(float* d, const float* s1, const float* s2, size_t n, float A, float B, float C) + { + for (auto i : grid_stride_range(0, n)) + { + d[i] = A*s1[i] + B*s2[i] + C; + } + } + + __global__ void _cuda_affine_transform4_0(float* d, const float* s1, const float* s2, size_t n, float A, float B) + { + for (auto i : grid_stride_range(0, n)) + { + d[i] = A*s1[i] + B*s2[i]; + } + } + + void affine_transform( + tensor& dest, + const tensor& src1, + const tensor& src2, + const float A, + const float B, + const float C + ) + { + DLIB_CASSERT(dest.size()==src1.size()); + DLIB_CASSERT(dest.size()==src2.size()); + if (C != 0) + launch_kernel(_cuda_affine_transform4,max_jobs(dest.size()),dest.device(), src1.device(), src2.device(), dest.size(), A, B, C); + else + launch_kernel(_cuda_affine_transform4_0,max_jobs(dest.size()),dest.device(), src1.device(), src2.device(), dest.size(), A, B); + } + + void affine_transform( + tensor& dest, + const tensor& src1, + const tensor& src2, + const float A, + const float B + ) + { + DLIB_CASSERT(dest.size()==src1.size()); + DLIB_CASSERT(dest.size()==src2.size()); + launch_kernel(_cuda_affine_transform4_0,max_jobs(dest.size()),dest.device(), src1.device(), src2.device(), dest.size(), A, B); + } + + // ---------------------------------------------------------------------------------------- + + __global__ void _cuda_add_scaled(float* d, const float* s, size_t n, float scale) + { + for (auto i : grid_stride_range(0, n)) + { + d[i] += scale*s[i]; + } + } + + void add_scaled( + tensor& dest, + const float scale, + const tensor& src + ) + { + DLIB_CASSERT(dest.size()==src.size()); + launch_kernel(_cuda_add_scaled,max_jobs(dest.size()),dest.device(), src.device(), dest.size(), scale); + } + + // ---------------------------------------------------------------------------------------- + + __global__ void _cuda_add_cv_to_all_columns(float beta, float* dest, float alpha, const float* src, size_t size, size_t stride) + { + for (auto i : grid_stride_range(0, size)) + { + dest[i] = beta*dest[i] + alpha*src[i/stride]; + } + } + + __global__ void _cuda_add_cv_to_all_columns_no_beta(float* dest, float alpha, const float* src, size_t size, size_t stride) + { + for (auto i : grid_stride_range(0, size)) + { + dest[i] = alpha*src[i/stride]; + } + } + + void add_cv_to_all_columns( + float beta, + tensor& dest, + float alpha, + const tensor& src + ) + { + DLIB_CASSERT(dest.num_samples() == src.num_samples() && src.num_samples() == src.size()); + if (beta == 0) + launch_kernel(_cuda_add_cv_to_all_columns_no_beta, max_jobs(dest.size()), dest.device(), alpha, src.device(), dest.size(), dest.size()/dest.num_samples()); + else + launch_kernel(_cuda_add_cv_to_all_columns, max_jobs(dest.size()), beta, dest.device(), alpha, src.device(), dest.size(), dest.size()/dest.num_samples()); + } + + // ---------------------------------------------------------------------------------------- + + __global__ void _cuda_affine_transform5( + float* d, const float* s1, const float* s2, const float* s3, size_t n, float A, float B, float C, float D + ) + { + for (auto i : grid_stride_range(0, n)) + { + d[i] = A*s1[i] + B*s2[i] + C*s3[i] + D; + } + } + + void affine_transform( + tensor& dest, + const tensor& src1, + const tensor& src2, + const tensor& src3, + const float A, + const float B, + const float C, + const float D + ) + { + DLIB_CASSERT(dest.size()==src1.size()); + DLIB_CASSERT(dest.size()==src2.size()); + DLIB_CASSERT(dest.size()==src3.size()); + launch_kernel(_cuda_affine_transform5,max_jobs(dest.size()),dest.device(), src1.device(), + src2.device(), src3.device(), dest.size(), A, B, C, D); + } + + // ---------------------------------------------------------------------------------------- + + __global__ void _cuda_affine_transform_range( + float* d, const float* s1, const float* s2, const float* s3, size_t begin, size_t end, float A, float B, float C + ) + { + for (auto i : grid_stride_range(begin, end)) + { + d[i] = A*s1[i] + B*s2[i] + C*s3[i]; + } + } + + + void affine_transform_range( + size_t begin, + size_t end, + tensor& dest, + const tensor& src1, + const tensor& src2, + const tensor& src3, + const float A, + const float B, + const float C + ) + { + DLIB_CASSERT(dest.size()==src1.size()); + DLIB_CASSERT(dest.size()==src2.size()); + DLIB_CASSERT(dest.size()==src3.size()); + DLIB_CASSERT(begin <= end && end <= dest.size()); + launch_kernel(_cuda_affine_transform_range,max_jobs(end-begin), + dest.device(), src1.device(), + src2.device(), src3.device(), begin, end, A, B, C); + } + + // ----------------------------------------------------------------------------------- + + __global__ void _cuda_affine_transform2(float* d, const float* s, size_t n, const float* A, const float* B) + { + for (auto i : grid_stride_range(0, n)) + { + d[i] = A[i]*s[i] + B[i]; + } + } + __global__ void _cuda_affine_transform3(float* d, const float* s, size_t n, const float* A, const float* B, size_t bs) + { + for (auto i : grid_stride_range(0, n)) + { + d[i] = A[i%bs]*s[i] + B[i%bs]; + } + } + + void affine_transform( + tensor& dest, + const tensor& src, + const tensor& A, + const tensor& B + ) + { + DLIB_CASSERT(have_same_dimensions(dest, src)); + DLIB_CASSERT( + ((A.num_samples()==1 && B.num_samples()==1) || + (A.num_samples()==src.num_samples() && B.num_samples()==src.num_samples()))); + DLIB_CASSERT( + A.nr()==B.nr() && B.nr()==src.nr() && + A.nc()==B.nc() && B.nc()==src.nc() && + A.k() ==B.k() && B.k()==src.k(), + "\nA.nr(): " << A.nr() << "\nB.nr(): " << B.nr() << "\nsrc.nr(): " << src.nr() + <<"\nA.nc(): " << A.nc() << "\nB.nc(): " << B.nc() << "\nsrc.nc(): " << src.nc() + <<"\nA.k(): " << A.k() << "\nB.k(): " << B.k() << "\nsrc.k(): " << src.k() + ); + + if (A.num_samples() == 1) + { + launch_kernel(_cuda_affine_transform3,max_jobs(dest.size()),dest.device(), src.device(), src.size(), A.device(), B.device(), A.size()); + } + else + { + launch_kernel(_cuda_affine_transform2,max_jobs(dest.size()),dest.device(), src.device(), src.size(), A.device(), B.device()); + } + } + + // ---------------------------------------------------------------------------------------- + + __global__ void _cuda_compute_adam_update( + size_t begin, + size_t end, + float* s, + float* m, + float* v, + const float alpha, + const float weight_decay, + const float momentum1, + const float momentum2, + const float* params, + const float* params_grad + ) + { + const float eps = 1e-8; + // The loop is equivalent to doing this: + // m = momentum1*m + (1-momentum1) * (weight_decay*params + params_grad); + // v = momentum2*v + (1-momentum2)*squared(weight_decay*params + params_grad); + // s = -alpha*m/(sqrt(v) + eps); + for (auto i : grid_stride_range(begin, end)) + { + float g = (weight_decay*params[i] + params_grad[i]); + m[i] = momentum1*m[i] + (1-momentum1)*g; + v[i] = momentum2*v[i] + (1-momentum2)*g*g; + s[i] = -alpha*m[i]/(std::sqrt(v[i]) + eps); + } + } + + void compute_adam_update ( + size_t begin, + size_t end, + tensor& s, + tensor& m, + tensor& v, + const float t, + const float learning_rate, + const float weight_decay, + const float momentum1, + const float momentum2, + const tensor& params, + const tensor& params_grad + ) + { + DLIB_CASSERT(s.size() == m.size() && + s.size() == v.size() && + s.size() == params.size() && + s.size() == params_grad.size()); + DLIB_CASSERT(begin <= end && end <= params.size()); + const float alpha = learning_rate*std::sqrt(1-std::pow(momentum2,t))/(1-std::pow(momentum1, t)); + + launch_kernel(_cuda_compute_adam_update,max_jobs(end-begin), + begin, end, s.device(), m.device(), v.device(), alpha, weight_decay, + momentum1, momentum2, params.device(), params_grad.device()); + } + + // ----------------------------------------------------------------------------------- + + __global__ void _cuda_affine_transform_conv(float* d, const float* s, size_t n, const float* A, const float* B, size_t bs, size_t ks) + { + for (auto i : grid_stride_range(0, n)) + { + auto k = (i/bs)%ks; + d[i] = A[k]*s[i] + B[k]; + } + } + + void affine_transform_conv( + tensor& dest, + const tensor& src, + const tensor& A, + const tensor& B + ) + { + DLIB_CASSERT(have_same_dimensions(dest, src)); + DLIB_CASSERT(have_same_dimensions(A, B)); + DLIB_CASSERT(A.num_samples() == 1 && A.nr() == 1 && A.nc() == 1 && A.k() == src.k()); + + launch_kernel(_cuda_affine_transform_conv,max_jobs(dest.size()), + dest.device(), src.device(), src.size(), A.device(), B.device(), src.nr()*src.nc(), src.k()); + } + + // ----------------------------------------------------------------------------------- + + __global__ void _add_bias_gradient(float* out, const float* in, size_t n, size_t total_n) + { + for (auto i : grid_stride_range(0, n)) + { + out[i] = in[i]; + for (size_t j = i+n; j < total_n; j+=n) + out[i] += in[j]; + } + } + + void assign_bias_gradient ( + tensor& grad, + const tensor& gradient_input + ) + { + DLIB_CASSERT( + grad.num_samples() == 1 && + gradient_input.k() == grad.k() && + gradient_input.nr() == grad.nr() && + gradient_input.nc() == grad.nc() && + gradient_input.size() > 0); + + launch_kernel(_add_bias_gradient,max_jobs(grad.size()),grad.device(), gradient_input.device(), grad.size(), gradient_input.size()); + } + + // ---------------------------------------------------------------------------------------- + + __global__ void _set_tensor(float* out, size_t n, const float val) + { + for (auto i : grid_stride_range(0, n)) + out[i] = val; + } + + void set_tensor ( + tensor& t, + float value + ) + { + launch_kernel(_set_tensor, max_jobs(t.size()), t.device(), t.size(), value); + } + + // ---------------------------------------------------------------------------------------- + + __global__ void _scale_tensor(float* out, size_t n, const float val) + { + for (auto i : grid_stride_range(0, n)) + out[i] *= val; + } + + void scale_tensor ( + tensor& t, + float value + ) + { + launch_kernel(_scale_tensor, max_jobs(t.size()), t.device(), t.size(), value); + } + + // ----------------------------------------------------------------------------------- + // ----------------------------------------------------------------------------------- + + __global__ void _cuda_threshold(float* d, size_t n, float thresh) + { + for (auto i : grid_stride_range(0, n)) + { + d[i] = d[i]>thresh ? 1:0; + } + } + + void threshold ( + tensor& data, + float thresh + ) + { + launch_kernel(_cuda_threshold,max_jobs(data.size()),data.device(), data.size(), thresh); + } + + // ------------------------------------------------------------------------------------ + + __global__ void _cuda_dot(const float* a, const float* b, size_t n, float* result) + { + // Parallel sum everything into local temp variables. + float temp = 0; + for(auto i : grid_stride_range(0, n)) + temp += a[i]*b[i]; + + // Then do the warp reduce add thing to merge into one output value. + warp_reduce_atomic_add(*result, temp); + } + + + void dot ( + const tensor& a, + const tensor& b, + tensor& result, + size_t idx + ) + { + DLIB_CASSERT(a.size() == b.size()); + DLIB_CASSERT(idx < result.size()); + + launch_kernel(_cuda_dot, max_jobs(a.size()), a.device(), b.device(), a.size(), result.device()+idx); + } + + // ---------------------------------------------------------------------------------------- + + __global__ void _cuda_prelu(const float* s, float* d, size_t n, const float* pp) + { + const float p = *pp; + for (auto i : grid_stride_range(0, n)) + { + if (s[i] > 0) + d[i] = s[i]; + else + d[i] = p*s[i]; + } + } + + void prelu ( + tensor& dest, + const tensor& src, + const tensor& param + ) + { + launch_kernel(_cuda_prelu, max_jobs(dest.size()), + src.device(), dest.device(), src.size(), param.device()); + } + + // ---------------------------------------------------------------------------------------- + + __global__ void _cuda_prelu_gradient(float* out, const float* s, const float* gi, size_t n, const float* pp, float* ppgrad) + { + const float p = *pp; + float pgrad = 0; + for(auto i : grid_stride_range(0, n)) + { + if (s[i] > 0) + { + out[i] += gi[i]; + } + else + { + out[i] += p*gi[i]; + pgrad += gi[i]*s[i]; + } + } + + // Then do the warp reduce add thing to merge into one output value. + warp_reduce_atomic_add(*ppgrad, pgrad); + } + + void prelu_gradient ( + tensor& grad, + const tensor& src, + const tensor& gradient_input, + const tensor& param, + tensor& params_grad + ) + { + params_grad = 0; + launch_kernel(_cuda_prelu_gradient, max_jobs(grad.size()), + grad.device(), src.device(), gradient_input.device(), grad.size(), + param.device(), params_grad.device()); + } + + // ---------------------------------------------------------------------------------------- + + __global__ void _cuda_resize_bilinear(size_t dsize, size_t dchan_size, size_t dnc, float* d, + size_t schan_size, int snr, int snc, const float* s, + const float x_scale, const float y_scale) + { + for(auto i : grid_stride_range(0, dsize)) + { + const int idx = i%dchan_size; + const int channel = i/dchan_size; + const int sidx = channel*schan_size; + const int r = idx/dnc; + const int c = idx%dnc; + + const float y = r*y_scale; + const int top = static_cast<int>(::floor(y)); + const int bottom = ::min(top+1, snr-1); + const float tb_frac = y - top; + + const float x = c*x_scale; + const int left = static_cast<int>(::floor(x)); + const int right = ::min(left+1, snc-1); + const float lr_frac = x - left; + + float tl = s[sidx+top*snc+left]; + float tr = s[sidx+top*snc+right]; + float bl = s[sidx+bottom*snc+left]; + float br = s[sidx+bottom*snc+right]; + + float temp = (1-tb_frac)*((1-lr_frac)*tl + lr_frac*tr) + + tb_frac*((1-lr_frac)*bl + lr_frac*br); + + d[i] = temp; + } + } + + __global__ void _cuda_resize_bilinear_strided(size_t dsize, size_t dchan_size, size_t dnc, float* d, + size_t schan_size, int snr, int snc, const float* s, + const float x_scale, const float y_scale, + size_t dest_row_stride, size_t src_row_stride, size_t dest_chan_size_strided + ) + { + for(auto i : grid_stride_range(0, dsize)) + { + const int idx = i%dchan_size; + const int channel = i/dchan_size; + const int sidx = channel*schan_size; + const int r = idx/dnc; + const int c = idx%dnc; + const int didx = channel*dest_chan_size_strided + r*dest_row_stride+c; + + const float y = r*y_scale; + const int top = static_cast<int>(::floor(y)); + const int bottom = ::min(top+1, snr-1); + const float tb_frac = y - top; + + const float x = c*x_scale; + const int left = static_cast<int>(::floor(x)); + const int right = ::min(left+1, snc-1); + const float lr_frac = x - left; + + float tl = s[sidx+top*src_row_stride+left]; + float tr = s[sidx+top*src_row_stride+right]; + float bl = s[sidx+bottom*src_row_stride+left]; + float br = s[sidx+bottom*src_row_stride+right]; + + float temp = (1-tb_frac)*((1-lr_frac)*tl + lr_frac*tr) + + tb_frac*((1-lr_frac)*bl + lr_frac*br); + + d[didx] = temp; + } + } + + void resize_bilinear ( + tensor& dest, + long dest_row_stride, + long dest_channel_stride, + const tensor& src, + long src_row_stride, + long src_channel_stride + ) + { + DLIB_CASSERT(is_same_object(dest, src)==false); + DLIB_CASSERT(dest.num_samples() == src.num_samples()); + DLIB_CASSERT(dest.k() == src.k()); + + if (dest.size() == 0 || src.size() == 0) + return; + + const float x_scale = (src.nc()-1)/(float)std::max<long>((dest.nc()-1),1); + const float y_scale = (src.nr()-1)/(float)std::max<long>((dest.nr()-1),1); + + if (dest.nc() == dest_row_stride && dest.nr()*dest.nc()==dest_channel_stride && + src.nc() == src_row_stride && src.nr()*src.nc()==src_channel_stride) + { + launch_kernel(_cuda_resize_bilinear, + dest.size(), dest.nr()*dest.nc(), dest.nc(), dest.device(), + src.nr()*src.nc(), src.nr(), src.nc(), src.device(), + x_scale, y_scale); + } + else + { + launch_kernel(_cuda_resize_bilinear_strided, + dest.size(), dest.nr()*dest.nc(), dest.nc(), dest.device(), + src_channel_stride, src.nr(), src.nc(), src.device(), + x_scale, y_scale, dest_row_stride, src_row_stride, dest_channel_stride); + } + } + + // ---------------------------------------------------------------------------------------- + + __global__ void _cuda_resize_bilinear_gradient(size_t dsize, size_t dchan_size, size_t dnc, const float* d, + size_t schan_size, int snr, int snc, float* s, + const float x_scale, const float y_scale) + { + for(auto i : grid_stride_range(0, dsize)) + { + const float tmp = d[i]; + + const int idx = i%dchan_size; + const int channel = i/dchan_size; + const int sidx = channel*schan_size; + const int r = idx/dnc; + const int c = idx%dnc; + + const float y = r*y_scale; + const int top = static_cast<int>(::floor(y)); + const int bottom = ::min(top+1, snr-1); + const float tb_frac = y - top; + + const float x = c*x_scale; + const int left = static_cast<int>(::floor(x)); + const int right = ::min(left+1, snc-1); + const float lr_frac = x - left; + + + atomicAdd(s+sidx+top*snc+left, tmp*(1-tb_frac)*(1-lr_frac)); + atomicAdd(s+sidx+top*snc+right, tmp*(1-tb_frac)*(lr_frac)); + atomicAdd(s+sidx+bottom*snc+left, tmp*(tb_frac)*(1-lr_frac)); + atomicAdd(s+sidx+bottom*snc+right, tmp*(tb_frac)*(lr_frac)); + } + } + + __global__ void _cuda_resize_bilinear_gradient_strided(size_t dsize, size_t dchan_size, size_t dnc, const float* d, + size_t schan_size, int snr, int snc, float* s, + const float x_scale, const float y_scale, + size_t dest_row_stride, size_t src_row_stride, size_t dest_chan_size_strided + ) + { + for(auto i : grid_stride_range(0, dsize)) + { + + const int idx = i%dchan_size; + const int channel = i/dchan_size; + const int didx = channel*dest_chan_size_strided; + const int sidx = channel*schan_size; + const int r = idx/dnc; + const int c = idx%dnc; + + const float tmp = d[didx + r*dest_row_stride+c]; + + const float y = r*y_scale; + const int top = static_cast<int>(::floor(y)); + const int bottom = ::min(top+1, snr-1); + const float tb_frac = y - top; + + const float x = c*x_scale; + const int left = static_cast<int>(::floor(x)); + const int right = ::min(left+1, snc-1); + const float lr_frac = x - left; + + + atomicAdd(s+sidx+top*src_row_stride+left, tmp*(1-tb_frac)*(1-lr_frac)); + atomicAdd(s+sidx+top*src_row_stride+right, tmp*(1-tb_frac)*(lr_frac)); + atomicAdd(s+sidx+bottom*src_row_stride+left, tmp*(tb_frac)*(1-lr_frac)); + atomicAdd(s+sidx+bottom*src_row_stride+right, tmp*(tb_frac)*(lr_frac)); + } + } + + void resize_bilinear_gradient ( + tensor& grad, + long grad_row_stride, + long grad_channel_stride, + const tensor& gradient_input, + long gradient_input_row_stride, + long gradient_input_channel_stride + ) + { + DLIB_CASSERT(is_same_object(grad, gradient_input)==false); + DLIB_CASSERT(gradient_input.num_samples() == grad.num_samples()); + DLIB_CASSERT(gradient_input.k() == grad.k()); + + if (grad.size() == 0 || gradient_input.size() == 0) + return; + + const float x_scale = (grad.nc()-1)/(float)std::max<long>((gradient_input.nc()-1),1); + const float y_scale = (grad.nr()-1)/(float)std::max<long>((gradient_input.nr()-1),1); + + if (grad.nc() == grad_row_stride && grad.nr()*grad.nc()==grad_channel_stride && + gradient_input.nc() == gradient_input_row_stride && gradient_input.nr()*gradient_input.nc()==gradient_input_channel_stride) + { + launch_kernel(_cuda_resize_bilinear_gradient, + gradient_input.size(), gradient_input.nr()*gradient_input.nc(), gradient_input.nc(), gradient_input.device(), + grad.nr()*grad.nc(), grad.nr(), grad.nc(), grad.device(), + x_scale, y_scale); + } + else + { + launch_kernel(_cuda_resize_bilinear_gradient_strided, + gradient_input.size(), gradient_input.nr()*gradient_input.nc(), gradient_input.nc(), gradient_input.device(), + grad_channel_stride, grad.nr(), grad.nc(), grad.device(), + x_scale, y_scale, gradient_input_row_stride, grad_row_stride, gradient_input_channel_stride); + } + } + + // ---------------------------------------------------------------------------------------- + + __global__ void _cuda_copy_tensor_add_to (float* dest, size_t size, const float* src, size_t dest_stride, size_t src_stride, size_t block_size) + { + for(auto i : grid_stride_range(0, size)) + { + size_t blk = i/block_size; + size_t j = i%block_size; + dest[blk*dest_stride + j] += src[blk*src_stride + j]; + } + } + + __global__ void _cuda_copy_tensor (float* dest, size_t size, const float* src, size_t dest_stride, size_t src_stride, size_t block_size) + { + for(auto i : grid_stride_range(0, size)) + { + size_t blk = i/block_size; + size_t j = i%block_size; + dest[blk*dest_stride + j] = src[blk*src_stride + j]; + } + } + + void copy_tensor( + bool add_to, + tensor& dest, + size_t dest_k_offset, + const tensor& src, + size_t src_k_offset, + size_t count_k + ) + { + const size_t dest_sample_size = static_cast<size_t>(dest.nc() * dest.nr() * dest.k()); + const size_t src_sample_size = static_cast<size_t>(src.nc() * src.nr() * src.k()); + + const size_t block_size = count_k * dest.nc() * dest.nr(); + + DLIB_CASSERT(dest.num_samples() == src.num_samples() && + dest.nc() == src.nc() && dest.nr() == src.nr(), "All sources should fit into dest tensor size"); + DLIB_CASSERT(dest.k() - dest_k_offset >= count_k, "Not enough space in dest tensor"); + DLIB_CASSERT(src.k() - src_k_offset >= count_k, "Not enough space in src tensor"); + + float* dest_p = dest.device() + dest_k_offset * dest.nc() * dest.nr(); + const float* src_p = src.device() + src_k_offset * src.nc() * src.nr();; + + if (add_to) + { + launch_kernel(_cuda_copy_tensor_add_to, max_jobs(dest.size()), + dest_p, block_size*dest.num_samples(), + src_p, dest_sample_size, src_sample_size, block_size); + } + else + { + launch_kernel(_cuda_copy_tensor, max_jobs(dest.size()), + dest_p, block_size*dest.num_samples(), + src_p, dest_sample_size, src_sample_size, block_size); + } + } + + // ---------------------------------------------------------------------------------------- + + } +} + diff --git a/ml/dlib/dlib/dnn/cuda_dlib.h b/ml/dlib/dlib/dnn/cuda_dlib.h new file mode 100644 index 000000000..3a057ffc4 --- /dev/null +++ b/ml/dlib/dlib/dnn/cuda_dlib.h @@ -0,0 +1,469 @@ +// Copyright (C) 2015 Davis E. King (davis@dlib.net) +// License: Boost Software License See LICENSE.txt for the full license. +#ifndef DLIB_DNN_CuDA_H_ +#define DLIB_DNN_CuDA_H_ + + +#include "tensor.h" +#include "../geometry/rectangle.h" + +namespace dlib +{ + namespace cuda + { + + // ---------------------------------------------------------------------------------------- + + void set_device ( + int dev + ); + + int get_device ( + ); + + int get_num_devices ( + ); + + std::string get_device_name ( + int device + ); + + void set_current_device_blocking_sync( + ); + + bool can_access_peer (int device_id, int peer_device_id); + bool can_access_peer (const tensor& device, const tensor& peer_device); + + void device_synchronize (int dev); + void device_synchronize (const tensor& dev); + + + class raii_set_device + { + public: + raii_set_device() = delete; + raii_set_device(const raii_set_device&) = delete; + raii_set_device& operator=(const raii_set_device&) = delete; + + raii_set_device(int dev) + { + prev_dev = get_device(); + set_device(dev); + } + + raii_set_device(const tensor& dev) + { + prev_dev = get_device(); + set_device(dev.device_id()); + } + + void operator() (int dev) + { + set_device(dev); + } + + void operator() (const tensor& dev) + { + set_device(dev.device_id()); + } + + ~raii_set_device() noexcept(false) + { + set_device(prev_dev); + } + + private: + int prev_dev; + }; + + +#ifdef DLIB_USE_CUDA + + class enable_peer_access + { + public: + + enable_peer_access() = delete; + enable_peer_access(const enable_peer_access&) = delete; + enable_peer_access& operator=(const enable_peer_access&) = delete; + + enable_peer_access( + int device_id, + int peer_device_id + ); + + enable_peer_access( + const tensor& device, + const tensor& peer_device + ) : enable_peer_access(device.device_id(), peer_device.device_id()) + {} + + ~enable_peer_access() noexcept(false); + + private: + + bool call_disable; + int device_id; + int peer_device_id; + }; + + // ----------------------------------------------------------------------------------- + + void inverse_norms ( + resizable_tensor& invnorms, + const tensor& data, + const double eps + ); + + void dot_prods ( + resizable_tensor& out, + const tensor& lhs, + const tensor& rhs + ); + + void dot_prods ( + bool add_to, + tensor& out, + const tensor& lhs, + const tensor& rhs + ); + + void scale_columns ( + tensor& out, + const tensor& m, + const tensor& v + ); + + void scale_rows ( + tensor& out, + const tensor& m, + const tensor& v + ); + + void scale_rows2 ( + float beta, + tensor& out, + const tensor& m1, + const tensor& m2, + const tensor& v1, + const tensor& v2 + ); + + void exp ( + tensor& dest, + const tensor& src + ); + + void log ( + tensor& dest, + const tensor& src + ); + + void log10 ( + tensor& dest, + const tensor& src + ); + + // ------------------------------------------------------------------------------------ + + void set_tensor ( + tensor& t, + float value + ); + + void scale_tensor ( + tensor& t, + float value + ); + + // ------------------------------------------------------------------------------------ + + void multiply ( + bool add_to, + tensor& dest, + const tensor& src1, + const tensor& src2 + ); + + void multiply_conv ( + bool add_to, + tensor& dest, + const tensor& src1, + const tensor& src2 + ); + + void multiply_zero_padded ( + bool add_to, + tensor& dest, + const tensor& src1, + const tensor& src2 + ); + + void scale_channels ( + bool add_to, + tensor& dest, + const tensor& src, + const tensor& scales + ); + + void add ( + tensor& dest, + const tensor& src1, + const tensor& src2 + ); + + // ----------------------------------------------------------------------------------- + + void affine_transform( + tensor& dest, + const tensor& src, + const float A, + const float B + ); + + void affine_transform( + tensor& dest, + const tensor& src, + const float A + ); + + void affine_transform( + tensor& dest, + const tensor& src1, + const tensor& src2, + const float A, + const float B, + const float C + ); + + void affine_transform( + tensor& dest, + const tensor& src1, + const tensor& src2, + const float A, + const float B + ); + + void affine_transform( + tensor& dest, + const tensor& src1, + const tensor& src2, + const tensor& src3, + const float A, + const float B, + const float C, + const float D + ); + + void affine_transform_range( + size_t begin, + size_t end, + tensor& dest, + const tensor& src1, + const tensor& src2, + const tensor& src3, + const float A, + const float B, + const float C + ); + + void affine_transform( + const rectangle& rect, + tensor& dest, + const tensor& src1, + const tensor& src2, + const tensor& src3, + float A, + float B, + float C + ); + + // Note that this function isn't in the tt:: namespace because add_scaled() is + // called by cuda::add() so we don't need a tt:: version of add_scaled(). + void add_scaled( + tensor& dest, + const float scale, + const tensor& src + ); + + void add_cv_to_all_columns( + float beta, + tensor& dest, + float alpha, + const tensor& src + ); + + // ----------------------------------------------------------------------------------- + + void affine_transform( + tensor& dest, + const tensor& src, + const tensor& A, + const tensor& B + ); + + // ----------------------------------------------------------------------------------- + + void affine_transform_conv( + tensor& dest, + const tensor& src, + const tensor& A, + const tensor& B + ); + + // ---------------------------------------------------------------------------------------- + + void compute_adam_update ( + size_t begin, + size_t end, + tensor& s, + tensor& m, + tensor& v, + const float t, + const float learning_rate, + const float weight_decay, + const float momentum1, + const float momentum2, + const tensor& params, + const tensor& params_grad + ); + + // ----------------------------------------------------------------------------------- + + void assign_bias_gradient ( + tensor& grad, + const tensor& gradient_input + ); + + // ----------------------------------------------------------------------------------- + + void threshold ( + tensor& data, + float thresh + ); + + // ---------------------------------------------------------------------------------------- + + void dot ( + const tensor& a, + const tensor& b, + tensor& result, + size_t idx + ); + + // ---------------------------------------------------------------------------------------- + + void prelu ( + tensor& dest, + const tensor& src, + const tensor& param + ); + + void prelu_gradient ( + tensor& grad, + const tensor& src, + const tensor& gradient_input, + const tensor& param, + tensor& params_grad + ); + + + // ---------------------------------------------------------------------------------------- + + void resize_bilinear ( + tensor& dest, + long dest_row_stride, + long dest_channel_stride, + const tensor& src, + long src_row_stride, + long src_channel_stride + ); + + void resize_bilinear_gradient ( + tensor& grad, + long grad_row_stride, + long grad_channel_stride, + const tensor& gradient_input, + long gradient_input_row_stride, + long gradient_input_channel_stride + ); + + inline void resize_bilinear ( + tensor& dest, + const tensor& src + ) { resize_bilinear(dest, dest.nc(), dest.nr()*dest.nc(), src, src.nc(), src.nr()*src.nc()); } + + inline void resize_bilinear_gradient ( + tensor& grad, + const tensor& gradient_input + ) { resize_bilinear_gradient(grad, grad.nc(), grad.nr()*grad.nc(), gradient_input, gradient_input.nc(), gradient_input.nr()*gradient_input.nc()); } + + // ---------------------------------------------------------------------------------------- + + void copy_tensor( + bool add_to, + tensor& dest, + size_t dest_k_offset, + const tensor& src, + size_t src_k_offset, + size_t count_k + ); + + // ------------------------------------------------------------------------------------ + // ------------------------------------------------------------------------------------ + // ------------------------------------------------------------------------------------ + // ------------------------------------------------------------------------------------ + +#else // if DLIB_USE_CUDA NOT DEFINED + + inline void set_device ( + int id + ) + { + DLIB_CASSERT(id == 0, "dlib::cuda::set_device(id) called with an invalid device id."); + } + + inline int get_device ( + ){ return 0; } + + inline int get_num_devices ( + ) { return 1; } + + inline std::string get_device_name ( + int device + ) + { + DLIB_CASSERT(device == 0, "dlib::cuda::set_device(id) called with an invalid device id."); + return "CUDA_DISABLED"; + } + + inline void set_current_device_blocking_sync( + ) {} + + + inline bool can_access_peer (int , int ) + { return false; } + inline bool can_access_peer (const tensor& , const tensor& ) + { return false; } + + inline void device_synchronize (int ){} + inline void device_synchronize (const tensor& ){} + + class enable_peer_access + { + public: + enable_peer_access() = delete; + enable_peer_access(const enable_peer_access&) = delete; + enable_peer_access& operator=(const enable_peer_access&) = delete; + enable_peer_access( int, int ){} + enable_peer_access( const tensor&, const tensor& ) {} + }; + +#endif // DLIB_USE_CUDA + + } +} + + +#endif // DLIB_DNN_CuDA_H_ + diff --git a/ml/dlib/dlib/dnn/cuda_errors.h b/ml/dlib/dlib/dnn/cuda_errors.h new file mode 100644 index 000000000..fd28693c2 --- /dev/null +++ b/ml/dlib/dlib/dnn/cuda_errors.h @@ -0,0 +1,70 @@ +// Copyright (C) 2015 Davis E. King (davis@dlib.net) +// License: Boost Software License See LICENSE.txt for the full license. +#ifndef DLIB_CUDA_ERRORs_H_ +#define DLIB_CUDA_ERRORs_H_ + + +#include "../error.h" + +namespace dlib +{ + struct cuda_error : public error + { + /*! + WHAT THIS OBJECT REPRESENTS + This is the exception thrown if any calls to the NVIDIA CUDA runtime + returns an error. + !*/ + + cuda_error(const std::string& message): error(message) {} + }; + + + struct cudnn_error : public cuda_error + { + /*! + WHAT THIS OBJECT REPRESENTS + This is the exception thrown if any calls to the NVIDIA cuDNN library + returns an error. + !*/ + + cudnn_error(const std::string& message): cuda_error(message) {} + }; + + struct curand_error : public cuda_error + { + /*! + WHAT THIS OBJECT REPRESENTS + This is the exception thrown if any calls to the NVIDIA cuRAND library + returns an error. + !*/ + + curand_error(const std::string& message): cuda_error(message) {} + }; + + struct cublas_error : public cuda_error + { + /*! + WHAT THIS OBJECT REPRESENTS + This is the exception thrown if any calls to the NVIDIA cuBLAS library + returns an error. + !*/ + + cublas_error(const std::string& message): cuda_error(message) {} + }; + + struct cusolver_error : public cuda_error + { + /*! + WHAT THIS OBJECT REPRESENTS + This is the exception thrown if any calls to the NVIDIA cuSolver library + returns an error. + !*/ + + cusolver_error(const std::string& message): cuda_error(message) {} + }; +} + + +#endif // DLIB_CUDA_ERRORs_H_ + diff --git a/ml/dlib/dlib/dnn/cuda_utils.h b/ml/dlib/dlib/dnn/cuda_utils.h new file mode 100644 index 000000000..673a4e8ad --- /dev/null +++ b/ml/dlib/dlib/dnn/cuda_utils.h @@ -0,0 +1,413 @@ +// Copyright (C) 2015 Davis E. King (davis@dlib.net) +// License: Boost Software License See LICENSE.txt for the full license. +#ifndef DLIB_CUDA_UtILS_H_ +#define DLIB_CUDA_UtILS_H_ + +#ifndef DLIB_USE_CUDA +#error "This file shouldn't be #included unless DLIB_USE_CUDA is #defined" +#endif + +#include "cuda_errors.h" +#include "../algs.h" +#include <cmath> + +#include <cuda_runtime.h> +#include <sstream> +#include <iostream> +#include <memory> +#include <vector> +#include <type_traits> + + +// Check the return value of a call to the CUDA runtime for an error condition. +#define CHECK_CUDA(call) \ +do{ \ + const cudaError_t error = call; \ + if (error != cudaSuccess) \ + { \ + std::ostringstream sout; \ + sout << "Error while calling " << #call << " in file " << __FILE__ << ":" << __LINE__ << ". ";\ + sout << "code: " << error << ", reason: " << cudaGetErrorString(error);\ + throw dlib::cuda_error(sout.str()); \ + } \ +}while(false) + +// ---------------------------------------------------------------------------------------- + +#ifdef __CUDACC__ + +namespace dlib +{ + namespace cuda + { + + // ------------------------------------------------------------------------------------ + + __inline__ __device__ size_t pack_idx ( + size_t dim_size3, + size_t dim_size2, + size_t dim_size1, + size_t idx4, + size_t idx3, + size_t idx2, + size_t idx1 + ) + /*! + ensures + - Converts a 4D array index into a 1D index assuming row major layout. To + understand precisely what this function does, imagine we had an array + declared like this: + int ARRAY[anything][dim_size3][dim_size2][dim_size1]; + Then we could index it like this: + ARRAY[idx4][idx3][idx2][idx1] + or equivalently like this: + ((int*)ARRAY)[pack_idx(dim_size3,dim_size2,dim_size1, idx4,idx3,idx2,idx1)] + !*/ + { + return ((idx4*dim_size3 + idx3)*dim_size2 + idx2)*dim_size1 + idx1; + } + + __inline__ __device__ void unpack_idx ( + size_t idx, + size_t dim_size3, + size_t dim_size2, + size_t dim_size1, + size_t& idx4, + size_t& idx3, + size_t& idx2, + size_t& idx1 + ) + /*! + ensures + - This function computes the inverse of pack_idx(). Therefore, + if PACKED == pack_idx(dim_size3,dim_size2,dim_size1, idx4,idx3,idx2,idx1) + then unpack_idx(PACKED,dim_size3,dim_size2,dim_size1, IDX4,IDX3,IDX2,IDX1) + results in: + - IDX1 == idx1 + - IDX2 == idx2 + - IDX3 == idx3 + - IDX4 == idx4 + !*/ + { + idx1 = idx%dim_size1; + + idx /= dim_size1; + idx2 = idx%dim_size2; + + idx /= dim_size2; + idx3 = idx%dim_size3; + + idx /= dim_size3; + idx4 = idx; + } + + // ------------------------------------------------------------------------------------ + + // This function is from the article: + // http://devblogs.nvidia.com/parallelforall/faster-parallel-reductions-kepler/ + __inline__ __device__ float warp_reduce_sum(float val) + { + for (int offset = warpSize/2; offset > 0; offset /= 2) +#if CUDART_VERSION >= 9000 + val += __shfl_down_sync(0xFFFFFFFF,val, offset); +#else + val += __shfl_down(val, offset); +#endif + return val; + } + + __inline__ __device__ bool is_first_thread_in_warp() + { + return (threadIdx.x & (warpSize - 1)) == 0; + } + + __inline__ __device__ void warp_reduce_atomic_add( + float& out, + float val + ) + /*! + ensures + - Atomically adds all the val variables in the current warp to out. + See this page for an extended discussion: + http://devblogs.nvidia.com/parallelforall/faster-parallel-reductions-kepler/ + !*/ + { + val = warp_reduce_sum(val); + if (is_first_thread_in_warp()) + atomicAdd(&out, val); + } + + // ------------------------------------------------------------------------------------ + + struct max_jobs + { + max_jobs(int x) : num_x(x) {} + max_jobs(int x, int y) : num_x(x), num_y(y) {} + int num_x; + int num_y = 1; + }; + + template <typename Kernel, typename... T> + void launch_kernel ( + Kernel K, + T ...args + ) + /*! + ensures + - launches the given kernel K(args...). The point of this function is to + automatically set the kernel launch parameters to something reasonable + based on the properties of the kernel and the current GPU card. + !*/ + { + int num_blocks, num_threads; + CHECK_CUDA(cudaOccupancyMaxPotentialBlockSize(&num_blocks,&num_threads,K)); + K<<<num_blocks,num_threads>>>(args...); + } + + template <typename Kernel, typename... T> + void launch_kernel ( + Kernel K, + max_jobs m, + T ...args + ) + /*! + ensures + - This function is just like launch_kernel(K,args...) except that you can + additionally supply a max_jobs number that tells it how many possible + total threads could be used. This is useful when launching potentially + small jobs that might not need the number of threads suggested by + launch_kernel(). + !*/ + { + if (m.num_x == 0 || m.num_y == 0) + return; + int num_blocks, num_threads; + CHECK_CUDA(cudaOccupancyMaxPotentialBlockSize(&num_blocks,&num_threads,K)); + // Check if the job is really small and we don't really need to launch a kernel + // with this many blocks and threads. + if (num_blocks*num_threads > m.num_x*m.num_y) + num_blocks = (m.num_x*m.num_y+num_threads-1)/num_threads; + + if (m.num_y == 1) + { + K<<<num_blocks,num_threads>>>(args...); + } + else + { + /* + In general, the reason m.num_y!=1 (i.e. the reason you are in this + code path) is because we are using nested grid-stride loops. There are + two important things to note about what we are doing here. To + illustrate them we will talk about this little CUDA code snippet: + + // initialize out before we begin. + for (auto i : grid_stride_range_y(0, nr)) + for (auto j : grid_stride_range(0, 1)) + out[i] = 0; + + __syncthreads(); // synchronize threads in block + + // loop over some 2D thing and sum and store things into out. + for (auto i : grid_stride_range_y(0, nr)) + { + float temp = 0; + for (auto j : grid_stride_range(0, nc)) + temp += whatever[i*nc+j]; + + // store the sum into out[i] + warp_reduce_atomic_add(out[i], temp); + } + + First, we make sure the number of x threads is a multiple of 32 so that + you can use warp_reduce_atomic_add() inside the y loop. + + Second, we put the x block size to 1 so inter-block synchronization is + easier. For example, if the number of x blocks wasn't 1 the above code + would have a race condition in it. This is because the execution of + out[i]=0 would be done by blocks with blockIdx.x==0, but then in the + second set of loops, *all* the x blocks use out[i]. Since + __syncthreads() doesn't do any synchronization between blocks some of + the blocks might begin before the out[i]=0 statements finished and that + would be super bad. + */ + + // Try and make sure that the ratio of x to y threads is reasonable based + // on the respective size of our loops. + int x_threads = 32; + int y_threads = num_threads/32; + const int ratio = static_cast<int>(std::round(put_in_range(1, y_threads, m.num_x/(double)m.num_y))); + x_threads *= ratio; + y_threads /= ratio; + + dim3 blocks(1,num_blocks); + dim3 threads(x_threads,y_threads); + K<<<blocks,threads>>>(args...); + } + } + + // ------------------------------------------------------------------------------------ + + class grid_stride_range + { + /*! + WHAT THIS OBJECT REPRESENTS + This is a tool for making a for loop that loops over an entire block of + memory inside a kernel, but doing so in a way that parallelizes + appropriately across all the threads in a kernel launch. For example, + the following kernel would add the vector a to the vector b and store + the output in out (assuming all vectors are of dimension n): + __global__ void add_arrays( + const float* a, + const float* b, + float* out, + size_t n + ) + { + for (auto i : grid_stride_range(0, n)) + { + out[i] = a[i]+b[i]; + } + } + !*/ + + public: + __device__ grid_stride_range( + size_t ibegin_, + size_t iend_ + ) : + ibegin(ibegin_), + iend(iend_) + {} + + class iterator + { + public: + __device__ iterator() {} + __device__ iterator(size_t pos_) : pos(pos_) {} + + __device__ size_t operator*() const + { + return pos; + } + + __device__ iterator& operator++() + { + pos += gridDim.x * blockDim.x; + return *this; + } + + __device__ bool operator!=(const iterator& item) const + { return pos < item.pos; } + + private: + size_t pos; + }; + + __device__ iterator begin() const + { + return iterator(ibegin+blockDim.x * blockIdx.x + threadIdx.x); + } + __device__ iterator end() const + { + return iterator(iend); + } + private: + + size_t ibegin; + size_t iend; + }; + + // ------------------------------------------------------------------------------------ + + class grid_stride_range_y + { + /*! + WHAT THIS OBJECT REPRESENTS + This object is just like grid_stride_range except that it looks at + CUDA's y thread index (e.g. threadIdx.y) instead of the x index. + Therefore, if you launch a cuda kernel with a statement like: + dim3 blocks(1,10); + dim3 threads(32,32); // You need to have x and y not equal to 1 to get parallelism over both loops. + add_arrays<<<blocks,threads>>>(a,b,out,nr,nc); + You can perform a nested 2D parallel for loop rather than doing just a + 1D for loop. + + So the code in the kernel would look like this if you wanted to add two + 2D matrices: + __global__ void add_arrays( + const float* a, + const float* b, + float* out, + size_t nr, + size_t nc + ) + { + for (auto r : grid_stride_range_y(0, nr)) + { + for (auto c : grid_stride_range(0, nc)) + { + auto i = r*nc+c; + out[i] = a[i]+b[i]; + } + } + } + !*/ + + public: + __device__ grid_stride_range_y( + size_t ibegin_, + size_t iend_ + ) : + ibegin(ibegin_), + iend(iend_) + {} + + class iterator + { + public: + __device__ iterator() {} + __device__ iterator(size_t pos_) : pos(pos_) {} + + __device__ size_t operator*() const + { + return pos; + } + + __device__ iterator& operator++() + { + pos += gridDim.y * blockDim.y; + return *this; + } + + __device__ bool operator!=(const iterator& item) const + { return pos < item.pos; } + + private: + size_t pos; + }; + + __device__ iterator begin() const + { + return iterator(ibegin+blockDim.y * blockIdx.y + threadIdx.y); + } + __device__ iterator end() const + { + return iterator(iend); + } + private: + + size_t ibegin; + size_t iend; + }; + + // ------------------------------------------------------------------------------------ + + } +} + +#endif // __CUDACC__ + +// ---------------------------------------------------------------------------------------- + +#endif // DLIB_CUDA_UtILS_H_ + diff --git a/ml/dlib/dlib/dnn/cudnn_dlibapi.cpp b/ml/dlib/dlib/dnn/cudnn_dlibapi.cpp new file mode 100644 index 000000000..6926561f1 --- /dev/null +++ b/ml/dlib/dlib/dnn/cudnn_dlibapi.cpp @@ -0,0 +1,1604 @@ +// Copyright (C) 2015 Davis E. King (davis@dlib.net) +// License: Boost Software License See LICENSE.txt for the full license. +#ifndef DLIB_DNN_CuDNN_CPP_ +#define DLIB_DNN_CuDNN_CPP_ + +#ifdef DLIB_USE_CUDA + +#include "cudnn_dlibapi.h" +#include "tensor.h" +#include <cudnn.h> +#include <iostream> +#include <string> +#include <vector> +#include "cuda_utils.h" +#include "cpu_dlib.h" +#include "cuda_dlib.h" +#include "tensor_tools.h" + +static const char* cudnn_get_error_string(cudnnStatus_t s) +{ + switch(s) + { + case CUDNN_STATUS_NOT_INITIALIZED: + return "CUDA Runtime API initialization failed."; + case CUDNN_STATUS_ALLOC_FAILED: + return "CUDA Resources could not be allocated."; + case CUDNN_STATUS_BAD_PARAM: + return "CUDNN_STATUS_BAD_PARAM"; + case CUDNN_STATUS_EXECUTION_FAILED: + return "CUDNN_STATUS_EXECUTION_FAILED"; + case CUDNN_STATUS_NOT_SUPPORTED: + return "CUDNN_STATUS_NOT_SUPPORTED"; + case CUDNN_STATUS_ARCH_MISMATCH: + return "CUDNN_STATUS_ARCH_MISMATCH: Your GPU is too old and not supported by cuDNN"; + default: + return "A call to cuDNN failed"; + } +} + +// Check the return value of a call to the cuDNN runtime for an error condition. +#define CHECK_CUDNN(call) \ +do{ \ + const cudnnStatus_t error = call; \ + if (error != CUDNN_STATUS_SUCCESS) \ + { \ + std::ostringstream sout; \ + sout << "Error while calling " << #call << " in file " << __FILE__ << ":" << __LINE__ << ". ";\ + sout << "code: " << error << ", reason: " << cudnn_get_error_string(error);\ + throw dlib::cudnn_error(sout.str()); \ + } \ +}while(false) + + +namespace dlib +{ + + namespace cuda + { + + // ------------------------------------------------------------------------------------ + + static cudnnTensorDescriptor_t descriptor(const tensor& t) + { + return (const cudnnTensorDescriptor_t)t.get_cudnn_tensor_descriptor().get_handle(); + } + static cudnnTensorDescriptor_t descriptor(const tensor_descriptor& t) + { + return (const cudnnTensorDescriptor_t)t.get_handle(); + } + + // ------------------------------------------------------------------------------------ + + class cudnn_context + { + public: + // not copyable + cudnn_context(const cudnn_context&) = delete; + cudnn_context& operator=(const cudnn_context&) = delete; + + cudnn_context() + { + handles.resize(16); + } + ~cudnn_context() + { + for (auto h : handles) + { + if (h) + cudnnDestroy(h); + } + } + + cudnnHandle_t get_handle ( + ) + { + int new_device_id; + CHECK_CUDA(cudaGetDevice(&new_device_id)); + // make room for more devices if needed + if (new_device_id >= (long)handles.size()) + handles.resize(new_device_id+16); + + // If we don't have a handle already for this device then make one + if (!handles[new_device_id]) + CHECK_CUDNN(cudnnCreate(&handles[new_device_id])); + + // Finally, return the handle for the current device + return handles[new_device_id]; + } + + private: + + std::vector<cudnnHandle_t> handles; + }; + + static cudnnHandle_t context() + { + thread_local cudnn_context c; + return c.get_handle(); + } + // ------------------------------------------------------------------------------------ + + class cudnn_device_buffer + { + public: + // not copyable + cudnn_device_buffer(const cudnn_device_buffer&) = delete; + cudnn_device_buffer& operator=(const cudnn_device_buffer&) = delete; + + cudnn_device_buffer() + { + buffers.resize(16); + } + ~cudnn_device_buffer() + { + } + + std::shared_ptr<resizable_cuda_buffer> get_buffer ( + ) + { + int new_device_id; + CHECK_CUDA(cudaGetDevice(&new_device_id)); + // make room for more devices if needed + if (new_device_id >= (long)buffers.size()) + buffers.resize(new_device_id+16); + + // If we don't have a buffer already for this device then make one + std::shared_ptr<resizable_cuda_buffer> buff = buffers[new_device_id].lock(); + if (!buff) + { + buff = std::make_shared<resizable_cuda_buffer>(); + buffers[new_device_id] = buff; + } + + // Finally, return the buffer for the current device + return buff; + } + + private: + + std::vector<std::weak_ptr<resizable_cuda_buffer>> buffers; + }; + + + static std::shared_ptr<resizable_cuda_buffer> device_global_buffer() + { + thread_local cudnn_device_buffer buffer; + return buffer.get_buffer(); + } + // ------------------------------------------------------------------------------------ + + class cudnn_activation_descriptor + { + public: + // not copyable + cudnn_activation_descriptor(const cudnn_activation_descriptor&) = delete; + cudnn_activation_descriptor& operator=(const cudnn_activation_descriptor&) = delete; + + cudnn_activation_descriptor( + cudnnActivationMode_t mode, + cudnnNanPropagation_t reluNanOpt, + double reluCeiling + ) + { + CHECK_CUDNN(cudnnCreateActivationDescriptor(&handle)); + CHECK_CUDNN(cudnnSetActivationDescriptor(handle, mode, reluNanOpt, reluCeiling)); + } + + ~cudnn_activation_descriptor() + { + cudnnDestroyActivationDescriptor(handle); + } + + cudnnActivationDescriptor_t get_handle ( + ) + { + return handle; + } + private: + cudnnActivationDescriptor_t handle; + }; + + static cudnnActivationDescriptor_t relu_activation_descriptor() + { + thread_local cudnn_activation_descriptor des(CUDNN_ACTIVATION_RELU, CUDNN_PROPAGATE_NAN,0); + return des.get_handle(); + } + + static cudnnActivationDescriptor_t sigmoid_activation_descriptor() + { + thread_local cudnn_activation_descriptor des(CUDNN_ACTIVATION_SIGMOID, CUDNN_PROPAGATE_NAN,0); + return des.get_handle(); + } + + static cudnnActivationDescriptor_t tanh_activation_descriptor() + { + thread_local cudnn_activation_descriptor des(CUDNN_ACTIVATION_TANH, CUDNN_PROPAGATE_NAN,0); + return des.get_handle(); + } + + // ------------------------------------------------------------------------------------ + + tensor_descriptor:: + tensor_descriptor( + ) : handle(nullptr) + { + } + + tensor_descriptor:: + ~tensor_descriptor() + { + set_size(0,0,0,0); + } + + void tensor_descriptor:: + set_size( + int n, + int k, + int nr, + int nc + ) + { + if (handle) + { + cudnnDestroyTensorDescriptor((cudnnTensorDescriptor_t)handle); + handle = nullptr; + } + + if (n != 0 && nr != 0 && nc != 0 && k != 0) + { + cudnnTensorDescriptor_t h; + CHECK_CUDNN(cudnnCreateTensorDescriptor(&h)); + handle = h; + + CHECK_CUDNN(cudnnSetTensor4dDescriptor((cudnnTensorDescriptor_t)handle, + CUDNN_TENSOR_NCHW, + CUDNN_DATA_FLOAT, + n, + k, + nr, + nc)); + } + } + + void tensor_descriptor:: + get_size ( + int& n, + int& k, + int& nr, + int& nc + ) const + { + if (handle) + { + int nStride, cStride, hStride, wStride; + cudnnDataType_t datatype; + CHECK_CUDNN(cudnnGetTensor4dDescriptor((cudnnTensorDescriptor_t)handle, + &datatype, + &n, + &k, + &nr, + &nc, + &nStride, + &cStride, + &hStride, + &wStride)); + } + else + { + n = 0; + k = 0; + nr = 0; + nc = 0; + } + } + + // ------------------------------------------------------------------------------------ + + void add( + float beta, + tensor& dest, + float alpha, + const tensor& src + ) + { + DLIB_CASSERT( + (have_same_dimensions(src, dest) || + (src.num_samples()==1 && src.k()==dest.k() && src.nr()==1 && src.nc()==1) || + (src.num_samples()==1 && src.k()==dest.k() && src.nr()==dest.nr() && src.nc()==dest.nc()) || + (src.num_samples()==1 && src.k()==1 && src.nr()==dest.nr() && src.nc()==dest.nc()) || + (src.num_samples()==dest.num_samples() && src.k()==1 && src.nr()==1 && src.nc()==1)) && + is_same_object(src,dest) == false , + "\n\t dest.num_samples(): " << dest.num_samples() + <<"\n\t dest.k(): " << dest.k() + <<"\n\t dest.nr(): " << dest.nr() + <<"\n\t dest.nc(): " << dest.nc() + <<"\n\t src.num_samples(): " << src.num_samples() + <<"\n\t src.k(): " << src.k() + <<"\n\t src.nr(): " << src.nr() + <<"\n\t src.nc(): " << src.nc() + ); + + if (dest.size() == src.size() && beta == 1) + { + // Call the dlib function in this case since it's faster than the one that + // comes with cuDNN (at least as of cuDNN v4). + add_scaled(dest, alpha, src); + return; + } + else if (src.num_samples()==dest.num_samples() && src.k()==1 && src.nr()==1 && src.nc()==1) + { + add_cv_to_all_columns(beta, dest, alpha, src); + return; + } + + CHECK_CUDNN(cudnnAddTensor(context(), + &alpha, + descriptor(src), + src.device(), + &beta, + descriptor(dest), + dest.device())); + } + + void assign_conv_bias_gradient ( + tensor& grad, + const tensor& gradient_input + ) + { + DLIB_CASSERT( + grad.num_samples() == 1 && + grad.k() >= 1 && + grad.nr() == 1 && + grad.nc() == 1 && + gradient_input.k() == grad.k() && + gradient_input.size() > 0 && + is_same_object(grad,gradient_input) == false + ); + + const float alpha = 1; + const float beta = 0; + CHECK_CUDNN(cudnnConvolutionBackwardBias(context(), + &alpha, + descriptor(gradient_input), + gradient_input.device(), + &beta, + descriptor(grad), + grad.device())); + } + + // ------------------------------------------------------------------------------------ + + void batch_normalize_inference ( + const double eps, + resizable_tensor& dest, + const tensor& src, + const tensor& gamma, + const tensor& beta, + const tensor& running_means, + const tensor& running_variances + ) + { + DLIB_CASSERT( + gamma.num_samples() == 1 && + gamma.nr() == src.nr() && + gamma.nc() == src.nc() && + gamma.k() == src.k() && + have_same_dimensions(gamma, beta) && + have_same_dimensions(gamma, running_means) && + have_same_dimensions(gamma, running_variances) && + eps > 0, + "\ngamma.num_samples(): " << gamma.num_samples() << + "\ngamma.k(): " << gamma.k() << + "\ngamma.nr(): " << gamma.nr() << + "\ngamma.nc(): " << gamma.nc() << + "\nbeta.num_samples(): " << beta.num_samples() << + "\nbeta.k(): " << beta.k() << + "\nbeta.nr(): " << beta.nr() << + "\nbeta.nc(): " << beta.nc() << + "\nrunning_means.num_samples(): " << running_means.num_samples() << + "\nrunning_means.k(): " << running_means.k() << + "\nrunning_means.nr(): " << running_means.nr() << + "\nrunning_means.nc(): " << running_means.nc() << + "\nrunning_variances.num_samples(): " << running_variances.num_samples() << + "\nrunning_variances.k(): " << running_variances.k() << + "\nrunning_variances.nr(): " << running_variances.nr() << + "\nrunning_variances.nc(): " << running_variances.nc() << + "\nsrc.k(): " << src.k() << + "\nsrc.nr(): " << src.nr() << + "\nsrc.nc(): " << src.nc() << + "\neps: " << eps + ); + const float in_scale = 1; + const float out_scale = 0; + + dest.copy_size(src); + + CHECK_CUDNN(cudnnBatchNormalizationForwardInference( + context(), + CUDNN_BATCHNORM_PER_ACTIVATION, + &in_scale, + &out_scale, + descriptor(src), + src.device(), + descriptor(dest), + dest.device(), + descriptor(gamma), + gamma.device(), + beta.device(), + running_means.device(), + running_variances.device(), + eps)); + } + + void batch_normalize ( + const double eps, + resizable_tensor& dest, + resizable_tensor& means, + resizable_tensor& invstds, + const double averaging_factor, + resizable_tensor& running_means, + resizable_tensor& running_variances, + const tensor& src, + const tensor& gamma, + const tensor& beta + ) + { + DLIB_CASSERT(0 <= averaging_factor && averaging_factor <= 1, "averaging_factor: " << averaging_factor); + DLIB_CASSERT(averaging_factor==1 || have_same_dimensions(running_means,means)); + DLIB_CASSERT(averaging_factor==1 || have_same_dimensions(running_variances,invstds)); + DLIB_CASSERT( + src.num_samples() > 1 && + gamma.num_samples() == 1 && + beta.num_samples() == 1 && + gamma.nr() == beta.nr() && beta.nr() == src.nr() && + gamma.nc() == beta.nc() && beta.nc() == src.nc() && + gamma.k() == beta.k() && beta.k() == src.k() && + eps > 0, + "\ngamma.num_samples(): " << gamma.num_samples() << + "\ngamma.k(): " << gamma.k() << + "\ngamma.nr(): " << gamma.nr() << + "\ngamma.nc(): " << gamma.nc() << + "\nbeta.num_samples(): " << beta.num_samples() << + "\nbeta.k(): " << beta.k() << + "\nbeta.nr(): " << beta.nr() << + "\nbeta.nc(): " << beta.nc() << + "\nsrc.k(): " << src.k() << + "\nsrc.nr(): " << src.nr() << + "\nsrc.nc(): " << src.nc() << + "\neps: " << eps + ); + + const float in_scale = 1; + const float out_scale = 0; + + dest.copy_size(src); + means.set_size(1, src.k(), src.nr(), src.nc()); + invstds.copy_size(means); + running_means.copy_size(means); + running_variances.copy_size(means); + // cuDNN requires that running_means and running_variances be initialized to + // some valid float values even if the averaging factor would have ignored + // them. + if (averaging_factor == 1) + { + running_means = 0; + running_variances = 1; + } + + CHECK_CUDNN(cudnnBatchNormalizationForwardTraining( + context(), + CUDNN_BATCHNORM_PER_ACTIVATION, + &in_scale, + &out_scale, + descriptor(src), + src.device(), + descriptor(dest), + dest.device(), + descriptor(gamma), + gamma.device(), + beta.device(), + averaging_factor, + running_means.device(), + running_variances.device(), + eps, + means.device(), + invstds.device())); + } + + void batch_normalize_gradient( + const double eps, + const tensor& gradient_input, + const tensor& means, + const tensor& invstds, + const tensor& src, + const tensor& gamma, + tensor& src_grad, + tensor& gamma_grad, + tensor& beta_grad + ) + { + const long num = src.k()*src.nr()*src.nc(); + DLIB_CASSERT(src.num_samples() > 1); + DLIB_CASSERT(num == (long)means.size()); + DLIB_CASSERT(num == (long)invstds.size()); + DLIB_CASSERT(num == (long)gamma.size()); + DLIB_CASSERT(num == (long)gamma_grad.size()); + DLIB_CASSERT(num == (long)beta_grad.size()); + DLIB_CASSERT(have_same_dimensions(gradient_input, src)); + DLIB_CASSERT(have_same_dimensions(gradient_input, src_grad)); + DLIB_CASSERT(eps > 0); + + const float in_scale = 1; + const float out_scale = 1; + const float in_scale_params = 1; + const float out_scale_params = 0; + + CHECK_CUDNN(cudnnBatchNormalizationBackward( + context(), + CUDNN_BATCHNORM_PER_ACTIVATION, + &in_scale, + &out_scale, + &in_scale_params, + &out_scale_params, + descriptor(src), + src.device(), + descriptor(gradient_input), + gradient_input.device(), + descriptor(src_grad), + src_grad.device(), + descriptor(gamma), + gamma.device(), + gamma_grad.device(), + beta_grad.device(), + eps, + means.device(), + invstds.device())); + } + + // ------------------------------------------------------------------------------------ + + void batch_normalize_conv_inference ( + const double eps, + resizable_tensor& dest, + const tensor& src, + const tensor& gamma, + const tensor& beta, + const tensor& running_means, + const tensor& running_variances + ) + { + DLIB_CASSERT( + gamma.num_samples() == 1 && + gamma.nr() == 1 && + gamma.nc() == 1 && + gamma.k() == src.k() && + have_same_dimensions(gamma, beta) && + have_same_dimensions(gamma, running_means) && + have_same_dimensions(gamma, running_variances) && + eps > 0, + "\ngamma.num_samples(): " << gamma.num_samples() << + "\ngamma.k(): " << gamma.k() << + "\ngamma.nr(): " << gamma.nr() << + "\ngamma.nc(): " << gamma.nc() << + "\nbeta.num_samples(): " << beta.num_samples() << + "\nbeta.k(): " << beta.k() << + "\nbeta.nr(): " << beta.nr() << + "\nbeta.nc(): " << beta.nc() << + "\nrunning_means.num_samples(): " << running_means.num_samples() << + "\nrunning_means.k(): " << running_means.k() << + "\nrunning_means.nr(): " << running_means.nr() << + "\nrunning_means.nc(): " << running_means.nc() << + "\nrunning_variances.num_samples(): " << running_variances.num_samples() << + "\nrunning_variances.k(): " << running_variances.k() << + "\nrunning_variances.nr(): " << running_variances.nr() << + "\nrunning_variances.nc(): " << running_variances.nc() << + "\nsrc.k(): " << src.k() << + "\nsrc.nr(): " << src.nr() << + "\nsrc.nc(): " << src.nc() << + "\neps: " << eps + ); + const float in_scale = 1; + const float out_scale = 0; + + dest.copy_size(src); + + CHECK_CUDNN(cudnnBatchNormalizationForwardInference( + context(), + CUDNN_BATCHNORM_SPATIAL, + &in_scale, + &out_scale, + descriptor(src), + src.device(), + descriptor(dest), + dest.device(), + descriptor(gamma), + gamma.device(), + beta.device(), + running_means.device(), + running_variances.device(), + eps)); + } + + void batch_normalize_conv ( + const double eps, + resizable_tensor& dest, + resizable_tensor& means, + resizable_tensor& invstds, + const double averaging_factor, + resizable_tensor& running_means, + resizable_tensor& running_variances, + const tensor& src, + const tensor& gamma, + const tensor& beta + ) + { + DLIB_CASSERT(0 <= averaging_factor && averaging_factor <= 1, "averaging_factor: " << averaging_factor); + DLIB_CASSERT(averaging_factor==1 || have_same_dimensions(running_means,means)); + DLIB_CASSERT(averaging_factor==1 || have_same_dimensions(running_variances,invstds)); + DLIB_CASSERT( + src.num_samples() > 1 && + gamma.num_samples() == 1 && + beta.num_samples() == 1 && + gamma.nr() == 1 && + beta.nr() == 1 && + gamma.nc() == 1 && + beta.nc() == 1 && + gamma.k() == beta.k() && beta.k() == src.k() && + eps > 0, + "\ngamma.num_samples(): " << gamma.num_samples() << + "\ngamma.k(): " << gamma.k() << + "\ngamma.nr(): " << gamma.nr() << + "\ngamma.nc(): " << gamma.nc() << + "\nbeta.num_samples(): " << beta.num_samples() << + "\nbeta.k(): " << beta.k() << + "\nbeta.nr(): " << beta.nr() << + "\nbeta.nc(): " << beta.nc() << + "\nsrc.k(): " << src.k() << + "\nsrc.nr(): " << src.nr() << + "\nsrc.nc(): " << src.nc() << + "\neps: " << eps + ); + const float in_scale = 1; + const float out_scale = 0; + + dest.copy_size(src); + means.set_size(1, src.k()); + invstds.copy_size(means); + running_means.copy_size(means); + running_variances.copy_size(means); + // cuDNN requires that running_means and running_variances be initialized to + // some valid float values even if the averaging factor would have ignored + // them. + if (averaging_factor == 1) + { + running_means = 0; + running_variances = 1; + } + + CHECK_CUDNN(cudnnBatchNormalizationForwardTraining( + context(), + CUDNN_BATCHNORM_SPATIAL, + &in_scale, + &out_scale, + descriptor(src), + src.device(), + descriptor(dest), + dest.device(), + descriptor(gamma), + gamma.device(), + beta.device(), + averaging_factor, + running_means.device(), + running_variances.device(), + eps, + means.device(), + invstds.device())); + } + + void batch_normalize_conv_gradient( + const double eps, + const tensor& gradient_input, + const tensor& means, + const tensor& invstds, + const tensor& src, + const tensor& gamma, + tensor& src_grad, + tensor& gamma_grad, + tensor& beta_grad + ) + { + DLIB_CASSERT(src.k() == (long)means.size()); + DLIB_CASSERT(src.k() == (long)invstds.size()); + DLIB_CASSERT(src.k() == (long)gamma.size()); + DLIB_CASSERT(src.k() == (long)gamma_grad.size()); + DLIB_CASSERT(src.k() == (long)beta_grad.size()); + DLIB_CASSERT(have_same_dimensions(gradient_input, src)); + DLIB_CASSERT(have_same_dimensions(gradient_input, src_grad)); + DLIB_CASSERT(eps > 0); + + const float in_scale = 1; + const float out_scale = 1; + const float in_scale_params = 1; + const float out_scale_params = 0; + + CHECK_CUDNN(cudnnBatchNormalizationBackward( + context(), + CUDNN_BATCHNORM_SPATIAL, + &in_scale, + &out_scale, + &in_scale_params, + &out_scale_params, + descriptor(src), + src.device(), + descriptor(gradient_input), + gradient_input.device(), + descriptor(src_grad), + src_grad.device(), + descriptor(gamma), + gamma.device(), + gamma_grad.device(), + beta_grad.device(), + eps, + means.device(), + invstds.device())); + } + + // ------------------------------------------------------------------------------------ + // ------------------------------------------------------------------------------------ + + tensor_conv:: + tensor_conv( + ) : + filter_handle(nullptr), + conv_handle(nullptr), + forward_algo(0), + backward_data_algo(0), + backward_filters_algo(0) + { + clear(); + } + + void tensor_conv:: + clear ( + ) + { + if (filter_handle) + cudnnDestroyFilterDescriptor((cudnnFilterDescriptor_t)filter_handle); + if (conv_handle) + cudnnDestroyConvolutionDescriptor((cudnnConvolutionDescriptor_t)conv_handle); + filter_handle = nullptr; + conv_handle = nullptr; + out_num_samples = 0; + out_k = 0; + out_nr = 0; + out_nc = 0; + + stride_y = 0; + stride_x = 0; + padding_y = 0; + padding_x = 0; + data_num_samples = 0; + data_k = 0; + data_nr = 0; + data_nc = 0; + filters_num_samples = 0; + filters_k = 0; + filters_nr = 0; + filters_nc = 0; + + forward_algo = 0; + backward_data_algo = 0; + backward_filters_algo = 0; + + forward_workspace_size_in_bytes = 0; + backward_data_workspace_size_in_bytes = 0; + backward_filters_workspace_size_in_bytes = 0; + + forward_workspace.reset(); + backward_data_workspace.reset(); + backward_filters_workspace.reset(); + workspace.reset(); + } + + void tensor_conv:: + setup( + const tensor& data, + const tensor& filters, + int stride_y_, + int stride_x_, + int padding_y_, + int padding_x_ + ) + { + DLIB_CASSERT(data.k() == filters.k()); + + // if the last call to setup gave the same exact settings then don't do + // anything. + if (stride_y_ == stride_y && + stride_x_ == stride_x && + padding_y_ == padding_y && + padding_x_ == padding_x && + data_num_samples == data.num_samples() && + data_k == data.k() && + data_nr == data.nr() && + data_nc == data.nc() && + filters_num_samples == filters.num_samples() && + filters_k == filters.k() && + filters_nr == filters.nr() && + filters_nc == filters.nc()) + { + return; + } + + clear(); + try + { + stride_y = stride_y_; + stride_x = stride_x_; + padding_y = padding_y_; + padding_x = padding_x_; + data_num_samples = data.num_samples(); + data_k = data.k(); + data_nr = data.nr(); + data_nc = data.nc(); + filters_num_samples = filters.num_samples(); + filters_k = filters.k(); + filters_nr = filters.nr(); + filters_nc = filters.nc(); + + CHECK_CUDNN(cudnnCreateFilterDescriptor((cudnnFilterDescriptor_t*)&filter_handle)); + CHECK_CUDNN(cudnnSetFilter4dDescriptor((cudnnFilterDescriptor_t)filter_handle, + CUDNN_DATA_FLOAT, + CUDNN_TENSOR_NCHW, + filters.num_samples(), + filters.k(), + filters.nr(), + filters.nc())); + + CHECK_CUDNN(cudnnCreateConvolutionDescriptor((cudnnConvolutionDescriptor_t*)&conv_handle)); +#if CUDNN_MAJOR >= 6 + CHECK_CUDNN(cudnnSetConvolution2dDescriptor((cudnnConvolutionDescriptor_t)conv_handle, + padding_y, // vertical padding + padding_x, // horizontal padding + stride_y, + stride_x, + 1, 1, // must be 1,1 + CUDNN_CROSS_CORRELATION, + CUDNN_DATA_FLOAT)); // could also be CUDNN_CONVOLUTION +#else + CHECK_CUDNN(cudnnSetConvolution2dDescriptor((cudnnConvolutionDescriptor_t)conv_handle, + padding_y, // vertical padding + padding_x, // horizontal padding + stride_y, + stride_x, + 1, 1, // must be 1,1 + CUDNN_CROSS_CORRELATION)); // could also be CUDNN_CONVOLUTION +#endif + + CHECK_CUDNN(cudnnGetConvolution2dForwardOutputDim( + (const cudnnConvolutionDescriptor_t)conv_handle, + descriptor(data), + (const cudnnFilterDescriptor_t)filter_handle, + &out_num_samples, + &out_k, + &out_nr, + &out_nc)); + + tensor_descriptor dest_desc; + dest_desc.set_size(out_num_samples,out_k,out_nr,out_nc); + + // Pick which forward algorithm we will use and allocate the necessary + // workspace buffer. + cudnnConvolutionFwdAlgo_t forward_best_algo; + CHECK_CUDNN(cudnnGetConvolutionForwardAlgorithm( + context(), + descriptor(data), + (const cudnnFilterDescriptor_t)filter_handle, + (const cudnnConvolutionDescriptor_t)conv_handle, + descriptor(dest_desc), + dnn_prefer_fastest_algorithms()?CUDNN_CONVOLUTION_FWD_PREFER_FASTEST:CUDNN_CONVOLUTION_FWD_NO_WORKSPACE, + std::numeric_limits<size_t>::max(), + &forward_best_algo)); + forward_algo = forward_best_algo; + CHECK_CUDNN(cudnnGetConvolutionForwardWorkspaceSize( + context(), + descriptor(data), + (const cudnnFilterDescriptor_t)filter_handle, + (const cudnnConvolutionDescriptor_t)conv_handle, + descriptor(dest_desc), + forward_best_algo, + &forward_workspace_size_in_bytes)); + + // Pick which backward data algorithm we will use and allocate the + // necessary workspace buffer. + cudnnConvolutionBwdDataAlgo_t backward_data_best_algo; + CHECK_CUDNN(cudnnGetConvolutionBackwardDataAlgorithm( + context(), + (const cudnnFilterDescriptor_t)filter_handle, + descriptor(dest_desc), + (const cudnnConvolutionDescriptor_t)conv_handle, + descriptor(data), + dnn_prefer_fastest_algorithms()?CUDNN_CONVOLUTION_BWD_DATA_PREFER_FASTEST:CUDNN_CONVOLUTION_BWD_DATA_NO_WORKSPACE, + std::numeric_limits<size_t>::max(), + &backward_data_best_algo)); + backward_data_algo = backward_data_best_algo; + + CHECK_CUDNN(cudnnGetConvolutionBackwardDataWorkspaceSize( + context(), + (const cudnnFilterDescriptor_t)filter_handle, + descriptor(dest_desc), + (const cudnnConvolutionDescriptor_t)conv_handle, + descriptor(data), + backward_data_best_algo, + &backward_data_workspace_size_in_bytes)); + + // Pick which backward filters algorithm we will use and allocate the + // necessary workspace buffer. + cudnnConvolutionBwdFilterAlgo_t backward_filters_best_algo; + CHECK_CUDNN(cudnnGetConvolutionBackwardFilterAlgorithm( + context(), + descriptor(data), + descriptor(dest_desc), + (const cudnnConvolutionDescriptor_t)conv_handle, + (const cudnnFilterDescriptor_t)filter_handle, + dnn_prefer_fastest_algorithms()?CUDNN_CONVOLUTION_BWD_FILTER_PREFER_FASTEST:CUDNN_CONVOLUTION_BWD_FILTER_NO_WORKSPACE, + std::numeric_limits<size_t>::max(), + &backward_filters_best_algo)); + // cuDNN 5.1 has a bug that causes + // cudnnGetConvolutionBackwardFilterAlgorithm() to pick the winograd + // algorithm even for cases where cuDNN doesn't support it, leading to + // incorrect outputs. So here we check if we are in a case where winograd + // isn't supported and manually overrule + // cudnnGetConvolutionBackwardFilterAlgorithm() by picking a safe + // algorithm. + if (dnn_prefer_fastest_algorithms() && + !(stride_x == 1 && stride_y == 1 && ((filters_nr==3&&filters_nc==3) || (filters_nr==5&&filters_nc==5))) + ) + { + backward_filters_best_algo = CUDNN_CONVOLUTION_BWD_FILTER_ALGO_0; + } + backward_filters_algo = backward_filters_best_algo; + + CHECK_CUDNN(cudnnGetConvolutionBackwardFilterWorkspaceSize( + context(), + descriptor(data), + descriptor(dest_desc), + (const cudnnConvolutionDescriptor_t)conv_handle, + (const cudnnFilterDescriptor_t)filter_handle, + backward_filters_best_algo, + &backward_filters_workspace_size_in_bytes)); + + workspace = device_global_buffer(); + } + catch(...) + { + clear(); + throw; + } + } + + tensor_conv:: + ~tensor_conv ( + ) + { + clear(); + } + + void tensor_conv::operator() ( + const bool add_to_output, + resizable_tensor& output, + const tensor& data, + const tensor& filters + ) + { + DLIB_CASSERT(stride_y > 0 && stride_x > 0, "You must call setup() before calling this function"); + + output.set_size(out_num_samples, out_k, out_nr, out_nc); + (*this)(add_to_output, static_cast<tensor&>(output), data, filters); + } + + void tensor_conv::operator() ( + const bool add_to_output, + tensor& output, + const tensor& data, + const tensor& filters + ) + { + DLIB_CASSERT(is_same_object(output,data) == false); + DLIB_CASSERT(is_same_object(output,filters) == false); + DLIB_CASSERT(filters.k() == data.k()); + DLIB_CASSERT(stride_y > 0 && stride_x > 0, "You must call setup() before calling this function"); + DLIB_CASSERT(filters.nc() <= data.nc() + 2*padding_x, + "Filter windows must be small enough to fit into the padded image." + << "\n\t filters.nc(): " << filters.nc() + << "\n\t data.nc(): " << data.nc() + << "\n\t padding_x: " << padding_x + ); + DLIB_CASSERT(filters.nr() <= data.nr() + 2*padding_y, + "Filter windows must be small enough to fit into the padded image." + << "\n\t filters.nr(): " << filters.nr() + << "\n\t data.nr(): " << data.nr() + << "\n\t padding_y: " << padding_y + ); + + + DLIB_CASSERT(output.num_samples() == data.num_samples(),out_num_samples << " " << data.num_samples()); + DLIB_CASSERT(output.k() == filters.num_samples()); + DLIB_CASSERT(output.nr() == 1+(data.nr()+2*padding_y-filters.nr())/stride_y); + DLIB_CASSERT(output.nc() == 1+(data.nc()+2*padding_x-filters.nc())/stride_x); + + + + const float alpha = 1; + const float beta = add_to_output ? 1 : 0; + + // Since cudnnConvolutionForward() is an asynchronous call, we need to hold a + // reference to the workspace buffer so we can be sure it isn't reallocated + // while the function is still executing on the device. But each time we come + // here, we make sure to grab the latest workspace buffer so that, globally, we + // minimize the number of such buffers. + forward_workspace = workspace->get(forward_workspace_size_in_bytes); + + CHECK_CUDNN(cudnnConvolutionForward( + context(), + &alpha, + descriptor(data), + data.device(), + (const cudnnFilterDescriptor_t)filter_handle, + filters.device(), + (const cudnnConvolutionDescriptor_t)conv_handle, + (cudnnConvolutionFwdAlgo_t)forward_algo, + forward_workspace, + forward_workspace_size_in_bytes, + &beta, + descriptor(output), + output.device())); + } + + void tensor_conv::get_gradient_for_data ( + const bool add_to_output, + const tensor& gradient_input, + const tensor& filters, + tensor& data_gradient + ) + { + const float alpha = 1; + const float beta = add_to_output ? 1 : 0; + + // Since cudnnConvolutionBackwardData() is an asynchronous call, we need to hold a + // reference to the workspace buffer so we can be sure it isn't reallocated + // while the function is still executing on the device. But each time we come + // here, we make sure to grab the latest workspace buffer so that, globally, we + // minimize the number of such buffers. + backward_data_workspace = workspace->get(backward_data_workspace_size_in_bytes); + + + CHECK_CUDNN(cudnnConvolutionBackwardData(context(), + &alpha, + (const cudnnFilterDescriptor_t)filter_handle, + filters.device(), + descriptor(gradient_input), + gradient_input.device(), + (const cudnnConvolutionDescriptor_t)conv_handle, + (cudnnConvolutionBwdDataAlgo_t)backward_data_algo, + backward_data_workspace, + backward_data_workspace_size_in_bytes, + &beta, + descriptor(data_gradient), + data_gradient.device())); + } + + void tensor_conv:: + get_gradient_for_filters ( + const bool add_to_output, + const tensor& gradient_input, + const tensor& data, + tensor& filters_gradient + ) + { + const float alpha = 1; + const float beta = add_to_output ? 1 : 0; + + // Since cudnnConvolutionBackwardFilter() is an asynchronous call, we need to hold a + // reference to the workspace buffer so we can be sure it isn't reallocated + // while the function is still executing on the device. But each time we come + // here, we make sure to grab the latest workspace buffer so that, globally, we + // minimize the number of such buffers. + backward_filters_workspace = workspace->get(backward_filters_workspace_size_in_bytes); + + CHECK_CUDNN(cudnnConvolutionBackwardFilter(context(), + &alpha, + descriptor(data), + data.device(), + descriptor(gradient_input), + gradient_input.device(), + (const cudnnConvolutionDescriptor_t)conv_handle, + (cudnnConvolutionBwdFilterAlgo_t)backward_filters_algo, + backward_filters_workspace, + backward_filters_workspace_size_in_bytes, + &beta, + (const cudnnFilterDescriptor_t)filter_handle, + filters_gradient.device())); + } + + // ------------------------------------------------------------------------------------ + // ------------------------------------------------------------------------------------ + + pooling::pooling ( + ) : handle(nullptr),window_height(0),window_width(0),stride_y(0),stride_x(0),padding_y(0), padding_x(0) + { + } + + pooling::~pooling( + ) + { + clear(); + } + + void pooling:: + clear( + ) + { + if (handle) + cudnnDestroyPoolingDescriptor((cudnnPoolingDescriptor_t)handle); + handle = nullptr; + window_height = 0; + window_width = 0; + stride_y = 0; + stride_x = 0; + padding_y = 0; + padding_x = 0; + } + + void pooling:: + setup_max_pooling( + int window_height_, + int window_width_, + int stride_y_, + int stride_x_, + int padding_y_, + int padding_x_ + ) + { + setup(window_height_, window_width_, stride_y_, stride_x_, padding_y_, padding_x_, CUDNN_POOLING_MAX); + do_max_pooling = true; + } + + void pooling:: + setup_avg_pooling( + int window_height_, + int window_width_, + int stride_y_, + int stride_x_, + int padding_y_, + int padding_x_ + ) + { + setup(window_height_, window_width_, stride_y_, stride_x_, padding_y_, padding_x_, CUDNN_POOLING_AVERAGE_COUNT_EXCLUDE_PADDING); + do_max_pooling = false; + } + + void pooling:: + setup( + int window_height_, + int window_width_, + int stride_y_, + int stride_x_, + int padding_y_, + int padding_x_, + int pooling_mode + ) + { + DLIB_CASSERT (window_height_ > 0 && window_width_ > 0 && + stride_y_ > 0 && stride_x_ > 0 , + "window_height_: " << window_height_ + << "\t\n window_width_: " << window_width_ + << "\t\n stride_y_: " << stride_y_ + << "\t\n stride_x_: " << stride_x_ ); + DLIB_CASSERT( 0 <= padding_y_ && padding_y_ < window_height_ && + 0 <= padding_x_ && padding_x_ < window_width_, + "window_height_: " << window_height_ + << "\t\n window_width_: " << window_width_ + << "\t\n padding_y_: " << padding_y_ + << "\t\n padding_x_: " << padding_x_ ); + + if (window_height == window_height_ && + window_width == window_width_ && + stride_y == stride_y_ && + stride_x == stride_x_ && + padding_y == padding_y_ && + padding_x == padding_x_ + ) + { + return; + } + + clear(); + try + { + window_height = window_height_; + window_width = window_width_; + stride_x = stride_x_; + stride_y = stride_y_; + padding_y = padding_y_; + padding_x = padding_x_; + cudnnPoolingDescriptor_t poolingDesc; + CHECK_CUDNN(cudnnCreatePoolingDescriptor(&poolingDesc)); + handle = poolingDesc; + + CHECK_CUDNN(cudnnSetPooling2dDescriptor(poolingDesc, + (cudnnPoolingMode_t)pooling_mode, + CUDNN_PROPAGATE_NAN, + window_height, + window_width, + padding_y, + padding_x, + stride_y, + stride_x)); + } + catch(...) + { + clear(); + throw; + } + } + + void pooling:: + operator() ( + resizable_tensor& dest, + const tensor& src + ) + { + DLIB_CASSERT(window_width <= src.nc() + 2*padding_x, + "Pooling windows must be small enough to fit into the padded image." + << "\n\t window_width: " << window_width + << "\n\t src.nc(): " << src.nc() + << "\n\t padding_x: " << padding_x + ); + DLIB_CASSERT(window_height <= src.nr() + 2*padding_y, + "Pooling windows must be small enough to fit into the padded image." + << "\n\t window_height: " << window_height + << "\n\t src.nr(): " << src.nr() + << "\n\t padding_y: " << padding_y + ); + const float alpha = 1; + const float beta = 0; + int outN; + int outC; + int outH; + int outW; + CHECK_CUDNN(cudnnGetPooling2dForwardOutputDim((const cudnnPoolingDescriptor_t)handle, + descriptor(src), + &outN, + &outC, + &outH, + &outW)); + + + dest.set_size(outN,outC,outH,outW); + + DLIB_CASSERT(dest.num_samples() == src.num_samples()); + DLIB_CASSERT(dest.k() == src.k()); + DLIB_CASSERT(dest.nr() == 1 + (src.nr() + 2*padding_y - window_height)/stride_y, + "\n stride_y: " << stride_y << + "\n padding_y: " << padding_y << + "\n window_height: " << window_height << + "\n src.nr(): " << src.nr() << + "\n dest.nr(): " << dest.nr() << + "\n src.nr()/stride_y: " << src.nr()/stride_y); + DLIB_CASSERT(dest.nc() == 1 + (src.nc() + 2*padding_x - window_width)/stride_x, + "\n stride_x: " << stride_x << + "\n padding_x: " << padding_x << + "\n window_width: " << window_width << + "\n src.nc(): " << src.nc() << + "\n dest.nc(): " << dest.nc() << + "\n src.nc()/stride_x: " << src.nc()/stride_x); + + CHECK_CUDNN(cudnnPoolingForward(context(), + (const cudnnPoolingDescriptor_t)handle, + &alpha, + descriptor(src), + src.device(), + &beta, + descriptor(dest), + dest.device())); + } + + void pooling::get_gradient( + const tensor& gradient_input, + const tensor& dest, + const tensor& src, + tensor& grad + ) + { + DLIB_CASSERT(have_same_dimensions(gradient_input,dest)); + DLIB_CASSERT(have_same_dimensions(src,grad)); + + const float alpha = 1; + const float beta = 1; + CHECK_CUDNN(cudnnPoolingBackward(context(), + (const cudnnPoolingDescriptor_t)handle, + &alpha, + descriptor(dest), + dest.device(), + descriptor(gradient_input), + gradient_input.device(), + descriptor(src), + src.device(), + &beta, + descriptor(grad), + grad.device())); + } + + // ------------------------------------------------------------------------------------ + // ------------------------------------------------------------------------------------ + + void softmax ( + tensor& dest, + const tensor& src + ) + { + DLIB_CASSERT(have_same_dimensions(dest,src)); + if (src.size() == 0) + return; + + const float alpha = 1; + const float beta = 0; + + CHECK_CUDNN(cudnnSoftmaxForward(context(), + CUDNN_SOFTMAX_ACCURATE, + CUDNN_SOFTMAX_MODE_CHANNEL, + &alpha, + descriptor(src), + src.device(), + &beta, + descriptor(dest), + dest.device())); + } + + + void softmax_gradient ( + tensor& grad, + const tensor& dest, + const tensor& gradient_input + ) + { + DLIB_CASSERT( + have_same_dimensions(dest,gradient_input) == true && + have_same_dimensions(dest,grad) == true ); + if (dest.size() == 0) + return; + + const float alpha = 1; + const float beta = is_same_object(grad,gradient_input) ? 0 : 1; + CHECK_CUDNN(cudnnSoftmaxBackward(context(), + CUDNN_SOFTMAX_ACCURATE, + CUDNN_SOFTMAX_MODE_CHANNEL, + &alpha, + descriptor(dest), + dest.device(), + descriptor(gradient_input), + gradient_input.device(), + &beta, + descriptor(grad), + grad.device())); + } + + // ------------------------------------------------------------------------------------ + // ------------------------------------------------------------------------------------ + + void softmax_all ( + tensor& dest, + const tensor& src + ) + { + DLIB_CASSERT(have_same_dimensions(dest,src)); + if (src.size() == 0) + return; + + const float alpha = 1; + const float beta = 0; + + CHECK_CUDNN(cudnnSoftmaxForward(context(), + CUDNN_SOFTMAX_ACCURATE, + CUDNN_SOFTMAX_MODE_INSTANCE, + &alpha, + descriptor(src), + src.device(), + &beta, + descriptor(dest), + dest.device())); + } + + + void softmax_all_gradient ( + tensor& grad, + const tensor& dest, + const tensor& gradient_input + ) + { + DLIB_CASSERT( + have_same_dimensions(dest,gradient_input) == true && + have_same_dimensions(dest,grad) == true ); + if (dest.size() == 0) + return; + + const float alpha = 1; + const float beta = is_same_object(grad,gradient_input) ? 0 : 1; + CHECK_CUDNN(cudnnSoftmaxBackward(context(), + CUDNN_SOFTMAX_ACCURATE, + CUDNN_SOFTMAX_MODE_INSTANCE, + &alpha, + descriptor(dest), + dest.device(), + descriptor(gradient_input), + gradient_input.device(), + &beta, + descriptor(grad), + grad.device())); + } + + // ------------------------------------------------------------------------------------ + // ------------------------------------------------------------------------------------ + + void sigmoid ( + tensor& dest, + const tensor& src + ) + { + DLIB_CASSERT(have_same_dimensions(dest,src)); + if (src.size() == 0) + return; + + const float alpha = 1; + const float beta = 0; + CHECK_CUDNN(cudnnActivationForward(context(), + sigmoid_activation_descriptor(), + &alpha, + descriptor(src), + src.device(), + &beta, + descriptor(dest), + dest.device())); + } + + void sigmoid_gradient ( + tensor& grad, + const tensor& dest, + const tensor& gradient_input + ) + { + DLIB_CASSERT( + have_same_dimensions(dest,gradient_input) == true && + have_same_dimensions(dest,grad) == true ); + if (dest.size() == 0) + return; + + const float alpha = 1; + const float beta = is_same_object(grad,gradient_input) ? 0 : 1; + CHECK_CUDNN(cudnnActivationBackward(context(), + sigmoid_activation_descriptor(), + &alpha, + descriptor(dest), + dest.device(), + descriptor(gradient_input), + gradient_input.device(), + descriptor(dest), + dest.device(), + &beta, + descriptor(grad), + grad.device())); + } + + // ------------------------------------------------------------------------------------ + + void relu ( + tensor& dest, + const tensor& src + ) + { + DLIB_CASSERT(have_same_dimensions(dest,src)); + if (src.size() == 0) + return; + + const float alpha = 1; + const float beta = 0; + CHECK_CUDNN(cudnnActivationForward(context(), + relu_activation_descriptor(), + &alpha, + descriptor(src), + src.device(), + &beta, + descriptor(dest), + dest.device())); + } + + void relu_gradient ( + tensor& grad, + const tensor& dest, + const tensor& gradient_input + ) + { + DLIB_CASSERT( + have_same_dimensions(dest,gradient_input) == true && + have_same_dimensions(dest,grad) == true ); + if (dest.size() == 0) + return; + + const float alpha = 1; + const float beta = is_same_object(grad,gradient_input) ? 0 : 1; + CHECK_CUDNN(cudnnActivationBackward(context(), + relu_activation_descriptor(), + &alpha, + descriptor(dest), + dest.device(), + descriptor(gradient_input), + gradient_input.device(), + descriptor(dest), + dest.device(), + &beta, + descriptor(grad), + grad.device())); + } + + // ------------------------------------------------------------------------------------ + + void tanh ( + tensor& dest, + const tensor& src + ) + { + DLIB_CASSERT(have_same_dimensions(dest,src)); + if (src.size() == 0) + return; + + const float alpha = 1; + const float beta = 0; + CHECK_CUDNN(cudnnActivationForward(context(), + tanh_activation_descriptor(), + &alpha, + descriptor(src), + src.device(), + &beta, + descriptor(dest), + dest.device())); + } + + void tanh_gradient ( + tensor& grad, + const tensor& dest, + const tensor& gradient_input + ) + { + DLIB_CASSERT( + have_same_dimensions(dest,gradient_input) == true && + have_same_dimensions(dest,grad) == true); + if (dest.size() == 0) + return; + + const float alpha = 1; + const float beta = is_same_object(grad,gradient_input) ? 0 : 1; + CHECK_CUDNN(cudnnActivationBackward(context(), + tanh_activation_descriptor(), + &alpha, + descriptor(dest), + dest.device(), + descriptor(gradient_input), + gradient_input.device(), + descriptor(dest), + dest.device(), + &beta, + descriptor(grad), + grad.device())); + } + + // ------------------------------------------------------------------------------------ + } +} + +#endif // DLIB_USE_CUDA + +#endif // DLIB_DNN_CuDNN_CPP_ + + diff --git a/ml/dlib/dlib/dnn/cudnn_dlibapi.h b/ml/dlib/dlib/dnn/cudnn_dlibapi.h new file mode 100644 index 000000000..e9ffe5f6d --- /dev/null +++ b/ml/dlib/dlib/dnn/cudnn_dlibapi.h @@ -0,0 +1,518 @@ +// Copyright (C) 2015 Davis E. King (davis@dlib.net) +// License: Boost Software License See LICENSE.txt for the full license. +#ifndef DLIB_DNN_CuDNN_H_ +#define DLIB_DNN_CuDNN_H_ + +#ifdef DLIB_USE_CUDA + +#include "cuda_errors.h" +#include <memory> +#include "cuda_data_ptr.h" + +namespace dlib +{ + class tensor; + class resizable_tensor; + + namespace cuda + { + + // ----------------------------------------------------------------------------------- + + class tensor_descriptor + { + /*! + Each tensor object will carry a tensor_descriptor in it when compiled with + CUDA. + !*/ + + public: + // not copyable + tensor_descriptor(const tensor_descriptor&) = delete; + tensor_descriptor& operator=(const tensor_descriptor&) = delete; + // but is movable + tensor_descriptor(tensor_descriptor&& item) : tensor_descriptor() { swap(item); } + tensor_descriptor& operator=(tensor_descriptor&& item) { swap(item); return *this; } + + tensor_descriptor(); + ~tensor_descriptor(); + + void set_size( + int n, + int k, + int nr, + int nc + ); + /*! + ensures + - if any of the arguments are 0 then they are all set to 0 in the tensor. + !*/ + + void get_size ( + int& n, + int& k, + int& nr, + int& nc + ) const; + + const void* get_handle ( + ) const { return handle; } + + private: + + void swap(tensor_descriptor& item) { std::swap(handle, item.handle); } + + void* handle; + }; + + // ------------------------------------------------------------------------------------ + + void add( + float beta, + tensor& dest, + float alpha, + const tensor& src + ); + /*! + requires + - One of the following is true: + - have_same_dimensions(src, dest) + - src.num_samples()==1 && src.k()==dest.k() && src.nr()==1 && src.nc()==1 + - src.num_samples()==1 && src.k()==dest.k() && src.nr()==dest.nr() && src.nc()==dest.nc() + - src.num_samples()==1 && src.k()==1 && src.nr()==dest.nr() && src.nc()==dest.nc() + - is_same_object(src,dest) == false + ensures + - performs: dest = beta*dest + alpha*src + However, how the addition happens depends on the dimensions of src. In + particular, this function adds the scaled values of one src tensor to + dest. Each dimension of the src tensor must match the corresponding + dimension of the dest tensor or must be equal to 1. In the latter case, + the same value from the src tensor, for those dimensions, will be used to + add into the dest tensor. + !*/ + + // ------------------------------------------------------------------------------------ + + void assign_conv_bias_gradient ( + tensor& grad, + const tensor& gradient_input + ); + /*! + requires + - grad.num_samples() == 1 + - grad.k() >= 1 + - grad.nr() == 1 + - grad.nc() == 1 + - gradient_input.k() == grad.k() + - gradient_input.size() > 0 + - is_same_object(grad,gradient_input) == false + ensures + - let BIAS be a tensor with all dimensions equal to 1 except for k which is >= 1. + - let OUT be the output of add(1,OUT,1,BIAS) + - let f(gradient_input,BIAS) == dot(gradient_input,OUT) + - Then this function computes the gradient of f() with respect to BIAS and + assigns it to grad. + !*/ + + // ------------------------------------------------------------------------------------ + + void batch_normalize_inference ( + const double eps, + resizable_tensor& dest, + const tensor& src, + const tensor& gamma, + const tensor& beta, + const tensor& running_means, + const tensor& running_variances + ); + + void batch_normalize ( + const double eps, + resizable_tensor& dest, + resizable_tensor& means, + resizable_tensor& invstds, + const double averaging_factor, + resizable_tensor& running_means, + resizable_tensor& running_variances, + const tensor& src, + const tensor& gamma, + const tensor& beta + ); + + void batch_normalize_gradient( + const double eps, + const tensor& gradient_input, + const tensor& means, + const tensor& invstds, + const tensor& src, + const tensor& gamma, + tensor& src_grad, + tensor& gamma_grad, + tensor& beta_grad + ); + + // ------------------------------------------------------------------------------------ + + void batch_normalize_conv_inference ( + const double eps, + resizable_tensor& dest, + const tensor& src, + const tensor& gamma, + const tensor& beta, + const tensor& running_means, + const tensor& running_variances + ); + + void batch_normalize_conv ( + const double eps, + resizable_tensor& dest, + resizable_tensor& means, + resizable_tensor& invstds, + const double averaging_factor, + resizable_tensor& running_means, + resizable_tensor& running_variances, + const tensor& src, + const tensor& gamma, + const tensor& beta + ); + + void batch_normalize_conv_gradient( + const double eps, + const tensor& gradient_input, + const tensor& means, + const tensor& invstds, + const tensor& src, + const tensor& gamma, + tensor& src_grad, + tensor& gamma_grad, + tensor& beta_grad + ); + + // ------------------------------------------------------------------------------------ + + class tensor_conv + { + public: + tensor_conv(const tensor_conv&) = delete; + tensor_conv& operator=(const tensor_conv&) = delete; + + tensor_conv(); + + void clear( + ); + + ~tensor_conv ( + ); + + void operator() ( + const bool add_to_output, + tensor& output, + const tensor& data, + const tensor& filters + ); + + void operator() ( + const bool add_to_output, + resizable_tensor& output, + const tensor& data, + const tensor& filters + ); + + void get_gradient_for_data ( + const bool add_to_output, + const tensor& gradient_input, + const tensor& filters, + tensor& data_gradient + ); + + void get_gradient_for_filters ( + const bool add_to_output, + const tensor& gradient_input, + const tensor& data, + tensor& filters_gradient + ); + + void setup( + const tensor& data, + const tensor& filters, + int stride_y, + int stride_x, + int padding_y, + int padding_x + ); + + private: + + // These variables record the type of data given to the last call to setup(). + int stride_y; + int stride_x; + int padding_y; + int padding_x; + long data_num_samples, data_k, data_nr, data_nc; + long filters_num_samples, filters_k, filters_nr, filters_nc; + + + void* filter_handle; + void* conv_handle; + + // dimensions of the output tensor from operator() + int out_num_samples; + int out_k; + int out_nr; + int out_nc; + + int forward_algo; + int backward_data_algo; + int backward_filters_algo; + + size_t forward_workspace_size_in_bytes; + size_t backward_data_workspace_size_in_bytes; + size_t backward_filters_workspace_size_in_bytes; + std::shared_ptr<resizable_cuda_buffer> workspace; + cuda_data_void_ptr forward_workspace; + cuda_data_void_ptr backward_data_workspace; + cuda_data_void_ptr backward_filters_workspace; + }; + + // ------------------------------------------------------------------------------------ + + class pooling + { + public: + + pooling(const pooling&) = delete; + pooling& operator=(const pooling&) = delete; + + pooling ( + ); + + ~pooling( + ); + + void clear( + ); + + void setup_max_pooling( + int window_height, + int window_width, + int stride_y, + int stride_x, + int padding_y, + int padding_x + ); + + void setup_avg_pooling( + int window_height, + int window_width, + int stride_y, + int stride_x, + int padding_y, + int padding_x + ); + + bool does_max_pooling( + ) const { return do_max_pooling; } + + void operator() ( + resizable_tensor& dest, + const tensor& src + ); + + void get_gradient( + const tensor& gradient_input, + const tensor& dest, + const tensor& src, + tensor& grad + ); + + private: + + void setup( + int window_height, + int window_width, + int stride_y, + int stride_x, + int padding_y, + int padding_x, + int pooling_mode + ); + + void* handle; + int window_height; + int window_width; + int stride_y; + int stride_x; + int padding_y; + int padding_x; + bool do_max_pooling; + }; + + // ------------------------------------------------------------------------------------ + + void softmax ( + tensor& dest, + const tensor& src + ); + /*! + requires + - have_same_dimensions(dest, src) == true + ensures + - Note that the softmax function is a vector valued function: + s(x) == exp(x)/sum(exp(x)) + - Computes the softmax function on src and writes the results to dest. The + softmax is computed per spatial location across the different channels at + each location. That is, softmax() outputs a new tensor, #dest, where + each of the spatial locations in dest (i.e. image idx, row idx, and + column idx) contains the output of s() evaluated over the channel values + at each location. + - This function supports in-place operation, i.e. having + is_same_object(dest, src)==true + !*/ + + void softmax_gradient ( + tensor& grad, + const tensor& dest, + const tensor& gradient_input + ); + /*! + requires + - have_same_dimensions(dest,gradient_input) == true + - have_same_dimensions(dest,grad) == true + - is_same_object(grad, dest)==false + ensures + - We interpret dest as the output of softmax(dest,SRC) for some SRC tensor. + Then let f(SRC) == dot(gradient_input,dest) Then this function computes + the gradient of f() with respect to SRC and assigns it to grad. + - This function supports in-place operation, i.e. having + is_same_object(grad, gradient_input)==true + !*/ + + // ------------------------------------------------------------------------------------ + + void softmax_all ( + tensor& dest, + const tensor& src + ); + + void softmax_all_gradient ( + tensor& grad, + const tensor& dest, + const tensor& gradient_input + ); + + // ------------------------------------------------------------------------------------ + + void sigmoid ( + tensor& dest, + const tensor& src + ); + /*! + requires + - have_same_dimensions(dest, src) == true + ensures + - for all valid i: + - #dest.host()[i] == 1/(1+std::exp(-src.host()[i])) + - This function supports in-place operation, i.e. having + is_same_object(dest, src)==true + !*/ + + void sigmoid_gradient ( + tensor& grad, + const tensor& dest, + const tensor& gradient_input + ); + /*! + requires + - have_same_dimensions(dest,gradient_input) == true + - have_same_dimensions(dest,grad) == true + - is_same_object(grad,dest) == false + ensures + - Recalling that dest is the output of sigmoid(dest,SRC) for some SRC tensor, + let f(SRC) == dot(gradient_input,dest) + - Then this function computes the gradient of f() with respect to SRC and + assigns it to grad. + - This function supports in-place operation, i.e. having + is_same_object(grad, gradient_input)==true + !*/ + + // ------------------------------------------------------------------------------------ + + void relu ( + tensor& dest, + const tensor& src + ); + /*! + requires + - have_same_dimensions(dest, src) == true + ensures + - for all valid i: + - #dest.host()[i] == std::max(0,src.host()[i]) + - This function supports in-place operation, i.e. having + is_same_object(dest, src)==true + !*/ + + void relu_gradient ( + tensor& grad, + const tensor& dest, + const tensor& gradient_input + ); + /*! + requires + - have_same_dimensions(dest,gradient_input) == true + - have_same_dimensions(dest,grad) == true + - is_same_object(grad,dest) == false + ensures + - Recalling that dest is the output of relu(dest,SRC) for some SRC tensor, + let f(SRC) == dot(gradient_input,dest) + - Then this function computes the gradient of f() with respect to SRC and + assigns it to grad. + - This function supports in-place operation, i.e. having + is_same_object(grad, gradient_input)==true + !*/ + + // ------------------------------------------------------------------------------------ + + void tanh ( + tensor& dest, + const tensor& src + ); + /*! + requires + - have_same_dimensions(dest, src) == true + ensures + - for all valid i: + - #dest.host()[i] == std::tanh(src.host()[i]) + - This function supports in-place operation, i.e. having + is_same_object(dest, src)==true + !*/ + + void tanh_gradient ( + tensor& grad, + const tensor& dest, + const tensor& gradient_input + ); + /*! + requires + - have_same_dimensions(dest,gradient_input) == true + - have_same_dimensions(dest,grad) == true + - is_same_object(grad,dest) == false + ensures + - Recalling that dest is the output of tanh(dest,SRC) for some SRC tensor, + let f(SRC) == dot(gradient_input,dest) + - Then this function computes the gradient of f() with respect to SRC and + assigns it to grad. + - This function supports in-place operation, i.e. having + is_same_object(grad, gradient_input)==true + !*/ + + + + // ------------------------------------------------------------------------------------ + + } +} + +#endif // DLIB_USE_CUDA + +#endif // DLIB_DNN_CuDNN_H_ + diff --git a/ml/dlib/dlib/dnn/curand_dlibapi.cpp b/ml/dlib/dlib/dnn/curand_dlibapi.cpp new file mode 100644 index 000000000..67828e664 --- /dev/null +++ b/ml/dlib/dlib/dnn/curand_dlibapi.cpp @@ -0,0 +1,113 @@ +// Copyright (C) 2015 Davis E. King (davis@dlib.net) +// License: Boost Software License See LICENSE.txt for the full license. +#ifndef DLIB_DNN_CuRAND_CPP_ +#define DLIB_DNN_CuRAND_CPP_ + +#ifdef DLIB_USE_CUDA + +#include "curand_dlibapi.h" +#include <curand.h> +#include "../string.h" + +static const char* curand_get_error_string(curandStatus_t s) +{ + switch(s) + { + case CURAND_STATUS_NOT_INITIALIZED: + return "CUDA Runtime API initialization failed."; + case CURAND_STATUS_LENGTH_NOT_MULTIPLE: + return "The requested length must be a multiple of two."; + default: + return "A call to cuRAND failed"; + } +} + +// Check the return value of a call to the cuDNN runtime for an error condition. +#define CHECK_CURAND(call) \ +do{ \ + const curandStatus_t error = call; \ + if (error != CURAND_STATUS_SUCCESS) \ + { \ + std::ostringstream sout; \ + sout << "Error while calling " << #call << " in file " << __FILE__ << ":" << __LINE__ << ". ";\ + sout << "code: " << error << ", reason: " << curand_get_error_string(error);\ + throw dlib::curand_error(sout.str()); \ + } \ +}while(false) + +namespace dlib +{ + namespace cuda + { + + // ---------------------------------------------------------------------------------------- + + curand_generator:: + curand_generator( + unsigned long long seed + ) : handle(nullptr) + { + curandGenerator_t gen; + CHECK_CURAND(curandCreateGenerator(&gen, CURAND_RNG_PSEUDO_DEFAULT)); + handle = gen; + + CHECK_CURAND(curandSetPseudoRandomGeneratorSeed(gen, seed)); + } + + curand_generator:: + ~curand_generator() + { + if (handle) + { + curandDestroyGenerator((curandGenerator_t)handle); + } + } + + void curand_generator:: + fill_gaussian ( + tensor& data, + float mean, + float stddev + ) + { + if (data.size() == 0) + return; + + CHECK_CURAND(curandGenerateNormal((curandGenerator_t)handle, + data.device(), + data.size(), + mean, + stddev)); + } + + void curand_generator:: + fill_uniform ( + tensor& data + ) + { + if (data.size() == 0) + return; + + CHECK_CURAND(curandGenerateUniform((curandGenerator_t)handle, data.device(), data.size())); + } + + void curand_generator:: + fill ( + cuda_data_ptr<unsigned int>& data + ) + { + if (data.size() == 0) + return; + + CHECK_CURAND(curandGenerate((curandGenerator_t)handle, data, data.size())); + } + + // ----------------------------------------------------------------------------------- + + } +} + +#endif // DLIB_USE_CUDA + +#endif // DLIB_DNN_CuRAND_CPP_ + diff --git a/ml/dlib/dlib/dnn/curand_dlibapi.h b/ml/dlib/dlib/dnn/curand_dlibapi.h new file mode 100644 index 000000000..cd51fecee --- /dev/null +++ b/ml/dlib/dlib/dnn/curand_dlibapi.h @@ -0,0 +1,75 @@ +// Copyright (C) 2015 Davis E. King (davis@dlib.net) +// License: Boost Software License See LICENSE.txt for the full license. +#ifndef DLIB_DNN_CuRAND_H_ +#define DLIB_DNN_CuRAND_H_ + +#ifdef DLIB_USE_CUDA + +#include "tensor.h" +#include "cuda_errors.h" +#include "cuda_data_ptr.h" + +namespace dlib +{ + namespace cuda + { + + // ----------------------------------------------------------------------------------- + + class curand_generator + { + public: + // not copyable + curand_generator(const curand_generator&) = delete; + curand_generator& operator=(const curand_generator&) = delete; + + curand_generator() : curand_generator(0) {} + curand_generator(unsigned long long seed); + ~curand_generator(); + + void fill ( + cuda_data_ptr<unsigned int>& data + ); + /*! + ensures + - Fills data with random 32-bit unsigned integers. + !*/ + + void fill_gaussian ( + tensor& data, + float mean = 0, + float stddev = 1 + ); + /*! + requires + - data.size()%2 == 0 + - stddev >= 0 + ensures + - Fills data with random numbers drawn from a Gaussian distribution + with the given mean and standard deviation. + !*/ + + void fill_uniform ( + tensor& data + ); + /*! + ensures + - Fills data with uniform random numbers in the range (0.0, 1.0]. + !*/ + + private: + + void* handle; + }; + + // ----------------------------------------------------------------------------------- + + } +} + +#endif // DLIB_USE_CUDA + +#endif // DLIB_DNN_CuRAND_H_ + + + diff --git a/ml/dlib/dlib/dnn/cusolver_dlibapi.cu b/ml/dlib/dlib/dnn/cusolver_dlibapi.cu new file mode 100644 index 000000000..942613134 --- /dev/null +++ b/ml/dlib/dlib/dnn/cusolver_dlibapi.cu @@ -0,0 +1,204 @@ +// Copyright (C) 2017 Davis E. King (davis@dlib.net) +// License: Boost Software License See LICENSE.txt for the full license. +#ifndef DLIB_DNN_CuSOLVER_CU_ +#define DLIB_DNN_CuSOLVER_CU_ + +#ifdef DLIB_USE_CUDA + +#include "cusolver_dlibapi.h" +#include <cublas_v2.h> +#include <cusolverDn.h> +#include "cuda_utils.h" + +// ---------------------------------------------------------------------------------------- + +static const char* cusolver_get_error_string(cusolverStatus_t s) +{ + switch(s) + { + case CUSOLVER_STATUS_NOT_INITIALIZED: + return "CUDA Runtime API initialization failed."; + case CUSOLVER_STATUS_ALLOC_FAILED: + return "CUDA Resources could not be allocated."; + default: + return "A call to cuSolver failed"; + } +} + +// Check the return value of a call to the cuSolver runtime for an error condition. +#define CHECK_CUSOLVER(call) \ +do{ \ + const cusolverStatus_t error = call; \ + if (error != CUSOLVER_STATUS_SUCCESS) \ + { \ + std::ostringstream sout; \ + sout << "Error while calling " << #call << " in file " << __FILE__ << ":" << __LINE__ << ". ";\ + sout << "code: " << error << ", reason: " << cusolver_get_error_string(error);\ + throw dlib::cusolver_error(sout.str()); \ + } \ +}while(false) + +// ---------------------------------------------------------------------------------------- +// ---------------------------------------------------------------------------------------- + +namespace dlib +{ + namespace cuda + { + + // ----------------------------------------------------------------------------------- + + class cusolver_context + { + public: + // not copyable + cusolver_context(const cusolver_context&) = delete; + cusolver_context& operator=(const cusolver_context&) = delete; + + cusolver_context() + { + handles.resize(16); + } + ~cusolver_context() + { + for (auto h : handles) + { + if (h) + cusolverDnDestroy(h); + } + } + + cusolverDnHandle_t get_handle ( + ) + { + int new_device_id; + CHECK_CUDA(cudaGetDevice(&new_device_id)); + // make room for more devices if needed + if (new_device_id >= (long)handles.size()) + handles.resize(new_device_id+16); + + // If we don't have a handle already for this device then make one + if (!handles[new_device_id]) + CHECK_CUSOLVER(cusolverDnCreate(&handles[new_device_id])); + + // Finally, return the handle for the current device + return handles[new_device_id]; + } + + private: + + std::vector<cusolverDnHandle_t> handles; + }; + + static cusolverDnHandle_t context() + { + thread_local cusolver_context c; + return c.get_handle(); + } + + // ------------------------------------------------------------------------------------ + // ------------------------------------------------------------------------------------ + // ------------------------------------------------------------------------------------ + + __global__ void _cuda_set_to_identity_matrix(float* m, size_t nr) + { + for (auto j : grid_stride_range(0, nr*nr)) + { + if (j%(nr+1) == 0) + m[j] = 1; + else + m[j] = 0; + } + } + + void set_to_identity_matrix ( + tensor& m + ) + { + DLIB_CASSERT(m.size() == m.num_samples()*m.num_samples()); + launch_kernel(_cuda_set_to_identity_matrix, max_jobs(m.size()), m.device(), m.num_samples()); + } + + // ------------------------------------------------------------------------------------ + + inv::~inv() + { + sync_if_needed(); + } + + // ------------------------------------------------------------------------------------ + + void inv:: + operator() ( + const tensor& m_, + resizable_tensor& out + ) + { + DLIB_CASSERT(m_.size() == m_.num_samples()*m_.num_samples(), "Input matrix must be square if you want to invert it."); + m = m_; + + out.copy_size(m); + set_to_identity_matrix(out); + + const int nc = m.num_samples(); + int Lwork; + CHECK_CUSOLVER(cusolverDnSgetrf_bufferSize(context(), nc , nc, m.device(), nc, &Lwork)); + + if (Lwork > (int)workspace.size()) + { + sync_if_needed(); + workspace = cuda_data_ptr<float>(Lwork); + } + if (nc > (int)Ipiv.size()) + { + sync_if_needed(); + Ipiv = cuda_data_ptr<int>(nc); + } + if (info.size() != 1) + { + info = cuda_data_ptr<int>(1); + } + + CHECK_CUSOLVER(cusolverDnSgetrf(context(), nc, nc, m.device(), nc, workspace, Ipiv, info)); + CHECK_CUSOLVER(cusolverDnSgetrs(context(), CUBLAS_OP_N, nc, nc, m.device(), nc, Ipiv, out.device(), nc, info)); + did_work_lately = true; + } + + // ------------------------------------------------------------------------------------ + + int inv:: + get_last_status( + ) + { + std::vector<int> linfo; + memcpy(linfo, info); + if (linfo.size() != 0) + return linfo[0]; + else + return 0; + } + + // ------------------------------------------------------------------------------------ + + void inv:: + sync_if_needed() + { + if (did_work_lately) + { + did_work_lately = false; + // make sure we wait until any previous kernel launches have finished + // before we do something like deallocate the GPU memory. + cudaDeviceSynchronize(); + } + } + + // ------------------------------------------------------------------------------------ + + } +} + +#endif // DLIB_USE_CUDA + +#endif // DLIB_DNN_CuSOLVER_CU_ + + diff --git a/ml/dlib/dlib/dnn/cusolver_dlibapi.h b/ml/dlib/dlib/dnn/cusolver_dlibapi.h new file mode 100644 index 000000000..e5c77c151 --- /dev/null +++ b/ml/dlib/dlib/dnn/cusolver_dlibapi.h @@ -0,0 +1,75 @@ +// Copyright (C) 2017 Davis E. King (davis@dlib.net) +// License: Boost Software License See LICENSE.txt for the full license. +#ifndef DLIB_DNN_CuSOLVER_H_ +#define DLIB_DNN_CuSOLVER_H_ + +#ifdef DLIB_USE_CUDA + +#include "tensor.h" +#include "cuda_errors.h" +#include "cuda_data_ptr.h" +#include "../noncopyable.h" + +namespace dlib +{ + namespace cuda + { + + // ----------------------------------------------------------------------------------- + + class inv : noncopyable + { + /*! + WHAT THIS OBJECT REPRESENTS + This is a functor for doing matrix inversion on the GPU. The only + reason it's an object is to avoid the reallocation of some GPU memory + blocks if you want to do a bunch of matrix inversions in a row. + !*/ + + public: + + inv() = default; + ~inv(); + + void operator() ( + const tensor& m, + resizable_tensor& out + ); + /*! + requires + - m.size() == m.num_samples()*m.num_samples() + (i.e. mat(m) must be a square matrix) + ensures + - out == inv(mat(m)); + !*/ + + int get_last_status( + ); + /*! + ensures + - returns 0 if the last matrix inversion was successful and != 0 + otherwise. + !*/ + + private: + + void sync_if_needed(); + + bool did_work_lately = false; + resizable_tensor m; + cuda_data_ptr<float> workspace; + cuda_data_ptr<int> Ipiv; + cuda_data_ptr<int> info; + }; + + // ------------------------------------------------------------------------------------ + + } +} + +#endif // DLIB_USE_CUDA + +#endif // DLIB_DNN_CuSOLVER_H_ + + + diff --git a/ml/dlib/dlib/dnn/gpu_data.cpp b/ml/dlib/dlib/dnn/gpu_data.cpp new file mode 100644 index 000000000..6e7cec6be --- /dev/null +++ b/ml/dlib/dlib/dnn/gpu_data.cpp @@ -0,0 +1,228 @@ +// Copyright (C) 2015 Davis E. King (davis@dlib.net) +// License: Boost Software License See LICENSE.txt for the full license. +#ifndef DLIB_GPU_DaTA_CPP_ +#define DLIB_GPU_DaTA_CPP_ + +// Only things that require CUDA are declared in this cpp file. Everything else is in the +// gpu_data.h header so that it can operate as "header-only" code when using just the CPU. +#ifdef DLIB_USE_CUDA + +#include "gpu_data.h" +#include <iostream> +#include "cuda_utils.h" +#include <cstring> + + +namespace dlib +{ + +// ---------------------------------------------------------------------------------------- + + void memcpy ( + gpu_data& dest, + const gpu_data& src + ) + { + DLIB_CASSERT(dest.size() == src.size()); + if (src.size() == 0 || &dest == &src) + return; + + memcpy(dest,0, src, 0, src.size()); + } + + void memcpy ( + gpu_data& dest, + size_t dest_offset, + const gpu_data& src, + size_t src_offset, + size_t num + ) + { + DLIB_CASSERT(dest_offset + num <= dest.size()); + DLIB_CASSERT(src_offset + num <= src.size()); + if (num == 0) + return; + + // if there is aliasing + if (&dest == &src && std::max(dest_offset, src_offset) < std::min(dest_offset,src_offset)+num) + { + // if they perfectly alias each other then there is nothing to do + if (dest_offset == src_offset) + return; + else + std::memmove(dest.host()+dest_offset, src.host()+src_offset, sizeof(float)*num); + } + else + { + // if we write to the entire thing then we can use device_write_only() + if (dest_offset == 0 && num == dest.size()) + { + // copy the memory efficiently based on which copy is current in each object. + if (src.device_ready()) + CHECK_CUDA(cudaMemcpy(dest.device_write_only(), src.device()+src_offset, num*sizeof(float), cudaMemcpyDeviceToDevice)); + else + CHECK_CUDA(cudaMemcpy(dest.device_write_only(), src.host()+src_offset, num*sizeof(float), cudaMemcpyHostToDevice)); + } + else + { + // copy the memory efficiently based on which copy is current in each object. + if (dest.device_ready() && src.device_ready()) + CHECK_CUDA(cudaMemcpy(dest.device()+dest_offset, src.device()+src_offset, num*sizeof(float), cudaMemcpyDeviceToDevice)); + else if (!dest.device_ready() && src.device_ready()) + CHECK_CUDA(cudaMemcpy(dest.host()+dest_offset, src.device()+src_offset, num*sizeof(float), cudaMemcpyDeviceToHost)); + else if (dest.device_ready() && !src.device_ready()) + CHECK_CUDA(cudaMemcpy(dest.device()+dest_offset, src.host()+src_offset, num*sizeof(float), cudaMemcpyHostToDevice)); + else + CHECK_CUDA(cudaMemcpy(dest.host()+dest_offset, src.host()+src_offset, num*sizeof(float), cudaMemcpyHostToHost)); + } + } + } +// ---------------------------------------------------------------------------------------- + + void gpu_data:: + wait_for_transfer_to_finish() const + { + if (have_active_transfer) + { + CHECK_CUDA(cudaStreamSynchronize((cudaStream_t)cuda_stream.get())); + have_active_transfer = false; + // Check for errors. These calls to cudaGetLastError() are what help us find + // out if our kernel launches have been failing. + CHECK_CUDA(cudaGetLastError()); + } + } + + void gpu_data:: + copy_to_device() const + { + // We want transfers to the device to always be concurrent with any device + // computation. So we use our non-default stream to do the transfer. + async_copy_to_device(); + wait_for_transfer_to_finish(); + } + + void gpu_data:: + copy_to_host() const + { + if (!host_current) + { + wait_for_transfer_to_finish(); + CHECK_CUDA(cudaMemcpy(data_host.get(), data_device.get(), data_size*sizeof(float), cudaMemcpyDeviceToHost)); + host_current = true; + // At this point we know our RAM block isn't in use because cudaMemcpy() + // implicitly syncs with the device. + device_in_use = false; + // Check for errors. These calls to cudaGetLastError() are what help us find + // out if our kernel launches have been failing. + CHECK_CUDA(cudaGetLastError()); + } + } + + void gpu_data:: + async_copy_to_device() const + { + if (!device_current) + { + if (device_in_use) + { + // Wait for any possible CUDA kernels that might be using our memory block to + // complete before we overwrite the memory. + CHECK_CUDA(cudaStreamSynchronize(0)); + device_in_use = false; + } + CHECK_CUDA(cudaMemcpyAsync(data_device.get(), data_host.get(), data_size*sizeof(float), cudaMemcpyHostToDevice, (cudaStream_t)cuda_stream.get())); + have_active_transfer = true; + device_current = true; + } + } + + void gpu_data:: + set_size( + size_t new_size + ) + { + if (new_size == 0) + { + if (device_in_use) + { + // Wait for any possible CUDA kernels that might be using our memory block to + // complete before we free the memory. + CHECK_CUDA(cudaStreamSynchronize(0)); + device_in_use = false; + } + wait_for_transfer_to_finish(); + data_size = 0; + host_current = true; + device_current = true; + device_in_use = false; + data_host.reset(); + data_device.reset(); + } + else if (new_size != data_size) + { + if (device_in_use) + { + // Wait for any possible CUDA kernels that might be using our memory block to + // complete before we free the memory. + CHECK_CUDA(cudaStreamSynchronize(0)); + device_in_use = false; + } + wait_for_transfer_to_finish(); + data_size = new_size; + host_current = true; + device_current = true; + device_in_use = false; + + try + { + CHECK_CUDA(cudaGetDevice(&the_device_id)); + + // free memory blocks before we allocate new ones. + data_host.reset(); + data_device.reset(); + + void* data; + CHECK_CUDA(cudaMallocHost(&data, new_size*sizeof(float))); + // Note that we don't throw exceptions since the free calls are invariably + // called in destructors. They also shouldn't fail anyway unless someone + // is resetting the GPU card in the middle of their program. + data_host.reset((float*)data, [](float* ptr){ + auto err = cudaFreeHost(ptr); + if(err!=cudaSuccess) + std::cerr << "cudaFreeHost() failed. Reason: " << cudaGetErrorString(err) << std::endl; + }); + + CHECK_CUDA(cudaMalloc(&data, new_size*sizeof(float))); + data_device.reset((float*)data, [](float* ptr){ + auto err = cudaFree(ptr); + if(err!=cudaSuccess) + std::cerr << "cudaFree() failed. Reason: " << cudaGetErrorString(err) << std::endl; + }); + + if (!cuda_stream) + { + cudaStream_t cstream; + CHECK_CUDA(cudaStreamCreateWithFlags(&cstream, cudaStreamNonBlocking)); + cuda_stream.reset(cstream, [](void* ptr){ + auto err = cudaStreamDestroy((cudaStream_t)ptr); + if(err!=cudaSuccess) + std::cerr << "cudaStreamDestroy() failed. Reason: " << cudaGetErrorString(err) << std::endl; + }); + } + + } + catch(...) + { + set_size(0); + throw; + } + } + } + +// ---------------------------------------------------------------------------------------- +} + +#endif // DLIB_USE_CUDA + +#endif // DLIB_GPU_DaTA_CPP_ + diff --git a/ml/dlib/dlib/dnn/gpu_data.h b/ml/dlib/dlib/dnn/gpu_data.h new file mode 100644 index 000000000..022a05f71 --- /dev/null +++ b/ml/dlib/dlib/dnn/gpu_data.h @@ -0,0 +1,266 @@ +// Copyright (C) 2015 Davis E. King (davis@dlib.net) +// License: Boost Software License See LICENSE.txt for the full license. +#ifndef DLIB_GPU_DaTA_H_ +#define DLIB_GPU_DaTA_H_ + +#include "gpu_data_abstract.h" +#include <memory> +#include <cstring> +#include "cuda_errors.h" +#include "../serialize.h" + +namespace dlib +{ + +// ---------------------------------------------------------------------------------------- + + class gpu_data + { + /*! + CONVENTION + - if (size() != 0) then + - data_host == a pointer to size() floats in CPU memory. + - if (data_device) then + - data_device == a pointer to size() floats in device memory. + + - if (there might be an active async transfer from host to device) then + - have_active_transfer == true + + - We use the host_current and device_current bools to keep track of which + copy of the data (or both) are most current. e.g. if the CPU has + modified the data and it hasn't been copied to the device yet then + host_current==true and device_current==false. + + Similarly, we use device_in_use==true to indicate that device() has been + called and no operation to wait for all CUDA kernel completion has been + executed. So if device_in_use==true then there might be a CUDA kernel + executing that is using the device memory block contained in this object. + + !*/ + public: + + gpu_data( + ) : data_size(0), host_current(true), device_current(true),have_active_transfer(false),device_in_use(false), the_device_id(0) + { + } + + // Not copyable + gpu_data(const gpu_data&) = delete; + gpu_data& operator=(const gpu_data&) = delete; + + // but is movable + gpu_data(gpu_data&& item) : gpu_data() { swap(item); } + gpu_data& operator=(gpu_data&& item) { swap(item); return *this; } + + int device_id() const { return the_device_id; } + +#ifdef DLIB_USE_CUDA + void async_copy_to_device() const; + void set_size(size_t new_size); +#else + // Note that calls to host() or device() will block until any async transfers are complete. + void async_copy_to_device() const{} + + void set_size(size_t new_size) + { + if (new_size == 0) + { + data_size = 0; + host_current = true; + device_current = true; + device_in_use = false; + data_host.reset(); + data_device.reset(); + } + else if (new_size != data_size) + { + data_size = new_size; + host_current = true; + device_current = true; + device_in_use = false; + data_host.reset(new float[new_size], std::default_delete<float[]>()); + data_device.reset(); + } + } +#endif + + const float* host() const + { + copy_to_host(); + return data_host.get(); + } + + float* host() + { + copy_to_host(); + device_current = false; + return data_host.get(); + } + + float* host_write_only() + { + host_current = true; + device_current = false; + return data_host.get(); + } + + const float* device() const + { +#ifndef DLIB_USE_CUDA + DLIB_CASSERT(false, "CUDA NOT ENABLED"); +#endif + copy_to_device(); + device_in_use = true; + return data_device.get(); + } + + float* device() + { +#ifndef DLIB_USE_CUDA + DLIB_CASSERT(false, "CUDA NOT ENABLED"); +#endif + copy_to_device(); + host_current = false; + device_in_use = true; + return data_device.get(); + } + + float* device_write_only() + { +#ifndef DLIB_USE_CUDA + DLIB_CASSERT(false, "CUDA NOT ENABLED"); +#endif + wait_for_transfer_to_finish(); + host_current = false; + device_current = true; + device_in_use = true; + return data_device.get(); + } + + bool host_ready ( + ) const { return host_current; } + + bool device_ready ( + ) const { return device_current && !have_active_transfer; } + + size_t size() const { return data_size; } + + void swap (gpu_data& item) + { + std::swap(data_size, item.data_size); + std::swap(host_current, item.host_current); + std::swap(device_current, item.device_current); + std::swap(have_active_transfer, item.have_active_transfer); + std::swap(data_host, item.data_host); + std::swap(data_device, item.data_device); + std::swap(cuda_stream, item.cuda_stream); + std::swap(the_device_id, item.the_device_id); + } + + private: + +#ifdef DLIB_USE_CUDA + void copy_to_device() const; + void copy_to_host() const; + void wait_for_transfer_to_finish() const; +#else + void copy_to_device() const{} + void copy_to_host() const{} + void wait_for_transfer_to_finish() const{} +#endif + + + size_t data_size; + mutable bool host_current; + mutable bool device_current; + mutable bool have_active_transfer; + mutable bool device_in_use; + + std::shared_ptr<float> data_host; + std::shared_ptr<float> data_device; + std::shared_ptr<void> cuda_stream; + int the_device_id; + }; + + inline void serialize(const gpu_data& item, std::ostream& out) + { + int version = 1; + serialize(version, out); + serialize(item.size(), out); + auto data = item.host(); + for (size_t i = 0; i < item.size(); ++i) + serialize(data[i], out); + } + + inline void deserialize(gpu_data& item, std::istream& in) + { + int version; + deserialize(version, in); + if (version != 1) + throw serialization_error("Unexpected version found while deserializing dlib::gpu_data."); + size_t s; + deserialize(s, in); + item.set_size(s); + auto data = item.host(); + for (size_t i = 0; i < item.size(); ++i) + deserialize(data[i], in); + } + +#ifdef DLIB_USE_CUDA + void memcpy (gpu_data& dest, const gpu_data& src); + + void memcpy ( + gpu_data& dest, + size_t dest_offset, + const gpu_data& src, + size_t src_offset, + size_t num + ); + +#else + + inline void memcpy (gpu_data& dest, const gpu_data& src) + { + DLIB_CASSERT(dest.size() == src.size()); + if (src.size() == 0 || &dest == &src) + return; + std::memcpy(dest.host_write_only(), src.host(), sizeof(float)*src.size()); + } + + inline void memcpy ( + gpu_data& dest, + size_t dest_offset, + const gpu_data& src, + size_t src_offset, + size_t num + ) + { + DLIB_CASSERT(dest_offset + num <= dest.size()); + DLIB_CASSERT(src_offset + num <= src.size()); + if (num == 0) + return; + if (&dest == &src && std::max(dest_offset, src_offset) < std::min(dest_offset,src_offset)+num) + { + // if they perfectly alias each other then there is nothing to do + if (dest_offset == src_offset) + return; + else + std::memmove(dest.host()+dest_offset, src.host()+src_offset, sizeof(float)*num); + } + else + { + // if we write to the entire thing then we can use host_write_only() + if (dest_offset == 0 && num == dest.size()) + std::memcpy(dest.host_write_only(), src.host()+src_offset, sizeof(float)*num); + else + std::memcpy(dest.host()+dest_offset, src.host()+src_offset, sizeof(float)*num); + } + } +#endif + +// ---------------------------------------------------------------------------------------- + +} + +#endif // DLIB_GPU_DaTA_H_ + diff --git a/ml/dlib/dlib/dnn/gpu_data_abstract.h b/ml/dlib/dlib/dnn/gpu_data_abstract.h new file mode 100644 index 000000000..f2423dee1 --- /dev/null +++ b/ml/dlib/dlib/dnn/gpu_data_abstract.h @@ -0,0 +1,266 @@ +// Copyright (C) 2015 Davis E. King (davis@dlib.net) +// License: Boost Software License See LICENSE.txt for the full license. +#undef DLIB_GPU_DaTA_ABSTRACT_H_ +#ifdef DLIB_GPU_DaTA_ABSTRACT_H_ + +#include "cuda_errors.h" +#include "../serialize.h" + +namespace dlib +{ + +// ---------------------------------------------------------------------------------------- + + class gpu_data + { + /*! + WHAT THIS OBJECT REPRESENTS + This object is a block of size() floats, all stored contiguously in memory. + Importantly, it keeps two copies of the floats, one on the host CPU side + and another on the GPU device side. It automatically performs the necessary + host/device transfers to keep these two copies of the data in sync. + + All transfers to the device happen asynchronously with respect to the + default CUDA stream so that CUDA kernel computations can overlap with data + transfers. However, any transfers from the device to the host happen + synchronously in the default CUDA stream. Therefore, you should perform + all your CUDA kernel launches on the default stream so that transfers back + to the host do not happen before the relevant computations have completed. + + If DLIB_USE_CUDA is not #defined then this object will not use CUDA at all. + Instead, it will simply store one host side memory block of floats. + + THREAD SAFETY + Instances of this object are not thread-safe. So don't touch one from + multiple threads at the same time. + !*/ + public: + + gpu_data( + ); + /*! + ensures + - #size() == 0 + - #host() == nullptr + - #device() == nullptr + - #host_ready() == true + - #device_ready() == true + - #device_id() == 0 + !*/ + + // This object is not copyable, however, it is movable. + gpu_data(const gpu_data&) = delete; + gpu_data& operator=(const gpu_data&) = delete; + gpu_data(gpu_data&& item); + gpu_data& operator=(gpu_data&& item); + + int device_id( + ) const; + /*! + ensures + - returns the ID of the CUDA device that allocated this memory. I.e. the + number returned by cudaGetDevice() when the memory was allocated. + - If CUDA is not being used then this function always returns 0. + !*/ + + void async_copy_to_device( + ); + /*! + ensures + - if (!device_ready()) then + - Begins asynchronously copying host data to the device once it is safe + to do so. I.e. This function will wait until any previously + scheduled CUDA kernels, which are using the device() memory block, + have completed before transferring the new data to the device. + - A call to device() that happens before the transfer completes will + block until the transfer is complete. That is, it is safe to call + async_copy_to_device() and then immediately call device(). + !*/ + + void set_size( + size_t new_size + ); + /*! + ensures + - #size() == new_size + !*/ + + bool host_ready ( + ) const; + /*! + ensures + - returns true if and only if the host's copy of the data is current. The + host's data is current if there aren't any modifications to the data + which were made on the device side that have yet to be copied to the + host. + !*/ + + bool device_ready ( + ) const; + /*! + ensures + - returns true if and only if the device's copy of the data is current. + The device's data is current if there aren't any modifications to the + data which were made on the host side that have yet to be copied to the + device. + !*/ + + const float* host( + ) const; + /*! + ensures + - returns a pointer to the host memory block of size() contiguous float + values or nullptr if size()==0. + - if (!host_ready()) then + - copies the data from the device to the host, while this is happening + the call to host() blocks. + - #host_ready() == true + !*/ + + float* host( + ); + /*! + ensures + - returns a pointer to the host memory block of size() contiguous float + values or nullptr if size()==0. + - if (!host_ready()) then + - copies the data from the device to the host, while this is happening + the call to host() blocks. + - #host_ready() == true + - #device_ready() == false + I.e. Marks the device side data as out of date so that the next call to + device() will perform a host to device transfer. If you want to begin + the transfer immediately then you can call async_copy_to_device() after + calling host(). + !*/ + + float* host_write_only( + ); + /*! + ensures + - This function returns the same pointer as host(), except that it never + performs a device to host memory copy. Instead, it immediately marks the + device side data as out of date, effectively discarding it. Therefore, + the values in the data pointed to by host_write_only() are undefined and + you should only call host_write_only() if you are going to assign to + every memory location in the returned memory block. + - #host_ready() == true + - #device_ready() == false + !*/ + + const float* device( + ) const; + /*! + requires + - DLIB_USE_CUDA is #defined + ensures + - returns a pointer to the device memory block of size() contiguous float + values or nullptr if size()==0. + - if (!device_ready()) then + - copies the data from the host to the device, while this is happening + the call to device() blocks. + - #device_ready() == true + !*/ + + float* device( + ); + /*! + requires + - DLIB_USE_CUDA is #defined + ensures + - returns a pointer to the device memory block of size() contiguous float + values or nullptr if size()==0. + - if (!device_ready()) then + - copies the data from the host to the device, while this is happening + the call to device() blocks. + - #host_ready() == false + - #device_ready() == true + !*/ + + float* device_write_only( + ); + /*! + requires + - DLIB_USE_CUDA is #defined + ensures + - This function returns the same pointer as device(), except that it never + performs a host to device memory copy. Instead, it immediately marks the + host side data as out of date, effectively discarding it. Therefore, the + values in the data pointed to by device_write_only() are undefined and + you should only call device_write_only() if you are going to assign to + every memory location in the returned memory block. + - #host_ready() == false + - #device_ready() == true + !*/ + + + size_t size( + ) const; + /*! + ensures + - returns the number of floats contained in this object. + !*/ + + void swap ( + gpu_data& item + ); + /*! + ensures + - swaps the state of *this and item + !*/ + + }; + + void serialize(const gpu_data& item, std::ostream& out); + void deserialize(gpu_data& item, std::istream& in); + /*! + provides serialization support + !*/ + + void memcpy ( + gpu_data& dest, + const gpu_data& src + ); + /*! + requires + - dest.size() == src.size() + ensures + - Copies the data in src to dest. If the device data is current (i.e. + device_ready()==true) on both src and dest then the copy will happen entirely + on the device side. + - It doesn't matter what GPU device is selected by cudaSetDevice(). You can + always copy gpu_data objects to and from each other regardless. + - This function blocks until the copy has completed. + !*/ + + void memcpy ( + gpu_data& dest, + size_t dest_offset, + const gpu_data& src, + size_t src_offset, + size_t num + ); + /*! + requires + - dest_offset + num <= dest.size() + - src_offset + num <= src.size() + ensures + - Copies the data in src to dest, but only copies data in the range + [src.host()+src_offset, src.host()+src_offset+num) to + [dest.host()+dest_offset, dest.host()+dest_offset+num). Therefore, it is + just like the above memcpy() except that you can specify some subset of data + in a gpu_data object to be copied. + - Like the above version of memcpy(), the copy will happen in the most + efficient way, automatically using the appropriate type of host/device + transfers based on where data is currently resident. + - It doesn't matter what GPU device is selected by cudaSetDevice(). You can + always copy gpu_data objects to and from each other regardless. + - This function blocks until the copy has completed. + !*/ + +// ---------------------------------------------------------------------------------------- + +} + +#endif // DLIB_GPU_DaTA_ABSTRACT_H_ + diff --git a/ml/dlib/dlib/dnn/input.h b/ml/dlib/dlib/dnn/input.h new file mode 100644 index 000000000..3b5c954e6 --- /dev/null +++ b/ml/dlib/dlib/dnn/input.h @@ -0,0 +1,808 @@ +// Copyright (C) 2015 Davis E. King (davis@dlib.net) +// License: Boost Software License See LICENSE.txt for the full license. +#ifndef DLIB_DNn_INPUT_H_ +#define DLIB_DNn_INPUT_H_ + +#include "input_abstract.h" +#include "../matrix.h" +#include "../array2d.h" +#include "../pixel.h" +#include "../image_processing.h" +#include <sstream> +#include <array> +#include "tensor_tools.h" + + +namespace dlib +{ + +// ---------------------------------------------------------------------------------------- + + template <typename T> + class input + { + const static bool always_false = sizeof(T)!=sizeof(T); + static_assert(always_false, "Unsupported type given to input<>. input<> only supports " + "dlib::matrix and dlib::array2d objects."); + }; + +// ---------------------------------------------------------------------------------------- + + template <size_t NR, size_t NC=NR> + class input_rgb_image_sized; + + class input_rgb_image + { + public: + typedef matrix<rgb_pixel> input_type; + + input_rgb_image ( + ) : + avg_red(122.782), + avg_green(117.001), + avg_blue(104.298) + { + } + + input_rgb_image ( + float avg_red_, + float avg_green_, + float avg_blue_ + ) : avg_red(avg_red_), avg_green(avg_green_), avg_blue(avg_blue_) + {} + + template <size_t NR, size_t NC> + inline input_rgb_image ( + const input_rgb_image_sized<NR,NC>& item + ); + + float get_avg_red() const { return avg_red; } + float get_avg_green() const { return avg_green; } + float get_avg_blue() const { return avg_blue; } + + bool image_contained_point ( const tensor& data, const point& p) const { return get_rect(data).contains(p); } + drectangle tensor_space_to_image_space ( const tensor& /*data*/, drectangle r) const { return r; } + drectangle image_space_to_tensor_space ( const tensor& /*data*/, double /*scale*/, drectangle r ) const { return r; } + + template <typename forward_iterator> + void to_tensor ( + forward_iterator ibegin, + forward_iterator iend, + resizable_tensor& data + ) const + { + DLIB_CASSERT(std::distance(ibegin,iend) > 0); + const auto nr = ibegin->nr(); + const auto nc = ibegin->nc(); + // make sure all the input matrices have the same dimensions + for (auto i = ibegin; i != iend; ++i) + { + DLIB_CASSERT(i->nr()==nr && i->nc()==nc, + "\t input_rgb_image::to_tensor()" + << "\n\t All matrices given to to_tensor() must have the same dimensions." + << "\n\t nr: " << nr + << "\n\t nc: " << nc + << "\n\t i->nr(): " << i->nr() + << "\n\t i->nc(): " << i->nc() + ); + } + + + // initialize data to the right size to contain the stuff in the iterator range. + data.set_size(std::distance(ibegin,iend), 3, nr, nc); + + + const size_t offset = nr*nc; + auto ptr = data.host(); + for (auto i = ibegin; i != iend; ++i) + { + for (long r = 0; r < nr; ++r) + { + for (long c = 0; c < nc; ++c) + { + rgb_pixel temp = (*i)(r,c); + auto p = ptr++; + *p = (temp.red-avg_red)/256.0; + p += offset; + *p = (temp.green-avg_green)/256.0; + p += offset; + *p = (temp.blue-avg_blue)/256.0; + p += offset; + } + } + ptr += offset*(data.k()-1); + } + + } + + friend void serialize(const input_rgb_image& item, std::ostream& out) + { + serialize("input_rgb_image", out); + serialize(item.avg_red, out); + serialize(item.avg_green, out); + serialize(item.avg_blue, out); + } + + friend void deserialize(input_rgb_image& item, std::istream& in) + { + std::string version; + deserialize(version, in); + if (version != "input_rgb_image" && version != "input_rgb_image_sized") + throw serialization_error("Unexpected version found while deserializing dlib::input_rgb_image."); + deserialize(item.avg_red, in); + deserialize(item.avg_green, in); + deserialize(item.avg_blue, in); + + // read and discard the sizes if this was really a sized input layer. + if (version == "input_rgb_image_sized") + { + size_t nr, nc; + deserialize(nr, in); + deserialize(nc, in); + } + } + + friend std::ostream& operator<<(std::ostream& out, const input_rgb_image& item) + { + out << "input_rgb_image("<<item.avg_red<<","<<item.avg_green<<","<<item.avg_blue<<")"; + return out; + } + + friend void to_xml(const input_rgb_image& item, std::ostream& out) + { + out << "<input_rgb_image r='"<<item.avg_red<<"' g='"<<item.avg_green<<"' b='"<<item.avg_blue<<"'/>"; + } + + private: + float avg_red; + float avg_green; + float avg_blue; + }; + +// ---------------------------------------------------------------------------------------- + + template <size_t NR, size_t NC> + class input_rgb_image_sized + { + public: + static_assert(NR != 0 && NC != 0, "The input image can't be empty."); + + typedef matrix<rgb_pixel> input_type; + + input_rgb_image_sized ( + ) : + avg_red(122.782), + avg_green(117.001), + avg_blue(104.298) + { + } + + input_rgb_image_sized ( + const input_rgb_image& item + ) : avg_red(item.get_avg_red()), + avg_green(item.get_avg_green()), + avg_blue(item.get_avg_blue()) + {} + + input_rgb_image_sized ( + float avg_red_, + float avg_green_, + float avg_blue_ + ) : avg_red(avg_red_), avg_green(avg_green_), avg_blue(avg_blue_) + {} + + float get_avg_red() const { return avg_red; } + float get_avg_green() const { return avg_green; } + float get_avg_blue() const { return avg_blue; } + + bool image_contained_point ( const tensor& data, const point& p) const { return get_rect(data).contains(p); } + drectangle tensor_space_to_image_space ( const tensor& /*data*/, drectangle r) const { return r; } + drectangle image_space_to_tensor_space ( const tensor& /*data*/, double /*scale*/, drectangle r ) const { return r; } + + template <typename forward_iterator> + void to_tensor ( + forward_iterator ibegin, + forward_iterator iend, + resizable_tensor& data + ) const + { + DLIB_CASSERT(std::distance(ibegin,iend) > 0); + // make sure all input images have the correct size + for (auto i = ibegin; i != iend; ++i) + { + DLIB_CASSERT(i->nr()==NR && i->nc()==NC, + "\t input_rgb_image_sized::to_tensor()" + << "\n\t All input images must have "<<NR<<" rows and "<<NC<< " columns, but we got one with "<<i->nr()<<" rows and "<<i->nc()<<" columns." + ); + } + + + // initialize data to the right size to contain the stuff in the iterator range. + data.set_size(std::distance(ibegin,iend), 3, NR, NC); + + + const size_t offset = NR*NC; + auto ptr = data.host(); + for (auto i = ibegin; i != iend; ++i) + { + for (size_t r = 0; r < NR; ++r) + { + for (size_t c = 0; c < NC; ++c) + { + rgb_pixel temp = (*i)(r,c); + auto p = ptr++; + *p = (temp.red-avg_red)/256.0; + p += offset; + *p = (temp.green-avg_green)/256.0; + p += offset; + *p = (temp.blue-avg_blue)/256.0; + p += offset; + } + } + ptr += offset*(data.k()-1); + } + + } + + friend void serialize(const input_rgb_image_sized& item, std::ostream& out) + { + serialize("input_rgb_image_sized", out); + serialize(item.avg_red, out); + serialize(item.avg_green, out); + serialize(item.avg_blue, out); + serialize(NR, out); + serialize(NC, out); + } + + friend void deserialize(input_rgb_image_sized& item, std::istream& in) + { + std::string version; + deserialize(version, in); + if (version != "input_rgb_image_sized") + throw serialization_error("Unexpected version found while deserializing dlib::input_rgb_image_sized."); + deserialize(item.avg_red, in); + deserialize(item.avg_green, in); + deserialize(item.avg_blue, in); + size_t nr, nc; + deserialize(nr, in); + deserialize(nc, in); + if (nr != NR || nc != NC) + { + std::ostringstream sout; + sout << "Wrong image dimensions found while deserializing dlib::input_rgb_image_sized.\n"; + sout << "Expected "<<NR<<" rows and "<<NC<< " columns, but found "<<nr<<" rows and "<<nc<<" columns."; + throw serialization_error(sout.str()); + } + } + + friend std::ostream& operator<<(std::ostream& out, const input_rgb_image_sized& item) + { + out << "input_rgb_image_sized("<<item.avg_red<<","<<item.avg_green<<","<<item.avg_blue<<") nr="<<NR<<" nc="<<NC; + return out; + } + + friend void to_xml(const input_rgb_image_sized& item, std::ostream& out) + { + out << "<input_rgb_image_sized r='"<<item.avg_red<<"' g='"<<item.avg_green<<"' b='"<<item.avg_blue<<"' nr='"<<NR<<"' nc='"<<NC<<"'/>"; + } + + private: + float avg_red; + float avg_green; + float avg_blue; + }; + +// ---------------------------------------------------------------------------------------- + + template <size_t NR, size_t NC> + input_rgb_image:: + input_rgb_image ( + const input_rgb_image_sized<NR,NC>& item + ) : avg_red(item.get_avg_red()), + avg_green(item.get_avg_green()), + avg_blue(item.get_avg_blue()) + {} + +// ---------------------------------------------------------------------------------------- + + template <typename T, long NR, long NC, typename MM, typename L> + class input<matrix<T,NR,NC,MM,L>> + { + public: + typedef matrix<T,NR,NC,MM,L> input_type; + + input() {} + input(const input&) {} + + template <typename mm> + input(const input<array2d<T,mm>>&) {} + + bool image_contained_point ( const tensor& data, const point& p) const { return get_rect(data).contains(p); } + drectangle tensor_space_to_image_space ( const tensor& /*data*/, drectangle r) const { return r; } + drectangle image_space_to_tensor_space ( const tensor& /*data*/, double /*scale*/, drectangle r ) const { return r; } + + template <typename forward_iterator> + void to_tensor ( + forward_iterator ibegin, + forward_iterator iend, + resizable_tensor& data + ) const + { + DLIB_CASSERT(std::distance(ibegin,iend) > 0); + const auto nr = ibegin->nr(); + const auto nc = ibegin->nc(); + // make sure all the input matrices have the same dimensions + for (auto i = ibegin; i != iend; ++i) + { + DLIB_CASSERT(i->nr()==nr && i->nc()==nc, + "\t input::to_tensor()" + << "\n\t All matrices given to to_tensor() must have the same dimensions." + << "\n\t nr: " << nr + << "\n\t nc: " << nc + << "\n\t i->nr(): " << i->nr() + << "\n\t i->nc(): " << i->nc() + ); + } + + + // initialize data to the right size to contain the stuff in the iterator range. + data.set_size(std::distance(ibegin,iend), pixel_traits<T>::num, nr, nc); + + typedef typename pixel_traits<T>::basic_pixel_type bptype; + + const size_t offset = nr*nc; + auto ptr = data.host(); + for (auto i = ibegin; i != iend; ++i) + { + for (long r = 0; r < nr; ++r) + { + for (long c = 0; c < nc; ++c) + { + auto temp = pixel_to_vector<float>((*i)(r,c)); + auto p = ptr++; + for (long j = 0; j < temp.size(); ++j) + { + if (is_same_type<bptype,unsigned char>::value) + *p = temp(j)/256.0; + else + *p = temp(j); + p += offset; + } + } + } + ptr += offset*(data.k()-1); + } + + } + + friend void serialize(const input& /*item*/, std::ostream& out) + { + serialize("input<matrix>", out); + } + + friend void deserialize(input& /*item*/, std::istream& in) + { + std::string version; + deserialize(version, in); + if (version != "input<matrix>") + throw serialization_error("Unexpected version found while deserializing dlib::input."); + } + + friend std::ostream& operator<<(std::ostream& out, const input& /*item*/) + { + out << "input<matrix>"; + return out; + } + + friend void to_xml(const input& /*item*/, std::ostream& out) + { + out << "<input/>"; + } + }; + +// ---------------------------------------------------------------------------------------- + + template <typename T, long NR, long NC, typename MM, typename L, size_t K> + class input<std::array<matrix<T,NR,NC,MM,L>,K>> + { + public: + typedef std::array<matrix<T,NR,NC,MM,L>,K> input_type; + + input() {} + input(const input&) {} + + bool image_contained_point ( const tensor& data, const point& p) const { return get_rect(data).contains(p); } + drectangle tensor_space_to_image_space ( const tensor& /*data*/, drectangle r) const { return r; } + drectangle image_space_to_tensor_space ( const tensor& /*data*/, double /*scale*/, drectangle r ) const { return r; } + + template <typename forward_iterator> + void to_tensor ( + forward_iterator ibegin, + forward_iterator iend, + resizable_tensor& data + ) const + { + DLIB_CASSERT(std::distance(ibegin,iend) > 0); + DLIB_CASSERT(ibegin->size() != 0, "When using std::array<matrix> inputs you can't give 0 sized arrays."); + const auto nr = (*ibegin)[0].nr(); + const auto nc = (*ibegin)[0].nc(); + // make sure all the input matrices have the same dimensions + for (auto i = ibegin; i != iend; ++i) + { + for (size_t k = 0; k < K; ++k) + { + const auto& arr = *i; + DLIB_CASSERT(arr[k].nr()==nr && arr[k].nc()==nc, + "\t input::to_tensor()" + << "\n\t When using std::array<matrix> as input, all matrices in a batch must have the same dimensions." + << "\n\t nr: " << nr + << "\n\t nc: " << nc + << "\n\t k: " << k + << "\n\t arr[k].nr(): " << arr[k].nr() + << "\n\t arr[k].nc(): " << arr[k].nc() + ); + } + } + + + // initialize data to the right size to contain the stuff in the iterator range. + data.set_size(std::distance(ibegin,iend), K, nr, nc); + + auto ptr = data.host(); + for (auto i = ibegin; i != iend; ++i) + { + for (size_t k = 0; k < K; ++k) + { + for (long r = 0; r < nr; ++r) + { + for (long c = 0; c < nc; ++c) + { + if (is_same_type<T,unsigned char>::value) + *ptr++ = (*i)[k](r,c)/256.0; + else + *ptr++ = (*i)[k](r,c); + } + } + } + } + + } + + friend void serialize(const input& /*item*/, std::ostream& out) + { + serialize("input<array<matrix>>", out); + } + + friend void deserialize(input& /*item*/, std::istream& in) + { + std::string version; + deserialize(version, in); + if (version != "input<array<matrix>>") + throw serialization_error("Unexpected version found while deserializing dlib::input<array<matrix>>."); + } + + friend std::ostream& operator<<(std::ostream& out, const input& /*item*/) + { + out << "input<array<matrix>>"; + return out; + } + + friend void to_xml(const input& /*item*/, std::ostream& out) + { + out << "<input/>"; + } + }; + +// ---------------------------------------------------------------------------------------- + + template <typename T, typename MM> + class input<array2d<T,MM>> + { + public: + typedef array2d<T,MM> input_type; + + input() {} + input(const input&) {} + + template <long NR, long NC, typename mm, typename L> + input(const input<matrix<T,NR,NC,mm,L>>&) {} + + bool image_contained_point ( const tensor& data, const point& p) const { return get_rect(data).contains(p); } + drectangle tensor_space_to_image_space ( const tensor& /*data*/, drectangle r) const { return r; } + drectangle image_space_to_tensor_space ( const tensor& /*data*/, double /*scale*/, drectangle r ) const { return r; } + + template <typename forward_iterator> + void to_tensor ( + forward_iterator ibegin, + forward_iterator iend, + resizable_tensor& data + ) const + { + DLIB_CASSERT(std::distance(ibegin,iend) > 0); + const auto nr = ibegin->nr(); + const auto nc = ibegin->nc(); + // make sure all the input matrices have the same dimensions + for (auto i = ibegin; i != iend; ++i) + { + DLIB_CASSERT(i->nr()==nr && i->nc()==nc, + "\t input::to_tensor()" + << "\n\t All array2d objects given to to_tensor() must have the same dimensions." + << "\n\t nr: " << nr + << "\n\t nc: " << nc + << "\n\t i->nr(): " << i->nr() + << "\n\t i->nc(): " << i->nc() + ); + } + + + // initialize data to the right size to contain the stuff in the iterator range. + data.set_size(std::distance(ibegin,iend), pixel_traits<T>::num, nr, nc); + typedef typename pixel_traits<T>::basic_pixel_type bptype; + + const size_t offset = nr*nc; + auto ptr = data.host(); + for (auto i = ibegin; i != iend; ++i) + { + for (long r = 0; r < nr; ++r) + { + for (long c = 0; c < nc; ++c) + { + auto temp = pixel_to_vector<float>((*i)[r][c]); + auto p = ptr++; + for (long j = 0; j < temp.size(); ++j) + { + if (is_same_type<bptype,unsigned char>::value) + *p = temp(j)/256.0; + else + *p = temp(j); + p += offset; + } + } + } + ptr += offset*(data.k()-1); + } + + } + + friend void serialize(const input& item, std::ostream& out) + { + serialize("input<array2d>", out); + } + + friend void deserialize(input& item, std::istream& in) + { + std::string version; + deserialize(version, in); + if (version != "input<array2d>") + throw serialization_error("Unexpected version found while deserializing dlib::input."); + } + friend std::ostream& operator<<(std::ostream& out, const input& item) + { + out << "input<array2d>"; + return out; + } + + friend void to_xml(const input& item, std::ostream& out) + { + out << "<input/>"; + } + }; + +// ---------------------------------------------------------------------------------------- + + template <typename PYRAMID_TYPE> + class input_rgb_image_pyramid + { + public: + typedef matrix<rgb_pixel> input_type; + typedef PYRAMID_TYPE pyramid_type; + + input_rgb_image_pyramid ( + ) : + avg_red(122.782), + avg_green(117.001), + avg_blue(104.298) + { + } + + input_rgb_image_pyramid ( + float avg_red_, + float avg_green_, + float avg_blue_ + ) : avg_red(avg_red_), avg_green(avg_green_), avg_blue(avg_blue_) + {} + + float get_avg_red() const { return avg_red; } + float get_avg_green() const { return avg_green; } + float get_avg_blue() const { return avg_blue; } + + unsigned long get_pyramid_padding () const { return pyramid_padding; } + void set_pyramid_padding (unsigned long value) { pyramid_padding = value; } + + unsigned long get_pyramid_outer_padding () const { return pyramid_outer_padding; } + void set_pyramid_outer_padding (unsigned long value) { pyramid_outer_padding = value; } + + bool image_contained_point ( + const tensor& data, + const point& p + ) const + { + auto&& rects = any_cast<std::vector<rectangle>>(data.annotation()); + DLIB_CASSERT(rects.size() > 0); + return rects[0].contains(p+rects[0].tl_corner()); + } + + drectangle tensor_space_to_image_space ( + const tensor& data, + drectangle r + ) const + { + auto&& rects = any_cast<std::vector<rectangle>>(data.annotation()); + return tiled_pyramid_to_image<pyramid_type>(rects, r); + } + + drectangle image_space_to_tensor_space ( + const tensor& data, + double scale, + drectangle r + ) const + { + DLIB_CASSERT(0 < scale && scale <= 1 , "scale: "<< scale); + auto&& rects = any_cast<std::vector<rectangle>>(data.annotation()); + return image_to_tiled_pyramid<pyramid_type>(rects, scale, r); + } + + template <typename forward_iterator> + void to_tensor ( + forward_iterator ibegin, + forward_iterator iend, + resizable_tensor& data + ) const + { + DLIB_CASSERT(std::distance(ibegin,iend) > 0); + auto nr = ibegin->nr(); + auto nc = ibegin->nc(); + // make sure all the input matrices have the same dimensions + for (auto i = ibegin; i != iend; ++i) + { + DLIB_CASSERT(i->nr()==nr && i->nc()==nc, + "\t input_rgb_image_pyramid::to_tensor()" + << "\n\t All matrices given to to_tensor() must have the same dimensions." + << "\n\t nr: " << nr + << "\n\t nc: " << nc + << "\n\t i->nr(): " << i->nr() + << "\n\t i->nc(): " << i->nc() + ); + } + + long NR, NC; + pyramid_type pyr; + auto& rects = data.annotation().get<std::vector<rectangle>>(); + impl::compute_tiled_image_pyramid_details(pyr, nr, nc, pyramid_padding, pyramid_outer_padding, rects, NR, NC); + + // initialize data to the right size to contain the stuff in the iterator range. + data.set_size(std::distance(ibegin,iend), 3, NR, NC); + + // We need to zero the image before doing the pyramid, since the pyramid + // creation code doesn't write to all parts of the image. We also take + // care to avoid triggering any device to hosts copies. + auto ptr = data.host_write_only(); + for (size_t i = 0; i < data.size(); ++i) + ptr[i] = 0; + + if (rects.size() == 0) + return; + + // copy the first raw image into the top part of the tiled pyramid. We need to + // do this for each of the input images/samples in the tensor. + for (auto i = ibegin; i != iend; ++i) + { + auto& img = *i; + ptr += rects[0].top()*data.nc(); + for (long r = 0; r < img.nr(); ++r) + { + auto p = ptr+rects[0].left(); + for (long c = 0; c < img.nc(); ++c) + p[c] = (img(r,c).red-avg_red)/256.0; + ptr += data.nc(); + } + ptr += data.nc()*(data.nr()-rects[0].bottom()-1); + + ptr += rects[0].top()*data.nc(); + for (long r = 0; r < img.nr(); ++r) + { + auto p = ptr+rects[0].left(); + for (long c = 0; c < img.nc(); ++c) + p[c] = (img(r,c).green-avg_green)/256.0; + ptr += data.nc(); + } + ptr += data.nc()*(data.nr()-rects[0].bottom()-1); + + ptr += rects[0].top()*data.nc(); + for (long r = 0; r < img.nr(); ++r) + { + auto p = ptr+rects[0].left(); + for (long c = 0; c < img.nc(); ++c) + p[c] = (img(r,c).blue-avg_blue)/256.0; + ptr += data.nc(); + } + ptr += data.nc()*(data.nr()-rects[0].bottom()-1); + } + + // now build the image pyramid into data. This does the same thing as + // create_tiled_pyramid(), except we use the GPU if one is available. + for (size_t i = 1; i < rects.size(); ++i) + { + alias_tensor src(data.num_samples(),data.k(),rects[i-1].height(),rects[i-1].width()); + alias_tensor dest(data.num_samples(),data.k(),rects[i].height(),rects[i].width()); + + auto asrc = src(data, data.nc()*rects[i-1].top() + rects[i-1].left()); + auto adest = dest(data, data.nc()*rects[i].top() + rects[i].left()); + + tt::resize_bilinear(adest, data.nc(), data.nr()*data.nc(), + asrc, data.nc(), data.nr()*data.nc()); + } + } + + friend void serialize(const input_rgb_image_pyramid& item, std::ostream& out) + { + serialize("input_rgb_image_pyramid2", out); + serialize(item.avg_red, out); + serialize(item.avg_green, out); + serialize(item.avg_blue, out); + serialize(item.pyramid_padding, out); + serialize(item.pyramid_outer_padding, out); + } + + friend void deserialize(input_rgb_image_pyramid& item, std::istream& in) + { + std::string version; + deserialize(version, in); + if (version != "input_rgb_image_pyramid" && version != "input_rgb_image_pyramid2") + throw serialization_error("Unexpected version found while deserializing dlib::input_rgb_image_pyramid."); + deserialize(item.avg_red, in); + deserialize(item.avg_green, in); + deserialize(item.avg_blue, in); + if (version == "input_rgb_image_pyramid2") + { + deserialize(item.pyramid_padding, in); + deserialize(item.pyramid_outer_padding, in); + } + else + { + item.pyramid_padding = 10; + item.pyramid_outer_padding = 11; + } + } + + friend std::ostream& operator<<(std::ostream& out, const input_rgb_image_pyramid& item) + { + out << "input_rgb_image_pyramid("<<item.avg_red<<","<<item.avg_green<<","<<item.avg_blue<<")"; + out << " pyramid_padding="<<item.pyramid_padding; + out << " pyramid_outer_padding="<<item.pyramid_outer_padding; + return out; + } + + friend void to_xml(const input_rgb_image_pyramid& item, std::ostream& out) + { + out << "<input_rgb_image_pyramid r='"<<item.avg_red<<"' g='"<<item.avg_green + <<"' b='"<<item.avg_blue + <<"' pyramid_padding='"<<item.pyramid_padding + <<"' pyramid_outer_padding='"<<item.pyramid_outer_padding + <<"'/>"; + } + + private: + float avg_red; + float avg_green; + float avg_blue; + unsigned long pyramid_padding = 10; + unsigned long pyramid_outer_padding = 11; + }; + +// ---------------------------------------------------------------------------------------- + +} + +#endif // DLIB_DNn_INPUT_H_ + diff --git a/ml/dlib/dlib/dnn/input_abstract.h b/ml/dlib/dlib/dnn/input_abstract.h new file mode 100644 index 000000000..7130efb17 --- /dev/null +++ b/ml/dlib/dlib/dnn/input_abstract.h @@ -0,0 +1,467 @@ +// Copyright (C) 2015 Davis E. King (davis@dlib.net) +// License: Boost Software License See LICENSE.txt for the full license. +#undef DLIB_DNn_INPUT_ABSTRACT_H_ +#ifdef DLIB_DNn_INPUT_ABSTRACT_H_ + +#include "../matrix.h" +#include "../pixel.h" + + +namespace dlib +{ + +// ---------------------------------------------------------------------------------------- + + class EXAMPLE_INPUT_LAYER + { + /*! + WHAT THIS OBJECT REPRESENTS + Each deep neural network model in dlib begins with an input layer. The job + of the input layer is to convert an input_type into a tensor. Nothing more + and nothing less. + + Note that there is no dlib::EXAMPLE_INPUT_LAYER type. It is shown here + purely to document the interface that an input layer object must implement. + If you are using some kind of image or matrix object as your input_type + then you can use the provided dlib::input layer defined below. Otherwise, + you need to define your own custom input layer. + + THREAD SAFETY + to_tensor() must be thread safe. That is, multiple threads must be able to + make calls to to_tensor() on a single instance of this object at the same + time. + !*/ + public: + + EXAMPLE_INPUT_LAYER( + ); + /*! + ensures + - Default constructs this object. This function is not required to do + anything in particular but it must exist, that is, it is required that + layer objects be default constructable. + !*/ + + EXAMPLE_INPUT_LAYER ( + const EXAMPLE_INPUT_LAYER& item + ); + /*! + ensures + - EXAMPLE_INPUT_LAYER objects are copy constructable + !*/ + + EXAMPLE_INPUT_LAYER( + const some_other_input_layer_type& item + ); + /*! + ensures + - Constructs this object from item. This form of constructor is optional + but it allows you to provide a conversion from one input layer type to + another. For example, the following code is valid only if my_input_layer2 can + be constructed from my_input_layer1: + relu<fc<relu<fc<my_input_layer1>>>> my_dnn1; + relu<fc<relu<fc<my_input_layer2>>>> my_dnn2(my_dnn1); + This kind of pattern is useful if you want to use one type of input layer + during training but a different type of layer during testing since it + allows you to easily convert between related deep neural network types. + !*/ + + typedef whatever_type_to_tensor_expects input_type; + + template <typename forward_iterator> + void to_tensor ( + forward_iterator ibegin, + forward_iterator iend, + resizable_tensor& data + ) const; + /*! + requires + - [ibegin, iend) is an iterator range over input_type objects. + - std::distance(ibegin,iend) > 0 + ensures + - Converts the iterator range into a tensor and stores it into #data. + - #data.num_samples()%distance(ibegin,iend) == 0. + Normally you would have #data.num_samples() == distance(ibegin,iend) but + you can also expand the output by some integer factor so long as the loss + you use can deal with it correctly. + - The data in the ith sample of #data corresponds to the input_type object + *(ibegin+i/sample_expansion_factor). + where sample_expansion_factor==#data.num_samples()/distance(ibegin,iend). + !*/ + }; + + std::ostream& operator<<(std::ostream& out, const EXAMPLE_INPUT_LAYER& item); + /*! + print a string describing this layer. + !*/ + + void to_xml(const EXAMPLE_INPUT_LAYER& item, std::ostream& out); + /*! + This function is optional, but required if you want to print your networks with + net_to_xml(). Therefore, to_xml() prints a layer as XML. + !*/ + + void serialize(const EXAMPLE_INPUT_LAYER& item, std::ostream& out); + void deserialize(EXAMPLE_INPUT_LAYER& item, std::istream& in); + /*! + provides serialization support + !*/ + +// ---------------------------------------------------------------------------------------- + + template < + typename T + > + class input + { + /*! + REQUIREMENTS ON T + One of the following must be true: + - T is a matrix or array2d object and it must contain some kind of + pixel type. I.e. pixel_traits<T::type> must be defined. + - T is a std::array<matrix<U>> where U is any built in scalar type like + float, double, or unsigned char. + + WHAT THIS OBJECT REPRESENTS + This is a basic input layer that simply copies images into a tensor. + !*/ + + public: + typedef T input_type; + + template <typename forward_iterator> + void to_tensor ( + forward_iterator ibegin, + forward_iterator iend, + resizable_tensor& data + ) const; + /*! + requires + - [ibegin, iend) is an iterator range over input_type objects. + - std::distance(ibegin,iend) > 0 + - The input range should contain image objects that all have the same + dimensions. + ensures + - Converts the iterator range into a tensor and stores it into #data. In + particular, if the input images have R rows, C columns, and K channels + (where K is given by pixel_traits::num or std::array::size() if + std::array inputs are used) then we will have: + - #data.num_samples() == std::distance(ibegin,iend) + - #data.nr() == R + - #data.nc() == C + - #data.k() == K + For example, a matrix<float,3,3> would turn into a tensor with 3 rows, 3 + columns, and k()==1. Or a matrix<rgb_pixel,4,5> would turn into a tensor + with 4 rows, 5 columns, and k()==3 (since rgb_pixels have 3 channels). + Or a std::array<matrix<float,3,3>,5> would turn into a tensor with 3 rows + and columns, and k()==5 channels. + - If the input data contains pixels of type unsigned char, rgb_pixel, or + other pixel types with a basic_pixel_type of unsigned char then each + value written to the output tensor is first divided by 256.0 so that the + resulting outputs are all in the range [0,1]. + !*/ + + // Provided for compatibility with input_rgb_image_pyramid's interface + bool image_contained_point ( const tensor& data, const point& p) const { return get_rect(data).contains(p); } + drectangle tensor_space_to_image_space ( const tensor& /*data*/, drectangle r) const { return r; } + drectangle image_space_to_tensor_space ( const tensor& /*data*/, double /*scale*/, drectangle r ) const { return r; } + }; + +// ---------------------------------------------------------------------------------------- + + class input_rgb_image + { + /*! + WHAT THIS OBJECT REPRESENTS + This input layer works with RGB images of type matrix<rgb_pixel>. It is + very similar to the dlib::input layer except that it allows you to subtract + the average color value from each color channel when converting an image to + a tensor. + !*/ + public: + typedef matrix<rgb_pixel> input_type; + + input_rgb_image ( + ); + /*! + ensures + - #get_avg_red() == 122.782 + - #get_avg_green() == 117.001 + - #get_avg_blue() == 104.298 + !*/ + + input_rgb_image ( + float avg_red, + float avg_green, + float avg_blue + ); + /*! + ensures + - #get_avg_red() == avg_red + - #get_avg_green() == avg_green + - #get_avg_blue() == avg_blue + !*/ + + float get_avg_red( + ) const; + /*! + ensures + - returns the value subtracted from the red color channel. + !*/ + + float get_avg_green( + ) const; + /*! + ensures + - returns the value subtracted from the green color channel. + !*/ + + float get_avg_blue( + ) const; + /*! + ensures + - returns the value subtracted from the blue color channel. + !*/ + + template <typename forward_iterator> + void to_tensor ( + forward_iterator ibegin, + forward_iterator iend, + resizable_tensor& data + ) const; + /*! + requires + - [ibegin, iend) is an iterator range over input_type objects. + - std::distance(ibegin,iend) > 0 + - The input range should contain images that all have the same + dimensions. + ensures + - Converts the iterator range into a tensor and stores it into #data. In + particular, if the input images have R rows, C columns then we will have: + - #data.num_samples() == std::distance(ibegin,iend) + - #data.nr() == R + - #data.nc() == C + - #data.k() == 3 + Moreover, each color channel is normalized by having its average value + subtracted (according to get_avg_red(), get_avg_green(), or + get_avg_blue()) and then is divided by 256.0. + !*/ + + + // Provided for compatibility with input_rgb_image_pyramid's interface + bool image_contained_point ( const tensor& data, const point& p) const { return get_rect(data).contains(p); } + drectangle tensor_space_to_image_space ( const tensor& /*data*/, drectangle r) const { return r; } + drectangle image_space_to_tensor_space ( const tensor& /*data*/, double /*scale*/, drectangle r ) const { return r; } + }; + +// ---------------------------------------------------------------------------------------- + + template <size_t NR, size_t NC=NR> + class input_rgb_image_sized + { + /*! + WHAT THIS OBJECT REPRESENTS + This layer has an interface and behavior identical to input_rgb_image + except that it requires input images to have NR rows and NC columns. This + is checked by a DLIB_CASSERT inside to_tensor(). + + You can also convert between input_rgb_image and input_rgb_image_sized by + copy construction or assignment. + !*/ + + }; + +// ---------------------------------------------------------------------------------------- + + template < + typename PYRAMID_TYPE + > + class input_rgb_image_pyramid + { + /*! + REQUIREMENTS ON PYRAMID_TYPE + PYRAMID_TYPE must be an instance of the dlib::pyramid_down template. + + WHAT THIS OBJECT REPRESENTS + This input layer works with RGB images of type matrix<rgb_pixel>. It is + identical to input_rgb_image except that it outputs a tensor containing a + tiled image pyramid of each input image rather than a simple copy of each + image. The tiled image pyramid is created using create_tiled_pyramid(). + !*/ + + public: + + typedef matrix<rgb_pixel> input_type; + typedef PYRAMID_TYPE pyramid_type; + + input_rgb_image_pyramid ( + ); + /*! + ensures + - #get_avg_red() == 122.782 + - #get_avg_green() == 117.001 + - #get_avg_blue() == 104.298 + - #get_pyramid_padding() == 10 + - #get_pyramid_outer_padding() == 11 + !*/ + + input_rgb_image_pyramid ( + float avg_red, + float avg_green, + float avg_blue + ); + /*! + ensures + - #get_avg_red() == avg_red + - #get_avg_green() == avg_green + - #get_avg_blue() == avg_blue + - #get_pyramid_padding() == 10 + - #get_pyramid_outer_padding() == 11 + !*/ + + float get_avg_red( + ) const; + /*! + ensures + - returns the value subtracted from the red color channel. + !*/ + + float get_avg_green( + ) const; + /*! + ensures + - returns the value subtracted from the green color channel. + !*/ + + float get_avg_blue( + ) const; + /*! + ensures + - returns the value subtracted from the blue color channel. + !*/ + + unsigned long get_pyramid_padding ( + ) const; + /*! + ensures + - When this object creates a pyramid it will call create_tiled_pyramid() and + set create_tiled_pyramid's pyramid_padding parameter to get_pyramid_padding(). + !*/ + void set_pyramid_padding ( + unsigned long value + ); + /*! + ensures + - #get_pyramid_padding() == value + !*/ + + unsigned long get_pyramid_outer_padding ( + ) const; + /*! + ensures + - When this object creates a pyramid it will call create_tiled_pyramid() + and set create_tiled_pyramid's pyramid_outer_padding parameter to + get_pyramid_outer_padding(). + !*/ + void set_pyramid_outer_padding ( + unsigned long value + ); + /*! + ensures + - #get_pyramid_outer_padding() == value + !*/ + + template <typename forward_iterator> + void to_tensor ( + forward_iterator ibegin, + forward_iterator iend, + resizable_tensor& data + ) const; + /*! + requires + - [ibegin, iend) is an iterator range over input_type objects. + - std::distance(ibegin,iend) > 0 + - The input range should contain images that all have the same + dimensions. + ensures + - Converts the iterator range into a tensor and stores it into #data. In + particular, we will have: + - #data.num_samples() == std::distance(ibegin,iend) + - #data.k() == 3 + - Each sample in #data contains a tiled image pyramid of the + corresponding input image. The tiled pyramid is created by + create_tiled_pyramid(). + Moreover, each color channel is normalized by having its average value + subtracted (according to get_avg_red(), get_avg_green(), or + get_avg_blue()) and then is divided by 256.0. + !*/ + + bool image_contained_point ( + const tensor& data, + const point& p + ) const; + /*! + requires + - data is a tensor that was produced by this->to_tensor() + ensures + - Since data is a tensor that is built from a bunch of identically sized + images, we can ask if those images were big enough to contain the point + p. This function returns the answer to that question. + !*/ + + drectangle image_space_to_tensor_space ( + const tensor& data, + double scale, + drectangle r + ) const; + /*! + requires + - data is a tensor that was produced by this->to_tensor() + - 0 < scale <= 1 + ensures + - This function maps from to_tensor()'s input image space to its output + tensor space. Therefore, given that data is a tensor produced by + to_tensor(), image_space_to_tensor_space() allows you to ask for the + rectangle in data that corresponds to a rectangle in the original image + space. + + Note that since the output tensor contains an image pyramid, there are + multiple points in the output tensor that correspond to any input + location. So you must also specify a scale so we know what level of the + pyramid is needed. So given a rectangle r in an input image, you can + ask, what rectangle in data corresponds to r when things are scale times + smaller? That rectangle is returned by this function. + - A scale of 1 means we don't move anywhere in the pyramid scale space relative + to the input image while smaller values of scale mean we move down the + pyramid. + !*/ + + drectangle tensor_space_to_image_space ( + const tensor& data, + drectangle r + ) const; + /*! + requires + - data is a tensor that was produced by this->to_tensor() + ensures + - This function maps from to_tensor()'s output tensor space to its input + image space. Therefore, given that data is a tensor produced by + to_tensor(), tensor_space_to_image_space() allows you to ask for the + rectangle in the input image that corresponds to a rectangle in data. + - It should be noted that this function isn't always an inverse of + image_space_to_tensor_space(). This is because you can ask + image_space_to_tensor_space() for the coordinates of points outside the input + image and they will be mapped to somewhere that doesn't have an inverse. + But for points actually inside the input image this function performs an + approximate inverse mapping. I.e. when image_contained_point(data,center(r))==true + there is an approximate inverse. + !*/ + + }; + +// ---------------------------------------------------------------------------------------- + +} + +#endif // DLIB_DNn_INPUT_ABSTRACT_H_ + diff --git a/ml/dlib/dlib/dnn/layers.h b/ml/dlib/dlib/dnn/layers.h new file mode 100644 index 000000000..91436f635 --- /dev/null +++ b/ml/dlib/dlib/dnn/layers.h @@ -0,0 +1,3244 @@ +// Copyright (C) 2015 Davis E. King (davis@dlib.net) +// License: Boost Software License See LICENSE.txt for the full license. +#ifndef DLIB_DNn_LAYERS_H_ +#define DLIB_DNn_LAYERS_H_ + +#include "layers_abstract.h" +#include "tensor.h" +#include "core.h" +#include <iostream> +#include <string> +#include "../rand.h" +#include "../string.h" +#include "tensor_tools.h" +#include "../vectorstream.h" +#include "utilities.h" +#include <sstream> + + +namespace dlib +{ + +// ---------------------------------------------------------------------------------------- + + struct num_con_outputs + { + num_con_outputs(unsigned long n) : num_outputs(n) {} + unsigned long num_outputs; + }; + + template < + long _num_filters, + long _nr, + long _nc, + int _stride_y, + int _stride_x, + int _padding_y = _stride_y!=1? 0 : _nr/2, + int _padding_x = _stride_x!=1? 0 : _nc/2 + > + class con_ + { + public: + + static_assert(_num_filters > 0, "The number of filters must be > 0"); + static_assert(_nr >= 0, "The number of rows in a filter must be >= 0"); + static_assert(_nc >= 0, "The number of columns in a filter must be >= 0"); + static_assert(_stride_y > 0, "The filter stride must be > 0"); + static_assert(_stride_x > 0, "The filter stride must be > 0"); + static_assert(_nr==0 || (0 <= _padding_y && _padding_y < _nr), "The padding must be smaller than the filter size."); + static_assert(_nc==0 || (0 <= _padding_x && _padding_x < _nc), "The padding must be smaller than the filter size."); + static_assert(_nr!=0 || 0 == _padding_y, "If _nr==0 then the padding must be set to 0 as well."); + static_assert(_nc!=0 || 0 == _padding_x, "If _nr==0 then the padding must be set to 0 as well."); + + con_( + num_con_outputs o + ) : + learning_rate_multiplier(1), + weight_decay_multiplier(1), + bias_learning_rate_multiplier(1), + bias_weight_decay_multiplier(0), + num_filters_(o.num_outputs), + padding_y_(_padding_y), + padding_x_(_padding_x) + { + DLIB_CASSERT(num_filters_ > 0); + } + + con_() : con_(num_con_outputs(_num_filters)) {} + + long num_filters() const { return num_filters_; } + long nr() const + { + if (_nr==0) + return filters.nr(); + else + return _nr; + } + long nc() const + { + if (_nc==0) + return filters.nc(); + else + return _nc; + } + long stride_y() const { return _stride_y; } + long stride_x() const { return _stride_x; } + long padding_y() const { return padding_y_; } + long padding_x() const { return padding_x_; } + + void set_num_filters(long num) + { + DLIB_CASSERT(num > 0); + if (num != num_filters_) + { + DLIB_CASSERT(get_layer_params().size() == 0, + "You can't change the number of filters in con_ if the parameter tensor has already been allocated."); + num_filters_ = num; + } + } + + double get_learning_rate_multiplier () const { return learning_rate_multiplier; } + double get_weight_decay_multiplier () const { return weight_decay_multiplier; } + void set_learning_rate_multiplier(double val) { learning_rate_multiplier = val; } + void set_weight_decay_multiplier(double val) { weight_decay_multiplier = val; } + + double get_bias_learning_rate_multiplier () const { return bias_learning_rate_multiplier; } + double get_bias_weight_decay_multiplier () const { return bias_weight_decay_multiplier; } + void set_bias_learning_rate_multiplier(double val) { bias_learning_rate_multiplier = val; } + void set_bias_weight_decay_multiplier(double val) { bias_weight_decay_multiplier = val; } + + inline dpoint map_input_to_output ( + dpoint p + ) const + { + p.x() = (p.x()+padding_x()-nc()/2)/stride_x(); + p.y() = (p.y()+padding_y()-nr()/2)/stride_y(); + return p; + } + + inline dpoint map_output_to_input ( + dpoint p + ) const + { + p.x() = p.x()*stride_x() - padding_x() + nc()/2; + p.y() = p.y()*stride_y() - padding_y() + nr()/2; + return p; + } + + con_ ( + const con_& item + ) : + params(item.params), + filters(item.filters), + biases(item.biases), + learning_rate_multiplier(item.learning_rate_multiplier), + weight_decay_multiplier(item.weight_decay_multiplier), + bias_learning_rate_multiplier(item.bias_learning_rate_multiplier), + bias_weight_decay_multiplier(item.bias_weight_decay_multiplier), + num_filters_(item.num_filters_), + padding_y_(item.padding_y_), + padding_x_(item.padding_x_) + { + // this->conv is non-copyable and basically stateless, so we have to write our + // own copy to avoid trying to copy it and getting an error. + } + + con_& operator= ( + const con_& item + ) + { + if (this == &item) + return *this; + + // this->conv is non-copyable and basically stateless, so we have to write our + // own copy to avoid trying to copy it and getting an error. + params = item.params; + filters = item.filters; + biases = item.biases; + padding_y_ = item.padding_y_; + padding_x_ = item.padding_x_; + learning_rate_multiplier = item.learning_rate_multiplier; + weight_decay_multiplier = item.weight_decay_multiplier; + bias_learning_rate_multiplier = item.bias_learning_rate_multiplier; + bias_weight_decay_multiplier = item.bias_weight_decay_multiplier; + num_filters_ = item.num_filters_; + return *this; + } + + template <typename SUBNET> + void setup (const SUBNET& sub) + { + const long filt_nr = _nr!=0 ? _nr : sub.get_output().nr(); + const long filt_nc = _nc!=0 ? _nc : sub.get_output().nc(); + + long num_inputs = filt_nr*filt_nc*sub.get_output().k(); + long num_outputs = num_filters_; + // allocate params for the filters and also for the filter bias values. + params.set_size(num_inputs*num_filters_ + num_filters_); + + dlib::rand rnd(std::rand()); + randomize_parameters(params, num_inputs+num_outputs, rnd); + + filters = alias_tensor(num_filters_, sub.get_output().k(), filt_nr, filt_nc); + biases = alias_tensor(1,num_filters_); + + // set the initial bias values to zero + biases(params,filters.size()) = 0; + } + + template <typename SUBNET> + void forward(const SUBNET& sub, resizable_tensor& output) + { + conv.setup(sub.get_output(), + filters(params,0), + _stride_y, + _stride_x, + padding_y_, + padding_x_); + conv(false, output, + sub.get_output(), + filters(params,0)); + + tt::add(1,output,1,biases(params,filters.size())); + } + + template <typename SUBNET> + void backward(const tensor& gradient_input, SUBNET& sub, tensor& params_grad) + { + conv.get_gradient_for_data (true, gradient_input, filters(params,0), sub.get_gradient_input()); + // no dpoint computing the parameter gradients if they won't be used. + if (learning_rate_multiplier != 0) + { + auto filt = filters(params_grad,0); + conv.get_gradient_for_filters (false, gradient_input, sub.get_output(), filt); + auto b = biases(params_grad, filters.size()); + tt::assign_conv_bias_gradient(b, gradient_input); + } + } + + const tensor& get_layer_params() const { return params; } + tensor& get_layer_params() { return params; } + + friend void serialize(const con_& item, std::ostream& out) + { + serialize("con_4", out); + serialize(item.params, out); + serialize(item.num_filters_, out); + serialize(_nr, out); + serialize(_nc, out); + serialize(_stride_y, out); + serialize(_stride_x, out); + serialize(item.padding_y_, out); + serialize(item.padding_x_, out); + serialize(item.filters, out); + serialize(item.biases, out); + serialize(item.learning_rate_multiplier, out); + serialize(item.weight_decay_multiplier, out); + serialize(item.bias_learning_rate_multiplier, out); + serialize(item.bias_weight_decay_multiplier, out); + } + + friend void deserialize(con_& item, std::istream& in) + { + std::string version; + deserialize(version, in); + long nr; + long nc; + int stride_y; + int stride_x; + if (version == "con_4") + { + deserialize(item.params, in); + deserialize(item.num_filters_, in); + deserialize(nr, in); + deserialize(nc, in); + deserialize(stride_y, in); + deserialize(stride_x, in); + deserialize(item.padding_y_, in); + deserialize(item.padding_x_, in); + deserialize(item.filters, in); + deserialize(item.biases, in); + deserialize(item.learning_rate_multiplier, in); + deserialize(item.weight_decay_multiplier, in); + deserialize(item.bias_learning_rate_multiplier, in); + deserialize(item.bias_weight_decay_multiplier, in); + if (item.padding_y_ != _padding_y) throw serialization_error("Wrong padding_y found while deserializing dlib::con_"); + if (item.padding_x_ != _padding_x) throw serialization_error("Wrong padding_x found while deserializing dlib::con_"); + if (nr != _nr) throw serialization_error("Wrong nr found while deserializing dlib::con_"); + if (nc != _nc) throw serialization_error("Wrong nc found while deserializing dlib::con_"); + if (stride_y != _stride_y) throw serialization_error("Wrong stride_y found while deserializing dlib::con_"); + if (stride_x != _stride_x) throw serialization_error("Wrong stride_x found while deserializing dlib::con_"); + } + else + { + throw serialization_error("Unexpected version '"+version+"' found while deserializing dlib::con_."); + } + } + + + friend std::ostream& operator<<(std::ostream& out, const con_& item) + { + out << "con\t (" + << "num_filters="<<item.num_filters_ + << ", nr="<<item.nr() + << ", nc="<<item.nc() + << ", stride_y="<<_stride_y + << ", stride_x="<<_stride_x + << ", padding_y="<<item.padding_y_ + << ", padding_x="<<item.padding_x_ + << ")"; + out << " learning_rate_mult="<<item.learning_rate_multiplier; + out << " weight_decay_mult="<<item.weight_decay_multiplier; + out << " bias_learning_rate_mult="<<item.bias_learning_rate_multiplier; + out << " bias_weight_decay_mult="<<item.bias_weight_decay_multiplier; + return out; + } + + friend void to_xml(const con_& item, std::ostream& out) + { + out << "<con" + << " num_filters='"<<item.num_filters_<<"'" + << " nr='"<<item.nr()<<"'" + << " nc='"<<item.nc()<<"'" + << " stride_y='"<<_stride_y<<"'" + << " stride_x='"<<_stride_x<<"'" + << " padding_y='"<<item.padding_y_<<"'" + << " padding_x='"<<item.padding_x_<<"'" + << " learning_rate_mult='"<<item.learning_rate_multiplier<<"'" + << " weight_decay_mult='"<<item.weight_decay_multiplier<<"'" + << " bias_learning_rate_mult='"<<item.bias_learning_rate_multiplier<<"'" + << " bias_weight_decay_mult='"<<item.bias_weight_decay_multiplier<<"'>\n"; + out << mat(item.params); + out << "</con>"; + } + + private: + + resizable_tensor params; + alias_tensor filters, biases; + + tt::tensor_conv conv; + double learning_rate_multiplier; + double weight_decay_multiplier; + double bias_learning_rate_multiplier; + double bias_weight_decay_multiplier; + long num_filters_; + + // These are here only because older versions of con (which you might encounter + // serialized to disk) used different padding settings. + int padding_y_; + int padding_x_; + + }; + + template < + long num_filters, + long nr, + long nc, + int stride_y, + int stride_x, + typename SUBNET + > + using con = add_layer<con_<num_filters,nr,nc,stride_y,stride_x>, SUBNET>; + +// ---------------------------------------------------------------------------------------- + + template < + long _num_filters, + long _nr, + long _nc, + int _stride_y, + int _stride_x, + int _padding_y = _stride_y!=1? 0 : _nr/2, + int _padding_x = _stride_x!=1? 0 : _nc/2 + > + class cont_ + { + public: + + static_assert(_num_filters > 0, "The number of filters must be > 0"); + static_assert(_nr > 0, "The number of rows in a filter must be > 0"); + static_assert(_nc > 0, "The number of columns in a filter must be > 0"); + static_assert(_stride_y > 0, "The filter stride must be > 0"); + static_assert(_stride_x > 0, "The filter stride must be > 0"); + static_assert(0 <= _padding_y && _padding_y < _nr, "The padding must be smaller than the filter size."); + static_assert(0 <= _padding_x && _padding_x < _nc, "The padding must be smaller than the filter size."); + + cont_( + num_con_outputs o + ) : + learning_rate_multiplier(1), + weight_decay_multiplier(1), + bias_learning_rate_multiplier(1), + bias_weight_decay_multiplier(0), + num_filters_(o.num_outputs), + padding_y_(_padding_y), + padding_x_(_padding_x) + { + DLIB_CASSERT(num_filters_ > 0); + } + + cont_() : cont_(num_con_outputs(_num_filters)) {} + + long num_filters() const { return num_filters_; } + long nr() const { return _nr; } + long nc() const { return _nc; } + long stride_y() const { return _stride_y; } + long stride_x() const { return _stride_x; } + long padding_y() const { return padding_y_; } + long padding_x() const { return padding_x_; } + + void set_num_filters(long num) + { + DLIB_CASSERT(num > 0); + if (num != num_filters_) + { + DLIB_CASSERT(get_layer_params().size() == 0, + "You can't change the number of filters in cont_ if the parameter tensor has already been allocated."); + num_filters_ = num; + } + } + + double get_learning_rate_multiplier () const { return learning_rate_multiplier; } + double get_weight_decay_multiplier () const { return weight_decay_multiplier; } + void set_learning_rate_multiplier(double val) { learning_rate_multiplier = val; } + void set_weight_decay_multiplier(double val) { weight_decay_multiplier = val; } + + double get_bias_learning_rate_multiplier () const { return bias_learning_rate_multiplier; } + double get_bias_weight_decay_multiplier () const { return bias_weight_decay_multiplier; } + void set_bias_learning_rate_multiplier(double val) { bias_learning_rate_multiplier = val; } + void set_bias_weight_decay_multiplier(double val) { bias_weight_decay_multiplier = val; } + + inline dpoint map_output_to_input ( + dpoint p + ) const + { + p.x() = (p.x()+padding_x()-nc()/2)/stride_x(); + p.y() = (p.y()+padding_y()-nr()/2)/stride_y(); + return p; + } + + inline dpoint map_input_to_output ( + dpoint p + ) const + { + p.x() = p.x()*stride_x() - padding_x() + nc()/2; + p.y() = p.y()*stride_y() - padding_y() + nr()/2; + return p; + } + + cont_ ( + const cont_& item + ) : + params(item.params), + filters(item.filters), + biases(item.biases), + learning_rate_multiplier(item.learning_rate_multiplier), + weight_decay_multiplier(item.weight_decay_multiplier), + bias_learning_rate_multiplier(item.bias_learning_rate_multiplier), + bias_weight_decay_multiplier(item.bias_weight_decay_multiplier), + num_filters_(item.num_filters_), + padding_y_(item.padding_y_), + padding_x_(item.padding_x_) + { + // this->conv is non-copyable and basically stateless, so we have to write our + // own copy to avoid trying to copy it and getting an error. + } + + cont_& operator= ( + const cont_& item + ) + { + if (this == &item) + return *this; + + // this->conv is non-copyable and basically stateless, so we have to write our + // own copy to avoid trying to copy it and getting an error. + params = item.params; + filters = item.filters; + biases = item.biases; + padding_y_ = item.padding_y_; + padding_x_ = item.padding_x_; + learning_rate_multiplier = item.learning_rate_multiplier; + weight_decay_multiplier = item.weight_decay_multiplier; + bias_learning_rate_multiplier = item.bias_learning_rate_multiplier; + bias_weight_decay_multiplier = item.bias_weight_decay_multiplier; + num_filters_ = item.num_filters_; + return *this; + } + + template <typename SUBNET> + void setup (const SUBNET& sub) + { + long num_inputs = _nr*_nc*sub.get_output().k(); + long num_outputs = num_filters_; + // allocate params for the filters and also for the filter bias values. + params.set_size(num_inputs*num_filters_ + num_filters_); + + dlib::rand rnd(std::rand()); + randomize_parameters(params, num_inputs+num_outputs, rnd); + + filters = alias_tensor(sub.get_output().k(), num_filters_, _nr, _nc); + biases = alias_tensor(1,num_filters_); + + // set the initial bias values to zero + biases(params,filters.size()) = 0; + } + + template <typename SUBNET> + void forward(const SUBNET& sub, resizable_tensor& output) + { + auto filt = filters(params,0); + unsigned int gnr = _stride_y * (sub.get_output().nr() - 1) + filt.nr() - 2 * padding_y_; + unsigned int gnc = _stride_x * (sub.get_output().nc() - 1) + filt.nc() - 2 * padding_x_; + unsigned int gnsamps = sub.get_output().num_samples(); + unsigned int gk = filt.k(); + output.set_size(gnsamps,gk,gnr,gnc); + conv.setup(output,filt,_stride_y,_stride_x,padding_y_,padding_x_); + conv.get_gradient_for_data(false, sub.get_output(),filt,output); + tt::add(1,output,1,biases(params,filters.size())); + } + + template <typename SUBNET> + void backward(const tensor& gradient_input, SUBNET& sub, tensor& params_grad) + { + auto filt = filters(params,0); + conv(true, sub.get_gradient_input(),gradient_input, filt); + // no point computing the parameter gradients if they won't be used. + if (learning_rate_multiplier != 0) + { + auto filt = filters(params_grad,0); + conv.get_gradient_for_filters (false, sub.get_output(),gradient_input, filt); + auto b = biases(params_grad, filters.size()); + tt::assign_conv_bias_gradient(b, gradient_input); + } + } + + const tensor& get_layer_params() const { return params; } + tensor& get_layer_params() { return params; } + + friend void serialize(const cont_& item, std::ostream& out) + { + serialize("cont_1", out); + serialize(item.params, out); + serialize(item.num_filters_, out); + serialize(_nr, out); + serialize(_nc, out); + serialize(_stride_y, out); + serialize(_stride_x, out); + serialize(item.padding_y_, out); + serialize(item.padding_x_, out); + serialize(item.filters, out); + serialize(item.biases, out); + serialize(item.learning_rate_multiplier, out); + serialize(item.weight_decay_multiplier, out); + serialize(item.bias_learning_rate_multiplier, out); + serialize(item.bias_weight_decay_multiplier, out); + } + + friend void deserialize(cont_& item, std::istream& in) + { + std::string version; + deserialize(version, in); + long nr; + long nc; + int stride_y; + int stride_x; + if (version == "cont_1") + { + deserialize(item.params, in); + deserialize(item.num_filters_, in); + deserialize(nr, in); + deserialize(nc, in); + deserialize(stride_y, in); + deserialize(stride_x, in); + deserialize(item.padding_y_, in); + deserialize(item.padding_x_, in); + deserialize(item.filters, in); + deserialize(item.biases, in); + deserialize(item.learning_rate_multiplier, in); + deserialize(item.weight_decay_multiplier, in); + deserialize(item.bias_learning_rate_multiplier, in); + deserialize(item.bias_weight_decay_multiplier, in); + if (item.padding_y_ != _padding_y) throw serialization_error("Wrong padding_y found while deserializing dlib::con_"); + if (item.padding_x_ != _padding_x) throw serialization_error("Wrong padding_x found while deserializing dlib::con_"); + if (nr != _nr) throw serialization_error("Wrong nr found while deserializing dlib::con_"); + if (nc != _nc) throw serialization_error("Wrong nc found while deserializing dlib::con_"); + if (stride_y != _stride_y) throw serialization_error("Wrong stride_y found while deserializing dlib::con_"); + if (stride_x != _stride_x) throw serialization_error("Wrong stride_x found while deserializing dlib::con_"); + } + else + { + throw serialization_error("Unexpected version '"+version+"' found while deserializing dlib::con_."); + } + } + + + friend std::ostream& operator<<(std::ostream& out, const cont_& item) + { + out << "cont\t (" + << "num_filters="<<item.num_filters_ + << ", nr="<<_nr + << ", nc="<<_nc + << ", stride_y="<<_stride_y + << ", stride_x="<<_stride_x + << ", padding_y="<<item.padding_y_ + << ", padding_x="<<item.padding_x_ + << ")"; + out << " learning_rate_mult="<<item.learning_rate_multiplier; + out << " weight_decay_mult="<<item.weight_decay_multiplier; + out << " bias_learning_rate_mult="<<item.bias_learning_rate_multiplier; + out << " bias_weight_decay_mult="<<item.bias_weight_decay_multiplier; + return out; + } + + friend void to_xml(const cont_& item, std::ostream& out) + { + out << "<cont" + << " num_filters='"<<item.num_filters_<<"'" + << " nr='"<<_nr<<"'" + << " nc='"<<_nc<<"'" + << " stride_y='"<<_stride_y<<"'" + << " stride_x='"<<_stride_x<<"'" + << " padding_y='"<<item.padding_y_<<"'" + << " padding_x='"<<item.padding_x_<<"'" + << " learning_rate_mult='"<<item.learning_rate_multiplier<<"'" + << " weight_decay_mult='"<<item.weight_decay_multiplier<<"'" + << " bias_learning_rate_mult='"<<item.bias_learning_rate_multiplier<<"'" + << " bias_weight_decay_mult='"<<item.bias_weight_decay_multiplier<<"'>\n"; + out << mat(item.params); + out << "</cont>"; + } + + private: + + resizable_tensor params; + alias_tensor filters, biases; + + tt::tensor_conv conv; + double learning_rate_multiplier; + double weight_decay_multiplier; + double bias_learning_rate_multiplier; + double bias_weight_decay_multiplier; + long num_filters_; + + int padding_y_; + int padding_x_; + + }; + + template < + long num_filters, + long nr, + long nc, + int stride_y, + int stride_x, + typename SUBNET + > + using cont = add_layer<cont_<num_filters,nr,nc,stride_y,stride_x>, SUBNET>; + +// ---------------------------------------------------------------------------------------- + + template < + int scale_y, + int scale_x + > + class upsample_ + { + public: + static_assert(scale_y >= 1, "upsampling scale factor can't be less than 1."); + static_assert(scale_x >= 1, "upsampling scale factor can't be less than 1."); + + upsample_() + { + } + + template <typename SUBNET> + void setup (const SUBNET& /*sub*/) + { + } + + template <typename SUBNET> + void forward(const SUBNET& sub, resizable_tensor& output) + { + output.set_size( + sub.get_output().num_samples(), + sub.get_output().k(), + scale_y*sub.get_output().nr(), + scale_x*sub.get_output().nc()); + tt::resize_bilinear(output, sub.get_output()); + } + + template <typename SUBNET> + void backward(const tensor& gradient_input, SUBNET& sub, tensor& /*params_grad*/) + { + tt::resize_bilinear_gradient(sub.get_gradient_input(), gradient_input); + } + + inline dpoint map_input_to_output (dpoint p) const + { + p.x() = p.x()*scale_x; + p.y() = p.y()*scale_y; + return p; + } + inline dpoint map_output_to_input (dpoint p) const + { + p.x() = p.x()/scale_x; + p.y() = p.y()/scale_y; + return p; + } + + const tensor& get_layer_params() const { return params; } + tensor& get_layer_params() { return params; } + + friend void serialize(const upsample_& , std::ostream& out) + { + serialize("upsample_", out); + serialize(scale_y, out); + serialize(scale_x, out); + } + + friend void deserialize(upsample_& , std::istream& in) + { + std::string version; + deserialize(version, in); + if (version != "upsample_") + throw serialization_error("Unexpected version '"+version+"' found while deserializing dlib::upsample_."); + + int _scale_y; + int _scale_x; + deserialize(_scale_y, in); + deserialize(_scale_x, in); + if (_scale_y != scale_y || _scale_x != scale_x) + throw serialization_error("Wrong scale found while deserializing dlib::upsample_"); + } + + friend std::ostream& operator<<(std::ostream& out, const upsample_& ) + { + out << "upsample\t (" + << "scale_y="<<scale_y + << ", scale_x="<<scale_x + << ")"; + return out; + } + + friend void to_xml(const upsample_& /*item*/, std::ostream& out) + { + out << "<upsample" + << " scale_y='"<<scale_y<<"'" + << " scale_x='"<<scale_x<<"'/>\n"; + } + + private: + resizable_tensor params; + }; + + template < + int scale, + typename SUBNET + > + using upsample = add_layer<upsample_<scale,scale>, SUBNET>; + +// ---------------------------------------------------------------------------------------- + + template < + long _nr, + long _nc, + int _stride_y, + int _stride_x, + int _padding_y = _stride_y!=1? 0 : _nr/2, + int _padding_x = _stride_x!=1? 0 : _nc/2 + > + class max_pool_ + { + static_assert(_nr >= 0, "The number of rows in a filter must be >= 0"); + static_assert(_nc >= 0, "The number of columns in a filter must be >= 0"); + static_assert(_stride_y > 0, "The filter stride must be > 0"); + static_assert(_stride_x > 0, "The filter stride must be > 0"); + static_assert(0 <= _padding_y && ((_nr==0 && _padding_y == 0) || (_nr!=0 && _padding_y < _nr)), + "The padding must be smaller than the filter size, unless the filters size is 0."); + static_assert(0 <= _padding_x && ((_nc==0 && _padding_x == 0) || (_nc!=0 && _padding_x < _nc)), + "The padding must be smaller than the filter size, unless the filters size is 0."); + public: + + + max_pool_( + ) : + padding_y_(_padding_y), + padding_x_(_padding_x) + {} + + long nr() const { return _nr; } + long nc() const { return _nc; } + long stride_y() const { return _stride_y; } + long stride_x() const { return _stride_x; } + long padding_y() const { return padding_y_; } + long padding_x() const { return padding_x_; } + + inline dpoint map_input_to_output ( + dpoint p + ) const + { + p.x() = (p.x()+padding_x()-nc()/2)/stride_x(); + p.y() = (p.y()+padding_y()-nr()/2)/stride_y(); + return p; + } + + inline dpoint map_output_to_input ( + dpoint p + ) const + { + p.x() = p.x()*stride_x() - padding_x() + nc()/2; + p.y() = p.y()*stride_y() - padding_y() + nr()/2; + return p; + } + + max_pool_ ( + const max_pool_& item + ) : + padding_y_(item.padding_y_), + padding_x_(item.padding_x_) + { + // this->mp is non-copyable so we have to write our own copy to avoid trying to + // copy it and getting an error. + } + + max_pool_& operator= ( + const max_pool_& item + ) + { + if (this == &item) + return *this; + + padding_y_ = item.padding_y_; + padding_x_ = item.padding_x_; + + // this->mp is non-copyable so we have to write our own copy to avoid trying to + // copy it and getting an error. + return *this; + } + + template <typename SUBNET> + void setup (const SUBNET& /*sub*/) + { + } + + template <typename SUBNET> + void forward(const SUBNET& sub, resizable_tensor& output) + { + mp.setup_max_pooling(_nr!=0?_nr:sub.get_output().nr(), + _nc!=0?_nc:sub.get_output().nc(), + _stride_y, _stride_x, padding_y_, padding_x_); + + mp(output, sub.get_output()); + } + + template <typename SUBNET> + void backward(const tensor& computed_output, const tensor& gradient_input, SUBNET& sub, tensor& /*params_grad*/) + { + mp.setup_max_pooling(_nr!=0?_nr:sub.get_output().nr(), + _nc!=0?_nc:sub.get_output().nc(), + _stride_y, _stride_x, padding_y_, padding_x_); + + mp.get_gradient(gradient_input, computed_output, sub.get_output(), sub.get_gradient_input()); + } + + const tensor& get_layer_params() const { return params; } + tensor& get_layer_params() { return params; } + + friend void serialize(const max_pool_& item, std::ostream& out) + { + serialize("max_pool_2", out); + serialize(_nr, out); + serialize(_nc, out); + serialize(_stride_y, out); + serialize(_stride_x, out); + serialize(item.padding_y_, out); + serialize(item.padding_x_, out); + } + + friend void deserialize(max_pool_& item, std::istream& in) + { + std::string version; + deserialize(version, in); + long nr; + long nc; + int stride_y; + int stride_x; + if (version == "max_pool_2") + { + deserialize(nr, in); + deserialize(nc, in); + deserialize(stride_y, in); + deserialize(stride_x, in); + deserialize(item.padding_y_, in); + deserialize(item.padding_x_, in); + } + else + { + throw serialization_error("Unexpected version '"+version+"' found while deserializing dlib::max_pool_."); + } + + if (item.padding_y_ != _padding_y) throw serialization_error("Wrong padding_y found while deserializing dlib::max_pool_"); + if (item.padding_x_ != _padding_x) throw serialization_error("Wrong padding_x found while deserializing dlib::max_pool_"); + if (_nr != nr) throw serialization_error("Wrong nr found while deserializing dlib::max_pool_"); + if (_nc != nc) throw serialization_error("Wrong nc found while deserializing dlib::max_pool_"); + if (_stride_y != stride_y) throw serialization_error("Wrong stride_y found while deserializing dlib::max_pool_"); + if (_stride_x != stride_x) throw serialization_error("Wrong stride_x found while deserializing dlib::max_pool_"); + } + + friend std::ostream& operator<<(std::ostream& out, const max_pool_& item) + { + out << "max_pool (" + << "nr="<<_nr + << ", nc="<<_nc + << ", stride_y="<<_stride_y + << ", stride_x="<<_stride_x + << ", padding_y="<<item.padding_y_ + << ", padding_x="<<item.padding_x_ + << ")"; + return out; + } + + friend void to_xml(const max_pool_& item, std::ostream& out) + { + out << "<max_pool" + << " nr='"<<_nr<<"'" + << " nc='"<<_nc<<"'" + << " stride_y='"<<_stride_y<<"'" + << " stride_x='"<<_stride_x<<"'" + << " padding_y='"<<item.padding_y_<<"'" + << " padding_x='"<<item.padding_x_<<"'" + << "/>\n"; + } + + + private: + + + tt::pooling mp; + resizable_tensor params; + + int padding_y_; + int padding_x_; + }; + + template < + long nr, + long nc, + int stride_y, + int stride_x, + typename SUBNET + > + using max_pool = add_layer<max_pool_<nr,nc,stride_y,stride_x>, SUBNET>; + + template < + typename SUBNET + > + using max_pool_everything = add_layer<max_pool_<0,0,1,1>, SUBNET>; + +// ---------------------------------------------------------------------------------------- + + template < + long _nr, + long _nc, + int _stride_y, + int _stride_x, + int _padding_y = _stride_y!=1? 0 : _nr/2, + int _padding_x = _stride_x!=1? 0 : _nc/2 + > + class avg_pool_ + { + public: + static_assert(_nr >= 0, "The number of rows in a filter must be >= 0"); + static_assert(_nc >= 0, "The number of columns in a filter must be >= 0"); + static_assert(_stride_y > 0, "The filter stride must be > 0"); + static_assert(_stride_x > 0, "The filter stride must be > 0"); + static_assert(0 <= _padding_y && ((_nr==0 && _padding_y == 0) || (_nr!=0 && _padding_y < _nr)), + "The padding must be smaller than the filter size, unless the filters size is 0."); + static_assert(0 <= _padding_x && ((_nc==0 && _padding_x == 0) || (_nc!=0 && _padding_x < _nc)), + "The padding must be smaller than the filter size, unless the filters size is 0."); + + avg_pool_( + ) : + padding_y_(_padding_y), + padding_x_(_padding_x) + {} + + long nr() const { return _nr; } + long nc() const { return _nc; } + long stride_y() const { return _stride_y; } + long stride_x() const { return _stride_x; } + long padding_y() const { return padding_y_; } + long padding_x() const { return padding_x_; } + + inline dpoint map_input_to_output ( + dpoint p + ) const + { + p.x() = (p.x()+padding_x()-nc()/2)/stride_x(); + p.y() = (p.y()+padding_y()-nr()/2)/stride_y(); + return p; + } + + inline dpoint map_output_to_input ( + dpoint p + ) const + { + p.x() = p.x()*stride_x() - padding_x() + nc()/2; + p.y() = p.y()*stride_y() - padding_y() + nr()/2; + return p; + } + + avg_pool_ ( + const avg_pool_& item + ) : + padding_y_(item.padding_y_), + padding_x_(item.padding_x_) + { + // this->ap is non-copyable so we have to write our own copy to avoid trying to + // copy it and getting an error. + } + + avg_pool_& operator= ( + const avg_pool_& item + ) + { + if (this == &item) + return *this; + + padding_y_ = item.padding_y_; + padding_x_ = item.padding_x_; + + // this->ap is non-copyable so we have to write our own copy to avoid trying to + // copy it and getting an error. + return *this; + } + + template <typename SUBNET> + void setup (const SUBNET& /*sub*/) + { + } + + template <typename SUBNET> + void forward(const SUBNET& sub, resizable_tensor& output) + { + ap.setup_avg_pooling(_nr!=0?_nr:sub.get_output().nr(), + _nc!=0?_nc:sub.get_output().nc(), + _stride_y, _stride_x, padding_y_, padding_x_); + + ap(output, sub.get_output()); + } + + template <typename SUBNET> + void backward(const tensor& computed_output, const tensor& gradient_input, SUBNET& sub, tensor& /*params_grad*/) + { + ap.setup_avg_pooling(_nr!=0?_nr:sub.get_output().nr(), + _nc!=0?_nc:sub.get_output().nc(), + _stride_y, _stride_x, padding_y_, padding_x_); + + ap.get_gradient(gradient_input, computed_output, sub.get_output(), sub.get_gradient_input()); + } + + const tensor& get_layer_params() const { return params; } + tensor& get_layer_params() { return params; } + + friend void serialize(const avg_pool_& item, std::ostream& out) + { + serialize("avg_pool_2", out); + serialize(_nr, out); + serialize(_nc, out); + serialize(_stride_y, out); + serialize(_stride_x, out); + serialize(item.padding_y_, out); + serialize(item.padding_x_, out); + } + + friend void deserialize(avg_pool_& item, std::istream& in) + { + std::string version; + deserialize(version, in); + + long nr; + long nc; + int stride_y; + int stride_x; + if (version == "avg_pool_2") + { + deserialize(nr, in); + deserialize(nc, in); + deserialize(stride_y, in); + deserialize(stride_x, in); + deserialize(item.padding_y_, in); + deserialize(item.padding_x_, in); + } + else + { + throw serialization_error("Unexpected version '"+version+"' found while deserializing dlib::avg_pool_."); + } + + if (item.padding_y_ != _padding_y) throw serialization_error("Wrong padding_y found while deserializing dlib::avg_pool_"); + if (item.padding_x_ != _padding_x) throw serialization_error("Wrong padding_x found while deserializing dlib::avg_pool_"); + if (_nr != nr) throw serialization_error("Wrong nr found while deserializing dlib::avg_pool_"); + if (_nc != nc) throw serialization_error("Wrong nc found while deserializing dlib::avg_pool_"); + if (_stride_y != stride_y) throw serialization_error("Wrong stride_y found while deserializing dlib::avg_pool_"); + if (_stride_x != stride_x) throw serialization_error("Wrong stride_x found while deserializing dlib::avg_pool_"); + } + + friend std::ostream& operator<<(std::ostream& out, const avg_pool_& item) + { + out << "avg_pool (" + << "nr="<<_nr + << ", nc="<<_nc + << ", stride_y="<<_stride_y + << ", stride_x="<<_stride_x + << ", padding_y="<<item.padding_y_ + << ", padding_x="<<item.padding_x_ + << ")"; + return out; + } + + friend void to_xml(const avg_pool_& item, std::ostream& out) + { + out << "<avg_pool" + << " nr='"<<_nr<<"'" + << " nc='"<<_nc<<"'" + << " stride_y='"<<_stride_y<<"'" + << " stride_x='"<<_stride_x<<"'" + << " padding_y='"<<item.padding_y_<<"'" + << " padding_x='"<<item.padding_x_<<"'" + << "/>\n"; + } + private: + + tt::pooling ap; + resizable_tensor params; + + int padding_y_; + int padding_x_; + }; + + template < + long nr, + long nc, + int stride_y, + int stride_x, + typename SUBNET + > + using avg_pool = add_layer<avg_pool_<nr,nc,stride_y,stride_x>, SUBNET>; + + template < + typename SUBNET + > + using avg_pool_everything = add_layer<avg_pool_<0,0,1,1>, SUBNET>; + +// ---------------------------------------------------------------------------------------- + + enum layer_mode + { + CONV_MODE = 0, + FC_MODE = 1 + }; + + const double DEFAULT_BATCH_NORM_EPS = 0.0001; + + template < + layer_mode mode + > + class bn_ + { + public: + explicit bn_( + unsigned long window_size, + double eps_ = DEFAULT_BATCH_NORM_EPS + ) : + num_updates(0), + running_stats_window_size(window_size), + learning_rate_multiplier(1), + weight_decay_multiplier(0), + bias_learning_rate_multiplier(1), + bias_weight_decay_multiplier(1), + eps(eps_) + { + DLIB_CASSERT(window_size > 0, "The batch normalization running stats window size can't be 0."); + } + + bn_() : bn_(100) {} + + layer_mode get_mode() const { return mode; } + unsigned long get_running_stats_window_size () const { return running_stats_window_size; } + void set_running_stats_window_size (unsigned long new_window_size ) + { + DLIB_CASSERT(new_window_size > 0, "The batch normalization running stats window size can't be 0."); + running_stats_window_size = new_window_size; + } + double get_eps() const { return eps; } + + double get_learning_rate_multiplier () const { return learning_rate_multiplier; } + double get_weight_decay_multiplier () const { return weight_decay_multiplier; } + void set_learning_rate_multiplier(double val) { learning_rate_multiplier = val; } + void set_weight_decay_multiplier(double val) { weight_decay_multiplier = val; } + + double get_bias_learning_rate_multiplier () const { return bias_learning_rate_multiplier; } + double get_bias_weight_decay_multiplier () const { return bias_weight_decay_multiplier; } + void set_bias_learning_rate_multiplier(double val) { bias_learning_rate_multiplier = val; } + void set_bias_weight_decay_multiplier(double val) { bias_weight_decay_multiplier = val; } + + inline dpoint map_input_to_output (const dpoint& p) const { return p; } + inline dpoint map_output_to_input (const dpoint& p) const { return p; } + + + template <typename SUBNET> + void setup (const SUBNET& sub) + { + if (mode == FC_MODE) + { + gamma = alias_tensor(1, + sub.get_output().k(), + sub.get_output().nr(), + sub.get_output().nc()); + } + else + { + gamma = alias_tensor(1, sub.get_output().k()); + } + beta = gamma; + + params.set_size(gamma.size()+beta.size()); + + gamma(params,0) = 1; + beta(params,gamma.size()) = 0; + + running_means.copy_size(gamma(params,0)); + running_variances.copy_size(gamma(params,0)); + running_means = 0; + running_variances = 1; + num_updates = 0; + } + + template <typename SUBNET> + void forward(const SUBNET& sub, resizable_tensor& output) + { + auto g = gamma(params,0); + auto b = beta(params,gamma.size()); + if (sub.get_output().num_samples() > 1) + { + const double decay = 1.0 - num_updates/(num_updates+1.0); + ++num_updates; + if (num_updates > running_stats_window_size) + num_updates = running_stats_window_size; + + if (mode == FC_MODE) + tt::batch_normalize(eps, output, means, invstds, decay, running_means, running_variances, sub.get_output(), g, b); + else + tt::batch_normalize_conv(eps, output, means, invstds, decay, running_means, running_variances, sub.get_output(), g, b); + } + else // we are running in testing mode so we just linearly scale the input tensor. + { + if (mode == FC_MODE) + tt::batch_normalize_inference(eps, output, sub.get_output(), g, b, running_means, running_variances); + else + tt::batch_normalize_conv_inference(eps, output, sub.get_output(), g, b, running_means, running_variances); + } + } + + template <typename SUBNET> + void backward(const tensor& gradient_input, SUBNET& sub, tensor& params_grad) + { + auto g = gamma(params,0); + auto g_grad = gamma(params_grad, 0); + auto b_grad = beta(params_grad, gamma.size()); + if (mode == FC_MODE) + tt::batch_normalize_gradient(eps, gradient_input, means, invstds, sub.get_output(), g, sub.get_gradient_input(), g_grad, b_grad ); + else + tt::batch_normalize_conv_gradient(eps, gradient_input, means, invstds, sub.get_output(), g, sub.get_gradient_input(), g_grad, b_grad ); + } + + const tensor& get_layer_params() const { return params; } + tensor& get_layer_params() { return params; } + + friend void serialize(const bn_& item, std::ostream& out) + { + if (mode == CONV_MODE) + serialize("bn_con2", out); + else // if FC_MODE + serialize("bn_fc2", out); + serialize(item.params, out); + serialize(item.gamma, out); + serialize(item.beta, out); + serialize(item.means, out); + serialize(item.invstds, out); + serialize(item.running_means, out); + serialize(item.running_variances, out); + serialize(item.num_updates, out); + serialize(item.running_stats_window_size, out); + serialize(item.learning_rate_multiplier, out); + serialize(item.weight_decay_multiplier, out); + serialize(item.bias_learning_rate_multiplier, out); + serialize(item.bias_weight_decay_multiplier, out); + serialize(item.eps, out); + } + + friend void deserialize(bn_& item, std::istream& in) + { + std::string version; + deserialize(version, in); + if (mode == CONV_MODE) + { + if (version != "bn_con2") + throw serialization_error("Unexpected version '"+version+"' found while deserializing dlib::bn_."); + } + else // must be in FC_MODE + { + if (version != "bn_fc2") + throw serialization_error("Unexpected version '"+version+"' found while deserializing dlib::bn_."); + } + + deserialize(item.params, in); + deserialize(item.gamma, in); + deserialize(item.beta, in); + deserialize(item.means, in); + deserialize(item.invstds, in); + deserialize(item.running_means, in); + deserialize(item.running_variances, in); + deserialize(item.num_updates, in); + deserialize(item.running_stats_window_size, in); + deserialize(item.learning_rate_multiplier, in); + deserialize(item.weight_decay_multiplier, in); + deserialize(item.bias_learning_rate_multiplier, in); + deserialize(item.bias_weight_decay_multiplier, in); + deserialize(item.eps, in); + } + + friend std::ostream& operator<<(std::ostream& out, const bn_& item) + { + if (mode == CONV_MODE) + out << "bn_con "; + else + out << "bn_fc "; + out << " eps="<<item.eps; + out << " running_stats_window_size="<<item.running_stats_window_size; + out << " learning_rate_mult="<<item.learning_rate_multiplier; + out << " weight_decay_mult="<<item.weight_decay_multiplier; + out << " bias_learning_rate_mult="<<item.bias_learning_rate_multiplier; + out << " bias_weight_decay_mult="<<item.bias_weight_decay_multiplier; + return out; + } + + friend void to_xml(const bn_& item, std::ostream& out) + { + if (mode==CONV_MODE) + out << "<bn_con"; + else + out << "<bn_fc"; + + out << " eps='"<<item.eps<<"'"; + out << " running_stats_window_size='"<<item.running_stats_window_size<<"'"; + out << " learning_rate_mult='"<<item.learning_rate_multiplier<<"'"; + out << " weight_decay_mult='"<<item.weight_decay_multiplier<<"'"; + out << " bias_learning_rate_mult='"<<item.bias_learning_rate_multiplier<<"'"; + out << " bias_weight_decay_mult='"<<item.bias_weight_decay_multiplier<<"'"; + out << ">\n"; + + out << mat(item.params); + + if (mode==CONV_MODE) + out << "</bn_con>\n"; + else + out << "</bn_fc>\n"; + } + + private: + + friend class affine_; + + resizable_tensor params; + alias_tensor gamma, beta; + resizable_tensor means, running_means; + resizable_tensor invstds, running_variances; + unsigned long num_updates; + unsigned long running_stats_window_size; + double learning_rate_multiplier; + double weight_decay_multiplier; + double bias_learning_rate_multiplier; + double bias_weight_decay_multiplier; + double eps; + }; + + template <typename SUBNET> + using bn_con = add_layer<bn_<CONV_MODE>, SUBNET>; + template <typename SUBNET> + using bn_fc = add_layer<bn_<FC_MODE>, SUBNET>; + +// ---------------------------------------------------------------------------------------- + + namespace impl + { + class visitor_bn_running_stats_window_size + { + public: + + visitor_bn_running_stats_window_size(unsigned long new_window_size_) : new_window_size(new_window_size_) {} + + template <typename T> + void set_window_size(T&) const + { + // ignore other layer detail types + } + + template < layer_mode mode > + void set_window_size(bn_<mode>& l) const + { + l.set_running_stats_window_size(new_window_size); + } + + template<typename input_layer_type> + void operator()(size_t , input_layer_type& ) const + { + // ignore other layers + } + + template <typename T, typename U, typename E> + void operator()(size_t , add_layer<T,U,E>& l) const + { + set_window_size(l.layer_details()); + } + + private: + + unsigned long new_window_size; + }; + } + + template <typename net_type> + void set_all_bn_running_stats_window_sizes ( + net_type& net, + unsigned long new_window_size + ) + { + visit_layers(net, impl::visitor_bn_running_stats_window_size(new_window_size)); + } + +// ---------------------------------------------------------------------------------------- +// ---------------------------------------------------------------------------------------- + + enum fc_bias_mode + { + FC_HAS_BIAS = 0, + FC_NO_BIAS = 1 + }; + + struct num_fc_outputs + { + num_fc_outputs(unsigned long n) : num_outputs(n) {} + unsigned long num_outputs; + }; + + template < + unsigned long num_outputs_, + fc_bias_mode bias_mode + > + class fc_ + { + static_assert(num_outputs_ > 0, "The number of outputs from a fc_ layer must be > 0"); + + public: + fc_(num_fc_outputs o) : num_outputs(o.num_outputs), num_inputs(0), + learning_rate_multiplier(1), + weight_decay_multiplier(1), + bias_learning_rate_multiplier(1), + bias_weight_decay_multiplier(0) + {} + + fc_() : fc_(num_fc_outputs(num_outputs_)) {} + + double get_learning_rate_multiplier () const { return learning_rate_multiplier; } + double get_weight_decay_multiplier () const { return weight_decay_multiplier; } + void set_learning_rate_multiplier(double val) { learning_rate_multiplier = val; } + void set_weight_decay_multiplier(double val) { weight_decay_multiplier = val; } + + double get_bias_learning_rate_multiplier () const { return bias_learning_rate_multiplier; } + double get_bias_weight_decay_multiplier () const { return bias_weight_decay_multiplier; } + void set_bias_learning_rate_multiplier(double val) { bias_learning_rate_multiplier = val; } + void set_bias_weight_decay_multiplier(double val) { bias_weight_decay_multiplier = val; } + + unsigned long get_num_outputs ( + ) const { return num_outputs; } + + void set_num_outputs(long num) + { + DLIB_CASSERT(num > 0); + if (num != (long)num_outputs) + { + DLIB_CASSERT(get_layer_params().size() == 0, + "You can't change the number of filters in fc_ if the parameter tensor has already been allocated."); + num_outputs = num; + } + } + + fc_bias_mode get_bias_mode ( + ) const { return bias_mode; } + + template <typename SUBNET> + void setup (const SUBNET& sub) + { + num_inputs = sub.get_output().nr()*sub.get_output().nc()*sub.get_output().k(); + if (bias_mode == FC_HAS_BIAS) + params.set_size(num_inputs+1, num_outputs); + else + params.set_size(num_inputs, num_outputs); + + dlib::rand rnd(std::rand()); + randomize_parameters(params, num_inputs+num_outputs, rnd); + + weights = alias_tensor(num_inputs, num_outputs); + + if (bias_mode == FC_HAS_BIAS) + { + biases = alias_tensor(1,num_outputs); + // set the initial bias values to zero + biases(params,weights.size()) = 0; + } + } + + template <typename SUBNET> + void forward(const SUBNET& sub, resizable_tensor& output) + { + DLIB_CASSERT((long)num_inputs == sub.get_output().nr()*sub.get_output().nc()*sub.get_output().k(), + "The size of the input tensor to this fc layer doesn't match the size the fc layer was trained with."); + output.set_size(sub.get_output().num_samples(), num_outputs); + + auto w = weights(params, 0); + tt::gemm(0,output, 1,sub.get_output(),false, w,false); + if (bias_mode == FC_HAS_BIAS) + { + auto b = biases(params, weights.size()); + tt::add(1,output,1,b); + } + } + + template <typename SUBNET> + void backward(const tensor& gradient_input, SUBNET& sub, tensor& params_grad) + { + // no point computing the parameter gradients if they won't be used. + if (learning_rate_multiplier != 0) + { + // compute the gradient of the weight parameters. + auto pw = weights(params_grad, 0); + tt::gemm(0,pw, 1,sub.get_output(),true, gradient_input,false); + + if (bias_mode == FC_HAS_BIAS) + { + // compute the gradient of the bias parameters. + auto pb = biases(params_grad, weights.size()); + tt::assign_bias_gradient(pb, gradient_input); + } + } + + // compute the gradient for the data + auto w = weights(params, 0); + tt::gemm(1,sub.get_gradient_input(), 1,gradient_input,false, w,true); + } + + alias_tensor_instance get_weights() + { + return weights(params, 0); + } + + alias_tensor_const_instance get_weights() const + { + return weights(params, 0); + } + + alias_tensor_instance get_biases() + { + static_assert(bias_mode == FC_HAS_BIAS, "This fc_ layer doesn't have a bias vector " + "to be retrieved, as per template parameter 'bias_mode'."); + return biases(params, weights.size()); + } + + alias_tensor_const_instance get_biases() const + { + static_assert(bias_mode == FC_HAS_BIAS, "This fc_ layer doesn't have a bias vector " + "to be retrieved, as per template parameter 'bias_mode'."); + return biases(params, weights.size()); + } + + const tensor& get_layer_params() const { return params; } + tensor& get_layer_params() { return params; } + + friend void serialize(const fc_& item, std::ostream& out) + { + serialize("fc_2", out); + serialize(item.num_outputs, out); + serialize(item.num_inputs, out); + serialize(item.params, out); + serialize(item.weights, out); + serialize(item.biases, out); + serialize((int)bias_mode, out); + serialize(item.learning_rate_multiplier, out); + serialize(item.weight_decay_multiplier, out); + serialize(item.bias_learning_rate_multiplier, out); + serialize(item.bias_weight_decay_multiplier, out); + } + + friend void deserialize(fc_& item, std::istream& in) + { + std::string version; + deserialize(version, in); + if (version != "fc_2") + throw serialization_error("Unexpected version '"+version+"' found while deserializing dlib::fc_."); + + deserialize(item.num_outputs, in); + deserialize(item.num_inputs, in); + deserialize(item.params, in); + deserialize(item.weights, in); + deserialize(item.biases, in); + int bmode = 0; + deserialize(bmode, in); + if (bias_mode != (fc_bias_mode)bmode) throw serialization_error("Wrong fc_bias_mode found while deserializing dlib::fc_"); + deserialize(item.learning_rate_multiplier, in); + deserialize(item.weight_decay_multiplier, in); + deserialize(item.bias_learning_rate_multiplier, in); + deserialize(item.bias_weight_decay_multiplier, in); + } + + friend std::ostream& operator<<(std::ostream& out, const fc_& item) + { + if (bias_mode == FC_HAS_BIAS) + { + out << "fc\t (" + << "num_outputs="<<item.num_outputs + << ")"; + out << " learning_rate_mult="<<item.learning_rate_multiplier; + out << " weight_decay_mult="<<item.weight_decay_multiplier; + out << " bias_learning_rate_mult="<<item.bias_learning_rate_multiplier; + out << " bias_weight_decay_mult="<<item.bias_weight_decay_multiplier; + } + else + { + out << "fc_no_bias (" + << "num_outputs="<<item.num_outputs + << ")"; + out << " learning_rate_mult="<<item.learning_rate_multiplier; + out << " weight_decay_mult="<<item.weight_decay_multiplier; + } + return out; + } + + friend void to_xml(const fc_& item, std::ostream& out) + { + if (bias_mode==FC_HAS_BIAS) + { + out << "<fc" + << " num_outputs='"<<item.num_outputs<<"'" + << " learning_rate_mult='"<<item.learning_rate_multiplier<<"'" + << " weight_decay_mult='"<<item.weight_decay_multiplier<<"'" + << " bias_learning_rate_mult='"<<item.bias_learning_rate_multiplier<<"'" + << " bias_weight_decay_mult='"<<item.bias_weight_decay_multiplier<<"'"; + out << ">\n"; + out << mat(item.params); + out << "</fc>\n"; + } + else + { + out << "<fc_no_bias" + << " num_outputs='"<<item.num_outputs<<"'" + << " learning_rate_mult='"<<item.learning_rate_multiplier<<"'" + << " weight_decay_mult='"<<item.weight_decay_multiplier<<"'"; + out << ">\n"; + out << mat(item.params); + out << "</fc_no_bias>\n"; + } + } + + private: + + unsigned long num_outputs; + unsigned long num_inputs; + resizable_tensor params; + alias_tensor weights, biases; + double learning_rate_multiplier; + double weight_decay_multiplier; + double bias_learning_rate_multiplier; + double bias_weight_decay_multiplier; + }; + + template < + unsigned long num_outputs, + typename SUBNET + > + using fc = add_layer<fc_<num_outputs,FC_HAS_BIAS>, SUBNET>; + + template < + unsigned long num_outputs, + typename SUBNET + > + using fc_no_bias = add_layer<fc_<num_outputs,FC_NO_BIAS>, SUBNET>; + +// ---------------------------------------------------------------------------------------- + + class dropout_ + { + public: + explicit dropout_( + float drop_rate_ = 0.5 + ) : + drop_rate(drop_rate_), + rnd(std::rand()) + { + DLIB_CASSERT(0 <= drop_rate && drop_rate <= 1); + } + + // We have to add a copy constructor and assignment operator because the rnd object + // is non-copyable. + dropout_( + const dropout_& item + ) : drop_rate(item.drop_rate), mask(item.mask), rnd(std::rand()) + {} + + dropout_& operator= ( + const dropout_& item + ) + { + if (this == &item) + return *this; + + drop_rate = item.drop_rate; + mask = item.mask; + return *this; + } + + float get_drop_rate ( + ) const { return drop_rate; } + + template <typename SUBNET> + void setup (const SUBNET& /*sub*/) + { + } + + void forward_inplace(const tensor& input, tensor& output) + { + // create a random mask and use it to filter the data + mask.copy_size(input); + rnd.fill_uniform(mask); + tt::threshold(mask, drop_rate); + tt::multiply(false, output, input, mask); + } + + void backward_inplace( + const tensor& gradient_input, + tensor& data_grad, + tensor& /*params_grad*/ + ) + { + if (is_same_object(gradient_input, data_grad)) + tt::multiply(false, data_grad, mask, gradient_input); + else + tt::multiply(true, data_grad, mask, gradient_input); + } + + inline dpoint map_input_to_output (const dpoint& p) const { return p; } + inline dpoint map_output_to_input (const dpoint& p) const { return p; } + + const tensor& get_layer_params() const { return params; } + tensor& get_layer_params() { return params; } + + friend void serialize(const dropout_& item, std::ostream& out) + { + serialize("dropout_", out); + serialize(item.drop_rate, out); + serialize(item.mask, out); + } + + friend void deserialize(dropout_& item, std::istream& in) + { + std::string version; + deserialize(version, in); + if (version != "dropout_") + throw serialization_error("Unexpected version '"+version+"' found while deserializing dlib::dropout_."); + deserialize(item.drop_rate, in); + deserialize(item.mask, in); + } + + void clean( + ) + { + mask.clear(); + } + + friend std::ostream& operator<<(std::ostream& out, const dropout_& item) + { + out << "dropout\t (" + << "drop_rate="<<item.drop_rate + << ")"; + return out; + } + + friend void to_xml(const dropout_& item, std::ostream& out) + { + out << "<dropout" + << " drop_rate='"<<item.drop_rate<<"'"; + out << "/>\n"; + } + + private: + float drop_rate; + resizable_tensor mask; + + tt::tensor_rand rnd; + resizable_tensor params; // unused + }; + + + template <typename SUBNET> + using dropout = add_layer<dropout_, SUBNET>; + +// ---------------------------------------------------------------------------------------- + + class multiply_ + { + public: + explicit multiply_( + float val_ = 0.5 + ) : + val(val_) + { + } + + multiply_ ( + const dropout_& item + ) : val(1-item.get_drop_rate()) {} + + float get_multiply_value ( + ) const { return val; } + + template <typename SUBNET> + void setup (const SUBNET& /*sub*/) + { + } + + void forward_inplace(const tensor& input, tensor& output) + { + tt::affine_transform(output, input, val); + } + + inline dpoint map_input_to_output (const dpoint& p) const { return p; } + inline dpoint map_output_to_input (const dpoint& p) const { return p; } + + void backward_inplace( + const tensor& gradient_input, + tensor& data_grad, + tensor& /*params_grad*/ + ) + { + if (is_same_object(gradient_input, data_grad)) + tt::affine_transform(data_grad, gradient_input, val); + else + tt::affine_transform(data_grad, data_grad, gradient_input, 1, val); + } + + const tensor& get_layer_params() const { return params; } + tensor& get_layer_params() { return params; } + + friend void serialize(const multiply_& item, std::ostream& out) + { + serialize("multiply_", out); + serialize(item.val, out); + } + + friend void deserialize(multiply_& item, std::istream& in) + { + std::string version; + deserialize(version, in); + if (version == "dropout_") + { + // Since we can build a multiply_ from a dropout_ we check if that's what + // is in the stream and if so then just convert it right here. + unserialize sin(version, in); + dropout_ temp; + deserialize(temp, sin); + item = temp; + return; + } + + if (version != "multiply_") + throw serialization_error("Unexpected version '"+version+"' found while deserializing dlib::multiply_."); + deserialize(item.val, in); + } + + friend std::ostream& operator<<(std::ostream& out, const multiply_& item) + { + out << "multiply (" + << "val="<<item.val + << ")"; + return out; + } + + friend void to_xml(const multiply_& item, std::ostream& out) + { + out << "<multiply" + << " val='"<<item.val<<"'"; + out << "/>\n"; + } + private: + float val; + resizable_tensor params; // unused + }; + + template <typename SUBNET> + using multiply = add_layer<multiply_, SUBNET>; + +// ---------------------------------------------------------------------------------------- + + class affine_ + { + public: + affine_( + ) : mode(FC_MODE) + { + } + + affine_( + layer_mode mode_ + ) : mode(mode_) + { + } + + template < + layer_mode bnmode + > + affine_( + const bn_<bnmode>& item + ) + { + gamma = item.gamma; + beta = item.beta; + mode = bnmode; + + params.copy_size(item.params); + + auto g = gamma(params,0); + auto b = beta(params,gamma.size()); + + resizable_tensor temp(item.params); + auto sg = gamma(temp,0); + auto sb = beta(temp,gamma.size()); + + g = pointwise_multiply(mat(sg), 1.0f/sqrt(mat(item.running_variances)+item.get_eps())); + b = mat(sb) - pointwise_multiply(mat(g), mat(item.running_means)); + } + + layer_mode get_mode() const { return mode; } + + inline dpoint map_input_to_output (const dpoint& p) const { return p; } + inline dpoint map_output_to_input (const dpoint& p) const { return p; } + + template <typename SUBNET> + void setup (const SUBNET& sub) + { + if (mode == FC_MODE) + { + gamma = alias_tensor(1, + sub.get_output().k(), + sub.get_output().nr(), + sub.get_output().nc()); + } + else + { + gamma = alias_tensor(1, sub.get_output().k()); + } + beta = gamma; + + params.set_size(gamma.size()+beta.size()); + + gamma(params,0) = 1; + beta(params,gamma.size()) = 0; + } + + void forward_inplace(const tensor& input, tensor& output) + { + auto g = gamma(params,0); + auto b = beta(params,gamma.size()); + if (mode == FC_MODE) + tt::affine_transform(output, input, g, b); + else + tt::affine_transform_conv(output, input, g, b); + } + + void backward_inplace( + const tensor& gradient_input, + tensor& data_grad, + tensor& /*params_grad*/ + ) + { + auto g = gamma(params,0); + auto b = beta(params,gamma.size()); + + // We are computing the gradient of dot(gradient_input, computed_output*g + b) + if (mode == FC_MODE) + { + if (is_same_object(gradient_input, data_grad)) + tt::multiply(false, data_grad, gradient_input, g); + else + tt::multiply(true, data_grad, gradient_input, g); + } + else + { + if (is_same_object(gradient_input, data_grad)) + tt::multiply_conv(false, data_grad, gradient_input, g); + else + tt::multiply_conv(true, data_grad, gradient_input, g); + } + } + + const tensor& get_layer_params() const { return empty_params; } + tensor& get_layer_params() { return empty_params; } + + friend void serialize(const affine_& item, std::ostream& out) + { + serialize("affine_", out); + serialize(item.params, out); + serialize(item.gamma, out); + serialize(item.beta, out); + serialize((int)item.mode, out); + } + + friend void deserialize(affine_& item, std::istream& in) + { + std::string version; + deserialize(version, in); + if (version == "bn_con2") + { + // Since we can build an affine_ from a bn_ we check if that's what is in + // the stream and if so then just convert it right here. + unserialize sin(version, in); + bn_<CONV_MODE> temp; + deserialize(temp, sin); + item = temp; + return; + } + else if (version == "bn_fc2") + { + // Since we can build an affine_ from a bn_ we check if that's what is in + // the stream and if so then just convert it right here. + unserialize sin(version, in); + bn_<FC_MODE> temp; + deserialize(temp, sin); + item = temp; + return; + } + + if (version != "affine_") + throw serialization_error("Unexpected version '"+version+"' found while deserializing dlib::affine_."); + deserialize(item.params, in); + deserialize(item.gamma, in); + deserialize(item.beta, in); + int mode; + deserialize(mode, in); + item.mode = (layer_mode)mode; + } + + friend std::ostream& operator<<(std::ostream& out, const affine_& ) + { + out << "affine"; + return out; + } + + friend void to_xml(const affine_& item, std::ostream& out) + { + if (item.mode==CONV_MODE) + out << "<affine_con>\n"; + else + out << "<affine_fc>\n"; + + out << mat(item.params); + + if (item.mode==CONV_MODE) + out << "</affine_con>\n"; + else + out << "</affine_fc>\n"; + } + + private: + resizable_tensor params, empty_params; + alias_tensor gamma, beta; + layer_mode mode; + }; + + template <typename SUBNET> + using affine = add_layer<affine_, SUBNET>; + +// ---------------------------------------------------------------------------------------- + + template < + template<typename> class tag + > + class add_prev_ + { + public: + const static unsigned long id = tag_id<tag>::id; + + add_prev_() + { + } + + template <typename SUBNET> + void setup (const SUBNET& /*sub*/) + { + } + + template <typename SUBNET> + void forward(const SUBNET& sub, resizable_tensor& output) + { + auto&& t1 = sub.get_output(); + auto&& t2 = layer<tag>(sub).get_output(); + output.set_size(std::max(t1.num_samples(),t2.num_samples()), + std::max(t1.k(),t2.k()), + std::max(t1.nr(),t2.nr()), + std::max(t1.nc(),t2.nc())); + tt::add(output, t1, t2); + } + + template <typename SUBNET> + void backward(const tensor& gradient_input, SUBNET& sub, tensor& /*params_grad*/) + { + // The gradient just flows backwards to the two layers that forward() added + // together. + tt::add(sub.get_gradient_input(), sub.get_gradient_input(), gradient_input); + tt::add(layer<tag>(sub).get_gradient_input(), layer<tag>(sub).get_gradient_input(), gradient_input); + } + + const tensor& get_layer_params() const { return params; } + tensor& get_layer_params() { return params; } + + inline dpoint map_input_to_output (const dpoint& p) const { return p; } + inline dpoint map_output_to_input (const dpoint& p) const { return p; } + + friend void serialize(const add_prev_& , std::ostream& out) + { + serialize("add_prev_", out); + } + + friend void deserialize(add_prev_& , std::istream& in) + { + std::string version; + deserialize(version, in); + if (version != "add_prev_") + throw serialization_error("Unexpected version '"+version+"' found while deserializing dlib::add_prev_."); + } + + friend std::ostream& operator<<(std::ostream& out, const add_prev_& item) + { + out << "add_prev"<<id; + return out; + } + + friend void to_xml(const add_prev_& item, std::ostream& out) + { + out << "<add_prev tag='"<<id<<"'/>\n"; + } + + private: + resizable_tensor params; + }; + + template < + template<typename> class tag, + typename SUBNET + > + using add_prev = add_layer<add_prev_<tag>, SUBNET>; + + template <typename SUBNET> using add_prev1 = add_prev<tag1, SUBNET>; + template <typename SUBNET> using add_prev2 = add_prev<tag2, SUBNET>; + template <typename SUBNET> using add_prev3 = add_prev<tag3, SUBNET>; + template <typename SUBNET> using add_prev4 = add_prev<tag4, SUBNET>; + template <typename SUBNET> using add_prev5 = add_prev<tag5, SUBNET>; + template <typename SUBNET> using add_prev6 = add_prev<tag6, SUBNET>; + template <typename SUBNET> using add_prev7 = add_prev<tag7, SUBNET>; + template <typename SUBNET> using add_prev8 = add_prev<tag8, SUBNET>; + template <typename SUBNET> using add_prev9 = add_prev<tag9, SUBNET>; + template <typename SUBNET> using add_prev10 = add_prev<tag10, SUBNET>; + + using add_prev1_ = add_prev_<tag1>; + using add_prev2_ = add_prev_<tag2>; + using add_prev3_ = add_prev_<tag3>; + using add_prev4_ = add_prev_<tag4>; + using add_prev5_ = add_prev_<tag5>; + using add_prev6_ = add_prev_<tag6>; + using add_prev7_ = add_prev_<tag7>; + using add_prev8_ = add_prev_<tag8>; + using add_prev9_ = add_prev_<tag9>; + using add_prev10_ = add_prev_<tag10>; + +// ---------------------------------------------------------------------------------------- + + template < + template<typename> class tag + > + class mult_prev_ + { + public: + const static unsigned long id = tag_id<tag>::id; + + mult_prev_() + { + } + + template <typename SUBNET> + void setup (const SUBNET& /*sub*/) + { + } + + template <typename SUBNET> + void forward(const SUBNET& sub, resizable_tensor& output) + { + auto&& t1 = sub.get_output(); + auto&& t2 = layer<tag>(sub).get_output(); + output.set_size(std::max(t1.num_samples(),t2.num_samples()), + std::max(t1.k(),t2.k()), + std::max(t1.nr(),t2.nr()), + std::max(t1.nc(),t2.nc())); + tt::multiply_zero_padded(false, output, t1, t2); + } + + template <typename SUBNET> + void backward(const tensor& gradient_input, SUBNET& sub, tensor& /*params_grad*/) + { + auto&& t1 = sub.get_output(); + auto&& t2 = layer<tag>(sub).get_output(); + // The gradient just flows backwards to the two layers that forward() + // multiplied together. + tt::multiply_zero_padded(true, sub.get_gradient_input(), t2, gradient_input); + tt::multiply_zero_padded(true, layer<tag>(sub).get_gradient_input(), t1, gradient_input); + } + + const tensor& get_layer_params() const { return params; } + tensor& get_layer_params() { return params; } + + friend void serialize(const mult_prev_& , std::ostream& out) + { + serialize("mult_prev_", out); + } + + friend void deserialize(mult_prev_& , std::istream& in) + { + std::string version; + deserialize(version, in); + if (version != "mult_prev_") + throw serialization_error("Unexpected version '"+version+"' found while deserializing dlib::mult_prev_."); + } + + friend std::ostream& operator<<(std::ostream& out, const mult_prev_& item) + { + out << "mult_prev"<<id; + return out; + } + + friend void to_xml(const mult_prev_& item, std::ostream& out) + { + out << "<mult_prev tag='"<<id<<"'/>\n"; + } + + private: + resizable_tensor params; + }; + + template < + template<typename> class tag, + typename SUBNET + > + using mult_prev = add_layer<mult_prev_<tag>, SUBNET>; + + template <typename SUBNET> using mult_prev1 = mult_prev<tag1, SUBNET>; + template <typename SUBNET> using mult_prev2 = mult_prev<tag2, SUBNET>; + template <typename SUBNET> using mult_prev3 = mult_prev<tag3, SUBNET>; + template <typename SUBNET> using mult_prev4 = mult_prev<tag4, SUBNET>; + template <typename SUBNET> using mult_prev5 = mult_prev<tag5, SUBNET>; + template <typename SUBNET> using mult_prev6 = mult_prev<tag6, SUBNET>; + template <typename SUBNET> using mult_prev7 = mult_prev<tag7, SUBNET>; + template <typename SUBNET> using mult_prev8 = mult_prev<tag8, SUBNET>; + template <typename SUBNET> using mult_prev9 = mult_prev<tag9, SUBNET>; + template <typename SUBNET> using mult_prev10 = mult_prev<tag10, SUBNET>; + + using mult_prev1_ = mult_prev_<tag1>; + using mult_prev2_ = mult_prev_<tag2>; + using mult_prev3_ = mult_prev_<tag3>; + using mult_prev4_ = mult_prev_<tag4>; + using mult_prev5_ = mult_prev_<tag5>; + using mult_prev6_ = mult_prev_<tag6>; + using mult_prev7_ = mult_prev_<tag7>; + using mult_prev8_ = mult_prev_<tag8>; + using mult_prev9_ = mult_prev_<tag9>; + using mult_prev10_ = mult_prev_<tag10>; + +// ---------------------------------------------------------------------------------------- + + template < + template<typename> class tag + > + class scale_ + { + public: + const static unsigned long id = tag_id<tag>::id; + + scale_() + { + } + + template <typename SUBNET> + void setup (const SUBNET& /*sub*/) + { + } + + template <typename SUBNET> + void forward(const SUBNET& sub, resizable_tensor& output) + { + auto&& scales = sub.get_output(); + auto&& src = layer<tag>(sub).get_output(); + DLIB_CASSERT(scales.num_samples() == src.num_samples() && + scales.k() == src.k() && + scales.nr() == 1 && + scales.nc() == 1, + "scales.k(): " << scales.k() << + "\nsrc.k(): " << src.k() + ); + + output.copy_size(src); + tt::scale_channels(false, output, src, scales); + } + + template <typename SUBNET> + void backward(const tensor& gradient_input, SUBNET& sub, tensor& /*params_grad*/) + { + auto&& scales = sub.get_output(); + auto&& src = layer<tag>(sub).get_output(); + // The gradient just flows backwards to the two layers that forward() + // read from. + tt::scale_channels(true, layer<tag>(sub).get_gradient_input(), gradient_input, scales); + + if (reshape_src.num_samples() != src.num_samples()) + { + reshape_scales = alias_tensor(src.num_samples()*src.k()); + reshape_src = alias_tensor(src.num_samples()*src.k(),src.nr()*src.nc()); + } + + auto&& scales_grad = sub.get_gradient_input(); + auto sgrad = reshape_scales(scales_grad); + tt::dot_prods(true, sgrad, reshape_src(src), reshape_src(gradient_input)); + } + + const tensor& get_layer_params() const { return params; } + tensor& get_layer_params() { return params; } + + friend void serialize(const scale_& item, std::ostream& out) + { + serialize("scale_", out); + serialize(item.reshape_scales, out); + serialize(item.reshape_src, out); + } + + friend void deserialize(scale_& item, std::istream& in) + { + std::string version; + deserialize(version, in); + if (version != "scale_") + throw serialization_error("Unexpected version '"+version+"' found while deserializing dlib::scale_."); + deserialize(item.reshape_scales, in); + deserialize(item.reshape_src, in); + } + + friend std::ostream& operator<<(std::ostream& out, const scale_& item) + { + out << "scale"<<id; + return out; + } + + friend void to_xml(const scale_& item, std::ostream& out) + { + out << "<scale tag='"<<id<<"'/>\n"; + } + + private: + alias_tensor reshape_scales; + alias_tensor reshape_src; + resizable_tensor params; + }; + + template < + template<typename> class tag, + typename SUBNET + > + using scale = add_layer<scale_<tag>, SUBNET>; + + template <typename SUBNET> using scale1 = scale<tag1, SUBNET>; + template <typename SUBNET> using scale2 = scale<tag2, SUBNET>; + template <typename SUBNET> using scale3 = scale<tag3, SUBNET>; + template <typename SUBNET> using scale4 = scale<tag4, SUBNET>; + template <typename SUBNET> using scale5 = scale<tag5, SUBNET>; + template <typename SUBNET> using scale6 = scale<tag6, SUBNET>; + template <typename SUBNET> using scale7 = scale<tag7, SUBNET>; + template <typename SUBNET> using scale8 = scale<tag8, SUBNET>; + template <typename SUBNET> using scale9 = scale<tag9, SUBNET>; + template <typename SUBNET> using scale10 = scale<tag10, SUBNET>; + + using scale1_ = scale_<tag1>; + using scale2_ = scale_<tag2>; + using scale3_ = scale_<tag3>; + using scale4_ = scale_<tag4>; + using scale5_ = scale_<tag5>; + using scale6_ = scale_<tag6>; + using scale7_ = scale_<tag7>; + using scale8_ = scale_<tag8>; + using scale9_ = scale_<tag9>; + using scale10_ = scale_<tag10>; + +// ---------------------------------------------------------------------------------------- + + class relu_ + { + public: + relu_() + { + } + + template <typename SUBNET> + void setup (const SUBNET& /*sub*/) + { + } + + void forward_inplace(const tensor& input, tensor& output) + { + tt::relu(output, input); + } + + void backward_inplace( + const tensor& computed_output, + const tensor& gradient_input, + tensor& data_grad, + tensor& + ) + { + tt::relu_gradient(data_grad, computed_output, gradient_input); + } + + inline dpoint map_input_to_output (const dpoint& p) const { return p; } + inline dpoint map_output_to_input (const dpoint& p) const { return p; } + + const tensor& get_layer_params() const { return params; } + tensor& get_layer_params() { return params; } + + friend void serialize(const relu_& , std::ostream& out) + { + serialize("relu_", out); + } + + friend void deserialize(relu_& , std::istream& in) + { + std::string version; + deserialize(version, in); + if (version != "relu_") + throw serialization_error("Unexpected version '"+version+"' found while deserializing dlib::relu_."); + } + + friend std::ostream& operator<<(std::ostream& out, const relu_& ) + { + out << "relu"; + return out; + } + + friend void to_xml(const relu_& /*item*/, std::ostream& out) + { + out << "<relu/>\n"; + } + + private: + resizable_tensor params; + }; + + + template <typename SUBNET> + using relu = add_layer<relu_, SUBNET>; + +// ---------------------------------------------------------------------------------------- + + class prelu_ + { + public: + explicit prelu_( + float initial_param_value_ = 0.25 + ) : initial_param_value(initial_param_value_) + { + } + + float get_initial_param_value ( + ) const { return initial_param_value; } + + template <typename SUBNET> + void setup (const SUBNET& /*sub*/) + { + params.set_size(1); + params = initial_param_value; + } + + template <typename SUBNET> + void forward( + const SUBNET& sub, + resizable_tensor& data_output + ) + { + data_output.copy_size(sub.get_output()); + tt::prelu(data_output, sub.get_output(), params); + } + + template <typename SUBNET> + void backward( + const tensor& gradient_input, + SUBNET& sub, + tensor& params_grad + ) + { + tt::prelu_gradient(sub.get_gradient_input(), sub.get_output(), + gradient_input, params, params_grad); + } + + inline dpoint map_input_to_output (const dpoint& p) const { return p; } + inline dpoint map_output_to_input (const dpoint& p) const { return p; } + + const tensor& get_layer_params() const { return params; } + tensor& get_layer_params() { return params; } + + friend void serialize(const prelu_& item, std::ostream& out) + { + serialize("prelu_", out); + serialize(item.params, out); + serialize(item.initial_param_value, out); + } + + friend void deserialize(prelu_& item, std::istream& in) + { + std::string version; + deserialize(version, in); + if (version != "prelu_") + throw serialization_error("Unexpected version '"+version+"' found while deserializing dlib::prelu_."); + deserialize(item.params, in); + deserialize(item.initial_param_value, in); + } + + friend std::ostream& operator<<(std::ostream& out, const prelu_& item) + { + out << "prelu\t (" + << "initial_param_value="<<item.initial_param_value + << ")"; + return out; + } + + friend void to_xml(const prelu_& item, std::ostream& out) + { + out << "<prelu initial_param_value='"<<item.initial_param_value<<"'>\n"; + out << mat(item.params); + out << "</prelu>\n"; + } + + private: + resizable_tensor params; + float initial_param_value; + }; + + template <typename SUBNET> + using prelu = add_layer<prelu_, SUBNET>; + +// ---------------------------------------------------------------------------------------- + + class sig_ + { + public: + sig_() + { + } + + template <typename SUBNET> + void setup (const SUBNET& /*sub*/) + { + } + + void forward_inplace(const tensor& input, tensor& output) + { + tt::sigmoid(output, input); + } + + void backward_inplace( + const tensor& computed_output, + const tensor& gradient_input, + tensor& data_grad, + tensor& + ) + { + tt::sigmoid_gradient(data_grad, computed_output, gradient_input); + } + + inline dpoint map_input_to_output (const dpoint& p) const { return p; } + inline dpoint map_output_to_input (const dpoint& p) const { return p; } + + const tensor& get_layer_params() const { return params; } + tensor& get_layer_params() { return params; } + + friend void serialize(const sig_& , std::ostream& out) + { + serialize("sig_", out); + } + + friend void deserialize(sig_& , std::istream& in) + { + std::string version; + deserialize(version, in); + if (version != "sig_") + throw serialization_error("Unexpected version '"+version+"' found while deserializing dlib::sig_."); + } + + friend std::ostream& operator<<(std::ostream& out, const sig_& ) + { + out << "sig"; + return out; + } + + friend void to_xml(const sig_& /*item*/, std::ostream& out) + { + out << "<sig/>\n"; + } + + + private: + resizable_tensor params; + }; + + + template <typename SUBNET> + using sig = add_layer<sig_, SUBNET>; + +// ---------------------------------------------------------------------------------------- + + class htan_ + { + public: + htan_() + { + } + + template <typename SUBNET> + void setup (const SUBNET& /*sub*/) + { + } + + inline dpoint map_input_to_output (const dpoint& p) const { return p; } + inline dpoint map_output_to_input (const dpoint& p) const { return p; } + + void forward_inplace(const tensor& input, tensor& output) + { + tt::tanh(output, input); + } + + void backward_inplace( + const tensor& computed_output, + const tensor& gradient_input, + tensor& data_grad, + tensor& + ) + { + tt::tanh_gradient(data_grad, computed_output, gradient_input); + } + + const tensor& get_layer_params() const { return params; } + tensor& get_layer_params() { return params; } + + friend void serialize(const htan_& , std::ostream& out) + { + serialize("htan_", out); + } + + friend void deserialize(htan_& , std::istream& in) + { + std::string version; + deserialize(version, in); + if (version != "htan_") + throw serialization_error("Unexpected version '"+version+"' found while deserializing dlib::htan_."); + } + + friend std::ostream& operator<<(std::ostream& out, const htan_& ) + { + out << "htan"; + return out; + } + + friend void to_xml(const htan_& /*item*/, std::ostream& out) + { + out << "<htan/>\n"; + } + + + private: + resizable_tensor params; + }; + + + template <typename SUBNET> + using htan = add_layer<htan_, SUBNET>; + +// ---------------------------------------------------------------------------------------- + + class softmax_ + { + public: + softmax_() + { + } + + template <typename SUBNET> + void setup (const SUBNET& /*sub*/) + { + } + + void forward_inplace(const tensor& input, tensor& output) + { + tt::softmax(output, input); + } + + void backward_inplace( + const tensor& computed_output, + const tensor& gradient_input, + tensor& data_grad, + tensor& + ) + { + tt::softmax_gradient(data_grad, computed_output, gradient_input); + } + + const tensor& get_layer_params() const { return params; } + tensor& get_layer_params() { return params; } + + friend void serialize(const softmax_& , std::ostream& out) + { + serialize("softmax_", out); + } + + friend void deserialize(softmax_& , std::istream& in) + { + std::string version; + deserialize(version, in); + if (version != "softmax_") + throw serialization_error("Unexpected version '"+version+"' found while deserializing dlib::softmax_."); + } + + friend std::ostream& operator<<(std::ostream& out, const softmax_& ) + { + out << "softmax"; + return out; + } + + friend void to_xml(const softmax_& /*item*/, std::ostream& out) + { + out << "<softmax/>\n"; + } + + private: + resizable_tensor params; + }; + + template <typename SUBNET> + using softmax = add_layer<softmax_, SUBNET>; + +// ---------------------------------------------------------------------------------------- + + class softmax_all_ + { + public: + softmax_all_() + { + } + + template <typename SUBNET> + void setup (const SUBNET& /*sub*/) + { + } + + void forward_inplace(const tensor& input, tensor& output) + { + tt::softmax_all(output, input); + } + + void backward_inplace( + const tensor& computed_output, + const tensor& gradient_input, + tensor& data_grad, + tensor& + ) + { + tt::softmax_all_gradient(data_grad, computed_output, gradient_input); + } + + const tensor& get_layer_params() const { return params; } + tensor& get_layer_params() { return params; } + + friend void serialize(const softmax_all_& , std::ostream& out) + { + serialize("softmax_all_", out); + } + + friend void deserialize(softmax_all_& , std::istream& in) + { + std::string version; + deserialize(version, in); + if (version != "softmax_all_") + throw serialization_error("Unexpected version '"+version+"' found while deserializing dlib::softmax_all_."); + } + + friend std::ostream& operator<<(std::ostream& out, const softmax_all_& ) + { + out << "softmax_all"; + return out; + } + + friend void to_xml(const softmax_all_& /*item*/, std::ostream& out) + { + out << "<softmax_all/>\n"; + } + + private: + resizable_tensor params; + }; + + template <typename SUBNET> + using softmax_all = add_layer<softmax_all_, SUBNET>; + +// ---------------------------------------------------------------------------------------- + + namespace impl + { + template <template<typename> class TAG_TYPE, template<typename> class... TAG_TYPES> + struct concat_helper_impl{ + + constexpr static size_t tag_count() {return 1 + concat_helper_impl<TAG_TYPES...>::tag_count();} + static void list_tags(std::ostream& out) + { + out << tag_id<TAG_TYPE>::id << (tag_count() > 1 ? "," : ""); + concat_helper_impl<TAG_TYPES...>::list_tags(out); + } + + template<typename SUBNET> + static void resize_out(resizable_tensor& out, const SUBNET& sub, long sum_k) + { + auto& t = layer<TAG_TYPE>(sub).get_output(); + concat_helper_impl<TAG_TYPES...>::resize_out(out, sub, sum_k + t.k()); + } + template<typename SUBNET> + static void concat(tensor& out, const SUBNET& sub, size_t k_offset) + { + auto& t = layer<TAG_TYPE>(sub).get_output(); + tt::copy_tensor(false, out, k_offset, t, 0, t.k()); + k_offset += t.k(); + concat_helper_impl<TAG_TYPES...>::concat(out, sub, k_offset); + } + template<typename SUBNET> + static void split(const tensor& input, SUBNET& sub, size_t k_offset) + { + auto& t = layer<TAG_TYPE>(sub).get_gradient_input(); + tt::copy_tensor(true, t, 0, input, k_offset, t.k()); + k_offset += t.k(); + concat_helper_impl<TAG_TYPES...>::split(input, sub, k_offset); + } + }; + template <template<typename> class TAG_TYPE> + struct concat_helper_impl<TAG_TYPE>{ + constexpr static size_t tag_count() {return 1;} + static void list_tags(std::ostream& out) + { + out << tag_id<TAG_TYPE>::id; + } + + template<typename SUBNET> + static void resize_out(resizable_tensor& out, const SUBNET& sub, long sum_k) + { + auto& t = layer<TAG_TYPE>(sub).get_output(); + out.set_size(t.num_samples(), t.k() + sum_k, t.nr(), t.nc()); + } + template<typename SUBNET> + static void concat(tensor& out, const SUBNET& sub, size_t k_offset) + { + auto& t = layer<TAG_TYPE>(sub).get_output(); + tt::copy_tensor(false, out, k_offset, t, 0, t.k()); + } + template<typename SUBNET> + static void split(const tensor& input, SUBNET& sub, size_t k_offset) + { + auto& t = layer<TAG_TYPE>(sub).get_gradient_input(); + tt::copy_tensor(true, t, 0, input, k_offset, t.k()); + } + }; + } + // concat layer + template< + template<typename> class... TAG_TYPES + > + class concat_ + { + static void list_tags(std::ostream& out) { impl::concat_helper_impl<TAG_TYPES...>::list_tags(out);}; + + public: + constexpr static size_t tag_count() {return impl::concat_helper_impl<TAG_TYPES...>::tag_count();}; + + template <typename SUBNET> + void setup (const SUBNET&) + { + // do nothing + } + template <typename SUBNET> + void forward(const SUBNET& sub, resizable_tensor& output) + { + // the total depth of result is the sum of depths from all tags + impl::concat_helper_impl<TAG_TYPES...>::resize_out(output, sub, 0); + + // copy output from each tag into different part result + impl::concat_helper_impl<TAG_TYPES...>::concat(output, sub, 0); + } + + template <typename SUBNET> + void backward(const tensor& gradient_input, SUBNET& sub, tensor&) + { + // Gradient is split into parts for each tag layer + impl::concat_helper_impl<TAG_TYPES...>::split(gradient_input, sub, 0); + } + + dpoint map_input_to_output(dpoint p) const { return p; } + dpoint map_output_to_input(dpoint p) const { return p; } + + const tensor& get_layer_params() const { return params; } + tensor& get_layer_params() { return params; } + + friend void serialize(const concat_& item, std::ostream& out) + { + serialize("concat_", out); + size_t count = tag_count(); + serialize(count, out); + } + + friend void deserialize(concat_& item, std::istream& in) + { + std::string version; + deserialize(version, in); + if (version != "concat_") + throw serialization_error("Unexpected version '"+version+"' found while deserializing dlib::concat_."); + size_t count_tags; + deserialize(count_tags, in); + if (count_tags != tag_count()) + throw serialization_error("Invalid count of tags "+ std::to_string(count_tags) +", expecting " + + std::to_string(tag_count()) + + " found while deserializing dlib::concat_."); + } + + friend std::ostream& operator<<(std::ostream& out, const concat_& item) + { + out << "concat\t ("; + list_tags(out); + out << ")"; + return out; + } + + friend void to_xml(const concat_& item, std::ostream& out) + { + out << "<concat tags='"; + list_tags(out); + out << "'/>\n"; + } + + private: + resizable_tensor params; // unused + }; + + + // concat layer definitions + template <template<typename> class TAG1, + template<typename> class TAG2, + typename SUBNET> + using concat2 = add_layer<concat_<TAG1, TAG2>, SUBNET>; + + template <template<typename> class TAG1, + template<typename> class TAG2, + template<typename> class TAG3, + typename SUBNET> + using concat3 = add_layer<concat_<TAG1, TAG2, TAG3>, SUBNET>; + + template <template<typename> class TAG1, + template<typename> class TAG2, + template<typename> class TAG3, + template<typename> class TAG4, + typename SUBNET> + using concat4 = add_layer<concat_<TAG1, TAG2, TAG3, TAG4>, SUBNET>; + + template <template<typename> class TAG1, + template<typename> class TAG2, + template<typename> class TAG3, + template<typename> class TAG4, + template<typename> class TAG5, + typename SUBNET> + using concat5 = add_layer<concat_<TAG1, TAG2, TAG3, TAG4, TAG5>, SUBNET>; + + // inception layer will use tags internally. If user will use tags too, some conflicts + // possible to exclude them, here are new tags specially for inceptions + template <typename SUBNET> using itag0 = add_tag_layer< 1000 + 0, SUBNET>; + template <typename SUBNET> using itag1 = add_tag_layer< 1000 + 1, SUBNET>; + template <typename SUBNET> using itag2 = add_tag_layer< 1000 + 2, SUBNET>; + template <typename SUBNET> using itag3 = add_tag_layer< 1000 + 3, SUBNET>; + template <typename SUBNET> using itag4 = add_tag_layer< 1000 + 4, SUBNET>; + template <typename SUBNET> using itag5 = add_tag_layer< 1000 + 5, SUBNET>; + // skip to inception input + template <typename SUBNET> using iskip = add_skip_layer< itag0, SUBNET>; + + // here are some templates to be used for creating inception layer groups + template <template<typename>class B1, + template<typename>class B2, + typename SUBNET> + using inception2 = concat2<itag1, itag2, itag1<B1<iskip< itag2<B2< itag0<SUBNET>>>>>>>; + + template <template<typename>class B1, + template<typename>class B2, + template<typename>class B3, + typename SUBNET> + using inception3 = concat3<itag1, itag2, itag3, itag1<B1<iskip< itag2<B2<iskip< itag3<B3< itag0<SUBNET>>>>>>>>>>; + + template <template<typename>class B1, + template<typename>class B2, + template<typename>class B3, + template<typename>class B4, + typename SUBNET> + using inception4 = concat4<itag1, itag2, itag3, itag4, + itag1<B1<iskip< itag2<B2<iskip< itag3<B3<iskip< itag4<B4< itag0<SUBNET>>>>>>>>>>>>>; + + template <template<typename>class B1, + template<typename>class B2, + template<typename>class B3, + template<typename>class B4, + template<typename>class B5, + typename SUBNET> + using inception5 = concat5<itag1, itag2, itag3, itag4, itag5, + itag1<B1<iskip< itag2<B2<iskip< itag3<B3<iskip< itag4<B4<iskip< itag5<B5< itag0<SUBNET>>>>>>>>>>>>>>>>; + +// ---------------------------------------------------------------------------------------- +// ---------------------------------------------------------------------------------------- + + const double DEFAULT_L2_NORM_EPS = 1e-5; + + class l2normalize_ + { + public: + explicit l2normalize_( + double eps_ = DEFAULT_L2_NORM_EPS + ) : + eps(eps_) + { + } + + double get_eps() const { return eps; } + + template <typename SUBNET> + void setup (const SUBNET& /*sub*/) + { + } + + void forward_inplace(const tensor& input, tensor& output) + { + tt::inverse_norms(norm, input, eps); + tt::scale_rows(output, input, norm); + } + + void backward_inplace( + const tensor& computed_output, + const tensor& gradient_input, + tensor& data_grad, + tensor& /*params_grad*/ + ) + { + if (is_same_object(gradient_input, data_grad)) + { + tt::dot_prods(temp, gradient_input, computed_output); + tt::scale_rows2(0, data_grad, gradient_input, computed_output, temp, norm); + } + else + { + tt::dot_prods(temp, gradient_input, computed_output); + tt::scale_rows2(1, data_grad, gradient_input, computed_output, temp, norm); + } + } + + const tensor& get_layer_params() const { return params; } + tensor& get_layer_params() { return params; } + + friend void serialize(const l2normalize_& item, std::ostream& out) + { + serialize("l2normalize_", out); + serialize(item.eps, out); + } + + friend void deserialize(l2normalize_& item, std::istream& in) + { + std::string version; + deserialize(version, in); + if (version != "l2normalize_") + throw serialization_error("Unexpected version '"+version+"' found while deserializing dlib::l2normalize_."); + deserialize(item.eps, in); + } + + friend std::ostream& operator<<(std::ostream& out, const l2normalize_& item) + { + out << "l2normalize"; + out << " eps="<<item.eps; + return out; + } + + friend void to_xml(const l2normalize_& item, std::ostream& out) + { + out << "<l2normalize"; + out << " eps='"<<item.eps<<"'"; + out << "/>\n"; + } + private: + double eps; + + resizable_tensor params; // unused + // Here only to avoid reallocation and as a cache between forward/backward + // functions. + resizable_tensor norm; + resizable_tensor temp; + }; + + template <typename SUBNET> + using l2normalize = add_layer<l2normalize_, SUBNET>; + +// ---------------------------------------------------------------------------------------- + + template < + long _offset, + long _k, + long _nr, + long _nc + > + class extract_ + { + static_assert(_offset >= 0, "The offset must be >= 0."); + static_assert(_k > 0, "The number of channels must be > 0."); + static_assert(_nr > 0, "The number of rows must be > 0."); + static_assert(_nc > 0, "The number of columns must be > 0."); + public: + extract_( + ) + { + } + + template <typename SUBNET> + void setup (const SUBNET& sub) + { + DLIB_CASSERT((long)sub.get_output().size() >= sub.get_output().num_samples()*(_offset+_k*_nr*_nc), + "The tensor we are trying to extract from the input tensor is too big to fit into the input tensor."); + + aout = alias_tensor(sub.get_output().num_samples(), _k*_nr*_nc); + ain = alias_tensor(sub.get_output().num_samples(), sub.get_output().size()/sub.get_output().num_samples()); + } + + template <typename SUBNET> + void forward(const SUBNET& sub, resizable_tensor& output) + { + if (aout.num_samples() != sub.get_output().num_samples()) + { + aout = alias_tensor(sub.get_output().num_samples(), _k*_nr*_nc); + ain = alias_tensor(sub.get_output().num_samples(), sub.get_output().size()/sub.get_output().num_samples()); + } + + output.set_size(sub.get_output().num_samples(), _k, _nr, _nc); + auto out = aout(output,0); + auto in = ain(sub.get_output(),0); + tt::copy_tensor(false, out, 0, in, _offset, _k*_nr*_nc); + } + + template <typename SUBNET> + void backward(const tensor& gradient_input, SUBNET& sub, tensor& /*params_grad*/) + { + auto out = ain(sub.get_gradient_input(),0); + auto in = aout(gradient_input,0); + tt::copy_tensor(true, out, _offset, in, 0, _k*_nr*_nc); + } + + const tensor& get_layer_params() const { return params; } + tensor& get_layer_params() { return params; } + + friend void serialize(const extract_& item, std::ostream& out) + { + serialize("extract_", out); + serialize(_offset, out); + serialize(_k, out); + serialize(_nr, out); + serialize(_nc, out); + } + + friend void deserialize(extract_& item, std::istream& in) + { + std::string version; + deserialize(version, in); + if (version != "extract_") + throw serialization_error("Unexpected version '"+version+"' found while deserializing dlib::extract_."); + + long offset; + long k; + long nr; + long nc; + deserialize(offset, in); + deserialize(k, in); + deserialize(nr, in); + deserialize(nc, in); + + if (offset != _offset) throw serialization_error("Wrong offset found while deserializing dlib::extract_"); + if (k != _k) throw serialization_error("Wrong k found while deserializing dlib::extract_"); + if (nr != _nr) throw serialization_error("Wrong nr found while deserializing dlib::extract_"); + if (nc != _nc) throw serialization_error("Wrong nc found while deserializing dlib::extract_"); + } + + friend std::ostream& operator<<(std::ostream& out, const extract_& item) + { + out << "extract\t (" + << "offset="<<_offset + << ", k="<<_k + << ", nr="<<_nr + << ", nc="<<_nc + << ")"; + return out; + } + + friend void to_xml(const extract_& item, std::ostream& out) + { + out << "<extract"; + out << " offset='"<<_offset<<"'"; + out << " k='"<<_k<<"'"; + out << " nr='"<<_nr<<"'"; + out << " nc='"<<_nc<<"'"; + out << "/>\n"; + } + private: + alias_tensor aout, ain; + + resizable_tensor params; // unused + }; + + template < + long offset, + long k, + long nr, + long nc, + typename SUBNET + > + using extract = add_layer<extract_<offset,k,nr,nc>, SUBNET>; + +// ---------------------------------------------------------------------------------------- + +} + +#endif // DLIB_DNn_LAYERS_H_ + + diff --git a/ml/dlib/dlib/dnn/layers_abstract.h b/ml/dlib/dlib/dnn/layers_abstract.h new file mode 100644 index 000000000..f07025ff8 --- /dev/null +++ b/ml/dlib/dlib/dnn/layers_abstract.h @@ -0,0 +1,2631 @@ +// Copyright (C) 2015 Davis E. King (davis@dlib.net) +// License: Boost Software License See LICENSE.txt for the full license. +#undef DLIB_DNn_LAYERS_ABSTRACT_H_ +#ifdef DLIB_DNn_LAYERS_ABSTRACT_H_ + +#include "tensor_abstract.h" +#include "core_abstract.h" + + +namespace dlib +{ + +// ---------------------------------------------------------------------------------------- + + class SUBNET + { + /*! + WHAT THIS OBJECT REPRESENTS + This object represents a deep neural network. In particular, it is + the simplified interface through which layer objects interact with their + subnetworks. A layer's two important tasks are to (1) take outputs from its + subnetwork and forward propagate them through itself and (2) to backwards + propagate an error gradient through itself and onto its subnetwork. + The idea of a subnetwork is illustrated in the following diagram: + + +---------------------------------------------------------+ + | loss <-- layer1 <-- layer2 <-- ... <-- layern <-- input | + +---------------------------------------------------------+ + ^ ^ + \__ subnetwork for layer1 __/ + + Therefore, by "subnetwork" we mean the part of the network closer to the + input. + + Note that there is no dlib::SUBNET type. It is shown here purely to + document the interface layer objects expect to see when they interact + with a network. + !*/ + + public: + // You aren't allowed to copy subnetworks from inside a layer. + SUBNET(const SUBNET&) = delete; + SUBNET& operator=(const SUBNET&) = delete; + + const tensor& get_output( + ) const; + /*! + ensures + - returns the output of this subnetwork. This is the data that the next + layer in the network will take as input. + - have_same_dimensions(#get_gradient_input(), get_output()) == true + !*/ + + tensor& get_gradient_input( + ); + /*! + ensures + - returns the error gradient for this subnetwork. That is, this is the + error gradient that this network will use to update itself. Therefore, + when performing back propagation, layers that sit on top of this + subnetwork write their back propagated error gradients into + get_gradient_input(). Or to put it another way, during back propagation, + layers take the contents of their get_gradient_input() and back propagate + it through themselves and store the results into their subnetwork's + get_gradient_input(). + !*/ + + const NEXT_SUBNET& subnet( + ) const; + /*! + ensures + - returns the subnetwork of *this network. With respect to the diagram + above, if *this was layer1 then subnet() would return the network that + begins with layer2. + !*/ + + NEXT_SUBNET& subnet( + ); + /*! + ensures + - returns the subnetwork of *this network. With respect to the diagram + above, if *this was layer1 then subnet() would return the network that + begins with layer2. + !*/ + + const layer_details_type& layer_details( + ) const; + /*! + ensures + - returns the layer_details_type instance that defines the behavior of the + layer at the top of this network. I.e. returns the layer details that + defines the behavior of the layer nearest to the network output rather + than the input layer. For computational layers, this is the object + implementing the EXAMPLE_COMPUTATIONAL_LAYER_ interface that defines the + layer's behavior. + !*/ + + unsigned int sample_expansion_factor ( + ) const; + /*! + ensures + - When to_tensor() is invoked on this network's input layer it converts N + input objects into M samples, all stored inside a resizable_tensor. It + is always the case that M is some integer multiple of N. + sample_expansion_factor() returns the value of this multiplier. To be + very specific, it is always true that M==I*N where I is some integer. + This integer I is what is returned by sample_expansion_factor(). + + It should be noted that computational layers likely do not care about the + sample expansion factor. It is only really of concern inside a loss + layer where you need to know its value so that tensor samples can be + matched against truth objects. Moreover, in most cases the sample + expansion factor is 1. + !*/ + + }; + +// ---------------------------------------------------------------------------------------- + + class EXAMPLE_COMPUTATIONAL_LAYER_ + { + /*! + WHAT THIS OBJECT REPRESENTS + Each computational layer in a deep neural network can be thought of as a + function, f(data,parameters), that takes in a data tensor, some parameters, + and produces an output tensor. You create an entire deep network by + composing these functions. Importantly, you are able to use a wide range + of different functions to accommodate the task you are trying to + accomplish. Therefore, dlib includes a number of common layer types but if + you want to define your own then you simply implement a class with the same + interface as EXAMPLE_COMPUTATIONAL_LAYER_. + + Note that there is no dlib::EXAMPLE_COMPUTATIONAL_LAYER_ type. It is shown + here purely to document the interface that a layer object must implement. + + The central work of defining a layer is implementing the forward and backward + methods. When you do this you have four options: + - Implement the forward() and backward() methods according to the + specification shown below. Do not implement forward_inplace() and + backward_inplace(). + - Implement the forward() and backward() methods according to the + specification shown below, except exclude the computed_output + parameter from backward(). Doing this will allow dlib to make some + layers execute in-place and therefore run a little faster and use + less memory. Do not implement forward_inplace() and + backward_inplace(). + - Implement the forward_inplace() and backward_inplace() methods + according to the specification shown below. Do not implement + forward() and backward(). These in-place methods allow some types of + layers to be implemented more efficiently. + - Implement the forward_inplace() and backward_inplace() methods + according to the specification shown below, except exclude the + computed_output parameter from backward_inplace(). Doing this will + allow dlib to make some layers execute in-place and therefore run a + little faster and use less memory. Do not implement forward() and + backward(). + + + It should also be noted that layers may define additional layer specific + fields and the solvers can use these fields as they see fit. For example, + some layers define get_learning_rate_multiplier() and + get_weight_decay_multiplier() methods. The solvers that come with dlib + look at these methods, if they exist, and adjust the learning rate or + weight decay for that layer according to the multiplier. Therefore, you + can add these methods to your layer types if you want, or even define new + fields and new solvers that use those fields in some way. + !*/ + + public: + + EXAMPLE_COMPUTATIONAL_LAYER_( + ); + /*! + ensures + - Default constructs this object. This function is not required to do + anything in particular but it must exist, that is, it is required that + layer objects be default constructable. + !*/ + + EXAMPLE_COMPUTATIONAL_LAYER_ ( + const EXAMPLE_COMPUTATIONAL_LAYER_& item + ); + /*! + ensures + - EXAMPLE_COMPUTATIONAL_LAYER_ objects are copy constructable + !*/ + + EXAMPLE_COMPUTATIONAL_LAYER_( + const some_other_layer_type& item + ); + /*! + ensures + - Constructs this object from item. This form of constructor is optional + but it allows you to provide a conversion from one layer type to another. + For example, the following code is valid only if my_layer2 can be + constructed from my_layer1: + relu<fc<my_layer1<fc<input<matrix<float>>>>>> my_dnn1; + relu<fc<my_layer2<fc<input<matrix<float>>>>>> my_dnn2(my_dnn1); + This kind of pattern is useful if you want to use one type of layer + during training but a different type of layer during testing since it + allows you to easily convert between related deep neural network types. + + Additionally, if you provide a constructor to build a layer from another + layer type you should also write your layer's deserialize() routine such + that it can read that other layer's serialized data in addition to your + own serialized data. + !*/ + + template <typename SUBNET> + void setup ( + const SUBNET& sub + ); + /*! + requires + - SUBNET implements the SUBNET interface defined at the top of this file. + ensures + - performs any necessary initial memory allocations and/or sets parameters + to their initial values prior to learning. Therefore, calling setup + destroys any previously learned parameters. Also, typically setup() + would look at the dimensions of the outputs of sub and configure the + number of parameters in *this accordingly. + !*/ + + template <typename SUBNET> + void forward( + const SUBNET& sub, + resizable_tensor& data_output + ); + /*! + requires + - SUBNET implements the SUBNET interface defined at the top of this file. + - setup() has been called. + ensures + - Runs the output of the subnetwork through this layer and stores the + results into #data_output. In particular, forward() can use any of the + outputs in sub (e.g. sub.get_output(), sub.subnet().get_output(), etc.) + to compute whatever it wants. + !*/ + + template <typename SUBNET> + void backward( + const tensor& computed_output, // this parameter is optional + const tensor& gradient_input, + SUBNET& sub, + tensor& params_grad + ); + /*! + requires + - SUBNET implements the SUBNET interface defined at the top of this file. + - setup() has been called. + - computed_output is the tensor resulting from calling forward(sub,computed_output). + Moreover, this was the most recent call to forward(). This means that + forward() is allowed to cache intermediate results so they can be used + during the backward computation. + - have_same_dimensions(gradient_input, computed_output) == true + - have_same_dimensions(sub.get_gradient_input(), sub.get_output()) == true + - have_same_dimensions(params_grad, get_layer_params()) == true + ensures + - This function outputs the gradients of this layer with respect to the + input data from sub and also with respect to this layer's parameters. + These gradients are stored into #sub and #params_grad, respectively. To be + precise, the gradients are taken of a function f(sub,get_layer_params()) + which is defined thusly: + - Recalling that computed_output is a function of both sub and get_layer_params(), + since it is the result of calling forward(sub,computed_output): + let f(sub,get_layer_params()) == dot(computed_output, gradient_input) + Then we define the following gradient vectors: + - PARAMETER_GRADIENT == gradient of f(sub,get_layer_params()) with + respect to get_layer_params(). + - for all valid I: + - DATA_GRADIENT_I == gradient of f(sub,get_layer_params()) with + respect to layer<I>(sub).get_output() (recall that forward() can + draw inputs from the immediate sub layer, sub.subnet(), or + any earlier layer. So you must consider the gradients with + respect to all inputs drawn from sub) + Finally, backward() outputs these gradients by performing: + - params_grad = PARAMETER_GRADIENT + - for all valid I: + - layer<I>(sub).get_gradient_input() += DATA_GRADIENT_I + !*/ + + void forward_inplace( + const tensor& data_input, + tensor& data_output + ); + /*! + requires + - have_same_dimensions(data_input,data_output) == true + - setup() has been called. + ensures + - Runs the data_input tensor through this layer and stores the output into + #data_output. + - This function supports in-place operation, i.e. having + is_same_object(data_input, data_output)==true + !*/ + + void backward_inplace( + const tensor& computed_output, // this parameter is optional + const tensor& gradient_input, + tensor& data_grad, + tensor& params_grad + ); + /*! + requires + - setup() has been called. + - computed_output is the tensor resulting from the most recent call to + forward_inplace(). This means that forward_inplace() is allowed to cache + intermediate results so they can be used during the backward computation. + - have_same_dimensions(gradient_input, data_grad) == true + - have_same_dimensions(gradient_input, computed_output) == true + - have_same_dimensions(params_grad, get_layer_params()) == true + ensures + - This function supports in-place operation, i.e. having + is_same_object(gradient_input, data_grad)==true + - This function outputs the gradients of this layer with respect to the + input data from a sublayer and also with respect to this layer's parameters. + These gradients are stored into #data_grad and #params_grad, respectively. To be + precise, the gradients are taken of a function f(data_input,get_layer_params()) + which is defined thusly: + - Recalling that computed_output is a function of both the input to + forward_inplace() and get_layer_params(), since it is the result of + calling forward_inplace(data_input,computed_output): + let f(data_input,get_layer_params()) == dot(computed_output, gradient_input) + Then we define the following gradient vectors: + - PARAMETER_GRADIENT == gradient of f(data_input,get_layer_params()) with + respect to get_layer_params(). + - DATA_GRADIENT == gradient of f(data_input,get_layer_params()) with respect + to data_input. + Finally, backward_inplace() outputs these gradients by performing: + - params_grad = PARAMETER_GRADIENT + - if (is_same_object(gradient_input, data_grad)) then + - data_grad = DATA_GRADIENT + - else + - data_grad += DATA_GRADIENT + !*/ + + const tensor& get_layer_params( + ) const; + /*! + ensures + - returns the parameters that define the behavior of forward(). + !*/ + + tensor& get_layer_params( + ); + /*! + ensures + - returns the parameters that define the behavior of forward(). + !*/ + + + dpoint map_input_to_output(dpoint p) const; + dpoint map_output_to_input(dpoint p) const; + /*! + These two functions are optional. If provided, they should map between + (column,row) coordinates in input and output tensors of forward(). Providing + these functions allows you to use global utility functions like + input_tensor_to_output_tensor(). + !*/ + + void clean ( + ); + /*! + Implementing this function is optional. If you don't need it then you don't + have to provide a clean(). But if you do provide it then it must behave as + follows: + + ensures + - calling clean() Causes this object to forget about everything except its + parameters. This is useful if your layer caches information between + forward and backward passes and you want to clean out that cache + information before saving the network to disk. + !*/ + + }; + + std::ostream& operator<<(std::ostream& out, const EXAMPLE_COMPUTATIONAL_LAYER_& item); + /*! + print a string describing this layer. + !*/ + + void to_xml(const EXAMPLE_COMPUTATIONAL_LAYER_& item, std::ostream& out); + /*! + This function is optional, but required if you want to print your networks with + net_to_xml(). Therefore, to_xml() prints a layer as XML. + !*/ + + void serialize(const EXAMPLE_COMPUTATIONAL_LAYER_& item, std::ostream& out); + void deserialize(EXAMPLE_COMPUTATIONAL_LAYER_& item, std::istream& in); + /*! + provides serialization support + !*/ + + // For each layer you define, always define an add_layer template so that layers can be + // easily composed. Moreover, the convention is that the layer class ends with an _ + // while the add_layer template has the same name but without the trailing _. + template <typename SUBNET> + using EXAMPLE_COMPUTATIONAL_LAYER = add_layer<EXAMPLE_COMPUTATIONAL_LAYER_, SUBNET>; + +// ---------------------------------------------------------------------------------------- +// ---------------------------------------------------------------------------------------- +// ---------------------------------------------------------------------------------------- + + enum fc_bias_mode + { + FC_HAS_BIAS = 0, + FC_NO_BIAS = 1 + }; + + struct num_fc_outputs + { + num_fc_outputs(unsigned long n) : num_outputs(n) {} + unsigned long num_outputs; + }; + + template < + unsigned long num_outputs, + fc_bias_mode bias_mode + > + class fc_ + { + /*! + REQUIREMENTS ON num_outputs + num_outputs > 0 + + WHAT THIS OBJECT REPRESENTS + This is an implementation of the EXAMPLE_COMPUTATIONAL_LAYER_ interface + defined above. In particular, it defines a fully connected layer that + takes an input tensor and multiplies it by a weight matrix and outputs the + results. + + The dimensions of the tensors output by this layer are as follows (letting + IN be the input tensor and OUT the output tensor): + - OUT.num_samples() == IN.num_samples() + - OUT.k() == get_num_outputs() + - OUT.nr() == 1 + - OUT.nc() == 1 + !*/ + + public: + + fc_( + ); + /*! + ensures + - #get_num_outputs() == num_outputs + - #get_bias_mode() == bias_mode + - #get_learning_rate_multiplier() == 1 + - #get_weight_decay_multiplier() == 1 + - #get_bias_learning_rate_multiplier() == 1 + - #get_bias_weight_decay_multiplier() == 0 + !*/ + + fc_( + num_fc_outputs o + ); + /*! + ensures + - #get_num_outputs() == o.num_outputs + - #get_bias_mode() == bias_mode + - #get_learning_rate_multiplier() == 1 + - #get_weight_decay_multiplier() == 1 + - #get_bias_learning_rate_multiplier() == 1 + - #get_bias_weight_decay_multiplier() == 0 + !*/ + + unsigned long get_num_outputs ( + ) const; + /*! + ensures + - This layer outputs column vectors that contain get_num_outputs() + elements. That is, the output tensor T from forward() will be such that: + - T.num_samples() == however many samples were given to forward(). + - T.k() == get_num_outputs() + - The rest of the dimensions of T will be 1. + !*/ + + void set_num_outputs( + long num + ); + /*! + requires + - num > 0 + - get_layer_params().size() == 0 || get_num_outputs() == num + (i.e. You can't change the number of outputs in fc_ if the parameter + tensor has already been allocated.) + ensures + - #get_num_outputs() == num + !*/ + + fc_bias_mode get_bias_mode ( + ) const; + /*! + ensures + - returns the bias mode which determines if this layer includes bias terms. + That is, if the bias mode is FC_HAS_BIAS then a different constant scalar + is added to each of the outputs of this layer. + !*/ + + double get_learning_rate_multiplier( + ) const; + /*! + ensures + - returns a multiplier number. The interpretation is that this object is + requesting that the learning rate used to optimize its parameters be + multiplied by get_learning_rate_multiplier(). + !*/ + + double get_weight_decay_multiplier( + ) const; + /*! + ensures + - returns a multiplier number. The interpretation is that this object is + requesting that the weight decay used to optimize its parameters be + multiplied by get_weight_decay_multiplier(). + !*/ + + void set_learning_rate_multiplier( + double val + ); + /*! + requires + - val >= 0 + ensures + - #get_learning_rate_multiplier() == val + !*/ + + void set_weight_decay_multiplier( + double val + ); + /*! + requires + - val >= 0 + ensures + - #get_weight_decay_multiplier() == val + !*/ + + double get_bias_learning_rate_multiplier( + ) const; + /*! + ensures + - returns a multiplier number. The interpretation is that this object is + requesting that the learning rate used to optimize its bias parameters be + multiplied by get_learning_rate_multiplier()*get_bias_learning_rate_multiplier(). + !*/ + + double get_bias_weight_decay_multiplier( + ) const; + /*! + ensures + - returns a multiplier number. The interpretation is that this object is + requesting that the weight decay used to optimize its bias parameters be + multiplied by get_weight_decay_multiplier()*get_bias_weight_decay_multiplier(). + !*/ + + void set_bias_learning_rate_multiplier( + double val + ); + /*! + requires + - val >= 0 + ensures + - #get_bias_learning_rate_multiplier() == val + !*/ + + void set_bias_weight_decay_multiplier( + double val + ); + /*! + requires + - val >= 0 + ensures + - #get_bias_weight_decay_multiplier() == val + !*/ + + alias_tensor_const_instance get_weights( + ) const; + /*! + ensures + - returns an alias of get_layer_params(), containing the weights matrix of + the fully connected layer. + - #get_weights().num_samples() is the number of elements in input sample, + i.e. sublayer's output's k * nc * nr. + - #get_bias().k() == #get_num_outputs() + - if get_bias_mode() == FC_HAS_BIAS: + - #get_layer_params().size() == (#get_weights().size() + #get_biases().size()) + - else: + - #get_layer_params().size() == #get_weights().size() + !*/ + + alias_tensor_instance get_weights( + ); + /*! + ensures + - returns an alias of get_layer_params(), containing the weights matrix of + the fully connected layer. + - #get_weights().num_samples() is the number of elements in input sample, + i.e. sublayer's output's k * nc * nr. + - #get_bias().k() == #get_num_outputs() + - if get_bias_mode() == FC_HAS_BIAS: + - #get_layer_params().size() == (#get_weights().size() + #get_biases().size()) + - else: + - #get_layer_params().size() == #get_weights().size() + !*/ + + alias_tensor_const_instance get_biases( + ) const; + /*! + requires + - #get_bias_mode() == FC_HAS_BIAS + ensures + - returns an alias of get_layer_params(), containing the bias vector of + the fully connected layer. + - #get_bias().num_samples() == 1 + - #get_bias().k() == #get_num_outputs() + - #get_layer_params().size() == (#get_weights().size() + #get_biases().size()) + !*/ + + alias_tensor_instance get_biases( + ); + /*! + requires + - #get_bias_mode() == FC_HAS_BIAS + ensures + - returns an alias of get_layer_params(), containing the bias vector of + the fully connected layer. + - #get_bias().num_samples() == 1 + - #get_bias().k() == #get_num_outputs() + - #get_layer_params().size() == (#get_weights().size() + #get_biases().size()) + !*/ + + template <typename SUBNET> void setup (const SUBNET& sub); + template <typename SUBNET> void forward(const SUBNET& sub, resizable_tensor& output); + template <typename SUBNET> void backward(const tensor& gradient_input, SUBNET& sub, tensor& params_grad); + const tensor& get_layer_params() const; + tensor& get_layer_params(); + /*! + These functions are implemented as described in the EXAMPLE_COMPUTATIONAL_LAYER_ interface. + !*/ + + }; + + template < + unsigned long num_outputs, + typename SUBNET + > + using fc = add_layer<fc_<num_outputs,FC_HAS_BIAS>, SUBNET>; + + template < + unsigned long num_outputs, + typename SUBNET + > + using fc_no_bias = add_layer<fc_<num_outputs,FC_NO_BIAS>, SUBNET>; + +// ---------------------------------------------------------------------------------------- + + struct num_con_outputs + { + num_con_outputs(unsigned long n) : num_outputs(n) {} + unsigned long num_outputs; + }; + + template < + long _num_filters, + long _nr, + long _nc, + int _stride_y, + int _stride_x, + int _padding_y = _stride_y!=1? 0 : _nr/2, + int _padding_x = _stride_x!=1? 0 : _nc/2 + > + class con_ + { + /*! + REQUIREMENTS ON TEMPLATE ARGUMENTS + - _num_filters > 0 + - _nr >= 0 + - _nc >= 0 + - _stride_y > 0 + - _stride_x > 0 + - _padding_y >= 0 + - _padding_x >= 0 + - Also, we require that: + - if (_nr == 0) then + - _padding_y == 0 + - else + - _padding_y < _nr + - if (_nc == 0) then + - _padding_x == 0 + - else + - _padding_x < _nc + + WHAT THIS OBJECT REPRESENTS + This is an implementation of the EXAMPLE_COMPUTATIONAL_LAYER_ interface + defined above. In particular, it defines a convolution layer that takes an + input tensor (nominally representing an image) and convolves it with a set + of filters and then outputs the results. + + The dimensions of the tensors output by this layer are as follows (letting + IN be the input tensor and OUT the output tensor): + - OUT.num_samples() == IN.num_samples() + - OUT.k() == num_filters() + - OUT.nr() == 1+(IN.nr() + 2*padding_y() - nr())/stride_y() + - OUT.nc() == 1+(IN.nc() + 2*padding_x() - nc())/stride_x() + + Note also that setting _nr or _nc to 0 has a special meaning of "set the + filter size equal to the input image size". Specifically, it means: + - if (_nr == 0) then + - nr() == IN.nr() + - OUT.nr() == 1 + - if (_nc == 0) then + - nc() == IN.nc() + - OUT.nc() == 1 + !*/ + + public: + con_( + ); + /*! + ensures + - #num_filters() == _num_filters + - #nr() == _nr + - #nc() == _nc + - #stride_y() == _stride_y + - #stride_x() == _stride_x + - #padding_y() == _padding_y + - #padding_x() == _padding_x + - #get_learning_rate_multiplier() == 1 + - #get_weight_decay_multiplier() == 1 + - #get_bias_learning_rate_multiplier() == 1 + - #get_bias_weight_decay_multiplier() == 0 + !*/ + + con_( + num_con_outputs o + ); + /*! + ensures + - #num_filters() == o.num_outputs + - #nr() == _nr + - #nc() == _nc + - #stride_y() == _stride_y + - #stride_x() == _stride_x + - #padding_y() == _padding_y + - #padding_x() == _padding_x + - #get_learning_rate_multiplier() == 1 + - #get_weight_decay_multiplier() == 1 + - #get_bias_learning_rate_multiplier() == 1 + - #get_bias_weight_decay_multiplier() == 0 + !*/ + + long num_filters( + ) const; + /*! + ensures + - returns the number of filters contained in this layer. The k dimension + of the output tensors produced by this layer will be equal to the number + of filters. + !*/ + + void set_num_filters( + long num + ); + /*! + requires + - num > 0 + - get_layer_params().size() == 0 || num_filters() == num + (i.e. You can't change the number of filters in con_ if the parameter + tensor has already been allocated.) + ensures + - #num_filters() == num + !*/ + + long nr( + ) const; + /*! + ensures + - returns the number of rows in the filters in this layer. Note that if + nr()==0 then it means the size of the filter is not yet assigned, but + once setup() is called nr() will be set to the input tensor's nr(). + Therefore, nr()==0 has the special interpretation of "be the same size as + the input tensor". + !*/ + + long nc( + ) const; + /*! + ensures + - returns the number of columns in the filters in this layer. Note that if + nc()==0 then it means the size of the filter is not yet assigned, but + once setup() is called nc() will be set to the input tensor's nc(). + Therefore, nc()==0 has the special interpretation of "be the same size as + the input tensor". + !*/ + + long stride_y( + ) const; + /*! + ensures + - returns the vertical stride used when convolving the filters over an + image. That is, each filter will be moved stride_y() pixels down at a + time when it moves over the image. + !*/ + + long stride_x( + ) const; + /*! + ensures + - returns the horizontal stride used when convolving the filters over an + image. That is, each filter will be moved stride_x() pixels right at a + time when it moves over the image. + !*/ + + long padding_y( + ) const; + /*! + ensures + - returns the number of pixels of zero padding added to the top and bottom + sides of the image. + !*/ + + long padding_x( + ) const; + /*! + ensures + - returns the number of pixels of zero padding added to the left and right + sides of the image. + !*/ + + double get_learning_rate_multiplier( + ) const; + /*! + ensures + - returns a multiplier number. The interpretation is that this object is + requesting that the learning rate used to optimize its parameters be + multiplied by get_learning_rate_multiplier(). + !*/ + + double get_weight_decay_multiplier( + ) const; + /*! + ensures + - returns a multiplier number. The interpretation is that this object is + requesting that the weight decay used to optimize its parameters be + multiplied by get_weight_decay_multiplier(). + !*/ + + void set_learning_rate_multiplier( + double val + ); + /*! + requires + - val >= 0 + ensures + - #get_learning_rate_multiplier() == val + !*/ + + void set_weight_decay_multiplier( + double val + ); + /*! + requires + - val >= 0 + ensures + - #get_weight_decay_multiplier() == val + !*/ + + double get_bias_learning_rate_multiplier( + ) const; + /*! + ensures + - returns a multiplier number. The interpretation is that this object is + requesting that the learning rate used to optimize its bias parameters be + multiplied by get_learning_rate_multiplier()*get_bias_learning_rate_multiplier(). + !*/ + + double get_bias_weight_decay_multiplier( + ) const; + /*! + ensures + - returns a multiplier number. The interpretation is that this object is + requesting that the weight decay used to optimize its bias parameters be + multiplied by get_weight_decay_multiplier()*get_bias_weight_decay_multiplier(). + !*/ + + void set_bias_learning_rate_multiplier( + double val + ); + /*! + requires + - val >= 0 + ensures + - #get_bias_learning_rate_multiplier() == val + !*/ + + void set_bias_weight_decay_multiplier( + double val + ); + /*! + requires + - val >= 0 + ensures + - #get_bias_weight_decay_multiplier() == val + !*/ + + template <typename SUBNET> void setup (const SUBNET& sub); + template <typename SUBNET> void forward(const SUBNET& sub, resizable_tensor& output); + template <typename SUBNET> void backward(const tensor& gradient_input, SUBNET& sub, tensor& params_grad); + dpoint map_input_to_output(dpoint p) const; + dpoint map_output_to_input(dpoint p) const; + const tensor& get_layer_params() const; + tensor& get_layer_params(); + /*! + These functions are implemented as described in the EXAMPLE_COMPUTATIONAL_LAYER_ interface. + !*/ + + }; + + template < + long num_filters, + long nr, + long nc, + int stride_y, + int stride_x, + typename SUBNET + > + using con = add_layer<con_<num_filters,nr,nc,stride_y,stride_x>, SUBNET>; + +// ---------------------------------------------------------------------------------------- + + template < + long _num_filters, + long _nr, + long _nc, + int _stride_y, + int _stride_x, + int _padding_y = _stride_y!=1? 0 : _nr/2, + int _padding_x = _stride_x!=1? 0 : _nc/2 + > + class cont_ + { + /*! + REQUIREMENTS ON TEMPLATE ARGUMENTS + All of them must be > 0. + Also, we require that: + - 0 <= _padding_y && _padding_y < _nr + - 0 <= _padding_x && _padding_x < _nc + + WHAT THIS OBJECT REPRESENTS + This is an implementation of the EXAMPLE_COMPUTATIONAL_LAYER_ interface + defined above. In particular, it defines a transposed convolution layer + that takes an input tensor and transpose convolves (sometimes called + "deconvolution") it with a set of filters and then outputs the results. + + This is essentially a convolutional layer that allows fractional strides. + Therefore, you can make output tensors that are larger than the input + tensors using this layer type. + + + The dimensions of the tensors output by this layer are as follows (letting + IN be the input tensor and OUT the output tensor): + - OUT.num_samples() == IN.num_samples() + - OUT.k() == num_filters() + - OUT.nr() == stride_y()*(IN.nr()-1) + nr() - 2*padding_y() + - OUT.nc() == stride_x()*(IN.nc()-1) + nc() - 2*padding_x() + !*/ + + public: + cont_( + ); + /*! + ensures + - #num_filters() == _num_filters + - #nr() == _nr + - #nc() == _nc + - #stride_y() == _stride_y + - #stride_x() == _stride_x + - #padding_y() == _padding_y + - #padding_x() == _padding_x + - #get_learning_rate_multiplier() == 1 + - #get_weight_decay_multiplier() == 1 + - #get_bias_learning_rate_multiplier() == 1 + - #get_bias_weight_decay_multiplier() == 0 + !*/ + + cont_( + num_con_outputs o + ); + /*! + ensures + - #num_filters() == o.num_outputs + - #nr() == _nr + - #nc() == _nc + - #stride_y() == _stride_y + - #stride_x() == _stride_x + - #padding_y() == _padding_y + - #padding_x() == _padding_x + - #get_learning_rate_multiplier() == 1 + - #get_weight_decay_multiplier() == 1 + - #get_bias_learning_rate_multiplier() == 1 + - #get_bias_weight_decay_multiplier() == 0 + !*/ + + long num_filters( + ) const; + /*! + ensures + - returns the number of filters contained in this layer. The k dimension + of the output tensors produced by this layer will be equal to the number + of filters. + !*/ + + void set_num_filters( + long num + ); + /*! + requires + - num > 0 + - get_layer_params().size() == 0 || num_filters() == num + (i.e. You can't change the number of filters in cont_ if the parameter + tensor has already been allocated.) + ensures + - #num_filters() == num + !*/ + + long nr( + ) const; + /*! + ensures + - returns the number of rows in the filters in this layer. + !*/ + + long nc( + ) const; + /*! + ensures + - returns the number of columns in the filters in this layer. + !*/ + + long stride_y( + ) const; + /*! + ensures + - returns the vertical stride used when convolving the filters over an + image. That is, each filter will be moved 1.0/stride_y() pixels down at + a time when it moves over the image. + !*/ + + long stride_x( + ) const; + /*! + ensures + - returns the horizontal stride used when convolving the filters over an + image. That is, each filter will be moved 1.0/stride_x() pixels right at + a time when it moves over the image. + !*/ + + long padding_y( + ) const; + /*! + ensures + - returns the number of pixels of zero padding added to the top and bottom + sides of the image. + !*/ + + long padding_x( + ) const; + /*! + ensures + - returns the number of pixels of zero padding added to the left and right + sides of the image. + !*/ + + double get_learning_rate_multiplier( + ) const; + /*! + ensures + - returns a multiplier number. The interpretation is that this object is + requesting that the learning rate used to optimize its parameters be + multiplied by get_learning_rate_multiplier(). + !*/ + + double get_weight_decay_multiplier( + ) const; + /*! + ensures + - returns a multiplier number. The interpretation is that this object is + requesting that the weight decay used to optimize its parameters be + multiplied by get_weight_decay_multiplier(). + !*/ + + void set_learning_rate_multiplier( + double val + ); + /*! + requires + - val >= 0 + ensures + - #get_learning_rate_multiplier() == val + !*/ + + void set_weight_decay_multiplier( + double val + ); + /*! + requires + - val >= 0 + ensures + - #get_weight_decay_multiplier() == val + !*/ + + double get_bias_learning_rate_multiplier( + ) const; + /*! + ensures + - returns a multiplier number. The interpretation is that this object is + requesting that the learning rate used to optimize its bias parameters be + multiplied by get_learning_rate_multiplier()*get_bias_learning_rate_multiplier(). + !*/ + + double get_bias_weight_decay_multiplier( + ) const; + /*! + ensures + - returns a multiplier number. The interpretation is that this object is + requesting that the weight decay used to optimize its bias parameters be + multiplied by get_weight_decay_multiplier()*get_bias_weight_decay_multiplier(). + !*/ + + void set_bias_learning_rate_multiplier( + double val + ); + /*! + requires + - val >= 0 + ensures + - #get_bias_learning_rate_multiplier() == val + !*/ + + void set_bias_weight_decay_multiplier( + double val + ); + /*! + requires + - val >= 0 + ensures + - #get_bias_weight_decay_multiplier() == val + !*/ + + template <typename SUBNET> void setup (const SUBNET& sub); + template <typename SUBNET> void forward(const SUBNET& sub, resizable_tensor& output); + template <typename SUBNET> void backward(const tensor& gradient_input, SUBNET& sub, tensor& params_grad); + dpoint map_input_to_output(dpoint p) const; + dpoint map_output_to_input(dpoint p) const; + const tensor& get_layer_params() const; + tensor& get_layer_params(); + /*! + These functions are implemented as described in the EXAMPLE_COMPUTATIONAL_LAYER_ interface. + !*/ + + }; + + template < + long num_filters, + long nr, + long nc, + int stride_y, + int stride_x, + typename SUBNET + > + using cont = add_layer<cont_<num_filters,nr,nc,stride_y,stride_x>, SUBNET>; + +// ---------------------------------------------------------------------------------------- + + template < + int scale_y, + int scale_x + > + class upsample_ + { + /*! + REQUIREMENTS ON TEMPLATE ARGUMENTS + All of them must be >= 1. + + WHAT THIS OBJECT REPRESENTS + This is an implementation of the EXAMPLE_COMPUTATIONAL_LAYER_ interface + defined above. In particular, it allows you to upsample a layer using + bilinear interpolation. To be very specific, it upsamples each of the + channels in an input tensor. Therefore, if IN is the input tensor to this + layer and OUT the output tensor, then we will have: + - OUT.num_samples() == IN.num_samples() + - OUT.k() == IN.k() + - OUT.nr() == IN.nr()*scale_y + - OUT.nc() == IN.nr()*scale_x + - for all valid i,k: image_plane(OUT,i,k) is a copy of + image_plane(IN,i,k) that has been bilinearly interpolated to fit into + the shape of image_plane(OUT,i,k). + !*/ + public: + + upsample_( + ); + /*! + ensures + - This object has no state, so the constructor does nothing, aside from + providing default constructability. + !*/ + + template <typename SUBNET> void setup (const SUBNET& sub); + template <typename SUBNET> void forward(const SUBNET& sub, resizable_tensor& output); + template <typename SUBNET> void backward(const tensor& gradient_input, SUBNET& sub, tensor& params_grad); + dpoint map_input_to_output(dpoint p) const; + dpoint map_output_to_input(dpoint p) const; + const tensor& get_layer_params() const; + tensor& get_layer_params(); + /*! + These functions are implemented as described in the EXAMPLE_COMPUTATIONAL_LAYER_ interface. + !*/ + }; + + template < + int scale, + typename SUBNET + > + using upsample = add_layer<upsample_<scale,scale>, SUBNET>; + +// ---------------------------------------------------------------------------------------- + + class dropout_ + { + /*! + WHAT THIS OBJECT REPRESENTS + This is an implementation of the EXAMPLE_COMPUTATIONAL_LAYER_ interface + defined above. In particular, it defines a dropout layer. Therefore, it + passes its inputs through the stochastic function f(x) which outputs either + 0 or x. The probability of 0 being output is given by the drop_rate + argument to this object's constructor. + + Note that, after you finish training a network with dropout, it is a good + idea to replace each dropout_ layer with a multiply_ layer because the + multiply_ layer is faster and deterministic. + !*/ + + public: + + explicit dropout_( + float drop_rate = 0.5 + ); + /*! + requires + - 0 <= drop_rate <= 1 + ensures + - #get_drop_rate() == drop_rate + !*/ + + float get_drop_rate ( + ) const; + /*! + ensures + - returns the probability that an individual input value to this layer will + be replaced with 0. + !*/ + + template <typename SUBNET> void setup (const SUBNET& sub); + void forward_inplace(const tensor& input, tensor& output); + void backward_inplace(const tensor& gradient_input, tensor& data_grad, tensor& params_grad); + dpoint map_input_to_output(dpoint p) const; + dpoint map_output_to_input(dpoint p) const; + const tensor& get_layer_params() const; + tensor& get_layer_params(); + /*! + These functions are implemented as described in the EXAMPLE_COMPUTATIONAL_LAYER_ interface. + !*/ + }; + + template <typename SUBNET> + using dropout = add_layer<dropout_, SUBNET>; + +// ---------------------------------------------------------------------------------------- + + class multiply_ + { + /*! + WHAT THIS OBJECT REPRESENTS + This is an implementation of the EXAMPLE_COMPUTATIONAL_LAYER_ interface + defined above. In particular, it defines a basic layer that just + multiplies its input tensor with a constant value and returns the result. + It therefore has no learnable parameters. + !*/ + + public: + explicit multiply_( + float val = 0.5 + ); + /*! + ensures + - #get_multiply_value() == val + !*/ + + multiply_ ( + const dropout_& item + ); + /*! + ensures + - #get_multiply_value() == 1-item.get_drop_rate() + (i.e. We construct the multiply_ layer so that it is essentially a + deterministic version of the given dropout_ layer) + !*/ + + float get_multiply_value ( + ) const; + /*! + ensures + - this layer simply multiplies its input tensor by get_multiply_value() and + produces the result as output. + !*/ + + template <typename SUBNET> void setup (const SUBNET& sub); + void forward_inplace(const tensor& input, tensor& output); + void backward_inplace(const tensor& gradient_input, tensor& data_grad, tensor& params_grad); + dpoint map_input_to_output(dpoint p) const; + dpoint map_output_to_input(dpoint p) const; + const tensor& get_layer_params() const; + tensor& get_layer_params(); + /*! + These functions are implemented as described in the EXAMPLE_COMPUTATIONAL_LAYER_ interface. + !*/ + }; + + template <typename SUBNET> + using multiply = add_layer<multiply_, SUBNET>; + +// ---------------------------------------------------------------------------------------- + + enum layer_mode + { + CONV_MODE = 0, // convolutional mode + FC_MODE = 1 // fully connected mode + }; + + const double DEFAULT_BATCH_NORM_EPS = 0.0001; + + template < + layer_mode mode + > + class bn_ + { + /*! + WHAT THIS OBJECT REPRESENTS + This is an implementation of the EXAMPLE_COMPUTATIONAL_LAYER_ interface + defined above. In particular, it defines a batch normalization layer that + implements the method described in the paper: + Batch Normalization: Accelerating Deep Network Training by Reducing + Internal Covariate Shift by Sergey Ioffe and Christian Szegedy + + In particular, this layer produces output tensors with the same + dimensionality as the input tensors, except that the mean and variances of + the elements have been standardized to 0 and 1 respectively. + + It should also be noted that when tensors with a num_samples() dimension of + 1 are passed to this layer it doesn't perform batch normalization. + Instead, it runs in "inference mode" where the learned linear normalizing + transformation is used to transform the tensor. + + Finally, after you finish training a batch normalized network, it is a good + idea to replace each bn_ layer with an affine_ layer because the affine_ + layer is faster and will never surprise you by performing batch + normalization on tensors that have a num_samples() dimension > 1. This allows + you to run large mini-batches of samples through your final network without + batch normalization executing at all. + !*/ + + public: + bn_( + ); + /*! + ensures + - #get_mode() == mode + - #get_running_stats_window_size() == 100 + - #get_learning_rate_multiplier() == 1 + - #get_weight_decay_multiplier() == 0 + - #get_bias_learning_rate_multiplier() == 1 + - #get_bias_weight_decay_multiplier() == 1 + - #get_eps() == tt::DEFAULT_BATCH_NORM_EPS + !*/ + + explicit bn_( + unsigned long window_size, + double eps = tt::DEFAULT_BATCH_NORM_EPS + ); + /*! + requires + - eps > 0 + - window_size > 0 + ensures + - #get_mode() == mode + - #get_running_stats_window_size() == window_size + - #get_learning_rate_multiplier() == 1 + - #get_weight_decay_multiplier() == 0 + - #get_bias_learning_rate_multiplier() == 1 + - #get_bias_weight_decay_multiplier() == 1 + - #get_eps() == eps + !*/ + + layer_mode get_mode( + ) const; + /*! + ensures + - returns the mode of this layer, either CONV_MODE or FC_MODE. + If the mode is FC_MODE then the normalization is applied across the + samples in a tensor (i.e. k()*nr()*nc() different things will be + normalized). Otherwise, normalization is applied across everything + except for the k() dimension, resulting in there being only k() + normalization equations that are applied spatially over the tensor. + + Therefore, if you are putting batch normalization after a fully connected + layer you should use FC_MODE. Otherwise, if you are putting batch + normalization after a convolutional layer you should use CONV_MODE. + !*/ + + double get_eps( + ) const; + /*! + ensures + - When doing batch normalization, we are dividing by the standard + deviation. This epsilon value returned by this function is added to the + variance to prevent the division from dividing by zero. + !*/ + + unsigned long get_running_stats_window_size ( + ) const; + /*! + ensures + - Just as recommended in the batch normalization paper, this object keeps a + running average of the mean and standard deviations of the features. + These averages are used during "inference mode" so you can run a single + object through a batch normalized network. They are also what is used to + initialize an affine_ layer that is constructed from a bn_ layer. This + function returns the effective number of recent samples used to compute + the running average. + !*/ + + void set_running_stats_window_size ( + unsigned long new_window_size + ); + /*! + requires + - new_window_size > 0 + ensures + - #get_running_stats_window_size() == new_window_size + !*/ + + double get_learning_rate_multiplier( + ) const; + /*! + ensures + - returns a multiplier number. The interpretation is that this object is + requesting that the learning rate used to optimize its parameters be + multiplied by get_learning_rate_multiplier(). + !*/ + + double get_weight_decay_multiplier( + ) const; + /*! + ensures + - returns a multiplier number. The interpretation is that this object is + requesting that the weight decay used to optimize its parameters be + multiplied by get_weight_decay_multiplier(). + !*/ + + void set_learning_rate_multiplier( + double val + ); + /*! + requires + - val >= 0 + ensures + - #get_learning_rate_multiplier() == val + !*/ + + void set_weight_decay_multiplier( + double val + ); + /*! + requires + - val >= 0 + ensures + - #get_weight_decay_multiplier() == val + !*/ + + double get_bias_learning_rate_multiplier( + ) const; + /*! + ensures + - returns a multiplier number. The interpretation is that this object is + requesting that the learning rate used to optimize its bias parameters be + multiplied by get_learning_rate_multiplier()*get_bias_learning_rate_multiplier(). + !*/ + + double get_bias_weight_decay_multiplier( + ) const; + /*! + ensures + - returns a multiplier number. The interpretation is that this object is + requesting that the weight decay used to optimize its bias parameters be + multiplied by get_weight_decay_multiplier()*get_bias_weight_decay_multiplier(). + !*/ + + void set_bias_learning_rate_multiplier( + double val + ); + /*! + requires + - val >= 0 + ensures + - #get_bias_learning_rate_multiplier() == val + !*/ + + void set_bias_weight_decay_multiplier( + double val + ); + /*! + requires + - val >= 0 + ensures + - #get_bias_weight_decay_multiplier() == val + !*/ + + template <typename SUBNET> void setup (const SUBNET& sub); + template <typename SUBNET> void forward(const SUBNET& sub, resizable_tensor& output); + template <typename SUBNET> void backward(const tensor& gradient_input, SUBNET& sub, tensor& params_grad); + dpoint map_input_to_output(dpoint p) const; + dpoint map_output_to_input(dpoint p) const; + const tensor& get_layer_params() const; + tensor& get_layer_params(); + /*! + These functions are implemented as described in the EXAMPLE_COMPUTATIONAL_LAYER_ interface. + !*/ + }; + + template <typename SUBNET> + using bn_con = add_layer<bn_<CONV_MODE>, SUBNET>; + template <typename SUBNET> + using bn_fc = add_layer<bn_<FC_MODE>, SUBNET>; + +// ---------------------------------------------------------------------------------------- + + template <typename net_type> + void set_all_bn_running_stats_window_sizes ( + const net_type& net, + unsigned long new_window_size + ); + /*! + requires + - new_window_size > 0 + - net_type is an object of type add_layer, add_loss_layer, add_skip_layer, or + add_tag_layer. + ensures + - Sets the get_running_stats_window_size() field of all bn_ layers in net to + new_window_size. + !*/ + +// ---------------------------------------------------------------------------------------- + + class affine_ + { + /*! + WHAT THIS OBJECT REPRESENTS + This is an implementation of the EXAMPLE_COMPUTATIONAL_LAYER_ interface + defined above. In particular, it applies a simple pointwise linear + transformation to an input tensor. You can think of it as having two + parameter tensors, A and B. If the input tensor is called INPUT then the + output of this layer is: + A*INPUT+B + where all operations are performed element wise and each sample in the + INPUT tensor is processed separately. + + Moreover, this object has two modes that effect the dimensionalities of A + and B and how they are applied to compute A*INPUT+B. If + get_mode()==FC_MODE then A and B each have the same dimensionality as the + input tensor, except their num_samples() dimensions are 1. If + get_mode()==CONV_MODE then A and B have all their dimensions set to 1 + except for k(), which is equal to INPUT.k(). + + In either case, the computation of A*INPUT+B is performed pointwise over all + the elements of INPUT using either: + OUTPUT(n,k,r,c) == A(1,k,r,c)*INPUT(n,k,r,c)+B(1,k,r,c) + or + OUTPUT(n,k,r,c) == A(1,k,1,1)*INPUT(n,k,r,c)+B(1,k,1,1) + as appropriate. + + + Finally, note that the parameters of this layer are not learnable and + therefore not modified during network updates. Instead, the layer will + perform the identity transformation unless it is initialized with a bn_ + layer, in which case it will perform whatever transformation the bn_ layer + has learned. + !*/ + + public: + + affine_( + ); + /*! + ensures + - #get_mode() == FC_MODE + !*/ + + affine_( + layer_mode mode + ); + /*! + ensures + - #get_mode() == mode + !*/ + + template < + layer_mode mode + > + affine_( + const bn_<mode>& layer + ); + /*! + ensures + - Constructs affine_ so that it performs the same transformation as the + supplied batch normalization layer. You would want to do this after you + finish training a network with bn_ layers because the affine_ layer will + execute faster. + - #get_mode() == layer.get_mode() + !*/ + + layer_mode get_mode( + ) const; + /*! + ensures + - returns the mode of this layer, either CONV_MODE or FC_MODE. + !*/ + + template <typename SUBNET> void setup (const SUBNET& sub); + void forward_inplace(const tensor& input, tensor& output); + void backward_inplace(const tensor& computed_output, const tensor& gradient_input, tensor& data_grad, tensor& params_grad); + dpoint map_input_to_output(dpoint p) const; + dpoint map_output_to_input(dpoint p) const; + const tensor& get_layer_params() const; + tensor& get_layer_params(); + /*! + These functions are implemented as described in the + EXAMPLE_COMPUTATIONAL_LAYER_ interface. Also note that get_layer_params() + always returns an empty tensor since there are no learnable parameters in this + object. + !*/ + + }; + + template <typename SUBNET> + using affine = add_layer<affine_, SUBNET>; + +// ---------------------------------------------------------------------------------------- + + template < + long _nr, + long _nc, + int _stride_y, + int _stride_x, + int _padding_y = _stride_y!=1? 0 : _nr/2, + int _padding_x = _stride_x!=1? 0 : _nc/2 + > + class max_pool_ + { + /*! + REQUIREMENTS ON TEMPLATE ARGUMENTS + - _nr >= 0 + - _nc >= 0 + - _stride_y > 0 + - _stride_x > 0 + - _padding_y >= 0 + - _padding_x >= 0 + - if (_nr != 0) then + - _padding_y < _nr + - else + - _padding_y == 0 + - if (_nc != 0) then + - _padding_x < _nr + - else + - _padding_x == 0 + + WHAT THIS OBJECT REPRESENTS + This is an implementation of the EXAMPLE_COMPUTATIONAL_LAYER_ interface + defined above. In particular, it defines a max pooling layer that takes an + input tensor and downsamples it. It does this by sliding a window over the + images in an input tensor and outputting, for each channel, the maximum + element within the window. + + If _nr == 0 then it means the filter size covers all the rows in the input + tensor, similarly for the _nc parameter. To be precise, if we call the + input tensor IN and the output tensor OUT, then OUT is defined as follows: + - let FILT_NR == (nr()==0) ? IN.nr() : nr() + - let FILT_NC == (nc()==0) ? IN.nc() : nc() + - OUT.num_samples() == IN.num_samples() + - OUT.k() == IN.k() + - OUT.nr() == 1+(IN.nr() + 2*padding_y() - FILT_NR)/stride_y() + - OUT.nc() == 1+(IN.nc() + 2*padding_x() - FILT_NC)/stride_x() + - for all valid s, k, r, and c: + - image_plane(OUT,s,k)(r,c) == max(subm_clipped(image_plane(IN,s,k), + centered_rect(x*stride_x() + FILT_NC/2 - padding_x(), + y*stride_y() + FILT_NR/2 - padding_y(), + FILT_NC, + FILT_NR))) + !*/ + + public: + + max_pool_ ( + ); + /*! + ensures + - #nr() == _nr + - #nc() == _nc + - #stride_y() == _stride_y + - #stride_x() == _stride_x + - #padding_y() == _padding_y + - #padding_x() == _padding_x + !*/ + + long nr( + ) const; + /*! + ensures + - returns the number of rows in the pooling window or 0 if the window size + is "the entire input tensor". + !*/ + + long nc( + ) const; + /*! + ensures + - returns the number of rows in the pooling window or 0 if the window size + is "the entire input tensor". + !*/ + + long stride_y( + ) const; + /*! + ensures + - returns the vertical stride used when scanning the max pooling window + over an image. That is, each window will be moved stride_y() pixels down + at a time when it moves over the image. + !*/ + + long stride_x( + ) const; + /*! + ensures + - returns the horizontal stride used when scanning the max pooling window + over an image. That is, each window will be moved stride_x() pixels down + at a time when it moves over the image. + !*/ + + long padding_y( + ) const; + /*! + ensures + - returns the number of pixels of zero padding added to the top and bottom + sides of the image. + !*/ + + long padding_x( + ) const; + /*! + ensures + - returns the number of pixels of zero padding added to the left and right + sides of the image. + !*/ + + template <typename SUBNET> void setup (const SUBNET& sub); + template <typename SUBNET> void forward(const SUBNET& sub, resizable_tensor& output); + template <typename SUBNET> void backward(const tensor& computed_output, const tensor& gradient_input, SUBNET& sub, tensor& params_grad); + dpoint map_input_to_output(dpoint p) const; + dpoint map_output_to_input(dpoint p) const; + const tensor& get_layer_params() const; + tensor& get_layer_params(); + /*! + These functions are implemented as described in the EXAMPLE_COMPUTATIONAL_LAYER_ + interface. Note that this layer doesn't have any parameters, so the tensor + returned by get_layer_params() is always empty. + !*/ + }; + + template < + long nr, + long nc, + int stride_y, + int stride_x, + typename SUBNET + > + using max_pool = add_layer<max_pool_<nr,nc,stride_y,stride_x>, SUBNET>; + + template < + typename SUBNET + > + using max_pool_everything = add_layer<max_pool_<0,0,1,1>, SUBNET>; + +// ---------------------------------------------------------------------------------------- + + template < + long _nr, + long _nc, + int _stride_y, + int _stride_x, + int _padding_y = _stride_y!=1? 0 : _nr/2, + int _padding_x = _stride_x!=1? 0 : _nc/2 + > + class avg_pool_ + { + /*! + REQUIREMENTS ON TEMPLATE ARGUMENTS + - _nr >= 0 + - _nc >= 0 + - _stride_y > 0 + - _stride_x > 0 + - _padding_y >= 0 + - _padding_x >= 0 + - if (_nr != 0) then + - _padding_y < _nr + - else + - _padding_y == 0 + - if (_nc != 0) then + - _padding_x < _nr + - else + - _padding_x == 0 + + WHAT THIS OBJECT REPRESENTS + This is an implementation of the EXAMPLE_COMPUTATIONAL_LAYER_ interface + defined above. In particular, it defines an average pooling layer that + takes an input tensor and downsamples it. It does this by sliding a window + over the images in an input tensor and outputting, for each channel, the + average element within the window. + + If _nr == 0 then it means the filter size covers all the rows in the input + tensor, similarly for the _nc parameter. To be precise, if we call the + input tensor IN and the output tensor OUT, then OUT is defined as follows: + - let FILT_NR == (nr()==0) ? IN.nr() : nr() + - let FILT_NC == (nc()==0) ? IN.nc() : nc() + - OUT.num_samples() == IN.num_samples() + - OUT.k() == IN.k() + - OUT.nr() == 1+(IN.nr() + 2*padding_y() - FILT_NR)/stride_y() + - OUT.nc() == 1+(IN.nc() + 2*padding_x() - FILT_NC)/stride_x() + - for all valid s, k, r, and c: + - image_plane(OUT,s,k)(r,c) == mean(subm_clipped(image_plane(IN,s,k), + centered_rect(x*stride_x() + FILT_NC/2 - padding_x(), + y*stride_y() + FILT_NR/2 - padding_y(), + FILT_NC, + FILT_NR))) + !*/ + + public: + + avg_pool_ ( + ); + /*! + ensures + - #nr() == _nr + - #nc() == _nc + - #stride_y() == _stride_y + - #stride_x() == _stride_x + - #padding_y() == _padding_y + - #padding_x() == _padding_x + !*/ + + long nr( + ) const; + /*! + ensures + - returns the number of rows in the pooling window or 0 if the window size + is "the entire input tensor". + !*/ + + long nc( + ) const; + /*! + ensures + - returns the number of rows in the pooling window or 0 if the window size + is "the entire input tensor". + !*/ + + long stride_y( + ) const; + /*! + ensures + - returns the vertical stride used when scanning the pooling window + over an image. That is, each window will be moved stride_y() pixels down + at a time when it moves over the image. + !*/ + + long stride_x( + ) const; + /*! + ensures + - returns the horizontal stride used when scanning the pooling window + over an image. That is, each window will be moved stride_x() pixels down + at a time when it moves over the image. + !*/ + + long padding_y( + ) const; + /*! + ensures + - returns the number of pixels of zero padding added to the top and bottom + sides of the image. + !*/ + + long padding_x( + ) const; + /*! + ensures + - returns the number of pixels of zero padding added to the left and right + sides of the image. + !*/ + + template <typename SUBNET> void setup (const SUBNET& sub); + template <typename SUBNET> void forward(const SUBNET& sub, resizable_tensor& output); + template <typename SUBNET> void backward(const tensor& computed_output, const tensor& gradient_input, SUBNET& sub, tensor& params_grad); + dpoint map_input_to_output(dpoint p) const; + dpoint map_output_to_input(dpoint p) const; + const tensor& get_layer_params() const; + tensor& get_layer_params(); + /*! + These functions are implemented as described in the EXAMPLE_COMPUTATIONAL_LAYER_ + interface. Note that this layer doesn't have any parameters, so the tensor + returned by get_layer_params() is always empty. + !*/ + + }; + + template < + long nr, + long nc, + int stride_y, + int stride_x, + typename SUBNET + > + using avg_pool = add_layer<avg_pool_<nr,nc,stride_y,stride_x>, SUBNET>; + + template < + typename SUBNET + > + using avg_pool_everything = add_layer<avg_pool_<0,0,1,1>, SUBNET>; + +// ---------------------------------------------------------------------------------------- + + class relu_ + { + /*! + WHAT THIS OBJECT REPRESENTS + This is an implementation of the EXAMPLE_COMPUTATIONAL_LAYER_ interface + defined above. In particular, it defines a rectified linear layer. + Therefore, it passes its inputs through the function + f(x)=max(x,0) + where f() is applied pointwise across the input tensor. + !*/ + + public: + + relu_( + ); + + template <typename SUBNET> void setup (const SUBNET& sub); + void forward_inplace(const tensor& input, tensor& output); + void backward_inplace(const tensor& computed_output, const tensor& gradient_input, tensor& data_grad, tensor& params_grad); + dpoint map_input_to_output(dpoint p) const; + dpoint map_output_to_input(dpoint p) const; + const tensor& get_layer_params() const; + tensor& get_layer_params(); + /*! + These functions are implemented as described in the EXAMPLE_COMPUTATIONAL_LAYER_ + interface. Note that this layer doesn't have any parameters, so the tensor + returned by get_layer_params() is always empty. + !*/ + }; + + template <typename SUBNET> + using relu = add_layer<relu_, SUBNET>; + +// ---------------------------------------------------------------------------------------- + + class prelu_ + { + /*! + WHAT THIS OBJECT REPRESENTS + This is an implementation of the EXAMPLE_COMPUTATIONAL_LAYER_ interface + defined above. In particular, it defines a parametric rectified linear + layer. Therefore, it passes its inputs through the function + f(x) = x>0 ? x : p*x + where f() is applied pointwise across the input tensor and p is a scalar + parameter learned by this layer. + + + This is the layer type introduced in the paper: + He, Kaiming, et al. "Delving deep into rectifiers: Surpassing + human-level performance on imagenet classification." Proceedings of the + IEEE International Conference on Computer Vision. 2015. + !*/ + + public: + + explicit prelu_( + float initial_param_value = 0.25 + ); + /*! + ensures + - The p parameter will be initialized with initial_param_value. + - #get_initial_param_value() == initial_param_value. + !*/ + + float get_initial_param_value ( + ) const; + /*! + ensures + - returns the initial value of the prelu parameter. + !*/ + + template <typename SUBNET> void setup (const SUBNET& sub); + void forward_inplace(const tensor& input, tensor& output); + void backward_inplace(const tensor& computed_output, const tensor& gradient_input, tensor& data_grad, tensor& params_grad); + dpoint map_input_to_output(dpoint p) const; + dpoint map_output_to_input(dpoint p) const; + const tensor& get_layer_params() const; + tensor& get_layer_params(); + /*! + These functions are implemented as described in the EXAMPLE_COMPUTATIONAL_LAYER_ interface. + !*/ + }; + + template <typename SUBNET> + using prelu = add_layer<prelu_, SUBNET>; + +// ---------------------------------------------------------------------------------------- + + class sig_ + { + /*! + WHAT THIS OBJECT REPRESENTS + This is an implementation of the EXAMPLE_COMPUTATIONAL_LAYER_ interface + defined above. In particular, it defines a sigmoid layer. Therefore, it + passes its inputs through the function + f(x)=1/(1+exp(-x)) + where f() is applied pointwise across the input tensor. + !*/ + + public: + + sig_( + ); + + template <typename SUBNET> void setup (const SUBNET& sub); + void forward_inplace(const tensor& input, tensor& output); + void backward_inplace(const tensor& computed_output, const tensor& gradient_input, tensor& data_grad, tensor& params_grad); + dpoint map_input_to_output(dpoint p) const; + dpoint map_output_to_input(dpoint p) const; + const tensor& get_layer_params() const; + tensor& get_layer_params(); + /*! + These functions are implemented as described in the EXAMPLE_COMPUTATIONAL_LAYER_ + interface. Note that this layer doesn't have any parameters, so the tensor + returned by get_layer_params() is always empty. + !*/ + }; + + template <typename SUBNET> + using sig = add_layer<sig_, SUBNET>; + +// ---------------------------------------------------------------------------------------- + + class htan_ + { + /*! + WHAT THIS OBJECT REPRESENTS + This is an implementation of the EXAMPLE_COMPUTATIONAL_LAYER_ interface + defined above. In particular, it defines a hyperbolic tangent layer. + Therefore, it passes its inputs through the function + f(x)=std::tanh(x) + where f() is applied pointwise across the input tensor. + !*/ + + public: + + htan_( + ); + + template <typename SUBNET> void setup (const SUBNET& sub); + void forward_inplace(const tensor& input, tensor& output); + void backward_inplace(const tensor& computed_output, const tensor& gradient_input, tensor& data_grad, tensor& params_grad); + dpoint map_input_to_output(dpoint p) const; + dpoint map_output_to_input(dpoint p) const; + const tensor& get_layer_params() const; + tensor& get_layer_params(); + /*! + These functions are implemented as described in the EXAMPLE_COMPUTATIONAL_LAYER_ + interface. Note that this layer doesn't have any parameters, so the tensor + returned by get_layer_params() is always empty. + !*/ + }; + + template <typename SUBNET> + using htan = add_layer<htan_, SUBNET>; + +// ---------------------------------------------------------------------------------------- + + class softmax_ + { + /*! + WHAT THIS OBJECT REPRESENTS + This is an implementation of the EXAMPLE_COMPUTATIONAL_LAYER_ interface + defined above. In particular, it defines a softmax layer. To be precise, + we define the softmax function s(x) as: + s(x) == exp(x)/sum(exp(x)) + where x is a vector. Then this layer treats its input tensor as a + collection of multi-channel images and applies s() to each spatial location + in each image. In each application, the tensor::k() channel elements at + each position are input to s() and then replaced by the outputs of s(). + + This means that, for example, if you collapsed each output image to a 1 + channel image by adding the channels then you would end up with images + where each pixel value was 1. This is because the sum of the outputs of + s() will always be equal to 1. + !*/ + + public: + + softmax_( + ); + + template <typename SUBNET> void setup (const SUBNET& sub); + void forward_inplace(const tensor& input, tensor& output); + void backward_inplace(const tensor& computed_output, const tensor& gradient_input, tensor& data_grad, tensor& params_grad); + const tensor& get_layer_params() const; + tensor& get_layer_params(); + /*! + These functions are implemented as described in the EXAMPLE_COMPUTATIONAL_LAYER_ + interface. Note that this layer doesn't have any parameters, so the tensor + returned by get_layer_params() is always empty. + !*/ + }; + + template <typename SUBNET> + using softmax = add_layer<softmax_, SUBNET>; + +// ---------------------------------------------------------------------------------------- + + class softmax_all_ + { + /*! + WHAT THIS OBJECT REPRESENTS + This is an implementation of the EXAMPLE_COMPUTATIONAL_LAYER_ interface + defined above. In particular, it defines a softmax layer. To be precise, + we define the softmax function s(x) as: + s(x) == exp(x)/sum(exp(x)) + where x is a vector. Then this layer treats its input tensor as a + collection of tensor::num_samples() vectors and applies s() to each vector + in the tensor. Therefore, there are logically tensor::num_samples() + invocations of s(). + !*/ + + public: + + softmax_all_( + ); + + template <typename SUBNET> void setup (const SUBNET& sub); + void forward_inplace(const tensor& input, tensor& output); + void backward_inplace(const tensor& computed_output, const tensor& gradient_input, tensor& data_grad, tensor& params_grad); + const tensor& get_layer_params() const; + tensor& get_layer_params(); + /*! + These functions are implemented as described in the EXAMPLE_COMPUTATIONAL_LAYER_ + interface. Note that this layer doesn't have any parameters, so the tensor + returned by get_layer_params() is always empty. + !*/ + }; + + template <typename SUBNET> + using softmax_all = add_layer<softmax_all_, SUBNET>; + +// ---------------------------------------------------------------------------------------- + + template < + template<typename> class tag + > + class add_prev_ + { + /*! + WHAT THIS OBJECT REPRESENTS + This is an implementation of the EXAMPLE_COMPUTATIONAL_LAYER_ interface + defined above. This layer simply adds the output of two previous layers. + In particular, it adds the tensor from its immediate predecessor layer, + sub.get_output(), with the tensor from a deeper layer, + layer<tag>(sub).get_output(). + + Therefore, you supply a tag via add_prev_'s template argument that tells it + what layer to add to the output of the previous layer. The result of this + addition is output by add_prev_. Finally, the addition happens pointwise + according to 4D tensor arithmetic. If the dimensions don't match then + missing elements are presumed to be equal to 0. Moreover, each dimension + of the output tensor is equal to the maximum dimension of either of the + inputs. That is, if the tensors A and B are being added to produce C then: + - C.num_samples() == max(A.num_samples(), B.num_samples()) + - C.k() == max(A.k(), B.k()) + - C.nr() == max(A.nr(), B.nr()) + - C.nc() == max(A.nc(), B.nc()) + !*/ + + public: + add_prev_( + ); + + template <typename SUBNET> void setup (const SUBNET& sub); + template <typename SUBNET> void forward(const SUBNET& sub, resizable_tensor& output); + template <typename SUBNET> void backward(const tensor& gradient_input, SUBNET& sub, tensor& params_grad); + dpoint map_input_to_output(dpoint p) const; + dpoint map_output_to_input(dpoint p) const; + const tensor& get_layer_params() const; + tensor& get_layer_params(); + /*! + These functions are implemented as described in the EXAMPLE_COMPUTATIONAL_LAYER_ interface. + !*/ + }; + + + template < + template<typename> class tag, + typename SUBNET + > + using add_prev = add_layer<add_prev_<tag>, SUBNET>; + + // Here we add some convenient aliases for using add_prev_ with the tag layers. + template <typename SUBNET> using add_prev1 = add_prev<tag1, SUBNET>; + template <typename SUBNET> using add_prev2 = add_prev<tag2, SUBNET>; + template <typename SUBNET> using add_prev3 = add_prev<tag3, SUBNET>; + template <typename SUBNET> using add_prev4 = add_prev<tag4, SUBNET>; + template <typename SUBNET> using add_prev5 = add_prev<tag5, SUBNET>; + template <typename SUBNET> using add_prev6 = add_prev<tag6, SUBNET>; + template <typename SUBNET> using add_prev7 = add_prev<tag7, SUBNET>; + template <typename SUBNET> using add_prev8 = add_prev<tag8, SUBNET>; + template <typename SUBNET> using add_prev9 = add_prev<tag9, SUBNET>; + template <typename SUBNET> using add_prev10 = add_prev<tag10, SUBNET>; + using add_prev1_ = add_prev_<tag1>; + using add_prev2_ = add_prev_<tag2>; + using add_prev3_ = add_prev_<tag3>; + using add_prev4_ = add_prev_<tag4>; + using add_prev5_ = add_prev_<tag5>; + using add_prev6_ = add_prev_<tag6>; + using add_prev7_ = add_prev_<tag7>; + using add_prev8_ = add_prev_<tag8>; + using add_prev9_ = add_prev_<tag9>; + using add_prev10_ = add_prev_<tag10>; + +// ---------------------------------------------------------------------------------------- + + template < + template<typename> class tag + > + class mult_prev_ + { + /*! + WHAT THIS OBJECT REPRESENTS + This is an implementation of the EXAMPLE_COMPUTATIONAL_LAYER_ interface + defined above. This layer simply multiplies the output of two previous + layers. In particular, it multiplies the tensor from its immediate + predecessor layer, sub.get_output(), with the tensor from a deeper layer, + layer<tag>(sub).get_output(). + + Therefore, you supply a tag via mult_prev_'s template argument that tells + it what layer to multiply with the output of the previous layer. The + result of this multiplication is output by mult_prev_. Finally, the + multiplication happens pointwise according to 4D tensor arithmetic. If the + dimensions don't match then missing elements are presumed to be equal to 0. + Moreover, each dimension of the output tensor is equal to the maximum + dimension of either of the inputs. That is, if the tensors A and B are + being multiplied to produce C then: + - C.num_samples() == max(A.num_samples(), B.num_samples()) + - C.k() == max(A.k(), B.k()) + - C.nr() == max(A.nr(), B.nr()) + - C.nc() == max(A.nc(), B.nc()) + !*/ + + public: + mult_prev_( + ); + + template <typename SUBNET> void setup (const SUBNET& sub); + template <typename SUBNET> void forward(const SUBNET& sub, resizable_tensor& output); + template <typename SUBNET> void backward(const tensor& gradient_input, SUBNET& sub, tensor& params_grad); + const tensor& get_layer_params() const; + tensor& get_layer_params(); + /*! + These functions are implemented as described in the EXAMPLE_COMPUTATIONAL_LAYER_ interface. + !*/ + }; + + + template < + template<typename> class tag, + typename SUBNET + > + using mult_prev = add_layer<mult_prev_<tag>, SUBNET>; + + // Here we add some convenient aliases for using mult_prev_ with the tag layers. + template <typename SUBNET> using mult_prev1 = mult_prev<tag1, SUBNET>; + template <typename SUBNET> using mult_prev2 = mult_prev<tag2, SUBNET>; + template <typename SUBNET> using mult_prev3 = mult_prev<tag3, SUBNET>; + template <typename SUBNET> using mult_prev4 = mult_prev<tag4, SUBNET>; + template <typename SUBNET> using mult_prev5 = mult_prev<tag5, SUBNET>; + template <typename SUBNET> using mult_prev6 = mult_prev<tag6, SUBNET>; + template <typename SUBNET> using mult_prev7 = mult_prev<tag7, SUBNET>; + template <typename SUBNET> using mult_prev8 = mult_prev<tag8, SUBNET>; + template <typename SUBNET> using mult_prev9 = mult_prev<tag9, SUBNET>; + template <typename SUBNET> using mult_prev10 = mult_prev<tag10, SUBNET>; + using mult_prev1_ = mult_prev_<tag1>; + using mult_prev2_ = mult_prev_<tag2>; + using mult_prev3_ = mult_prev_<tag3>; + using mult_prev4_ = mult_prev_<tag4>; + using mult_prev5_ = mult_prev_<tag5>; + using mult_prev6_ = mult_prev_<tag6>; + using mult_prev7_ = mult_prev_<tag7>; + using mult_prev8_ = mult_prev_<tag8>; + using mult_prev9_ = mult_prev_<tag9>; + using mult_prev10_ = mult_prev_<tag10>; + +// ---------------------------------------------------------------------------------------- + + template < + template<typename> class tag + > + class scale_ + { + /*! + WHAT THIS OBJECT REPRESENTS + This is an implementation of the EXAMPLE_COMPUTATIONAL_LAYER_ interface + defined above. This layer scales the output channels of the tagged layer + by multiplying it with the output of the previous layer. To be specific: + - Let INPUT == layer<tag>(sub).get_output() + - Let SCALES == sub.get_output() + - This layer takes INPUT and SCALES as input. + - The output of this layer has the same dimensions as INPUT. + - This layer requires: + - SCALES.num_samples() == INPUT.num_samples() + - SCALES.k() == INPUT.k() + - SCALES.nr() == 1 + - SCALES.nc() == 1 + - The output tensor is produced by pointwise multiplying SCALES with + INPUT at each spatial location. Therefore, if OUT is the output of + this layer then we would have: + OUT(n,k,r,c) == INPUT(n,k,r,c)*SCALES(n,k) + !*/ + + public: + scale_( + ); + + template <typename SUBNET> void setup (const SUBNET& sub); + template <typename SUBNET> void forward(const SUBNET& sub, resizable_tensor& output); + template <typename SUBNET> void backward(const tensor& gradient_input, SUBNET& sub, tensor& params_grad); + const tensor& get_layer_params() const; + tensor& get_layer_params(); + /*! + These functions are implemented as described in the EXAMPLE_COMPUTATIONAL_LAYER_ interface. + !*/ + }; + + + template < + template<typename> class tag, + typename SUBNET + > + using scale = add_layer<scale_<tag>, SUBNET>; + + // Here we add some convenient aliases for using scale_ with the tag layers. + template <typename SUBNET> using scale1 = scale<tag1, SUBNET>; + template <typename SUBNET> using scale2 = scale<tag2, SUBNET>; + template <typename SUBNET> using scale3 = scale<tag3, SUBNET>; + template <typename SUBNET> using scale4 = scale<tag4, SUBNET>; + template <typename SUBNET> using scale5 = scale<tag5, SUBNET>; + template <typename SUBNET> using scale6 = scale<tag6, SUBNET>; + template <typename SUBNET> using scale7 = scale<tag7, SUBNET>; + template <typename SUBNET> using scale8 = scale<tag8, SUBNET>; + template <typename SUBNET> using scale9 = scale<tag9, SUBNET>; + template <typename SUBNET> using scale10 = scale<tag10, SUBNET>; + using scale1_ = scale_<tag1>; + using scale2_ = scale_<tag2>; + using scale3_ = scale_<tag3>; + using scale4_ = scale_<tag4>; + using scale5_ = scale_<tag5>; + using scale6_ = scale_<tag6>; + using scale7_ = scale_<tag7>; + using scale8_ = scale_<tag8>; + using scale9_ = scale_<tag9>; + using scale10_ = scale_<tag10>; + +// ---------------------------------------------------------------------------------------- + + template< + template<typename> class... TAG_TYPES + > + class concat_ + { + /*! + WHAT THIS OBJECT REPRESENTS + This is an implementation of the EXAMPLE_COMPUTATIONAL_LAYER_ interface + defined above. This layer simply concatenates the output of tagged layers. + Importantly, each input layer must have the same dimensions (i.e. + num_samples, nr, and nc) except for the k channel, which may vary. This is + because the concatenation happens along the k dimension. That is, the + output of this network is a tensor, OUT, that is the concatenation of the + tensors: + for each (tag in TAG_TYPES) + layer<tag>(subnet).get_output() + Therefore, out.num_samples(), out.nr(), and out.nc() match the dimensions + of the input tensors while OUT.k() is the sum of the input layer's k() + dimensions. + !*/ + + public: + template <typename SUBNET> void setup (const SUBNET& sub); + template <typename SUBNET> void forward(const SUBNET& sub, resizable_tensor& output); + template <typename SUBNET> void backward(const tensor& gradient_input, SUBNET& sub, tensor& params_grad); + dpoint map_input_to_output(dpoint p) const; + dpoint map_output_to_input(dpoint p) const; + const tensor& get_layer_params() const; + tensor& get_layer_params(); + /*! + These functions are implemented as described in the EXAMPLE_COMPUTATIONAL_LAYER_ interface. + !*/ + }; + + + // concat layer definitions + template <template<typename> class TAG1, + template<typename> class TAG2, + typename SUBNET> + using concat2 = add_layer<concat_<TAG1, TAG2>, SUBNET>; + + template <template<typename> class TAG1, + template<typename> class TAG2, + template<typename> class TAG3, + typename SUBNET> + using concat3 = add_layer<concat_<TAG1, TAG2, TAG3>, SUBNET>; + + template <template<typename> class TAG1, + template<typename> class TAG2, + template<typename> class TAG3, + template<typename> class TAG4, + typename SUBNET> + using concat4 = add_layer<concat_<TAG1, TAG2, TAG3, TAG4>, SUBNET>; + + template <template<typename> class TAG1, + template<typename> class TAG2, + template<typename> class TAG3, + template<typename> class TAG4, + template<typename> class TAG5, + typename SUBNET> + using concat5 = add_layer<concat_<TAG1, TAG2, TAG3, TAG4, TAG5>, SUBNET>; + +// ---------------------------------------------------------------------------------------- + + /*!A inception layer definitions !*/ + + // Now define inception layer tag types. These layer aliases allow creating + // the networks described in the paper: + // Szegedy, Christian, et al. "Going deeper with convolutions." Proceedings of + // the IEEE Conference on Computer Vision and Pattern Recognition. 2015. + // See the dnn_inception_ex.cpp example for a complete example of their use. Note also + // that we use tag ID numbers >= 1000 to avoid conflict with user's tag layers. + template <typename SUBNET> using itag0 = add_tag_layer< 1000 + 0, SUBNET>; + template <typename SUBNET> using itag1 = add_tag_layer< 1000 + 1, SUBNET>; + template <typename SUBNET> using itag2 = add_tag_layer< 1000 + 2, SUBNET>; + template <typename SUBNET> using itag3 = add_tag_layer< 1000 + 3, SUBNET>; + template <typename SUBNET> using itag4 = add_tag_layer< 1000 + 4, SUBNET>; + template <typename SUBNET> using itag5 = add_tag_layer< 1000 + 5, SUBNET>; + // skip to inception input + template <typename SUBNET> using iskip = add_skip_layer< itag0, SUBNET>; + + // here are some templates to be used for creating inception layer groups + template <template<typename>class B1, + template<typename>class B2, + typename SUBNET> + using inception2 = concat2<itag1, itag2, itag1<B1<iskip< itag2<B2< itag0<SUBNET>>>>>>>; + + template <template<typename>class B1, + template<typename>class B2, + template<typename>class B3, + typename SUBNET> + using inception3 = concat3<itag1, itag2, itag3, itag1<B1<iskip< itag2<B2<iskip< itag3<B3< itag0<SUBNET>>>>>>>>>>; + + template <template<typename>class B1, + template<typename>class B2, + template<typename>class B3, + template<typename>class B4, + typename SUBNET> + using inception4 = concat4<itag1, itag2, itag3, itag4, + itag1<B1<iskip< itag2<B2<iskip< itag3<B3<iskip< itag4<B4< itag0<SUBNET>>>>>>>>>>>>>; + + template <template<typename>class B1, + template<typename>class B2, + template<typename>class B3, + template<typename>class B4, + template<typename>class B5, + typename SUBNET> + using inception5 = concat5<itag1, itag2, itag3, itag4, itag5, + itag1<B1<iskip< itag2<B2<iskip< itag3<B3<iskip< itag4<B4<iskip< itag5<B5< itag0<SUBNET>>>>>>>>>>>>>>>>; + +// ---------------------------------------------------------------------------------------- + + const double DEFAULT_L2_NORM_EPS = 1e-5; + + class l2normalize_ + { + /*! + WHAT THIS OBJECT REPRESENTS + This is an implementation of the EXAMPLE_COMPUTATIONAL_LAYER_ interface + defined above. It takes tensors as input and L2 normalizes them. In particular, + it has the following properties: + - The output tensors from this layer have the same dimensions as the + input tensors. + - If you think of each input tensor as a set of tensor::num_samples() + vectors, then the output tensor contains the same vectors except they + have been length normalized so that their L2 norms are all 1. I.e. + for each vector v we will have ||v||==1. + !*/ + + public: + + explicit l2normalize_( + double eps = tt::DEFAULT_L2_NORM_EPS + ); + /*! + requires + - eps > 0 + ensures + - #get_eps() == eps + !*/ + + double get_eps( + ) const; + /*! + ensures + - When we normalize a vector we divide it by its L2 norm. However, the + get_eps() value is added to the squared norm prior to division to avoid + ever dividing by zero. + !*/ + + template <typename SUBNET> void setup (const SUBNET& sub); + void forward_inplace(const tensor& input, tensor& output); + void backward_inplace(const tensor& computed_output, const tensor& gradient_input, tensor& data_grad, tensor& params_grad); + const tensor& get_layer_params() const; + tensor& get_layer_params(); + /*! + These functions are implemented as described in the EXAMPLE_COMPUTATIONAL_LAYER_ interface. + !*/ + }; + +// ---------------------------------------------------------------------------------------- + + template < + long _offset, + long _k, + long _nr, + long _nc + > + class extract_ + { + /*! + REQUIREMENTS ON TEMPLATE ARGUMENTS + - 0 <= _offset + - 0 < _k + - 0 < _nr + - 0 < _nc + + WHAT THIS OBJECT REPRESENTS + This is an implementation of the EXAMPLE_COMPUTATIONAL_LAYER_ interface + defined above. In particular, the output of this layer is simply a copy of + the input tensor. However, you can configure the extract layer to output + only some subset of the input tensor and also to reshape it. Therefore, + the dimensions of the tensor output by this layer are as follows (letting + IN be the input tensor and OUT the output tensor): + - OUT.num_samples() == IN.num_samples() + - OUT.k() == _k + - OUT.nr() == _nr + - OUT.nc() == _nc + + So the output will always have the same number of samples as the input, but + within each sample (the k,nr,nc part) we will copy only a subset of the + values. Moreover, the _offset parameter controls which part of each sample + we take. To be very precise, we will have: + - let IN_SIZE = IN.k()*IN.nr()*IN.nc() + - let OUT_SIZE = _k*_nr*_nc + - for i in range[0,IN.num_samples()) and j in range[0,OUT_SIZE): + - OUT.host()[i*OUT_SIZE+j] == IN.host()[i*IN_SIZE+_offset+j] + + + Finally, all this means that the input tensor to this layer must have a big + enough size to accommodate taking a _k*_nr*_nc slice from each of its + samples. + !*/ + + public: + + template <typename SUBNET> void setup (const SUBNET& sub); + template <typename SUBNET> void forward(const SUBNET& sub, resizable_tensor& output); + template <typename SUBNET> void backward(const tensor& gradient_input, SUBNET& sub, tensor& params_grad); + const tensor& get_layer_params() const; + tensor& get_layer_params(); + /*! + These functions are implemented as described in the EXAMPLE_COMPUTATIONAL_LAYER_ interface. + !*/ + }; + + template < + long offset, + long k, + long nr, + long nc, + typename SUBNET + > + using extract = add_layer<extract_<offset,k,nr,nc>, SUBNET>; + +// ---------------------------------------------------------------------------------------- + +} + +#endif // DLIB_DNn_LAYERS_ABSTRACT_H_ + diff --git a/ml/dlib/dlib/dnn/loss.h b/ml/dlib/dlib/dnn/loss.h new file mode 100644 index 000000000..1b09b85c3 --- /dev/null +++ b/ml/dlib/dlib/dnn/loss.h @@ -0,0 +1,2870 @@ +// Copyright (C) 2015 Davis E. King (davis@dlib.net) +// License: Boost Software License See LICENSE.txt for the full license. +#ifndef DLIB_DNn_LOSS_H_ +#define DLIB_DNn_LOSS_H_ + +#include "loss_abstract.h" +#include "core.h" +#include "../matrix.h" +#include "tensor_tools.h" +#include "../geometry.h" +#include "../image_processing/box_overlap_testing.h" +#include "../image_processing/full_object_detection.h" +#include "../svm/ranking_tools.h" +#include <sstream> +#include <map> + +namespace dlib +{ + +// ---------------------------------------------------------------------------------------- + + class loss_binary_hinge_ + { + public: + + typedef float training_label_type; + typedef float output_label_type; + + template < + typename SUB_TYPE, + typename label_iterator + > + void to_label ( + const tensor& input_tensor, + const SUB_TYPE& sub, + label_iterator iter + ) const + { + DLIB_CASSERT(sub.sample_expansion_factor() == 1); + + const tensor& output_tensor = sub.get_output(); + DLIB_CASSERT(output_tensor.nr() == 1 && + output_tensor.nc() == 1 && + output_tensor.k() == 1); + DLIB_CASSERT(input_tensor.num_samples() == output_tensor.num_samples()); + + const float* out_data = output_tensor.host(); + for (long i = 0; i < output_tensor.num_samples(); ++i) + { + *iter++ = out_data[i]; + } + } + + template < + typename const_label_iterator, + typename SUBNET + > + double compute_loss_value_and_gradient ( + const tensor& input_tensor, + const_label_iterator truth, + SUBNET& sub + ) const + { + const tensor& output_tensor = sub.get_output(); + tensor& grad = sub.get_gradient_input(); + + DLIB_CASSERT(sub.sample_expansion_factor() == 1); + DLIB_CASSERT(input_tensor.num_samples() != 0); + DLIB_CASSERT(input_tensor.num_samples()%sub.sample_expansion_factor() == 0); + DLIB_CASSERT(input_tensor.num_samples() == grad.num_samples()); + DLIB_CASSERT(input_tensor.num_samples() == output_tensor.num_samples()); + DLIB_CASSERT(output_tensor.nr() == 1 && + output_tensor.nc() == 1 && + output_tensor.k() == 1); + + // The loss we output is the average loss over the mini-batch. + const double scale = 1.0/output_tensor.num_samples(); + double loss = 0; + const float* out_data = output_tensor.host(); + float* g = grad.host_write_only(); + for (long i = 0; i < output_tensor.num_samples(); ++i) + { + const float y = *truth++; + DLIB_CASSERT(y == +1 || y == -1, "y: " << y); + const float temp = 1-y*out_data[i]; + if (temp > 0) + { + loss += scale*temp; + g[i] = -scale*y; + } + else + { + g[i] = 0; + } + } + return loss; + } + + friend void serialize(const loss_binary_hinge_& , std::ostream& out) + { + serialize("loss_binary_hinge_", out); + } + + friend void deserialize(loss_binary_hinge_& , std::istream& in) + { + std::string version; + deserialize(version, in); + if (version != "loss_binary_hinge_") + throw serialization_error("Unexpected version found while deserializing dlib::loss_binary_hinge_."); + } + + friend std::ostream& operator<<(std::ostream& out, const loss_binary_hinge_& ) + { + out << "loss_binary_hinge"; + return out; + } + + friend void to_xml(const loss_binary_hinge_& /*item*/, std::ostream& out) + { + out << "<loss_binary_hinge/>"; + } + + }; + + template <typename SUBNET> + using loss_binary_hinge = add_loss_layer<loss_binary_hinge_, SUBNET>; + +// ---------------------------------------------------------------------------------------- + + class loss_binary_log_ + { + public: + + typedef float training_label_type; + typedef float output_label_type; + + template < + typename SUB_TYPE, + typename label_iterator + > + void to_label ( + const tensor& input_tensor, + const SUB_TYPE& sub, + label_iterator iter + ) const + { + DLIB_CASSERT(sub.sample_expansion_factor() == 1); + + const tensor& output_tensor = sub.get_output(); + DLIB_CASSERT(output_tensor.nr() == 1 && + output_tensor.nc() == 1 && + output_tensor.k() == 1); + DLIB_CASSERT(input_tensor.num_samples() == output_tensor.num_samples()); + + const float* out_data = output_tensor.host(); + for (long i = 0; i < output_tensor.num_samples(); ++i) + { + *iter++ = out_data[i]; + } + } + + + template < + typename const_label_iterator, + typename SUBNET + > + double compute_loss_value_and_gradient ( + const tensor& input_tensor, + const_label_iterator truth, + SUBNET& sub + ) const + { + const tensor& output_tensor = sub.get_output(); + tensor& grad = sub.get_gradient_input(); + + DLIB_CASSERT(sub.sample_expansion_factor() == 1); + DLIB_CASSERT(input_tensor.num_samples() != 0); + DLIB_CASSERT(input_tensor.num_samples()%sub.sample_expansion_factor() == 0); + DLIB_CASSERT(input_tensor.num_samples() == grad.num_samples()); + DLIB_CASSERT(input_tensor.num_samples() == output_tensor.num_samples()); + DLIB_CASSERT(output_tensor.nr() == 1 && + output_tensor.nc() == 1 && + output_tensor.k() == 1); + DLIB_CASSERT(grad.nr() == 1 && + grad.nc() == 1 && + grad.k() == 1); + + tt::sigmoid(grad, output_tensor); + + // The loss we output is the average loss over the mini-batch. + const double scale = 1.0/output_tensor.num_samples(); + double loss = 0; + float* g = grad.host(); + const float* out_data = output_tensor.host(); + for (long i = 0; i < output_tensor.num_samples(); ++i) + { + const float y = *truth++; + DLIB_CASSERT(y == +1 || y == -1, "y: " << y); + float temp; + if (y > 0) + { + temp = log1pexp(-out_data[i]); + loss += scale*temp; + g[i] = scale*(g[i]-1); + } + else + { + temp = -(-out_data[i]-log1pexp(-out_data[i])); + loss += scale*temp; + g[i] = scale*g[i]; + } + } + return loss; + } + + friend void serialize(const loss_binary_log_& , std::ostream& out) + { + serialize("loss_binary_log_", out); + } + + friend void deserialize(loss_binary_log_& , std::istream& in) + { + std::string version; + deserialize(version, in); + if (version != "loss_binary_log_") + throw serialization_error("Unexpected version found while deserializing dlib::loss_binary_log_."); + } + + friend std::ostream& operator<<(std::ostream& out, const loss_binary_log_& ) + { + out << "loss_binary_log"; + return out; + } + + friend void to_xml(const loss_binary_log_& /*item*/, std::ostream& out) + { + out << "<loss_binary_log/>"; + } + + }; + + template <typename T> + T safe_log(T input, T epsilon = 1e-10) + { + // Prevent trying to calculate the logarithm of a very small number (let alone zero) + return std::log(std::max(input, epsilon)); + } + + template <typename SUBNET> + using loss_binary_log = add_loss_layer<loss_binary_log_, SUBNET>; + +// ---------------------------------------------------------------------------------------- + + class loss_multiclass_log_ + { + public: + + typedef unsigned long training_label_type; + typedef unsigned long output_label_type; + + template < + typename SUB_TYPE, + typename label_iterator + > + void to_label ( + const tensor& input_tensor, + const SUB_TYPE& sub, + label_iterator iter + ) const + { + const tensor& output_tensor = sub.get_output(); + DLIB_CASSERT(sub.sample_expansion_factor() == 1); + DLIB_CASSERT(output_tensor.nr() == 1 && + output_tensor.nc() == 1 ); + DLIB_CASSERT(input_tensor.num_samples() == output_tensor.num_samples()); + + + // Note that output_tensor.k() should match the number of labels. + + for (long i = 0; i < output_tensor.num_samples(); ++i) + { + // The index of the largest output for this sample is the label. + *iter++ = index_of_max(rowm(mat(output_tensor),i)); + } + } + + + template < + typename const_label_iterator, + typename SUBNET + > + double compute_loss_value_and_gradient ( + const tensor& input_tensor, + const_label_iterator truth, + SUBNET& sub + ) const + { + const tensor& output_tensor = sub.get_output(); + tensor& grad = sub.get_gradient_input(); + + DLIB_CASSERT(sub.sample_expansion_factor() == 1); + DLIB_CASSERT(input_tensor.num_samples() != 0); + DLIB_CASSERT(input_tensor.num_samples()%sub.sample_expansion_factor() == 0); + DLIB_CASSERT(input_tensor.num_samples() == grad.num_samples()); + DLIB_CASSERT(input_tensor.num_samples() == output_tensor.num_samples()); + DLIB_CASSERT(output_tensor.nr() == 1 && + output_tensor.nc() == 1); + DLIB_CASSERT(grad.nr() == 1 && + grad.nc() == 1); + + tt::softmax(grad, output_tensor); + + // The loss we output is the average loss over the mini-batch. + const double scale = 1.0/output_tensor.num_samples(); + double loss = 0; + float* g = grad.host(); + for (long i = 0; i < output_tensor.num_samples(); ++i) + { + const long y = (long)*truth++; + // The network must produce a number of outputs that is equal to the number + // of labels when using this type of loss. + DLIB_CASSERT(y < output_tensor.k(), "y: " << y << ", output_tensor.k(): " << output_tensor.k()); + for (long k = 0; k < output_tensor.k(); ++k) + { + const unsigned long idx = i*output_tensor.k()+k; + if (k == y) + { + loss += scale*-safe_log(g[idx]); + g[idx] = scale*(g[idx]-1); + } + else + { + g[idx] = scale*g[idx]; + } + } + } + return loss; + } + + friend void serialize(const loss_multiclass_log_& , std::ostream& out) + { + serialize("loss_multiclass_log_", out); + } + + friend void deserialize(loss_multiclass_log_& , std::istream& in) + { + std::string version; + deserialize(version, in); + if (version != "loss_multiclass_log_") + throw serialization_error("Unexpected version found while deserializing dlib::loss_multiclass_log_."); + } + + friend std::ostream& operator<<(std::ostream& out, const loss_multiclass_log_& ) + { + out << "loss_multiclass_log"; + return out; + } + + friend void to_xml(const loss_multiclass_log_& /*item*/, std::ostream& out) + { + out << "<loss_multiclass_log/>"; + } + + }; + + template <typename SUBNET> + using loss_multiclass_log = add_loss_layer<loss_multiclass_log_, SUBNET>; + +// ---------------------------------------------------------------------------------------- + + class loss_multimulticlass_log_ + { + + public: + + loss_multimulticlass_log_ () = default; + + loss_multimulticlass_log_ ( + const std::map<std::string,std::vector<std::string>>& labels + ) + { + for (auto& l : labels) + { + possible_labels[l.first] = std::make_shared<decltype(l.second)>(l.second); + DLIB_CASSERT(l.second.size() >= 2, "Each classifier must have at least two possible labels."); + + for (size_t i = 0; i < l.second.size(); ++i) + { + label_idx_lookup[l.first][l.second[i]] = i; + ++total_num_labels; + } + } + } + + unsigned long number_of_labels() const { return total_num_labels; } + + unsigned long number_of_classifiers() const { return possible_labels.size(); } + + std::map<std::string,std::vector<std::string>> get_labels ( + ) const + { + std::map<std::string,std::vector<std::string>> info; + for (auto& i : possible_labels) + { + for (auto& label : *i.second) + info[i.first].emplace_back(label); + } + return info; + } + + class classifier_output + { + + public: + classifier_output() = default; + + size_t num_classes() const { return class_probs.size(); } + + double probability_of_class ( + size_t i + ) const + { + DLIB_CASSERT(i < num_classes()); + return class_probs(i); + } + + const std::string& label( + size_t i + ) const + { + DLIB_CASSERT(i < num_classes()); + return (*_labels)[i]; + } + + operator std::string( + ) const + { + DLIB_CASSERT(num_classes() != 0); + return (*_labels)[index_of_max(class_probs)]; + } + + friend std::ostream& operator<< (std::ostream& out, const classifier_output& item) + { + DLIB_ASSERT(item.num_classes() != 0); + out << static_cast<std::string>(item); + return out; + } + + private: + + friend class loss_multimulticlass_log_; + + template <typename EXP> + classifier_output( + const matrix_exp<EXP>& class_probs, + const std::shared_ptr<std::vector<std::string>>& _labels + ) : + class_probs(class_probs), + _labels(_labels) + { + } + + matrix<float,1,0> class_probs; + std::shared_ptr<std::vector<std::string>> _labels; + }; + + typedef std::map<std::string,std::string> training_label_type; + typedef std::map<std::string,classifier_output> output_label_type; + + + template < + typename SUB_TYPE, + typename label_iterator + > + void to_label ( + const tensor& input_tensor, + const SUB_TYPE& sub, + label_iterator iter_begin + ) const + { + const tensor& output_tensor = sub.get_output(); + DLIB_CASSERT(sub.sample_expansion_factor() == 1); + DLIB_CASSERT(output_tensor.nr() == 1 && + output_tensor.nc() == 1 ); + DLIB_CASSERT(input_tensor.num_samples() == output_tensor.num_samples()); + + DLIB_CASSERT(number_of_labels() != 0, "You must give the loss_multimulticlass_log_'s constructor label data before you can use it!"); + DLIB_CASSERT(output_tensor.k() == (long)number_of_labels(), "The output tensor must have " << number_of_labels() << " channels."); + + + long k_offset = 0; + for (auto& l : possible_labels) + { + auto iter = iter_begin; + const std::string& classifier_name = l.first; + const auto& labels = (*l.second); + scratch.set_size(output_tensor.num_samples(), labels.size()); + tt::copy_tensor(false, scratch, 0, output_tensor, k_offset, labels.size()); + + tt::softmax(scratch, scratch); + + for (long i = 0; i < scratch.num_samples(); ++i) + (*iter++)[classifier_name] = classifier_output(rowm(mat(scratch),i), l.second); + + k_offset += labels.size(); + } + } + + + template < + typename const_label_iterator, + typename SUBNET + > + double compute_loss_value_and_gradient ( + const tensor& input_tensor, + const_label_iterator truth_begin, + SUBNET& sub + ) const + { + const tensor& output_tensor = sub.get_output(); + tensor& grad = sub.get_gradient_input(); + + DLIB_CASSERT(sub.sample_expansion_factor() == 1); + DLIB_CASSERT(input_tensor.num_samples() != 0); + DLIB_CASSERT(input_tensor.num_samples()%sub.sample_expansion_factor() == 0); + DLIB_CASSERT(input_tensor.num_samples() == grad.num_samples()); + DLIB_CASSERT(input_tensor.num_samples() == output_tensor.num_samples()); + DLIB_CASSERT(output_tensor.nr() == 1 && + output_tensor.nc() == 1); + DLIB_CASSERT(grad.nr() == 1 && + grad.nc() == 1); + DLIB_CASSERT(number_of_labels() != 0, "You must give the loss_multimulticlass_log_'s constructor label data before you can use it!"); + DLIB_CASSERT(output_tensor.k() == (long)number_of_labels(), "The output tensor must have " << number_of_labels() << " channels."); + + // The loss we output is the average loss over the mini-batch. + const double scale = 1.0/output_tensor.num_samples(); + double loss = 0; + long k_offset = 0; + for (auto& l : label_idx_lookup) + { + const std::string& classifier_name = l.first; + const auto& int_labels = l.second; + scratch.set_size(output_tensor.num_samples(), int_labels.size()); + tt::copy_tensor(false, scratch, 0, output_tensor, k_offset, int_labels.size()); + + tt::softmax(scratch, scratch); + + + auto truth = truth_begin; + float* g = scratch.host(); + for (long i = 0; i < scratch.num_samples(); ++i) + { + const long y = int_labels.at(truth->at(classifier_name)); + ++truth; + + for (long k = 0; k < scratch.k(); ++k) + { + const unsigned long idx = i*scratch.k()+k; + if (k == y) + { + loss += scale*-std::log(g[idx]); + g[idx] = scale*(g[idx]-1); + } + else + { + g[idx] = scale*g[idx]; + } + } + } + + tt::copy_tensor(false, grad, k_offset, scratch, 0, int_labels.size()); + + k_offset += int_labels.size(); + } + return loss; + } + + + friend void serialize(const loss_multimulticlass_log_& item, std::ostream& out) + { + serialize("loss_multimulticlass_log_", out); + serialize(item.get_labels(), out); + } + + friend void deserialize(loss_multimulticlass_log_& item, std::istream& in) + { + std::string version; + deserialize(version, in); + if (version != "loss_multimulticlass_log_") + throw serialization_error("Unexpected version found while deserializing dlib::loss_multimulticlass_log_."); + + std::map<std::string,std::vector<std::string>> info; + deserialize(info, in); + item = loss_multimulticlass_log_(info); + } + + friend std::ostream& operator<<(std::ostream& out, const loss_multimulticlass_log_& item) + { + out << "loss_multimulticlass_log, labels={"; + for (auto i = item.possible_labels.begin(); i != item.possible_labels.end(); ) + { + auto& category = i->first; + auto& labels = *(i->second); + out << category << ":("; + for (size_t j = 0; j < labels.size(); ++j) + { + out << labels[j]; + if (j+1 < labels.size()) + out << ","; + } + + out << ")"; + if (++i != item.possible_labels.end()) + out << ", "; + } + out << "}"; + return out; + } + + friend void to_xml(const loss_multimulticlass_log_& item, std::ostream& out) + { + out << "<loss_multimulticlass_log>\n"; + out << item; + out << "\n</loss_multimulticlass_log>"; + } + + private: + + std::map<std::string,std::shared_ptr<std::vector<std::string>>> possible_labels; + unsigned long total_num_labels = 0; + + // We make it true that: possible_labels[classifier][label_idx_lookup[classifier][label]] == label + std::map<std::string, std::map<std::string,long>> label_idx_lookup; + + + // Scratch doesn't logically contribute to the state of this object. It's just + // temporary scratch space used by this class. + mutable resizable_tensor scratch; + + + }; + + template <typename SUBNET> + using loss_multimulticlass_log = add_loss_layer<loss_multimulticlass_log_, SUBNET>; + + inline bool operator== (const std::string& lhs, const loss_multimulticlass_log_::classifier_output& rhs) + { return lhs == static_cast<const std::string&>(rhs); } + inline bool operator== (const loss_multimulticlass_log_::classifier_output& lhs, const std::string& rhs) + { return rhs == static_cast<const std::string&>(lhs); } + +// ---------------------------------------------------------------------------------------- +// ---------------------------------------------------------------------------------------- + + enum class use_image_pyramid : uint8_t + { + no, + yes + }; + + struct mmod_options + { + public: + + struct detector_window_details + { + detector_window_details() = default; + detector_window_details(unsigned long w, unsigned long h) : width(w), height(h) {} + detector_window_details(unsigned long w, unsigned long h, const std::string& l) : width(w), height(h), label(l) {} + + unsigned long width = 0; + unsigned long height = 0; + std::string label; + + friend inline void serialize(const detector_window_details& item, std::ostream& out) + { + int version = 2; + serialize(version, out); + serialize(item.width, out); + serialize(item.height, out); + serialize(item.label, out); + } + + friend inline void deserialize(detector_window_details& item, std::istream& in) + { + int version = 0; + deserialize(version, in); + if (version != 1 && version != 2) + throw serialization_error("Unexpected version found while deserializing dlib::mmod_options::detector_window_details"); + deserialize(item.width, in); + deserialize(item.height, in); + if (version == 2) + deserialize(item.label, in); + } + + }; + + mmod_options() = default; + + std::vector<detector_window_details> detector_windows; + double loss_per_false_alarm = 1; + double loss_per_missed_target = 1; + double truth_match_iou_threshold = 0.5; + test_box_overlap overlaps_nms = test_box_overlap(0.4); + test_box_overlap overlaps_ignore; + + use_image_pyramid assume_image_pyramid = use_image_pyramid::yes; + + mmod_options ( + const std::vector<std::vector<mmod_rect>>& boxes, + const unsigned long target_size, // We want the length of the longest dimension of the detector window to be this. + const unsigned long min_target_size, // But we require that the smallest dimension of the detector window be at least this big. + const double min_detector_window_overlap_iou = 0.75 + ) + { + DLIB_CASSERT(0 < min_target_size && min_target_size <= target_size); + DLIB_CASSERT(0.5 < min_detector_window_overlap_iou && min_detector_window_overlap_iou < 1); + + // Figure out what detector windows we will need. + for (auto& label : get_labels(boxes)) + { + for (auto ratio : find_covering_aspect_ratios(boxes, test_box_overlap(min_detector_window_overlap_iou), label)) + { + double detector_width; + double detector_height; + if (ratio < 1) + { + detector_height = target_size; + detector_width = ratio*target_size; + if (detector_width < min_target_size) + { + detector_height = min_target_size/ratio; + detector_width = min_target_size; + } + } + else + { + detector_width = target_size; + detector_height = target_size/ratio; + if (detector_height < min_target_size) + { + detector_width = min_target_size*ratio; + detector_height = min_target_size; + } + } + + detector_window_details p((unsigned long)std::round(detector_width), (unsigned long)std::round(detector_height), label); + detector_windows.push_back(p); + } + } + + DLIB_CASSERT(detector_windows.size() != 0, "You can't call mmod_options's constructor with a set of boxes that is empty (or only contains ignored boxes)."); + + set_overlap_nms(boxes); + } + + mmod_options( + use_image_pyramid assume_image_pyramid, + const std::vector<std::vector<mmod_rect>>& boxes, + const double min_detector_window_overlap_iou = 0.75 + ) + : assume_image_pyramid(assume_image_pyramid) + { + DLIB_CASSERT(assume_image_pyramid == use_image_pyramid::no); + DLIB_CASSERT(0.5 < min_detector_window_overlap_iou && min_detector_window_overlap_iou < 1); + + // Figure out what detector windows we will need. + for (auto& label : get_labels(boxes)) + { + for (auto rectangle : find_covering_rectangles(boxes, test_box_overlap(min_detector_window_overlap_iou), label)) + { + detector_windows.push_back(detector_window_details(rectangle.width(), rectangle.height(), label)); + } + } + + DLIB_CASSERT(detector_windows.size() != 0, "You can't call mmod_options's constructor with a set of boxes that is empty (or only contains ignored boxes)."); + + set_overlap_nms(boxes); + } + + private: + + void set_overlap_nms(const std::vector<std::vector<mmod_rect>>& boxes) + { + // Convert from mmod_rect to rectangle so we can call + // find_tight_overlap_tester(). + std::vector<std::vector<rectangle>> temp; + for (auto&& bi : boxes) + { + std::vector<rectangle> rtemp; + for (auto&& b : bi) + { + if (b.ignore) + continue; + rtemp.push_back(b.rect); + } + temp.push_back(std::move(rtemp)); + } + overlaps_nms = find_tight_overlap_tester(temp); + // Relax the non-max-suppression a little so that it doesn't accidentally make + // it impossible for the detector to output boxes matching the training data. + // This could be a problem with the tightest possible nms test since there is + // some small variability in how boxes get positioned between the training data + // and the coordinate system used by the detector when it runs. So relaxing it + // here takes care of that. + auto iou_thresh = advance_toward_1(overlaps_nms.get_iou_thresh()); + auto percent_covered_thresh = advance_toward_1(overlaps_nms.get_percent_covered_thresh()); + overlaps_nms = test_box_overlap(iou_thresh, percent_covered_thresh); + } + + static double advance_toward_1 ( + double val + ) + { + if (val < 1) + val += (1-val)*0.1; + return val; + } + + static size_t count_overlaps ( + const std::vector<rectangle>& rects, + const test_box_overlap& overlaps, + const rectangle& ref_box + ) + { + size_t cnt = 0; + for (auto& b : rects) + { + if (overlaps(b, ref_box)) + ++cnt; + } + return cnt; + } + + static std::vector<rectangle> find_rectangles_overlapping_all_others ( + std::vector<rectangle> rects, + const test_box_overlap& overlaps + ) + { + std::vector<rectangle> exemplars; + dlib::rand rnd; + + while(rects.size() > 0) + { + // Pick boxes at random and see if they overlap a lot of other boxes. We will try + // 500 different boxes each iteration and select whichever hits the most others to + // add to our exemplar set. + rectangle best_ref_box; + size_t best_cnt = 0; + for (int iter = 0; iter < 500; ++iter) + { + rectangle ref_box = rects[rnd.get_random_64bit_number()%rects.size()]; + size_t cnt = count_overlaps(rects, overlaps, ref_box); + if (cnt >= best_cnt) + { + best_cnt = cnt; + best_ref_box = ref_box; + } + } + + // Now mark all the boxes the new ref box hit as hit. + for (size_t i = 0; i < rects.size(); ++i) + { + if (overlaps(rects[i], best_ref_box)) + { + // remove box from rects so we don't hit it again later + swap(rects[i], rects.back()); + rects.pop_back(); + --i; + } + } + + exemplars.push_back(best_ref_box); + } + + return exemplars; + } + + static std::set<std::string> get_labels ( + const std::vector<std::vector<mmod_rect>>& rects + ) + { + std::set<std::string> labels; + for (auto& rr : rects) + { + for (auto& r : rr) + labels.insert(r.label); + } + return labels; + } + + static std::vector<double> find_covering_aspect_ratios ( + const std::vector<std::vector<mmod_rect>>& rects, + const test_box_overlap& overlaps, + const std::string& label + ) + { + std::vector<rectangle> boxes; + // Make sure all the boxes have the same size and position, so that the only thing our + // checks for overlap will care about is aspect ratio (i.e. scale and x,y position are + // ignored). + for (auto& bb : rects) + { + for (auto&& b : bb) + { + if (!b.ignore && b.label == label) + boxes.push_back(move_rect(set_rect_area(b.rect,400*400), point(0,0))); + } + } + + std::vector<double> ratios; + for (auto r : find_rectangles_overlapping_all_others(boxes, overlaps)) + ratios.push_back(r.width()/(double)r.height()); + return ratios; + } + + static std::vector<dlib::rectangle> find_covering_rectangles ( + const std::vector<std::vector<mmod_rect>>& rects, + const test_box_overlap& overlaps, + const std::string& label + ) + { + std::vector<rectangle> boxes; + // Make sure all the boxes have the same position, so that the we only check for + // width and height. + for (auto& bb : rects) + { + for (auto&& b : bb) + { + if (!b.ignore && b.label == label) + boxes.push_back(rectangle(b.rect.width(), b.rect.height())); + } + } + + return find_rectangles_overlapping_all_others(boxes, overlaps); + } + }; + + inline void serialize(const mmod_options& item, std::ostream& out) + { + int version = 3; + + serialize(version, out); + serialize(item.detector_windows, out); + serialize(item.loss_per_false_alarm, out); + serialize(item.loss_per_missed_target, out); + serialize(item.truth_match_iou_threshold, out); + serialize(item.overlaps_nms, out); + serialize(item.overlaps_ignore, out); + serialize(static_cast<uint8_t>(item.assume_image_pyramid), out); + } + + inline void deserialize(mmod_options& item, std::istream& in) + { + int version = 0; + deserialize(version, in); + if (version != 3 && version != 2 && version != 1) + throw serialization_error("Unexpected version found while deserializing dlib::mmod_options"); + if (version == 1) + { + unsigned long width; + unsigned long height; + deserialize(width, in); + deserialize(height, in); + item.detector_windows = {mmod_options::detector_window_details(width, height)}; + } + else + { + deserialize(item.detector_windows, in); + } + deserialize(item.loss_per_false_alarm, in); + deserialize(item.loss_per_missed_target, in); + deserialize(item.truth_match_iou_threshold, in); + deserialize(item.overlaps_nms, in); + deserialize(item.overlaps_ignore, in); + item.assume_image_pyramid = use_image_pyramid::yes; + if (version >= 3) + { + uint8_t assume_image_pyramid = 0; + deserialize(assume_image_pyramid, in); + item.assume_image_pyramid = static_cast<use_image_pyramid>(assume_image_pyramid); + } + } + +// ---------------------------------------------------------------------------------------- + + class loss_mmod_ + { + struct intermediate_detection + { + intermediate_detection() = default; + + intermediate_detection( + rectangle rect_ + ) : rect(rect_) {} + + intermediate_detection( + rectangle rect_, + double detection_confidence_, + size_t tensor_offset_, + long channel + ) : rect(rect_), detection_confidence(detection_confidence_), tensor_offset(tensor_offset_), tensor_channel(channel) {} + + rectangle rect; + double detection_confidence = 0; + size_t tensor_offset = 0; + long tensor_channel = 0; + + bool operator<(const intermediate_detection& item) const { return detection_confidence < item.detection_confidence; } + }; + + public: + + typedef std::vector<mmod_rect> training_label_type; + typedef std::vector<mmod_rect> output_label_type; + + loss_mmod_() {} + + loss_mmod_(mmod_options options_) : options(options_) {} + + const mmod_options& get_options ( + ) const { return options; } + + template < + typename SUB_TYPE, + typename label_iterator + > + void to_label ( + const tensor& input_tensor, + const SUB_TYPE& sub, + label_iterator iter, + double adjust_threshold = 0 + ) const + { + const tensor& output_tensor = sub.get_output(); + DLIB_CASSERT(output_tensor.k() == (long)options.detector_windows.size()); + DLIB_CASSERT(input_tensor.num_samples() == output_tensor.num_samples()); + DLIB_CASSERT(sub.sample_expansion_factor() == 1, sub.sample_expansion_factor()); + + std::vector<intermediate_detection> dets_accum; + output_label_type final_dets; + for (long i = 0; i < output_tensor.num_samples(); ++i) + { + tensor_to_dets(input_tensor, output_tensor, i, dets_accum, adjust_threshold, sub); + + // Do non-max suppression + final_dets.clear(); + for (unsigned long i = 0; i < dets_accum.size(); ++i) + { + if (overlaps_any_box_nms(final_dets, dets_accum[i].rect)) + continue; + + final_dets.push_back(mmod_rect(dets_accum[i].rect, + dets_accum[i].detection_confidence, + options.detector_windows[dets_accum[i].tensor_channel].label)); + } + + *iter++ = std::move(final_dets); + } + } + + template < + typename const_label_iterator, + typename SUBNET + > + double compute_loss_value_and_gradient ( + const tensor& input_tensor, + const_label_iterator truth, + SUBNET& sub + ) const + { + const tensor& output_tensor = sub.get_output(); + tensor& grad = sub.get_gradient_input(); + + DLIB_CASSERT(input_tensor.num_samples() != 0); + DLIB_CASSERT(sub.sample_expansion_factor() == 1); + DLIB_CASSERT(input_tensor.num_samples() == grad.num_samples()); + DLIB_CASSERT(input_tensor.num_samples() == output_tensor.num_samples()); + DLIB_CASSERT(output_tensor.k() == (long)options.detector_windows.size()); + + double det_thresh_speed_adjust = 0; + + + // we will scale the loss so that it doesn't get really huge + const double scale = 1.0/output_tensor.size(); + double loss = 0; + + float* g = grad.host_write_only(); + for (size_t i = 0; i < grad.size(); ++i) + g[i] = 0; + + const float* out_data = output_tensor.host(); + + std::vector<size_t> truth_idxs; truth_idxs.reserve(truth->size()); + std::vector<intermediate_detection> dets; + for (long i = 0; i < output_tensor.num_samples(); ++i) + { + tensor_to_dets(input_tensor, output_tensor, i, dets, -options.loss_per_false_alarm + det_thresh_speed_adjust, sub); + + const unsigned long max_num_dets = 50 + truth->size()*5; + // Prevent calls to tensor_to_dets() from running for a really long time + // due to the production of an obscene number of detections. + const unsigned long max_num_initial_dets = max_num_dets*100; + if (dets.size() >= max_num_initial_dets) + { + det_thresh_speed_adjust = std::max(det_thresh_speed_adjust,dets[max_num_initial_dets].detection_confidence + options.loss_per_false_alarm); + } + + + // The loss will measure the number of incorrect detections. A detection is + // incorrect if it doesn't hit a truth rectangle or if it is a duplicate detection + // on a truth rectangle. + loss += truth->size()*options.loss_per_missed_target; + for (auto&& x : *truth) + { + if (!x.ignore) + { + size_t k; + point p; + if(image_rect_to_feat_coord(p, input_tensor, x, x.label, sub, k, options.assume_image_pyramid)) + { + // Ignore boxes that can't be detected by the CNN. + loss -= options.loss_per_missed_target; + continue; + } + const size_t idx = (k*output_tensor.nr() + p.y())*output_tensor.nc() + p.x(); + loss -= out_data[idx]; + // compute gradient + g[idx] = -scale; + truth_idxs.push_back(idx); + } + else + { + // This box was ignored so shouldn't have been counted in the loss. + loss -= options.loss_per_missed_target; + truth_idxs.push_back(0); + } + } + + // Measure the loss augmented score for the detections which hit a truth rect. + std::vector<double> truth_score_hits(truth->size(), 0); + + // keep track of which truth boxes we have hit so far. + std::vector<bool> hit_truth_table(truth->size(), false); + + std::vector<intermediate_detection> final_dets; + // The point of this loop is to fill out the truth_score_hits array. + for (unsigned long i = 0; i < dets.size() && final_dets.size() < max_num_dets; ++i) + { + if (overlaps_any_box_nms(final_dets, dets[i].rect)) + continue; + + const auto& det_label = options.detector_windows[dets[i].tensor_channel].label; + + const std::pair<double,unsigned int> hittruth = find_best_match(*truth, dets[i].rect, det_label); + + final_dets.push_back(dets[i].rect); + + const double truth_match = hittruth.first; + // if hit truth rect + if (truth_match > options.truth_match_iou_threshold) + { + // if this is the first time we have seen a detect which hit (*truth)[hittruth.second] + const double score = dets[i].detection_confidence; + if (hit_truth_table[hittruth.second] == false) + { + hit_truth_table[hittruth.second] = true; + truth_score_hits[hittruth.second] += score; + } + else + { + truth_score_hits[hittruth.second] += score + options.loss_per_false_alarm; + } + } + } + + // Check if any of the truth boxes are unobtainable because the NMS is + // killing them. If so, automatically set those unobtainable boxes to + // ignore and print a warning message to the user. + for (size_t i = 0; i < hit_truth_table.size(); ++i) + { + if (!hit_truth_table[i] && !(*truth)[i].ignore) + { + // So we didn't hit this truth box. Is that because there is + // another, different truth box, that overlaps it according to NMS? + const std::pair<double,unsigned int> hittruth = find_best_match(*truth, (*truth)[i], i); + if (hittruth.second == i || (*truth)[hittruth.second].ignore) + continue; + rectangle best_matching_truth_box = (*truth)[hittruth.second]; + if (options.overlaps_nms(best_matching_truth_box, (*truth)[i])) + { + const size_t idx = truth_idxs[i]; + // We are ignoring this box so we shouldn't have counted it in the + // loss in the first place. So we subtract out the loss values we + // added for it in the code above. + loss -= options.loss_per_missed_target-out_data[idx]; + g[idx] = 0; + std::cout << "Warning, ignoring object. We encountered a truth rectangle located at " << (*truth)[i].rect; + std::cout << " that is suppressed by non-max-suppression "; + std::cout << "because it is overlapped by another truth rectangle located at " << best_matching_truth_box + << " (IoU:"<< box_intersection_over_union(best_matching_truth_box,(*truth)[i]) <<", Percent covered:" + << box_percent_covered(best_matching_truth_box,(*truth)[i]) << ")." << std::endl; + } + } + } + + hit_truth_table.assign(hit_truth_table.size(), false); + final_dets.clear(); + + + // Now figure out which detections jointly maximize the loss and detection score sum. We + // need to take into account the fact that allowing a true detection in the output, while + // initially reducing the loss, may allow us to increase the loss later with many duplicate + // detections. + for (unsigned long i = 0; i < dets.size() && final_dets.size() < max_num_dets; ++i) + { + if (overlaps_any_box_nms(final_dets, dets[i].rect)) + continue; + + const auto& det_label = options.detector_windows[dets[i].tensor_channel].label; + + const std::pair<double,unsigned int> hittruth = find_best_match(*truth, dets[i].rect, det_label); + + const double truth_match = hittruth.first; + if (truth_match > options.truth_match_iou_threshold) + { + if (truth_score_hits[hittruth.second] > options.loss_per_missed_target) + { + if (!hit_truth_table[hittruth.second]) + { + hit_truth_table[hittruth.second] = true; + final_dets.push_back(dets[i]); + loss -= options.loss_per_missed_target; + } + else + { + final_dets.push_back(dets[i]); + loss += options.loss_per_false_alarm; + } + } + } + else if (!overlaps_ignore_box(*truth, dets[i].rect)) + { + // didn't hit anything + final_dets.push_back(dets[i]); + loss += options.loss_per_false_alarm; + } + } + + for (auto&& x : final_dets) + { + loss += out_data[x.tensor_offset]; + g[x.tensor_offset] += scale; + } + + ++truth; + g += output_tensor.k()*output_tensor.nr()*output_tensor.nc(); + out_data += output_tensor.k()*output_tensor.nr()*output_tensor.nc(); + } // END for (long i = 0; i < output_tensor.num_samples(); ++i) + + + // Here we scale the loss so that it's roughly equal to the number of mistakes + // in an image. Note that this scaling is different than the scaling we + // applied to the gradient but it doesn't matter since the loss value isn't + // used to update parameters. It's used only for display and to check if we + // have converged. So it doesn't matter that they are scaled differently and + // this way the loss that is displayed is readily interpretable to the user. + return loss/output_tensor.num_samples(); + } + + + friend void serialize(const loss_mmod_& item, std::ostream& out) + { + serialize("loss_mmod_", out); + serialize(item.options, out); + } + + friend void deserialize(loss_mmod_& item, std::istream& in) + { + std::string version; + deserialize(version, in); + if (version != "loss_mmod_") + throw serialization_error("Unexpected version found while deserializing dlib::loss_mmod_."); + deserialize(item.options, in); + } + + friend std::ostream& operator<<(std::ostream& out, const loss_mmod_& item) + { + out << "loss_mmod\t ("; + + out << "detector_windows:("; + auto& opts = item.options; + for (size_t i = 0; i < opts.detector_windows.size(); ++i) + { + out << opts.detector_windows[i].width << "x" << opts.detector_windows[i].height; + if (i+1 < opts.detector_windows.size()) + out << ","; + } + out << ")"; + out << ", loss per FA:" << opts.loss_per_false_alarm; + out << ", loss per miss:" << opts.loss_per_missed_target; + out << ", truth match IOU thresh:" << opts.truth_match_iou_threshold; + out << ", overlaps_nms:("<<opts.overlaps_nms.get_iou_thresh()<<","<<opts.overlaps_nms.get_percent_covered_thresh()<<")"; + out << ", overlaps_ignore:("<<opts.overlaps_ignore.get_iou_thresh()<<","<<opts.overlaps_ignore.get_percent_covered_thresh()<<")"; + + out << ")"; + return out; + } + + friend void to_xml(const loss_mmod_& /*item*/, std::ostream& out) + { + // TODO, add options fields + out << "<loss_mmod/>"; + } + + private: + + template <typename net_type> + void tensor_to_dets ( + const tensor& input_tensor, + const tensor& output_tensor, + long i, + std::vector<intermediate_detection>& dets_accum, + double adjust_threshold, + const net_type& net + ) const + { + DLIB_CASSERT(net.sample_expansion_factor() == 1,net.sample_expansion_factor()); + DLIB_CASSERT(output_tensor.k() == (long)options.detector_windows.size()); + const float* out_data = output_tensor.host() + output_tensor.k()*output_tensor.nr()*output_tensor.nc()*i; + // scan the final layer and output the positive scoring locations + dets_accum.clear(); + for (long k = 0; k < output_tensor.k(); ++k) + { + for (long r = 0; r < output_tensor.nr(); ++r) + { + for (long c = 0; c < output_tensor.nc(); ++c) + { + double score = out_data[(k*output_tensor.nr() + r)*output_tensor.nc() + c]; + if (score > adjust_threshold) + { + dpoint p = output_tensor_to_input_tensor(net, point(c,r)); + drectangle rect = centered_drect(p, options.detector_windows[k].width, options.detector_windows[k].height); + rect = input_layer(net).tensor_space_to_image_space(input_tensor,rect); + + dets_accum.push_back(intermediate_detection(rect, score, (k*output_tensor.nr() + r)*output_tensor.nc() + c, k)); + } + } + } + } + std::sort(dets_accum.rbegin(), dets_accum.rend()); + } + + size_t find_best_detection_window ( + rectangle rect, + const std::string& label, + use_image_pyramid assume_image_pyramid + ) const + { + if (assume_image_pyramid == use_image_pyramid::yes) + { + rect = move_rect(set_rect_area(rect, 400*400), point(0,0)); + } + else + { + rect = rectangle(rect.width(), rect.height()); + } + + // Figure out which detection window in options.detector_windows is most similar to rect + // (in terms of aspect ratio, if assume_image_pyramid == use_image_pyramid::yes). + size_t best_i = 0; + double best_ratio_diff = -std::numeric_limits<double>::infinity(); + for (size_t i = 0; i < options.detector_windows.size(); ++i) + { + if (options.detector_windows[i].label != label) + continue; + + rectangle det_window; + + if (options.assume_image_pyramid == use_image_pyramid::yes) + { + det_window = centered_rect(point(0,0), options.detector_windows[i].width, options.detector_windows[i].height); + det_window = move_rect(set_rect_area(det_window, 400*400), point(0,0)); + } + else + { + det_window = rectangle(options.detector_windows[i].width, options.detector_windows[i].height); + } + + double iou = box_intersection_over_union(rect, det_window); + if (iou > best_ratio_diff) + { + best_ratio_diff = iou; + best_i = i; + } + } + return best_i; + } + + template <typename net_type> + bool image_rect_to_feat_coord ( + point& tensor_p, + const tensor& input_tensor, + const rectangle& rect, + const std::string& label, + const net_type& net, + size_t& det_idx, + use_image_pyramid assume_image_pyramid + ) const + { + using namespace std; + if (!input_layer(net).image_contained_point(input_tensor,center(rect))) + { + std::ostringstream sout; + sout << "Encountered a truth rectangle located at " << rect << " that is outside the image." << endl; + sout << "The center of each truth rectangle must be within the image." << endl; + throw impossible_labeling_error(sout.str()); + } + + det_idx = find_best_detection_window(rect,label,assume_image_pyramid); + + double scale = 1.0; + if (options.assume_image_pyramid == use_image_pyramid::yes) + { + // Compute the scale we need to be at to get from rect to our detection window. + // Note that we compute the scale as the max of two numbers. It doesn't + // actually matter which one we pick, because if they are very different then + // it means the box can't be matched by the sliding window. But picking the + // max causes the right error message to be selected in the logic below. + scale = std::max(options.detector_windows[det_idx].width/(double)rect.width(), options.detector_windows[det_idx].height/(double)rect.height()); + } + else + { + // We don't want invariance to scale. + scale = 1.0; + } + + const rectangle mapped_rect = input_layer(net).image_space_to_tensor_space(input_tensor, std::min(1.0,scale), rect); + + // compute the detection window that we would use at this position. + tensor_p = center(mapped_rect); + rectangle det_window = centered_rect(tensor_p, options.detector_windows[det_idx].width,options.detector_windows[det_idx].height); + det_window = input_layer(net).tensor_space_to_image_space(input_tensor, det_window); + + // make sure the rect can actually be represented by the image pyramid we are + // using. + if (box_intersection_over_union(rect, det_window) <= options.truth_match_iou_threshold) + { + std::cout << "Warning, ignoring object. We encountered a truth rectangle with a width and height of " << rect.width() << " and " << rect.height() << ". "; + std::cout << "The image pyramid and sliding windows can't output a rectangle of this shape. "; + const double detector_area = options.detector_windows[det_idx].width*options.detector_windows[det_idx].height; + if (mapped_rect.area()/detector_area <= options.truth_match_iou_threshold) + { + std::cout << "This is because the rectangle is smaller than the best matching detection window, which has a width "; + std::cout << "and height of " << options.detector_windows[det_idx].width << " and " << options.detector_windows[det_idx].height << "." << std::endl; + } + else + { + std::cout << "This is either because (1) the final layer's features have too large of a stride across the image, limiting the possible locations the sliding window can search "; + std::cout << "or (2) because the rectangle's aspect ratio is too different from the best matching detection window, "; + std::cout << "which has a width and height of " << options.detector_windows[det_idx].width << " and " << options.detector_windows[det_idx].height << "." << std::endl; + } + return true; + } + + // now map through the CNN to the output layer. + tensor_p = input_tensor_to_output_tensor(net,tensor_p); + + const tensor& output_tensor = net.get_output(); + if (!get_rect(output_tensor).contains(tensor_p)) + { + std::cout << "Warning, ignoring object. We encountered a truth rectangle located at " << rect << " that is too close to the edge "; + std::cout << "of the image to be captured by the CNN features." << std::endl; + return true; + } + + return false; + } + + + bool overlaps_ignore_box ( + const std::vector<mmod_rect>& boxes, + const rectangle& rect + ) const + { + for (auto&& b : boxes) + { + if (b.ignore && options.overlaps_ignore(b, rect)) + return true; + } + return false; + } + + std::pair<double,unsigned int> find_best_match( + const std::vector<mmod_rect>& boxes, + const rectangle& rect, + const std::string& label + ) const + { + double match = 0; + unsigned int best_idx = 0; + for (unsigned long i = 0; i < boxes.size(); ++i) + { + if (boxes[i].ignore || boxes[i].label != label) + continue; + + const double new_match = box_intersection_over_union(rect, boxes[i]); + if (new_match > match) + { + match = new_match; + best_idx = i; + } + } + + return std::make_pair(match,best_idx); + } + + std::pair<double,unsigned int> find_best_match( + const std::vector<mmod_rect>& boxes, + const rectangle& rect, + const size_t excluded_idx + ) const + { + double match = 0; + unsigned int best_idx = 0; + for (unsigned long i = 0; i < boxes.size(); ++i) + { + if (boxes[i].ignore || excluded_idx == i) + continue; + + const double new_match = box_intersection_over_union(rect, boxes[i]); + if (new_match > match) + { + match = new_match; + best_idx = i; + } + } + + return std::make_pair(match,best_idx); + } + + template <typename T> + inline bool overlaps_any_box_nms ( + const std::vector<T>& rects, + const rectangle& rect + ) const + { + for (auto&& r : rects) + { + if (options.overlaps_nms(r.rect, rect)) + return true; + } + return false; + } + + + mmod_options options; + + }; + + template <typename SUBNET> + using loss_mmod = add_loss_layer<loss_mmod_, SUBNET>; + +// ---------------------------------------------------------------------------------------- + + class loss_metric_ + { + public: + + typedef unsigned long training_label_type; + typedef matrix<float,0,1> output_label_type; + + loss_metric_() = default; + + loss_metric_( + float margin_, + float dist_thresh_ + ) : margin(margin_), dist_thresh(dist_thresh_) + { + DLIB_CASSERT(margin_ > 0); + DLIB_CASSERT(dist_thresh_ > 0); + } + + template < + typename SUB_TYPE, + typename label_iterator + > + void to_label ( + const tensor& input_tensor, + const SUB_TYPE& sub, + label_iterator iter + ) const + { + const tensor& output_tensor = sub.get_output(); + DLIB_CASSERT(sub.sample_expansion_factor() == 1); + DLIB_CASSERT(input_tensor.num_samples() != 0); + DLIB_CASSERT(input_tensor.num_samples()%sub.sample_expansion_factor() == 0); + DLIB_CASSERT(input_tensor.num_samples() == output_tensor.num_samples()); + DLIB_CASSERT(output_tensor.nr() == 1 && + output_tensor.nc() == 1); + + const float* p = output_tensor.host(); + for (long i = 0; i < output_tensor.num_samples(); ++i) + { + *iter = mat(p,output_tensor.k(),1); + + ++iter; + p += output_tensor.k(); + } + } + + + float get_margin() const { return margin; } + float get_distance_threshold() const { return dist_thresh; } + + template < + typename const_label_iterator, + typename SUBNET + > + double compute_loss_value_and_gradient ( + const tensor& input_tensor, + const_label_iterator truth, + SUBNET& sub + ) const + { + const tensor& output_tensor = sub.get_output(); + tensor& grad = sub.get_gradient_input(); + + DLIB_CASSERT(sub.sample_expansion_factor() == 1); + DLIB_CASSERT(input_tensor.num_samples() != 0); + DLIB_CASSERT(input_tensor.num_samples()%sub.sample_expansion_factor() == 0); + DLIB_CASSERT(input_tensor.num_samples() == grad.num_samples()); + DLIB_CASSERT(input_tensor.num_samples() == output_tensor.num_samples()); + DLIB_CASSERT(output_tensor.nr() == 1 && + output_tensor.nc() == 1); + DLIB_CASSERT(grad.nr() == 1 && + grad.nc() == 1); + + + + temp.set_size(output_tensor.num_samples(), output_tensor.num_samples()); + grad_mul.copy_size(temp); + + tt::gemm(0, temp, 1, output_tensor, false, output_tensor, true); + + + std::vector<double> temp_threshs; + const float* d = temp.host(); + double loss = 0; + double num_pos_samps = 0.0001; + double num_neg_samps = 0.0001; + for (long r = 0; r < temp.num_samples(); ++r) + { + auto xx = d[r*temp.num_samples() + r]; + const auto x_label = *(truth + r); + for (long c = r+1; c < temp.num_samples(); ++c) + { + const auto y_label = *(truth + c); + if (x_label == y_label) + { + ++num_pos_samps; + } + else + { + ++num_neg_samps; + + // Figure out what distance threshold, when applied to the negative pairs, + // causes there to be an equal number of positive and negative pairs. + auto yy = d[c*temp.num_samples() + c]; + auto xy = d[r*temp.num_samples() + c]; + // compute the distance between x and y samples. + auto d2 = xx + yy - 2*xy; + if (d2 < 0) + d2 = 0; + temp_threshs.push_back(d2); + } + } + } + // The whole objective function is multiplied by this to scale the loss + // relative to the number of things in the mini-batch. + const double scale = 0.5/num_pos_samps; + DLIB_CASSERT(num_pos_samps>=1, "Make sure each mini-batch contains both positive pairs and negative pairs"); + DLIB_CASSERT(num_neg_samps>=1, "Make sure each mini-batch contains both positive pairs and negative pairs"); + + std::sort(temp_threshs.begin(), temp_threshs.end()); + const float neg_thresh = std::sqrt(temp_threshs[std::min(num_pos_samps,num_neg_samps)-1]); + + // loop over all the pairs of training samples and compute the loss and + // gradients. Note that we only use the hardest negative pairs and that in + // particular we pick the number of negative pairs equal to the number of + // positive pairs so everything is balanced. + float* gm = grad_mul.host(); + for (long r = 0; r < temp.num_samples(); ++r) + { + gm[r*temp.num_samples() + r] = 0; + const auto x_label = *(truth + r); + auto xx = d[r*temp.num_samples() + r]; + for (long c = 0; c < temp.num_samples(); ++c) + { + if (r==c) + continue; + const auto y_label = *(truth + c); + auto yy = d[c*temp.num_samples() + c]; + auto xy = d[r*temp.num_samples() + c]; + + // compute the distance between x and y samples. + auto d2 = xx + yy - 2*xy; + if (d2 <= 0) + d2 = 0; + else + d2 = std::sqrt(d2); + + // It should be noted that the derivative of length(x-y) with respect + // to the x vector is the unit vector (x-y)/length(x-y). If you stare + // at the code below long enough you will see that it's just an + // application of this formula. + + if (x_label == y_label) + { + // Things with the same label should have distances < dist_thresh between + // them. If not then we experience non-zero loss. + if (d2 < dist_thresh-margin) + { + gm[r*temp.num_samples() + c] = 0; + } + else + { + loss += scale*(d2 - (dist_thresh-margin)); + gm[r*temp.num_samples() + r] += scale/d2; + gm[r*temp.num_samples() + c] = -scale/d2; + } + } + else + { + // Things with different labels should have distances > dist_thresh between + // them. If not then we experience non-zero loss. + if (d2 > dist_thresh+margin || d2 > neg_thresh) + { + gm[r*temp.num_samples() + c] = 0; + } + else + { + loss += scale*((dist_thresh+margin) - d2); + // don't divide by zero (or a really small number) + d2 = std::max(d2, 0.001f); + gm[r*temp.num_samples() + r] -= scale/d2; + gm[r*temp.num_samples() + c] = scale/d2; + } + } + } + } + + + tt::gemm(0, grad, 1, grad_mul, false, output_tensor, false); + + return loss; + } + + friend void serialize(const loss_metric_& item, std::ostream& out) + { + serialize("loss_metric_2", out); + serialize(item.margin, out); + serialize(item.dist_thresh, out); + } + + friend void deserialize(loss_metric_& item, std::istream& in) + { + std::string version; + deserialize(version, in); + if (version == "loss_metric_") + { + // These values used to be hard coded, so for this version of the metric + // learning loss we just use these values. + item.margin = 0.1; + item.dist_thresh = 0.75; + return; + } + else if (version == "loss_metric_2") + { + deserialize(item.margin, in); + deserialize(item.dist_thresh, in); + } + else + { + throw serialization_error("Unexpected version found while deserializing dlib::loss_metric_. Instead found " + version); + } + } + + friend std::ostream& operator<<(std::ostream& out, const loss_metric_& item ) + { + out << "loss_metric (margin="<<item.margin<<", distance_threshold="<<item.dist_thresh<<")"; + return out; + } + + friend void to_xml(const loss_metric_& item, std::ostream& out) + { + out << "<loss_metric margin='"<<item.margin<<"' distance_threshold='"<<item.dist_thresh<<"'/>"; + } + + private: + float margin = 0.04; + float dist_thresh = 0.6; + + + // These variables are only here to avoid being reallocated over and over in + // compute_loss_value_and_gradient() + mutable resizable_tensor temp, grad_mul; + + }; + + template <typename SUBNET> + using loss_metric = add_loss_layer<loss_metric_, SUBNET>; + +// ---------------------------------------------------------------------------------------- + + class loss_ranking_ + { + public: + + typedef float training_label_type; // nominally +1/-1 + typedef float output_label_type; // ranking score + + template < + typename SUB_TYPE, + typename label_iterator + > + void to_label ( + const tensor& input_tensor, + const SUB_TYPE& sub, + label_iterator iter + ) const + { + DLIB_CASSERT(sub.sample_expansion_factor() == 1); + + const tensor& output_tensor = sub.get_output(); + + DLIB_CASSERT(output_tensor.nr() == 1 && + output_tensor.nc() == 1 && + output_tensor.k() == 1); + DLIB_CASSERT(input_tensor.num_samples() == output_tensor.num_samples()); + + const float* out_data = output_tensor.host(); + for (long i = 0; i < output_tensor.num_samples(); ++i) + { + *iter++ = out_data[i]; + } + } + + + template < + typename const_label_iterator, + typename SUBNET + > + double compute_loss_value_and_gradient ( + const tensor& input_tensor, + const_label_iterator truth, + SUBNET& sub + ) const + { + const tensor& output_tensor = sub.get_output(); + tensor& grad = sub.get_gradient_input(); + + DLIB_CASSERT(sub.sample_expansion_factor() == 1); + DLIB_CASSERT(input_tensor.num_samples() != 0); + DLIB_CASSERT(input_tensor.num_samples()%sub.sample_expansion_factor() == 0); + DLIB_CASSERT(input_tensor.num_samples() == grad.num_samples()); + DLIB_CASSERT(input_tensor.num_samples() == output_tensor.num_samples()); + DLIB_CASSERT(output_tensor.nr() == 1 && + output_tensor.nc() == 1 && + output_tensor.k() == 1); + DLIB_CASSERT(grad.nr() == 1 && + grad.nc() == 1 && + grad.k() == 1); + + + std::vector<double> rel_scores; + std::vector<double> nonrel_scores; + std::vector<long> rel_idx, nonrel_idx; + + const float* out_data = output_tensor.host(); + float* g = grad.host_write_only(); + for (long i = 0; i < output_tensor.num_samples(); ++i) + { + const float y = *truth++; + if (y > 0) + { + rel_scores.push_back(out_data[i]-y); + rel_idx.push_back(i); + } + else if (y < 0) + { + nonrel_scores.push_back(out_data[i]-y); + nonrel_idx.push_back(i); + } + else + { + g[i] = 0; + } + } + + + std::vector<unsigned long> rel_counts; + std::vector<unsigned long> nonrel_counts; + count_ranking_inversions(rel_scores, nonrel_scores, rel_counts, nonrel_counts); + const unsigned long total_pairs = rel_scores.size()*nonrel_scores.size(); + DLIB_CASSERT(total_pairs > 0, "You can't give a ranking mini-batch that contains only one class. Both classes must be represented."); + const double scale = 1.0/total_pairs; + + + double loss = 0; + for (unsigned long k = 0; k < rel_counts.size(); ++k) + { + loss -= rel_counts[k]*rel_scores[k]; + g[rel_idx[k]] = -1.0*rel_counts[k]*scale; + } + + for (unsigned long k = 0; k < nonrel_counts.size(); ++k) + { + loss += nonrel_counts[k]*nonrel_scores[k]; + g[nonrel_idx[k]] = nonrel_counts[k]*scale; + } + + return loss*scale; + } + + friend void serialize(const loss_ranking_& , std::ostream& out) + { + serialize("loss_ranking_", out); + } + + friend void deserialize(loss_ranking_& , std::istream& in) + { + std::string version; + deserialize(version, in); + if (version != "loss_ranking_") + throw serialization_error("Unexpected version found while deserializing dlib::loss_ranking_."); + } + + friend std::ostream& operator<<(std::ostream& out, const loss_ranking_& ) + { + out << "loss_ranking"; + return out; + } + + friend void to_xml(const loss_ranking_& /*item*/, std::ostream& out) + { + out << "<loss_ranking/>"; + } + + }; + + template <typename SUBNET> + using loss_ranking = add_loss_layer<loss_ranking_, SUBNET>; + +// ---------------------------------------------------------------------------------------- + + class loss_mean_squared_ + { + public: + + typedef float training_label_type; + typedef float output_label_type; + + template < + typename SUB_TYPE, + typename label_iterator + > + void to_label ( + const tensor& input_tensor, + const SUB_TYPE& sub, + label_iterator iter + ) const + { + DLIB_CASSERT(sub.sample_expansion_factor() == 1); + + const tensor& output_tensor = sub.get_output(); + + DLIB_CASSERT(output_tensor.nr() == 1 && + output_tensor.nc() == 1 && + output_tensor.k() == 1); + DLIB_CASSERT(input_tensor.num_samples() == output_tensor.num_samples()); + + const float* out_data = output_tensor.host(); + for (long i = 0; i < output_tensor.num_samples(); ++i) + { + *iter++ = out_data[i]; + } + } + + + template < + typename const_label_iterator, + typename SUBNET + > + double compute_loss_value_and_gradient ( + const tensor& input_tensor, + const_label_iterator truth, + SUBNET& sub + ) const + { + const tensor& output_tensor = sub.get_output(); + tensor& grad = sub.get_gradient_input(); + + DLIB_CASSERT(sub.sample_expansion_factor() == 1); + DLIB_CASSERT(input_tensor.num_samples() != 0); + DLIB_CASSERT(input_tensor.num_samples()%sub.sample_expansion_factor() == 0); + DLIB_CASSERT(input_tensor.num_samples() == grad.num_samples()); + DLIB_CASSERT(input_tensor.num_samples() == output_tensor.num_samples()); + DLIB_CASSERT(output_tensor.nr() == 1 && + output_tensor.nc() == 1 && + output_tensor.k() == 1); + DLIB_CASSERT(grad.nr() == 1 && + grad.nc() == 1 && + grad.k() == 1); + + // The loss we output is the average loss over the mini-batch. + const double scale = 1.0/output_tensor.num_samples(); + double loss = 0; + float* g = grad.host_write_only(); + const float* out_data = output_tensor.host(); + for (long i = 0; i < output_tensor.num_samples(); ++i) + { + const float y = *truth++; + const float temp1 = y - out_data[i]; + const float temp2 = scale*temp1; + loss += temp2*temp1; + g[i] = -temp2; + + } + return loss; + } + + friend void serialize(const loss_mean_squared_& , std::ostream& out) + { + serialize("loss_mean_squared_", out); + } + + friend void deserialize(loss_mean_squared_& , std::istream& in) + { + std::string version; + deserialize(version, in); + if (version != "loss_mean_squared_") + throw serialization_error("Unexpected version found while deserializing dlib::loss_mean_squared_."); + } + + friend std::ostream& operator<<(std::ostream& out, const loss_mean_squared_& ) + { + out << "loss_mean_squared"; + return out; + } + + friend void to_xml(const loss_mean_squared_& /*item*/, std::ostream& out) + { + out << "<loss_mean_squared/>"; + } + + }; + + template <typename SUBNET> + using loss_mean_squared = add_loss_layer<loss_mean_squared_, SUBNET>; + +// ---------------------------------------------------------------------------------------- + + class loss_epsilon_insensitive_ + { + public: + + typedef float training_label_type; + typedef float output_label_type; + + loss_epsilon_insensitive_() = default; + loss_epsilon_insensitive_(double eps) : eps(eps) + { + DLIB_CASSERT(eps >= 0, "You can't set a negative error epsilon."); + } + + double get_epsilon () const { return eps; } + void set_epsilon(double e) + { + DLIB_CASSERT(e >= 0, "You can't set a negative error epsilon."); + eps = e; + } + + template < + typename SUB_TYPE, + typename label_iterator + > + void to_label ( + const tensor& input_tensor, + const SUB_TYPE& sub, + label_iterator iter + ) const + { + DLIB_CASSERT(sub.sample_expansion_factor() == 1); + + const tensor& output_tensor = sub.get_output(); + + DLIB_CASSERT(output_tensor.nr() == 1 && + output_tensor.nc() == 1 && + output_tensor.k() == 1); + DLIB_CASSERT(input_tensor.num_samples() == output_tensor.num_samples()); + + const float* out_data = output_tensor.host(); + for (long i = 0; i < output_tensor.num_samples(); ++i) + { + *iter++ = out_data[i]; + } + } + + + template < + typename const_label_iterator, + typename SUBNET + > + double compute_loss_value_and_gradient ( + const tensor& input_tensor, + const_label_iterator truth, + SUBNET& sub + ) const + { + const tensor& output_tensor = sub.get_output(); + tensor& grad = sub.get_gradient_input(); + + DLIB_CASSERT(sub.sample_expansion_factor() == 1); + DLIB_CASSERT(input_tensor.num_samples() != 0); + DLIB_CASSERT(input_tensor.num_samples()%sub.sample_expansion_factor() == 0); + DLIB_CASSERT(input_tensor.num_samples() == grad.num_samples()); + DLIB_CASSERT(input_tensor.num_samples() == output_tensor.num_samples()); + DLIB_CASSERT(output_tensor.nr() == 1 && + output_tensor.nc() == 1 && + output_tensor.k() == 1); + DLIB_CASSERT(grad.nr() == 1 && + grad.nc() == 1 && + grad.k() == 1); + + // The loss we output is the average loss over the mini-batch. + const double scale = 1.0/output_tensor.num_samples(); + double loss = 0; + float* g = grad.host_write_only(); + const float* out_data = output_tensor.host(); + for (long i = 0; i < output_tensor.num_samples(); ++i) + { + const float y = *truth++; + const float err = out_data[i]-y; + if (err > eps) + { + loss += scale*(err-eps); + g[i] = scale; + } + else if (err < -eps) + { + loss += scale*(eps-err); + g[i] = -scale; + } + } + return loss; + } + + friend void serialize(const loss_epsilon_insensitive_& item, std::ostream& out) + { + serialize("loss_epsilon_insensitive_", out); + serialize(item.eps, out); + } + + friend void deserialize(loss_epsilon_insensitive_& item, std::istream& in) + { + std::string version; + deserialize(version, in); + if (version != "loss_epsilon_insensitive_") + throw serialization_error("Unexpected version found while deserializing dlib::loss_epsilon_insensitive_."); + deserialize(item.eps, in); + } + + friend std::ostream& operator<<(std::ostream& out, const loss_epsilon_insensitive_& item) + { + out << "loss_epsilon_insensitive epsilon: " << item.eps; + return out; + } + + friend void to_xml(const loss_epsilon_insensitive_& item, std::ostream& out) + { + out << "<loss_epsilon_insensitive_ epsilon='" << item.eps << "'/>"; + } + + private: + double eps = 1; + + }; + + template <typename SUBNET> + using loss_epsilon_insensitive = add_loss_layer<loss_epsilon_insensitive_, SUBNET>; + +// ---------------------------------------------------------------------------------------- + + class loss_mean_squared_multioutput_ + { + public: + + typedef matrix<float> training_label_type; + typedef matrix<float> output_label_type; + + template < + typename SUB_TYPE, + typename label_iterator + > + void to_label ( + const tensor& input_tensor, + const SUB_TYPE& sub, + label_iterator iter + ) const + { + DLIB_CASSERT(sub.sample_expansion_factor() == 1); + + const tensor& output_tensor = sub.get_output(); + + DLIB_CASSERT(output_tensor.nr() == 1 && + output_tensor.nc() == 1) + DLIB_CASSERT(input_tensor.num_samples() == output_tensor.num_samples()); + + const float* out_data = output_tensor.host(); + for (long i = 0; i < output_tensor.num_samples(); ++i) + { + *iter++ = mat(out_data, output_tensor.k(), 1); + out_data += output_tensor.k(); + } + } + + + template < + typename const_label_iterator, + typename SUBNET + > + double compute_loss_value_and_gradient ( + const tensor& input_tensor, + const_label_iterator truth, + SUBNET& sub + ) const + { + const tensor& output_tensor = sub.get_output(); + tensor& grad = sub.get_gradient_input(); + + DLIB_CASSERT(sub.sample_expansion_factor() == 1); + DLIB_CASSERT(input_tensor.num_samples() != 0); + DLIB_CASSERT(input_tensor.num_samples()%sub.sample_expansion_factor() == 0); + DLIB_CASSERT(input_tensor.num_samples() == grad.num_samples()); + DLIB_CASSERT(input_tensor.num_samples() == output_tensor.num_samples()); + DLIB_CASSERT(output_tensor.nr() == 1 && + output_tensor.nc() == 1); + DLIB_CASSERT(grad.nr() == 1 && + grad.nc() == 1); + DLIB_CASSERT(grad.k() == output_tensor.k()); + const long k = output_tensor.k(); + for (long idx = 0; idx < output_tensor.num_samples(); ++idx) + { + const_label_iterator truth_matrix_ptr = (truth + idx); + DLIB_CASSERT((*truth_matrix_ptr).nr() == k && + (*truth_matrix_ptr).nc() == 1); + } + + // The loss we output is the average loss over the mini-batch. + const double scale = 1.0/output_tensor.num_samples(); + double loss = 0; + float* g = grad.host_write_only(); + const float* out_data = output_tensor.host(); + matrix<float> ytrue; + for (long i = 0; i < output_tensor.num_samples(); ++i) + { + ytrue = *truth++; + for (long j = 0; j < output_tensor.k(); ++j) + { + const float y = ytrue(j, 0); + const float temp1 = y - *out_data++; + const float temp2 = scale*temp1; + loss += temp2*temp1; + *g = -temp2; + ++g; + } + + } + return loss; + } + + friend void serialize(const loss_mean_squared_multioutput_& , std::ostream& out) + { + serialize("loss_mean_squared_multioutput_", out); + } + + friend void deserialize(loss_mean_squared_multioutput_& , std::istream& in) + { + std::string version; + deserialize(version, in); + if (version != "loss_mean_squared_multioutput_") + throw serialization_error("Unexpected version found while deserializing dlib::loss_mean_squared_."); + } + + friend std::ostream& operator<<(std::ostream& out, const loss_mean_squared_multioutput_& ) + { + out << "loss_mean_squared_multioutput"; + return out; + } + + friend void to_xml(const loss_mean_squared_multioutput_& /*item*/, std::ostream& out) + { + out << "<loss_mean_squared_multioutput/>"; + } + + }; + + template <typename SUBNET> + using loss_mean_squared_multioutput = add_loss_layer<loss_mean_squared_multioutput_, SUBNET>; + +// ---------------------------------------------------------------------------------------- + + class loss_multiclass_log_per_pixel_ + { + public: + + // In semantic segmentation, if you don't know the ground-truth of some pixel, + // set the label of that pixel to this value. When you do so, the pixel will be + // ignored when computing gradients. + static const uint16_t label_to_ignore = std::numeric_limits<uint16_t>::max(); + + + // In semantic segmentation, 65535 classes ought to be enough for anybody. + typedef matrix<uint16_t> training_label_type; + typedef matrix<uint16_t> output_label_type; + + template < + typename SUB_TYPE, + typename label_iterator + > + static void to_label ( + const tensor& input_tensor, + const SUB_TYPE& sub, + label_iterator iter + ) + { + DLIB_CASSERT(sub.sample_expansion_factor() == 1); + + const tensor& output_tensor = sub.get_output(); + + DLIB_CASSERT(output_tensor.k() >= 1); // Note that output_tensor.k() should match the number of labels. + DLIB_CASSERT(output_tensor.k() < std::numeric_limits<uint16_t>::max()); + DLIB_CASSERT(input_tensor.num_samples() == output_tensor.num_samples()); + + const float* const out_data = output_tensor.host(); + + // The index of the largest output for each element is the label. + const auto find_label = [&](long sample, long r, long c) + { + uint16_t label = 0; + float max_value = out_data[tensor_index(output_tensor, sample, 0, r, c)]; + for (long k = 1; k < output_tensor.k(); ++k) + { + const float value = out_data[tensor_index(output_tensor, sample, k, r, c)]; + if (value > max_value) + { + label = static_cast<uint16_t>(k); + max_value = value; + } + } + return label; + }; + + for (long i = 0; i < output_tensor.num_samples(); ++i, ++iter) + { + iter->set_size(output_tensor.nr(), output_tensor.nc()); + for (long r = 0; r < output_tensor.nr(); ++r) + { + for (long c = 0; c < output_tensor.nc(); ++c) + { + // The index of the largest output for this element is the label. + iter->operator()(r, c) = find_label(i, r, c); + } + } + } + } + + template < + typename const_label_iterator, + typename SUBNET + > + double compute_loss_value_and_gradient ( + const tensor& input_tensor, + const_label_iterator truth, + SUBNET& sub + ) const + { + const tensor& output_tensor = sub.get_output(); + tensor& grad = sub.get_gradient_input(); + + DLIB_CASSERT(sub.sample_expansion_factor() == 1); + DLIB_CASSERT(input_tensor.num_samples() != 0); + DLIB_CASSERT(input_tensor.num_samples()%sub.sample_expansion_factor() == 0); + DLIB_CASSERT(input_tensor.num_samples() == grad.num_samples()); + DLIB_CASSERT(input_tensor.num_samples() == output_tensor.num_samples()); + DLIB_CASSERT(output_tensor.k() >= 1); + DLIB_CASSERT(output_tensor.k() < std::numeric_limits<uint16_t>::max()); + DLIB_CASSERT(output_tensor.nr() == grad.nr() && + output_tensor.nc() == grad.nc() && + output_tensor.k() == grad.k()); + for (long idx = 0; idx < output_tensor.num_samples(); ++idx) + { + const_label_iterator truth_matrix_ptr = (truth + idx); + DLIB_CASSERT(truth_matrix_ptr->nr() == output_tensor.nr() && + truth_matrix_ptr->nc() == output_tensor.nc(), + "truth size = " << truth_matrix_ptr->nr() << " x " << truth_matrix_ptr->nc() << ", " + "output size = " << output_tensor.nr() << " x " << output_tensor.nc()); + } + + tt::softmax(grad, output_tensor); + + // The loss we output is the average loss over the mini-batch, and also over each element of the matrix output. + const double scale = 1.0 / (output_tensor.num_samples() * output_tensor.nr() * output_tensor.nc()); + double loss = 0; + float* const g = grad.host(); + for (long i = 0; i < output_tensor.num_samples(); ++i, ++truth) + { + for (long r = 0; r < output_tensor.nr(); ++r) + { + for (long c = 0; c < output_tensor.nc(); ++c) + { + const uint16_t y = truth->operator()(r, c); + // The network must produce a number of outputs that is equal to the number + // of labels when using this type of loss. + DLIB_CASSERT(static_cast<long>(y) < output_tensor.k() || y == label_to_ignore, + "y: " << y << ", output_tensor.k(): " << output_tensor.k()); + for (long k = 0; k < output_tensor.k(); ++k) + { + const size_t idx = tensor_index(output_tensor, i, k, r, c); + if (k == y) + { + loss += scale*-safe_log(g[idx]); + g[idx] = scale*(g[idx] - 1); + } + else if (y == label_to_ignore) + { + g[idx] = 0.f; + } + else + { + g[idx] = scale*g[idx]; + } + } + } + } + } + return loss; + } + + friend void serialize(const loss_multiclass_log_per_pixel_& , std::ostream& out) + { + serialize("loss_multiclass_log_per_pixel_", out); + } + + friend void deserialize(loss_multiclass_log_per_pixel_& , std::istream& in) + { + std::string version; + deserialize(version, in); + if (version != "loss_multiclass_log_per_pixel_") + throw serialization_error("Unexpected version found while deserializing dlib::loss_multiclass_log_per_pixel_."); + } + + friend std::ostream& operator<<(std::ostream& out, const loss_multiclass_log_per_pixel_& ) + { + out << "loss_multiclass_log_per_pixel"; + return out; + } + + friend void to_xml(const loss_multiclass_log_per_pixel_& /*item*/, std::ostream& out) + { + out << "<loss_multiclass_log_per_pixel/>"; + } + + private: + static size_t tensor_index(const tensor& t, long sample, long k, long row, long column) + { + // See: https://github.com/davisking/dlib/blob/4dfeb7e186dd1bf6ac91273509f687293bd4230a/dlib/dnn/tensor_abstract.h#L38 + return ((sample * t.k() + k) * t.nr() + row) * t.nc() + column; + } + + }; + + template <typename SUBNET> + using loss_multiclass_log_per_pixel = add_loss_layer<loss_multiclass_log_per_pixel_, SUBNET>; + +// ---------------------------------------------------------------------------------------- + + class loss_multiclass_log_per_pixel_weighted_ + { + public: + + struct weighted_label + { + weighted_label() + {} + + weighted_label(uint16_t label, float weight = 1.f) + : label(label), weight(weight) + {} + + // In semantic segmentation, 65536 classes ought to be enough for anybody. + uint16_t label = 0; + float weight = 1.f; + }; + + typedef matrix<weighted_label> training_label_type; + typedef matrix<uint16_t> output_label_type; + + template < + typename SUB_TYPE, + typename label_iterator + > + static void to_label ( + const tensor& input_tensor, + const SUB_TYPE& sub, + label_iterator iter + ) + { + loss_multiclass_log_per_pixel_::to_label(input_tensor, sub, iter); + } + + template < + typename const_label_iterator, + typename SUBNET + > + double compute_loss_value_and_gradient ( + const tensor& input_tensor, + const_label_iterator truth, + SUBNET& sub + ) const + { + const tensor& output_tensor = sub.get_output(); + tensor& grad = sub.get_gradient_input(); + + DLIB_CASSERT(sub.sample_expansion_factor() == 1); + DLIB_CASSERT(input_tensor.num_samples() != 0); + DLIB_CASSERT(input_tensor.num_samples()%sub.sample_expansion_factor() == 0); + DLIB_CASSERT(input_tensor.num_samples() == grad.num_samples()); + DLIB_CASSERT(input_tensor.num_samples() == output_tensor.num_samples()); + DLIB_CASSERT(output_tensor.k() >= 1); + DLIB_CASSERT(output_tensor.k() < std::numeric_limits<uint16_t>::max()); + DLIB_CASSERT(output_tensor.nr() == grad.nr() && + output_tensor.nc() == grad.nc() && + output_tensor.k() == grad.k()); + for (long idx = 0; idx < output_tensor.num_samples(); ++idx) + { + const_label_iterator truth_matrix_ptr = (truth + idx); + DLIB_CASSERT(truth_matrix_ptr->nr() == output_tensor.nr() && + truth_matrix_ptr->nc() == output_tensor.nc(), + "truth size = " << truth_matrix_ptr->nr() << " x " << truth_matrix_ptr->nc() << ", " + "output size = " << output_tensor.nr() << " x " << output_tensor.nc()); + } + + tt::softmax(grad, output_tensor); + + // The loss we output is the weighted average loss over the mini-batch, and also over each element of the matrix output. + const double scale = 1.0 / (output_tensor.num_samples() * output_tensor.nr() * output_tensor.nc()); + double loss = 0; + float* const g = grad.host(); + for (long i = 0; i < output_tensor.num_samples(); ++i, ++truth) + { + for (long r = 0; r < output_tensor.nr(); ++r) + { + for (long c = 0; c < output_tensor.nc(); ++c) + { + const weighted_label& weighted_label = truth->operator()(r, c); + const uint16_t y = weighted_label.label; + const float weight = weighted_label.weight; + // The network must produce a number of outputs that is equal to the number + // of labels when using this type of loss. + DLIB_CASSERT(static_cast<long>(y) < output_tensor.k() || weight == 0.f, + "y: " << y << ", output_tensor.k(): " << output_tensor.k()); + for (long k = 0; k < output_tensor.k(); ++k) + { + const size_t idx = tensor_index(output_tensor, i, k, r, c); + if (k == y) + { + loss += weight*scale*-safe_log(g[idx]); + g[idx] = weight*scale*(g[idx] - 1); + } + else + { + g[idx] = weight*scale*g[idx]; + } + } + } + } + } + return loss; + } + + friend void serialize(const loss_multiclass_log_per_pixel_weighted_& , std::ostream& out) + { + serialize("loss_multiclass_log_per_pixel_weighted_", out); + } + + friend void deserialize(loss_multiclass_log_per_pixel_weighted_& , std::istream& in) + { + std::string version; + deserialize(version, in); + if (version != "loss_multiclass_log_per_pixel_weighted_") + throw serialization_error("Unexpected version found while deserializing dlib::loss_multiclass_log_per_pixel_weighted_."); + } + + friend std::ostream& operator<<(std::ostream& out, const loss_multiclass_log_per_pixel_weighted_& ) + { + out << "loss_multiclass_log_per_pixel_weighted"; + return out; + } + + friend void to_xml(const loss_multiclass_log_per_pixel_weighted_& /*item*/, std::ostream& out) + { + out << "<loss_multiclass_log_per_pixel_weighted/>"; + } + + private: + static size_t tensor_index(const tensor& t, long sample, long k, long row, long column) + { + // See: https://github.com/davisking/dlib/blob/4dfeb7e186dd1bf6ac91273509f687293bd4230a/dlib/dnn/tensor_abstract.h#L38 + return ((sample * t.k() + k) * t.nr() + row) * t.nc() + column; + } + + }; + + template <typename SUBNET> + using loss_multiclass_log_per_pixel_weighted = add_loss_layer<loss_multiclass_log_per_pixel_weighted_, SUBNET>; + +// ---------------------------------------------------------------------------------------- + + class loss_mean_squared_per_pixel_ + { + public: + + typedef matrix<float> training_label_type; + typedef matrix<float> output_label_type; + + template < + typename SUB_TYPE, + typename label_iterator + > + void to_label ( + const tensor& input_tensor, + const SUB_TYPE& sub, + label_iterator iter + ) const + { + DLIB_CASSERT(sub.sample_expansion_factor() == 1); + + const tensor& output_tensor = sub.get_output(); + + DLIB_CASSERT(output_tensor.k() == 1, "output k = " << output_tensor.k()); + DLIB_CASSERT(input_tensor.num_samples() == output_tensor.num_samples()); + + const float* out_data = output_tensor.host(); + for (long i = 0; i < output_tensor.num_samples(); ++i, ++iter) + { + iter->set_size(output_tensor.nr(), output_tensor.nc()); + for (long r = 0; r < output_tensor.nr(); ++r) + { + for (long c = 0; c < output_tensor.nc(); ++c) + { + iter->operator()(r, c) = out_data[tensor_index(output_tensor, i, 0, r, c)]; + } + } + } + } + + + template < + typename const_label_iterator, + typename SUBNET + > + double compute_loss_value_and_gradient ( + const tensor& input_tensor, + const_label_iterator truth, + SUBNET& sub + ) const + { + const tensor& output_tensor = sub.get_output(); + tensor& grad = sub.get_gradient_input(); + + DLIB_CASSERT(sub.sample_expansion_factor() == 1); + DLIB_CASSERT(input_tensor.num_samples() != 0); + DLIB_CASSERT(input_tensor.num_samples() % sub.sample_expansion_factor() == 0); + DLIB_CASSERT(input_tensor.num_samples() == grad.num_samples()); + DLIB_CASSERT(input_tensor.num_samples() == output_tensor.num_samples()); + DLIB_CASSERT(output_tensor.k() >= 1); + DLIB_CASSERT(output_tensor.k() < std::numeric_limits<uint16_t>::max()); + DLIB_CASSERT(output_tensor.nr() == grad.nr() && + output_tensor.nc() == grad.nc() && + output_tensor.k() == grad.k()); + for (long idx = 0; idx < output_tensor.num_samples(); ++idx) + { + const_label_iterator truth_matrix_ptr = (truth + idx); + DLIB_CASSERT(truth_matrix_ptr->nr() == output_tensor.nr() && + truth_matrix_ptr->nc() == output_tensor.nc(), + "truth size = " << truth_matrix_ptr->nr() << " x " << truth_matrix_ptr->nc() << ", " + "output size = " << output_tensor.nr() << " x " << output_tensor.nc()); + } + + // The loss we output is the average loss over the mini-batch, and also over each element of the matrix output. + const double scale = 1.0 / (output_tensor.num_samples() * output_tensor.nr() * output_tensor.nc()); + double loss = 0; + float* const g = grad.host(); + const float* out_data = output_tensor.host(); + for (long i = 0; i < output_tensor.num_samples(); ++i, ++truth) + { + for (long r = 0; r < output_tensor.nr(); ++r) + { + for (long c = 0; c < output_tensor.nc(); ++c) + { + const float y = truth->operator()(r, c); + const size_t idx = tensor_index(output_tensor, i, 0, r, c); + const float temp1 = y - out_data[idx]; + const float temp2 = scale*temp1; + loss += temp2*temp1; + g[idx] = -temp2; + } + } + } + return loss; + } + + friend void serialize(const loss_mean_squared_per_pixel_& , std::ostream& out) + { + serialize("loss_mean_squared_per_pixel_", out); + } + + friend void deserialize(loss_mean_squared_per_pixel_& , std::istream& in) + { + std::string version; + deserialize(version, in); + if (version != "loss_mean_squared_per_pixel_") + throw serialization_error("Unexpected version found while deserializing dlib::loss_mean_squared_per_pixel_."); + } + + friend std::ostream& operator<<(std::ostream& out, const loss_mean_squared_per_pixel_& ) + { + out << "loss_mean_squared_per_pixel"; + return out; + } + + friend void to_xml(const loss_mean_squared_per_pixel_& /*item*/, std::ostream& out) + { + out << "<loss_mean_squared_per_pixel/>"; + } + + private: + static size_t tensor_index(const tensor& t, long sample, long k, long row, long column) + { + // See: https://github.com/davisking/dlib/blob/4dfeb7e186dd1bf6ac91273509f687293bd4230a/dlib/dnn/tensor_abstract.h#L38 + return ((sample * t.k() + k) * t.nr() + row) * t.nc() + column; + } + }; + + template <typename SUBNET> + using loss_mean_squared_per_pixel = add_loss_layer<loss_mean_squared_per_pixel_, SUBNET>; + +// ---------------------------------------------------------------------------------------- + + class loss_dot_ + { + public: + + typedef matrix<float,0,1> training_label_type; + typedef matrix<float,0,1> output_label_type; + + template < + typename SUB_TYPE, + typename label_iterator + > + void to_label ( + const tensor& input_tensor, + const SUB_TYPE& sub, + label_iterator iter + ) const + { + const tensor& output_tensor = sub.get_output(); + DLIB_CASSERT(sub.sample_expansion_factor() == 1); + DLIB_CASSERT(input_tensor.num_samples() != 0); + DLIB_CASSERT(input_tensor.num_samples()%sub.sample_expansion_factor() == 0); + DLIB_CASSERT(input_tensor.num_samples() == output_tensor.num_samples()); + + for (long i = 0; i < output_tensor.num_samples(); ++i) + *iter++ = trans(rowm(mat(output_tensor),i)); + } + + + template < + typename const_label_iterator, + typename SUBNET + > + double compute_loss_value_and_gradient ( + const tensor& input_tensor, + const_label_iterator truth, + SUBNET& sub + ) const + { + const tensor& output_tensor = sub.get_output(); + tensor& grad = sub.get_gradient_input(); + + DLIB_CASSERT(sub.sample_expansion_factor() == 1); + DLIB_CASSERT(input_tensor.num_samples() != 0); + DLIB_CASSERT(input_tensor.num_samples()%sub.sample_expansion_factor() == 0); + DLIB_CASSERT(input_tensor.num_samples() == grad.num_samples()); + DLIB_CASSERT(input_tensor.num_samples() == output_tensor.num_samples()); + + const long network_output_dims = output_tensor.size()/output_tensor.num_samples(); + + + // The loss we output is the average loss over the mini-batch. + const double scale = 1.0/output_tensor.num_samples(); + double loss = 0; + float* g = grad.host(); + const float* out_data = output_tensor.host(); + for (long i = 0; i < output_tensor.num_samples(); ++i) + { + DLIB_CASSERT(truth->size() == network_output_dims, "The network must output a vector with the same dimensionality as the training labels. " + << "\ntruth->size(): " << truth->size() + << "\nnetwork_output_dims: " << network_output_dims); + + const float* t = &(*truth++)(0); + + for (long j = 0; j < network_output_dims; ++j) + { + g[j] = -t[j]*scale; + loss -= out_data[j]*t[j]; + } + + g += network_output_dims; + out_data += network_output_dims; + } + return loss*scale; + } + + friend void serialize(const loss_dot_& , std::ostream& out) + { + serialize("loss_dot_", out); + } + + friend void deserialize(loss_dot_& , std::istream& in) + { + std::string version; + deserialize(version, in); + if (version != "loss_dot_") + throw serialization_error("Unexpected version found while deserializing dlib::loss_dot_."); + } + + friend std::ostream& operator<<(std::ostream& out, const loss_dot_& ) + { + out << "loss_dot"; + return out; + } + + friend void to_xml(const loss_dot_& /*item*/, std::ostream& out) + { + out << "<loss_dot/>"; + } + + }; + + template <typename SUBNET> + using loss_dot = add_loss_layer<loss_dot_, SUBNET>; + +// ---------------------------------------------------------------------------------------- + +} + +#endif // DLIB_DNn_LOSS_H_ + diff --git a/ml/dlib/dlib/dnn/loss_abstract.h b/ml/dlib/dlib/dnn/loss_abstract.h new file mode 100644 index 000000000..0dd043677 --- /dev/null +++ b/ml/dlib/dlib/dnn/loss_abstract.h @@ -0,0 +1,1542 @@ +// Copyright (C) 2015 Davis E. King (davis@dlib.net) +// License: Boost Software License See LICENSE.txt for the full license. +#undef DLIB_DNn_LOSS_ABSTRACT_H_ +#ifdef DLIB_DNn_LOSS_ABSTRACT_H_ + +#include "core_abstract.h" +#include "../image_processing/full_object_detection_abstract.h" + +namespace dlib +{ + +// ---------------------------------------------------------------------------------------- + + class EXAMPLE_LOSS_LAYER_ + { + /*! + WHAT THIS OBJECT REPRESENTS + A loss layer is the final layer in a deep neural network. It computes the + task loss. That is, it computes a number that tells us how well the + network is performing on some task, such as predicting a binary label. + + You can use one of the loss layers that comes with dlib (defined below). + But importantly, you are able to define your own loss layers to suit your + needs. You do this by creating a class that defines an interface matching + the one described by this EXAMPLE_LOSS_LAYER_ class. Note that there is no + dlib::EXAMPLE_LOSS_LAYER_ type. It is shown here purely to document the + interface that a loss layer must implement. + + A loss layer can optionally provide a to_label() method that converts the + output of a network into a user defined type. If to_label() is not + provided then the operator() methods of add_loss_layer will not be + available, but otherwise everything will function as normal. + + Finally, note that there are two broad flavors of loss layer, supervised + and unsupervised. The EXAMPLE_LOSS_LAYER_ as shown here is a supervised + layer. To make an unsupervised loss you simply leave out the + training_label_type typedef and the truth iterator argument to + compute_loss_value_and_gradient(). + !*/ + + public: + + // In most cases training_label_type and output_label_type will be the same type. + typedef whatever_type_you_use_for_training_labels training_label_type; + typedef whatever_type_you_use_for_outout_labels output_label_type; + + EXAMPLE_LOSS_LAYER_ ( + ); + /*! + ensures + - EXAMPLE_LOSS_LAYER_ objects are default constructable. + !*/ + + EXAMPLE_LOSS_LAYER_ ( + const EXAMPLE_LOSS_LAYER_& item + ); + /*! + ensures + - EXAMPLE_LOSS_LAYER_ objects are copy constructable. + !*/ + + // Implementing to_label() is optional. + template < + typename SUB_TYPE, + typename label_iterator + > + void to_label ( + const tensor& input_tensor, + const SUB_TYPE& sub, + label_iterator iter + ) const; + /*! + requires + - SUBNET implements the SUBNET interface defined at the top of + layers_abstract.h. + - input_tensor was given as input to the network sub and the outputs are + now visible in layer<i>(sub).get_output(), for all valid i. + - input_tensor.num_samples() > 0 + - input_tensor.num_samples()%sub.sample_expansion_factor() == 0. + - iter == an iterator pointing to the beginning of a range of + input_tensor.num_samples()/sub.sample_expansion_factor() elements. Moreover, + they must be output_label_type elements. + ensures + - Converts the output of the provided network to output_label_type objects and + stores the results into the range indicated by iter. In particular, for + all valid i, it will be the case that: + *(iter+i/sub.sample_expansion_factor()) is populated based on the output of + sub and corresponds to the ith sample in input_tensor. + !*/ + + template < + typename const_label_iterator, + typename SUBNET + > + double compute_loss_value_and_gradient ( + const tensor& input_tensor, + const_label_iterator truth, + SUBNET& sub + ) const; + /*! + requires + - SUBNET implements the SUBNET interface defined at the top of + layers_abstract.h. + - input_tensor was given as input to the network sub and the outputs are + now visible in layer<i>(sub).get_output(), for all valid i. + - input_tensor.num_samples() > 0 + - input_tensor.num_samples()%sub.sample_expansion_factor() == 0. + - for all valid i: + - layer<i>(sub).get_gradient_input() has the same dimensions as + layer<i>(sub).get_output(). + - layer<i>(sub).get_gradient_input() contains all zeros (i.e. + initially, all input gradients are 0). + - truth == an iterator pointing to the beginning of a range of + input_tensor.num_samples()/sub.sample_expansion_factor() elements. Moreover, + they must be training_label_type elements. + - for all valid i: + - *(truth+i/sub.sample_expansion_factor()) is the label of the ith sample in + input_tensor. + ensures + - This function computes a loss function that describes how well the output + of sub matches the expected labels given by truth. Let's write the loss + function as L(input_tensor, truth, sub). + - Then compute_loss_value_and_gradient() computes the gradient of L() with + respect to the outputs in sub. Specifically, compute_loss_value_and_gradient() + assigns the gradients into sub by performing the following tensor + assignments, for all valid i: + - layer<i>(sub).get_gradient_input() = the gradient of + L(input_tensor,truth,sub) with respect to layer<i>(sub).get_output(). + Note that, since get_gradient_input() is zero initialized, you don't + have to write gradient information to layers that have a zero + loss gradient. + - returns L(input_tensor,truth,sub) + !*/ + }; + + std::ostream& operator<<(std::ostream& out, const EXAMPLE_LOSS_LAYER_& item); + /*! + print a string describing this layer. + !*/ + + void to_xml(const EXAMPLE_LOSS_LAYER_& item, std::ostream& out); + /*! + This function is optional, but required if you want to print your networks with + net_to_xml(). Therefore, to_xml() prints a layer as XML. + !*/ + + void serialize(const EXAMPLE_LOSS_LAYER_& item, std::ostream& out); + void deserialize(EXAMPLE_LOSS_LAYER_& item, std::istream& in); + /*! + provides serialization support + !*/ + + // For each loss layer you define, always define an add_loss_layer template so that + // layers can be easily composed. Moreover, the convention is that the layer class + // ends with an _ while the add_loss_layer template has the same name but without the + // trailing _. + template <typename SUBNET> + using EXAMPLE_LOSS_LAYER = add_loss_layer<EXAMPLE_LOSS_LAYER_, SUBNET>; + +// ---------------------------------------------------------------------------------------- +// ---------------------------------------------------------------------------------------- +// ---------------------------------------------------------------------------------------- + + class loss_binary_hinge_ + { + /*! + WHAT THIS OBJECT REPRESENTS + This object implements the loss layer interface defined above by + EXAMPLE_LOSS_LAYER_. In particular, it implements the hinge loss, which is + appropriate for binary classification problems. Therefore, the possible + labels when using this loss are +1 and -1. Moreover, it will cause the + network to produce outputs > 0 when predicting a member of the +1 class and + values < 0 otherwise. + !*/ + public: + + typedef float training_label_type; + typedef float output_label_type; + + template < + typename SUB_TYPE, + typename label_iterator + > + void to_label ( + const tensor& input_tensor, + const SUB_TYPE& sub, + label_iterator iter + ) const; + /*! + This function has the same interface as EXAMPLE_LOSS_LAYER_::to_label() except + it has the additional calling requirements that: + - sub.get_output().nr() == 1 + - sub.get_output().nc() == 1 + - sub.get_output().k() == 1 + - sub.get_output().num_samples() == input_tensor.num_samples() + - sub.sample_expansion_factor() == 1 + and the output label is the raw score for each classified object. If the score + is > 0 then the classifier is predicting the +1 class, otherwise it is + predicting the -1 class. + !*/ + + template < + typename const_label_iterator, + typename SUBNET + > + double compute_loss_value_and_gradient ( + const tensor& input_tensor, + const_label_iterator truth, + SUBNET& sub + ) const; + /*! + This function has the same interface as EXAMPLE_LOSS_LAYER_::compute_loss_value_and_gradient() + except it has the additional calling requirements that: + - sub.get_output().nr() == 1 + - sub.get_output().nc() == 1 + - sub.get_output().k() == 1 + - sub.get_output().num_samples() == input_tensor.num_samples() + - sub.sample_expansion_factor() == 1 + - all values pointed to by truth are +1 or -1. + !*/ + + }; + + template <typename SUBNET> + using loss_binary_hinge = add_loss_layer<loss_binary_hinge_, SUBNET>; + +// ---------------------------------------------------------------------------------------- + + class loss_binary_log_ + { + /*! + WHAT THIS OBJECT REPRESENTS + This object implements the loss layer interface defined above by + EXAMPLE_LOSS_LAYER_. In particular, it implements the log loss, which is + appropriate for binary classification problems. Therefore, the possible + labels when using this loss are +1 and -1. Moreover, it will cause the + network to produce outputs > 0 when predicting a member of the +1 class and + values < 0 otherwise. + + To be more specific, this object contains a sigmoid layer followed by a + cross-entropy layer. + !*/ + public: + + typedef float training_label_type; + typedef float output_label_type; + + template < + typename SUB_TYPE, + typename label_iterator + > + void to_label ( + const tensor& input_tensor, + const SUB_TYPE& sub, + label_iterator iter + ) const; + /*! + This function has the same interface as EXAMPLE_LOSS_LAYER_::to_label() except + it has the additional calling requirements that: + - sub.get_output().nr() == 1 + - sub.get_output().nc() == 1 + - sub.get_output().k() == 1 + - sub.get_output().num_samples() == input_tensor.num_samples() + - sub.sample_expansion_factor() == 1 + and the output label is the raw score for each classified object. If the score + is > 0 then the classifier is predicting the +1 class, otherwise it is + predicting the -1 class. + !*/ + + template < + typename const_label_iterator, + typename SUBNET + > + double compute_loss_value_and_gradient ( + const tensor& input_tensor, + const_label_iterator truth, + SUBNET& sub + ) const; + /*! + This function has the same interface as EXAMPLE_LOSS_LAYER_::compute_loss_value_and_gradient() + except it has the additional calling requirements that: + - sub.get_output().nr() == 1 + - sub.get_output().nc() == 1 + - sub.get_output().k() == 1 + - sub.get_output().num_samples() == input_tensor.num_samples() + - sub.sample_expansion_factor() == 1 + - all values pointed to by truth are +1 or -1. + !*/ + + }; + + template <typename SUBNET> + using loss_binary_log = add_loss_layer<loss_binary_log_, SUBNET>; + +// ---------------------------------------------------------------------------------------- + + class loss_multiclass_log_ + { + /*! + WHAT THIS OBJECT REPRESENTS + This object implements the loss layer interface defined above by + EXAMPLE_LOSS_LAYER_. In particular, it implements the multiclass logistic + regression loss (e.g. negative log-likelihood loss), which is appropriate + for multiclass classification problems. This means that the possible + labels when using this loss are integers >= 0. + + Moreover, if after training you were to replace the loss layer of the + network with a softmax layer, the network outputs would give the + probabilities of each class assignment. That is, if you have K classes + then the network should output tensors with the tensor::k()'th dimension + equal to K. Applying softmax to these K values gives the probabilities of + each class. The index into that K dimensional vector with the highest + probability is the predicted class label. + !*/ + + public: + + typedef unsigned long training_label_type; + typedef unsigned long output_label_type; + + template < + typename SUB_TYPE, + typename label_iterator + > + void to_label ( + const tensor& input_tensor, + const SUB_TYPE& sub, + label_iterator iter + ) const; + /*! + This function has the same interface as EXAMPLE_LOSS_LAYER_::to_label() except + it has the additional calling requirements that: + - sub.get_output().nr() == 1 + - sub.get_output().nc() == 1 + - sub.get_output().num_samples() == input_tensor.num_samples() + - sub.sample_expansion_factor() == 1 + and the output label is the predicted class for each classified object. The number + of possible output classes is sub.get_output().k(). + !*/ + + template < + typename const_label_iterator, + typename SUBNET + > + double compute_loss_value_and_gradient ( + const tensor& input_tensor, + const_label_iterator truth, + SUBNET& sub + ) const; + /*! + This function has the same interface as EXAMPLE_LOSS_LAYER_::compute_loss_value_and_gradient() + except it has the additional calling requirements that: + - sub.get_output().nr() == 1 + - sub.get_output().nc() == 1 + - sub.get_output().num_samples() == input_tensor.num_samples() + - sub.sample_expansion_factor() == 1 + - all values pointed to by truth are < sub.get_output().k() + !*/ + + }; + + template <typename SUBNET> + using loss_multiclass_log = add_loss_layer<loss_multiclass_log_, SUBNET>; + +// ---------------------------------------------------------------------------------------- + + class loss_multimulticlass_log_ + { + /*! + WHAT THIS OBJECT REPRESENTS + This object implements the loss layer interface defined above by + EXAMPLE_LOSS_LAYER_. In particular, it implements a collection of + multiclass classifiers. An example will make its use clear. So suppose, + for example, that you want to make something that takes a picture of a + vehicle and answers the following questions: + - What type of vehicle is it? A sedan or a truck? + - What color is it? red, green, blue, gray, or black? + You need two separate multi-class classifiers to do this. One to decide + the type of vehicle, and another to decide the color. The + loss_multimulticlass_log_ allows you to pack these two classifiers into one + neural network. This means that when you use the network to process an + image it will output 2 labels for each image, the type label and the color + label. + + To create a loss_multimulticlass_log_ for the above case you would + construct it as follows: + std::map<std::string,std::vector<std::string>> labels; + labels["type"] = {"sedan", "truck"}; + labels["color"] = {"red", "green", "blue", "gray", "black"}; + loss_multimulticlass_log_ myloss(labels); + Then you could use myloss with a network object and train it to do this + task. More generally, you can use any number of classifiers and labels + when using this object. Finally, each of the classifiers uses a standard + multi-class logistic regression loss. + !*/ + + public: + + loss_multimulticlass_log_( + ); + /*! + ensures + - #number_of_labels() == 0 + - #get_labels().size() == 0 + !*/ + + loss_multimulticlass_log_ ( + const std::map<std::string,std::vector<std::string>>& labels + ); + /*! + requires + - Each vector in labels must contain at least 2 strings. I.e. each + classifier must have at least two possible labels. + ensures + - #number_of_labels() == the total number of strings in all the + std::vectors in labels. + - #number_of_classifiers() == labels.size() + - #get_labels() == labels + !*/ + + unsigned long number_of_labels( + ) const; + /*! + ensures + - returns the total number of labels known to this loss. This is the count of + all the labels in each classifier. + !*/ + + unsigned long number_of_classifiers( + ) const; + /*! + ensures + - returns the number of classifiers defined by this loss. + !*/ + + std::map<std::string,std::vector<std::string>> get_labels ( + ) const; + /*! + ensures + - returns the names of the classifiers and labels used by this loss. In + particular, if the returned object is L then: + - L[CLASS] == the set of labels used by the classifier CLASS. + - L.size() == number_of_classifiers() + - The count of strings in the vectors in L == number_of_labels() + !*/ + + class classifier_output + { + /*! + WHAT THIS OBJECT REPRESENTS + This object stores the predictions from one of the classifiers in + loss_multimulticlass_log_. It allows you to find out the most likely + string label predicted by that classifier, as well as get the class + conditional probability of any of the classes in the classifier. + !*/ + + public: + + classifier_output( + ); + /*! + ensures + - #num_classes() == 0 + !*/ + + size_t num_classes( + ) const; + /*! + ensures + - returns the number of possible classes output by this classifier. + !*/ + + double probability_of_class ( + size_t i + ) const; + /*! + requires + - i < num_classes() + ensures + - returns the probability that the true class has a label of label(i). + - The sum of probability_of_class(j) for j in the range [0, num_classes()) is always 1. + !*/ + + const std::string& label( + size_t i + ) const; + /*! + requires + - i < num_classes() + ensures + - returns the string label for the ith class. + !*/ + + operator std::string( + ) const; + /*! + requires + - num_classes() != 0 + ensures + - returns the string label for the most probable class. + !*/ + + friend std::ostream& operator<< (std::ostream& out, const classifier_output& item); + /*! + requires + - num_classes() != 0 + ensures + - prints the most probable class label to out. + !*/ + + }; + + // Both training_label_type and output_label_type should always have sizes equal to + // number_of_classifiers(). That is, the std::map should have an entry for every + // classifier known to this loss. + typedef std::map<std::string,std::string> training_label_type; + typedef std::map<std::string,classifier_output> output_label_type; + + + template < + typename SUB_TYPE, + typename label_iterator + > + void to_label ( + const tensor& input_tensor, + const SUB_TYPE& sub, + label_iterator iter + ) const; + /*! + This function has the same interface as EXAMPLE_LOSS_LAYER_::to_label() except + it has the additional calling requirements that: + - number_of_labels() != 0 + - sub.get_output().k() == number_of_labels() + - sub.get_output().nr() == 1 + - sub.get_output().nc() == 1 + - sub.get_output().num_samples() == input_tensor.num_samples() + - sub.sample_expansion_factor() == 1 + !*/ + + template < + typename const_label_iterator, + typename SUBNET + > + double compute_loss_value_and_gradient ( + const tensor& input_tensor, + const_label_iterator truth, + SUBNET& sub + ) const; + /*! + This function has the same interface as EXAMPLE_LOSS_LAYER_::compute_loss_value_and_gradient() + except it has the additional calling requirements that: + - number_of_labels() != 0 + - sub.get_output().k() == number_of_labels() + It should be noted that the last layer in your network should usually + be an fc layer. If so, you can satisfy this requirement of k() being + number_of_labels() by calling set_num_outputs() prior to training your + network like so: + your_network.subnet().layer_details().set_num_outputs(your_network.loss_details().number_of_labels()); + - sub.get_output().nr() == 1 + - sub.get_output().nc() == 1 + - sub.get_output().num_samples() == input_tensor.num_samples() + - sub.sample_expansion_factor() == 1 + - All the std::maps pointed to by truth contain entries for all the + classifiers known to this loss. That is, it must be valid to call + truth[i][classifier] for any of the classifiers known to this loss. To + say this another way, all the training samples must contain labels for + each of the classifiers defined by this loss. + + To really belabor this, this also means that truth[i].size() == + get_labels().size() and that both truth[i] and get_labels() have the same + set of key strings. It also means that the value strings in truth[i] + must be strings known to the loss, i.e. they are valid labels according + to get_labels(). + !*/ + }; + + template <typename SUBNET> + using loss_multimulticlass_log = add_loss_layer<loss_multimulticlass_log_, SUBNET>; + + // Allow comparison between classifier_outputs and std::string to check if the + // predicted class is a particular string. + inline bool operator== (const std::string& lhs, const loss_multimulticlass_log_::classifier_output& rhs) + { return lhs == static_cast<const std::string&>(rhs); } + inline bool operator== (const loss_multimulticlass_log_::classifier_output& lhs, const std::string& rhs) + { return rhs == static_cast<const std::string&>(lhs); } + +// ---------------------------------------------------------------------------------------- +// ---------------------------------------------------------------------------------------- + + enum class use_image_pyramid : uint8_t + { + no, + yes + }; + + struct mmod_options + { + /*! + WHAT THIS OBJECT REPRESENTS + This object contains all the parameters that control the behavior of loss_mmod_. + !*/ + + public: + + struct detector_window_details + { + detector_window_details() = default; + detector_window_details(unsigned long w, unsigned long h) : width(w), height(h) {} + detector_window_details(unsigned long w, unsigned long h, const std::string& l) : width(w), height(h), label(l) {} + + unsigned long width = 0; + unsigned long height = 0; + std::string label; + + friend inline void serialize(const detector_window_details& item, std::ostream& out); + friend inline void deserialize(detector_window_details& item, std::istream& in); + }; + + mmod_options() = default; + + // This kind of object detector is a sliding window detector. The detector_windows + // field determines how many sliding windows we will use and what the shape of each + // window is. It also determines the output label applied to each detection + // identified by each window. Since you will usually use the MMOD loss with an + // image pyramid, the detector sizes also determine the size of the smallest object + // you can detect. + std::vector<detector_window_details> detector_windows; + + // These parameters control how we penalize different kinds of mistakes. See + // Max-Margin Object Detection by Davis E. King (http://arxiv.org/abs/1502.00046) + // for further details. + double loss_per_false_alarm = 1; + double loss_per_missed_target = 1; + + // A detection must have an intersection-over-union value greater than this for us + // to consider it a match against a ground truth box. + double truth_match_iou_threshold = 0.5; + + // When doing non-max suppression, we use overlaps_nms to decide if a box overlaps + // an already output detection and should therefore be thrown out. + test_box_overlap overlaps_nms = test_box_overlap(0.4); + + // Any mmod_rect in the training data that has its ignore field set to true defines + // an "ignore zone" in an image. Any detection from that area is totally ignored + // by the optimizer. Therefore, this overlaps_ignore field defines how we decide + // if a box falls into an ignore zone. You use these ignore zones if there are + // objects in your dataset that you are unsure if you want to detect or otherwise + // don't care if the detector gets them or not. + test_box_overlap overlaps_ignore; + + // Usually the detector would be scale-invariant, and used with an image pyramid. + // However, sometimes scale-invariance may not be desired. + use_image_pyramid assume_image_pyramid = use_image_pyramid::yes; + + mmod_options ( + const std::vector<std::vector<mmod_rect>>& boxes, + const unsigned long target_size, + const unsigned long min_target_size, + const double min_detector_window_overlap_iou = 0.75 + ); + /*! + requires + - 0 < min_target_size <= target_size + - 0.5 < min_detector_window_overlap_iou < 1 + ensures + - use_image_pyramid_ == use_image_pyramid::yes + - This function should be used when scale-invariance is desired, and + input_rgb_image_pyramid is therefore used as the input layer. + - This function tries to automatically set the MMOD options to reasonable + values, assuming you have a training dataset of boxes.size() images, where + the ith image contains objects boxes[i] you want to detect. + - The most important thing this function does is decide what detector + windows should be used. This is done by finding a set of detector + windows that are sized such that: + - When slid over an image pyramid, each box in boxes will have an + intersection-over-union with one of the detector windows of at least + min_detector_window_overlap_iou. That is, we will make sure that + each box in boxes could potentially be detected by one of the + detector windows. This essentially comes down to picking detector + windows with aspect ratios similar to the aspect ratios in boxes. + Note that we also make sure that each box can be detected by a window + with the same label. For example, if all the boxes had the same + aspect ratio but there were 4 different labels used in boxes then + there would be 4 resulting detector windows, one for each label. + - The longest edge of each detector window is target_size pixels in + length, unless the window's shortest side would be less than + min_target_size pixels in length. In this case the shortest side + will be set to min_target_size length, and the other side sized to + preserve the aspect ratio of the window. + This means that target_size and min_target_size control the size of the + detector windows, while the aspect ratios of the detector windows are + automatically determined by the contents of boxes. It should also be + emphasized that the detector isn't going to be able to detect objects + smaller than any of the detector windows. So consider that when setting + these sizes. + - This function will also set the overlaps_nms tester to the most + restrictive tester that doesn't reject anything in boxes. + !*/ + + mmod_options ( + use_image_pyramid use_image_pyramid, + const std::vector<std::vector<mmod_rect>>& boxes, + const double min_detector_window_overlap_iou = 0.75 + ); + /*! + requires + - use_image_pyramid == use_image_pyramid::no + - 0.5 < min_detector_window_overlap_iou < 1 + ensures + - This function should be used when scale-invariance is not desired, and + there is no intention to apply an image pyramid. + - This function tries to automatically set the MMOD options to reasonable + values, assuming you have a training dataset of boxes.size() images, where + the ith image contains objects boxes[i] you want to detect. + - The most important thing this function does is decide what detector + windows should be used. This is done by finding a set of detector + windows that are sized such that: + - When slid over an image, each box in boxes will have an + intersection-over-union with one of the detector windows of at least + min_detector_window_overlap_iou. That is, we will make sure that + each box in boxes could potentially be detected by one of the + detector windows. + - This function will also set the overlaps_nms tester to the most + restrictive tester that doesn't reject anything in boxes. + !*/ + }; + + void serialize(const mmod_options& item, std::ostream& out); + void deserialize(mmod_options& item, std::istream& in); + +// ---------------------------------------------------------------------------------------- + + class loss_mmod_ + { + /*! + WHAT THIS OBJECT REPRESENTS + This object implements the loss layer interface defined above by + EXAMPLE_LOSS_LAYER_. In particular, it implements the Max Margin Object + Detection loss defined in the paper: + Max-Margin Object Detection by Davis E. King (http://arxiv.org/abs/1502.00046). + + This means you use this loss if you want to detect the locations of objects + in images. + + It should also be noted that this loss layer requires an input layer that + defines the following functions: + - image_contained_point() + - tensor_space_to_image_space() + - image_space_to_tensor_space() + A reference implementation of them and their definitions can be found in + the input_rgb_image_pyramid object, which is the recommended input layer to + be used with loss_mmod_. + !*/ + + public: + + typedef std::vector<mmod_rect> training_label_type; + typedef std::vector<mmod_rect> output_label_type; + + loss_mmod_( + ); + /*! + ensures + - #get_options() == mmod_options() + !*/ + + loss_mmod_( + mmod_options options_ + ); + /*! + ensures + - #get_options() == options_ + !*/ + + const mmod_options& get_options ( + ) const; + /*! + ensures + - returns the options object that defines the general behavior of this loss layer. + !*/ + + template < + typename SUB_TYPE, + typename label_iterator + > + void to_label ( + const tensor& input_tensor, + const SUB_TYPE& sub, + label_iterator iter, + double adjust_threshold = 0 + ) const; + /*! + This function has the same interface as EXAMPLE_LOSS_LAYER_::to_label() except + it has the additional calling requirements that: + - sub.get_output().k() == 1 + - sub.get_output().num_samples() == input_tensor.num_samples() + - sub.sample_expansion_factor() == 1 + Also, the output labels are std::vectors of mmod_rects where, for each mmod_rect R, + we have the following interpretations: + - R.rect == the location of an object in the image. + - R.detection_confidence the score for the object, the bigger the score the + more confident the detector is that an object is really there. Only + objects with a detection_confidence > adjust_threshold are output. So if + you want to output more objects (that are also of less confidence) you + can call to_label() with a smaller value of adjust_threshold. + - R.ignore == false (this value is unused by to_label()). + !*/ + + template < + typename const_label_iterator, + typename SUBNET + > + double compute_loss_value_and_gradient ( + const tensor& input_tensor, + const_label_iterator truth, + SUBNET& sub + ) const; + /*! + This function has the same interface as EXAMPLE_LOSS_LAYER_::compute_loss_value_and_gradient() + except it has the additional calling requirements that: + - sub.get_output().k() == 1 + - sub.get_output().num_samples() == input_tensor.num_samples() + - sub.sample_expansion_factor() == 1 + Also, the loss value returned is roughly equal to the average number of + mistakes made per image. This is the sum of false alarms and missed + detections, weighted by the loss weights for these types of mistakes specified + in the mmod_options. + !*/ + }; + + template <typename SUBNET> + using loss_mmod = add_loss_layer<loss_mmod_, SUBNET>; + +// ---------------------------------------------------------------------------------------- + + class loss_metric_ + { + /*! + WHAT THIS OBJECT REPRESENTS + This object implements the loss layer interface defined above by + EXAMPLE_LOSS_LAYER_. In particular, it allows you to learn to map objects + into a vector space where objects sharing the same class label are close to + each other, while objects with different labels are far apart. + + To be specific, it optimizes the following loss function which considers + all pairs of objects in a mini-batch and computes a different loss depending + on their respective class labels. So if objects A1 and A2 in a mini-batch + share the same class label then their contribution to the loss is: + max(0, length(A1-A2)-get_distance_threshold() + get_margin()) + + While if A1 and B1 have different class labels then their contribution to + the loss function is: + max(0, get_distance_threshold()-length(A1-B1) + get_margin()) + + Therefore, this loss layer optimizes a version of the hinge loss. + Moreover, the loss is trying to make sure that all objects with the same + label are within get_distance_threshold() distance of each other. + Conversely, if two objects have different labels then they should be more + than get_distance_threshold() distance from each other in the learned + embedding. So this loss function gives you a natural decision boundary for + deciding if two objects are from the same class. + + Finally, the loss balances the number of negative pairs relative to the + number of positive pairs. Therefore, if there are N pairs that share the + same identity in a mini-batch then the algorithm will only include the N + worst non-matching pairs in the loss. That is, the algorithm performs hard + negative mining on the non-matching pairs. This is important since there + are in general way more non-matching pairs than matching pairs. So to + avoid imbalance in the loss this kind of hard negative mining is useful. + !*/ + public: + + typedef unsigned long training_label_type; + typedef matrix<float,0,1> output_label_type; + + loss_metric_( + ); + /*! + ensures + - #get_margin() == 0.04 + - #get_distance_threshold() == 0.6 + !*/ + + loss_metric_( + float margin, + float dist_thresh + ); + /*! + requires + - margin > 0 + - dist_thresh > 0 + ensures + - #get_margin() == margin + - #get_distance_threshold() == dist_thresh + !*/ + + template < + typename SUB_TYPE, + typename label_iterator + > + void to_label ( + const tensor& input_tensor, + const SUB_TYPE& sub, + label_iterator iter + ) const; + /*! + This function has the same interface as EXAMPLE_LOSS_LAYER_::to_label() except + it has the additional calling requirements that: + - sub.get_output().nr() == 1 + - sub.get_output().nc() == 1 + - sub.get_output().num_samples() == input_tensor.num_samples() + - sub.sample_expansion_factor() == 1 + This loss expects the network to produce a single vector (per sample) as + output. This vector is the learned embedding. Therefore, to_label() just + copies these output vectors from the network into the output label_iterators + given to this function, one for each sample in the input_tensor. + !*/ + + float get_margin() const; + /*! + ensures + - returns the margin value used by the loss function. See the discussion + in WHAT THIS OBJECT REPRESENTS for details. + !*/ + + float get_distance_threshold() const; + /*! + ensures + - returns the distance threshold value used by the loss function. See the discussion + in WHAT THIS OBJECT REPRESENTS for details. + !*/ + + template < + typename const_label_iterator, + typename SUBNET + > + double compute_loss_value_and_gradient ( + const tensor& input_tensor, + const_label_iterator truth, + SUBNET& sub + ) const; + /*! + This function has the same interface as EXAMPLE_LOSS_LAYER_::compute_loss_value_and_gradient() + except it has the additional calling requirements that: + - sub.get_output().nr() == 1 + - sub.get_output().nc() == 1 + - sub.get_output().num_samples() == input_tensor.num_samples() + - sub.sample_expansion_factor() == 1 + !*/ + + }; + + template <typename SUBNET> + using loss_metric = add_loss_layer<loss_metric_, SUBNET>; + +// ---------------------------------------------------------------------------------------- + + class loss_ranking_ + { + /*! + WHAT THIS OBJECT REPRESENTS + This object implements the loss layer interface defined above by + EXAMPLE_LOSS_LAYER_. In particular, it implements the pairwise ranking + loss described in the paper: + Optimizing Search Engines using Clickthrough Data by Thorsten Joachims + + This is the same loss function used by the dlib::svm_rank_trainer object. + Therefore, it is generally appropriate when you have a two class problem + and you want to learn a function that ranks one class before the other. + + So for example, suppose you have two classes of data. Objects of type A + and objects of type B. Moreover, suppose that you want to sort the objects + so that A objects always come before B objects. This loss will help you + learn a function that assigns a real number to each object such that A + objects get a larger number assigned to them than B objects. This lets you + then sort the objects according to the output of the neural network and + obtain the desired result of having A objects come before B objects. + + The training labels should be positive values for objects you want to get + high scores and negative for objects that should get small scores. So + relative to our A/B example, you would give A objects labels of +1 and B + objects labels of -1. This should cause the learned network to give A + objects large positive values and B objects negative values. + + + Finally, the specific loss function is: + For all pairs of positive vs negative training examples A_i and B_j respectively: + sum_ij: max(0, B_i - A_j + margin_ij) + where margin_ij = the label for A_j minus the label for B_i. If you + always use +1 and -1 labels then the margin is always 2. However, this + formulation allows you to give certain training samples different weight by + adjusting the training labels appropriately. + !*/ + + public: + + typedef float training_label_type; + typedef float output_label_type; + + template < + typename SUB_TYPE, + typename label_iterator + > + void to_label ( + const tensor& input_tensor, + const SUB_TYPE& sub, + label_iterator iter + ) const; + /*! + This function has the same interface as EXAMPLE_LOSS_LAYER_::to_label() except + it has the additional calling requirements that: + - sub.get_output().nr() == 1 + - sub.get_output().nc() == 1 + - sub.get_output().k() == 1 + - sub.get_output().num_samples() == input_tensor.num_samples() + - sub.sample_expansion_factor() == 1 + and the output label is the predicted ranking score. + !*/ + + template < + typename const_label_iterator, + typename SUBNET + > + double compute_loss_value_and_gradient ( + const tensor& input_tensor, + const_label_iterator truth, + SUBNET& sub + ) const; + /*! + This function has the same interface as EXAMPLE_LOSS_LAYER_::compute_loss_value_and_gradient() + except it has the additional calling requirements that: + - sub.get_output().nr() == 1 + - sub.get_output().nc() == 1 + - sub.get_output().k() == 1 + - sub.get_output().num_samples() == input_tensor.num_samples() + - sub.sample_expansion_factor() == 1 + !*/ + + }; + + template <typename SUBNET> + using loss_ranking = add_loss_layer<loss_ranking_, SUBNET>; + +// ---------------------------------------------------------------------------------------- + + class loss_epsilon_insensitive_ + { + /*! + WHAT THIS OBJECT REPRESENTS + This object implements the loss layer interface defined above by + EXAMPLE_LOSS_LAYER_. In particular, it implements the epsilon insensitive + loss, which is appropriate for regression problems. In particular, this + loss function is; + loss(y1,y2) = abs(y1-y2)<epsilon ? 0 : abs(y1-y2)-epsilon + + Therefore, the loss is basically just the abs() loss except there is a dead + zone around zero, causing the loss to not care about mistakes of magnitude + smaller than epsilon. + !*/ + public: + + typedef float training_label_type; + typedef float output_label_type; + + loss_epsilon_insensitive_( + ) = default; + /*! + ensures + - #get_epsilon() == 1 + !*/ + + loss_epsilon_insensitive_( + double eps + ); + /*! + requires + - eps >= 0 + ensures + - #get_epsilon() == eps + !*/ + + double get_epsilon ( + ) const; + /*! + ensures + - returns the epsilon value used in the loss function. Mistakes in the + regressor smaller than get_epsilon() are ignored by the loss function. + !*/ + + void set_epsilon( + double eps + ); + /*! + requires + - eps >= 0 + ensures + - #get_epsilon() == eps + !*/ + + template < + typename SUB_TYPE, + typename label_iterator + > + void to_label ( + const tensor& input_tensor, + const SUB_TYPE& sub, + label_iterator iter + ) const; + /*! + This function has the same interface as EXAMPLE_LOSS_LAYER_::to_label() except + it has the additional calling requirements that: + - sub.get_output().nr() == 1 + - sub.get_output().nc() == 1 + - sub.get_output().k() == 1 + - sub.get_output().num_samples() == input_tensor.num_samples() + - sub.sample_expansion_factor() == 1 + and the output label is the predicted continuous variable. + !*/ + + template < + typename const_label_iterator, + typename SUBNET + > + double compute_loss_value_and_gradient ( + const tensor& input_tensor, + const_label_iterator truth, + SUBNET& sub + ) const; + /*! + This function has the same interface as EXAMPLE_LOSS_LAYER_::compute_loss_value_and_gradient() + except it has the additional calling requirements that: + - sub.get_output().nr() == 1 + - sub.get_output().nc() == 1 + - sub.get_output().k() == 1 + - sub.get_output().num_samples() == input_tensor.num_samples() + - sub.sample_expansion_factor() == 1 + !*/ + + }; + + template <typename SUBNET> + using loss_epsilon_insensitive = add_loss_layer<loss_epsilon_insensitive_, SUBNET>; + +// ---------------------------------------------------------------------------------------- + + class loss_mean_squared_ + { + /*! + WHAT THIS OBJECT REPRESENTS + This object implements the loss layer interface defined above by + EXAMPLE_LOSS_LAYER_. In particular, it implements the mean squared loss, which is + appropriate for regression problems. + !*/ + public: + + typedef float training_label_type; + typedef float output_label_type; + + template < + typename SUB_TYPE, + typename label_iterator + > + void to_label ( + const tensor& input_tensor, + const SUB_TYPE& sub, + label_iterator iter + ) const; + /*! + This function has the same interface as EXAMPLE_LOSS_LAYER_::to_label() except + it has the additional calling requirements that: + - sub.get_output().nr() == 1 + - sub.get_output().nc() == 1 + - sub.get_output().k() == 1 + - sub.get_output().num_samples() == input_tensor.num_samples() + - sub.sample_expansion_factor() == 1 + and the output label is the predicted continuous variable. + !*/ + + template < + typename const_label_iterator, + typename SUBNET + > + double compute_loss_value_and_gradient ( + const tensor& input_tensor, + const_label_iterator truth, + SUBNET& sub + ) const; + /*! + This function has the same interface as EXAMPLE_LOSS_LAYER_::compute_loss_value_and_gradient() + except it has the additional calling requirements that: + - sub.get_output().nr() == 1 + - sub.get_output().nc() == 1 + - sub.get_output().k() == 1 + - sub.get_output().num_samples() == input_tensor.num_samples() + - sub.sample_expansion_factor() == 1 + !*/ + + }; + + template <typename SUBNET> + using loss_mean_squared = add_loss_layer<loss_mean_squared_, SUBNET>; + +// ---------------------------------------------------------------------------------------- + + class loss_mean_squared_multioutput_ + { + /*! + WHAT THIS OBJECT REPRESENTS + This object implements the loss layer interface defined above by + EXAMPLE_LOSS_LAYER_. In particular, it implements the mean squared loss, + which is appropriate for regression problems. It is basically just like + loss_mean_squared_ except that it lets you define multiple outputs instead + of just 1. + !*/ + public: + + typedef matrix<float> training_label_type; + typedef matrix<float> output_label_type; + + template < + typename SUB_TYPE, + typename label_iterator + > + void to_label ( + const tensor& input_tensor, + const SUB_TYPE& sub, + label_iterator iter + ) const; + /*! + This function has the same interface as EXAMPLE_LOSS_LAYER_::to_label() except + it has the additional calling requirements that: + - sub.get_output().nr() == 1 + - sub.get_output().nc() == 1 + - sub.get_output().num_samples() == input_tensor.num_samples() + - sub.sample_expansion_factor() == 1 + and the output label is the predicted continuous variable. + !*/ + + template < + typename const_label_iterator, + typename SUBNET + > + double compute_loss_value_and_gradient ( + const tensor& input_tensor, + const_label_iterator truth, + SUBNET& sub + ) const; + /*! + This function has the same interface as EXAMPLE_LOSS_LAYER_::compute_loss_value_and_gradient() + except it has the additional calling requirements that: + - sub.get_output().nr() == 1 + - sub.get_output().nc() == 1 + - sub.get_output().num_samples() == input_tensor.num_samples() + - sub.sample_expansion_factor() == 1 + - (*(truth + idx)).nc() == 1 for all idx such that 0 <= idx < sub.get_output().num_samples() + - (*(truth + idx)).nr() == sub.get_output().k() for all idx such that 0 <= idx < sub.get_output().num_samples() + !*/ + + }; + + template <typename SUBNET> + using loss_mean_squared_multioutput = add_loss_layer<loss_mean_squared_multioutput_, SUBNET>; + +// ---------------------------------------------------------------------------------------- + + class loss_multiclass_log_per_pixel_ + { + /*! + WHAT THIS OBJECT REPRESENTS + This object implements the loss layer interface defined above by + EXAMPLE_LOSS_LAYER_. In particular, it implements the multiclass logistic + regression loss (e.g. negative log-likelihood loss), which is appropriate + for multiclass classification problems. It is basically just like + loss_multiclass_log_ except that it lets you define matrix outputs instead + of scalar outputs. It should be useful, for example, in semantic + segmentation where we want to classify each pixel of an image. + !*/ + public: + + // In semantic segmentation, if you don't know the ground-truth of some pixel, + // set the label of that pixel to this value. When you do so, the pixel will be + // ignored when computing gradients. + static const uint16_t label_to_ignore = std::numeric_limits<uint16_t>::max(); + + // In semantic segmentation, 65535 classes ought to be enough for anybody. + typedef matrix<uint16_t> training_label_type; + typedef matrix<uint16_t> output_label_type; + + template < + typename SUB_TYPE, + typename label_iterator + > + void to_label ( + const tensor& input_tensor, + const SUB_TYPE& sub, + label_iterator iter + ) const; + /*! + This function has the same interface as EXAMPLE_LOSS_LAYER_::to_label() except + it has the additional calling requirements that: + - sub.get_output().num_samples() == input_tensor.num_samples() + - sub.sample_expansion_factor() == 1 + and the output label is the predicted class for each classified element. The number + of possible output classes is sub.get_output().k(). + !*/ + + template < + typename const_label_iterator, + typename SUBNET + > + double compute_loss_value_and_gradient ( + const tensor& input_tensor, + const_label_iterator truth, + SUBNET& sub + ) const; + /*! + This function has the same interface as EXAMPLE_LOSS_LAYER_::compute_loss_value_and_gradient() + except it has the additional calling requirements that: + - sub.get_output().num_samples() == input_tensor.num_samples() + - sub.sample_expansion_factor() == 1 + - all values pointed to by truth are < sub.get_output().k() or are equal to label_to_ignore. + !*/ + + }; + + template <typename SUBNET> + using loss_multiclass_log_per_pixel = add_loss_layer<loss_multiclass_log_per_pixel_, SUBNET>; + +// ---------------------------------------------------------------------------------------- + + class loss_multiclass_log_per_pixel_weighted_ + { + /*! + WHAT THIS OBJECT REPRESENTS + This object implements the loss layer interface defined above by + EXAMPLE_LOSS_LAYER_. In particular, it implements the multiclass logistic + regression loss (e.g. negative log-likelihood loss), which is appropriate + for multiclass classification problems. It is basically just like + loss_multiclass_log_per_pixel_ except that it lets you define per-pixel + weights, which may be useful e.g. if you want to emphasize rare classes + while training. (If the classification problem is difficult, a flat weight + structure may lead the network to always predict the most common label, in + particular if the degree of imbalance is high. To emphasize a certain + class or classes, simply increase the weights of the corresponding pixels, + relative to the weights of the other pixels.) + + Note that if you set the weight to 0 whenever a pixel's label is equal to + loss_multiclass_log_per_pixel_::label_to_ignore, and to 1 otherwise, then + you essentially get loss_multiclass_log_per_pixel_ as a special case. + !*/ + public: + + struct weighted_label + { + /*! + WHAT THIS OBJECT REPRESENTS + This object represents the truth label of a single pixel, together with + an associated weight (the higher the weight, the more emphasis the + corresponding pixel is given during the training). + !*/ + + weighted_label(); + weighted_label(uint16_t label, float weight = 1.f); + + // The ground-truth label. In semantic segmentation, 65536 classes ought to be + // enough for anybody. + uint16_t label = 0; + + // The weight of the corresponding pixel. + float weight = 1.f; + }; + + typedef matrix<weighted_label> training_label_type; + typedef matrix<uint16_t> output_label_type; + + template < + typename SUB_TYPE, + typename label_iterator + > + void to_label ( + const tensor& input_tensor, + const SUB_TYPE& sub, + label_iterator iter + ) const; + /*! + This function has the same interface as EXAMPLE_LOSS_LAYER_::to_label() except + it has the additional calling requirements that: + - sub.get_output().num_samples() == input_tensor.num_samples() + - sub.sample_expansion_factor() == 1 + and the output label is the predicted class for each classified element. The number + of possible output classes is sub.get_output().k(). + !*/ + + template < + typename const_label_iterator, + typename SUBNET + > + double compute_loss_value_and_gradient ( + const tensor& input_tensor, + const_label_iterator truth, + SUBNET& sub + ) const; + /*! + This function has the same interface as EXAMPLE_LOSS_LAYER_::compute_loss_value_and_gradient() + except it has the additional calling requirements that: + - sub.get_output().num_samples() == input_tensor.num_samples() + - sub.sample_expansion_factor() == 1 + - all labels pointed to by truth are < sub.get_output().k(), or the corresponding weight + is zero. + !*/ + + }; + + template <typename SUBNET> + using loss_multiclass_log_per_pixel_weighted = add_loss_layer<loss_multiclass_log_per_pixel_weighted_, SUBNET>; + +// ---------------------------------------------------------------------------------------- + + class loss_mean_squared_per_pixel_ + { + /*! + WHAT THIS OBJECT REPRESENTS + This object implements the loss layer interface defined above by + EXAMPLE_LOSS_LAYER_. In particular, it implements the mean squared loss, + which is appropriate for regression problems. It is basically just like + loss_mean_squared_multioutput_ except that it lets you define matrix or + image outputs, instead of vector. + !*/ + public: + + typedef matrix<float> training_label_type; + typedef matrix<float> output_label_type; + + template < + typename SUB_TYPE, + typename label_iterator + > + void to_label ( + const tensor& input_tensor, + const SUB_TYPE& sub, + label_iterator iter + ) const; + /*! + This function has the same interface as EXAMPLE_LOSS_LAYER_::to_label() except + it has the additional calling requirements that: + - sub.get_output().num_samples() == input_tensor.num_samples() + - sub.sample_expansion_factor() == 1 + and the output labels are the predicted continuous variables. + !*/ + + template < + typename const_label_iterator, + typename SUBNET + > + double compute_loss_value_and_gradient ( + const tensor& input_tensor, + const_label_iterator truth, + SUBNET& sub + ) const; + /*! + This function has the same interface as EXAMPLE_LOSS_LAYER_::compute_loss_value_and_gradient() + except it has the additional calling requirements that: + - sub.get_output().k() == 1 + - sub.get_output().num_samples() == input_tensor.num_samples() + - sub.sample_expansion_factor() == 1 + - for all idx such that 0 <= idx < sub.get_output().num_samples(): + - sub.get_output().nr() == (*(truth + idx)).nr() + - sub.get_output().nc() == (*(truth + idx)).nc() + !*/ + }; + + template <typename SUBNET> + using loss_mean_squared_per_pixel = add_loss_layer<loss_mean_squared_per_pixel_, SUBNET>; + +// ---------------------------------------------------------------------------------------- + + class loss_dot_ + { + /*! + WHAT THIS OBJECT REPRESENTS + This object implements the loss layer interface defined above by + EXAMPLE_LOSS_LAYER_. In particular, selecting this loss means you want + maximize the dot product between the output of a network and a set of + training vectors. The loss is therefore the negative dot product. To be + very specific, if X is the output vector of a network and Y is a training + label (also a vector), then the loss for this training sample is: -dot(X,Y) + !*/ + + public: + + typedef matrix<float,0,1> training_label_type; + typedef matrix<float,0,1> output_label_type; + + template < + typename SUB_TYPE, + typename label_iterator + > + void to_label ( + const tensor& input_tensor, + const SUB_TYPE& sub, + label_iterator iter + ) const; + /*! + This function has the same interface as EXAMPLE_LOSS_LAYER_::to_label() except + it has the additional calling requirements that: + - sub.get_output().num_samples() == input_tensor.num_samples() + - sub.sample_expansion_factor() == 1 + and the output labels are simply the final network outputs stuffed into a + vector. To be very specific, the output is the following for all valid i: + *(iter+i) == trans(rowm(mat(sub.get_output()),i)) + !*/ + + + template < + typename const_label_iterator, + typename SUBNET + > + double compute_loss_value_and_gradient ( + const tensor& input_tensor, + const_label_iterator truth, + SUBNET& sub + ) const; + /*! + This function has the same interface as EXAMPLE_LOSS_LAYER_::compute_loss_value_and_gradient() + except it has the additional calling requirements that: + - sub.get_output().num_samples() == input_tensor.num_samples() + - sub.sample_expansion_factor() == 1 + - Let NETWORK_OUTPUT_DIMS == sub.get_output().size()/sub.get_output().num_samples() + - for all idx such that 0 <= idx < sub.get_output().num_samples(): + - NETWORK_OUTPUT_DIMS == (*(truth + idx)).size() + !*/ + }; + + template <typename SUBNET> + using loss_dot = add_loss_layer<loss_dot_, SUBNET>; + +// ---------------------------------------------------------------------------------------- + +} + +#endif // DLIB_DNn_LOSS_ABSTRACT_H_ + diff --git a/ml/dlib/dlib/dnn/solvers.h b/ml/dlib/dlib/dnn/solvers.h new file mode 100644 index 000000000..204541a7e --- /dev/null +++ b/ml/dlib/dlib/dnn/solvers.h @@ -0,0 +1,405 @@ +// Copyright (C) 2015 Davis E. King (davis@dlib.net) +// License: Boost Software License See LICENSE.txt for the full license. +#ifndef DLIB_DNn_SOLVERS_H_ +#define DLIB_DNn_SOLVERS_H_ + +#include "solvers_abstract.h" +#include "tensor.h" +#include <iostream> +#include "layers.h" + +namespace dlib +{ + class sgd + { + public: + + explicit sgd( + float weight_decay_, + float momentum_ = 0.9 + ) + { + weight_decay = weight_decay_; + momentum = momentum_; + } + + sgd( + ) : sgd(0.0005, 0.9) + { + } + + float get_momentum ( + ) const { return momentum; } + + float get_weight_decay ( + ) const { return weight_decay; } + + template <typename layer_type> + const tensor& operator() ( + const float learning_rate, + const layer_type& l, + const tensor& params_grad + ) + { + const tensor& params = l.get_layer_params(); + + DLIB_CASSERT(params.size() != 0); + if (v.size() == 0) + { + v.copy_size(params_grad); + v = 0; + } + + const double lr = learning_rate*get_learning_rate_multiplier(l); + const double wd = weight_decay*get_weight_decay_multiplier(l); + + //perform: v = momentum*mat(v) - wd*lr*mat(params) - lr*mat(params_grad); + tt::affine_transform(v, v, params, params_grad, momentum, -wd*lr, -lr); + + return v; + } + + template <unsigned long N> + const tensor& operator() ( + const float learning_rate, + const fc_<N,FC_HAS_BIAS>& l, + const tensor& params_grad + ) + { + update_considering_bias(learning_rate, l, params_grad, params_grad.size()-l.get_num_outputs()); + return v; + } + + template < + long _num_filters, + long _nr, + long _nc, + int _stride_y, + int _stride_x, + int _padding_y, + int _padding_x + > + const tensor& operator() ( + const float learning_rate, + const con_<_num_filters,_nr,_nc,_stride_y,_stride_x,_padding_y,_padding_x>& l, + const tensor& params_grad + ) + { + update_considering_bias(learning_rate, l, params_grad, params_grad.size()-l.num_filters()); + return v; + } + + template < + long _num_filters, + long _nr, + long _nc, + int _stride_y, + int _stride_x, + int _padding_y, + int _padding_x + > + const tensor& operator() ( + const float learning_rate, + const cont_<_num_filters,_nr,_nc,_stride_y,_stride_x,_padding_y,_padding_x>& l, + const tensor& params_grad + ) + { + update_considering_bias(learning_rate, l, params_grad, params_grad.size()-l.num_filters()); + return v; + } + + template < layer_mode mode > + const tensor& operator() ( + const float learning_rate, + const bn_<mode>& l, + const tensor& params_grad + ) + { + update_considering_bias(learning_rate, l, params_grad, params_grad.size()/2); + return v; + } + + friend void serialize(const sgd& item, std::ostream& out) + { + serialize("sgd2", out); + serialize(item.v, out); + serialize(item.weight_decay, out); + serialize(item.momentum, out); + } + + friend void deserialize(sgd& item, std::istream& in) + { + std::string version; + deserialize(version, in); + if (version != "sgd2") + throw serialization_error("Unexpected version found while deserializing dlib::sgd."); + deserialize(item.v, in); + deserialize(item.weight_decay, in); + deserialize(item.momentum, in); + } + + friend std::ostream& operator<< (std::ostream& out, const sgd& item) + { + out << "sgd: weight_decay="<<item.get_weight_decay() << ", momentum="<<item.get_momentum(); + return out; + } + + private: + + template <typename layer_type> + void update_considering_bias( + const float learning_rate, + const layer_type& l, + const tensor& params_grad, + unsigned long bias_offset + ) + { + const tensor& params = l.get_layer_params(); + + DLIB_CASSERT(params.size() != 0); + if (v.size() == 0) + { + v.copy_size(params_grad); + v = 0; + } + + double lr = learning_rate*get_learning_rate_multiplier(l); + double wd = weight_decay*get_weight_decay_multiplier(l); + + //perform: v = momentum*mat(v) - wd*lr*mat(params) - lr*mat(params_grad); + + if (l.get_bias_learning_rate_multiplier() == 1 && l.get_bias_weight_decay_multiplier() == 1) + { + tt::affine_transform(v, v, params, params_grad, momentum, -wd*lr, -lr); + } + else + { + + tt::affine_transform_range(0, bias_offset, v, v, params, params_grad, momentum, -wd*lr, -lr); + + // now update the biases but apply their multipliers + lr *= l.get_bias_learning_rate_multiplier(); + wd *= l.get_bias_weight_decay_multiplier(); + tt::affine_transform_range(bias_offset, v.size(), v, v, params, params_grad, momentum, -wd*lr, -lr); + } + } + + resizable_tensor v; + float weight_decay; + float momentum; + + }; + +// ---------------------------------------------------------------------------------------- + + class adam + { + public: + + adam( + float weight_decay_, + float momentum1_, + float momentum2_ + ) + { + weight_decay = weight_decay_; + momentum1 = momentum1_; + momentum2 = momentum2_; + t = 0; + } + + adam( + ) : adam(0.0005, 0.9, 0.999) + {} + + float get_momentum1 ( + ) const { return momentum1; } + + float get_momentum2 ( + ) const { return momentum2; } + + float get_weight_decay ( + ) const { return weight_decay; } + + template <typename layer_type> + const tensor& operator() ( + const float learning_rate, + const layer_type& l, + const tensor& params_grad + ) + { + const tensor& params = l.get_layer_params(); + DLIB_CASSERT(params.size() != 0); + if (v.size() == 0) + { + m.copy_size(params_grad); + m = 0; + v.copy_size(params_grad); + v = 0; + s.copy_size(params_grad); + } + + ++t; + + + tt::compute_adam_update(0, params.size(), s, m, v, t, + learning_rate*get_learning_rate_multiplier(l), + weight_decay*get_weight_decay_multiplier(l), + momentum1, momentum2, params, params_grad); + + return s; + } + + template <unsigned long N> + const tensor& operator() ( + const float learning_rate, + const fc_<N,FC_HAS_BIAS>& l, + const tensor& params_grad + ) + { + update_considering_bias(learning_rate, l, params_grad, params_grad.size()-l.get_num_outputs()); + return s; + } + + template < + long _num_filters, + long _nr, + long _nc, + int _stride_y, + int _stride_x, + int _padding_y, + int _padding_x + > + const tensor& operator() ( + const float learning_rate, + const con_<_num_filters,_nr,_nc,_stride_y,_stride_x,_padding_y,_padding_x>& l, + const tensor& params_grad + ) + { + update_considering_bias(learning_rate, l, params_grad, params_grad.size()-l.num_filters()); + return s; + } + + template < + long _num_filters, + long _nr, + long _nc, + int _stride_y, + int _stride_x, + int _padding_y, + int _padding_x + > + const tensor& operator() ( + const float learning_rate, + const cont_<_num_filters,_nr,_nc,_stride_y,_stride_x,_padding_y,_padding_x>& l, + const tensor& params_grad + ) + { + update_considering_bias(learning_rate, l, params_grad, params_grad.size()-l.num_filters()); + return s; + } + + template < layer_mode mode > + const tensor& operator() ( + const float learning_rate, + const bn_<mode>& l, + const tensor& params_grad + ) + { + update_considering_bias(learning_rate, l, params_grad, params_grad.size()/2); + return s; + } + + + friend void serialize(const adam& item, std::ostream& out) + { + serialize("adam2", out); + serialize(item.m, out); + serialize(item.v, out); + serialize(item.s, out); + serialize(item.weight_decay, out); + serialize(item.momentum1, out); + serialize(item.momentum2, out); + serialize(item.t, out); + } + + friend void deserialize(adam& item, std::istream& in) + { + std::string version; + deserialize(version, in); + if (version != "adam2") + throw serialization_error("Unexpected version found while deserializing dlib::adam."); + deserialize(item.m, in); + deserialize(item.v, in); + deserialize(item.s, in); + deserialize(item.weight_decay, in); + deserialize(item.momentum1, in); + deserialize(item.momentum2, in); + deserialize(item.t, in); + } + + friend std::ostream& operator<< (std::ostream& out, const adam& item) + { + out << "adam: weight_decay="<<item.get_weight_decay() << ", momentum1="<<item.get_momentum1() << ", momentum2="<<item.get_momentum2(); + return out; + } + + private: + + template <typename layer_type> + void update_considering_bias( + const float learning_rate, + const layer_type& l, + const tensor& params_grad, + unsigned long bias_offset + ) + { + const tensor& params = l.get_layer_params(); + DLIB_CASSERT(params.size() != 0); + if (v.size() == 0) + { + m.copy_size(params_grad); + m = 0; + v.copy_size(params_grad); + v = 0; + s.copy_size(params_grad); + } + + + ++t; + + if (l.get_bias_learning_rate_multiplier() == 1 && l.get_bias_weight_decay_multiplier() == 1) + { + tt::compute_adam_update(0, params.size(), s, m, v, t, + learning_rate*get_learning_rate_multiplier(l), + weight_decay*get_weight_decay_multiplier(l), + momentum1, momentum2, params, params_grad); + } + else + { + tt::compute_adam_update(0, bias_offset, s, m, v, t, + learning_rate*get_learning_rate_multiplier(l), + weight_decay*get_weight_decay_multiplier(l), + momentum1, momentum2, params, params_grad); + + tt::compute_adam_update(bias_offset, params.size(), s, m, v, t, + learning_rate*get_learning_rate_multiplier(l)*l.get_bias_learning_rate_multiplier(), + weight_decay*get_weight_decay_multiplier(l)*l.get_bias_weight_decay_multiplier(), + momentum1, momentum2, params, params_grad); + } + } + resizable_tensor m; + resizable_tensor v; + resizable_tensor s; + float weight_decay; + float momentum1; + float momentum2; + float t; + }; + +// ---------------------------------------------------------------------------------------- + +} + +#endif // DLIB_DNn_SOLVERS_H_ + diff --git a/ml/dlib/dlib/dnn/solvers_abstract.h b/ml/dlib/dlib/dnn/solvers_abstract.h new file mode 100644 index 000000000..d10ef163a --- /dev/null +++ b/ml/dlib/dlib/dnn/solvers_abstract.h @@ -0,0 +1,204 @@ +// Copyright (C) 2015 Davis E. King (davis@dlib.net) +// License: Boost Software License See LICENSE.txt for the full license. +#undef DLIB_DNn_SOLVERS_ABSTRACT_H_ +#ifdef DLIB_DNn_SOLVERS_ABSTRACT_H_ + +#include "tensor_abstract.h" +#include <iostream> + +namespace dlib +{ + +// ---------------------------------------------------------------------------------------- +// ---------------------------------------------------------------------------------------- +// ---------------------------------------------------------------------------------------- + + class EXAMPLE_SOLVER + { + /*! + WHAT THIS OBJECT REPRESENTS + A solver defines the parameter update rule for a single layer in a deep + neural network. It takes a parameter gradient vector and the layer's + parameters and tells you how the parameters should be updated. + Importantly, each solver instance is used with only one layer in a network. + This allows us to define solvers that have per layer state, for example, a + solver may keep a momentum term and apply it to its update rule. + + Note that there is no dlib::EXAMPLE_SOLVER type. It is shown here purely + to document the interface a solver object must implement. + !*/ + + public: + + EXAMPLE_SOLVER( + ); + + template <typename layer_type> + const tensor& operator() ( + const float learning_rate, + const layer_type& l, + const tensor& params_grad + ) + /*! + requires + - l.get_layer_params().size() != 0 + - have_same_dimensions(l.get_layer_params(), params_grad) == true. + - When this function is invoked on a particular solver instance, it is + always supplied with the same layer instance, l. That is, the solver is + allowed to remember things from one invocation to another and to assume + that it is being serially applied to optimize the same layer's + parameters. + ensures + - Returns a step vector V that is intended to be used to update the + parameters by adding V to l.get_layer_params(). + - This function will use the given "learning rate" to compute V. How the + learning rate is used is solver dependent. But in general the learning + rate should be used to select the step size, i.e. to somehow determine + the magnitude of V. + !*/ + }; + + void serialize(const EXAMPLE_SOLVER& item, std::ostream& out); + void deserialize(EXAMPLE_SOLVER& item, std::istream& in); + /*! + provides serialization support + !*/ + + std::ostream& operator<< (std::ostream& out, const EXAMPLE_SOLVER& item); + /*! + Prints the solver's name and parameters to out. + !*/ + +// ---------------------------------------------------------------------------------------- +// ---------------------------------------------------------------------------------------- +// ---------------------------------------------------------------------------------------- + + class sgd + { + /*! + WHAT THIS OBJECT REPRESENTS + This object implements the EXAMPLE_SOLVER interface defined above. It is a + basic stochastic gradient descent solver which uses momentum and weight + decay. In particular, it computes the update vector V according to: + V = momentum*V - weight_decay*learning_rate*l.get_layer_params() - learning_rate*params_grad; + Here V is a momentum term that is remembered by the solver from one + invocation of operator() to the next. + + + Note that the actual learning rate and weight decay used by the solver are + multiplied by the per layer multipliers. That is, the solver will call + get_learning_rate_multiplier(l) and get_weight_decay_multiplier(l) and + multiply these values with the nominal learning rate and weight decay, + respectively, to determine the values it will use during each step. It is + also overloaded to allow additional learning rate multipliers to be applied + to fc_ and con_ bias parameters. + !*/ + public: + + sgd( + ); + /*! + ensures + - #get_weight_decay() == 0.0005 + - #get_momentum() == 0.9 + !*/ + + explicit sgd( + float weight_decay, + float momentum = 0.9 + ); + /*! + requires + - weight_decay >= 0 + - momentum >= 0 + ensures + - #get_weight_decay() == weight_decay + - #get_momentum() == momentum + !*/ + + float get_weight_decay () const; + float get_momentum () const; + }; + + void serialize(const sgd& item, std::ostream& out); + void deserialize(sgd& item, std::istream& in); + /*! + provides serialization support + !*/ + + std::ostream& operator<< (std::ostream& out, const sgd& item); + /*! + Prints the solver's name and parameters to out. + !*/ + +// ---------------------------------------------------------------------------------------- + + class adam + { + /*! + WHAT THIS OBJECT REPRESENTS + This object implements the EXAMPLE_SOLVER interface defined above. In + particular, it implements the ADAM parameter update method described in the + paper: + Kingma, Diederik P., and Jimmy Ba Adam. "A method for stochastic + optimization." International Conference on Learning Representation. 2015. + + + Note that the actual learning rate and weight decay used by the solver are + multiplied by the per layer multipliers. That is, the solver will call + get_learning_rate_multiplier(l) and get_weight_decay_multiplier(l) and + multiply these values with the nominal learning rate and weight decay, + respectively, to determine the values it will use during each step. It is + also overloaded to allow additional learning rate multipliers to be applied + to fc_ and con_ bias parameters. + !*/ + + public: + + adam( + ); + /*! + ensures + - #get_weight_decay() == 0.0005 + - #get_momentum1() == 0.9 + - #get_momentum2() == 0.999 + !*/ + + adam( + float weight_decay, + float momentum1, + float momentum2 + ); + /*! + requires + - weight_decay >= 0 + - 0 <= momentum1 < 1 + - 0 <= momentum2 < 1 + ensures + - #get_weight_decay() == weight_decay + - #get_momentum1() == momentum1 + - #get_momentum2() == momentum2 + !*/ + + float get_weight_decay () const; + float get_momentum1 () const; + float get_momentum2 () const; + }; + + void serialize(const adam& item, std::ostream& out); + void deserialize(adam& item, std::istream& in); + /*! + provides serialization support + !*/ + + std::ostream& operator<< (std::ostream& out, const adam& item); + /*! + Prints the solver's name and parameters to out. + !*/ + +// ---------------------------------------------------------------------------------------- + +} + +#endif // DLIB_DNn_SOLVERS_ABSTRACT_H_ + diff --git a/ml/dlib/dlib/dnn/tensor.h b/ml/dlib/dlib/dnn/tensor.h new file mode 100644 index 000000000..8039fe666 --- /dev/null +++ b/ml/dlib/dlib/dnn/tensor.h @@ -0,0 +1,686 @@ +// Copyright (C) 2015 Davis E. King (davis@dlib.net) +// License: Boost Software License See LICENSE.txt for the full license. +#ifndef DLIB_DNn_TENSOR_H_ +#define DLIB_DNn_TENSOR_H_ + +#include "tensor_abstract.h" +#include <cstring> +#include "../matrix.h" +#include "cudnn_dlibapi.h" +#include "gpu_data.h" +#include "../byte_orderer.h" +#include <memory> +#include "../any.h" + +namespace dlib +{ + +// ---------------------------------------------------------------------------------------- + + class tensor; + namespace cuda + { + void set_tensor ( + tensor& t, + float value + ); + + void scale_tensor ( + tensor& t, + float value + ); + } + +// ---------------------------------------------------------------------------------------- + + class tensor + { + public: + + tensor ( + ) : + m_n(0), m_k(0), m_nr(0), m_nc(0), m_size(0) + { + } + + virtual ~tensor() {} + + long long num_samples() const { return m_n; } + long long k() const { return m_k; } + long long nr() const { return m_nr; } + long long nc() const { return m_nc; } + size_t size() const { return m_size; } + + typedef float* iterator; + typedef const float* const_iterator; + iterator begin() { return host(); } + const_iterator begin() const { return host(); } + iterator end() { return host()+size(); } + const_iterator end() const { return host()+size(); } + + void async_copy_to_device() const + { + data().async_copy_to_device(); + } + + virtual const float* host() const = 0; + virtual float* host() = 0; + virtual float* host_write_only() = 0; + virtual const float* device() const = 0; + virtual float* device() = 0; + virtual float* device_write_only() = 0; + + virtual const any& annotation() const = 0; + virtual any& annotation() = 0; + + int device_id() const { return data().device_id(); } + + tensor& operator= (float val) + { +#ifdef DLIB_USE_CUDA + // If you are using CUDA then presumably you will be mostly using tensors on + // the GPU. So unless you seem to be actively working with the host side's + // data then we do this initialization on the device side since this avoids a + // host to device transfer that would likely immediately follow. + if (data().device_ready()) + { + cuda::set_tensor(*this, val); + return *this; + } +#endif + auto d = host_write_only(); + for (size_t i = 0; i < size(); ++i) + d[i] = val; + + return *this; + } + + tensor& operator*= (float val) + { +#ifdef DLIB_USE_CUDA + cuda::scale_tensor(*this, val); + return *this; +#else + for (auto& d : *this) + d *= val; + + return *this; +#endif + } + + tensor& operator/= (float val) + { + *this *= 1.0/val; + return *this; + } + + template <typename EXP> + tensor& operator= (const matrix_exp<EXP>& item) + { + DLIB_CASSERT(num_samples() == item.nr() && + nr()*nc()*k() == item.nc()); + static_assert((is_same_type<float, typename EXP::type>::value == true), + "To assign a matrix to a tensor the matrix must contain float values"); + + set_ptrm(host_write_only(), m_n, m_nr*m_nc*m_k) = item; + return *this; + } + + template <typename EXP> + tensor& operator+= (const matrix_exp<EXP>& item) + { + DLIB_CASSERT(num_samples() == item.nr() && + nr()*nc()*k() == item.nc()); + static_assert((is_same_type<float, typename EXP::type>::value == true), + "To assign a matrix to a tensor the matrix must contain float values"); + set_ptrm(host(), m_n, m_nr*m_nc*m_k) += item; + return *this; + } + + template <typename EXP> + tensor& operator-= (const matrix_exp<EXP>& item) + { + DLIB_CASSERT(num_samples() == item.nr() && + nr()*nc()*k() == item.nc()); + static_assert((is_same_type<float, typename EXP::type>::value == true), + "To assign a matrix to a tensor the matrix must contain float values"); + set_ptrm(host(), m_n, m_nr*m_nc*m_k) -= item; + return *this; + } + + template <typename EXP> + void set_sample ( + unsigned long long idx, + const matrix_exp<EXP>& item + ) + { + DLIB_CASSERT(idx < (unsigned long long)num_samples()); + DLIB_CASSERT(item.size() == nr()*nc()*k()); + static_assert((is_same_type<float, typename EXP::type>::value == true), + "To assign a matrix to a tensor the matrix must contain float values"); + set_ptrm(host()+idx*item.size(), item.nr(), item.nc()) = item; + } + + + template <typename EXP> + void add_to_sample ( + unsigned long long idx, + const matrix_exp<EXP>& item + ) + { + DLIB_CASSERT(idx < (unsigned long long)num_samples()); + DLIB_CASSERT(item.size() == nr()*nc()*k()); + static_assert((is_same_type<float, typename EXP::type>::value == true), + "To assign a matrix to a tensor the matrix must contain float values"); + set_ptrm(host()+idx*item.size(), item.nr(), item.nc()) += item; + } + + +#ifdef DLIB_USE_CUDA + virtual const cuda::tensor_descriptor& get_cudnn_tensor_descriptor ( + ) const = 0; +#endif + + friend void memcpy ( + tensor& dest, + const tensor& src + ) + { + DLIB_CASSERT(dest.size() == src.size()); + memcpy(dest.data(), dest.get_alias_offset(), + src.data(), src.get_alias_offset(), + src.size()); + } + + + protected: + + friend class alias_tensor; + + virtual gpu_data& data() = 0; + virtual const gpu_data& data() const = 0; + virtual size_t get_alias_offset() const { return 0; } // needed by alias_tensor. + + long long m_n; + long long m_k; + long long m_nr; + long long m_nc; + long long m_size; // always equal to m_n*m_k*m_nr*m_nc + }; + +// ---------------------------------------------------------------------------------------- + + inline bool is_vector ( + const tensor& t + ) + { + return t.size() == (size_t)t.num_samples() || + t.size() == (size_t)t.k() || + t.size() == (size_t)t.nr() || + t.size() == (size_t)t.nc(); + } + +// ---------------------------------------------------------------------------------------- + + inline const matrix_op<op_pointer_to_mat<float> > mat ( + const tensor& t, + long long nr, + long long nc + ) + { + DLIB_ASSERT(nr >= 0 && nc >= 0 , + "\tconst matrix_exp mat(tensor, nr, nc)" + << "\n\t nr and nc must be >= 0" + << "\n\t nr: " << nr + << "\n\t nc: " << nc + ); + DLIB_ASSERT(nr*nc == (long long)t.size() , + "\tconst matrix_exp mat(tensor, nr, nc)" + << "\n\t The sizes don't match up." + << "\n\t nr*nc: " << nr*nc + << "\n\t t.size(): " << t.size() + ); + typedef op_pointer_to_mat<float> op; + return matrix_op<op>(op(t.host(),nr,nc)); + } + + inline const matrix_op<op_pointer_to_mat<float> > mat ( + const tensor& t + ) + { + if (t.size() != 0) + return mat(t, t.num_samples(), t.size()/t.num_samples()); + else + return mat((float*)0,0,0); + } + + inline const matrix_op<op_pointer_to_mat<float> > image_plane ( + const tensor& t, + long long sample = 0, + long long k = 0 + ) + { + DLIB_ASSERT(0 <= sample && sample < t.num_samples() && + 0 <= k && k < t.k() && + t.size() != 0, + "\tconst matrix_exp image_plane(tensor,sample,k)" + << "\n\t Invalid arguments were given to this function." + << "\n\t sample: " << sample + << "\n\t k: " << k + << "\n\t t.num_samples(): " << t.num_samples() + << "\n\t t.k(): " << t.k() + << "\n\t t.size(): " << t.size() + ); + + + typedef op_pointer_to_mat<float> op; + return matrix_op<op>(op(t.host() + ((sample*t.k() + k)*t.nr())*t.nc(), + t.nr(), + t.nc())); + } + +// ---------------------------------------------------------------------------------------- + + inline bool have_same_dimensions ( + const tensor& a, + const tensor& b + ) + { + return a.num_samples() == b.num_samples() && + a.k() == b.k() && + a.nr() == b.nr() && + a.nc() == b.nc(); + } + +// ---------------------------------------------------------------------------------------- + + class resizable_tensor : public tensor + { + public: + resizable_tensor( + ) + {} + + template <typename EXP> + resizable_tensor( + const matrix_exp<EXP>& item + ) + { + set_size(item.nr(), item.nc()); + *this = item; + } + + explicit resizable_tensor( + long long n_, long long k_ = 1, long long nr_ = 1, long long nc_ = 1 + ) + { + DLIB_ASSERT( n_ >= 0 && k_ >= 0 && nr_ >= 0 && nc_ >= 0); + + set_size(n_,k_,nr_,nc_); + } + + resizable_tensor(const resizable_tensor& item) : _annotation(item.annotation()) + { + copy_size(item); + memcpy(*this, item); + } + resizable_tensor(const tensor& item) : _annotation(item.annotation()) + { + copy_size(item); + memcpy(*this, item); + } + + resizable_tensor(resizable_tensor&& item) { swap(item); } + resizable_tensor& operator=(resizable_tensor&& item) { swap(item); return *this; } + + virtual const float* host() const { return data_instance.host(); } + virtual float* host() { return data_instance.host(); } + virtual float* host_write_only() { return data_instance.host_write_only(); } + virtual const float* device() const { return data_instance.device(); } + virtual float* device() { return data_instance.device(); } + virtual float* device_write_only() { return data_instance.device_write_only(); } + + virtual const any& annotation() const { return _annotation; } + virtual any& annotation() { return _annotation; } + + void clear( + ) + { + set_size(0,0,0,0); + _annotation.clear(); + // free underlying memory + data_instance.set_size(0); + } + + void copy_size ( + const tensor& item + ) + { + set_size(item.num_samples(), item.k(), item.nr(), item.nc()); + } + + resizable_tensor& operator= (float val) + { + tensor::operator=(val); + return *this; + } + + template <typename EXP> + resizable_tensor& operator= ( + const matrix_exp<EXP>& item + ) + { + if (!(num_samples() == item.nr() && k()*nr()*nc() == item.nc())) + set_size(item.nr(), item.nc()); + tensor::operator=(item); + return *this; + } + + void set_size( + long long n_, long long k_ = 1, long long nr_ = 1, long long nc_ = 1 + ) + { + DLIB_ASSERT( n_ >= 0 && k_ >= 0 && nr_ >= 0 && nc_ >= 0); + + m_n = n_; + m_k = k_; + m_nr = nr_; + m_nc = nc_; + m_size = n_*k_*nr_*nc_; + if ((long long)data_instance.size() < m_size) + data_instance.set_size(m_size); +#ifdef DLIB_USE_CUDA + cudnn_descriptor.set_size(m_n,m_k,m_nr,m_nc); +#endif + } + + + resizable_tensor& operator= (const resizable_tensor& item) + { + resizable_tensor temp(item); + temp.swap(*this); + return *this; + } + + resizable_tensor& operator= (const tensor& item) + { + resizable_tensor temp(item); + temp.swap(*this); + return *this; + } + + + void swap(resizable_tensor& item) + { + std::swap(m_n, item.m_n); + std::swap(m_k, item.m_k); + std::swap(m_nr, item.m_nr); + std::swap(m_nc, item.m_nc); + std::swap(m_size, item.m_size); + std::swap(data_instance, item.data_instance); + std::swap(_annotation, item._annotation); +#ifdef DLIB_USE_CUDA + std::swap(cudnn_descriptor, item.cudnn_descriptor); +#endif + } + +#ifdef DLIB_USE_CUDA + virtual const cuda::tensor_descriptor& get_cudnn_tensor_descriptor ( + ) const { return cudnn_descriptor; } +#endif + + private: + +#ifdef DLIB_USE_CUDA + cuda::tensor_descriptor cudnn_descriptor; +#endif + + gpu_data data_instance; + any _annotation; + virtual gpu_data& data() { return data_instance; } + virtual const gpu_data& data() const { return data_instance; } + }; + + inline void serialize(const tensor& item, std::ostream& out) + { + int version = 2; + serialize(version, out); + serialize(item.num_samples(), out); + serialize(item.k(), out); + serialize(item.nr(), out); + serialize(item.nc(), out); + byte_orderer bo; + auto sbuf = out.rdbuf(); + for (auto d : item) + { + // Write out our data as 4byte little endian IEEE floats rather than using + // dlib's default float serialization. We do this because it will result in + // more compact outputs. It's slightly less portable but it seems doubtful + // that any CUDA enabled platform isn't going to use IEEE floats. But if one + // does we can just update the serialization code here to handle it if such a + // platform is encountered. + bo.host_to_little(d); + static_assert(sizeof(d)==4, "This serialization code assumes we are writing 4 byte floats"); + sbuf->sputn((char*)&d, sizeof(d)); + } + } + + inline void deserialize(resizable_tensor& item, std::istream& in) + { + int version; + deserialize(version, in); + if (version != 2) + throw serialization_error("Unexpected version found while deserializing dlib::resizable_tensor."); + + long long num_samples=0, k=0, nr=0, nc=0; + deserialize(num_samples, in); + deserialize(k, in); + deserialize(nr, in); + deserialize(nc, in); + item.set_size(num_samples, k, nr, nc); + byte_orderer bo; + auto sbuf = in.rdbuf(); + for (auto& d : item) + { + static_assert(sizeof(d)==4, "This serialization code assumes we are writing 4 byte floats"); + if (sbuf->sgetn((char*)&d,sizeof(d)) != sizeof(d)) + { + in.setstate(std::ios::badbit); + throw serialization_error("Error reading data while deserializing dlib::resizable_tensor."); + } + bo.little_to_host(d); + } + } + +// ---------------------------------------------------------------------------------------- + + inline double dot( + const tensor& a, + const tensor& b + ) + { + DLIB_CASSERT(a.size() == b.size()); + const float* da = a.host(); + const float* db = b.host(); + double sum = 0; + for (size_t i = 0; i < a.size(); ++i) + sum += da[i]*db[i]; + return sum; + } + +// ---------------------------------------------------------------------------------------- +// ---------------------------------------------------------------------------------------- + + class alias_tensor_instance : public tensor + { + alias_tensor_instance( + ) : data_instance(0), _annotation(0), data_offset(0) {} + + public: + friend class alias_tensor; + friend class alias_tensor_const_instance; + + alias_tensor_instance& operator= (float val) + { + tensor::operator=(val); + return *this; + } + + template <typename EXP> + alias_tensor_instance& operator= (const matrix_exp<EXP>& item) + { + tensor::operator=(item); + return *this; + } + + virtual const float* host() const { return data_instance->host()+data_offset; } + virtual float* host() { return data_instance->host()+data_offset; } + virtual float* host_write_only() { return data_instance->host()+data_offset; } + virtual const float* device() const { return data_instance->device()+data_offset; } + virtual float* device() { return data_instance->device()+data_offset; } + virtual float* device_write_only() { return data_instance->device()+data_offset; } + + virtual const any& annotation() const { return *_annotation; } + virtual any& annotation() { return *_annotation; } + +#ifdef DLIB_USE_CUDA + virtual const cuda::tensor_descriptor& get_cudnn_tensor_descriptor ( + ) const { return *cudnn_descriptor; } +#endif + private: + + virtual size_t get_alias_offset() const { return data_offset; } + +#ifdef DLIB_USE_CUDA + std::shared_ptr<cuda::tensor_descriptor> cudnn_descriptor; +#endif + gpu_data* data_instance; + any* _annotation; + size_t data_offset; + virtual gpu_data& data() { return *data_instance; } + virtual const gpu_data& data() const { return *data_instance; } + }; + +// ---------------------------------------------------------------------------------------- + + class alias_tensor_const_instance + { + public: + const tensor& get() const { return inst; } + operator const tensor& () { return inst; } + + alias_tensor_const_instance(const alias_tensor_instance& item) : inst(item) {} + + private: + alias_tensor_instance inst; + + friend class alias_tensor; + alias_tensor_const_instance() {} + }; + +// ---------------------------------------------------------------------------------------- + + class alias_tensor + { + public: + + alias_tensor ( + ) {} + + alias_tensor ( + long long n_, long long k_ = 1, long long nr_ = 1, long long nc_ = 1 + ) + { + DLIB_ASSERT( n_ >= 0 && k_ >= 0 && nr_ >= 0 && nc_ >= 0); + + inst.m_n = n_; + inst.m_k = k_; + inst.m_nr = nr_; + inst.m_nc = nc_; + inst.m_size = n_*k_*nr_*nc_; + } + + long long num_samples( + ) const { return inst.m_n; } + + long long k( + ) const { return inst.m_k; } + + long long nr( + ) const { return inst.m_nr; } + + long long nc( + ) const { return inst.m_nc; } + + size_t size( + ) const { return inst.m_size; } + + alias_tensor_instance operator() ( + tensor& t, + size_t offset = 0 + ) const + { + DLIB_CASSERT(offset+size() <= t.size(), + "offset: "<<offset <<"\n"<< + "size(): "<<size() <<"\n"<< + "t.size(): "<<t.size() <<"\n"); + +#ifdef DLIB_USE_CUDA + if (!inst.cudnn_descriptor) + { + inst.cudnn_descriptor = std::make_shared<cuda::tensor_descriptor>(); + inst.cudnn_descriptor->set_size(inst.m_n, inst.m_k, inst.m_nr, inst.m_nc); + } +#endif + inst.data_instance = &t.data(); + inst._annotation = &t.annotation(); + // Note that t might already be an aliasing tensor so we need to take that into + // account. + inst.data_offset = t.get_alias_offset()+offset; + return inst; + } + + alias_tensor_const_instance operator() ( + const tensor& t, + size_t offset = 0 + ) const + { + alias_tensor_const_instance temp; + temp.inst = (*this)(const_cast<tensor&>(t),offset); + return temp; + } + + private: + mutable alias_tensor_instance inst; + }; + + inline void serialize(const alias_tensor& item, std::ostream& out) + { + int version = 1; + serialize(version, out); + serialize(item.num_samples(), out); + serialize(item.k(), out); + serialize(item.nr(), out); + serialize(item.nc(), out); + } + + inline void deserialize(alias_tensor& item, std::istream& in) + { + int version = 0; + deserialize(version, in); + if (version != 1) + throw serialization_error("Unexpected version found while deserializing dlib::alias_tensor."); + long long num_samples, k, nr, nc; + deserialize(num_samples, in); + deserialize(k, in); + deserialize(nr, in); + deserialize(nc, in); + item = alias_tensor(num_samples, k, nr, nc); + } + +// ---------------------------------------------------------------------------------------- + +} + +#endif // DLIB_DNn_TENSOR_H_ + diff --git a/ml/dlib/dlib/dnn/tensor_abstract.h b/ml/dlib/dlib/dnn/tensor_abstract.h new file mode 100644 index 000000000..73a9fff77 --- /dev/null +++ b/ml/dlib/dlib/dnn/tensor_abstract.h @@ -0,0 +1,727 @@ +// Copyright (C) 2015 Davis E. King (davis@dlib.net) +// License: Boost Software License See LICENSE.txt for the full license. +#undef DLIB_DNn_TENSOR_ABSTRACT_H_ +#ifdef DLIB_DNn_TENSOR_ABSTRACT_H_ + +#include "../matrix.h" +#include "../any/any_abstract.h" + +namespace dlib +{ +// ---------------------------------------------------------------------------------------- + + class tensor + { + /*! + WHAT THIS OBJECT REPRESENTS + This object represents a 4D array of float values, all stored contiguously + in memory. Importantly, it keeps two copies of the floats, one on the host + CPU side and another on the GPU device side. It automatically performs the + necessary host/device transfers to keep these two copies of the data in + sync. + + All transfers to the device happen asynchronously with respect to the + default CUDA stream so that CUDA kernel computations can overlap with data + transfers. However, any transfers from the device to the host happen + synchronously in the default CUDA stream. Therefore, you should perform + all your CUDA kernel launches on the default stream so that transfers back + to the host do not happen before the relevant computations have completed. + + If DLIB_USE_CUDA is not #defined then this object will not use CUDA at all. + Instead, it will simply store one host side memory block of floats. + + Finally, the convention in dlib code is to interpret the tensor as a set of + num_samples() 3D arrays, each of dimension k() by nr() by nc(). Also, + while this class does not specify a memory layout, the convention is to + assume that indexing into an element at coordinates (sample,k,r,c) can be + accomplished via: + host()[((sample*t.k() + k)*t.nr() + r)*t.nc() + c] + + THREAD SAFETY + Instances of this object are not thread-safe. So don't touch one from + multiple threads at the same time. + !*/ + + public: + + virtual ~tensor(); + + long long num_samples( + ) const; + /*! + ensures + - returns the number of 3D arrays of dimension k() by nr() by nc() there + are in this object. + !*/ + + long long k( + ) const; + /*! + ensures + - returns the k dimension of this tensor. Generally, we think of a tensor + as containing num_samples() images of nr() by nc() rows and columns, each + with k() channels. + !*/ + + long long nr( + ) const; + /*! + ensures + - returns the number of rows in this tensor. + !*/ + + long long nc( + ) const; + /*! + ensures + - returns the number of columns in this tensor. + !*/ + + size_t size( + ) const; + /*! + ensures + - returns num_samples()*k()*nr()*nc() + (i.e. the total number of floats in this tensor) + !*/ + + void async_copy_to_device( + ) const; + /*! + ensures + - This function does not block. + - if (the host version of the data is newer than the device's copy) then + - Begins asynchronously copying host data to the device. + - A call to device() that happens before the transfer completes will + block until the transfer is complete. That is, it is safe to call + async_copy_to_device() and then immediately call device(). + !*/ + + typedef float* iterator; + typedef const float* const_iterator; + iterator begin() { return host(); } + const_iterator begin() const { return host(); } + iterator end() { return host()+size(); } + const_iterator end() const { return host()+size(); } + /*! + ensures + - makes a tensor iterable just like the STL containers. + !*/ + + virtual const float* host( + ) const = 0; + /*! + ensures + - returns a pointer to the host memory block of size() contiguous float + values or nullptr if size()==0. + - if (the host's copy of the data is out of date) then + - copies the data from the device to the host, while this is happening + the call to host() blocks. + !*/ + + virtual float* host( + ) = 0; + /*! + ensures + - returns a pointer to the host memory block of size() contiguous float + values or nullptr if size()==0. + - if (the host's copy of the data is out of date) then + - copies the data from the device to the host, while this is happening + the call to host() blocks. + - Marks the device side data as out of date so that the next call to + device() will perform a host to device transfer. If you want to begin + the transfer immediately then you can call async_copy_to_device() after + calling host(). + !*/ + + virtual float* host_write_only( + ) = 0; + /*! + ensures + - This function returns the same pointer as host(), except that it never + performs a device to host memory copy. Instead, it immediately marks the + device side data as out of date, effectively discarding it. Therefore, + the values in the data pointed to by host_write_only() are undefined and + you should only call host_write_only() if you are going to assign to + every memory location in the returned memory block. + !*/ + + virtual const float* device( + ) const = 0; + /*! + requires + - DLIB_USE_CUDA is #defined + ensures + - returns a pointer to the device memory block of size() contiguous float + values or nullptr if size()==0. + - if (the device's copy of the data is out of date) then + - copies the data from the host to the device, while this is happening + the call to device() blocks. + !*/ + + virtual float* device( + ) = 0; + /*! + requires + - DLIB_USE_CUDA is #defined + ensures + - returns a pointer to the device memory block of size() contiguous float + values or nullptr if size()==0. + - if (the device's copy of the data is out of date) then + - copies the data from the host to the device, while this is happening + the call to device() blocks. + - Marks the host side data as out of date so that the next call to + host() will perform a device to host transfer. + !*/ + + virtual float* device_write_only( + ) = 0; + /*! + requires + - DLIB_USE_CUDA is #defined + ensures + - This function returns the same pointer as device(), except that it never + performs a host to device memory copy. Instead, it immediately marks the + host side data as out of date, effectively discarding it. Therefore, the + values in the data pointed to by device_write_only() are undefined and + you should only call device_write_only() if you are going to assign to + every memory location in the returned memory block. + !*/ + + virtual const any& annotation( + ) const = 0; + /*! + ensures + - returns a const reference to the any object in this tensor. The any + object can be used to store any additional annotation you like in a + tensor. However, it should be noted that the annotation() is ignored by + serialize() and therefore not saved when a tensor is serialized. + !*/ + + virtual any& annotation( + ) = 0; + /*! + ensures + - returns a non-const reference to the any object in this tensor. The any + object can be used to store any additional annotation you like in a + tensor. However, it should be noted that the annotation() is ignored by + serialize() and therefore not saved when a tensor is serialized. + !*/ + + int device_id( + ) const; + /*! + ensures + - returns the ID of the CUDA device that allocated this memory. I.e. the + number returned by cudaGetDevice() when the memory was allocated. + - If CUDA is not being used then this function always returns 0. + !*/ + + tensor& operator= ( + float val + ); + /*! + ensures + - sets all elements of this tensor equal to val. + - returns *this + !*/ + + tensor& operator*= ( + float val + ); + /*! + ensures + - pointwise multiplies all elements of *this tensor with val. + - returns *this + !*/ + + tensor& operator/= ( + float val + ); + /*! + ensures + - pointwise divides all elements of *this tensor with val. + - returns *this + !*/ + + template <typename EXP> + tensor& operator= ( + const matrix_exp<EXP>& item + ); + /*! + requires + - num_samples() == item.nr() + - k()*nr()*nc() == item.nc() + - item contains float values + ensures + - Assigns item to *this tensor by performing: + set_ptrm(host(), num_samples(), k()*nr()*nc()) = item; + !*/ + + template <typename EXP> + tensor& operator+= ( + const matrix_exp<EXP>& item + ); + /*! + requires + - num_samples() == item.nr() + - k()*nr()*nc() == item.nc() + - item contains float values + ensures + - Adds item to *this tensor by performing: + set_ptrm(host(), num_samples(), k()*nr()*nc()) += item; + !*/ + + template <typename EXP> + tensor& operator-= ( + const matrix_exp<EXP>& item + ); + /*! + requires + - num_samples() == item.nr() + - k()*nr()*nc() == item.nc() + - item contains float values + ensures + - Subtracts item from *this tensor by performing: + set_ptrm(host(), num_samples(), k()*nr()*nc()) -= item; + !*/ + + template <typename EXP> + void set_sample ( + unsigned long long idx, + const matrix_exp<EXP>& item + ); + /*! + requires + - idx < num_samples() + - k()*nr()*nc() == item.size() + - item contains float values + ensures + - Assigns item to the idx'th sample in *this by performing: + set_ptrm(host()+idx*item.size(), item.nr(), item.nc()) = item; + !*/ + + + template <typename EXP> + void add_to_sample ( + unsigned long long idx, + const matrix_exp<EXP>& item + ); + /*! + requires + - idx < num_samples() + - k()*nr()*nc() == item.size() + - item contains float values + ensures + - Adds item to the idx'th sample in *this by performing: + set_ptrm(host()+idx*item.size(), item.nr(), item.nc()) += item; + !*/ + + protected: + + // You can't move or copy another tensor into *this since that might modify the + // tensor's dimensions. If you want to do that sort of thing then use a + // resizable_tensor. + tensor(const tensor& item); + tensor& operator= (const tensor& item); + tensor(tensor&& item); + tensor& operator=(tensor&& item); + }; + +// ---------------------------------------------------------------------------------------- + + void memcpy ( + tensor& dest, + const tensor& src + ); + /*! + requires + - dest.size() == src.size() + ensures + - Copies the data in src to dest. If the device data is current on both src + and dest then the copy will happen entirely on the device side. + - It doesn't matter what GPU device is selected by cudaSetDevice(). You can + always copy tensor objects to and from each other regardless. + - This function blocks until the copy has completed. + !*/ + +// ---------------------------------------------------------------------------------------- + + bool is_vector ( + const tensor& t + ); + /*! + ensures + - returns true if and only if one of the following is true: + - t.size() == t.num_samples() + - t.size() == t.k() + - t.size() == t.nr() + - t.size() == t.nc() + !*/ + +// ---------------------------------------------------------------------------------------- + + const matrix_exp mat ( + const tensor& t, + long long nr, + long long nc + ); + /*! + requires + - nr >= 0 + - nc >= 0 + - nr*nc == t.size() + ensures + - returns a matrix M such that: + - M.nr() == nr + - m.nc() == nc + - for all valid r and c: + M(r,c) == t.host()[r*nc + c] + (i.e. the tensor is interpreted as a matrix laid out in memory + in row major order) + !*/ + + const matrix_exp mat ( + const tensor& t + ); + /*! + ensures + - if (t.size() != 0) then + - returns mat(t, t.num_samples(), t.size()/t.num_samples()) + - else + - returns an empty matrix. + !*/ + + const matrix_exp image_plane ( + const tensor& t, + long long sample = 0, + long long k = 0 + ); + /*! + requires + - t.size() != 0 + - 0 <= sample < t.num_samples() + - 0 <= k < t.k() + ensures + - returns the k-th image plane from the sample-th image in t. That is, + returns a matrix M such that: + - M contains float valued elements. + - M.nr() == t.nr() + - M.nc() == t.nc() + - for all valid r and c: + - M(r,c) == t.host()[((sample*t.k() + k)*t.nr() + r)*t.nc() + c] + !*/ + +// ---------------------------------------------------------------------------------------- + + bool have_same_dimensions ( + const tensor& a, + const tensor& b + ); + /*! + ensures + - returns true if and only if all of the fallowing are satisfied: + - a.num_samples() == b.num_samples() + - a.k() == b.k() + - a.nr() == b.nr() + - a.nc() == b.nc() + !*/ + +// ---------------------------------------------------------------------------------------- + + class resizable_tensor : public tensor + { + /*! + WHAT THIS OBJECT REPRESENTS + This object is just a tensor with the additional ability to be resized. + !*/ + + public: + resizable_tensor( + ); + /*! + ensures + - #size() == 0 + - #num_samples() == 0 + - #k() == 0 + - #nr() == 0 + - #nc() == 0 + - #capacity() == 0 + !*/ + + template <typename EXP> + resizable_tensor( + const matrix_exp<EXP>& item + ); + /*! + requires + - item contains float values + ensures + - #num_samples() == item.nr() + - #k() == item.nc() + - #nr() == 1 + - #nc() == 1 + - Assigns item to *this tensor by performing: + set_ptrm(host(), num_samples(), k()*nr()*nc()) = item; + - #capacity() == size() + !*/ + + explicit resizable_tensor( + long long n_, long long k_ = 1, long long nr_ = 1, long long nc_ = 1 + ); + /*! + requires + - n_ >= 0 + - k_ >= 0 + - nr_ >= 0 + - nc_ >= 0 + ensures + - #size() == n_*k_*nr_*nc_ + - #num_samples() == n_ + - #k() == k_ + - #nr() == nr_ + - #nc() == nc_ + - #capacity() == size() + !*/ + + // This object is copyable and movable + resizable_tensor(const resizable_tensor&) = default; + resizable_tensor(resizable_tensor&&) = default; + resizable_tensor& operator= (const resizable_tensor&) = default; + resizable_tensor& operator= (resizable_tensor&&) = default; + + size_t capacity ( + ) const; + /*! + ensures + - returns the total number of floats allocated. This might be different + from the size() since calls to set_size() that make a tensor smaller + don't trigger reallocations. They simply adjust the nominal dimensions + while keeping the same allocated memory block. This makes calls to + set_size() very fast. If you need to deallocate a tensor then use + clear(). + !*/ + + void clear( + ); + /*! + ensures + - #size() == 0 + - #num_samples() == 0 + - #k() == 0 + - #nr() == 0 + - #nc() == 0 + - #annotation().is_empty() == true + - #capacity() == 0 + !*/ + + void copy_size ( + const tensor& item + ); + /*! + ensures + - resizes *this so that: have_same_dimensions(#*this, item)==true + !*/ + + void set_size( + long long n_, long long k_ = 1, long long nr_ = 1, long long nc_ = 1 + ); + /*! + requires + - n_ >= 0 + - k_ >= 0 + - nr_ >= 0 + - nc_ >= 0 + ensures + - #size() == n_*k_*nr_*nc_ + - #num_samples() == n_ + - #k() == k_ + - #nr() == nr_ + - #nc() == nc_ + - #capacity() == max(#size(), capacity()) + (i.e. capacity() never goes down when calling set_size().) + !*/ + + template <typename EXP> + resizable_tensor& operator= ( + const matrix_exp<EXP>& item + ); + /*! + requires + - item contains float values + ensures + - if (num_samples() == item.nr() && k()*nr()*nc() == item.nc()) then + - the dimensions of this tensor are not changed + - else + - #num_samples() == item.nr() + - #k() == item.nc() + - #nr() == 1 + - #nc() == 1 + - Assigns item to *this tensor by performing: + set_ptrm(host(), num_samples(), k()*nr()*nc()) = item; + !*/ + }; + + void serialize(const tensor& item, std::ostream& out); + void deserialize(resizable_tensor& item, std::istream& in); + /*! + provides serialization support for tensor and resizable_tensor. Note that you can + serialize to/from any combination of tenor and resizable_tensor objects. + !*/ + +// ---------------------------------------------------------------------------------------- + + double dot( + const tensor& a, + const tensor& b + ); + /*! + requires + - a.size() == b.size() + ensures + - returns the dot product between a and b when they are both treated as + a.size() dimensional vectors. That is, this function pointwise multiplies + the vectors together, then sums the result and returns it. + + !*/ + +// ---------------------------------------------------------------------------------------- + + class alias_tensor_instance : public tensor + { + /*! + WHAT THIS OBJECT REPRESENTS + This object is a tensor that aliases another tensor. That is, it doesn't + have its own block of memory but instead simply holds pointers to the + memory of another tensor object. It therefore allows you to efficiently + break a tensor into pieces and pass those pieces into functions. + + An alias_tensor_instance doesn't own the resources it points to in any sense. + So it is important to make sure that the underlying owning tensor doesn't get + destructed before any alias tensors which point to it are destructed. + !*/ + + // You can't default initialize this object. You can only get instances of it from + // alias_tensor::operator(). + alias_tensor_instance( + ); + }; + + class alias_tensor_const_instance + { + /*! + WHAT THIS OBJECT REPRESENTS + This is essentially a const version of alias_tensor_instance and therefore + represents a tensor. However, due to the mechanics of C++, this object + can't inherit from tensor. So instead it provides a get() and an implicit + conversion to const tensor. + !*/ + + public: + + // non-const alias tensors are convertible to const ones. + alias_tensor_const_instance(const alias_tensor_instance& item); + + // Methods that cast the alias to a tensor. + const tensor& get() const; + operator const tensor& (); + + private: + // You can't default initialize this object. You can only get instances of it from + // alias_tensor::operator(). + alias_tensor_const_instance(); + }; + + class alias_tensor + { + /*! + WHAT THIS OBJECT REPRESENTS + This is a tool for creating tensor objects that alias other tensor objects. + That is, it allows you to make a tensor that references the memory space of + another tensor object rather than owning its own memory. This allows you + to do things like interpret a single tensor in different ways or even as a + group of multiple tensors. + !*/ + public: + + alias_tensor ( + ); + /*! + ensures + - #size() == 0 + - #num_samples() == 0 + - #k() == 0 + - #nr() == 0 + - #nc() == 0 + !*/ + + alias_tensor ( + long long n_, long long k_ = 1, long long nr_ = 1, long long nc_ = 1 + ); + /*! + requires + - n_ >= 0 + - k_ >= 0 + - nr_ >= 0 + - nc_ >= 0 + ensures + - #size() == n_*k_*nr_*nc_ + - #num_samples() == n_ + - #k() == k_ + - #nr() == nr_ + - #nc() == nc_ + !*/ + + long long num_samples() const; + long long k() const; + long long nr() const; + long long nc() const; + size_t size() const; + + alias_tensor_instance operator() ( + tensor& t, + size_t offset = 0 + ) const; + /*! + requires + - offset+size() <= t.size() + ensures + - Returns a tensor that simply aliases the elements of t beginning with t's + offset'th element. Specifically, this function returns an aliasing + tensor T such that: + - T.size() == size() + - T.num_samples() == num_samples() + - T.k() == k() + - T.nr() == nr() + - T.nc() == nc() + - T.host() == t.host()+offset + - T.device() == t.device()+offset + - &T.annotation() == &t.annotation() + !*/ + + alias_tensor_const_instance operator() ( + const tensor& t, + size_t offset = 0 + ) const; + /*! + requires + - offset+size() <= t.size() + ensures + - This function is identical to the above version of operator() except that + it takes and returns const tensors instead of non-const tensors. + !*/ + }; + + void serialize(const alias_tensor& item, std::ostream& out); + void deserialize(alias_tensor& item, std::istream& in); + /*! + provides serialization support for alias_tensor. + !*/ + +// ---------------------------------------------------------------------------------------- + +} + +#endif // DLIB_DNn_TENSOR_ABSTRACT_H_ + + diff --git a/ml/dlib/dlib/dnn/tensor_tools.cpp b/ml/dlib/dlib/dnn/tensor_tools.cpp new file mode 100644 index 000000000..c0f7fd69d --- /dev/null +++ b/ml/dlib/dlib/dnn/tensor_tools.cpp @@ -0,0 +1,985 @@ +// Copyright (C) 2015 Davis E. King (davis@dlib.net) +// License: Boost Software License See LICENSE.txt for the full license. +#ifndef DLIB_TeNSOR_TOOLS_CPP_ +#define DLIB_TeNSOR_TOOLS_CPP_ + +#include "tensor_tools.h" +#include "../string.h" +#include <atomic> + +namespace dlib +{ + namespace + { + std::atomic<bool>& dnn_prefer_fastest_algo ( + ) + { + static std::atomic<bool> var(true); + return var; + } + } + + bool dnn_prefer_fastest_algorithms ( + ) + { + return dnn_prefer_fastest_algo(); + } + + void set_dnn_prefer_fastest_algorithms( + ) + { + dnn_prefer_fastest_algo() = true; + } + + void set_dnn_prefer_smallest_algorithms( + ) + { + dnn_prefer_fastest_algo() = false; + } +} + +namespace dlib { namespace tt +{ + +// ---------------------------------------------------------------------------------------- + + void inverse_norms ( + resizable_tensor& invnorms, + const tensor& data, + const double eps + ) + { +#ifdef DLIB_USE_CUDA + cuda::inverse_norms(invnorms, data, eps); +#else + invnorms = reciprocal(sqrt(sum_cols(squared(mat(data))) + eps)); +#endif + } + + void dot_prods ( + resizable_tensor& out, + const tensor& lhs, + const tensor& rhs + ) + { +#ifdef DLIB_USE_CUDA + cuda::dot_prods(out, lhs, rhs); +#else + out = sum_cols(pointwise_multiply(mat(lhs), mat(rhs))); +#endif + } + + void dot_prods ( + bool add_to, + tensor& out, + const tensor& lhs, + const tensor& rhs + ) + { +#ifdef DLIB_USE_CUDA + cuda::dot_prods(add_to, out, lhs, rhs); +#else + if (add_to) + out += sum_cols(pointwise_multiply(mat(lhs), mat(rhs))); + else + out = sum_cols(pointwise_multiply(mat(lhs), mat(rhs))); +#endif + } + + void scale_columns ( + tensor& out, + const tensor& m, + const tensor& v + ) + { + DLIB_CASSERT(have_same_dimensions(out,m)); + DLIB_CASSERT(is_vector(v)); + if (m.size() == 0 && v.size() == 0) + return; + DLIB_CASSERT(m.size() != 0); + DLIB_CASSERT(m.size()/m.num_samples() == v.size()); + +#ifdef DLIB_USE_CUDA + cuda::scale_columns(out, m, v); +#else + DLIB_CASSERT(false, "shouldn't be called right now"); + out = scale_columns(mat(m), mat(v)); +#endif + } + + void scale_rows ( + tensor& out, + const tensor& m, + const tensor& v + ) + { + DLIB_CASSERT(have_same_dimensions(out,m)); + DLIB_CASSERT(is_vector(v)); + if (m.size() == 0 && v.size() == 0) + return; + DLIB_CASSERT(m.size() != 0); + DLIB_CASSERT(m.num_samples() == v.size()); + +#ifdef DLIB_USE_CUDA + cuda::scale_rows(out, m, v); +#else + out = scale_rows(mat(m), mat(v)); +#endif + } + + void scale_rows2 ( + float beta, + tensor& out, + const tensor& m1, + const tensor& m2, + const tensor& v1, + const tensor& v2 + ) + { + DLIB_CASSERT(have_same_dimensions(out,m1)); + DLIB_CASSERT(have_same_dimensions(out,m2)); + DLIB_CASSERT(have_same_dimensions(v1,v2)); + DLIB_CASSERT(is_vector(mat(v1))); + DLIB_CASSERT(v1.size() == m1.num_samples()); + +#ifdef DLIB_USE_CUDA + cuda::scale_rows2(beta, out, m1, m2, v1, v2); +#else + if (beta == 0) + out = scale_rows(mat(m1) - scale_rows(mat(m2),mat(v1)), mat(v2)); + else + out = beta*mat(out) + scale_rows(mat(m1) - scale_rows(mat(m2),mat(v1)), mat(v2)); +#endif + } + +// ---------------------------------------------------------------------------------------- + + void exp ( + tensor& dest, + const tensor& src + ) + { + DLIB_CASSERT(dest.size() == src.size()); + +#ifdef DLIB_USE_CUDA + cuda::exp(dest,src); +#else + dest = exp(mat(src)); +#endif + } + +// ---------------------------------------------------------------------------------------- + + void log ( + tensor& dest, + const tensor& src + ) + { + DLIB_CASSERT(dest.size() == src.size()); + +#ifdef DLIB_USE_CUDA + cuda::log(dest,src); +#else + dest = log(mat(src)); +#endif + } + +// ---------------------------------------------------------------------------------------- + + void log10 ( + tensor& dest, + const tensor& src + ) + { + DLIB_CASSERT(dest.size() == src.size()); + +#ifdef DLIB_USE_CUDA + cuda::log10(dest,src); +#else + dest = log10(mat(src)); +#endif + } + +// ---------------------------------------------------------------------------------------- + + void gemm ( + float beta, + tensor& dest, + float alpha, + const tensor& lhs, + bool trans_lhs, + const tensor& rhs, + bool trans_rhs + ) + { +#ifdef DLIB_USE_CUDA + cuda::gemm(beta, dest, alpha, lhs, trans_lhs, rhs, trans_rhs); +#else + if (beta != 0) + { + if (trans_lhs && trans_rhs) + dest = alpha*trans(mat(lhs))*trans(mat(rhs)) + beta*mat(dest); + else if (!trans_lhs && trans_rhs) + dest = alpha*mat(lhs)*trans(mat(rhs)) + beta*mat(dest); + else if (trans_lhs && !trans_rhs) + dest = alpha*trans(mat(lhs))*mat(rhs) + beta*mat(dest); + else + dest = alpha*mat(lhs)*mat(rhs) + beta*mat(dest); + } + else + { + if (trans_lhs && trans_rhs) + dest = alpha*trans(mat(lhs))*trans(mat(rhs)); + else if (!trans_lhs && trans_rhs) + dest = alpha*mat(lhs)*trans(mat(rhs)); + else if (trans_lhs && !trans_rhs) + dest = alpha*trans(mat(lhs))*mat(rhs); + else + dest = alpha*mat(lhs)*mat(rhs); + } +#endif + } + +// ---------------------------------------------------------------------------------------- +// ---------------------------------------------------------------------------------------- + + tensor_rand:: + tensor_rand( + unsigned long long seed + ) +#ifdef DLIB_USE_CUDA + :rnd(seed){} +#else + {rnd.set_seed(cast_to_string(seed)); } +#endif + + void tensor_rand:: + fill_gaussian ( + tensor& data, + float mean, + float stddev + ) + { + DLIB_CASSERT(data.size()%2 == 0); +#ifdef DLIB_USE_CUDA + rnd.fill_gaussian(data, mean, stddev); +#else + for (auto& x : data) + x = rnd.get_random_gaussian()*stddev + mean; +#endif + } + + void tensor_rand:: + fill_uniform ( + tensor& data + ) + { +#ifdef DLIB_USE_CUDA + rnd.fill_uniform(data); +#else + for (auto& x : data) + x = rnd.get_random_float(); +#endif + } + +// ---------------------------------------------------------------------------------------- +// ---------------------------------------------------------------------------------------- + + void multiply ( + bool add_to, + tensor& dest, + const tensor& src1, + const tensor& src2 + ) + { + DLIB_CASSERT(dest.k() == src1.k() && src1.k() == src2.k() && + dest.nr() == src1.nr() && src1.nr() == src2.nr() && + dest.nc() == src1.nc() && src1.nc() == src2.nc() ); + const long MD = std::max(std::max(dest.num_samples(),src1.num_samples()),src2.num_samples()); + DLIB_CASSERT((dest.num_samples()==1 || dest.num_samples()==MD) && + (src1.num_samples()==1 || src1.num_samples()==MD) && + (src2.num_samples()==1 || src2.num_samples()==MD) ); +#ifdef DLIB_USE_CUDA + cuda::multiply(add_to, dest, src1, src2); +#else + cpu::multiply(add_to, dest, src1, src2); +#endif + + } + + void scale_channels ( + bool add_to, + tensor& dest, + const tensor& src, + const tensor& scales + ) + { +#ifdef DLIB_USE_CUDA + cuda::scale_channels(add_to, dest, src, scales); +#else + cpu::scale_channels(add_to, dest, src, scales); +#endif + } + + void multiply_conv ( + bool add_to, + tensor& dest, + const tensor& src1, + const tensor& src2 + ) + { +#ifdef DLIB_USE_CUDA + cuda::multiply_conv(add_to, dest, src1, src2); +#else + cpu::multiply_conv(add_to, dest, src1, src2); +#endif + } + + void multiply_zero_padded ( + bool add_to, + tensor& dest, + const tensor& src1, + const tensor& src2 + ) + { +#ifdef DLIB_USE_CUDA + cuda::multiply_zero_padded(add_to, dest, src1, src2); +#else + cpu::multiply_zero_padded(add_to, dest, src1, src2); +#endif + } + +// ---------------------------------------------------------------------------------------- + + void affine_transform( + tensor& dest, + const tensor& src, + const float A, + const float B + ) + { +#ifdef DLIB_USE_CUDA + cuda::affine_transform(dest,src,A,B); +#else + cpu::affine_transform(dest,src,A,B); +#endif + } + + void affine_transform( + tensor& dest, + const tensor& src, + const float A + ) + { +#ifdef DLIB_USE_CUDA + cuda::affine_transform(dest,src,A); +#else + cpu::affine_transform(dest,src,A,0); +#endif + } + + void affine_transform( + tensor& dest, + const tensor& src1, + const tensor& src2, + const float A, + const float B, + const float C + ) + { +#ifdef DLIB_USE_CUDA + cuda::affine_transform(dest,src1,src2,A,B,C); +#else + cpu::affine_transform(dest,src1,src2,A,B,C); +#endif + } + + void affine_transform( + tensor& dest, + const tensor& src1, + const tensor& src2, + const float A, + const float B + ) + { +#ifdef DLIB_USE_CUDA + cuda::affine_transform(dest,src1,src2,A,B); +#else + cpu::affine_transform(dest,src1,src2,A,B,0); +#endif + } + + void affine_transform( + tensor& dest, + const tensor& src1, + const tensor& src2, + const tensor& src3, + const float A, + const float B, + const float C, + const float D + ) + { +#ifdef DLIB_USE_CUDA + cuda::affine_transform(dest,src1,src2,src3,A,B,C,D); +#else + cpu::affine_transform(dest,src1,src2,src3,A,B,C,D); +#endif + } + + void affine_transform_range( + size_t begin, + size_t end, + tensor& dest, + const tensor& src1, + const tensor& src2, + const tensor& src3, + const float A, + const float B, + const float C + ) + { +#ifdef DLIB_USE_CUDA + cuda::affine_transform_range(begin, end, dest,src1,src2,src3,A,B,C); +#else + cpu::affine_transform_range(begin, end, dest,src1,src2,src3,A,B,C); +#endif + } + + void affine_transform( + const rectangle& rect, + tensor& dest, + const tensor& src1, + const tensor& src2, + const tensor& src3, + float A, + float B, + float C + ) + { +#ifdef DLIB_USE_CUDA + cuda::affine_transform(rect, dest,src1,src2,src3,A,B,C); +#else + cpu::affine_transform(rect, dest,src1,src2,src3,A,B,C); +#endif + } + + void affine_transform( + tensor& dest, + const tensor& src1, + const tensor& src2, + const tensor& src3, + const float A, + const float B, + const float C + ) + { +#ifdef DLIB_USE_CUDA + cuda::affine_transform_range(0,dest.size(),dest,src1,src2,src3,A,B,C); +#else + cpu::affine_transform_range(0,dest.size(),dest,src1,src2,src3,A,B,C); +#endif + } + +// ---------------------------------------------------------------------------------------- + + void affine_transform( + tensor& dest, + const tensor& src, + const tensor& A, + const tensor& B + ) + { +#ifdef DLIB_USE_CUDA + cuda::affine_transform(dest,src,A,B); +#else + cpu::affine_transform(dest,src,A,B); +#endif + } + +// ---------------------------------------------------------------------------------------- + + void affine_transform_conv( + tensor& dest, + const tensor& src, + const tensor& A, + const tensor& B + ) + { +#ifdef DLIB_USE_CUDA + cuda::affine_transform_conv(dest,src,A,B); +#else + cpu::affine_transform_conv(dest,src,A,B); +#endif + } + +// ---------------------------------------------------------------------------------------- + + void compute_adam_update ( + size_t begin, + size_t end, + tensor& s, + tensor& m, + tensor& v, + const float t, + const float learning_rate, + const float weight_decay, + const float momentum1, + const float momentum2, + const tensor& params, + const tensor& params_grad + ) + { +#ifdef DLIB_USE_CUDA + cuda::compute_adam_update(begin, end, s, m, v, t, learning_rate, weight_decay, momentum1, + momentum2, params, params_grad); +#else + cpu::compute_adam_update(begin, end, s, m, v, t, learning_rate, weight_decay, momentum1, + momentum2, params, params_grad); +#endif + } + +// ---------------------------------------------------------------------------------------- + + void batch_normalize_inference ( + const double eps, + resizable_tensor& dest, + const tensor& src, + const tensor& gamma, + const tensor& beta, + const tensor& running_means, + const tensor& running_variances + ) + { +#ifdef DLIB_USE_CUDA + cuda::batch_normalize_inference(eps,dest,src,gamma,beta,running_means,running_variances); +#else + cpu::batch_normalize_inference(eps,dest,src,gamma,beta,running_means,running_variances); +#endif + } + + void batch_normalize ( + const double eps, + resizable_tensor& dest, + resizable_tensor& means, + resizable_tensor& vars, + const double averaging_factor, + resizable_tensor& running_means, + resizable_tensor& running_variances, + const tensor& src, + const tensor& gamma, + const tensor& beta + ) + { +#ifdef DLIB_USE_CUDA + cuda::batch_normalize(eps,dest,means,vars,averaging_factor,running_means,running_variances,src,gamma,beta); +#else + cpu::batch_normalize(eps,dest,means,vars,averaging_factor,running_means,running_variances,src,gamma,beta); +#endif + } + + void batch_normalize_gradient ( + const double eps, + const tensor& gradient_input, + const tensor& means, + const tensor& invstds, + const tensor& src, + const tensor& gamma, + tensor& src_grad, + tensor& gamma_grad, + tensor& beta_grad + ) + { + +#ifdef DLIB_USE_CUDA + cuda::batch_normalize_gradient(eps,gradient_input, means, invstds, src, gamma, src_grad, gamma_grad, beta_grad); +#else + cpu::batch_normalize_gradient(eps,gradient_input, means, invstds, src, gamma, src_grad, gamma_grad, beta_grad); +#endif + } + +// ---------------------------------------------------------------------------------------- + + void batch_normalize_conv_inference ( + const double eps, + resizable_tensor& dest, + const tensor& src, + const tensor& gamma, + const tensor& beta, + const tensor& running_means, + const tensor& running_variances + ) + { +#ifdef DLIB_USE_CUDA + cuda::batch_normalize_conv_inference(eps,dest,src,gamma,beta,running_means,running_variances); +#else + cpu::batch_normalize_conv_inference(eps,dest,src,gamma,beta,running_means,running_variances); +#endif + } + + void batch_normalize_conv ( + const double eps, + resizable_tensor& dest, + resizable_tensor& means, + resizable_tensor& vars, + const double averaging_factor, + resizable_tensor& running_means, + resizable_tensor& running_variances, + const tensor& src, + const tensor& gamma, + const tensor& beta + ) + { +#ifdef DLIB_USE_CUDA + cuda::batch_normalize_conv(eps,dest,means,vars,averaging_factor,running_means,running_variances,src,gamma,beta); +#else + cpu::batch_normalize_conv(eps,dest,means,vars,averaging_factor,running_means,running_variances,src,gamma,beta); +#endif + } + + void batch_normalize_conv_gradient ( + const double eps, + const tensor& gradient_input, + const tensor& means, + const tensor& invstds, + const tensor& src, + const tensor& gamma, + tensor& src_grad, + tensor& gamma_grad, + tensor& beta_grad + ) + { + +#ifdef DLIB_USE_CUDA + cuda::batch_normalize_conv_gradient(eps,gradient_input, means, invstds, src, gamma, src_grad, gamma_grad, beta_grad); +#else + cpu::batch_normalize_conv_gradient(eps,gradient_input, means, invstds, src, gamma, src_grad, gamma_grad, beta_grad); +#endif + } + +// ---------------------------------------------------------------------------------------- + + void threshold ( + tensor& data, + float thresh + ) + { +#ifdef DLIB_USE_CUDA + cuda::threshold(data,thresh); +#else + cpu::threshold(data,thresh); +#endif + } + + void dot ( + const tensor& a, + const tensor& b, + tensor& result, + size_t idx + ) + { +#ifdef DLIB_USE_CUDA + cuda::dot(a,b,result,idx); +#else + cpu::dot(a,b,result,idx); +#endif + } + +// ---------------------------------------------------------------------------------------- + + void add( + float beta, + tensor& dest, + float alpha, + const tensor& src + ) + { +#ifdef DLIB_USE_CUDA + cuda::add(beta,dest,alpha,src); +#else + cpu::add(beta,dest,alpha,src); +#endif + } + +// ---------------------------------------------------------------------------------------- + + void add ( + tensor& dest, + const tensor& src1, + const tensor& src2 + ) + { +#ifdef DLIB_USE_CUDA + cuda::add(dest, src1, src2); +#else + cpu::add(dest, src1, src2); +#endif + } + +// ---------------------------------------------------------------------------------------- + + void assign_conv_bias_gradient ( + tensor& grad, + const tensor& gradient_input + ) + { +#ifdef DLIB_USE_CUDA + cuda::assign_conv_bias_gradient(grad,gradient_input); +#else + cpu::assign_conv_bias_gradient(grad,gradient_input); +#endif + } + +// ---------------------------------------------------------------------------------------- + + void assign_bias_gradient ( + tensor& grad, + const tensor& gradient_input + ) + { +#ifdef DLIB_USE_CUDA + cuda::assign_bias_gradient(grad,gradient_input); +#else + cpu::assign_bias_gradient(grad,gradient_input); +#endif + } + +// ---------------------------------------------------------------------------------------- +// ---------------------------------------------------------------------------------------- + + void softmax ( + tensor& dest, + const tensor& src + ) + { +#ifdef DLIB_USE_CUDA + cuda::softmax(dest,src); +#else + cpu::softmax(dest,src); +#endif + } + + void softmax_gradient ( + tensor& grad, + const tensor& dest, + const tensor& gradient_input + ) + { +#ifdef DLIB_USE_CUDA + cuda::softmax_gradient(grad, dest, gradient_input); +#else + cpu::softmax_gradient(grad, dest, gradient_input); +#endif + } + +// ---------------------------------------------------------------------------------------- + + void softmax_all ( + tensor& dest, + const tensor& src + ) + { +#ifdef DLIB_USE_CUDA + cuda::softmax_all(dest,src); +#else + cpu::softmax_all(dest,src); +#endif + } + + void softmax_all_gradient ( + tensor& grad, + const tensor& dest, + const tensor& gradient_input + ) + { +#ifdef DLIB_USE_CUDA + cuda::softmax_all_gradient(grad, dest, gradient_input); +#else + cpu::softmax_all_gradient(grad, dest, gradient_input); +#endif + } + +// ---------------------------------------------------------------------------------------- + + void sigmoid ( + tensor& dest, + const tensor& src + ) + { +#ifdef DLIB_USE_CUDA + cuda::sigmoid(dest,src); +#else + cpu::sigmoid(dest,src); +#endif + } + + void sigmoid_gradient ( + tensor& grad, + const tensor& dest, + const tensor& gradient_input + ) + { +#ifdef DLIB_USE_CUDA + cuda::sigmoid_gradient(grad, dest, gradient_input); +#else + cpu::sigmoid_gradient(grad, dest, gradient_input); +#endif + } + +// ---------------------------------------------------------------------------------------- + + void relu ( + tensor& dest, + const tensor& src + ) + { +#ifdef DLIB_USE_CUDA + cuda::relu(dest,src); +#else + cpu::relu(dest,src); +#endif + } + + void relu_gradient ( + tensor& grad, + const tensor& dest, + const tensor& gradient_input + ) + { +#ifdef DLIB_USE_CUDA + cuda::relu_gradient(grad, dest, gradient_input); +#else + cpu::relu_gradient(grad, dest, gradient_input); +#endif + } + +// ---------------------------------------------------------------------------------------- + + void prelu ( + tensor& dest, + const tensor& src, + const tensor& param + ) + { +#ifdef DLIB_USE_CUDA + cuda::prelu(dest, src, param); +#else + cpu::prelu(dest, src, param); +#endif + } + + void prelu_gradient ( + tensor& grad, + const tensor& src, + const tensor& gradient_input, + const tensor& param, + tensor& params_grad + ) + { +#ifdef DLIB_USE_CUDA + cuda::prelu_gradient(grad, src, gradient_input, param, params_grad); +#else + cpu::prelu_gradient(grad, src, gradient_input, param, params_grad); +#endif + } + +// ---------------------------------------------------------------------------------------- + + void tanh ( + tensor& dest, + const tensor& src + ) + { +#ifdef DLIB_USE_CUDA + cuda::tanh(dest,src); +#else + cpu::tanh(dest,src); +#endif + } + + void tanh_gradient ( + tensor& grad, + const tensor& dest, + const tensor& gradient_input + ) + { +#ifdef DLIB_USE_CUDA + cuda::tanh_gradient(grad, dest, gradient_input); +#else + cpu::tanh_gradient(grad, dest, gradient_input); +#endif + } + +// ---------------------------------------------------------------------------------------- + + void resize_bilinear ( + tensor& dest, + long dest_row_stride, + long dest_channel_stride, + const tensor& src, + long src_row_stride, + long src_channel_stride + ) + { +#ifdef DLIB_USE_CUDA + cuda::resize_bilinear(dest,dest_row_stride,dest_channel_stride, src,src_row_stride,src_channel_stride); +#else + cpu::resize_bilinear(dest,dest_row_stride,dest_channel_stride, src,src_row_stride,src_channel_stride); +#endif + } + + void resize_bilinear_gradient ( + tensor& grad, + long grad_row_stride, + long grad_channel_stride, + const tensor& gradient_input, + long gradient_input_row_stride, + long gradient_input_channel_stride + ) + { +#ifdef DLIB_USE_CUDA + cuda::resize_bilinear_gradient(grad,grad_row_stride,grad_channel_stride, gradient_input,gradient_input_row_stride,gradient_input_channel_stride); +#else + cpu::resize_bilinear_gradient(grad,grad_row_stride,grad_channel_stride, gradient_input,gradient_input_row_stride,gradient_input_channel_stride); +#endif + } + +// ------------------------------------------------------------------------------------ + + void copy_tensor( + bool add_to, + tensor& dest, + size_t dest_k_offset, + const tensor& src, + size_t src_k_offset, + size_t count_k + ) + { +#ifdef DLIB_USE_CUDA + cuda::copy_tensor(add_to, dest, dest_k_offset, src, src_k_offset, count_k); +#else + cpu::copy_tensor(add_to, dest, dest_k_offset, src, src_k_offset, count_k); +#endif + } + +// ---------------------------------------------------------------------------------------- + + void inv:: + operator() ( + const tensor& m, + resizable_tensor& out + ) + { +#ifdef DLIB_USE_CUDA + finv(m,out); +#else + out = dlib::inv(mat(m)); +#endif + } + +// ---------------------------------------------------------------------------------------- + +}} + +#endif // DLIB_TeNSOR_TOOLS_CPP_ + diff --git a/ml/dlib/dlib/dnn/tensor_tools.h b/ml/dlib/dlib/dnn/tensor_tools.h new file mode 100644 index 000000000..9ba3154e5 --- /dev/null +++ b/ml/dlib/dlib/dnn/tensor_tools.h @@ -0,0 +1,1711 @@ +// Copyright (C) 2015 Davis E. King (davis@dlib.net) +// License: Boost Software License See LICENSE.txt for the full license. +#ifndef DLIB_TeNSOR_TOOLS_H_ +#define DLIB_TeNSOR_TOOLS_H_ + +#include "tensor.h" +#include "cudnn_dlibapi.h" +#include "cublas_dlibapi.h" +#include "cusolver_dlibapi.h" +#include "curand_dlibapi.h" +#include "cpu_dlib.h" +#include "cuda_dlib.h" +#include "../rand.h" +#include <memory> +#include "../geometry/rectangle.h" +#include "../test_for_odr_violations.h" + +namespace dlib +{ + bool dnn_prefer_fastest_algorithms(); + void set_dnn_prefer_fastest_algorithms(); + void set_dnn_prefer_smallest_algorithms(); +} + +namespace dlib { namespace tt +{ + +// ---------------------------------------------------------------------------------------- + + void inverse_norms ( + resizable_tensor& invnorms, + const tensor& data, + const double eps + ); + /*! + ensures + - #invnorms == reciprocal(sqrt(sum_cols(squared(mat(data))) + eps)) + !*/ + + void dot_prods ( + resizable_tensor& out, + const tensor& lhs, + const tensor& rhs + ); + /*! + requires + - have_same_dimensions(lhs,rhs) == true + ensures + - #out.num_samples() == lhs.num_samples() + - #out.k() == #out.nr() == #out.nc() == 1 + - #out == sum_cols(pointwise_multiply(mat(lhs), mat(rhs))); + !*/ + + void dot_prods ( + bool add_to, + tensor& out, + const tensor& lhs, + const tensor& rhs + ); + /*! + requires + - have_same_dimensions(lhs,rhs) == true + - out.size() == lhs.num_samples() + - out.k() == out.nr() == out.nc() == 1 + ensures + - if (add_to) then + - #out == mat(out) + sum_cols(pointwise_multiply(mat(lhs), mat(rhs))); + - else + - #out == sum_cols(pointwise_multiply(mat(lhs), mat(rhs))); + !*/ + + void scale_columns ( + tensor& out, + const tensor& m, + const tensor& v + ); + /*! + requires + - have_same_dimensions(out,m) == true + - is_vector(v) == true + - v.size() == mat(m).nc() + ensures + - performs: out = scale_columns(mat(m),mat(v)); + !*/ + + void scale_rows ( + tensor& out, + const tensor& m, + const tensor& v + ); + /*! + requires + - have_same_dimensions(out,m) == true + - is_vector(v) == true + - v.size() == m.num_samples() + ensures + - performs: out = scale_rows(mat(m),mat(v)); + !*/ + + void scale_rows2 ( + float beta, + tensor& out, + const tensor& m1, + const tensor& m2, + const tensor& v1, + const tensor& v2 + ); + /*! + requires + - have_same_dimensions(out,m1) == true + - have_same_dimensions(out,m2) == true + - have_same_dimensions(v1,v2) == true + - is_vector(v1) == true + - v1.size() == m1.num_samples() + ensures + - performs: + out = beta*out + scale_rows(mat(m1) - scale_rows(mat(m2),mat(v1)), mat(v2)); + !*/ + +// ---------------------------------------------------------------------------------------- + + void exp ( + tensor& dest, + const tensor& src + ); + /*! + requires + - dest.size() == src.size() + ensures + - performs: dest = exp(mat(src)) + !*/ + +// ---------------------------------------------------------------------------------------- + + void log ( + tensor& dest, + const tensor& src + ); + /*! + requires + - dest.size() == src.size() + ensures + - performs: dest = log(mat(src)) + !*/ + +// ---------------------------------------------------------------------------------------- + + void log10 ( + tensor& dest, + const tensor& src + ); + /*! + requires + - dest.size() == src.size() + ensures + - performs: dest = log10(mat(src)) + !*/ + +// ---------------------------------------------------------------------------------------- + + void gemm ( + float beta, + tensor& dest, + float alpha, + const tensor& lhs, + bool trans_lhs, + const tensor& rhs, + bool trans_rhs + ); + /*! + requires + - dest does not alias the memory of lhs or rhs + - The dimensions of lhs and rhs must be compatible for matrix multiplication. + In particular: + - Let L == trans_lhs ? trans(mat(lhs)) : mat(lhs) + - Let R == trans_rhs ? trans(mat(rhs)) : mat(rhs) + - Let D == mat(dest) + - D.nr() == L.nr() && D.nc() == R.nc() + (i.e. dest must be preallocated and have the correct output dimensions) + - L.nc() == R.nr() + ensures + - performs: dest = alpha*L*R + beta*mat(dest) + !*/ + +// ---------------------------------------------------------------------------------------- + + class inv + { + /*! + WHAT THIS OBJECT REPRESENTS + This is a functor for doing matrix inversion on the GPU. The only + reason it's an object is to avoid the reallocation of some GPU memory + blocks if you want to do a bunch of matrix inversions in a row. + !*/ + public: + + void operator() ( + const tensor& m, + resizable_tensor& out + ); + /*! + requires + - m.size() == m.num_samples()*m.num_samples() + (i.e. mat(m) must be a square matrix) + ensures + - out == inv(mat(m)); + !*/ + + private: +#ifdef DLIB_USE_CUDA + cuda::inv finv; +#endif + }; + +// ---------------------------------------------------------------------------------------- + + class tensor_rand + { + /*! + WHAT THIS OBJECT REPRESENTS + This is a tool for filling a tensor with random numbers. + + Note that the sequence of random numbers output by this object is different + when dlib is compiled with DLIB_USE_CUDA. So you should not write code + that depends on any specific sequence of numbers coming out of a + tensor_rand. + + !*/ + + public: + // not copyable + tensor_rand(const tensor_rand&) = delete; + tensor_rand& operator=(const tensor_rand&) = delete; + + tensor_rand() : tensor_rand(0) {} + tensor_rand(unsigned long long seed); + + void fill_gaussian ( + tensor& data, + float mean = 0, + float stddev = 1 + ); + /*! + requires + - data.size()%2 == 0 + ensures + - Fills data with random numbers drawn from a Gaussian distribution + with the given mean and standard deviation. + !*/ + + void fill_uniform ( + tensor& data + ); + /*! + ensures + - Fills data with uniform random numbers in the range (0.0, 1.0]. + !*/ + +#ifdef DLIB_USE_CUDA + cuda::curand_generator rnd; +#else + dlib::rand rnd; +#endif + }; + +// ---------------------------------------------------------------------------------------- + + void multiply ( + bool add_to, + tensor& dest, + const tensor& src1, + const tensor& src2 + ); + /*! + requires + - dest.k() == src1.k() == src2.k() + - dest.nr() == src1.nr() == src2.nr() + - dest.nc() == src1.nc() == src2.nc() + - dest.num_samples(), src1.num_samples(), and src2.num_samples() must each + either be 1 or whichever ones aren't equal to 1 must have the same values. + ensures + - let MD = max(dest.num_samples(), src1.num_samples(), src2.num_samples) + - This function pointwise multiplies src1 with src2 and stores the result into + #dest. However, how the multiplication happens depends on the dimensions of + the tensors. First, when src1 and src2 are multiplied together, if either + has a num_samples() dimension that is != MD, then it is first replicated to + produce a tensor with num_samples()==MD dimensions and then they are + pointwise multiplied together. + + Second, if dest.num_samples()==1, then after the pointwise multiplication of + src1 with src2, the result has its samples summed to produce an output tensor + with num_samples()==1 which is then assigned to #dest. + - if (add_to) then + - Instead of assigning the result to dest, this function adds the result to dest. + !*/ + + void scale_channels ( + bool add_to, + tensor& dest, + const tensor& src, + const tensor& scales + ); + /*! + requires + - have_same_dimensions(dest, src) == true + - scales.num_samples() == src.num_samples() + - scales.k() == src.k() + - scales.nr() == 1 + - scales.nc() == 1 + ensures + - Scales each channel of src by the corresponding value in scales. To be + precise, we will have: + - #dest(n,k,r,c) == src(n,k,r,c)*scales(n,k,1,1) + - if (add_to) then + - Instead of assigning the result to dest, this function adds the result to dest. + !*/ + + void multiply_conv ( + bool add_to, + tensor& dest, + const tensor& src1, + const tensor& src2 + ); + /*! + requires + - if (have_same_dimensions(dest, src1) == true) then + - src2.num_samples() == 1 + - src2.nr() == 1 + - src2.nc() == 1 + - src2.k() == src1.k() + - else + - have_same_dimensions(src1, src2) == true) + - dest.num_samples() == 1 + - dest.nr() == 1 + - dest.nc() == 1 + - dest.k() == src1.k() + ensures + - Performs #dest == src1*src2 + In particular, if the elements of dest, src1, and src2 were indexed by (n,k,r,c) then + we would have: + - if (have_same_dimensions(dest,src1)) then + #dest(n,k,r,c) == src1(n,k,r,c)*src2(k) + - else + #dest(k) == sum over {n,r,c} of src1(n,k,r,c)*src2(n,k,r,c) + - if (add_to) then + - Instead of assigning the result to dest, this function adds the result to dest. + !*/ + + void multiply_zero_padded ( + bool add_to, + tensor& dest, + const tensor& src1, + const tensor& src2 + ); + /*! + ensures + - if (add_to) then + - performs: dest += src1 * src2 + - else + - performs: dest = src1 * src2 + - In either case, the multiplication happens pointwise according to 4D tensor + arithmetic. If the dimensions don't match then missing elements are presumed + to be equal to 0. + !*/ + +// ---------------------------------------------------------------------------------------- + + void affine_transform( + tensor& dest, + const tensor& src, + const float A, + const float B + ); + /*! + requires + - dest.size()==src.size() + ensures + - #dest == A*src + B + !*/ + + void affine_transform( + tensor& dest, + const tensor& src, + const float A + ); + /*! + requires + - dest.size()==src.size() + ensures + - #dest == A*src + !*/ + + void affine_transform( + tensor& dest, + const tensor& src1, + const tensor& src2, + const float A, + const float B, + const float C + ); + /*! + requires + - dest.size()==src1.size() + - dest.size()==src2.size() + ensures + - #dest == A*src1 + B*src2 + C + !*/ + + void affine_transform( + tensor& dest, + const tensor& src1, + const tensor& src2, + const float A, + const float B + ); + /*! + requires + - dest.size()==src1.size() + - dest.size()==src2.size() + ensures + - #dest == A*src1 + B*src2 + !*/ + + void affine_transform( + tensor& dest, + const tensor& src1, + const tensor& src2, + const tensor& src3, + const float A, + const float B, + const float C, + const float D + ); + /*! + requires + - dest.size()==src1.size() + - dest.size()==src2.size() + - dest.size()==src3.size() + ensures + - #dest == A*src1 + B*src2 + C*src3 + D + !*/ + + void affine_transform( + tensor& dest, + const tensor& src1, + const tensor& src2, + const tensor& src3, + const float A, + const float B, + const float C + ); + /*! + requires + - dest.size()==src1.size() + - dest.size()==src2.size() + - dest.size()==src3.size() + ensures + - #dest == A*src1 + B*src2 + C*src3 + !*/ + + void affine_transform_range( + size_t begin, + size_t end, + tensor& dest, + const tensor& src1, + const tensor& src2, + const tensor& src3, + const float A, + const float B, + const float C + ); + /*! + requires + - dest.size()==src1.size() + - dest.size()==src2.size() + - dest.size()==src3.size() + - begin <= end <= dest.size() + ensures + - This function operates much like + affine_transform(dest,src1,src2,src3,A,B,C,0), except that it runs over only + the half open range [begin,end) rather than processing the entire tensor. + Specifically, it does this: + - for i in the range [begin, end): + - #dest.host()[i] == A*src1.host()[i] + B*src2.host()[i] + C*src3.host()[i] + !*/ + + void affine_transform( + const rectangle& rect, + tensor& dest, + const tensor& src1, + const tensor& src2, + const tensor& src3, + float A, + float B, + float C + ); + /*! + requires + - dest.size()==src1.size() + - dest.size()==src2.size() + - dest.size()==src3.size() + - dest.num_samples()==src1.num_samples() + - dest.num_samples()==src2.num_samples() + - dest.num_samples()==src3.num_samples() + - get_rect(mat(dest)).contains(rect) == true + (i.e. rect must be entirely contained within dest) + ensures + - This function operates much like + affine_transform(dest,src1,src2,src3,A,B,C,0), except that it runs over only + the sub-rectangle indicated by rect. In particular, this function is equivalent + to: + set_subm(dest,rect) = A*subm(mat(src1),rect) + B*subm(mat(src2),rect) + C*subm(mat(src3),rect) + !*/ + +// ---------------------------------------------------------------------------------------- + + void affine_transform( + tensor& dest, + const tensor& src, + const tensor& A, + const tensor& B + ); + /*! + requires + - have_same_dimensions(dest,src) == true + - if (A.num_samples() == 1) then + - B.num_samples() == 1 + - else + - A.num_samples() == src.num_samples() + - B.num_samples() == src.num_samples() + - A.nr() == B.nr() == src.nr() + - A.nc() == B.nc() == src.nc() + - A.k() == B.k() == src.k() + ensures + - if (A.num_samples() == 1) then + - #dest == A*src + B + (done for each sample in src) + - else + - for all valid i: + - #dest.host()[i] == A.host()[i]*src.host()[i] + B.host()[i] + !*/ + +// ---------------------------------------------------------------------------------------- + + void affine_transform_conv( + tensor& dest, + const tensor& src, + const tensor& A, + const tensor& B + ); + /*! + requires + - have_same_dimensions(dest,src) == true + - have_same_dimensions(A, B) == true + - A.num_samples() == 1 + - A.nr() == 1 + - A.nc() == 1 + - A.k() == src.k() + ensures + - Performs #dest == A*src + B + In particular, if the elements of dest and src were indexed by (n,k,r,c) then + we would have: + #dest(n,k,r,c) == A(k)*src(n,k,r,c) + B(k). + !*/ + +// ---------------------------------------------------------------------------------------- + + void compute_adam_update ( + size_t begin, + size_t end, + tensor& s, + tensor& m, + tensor& v, + const float t, + const float learning_rate, + const float weight_decay, + const float momentum1, + const float momentum2, + const tensor& params, + const tensor& params_grad + ); + /*! + requires + - s.size() == m.size() = v.size() == params.size() == params_grad.size() + - t > 0 + - learning_rate > 0 + - weight_decay >= 0 + - 0 <= momentum1 < 1 + - 0 <= momentum2 < 1 + - begin <= end <= params.size() + ensures + - This function implements the ADAM parameter update method described in the paper: + Kingma, Diederik P., and Jimmy Ba Adam. "A method for stochastic + optimization." International Conference on Learning Representation. 2015. + Specifically, it implements the method shown as Algorithm 1. + - #s is the update vector that should be added to the parameters. + - The function only operates in the half open range [begin,end) of the memory + blocks of each tensor. E.g. to make this function run on the entire tensor + set begin to 0 and end to params.size(). + !*/ + +// ---------------------------------------------------------------------------------------- + + void batch_normalize_inference ( + const double eps, + resizable_tensor& dest, + const tensor& src, + const tensor& gamma, + const tensor& beta, + const tensor& running_means, + const tensor& running_variances + ); + /*! + requires + - eps > 0 + - gamma.num_samples() == 1 + - gamma.nr() == src.nr() + - gamma.nc() == src.nc() + - gamma.k() == src.k() + - have_same_dimensions(gamma, beta) + - have_same_dimensions(gamma, running_means) + - have_same_dimensions(gamma, running_variances) + ensures + - Linearly transforms src as a call to batch_normalize() would if src had means + and variances as given by running_means and running_variances. That is, this + function performs: + dest = gamma*(src-running_means)/sqrt(running_variances+eps) + beta + Note that it does it in a pointwise fashion over the samples in src. + !*/ + + void batch_normalize ( + const double eps, + resizable_tensor& dest, + resizable_tensor& means, + resizable_tensor& invstds, + const double averaging_factor, + resizable_tensor& running_means, + resizable_tensor& running_variances, + const tensor& src, + const tensor& gamma, + const tensor& beta + ); + /*! + requires + - eps > 0 + - src.num_samples() > 1 + - gamma.num_samples() == 1 + - beta.num_samples() == 1 + - gamma.nr() == beta.nr() == src.nr() + - gamma.nc() == beta.nc() == src.nc() + - gamma.k() == beta.k() == src.k() + - 0 <= averaging_factor <= 1 + - if (averaging_factor != 1) + - have_same_dimensions(running_means, means) == true + - have_same_dimensions(running_variances, invstds) == true + ensures + - have_same_dimensions(#dest, src) == true + - #means.num_samples() == 1 + - #invstds.num_samples() == 1 + - means.nr() == invstds.nr() == src.nr() + - means.nc() == invstds.nc() == src.nc() + - means.k() == invstds.k() == src.k() + - #src == the batch normalized version of src. + - #means == the mean values of the contents of src. + - #invstds == 1/(the standard deviation values of the contents of src). + - #running_means = (1-averaging_factor)*mat(#running_means) + averaging_factor*mat(#means); + - #running_variances = (1-averaging_factor)*mat(#running_variances) + averaging_factor*(variance of contents of src); + !*/ + + void batch_normalize_gradient ( + const double eps, + const tensor& gradient_input, + const tensor& means, + const tensor& invstds, + const tensor& src, + const tensor& gamma, + tensor& src_grad, + tensor& gamma_grad, + tensor& beta_grad + ); + /*! + requires + - eps > 0 + - invstds and means should be the output of a call to + batch_normalize(eps,dest,means,invstds,src,gamma,beta) + - have_same_dimensions(gradient_input, src) == true + - have_same_dimensions(src, src_grad) == true + - src.num_samples() > 1 + - gamma.num_samples() == 1 + - have_same_dimensions(gamma, gamma_grad) == true + - have_same_dimensions(gamma, beta_grad) == true + - gamma.nr() == src.nr() + - gamma.nc() == src.nc() + - gamma.k() == src.k() + - have_same_dimensions(means, gamma) == true + - have_same_dimensions(invstds, gamma) == true + ensures + - Let f(src,gamma,beta) == dot(gradient_input, dest output of + batch_normalize(eps,dest,means,invstds,src,gamma,beta)) + - Adds the gradient of f() with respect to src to #src_grad. + - Assigns the gradient of f() with respect to gamma to #gamma_grad. + - Assigns the gradient of f() with respect to beta to #beta_grad. + !*/ + +// ---------------------------------------------------------------------------------------- + + void batch_normalize_conv_inference ( + const double eps, + resizable_tensor& dest, + const tensor& src, + const tensor& gamma, + const tensor& beta, + const tensor& running_means, + const tensor& running_variances + ); + /*! + requires + - eps > 0 + - gamma.num_samples() == 1 + - gamma.nr() == 1 + - gamma.nc() == 1 + - gamma.k() == src.k() + - have_same_dimensions(gamma, beta) + - have_same_dimensions(gamma, running_means) + - have_same_dimensions(gamma, running_variances) + ensures + - Linearly transforms src as a call to batch_normalize_conv() would if src had + means and variances as given by running_means and running_variances. That + is, this function performs: + dest = gamma*(src-running_means)/sqrt(running_variances+eps) + beta + Note that it does this in a pointwise fashion over the samples, rows, and + columns in src. + !*/ + + void batch_normalize_conv ( + const double eps, + resizable_tensor& dest, + resizable_tensor& means, + resizable_tensor& invstds, + const double averaging_factor, + resizable_tensor& running_means, + resizable_tensor& running_variances, + const tensor& src, + const tensor& gamma, + const tensor& beta + ); + /*! + requires + - eps > 0 + - src.num_samples() > 1 + - gamma.num_samples()==gamma.nr()==gamma.nc() == 1 + - beta.num_samples() ==beta.nr() ==gamma.nc() == 1 + - gamma.k() == beta.k() == src.k() + - 0 <= averaging_factor <= 1 + - if (averaging_factor != 1) + - have_same_dimensions(running_means, means) == true + - have_same_dimensions(running_variances, invstds) == true + ensures + - have_same_dimensions(#dest, src) == true + - #means.num_samples()==means.nr()==means.nc() == 1 + - #invstds.num_samples() ==invstds.nr() ==invstds.nc() == 1 + - means.k() == invstds.k() == src.k() + - #src == the batch normalized version of src. + - #means == the mean values of the contents of src. + - #invstds == 1/(the standard deviation values of the contents of src). + - #running_means = (1-averaging_factor)*mat(#running_means) + averaging_factor*mat(#means); + - #running_variances = (1-averaging_factor)*mat(#running_variances) + averaging_factor*(variance of contents of src); + !*/ + + void batch_normalize_conv_gradient ( + const double eps, + const tensor& gradient_input, + const tensor& means, + const tensor& invstds, + const tensor& src, + const tensor& gamma, + tensor& src_grad, + tensor& gamma_grad, + tensor& beta_grad + ); + /*! + requires + - eps > 0 + - invstds and means should be the output of a call to + batch_normalize_conv(eps,dest,means,invstds,src,gamma,beta) + - have_same_dimensions(gradient_input, src) == true + - have_same_dimensions(src, src_grad) == true + - src.num_samples() > 1 + - gamma.num_samples()==gamma.nr()==gamma.nc() == 1 + - have_same_dimensions(gamma, gamma_grad) == true + - have_same_dimensions(gamma, beta_grad) == true + - gamma.k() == src.k() + - have_same_dimensions(means, gamma) == true + - have_same_dimensions(invstds, gamma) == true + ensures + - Let f(src,gamma,beta) == dot(gradient_input, dest output of + batch_normalize_conv(eps,dest,means,invstds,src,gamma,beta)) + - Adds the gradient of f() with respect to src to #src_grad. + - Assigns the gradient of f() with respect to gamma to #gamma_grad. + - Assigns the gradient of f() with respect to beta to #beta_grad. + !*/ + +// ----------------------------------------------------------------------------------- + + void threshold ( + tensor& data, + float thresh + ); + /*! + ensures + - Sets all elements of data to 1 or 0 depending on if they are above or below + the given threshold. Specifically, for all valid i: + - #data.host()[i] == data.host()[i]>thresh ? 1 : 0 + !*/ + + void dot ( + const tensor& a, + const tensor& b, + tensor& result, + size_t idx + ); + /*! + requires + - a.size() == b.size() + - idx < result.size() + ensures + - #result.host()[idx] == result.host()[idx] + dot(a,b); + I.e. Adds the dot product between a and b into the idx-th element of result. + The reason you might want to use this more complex version of dot() is + because, when using CUDA, it runs by generating asynchronous kernel launches + whereas the version of dot() that returns the result immediately as a scalar + must block the host while we wait for the result to be computed and then + transfered from the GPU do the host for return by dot(). So this version of + dot() might be much faster in some cases. + !*/ + +// ---------------------------------------------------------------------------------------- + + void add( + float beta, + tensor& dest, + float alpha, + const tensor& src + ); + /*! + requires + - One of the following is true: + - have_same_dimensions(src, dest) + - src.num_samples()==1 && src.k()==dest.k() && src.nr()==1 && src.nc()==1 + - src.num_samples()==1 && src.k()==dest.k() && src.nr()==dest.nr() && src.nc()==dest.nc() + - src.num_samples()==1 && src.k()==1 && src.nr()==dest.nr() && src.nc()==dest.nc() + - src.num_samples()==dest.num_samples() && src.k()==1 && src.nr()==1 && src.nc()==1 + - is_same_object(src,dest) == false + ensures + - performs: dest = beta*dest + alpha*src + However, how the addition happens depends on the dimensions of src. In + particular, this function adds the scaled values of one src tensor to dest. + Each dimension of the src tensor must match the corresponding dimension of + the dest tensor or must be equal to 1. In the latter case, the same value + from the src tensor, for those dimensions, will be used to add into the dest + tensor. + !*/ + +// ---------------------------------------------------------------------------------------- + + void add ( + tensor& dest, + const tensor& src1, + const tensor& src2 + ); + /*! + ensures + - performs: dest = src1 + src2 + The addition happens pointwise according to 4D tensor arithmetic. If the + dimensions don't match then missing elements are presumed to be equal to 0. + !*/ + +// ---------------------------------------------------------------------------------------- + + void assign_conv_bias_gradient ( + tensor& grad, + const tensor& gradient_input + ); + /*! + requires + - grad.num_samples() == 1 + - grad.k() >= 1 + - grad.nr() == 1 + - grad.nc() == 1 + - gradient_input.k() == grad.k() + - gradient_input.size() > 0 + - is_same_object(grad,gradient_input) == false + ensures + - let BIAS be a tensor with the same dimensions as grad. + - let OUT be the output of add(1,OUT,1,BIAS) + - let f(gradient_input,BIAS) == dot(gradient_input,OUT) + - Then this function computes the gradient of f() with respect to BIAS and + assigns it to grad. + !*/ + +// ---------------------------------------------------------------------------------------- + + void assign_bias_gradient ( + tensor& grad, + const tensor& gradient_input + ); + /*! + requires + - grad.num_samples() == 1 + - gradient_input.k() == grad.k() + - gradient_input.nr() == grad.nr() + - gradient_input.nc() == grad.nc() + - gradient_input.size() > 0 + - is_same_object(grad,gradient_input) == false + ensures + - let BIAS be a tensor with the same dimensions as grad. + - let OUT be the output of add(1,OUT,1,BIAS) + - let f(gradient_input,BIAS) == dot(gradient_input,OUT) + - Then this function computes the gradient of f() with respect to BIAS and + assigns it to grad. + !*/ + +// ---------------------------------------------------------------------------------------- + + class tensor_conv + { + public: + tensor_conv(const tensor_conv&) = delete; + tensor_conv& operator=(const tensor_conv&) = delete; + + tensor_conv() {} + + void clear( + ) { impl.clear(); } + + void operator() ( + const bool add_to_output, + tensor& output, + const tensor& data, + const tensor& filters + ) { impl(add_to_output,output,data,filters); } + /*! + requires + - setup() has been called. Specifically, setup() has been called like this: + this->setup(data, filters, stride_y, stride_x, padding_y, padding_x); + - is_same_object(output,data) == false + - is_same_object(output,filters) == false + - filters.k() == data.k() + - filters.nr() <= src.nr() + 2*padding_y + - filters.nc() <= src.nc() + 2*padding_x + - #output.num_samples() == data.num_samples() + - #output.k() == filters.num_samples() + - #output.nr() == 1+(data.nr() + 2*padding_y - filters.nr())/stride_y + - #output.nc() == 1+(data.nc() + 2*padding_x - filters.nc())/stride_x + ensures + - Convolves filters over data. If add_to_output==true then we add the + results to output, otherwise we assign to output, overwriting the + previous values in output. + - filters contains filters.num_samples() filters. + !*/ + + void operator() ( + const bool add_to_output, + resizable_tensor& output, + const tensor& data, + const tensor& filters + ) { impl(add_to_output,output,data,filters); } + /*! + requires + - setup() has been called. Specifically, setup() has been called like this: + this->setup(data, filters, stride_y, stride_x, padding_y, padding_x); + - is_same_object(output,data) == false + - is_same_object(output,filters) == false + - filters.k() == data.k() + - filters.nr() <= src.nr() + 2*padding_y + - filters.nc() <= src.nc() + 2*padding_x + ensures + - Convolves filters over data. If add_to_output==true then we add the + results to output, otherwise we assign to output, overwriting the + previous values in output. + - filters contains filters.num_samples() filters. + - #output.num_samples() == data.num_samples() + - #output.k() == filters.num_samples() + - #output.nr() == 1+(data.nr() + 2*padding_y - filters.nr())/stride_y + - #output.nc() == 1+(data.nc() + 2*padding_x - filters.nc())/stride_x + !*/ + + void get_gradient_for_data ( + const bool add_to_output, + const tensor& gradient_input, + const tensor& filters, + tensor& data_gradient + ) { impl.get_gradient_for_data(add_to_output,gradient_input,filters,data_gradient); } + /*! + requires + - One of the following must be true: + - filters has the same dimensions as the filters object given to the + last call to operator(). Also, data_gradient has the same dimensions + as the data object given to the last call to operator(). + - setup() has been called. Specifically, setup() has been called like this: + this->setup(data_gradient, filters, stride_y, stride_x, padding_y, padding_x); + - gradient_input has the following dimensions: + - gradient_input.num_samples() == data_gradient.num_samples() + - gradient_input.k() == filters.num_samples() + - gradient_input.nr() == 1+(data_gradient.nr() + 2*padding_y - filters.nr())/stride_y + - gradient_input.nc() == 1+(data_gradient.nc() + 2*padding_x - filters.nc())/stride_x + - NOTE, these dimensions are what you would obtain if gradient_input + has the same dimensions as the last output of operator(). + - is_same_object(data_gradient,filters) == false + - is_same_object(data_gradient,gradient_input) == false + ensures + - let OUT be the output of (*this)(OUT,data,filters,sx,sy). + - let f(data,filters) == dot(OUT, gradient_input) + - if (add_to_output) then + - This function finds the gradient of f() with respect to data and adds + this gradient to data_gradient. + - else + - This function finds the gradient of f() with respect to data and + assigns this gradient to data_gradient, overwriting the previous + values in data_gradient. + !*/ + + void get_gradient_for_filters ( + const bool add_to_output, + const tensor& gradient_input, + const tensor& data, + tensor& filters_gradient + ) { impl.get_gradient_for_filters(add_to_output,gradient_input,data,filters_gradient); } + /*! + requires + - One of the following must be true: + - filters_gradient has the same dimensions as the filters object given + to the last call to operator(). Also, data has the same dimensions + as the data object given to the last call to operator(). + - setup() has been called. Specifically, setup() has been called like this: + this->setup(data, filters_gradient, stride_y, stride_x, padding_y, padding_x); + - gradient_input has the following dimensions: + - gradient_input.num_samples() == data.num_samples() + - gradient_input.k() == filters.num_samples() + - gradient_input.nr() == 1+(data.nr() + 2*padding_y - filters.nr())/stride_y + - gradient_input.nc() == 1+(data.nc() + 2*padding_x - filters.nc())/stride_x + - NOTE, these dimensions are what you would obtain if gradient_input + has the same dimensions as the last output of operator(). + - is_same_object(filters_gradient,data) == false + - is_same_object(filters_gradient,gradient_input) == false + ensures + - let OUT be the output of (*this)(OUT,data,filters,sx,sy). + - let f(data,filters) == dot(OUT, gradient_input) + - if (add_to_output) then + - This function finds the gradient of f() with respect to filters and + adds this gradient to filters_gradient. + - else + - This function finds the gradient of f() with respect to filters and + assigns this gradient to filters_gradient, overwriting the previous + values in filters_gradient. + !*/ + + + void setup( + const tensor& data, + const tensor& filters, + int stride_y, + int stride_x, + int padding_y, + int padding_x + ) {impl.setup(data,filters,stride_y,stride_x,padding_y,padding_x); } + /*! + requires + - filters.k() == data.k() + - stride_y > 0 + - stride_x > 0 + - 0 <= padding_y < filters.nr() + - 0 <= padding_x < filters.nc() + ensures + - When operator() is called, the output tensor will have these dimensions: + - output.nr() == 1+(data.nr() + 2*padding_y - filters.nr())/stride_y + - output.nc() == 1+(data.nc() + 2*padding_x - filters.nc())/stride_x + - output.num_samples() == data.num_samples() + - output.k() == filters.num_samples() + - The point of setup() is to allow this object to gather information about + all the tensor sizes and filter layouts involved in the computation. In + particular, the reason the tensors are input into setup() is just to + observe their sizes. setup() doesn't do anything with the contents of + the tensors, or store any kind of references to the data or filter + tensors. + !*/ + + private: +#ifdef DLIB_USE_CUDA + cuda::tensor_conv impl; +#else + cpu::tensor_conv impl; +#endif + + }; + +// ---------------------------------------------------------------------------------------- + + class pooling + { + /*! + WHAT THIS OBJECT REPRESENTS + The pooling object is a tool for performing spatial pooling over a tensor. + It can be configured to do either max or average pooling. + !*/ + public: + + pooling(const pooling&) = delete; + pooling& operator=(const pooling&) = delete; + + pooling ( + ) = default; + + void clear( + ) { impl.clear(); } + + void setup_max_pooling( + int window_height, + int window_width, + int stride_y, + int stride_x, + int padding_y, + int padding_x + ) { impl.setup_max_pooling(window_height, window_width, stride_y, stride_x, padding_y, padding_x); } + /*! + requires + - window_height > 0 + - window_width > 0 + - stride_y > 0 + - stride_x > 0 + - 0 <= padding_y < window_height + - 0 <= padding_x < window_width + ensures + - When you call operator() it will do max pooling with the given + parameters. + !*/ + + void setup_avg_pooling( + int window_height, + int window_width, + int stride_y, + int stride_x, + int padding_y, + int padding_x + ) { impl.setup_avg_pooling(window_height, window_width, stride_y, stride_x, padding_y, padding_x); } + /*! + requires + - window_height > 0 + - window_width > 0 + - stride_y > 0 + - stride_x > 0 + - 0 <= padding_y < window_height + - 0 <= padding_x < window_width + ensures + - When you call operator() it will do average pooling with the given + parameters. + !*/ + + bool does_max_pooling( + ) const { return impl.does_max_pooling(); } + + void operator() ( + resizable_tensor& dest, + const tensor& src + ) { impl(dest, src); } + /*! + requires + - is_same_object(dest,src) == false + - either setup_max_pooling() or setup_avg_pooling() has been called. + - window_width <= src.nc() + 2*padding_x + - window_height <= src.nr() + 2*padding_y + ensures + - #dest.num_samples() == src.num_samples() + - #dest.k() == src.k() + - #dest.nr() == 1 + (src.nr() + 2*padding_y - window_height)/stride_y + - #dest.nc() == 1 + (src.nc() + 2*padding_x - window_width)/stride_x + - WINDOW == centered_rect(x*stride_x + window_width/2 - padding_x, + y*stride_y + window_height/2 - padding_y, + window_width, + window_height) + - for all valid s, k, r, and c: + - if (does_max_pooling()) then + - image_plane(#dest,s,k)(r,c) == max(subm_clipped(image_plane(src,s,k),WINDOW(c,r))) + - else + - image_plane(#dest,s,k)(r,c) == mean(subm_clipped(image_plane(src,s,k),WINDOW(c,r))) + !*/ + + void get_gradient( + const tensor& gradient_input, + const tensor& dest, + const tensor& src, + tensor& grad + ) { impl.get_gradient(gradient_input, dest, src, grad); } + /*! + requires + - have_same_dimensions(gradient_input,dest) == true + - have_same_dimensions(src,grad) == true + - dest contains the result of calling (*this)(dest,src) + - is_same_object(grad,gradient_input) == false + - is_same_object(grad,dest) == false + - is_same_object(grad,src) == false + ensures + - Recalling that dest is the output of (*this)(dest,src), + let f(src) == dot(gradient_input,dest) + - Then this function computes the gradient of f() with respect to src and + adds it to grad. + !*/ + + private: +#ifdef DLIB_USE_CUDA + cuda::pooling impl; +#else + cpu::pooling impl; +#endif + }; + +// ---------------------------------------------------------------------------------------- + + void softmax ( + tensor& dest, + const tensor& src + ); + /*! + requires + - have_same_dimensions(dest, src) == true + ensures + - Note that the softmax function is a vector valued function: + s(x) == exp(x)/sum(exp(x)) + - Computes the softmax function on src and writes the results to dest. The + softmax is computed per spatial location across the different channels at + each location. That is, softmax() outputs a new tensor, #dest, where each of + the spatial locations in dest (i.e. image idx, row idx, and column idx) + contains the output of s() evaluated over the channel values at each + location. + - This function supports in-place operation, i.e. having + is_same_object(dest, src)==true + !*/ + + void softmax_gradient ( + tensor& grad, + const tensor& dest, + const tensor& gradient_input + ); + /*! + requires + - have_same_dimensions(dest,gradient_input) == true + - have_same_dimensions(dest,grad) == true + ensures + - We interpret dest as the output of softmax(dest,SRC) for some SRC tensor. + Then let f(SRC) == dot(gradient_input,dest). Then this function computes the + gradient of f() with respect to SRC and stores it to grad. Moreover, if + is_same_object(grad,gradient_input)==true then the output is assigned to + grad, replacing its previous contents. Otherwise the output is added to + grad. + - This function supports in-place operation, i.e. having + is_same_object(grad, gradient_input)==true + !*/ + +// ---------------------------------------------------------------------------------------- + + void softmax_all ( + tensor& dest, + const tensor& src + ); + /*! + requires + - have_same_dimensions(dest, src) == true + ensures + - Note that the softmax function is a vector valued function: + s(x) == exp(x)/sum(exp(x)) + - Computes the softmax function on src and writes the results to dest. The + softmax is computed over the entire tensor with one invocation of s(). So + unlike softmax() which computes many s() evaluations, one for each spatial + location, softmax_all() calls s() once for the entire tensor. + - This function supports in-place operation, i.e. having + is_same_object(dest, src)==true + !*/ + + void softmax_all_gradient ( + tensor& grad, + const tensor& dest, + const tensor& gradient_input + ); + /*! + requires + - have_same_dimensions(dest,gradient_input) == true + - have_same_dimensions(dest,grad) == true + - is_same_object(grad, dest)==false + ensures + - We interpret dest as the output of softmax_all(dest,SRC) for some SRC tensor. + Then let f(SRC) == dot(gradient_input,dest) Then this function computes the + gradient of f() with respect to SRC and assigns it to grad. + - This function supports in-place operation, i.e. having + is_same_object(grad, gradient_input)==true + !*/ + +// ---------------------------------------------------------------------------------------- + + void sigmoid ( + tensor& dest, + const tensor& src + ); + /*! + requires + - have_same_dimensions(dest, src) == true + ensures + - for all valid i: + - #dest.host()[i] == 1/(1+std::exp(-src.host()[i])) + - This function supports in-place operation, i.e. having + is_same_object(dest, src)==true + !*/ + + void sigmoid_gradient ( + tensor& grad, + const tensor& dest, + const tensor& gradient_input + ); + /*! + requires + - have_same_dimensions(dest,gradient_input) == true + - have_same_dimensions(dest,grad) == true + ensures + - Recalling that dest is the output of sigmoid(dest,SRC) for some SRC tensor, + let f(SRC) == dot(gradient_input,dest). Then this function computes the + gradient of f() with respect to SRC and stores it to grad. Moreover, if + is_same_object(grad,gradient_input)==true then the output is assigned to + grad, replacing its previous contents. Otherwise the output is added to + grad. + - This function supports in-place operation, i.e. having + is_same_object(grad, gradient_input)==true + !*/ + +// ---------------------------------------------------------------------------------------- + + void relu ( + tensor& dest, + const tensor& src + ); + /*! + requires + - have_same_dimensions(dest, src) == true + ensures + - for all valid i: + - #dest.host()[i] == std::max(0,src.host()[i]) + - This function supports in-place operation, i.e. having + is_same_object(dest, src)==true + !*/ + + void relu_gradient ( + tensor& grad, + const tensor& dest, + const tensor& gradient_input + ); + /*! + requires + - have_same_dimensions(dest,gradient_input) == true + - have_same_dimensions(dest,grad) == true + ensures + - Recalling that dest is the output of relu(dest,SRC) for some SRC tensor, + let f(SRC) == dot(gradient_input,dest). Then this function computes the + gradient of f() with respect to SRC and stores it to grad. Moreover, if + is_same_object(grad,gradient_input)==true then the output is assigned to + grad, replacing its previous contents. Otherwise the output is added to + grad. + - This function supports in-place operation, i.e. having + is_same_object(grad, gradient_input)==true + !*/ + +// ---------------------------------------------------------------------------------------- + + void prelu ( + tensor& dest, + const tensor& src, + const tensor& param + ); + /*! + requires + - have_same_dimensions(dest, src) == true + - param.size() == 1 + ensures + - for all valid i: + - if (src.host()[i] > 0) then + - #dest.host()[i] == src.host()[i] + - else + - #dest.host()[i] == src.host()[i] * param.host()[0] + - This function supports in-place operation, i.e. having + is_same_object(dest, src)==true + !*/ + + void prelu_gradient ( + tensor& grad, + const tensor& src, + const tensor& gradient_input, + const tensor& param, + tensor& params_grad + ); + /*! + requires + - have_same_dimensions(grad,src) == true + - have_same_dimensions(grad,gradient_input) == true + - param.size() == 1 + - params_grad.size() == 1 + - is_same_object(grad, gradient_input) == false + ensures + - Recalling that dest is the output of prelu(dest,src,param) let + f(src,param) == dot(gradient_input,dest) + - Then this function computes the gradient of f() with respect to src and + param. It assigns the gradient with respect to param to #params_grad and + adds the gradient with respect to src to #grad. + !*/ + +// ---------------------------------------------------------------------------------------- + + void tanh ( + tensor& dest, + const tensor& src + ); + /*! + requires + - have_same_dimensions(dest, src) == true + ensures + - for all valid i: + - #dest.host()[i] == std::tanh(src.host()[i]) + - This function supports in-place operation, i.e. having + is_same_object(dest, src)==true + !*/ + + void tanh_gradient ( + tensor& grad, + const tensor& dest, + const tensor& gradient_input + ); + /*! + requires + - have_same_dimensions(dest,gradient_input) == true + - have_same_dimensions(dest,grad) == true + ensures + - Recalling that dest is the output of tanh(dest,SRC) for some SRC tensor, + let f(SRC) == dot(gradient_input,dest). Then this function computes the + gradient of f() with respect to SRC and stores it to grad. Moreover, if + is_same_object(grad,gradient_input)==true then the output is assigned to + grad, replacing its previous contents. Otherwise the output is added to + grad. + - This function supports in-place operation, i.e. having + is_same_object(grad, gradient_input)==true + !*/ + +// ---------------------------------------------------------------------------------------- + + void resize_bilinear ( + tensor& dest, + long dest_row_stride, + long dest_channel_stride, + const tensor& src, + long src_row_stride, + long src_channel_stride + ); + /*! + requires + - is_same_object(dest, src)==false + - dest.num_samples() == src.num_samples() + - dest.k() == src.k() + ensures + - for all valid i,k: image_plane(dest,i,k) is a copy of image_plane(src,i,k) + that has been bilinearly interpolated to fit into the shape of + image_plane(dest,i,k). + - Instead of supposing the row stride and channel stride in the tensors is + given by tensor::nc() and tensor::nr()*tensor::nc() respectively, we use the + provided stride values to transition from one row and channel to the next. + This is useful in combination with alias_tensor objects since it allows you + to operate on subwindows in an image. + !*/ + + void resize_bilinear_gradient ( + tensor& grad, + long grad_row_stride, + long grad_channel_stride, + const tensor& gradient_input, + long gradient_input_row_stride, + long gradient_input_channel_stride + ); + /*! + requires + - is_same_object(grad, gradient_input)==false + - gradient_input.num_samples() == grad.num_samples() + - gradient_input.k() == grad.k() + ensures + - Suppose that DEST is the output of resize_bilinear(DEST,SRC) for some SRC + tensor, let f(SRC) == dot(gradient_input,DEST). Then this function computes + the gradient of f() with respect to SRC and adds it to grad. It should be + noted that we don't need to know the contents of DEST to compute this + gradient. All that matters is that gradient_input have the same dimensions + as DEST. + - Instead of supposing the row stride and channel stride in the tensors is + given by tensor::nc() and tensor::nr()*tensor::nc() respectively, we use the + provided stride values to transition from one row and channel to the next. + This is useful in combination with alias_tensor objects since it allows you + to operate on subwindows in an image. + !*/ + + inline void resize_bilinear ( + tensor& dest, + const tensor& src + ) { resize_bilinear(dest, dest.nc(), dest.nr()*dest.nc(), src, src.nc(), src.nr()*src.nc()); } + /*! + requires + - is_same_object(dest, src)==false + - dest.num_samples() == src.num_samples() + - dest.k() == src.k() + ensures + - for all valid i,k: image_plane(dest,i,k) is a copy of image_plane(src,i,k) + that has been bilinearly interpolated to fit into the shape of + image_plane(dest,i,k). + !*/ + + inline void resize_bilinear_gradient ( + tensor& grad, + const tensor& gradient_input + ) { resize_bilinear_gradient(grad, grad.nc(), grad.nr()*grad.nc(), gradient_input, gradient_input.nc(), gradient_input.nr()*gradient_input.nc()); } + /*! + requires + - is_same_object(grad, gradient_input)==false + - gradient_input.num_samples() == grad.num_samples() + - gradient_input.k() == grad.k() + ensures + - Suppose that DEST is the output of resize_bilinear(DEST,SRC) for some SRC + tensor, let f(SRC) == dot(gradient_input,DEST). Then this function computes + the gradient of f() with respect to SRC and adds it to grad. It should be + noted that we don't need to know the contents of DEST to compute this + gradient. All that matters is that gradient_input have the same dimensions + as DEST. + !*/ + +// ---------------------------------------------------------------------------------------- + + class multi_device_tensor_averager + { + /*! + WHAT THIS OBJECT REPRESENTS + This object is a tool for very quickly averaging a bunch of tensors + together. + !*/ + public: + + multi_device_tensor_averager(const multi_device_tensor_averager&) = delete; + multi_device_tensor_averager& operator=(const multi_device_tensor_averager&) = delete; + + multi_device_tensor_averager() = default; + + void set( + std::vector<tensor*> items + ) + /*! + requires + - All the tensors in items are the same size + ensures + - When you call average() we will average the tensors in items. + - It's important that the tensors already be allocated to their devices + before you call set(). This is because set() will setup the types of + between device transfers now and use them when you call average(). + !*/ + { + using namespace ::dlib::cuda; + accessible_groups.clear(); + epa.clear(); + if (items.size() < 1) + return; + + scale = 1.0/items.size(); + + // split item into groups of accessible devices + std::vector<tensor*> group, unused; + while(items.size() > 0) + { + group.push_back(items[0]); + for(size_t i = 1; i < items.size(); ++i) + { + if (can_access_peer(*items[0], *items[i])) + group.push_back(items[i]); + else + unused.push_back(items[i]); + } + accessible_groups.push_back(group); + unused.swap(items); + unused.clear(); + group.clear(); + } + for (auto&& g : accessible_groups) + { + for (size_t i = 1; i < g.size(); ++i) + { + epa.emplace_back(new enable_peer_access(*g[0], *g[i])); + } + } + } + + size_t num_device_groups( + ) const { return accessible_groups.size(); } + /*! + ensures + - The devices given to set() are grouped together when they can directly + access each other using GPUDirect. This function returns the number of + such groups. For example, if all devices can directly access each other + then the number of groups is 1. + !*/ + + void average() + /*! + requires + - All the devices have stopped writing to the tensors given to set(). So + you should probably call cudaDeviceSynchronize() on each of the relevant + devices before calling average(). + ensures + - Computes the average of all the tensors given to set() and then sets them + all equal to the average. + !*/ + { + using namespace ::dlib::cuda; + + + // First we average things within each group + for (auto&& g : accessible_groups) + { + raii_set_device set_dev(*g[0]); + if (g.size() == 1) + tt::affine_transform(*g[0], *g[0], scale); + else + tt::affine_transform(*g[0], *g[0], *g[1], scale, scale); + + for (size_t i = 2; i < g.size(); ++i) + tt::affine_transform(*g[0], *g[0], *g[i], 1, scale); + } + + if (accessible_groups.size() > 1) + { + tensor& total_avg = *accessible_groups[0][0]; + raii_set_device set_dev(total_avg); + accum_buffer.copy_size(total_avg); + // now we need to average things across groups + for (size_t i = 1; i < accessible_groups.size(); ++i) + { + memcpy(accum_buffer, *accessible_groups[i][0]); + tt::add(total_avg, total_avg, accum_buffer); + } + + // Now total_avg has the final average in it. So we need to send + // copies of it back to each of the groups. + for (size_t i = 1; i < accessible_groups.size(); ++i) + { + memcpy(*accessible_groups[i][0], total_avg); + } + } + + + // Now propagate averages back out to each element using point to point + // communication inside a group. + for (auto&& g : accessible_groups) + { + raii_set_device set_dev(*g[0]); + for (size_t i = 1; i < g.size(); ++i) + memcpy(*g[i], *g[0]); + } + } + + private: + std::vector<std::unique_ptr<::dlib::cuda::enable_peer_access>> epa; + std::vector<std::vector<tensor*>> accessible_groups; + float scale; + + resizable_tensor accum_buffer; + }; + +// ---------------------------------------------------------------------------------------- + + void copy_tensor( + bool add_to, + tensor& dest, + size_t dest_k_offset, + const tensor& src, + size_t src_k_offset, + size_t count_k + ); + /*! + requires + - dest.nc() == src.nc() + - dest.nr() == src.nr() + - dest.num_samples() == src.num_samples() + - dest.k() - dest_k_offset >= count_k + - src.k() - src_k_offset >= count_k + - is_same_object(dest,src) == false + - The memory areas of src and dest do not overlap. + ensures + - if (add_to) then + - performs: dest[i, k + dest_k_offset, r, c] += src[i, k + src_k_offset, r, c], where k in [0..count_k] + i.e., adds content of each sample from src in to corresponding place of sample at dest. + - else + - performs: dest[i, k + dest_k_offset, r, c] = src[i, k + src_k_offset, r, c], where k in [0..count_k] + i.e., copies content of each sample from src in to corresponding place of sample at dest. + !*/ + +// ---------------------------------------------------------------------------------------- + +}} + +#ifdef NO_MAKEFILE +#include "tensor_tools.cpp" +#endif + +#endif // DLIB_TeNSOR_TOOLS_H_ + + diff --git a/ml/dlib/dlib/dnn/trainer.h b/ml/dlib/dlib/dnn/trainer.h new file mode 100644 index 000000000..7cb2bf5e5 --- /dev/null +++ b/ml/dlib/dlib/dnn/trainer.h @@ -0,0 +1,1333 @@ +// Copyright (C) 2015 Davis E. King (davis@dlib.net) +// License: Boost Software License See LICENSE.txt for the full license. +#ifndef DLIB_DNn_TRAINER_H_ +#define DLIB_DNn_TRAINER_H_ + +#include "trainer_abstract.h" +#include "core.h" +#include "solvers.h" +#include "../statistics.h" +#include <chrono> +#include <fstream> +#include <sstream> +#include "../serialize.h" + +#include "../pipe.h" +#include "../threads.h" +#include "cuda_dlib.h" +#include "../statistics/running_gradient.h" +#include <atomic> +#include <cstdio> +#include <set> +#include <future> +#include <exception> +#include <mutex> +#include "../dir_nav.h" +#include "../md5.h" + +namespace dlib +{ + +// ---------------------------------------------------------------------------------------- + + namespace impl + { + template <typename training_label_type> + struct dnn_job_t + { + dnn_job_t() = default; + dnn_job_t(const dnn_job_t&) = delete; + dnn_job_t& operator=(const dnn_job_t&) = delete; + + std::vector<std::vector<training_label_type>> labels; + std::vector<resizable_tensor> t; + std::vector<int> have_data; // have_data[i] is true if there is data in labels[i] and t[i]. + bool test_only = false; + }; + + template <typename training_label_type> + void swap(dnn_job_t<training_label_type>& a, dnn_job_t<training_label_type>& b) + { + a.labels.swap(b.labels); + a.t.swap(b.t); + a.have_data.swap(b.have_data); + std::swap(a.test_only,b.test_only); + } + } + + enum class force_flush_to_disk { + no = 0, + yes = 1 + }; + + template < + typename net_type, + typename solver_type = sgd + > + class dnn_trainer : private threaded_object + { + public: + + static_assert(is_loss_layer_type<net_type>::value, + "The last layer in a network must be a loss layer."); + + typedef typename net_type::training_label_type training_label_type; + typedef typename net_type::input_type input_type; + const static size_t num_computational_layers = net_type::num_computational_layers; + const static size_t num_layers = net_type::num_layers; + private: + typedef impl::dnn_job_t<training_label_type> job_t; + public: + + dnn_trainer() = delete; + dnn_trainer(const dnn_trainer&) = delete; + dnn_trainer& operator=(const dnn_trainer&) = delete; + + explicit dnn_trainer(net_type& net_) : job_pipe(0), net(net_) + { + solver_type default_solver; + devices.push_back(std::make_shared<device_data>(dlib::cuda::get_device(), net, default_solver)); + + init(); + } + + dnn_trainer( + net_type& net_, + const solver_type& solver_ + ) : job_pipe(0), net(net_) + { + devices.push_back(std::make_shared<device_data>(dlib::cuda::get_device(), net, solver_)); + + init(); + } + + dnn_trainer( + net_type& net_, + const solver_type& solver_, + const std::vector<int>& cuda_extra_devices + ) : job_pipe(0), net(net_) + { + devices.push_back(std::make_shared<device_data>(dlib::cuda::get_device(), net, solver_)); + + const int total_devices = dlib::cuda::get_num_devices(); + + // Make device contexts for the extra device ids but be careful to avoid any + // duplicate ids. + std::set<int> temp(cuda_extra_devices.begin(), cuda_extra_devices.end()); + temp.erase(devices[0]->device_id); + for (auto id : temp) + { + DLIB_CASSERT(0 <= id && id < total_devices, "Invalid CUDA device id given to dnn_trainer."); + // Switch to this device so that any tensor objects that get allocated when + // we create the device context happen on this device. + dlib::cuda::set_device(id); + devices.push_back(std::make_shared<device_data>(id, net, solver_, clone_net())); + } + // Set the current device back to what it was before this constructor was + // called. + dlib::cuda::set_device(devices[0]->device_id); + + init(); + } + + ~dnn_trainer( + ) + { + job_pipe.disable(); + stop(); + wait(); + } + + net_type& get_net ( + force_flush_to_disk force_flush = force_flush_to_disk::yes + ) + { + wait_for_thread_to_pause(); + sync_to_disk(force_flush == force_flush_to_disk::yes); + propagate_exception(); + return net; + } + + + unsigned long get_mini_batch_size ( + ) const { return mini_batch_size; } + + void set_mini_batch_size ( + unsigned long batch_size + ) + { + DLIB_CASSERT(batch_size > 0); + mini_batch_size = batch_size; + } + + unsigned long get_max_num_epochs ( + ) const { return max_num_epochs; } + + void set_max_num_epochs ( + unsigned long num + ) + { + DLIB_CASSERT(num > 0); + max_num_epochs = num; + } + + void be_verbose ( + ) + { + verbose = true; + } + + void be_quiet ( + ) + { + verbose = false; + } + + + const std::vector<solver_type>& get_solvers ( + ) const + { + wait_for_thread_to_pause(); + propagate_exception(); + return devices[0]->solvers; + } + + void train_one_step ( + const std::vector<input_type>& data, + const std::vector<training_label_type>& labels + ) + { + DLIB_CASSERT(data.size() == labels.size()); + + train_one_step(data.begin(), data.end(), labels.begin()); + } + + template < + typename data_iterator, + typename label_iterator + > + void train_one_step ( + data_iterator dbegin, + data_iterator dend, + label_iterator lbegin + ) + { + DLIB_CASSERT(std::distance(dbegin, dend) > 0); + + print_periodic_verbose_status(); + sync_to_disk(); + send_job(false, dbegin, dend, lbegin); + + ++train_one_step_calls; + } + + void train_one_step ( + const std::vector<input_type>& data + ) + { + train_one_step(data.begin(), data.end()); + } + + template < + typename data_iterator + > + void train_one_step ( + data_iterator dbegin, + data_iterator dend + ) + { + DLIB_CASSERT(std::distance(dbegin, dend) > 0); + print_periodic_verbose_status(); + sync_to_disk(); + send_job(false, dbegin, dend); + ++train_one_step_calls; + } + + void test_one_step ( + const std::vector<input_type>& data, + const std::vector<training_label_type>& labels + ) + { + DLIB_CASSERT(data.size() == labels.size()); + + test_one_step(data.begin(), data.end(), labels.begin()); + } + + template < + typename data_iterator, + typename label_iterator + > + void test_one_step ( + data_iterator dbegin, + data_iterator dend, + label_iterator lbegin + ) + { + DLIB_CASSERT(std::distance(dbegin, dend) > 0); + + print_periodic_verbose_status(); + sync_to_disk(); + send_job(true, dbegin, dend, lbegin); + + ++test_one_step_calls; + } + + void test_one_step ( + const std::vector<input_type>& data + ) + { + test_one_step(data.begin(), data.end()); + } + + template < + typename data_iterator + > + void test_one_step ( + data_iterator dbegin, + data_iterator dend + ) + { + DLIB_CASSERT(std::distance(dbegin, dend) > 0); + print_periodic_verbose_status(); + sync_to_disk(); + send_job(true, dbegin, dend); + ++test_one_step_calls; + } + + void train ( + const std::vector<input_type>& data, + const std::vector<training_label_type>& labels + ) + { + DLIB_CASSERT(data.size() == labels.size() && data.size() > 0); + + // The reason these two loops don't initialize their counter variables but + // instead use class members is so we can include the state of the loops in the + // stuff written by sync_to_disk() + for (; + epoch_iteration < max_num_epochs && learning_rate >= min_learning_rate; + ++epoch_iteration) + { + using namespace std::chrono; + last_time = system_clock::now(); + clear_average_loss(); + for (; epoch_pos < data.size() && learning_rate >= min_learning_rate; epoch_pos += mini_batch_size) + { + if (verbose) + { + auto now_time = system_clock::now(); + if (now_time-last_time > seconds(20)) + { + last_time = now_time; + auto iter = epoch_iteration + epoch_pos/(double)data.size(); + std::cout << "epoch: " << rpad(cast_to_string(iter),epoch_string_pad) << " " + << "learning rate: " << rpad(cast_to_string(learning_rate),lr_string_pad) << " " + << "average loss: " << rpad(cast_to_string(get_average_loss()),string_pad) << " "; + print_progress(); + } + } + + sync_to_disk(); + send_job(false, data.begin()+epoch_pos, + data.begin()+std::min(epoch_pos+mini_batch_size,data.size()), + labels.begin()+epoch_pos); + } + epoch_pos = 0; + + if (verbose) + { + // Capitalize the E in Epoch so it's easy to grep out the lines that + // are for full epoch status statements. + std::cout << "Epoch: " << rpad(cast_to_string(epoch_iteration+1),epoch_string_pad) << " " + << "learning rate: " << rpad(cast_to_string(learning_rate),lr_string_pad) << " " + << "average loss: " << rpad(cast_to_string(get_average_loss()),string_pad) << " "; + print_progress(); + } + } + wait_for_thread_to_pause(); + // if we modified the network at all then be sure to sync the final result. + sync_to_disk(true); + } + + void train ( + const std::vector<input_type>& data + ) + { + DLIB_CASSERT(data.size() > 0); + + const bool has_unsupervised_loss = std::is_same<no_label_type, training_label_type>::value; + static_assert(has_unsupervised_loss, + "You can only call this version of train() when using an unsupervised loss."); + + // The reason these two loops don't initialize their counter variables but + // instead use class members is so we can include the state of the loops in the + // stuff written by sync_to_disk() + for (; + epoch_iteration < max_num_epochs && learning_rate >= min_learning_rate; + ++epoch_iteration) + { + using namespace std::chrono; + last_time = system_clock::now(); + clear_average_loss(); + for (; epoch_pos < data.size() && learning_rate >= min_learning_rate; epoch_pos += mini_batch_size) + { + if (verbose) + { + auto now_time = system_clock::now(); + if (now_time-last_time > seconds(20)) + { + last_time = now_time; + auto iter = epoch_iteration + epoch_pos/(double)data.size(); + std::cout << "epoch: " << rpad(cast_to_string(iter),epoch_string_pad) << " " + << "learning rate: " << rpad(cast_to_string(learning_rate),lr_string_pad) << " " + << "average loss: " << rpad(cast_to_string(get_average_loss()),string_pad) << " "; + print_progress(); + } + } + + sync_to_disk(); + send_job(false, data.begin()+epoch_pos, + data.begin()+std::min(epoch_pos+mini_batch_size,data.size())); + } + epoch_pos = 0; + + if (verbose) + { + // Capitalize the E in Epoch so it's easy to grep out the lines that + // are for full epoch status statements. + std::cout << "Epoch: " << rpad(cast_to_string(epoch_iteration+1),epoch_string_pad) << " " + << "learning rate: " << rpad(cast_to_string(learning_rate),lr_string_pad) << " " + << "average loss: " << rpad(cast_to_string(get_average_loss()),string_pad) << " "; + print_progress(); + } + } + wait_for_thread_to_pause(); + // if we modified the network at all then be sure to sync the final result. + sync_to_disk(true); + } + + void set_synchronization_file ( + const std::string& filename, + std::chrono::seconds time_between_syncs_ = std::chrono::minutes(15) + ) + { + last_sync_time = std::chrono::system_clock::now(); + sync_filename = filename; + time_between_syncs = time_between_syncs_; + + // check if the sync file already exists, if it does we should load it. + std::ifstream fin(newest_syncfile(), std::ios::binary); + if (fin) + deserialize(*this, fin); + } + + const std::string& get_synchronization_file ( + ) + { + return sync_filename; + } + + double get_average_loss ( + ) const + { + wait_for_thread_to_pause(); + return rs.mean(); + } + + double get_average_test_loss ( + ) const + { + wait_for_thread_to_pause(); + return rs_test.mean(); + } + + void clear_average_loss ( + ) + { + wait_for_thread_to_pause(); + rs.clear(); + } + + void set_learning_rate ( + double lr + ) + { + DLIB_CASSERT(lr > 0); + wait_for_thread_to_pause(); + if (learning_rate != lr) + { + steps_without_progress = 0; + test_steps_without_progress = 0; + previous_loss_values.clear(); + test_previous_loss_values.clear(); + } + learning_rate = lr; + lr_schedule.set_size(0); + } + + double get_learning_rate( + ) const + { + return learning_rate; + } + + void set_min_learning_rate ( + double lr + ) + { + DLIB_CASSERT(lr > 0); + wait_for_thread_to_pause(); + lr_schedule.set_size(0); + min_learning_rate = lr; + } + + double get_min_learning_rate ( + ) const + { + return min_learning_rate; + } + + template <typename EXP> + void set_learning_rate_schedule ( + const matrix_exp<EXP>& schedule + ) + { + DLIB_CASSERT(schedule.size() > 0); + DLIB_CASSERT(min(schedule) > 0); + set_learning_rate(schedule(0,0)); + set_min_learning_rate(min(schedule)); + set_learning_rate_shrink_factor(1); + lr_schedule = matrix_cast<double>(reshape_to_column_vector(schedule)); + lr_schedule_pos = 0; + } + + const matrix<double,0,1>& get_learning_rate_schedule ( + ) const + { + return lr_schedule; + } + + void set_iterations_without_progress_threshold ( + unsigned long thresh + ) + { + wait_for_thread_to_pause(); + lr_schedule.set_size(0); + iter_without_progress_thresh = thresh; + } + + unsigned long get_iterations_without_progress_threshold ( + ) const + { + return iter_without_progress_thresh; + } + + unsigned long get_steps_without_progress ( + ) const + { + return steps_without_progress; + } + + void set_test_iterations_without_progress_threshold ( + unsigned long thresh + ) + { + wait_for_thread_to_pause(); + lr_schedule.set_size(0); + test_iter_without_progress_thresh = thresh; + } + + unsigned long get_test_iterations_without_progress_threshold ( + ) const + { + return test_iter_without_progress_thresh; + } + + unsigned long get_test_steps_without_progress ( + ) const + { + return test_steps_without_progress; + } + + void set_learning_rate_shrink_factor ( + double shrink + ) + { + DLIB_CASSERT(0 < shrink && shrink <= 1); + wait_for_thread_to_pause(); + lr_schedule.set_size(0); + learning_rate_shrink = shrink; + steps_without_progress = 0; + test_steps_without_progress = 0; + } + + double get_learning_rate_shrink_factor ( + ) const + { + return learning_rate_shrink; + } + + unsigned long long get_train_one_step_calls ( + ) const + { + return train_one_step_calls; + } + + unsigned long long get_test_one_step_calls ( + ) const + { + return test_one_step_calls; + } + + private: + + void record_test_loss(double loss) + { + test_previous_loss_values.push_back(loss); + if (is_finite(loss)) + rs_test.add(loss); + // discard really old loss values. + while (test_previous_loss_values.size() > test_iter_without_progress_thresh) + test_previous_loss_values.pop_front(); + } + + void record_loss(double loss) + { + // This kind of budgeting causes our gradient checking to use a fixed amount of + // computational resources, regardless of the size of iter_without_progress_thresh. + gradient_check_budget += 200; + + rs.add(loss); + previous_loss_values.push_back(loss); + // discard really old loss values. + while (previous_loss_values.size() > iter_without_progress_thresh) + previous_loss_values.pop_front(); + } + + template <typename T> + double compute_parameter_gradients(size_t device, job_t& next_job, const T&) + { + if (next_job.have_data[device]) + { + auto&& dev = *devices[device]; + dlib::cuda::set_device(dev.device_id); + if (next_job.test_only) + return dev.net.compute_loss(next_job.t[device], next_job.labels[device].begin()); + else + return dev.net.compute_parameter_gradients(next_job.t[device], next_job.labels[device].begin()); + } + else + { + return 0; + } + } + + double compute_parameter_gradients(size_t device, job_t& next_job, const no_label_type&) + { + if (next_job.have_data[device]) + { + auto&& dev = *devices[device]; + dlib::cuda::set_device(dev.device_id); + no_label_type pick_which_run_update; + if (next_job.test_only) + return dev.net.compute_loss(next_job.t[device]); + else + return dev.net.compute_parameter_gradients(next_job.t[device]); + } + else + { + return 0; + } + } + + void update_parameters(size_t device) + { + auto&& dev = *devices[device]; + dlib::cuda::set_device(dev.device_id); + dev.net.update_parameters(make_sstack(dev.solvers), learning_rate); + } + + void thread() try + { + training_label_type pick_which_run_update; + job_t next_job; + + std::vector<dlib::future<double>> losses(devices.size()); + + std::vector<tt::multi_device_tensor_averager> averagers; + // An array of all the parameter tensors in the first network. We will + // periodically copy these tensors to all the other devices to make sure the + // different GPUs don't go out of sync. + std::vector<tensor*> reference_params; + visit_layer_parameters(devices[0]->net, [&](size_t, tensor& t) { reference_params.push_back(&t); }); + + // We make separate thread pools with just one thread in them because we want + // to make sure each device is always executed on the same thread. We care + // about this because there are thread_local context variables for some cuda + // components and they get allocated for each combination of thread and device. + // So if we make sure the same device always uses the same thread this will + // reduce the number of contexts we allocate from num_devices*num_devices to + // just num_devices. + std::vector<std::shared_ptr<thread_pool>> tp; + for (size_t i = 0; i < devices.size(); ++i) + tp.push_back(std::make_shared<thread_pool>(1)); + + + main_iteration_counter = 0; + while(job_pipe.dequeue(next_job)) + { + if (next_job.test_only) + { + // compute the testing loss + for (size_t i = 0; i < devices.size(); ++i) + tp[i]->add_task_by_value([&,i](double& loss){ loss = compute_parameter_gradients(i, next_job, pick_which_run_update); }, losses[i]); + // aggregate loss values from all the network computations. + double theloss = 0; + for (auto&& loss : losses) + theloss += loss.get(); + record_test_loss(theloss/losses.size()); + + // Check if we should shrink the learning rate based on how the test + // error has been doing lately. + if (learning_rate_shrink != 1) + { + test_steps_without_progress = count_steps_without_decrease(test_previous_loss_values); + if (test_steps_without_progress >= test_iter_without_progress_thresh) + { + test_steps_without_progress = count_steps_without_decrease_robust(test_previous_loss_values); + if (test_steps_without_progress >= test_iter_without_progress_thresh) + { + // optimization has flattened out, so drop the learning rate. + learning_rate = learning_rate_shrink*learning_rate; + test_steps_without_progress = 0; + // Empty out some of the previous loss values so that test_steps_without_progress + // will decrease below test_iter_without_progress_thresh. + for (unsigned long cnt = 0; cnt < test_previous_loss_values_dump_amount+test_iter_without_progress_thresh/10 && test_previous_loss_values.size() > 0; ++cnt) + test_previous_loss_values.pop_front(); + } + } + } + continue; + } + + updated_net_since_last_sync = true; + ++main_iteration_counter; + // Call compute_parameter_gradients() and update_parameters() but pick the + // right version for unsupervised or supervised training based on the type + // of training_label_type. + for (size_t i = 0; i < devices.size(); ++i) + tp[i]->add_task_by_value([&,i](double& loss){ loss = compute_parameter_gradients(i, next_job, pick_which_run_update); }, losses[i]); + // aggregate loss values from all the network computations. + double theloss = 0; + for (auto&& loss : losses) + theloss += loss.get(); + record_loss(theloss/losses.size()); + + // Now, if there is more than one active device we need to synchronize the + // gradient updates between devices. So we do that now. + if (devices.size() > 1) + { + // if this is the first iteration then we need to setup the averagers. + // We can't do this outside the loop because the tensors that get + // averaged need to be allocated to their devices before we call set() + // so that the averagers can determine how best to average them. + if (averagers.size() == 0 || sync_file_reloaded) + { + averagers = std::vector<tt::multi_device_tensor_averager>(net_type::num_computational_layers); + // setup the averagers to point to the tensors in the networks. + std::vector<std::vector<tensor*>> all_tensors(devices.size()); + for (size_t i = 0; i < all_tensors.size(); ++i) + { + all_tensors[i].resize(net_type::num_computational_layers); + visit_layer_parameter_gradients(devices[i]->net, [&](size_t j, tensor& t){ + all_tensors[i][j] = &t; + }); + } + // Now set each averager to average the tensors at the same layer in each + // network. + for (size_t i = 0; i < net_type::num_computational_layers; ++i) + { + std::vector<tensor*> temp(all_tensors.size()); + for (size_t j = 0; j < all_tensors.size(); ++j) + temp[j] = all_tensors[j][i]; + // ignore layers that don't have parameters + if (temp[0]->size() != 0) + averagers[i].set(temp); + } + + sync_file_reloaded = false; + } + + + for (auto&& d : devices) + cuda::device_synchronize(d->device_id); + + for (auto&& avg : averagers) + avg.average(); + } + + + // Now apply all the updates to each device. + for (size_t i = 0; i < devices.size(); ++i) + tp[i]->add_task_by_value([&,i](){ if (next_job.have_data[i]) update_parameters(i); }); + // and wait for the updates to all happen. + for (size_t i = 0; i < devices.size(); ++i) + tp[i]->wait_for_all_tasks(); + + + // Every now and then force all the parameters to be the same just to make + // sure they aren't drifting apart due to any non-deterministic behavior on + // the GPU. It's also important to do this on the first iteration because + // the different networks may be initialized differently when tensor data + // is first passed through them. So this code block deals with these + // issues. + if (devices.size() > 1 && main_iteration_counter%2000 == 1) + { + for (size_t i = 1; i < devices.size(); ++i) + { + visit_layer_parameters(devices[i]->net, [&](size_t j, tensor& t) + { + memcpy(t, *reference_params[j]); + }); + } + } + + // If we have been running for a while then check if the loss is still + // dropping. If it isn't then we will reduce the learning rate. Note that we + // have a "budget" that prevents us from calling + // count_steps_without_decrease() every iteration. We do this because + // it can be expensive to compute when previous_loss_values is large. + if (gradient_check_budget > iter_without_progress_thresh && learning_rate_shrink != 1) + { + gradient_check_budget = 0; + steps_without_progress = count_steps_without_decrease(previous_loss_values); + if (steps_without_progress >= iter_without_progress_thresh) + { + // Double check that we aren't seeing decrease. This second check + // discards the top 10% largest values and checks again. We do + // this because sometimes a mini-batch might be bad and cause the + // loss to suddenly jump up, making count_steps_without_decrease() + // return a large number. But if we discard the top 10% of the + // values in previous_loss_values then we are robust to that kind + // of noise. Another way of looking at it, if the reason + // count_steps_without_decrease() returns a large value is only + // because the most recent loss values have suddenly been large, + // then we shouldn't stop or lower the learning rate. We should + // keep going until whatever disturbance we hit is damped down. + steps_without_progress = count_steps_without_decrease_robust(previous_loss_values); + if (steps_without_progress >= iter_without_progress_thresh) + { + // optimization has flattened out, so drop the learning rate. + learning_rate = learning_rate_shrink*learning_rate; + steps_without_progress = 0; + // Empty out some of the previous loss values so that steps_without_progress + // will decrease below iter_without_progress_thresh. + for (unsigned long cnt = 0; cnt < previous_loss_values_dump_amount+iter_without_progress_thresh/10 && previous_loss_values.size() > 0; ++cnt) + previous_loss_values.pop_front(); + } + } + } + else if (lr_schedule.size() != 0) // or use the learning rate schedule if we have one. + { + if (lr_schedule_pos < lr_schedule.size()) + learning_rate = lr_schedule(lr_schedule_pos++); + else + learning_rate = lr_schedule(lr_schedule.size()-1)*0.99; + } + } + } + catch(...) + { + // If an exception happens then permanently disable the trainer object. + job_pipe.disable(); + std::lock_guard<std::mutex> lock(eptr_mutex); + eptr = std::current_exception(); + } + + void wait_for_thread_to_pause() const + { + job_pipe.wait_for_num_blocked_dequeues(1); + } + + const static long string_pad = 11; + const static long epoch_string_pad = 4; + const static long lr_string_pad = 4; + + void init() + { + max_num_epochs = 10000; + mini_batch_size = 128; + verbose = false; + learning_rate = 1e-2; + min_learning_rate = 1e-5; + iter_without_progress_thresh = 2000; + steps_without_progress = 0; + test_iter_without_progress_thresh = 500; + test_steps_without_progress = 0; + + learning_rate_shrink = 0.1; + epoch_iteration = 0; + epoch_pos = 0; + train_one_step_calls = 0; + test_one_step_calls = 0; + gradient_check_budget = 0; + lr_schedule_pos = 0; + + main_iteration_counter = 0; + main_iteration_counter_at_last_disk_sync = 0; + prob_loss_increasing_thresh_default_value = 0.99; + prob_loss_increasing_thresh_max_value = 0.99999; + prob_loss_increasing_thresh = prob_loss_increasing_thresh_default_value; + updated_net_since_last_sync = false; + sync_file_reloaded = false; + previous_loss_values_dump_amount = 400; + test_previous_loss_values_dump_amount = 100; + + rs_test = running_stats_decayed<double>(200); + + start(); + } + + // serialize and deserialize are private because we hold net by reference so + // allowing someone to serialize this training object is weird and will likely + // result in user errors. However, we use these functions as part of the automatic + // sync code in this object. + friend void serialize(const dnn_trainer& item, std::ostream& out) + { + item.wait_for_thread_to_pause(); + int version = 12; + serialize(version, out); + + size_t nl = dnn_trainer::num_layers; + serialize(nl, out); + serialize(item.rs, out); + serialize(item.rs_test, out); + serialize(item.previous_loss_values, out); + serialize(item.max_num_epochs, out); + serialize(item.mini_batch_size, out); + serialize(item.verbose, out); + serialize(item.net, out); + serialize(item.devices[0]->solvers, out); + serialize(item.learning_rate.load(), out); + serialize(item.min_learning_rate, out); + serialize(item.iter_without_progress_thresh.load(), out); + serialize(item.steps_without_progress.load(), out); + serialize(item.learning_rate_shrink.load(), out); + serialize(item.epoch_iteration, out); + serialize(item.epoch_pos, out); + serialize(item.train_one_step_calls, out); + serialize(item.test_one_step_calls, out); + serialize(item.lr_schedule, out); + serialize(item.lr_schedule_pos, out); + serialize(item.test_iter_without_progress_thresh.load(), out); + serialize(item.test_steps_without_progress.load(), out); + serialize(item.test_previous_loss_values, out); + serialize(item.previous_loss_values_dump_amount, out); + serialize(item.test_previous_loss_values_dump_amount, out); + + } + friend void deserialize(dnn_trainer& item, std::istream& in) + { + item.wait_for_thread_to_pause(); + int version = 0; + deserialize(version, in); + if (version != 12) + throw serialization_error("Unexpected version found while deserializing dlib::dnn_trainer."); + + size_t num_layers = 0; + deserialize(num_layers, in); + if (num_layers != dnn_trainer::num_layers) + { + std::ostringstream sout; + sout << "Error deserializing dlib::dnn_trainer. The saved sync file is for a network with " << std::endl; + sout << "a different number of layers. We expected the number of layers to be " << dnn_trainer::num_layers << " but" << std::endl; + sout << "instead the file contains " << num_layers << " layers." << std::endl; + throw serialization_error(sout.str()); + } + + double dtemp; long ltemp; + deserialize(item.rs, in); + deserialize(item.rs_test, in); + deserialize(item.previous_loss_values, in); + deserialize(item.max_num_epochs, in); + deserialize(item.mini_batch_size, in); + deserialize(item.verbose, in); + deserialize(item.net, in); + deserialize(item.devices[0]->solvers, in); + deserialize(dtemp, in); item.learning_rate = dtemp; + deserialize(item.min_learning_rate, in); + deserialize(ltemp, in); item.iter_without_progress_thresh = ltemp; + deserialize(ltemp, in); item.steps_without_progress = ltemp; + deserialize(dtemp, in); item.learning_rate_shrink = dtemp; + deserialize(item.epoch_iteration, in); + deserialize(item.epoch_pos, in); + deserialize(item.train_one_step_calls, in); + deserialize(item.test_one_step_calls, in); + deserialize(item.lr_schedule, in); + deserialize(item.lr_schedule_pos, in); + deserialize(ltemp, in); item.test_iter_without_progress_thresh = ltemp; + deserialize(ltemp, in); item.test_steps_without_progress = ltemp; + deserialize(item.test_previous_loss_values, in); + deserialize(item.previous_loss_values_dump_amount, in); + deserialize(item.test_previous_loss_values_dump_amount, in); + + if (item.devices.size() > 1) + { + const auto prev_dev = dlib::cuda::get_device(); + // initialize all the other device networks and solver objects + for (size_t i = 1; i < item.devices.size(); ++i) + { + // Switch to this device so that any tensor objects that get allocated when + // we copy this stuff happen on this device. + dlib::cuda::set_device(item.devices[i]->device_id); + item.devices[i]->solvers = item.devices[0]->solvers; + item.devices[i]->net = item.devices[0]->net; + } + dlib::cuda::set_device(prev_dev); + } + } + + void sync_to_disk ( + bool do_it_now = false + ) + { + // don't sync anything if we haven't updated the network since the last sync + if (!updated_net_since_last_sync) + return; + + // If the sync file isn't set then don't do anything. + if (sync_filename.size() == 0) + return; + + // Only sync if it has been long enough since the last sync or we are being + // explicitly forced to do it. + if (std::chrono::system_clock::now() - last_sync_time > time_between_syncs || + do_it_now) + { + wait_for_thread_to_pause(); + + // compact network before saving to disk. + this->net.clean(); + + // if the loss has actually been going up since the last time we saved our + // state to disk then something has probably gone wrong in the + // optimization. So in this case we do the opposite and recall the + // previously saved state in the hopes that the problem won't reoccur. + if (loss_increased_since_last_disk_sync()) + { + std::ifstream fin(newest_syncfile(), std::ios::binary); + deserialize(*this, fin); + sync_file_reloaded = true; + if (verbose) + std::cout << "Loss has been increasing, reloading saved state from " << newest_syncfile() << std::endl; + } + else + { + + const std::string filename = oldest_syncfile(); + serialize(filename) << *this; + + if (verbose) + std::cout << "Saved state to " << filename << std::endl; + } + + last_sync_time = std::chrono::system_clock::now(); + main_iteration_counter_at_last_disk_sync = main_iteration_counter; + updated_net_since_last_sync = false; + } + } + + std::string newest_syncfile ( + ) + { + return select_newest_file(sync_filename, sync_filename + "_"); + } + + std::string oldest_syncfile ( + ) + { + return select_oldest_file(sync_filename, sync_filename + "_"); + } + + bool loss_increased_since_last_disk_sync() + { + size_t gradient_updates_since_last_sync = main_iteration_counter - main_iteration_counter_at_last_disk_sync; + + // if we haven't synced anything to disk yet then return false. + if (!std::ifstream(newest_syncfile(), std::ios::binary)) + return false; + + for (auto x : previous_loss_values) + { + // If we get a NaN value of loss assume things have gone horribly wrong and + // we should reload the state of the trainer. + if (std::isnan(x)) + return true; + } + + // if we haven't seen much data yet then just say false. Or, alternatively, if + // it's been too long since the last sync then don't reload either. + if (gradient_updates_since_last_sync < 30 || previous_loss_values.size() < 2*gradient_updates_since_last_sync) + return false; + + // Now look at the data since a little before the last disk sync. We will + // check if the loss is getting bettor or worse. + running_gradient g; + for (size_t i = previous_loss_values.size() - 2*gradient_updates_since_last_sync; i < previous_loss_values.size(); ++i) + g.add(previous_loss_values[i]); + + // if the loss is very likely to be increasing then return true + const double prob = g.probability_gradient_greater_than(0); + if (prob > prob_loss_increasing_thresh && prob_loss_increasing_thresh <= prob_loss_increasing_thresh_max_value) + { + // Exponentially decay the threshold towards 1 so that if we keep finding + // the loss to be increasing over and over we will make the test + // progressively harder and harder until it fails, therefore ensuring we + // can't get stuck reloading from a previous state over and over. + prob_loss_increasing_thresh = 0.1*prob_loss_increasing_thresh + 0.9*1; + return true; + } + else + { + // decay back to the default threshold + prob_loss_increasing_thresh = std::pow(prob_loss_increasing_thresh, 10.0); + // but don't decay below the default value + prob_loss_increasing_thresh = std::max(prob_loss_increasing_thresh, prob_loss_increasing_thresh_default_value); + + return false; + } + } + + + struct clone_net{}; + + // per device state. All the containers have the same number of objects in them. + struct device_data + { + device_data( + int device_id_, + net_type& net_, + const solver_type& solver_ + ) : device_id(device_id_), net(net_), solvers(num_computational_layers, solver_) {} + + device_data( + int device_id_, + net_type& net_, + const solver_type& solver_, + clone_net + ) : device_id(device_id_), net_copy(std::make_shared<net_type>(net_)), net(*net_copy), solvers(num_computational_layers, solver_) {} + + int device_id; + std::shared_ptr<net_type> net_copy; + net_type& net; + std::vector<solver_type> solvers; + }; + + template < + typename data_iterator, + typename label_iterator + > + void send_job ( + bool test_only, + data_iterator dbegin, + data_iterator dend, + label_iterator lbegin + ) + { + propagate_exception(); + size_t num = std::distance(dbegin, dend); + size_t devs = devices.size(); + job.t.resize(devs); + job.labels.resize(devs); + job.have_data.resize(devs); + job.test_only = test_only; + + // chop the data into devs blocks, each of about block_size elements. + size_t block_size = (num+devs-1)/devs; + + const auto prev_dev = dlib::cuda::get_device(); + for (size_t i = 0; i < devs; ++i) + { + dlib::cuda::set_device(devices[i]->device_id); + + size_t start = i*block_size; + size_t stop = std::min(num, start+block_size); + + if (start < stop) + { + devices[i]->net.to_tensor(dbegin+start, dbegin+stop, job.t[i]); + job.labels[i].assign(lbegin+start, lbegin+stop); + job.have_data[i] = true; + } + else + { + job.have_data[i] = false; + } + } + + dlib::cuda::set_device(prev_dev); + job_pipe.enqueue(job); + } + + template < + typename data_iterator + > + void send_job ( + bool test_only, + data_iterator dbegin, + data_iterator dend + ) + { + typename std::vector<training_label_type>::iterator nothing; + send_job(test_only, dbegin, dend, nothing); + } + + void print_progress() + { + if (lr_schedule.size() == 0) + { + if (test_previous_loss_values.size() == 0) + std::cout << "steps without apparent progress: " << steps_without_progress; + else + std::cout << "steps without apparent progress: train=" << steps_without_progress << ", test=" << test_steps_without_progress; + } + else + { + std::ostringstream sout; + sout << "percent complete: " << std::fixed << std::setprecision(2) << 100.0*lr_schedule_pos/(double)lr_schedule.size() << "%"; + std::cout << sout.str(); + } + std::cout << std::endl; + } + + void print_periodic_verbose_status() + { + if (verbose) + { + using namespace std::chrono; + auto now_time = system_clock::now(); + if (now_time-last_time > seconds(40)) + { + last_time = now_time; + std::cout << "step#: " << rpad(cast_to_string(train_one_step_calls),epoch_string_pad) << " " + << "learning rate: " << rpad(cast_to_string(learning_rate),lr_string_pad) << " "; + if (test_previous_loss_values.size() == 0) + { + std::cout << "average loss: " << rpad(cast_to_string(get_average_loss()),string_pad) << " "; + } + else + { + std::cout << "train loss: " << rpad(cast_to_string(get_average_loss()),string_pad) << " "; + std::cout << "test loss: " << rpad(cast_to_string(get_average_test_loss()),string_pad) << " "; + } + print_progress(); + clear_average_loss(); + } + } + } + + std::vector<std::shared_ptr<device_data>> devices; + dlib::pipe<job_t> job_pipe; + job_t job; + + + running_stats<double> rs; + running_stats_decayed<double> rs_test; + std::deque<double> previous_loss_values; + unsigned long max_num_epochs; + size_t mini_batch_size; + bool verbose; + net_type& net; + std::atomic<double> learning_rate; + double min_learning_rate; + std::atomic<unsigned long> iter_without_progress_thresh; + std::atomic<unsigned long> steps_without_progress; + + std::atomic<unsigned long> test_iter_without_progress_thresh; + std::atomic<unsigned long> test_steps_without_progress; + std::deque<double> test_previous_loss_values; + + std::atomic<double> learning_rate_shrink; + std::chrono::time_point<std::chrono::system_clock> last_sync_time; + std::string sync_filename; + std::chrono::seconds time_between_syncs; + unsigned long epoch_iteration; + size_t epoch_pos; + std::chrono::time_point<std::chrono::system_clock> last_time; + unsigned long long train_one_step_calls; + unsigned long long test_one_step_calls; + matrix<double,0,1> lr_schedule; + long lr_schedule_pos; + unsigned long gradient_check_budget; + + std::exception_ptr eptr = nullptr; + mutable std::mutex eptr_mutex; + void propagate_exception() const + { + std::lock_guard<std::mutex> lock(eptr_mutex); + if (eptr) + std::rethrow_exception(eptr); + } + + // These 5 variables are not serialized + size_t main_iteration_counter; + size_t main_iteration_counter_at_last_disk_sync; + double prob_loss_increasing_thresh_default_value; + double prob_loss_increasing_thresh_max_value; + double prob_loss_increasing_thresh; + std::atomic<bool> updated_net_since_last_sync; + + bool sync_file_reloaded; + unsigned long previous_loss_values_dump_amount; + unsigned long test_previous_loss_values_dump_amount; + }; + +// ---------------------------------------------------------------------------------------- + + template < + typename net_type, + typename solver_type + > + std::ostream& operator<< ( + std::ostream& out, + dnn_trainer<net_type,solver_type>& trainer + ) + { + using std::endl; + out << "dnn_trainer details: \n"; + out << " net_type::num_layers: " << net_type::num_layers << endl; + // figure out how big the net is in MB. + std::ostringstream sout; + net_type temp = trainer.get_net(); // make a copy so that we can clean it without mutating the trainer's net. + temp.clean(); + serialize(temp, sout); + out << " net size: " << sout.str().size()/1024.0/1024.0 << "MB" << endl; + // Don't include the loss params in the hash since we print them on the next line. + // They also aren't really part of the "architecture" of the network. + out << " net architecture hash: " << md5(cast_to_string(trainer.get_net().subnet())) << endl; + out << " loss: " << trainer.get_net().loss_details() << endl; + + out << " synchronization file: " << trainer.get_synchronization_file() << endl; + out << " trainer.get_solvers()[0]: " << trainer.get_solvers()[0] << endl; + auto sched = trainer.get_learning_rate_schedule(); + if (sched.size() != 0) + { + out << " using explicit user-supplied learning rate schedule" << endl; + } + else + { + out << " learning rate: "<< trainer.get_learning_rate() << endl; + out << " learning rate shrink factor: "<< trainer.get_learning_rate_shrink_factor() << endl; + out << " min learning rate: "<< trainer.get_min_learning_rate() << endl; + out << " iterations without progress threshold: "<< trainer.get_iterations_without_progress_threshold() << endl; + out << " test iterations without progress threshold: "<< trainer.get_test_iterations_without_progress_threshold() << endl; + } + return out; + } + +// ---------------------------------------------------------------------------------------- + +} + +#endif // DLIB_DNn_TRAINER_H_ + diff --git a/ml/dlib/dlib/dnn/trainer_abstract.h b/ml/dlib/dlib/dnn/trainer_abstract.h new file mode 100644 index 000000000..3bfb6dc99 --- /dev/null +++ b/ml/dlib/dlib/dnn/trainer_abstract.h @@ -0,0 +1,765 @@ +// Copyright (C) 2015 Davis E. King (davis@dlib.net) +// License: Boost Software License See LICENSE.txt for the full license. +#undef DLIB_DNn_TRAINER_ABSTRACT_H_ +#ifdef DLIB_DNn_TRAINER_ABSTRACT_H_ + +#include "core_abstract.h" +#include "solvers_abstract.h" +#include <vector> +#include <chrono> + + +namespace dlib +{ + +// ---------------------------------------------------------------------------------------- + + enum class force_flush_to_disk { + no = 0, + yes = 1 + }; + +// ---------------------------------------------------------------------------------------- + + template < + typename net_type, + typename solver_type = sgd + > + class dnn_trainer + { + /*! + REQUIREMENTS ON net_type + - net_type is an add_loss_layer object. + + REQUIREMENTS ON solver_type + - solver_type is an implementation of the EXAMPLE_SOLVER interface defined + in solvers_abstract.h + + WHAT THIS OBJECT REPRESENTS + This object is a tool training a deep neural network. To use it you supply + a neural network type and a solver, then you call train() with your + training data and it will output a new network instance that has hopefully + learned something useful from your training data. + + If you are compiling with CUDA then this object will use the GPU that is + currently selected (i.e. the one indicated by cudaGetDevice()) when + dnn_trainer is constructed. It will continue to use that device even if + you later change it by a call to cudaSetDevice(). + + EXCEPTIONS + If an exception is thrown by any part of the neural network during training + then the exception will be propagated out of the trainer to the user. + Moreover, the trainer instance will be unusable and should be destroyed. + !*/ + + public: + + typedef typename net_type::training_label_type training_label_type; + typedef typename net_type::input_type input_type; + const static size_t num_computational_layers = net_type::num_computational_layers; + + dnn_trainer() = delete; + dnn_trainer(const dnn_trainer&) = delete; + dnn_trainer& operator=(const dnn_trainer&) = delete; + + dnn_trainer( + net_type& net, + const solver_type& solver = solver_type(), + const std::vector<int>& cuda_extra_devices = {} + ); + /*! + requires + - for all valid i: + - 0 <= cuda_extra_devices[i] < dlib::cuda::get_num_devices() + ensures + - &#get_net() == &net + (i.e. The dnn_trainer holds a reference to net, it does not copy it. + Therefore, you must ensure net has a lifetime at least as long as the + dnn_trainer). + - #get_solvers() == a set of solvers that are all initialized with the + provided solver instance. + - #get_max_num_epochs() == 10000 + - #get_mini_batch_size() == 128 + - #get_learning_rate() == 1e-2 + - #get_min_learning_rate() == 1e-5 + - #get_iterations_without_progress_threshold() == 2000 + - #get_test_iterations_without_progress_threshold() == 500 + - #get_learning_rate_shrink_factor() == 0.1 + - #get_learning_rate_schedule().size() == 0 + - #get_train_one_step_calls() == 0 + - #get_test_one_step_calls() == 0 + - #get_synchronization_file() == "" + - if (cuda_extra_devices.size() > 0) then + - This object will use multiple graphics cards to run the learning + algorithms. In particular, it will always use whatever device is + currently selected on the calling thread (the device indicated by + cudaGetDevice()). In addition, you can ask to use additional + devices, which you do by putting their device numbers into + cuda_extra_devices. + !*/ + + net_type& get_net ( + force_flush_to_disk force_flush = force_flush_to_disk::yes + ); + /*! + ensures + - returns the neural network object used by this trainer. This is the + network that is optimized when you call train() or train_one_step(). + Recall that the dnn_trainer doesn't contain the net_type object but + simply holds a reference to an external network which was provided to the + dnn_trainer's constructor. + - This function blocks until all threads inside the dnn_trainer have + stopped touching the net. + - If force_flush is yes, then this function will sync the trainer state to + disk if the current state hasn't already been synced to disk since the + last network modification. + !*/ + + const std::vector<solver_type>& get_solvers ( + ) const; + /*! + ensures + - returns the solvers used to optimize each layer of the neural network + get_net(). In particular, the first layer's solver is + get_solvers()[0], the second layer's solver is + get_solvers()[1], and so on. + - This function blocks until all threads inside the dnn_trainer have + stopped touching the net. + !*/ + + unsigned long get_mini_batch_size ( + ) const; + /*! + ensures + - During training, we call the network's update() routine over and over + with training data. The number of training samples we give to each call + to update is the "mini-batch size", which is defined by + get_mini_batch_size(). + !*/ + + void set_mini_batch_size ( + unsigned long batch_size + ); + /*! + requires + - batch_size > 0 + ensures + - #get_mini_batch_size() == batch_size + !*/ + + unsigned long get_max_num_epochs ( + ) const; + /*! + ensures + - train() will execute at most get_max_num_epochs() iterations over the + training data before returning. + !*/ + + void set_max_num_epochs ( + unsigned long num + ); + /*! + requires + - num > 0 + ensures + - #get_max_num_epochs() == num + !*/ + + void set_learning_rate ( + double lr + ); + /*! + requires + - lr > 0 + ensures + - #get_learning_rate() == lr + - #get_learning_rate_schedule().size() == 0 + - This function blocks until all threads inside the dnn_trainer have + stopped touching the net. + !*/ + + double get_learning_rate( + ) const; + /*! + ensures + - During each training step, a solver tells us how to modify the parameters + of each layer in the network. It does this by outputting a step vector + that, when added to the parameters, will hopefully result in improved + network performance. The learning rate is one of the inputs to the + solver and influences the size of this step vector. This function + returns the current learning rate, that is, the learning rate that will + be used during the next training step. + !*/ + + void set_min_learning_rate ( + double lr + ); + /*! + requires + - lr > 0 + ensures + - #get_min_learning_rate() == lr + - #get_learning_rate_schedule().size() == 0 + - This function blocks until all threads inside the dnn_trainer have + stopped touching the net. + !*/ + + double get_min_learning_rate ( + ) const; + /*! + ensures + - During training via this->train(), this object will test if progress is + still being made and if it isn't then it will reduce get_learning_rate() + by setting it to get_learning_rate()*get_learning_rate_shrink_factor(). + However, it will not reduce it below get_min_learning_rate(). Once this + minimum learning rate is crossed the training will terminate. + - get_min_learning_rate() doesn't apply if you are using train_one_step(). + You can keep calling train_one_step() as many times as you want and the + learning rate will drop infinitely close to 0 if you run long enough. + !*/ + + template <typename EXP> + void set_learning_rate_schedule ( + const matrix_exp<EXP>& schedule + ); + /*! + requires + - schedule.size() > 0 + - min(schedule) > 0 + ensures + - #get_learning_rate_schedule() == reshape_to_column_vector(schedule) + - #get_learning_rate() == schedule(0,0) + - #get_min_learning_rate() == min(schedule) + - #set_learning_rate_shrink_factor() == 1 + !*/ + + const matrix<double,0,1>& get_learning_rate_schedule ( + ) const; + /*! + ensures + - if (this function returns a non-empty matrix) then + - This trainer will use an explicit learning rate schedule defined by + the learning rate values in get_learning_rate_schedule(). For + example, if get_learning_rate_schedule() returned {0.1, 0.09, 0.08, + 0.07, 0.06} then the first training mini-batch would use a learning + rate of 0.1, then the next training mini-batch uses 0.09, and then + 0.8, and so on until the end of the schedule is reached. + + If you continue to run training after the end of the schedule has + been reached then the learning rate will be fixed to 0.99 times the + final value. So in our example, eventually the learning rate would + be fixed to 0.99*0.06. This allows you to test if we have reached the + end of the schedule by checking if get_learning_rate() >= 0.06. + !*/ + + unsigned long get_steps_without_progress ( + ) const; + /*! + ensures + - if (get_learning_rate_shrink_factor() != 1) then + - returns an estimate of how many mini-batches have executed without us + observing a statistically significant decrease in the training error. + - else + - returns 0 + !*/ + + void set_iterations_without_progress_threshold ( + unsigned long thresh + ); + /*! + ensures + - #get_iterations_without_progress_threshold() == thresh + - #get_learning_rate_schedule().size() == 0 + - This function blocks until all threads inside the dnn_trainer have + stopped touching the net. + !*/ + + unsigned long get_iterations_without_progress_threshold ( + ) const; + /*! + ensures + - This object monitors the progress of training and estimates if the + training error is being reduced. It does this by looking at the previous + get_iterations_without_progress_threshold() mini-batch results and + applying the statistical test defined by the running_gradient object to + see if the training error is getting smaller. If it isn't being reduced + then get_learning_rate() is made smaller by a factor of get_learning_rate_shrink_factor(). + + Therefore, get_iterations_without_progress_threshold() should always be + set to something sensibly large so that this test can be done with + reasonably high confidence. Think of this test as saying "if the loss + hasn't decreased for the previous get_iterations_without_progress_threshold() + then shrink the learning rate". + !*/ + + void set_learning_rate_shrink_factor ( + double shrink + ); + /*! + requires + - 0 < shrink && shrink <= 1 + ensures + - #get_learning_rate_shrink_factor() == shrink + - #get_learning_rate_schedule().size() == 0 + - This function blocks until all threads inside the dnn_trainer have + stopped touching the net. + !*/ + + double get_learning_rate_shrink_factor ( + ) const; + /*! + ensures + - Whenever the training routine thinks it isn't making progress anymore it + will reduce get_learning_rate() by multiplying it by get_learning_rate_shrink_factor(). + - You can disable the automatic learning rate reduction by setting + get_learning_rate_shrink_factor() to 1. + !*/ + + unsigned long long get_train_one_step_calls ( + ) const; + /*! + ensures + - returns the number of times train_one_step() has been called. + !*/ + + unsigned long long get_test_one_step_calls ( + ) const; + /*! + ensures + - returns the number of times test_one_step() has been called. + !*/ + + void be_verbose ( + ); + /*! + ensures + - This object will print status messages to standard out so that a + user can observe the progress of the algorithm. + !*/ + + void be_quiet ( + ); + /*! + ensures + - This object will not print anything to standard out + !*/ + + void set_synchronization_file ( + const std::string& filename, + std::chrono::seconds time_between_syncs = std::chrono::minutes(15) + ); + /*! + ensures + - #get_synchronization_file() == filename + - While training is running, either via train() or repeated calls to + train_one_step(), this object will save its entire state, including the + state of get_net(), to disk in the file named filename every + time_between_syncs seconds. + - If the filename file already exists then the state of this trainer will + be loaded from that file by this call to set_synchronization_file(). + This allows you to resume a training session which was previously + interrupted. + - It should be noted that when saving, the trainer will alternate between + saving to a file called filename and another file called filename+"_". + We do this because it's possible that your computer might crash (not + because of dlib, just in general) before the data is safely saved to + disk. This way, you will always have a backup file if the write to disk + gets corrupted or is incomplete. Moreover, when loading, we will always + load from the newest of the two possible files. + !*/ + + const std::string& get_synchronization_file ( + ); + /*! + ensures + - Returns the name of the file the dnn_trainer will periodically save it's + state to. If the return value is "" then synchronization is disabled. + !*/ + + void train ( + const std::vector<input_type>& data, + const std::vector<training_label_type>& labels + ); + /*! + requires + - data.size() == labels.size() + - data.size() > 0 + - net_type uses a supervised loss. + i.e. net_type::training_label_type != no_label_type. + ensures + - Trains a supervised neural network based on the given training data. + The goal of training is to find the network parameters that minimize + get_net().compute_loss(data.begin(), data.end(), labels.begin()). + - The optimizer will run until get_learning_rate() < get_min_learning_rate() + or get_max_num_epochs() training epochs have been executed. + - Each layer in the network will be optimized by its corresponding solver + in get_solvers(). + - Each call to train DOES NOT reinitialize the state of get_net() or + get_solvers(). That is, the existing state of the solvers and network is + the starting point for the optimization each time train() is called. In + particular, if you use the set_synchronization_file() method you can + resume an interrupted train() call by simply calling train() again and it + will pick up from the last synchronization point. + - You can obtain the average loss value during the final training epoch by + calling get_average_loss(). + - This function blocks until all threads inside the dnn_trainer have + stopped touching the net. + !*/ + + void train ( + const std::vector<input_type>& data + ); + /*! + requires + - data.size() > 0 + - net_type uses an unsupervised loss. + i.e. net_type::training_label_type == no_label_type. + ensures + - Trains an unsupervised neural network based on the given training data. + The goal of training is to find the network parameters that minimize + get_net().compute_loss(data.begin(), data.end()). + - The optimizer will run until get_learning_rate() < get_min_learning_rate() + or get_max_num_epochs() training epochs have been executed. + - Each layer in the network will be optimized by its corresponding solver + in get_solvers(). + - Each call to train DOES NOT reinitialize the state of get_net() or + get_solvers(). That is, the existing state of the solvers and network is + the starting point for the optimization each time train() is called. In + particular, if you use the set_synchronization_file() method you can + resume an interrupted train() call by simply calling train() again and it + will pick up from the last synchronization point. + - You can obtain the average loss value during the final training epoch by + calling get_average_loss(). + - This function blocks until all threads inside the dnn_trainer have + stopped touching the net. + !*/ + + void train_one_step ( + const std::vector<input_type>& data, + const std::vector<training_label_type>& labels + ); + /*! + requires + - data.size() == labels.size() + - data.size() > 0 + - net_type uses a supervised loss. + i.e. net_type::training_label_type != no_label_type. + ensures + - Performs one stochastic gradient update step based on the mini-batch of + data and labels supplied to this function. In particular, calling + train_one_step() in a loop is equivalent to calling the train() method + defined above. However, train_one_step() allows you to stream data from + disk into the training process while train() requires you to first load + all the training data into RAM. Otherwise, these training methods are + equivalent. + - You can observe the current average loss value by calling get_average_loss(). + - The network training will happen in another thread. Therefore, after + calling this function you should call get_net() before you touch the net + object from the calling thread to ensure no other threads are still + accessing the network. + - #get_train_one_step_calls() == get_train_one_step_calls() + 1. + !*/ + + template < + typename data_iterator, + typename label_iterator + > + void train_one_step ( + data_iterator dbegin, + data_iterator dend, + label_iterator lbegin + ); + /*! + requires + - std::advance(lbegin, std::distance(dbegin, dend) - 1) is dereferencable + - std::distance(dbegin, dend) > 0 + - net_type uses a supervised loss. + i.e. net_type::training_label_type != no_label_type. + ensures + - Performs one stochastic gradient update step based on the mini-batch of + data and labels supplied to this function. In particular, calling + train_one_step() in a loop is equivalent to calling the train() method + defined above. However, train_one_step() allows you to stream data from + disk into the training process while train() requires you to first load + all the training data into RAM. Otherwise, these training methods are + equivalent. + - You can observe the current average loss value by calling get_average_loss(). + - The network training will happen in another thread. Therefore, after + calling this function you should call get_net() before you touch the net + object from the calling thread to ensure no other threads are still + accessing the network. + - #get_train_one_step_calls() == get_train_one_step_calls() + 1. + !*/ + + void train_one_step ( + const std::vector<input_type>& data + ); + /*! + requires + - data.size() > 0 + - net_type uses an unsupervised loss. + i.e. net_type::training_label_type == no_label_type. + ensures + - Performs one stochastic gradient update step based on the mini-batch of + data supplied to this function. In particular, calling train_one_step() + in a loop is equivalent to calling the train() method defined above. + However, train_one_step() allows you to stream data from disk into the + training process while train() requires you to first load all the + training data into RAM. Otherwise, these training methods are + equivalent. + - You can observe the current average loss value by calling get_average_loss(). + - The network training will happen in another thread. Therefore, after + calling this function you should call get_net() before you touch the net + object from the calling thread to ensure no other threads are still + accessing the network. + - #get_train_one_step_calls() == get_train_one_step_calls() + 1. + !*/ + + template < + typename data_iterator + > + void train_one_step ( + data_iterator dbegin, + data_iterator dend + ); + /*! + requires + - std::distance(dbegin, dend) > 0 + - net_type uses an unsupervised loss. + i.e. net_type::training_label_type == no_label_type. + ensures + - Performs one stochastic gradient update step based on the mini-batch of + data supplied to this function. In particular, calling train_one_step() + in a loop is equivalent to calling the train() method defined above. + However, train_one_step() allows you to stream data from disk into the + training process while train() requires you to first load all the + training data into RAM. Otherwise, these training methods are + equivalent. + - You can observe the current average loss value by calling get_average_loss(). + - The network training will happen in another thread. Therefore, after + calling this function you should call get_net() before you touch the net + object from the calling thread to ensure no other threads are still + accessing the network. + - #get_train_one_step_calls() == get_train_one_step_calls() + 1. + !*/ + + double get_average_loss ( + ) const; + /*! + ensures + - returns the average loss value observed during previous calls to + train_one_step() or train(). That is, the average output of + net_type::update() during the previous mini-batch updates. + - Note that, if be_verbose() has been called, then this object will + automatically call clear_average_loss() periodically when it logs the + loss to the console. + - This function blocks until all threads inside the dnn_trainer have + stopped touching the net. + !*/ + + void clear_average_loss ( + ); + /*! + ensures + - #get_average_loss() == 0 + - get_average_loss() uses a dlib::running_stats object to keep a running + average of the loss values seen during the previous mini-batch updates + applied during training. Calling clear_average_loss() resets the + running_stats object so it forgets about all previous loss values + observed. + - This function blocks until all threads inside the dnn_trainer have + stopped touching the net. + !*/ + + // ---------------------- + + double get_average_test_loss ( + ) const; + /*! + ensures + - returns the average loss value observed during previous calls to + test_one_step(). + - This function blocks until all threads inside the dnn_trainer have + stopped touching the net. + !*/ + + void test_one_step ( + const std::vector<input_type>& data, + const std::vector<training_label_type>& labels + ); + /*! + requires + - data.size() == labels.size() + - data.size() > 0 + - net_type uses a supervised loss. + i.e. net_type::training_label_type != no_label_type. + ensures + - Runs the given data through the network and computes and records the loss. + - This call does not modify network parameters. The point of + test_one_step() is two fold, to allow you to observe the accuracy of the + network on hold out data during training, and to allow the trainer to + automatically adjust the learning rate when the test loss stops + improving. It should be noted that you are not required to use + test_one_step() at all, but if you want to do this kind of thing it is + available. + - You can observe the current average loss value by calling get_average_test_loss(). + - The computation will happen in another thread. Therefore, after calling + this function you should call get_net() before you touch the net object + from the calling thread to ensure no other threads are still accessing + the network. + - #get_test_one_step_calls() == get_test_one_step_calls() + 1. + !*/ + + template < + typename data_iterator, + typename label_iterator + > + void test_one_step ( + data_iterator dbegin, + data_iterator dend, + label_iterator lbegin + ); + /*! + requires + - std::advance(lbegin, std::distance(dbegin, dend) - 1) is dereferencable + - std::distance(dbegin, dend) > 0 + - net_type uses a supervised loss. + i.e. net_type::training_label_type != no_label_type. + ensures + - Runs the given data through the network and computes and records the loss. + - This call does not modify network parameters. The point of + test_one_step() is two fold, to allow you to observe the accuracy of the + network on hold out data during training, and to allow the trainer to + automatically adjust the learning rate when the test loss stops + improving. It should be noted that you are not required to use + test_one_step() at all, but if you want to do this kind of thing it is + available. + - You can observe the current average loss value by calling get_average_test_loss(). + - The computation will happen in another thread. Therefore, after calling + this function you should call get_net() before you touch the net object + from the calling thread to ensure no other threads are still accessing + the network. + - #get_test_one_step_calls() == get_test_one_step_calls() + 1. + !*/ + + void test_one_step ( + const std::vector<input_type>& data + ); + /*! + requires + - data.size() > 0 + - net_type uses an unsupervised loss. + i.e. net_type::training_label_type == no_label_type. + ensures + - Runs the given data through the network and computes and records the loss. + - This call does not modify network parameters. The point of + test_one_step() is two fold, to allow you to observe the accuracy of the + network on hold out data during training, and to allow the trainer to + automatically adjust the learning rate when the test loss stops + improving. It should be noted that you are not required to use + test_one_step() at all, but if you want to do this kind of thing it is + available. + - You can observe the current average loss value by calling get_average_test_loss(). + - The computation will happen in another thread. Therefore, after calling + this function you should call get_net() before you touch the net object + from the calling thread to ensure no other threads are still accessing + the network. + - #get_test_one_step_calls() == get_test_one_step_calls() + 1. + !*/ + + template < + typename data_iterator + > + void test_one_step ( + data_iterator dbegin, + data_iterator dend + ); + /*! + requires + - std::distance(dbegin, dend) > 0 + - net_type uses an unsupervised loss. + i.e. net_type::training_label_type == no_label_type. + ensures + - Runs the given data through the network and computes and records the loss. + - This call does not modify network parameters. The point of + test_one_step() is two fold, to allow you to observe the accuracy of the + network on hold out data during training, and to allow the trainer to + automatically adjust the learning rate when the test loss stops + improving. It should be noted that you are not required to use + test_one_step() at all, but if you want to do this kind of thing it is + available. + - You can observe the current average loss value by calling get_average_test_loss(). + - The computation will happen in another thread. Therefore, after calling + this function you should call get_net() before you touch the net object + from the calling thread to ensure no other threads are still accessing + the network. + - #get_test_one_step_calls() == get_test_one_step_calls() + 1. + !*/ + + void set_test_iterations_without_progress_threshold ( + unsigned long thresh + ); + /*! + ensures + - #get_test_iterations_without_progress_threshold() == thresh + - #get_learning_rate_schedule().size() == 0 + - This function blocks until all threads inside the dnn_trainer have + stopped touching the net. + !*/ + + unsigned long get_test_iterations_without_progress_threshold ( + ) const; + /*! + ensures + - This object monitors the progress of training and estimates if the + testing error is being reduced. It does this by looking at the previous + get_test_iterations_without_progress_threshold() mini-batch results from + test_one_step() and applying the statistical test defined by the + running_gradient object to see if the testing error is getting smaller. + If it isn't being reduced then get_learning_rate() is made smaller by a + factor of get_learning_rate_shrink_factor(). + + Therefore, get_test_iterations_without_progress_threshold() should always be + set to something sensibly large so that this test can be done with + reasonably high confidence. Think of this test as saying "if the testing loss + hasn't decreased for the previous get_test_iterations_without_progress_threshold() + calls to test_one_step() then shrink the learning rate". + !*/ + + unsigned long get_test_steps_without_progress ( + ) const; + /*! + ensures + - if (get_learning_rate_shrink_factor() != 1) then + - returns an estimate of how many mini-batches have executed without us + observing a statistically significant decrease in the testing error + (i.e. the error on the data given to the trainer via test_one_step() + calls). + - else + - returns 0 + !*/ + + }; + +// ---------------------------------------------------------------------------------------- + + template < + typename net_type, + typename solver_type + > + std::ostream& operator<< ( + std::ostream& out, + dnn_trainer<net_type,solver_type>& trainer + ); + /*! + ensures + - Prints a log of the current parameters of trainer to out. + !*/ + +// ---------------------------------------------------------------------------------------- + +} + +#endif // DLIB_DNn_TRAINER_ABSTRACT_H_ + + diff --git a/ml/dlib/dlib/dnn/utilities.h b/ml/dlib/dlib/dnn/utilities.h new file mode 100644 index 000000000..976128c81 --- /dev/null +++ b/ml/dlib/dlib/dnn/utilities.h @@ -0,0 +1,281 @@ +// Copyright (C) 2016 Davis E. King (davis@dlib.net) +// License: Boost Software License See LICENSE.txt for the full license. +#ifndef DLIB_DNn_UTILITIES_H_ +#define DLIB_DNn_UTILITIES_H_ + +#include "core.h" +#include "utilities_abstract.h" +#include "../geometry.h" +#include <fstream> + +namespace dlib +{ + +// ---------------------------------------------------------------------------------------- + + inline double log1pexp(double x) + { + using std::exp; + using namespace std; // Do this instead of using std::log1p because some compilers + // error out otherwise (E.g. gcc 4.9 in cygwin) + if (x <= -37) + return exp(x); + else if (-37 < x && x <= 18) + return log1p(exp(x)); + else if (18 < x && x <= 33.3) + return x + exp(-x); + else + return x; + } + +// ---------------------------------------------------------------------------------------- + + inline void randomize_parameters ( + tensor& params, + unsigned long num_inputs_and_outputs, + dlib::rand& rnd + ) + { + for (auto& val : params) + { + // Draw a random number to initialize the layer according to formula (16) + // from Understanding the difficulty of training deep feedforward neural + // networks by Xavier Glorot and Yoshua Bengio. + val = 2*rnd.get_random_float()-1; + val *= std::sqrt(6.0/(num_inputs_and_outputs)); + } + } + +// ---------------------------------------------------------------------------------------- + + namespace impl + { + class visitor_net_to_xml + { + public: + + visitor_net_to_xml(std::ostream& out_) : out(out_) {} + + template<typename input_layer_type> + void operator()(size_t idx, const input_layer_type& l) + { + out << "<layer idx='"<<idx<<"' type='input'>\n"; + to_xml(l,out); + out << "</layer>\n"; + } + + template <typename T, typename U> + void operator()(size_t idx, const add_loss_layer<T,U>& l) + { + out << "<layer idx='"<<idx<<"' type='loss'>\n"; + to_xml(l.loss_details(),out); + out << "</layer>\n"; + } + + template <typename T, typename U, typename E> + void operator()(size_t idx, const add_layer<T,U,E>& l) + { + out << "<layer idx='"<<idx<<"' type='comp'>\n"; + to_xml(l.layer_details(),out); + out << "</layer>\n"; + } + + template <unsigned long ID, typename U, typename E> + void operator()(size_t idx, const add_tag_layer<ID,U,E>& l) + { + out << "<layer idx='"<<idx<<"' type='tag' id='"<<ID<<"'/>\n"; + } + + template <template<typename> class T, typename U> + void operator()(size_t idx, const add_skip_layer<T,U>& l) + { + out << "<layer idx='"<<idx<<"' type='skip' id='"<<(tag_id<T>::id)<<"'/>\n"; + } + + private: + + std::ostream& out; + }; + } + + template <typename net_type> + void net_to_xml ( + const net_type& net, + std::ostream& out + ) + { + auto old_precision = out.precision(9); + out << "<net>\n"; + visit_layers(net, impl::visitor_net_to_xml(out)); + out << "</net>\n"; + // restore the original stream precision. + out.precision(old_precision); + } + + template <typename net_type> + void net_to_xml ( + const net_type& net, + const std::string& filename + ) + { + std::ofstream fout(filename); + net_to_xml(net, fout); + } + +// ---------------------------------------------------------------------------------------- + + namespace impl + { + + class visitor_net_map_input_to_output + { + public: + + visitor_net_map_input_to_output(dpoint& p_) : p(p_) {} + + dpoint& p; + + template<typename input_layer_type> + void operator()(const input_layer_type& net) + { + } + + template <typename T, typename U> + void operator()(const add_loss_layer<T,U>& net) + { + (*this)(net.subnet()); + } + + template <typename T, typename U, typename E> + void operator()(const add_layer<T,U,E>& net) + { + (*this)(net.subnet()); + p = net.layer_details().map_input_to_output(p); + } + template <bool B, typename T, typename U, typename E> + void operator()(const dimpl::subnet_wrapper<add_layer<T,U,E>,B>& net) + { + (*this)(net.subnet()); + p = net.layer_details().map_input_to_output(p); + } + + + template <unsigned long ID, typename U, typename E> + void operator()(const add_tag_layer<ID,U,E>& net) + { + // tag layers are an identity transform, so do nothing + (*this)(net.subnet()); + } + template <bool is_first, unsigned long ID, typename U, typename E> + void operator()(const dimpl::subnet_wrapper<add_tag_layer<ID,U,E>,is_first>& net) + { + // tag layers are an identity transform, so do nothing + (*this)(net.subnet()); + } + + + template <template<typename> class TAG_TYPE, typename U> + void operator()(const add_skip_layer<TAG_TYPE,U>& net) + { + (*this)(layer<TAG_TYPE>(net)); + } + template <bool is_first, template<typename> class TAG_TYPE, typename SUBNET> + void operator()(const dimpl::subnet_wrapper<add_skip_layer<TAG_TYPE,SUBNET>,is_first>& net) + { + // skip layers are an identity transform, so do nothing + (*this)(layer<TAG_TYPE>(net)); + } + + }; + + class visitor_net_map_output_to_input + { + public: + visitor_net_map_output_to_input(dpoint& p_) : p(p_) {} + + dpoint& p; + + template<typename input_layer_type> + void operator()(const input_layer_type& net) + { + } + + template <typename T, typename U> + void operator()(const add_loss_layer<T,U>& net) + { + (*this)(net.subnet()); + } + + template <typename T, typename U, typename E> + void operator()(const add_layer<T,U,E>& net) + { + p = net.layer_details().map_output_to_input(p); + (*this)(net.subnet()); + } + template <bool B, typename T, typename U, typename E> + void operator()(const dimpl::subnet_wrapper<add_layer<T,U,E>,B>& net) + { + p = net.layer_details().map_output_to_input(p); + (*this)(net.subnet()); + } + + + template <unsigned long ID, typename U, typename E> + void operator()(const add_tag_layer<ID,U,E>& net) + { + // tag layers are an identity transform, so do nothing + (*this)(net.subnet()); + } + template <bool is_first, unsigned long ID, typename U, typename E> + void operator()(const dimpl::subnet_wrapper<add_tag_layer<ID,U,E>,is_first>& net) + { + // tag layers are an identity transform, so do nothing + (*this)(net.subnet()); + } + + + template <template<typename> class TAG_TYPE, typename U> + void operator()(const add_skip_layer<TAG_TYPE,U>& net) + { + (*this)(layer<TAG_TYPE>(net)); + } + template <bool is_first, template<typename> class TAG_TYPE, typename SUBNET> + void operator()(const dimpl::subnet_wrapper<add_skip_layer<TAG_TYPE,SUBNET>,is_first>& net) + { + // skip layers are an identity transform, so do nothing + (*this)(layer<TAG_TYPE>(net)); + } + + }; + } + + template <typename net_type> + inline dpoint input_tensor_to_output_tensor( + const net_type& net, + dpoint p + ) + { + impl::visitor_net_map_input_to_output temp(p); + temp(net); + return p; + } + + template <typename net_type> + inline dpoint output_tensor_to_input_tensor( + const net_type& net, + dpoint p + ) + { + impl::visitor_net_map_output_to_input temp(p); + temp(net); + return p; + } + +// ---------------------------------------------------------------------------------------- + +} + +#endif // DLIB_DNn_UTILITIES_H_ + + + diff --git a/ml/dlib/dlib/dnn/utilities_abstract.h b/ml/dlib/dlib/dnn/utilities_abstract.h new file mode 100644 index 000000000..2a9a3d3fc --- /dev/null +++ b/ml/dlib/dlib/dnn/utilities_abstract.h @@ -0,0 +1,127 @@ +// Copyright (C) 2016 Davis E. King (davis@dlib.net) +// License: Boost Software License See LICENSE.txt for the full license. +#undef DLIB_DNn_UTILITIES_ABSTRACT_H_ +#ifdef DLIB_DNn_UTILITIES_ABSTRACT_H_ + +#include "core_abstract.h" +#include "../geometry/vector_abstract.h" + +namespace dlib +{ + +// ---------------------------------------------------------------------------------------- + + double log1pexp( + double x + ); + /*! + ensures + - returns log(1+exp(x)) + (except computes it using a numerically accurate method) + !*/ + +// ---------------------------------------------------------------------------------------- + + void randomize_parameters ( + tensor& params, + unsigned long num_inputs_and_outputs, + dlib::rand& rnd + ); + /*! + ensures + - This function assigns random values into params based on the given random + number generator. In particular, it uses the parameter initialization method + of formula 16 from the paper "Understanding the difficulty of training deep + feedforward neural networks" by Xavier Glorot and Yoshua Bengio. + - It is assumed that the total number of inputs and outputs from the layer is + num_inputs_and_outputs. That is, you should set num_inputs_and_outputs to + the sum of the dimensionalities of the vectors going into and out of the + layer that uses params as its parameters. + !*/ + +// ---------------------------------------------------------------------------------------- + + template <typename net_type> + void net_to_xml ( + const net_type& net, + std::ostream& out + ); + /*! + requires + - net_type is an object of type add_layer, add_loss_layer, add_skip_layer, or + add_tag_layer. + - All layers in the net must provide to_xml() functions. + ensures + - Prints the given neural network object as an XML document to the given output + stream. + !*/ + + template <typename net_type> + void net_to_xml ( + const net_type& net, + const std::string& filename + ); + /*! + requires + - net_type is an object of type add_layer, add_loss_layer, add_skip_layer, or + add_tag_layer. + - All layers in the net must provide to_xml() functions. + ensures + - This function is just like the above net_to_xml(), except it writes to a file + rather than an ostream. + !*/ + +// ---------------------------------------------------------------------------------------- + + template <typename net_type> + dpoint input_tensor_to_output_tensor( + const net_type& net, + dpoint p + ); + /*! + requires + - net_type is an object of type add_layer, add_skip_layer, or add_tag_layer. + - All layers in the net must provide map_input_to_output() functions. + ensures + - Given a dpoint (i.e. a row,column coordinate) in the input tensor given to + net, this function returns the corresponding dpoint in the output tensor + net.get_output(). This kind of mapping is useful when working with fully + convolutional networks as you will often want to know what parts of the + output feature maps correspond to what parts of the input. + - If the network contains skip layers then any layers skipped over by the skip + layer are ignored for the purpose of computing this coordinate mapping. That + is, if you walk the network from the output layer to the input layer, where + each time you encounter a skip layer you jump to the layer indicated by the + skip layer, you will visit exactly the layers in the network involved in the + input_tensor_to_output_tensor() calculation. This behavior is useful since it + allows you to compute some auxiliary DNN as a separate branch of computation + that is separate from the main network's job of running some kind of fully + convolutional network over an image. For instance, you might want to have a + branch in your network that computes some global image level + summarization/feature. + !*/ + +// ---------------------------------------------------------------------------------------- + + template <typename net_type> + dpoint output_tensor_to_input_tensor( + const net_type& net, + dpoint p + ); + /*! + requires + - net_type is an object of type add_layer, add_skip_layer, or add_tag_layer. + - All layers in the net must provide map_output_to_input() functions. + ensures + - This function provides the reverse mapping of input_tensor_to_output_tensor(). + That is, given a dpoint in net.get_output(), what is the corresponding dpoint + in the input tensor? + !*/ + +// ---------------------------------------------------------------------------------------- + +} + +#endif // DLIB_DNn_UTILITIES_ABSTRACT_H_ + + diff --git a/ml/dlib/dlib/dnn/validation.h b/ml/dlib/dlib/dnn/validation.h new file mode 100644 index 000000000..c65cb4526 --- /dev/null +++ b/ml/dlib/dlib/dnn/validation.h @@ -0,0 +1,122 @@ +// Copyright (C) 2016 Davis E. King (davis@dlib.net) +// License: Boost Software License See LICENSE.txt for the full license. +#ifndef DLIB_DNn_VALIDATION_H_ +#define DLIB_DNn_VALIDATION_H_ + +#include "../svm/cross_validate_object_detection_trainer_abstract.h" +#include "../svm/cross_validate_object_detection_trainer.h" +#include "layers.h" +#include <set> + +namespace dlib +{ + namespace impl + { + inline std::set<std::string> get_labels ( + const std::vector<mmod_rect>& rects1, + const std::vector<mmod_rect>& rects2 + ) + { + std::set<std::string> labels; + for (auto& rr : rects1) + labels.insert(rr.label); + for (auto& rr : rects2) + labels.insert(rr.label); + return labels; + } + } + + template < + typename SUBNET, + typename image_array_type + > + const matrix<double,1,3> test_object_detection_function ( + loss_mmod<SUBNET>& detector, + const image_array_type& images, + const std::vector<std::vector<mmod_rect>>& truth_dets, + const test_box_overlap& overlap_tester = test_box_overlap(), + const double adjust_threshold = 0, + const test_box_overlap& overlaps_ignore_tester = test_box_overlap() + ) + { + // make sure requires clause is not broken + DLIB_CASSERT( is_learning_problem(images,truth_dets) == true , + "\t matrix test_object_detection_function()" + << "\n\t invalid inputs were given to this function" + << "\n\t is_learning_problem(images,truth_dets): " << is_learning_problem(images,truth_dets) + << "\n\t images.size(): " << images.size() + ); + + + + double correct_hits = 0; + double total_true_targets = 0; + + std::vector<std::pair<double,bool> > all_dets; + unsigned long missing_detections = 0; + + resizable_tensor temp; + + for (unsigned long i = 0; i < images.size(); ++i) + { + std::vector<mmod_rect> hits; + detector.to_tensor(&images[i], &images[i]+1, temp); + detector.subnet().forward(temp); + detector.loss_details().to_label(temp, detector.subnet(), &hits, adjust_threshold); + + + for (auto& label : impl::get_labels(truth_dets[i], hits)) + { + std::vector<full_object_detection> truth_boxes; + std::vector<rectangle> ignore; + std::vector<std::pair<double,rectangle>> boxes; + // copy hits and truth_dets into the above three objects + for (auto&& b : truth_dets[i]) + { + if (b.ignore) + { + ignore.push_back(b); + } + else if (b.label == label) + { + truth_boxes.push_back(full_object_detection(b.rect)); + ++total_true_targets; + } + } + for (auto&& b : hits) + { + if (b.label == label) + boxes.push_back(std::make_pair(b.detection_confidence, b.rect)); + } + + correct_hits += impl::number_of_truth_hits(truth_boxes, ignore, boxes, overlap_tester, all_dets, missing_detections, overlaps_ignore_tester); + } + } + + std::sort(all_dets.rbegin(), all_dets.rend()); + + double precision, recall; + + double total_hits = all_dets.size(); + + if (total_hits == 0) + precision = 1; + else + precision = correct_hits / total_hits; + + if (total_true_targets == 0) + recall = 1; + else + recall = correct_hits / total_true_targets; + + matrix<double, 1, 3> res; + res = precision, recall, average_precision(all_dets, missing_detections); + return res; + } + +// ---------------------------------------------------------------------------------------- + +} + +#endif // DLIB_DNn_VALIDATION_H_ + |