summaryrefslogtreecommitdiffstats
path: root/ml/dlib/dlib/dnn
diff options
context:
space:
mode:
Diffstat (limited to 'ml/dlib/dlib/dnn')
-rw-r--r--ml/dlib/dlib/dnn/core.h3599
-rw-r--r--ml/dlib/dlib/dnn/core_abstract.h1700
-rw-r--r--ml/dlib/dlib/dnn/cpu_dlib.cpp2170
-rw-r--r--ml/dlib/dlib/dnn/cpu_dlib.h505
-rw-r--r--ml/dlib/dlib/dnn/cublas_dlibapi.cpp165
-rw-r--r--ml/dlib/dlib/dnn/cublas_dlibapi.h50
-rw-r--r--ml/dlib/dlib/dnn/cuda_data_ptr.cpp71
-rw-r--r--ml/dlib/dlib/dnn/cuda_data_ptr.h184
-rw-r--r--ml/dlib/dlib/dnn/cuda_dlib.cu1630
-rw-r--r--ml/dlib/dlib/dnn/cuda_dlib.h469
-rw-r--r--ml/dlib/dlib/dnn/cuda_errors.h70
-rw-r--r--ml/dlib/dlib/dnn/cuda_utils.h413
-rw-r--r--ml/dlib/dlib/dnn/cudnn_dlibapi.cpp1604
-rw-r--r--ml/dlib/dlib/dnn/cudnn_dlibapi.h518
-rw-r--r--ml/dlib/dlib/dnn/curand_dlibapi.cpp113
-rw-r--r--ml/dlib/dlib/dnn/curand_dlibapi.h75
-rw-r--r--ml/dlib/dlib/dnn/cusolver_dlibapi.cu204
-rw-r--r--ml/dlib/dlib/dnn/cusolver_dlibapi.h75
-rw-r--r--ml/dlib/dlib/dnn/gpu_data.cpp228
-rw-r--r--ml/dlib/dlib/dnn/gpu_data.h266
-rw-r--r--ml/dlib/dlib/dnn/gpu_data_abstract.h266
-rw-r--r--ml/dlib/dlib/dnn/input.h808
-rw-r--r--ml/dlib/dlib/dnn/input_abstract.h467
-rw-r--r--ml/dlib/dlib/dnn/layers.h3244
-rw-r--r--ml/dlib/dlib/dnn/layers_abstract.h2631
-rw-r--r--ml/dlib/dlib/dnn/loss.h2870
-rw-r--r--ml/dlib/dlib/dnn/loss_abstract.h1542
-rw-r--r--ml/dlib/dlib/dnn/solvers.h405
-rw-r--r--ml/dlib/dlib/dnn/solvers_abstract.h204
-rw-r--r--ml/dlib/dlib/dnn/tensor.h686
-rw-r--r--ml/dlib/dlib/dnn/tensor_abstract.h727
-rw-r--r--ml/dlib/dlib/dnn/tensor_tools.cpp985
-rw-r--r--ml/dlib/dlib/dnn/tensor_tools.h1711
-rw-r--r--ml/dlib/dlib/dnn/trainer.h1333
-rw-r--r--ml/dlib/dlib/dnn/trainer_abstract.h765
-rw-r--r--ml/dlib/dlib/dnn/utilities.h281
-rw-r--r--ml/dlib/dlib/dnn/utilities_abstract.h127
-rw-r--r--ml/dlib/dlib/dnn/validation.h122
38 files changed, 33283 insertions, 0 deletions
diff --git a/ml/dlib/dlib/dnn/core.h b/ml/dlib/dlib/dnn/core.h
new file mode 100644
index 000000000..5f1d05498
--- /dev/null
+++ b/ml/dlib/dlib/dnn/core.h
@@ -0,0 +1,3599 @@
+// Copyright (C) 2015 Davis E. King (davis@dlib.net)
+// License: Boost Software License See LICENSE.txt for the full license.
+#ifndef DLIB_DNn_CORE_H_
+#define DLIB_DNn_CORE_H_
+
+#include "core_abstract.h"
+#include "tensor.h"
+#include <iterator>
+#include <memory>
+#include <sstream>
+#include <type_traits>
+#include "../statistics.h"
+#include "../rand.h"
+#include "../algs.h"
+#include <utility>
+#include <tuple>
+#include <cmath>
+#include <vector>
+#include "tensor_tools.h"
+#include <type_traits>
+#include "../metaprogramming.h"
+
+#ifdef _MSC_VER
+// Tell Visual Studio not to recursively inline functions very much because otherwise it
+// takes hours to compile the DNN code sometimes. It's crazy. Hopefully we can remove
+// this some day when the visual studio compiler is more efficient.
+#pragma inline_depth(2)
+#endif
+
+
+namespace dlib
+{
+
+// ----------------------------------------------------------------------------------------
+
+ namespace impl
+ {
+ template <typename T, typename int_<decltype(&T::get_learning_rate_multiplier)>::type = 0>
+ double get_learning_rate_multiplier (
+ const T& obj,
+ special_
+ ) { return obj.get_learning_rate_multiplier(); }
+
+ template <typename T>
+ double get_learning_rate_multiplier ( const T& , general_) { return 1; }
+ }
+ template <typename T>
+ double get_learning_rate_multiplier(const T& obj) { return impl::get_learning_rate_multiplier(obj, special_()); }
+
+// ----------------------------------------------------------------------------------------
+
+ namespace impl
+ {
+ template <typename T, typename int_<decltype(&T::get_weight_decay_multiplier)>::type = 0>
+ double get_weight_decay_multiplier (
+ const T& obj,
+ special_
+ ) { return obj.get_weight_decay_multiplier(); }
+
+ template <typename T>
+ double get_weight_decay_multiplier ( const T& , general_) { return 1; }
+ }
+ template <typename T>
+ double get_weight_decay_multiplier(const T& obj) { return impl::get_weight_decay_multiplier(obj, special_()); }
+
+// ----------------------------------------------------------------------------------------
+
+ namespace impl
+ {
+ // The reason we return an int for this version rather than doing the more straight forward thing (like we do above) is to avoid a bug in visual studio 2015.
+ template <typename T>
+ auto call_clean_method_if_exists (
+ T& obj,
+ special_
+ ) -> typename int_<decltype(&T::clean)>::type { obj.clean(); return 0; }
+
+ template <typename T>
+ void call_clean_method_if_exists (T& , general_) {}
+ }
+ template <typename T>
+ void call_clean_method_if_exists(T& obj) { impl::call_clean_method_if_exists(obj, special_()); }
+ /*!
+ ensures
+ - calls obj.clean() if obj has a .clean() method.
+ !*/
+
+// ----------------------------------------------------------------------------------------
+
+ namespace impl
+ {
+ class repeat_input_layer
+ {
+ /*!
+ None of the declarations in this object are really used. The only reason it
+ exists is to allow the repeat object to use a special input layer in its
+ internal networks which will cause add_tag_layer objects that happen to be
+ right at the input to not create copies of their input tensors. So
+ introducing the repeat_input_layer object allows us to optimize the
+ implementation of add_tag_layer for a special case that arises when it's
+ used in the context of the repeat layer.
+ !*/
+ public:
+ typedef int input_type;
+
+ template <typename forward_iterator>
+ void to_tensor (
+ forward_iterator ,
+ forward_iterator ,
+ resizable_tensor&
+ ) const
+ {
+ }
+
+ friend void serialize(const repeat_input_layer&, std::ostream&){}
+ friend void deserialize(repeat_input_layer&, std::istream&){}
+ friend std::ostream& operator<<(std::ostream& out, const repeat_input_layer&) { return out; }
+ };
+
+ inline std::string tensor_to_str (
+ const tensor& t,
+ int& min_length
+ )
+ {
+ if (t.size() == 0)
+ return "";
+
+ std::ostringstream sout;
+ sout << "output size=(num:"<< t.num_samples() << ", ";
+ sout << "k:" << t.k() << ",";
+ while (sout.tellp() < 28) sout << " ";
+ sout << "nr:" << t.nr() << ",";
+ while (sout.tellp() < 28+8) sout << " ";
+ sout << "nc:" << t.nc() << ")";
+ while (sout.tellp() < min_length) sout << " ";
+ min_length = sout.tellp();
+ sout << "\t";
+ return sout.str();
+ }
+ }
+
+// ----------------------------------------------------------------------------------------
+
+ // Tell us if T is one of the special layer types (i.e. add_layer, repeat, add_tag_layer, or
+ // add_skip_layer).
+ template <typename T> struct is_nonloss_layer_type : std::false_type {};
+ // Tell us if T is an instance of add_loss_layer.
+ template <typename T> struct is_loss_layer_type : std::false_type {};
+ // Tell us if T is an instance of add_layer
+ template <typename T> struct is_add_layer : std::false_type {};
+
+ namespace impl
+ {
+ template <size_t... indices, typename Tuple>
+ auto tuple_subset(
+ const Tuple& item,
+ compile_time_integer_list<indices...>
+ ) -> decltype(std::make_tuple(std::get<indices>(item)...))
+ {
+ return std::make_tuple(std::get<indices>(item)...);
+ }
+
+ template <typename Head, typename... Tail>
+ std::tuple<Tail...> basic_tuple_tail(
+ const std::tuple<Head, Tail...>& item
+ )
+ {
+ return tuple_subset(item, typename make_compile_time_integer_range<sizeof...(Tail)>::type());
+ }
+
+ template <typename T>
+ std::tuple<T> tuple_flatten(const T& t)
+ {
+ return std::make_tuple(t);
+ }
+
+ template <typename... T>
+ auto tuple_flatten(
+ const std::tuple<T...>& item
+ ) -> decltype(tuple_flatten(item, typename make_compile_time_integer_range<sizeof...(T)>::type()))
+ {
+ return tuple_flatten(item, typename make_compile_time_integer_range<sizeof...(T)>::type());
+ }
+
+ template <size_t... indices, typename... T>
+ auto tuple_flatten(
+ const std::tuple<T...>& item,
+ compile_time_integer_list<indices...>
+ ) -> decltype(std::tuple_cat(tuple_flatten(std::get<indices-1>(item))...))
+ {
+ return std::tuple_cat(tuple_flatten(std::get<indices-1>(item))...);
+ }
+
+ template <typename T>
+ struct tuple_head_helper
+ {
+ typedef T type;
+ static const type& get(const T& item)
+ {
+ return item;
+ }
+ };
+
+ template <typename T, typename... U>
+ struct tuple_head_helper<std::tuple<T, U...>>
+ {
+ typedef typename tuple_head_helper<T>::type type;
+ static const type& get(const std::tuple<T,U...>& item)
+ {
+ return tuple_head_helper<T>::get(std::get<0>(item));
+ }
+ };
+
+ template <typename T> struct alwaysbool { typedef bool type; };
+ // one more structure for VS 2015 UP3 support workaround
+ template <typename T> struct alwaysbool2 { typedef bool type; };
+
+ resizable_tensor& rt();
+
+ // The significance of a layer's backward method requiring forward's outputs is
+ // that such as layer can't have an in-place layer stacked on top of it because
+ // in-place layers overwrite the output of the layer they sit on top of.
+ template <typename layer_type, typename SUBNET>
+ constexpr auto backward_requires_forward_output(
+ layer_type& layer,
+ SUBNET& sub
+ ) -> typename alwaysbool<decltype(layer.backward(rt(),rt(),sub,rt()))>::type
+ {
+ return true;
+ }
+
+ template <typename layer_type, typename SUBNET>
+ constexpr auto backward_requires_forward_output(
+ layer_type& layer,
+ SUBNET& sub
+ ) -> typename alwaysbool<decltype(layer.backward(rt(),sub,rt()))>::type
+ {
+ return false;
+ }
+
+ template <typename layer_type, typename SUBNET>
+ constexpr auto backward_requires_forward_output(
+ layer_type& layer,
+ SUBNET& sub
+ ) -> typename alwaysbool<decltype(layer.backward_inplace(rt(),rt(),sub.get_gradient_input(),rt()))>::type
+ {
+ return true;
+ }
+
+ template <typename layer_type, typename SUBNET>
+ constexpr auto backward_requires_forward_output(
+ layer_type& layer,
+ SUBNET& sub
+ ) -> typename alwaysbool<decltype(layer.backward_inplace(rt(),sub.get_gradient_input(),rt()))>::type
+ {
+ return false;
+ }
+
+ template <typename layer_type, typename SUBNET>
+ constexpr auto has_inplace_backward(
+ layer_type& layer,
+ SUBNET& sub
+ ) -> typename alwaysbool2<decltype(layer.backward(rt(),rt(),sub,rt()))>::type
+ {
+ return false;
+ }
+
+ template <typename layer_type, typename SUBNET>
+ constexpr auto has_inplace_backward(
+ layer_type& layer,
+ SUBNET& sub
+ ) -> typename alwaysbool2<decltype(layer.backward(rt(),sub,rt()))>::type
+ {
+ return false;
+ }
+
+ template <typename layer_type, typename SUBNET>
+ constexpr auto has_inplace_backward(
+ layer_type& layer,
+ SUBNET& sub
+ ) -> typename alwaysbool2<decltype(layer.backward_inplace(rt(),rt(),sub.get_gradient_input(),rt()))>::type
+ {
+ return true;
+ }
+
+ template <typename layer_type, typename SUBNET>
+ constexpr auto has_inplace_backward(
+ layer_type& layer,
+ SUBNET& sub
+ ) -> typename alwaysbool2<decltype(layer.backward_inplace(rt(),sub.get_gradient_input(),rt()))>::type
+ {
+ return true;
+ }
+
+ template <typename layer_type, typename SUBNET>
+ constexpr auto is_inplace_layer(
+ layer_type& layer,
+ const SUBNET& sub
+ ) -> typename alwaysbool2<decltype(layer.forward(sub,rt()))>::type
+ {
+ return false;
+ }
+
+ template <typename layer_type, typename SUBNET>
+ constexpr auto is_inplace_layer(
+ layer_type& layer,
+ const SUBNET& sub
+ ) -> typename alwaysbool<decltype(layer.forward_inplace(sub.get_output(),rt()))>::type
+ {
+ return true;
+ }
+
+ template <typename layer_type, typename SUBNET>
+ auto call_layer_backward(
+ layer_type& layer,
+ const tensor& computed_output,
+ const tensor& gradient_input,
+ SUBNET& sub,
+ tensor& params_grad
+ ) -> decltype(layer.backward(computed_output,gradient_input,sub,params_grad))
+ {
+ layer.backward(computed_output,gradient_input,sub,params_grad);
+ }
+
+ template <typename layer_type, typename SUBNET>
+ auto call_layer_backward(
+ layer_type& layer,
+ const tensor& ,
+ const tensor& gradient_input,
+ SUBNET& sub,
+ tensor& params_grad
+ ) -> decltype(layer.backward(gradient_input,sub,params_grad))
+ {
+ layer.backward(gradient_input,sub,params_grad);
+ }
+
+ template <typename layer_type, typename SUBNET>
+ auto call_layer_backward(
+ layer_type& layer,
+ const tensor& computed_output,
+ const tensor& gradient_input,
+ SUBNET& sub,
+ tensor& params_grad
+ ) -> decltype(layer.backward_inplace(computed_output,gradient_input,sub.get_gradient_input(),params_grad))
+ {
+ layer.backward_inplace(computed_output,gradient_input,sub.get_gradient_input(),params_grad);
+ }
+
+ template <typename layer_type, typename SUBNET>
+ auto call_layer_backward(
+ layer_type& layer,
+ const tensor& ,
+ const tensor& gradient_input,
+ SUBNET& sub,
+ tensor& params_grad
+ ) -> decltype(layer.backward_inplace(gradient_input,sub.get_gradient_input(),params_grad))
+ {
+ layer.backward_inplace(gradient_input,sub.get_gradient_input(),params_grad);
+ }
+
+
+ template <typename layer_type, typename SUBNET>
+ auto call_layer_forward(
+ layer_type& layer,
+ const SUBNET& sub,
+ tensor& /*data_output*/
+ ) -> decltype(layer.forward(sub,rt()))
+ {
+ // This overload of call_layer_forward() is here because this template
+ // naturally gets instantiated but only on code paths that never get executed.
+ // So rather than writing a bunch of hard to read template magic around call
+ // sites we just have this overload that doesn't do anything (and an assert to
+ // make sure that's the case).
+ DLIB_CASSERT(false, "This should never happen");
+ }
+
+ template <typename layer_type, typename SUBNET>
+ auto call_layer_forward(
+ layer_type& layer,
+ const SUBNET& sub,
+ resizable_tensor& data_output
+ ) -> decltype(layer.forward(sub,data_output))
+ {
+ layer.forward(sub,data_output);
+ }
+
+ template <typename layer_type, typename SUBNET>
+ auto call_layer_forward(
+ layer_type& layer,
+ const SUBNET& sub,
+ tensor& data_output
+ ) -> decltype(layer.forward_inplace(sub.get_output(),data_output))
+ {
+ layer.forward_inplace(sub.get_output(),data_output);
+ }
+
+ template <typename layer_type, typename SUBNET>
+ auto call_layer_forward(
+ layer_type& layer,
+ const SUBNET& sub,
+ resizable_tensor& data_output
+ ) -> decltype(layer.forward_inplace(sub.get_output(),data_output))
+ {
+ if (!have_same_dimensions(data_output, sub.get_output()))
+ data_output.copy_size(sub.get_output());
+ layer.forward_inplace(sub.get_output(),static_cast<tensor&>(data_output));
+ }
+
+
+ } // end namespace impl
+
+ template <typename... T>
+ typename impl::tuple_head_helper<std::tuple<T...>>::type tuple_head (
+ const std::tuple<T...>& item
+ )
+ {
+ return impl::tuple_head_helper<std::tuple<T...>>::get(item);
+ }
+
+ template <typename... T>
+ auto tuple_tail(
+ const std::tuple<T...>& item
+ ) -> decltype(impl::basic_tuple_tail(impl::tuple_flatten(item)))
+ {
+ return impl::basic_tuple_tail(impl::tuple_flatten(item));
+ }
+
+ inline std::tuple<> tuple_tail(
+ const std::tuple<>& item
+ )
+ {
+ return item;
+ }
+// ----------------------------------------------------------------------------------------
+
+ template <typename T>
+ class sstack
+ {
+ public:
+ typedef T value_type;
+
+ sstack() = delete;
+
+ sstack (
+ T* data_,
+ size_t s
+ ) : data(data_), mysize(s) {}
+
+ const T& top() const
+ {
+ DLIB_CASSERT(size() != 0, "You can't call top() on an empty stack");
+ return *data;
+ }
+ T& top()
+ {
+ DLIB_CASSERT(size() != 0, "You can't call top() on an empty stack");
+ return *data;
+ }
+
+ size_t size() const { return mysize; }
+
+ sstack pop(size_t num=1)
+ {
+ DLIB_CASSERT(num <= size(), "You can't pop more things from the stack than it has in it.");
+ return sstack(data+num, mysize-num);
+ }
+
+ private:
+
+ T* data;
+ size_t mysize;
+ };
+
+ template <typename T>
+ sstack<T> make_sstack(std::vector<T>& item)
+ {
+ return sstack<T>(item.data(), item.size());
+ }
+
+// ----------------------------------------------------------------------------------------
+// ----------------------------------------------------------------------------------------
+// ----------------------------------------------------------------------------------------
+
+ namespace dimpl
+ {
+ template <typename T, bool is_first = true, typename enabled=void>
+ class subnet_wrapper
+ {
+ /*!
+ WHAT THIS OBJECT REPRESENTS
+ This is a tool that makes an add_layer or add_loss_layer object
+ expose only the part of its interface defined by the SUBNET
+ type in layers_abstract.h. This way, when we pass subnetwork
+ objects to the layer callbacks those callbacks won't be able to
+ interact with the subnetworks in a way other than specified
+ by the SUBNET interface spec.
+
+ We also allow the top layer of a subnet_wrapper stack to call the
+ private_get_output() and private_get_gradient_input() functions. This
+ way, layers that have had their output/gradient overwritten by in-place
+ layers can only be accessed from the in-place layers that sit directly
+ on top of them since those in-place layers are the only layers that
+ know how to interact with them properly.
+ !*/
+
+ public:
+ subnet_wrapper(const subnet_wrapper&) = delete;
+ subnet_wrapper& operator=(const subnet_wrapper&) = delete;
+
+ subnet_wrapper(T& l_, unsigned int sef) : l(l_),_sample_expansion_factor(sef) {}
+ // Not much here because in this case T is one of the input layer types
+ // that doesn't have anything in it.
+ typedef T layer_details_type;
+ const layer_details_type& layer_details() const { return l; }
+ unsigned int sample_expansion_factor() const { return _sample_expansion_factor; }
+ private:
+ T& l;
+ unsigned int _sample_expansion_factor;
+ };
+
+ template <typename T>
+ class subnet_wrapper<T,true, typename std::enable_if<is_nonloss_layer_type<T>::value>::type>
+ {
+
+ public:
+ subnet_wrapper(const subnet_wrapper&) = delete;
+ subnet_wrapper& operator=(const subnet_wrapper&) = delete;
+
+ typedef T wrapped_type;
+ const static size_t num_computational_layers = T::num_computational_layers;
+ const static size_t num_layers = T::num_layers;
+ typedef typename T::layer_details_type layer_details_type;
+
+ subnet_wrapper(T& l_, unsigned int = 0) : l(l_),subnetwork(l.subnet(), l.sample_expansion_factor()) {}
+
+ const tensor& get_output() const { return l.private_get_output(); }
+ tensor& get_gradient_input() { return l.private_get_gradient_input(); }
+
+ const layer_details_type& layer_details() const { return l.layer_details(); }
+
+ const subnet_wrapper<typename T::subnet_type,false>& subnet() const { return subnetwork; }
+ subnet_wrapper<typename T::subnet_type,false>& subnet() { return subnetwork; }
+ unsigned int sample_expansion_factor() const { return l.sample_expansion_factor(); }
+
+ private:
+ T& l;
+ subnet_wrapper<typename T::subnet_type,false> subnetwork;
+ };
+
+ template <typename T>
+ class subnet_wrapper<T,false, typename std::enable_if<is_nonloss_layer_type<T>::value>::type>
+ {
+
+ public:
+ subnet_wrapper(const subnet_wrapper&) = delete;
+ subnet_wrapper& operator=(const subnet_wrapper&) = delete;
+
+ typedef T wrapped_type;
+ const static size_t num_computational_layers = T::num_computational_layers;
+ const static size_t num_layers = T::num_layers;
+ typedef typename T::layer_details_type layer_details_type;
+
+ subnet_wrapper(T& l_, unsigned int = 0) : l(l_),subnetwork(l.subnet(), l.sample_expansion_factor()) {}
+
+ const tensor& get_output() const { return l.get_output(); }
+ tensor& get_gradient_input() { return l.get_gradient_input(); }
+
+ const layer_details_type& layer_details() const { return l.layer_details(); }
+
+ const subnet_wrapper<typename T::subnet_type,false>& subnet() const { return subnetwork; }
+ subnet_wrapper<typename T::subnet_type,false>& subnet() { return subnetwork; }
+ unsigned int sample_expansion_factor() const { return l.sample_expansion_factor(); }
+
+ private:
+ T& l;
+ subnet_wrapper<typename T::subnet_type,false> subnetwork;
+ };
+ }
+
+// ----------------------------------------------------------------------------------------
+
+ template <typename LAYER_DETAILS, typename SUBNET, typename enabled = void>
+ class add_layer;
+
+ template <typename LAYER_DETAILS, typename SUBNET, typename enabled>
+ void serialize(const add_layer<LAYER_DETAILS,SUBNET,enabled>& item, std::ostream& out);
+ template <typename LAYER_DETAILS, typename SUBNET, typename enabled>
+ void deserialize(add_layer<LAYER_DETAILS,SUBNET,enabled>& item, std::istream& in);
+
+ template <typename T, typename U>
+ struct is_nonloss_layer_type<add_layer<T,U>> : std::true_type {};
+
+ template <typename LAYER_DETAILS, typename SUBNET>
+ class add_layer<LAYER_DETAILS,SUBNET,
+ typename std::enable_if<is_nonloss_layer_type<SUBNET>::value>::type>
+ {
+ public:
+ typedef LAYER_DETAILS layer_details_type;
+ typedef SUBNET subnet_type;
+ typedef typename subnet_type::input_type input_type;
+ const static size_t num_layers = subnet_type::num_layers + 1;
+ const static size_t num_computational_layers = subnet_type::num_computational_layers + 1;
+
+ add_layer(
+ ):
+ subnetwork(new subnet_type()),
+ this_layer_setup_called(false),
+ gradient_input_is_stale(true),
+ get_output_and_gradient_input_disabled(false)
+ {
+ if (this_layer_operates_inplace())
+ subnetwork->disable_output_and_gradient_getters();
+ }
+
+ add_layer(const add_layer& item)
+ {
+ details = item.details;
+ subnetwork.reset(new subnet_type(*item.subnetwork));
+ this_layer_setup_called = item.this_layer_setup_called;
+ gradient_input_is_stale = item.gradient_input_is_stale;
+ get_output_and_gradient_input_disabled = item.get_output_and_gradient_input_disabled;
+ x_grad = item.x_grad;
+ cached_output = item.cached_output;
+ params_grad = item.params_grad;
+ temp_tensor = item.temp_tensor;
+ }
+ add_layer& operator=(const add_layer& item) { add_layer(item).swap(*this); return *this;}
+ add_layer(add_layer&& item) : add_layer() { swap(item); }
+ add_layer& operator=(add_layer&& item) { swap(item); return *this; }
+
+ template <typename T, typename U, typename E>
+ friend class add_layer;
+ template <typename T, bool is_first, typename E>
+ friend class dimpl::subnet_wrapper;
+ template <unsigned long T, typename U, typename E>
+ friend class add_tag_layer;
+ template <template<typename> class T, typename U>
+ friend class add_skip_layer;
+ template <size_t N, template<typename> class L, typename S>
+ friend class repeat;
+
+ // Allow copying networks from one to another as long as their corresponding
+ // layers can be constructed from each other.
+ template <typename T, typename U, typename E>
+ add_layer(
+ const add_layer<T,U,E>& item
+ ) :
+ details(item.layer_details()),
+ subnetwork(new subnet_type(item.subnet())),
+ this_layer_setup_called(item.this_layer_setup_called),
+ gradient_input_is_stale(item.gradient_input_is_stale),
+ get_output_and_gradient_input_disabled(item.get_output_and_gradient_input_disabled),
+ x_grad(item.x_grad),
+ cached_output(item.cached_output)
+ {
+ if (this_layer_operates_inplace())
+ subnetwork->disable_output_and_gradient_getters();
+ }
+
+ template <typename ...T>
+ add_layer(
+ const LAYER_DETAILS& layer_det,
+ T&& ...args
+ ) :
+ details(layer_det),
+ subnetwork(new subnet_type(std::forward<T>(args)...)),
+ this_layer_setup_called(false),
+ gradient_input_is_stale(true),
+ get_output_and_gradient_input_disabled(false)
+ {
+ if (this_layer_operates_inplace())
+ subnetwork->disable_output_and_gradient_getters();
+ }
+
+ template <typename T, typename ...U>
+ struct disable_forwarding_constr
+ {
+ const static bool value = std::is_constructible<LAYER_DETAILS,T>::value;
+ };
+ template <typename ...T, typename ...U>
+ struct disable_forwarding_constr<std::tuple<T...>,U...>
+ {
+ const static bool value = disable_forwarding_constr<typename std::remove_reference<T>::type...>::value;
+ };
+ template <typename T, typename ...U>
+ struct disable_forwarding_constr<std::tuple<T>,U...>
+ {
+ const static bool value = disable_forwarding_constr<typename std::remove_reference<T>::type>::value;
+ };
+ template <typename ...U>
+ struct disable_forwarding_constr<std::tuple<>,U...>
+ {
+ const static bool value = true;
+ };
+ template <typename ...T>
+ struct disable_forwarding_constr<add_layer<T...>>
+ {
+ const static bool value = true;
+ };
+
+ template <
+ typename ...T,
+ typename = typename std::enable_if<!disable_forwarding_constr<typename std::remove_reference<T>::type...>::value>::type
+ >
+ add_layer(
+ T&& ...args
+ ) :
+ subnetwork(new subnet_type(std::forward<T>(args)...)),
+ this_layer_setup_called(false),
+ gradient_input_is_stale(true),
+ get_output_and_gradient_input_disabled(false)
+ {
+ if (this_layer_operates_inplace())
+ subnetwork->disable_output_and_gradient_getters();
+ }
+
+ template <typename ...T>
+ add_layer(
+ LAYER_DETAILS&& layer_det,
+ T&& ...args
+ ) :
+ details(std::move(layer_det)),
+ subnetwork(new subnet_type(std::forward<T>(args)...)),
+ this_layer_setup_called(false),
+ gradient_input_is_stale(true),
+ get_output_and_gradient_input_disabled(false)
+ {
+ if (this_layer_operates_inplace())
+ subnetwork->disable_output_and_gradient_getters();
+ }
+
+ template <typename ...T, typename LD, typename ...U>
+ add_layer(
+ const std::tuple<LD,U...>& layer_det,
+ T&& ...args
+ ) :
+ details(tuple_head(layer_det)),
+ subnetwork(new subnet_type(tuple_tail(layer_det),std::forward<T>(args)...)),
+ this_layer_setup_called(false),
+ gradient_input_is_stale(true),
+ get_output_and_gradient_input_disabled(false)
+ {
+ if (this_layer_operates_inplace())
+ subnetwork->disable_output_and_gradient_getters();
+ }
+
+ template <typename ...T, typename LD, typename ...U>
+ add_layer(
+ std::tuple<>,
+ const std::tuple<LD,U...>& layer_det,
+ T&& ...args
+ ) : add_layer(layer_det,args...) { }
+
+ add_layer (
+ std::tuple<>
+ ) : add_layer() {}
+
+ template <typename ...T>
+ add_layer(
+ std::tuple<>,
+ LAYER_DETAILS&& layer_det,
+ T&& ...args
+ ) : add_layer(layer_det, args...) { }
+
+ template <typename forward_iterator>
+ void to_tensor (
+ forward_iterator ibegin,
+ forward_iterator iend,
+ resizable_tensor& data
+ ) const
+ {
+ subnetwork->to_tensor(ibegin,iend,data);
+ }
+
+ template <typename forward_iterator>
+ const tensor& operator() (
+ forward_iterator ibegin,
+ forward_iterator iend
+ )
+ {
+ to_tensor(ibegin,iend,temp_tensor);
+ return forward(temp_tensor);
+ }
+
+
+ const tensor& operator() (const input_type& x)
+ {
+ return (*this)(&x, &x+1);
+ }
+
+ const tensor& forward(const tensor& x)
+ {
+ subnetwork->forward(x);
+ const dimpl::subnet_wrapper<subnet_type> wsub(*subnetwork);
+ if (!this_layer_setup_called)
+ {
+ details.setup(wsub);
+ this_layer_setup_called = true;
+ }
+ if (this_layer_operates_inplace())
+ impl::call_layer_forward(details, wsub, private_get_output());
+ else
+ impl::call_layer_forward(details, wsub, cached_output);
+
+ gradient_input_is_stale = true;
+ return private_get_output();
+ }
+
+ private:
+ tensor& private_get_output() const
+ {
+ if (const_cast<add_layer&>(*this).this_layer_operates_inplace())
+ return subnetwork->private_get_output();
+ else
+ return const_cast<resizable_tensor&>(cached_output);
+ }
+ tensor& private_get_gradient_input()
+ {
+ if (this_layer_operates_inplace())
+ {
+ return subnetwork->private_get_gradient_input();
+ }
+ else
+ {
+ if (gradient_input_is_stale)
+ {
+ gradient_input_is_stale = false;
+ x_grad.copy_size(private_get_output());
+ x_grad = 0;
+ }
+ return x_grad;
+ }
+ }
+ void disable_output_and_gradient_getters (
+ ) { get_output_and_gradient_input_disabled = true; }
+ public:
+ const tensor& get_output() const
+ {
+ if (get_output_and_gradient_input_disabled)
+ throw dlib::error("Accessing this layer's get_output() is disabled because an in-place layer has been stacked on top of it.");
+ return private_get_output();
+ }
+ tensor& get_gradient_input()
+ {
+ if (get_output_and_gradient_input_disabled)
+ throw dlib::error("Accessing this layer's get_gradient_input() is disabled because an in-place layer has been stacked on top of it.");
+ return private_get_gradient_input();
+ }
+
+ const tensor& get_final_data_gradient(
+ ) const { return subnetwork->get_final_data_gradient(); }
+
+ void back_propagate_error(const tensor& x)
+ {
+ back_propagate_error(x, private_get_gradient_input());
+ }
+ void back_propagate_error(const tensor& x, const tensor& gradient_input)
+ {
+ dimpl::subnet_wrapper<subnet_type> wsub(*subnetwork);
+ params_grad.copy_size(details.get_layer_params());
+ impl::call_layer_backward(details, private_get_output(),
+ gradient_input, wsub, static_cast<tensor&>(params_grad));
+
+ subnetwork->back_propagate_error(x);
+
+ // zero out get_gradient_input()
+ gradient_input_is_stale = true;
+ }
+
+ template <typename solver_type>
+ void update_parameters(sstack<solver_type> solvers, double learning_rate)
+ {
+ DLIB_CASSERT(solvers.size()>=num_computational_layers);
+ // Don't try to adjust the parameters if this layer doesn't have any or the
+ // learning rate is disabled for this layer.
+ if (params_grad.size() != 0 && get_learning_rate_multiplier(details) != 0)
+ {
+ const tensor& step = solvers.top()(learning_rate, details, static_cast<const tensor&>(params_grad));
+ tt::add(details.get_layer_params(), details.get_layer_params(), step);
+ }
+ subnetwork->update_parameters(solvers.pop(), learning_rate);
+ }
+
+ const tensor& get_parameter_gradient(
+ ) const { return params_grad; }
+
+ tensor& get_parameter_gradient (
+ ) { return params_grad; }
+
+ const subnet_type& subnet() const { return *subnetwork; }
+ subnet_type& subnet() { return *subnetwork; }
+
+ const layer_details_type& layer_details() const { return details; }
+ layer_details_type& layer_details() { return details; }
+
+ unsigned int sample_expansion_factor() const { return subnet().sample_expansion_factor(); }
+
+ void clean()
+ {
+ x_grad.clear();
+ cached_output.clear();
+ params_grad.clear();
+ temp_tensor.clear();
+ gradient_input_is_stale = true;
+ subnetwork->clean();
+ call_clean_method_if_exists(details);
+ }
+
+ friend void serialize(const add_layer& item, std::ostream& out)
+ {
+ int version = 2;
+ serialize(version, out);
+ serialize(*item.subnetwork, out);
+ serialize(item.details, out);
+ serialize(item.this_layer_setup_called, out);
+ serialize(item.gradient_input_is_stale, out);
+ serialize(item.get_output_and_gradient_input_disabled, out);
+ serialize(item.x_grad, out);
+ serialize(item.cached_output, out);
+ serialize(item.params_grad, out);
+ }
+
+ friend void deserialize(add_layer& item, std::istream& in)
+ {
+ int version = 0;
+ deserialize(version, in);
+ if (!(1 <= version && version <= 2))
+ throw serialization_error("Unexpected version found while deserializing dlib::add_layer.");
+ deserialize(*item.subnetwork, in);
+ deserialize(item.details, in);
+ deserialize(item.this_layer_setup_called, in);
+ deserialize(item.gradient_input_is_stale, in);
+ deserialize(item.get_output_and_gradient_input_disabled, in);
+ deserialize(item.x_grad, in);
+ deserialize(item.cached_output, in);
+ if (version == 2)
+ deserialize(item.params_grad, in);
+ }
+
+ friend std::ostream& operator<< (std::ostream& out, const add_layer& item)
+ {
+ int min_length = 0;
+ item.print(out, 0, min_length);
+ return out;
+ }
+
+ void print (std::ostream& out, unsigned long idx, int& min_length) const
+ {
+ out << "layer<" << idx << ">\t" << impl::tensor_to_str(private_get_output(), min_length) << layer_details() << "\n";
+ subnet().print(out, idx+1, min_length);
+ }
+
+ private:
+
+ bool this_layer_operates_inplace(
+ )
+ {
+ // This layer can run in-place if it's an in-place capable layer and also if
+ // the layer it's on top of doesn't need its own output tensor (since in-place
+ // layers overwrite that tensor)
+ return impl::is_inplace_layer(details, *subnetwork) && !subnetwork->this_layer_requires_forward_output();
+ }
+ bool this_layer_requires_forward_output(
+ )
+ {
+ return impl::backward_requires_forward_output(details, *subnetwork);
+ }
+
+ void swap(add_layer& item)
+ {
+ std::swap(subnetwork,item.subnetwork);
+ std::swap(details, item.details);
+ std::swap(this_layer_setup_called, item.this_layer_setup_called);
+ std::swap(gradient_input_is_stale, item.gradient_input_is_stale);
+ std::swap(get_output_and_gradient_input_disabled, item.get_output_and_gradient_input_disabled);
+ std::swap(x_grad, item.x_grad);
+ std::swap(cached_output, item.cached_output);
+ std::swap(params_grad, item.params_grad);
+ }
+
+
+ LAYER_DETAILS details;
+ std::unique_ptr<subnet_type> subnetwork;
+ bool this_layer_setup_called;
+ bool gradient_input_is_stale;
+ bool get_output_and_gradient_input_disabled;
+ // Note that if this_layer_operates_inplace()==true then x_grad and cached_output
+ // are not used at all. Instead, this layer uses these variables from the lower
+ // layer.
+ resizable_tensor x_grad;
+ resizable_tensor cached_output;
+
+ resizable_tensor params_grad;
+
+ // temp_tensor doesn't logically contribute to the state of this object.
+ // It is here only to prevent it from being reallocated over and over.
+ resizable_tensor temp_tensor;
+
+ };
+
+ template <typename T, typename U, typename E>
+ struct is_add_layer<add_layer<T,U,E>> : std::true_type {};
+ template <typename T, typename U, typename E>
+ struct is_add_layer<const add_layer<T,U,E>> : std::true_type {};
+ template <typename T, typename U, typename E>
+ struct is_add_layer<add_layer<T,U,E>&> : std::true_type {};
+ template <typename T, typename U, typename E>
+ struct is_add_layer<const add_layer<T,U,E>&> : std::true_type {};
+
+// ----------------------------------------------------------------------------------------
+
+// This version of add_layer handles the special case where the subnetwork being given is
+// just an input layer object.
+ template <typename LAYER_DETAILS, typename INPUT_LAYER, typename enabled>
+ class add_layer
+ {
+ public:
+ typedef LAYER_DETAILS layer_details_type;
+ typedef INPUT_LAYER subnet_type;
+ typedef typename INPUT_LAYER::input_type input_type;
+ const static size_t num_layers = 2;
+ const static size_t num_computational_layers = 1;
+
+ add_layer(
+ ):
+ this_layer_setup_called(false),
+ gradient_input_is_stale(true),
+ get_output_and_gradient_input_disabled(false),
+ _sample_expansion_factor(0)
+ {}
+
+ add_layer(const add_layer&) = default;
+ add_layer(add_layer&& item) : add_layer() { swap(item); }
+ add_layer& operator=(const add_layer&) = default;
+ add_layer& operator=(add_layer&& item) { swap(item); return *this; }
+
+ template <typename T, typename U, typename E>
+ friend class add_layer;
+ template <typename T, bool is_first, typename E>
+ friend class dimpl::subnet_wrapper;
+ template <unsigned long T, typename U, typename E>
+ friend class add_tag_layer;
+ template <template<typename> class T, typename U>
+ friend class add_skip_layer;
+ template <size_t N, template<typename> class L, typename S>
+ friend class repeat;
+
+ // Allow copying networks from one to another as long as their corresponding
+ // layers can be constructed from each other.
+ template <typename T, typename U, typename E>
+ add_layer(
+ const add_layer<T,U,E>& item
+ ):
+ input_layer(item.subnet()),
+ details(item.layer_details()),
+ this_layer_setup_called(item.this_layer_setup_called),
+ gradient_input_is_stale(item.gradient_input_is_stale),
+ get_output_and_gradient_input_disabled(false),
+ _sample_expansion_factor(item._sample_expansion_factor),
+ x_grad(item.x_grad),
+ cached_output(item.cached_output),
+ grad_final(item.grad_final)
+ {
+ }
+
+ add_layer(
+ const LAYER_DETAILS& layer_det
+ ) :
+ details(layer_det),
+ this_layer_setup_called(false),
+ gradient_input_is_stale(true),
+ get_output_and_gradient_input_disabled(false),
+ _sample_expansion_factor(0)
+ {}
+
+ add_layer(
+ const INPUT_LAYER& il
+ ) :
+ input_layer(il),
+ this_layer_setup_called(false),
+ gradient_input_is_stale(true),
+ get_output_and_gradient_input_disabled(false),
+ _sample_expansion_factor(0)
+ {}
+
+ add_layer(
+ LAYER_DETAILS&& layer_det
+ ) :
+ details(std::move(layer_det)),
+ this_layer_setup_called(false),
+ gradient_input_is_stale(true),
+ get_output_and_gradient_input_disabled(false),
+ _sample_expansion_factor(0)
+ {}
+
+ add_layer(
+ LAYER_DETAILS layer_det,
+ INPUT_LAYER il
+ ) :
+ details(std::move(layer_det)),
+ input_layer(std::move(il)),
+ this_layer_setup_called(false),
+ gradient_input_is_stale(true),
+ get_output_and_gradient_input_disabled(false),
+ _sample_expansion_factor(0)
+ {}
+
+ add_layer(
+ std::tuple<>,
+ const LAYER_DETAILS& layer_det
+ ) : add_layer(layer_det) {}
+
+ add_layer(
+ std::tuple<>,
+ LAYER_DETAILS&& layer_det
+ ) : add_layer(layer_det) {}
+
+ add_layer(
+ std::tuple<>,
+ LAYER_DETAILS layer_det,
+ INPUT_LAYER il
+ ) : add_layer(layer_det,il) {}
+
+ add_layer(
+ const std::tuple<LAYER_DETAILS>& layer_det
+ ) : add_layer(tuple_head(layer_det)) {}
+
+ add_layer(
+ const std::tuple<LAYER_DETAILS>& layer_det,
+ INPUT_LAYER il
+ ) : add_layer(tuple_head(layer_det),il) {}
+
+ template <typename forward_iterator>
+ void to_tensor (
+ forward_iterator ibegin,
+ forward_iterator iend,
+ resizable_tensor& data
+ ) const
+ {
+ input_layer.to_tensor(ibegin, iend, data);
+ // make sure the input layer's to_tensor() function is implemented properly.
+ DLIB_CASSERT(data.num_samples() >= std::distance(ibegin,iend),
+ "The input layer can't produce fewer output tensors than there are inputs.");
+ DLIB_CASSERT(data.num_samples()%std::distance(ibegin,iend) == 0,
+ "The number of tensors produced by the input layer must be an integer multiple of the number of input objects.");
+
+ _sample_expansion_factor = data.num_samples()/std::distance(ibegin,iend);
+ data.async_copy_to_device();
+ }
+
+
+ template <typename forward_iterator>
+ const tensor& operator() (
+ forward_iterator ibegin,
+ forward_iterator iend
+ )
+ {
+ to_tensor(ibegin,iend,temp_tensor);
+ return forward(temp_tensor);
+ }
+
+
+ const tensor& operator() (const input_type& x)
+ {
+ return (*this)(&x, &x+1);
+ }
+
+ const tensor& forward (const tensor& x)
+ {
+ DLIB_CASSERT(sample_expansion_factor() != 0, "You must call to_tensor() before this function can be used.");
+ DLIB_CASSERT(x.num_samples()%sample_expansion_factor() == 0);
+ subnet_wrapper wsub(x, grad_final, _sample_expansion_factor);
+ if (!this_layer_setup_called)
+ {
+ details.setup(wsub);
+ this_layer_setup_called = true;
+ }
+ impl::call_layer_forward(details, wsub, cached_output);
+ gradient_input_is_stale = true;
+ return private_get_output();
+ }
+
+ private:
+ tensor& private_get_output() const { return const_cast<resizable_tensor&>(cached_output); }
+ tensor& private_get_gradient_input()
+ {
+ if (gradient_input_is_stale)
+ {
+ gradient_input_is_stale = false;
+ x_grad.copy_size(private_get_output());
+ x_grad = 0;
+ }
+ return x_grad;
+ }
+ void disable_output_and_gradient_getters (
+ ) { get_output_and_gradient_input_disabled = true; }
+ public:
+ const tensor& get_output() const
+ {
+ if (get_output_and_gradient_input_disabled)
+ throw dlib::error("Accessing this layer's get_output() is disabled because an in-place layer has been stacked on top of it.");
+ return private_get_output();
+ }
+ tensor& get_gradient_input()
+ {
+ if (get_output_and_gradient_input_disabled)
+ throw dlib::error("Accessing this layer's get_gradient_input() is disabled because an in-place layer has been stacked on top of it.");
+ return private_get_gradient_input();
+ }
+
+ const tensor& get_final_data_gradient(
+ ) const { return grad_final; }
+
+ void back_propagate_error(const tensor& x)
+ {
+ back_propagate_error(x, private_get_gradient_input());
+ }
+ void back_propagate_error(const tensor& x, const tensor& gradient_input)
+ {
+ // make sure grad_final is initialized to 0
+ if (!have_same_dimensions(x, grad_final))
+ grad_final.copy_size(x);
+ grad_final = 0;
+
+ subnet_wrapper wsub(x, grad_final, _sample_expansion_factor);
+ params_grad.copy_size(details.get_layer_params());
+ impl::call_layer_backward(details, private_get_output(),
+ gradient_input, wsub, static_cast<tensor&>(params_grad));
+
+ // zero out get_gradient_input()
+ gradient_input_is_stale = true;
+ }
+
+ template <typename solver_type>
+ void update_parameters(sstack<solver_type> solvers, double learning_rate)
+ {
+ DLIB_CASSERT(solvers.size()>=num_computational_layers);
+ // Don't try to adjust the parameters if this layer doesn't have any or the
+ // learning rate is disabled for this layer.
+ if (params_grad.size() != 0 && get_learning_rate_multiplier(details) != 0)
+ {
+ const tensor& step = solvers.top()(learning_rate, details, static_cast<const tensor&>(params_grad));
+ tt::add(details.get_layer_params(), details.get_layer_params(), step);
+ }
+ }
+
+ const tensor& get_parameter_gradient(
+ ) const { return params_grad; }
+
+ tensor& get_parameter_gradient (
+ ) { return params_grad; }
+
+ const subnet_type& subnet() const { return input_layer; }
+ subnet_type& subnet() { return input_layer; }
+
+ const layer_details_type& layer_details() const { return details; }
+ layer_details_type& layer_details() { return details; }
+
+ unsigned int sample_expansion_factor() const { return _sample_expansion_factor; }
+
+ void clean()
+ {
+ x_grad.clear();
+ grad_final.clear();
+ cached_output.clear();
+ params_grad.clear();
+ temp_tensor.clear();
+ gradient_input_is_stale = true;
+ call_clean_method_if_exists(details);
+ }
+
+ friend void serialize(const add_layer& item, std::ostream& out)
+ {
+ int version = 3;
+ serialize(version, out);
+ serialize(item.input_layer, out);
+ serialize(item.details, out);
+ serialize(item.this_layer_setup_called, out);
+ serialize(item.gradient_input_is_stale, out);
+ serialize(item.get_output_and_gradient_input_disabled, out);
+ serialize(item.x_grad, out);
+ serialize(item.cached_output, out);
+ serialize(item.grad_final, out);
+ serialize(item._sample_expansion_factor, out);
+ }
+
+ friend void deserialize(add_layer& item, std::istream& in)
+ {
+ int version = 0;
+ deserialize(version, in);
+ if (!(2 <= version && version <= 3))
+ throw serialization_error("Unexpected version found while deserializing dlib::add_layer.");
+ deserialize(item.input_layer, in);
+ deserialize(item.details, in);
+ deserialize(item.this_layer_setup_called, in);
+ deserialize(item.gradient_input_is_stale, in);
+ deserialize(item.get_output_and_gradient_input_disabled, in);
+ deserialize(item.x_grad, in);
+ deserialize(item.cached_output, in);
+ deserialize(item.grad_final, in);
+ if (version >= 3)
+ deserialize(item._sample_expansion_factor, in);
+ else
+ item._sample_expansion_factor = 1; // all layer types set this to 1 in older dlib versions, so that's what we put here.
+ }
+
+ friend std::ostream& operator<< (std::ostream& out, const add_layer& item)
+ {
+ int min_length = 0;
+ item.print(out, 0, min_length);
+ return out;
+ }
+
+ void print (std::ostream& out, unsigned long idx, int& min_length) const
+ {
+ out << "layer<" << idx << ">\t" << impl::tensor_to_str(private_get_output(), min_length) << layer_details() << "\n";
+
+ // Don't print the repeat_input_layer since it doesn't exist from the user's
+ // point of view. It's just an artifact of how repeat<> works.
+ if (!std::is_same<subnet_type, impl::repeat_input_layer>::value)
+ out << "layer<" << idx+1 << ">\t" << subnet() << "\n";
+ }
+
+ private:
+
+ bool this_layer_requires_forward_output(
+ )
+ {
+ subnet_wrapper wsub(grad_final, grad_final, _sample_expansion_factor);
+ return impl::backward_requires_forward_output(details, wsub);
+ }
+
+ class subnet_wrapper
+ {
+ public:
+ subnet_wrapper(const tensor& x_, resizable_tensor& grad_final_, unsigned int sef) :
+ x(x_), grad_final(grad_final_), _sample_expansion_factor(sef) {}
+
+ subnet_wrapper(const subnet_wrapper&) = delete;
+ subnet_wrapper& operator=(const subnet_wrapper&) = delete;
+
+ unsigned int sample_expansion_factor() const { return _sample_expansion_factor;}
+ const tensor& get_output() const { return x; }
+ tensor& get_gradient_input()
+ {
+ if (!have_same_dimensions(x, grad_final))
+ {
+ grad_final.copy_size(x);
+ grad_final = 0;
+ }
+ return grad_final;
+ }
+
+ private:
+ const tensor& x;
+ resizable_tensor& grad_final;
+ unsigned int _sample_expansion_factor;
+ };
+
+ void swap(add_layer& item)
+ {
+ std::swap(input_layer, item.input_layer);
+ std::swap(details, item.details);
+ std::swap(this_layer_setup_called, item.this_layer_setup_called);
+ std::swap(gradient_input_is_stale, item.gradient_input_is_stale);
+ std::swap(get_output_and_gradient_input_disabled, item.get_output_and_gradient_input_disabled);
+ std::swap(x_grad, item.x_grad);
+ std::swap(cached_output, item.cached_output);
+ std::swap(grad_final, item.grad_final);
+ std::swap(_sample_expansion_factor, item._sample_expansion_factor);
+ }
+
+ subnet_type input_layer;
+ LAYER_DETAILS details;
+ bool this_layer_setup_called;
+ bool gradient_input_is_stale;
+ bool get_output_and_gradient_input_disabled;
+ mutable unsigned int _sample_expansion_factor;
+ resizable_tensor x_grad;
+ resizable_tensor cached_output;
+ resizable_tensor grad_final;
+
+ // The following 2 objects don't logically contribute to the state of this class.
+ // They are only here to prevent them from being reallocated over and over in
+ // member functions.
+ resizable_tensor params_grad;
+ resizable_tensor temp_tensor;
+ };
+
+// ----------------------------------------------------------------------------------------
+
+ template <unsigned long ID, typename SUBNET, typename enabled=void>
+ class add_tag_layer;
+
+ template <template<typename SUBNET> class tag>
+ struct tag_id
+ {
+ const static unsigned long id = tag<impl::repeat_input_layer>::id;
+ };
+
+ template <unsigned long ID, typename SUBNET>
+ class add_tag_layer<ID,SUBNET,
+ typename std::enable_if<is_nonloss_layer_type<SUBNET>::value>::type>
+ {
+ public:
+ typedef SUBNET subnet_type;
+ typedef typename subnet_type::input_type input_type;
+ typedef int layer_details_type; // not really used anywhere, but required by subnet_wrapper.
+ const static size_t num_layers = subnet_type::num_layers + 1;
+ const static size_t num_computational_layers = subnet_type::num_computational_layers;
+ const static unsigned long id = ID;
+
+ add_tag_layer() {};
+ add_tag_layer(const add_tag_layer&) = default;
+ add_tag_layer(add_tag_layer&&) = default;
+ add_tag_layer& operator=(add_tag_layer&&) = default;
+ add_tag_layer& operator=(const add_tag_layer&) = default;
+
+ template <typename T>
+ add_tag_layer(
+ const add_tag_layer<ID,T>& item
+ ) : subnetwork(item.subnet())
+ {}
+
+ template <typename ...T>
+ add_tag_layer(
+ T ...args
+ ) :
+ subnetwork(std::move(args)...)
+ {
+ }
+
+ template <typename forward_iterator>
+ void to_tensor (
+ forward_iterator ibegin,
+ forward_iterator iend,
+ resizable_tensor& data
+ ) const
+ {
+ subnetwork.to_tensor(ibegin,iend,data);
+ }
+
+ template <typename forward_iterator>
+ const tensor& operator() (
+ forward_iterator ibegin,
+ forward_iterator iend
+ )
+ {
+ return subnetwork(ibegin,iend);
+ }
+
+ const tensor& operator() (const input_type& x)
+ {
+ return subnetwork(x);
+ }
+
+ const tensor& forward(const tensor& x)
+ {
+ return subnetwork.forward(x);
+ }
+
+ const tensor& get_output() const { return subnetwork.get_output(); }
+
+ tensor& get_gradient_input()
+ {
+ return subnetwork.get_gradient_input();
+ }
+
+ const tensor& get_final_data_gradient(
+ ) const { return subnetwork.get_final_data_gradient(); }
+
+ void back_propagate_error(const tensor& x)
+ {
+ subnetwork.back_propagate_error(x);
+ }
+ void back_propagate_error(const tensor& x, const tensor& gradient_input)
+ {
+ subnetwork.back_propagate_error(x,gradient_input);
+ }
+
+ template <typename solver_type>
+ void update_parameters(sstack<solver_type> solvers, double learning_rate)
+ {
+ subnetwork.update_parameters(solvers, learning_rate);
+ }
+
+ const tensor& get_parameter_gradient(
+ ) const { return params_grad; }
+
+ tensor& get_parameter_gradient (
+ ) { return params_grad; }
+
+ const subnet_type& subnet() const { return subnetwork; }
+ subnet_type& subnet() { return subnetwork; }
+
+ unsigned int sample_expansion_factor() const { return subnet().sample_expansion_factor(); }
+
+ void clean()
+ {
+ subnetwork.clean();
+ }
+
+ friend void serialize(const add_tag_layer& item, std::ostream& out)
+ {
+ int version = 1;
+ serialize(version, out);
+ serialize(item.subnetwork, out);
+ }
+
+ friend void deserialize(add_tag_layer& item, std::istream& in)
+ {
+ int version = 0;
+ deserialize(version, in);
+ if (version != 1)
+ throw serialization_error("Unexpected version found while deserializing dlib::add_tag_layer.");
+ deserialize(item.subnetwork, in);
+ }
+
+ friend std::ostream& operator<< (std::ostream& out, const add_tag_layer& item)
+ {
+ int min_length = 0;
+ item.print(out, 0, min_length);
+ return out;
+ }
+
+ void print (std::ostream& out, unsigned long idx, int& min_length) const
+ {
+ out << "layer<" << idx << ">\t" << impl::tensor_to_str(private_get_output(), min_length) << "tag" << ID << "\n";
+ subnet().print(out, idx+1, min_length);
+ }
+
+ private:
+
+ template <typename T, typename U, typename E>
+ friend class add_layer;
+ template <typename T, bool is_first, typename E>
+ friend class dimpl::subnet_wrapper;
+ template <unsigned long T, typename U, typename E>
+ friend class add_tag_layer;
+ template <template<typename> class T, typename U>
+ friend class add_skip_layer;
+ template <size_t N, template<typename> class L, typename S>
+ friend class repeat;
+
+ // You wouldn't put a tag on a layer if you didn't want to access its forward
+ // outputs. So this is always true.
+ bool this_layer_requires_forward_output(
+ ) { return true; }
+
+ void disable_output_and_gradient_getters (
+ )
+ {
+ // This should never happen because only inplace layers call
+ // disable_output_and_gradient_getters(), however, putting a tag layer right
+ // before an inplace layer basically means you don't want the following layer
+ // to operate in place. So the inplace layer should turn itself into an
+ // out-of-place layer and not call disable_output_and_gradient_getters().
+ DLIB_CASSERT(false,"This should never happen");
+ }
+
+ tensor& private_get_output() const
+ { return subnetwork.private_get_output(); }
+ tensor& private_get_gradient_input()
+ { return subnetwork.private_get_gradient_input(); }
+
+ subnet_type subnetwork;
+
+ // This member doesn't logically contribute to the state of the object since it is
+ // always empty. It's just here so we can have the get_parameter_gradient() methods
+ // which have to return something. So they return this empty tensor.
+ resizable_tensor params_grad;
+ };
+
+// ----------------------------------------------------------------------------------------
+
+ template <typename ...T>
+ struct decorator_repeat_group
+ {
+ decorator_repeat_group(
+ T&& ...args
+ ) : data(std::forward<T>(args)...) {}
+
+ std::tuple<T...> data;
+ };
+ template <typename ...T>
+ decorator_repeat_group<T...> repeat_group (
+ T&& ...args
+ )
+ {
+ return decorator_repeat_group<T...>(std::forward<T>(args)...);
+ }
+
+ template <
+ size_t num,
+ template<typename> class REPEATED_LAYER,
+ typename SUBNET
+ >
+ class repeat
+ {
+ static_assert(num > 0, "You can't have a layer repeated 0 times.");
+ public:
+ typedef SUBNET subnet_type;
+ typedef typename SUBNET::input_type input_type;
+ typedef int layer_details_type; // not really used anywhere, but required by subnet_wrapper.
+ const static size_t comp_layers_in_each_group = (REPEATED_LAYER<SUBNET>::num_computational_layers-SUBNET::num_computational_layers);
+ const static size_t comp_layers_in_repeated_group = comp_layers_in_each_group*num;
+ const static size_t num_computational_layers = comp_layers_in_repeated_group + SUBNET::num_computational_layers;
+
+ const static size_t layers_in_each_group = (REPEATED_LAYER<SUBNET>::num_layers-SUBNET::num_layers);
+ const static size_t layers_in_repeated_group = layers_in_each_group*num;
+ const static size_t num_layers = subnet_type::num_layers + layers_in_repeated_group;
+
+
+ typedef REPEATED_LAYER<impl::repeat_input_layer> repeated_layer_type;
+
+ repeat(
+ ) :
+ details(num)
+ {
+ }
+
+ size_t num_repetitions (
+ ) const { return num; }
+
+ const repeated_layer_type& get_repeated_layer (
+ size_t i
+ ) const
+ {
+ DLIB_CASSERT(i < num_repetitions());
+ return details[i];
+ }
+
+ repeated_layer_type& get_repeated_layer (
+ size_t i
+ )
+ {
+ DLIB_CASSERT(i < num_repetitions());
+ return details[i];
+ }
+
+ repeat(const repeat&) = default;
+ repeat(repeat&&) = default;
+ repeat& operator=(repeat&&) = default;
+ repeat& operator=(const repeat&) = default;
+
+ template <template<typename> class T, typename U>
+ repeat(
+ const repeat<num,T,U>& item
+ ) :
+ subnetwork(item.subnetwork)
+ {
+ for (auto&& d : item.details)
+ details.emplace_back(d);
+ }
+
+ template <typename T, typename ...U>
+ repeat(
+ T arg1,
+ U ...args2
+ ):
+ details(num, std::move(arg1)),
+ subnetwork(std::move(args2)...)
+ {
+ }
+
+ template <typename ...T, typename ...U>
+ repeat(
+ decorator_repeat_group<T...>&& arg1,
+ U ...args2
+ ):
+ details(num, arg1.data),
+ subnetwork(std::move(args2)...)
+ {
+ }
+
+ template <typename T, typename ...U>
+ repeat(
+ std::tuple<>,
+ T arg1,
+ U ...args2
+ ):
+ details(num, std::move(arg1)),
+ subnetwork(std::move(args2)...)
+ {
+ }
+
+ template <typename forward_iterator>
+ void to_tensor (
+ forward_iterator ibegin,
+ forward_iterator iend,
+ resizable_tensor& data
+ ) const
+ {
+ subnetwork.to_tensor(ibegin,iend,data);
+ // call to_tensor on the networks in details just to populate the
+ // _sample_expansion_factor values in those networks. Other than that this
+ // call is a noop.
+ for (auto& d : details)
+ d.to_tensor(ibegin, iend, data);
+ }
+
+ template <typename forward_iterator>
+ const tensor& operator() (
+ forward_iterator ibegin,
+ forward_iterator iend
+ )
+ {
+ to_tensor(ibegin,iend,temp_tensor);
+ return forward(temp_tensor);
+ }
+
+ const tensor& operator() (const input_type& x)
+ {
+ return (*this)(&x, &x+1);
+ }
+
+ const tensor& forward(const tensor& x)
+ {
+ subnetwork.forward(x);
+ details[details.size()-1].forward(subnetwork.get_output());
+ for (long i = details.size()-2; i >= 0; --i)
+ details[i].forward(details[i+1].get_output());
+ return private_get_output();
+ }
+
+ private:
+ tensor& private_get_output() const
+ {
+ return details[0].private_get_output();
+ }
+ tensor& private_get_gradient_input()
+ {
+ return details[0].private_get_gradient_input();
+ }
+ public:
+ const tensor& get_output() const
+ {
+ return details[0].get_output();
+ }
+ tensor& get_gradient_input()
+ {
+ return details[0].get_gradient_input();
+ }
+
+ const tensor& get_parameter_gradient(
+ ) const { return details[0].get_parameter_gradient(); }
+
+ tensor& get_parameter_gradient (
+ ) { return details[0].get_parameter_gradient(); }
+
+ void back_propagate_error(const tensor& x)
+ {
+ back_propagate_error(x, private_get_gradient_input());
+ }
+ void back_propagate_error(const tensor& x, const tensor& gradient_input)
+ {
+ if (details.size() > 1)
+ {
+ details[0].back_propagate_error(details[1].get_output(), gradient_input);
+ for (size_t i = 1; i < details.size(); ++i)
+ {
+ if (i+1 < details.size())
+ details[i].back_propagate_error(details[i+1].get_output(), details[i-1].get_final_data_gradient());
+ else
+ details[i].back_propagate_error(subnetwork.get_output(), details[i-1].get_final_data_gradient());
+ }
+ }
+ else
+ {
+ details[0].back_propagate_error(subnetwork.get_output(), gradient_input);
+ }
+ subnetwork.back_propagate_error(x, details.back().get_final_data_gradient());
+ }
+
+ template <typename solver_type>
+ void update_parameters(sstack<solver_type> solvers, double learning_rate)
+ {
+ for (size_t i = 0; i < details.size(); ++i)
+ details[i].update_parameters(solvers.pop(comp_layers_in_each_group*i),learning_rate);
+ subnetwork.update_parameters(solvers.pop(comp_layers_in_each_group*details.size()),learning_rate);
+ }
+
+ const subnet_type& subnet() const { return subnetwork; }
+ subnet_type& subnet() { return subnetwork; }
+
+ unsigned int sample_expansion_factor() const { return subnet().sample_expansion_factor(); }
+
+ void clean()
+ {
+ temp_tensor.clear();
+ subnetwork.clean();
+ for (auto&& d : details)
+ d.clean();
+ }
+
+ friend void serialize(const repeat& item, std::ostream& out)
+ {
+ int version = 1;
+ serialize(version, out);
+ serialize(item.details, out);
+ serialize(item.subnetwork, out);
+ }
+
+ friend void deserialize(repeat& item, std::istream& in)
+ {
+ int version = 0;
+ deserialize(version, in);
+ if (version != 1)
+ throw serialization_error("Unexpected version found while deserializing dlib::repeat.");
+ deserialize(item.details, in);
+ deserialize(item.subnetwork, in);
+ }
+
+ friend std::ostream& operator<< (std::ostream& out, const repeat& item)
+ {
+ int min_length = 0;
+ item.print(out, 0, min_length);
+ return out;
+ }
+
+ void print (std::ostream& out, unsigned long idx, int& min_length) const
+ {
+ for (size_t i = 0; i < num_repetitions(); ++i)
+ {
+ get_repeated_layer(i).print(out, idx, min_length);
+ idx += layers_in_each_group;
+ }
+ subnet().print(out, idx, min_length);
+ }
+ private:
+
+
+ template <typename T, typename U, typename E>
+ friend class add_layer;
+ template <typename T, bool is_first, typename E>
+ friend class dimpl::subnet_wrapper;
+ template <unsigned long T, typename U, typename E>
+ friend class add_tag_layer;
+ template <template<typename> class T, typename U>
+ friend class add_skip_layer;
+ template <size_t N, template<typename> class L, typename S>
+ friend class repeat;
+
+ bool this_layer_requires_forward_output(
+ )
+ {
+ return details[0].this_layer_requires_forward_output();
+ }
+
+ void disable_output_and_gradient_getters (
+ )
+ {
+ details[0].disable_output_and_gradient_getters();
+ }
+
+
+ std::vector<repeated_layer_type> details;
+ subnet_type subnetwork;
+
+ // temp_tensor doesn't logically contribute to the state of this class.
+ // It is here only to void needing to reallocate it over and over.
+ resizable_tensor temp_tensor;
+ };
+
+ template <
+ size_t num,
+ template<typename> class REPEATED_LAYER,
+ typename SUBNET
+ >
+ struct is_nonloss_layer_type<repeat<num,REPEATED_LAYER,SUBNET>> : std::true_type {};
+
+// ----------------------------------------------------------------------------------------
+
+// This version of add_tag_layer handles the special case where the subnetwork being given
+// is just an input layer object.
+ template <unsigned long ID, typename INPUT_LAYER, typename enabled>
+ class add_tag_layer
+ {
+ public:
+ typedef INPUT_LAYER subnet_type;
+ typedef typename subnet_type::input_type input_type;
+ typedef int layer_details_type; // not really used anywhere, but required by subnet_wrapper.
+ const static size_t num_computational_layers = 0;
+ const static size_t num_layers = 2;
+ const static unsigned long id = ID;
+
+ add_tag_layer():cached_output_ptr(nullptr),gradient_input_is_stale(true),_sample_expansion_factor(0) {}
+
+ add_tag_layer(const add_tag_layer&) = default;
+ add_tag_layer& operator=(const add_tag_layer&) = default;
+ add_tag_layer(add_tag_layer&& item) : add_tag_layer() { swap(item); }
+ add_tag_layer& operator=(add_tag_layer&& item) { swap(item); return *this; }
+
+ template <typename T, typename E>
+ add_tag_layer(
+ const add_tag_layer<ID,T,E>& item
+ ) : input_layer(item.subnet()),
+ cached_output(item.cached_output),
+ cached_output_ptr(nullptr),
+ grad_final(item.grad_final),
+ gradient_input_is_stale(item.gradient_input_is_stale),
+ _sample_expansion_factor(0)
+ {}
+
+ template <typename ...T>
+ add_tag_layer(
+ T ...args
+ ) :
+ input_layer(std::move(args)...),
+ cached_output_ptr(nullptr),
+ gradient_input_is_stale(true),
+ _sample_expansion_factor(0)
+ {
+ }
+
+ add_tag_layer (
+ std::tuple<>
+ ) :
+ cached_output_ptr(nullptr),
+ gradient_input_is_stale(true),
+ _sample_expansion_factor(0)
+ {}
+
+ template <typename forward_iterator>
+ void to_tensor (
+ forward_iterator ibegin,
+ forward_iterator iend,
+ resizable_tensor& data
+ ) const
+ {
+ input_layer.to_tensor(ibegin,iend,data);
+
+ // make sure the input layer's to_tensor() function is implemented properly.
+ DLIB_CASSERT(data.num_samples() >= std::distance(ibegin,iend),
+ "The input layer can't produce fewer output tensors than there are inputs.");
+ DLIB_CASSERT(data.num_samples()%std::distance(ibegin,iend) == 0,
+ "The number of tensors produced by the input layer must be an integer multiple of the number of input objects.");
+
+ _sample_expansion_factor = data.num_samples()/std::distance(ibegin,iend);
+ data.async_copy_to_device();
+ }
+
+ unsigned int sample_expansion_factor() const { return _sample_expansion_factor; }
+
+ template <typename forward_iterator>
+ const tensor& operator() (
+ forward_iterator ibegin,
+ forward_iterator iend
+ )
+ {
+ input_layer.to_tensor(ibegin,iend,cached_output);
+ cached_output_ptr = nullptr;
+ return get_output();
+ }
+
+ const tensor& operator() (const input_type& x)
+ {
+ return (*this)(&x, &x+1);
+ }
+
+ const tensor& forward(const tensor& x)
+ {
+ // If this tag is the first layer in one of the sub networks inside a repeat
+ // layer then we don't want it to be creating copies of x. This is because, we
+ // can just hold a pointer to x since the way repeat is constructed guarantees
+ // that x will have a lifetime larger than this pointer.
+ if (is_same_type<INPUT_LAYER, impl::repeat_input_layer>::value)
+ cached_output_ptr = const_cast<tensor*>(&x);
+ else
+ cached_output = x;
+ gradient_input_is_stale = true;
+ return get_output();
+ }
+
+ const tensor& get_output() const
+ {
+ if (cached_output_ptr)
+ return *cached_output_ptr;
+ else
+ return cached_output;
+ }
+
+ const tensor& get_final_data_gradient(
+ ) const { return grad_final; }
+
+ tensor& get_gradient_input()
+ {
+ if (!have_same_dimensions(get_output(), grad_final) ||
+ gradient_input_is_stale)
+ {
+ grad_final.copy_size(get_output());
+ grad_final = 0;
+ gradient_input_is_stale = false;
+ }
+ return grad_final;
+ }
+
+ void back_propagate_error(const tensor& /*x*/)
+ {
+ // nothing to do
+ }
+ void back_propagate_error(const tensor& /*x*/, const tensor& /*gradient_input*/)
+ {
+ // nothing to do
+ }
+
+ template <typename solver_type>
+ void update_parameters(sstack<solver_type> /*solvers*/, double /*learning_rate*/)
+ {
+ // nothing to do
+ }
+
+ const subnet_type& subnet() const { return input_layer; }
+ subnet_type& subnet() { return input_layer; }
+
+ void clean()
+ {
+ grad_final.clear();
+ cached_output.clear();
+ cached_output_ptr = 0;
+ }
+
+ friend void serialize(const add_tag_layer& item, std::ostream& out)
+ {
+ int version = 2;
+ serialize(version, out);
+ serialize(item.input_layer, out);
+ serialize(item.cached_output, out);
+ serialize(item.grad_final, out);
+ serialize(item.gradient_input_is_stale, out);
+ serialize(item._sample_expansion_factor, out);
+ }
+
+ friend void deserialize(add_tag_layer& item, std::istream& in)
+ {
+ int version = 0;
+ deserialize(version, in);
+ if (!(1 <= version && version <= 2))
+ throw serialization_error("Unexpected version found while deserializing dlib::add_tag_layer.");
+ deserialize(item.input_layer, in);
+ deserialize(item.cached_output, in);
+ deserialize(item.grad_final, in);
+ deserialize(item.gradient_input_is_stale, in);
+ item.cached_output_ptr = nullptr;
+ if (version >= 2)
+ deserialize(item._sample_expansion_factor, in);
+ else
+ item._sample_expansion_factor = 1; // all layer types set this to 1 in older dlib versions, so that's what we put here.
+
+ }
+
+ friend std::ostream& operator<< (std::ostream& out, const add_tag_layer& item)
+ {
+ int min_length = 0;
+ item.print(out, 0, min_length);
+ return out;
+ }
+
+ void print (std::ostream& out, unsigned long idx, int& min_length) const
+ {
+ out << "layer<"<<idx << ">\t"<<impl::tensor_to_str(private_get_output(), min_length)<< "tag" << ID << "\n";
+ // Don't print the repeat_input_layer since it doesn't exist from the user's
+ // point of view. It's just an artifact of how repeat<> works.
+ if (!std::is_same<subnet_type, impl::repeat_input_layer>::value)
+ out << "layer<"<< idx+1 << ">\t" << subnet() << "\n";
+ }
+
+ private:
+
+ template <typename T, typename U, typename E>
+ friend class add_layer;
+ template <typename T, bool is_first, typename E>
+ friend class dimpl::subnet_wrapper;
+ template <unsigned long T, typename U, typename E>
+ friend class add_tag_layer;
+ template <template<typename> class T, typename U>
+ friend class add_skip_layer;
+ template <size_t N, template<typename> class L, typename S>
+ friend class repeat;
+
+ // You woudln't put a tag on a layer if you didn't want to access its forward
+ // outputs. So this is always true.
+ bool this_layer_requires_forward_output(
+ ) { return true; }
+
+ void disable_output_and_gradient_getters (
+ )
+ {
+ // This should never happen because only inplace layers call
+ // disable_output_and_gradient_getters(), however, putting a tag layer right
+ // before an inplace layer basically means you don't want the following layer
+ // to operate in place. So the inplace layer should turn itself into an
+ // out-of-place layer and not call disable_output_and_gradient_getters().
+ DLIB_CASSERT(false,"This should never happen");
+ }
+
+ tensor& private_get_output() const
+ { return const_cast<tensor&>(get_output()); }
+ tensor& private_get_gradient_input()
+ { return get_gradient_input(); }
+
+ void swap(add_tag_layer& item)
+ {
+ std::swap(input_layer, item.input_layer);
+ std::swap(cached_output, item.cached_output);
+ std::swap(cached_output_ptr, item.cached_output_ptr);
+ std::swap(grad_final, item.grad_final);
+ std::swap(gradient_input_is_stale, item.gradient_input_is_stale);
+ std::swap(_sample_expansion_factor, item._sample_expansion_factor);
+ }
+
+ subnet_type input_layer;
+ resizable_tensor cached_output;
+ tensor* cached_output_ptr;
+ resizable_tensor grad_final;
+ bool gradient_input_is_stale;
+ mutable unsigned int _sample_expansion_factor;
+ };
+
+ template <unsigned long ID, typename U, typename E>
+ struct is_nonloss_layer_type<add_tag_layer<ID,U,E>> : std::true_type {};
+
+
+// ----------------------------------------------------------------------------------------
+// ----------------------------------------------------------------------------------------
+// ----------------------------------------------------------------------------------------
+
+ template <typename LOSS_DETAILS, typename SUBNET>
+ class add_loss_layer;
+
+ class no_label_type
+ {
+ private:
+ // We don't want anyone making these no_label_type objects. They are here only to
+ // allow add_loss_layer::training_label_type and dnn_trainer::training_label_type
+ // to exist which avoids needing to overload add_loss_layer and dnn_trainer for
+ // supervised an unsupervised losses. It also can be a type to use in template
+ // metaprogramming to indicate "no label". So here we make the constructor private
+ // with the exception that add_loss_layer objects can make it (again, just to
+ // simplify add_loss_layer's implementation).
+ no_label_type(){};
+ template <typename LOSS_DETAILS, typename SUBNET> friend class add_loss_layer;
+ template < typename net_type, typename solver_type > friend class dnn_trainer;
+ };
+
+// ----------------------------------------------------------------------------------------
+
+ template <typename LOSS_DETAILS, typename SUBNET>
+ class add_loss_layer
+ {
+ template <typename T, typename enabled=void>
+ struct get_loss_layer_training_label_type
+ {
+ typedef no_label_type type;
+ };
+ template <typename T>
+ struct get_loss_layer_training_label_type<T,typename std::enable_if<sizeof(typename T::training_label_type)!=0>::type>
+ {
+ typedef typename T::training_label_type type;
+ };
+
+ template <typename T, typename enabled=void>
+ struct get_loss_layer_output_label_type
+ {
+ typedef no_label_type type;
+ };
+ template <typename T>
+ struct get_loss_layer_output_label_type<T,typename std::enable_if<sizeof(typename T::output_label_type)!=0>::type>
+ {
+ typedef typename T::output_label_type type;
+ };
+
+ public:
+ typedef LOSS_DETAILS loss_details_type;
+ typedef SUBNET subnet_type;
+ typedef typename subnet_type::input_type input_type;
+ const static size_t num_layers = subnet_type::num_layers + 1;
+ // Note that the loss layer doesn't count as an additional computational layer.
+ const static size_t num_computational_layers = subnet_type::num_computational_layers;
+ typedef typename get_loss_layer_training_label_type<LOSS_DETAILS>::type training_label_type;
+ typedef typename get_loss_layer_output_label_type<LOSS_DETAILS>::type output_label_type;
+
+ static_assert(is_nonloss_layer_type<SUBNET>::value,
+ "SUBNET must be of type add_layer, add_skip_layer, or add_tag_layer.");
+
+
+ add_loss_layer() {};
+ add_loss_layer(const add_loss_layer&) = default;
+ add_loss_layer& operator=(const add_loss_layer&) = default;
+ add_loss_layer(add_loss_layer&& item) : add_loss_layer() { swap(item); }
+ add_loss_layer& operator=(add_loss_layer&& item) { swap(item); return *this; }
+
+ template <typename T, typename U>
+ add_loss_layer(
+ const add_loss_layer<T,U>& item
+ ) :
+ loss(item.loss_details()),
+ subnetwork(item.subnet())
+ {}
+
+ template <typename ...T>
+ add_loss_layer(
+ const LOSS_DETAILS& layer_det,
+ T&& ...args
+ ) :
+ loss(layer_det),
+ subnetwork(std::forward<T>(args)...)
+ {
+ }
+
+ template <typename ...T>
+ add_loss_layer(
+ LOSS_DETAILS&& layer_det,
+ T&& ...args
+ ) :
+ loss(std::move(layer_det)),
+ subnetwork(std::forward<T>(args)...)
+ {
+ }
+
+ template <typename T, typename ...U>
+ struct disable_forwarding_constr
+ {
+ const static bool value = std::is_constructible<LOSS_DETAILS,T>::value;
+ };
+ template <typename ...T>
+ struct disable_forwarding_constr<add_loss_layer<T...>>
+ {
+ const static bool value = true;
+ };
+
+ template <
+ typename ...T,
+ typename = typename std::enable_if<!disable_forwarding_constr<typename std::remove_reference<T>::type...>::value>::type
+ >
+ add_loss_layer(
+ T&& ...args
+ ) :
+ subnetwork(std::forward<T>(args)...)
+ {
+ }
+
+ template <typename forward_iterator>
+ void to_tensor (
+ forward_iterator ibegin,
+ forward_iterator iend,
+ resizable_tensor& data
+ ) const
+ {
+ subnetwork.to_tensor(ibegin,iend,data);
+ }
+
+ unsigned int sample_expansion_factor() const { return subnet().sample_expansion_factor(); }
+
+ template <typename output_iterator>
+ void operator() (
+ const tensor& x,
+ output_iterator obegin
+ )
+ {
+ subnetwork.forward(x);
+ const dimpl::subnet_wrapper<subnet_type> wsub(subnetwork);
+ loss.to_label(x, wsub, obegin);
+ }
+
+ template <typename forward_iterator, typename output_iterator>
+ void operator() (
+ forward_iterator ibegin,
+ forward_iterator iend,
+ output_iterator obegin
+ )
+ {
+ to_tensor(ibegin,iend,temp_tensor);
+ (*this)(temp_tensor, obegin);
+ }
+
+ const output_label_type& operator() (const input_type& x)
+ {
+ (*this)(&x, &x+1, &temp_label);
+ return temp_label;
+ }
+
+ template <typename ...T>
+ const output_label_type& process (const input_type& x, T&& ...args)
+ {
+ to_tensor(&x,&x+1,temp_tensor);
+ subnetwork.forward(temp_tensor);
+ const dimpl::subnet_wrapper<subnet_type> wsub(subnetwork);
+ loss.to_label(temp_tensor, wsub, &temp_label, std::forward<T>(args)...);
+ return temp_label;
+ }
+
+ template <typename iterable_type, typename ...T>
+ std::vector<output_label_type> process_batch (const iterable_type& data, size_t batch_size, T&& ...args)
+ {
+ std::vector<output_label_type> results(std::distance(data.begin(), data.end()));
+ auto o = results.begin();
+ auto i = data.begin();
+ auto num_remaining = results.size();
+ while(num_remaining != 0)
+ {
+ auto inc = std::min(batch_size, num_remaining);
+ to_tensor(i,i+inc,temp_tensor);
+ subnetwork.forward(temp_tensor);
+ const dimpl::subnet_wrapper<subnet_type> wsub(subnetwork);
+ loss.to_label(temp_tensor, wsub, o, std::forward<T>(args)...);
+
+ i += inc;
+ o += inc;
+ num_remaining -= inc;
+ }
+ return results;
+ }
+
+ template <typename iterable_type>
+ std::vector<output_label_type> operator() (
+ const iterable_type& data,
+ size_t batch_size = 128
+ )
+ {
+ std::vector<output_label_type> results(std::distance(data.begin(), data.end()));
+ auto o = results.begin();
+ auto i = data.begin();
+ auto num_remaining = results.size();
+ while(num_remaining != 0)
+ {
+ auto inc = std::min(batch_size, num_remaining);
+ (*this)(i, i+inc, o);
+ i += inc;
+ o += inc;
+ num_remaining -= inc;
+ }
+ return results;
+ }
+
+ template <typename label_iterator>
+ double compute_loss (
+ const tensor& x,
+ label_iterator lbegin
+ )
+ {
+ subnetwork.forward(x);
+ dimpl::subnet_wrapper<subnet_type> wsub(subnetwork);
+ return loss.compute_loss_value_and_gradient(x, lbegin, wsub);
+ }
+
+ template <typename forward_iterator, typename label_iterator>
+ double compute_loss (
+ forward_iterator ibegin,
+ forward_iterator iend,
+ label_iterator lbegin
+ )
+ {
+ to_tensor(ibegin,iend,temp_tensor);
+ return compute_loss(temp_tensor, lbegin);
+ }
+
+ double compute_loss (
+ const tensor& x
+ )
+ {
+ subnetwork.forward(x);
+ dimpl::subnet_wrapper<subnet_type> wsub(subnetwork);
+ return loss.compute_loss_value_and_gradient(x, wsub);
+ }
+
+ template <typename forward_iterator>
+ double compute_loss (
+ forward_iterator ibegin,
+ forward_iterator iend
+ )
+ {
+ to_tensor(ibegin,iend,temp_tensor);
+ return compute_loss(temp_tensor);
+ }
+
+ template <typename label_iterator>
+ double compute_parameter_gradients (
+ const tensor& x,
+ label_iterator lbegin
+ )
+ {
+ subnetwork.forward(x);
+ dimpl::subnet_wrapper<subnet_type> wsub(subnetwork);
+ double l = loss.compute_loss_value_and_gradient(x, lbegin, wsub);
+ subnetwork.back_propagate_error(x);
+ return l;
+ }
+ template <typename forward_iterator, typename label_iterator>
+ double compute_parameter_gradients (
+ forward_iterator ibegin,
+ forward_iterator iend,
+ label_iterator lbegin
+ )
+ {
+ to_tensor(ibegin,iend,temp_tensor);
+ return compute_parameter_gradients(temp_tensor, lbegin);
+ }
+ double compute_parameter_gradients (
+ const tensor& x
+ )
+ {
+ subnetwork.forward(x);
+ dimpl::subnet_wrapper<subnet_type> wsub(subnetwork);
+ double l = loss.compute_loss_value_and_gradient(x, wsub);
+ subnetwork.back_propagate_error(x);
+ return l;
+ }
+ template <typename forward_iterator>
+ double compute_parameter_gradients (
+ forward_iterator ibegin,
+ forward_iterator iend
+ )
+ {
+ to_tensor(ibegin,iend,temp_tensor);
+ return compute_parameter_gradients(temp_tensor);
+ }
+
+ template <typename solver_type>
+ void update_parameters (
+ sstack<solver_type> solvers,
+ double learning_rate
+ )
+ {
+ subnetwork.update_parameters(solvers, learning_rate);
+ }
+
+ const subnet_type& subnet() const { return subnetwork; }
+ subnet_type& subnet() { return subnetwork; }
+ const loss_details_type& loss_details() const { return loss; }
+ loss_details_type& loss_details() { return loss; }
+
+ void clean (
+ )
+ {
+ temp_tensor.clear();
+ subnetwork.clean();
+ }
+
+ template <typename T, typename U>
+ friend void serialize(const add_loss_layer<T,U>& item, std::ostream& out);
+ template <typename T, typename U>
+ friend void deserialize(add_loss_layer<T,U>& item, std::istream& in);
+
+ friend std::ostream& operator<< (std::ostream& out, const add_loss_layer& item)
+ {
+ int min_length = 0;
+ item.print(out, 0, min_length);
+ return out;
+ }
+
+ void print (std::ostream& out, unsigned long idx, int& min_length) const
+ {
+ out << "layer<" << idx << ">\t" << loss_details() << "\n";
+ subnet().print(out, idx+1, min_length);
+ }
+
+ private:
+
+
+ void swap(add_loss_layer& item)
+ {
+ std::swap(loss, item.loss);
+ std::swap(subnetwork, item.subnetwork);
+ }
+
+ loss_details_type loss;
+ subnet_type subnetwork;
+
+ // These two objects don't logically contribute to the state of this object. They
+ // are here to prevent them from being reallocated over and over.
+ output_label_type temp_label;
+ resizable_tensor temp_tensor;
+ };
+
+ template <typename LOSS_DETAILS, typename SUBNET>
+ void serialize(const add_loss_layer<LOSS_DETAILS,SUBNET>& item, std::ostream& out)
+ {
+ int version = 1;
+ serialize(version, out);
+ serialize(item.loss, out);
+ serialize(item.subnetwork, out);
+ }
+
+ template <typename LOSS_DETAILS, typename SUBNET>
+ void deserialize(add_loss_layer<LOSS_DETAILS,SUBNET>& item, std::istream& in)
+ {
+ int version = 0;
+ deserialize(version, in);
+ if (version != 1)
+ throw serialization_error("Unexpected version found while deserializing dlib::add_loss_layer.");
+ deserialize(item.loss, in);
+ deserialize(item.subnetwork, in);
+ }
+
+
+ template <typename T, typename U>
+ struct is_loss_layer_type<add_loss_layer<T,U>> : std::true_type {};
+
+// ----------------------------------------------------------------------------------------
+// ----------------------------------------------------------------------------------------
+// ----------------------------------------------------------------------------------------
+
+ namespace impl
+ {
+ template <unsigned int i, typename T, typename enabled = void>
+ struct layer_helper
+ {
+ static_assert(i < T::num_layers, "Call to layer() attempted to access non-existing layer in neural network.");
+ static T& makeT();
+ using next_type = typename std::remove_reference<decltype(makeT().subnet())>::type;
+ using type = typename layer_helper<i-1,next_type>::type;
+ static type& layer(T& n)
+ {
+ return layer_helper<i-1,next_type>::layer(n.subnet());
+ }
+ };
+ template <
+ unsigned int i,
+ size_t N, template<typename> class L, typename S
+ >
+ struct layer_helper<i,repeat<N,L,S>, typename std::enable_if<(i!=0&&i>=repeat<N,L,S>::layers_in_repeated_group)>::type>
+ {
+ const static size_t layers_in_repeated_group = repeat<N,L,S>::layers_in_repeated_group;
+
+ static repeat<N,L,S>& makeT();
+ using next_type = typename std::remove_reference<decltype(makeT().subnet())>::type;
+ using type = typename layer_helper<i-layers_in_repeated_group,next_type>::type;
+ static type& layer(repeat<N,L,S>& n)
+ {
+ return layer_helper<i-layers_in_repeated_group,next_type>::layer(n.subnet());
+ }
+ };
+ template <
+ unsigned int i,
+ size_t N, template<typename> class L, typename S
+ >
+ struct layer_helper<i,repeat<N,L,S>, typename std::enable_if<(i!=0&&i<repeat<N,L,S>::layers_in_repeated_group)>::type>
+ {
+ const static size_t layers_in_each_group = repeat<N,L,S>::layers_in_each_group;
+ typedef typename repeat<N,L,S>::repeated_layer_type repeated_layer_type;
+ using next_type = repeated_layer_type;
+ using type = typename layer_helper<i%layers_in_each_group,next_type>::type;
+ static type& layer(repeat<N,L,S>& n)
+ {
+ return layer_helper<i%layers_in_each_group,next_type>::layer(n.get_repeated_layer(i/layers_in_each_group));
+ }
+ };
+ template <
+ size_t N, template<typename> class L, typename S
+ >
+ struct layer_helper<0,repeat<N,L,S>, void>
+ {
+ typedef typename repeat<N,L,S>::repeated_layer_type repeated_layer_type;
+ using type = repeated_layer_type;
+ static type& layer(repeat<N,L,S>& n)
+ {
+ return n.get_repeated_layer(0);
+ }
+ };
+
+
+
+ template <
+ unsigned int i,
+ size_t N, template<typename> class L, typename S
+ >
+ struct layer_helper<i,const repeat<N,L,S>, typename std::enable_if<(i!=0&&i>=repeat<N,L,S>::layers_in_repeated_group)>::type>
+ {
+ const static size_t layers_in_repeated_group = repeat<N,L,S>::layers_in_repeated_group;
+
+ static const repeat<N,L,S>& makeT();
+ using next_type = const typename std::remove_reference<decltype(makeT().subnet())>::type;
+ using type = const typename layer_helper<i-layers_in_repeated_group,next_type>::type;
+ static type& layer(const repeat<N,L,S>& n)
+ {
+ return layer_helper<i-layers_in_repeated_group,next_type>::layer(n.subnet());
+ }
+ };
+ template <
+ unsigned int i,
+ size_t N, template<typename> class L, typename S
+ >
+ struct layer_helper<i,const repeat<N,L,S>, typename std::enable_if<(i!=0&&i<repeat<N,L,S>::layers_in_repeated_group)>::type>
+ {
+ const static size_t layers_in_each_group = repeat<N,L,S>::layers_in_each_group;
+ typedef typename repeat<N,L,S>::repeated_layer_type repeated_layer_type;
+ using next_type = const repeated_layer_type;
+ using type = const typename layer_helper<i%layers_in_each_group,next_type>::type;
+ static type& layer(const repeat<N,L,S>& n)
+ {
+ return layer_helper<i%layers_in_each_group,next_type>::layer(n.get_repeated_layer(i/layers_in_each_group));
+ }
+ };
+ template <
+ size_t N, template<typename> class L, typename S
+ >
+ struct layer_helper<0,const repeat<N,L,S>, void>
+ {
+ typedef typename repeat<N,L,S>::repeated_layer_type repeated_layer_type;
+ using type = const repeated_layer_type;
+ static type& layer(const repeat<N,L,S>& n)
+ {
+ return n.get_repeated_layer(0);
+ }
+ };
+
+
+
+ template <typename T>
+ struct layer_helper<0,T,void>
+ {
+ using type = T;
+ static type& layer(T& n)
+ {
+ return n;
+ }
+ };
+
+ template <template<typename> class Match, typename T, unsigned int i, typename enabled = void>
+ struct layer_helper_match
+ {
+ static T& makeT();
+ using next_type = typename std::remove_reference<decltype(makeT().subnet())>::type;
+ using type = typename layer_helper_match<Match,next_type,i>::type;
+ static type& layer(T& n)
+ {
+ return layer_helper_match<Match,next_type,i>::layer(n.subnet());
+ }
+ };
+ // This overload catches add_layer and add_loss_layer templates.
+ template <template<typename> class Match, typename T, unsigned int i>
+ struct layer_helper_match<Match,T,i,
+ typename std::enable_if<std::is_same<const T,const Match<typename T::subnet_type>>::value>::type>
+ {
+ using type = typename layer_helper<i,T>::type;
+ static type& layer(T& n)
+ {
+ return layer_helper<i,T>::layer(n);
+ }
+ };
+ // This overload catches input templates.
+ template <template<typename> class Match, typename T, unsigned int i>
+ struct layer_helper_match<Match,T,i,
+ typename std::enable_if<std::is_same<const T,const Match<typename T::input_type>>::value>::type>
+ {
+ using type = typename layer_helper<i,T>::type;
+ static type& layer(T& n)
+ {
+ return layer_helper<i,T>::layer(n);
+ }
+ };
+ // This overload catches subnet_wrapper templates.
+ template <template<typename> class Match, typename T, unsigned int i>
+ struct layer_helper_match<Match,T,i,
+ typename std::enable_if<std::is_same<const typename T::wrapped_type,
+ const Match<typename T::wrapped_type::subnet_type>>::value>::type>
+ {
+ using type = typename layer_helper<i,T>::type;
+ static type& layer(T& n)
+ {
+ return layer_helper<i,T>::layer(n);
+ }
+ };
+ }
+
+ template <unsigned int i, typename T>
+ typename impl::layer_helper<i,T>::type& layer (T& n)
+ {
+ return impl::layer_helper<i,T>::layer(n);
+ }
+
+ template <template<typename> class Match, typename T>
+ typename impl::layer_helper_match<Match,T,0>::type& layer (T& n)
+ {
+ return impl::layer_helper_match<Match,T,0>::layer(n);
+ }
+
+ template <template<typename> class Match, unsigned int i, typename T>
+ typename impl::layer_helper_match<Match,T,i>::type& layer (T& n)
+ {
+ return impl::layer_helper_match<Match,T,i>::layer(n);
+ }
+
+// ----------------------------------------------------------------------------------------
+
+
+ namespace dimpl
+ {
+ template <typename T>
+ T& get_input_details (
+ T& net
+ )
+ {
+ return net;
+ }
+
+ template <typename T, bool is_first, typename enabled>
+ auto get_input_details (
+ dimpl::subnet_wrapper<T,is_first,enabled>& net
+ ) -> decltype(net.layer_details())&
+ {
+ return net.layer_details();
+ }
+
+ template <typename T, bool is_first, typename enabled>
+ auto get_input_details (
+ const dimpl::subnet_wrapper<T,is_first,enabled>& net
+ ) -> decltype(net.layer_details())&
+ {
+ return net.layer_details();
+ }
+ }
+
+ template <typename net_type>
+ auto input_layer (
+ net_type& net
+ ) -> decltype(dimpl::get_input_details(layer<net_type::num_layers-1>(net)))&
+ {
+ // Calling input_layer() on a subnet_wrapper is a little funny since the behavior of
+ // .subnet() returns another subnet_wrapper rather than an input details object as it
+ // does in add_layer.
+ return dimpl::get_input_details(layer<net_type::num_layers-1>(net));
+ }
+
+// ----------------------------------------------------------------------------------------
+
+ template <template<typename> class TAG_TYPE, typename SUBNET>
+ class add_skip_layer
+ {
+ public:
+ typedef SUBNET subnet_type;
+ typedef typename subnet_type::input_type input_type;
+ typedef int layer_details_type; // not really used anywhere, but required by subnet_wrapper.
+ const static size_t num_layers = subnet_type::num_layers + 1;
+ const static size_t num_computational_layers = subnet_type::num_computational_layers;
+ const static unsigned long id = tag_id<TAG_TYPE>::id;
+
+ add_skip_layer() {};
+ add_skip_layer(const add_skip_layer&) = default;
+ add_skip_layer(add_skip_layer&&) = default;
+ add_skip_layer& operator=(add_skip_layer&&) = default;
+ add_skip_layer& operator=(const add_skip_layer&) = default;
+
+ template <typename T>
+ add_skip_layer(
+ const add_skip_layer<TAG_TYPE,T>& item
+ ) : subnetwork(item.subnet())
+ {}
+
+ template <typename ...T>
+ add_skip_layer(
+ T ...args
+ ) :
+ subnetwork(std::move(args)...)
+ {
+ }
+
+ template <typename forward_iterator>
+ void to_tensor (
+ forward_iterator ibegin,
+ forward_iterator iend,
+ resizable_tensor& data
+ ) const
+ {
+ subnetwork.to_tensor(ibegin,iend,data);
+ }
+
+ template <typename forward_iterator>
+ const tensor& operator() (
+ forward_iterator ibegin,
+ forward_iterator iend
+ )
+ {
+ subnetwork(ibegin,iend);
+ return layer<TAG_TYPE>(subnetwork).get_output();
+ }
+
+ const tensor& operator() (const input_type& x)
+ {
+ subnetwork(x);
+ return layer<TAG_TYPE>(subnetwork).get_output();
+ }
+
+ const tensor& forward(const tensor& x)
+ {
+ subnetwork.forward(x);
+ return layer<TAG_TYPE>(subnetwork).get_output();
+ }
+
+ const tensor& get_output() const
+ {
+ return layer<TAG_TYPE>(subnetwork).get_output();
+ }
+
+ tensor& get_gradient_input()
+ {
+ return layer<TAG_TYPE>(subnetwork).get_gradient_input();
+ }
+
+ const tensor& get_final_data_gradient(
+ ) const
+ {
+ return subnetwork.get_final_data_gradient();
+ }
+
+ void back_propagate_error(const tensor& x)
+ {
+ subnetwork.back_propagate_error(x);
+ }
+
+ template <typename solver_type>
+ void update_parameters(sstack<solver_type> solvers, double learning_rate)
+ {
+ subnetwork.update_parameters(solvers, learning_rate);
+ }
+
+ const tensor& get_parameter_gradient(
+ ) const { return params_grad; }
+
+ tensor& get_parameter_gradient (
+ ) { return params_grad; }
+
+
+ const subnet_type& subnet() const
+ {
+ return subnetwork;
+ }
+
+ subnet_type& subnet()
+ {
+ return subnetwork;
+ }
+
+ unsigned int sample_expansion_factor() const { return subnet().sample_expansion_factor(); }
+
+ void clean()
+ {
+ subnetwork.clean();
+ }
+
+ friend void serialize(const add_skip_layer& item, std::ostream& out)
+ {
+ int version = 1;
+ serialize(version, out);
+ serialize(item.subnetwork, out);
+ }
+
+ friend void deserialize(add_skip_layer& item, std::istream& in)
+ {
+ int version = 0;
+ deserialize(version, in);
+ if (version != 1)
+ throw serialization_error("Unexpected version found while deserializing dlib::add_skip_layer.");
+ deserialize(item.subnetwork, in);
+ }
+
+ friend std::ostream& operator<< (std::ostream& out, const add_skip_layer& item)
+ {
+ int min_length = 0;
+ item.print(out, 0, min_length);
+ return out;
+ }
+
+ void print (std::ostream& out, unsigned long idx, int& min_length) const
+ {
+ out << "layer<" << idx << ">\t"<<impl::tensor_to_str(private_get_output(), min_length) <<"skip"<<id<<"\n";
+ subnet().print(out, idx+1, min_length);
+ }
+
+ private:
+
+
+ template <typename T, typename U, typename E>
+ friend class add_layer;
+ template <typename T, bool is_first, typename E>
+ friend class dimpl::subnet_wrapper;
+ template <unsigned long T, typename U, typename E>
+ friend class add_tag_layer;
+ template <template<typename> class T, typename U>
+ friend class add_skip_layer;
+ template <size_t N, template<typename> class L, typename S>
+ friend class repeat;
+
+ bool this_layer_requires_forward_output(
+ ) { return layer<TAG_TYPE>(subnetwork).this_layer_requires_forward_output(); }
+
+ void disable_output_and_gradient_getters (
+ ) { layer<TAG_TYPE>(subnetwork).disable_output_and_gradient_getters(); }
+
+ tensor& private_get_output() const
+ { return layer<TAG_TYPE>(subnetwork).private_get_output(); }
+ tensor& private_get_gradient_input()
+ { return layer<TAG_TYPE>(subnetwork).private_get_gradient_input(); }
+
+ subnet_type subnetwork;
+
+ // This member doesn't logically contribute to the state of the object since it is
+ // always empty. It's just here so we can have the get_parameter_gradient() methods
+ // which have to return something. So they return this empty tensor.
+ resizable_tensor params_grad;
+ };
+ template <template<typename> class T, typename U>
+ struct is_nonloss_layer_type<add_skip_layer<T,U>> : std::true_type {};
+
+ template <typename SUBNET> using tag1 = add_tag_layer< 1, SUBNET>;
+ template <typename SUBNET> using tag2 = add_tag_layer< 2, SUBNET>;
+ template <typename SUBNET> using tag3 = add_tag_layer< 3, SUBNET>;
+ template <typename SUBNET> using tag4 = add_tag_layer< 4, SUBNET>;
+ template <typename SUBNET> using tag5 = add_tag_layer< 5, SUBNET>;
+ template <typename SUBNET> using tag6 = add_tag_layer< 6, SUBNET>;
+ template <typename SUBNET> using tag7 = add_tag_layer< 7, SUBNET>;
+ template <typename SUBNET> using tag8 = add_tag_layer< 8, SUBNET>;
+ template <typename SUBNET> using tag9 = add_tag_layer< 9, SUBNET>;
+ template <typename SUBNET> using tag10 = add_tag_layer<10, SUBNET>;
+
+ template <typename SUBNET> using skip1 = add_skip_layer< tag1, SUBNET>;
+ template <typename SUBNET> using skip2 = add_skip_layer< tag2, SUBNET>;
+ template <typename SUBNET> using skip3 = add_skip_layer< tag3, SUBNET>;
+ template <typename SUBNET> using skip4 = add_skip_layer< tag4, SUBNET>;
+ template <typename SUBNET> using skip5 = add_skip_layer< tag5, SUBNET>;
+ template <typename SUBNET> using skip6 = add_skip_layer< tag6, SUBNET>;
+ template <typename SUBNET> using skip7 = add_skip_layer< tag7, SUBNET>;
+ template <typename SUBNET> using skip8 = add_skip_layer< tag8, SUBNET>;
+ template <typename SUBNET> using skip9 = add_skip_layer< tag9, SUBNET>;
+ template <typename SUBNET> using skip10 = add_skip_layer<tag10, SUBNET>;
+
+// ----------------------------------------------------------------------------------------
+
+ namespace timpl
+ {
+ inline void fill_with_gassuan_random_numbers (
+ tensor& t,
+ dlib::rand& rnd,
+ double sigma = 1
+ )
+ {
+ float* data = t.host();
+ for (size_t i = 0; i < t.size(); ++i)
+ data[i] = rnd.get_random_gaussian()*sigma;
+ }
+
+ class test_layer_subnet
+ {
+ public:
+ test_layer_subnet (
+ dlib::rand& rnd_
+ ) : rnd(rnd_)
+ {
+ // Output and gradient_input have to have the same dimensions in each
+ // layer.
+ const long num_samples = rnd.get_random_32bit_number()%4+3;
+ const long k = rnd.get_random_32bit_number()%4+2;
+ const long nr = rnd.get_random_32bit_number()%4+2;
+ const long nc = rnd.get_random_32bit_number()%4+2;
+
+ output.set_size(num_samples, k, nr, nc);
+ gradient_input.set_size(num_samples, k, nr, nc);
+
+ // Use a non-zero initial gradient to make sure the layers add to it
+ // rather than assign and blow away the initial value.
+ fill_with_gassuan_random_numbers(gradient_input, rnd, 0.01);
+
+ fill_with_gassuan_random_numbers(output, rnd);
+ }
+
+
+ tensor& get_mutable_output() { return output; }
+ const tensor& get_output() const { return output; }
+ const tensor& private_get_output() const { return get_output(); }
+ const test_layer_subnet& subnet() const { init_sub(); return *subnetwork; }
+
+ tensor& get_gradient_input() { return gradient_input; }
+ tensor& private_get_gradient_input() { return get_gradient_input(); }
+ test_layer_subnet& subnet() { init_sub(); return *subnetwork; }
+
+
+
+ unsigned long count_outputs() const
+ {
+ if (subnetwork)
+ return subnetwork->count_outputs() + output.size();
+ else
+ return output.size();
+ }
+
+ float& get_output_element(unsigned long i)
+ {
+ if (i < output.size())
+ return output.host()[i];
+ else
+ return subnet().get_output_element(i-output.size());
+ }
+
+ float get_gradient_input_element(unsigned long i) const
+ {
+ if (i < gradient_input.size())
+ return gradient_input.host()[i];
+ else
+ return subnet().get_gradient_input_element(i-gradient_input.size());
+ }
+
+
+ private:
+ // We lazily initialize sub-layers as needed when someone tries to call
+ // subnet()
+ void init_sub() const
+ {
+ if (!subnetwork)
+ subnetwork.reset(new test_layer_subnet(rnd));
+ }
+
+ dlib::rand& rnd;
+ mutable std::unique_ptr<test_layer_subnet> subnetwork;
+ resizable_tensor output;
+ resizable_tensor gradient_input;
+ };
+
+ }
+
+ struct layer_test_results
+ {
+ layer_test_results() : was_good(true) {}
+ explicit layer_test_results(const std::string& l) : log(l),was_good(false) {}
+
+ std::string log;
+ bool was_good;
+
+ operator bool() const { return was_good; }
+ };
+
+ inline std::ostream& operator<< (std::ostream& out, const layer_test_results& item)
+ {
+ out << item.log;
+ return out;
+ }
+
+ template <
+ typename layer_details_type
+ >
+ layer_test_results impl_test_layer (
+ layer_details_type l,
+ const float base_eps
+ )
+ {
+ using namespace timpl;
+ // Do some setup
+ running_stats<double> rs_data, rs_params;
+ dlib::rand rnd;
+ std::ostringstream sout;
+ for (int iter = 0; iter < 10; ++iter)
+ {
+ test_layer_subnet subnetwork(rnd);
+ resizable_tensor output, out2, out3;
+ // Run setup() and forward() as well to make sure any calls to subnet() have
+ // happened before we start assuming we know how many data elements there are
+ // (since we do a lazy layer creation thing based on calls to subnet() inside
+ // test_layer_subnet).
+ l.setup(subnetwork);
+ impl::call_layer_forward(l, subnetwork, output);
+
+ resizable_tensor input_grad;
+ input_grad.copy_size(output);
+ fill_with_gassuan_random_numbers(input_grad, rnd);
+
+
+ // The f() we are computing gradients of is this thing. It's value at the current
+ // parameter and data values is:
+ //sout << "f(data,params): " << dot(output, input_grad) << std::endl;
+
+ // We are going to save a copy of the subnetwork.get_gradient_input() data before we do
+ // backpropagation since the backward() function is supposed to *add* to the
+ // gradients rather than overwrite them. We will use this saved data to check if
+ // that is the case.
+ const unsigned long num_data_inputs = subnetwork.count_outputs();
+ std::vector<float> initial_gradient_input(num_data_inputs);
+ for (unsigned long i = 0; i < num_data_inputs; ++i)
+ initial_gradient_input[i] = subnetwork.get_gradient_input_element(i);
+
+
+ // Now tell the layer to compute all the gradients. In the rest of this function
+ // we will just be checking that these gradients were computed correctly by
+ // comparing them to a central differences approximation.
+ resizable_tensor params_grad;
+ params_grad.copy_size(l.get_layer_params());
+ // But first, set the params grad to something crazy so that it's very obvious if
+ // it doesn't get fully assigned.
+ params_grad = std::numeric_limits<float>::infinity();
+ impl::call_layer_backward(l, output, input_grad, subnetwork, params_grad);
+
+ static_assert(impl::is_inplace_layer(l, subnetwork) == impl::has_inplace_backward(l, subnetwork),
+ "Layer not defined correctly. forward and backward methods must either both be in-place or both out-of-place. ");
+
+ // Make sure the outputs of forward() and backward() are the same when they are run
+ // in in-place mode.
+ if (impl::is_inplace_layer(l, subnetwork))
+ {
+ test_layer_subnet subnetwork2(rnd);
+ layer_details_type ll(l);
+ ll.setup(subnetwork2);
+ resizable_tensor ip_out;
+ impl::call_layer_forward(ll, subnetwork2, ip_out);
+ impl::call_layer_forward(ll, subnetwork2, subnetwork2.get_mutable_output());
+ const auto forward_error = max(abs(mat(ip_out) - mat(subnetwork2.get_output())));
+ if (forward_error > 0.00001)
+ {
+ using namespace std;
+ sout << "This layer is supposed to support in-place computations but the output of forward_inplace()\n";
+ sout << "changes when invoked in-place vs. out-of-place. The error was: " << forward_error << endl;
+ return layer_test_results(sout.str());
+ }
+
+ resizable_tensor params_grad;
+ params_grad.copy_size(ll.get_layer_params());
+ params_grad = std::numeric_limits<float>::infinity();
+
+ resizable_tensor input_grad;
+ input_grad.copy_size(ip_out);
+ fill_with_gassuan_random_numbers(input_grad, rnd);
+ resizable_tensor params_grad1, params_grad2, data_grad1, data_grad2;
+ params_grad1 = params_grad;
+ params_grad2 = params_grad;
+ // Now call backward() and make sure it works as well. Recall that when an
+ // in-place layer works in-place it assigns to it's outputs but when it's
+ // not running in-place it adds. So we initialize to a non-zero value to
+ // check that this is the behavior that really executes.
+ subnetwork2.get_gradient_input() = 9;
+ impl::call_layer_backward(ll, ip_out, input_grad, subnetwork2, params_grad1);
+ data_grad1 = subnetwork2.get_gradient_input();
+
+ subnetwork2.get_gradient_input() = mat(input_grad);
+ impl::call_layer_backward(ll, ip_out, subnetwork2.get_gradient_input(), subnetwork2, params_grad2);
+ data_grad2 = subnetwork2.get_gradient_input();
+ if (params_grad.size() != 0)
+ {
+ const auto backward_param_error = max(abs(mat(params_grad1) - mat(params_grad2)));
+ if (backward_param_error > 0.00001)
+ {
+ using namespace std;
+ sout << "This layer is supposed to support in-place computations but the output of backward_inplace()\n";
+ sout << "changes when invoked in-place vs. out-of-place. The error was: " << backward_param_error << endl;
+ return layer_test_results(sout.str());
+ }
+ }
+ const auto backward_data_error = max(abs(mat(data_grad1)-9 - mat(data_grad2)));
+ if (backward_data_error > 0.00001)
+ {
+ using namespace std;
+ sout << "This layer is supposed to support in-place computations but the output of backward_inplace()\n";
+ sout << "changes when invoked in-place vs. out-of-place. The error was: " << backward_data_error << endl;
+ return layer_test_results(sout.str());
+ }
+ }
+
+ // ==================================================================
+ // first validate the way the parameter gradients are computed
+ for (unsigned long i = 0; i < params_grad.size(); ++i)
+ {
+ layer_details_type l1(l);
+
+ float eps = l1.get_layer_params().host()[i]*base_eps;
+ if (eps == 0)
+ eps = base_eps;
+ const float oldval = l1.get_layer_params().host()[i];
+ l1.get_layer_params().host()[i] = oldval+eps;
+ impl::call_layer_forward(l1, subnetwork, out2);
+ l1.get_layer_params().host()[i] = oldval-eps;
+ impl::call_layer_forward(l1, subnetwork, out3);
+ l1.get_layer_params().host()[i] = oldval;
+
+ // Compute a reference derivative via a central differences approximation and
+ // compare it to the one output by the layer and make sure they match.
+ double reference_derivative = (dot(out2,input_grad)-dot(out3, input_grad))/(2*eps);
+ double output_derivative = params_grad.host()[i];
+ double relative_error;
+ if (reference_derivative*output_derivative != 0)
+ relative_error = (reference_derivative - output_derivative)/(reference_derivative);
+ else
+ relative_error = (reference_derivative - output_derivative);
+ double absolute_error = (reference_derivative - output_derivative);
+ rs_params.add(std::abs(relative_error));
+ if (std::abs(relative_error) > 0.05 && std::abs(absolute_error) > 0.006)
+ {
+ using namespace std;
+ sout << "Gradient error in parameter #" << i <<". Relative error: "<< relative_error << endl;
+ sout << "expected derivative: " << reference_derivative << endl;
+ sout << "output derivative: " << output_derivative << endl;
+ sout << "iteration: " << iter << endl;
+ return layer_test_results(sout.str());
+ }
+ }
+
+ // ==================================================================
+ // now validate the data gradients
+ for (unsigned long i = 0; i < num_data_inputs; ++i)
+ {
+ const float oldval = subnetwork.get_output_element(i);
+ float eps = oldval*base_eps;
+ if (eps == 0)
+ eps = base_eps;
+ subnetwork.get_output_element(i) = oldval+eps;
+ impl::call_layer_forward(l, subnetwork, out2);
+ subnetwork.get_output_element(i) = oldval-eps;
+ impl::call_layer_forward(l, subnetwork, out3);
+ subnetwork.get_output_element(i) = oldval;
+
+ // Compute a reference derivative via a central differences approximation and
+ // compare it to the one output by the layer and make sure they match.
+ double reference_derivative = (dot(out2,input_grad)-dot(out3, input_grad))/(2*eps);
+ double output_derivative = subnetwork.get_gradient_input_element(i);
+ output_derivative -= initial_gradient_input[i];
+ double relative_error;
+ if (reference_derivative*output_derivative != 0)
+ relative_error = (reference_derivative - output_derivative)/(reference_derivative);
+ else
+ relative_error = (reference_derivative - output_derivative);
+ double absolute_error = (reference_derivative - output_derivative);
+ rs_data.add(std::abs(relative_error));
+ if (std::abs(relative_error) > 0.05 && std::abs(absolute_error) > 0.006)
+ {
+ using namespace std;
+ sout << "Gradient error in data variable #" << i <<". Relative error: "<< relative_error << endl;
+ sout << "expected derivative: " << reference_derivative << endl;
+ sout << "output derivative: " << output_derivative << endl;
+ sout << "iteration: " << iter << endl;
+ return layer_test_results(sout.str());
+ }
+ }
+
+ } // end for (int iter = 0; iter < 10; ++iter)
+
+ if (rs_params.mean() > 0.003)
+ {
+ using namespace std;
+ sout << "Average parameter gradient error is somewhat large at: "<< rs_params.mean() << endl;
+ return layer_test_results(sout.str());
+ }
+ if (rs_data.mean() > 0.003)
+ {
+ using namespace std;
+ sout << "Average data gradient error is somewhat large at: "<< rs_data.mean() << endl;
+ return layer_test_results(sout.str());
+ }
+
+ return layer_test_results();
+ }
+
+ template <
+ typename layer_details_type
+ >
+ layer_test_results test_layer (
+ layer_details_type l
+ )
+ {
+ // Try a few different derivative step sizes to see if any work.
+ for (float base_eps = 0.0001; base_eps < 0.1; base_eps *= 2)
+ {
+ auto result = impl_test_layer(l, base_eps);
+ if (result)
+ return result;
+ }
+ // However, if none of the step sizes worked then try this one and probably result
+ // in returning an error.
+ return impl_test_layer(l, 0.01);
+ }
+
+// ----------------------------------------------------------------------------------------
+
+ namespace impl
+ {
+ template <size_t i, size_t num>
+ struct vlp_loop
+ {
+ template <typename T, typename U>
+ static typename std::enable_if<!is_add_layer<U>::value>::type invoke_functor(T&& , size_t& , U&& )
+ {
+ // intentionally left empty
+ }
+
+ template <typename T, typename U>
+ static typename std::enable_if<is_add_layer<U>::value>::type invoke_functor(T&& v , size_t& comp_i, U&& l )
+ {
+ v(comp_i, l.layer_details().get_layer_params());
+ ++comp_i;
+ }
+
+ template <
+ typename net_type,
+ typename visitor
+ >
+ static void visit(
+ size_t comp_i,
+ net_type& net,
+ visitor&& v
+ )
+ {
+ invoke_functor(v, comp_i, layer<i>(net));
+ vlp_loop<i+1, num>::visit(comp_i, net,v);
+ }
+ };
+
+ template <size_t num>
+ struct vlp_loop<num,num>
+ {
+ template <
+ typename net_type,
+ typename visitor
+ >
+ static void visit(
+ size_t,
+ net_type&,
+ visitor&&
+ )
+ {
+ // Base case of recursion. Don't do anything.
+ }
+ };
+
+ }
+
+ template <
+ typename net_type,
+ typename visitor
+ >
+ void visit_layer_parameters(
+ net_type& net,
+ visitor v
+ )
+ {
+ size_t comp_i = 0;
+ impl::vlp_loop<0, net_type::num_layers>::visit(comp_i, net, v);
+ }
+
+// ----------------------------------------------------------------------------------------
+
+ namespace impl
+ {
+ template <size_t i, size_t num>
+ struct vlpg_loop
+ {
+ template <typename T, typename U>
+ static typename std::enable_if<!is_add_layer<U>::value>::type invoke_functor(T&& , size_t& , U&& )
+ {
+ // intentionally left empty
+ }
+
+ template <typename T, typename U>
+ static typename std::enable_if<is_add_layer<U>::value>::type invoke_functor(T&& v , size_t& comp_i, U&& l )
+ {
+ v(comp_i, l.get_parameter_gradient());
+ ++comp_i;
+ }
+
+ template <
+ typename net_type,
+ typename visitor
+ >
+ static void visit(
+ size_t comp_i,
+ net_type& net,
+ visitor&& v
+ )
+ {
+ invoke_functor(v, comp_i, layer<i>(net));
+ vlpg_loop<i+1, num>::visit(comp_i, net,v);
+ }
+ };
+
+ template <size_t num>
+ struct vlpg_loop<num,num>
+ {
+ template <
+ typename net_type,
+ typename visitor
+ >
+ static void visit(
+ size_t,
+ net_type&,
+ visitor&&
+ )
+ {
+ // Base case of recursion. Don't do anything.
+ }
+ };
+
+ }
+
+ template <
+ typename net_type,
+ typename visitor
+ >
+ void visit_layer_parameter_gradients(
+ net_type& net,
+ visitor v
+ )
+ {
+ size_t comp_i = 0;
+ impl::vlpg_loop<0, net_type::num_layers>::visit(comp_i, net, v);
+ }
+
+// ----------------------------------------------------------------------------------------
+
+ namespace impl
+ {
+ template <size_t i, size_t num>
+ struct vl_loop
+ {
+ template <
+ typename net_type,
+ typename visitor
+ >
+ static void visit(
+ net_type& net,
+ visitor&& v
+ )
+ {
+ v(i, layer<i>(net));
+ vl_loop<i+1, num>::visit(net,v);
+ }
+ };
+
+ template <size_t num>
+ struct vl_loop<num,num>
+ {
+ template <
+ typename net_type,
+ typename visitor
+ >
+ static void visit(
+ net_type&,
+ visitor&&
+ )
+ {
+ // Base case of recursion. Don't do anything.
+ }
+ };
+
+ template <size_t i, size_t num>
+ struct vl_loop_backwards
+ {
+ template <
+ typename net_type,
+ typename visitor
+ >
+ static void visit(
+ net_type& net,
+ visitor&& v
+ )
+ {
+ vl_loop_backwards<i+1, num>::visit(net,v);
+ v(i, layer<i>(net));
+ }
+ };
+
+ template <size_t num>
+ struct vl_loop_backwards<num,num>
+ {
+ template <
+ typename net_type,
+ typename visitor
+ >
+ static void visit(
+ net_type&,
+ visitor&&
+ )
+ {
+ // Base case of recursion. Don't do anything.
+ }
+ };
+
+ }
+
+ template <
+ typename net_type,
+ typename visitor
+ >
+ void visit_layers(
+ net_type& net,
+ visitor v
+ )
+ {
+ impl::vl_loop<0, net_type::num_layers>::visit(net, v);
+ }
+
+ template <
+ typename net_type,
+ typename visitor
+ >
+ void visit_layers_backwards(
+ net_type& net,
+ visitor v
+ )
+ {
+ impl::vl_loop_backwards<0, net_type::num_layers>::visit(net, v);
+ }
+
+ template <
+ size_t begin,
+ size_t end,
+ typename net_type,
+ typename visitor
+ >
+ void visit_layers_range(
+ net_type& net,
+ visitor v
+ )
+ {
+ static_assert(begin <= end, "Invalid range");
+ static_assert(end <= net_type::num_layers, "Invalid range");
+ impl::vl_loop<begin,end>::visit(net, v);
+ }
+
+ template <
+ size_t begin,
+ size_t end,
+ typename net_type,
+ typename visitor
+ >
+ void visit_layers_backwards_range(
+ net_type& net,
+ visitor v
+ )
+ {
+ static_assert(begin <= end, "Invalid range");
+ static_assert(end <= net_type::num_layers, "Invalid range");
+ impl::vl_loop_backwards<begin,end>::visit(net, v);
+ }
+
+// ----------------------------------------------------------------------------------------
+
+ namespace impl
+ {
+ template <size_t i, unsigned long tag_id>
+ struct vl_until_tag
+ {
+ template <
+ typename net_type,
+ typename next_net_type,
+ typename visitor
+ >
+ static void visit(
+ net_type& net,
+ next_net_type& next_net,
+ visitor&& v
+ )
+ {
+ v(next_net);
+ vl_until_tag<i+1,tag_id>::visit(net,layer<i+1>(net),v);
+ }
+
+ template <
+ typename net_type,
+ typename SUBNET,
+ typename visitor
+ >
+ static void visit(
+ net_type& net,
+ const add_tag_layer<tag_id,SUBNET>& next_net,
+ visitor&& v
+ )
+ {
+ v(next_net);
+ }
+
+ template <
+ typename net_type,
+ typename SUBNET,
+ typename visitor
+ >
+ static void visit(
+ net_type& net,
+ add_tag_layer<tag_id,SUBNET>& next_net,
+ visitor&& v
+ )
+ {
+ v(next_net);
+ }
+ };
+ }
+
+ template <
+ unsigned long tag_id,
+ typename net_type,
+ typename visitor
+ >
+ void visit_layers_until_tag(
+ net_type& net,
+ visitor v
+ )
+ {
+ impl::vl_until_tag<0,tag_id>::visit(net, net, v);
+ }
+
+// ----------------------------------------------------------------------------------------
+
+}
+
+#endif // DLIB_DNn_CORE_H_
+
+
diff --git a/ml/dlib/dlib/dnn/core_abstract.h b/ml/dlib/dlib/dnn/core_abstract.h
new file mode 100644
index 000000000..db168a88b
--- /dev/null
+++ b/ml/dlib/dlib/dnn/core_abstract.h
@@ -0,0 +1,1700 @@
+// Copyright (C) 2015 Davis E. King (davis@dlib.net)
+// License: Boost Software License See LICENSE.txt for the full license.
+#undef DLIB_DNn_CORE_ABSTRACT_H_
+#ifdef DLIB_DNn_CORE_ABSTRACT_H_
+
+#include "tensor_abstract.h"
+#include <memory>
+#include <type_traits>
+#include <tuple>
+#include <vector>
+#include "../rand.h"
+
+
+namespace dlib
+{
+
+// ----------------------------------------------------------------------------------------
+
+ template <
+ typename... T
+ >
+ auto tuple_tail(
+ const std::tuple<T...>& item
+ );
+ /*!
+ ensures
+ - returns a tuple that contains everything in item except for tuple_head(item).
+ The items will be in the same order as they are in item, just without
+ tuple_head(item).
+ - This function will correctly handle nested tuples.
+ !*/
+
+ template <typename... T>
+ auto tuple_head (
+ const std::tuple<T...>& item
+ );
+ /*!
+ ensures
+ - returns a copy of the first thing in the tuple that isn't a std::tuple.
+ Essentially, this function calls std::get<0>() recursively on item until
+ a non-std::tuple object is found.
+ !*/
+
+// ----------------------------------------------------------------------------------------
+
+ template <typename T>
+ double get_learning_rate_multiplier(
+ const T& obj
+ );
+ /*!
+ ensures
+ - if (obj has a get_learning_rate_multiplier() member function) then
+ - returns obj.get_learning_rate_multiplier()
+ - else
+ - returns 1
+ !*/
+
+ template <typename T>
+ double get_weight_decay_multiplier(
+ const T& obj
+ );
+ /*!
+ ensures
+ - if (obj has a get_weight_decay_multiplier() member function) then
+ - returns obj.get_weight_decay_multiplier()
+ - else
+ - returns 1
+ !*/
+
+// ----------------------------------------------------------------------------------------
+
+ bool dnn_prefer_fastest_algorithms(
+ );
+ /*!
+ ensures
+ - If dlib should prefer to use fast algorithms rather than ones that use less
+ RAM then this function returns true and false otherwise.
+ - On program startup this function will default to true.
+ !*/
+
+ void set_dnn_prefer_fastest_algorithms(
+ );
+ /*!
+ ensures
+ - #dnn_prefer_fastest_algorithms() == true
+ !*/
+
+ void set_dnn_prefer_smallest_algorithms(
+ );
+ /*!
+ ensures
+ - #dnn_prefer_fastest_algorithms() == false
+ !*/
+
+// ----------------------------------------------------------------------------------------
+
+ template <
+ typename T
+ >
+ class sstack
+ {
+ /*!
+ WHAT THIS OBJECT REPRESENTS
+ This is a basic stack of T objects. It contains no data itself but simply
+ points to a memory range of T object and allows you to access that block of
+ T objects as a stack.
+ !*/
+
+ public:
+ typedef T value_type;
+
+ sstack() = delete;
+
+ sstack (
+ T* data,
+ size_t s
+ );
+ /*!
+ ensures
+ - #size() == s
+ - #top() == *data
+ - #pop(i).top() == data[i]
+ !*/
+
+ const T& top(
+ ) const;
+ /*!
+ requires
+ - size() != 0
+ ensures
+ - returns the top element of the stack.
+ !*/
+
+ T& top(
+ );
+ /*!
+ requires
+ - size() != 0
+ ensures
+ - returns the top element of the stack.
+ !*/
+
+ size_t size(
+ ) const;
+ /*!
+ ensures
+ - returns the number of elements in this stack.
+ !*/
+
+ sstack pop(
+ size_t num = 1
+ );
+ /*!
+ requires
+ - num <= size()
+ ensures
+ - returns a reference to the sub-stack S such that:
+ - S.size() == size()-num.
+ - S.top() is num elements down the stack.
+ !*/
+ };
+
+ template <
+ typename T
+ >
+ sstack<T> make_sstack(
+ std::vector<T>& item
+ ) { return sstack<T>(item.data(), item.size()); }
+ /*!
+ ensures
+ - returns a sstack that sits on top of the given std::vector.
+ !*/
+
+// ----------------------------------------------------------------------------------------
+
+ template <
+ typename LAYER_DETAILS,
+ typename SUBNET
+ >
+ class add_layer
+ {
+ /*!
+ REQUIREMENTS ON LAYER_DETAILS
+ - Must be a type that implements the EXAMPLE_COMPUTATIONAL_LAYER_ interface
+ defined in layers_abstract.h
+
+ REQUIREMENTS ON SUBNET
+ - One of the following must be true:
+ - SUBNET implements the EXAMPLE_INPUT_LAYER interface defined in
+ input_abstract.h.
+ - SUBNET is an add_layer object.
+ - SUBNET is an add_tag_layer object.
+ - SUBNET is an add_skip_layer object.
+ - SUBNET is a repeat object.
+
+ WHAT THIS OBJECT REPRESENTS
+ This object represents a deep neural network. In particular, it is a tool
+ for adding another layer on top of the neural network of type SUBNET, which
+ is specified as a template argument. The specific layer added is defined
+ by the LAYER_DETAILS details template argument.
+ !*/
+
+ public:
+ typedef LAYER_DETAILS layer_details_type;
+ typedef SUBNET subnet_type;
+ typedef typename subnet_type::input_type input_type;
+ // num_computational_layers will always give the number of layers in the network
+ // that transform tensors (i.e. layers defined by something that implements the
+ // EXAMPLE_COMPUTATIONAL_LAYER_ interface). This is all the layers except for
+ // loss, tag, and skip layers.
+ const static size_t num_computational_layers = subnet_type::num_computational_layers + 1;
+ // num_layers counts all the layers in the network regardless of their type.
+ const static size_t num_layers = subnet_type::num_layers + 1;
+
+ add_layer(
+ );
+ /*!
+ ensures
+ - default constructs all the layers in this network.
+ - #sample_expansion_factor() == 0
+ !*/
+
+ add_layer(const add_layer&) = default;
+ add_layer(add_layer&&) = default;
+ add_layer& operator=(add_layer&&) = default;
+ add_layer& operator=(const add_layer&) = default;
+ /*!
+ ensures
+ - this object is copyable and movable.
+ !*/
+
+ template <typename T, typename U>
+ add_layer(
+ const add_layer<T,U>& item
+ );
+ /*!
+ ensures
+ - This constructor allows you to copy neural network objects from one to
+ another as long as their corresponding layers can be constructed from
+ each other.
+ - #layer_details() == layer_details_type(item.layer_details())
+ - #subnet() == subnet_type(item.subnet())
+ - #sample_expansion_factor() == item.sample_expansion_factor()
+ !*/
+
+ template <typename ...T, typename LD, typename ...U>
+ add_layer(
+ const std::tuple<LD,U...>& layer_det,
+ T&& ...args
+ );
+ /*!
+ ensures
+ - #layer_details() == layer_details_type(tuple_head(layer_det))
+ - #subnet() == subnet_type(tuple_tail(layer_det),args)
+ - #sample_expansion_factor() == 0
+ !*/
+
+ template <typename ...T>
+ add_layer(
+ const layer_details_type& layer_det,
+ T&& ...args
+ );
+ /*!
+ ensures
+ - #layer_details() == layer_details_type(layer_det)
+ - #subnet() == subnet_type(args)
+ - #sample_expansion_factor() == 0
+ !*/
+
+ template <typename ...T>
+ add_layer(
+ T&& ...args
+ );
+ /*!
+ ensures
+ - This version of the constructor is only called if layer_details_type
+ can't be constructed from the first thing in args. In this case, the
+ args are simply passed on to the sub layers in their entirety.
+ - #layer_details() == layer_details_type()
+ - #subnet() == subnet_type(args)
+ - #sample_expansion_factor() == 0
+ !*/
+
+ template <typename ...T>
+ add_layer(
+ layer_details_type&& layer_det,
+ T&& ...args
+ );
+ /*!
+ ensures
+ - #layer_details() == layer_det
+ - #subnet() == subnet_type(args)
+ - #sample_expansion_factor() == 0
+ !*/
+
+ template <typename forward_iterator>
+ void to_tensor (
+ forward_iterator ibegin,
+ forward_iterator iend,
+ resizable_tensor& data
+ ) const;
+ /*!
+ requires
+ - [ibegin, iend) is an iterator range over input_type objects.
+ - std::distance(ibegin,iend) > 0
+ ensures
+ - Converts the iterator range into a tensor and stores it into #data.
+ - #data.num_samples()%distance(ibegin,iend) == 0.
+ - #sample_expansion_factor() == #data.num_samples()/distance(ibegin,iend).
+ - #sample_expansion_factor() > 0
+ - The data in the ith sample of #data corresponds to the input_type object
+ *(ibegin+i/#sample_expansion_factor()).
+ - Invokes data.async_copy_to_device() so that the data begins transferring
+ to the GPU device, if present.
+ - This function is implemented by calling the to_tensor() routine defined
+ at the input layer of this network.
+ !*/
+
+ unsigned int sample_expansion_factor (
+ ) const;
+ /*!
+ ensures
+ - When to_tensor() is invoked on this network's input layer it converts N
+ input objects into M samples, all stored inside a resizable_tensor. It
+ is always the case that M is some integer multiple of N.
+ sample_expansion_factor() returns the value of this multiplier. To be
+ very specific, it is always true that M==I*N where I is some integer.
+ This integer I is what is returned by sample_expansion_factor().
+ !*/
+
+ const subnet_type& subnet(
+ ) const;
+ /*!
+ ensures
+ - returns the immediate subnetwork of *this network.
+ !*/
+
+ subnet_type& subnet(
+ );
+ /*!
+ ensures
+ - returns the immediate subnetwork of *this network.
+ !*/
+
+ const layer_details_type& layer_details(
+ ) const;
+ /*!
+ ensures
+ - returns the layer_details_type instance that defines the behavior of the
+ layer at the top of this network. I.e. returns the layer details that
+ defines the behavior of the layer nearest to the network output rather
+ than the input layer.
+ !*/
+
+ layer_details_type& layer_details(
+ );
+ /*!
+ ensures
+ - returns the layer_details_type instance that defines the behavior of the
+ layer at the top of this network. I.e. returns the layer details that
+ defines the behavior of the layer nearest to the network output rather
+ than the input layer.
+ !*/
+
+ template <typename forward_iterator>
+ const tensor& operator() (
+ forward_iterator ibegin,
+ forward_iterator iend
+ );
+ /*!
+ requires
+ - [ibegin, iend) is an iterator range over input_type objects.
+ - std::distance(ibegin,iend) > 0
+ ensures
+ - runs [ibegin,iend) through the network and returns the results.
+ In particular, this function performs:
+ to_tensor(ibegin,iend,temp_tensor);
+ return forward(temp_tensor);
+ - The return value from this function is also available in #get_output().
+ i.e. this function returns #get_output().
+ - have_same_dimensions(#get_gradient_input(), #get_output()) == true.
+ - All elements of #get_gradient_input() are set to 0.
+ i.e. calling this function clears out #get_gradient_input() and ensures
+ it has the same dimensions as the most recent output.
+ !*/
+
+ const tensor& operator() (
+ const input_type& x
+ );
+ /*!
+ ensures
+ - runs a single x through the network and returns the output.
+ I.e. returns (*this)(&x, &x+1);
+ !*/
+
+ const tensor& forward(
+ const tensor& x
+ );
+ /*!
+ requires
+ - sample_expansion_factor() != 0
+ (i.e. to_tensor() must have been called to set sample_expansion_factor()
+ to something non-zero.)
+ - x.num_samples()%sample_expansion_factor() == 0
+ - x.num_samples() > 0
+ ensures
+ - Runs x through the network and returns the results. In particular, this
+ function performs the equivalent of:
+ subnet().forward(x);
+ if (this is the first time forward() has been called) then
+ layer_details().setup(subnet());
+ layer_details().forward(subnet(), get_output());
+ - The return value from this function is also available in #get_output().
+ i.e. this function returns #get_output().
+ - have_same_dimensions(#get_gradient_input(), #get_output()) == true
+ - All elements of #get_gradient_input() are set to 0.
+ i.e. calling this function clears out #get_gradient_input() and ensures
+ it has the same dimensions as the most recent output.
+ !*/
+
+ const tensor& get_output(
+ ) const;
+ /*!
+ ensures
+ - returns the output for the last tensor that was run through the network.
+ If nothing has been run through the network yet then returns an empty
+ tensor.
+ !*/
+
+ tensor& get_gradient_input(
+ );
+ /*!
+ ensures
+ - returns the error gradient for this network. That is, this is the error
+ gradient that this network will use to compute parameter gradients when
+ back_propagate_error() is called. Therefore, when performing back
+ propagation, layers that sit on top of this network layer write their
+ back-propagated error gradients into get_gradient_input(). Or to put it
+ another way, during back-propagation, layers take the contents of their
+ get_gradient_input() and back-propagate it through themselves and store
+ the result into their subnetwork's get_gradient_input().
+
+ This means you should consider get_gradient_input() as an input to the
+ back_propagate_error() method.
+ !*/
+
+ const tensor& get_final_data_gradient(
+ ) const;
+ /*!
+ ensures
+ - if back_propagate_error() has been called to back-propagate a gradient
+ through this network then you can call get_final_data_gradient() to
+ obtain the last data gradient computed. That is, this function returns
+ the gradient of the network with respect to its inputs.
+ - Note that there is only one "final data gradient" for an entire network,
+ not one per layer, since there is only one input to the entire network.
+ !*/
+
+ const tensor& get_parameter_gradient(
+ ) const;
+ /*!
+ ensures
+ - if back_propagate_error() has been called then you can call
+ get_parameter_gradient() to find the gradient of this layer's parameters.
+ When we update the parameters by calling update_parameters(), it will use
+ the gradient in get_parameter_gradient() to perform the update.
+ Therefore, you should consider get_parameter_gradient() as an input to
+ update_parameters().
+ !*/
+
+ tensor& get_parameter_gradient (
+ );
+ /*!
+ ensures
+ - returns a non-const reference to the tensor returned by the above
+ get_parameter_gradient() method. You could use this method to modify the
+ parameter gradient in some way before invoking update_parameters().
+ !*/
+
+ void back_propagate_error(
+ const tensor& x
+ );
+ /*!
+ requires
+ - forward(x) was called to forward propagate x though the network.
+ Moreover, this was the most recent call to forward() and x has not been
+ subsequently modified in any way.
+ - get_gradient_input() has been set equal to the gradient of this network's
+ output with respect to some loss function.
+ ensures
+ - Back propagates the error gradient, get_gradient_input(), through this
+ network and computes parameter and data gradients, via backpropagation.
+ Specifically, this function populates get_final_data_gradient() and also,
+ for each layer, the tensor returned by get_parameter_gradient().
+ - All elements of #get_gradient_input() are set to 0.
+ - have_same_dimensions(#get_final_data_gradient(), x) == true.
+ - have_same_dimensions(#get_parameter_gradient(), layer_details().get_layer_params()) == true.
+ - #get_final_data_gradient() contains the gradient of the network with
+ respect to x.
+ !*/
+
+ void back_propagate_error(
+ const tensor& x,
+ const tensor& gradient_input
+ );
+ /*!
+ requires
+ - forward(x) was called to forward propagate x though the network.
+ Moreover, this was the most recent call to forward() and x has not been
+ subsequently modified in any way.
+ - have_same_dimensions(gradient_input, get_output()) == true
+ ensures
+ - This function is identical to the version of back_propagate_error()
+ defined immediately above except that it back-propagates gradient_input
+ through the network instead of get_gradient_input(). Therefore, this
+ version of back_propagate_error() is equivalent to performing:
+ get_gradient_input() = gradient_input;
+ back_propagate_error(x);
+ Except that calling back_propagate_error(x,gradient_input) avoids the
+ copy and is therefore slightly more efficient.
+ - All elements of #get_gradient_input() are set to 0.
+ - have_same_dimensions(#get_final_data_gradient(), x) == true.
+ - have_same_dimensions(#get_parameter_gradient(), layer_details().get_layer_params()) == true.
+ - #get_final_data_gradient() contains the gradient of the network with
+ respect to x.
+ !*/
+
+ template <typename solver_type>
+ void update_parameters(
+ sstack<solver_type> solvers,
+ double learning_rate
+ );
+ /*!
+ requires
+ - solver_type is an implementation of the EXAMPLE_SOLVER interface defined
+ in solvers_abstract.h
+ - back_propagate_error() has been called.
+ - The given solvers have only ever been used with this network. That is,
+ if you want to call update_parameters() on some other neural network
+ object then you must NOT reuse the same solvers object.
+ - solvers.size() >= num_computational_layers
+ - 0 < learning_rate <= 1
+ ensures
+ - Updates all the parameters in the network. In particular, we pass each
+ layer's parameter gradient (i.e. the tensor returned by the layer's
+ get_parameter_gradient() member) through that layer's corresponding
+ solver object. This produces a parameter delta vector which we add to
+ the layer's parameters.
+ - The solvers use the given learning rate.
+ !*/
+
+ void clean(
+ );
+ /*!
+ ensures
+ - Causes the network to forget about everything but its parameters.
+ That is, for each layer we will have:
+ - get_output().num_samples() == 0
+ - get_gradient_input().num_samples() == 0
+ However, running new input data though this network will still produce
+ the same output it would have produced regardless of any calls to
+ clean(). The purpose of clean() is to compact the network object prior
+ to saving it to disk so that it takes up less space and the IO is
+ quicker.
+ - This also calls the .clean() method on any layer details objects that
+ define a .clean() method.
+ !*/
+
+ };
+
+ template <typename T, typename U>
+ std::ostream& operator<<(std::ostream& out, const add_layer<T,U>& item);
+ /*!
+ prints the network architecture to the given output stream.
+ !*/
+
+ template <typename T, typename U>
+ void serialize(const add_layer<T,U>& item, std::ostream& out);
+ template <typename T, typename U>
+ void deserialize(add_layer<T,U>& item, std::istream& in);
+ /*!
+ provides serialization support
+ !*/
+
+// ----------------------------------------------------------------------------------------
+// ----------------------------------------------------------------------------------------
+// ----------------------------------------------------------------------------------------
+
+ class no_label_type;
+
+ template <
+ typename LOSS_DETAILS,
+ typename SUBNET
+ >
+ class add_loss_layer
+ {
+ /*!
+ REQUIREMENTS ON LOSS_DETAILS
+ - Must be a type that implements the EXAMPLE_LOSS_LAYER_ interface defined
+ in loss_abstract.h
+
+ REQUIREMENTS ON SUBNET
+ - One of the following must be true:
+ - SUBNET is an add_layer object.
+ - SUBNET is an add_tag_layer object.
+ - SUBNET is an add_skip_layer object.
+ - SUBNET is a repeat object.
+
+ WHAT THIS OBJECT REPRESENTS
+ This object represents a deep neural network. In particular, it is a tool
+ for adding a loss layer on top of the neural network of type SUBNET, which
+ is specified as a template argument. The specific layer added is defined
+ by the LOSS_DETAILS details template argument. Importantly, a loss layer
+ is the last layer in a deep neural network. So once it is added you can't
+ add any other layers of any type.
+ !*/
+
+ public:
+ typedef LOSS_DETAILS loss_details_type;
+ typedef SUBNET subnet_type;
+ typedef typename subnet_type::input_type input_type;
+ const static size_t num_computational_layers = subnet_type::num_computational_layers;
+ const static size_t num_layers = subnet_type::num_layers + 1;
+ // If LOSS_DETAILS is an unsupervised loss then training_label_type==no_label_type.
+ // Otherwise it is defined as follows:
+ typedef typename LOSS_DETAILS::training_label_type training_label_type;
+ // Similarly, if LOSS_DETAILS doesn't provide any output conversion then
+ // output_label_type==no_label_type.
+ typedef typename LOSS_DETAILS::output_label_type output_label_type;
+
+
+
+ add_loss_layer() = default;
+ /*!
+ ensures
+ - default constructs all the layers in this network.
+ !*/
+
+ add_loss_layer(const add_loss_layer&) = default;
+ add_loss_layer(add_loss_layer&&) = default;
+ add_loss_layer& operator=(add_loss_layer&&) = default;
+ add_loss_layer& operator=(const add_loss_layer&) = default;
+ /*!
+ ensures
+ - this object is copyable and movable.
+ !*/
+
+ template <typename T, typename U>
+ add_loss_layer(
+ const add_loss_layer<T,U>& item
+ );
+ /*!
+ ensures
+ - This constructor allows you to copy neural network objects from one to
+ another as long as their corresponding layers can be constructed from
+ each other.
+ - #loss_details() == loss_details_type(item.loss_details())
+ - #subnet() == subnet_type(item.subnet())
+ !*/
+
+ template <typename ...T>
+ add_loss_layer(
+ const LOSS_DETAILS& layer_det,
+ T&& ...args
+ );
+ /*!
+ ensures
+ - #loss_details() == loss_details_type(layer_det)
+ - #subnet() == subnet_type(args)
+ !*/
+
+ template <typename ...T>
+ add_loss_layer(
+ LOSS_DETAILS&& layer_det,
+ T&& ...args
+ );
+ /*!
+ ensures
+ - #loss_details() == loss_details_type(layer_det)
+ - #subnet() == subnet_type(args)
+ !*/
+
+ template <typename ...T>
+ add_loss_layer(
+ T&& ...args
+ );
+ /*!
+ ensures
+ - This version of the constructor is only called if loss_details_type can't
+ be constructed from the first thing in args. In this case, the args are
+ simply passed on to the sub layers in their entirety.
+ - #loss_details() == loss_details_type()
+ - #subnet() == subnet_type(args)
+ !*/
+
+ const subnet_type& subnet(
+ ) const;
+ /*!
+ ensures
+ - returns the immediate subnetwork of *this network.
+ !*/
+
+ subnet_type& subnet(
+ );
+ /*!
+ ensures
+ - returns the immediate subnetwork of *this network.
+ !*/
+
+ const loss_details_type& loss_details(
+ ) const;
+ /*!
+ ensures
+ - returns the loss_details_type instance that defines the behavior of the
+ loss layer used by this network.
+ !*/
+
+ loss_details_type& loss_details(
+ );
+ /*!
+ ensures
+ - returns the loss_details_type instance that defines the behavior of the
+ loss layer used by this network.
+ !*/
+
+ template <typename forward_iterator>
+ void to_tensor (
+ forward_iterator ibegin,
+ forward_iterator iend,
+ resizable_tensor& data
+ ) const;
+ /*!
+ requires
+ - [ibegin, iend) is an iterator range over input_type objects.
+ - std::distance(ibegin,iend) > 0
+ ensures
+ - Converts the iterator range into a tensor and stores it into #data.
+ - #data.num_samples()%distance(ibegin,iend) == 0.
+ - #sample_expansion_factor() == #data.num_samples()/distance(ibegin,iend).
+ - #sample_expansion_factor() > 0
+ - The data in the ith sample of #data corresponds to the input_type object
+ *(ibegin+i/sample_expansion_factor()).
+ - Invokes data.async_copy_to_device() so that the data begins transferring
+ to the GPU device, if present.
+ - This function is implemented by calling the to_tensor() routine defined
+ at the input layer of this network.
+ !*/
+
+ unsigned int sample_expansion_factor (
+ ) const;
+ /*!
+ ensures
+ - When to_tensor() is invoked on this network's input layer it converts N
+ input objects into M samples, all stored inside a resizable_tensor. It
+ is always the case that M is some integer multiple of N.
+ sample_expansion_factor() returns the value of this multiplier. To be
+ very specific, it is always true that M==I*N where I is some integer.
+ This integer I is what is returned by sample_expansion_factor().
+ !*/
+
+ // -------------
+
+ template <typename output_iterator>
+ void operator() (
+ const tensor& x,
+ output_iterator obegin
+ );
+ /*!
+ requires
+ - sample_expansion_factor() != 0
+ (i.e. to_tensor() must have been called to set sample_expansion_factor()
+ to something non-zero.)
+ - x.num_samples()%sample_expansion_factor() == 0
+ - x.num_samples() > 0
+ - obegin == iterator pointing to the start of a range of
+ x.num_samples()/sample_expansion_factor() output_label_type elements.
+ ensures
+ - runs x through the network and writes the output to the range at obegin.
+ - loss_details().to_label() is used to write the network output into
+ obegin.
+ !*/
+
+ template <typename forward_iterator, typename label_iterator>
+ void operator() (
+ forward_iterator ibegin,
+ forward_iterator iend,
+ label_iterator obegin
+ );
+ /*!
+ requires
+ - [ibegin, iend) is an iterator range over input_type objects.
+ - std::distance(ibegin,iend) > 0
+ - obegin == iterator pointing to the start of a range of
+ std::distance(ibegin,iend) output_label_type elements.
+ ensures
+ - runs [ibegin,iend) through the network and writes the output to the range
+ at obegin.
+ - loss_details().to_label() is used to write the network output into
+ obegin.
+ !*/
+
+ // -------------
+
+ const output_label_type& operator() (
+ const input_type& x
+ );
+ /*!
+ ensures
+ - runs a single object, x, through the network and returns the output.
+ - loss_details().to_label() is used to convert the network output into a
+ output_label_type.
+ !*/
+
+ template <typename iterable_type>
+ std::vector<output_label_type> operator() (
+ const iterable_type& data,
+ size_t batch_size = 128
+ );
+ /*!
+ requires
+ - batch_size > 0
+ - data must have a .begin() and .end() that supply iterators over a
+ sequence of input_type elements. E.g. data could have a type of
+ std::vector<input_type>
+ ensures
+ - runs all the objects in data through the network and returns their
+ predicted labels. This means this function returns a vector V such that:
+ - V.size() == data.size()
+ - for all valid i: V[i] == the predicted label of data[i].
+ - Elements of data are run through the network in batches of batch_size
+ items. Using a batch_size > 1 can be faster because it better exploits
+ the available hardware parallelism.
+ - loss_details().to_label() is used to convert the network output into a
+ output_label_type.
+ !*/
+
+ template <typename ...T>
+ const output_label_type& process (
+ const input_type& x,
+ T&& ...args
+ );
+ /*!
+ ensures
+ - This function is just like (*this)(x), i.e. it runs a single object, x,
+ through the network and returns the output. But we additionally pass the
+ given args to loss_details().to_label() as the 4th argument (or more,
+ depending on how many things are in args) when converting the network
+ output to an output_label_type. This is useful, for instance, with loss
+ layers like loss_mmod_ which has an optional adjust_threshold argument to
+ to_label() that adjusts the detection threshold. Therefore, for such
+ networks you could call them like: net.process(some_image, -0.5), and -0.5
+ would be passed so the adjust_threshold argument of to_tensor().
+ !*/
+
+ template <typename iterable_type, typename ...T>
+ std::vector<output_label_type> process_batch (
+ const iterable_type& data,
+ size_t batch_size,
+ T&& ...args
+ );
+ /*!
+ requires
+ - batch_size > 0
+ - data must have a .begin() and .end() that supply iterators over a
+ sequence of input_type elements. E.g. data could have a type of
+ std::vector<input_type>
+ ensures
+ - This function is just like (*this)(data,batch_size), i.e. it runs a
+ bunch of objects through the network and returns the outputs. But we
+ additionally pass the given args to loss_details().to_label() as the 4th
+ argument (or more, depending on how many things are in args) when
+ converting the network output to output_label_types. This is useful,
+ for instance, with loss layers like loss_mmod_ which has an optional
+ adjust_threshold argument to to_label() that adjusts the detection
+ threshold. Therefore, for such networks you could call them like:
+ net.process_batch(std::vector<image_type>({some_image, another_image}), 128, -0.5),
+ and -0.5 would be passed so the adjust_threshold argument of to_tensor().
+ !*/
+
+ // -------------
+
+ template <typename label_iterator>
+ double compute_loss (
+ const tensor& x,
+ label_iterator lbegin
+ );
+ /*!
+ requires
+ - sample_expansion_factor() != 0
+ (i.e. to_tensor() must have been called to set sample_expansion_factor()
+ to something non-zero.)
+ - x.num_samples()%sample_expansion_factor() == 0
+ - x.num_samples() > 0
+ - lbegin == iterator pointing to the start of a range of
+ x.num_samples()/sample_expansion_factor() training_label_type elements.
+ ensures
+ - runs x through the network, compares the output to the expected output
+ pointed to by lbegin, and returns the resulting loss.
+ - for all valid k:
+ - the expected label of the kth sample in x is *(lbegin+k/sample_expansion_factor()).
+ - This function does not update the network parameters.
+ !*/
+
+ template <typename forward_iterator, typename label_iterator>
+ double compute_loss (
+ forward_iterator ibegin,
+ forward_iterator iend,
+ label_iterator lbegin
+ );
+ /*!
+ requires
+ - [ibegin, iend) is an iterator range over input_type objects.
+ - std::distance(ibegin,iend) > 0
+ - lbegin == iterator pointing to the start of a range of
+ std::distance(ibegin,iend) training_label_type elements.
+ ensures
+ - runs [ibegin,iend) through the network, compares the output to the
+ expected output pointed to by lbegin, and returns the resulting loss.
+ - for all valid k:
+ - the expected label of *(ibegin+k) is *(lbegin+k).
+ - This function does not update the network parameters.
+ !*/
+
+ // -------------
+
+ double compute_loss (
+ const tensor& x
+ );
+ /*!
+ requires
+ - LOSS_DETAILS is an unsupervised loss. i.e. training_label_type==no_label_type.
+ - sample_expansion_factor() != 0
+ (i.e. to_tensor() must have been called to set sample_expansion_factor()
+ to something non-zero.)
+ - x.num_samples()%sample_expansion_factor() == 0
+ - x.num_samples() > 0
+ ensures
+ - runs x through the network and returns the resulting loss.
+ - This function does not update the network parameters.
+ !*/
+
+ template <typename forward_iterator>
+ double compute_loss (
+ forward_iterator ibegin,
+ forward_iterator iend,
+ );
+ /*!
+ requires
+ - LOSS_DETAILS is an unsupervised loss. i.e. training_label_type==no_label_type.
+ - [ibegin, iend) is an iterator range over input_type objects.
+ - std::distance(ibegin,iend) > 0
+ ensures
+ - runs [ibegin,iend) through the network and returns the resulting loss.
+ - This function does not update the network parameters.
+ !*/
+
+ // -------------
+
+ template <typename label_iterator>
+ double compute_parameter_gradients (
+ const tensor& x,
+ label_iterator lbegin
+ );
+ /*!
+ requires
+ - sample_expansion_factor() != 0
+ (i.e. to_tensor() must have been called to set sample_expansion_factor()
+ to something non-zero.)
+ - x.num_samples()%sample_expansion_factor() == 0
+ - x.num_samples() > 0
+ - lbegin == iterator pointing to the start of a range of
+ x.num_samples()/sample_expansion_factor() training_label_type elements.
+ ensures
+ - runs x through the network, compares the output to the expected output
+ pointed to by lbegin, and computes parameter and data gradients with
+ respect to the loss, via backpropagation. Specifically, this function
+ updates get_final_data_gradient() and also, for each layer, the tensor
+ returned by get_parameter_gradient().
+ - for all valid k:
+ - the expected label of the kth sample in x is *(lbegin+k/sample_expansion_factor()).
+ - returns compute_loss(x,lbegin)
+ !*/
+
+ template <typename forward_iterator, typename label_iterator>
+ double compute_parameter_gradients (
+ forward_iterator ibegin,
+ forward_iterator iend,
+ label_iterator lbegin
+ );
+ /*!
+ requires
+ - [ibegin, iend) is an iterator range over input_type objects.
+ - std::distance(ibegin,iend) > 0
+ - lbegin == iterator pointing to the start of a range of
+ std::distance(ibegin,iend) training_label_type elements.
+ ensures
+ - runs [ibegin,iend) through the network, compares the output to the
+ expected output pointed to by lbegin, and computes parameter and data
+ gradients with respect to the loss, via backpropagation. Specifically,
+ this function updates get_final_data_gradient() and also, for each layer,
+ the tensor returned by get_parameter_gradient().
+ - for all valid k:
+ - the expected label of *(ibegin+k) is *(lbegin+k).
+ - returns compute_loss(ibegin,iend,lbegin)
+ !*/
+
+ double compute_parameter_gradients (
+ const tensor& x
+ );
+ /*!
+ requires
+ - LOSS_DETAILS is an unsupervised loss. i.e. training_label_type==no_label_type.
+ - sample_expansion_factor() != 0
+ (i.e. to_tensor() must have been called to set sample_expansion_factor()
+ to something non-zero.)
+ - x.num_samples()%sample_expansion_factor() == 0
+ - x.num_samples() > 0
+ ensures
+ - runs x through the network and computes parameter and data gradients with
+ respect to the loss, via backpropagation. Specifically, this function
+ updates get_final_data_gradient() and also, for each layer, the tensor
+ returned by get_parameter_gradient().
+ - returns compute_loss(x)
+ !*/
+
+ template <typename forward_iterator>
+ double compute_parameter_gradients (
+ forward_iterator ibegin,
+ forward_iterator iend
+ );
+ /*!
+ requires
+ - LOSS_DETAILS is an unsupervised loss. i.e. training_label_type==no_label_type.
+ - [ibegin, iend) is an iterator range over input_type objects.
+ - std::distance(ibegin,iend) > 0
+ ensures
+ - runs [ibegin,iend) through the network and computes parameter and data
+ gradients with respect to the loss, via backpropagation. Specifically,
+ this function updates get_final_data_gradient() and also, for each layer,
+ the tensor returned by get_parameter_gradient().
+ - returns compute_loss(ibegin,iend)
+ !*/
+
+ template <typename solver_type>
+ void update_parameters (
+ sstack<solver_type> solvers,
+ double learning_rate
+ );
+ /*!
+ requires
+ - solver_type is an implementation of the EXAMPLE_SOLVER interface defined
+ in solvers_abstract.h
+ - compute_parameter_gradients() has been called.
+ - The given solvers have only ever been used with this network. That
+ is, if you want to call update_parameters() on some other neural network
+ object then you must NOT reuse the same solvers object.
+ - solvers.size() >= num_computational_layers
+ - 0 < learning_rate <= 1
+ ensures
+ - Updates all the parameters in the network. In particular, we pass each
+ layer's parameter gradient (i.e. the tensor returned by the layer's
+ get_parameter_gradient() member) through that layer's corresponding
+ solver object. This produces a parameter delta vector which we add to
+ the layer's parameters.
+ - The solvers use the given learning rate.
+ !*/
+
+ // -------------
+
+ void clean (
+ );
+ /*!
+ ensures
+ - Causes the network to forget about everything but its parameters.
+ - invokes subnet().clean()
+ !*/
+ };
+
+ template <typename T, typename U>
+ std::ostream& operator<<(std::ostream& out, const add_loss_layer<T,U>& item);
+ /*!
+ prints the network architecture to the given output stream.
+ !*/
+
+ template <typename T, typename U>
+ void serialize(const add_loss_layer<T,U>& item, std::ostream& out);
+ template <typename T, typename U>
+ void deserialize(add_loss_layer<T,U>& item, std::istream& in);
+ /*!
+ provides serialization support
+ !*/
+
+// ----------------------------------------------------------------------------------------
+// ----------------------------------------------------------------------------------------
+// ----------------------------------------------------------------------------------------
+
+ template <typename ...T>
+ decorator_repeat_group<T...> repeat_group (
+ T&& ...args
+ );
+ /*!
+ ensures
+ - Decorates a group of variables. This is essentially like std::make_tuple()
+ except it's only purpose is to group variables together so they can be passed
+ to the repeat object's constructor.
+ !*/
+
+ template <
+ size_t num,
+ template<typename> class REPEATED_LAYER,
+ typename SUBNET
+ >
+ class repeat
+ {
+ /*!
+ REQUIREMENTS ON num
+ - num > 0
+
+ REQUIREMENTS ON REPEATED_LAYER
+ - REPEATED_LAYER must be a template that stacks more layers onto a deep neural
+ network. For example, if net_type were a network without a loss layer,
+ then it should be legal to create a deeper network with a type of
+ REPEATED_LAYER<net_type>.
+
+ REQUIREMENTS ON SUBNET
+ - One of the following must be true:
+ - SUBNET is an add_layer object.
+ - SUBNET is an add_tag_layer object.
+ - SUBNET is an add_skip_layer object.
+ - SUBNET is a repeat object.
+
+ WHAT THIS OBJECT REPRESENTS
+ This object adds more layers to a deep neural network. In particular, it
+ adds REPEATED_LAYER on top of SUBNET num times. So for example, if num were 2 then
+ repeat<2,REPEATED_LAYER,SUBNET> would create a network equivalent to REPEATED_LAYER<REPEATED_LAYER<SUBNET>>.
+
+ Also, this object provides an interface identical to the one defined by the
+ add_layer object except that we add the num_repetitions() and
+ get_repeated_layer() methods. These additions are shown below along with
+ some additional explanatory comments.
+ !*/
+
+ public:
+
+ typedef SUBNET subnet_type;
+ typedef typename SUBNET::input_type input_type;
+ const static size_t num_computational_layers = (REPEATED_LAYER<SUBNET>::num_computational_layers-SUBNET::num_computational_layers)*num + SUBNET::num_computational_layers;
+ const static size_t num_layers = (REPEATED_LAYER<SUBNET>::num_layers-SUBNET::num_layers)*num + SUBNET::num_layers;
+ typedef REPEATED_LAYER<an_unspecified_input_type> repeated_layer_type;
+
+ template <typename T, typename ...U>
+ repeat(
+ T arg1,
+ U ...args2
+ );
+ /*!
+ ensures
+ - arg1 is used to initialize the num_repetitions() copies of REPEATED_LAYER inside
+ this object. That is, all the REPEATED_LAYER elements are initialized identically
+ by being given copies of arg1.
+ - The rest of the arguments to the constructor, i.e. args2, are passed to
+ SUBNET's constructor.
+ !*/
+
+ template <typename ...T, typename ...U>
+ repeat(
+ decorator_repeat_group<T...>&& arg1,
+ U ...args2
+ );
+ /*!
+ ensures
+ - arg1 is used to initialize the num_repetitions() copies of REPEATED_LAYER inside
+ this object. That is, all the REPEATED_LAYER elements are initialized identically
+ by being given copies of an undecorated arg1.
+ - The rest of the arguments to the constructor, i.e. args2, are passed to
+ SUBNET's constructor.
+ !*/
+
+ size_t num_repetitions (
+ ) const;
+ /*!
+ ensures
+ - returns num (i.e. the number of times REPEATED_LAYER was stacked on top of SUBNET)
+ !*/
+
+ const repeated_layer_type& get_repeated_layer (
+ size_t i
+ ) const;
+ /*!
+ requires
+ - i < num_repetitions()
+ ensures
+ - returns a reference to the i-th instance of REPEATED_LAYER. For example,
+ get_repeated_layer(0) returns the instance of REPEATED_LAYER that is on the top of
+ the network while get_repeated_layer(num_repetitions()-1) returns the
+ instance of REPEATED_LAYER that is stacked immediately on top of SUBNET.
+ !*/
+
+ repeated_layer_type& get_repeated_layer (
+ size_t i
+ );
+ /*!
+ requires
+ - i < num_repetitions()
+ ensures
+ - returns a reference to the i-th instance of REPEATED_LAYER. For example,
+ get_repeated_layer(0) returns the instance of REPEATED_LAYER that is on the top of
+ the network while get_repeated_layer(num_repetitions()-1) returns the
+ instance of REPEATED_LAYER that is stacked immediately on top of SUBNET.
+ !*/
+
+ const subnet_type& subnet(
+ ) const;
+ /*!
+ ensures
+ - returns the SUBNET base network that repeat sits on top of. If you want
+ to access the REPEATED_LAYER components then you must use get_repeated_layer().
+ !*/
+
+ subnet_type& subnet(
+ );
+ /*!
+ ensures
+ - returns the SUBNET base network that repeat sits on top of. If you want
+ to access the REPEATED_LAYER components then you must use get_repeated_layer().
+ !*/
+ };
+
+ template < size_t num, template<typename> class T, typename U >
+ std::ostream& operator<<(std::ostream& out, const repeat<num,T,U>& item);
+ /*!
+ prints the network architecture to the given output stream.
+ !*/
+
+ template < size_t num, template<typename> class T, typename U >
+ void serialize(const repeat<num,T,U>& item, std::ostream& out);
+ template < size_t num, template<typename> class T, typename U >
+ void deserialize(repeat<num,T,U>& item, std::istream& in);
+ /*!
+ provides serialization support
+ !*/
+
+// ----------------------------------------------------------------------------------------
+
+ template <
+ unsigned long ID,
+ typename SUBNET
+ >
+ class add_tag_layer
+ {
+ /*!
+ REQUIREMENTS ON SUBNET
+ - One of the following must be true:
+ - SUBNET implements the EXAMPLE_INPUT_LAYER interface defined in
+ input_abstract.h.
+ - SUBNET is an add_layer object.
+ - SUBNET is an add_tag_layer object.
+ - SUBNET is an add_skip_layer object.
+ - SUBNET is a repeat object.
+
+ WHAT THIS OBJECT REPRESENTS
+ This object adds a new layer to a deep neural network. However, this layer
+ simply performs the identity transform. This means it is a no-op and its
+ presence does not change the behavior of the network. It exists solely to
+ be used by add_skip_layer to reference a particular part of a network.
+
+ Also, this object provides an interface identical to the one defined by the
+ add_layer object.
+ !*/
+ };
+
+ template <unsigned long ID, typename U>
+ std::ostream& operator<<(std::ostream& out, const add_tag_layer<ID,U>& item);
+ /*!
+ prints the network architecture to the given output stream.
+ !*/
+
+ template <unsigned long ID, typename U>
+ void serialize(const add_tag_layer<ID,U>& item, std::ostream& out);
+ template <unsigned long ID, typename U>
+ void deserialize(add_tag_layer<ID,U>& item, std::istream& in);
+ /*!
+ provides serialization support
+ !*/
+
+ template <typename SUBNET> using tag1 = add_tag_layer< 1, SUBNET>;
+ template <typename SUBNET> using tag2 = add_tag_layer< 2, SUBNET>;
+ template <typename SUBNET> using tag3 = add_tag_layer< 3, SUBNET>;
+ template <typename SUBNET> using tag4 = add_tag_layer< 4, SUBNET>;
+ template <typename SUBNET> using tag5 = add_tag_layer< 5, SUBNET>;
+ template <typename SUBNET> using tag6 = add_tag_layer< 6, SUBNET>;
+ template <typename SUBNET> using tag7 = add_tag_layer< 7, SUBNET>;
+ template <typename SUBNET> using tag8 = add_tag_layer< 8, SUBNET>;
+ template <typename SUBNET> using tag9 = add_tag_layer< 9, SUBNET>;
+ template <typename SUBNET> using tag10 = add_tag_layer<10, SUBNET>;
+
+ template <template<typename SUBNET> class tag>
+ struct tag_id
+ {
+ /*!
+ REQUIREMENTS ON tag
+ Tag should be an add_tag_layer template such as tag1, tag2, etc.
+
+ WHAT THIS OBJECT REPRESENTS
+ This is a tool for finding the numeric ID of a tag layer. For example,
+ tag_id<tag3>::id == 3.
+ !*/
+
+ const static unsigned long id;
+ };
+
+// ----------------------------------------------------------------------------------------
+
+ template <
+ template<typename> class TAG_TYPE,
+ typename SUBNET
+ >
+ class add_skip_layer
+ {
+ /*!
+ REQUIREMENTS ON SUBNET
+ - One of the following must be true:
+ - SUBNET is an add_layer object.
+ - SUBNET is an add_tag_layer object.
+ - SUBNET is an add_skip_layer object.
+ - SUBNET is a repeat object.
+
+ WHAT THIS OBJECT REPRESENTS
+ This object adds a new layer to a deep neural network which draws its
+ inputs from layer<TAG_TYPE>(subnet()) and performs the identity transform.
+
+ Also, this object provides an interface identical to the one defined by the
+ add_layer object.
+ !*/
+ };
+
+ template <template<typename> class T, typename U>
+ std::ostream& operator<<(std::ostream& out, const add_skip_layer<T,U>& item);
+ /*!
+ prints the network architecture to the given output stream.
+ !*/
+
+ template <template<typename> class T, typename U>
+ void serialize(const add_skip_layer<T,U>& item, std::ostream& out);
+ template <template<typename> class T, typename U>
+ void deserialize(add_skip_layer<T,U>& item, std::istream& in);
+ /*!
+ provides serialization support
+ !*/
+
+ template <typename SUBNET> using skip1 = add_skip_layer< tag1, SUBNET>;
+ template <typename SUBNET> using skip2 = add_skip_layer< tag2, SUBNET>;
+ template <typename SUBNET> using skip3 = add_skip_layer< tag3, SUBNET>;
+ template <typename SUBNET> using skip4 = add_skip_layer< tag4, SUBNET>;
+ template <typename SUBNET> using skip5 = add_skip_layer< tag5, SUBNET>;
+ template <typename SUBNET> using skip6 = add_skip_layer< tag6, SUBNET>;
+ template <typename SUBNET> using skip7 = add_skip_layer< tag7, SUBNET>;
+ template <typename SUBNET> using skip8 = add_skip_layer< tag8, SUBNET>;
+ template <typename SUBNET> using skip9 = add_skip_layer< tag9, SUBNET>;
+ template <typename SUBNET> using skip10 = add_skip_layer<tag10, SUBNET>;
+
+// ----------------------------------------------------------------------------------------
+
+ template <
+ unsigned int i,
+ typename net_type
+ >
+ auto& layer (
+ net_type& n
+ );
+ /*!
+ requires
+ - net_type is an object of type add_layer, add_loss_layer, add_skip_layer, or
+ add_tag_layer.
+ - i < net_type::num_layers
+ ensures
+ - This function allows you to access any layer in a network by its layer index
+ i. Therefore, it will walk i steps down the network and return the layer
+ object there. Since networks can be big, the best way to find layer index
+ numbers is to print a network to the screen since the print out will include
+ indexes for each layer.
+ - In general, this function chains together i calls to n.subnet() and returns
+ the result. So for example:
+ - if (i == 0)
+ - returns n
+ - else if (i == 1)
+ - returns n.subnet()
+ - else if (i == 2)
+ - returns n.subnet().subnet()
+ - else if (i == 3)
+ - returns n.subnet().subnet().subnet()
+ - else
+ - etc.
+ Except that when it hits a repeat layer it recurses into the repeated layers
+ contained inside. That is, if the layer index indicates a layer in a repeat
+ object this function will make the appropriate call to get_repeated_layer()
+ and do the right thing.
+ !*/
+
+ template <
+ template<typename> class Match,
+ typename net_type
+ >
+ auto& layer (
+ net_type& n
+ );
+ /*!
+ requires
+ - net_type is an object of type add_layer, add_loss_layer, add_skip_layer, or
+ add_tag_layer.
+ ensures
+ - returns the first layer in n that is of type Match. E.g. if net_type is
+ fc<relu<fc<input<sample_type>>>> then calling layer<relu>(n) would return
+ layer<1>(n), that is, a reference to the relu layer.
+ !*/
+
+ template <
+ template<typename> class Match,
+ unsigned int i,
+ typename net_type
+ >
+ auto& layer (
+ net_type& n
+ );
+ /*!
+ requires
+ - net_type is an object of type add_layer, add_loss_layer, add_skip_layer, or
+ add_tag_layer.
+ ensures
+ - returns layer<i>(layer<Match>(n))
+ !*/
+
+// ----------------------------------------------------------------------------------------
+
+ template <typename net_type>
+ auto& input_layer (
+ net_type& net
+ );
+ /*!
+ requires
+ - net_type is an object of type add_layer, add_loss_layer, add_skip_layer, or
+ add_tag_layer.
+ ensures
+ - returns the input later of the given network object. Specifically, this
+ function is equivalent to calling:
+ layer<net_type::num_layers-1>(net);
+ That is, you get the input layer details object for the network.
+ !*/
+
+// ----------------------------------------------------------------------------------------
+
+ template <
+ typename net_type,
+ typename visitor
+ >
+ void visit_layer_parameters(
+ net_type& net,
+ visitor v
+ );
+ /*!
+ requires
+ - net_type is an object of type add_layer, add_loss_layer, add_skip_layer, or
+ add_tag_layer.
+ - v is a function object with a signature equivalent to:
+ v(size_t idx, tensor& t)
+ ensures
+ - Loops over all the computational layers (i.e. layers with parameters, as
+ opposed to loss, tag, or input layers) in net and passes their parameters to
+ v(). To be specific, this function essentially performs the following:
+
+ size_t computational_layer_idx = 0;
+ for (size_t i = 0; i < net_type::num_layers; ++i)
+ {
+ if (layer<i>(net) is a computational layer)
+ {
+ v(computational_layer_idx, layer<i>(net).layer_details().get_layer_params());
+ ++computational_layer_idx;
+ }
+ }
+ - When v() is called, the first argument is always < net_type::num_computational_layers.
+ !*/
+
+// ----------------------------------------------------------------------------------------
+
+ template <
+ typename net_type,
+ typename visitor
+ >
+ void visit_layer_parameter_gradients(
+ net_type& net,
+ visitor v
+ );
+ /*!
+ requires
+ - net_type is an object of type add_layer, add_loss_layer, add_skip_layer, or
+ add_tag_layer.
+ - v is a function object with a signature equivalent to:
+ v(size_t idx, tensor& t)
+ ensures
+ - Loops over all the computational layers (i.e. layers with parameters, as
+ opposed to loss, tag, or input layers) in net and passes their parameter
+ gradients to v(). To be specific, this function essentially performs the
+ following:
+
+ size_t computational_layer_idx = 0;
+ for (size_t i = 0; i < net_type::num_layers; ++i)
+ {
+ if (layer<i>(net) is a computational layer)
+ {
+ v(computational_layer_idx, layer<i>(net).get_parameter_gradient());
+ ++computational_layer_idx;
+ }
+ }
+ - When v() is called, the first argument is always < net_type::num_computational_layers.
+ !*/
+
+// ----------------------------------------------------------------------------------------
+
+ template <
+ typename net_type,
+ typename visitor
+ >
+ void visit_layers(
+ net_type& net,
+ visitor v
+ );
+ /*!
+ requires
+ - net_type is an object of type add_layer, add_loss_layer, add_skip_layer, or
+ add_tag_layer.
+ - v is a function object with a signature equivalent to:
+ v(size_t idx, any_net_type& t)
+ That is, it must take a size_t and then any of the network types such as
+ add_layer, add_loss_layer, etc.
+ ensures
+ - Loops over all the layers in net and calls v() on them. To be specific, this
+ function essentially performs the following:
+
+ for (size_t i = 0; i < net_type::num_layers; ++i)
+ v(i, layer<i>(net));
+ !*/
+
+ template <
+ typename net_type,
+ typename visitor
+ >
+ void visit_layers_backwards(
+ net_type& net,
+ visitor v
+ );
+ /*!
+ requires
+ - net_type is an object of type add_layer, add_loss_layer, add_skip_layer, or
+ add_tag_layer.
+ - v is a function object with a signature equivalent to:
+ v(size_t idx, any_net_type& t)
+ That is, it must take a size_t and then any of the network types such as
+ add_layer, add_loss_layer, etc.
+ ensures
+ - Loops over all the layers in net and calls v() on them. The loop happens in
+ the reverse order of visit_layers(). To be specific, this function
+ essentially performs the following:
+
+ for (size_t i = net_type::num_layers; i != 0; --i)
+ v(i-1, layer<i-1>(net));
+ !*/
+
+// ----------------------------------------------------------------------------------------
+
+ template <
+ size_t begin,
+ size_t end,
+ typename net_type,
+ typename visitor
+ >
+ void visit_layers_range(
+ net_type& net,
+ visitor v
+ );
+ /*!
+ requires
+ - net_type is an object of type add_layer, add_loss_layer, add_skip_layer, or
+ add_tag_layer.
+ - v is a function object with a signature equivalent to:
+ v(size_t idx, any_net_type& t)
+ That is, it must take a size_t and then any of the network types such as
+ add_layer, add_loss_layer, etc.
+ - begin <= end <= net_type::num_layers
+ ensures
+ - Loops over the layers in the range [begin,end) in net and calls v() on them.
+ The loop happens in the reverse order of visit_layers(). To be specific,
+ this function essentially performs the following:
+
+ for (size_t i = begin; i < end; ++i)
+ v(i, layer<i>(net));
+ !*/
+
+ template <
+ size_t begin,
+ size_t end,
+ typename net_type,
+ typename visitor
+ >
+ void visit_layers_backwards_range(
+ net_type& net,
+ visitor v
+ );
+ /*!
+ requires
+ - net_type is an object of type add_layer, add_loss_layer, add_skip_layer, or
+ add_tag_layer.
+ - v is a function object with a signature equivalent to:
+ v(size_t idx, any_net_type& t)
+ That is, it must take a size_t and then any of the network types such as
+ add_layer, add_loss_layer, etc.
+ - begin <= end <= net_type::num_layers
+ ensures
+ - Loops over the layers in the range [begin,end) in net and calls v() on them.
+ The loop happens in the reverse order of visit_layers_range(). To be specific,
+ this function essentially performs the following:
+
+ for (size_t i = end; i != begin; --i)
+ v(i-1, layer<i-1>(net));
+ !*/
+
+// ----------------------------------------------------------------------------------------
+
+ template <
+ unsigned long tag_id,
+ typename net_type,
+ typename visitor
+ >
+ void visit_layers_until_tag(
+ net_type& net,
+ visitor v
+ );
+ /*!
+ requires
+ - net_type is an object of type add_layer, add_loss_layer, add_skip_layer, or
+ add_tag_layer.
+ - v is a function object with a signature equivalent to:
+ v(any_net_type& t)
+ That is, it must take any of the network types such as add_layer,
+ add_loss_layer, etc.
+ ensures
+ - Loops over all the layers in net beginning with layer<0>(net) and going until
+ a tag layer with an ID of tag_id is encountered. To be specific, this
+ function essentially performs the following:
+
+ size_t i = 0;
+ while(layer<i>(net) isn't an add_tag_layer with ID == tag_id) {
+ v(layer<i>(net));
+ ++i;
+ }
+ v(layer<i>(net)); // also visits the tag layer itself at the very end.
+ !*/
+
+// ----------------------------------------------------------------------------------------
+
+ struct layer_test_results
+ {
+ std::string log;
+ bool was_good;
+
+ operator bool() const { return was_good; }
+ };
+
+ inline std::ostream& operator<< (std::ostream& out, const layer_test_results& item)
+ {
+ out << item.log;
+ return out;
+ }
+
+ template <
+ typename layer_details_type
+ >
+ layer_test_results test_layer (
+ layer_details_type l
+ );
+ /*!
+ ensures
+ - Checks if l correctly implements the EXAMPLE_COMPUTATIONAL_LAYER_ interface
+ defined in layers_abstract.h. Importantly, it computes numerical approximations
+ to the gradients and compares them to the outputs of the layer.
+ - The results of the testing are returned. In particular, if the returned object
+ is RESULT then we will have:
+ - RESULT.was_good == false if and only if the layer failed the testing.
+ - RESULT.log == a string describing why the testing failed if was_good==false.
+ - Note that this function is only capable of checking layers that take
+ arbitrary subnetworks as input. So if you have designed a layer that expects
+ only a certain restricted type of subnetwork then you might get a compile or
+ runtime error when you call this function.
+ !*/
+
+// ----------------------------------------------------------------------------------------
+
+}
+
+#endif // DLIB_DNn_CORE_ABSTRACT_H_
+
diff --git a/ml/dlib/dlib/dnn/cpu_dlib.cpp b/ml/dlib/dlib/dnn/cpu_dlib.cpp
new file mode 100644
index 000000000..ed5661102
--- /dev/null
+++ b/ml/dlib/dlib/dnn/cpu_dlib.cpp
@@ -0,0 +1,2170 @@
+// Copyright (C) 2015 Davis E. King (davis@dlib.net)
+// License: Boost Software License See LICENSE.txt for the full license.
+#ifndef DLIB_DNN_CPU_cPP_
+#define DLIB_DNN_CPU_cPP_
+
+// This file contains CPU implementations of the GPU based functions in cuda_dlib.h
+
+#include "cpu_dlib.h"
+#include "tensor_tools.h"
+#include "../image_transforms/interpolation.h"
+#include "../threads.h"
+
+namespace dlib
+{
+ namespace cpu
+ {
+
+ // -----------------------------------------------------------------------------------
+
+ void multiply (
+ bool add_to,
+ tensor& dest,
+ const tensor& src1,
+ const tensor& src2
+ )
+ {
+ DLIB_CASSERT(dest.k() == src1.k() && src1.k() == src2.k() &&
+ dest.nr() == src1.nr() && src1.nr() == src2.nr() &&
+ dest.nc() == src1.nc() && src1.nc() == src2.nc() );
+ const long MD = std::max(std::max(dest.num_samples(),src1.num_samples()),src2.num_samples());
+ DLIB_CASSERT((dest.num_samples()==1 || dest.num_samples()==MD) &&
+ (src1.num_samples()==1 || src1.num_samples()==MD) &&
+ (src2.num_samples()==1 || src2.num_samples()==MD) );
+
+ if (dest.size() == 0)
+ return;
+
+ const size_t max_size = std::max(std::max(dest.size(),src1.size()),src2.size());
+ const auto d = dest.host();
+ const auto s1 = src1.host();
+ const auto s2 = src2.host();
+ if (dest.size() == src1.size() && src1.size() == src2.size())
+ {
+ if (add_to)
+ {
+ for (size_t i = 0; i < src1.size(); ++i)
+ d[i] += s1[i]*s2[i];
+ }
+ else
+ {
+ for (size_t i = 0; i < src1.size(); ++i)
+ d[i] = s1[i]*s2[i];
+ }
+ }
+ else if (dest.num_samples() == 1)
+ {
+ if (!add_to)
+ {
+ for (size_t i = 0; i < dest.size(); ++i)
+ d[i] = 0;
+ }
+ for (size_t i = 0; i < max_size; ++i)
+ d[i%dest.size()] += s1[i%src1.size()]*s2[i%src2.size()];
+ }
+ else
+ {
+ if (add_to)
+ {
+ for (size_t i = 0; i < max_size; ++i)
+ d[i] += s1[i%src1.size()]*s2[i%src2.size()];
+ }
+ else
+ {
+ for (size_t i = 0; i < max_size; ++i)
+ d[i] = s1[i%src1.size()]*s2[i%src2.size()];
+ }
+ }
+ }
+
+ // ------------------------------------------------------------------------------------
+
+ void multiply_conv (
+ bool add_to,
+ tensor& dest,
+ const tensor& src1,
+ const tensor& src2
+ )
+ {
+ auto d = dest.host();
+ auto s1 = src1.host();
+ auto s2 = src2.host();
+ if (have_same_dimensions(dest,src1))
+ {
+ DLIB_CASSERT(src2.num_samples() == 1 && src2.nr() == 1 && src2.nc() == 1 && src2.k() == src1.k());
+
+ if (add_to)
+ {
+ for (long n = 0; n < dest.num_samples(); ++n)
+ {
+ for (long k = 0; k < dest.k(); ++k)
+ {
+ for (long r = 0; r < dest.nr(); ++r)
+ {
+ for (long c = 0; c < dest.nc(); ++c)
+ {
+ *d++ += (*s1++)*s2[k];
+ }
+ }
+ }
+ }
+ }
+ else
+ {
+ for (long n = 0; n < dest.num_samples(); ++n)
+ {
+ for (long k = 0; k < dest.k(); ++k)
+ {
+ for (long r = 0; r < dest.nr(); ++r)
+ {
+ for (long c = 0; c < dest.nc(); ++c)
+ {
+ *d++ = (*s1++)*s2[k];
+ }
+ }
+ }
+ }
+ }
+ }
+ else
+ {
+ DLIB_CASSERT(have_same_dimensions(src1,src2));
+ DLIB_CASSERT(dest.num_samples() == 1 && dest.nr() == 1 && dest.nc() == 1 && dest.k() == src1.k());
+
+ if (!add_to)
+ {
+ for (long k = 0; k < src1.k(); ++k)
+ d[k] = 0;
+ }
+
+ for (long n = 0; n < src1.num_samples(); ++n)
+ {
+ for (long k = 0; k < src1.k(); ++k)
+ {
+ for (long r = 0; r < src1.nr(); ++r)
+ {
+ for (long c = 0; c < src1.nc(); ++c)
+ {
+ d[k] += (*s1++)*(*s2++);
+ }
+ }
+ }
+ }
+ }
+ }
+
+ // ------------------------------------------------------------------------------------
+
+ void scale_channels (
+ bool add_to,
+ tensor& dest,
+ const tensor& src,
+ const tensor& scales
+ )
+ {
+ DLIB_CASSERT(have_same_dimensions(dest,src) &&
+ scales.num_samples() == src.num_samples() &&
+ scales.k() == src.k() &&
+ scales.nr() == 1 &&
+ scales.nc() == 1 );
+
+ if (dest.size() == 0)
+ return;
+
+ if (add_to)
+ {
+ auto d = dest.host();
+ auto s = src.host();
+ auto scal = scales.host();
+
+ for (long n = 0; n < src.num_samples(); ++n)
+ {
+ for (long k = 0; k < src.k(); ++k)
+ {
+ const auto scale = scal[n*scales.k() + k];
+ for (long r = 0; r < src.nr(); ++r)
+ {
+ for (long c = 0; c < src.nc(); ++c)
+ {
+ *d++ += (*s++) * scale;
+ }
+ }
+ }
+ }
+
+
+ }
+ else
+ {
+ auto d = dest.host_write_only();
+ auto s = src.host();
+ auto scal = scales.host();
+
+ for (long n = 0; n < src.num_samples(); ++n)
+ {
+ for (long k = 0; k < src.k(); ++k)
+ {
+ const auto scale = scal[n*scales.k() + k];
+ for (long r = 0; r < src.nr(); ++r)
+ {
+ for (long c = 0; c < src.nc(); ++c)
+ {
+ *d++ = (*s++) * scale;
+ }
+ }
+ }
+ }
+ }
+ }
+
+ // ------------------------------------------------------------------------------------
+
+ void add(
+ float beta,
+ tensor& dest,
+ float alpha,
+ const tensor& src
+ )
+ {
+ DLIB_CASSERT(
+ (have_same_dimensions(src, dest) ||
+ (src.num_samples()==1 && src.k()==dest.k() && src.nr()==1 && src.nc()==1) ||
+ (src.num_samples()==1 && src.k()==dest.k() && src.nr()==dest.nr() && src.nc()==dest.nc()) ||
+ (src.num_samples()==1 && src.k()==1 && src.nr()==dest.nr() && src.nc()==dest.nc()) ||
+ (src.num_samples()==dest.num_samples() && src.k()==1 && src.nr()==1 && src.nc()==1)) &&
+ is_same_object(src,dest) == false ,
+ "\n\t dest.num_samples(): " << dest.num_samples()
+ <<"\n\t dest.k(): " << dest.k()
+ <<"\n\t dest.nr(): " << dest.nr()
+ <<"\n\t dest.nc(): " << dest.nc()
+ <<"\n\t src.num_samples(): " << src.num_samples()
+ <<"\n\t src.k(): " << src.k()
+ <<"\n\t src.nr(): " << src.nr()
+ <<"\n\t src.nc(): " << src.nc()
+ );
+
+
+ if (beta == 0 && alpha == 0)
+ {
+ dest = 0;
+ return;
+ }
+
+ auto d = dest.host();
+ auto s = src.host();
+ for (long n = 0; n < dest.num_samples(); ++n)
+ {
+ const auto sn = src.num_samples()==1 ? 0:n;
+ for (long k = 0; k < dest.k(); ++k)
+ {
+ const auto sk = src.k()==1 ? 0:k;
+ for (long r = 0; r < dest.nr(); ++r)
+ {
+ const auto sr = src.nr()==1 ? 0:r;
+ for (long c = 0; c < dest.nc(); ++c)
+ {
+ const auto sc = src.nc()==1 ? 0:c;
+
+ const auto s_idx = ((sn*src.k() + sk)*src.nr() + sr)*src.nc() + sc;
+ *d = beta*(*d) + alpha*s[s_idx];
+ ++d;
+ }
+ }
+ }
+ }
+ }
+
+ // ----------------------------------------------------------------------------------------
+
+ void add (
+ tensor& dest,
+ const tensor& src1,
+ const tensor& src2
+ )
+ {
+ auto d = dest.host();
+ auto s1 = src1.host();
+ auto s2 = src2.host();
+
+ // Do the simple and fast version if everything has the same dimensions
+ if (have_same_dimensions(dest, src1) &&
+ have_same_dimensions(dest, src2))
+ {
+ for (size_t i = 0; i < dest.size(); ++i)
+ d[i] = s1[i] + s2[i];
+ return;
+ }
+
+ // Otherwise, do the more complex version with bounds checking.
+ for (long n = 0; n < dest.num_samples(); ++n)
+ {
+ for (long k = 0; k < dest.k(); ++k)
+ {
+ for (long r = 0; r < dest.nr(); ++r)
+ {
+ for (long c = 0; c < dest.nc(); ++c)
+ {
+ float v1 = 0;
+ float v2 = 0;
+
+ // if this index is inside src1
+ if (n < src1.num_samples() &&
+ k < src1.k() &&
+ r < src1.nr() &&
+ c < src1.nc() )
+ {
+ const auto s_idx = ((n*src1.k() + k)*src1.nr() + r)*src1.nc() + c;
+ v1 = s1[s_idx];
+ }
+
+ // if this index is inside src2
+ if (n < src2.num_samples() &&
+ k < src2.k() &&
+ r < src2.nr() &&
+ c < src2.nc() )
+ {
+ const auto s_idx = ((n*src2.k() + k)*src2.nr() + r)*src2.nc() + c;
+ v2 = s2[s_idx];
+ }
+
+ *d = v1 + v2;
+ ++d;
+ }
+ }
+ }
+ }
+ }
+
+ // ----------------------------------------------------------------------------------------
+
+ void multiply_zero_padded (
+ bool add_to,
+ tensor& dest,
+ const tensor& src1,
+ const tensor& src2
+ )
+ {
+ auto d = dest.host();
+ auto s1 = src1.host();
+ auto s2 = src2.host();
+
+ // Do the simple and fast version if everything has the same dimensions
+ if (have_same_dimensions(dest, src1) &&
+ have_same_dimensions(dest, src2))
+ {
+ if (add_to)
+ {
+ for (size_t i = 0; i < dest.size(); ++i)
+ d[i] += s1[i] * s2[i];
+ }
+ else
+ {
+ for (size_t i = 0; i < dest.size(); ++i)
+ d[i] = s1[i] * s2[i];
+ }
+ return;
+ }
+
+ // Otherwise, do the more complex version with bounds checking.
+ for (long n = 0; n < dest.num_samples(); ++n)
+ {
+ for (long k = 0; k < dest.k(); ++k)
+ {
+ for (long r = 0; r < dest.nr(); ++r)
+ {
+ for (long c = 0; c < dest.nc(); ++c)
+ {
+ float v1 = 0;
+ float v2 = 0;
+
+ // if this index is inside src1
+ if (n < src1.num_samples() &&
+ k < src1.k() &&
+ r < src1.nr() &&
+ c < src1.nc() )
+ {
+ const auto s_idx = ((n*src1.k() + k)*src1.nr() + r)*src1.nc() + c;
+ v1 = s1[s_idx];
+ }
+
+ // if this index is inside src2
+ if (n < src2.num_samples() &&
+ k < src2.k() &&
+ r < src2.nr() &&
+ c < src2.nc() )
+ {
+ const auto s_idx = ((n*src2.k() + k)*src2.nr() + r)*src2.nc() + c;
+ v2 = s2[s_idx];
+ }
+
+ if (add_to)
+ *d += v1 * v2;
+ else
+ *d = v1 * v2;
+ ++d;
+ }
+ }
+ }
+ }
+ }
+
+ // ----------------------------------------------------------------------------------------
+
+ void assign_bias_gradient (
+ tensor& grad,
+ const tensor& gradient_input
+ )
+ {
+ DLIB_CASSERT(
+ grad.num_samples() == 1 &&
+ gradient_input.k() == grad.k() &&
+ gradient_input.nr() == grad.nr() &&
+ gradient_input.nc() == grad.nc() &&
+ gradient_input.size() > 0);
+
+ auto out = grad.host();
+ auto in = gradient_input.host();
+
+ for (size_t i = 0; i < grad.size(); ++i)
+ out[i] = *in++;
+
+ for (long j = 1; j < gradient_input.num_samples(); ++j)
+ {
+ for (size_t i = 0; i < grad.size(); ++i)
+ out[i] += *in++;
+ }
+ }
+
+ // ------------------------------------------------------------------------------------
+
+ void assign_conv_bias_gradient (
+ tensor& grad,
+ const tensor& gradient_input
+ )
+ {
+ DLIB_CASSERT(
+ grad.num_samples() == 1 &&
+ grad.k() >= 1 &&
+ grad.nr() == 1 &&
+ grad.nc() == 1 &&
+ gradient_input.k() == grad.k() &&
+ gradient_input.size() > 0 &&
+ is_same_object(grad,gradient_input) == false
+ );
+
+ auto g = grad.host();
+ auto gi = gradient_input.host();
+
+ for (long k = 0; k < gradient_input.k(); ++k)
+ g[k] = 0;
+
+ for (long n = 0; n < gradient_input.num_samples(); ++n)
+ {
+ for (long k = 0; k < gradient_input.k(); ++k)
+ {
+ for (long r = 0; r < gradient_input.nr(); ++r)
+ {
+ for (long c = 0; c < gradient_input.nc(); ++c)
+ {
+ g[k] += (*gi++);
+ }
+ }
+ }
+ }
+ }
+
+ // -----------------------------------------------------------------------------------
+
+ void affine_transform(
+ tensor& dest,
+ const tensor& src,
+ const float A,
+ const float B
+ )
+ {
+ DLIB_CASSERT(dest.size()==src.size());
+ const auto d = dest.host();
+ const auto s = src.host();
+ for (size_t i = 0; i < src.size(); ++i)
+ d[i] = A*s[i] + B;
+ }
+
+ void affine_transform(
+ tensor& dest,
+ const tensor& src1,
+ const tensor& src2,
+ const float A,
+ const float B,
+ const float C
+ )
+ {
+ DLIB_CASSERT(dest.size()==src1.size());
+ DLIB_CASSERT(dest.size()==src2.size());
+ const auto d = dest.host();
+ const auto s1 = src1.host();
+ const auto s2 = src2.host();
+ for (size_t i = 0; i < src1.size(); ++i)
+ d[i] = A*s1[i] + B*s2[i] + C;
+ }
+
+ void affine_transform(
+ tensor& dest,
+ const tensor& src1,
+ const tensor& src2,
+ const tensor& src3,
+ const float A,
+ const float B,
+ const float C,
+ const float D
+ )
+ {
+ DLIB_CASSERT(dest.size()==src1.size());
+ DLIB_CASSERT(dest.size()==src2.size());
+ DLIB_CASSERT(dest.size()==src3.size());
+ const auto d = dest.host();
+ const auto s1 = src1.host();
+ const auto s2 = src2.host();
+ const auto s3 = src3.host();
+ for (size_t i = 0; i < src1.size(); ++i)
+ d[i] = A*s1[i] + B*s2[i] + C*s3[i] + D;
+ }
+
+ void affine_transform_range(
+ size_t begin,
+ size_t end,
+ tensor& dest,
+ const tensor& src1,
+ const tensor& src2,
+ const tensor& src3,
+ const float A,
+ const float B,
+ const float C
+ )
+ {
+ DLIB_CASSERT(dest.size()==src1.size());
+ DLIB_CASSERT(dest.size()==src2.size());
+ DLIB_CASSERT(dest.size()==src3.size());
+ DLIB_CASSERT(begin <= end && end <= dest.size());
+ const auto d = dest.host();
+ const auto s1 = src1.host();
+ const auto s2 = src2.host();
+ const auto s3 = src3.host();
+ for (size_t i = begin; i < end; ++i)
+ d[i] = A*s1[i] + B*s2[i] + C*s3[i];
+ }
+
+ // -----------------------------------------------------------------------------------
+
+ void affine_transform(
+ tensor& dest,
+ const tensor& src,
+ const tensor& A,
+ const tensor& B
+ )
+ {
+ DLIB_CASSERT(have_same_dimensions(dest,src));
+ DLIB_CASSERT(
+ ((A.num_samples()==1 && B.num_samples()==1) ||
+ (A.num_samples()==src.num_samples() && B.num_samples()==src.num_samples())) &&
+ A.nr()==B.nr() && B.nr()==src.nr() &&
+ A.nc()==B.nc() && B.nc()==src.nc() &&
+ A.k() ==B.k() && B.k()==src.k());
+
+ auto d = dest.host();
+ auto s = src.host();
+ const auto a = A.host();
+ const auto b = B.host();
+ if (A.num_samples() == 1)
+ {
+ const long num = src.size()/src.num_samples();
+ for (long i = 0; i < src.num_samples(); ++i)
+ {
+ for (long j = 0; j < num; ++j)
+ {
+ *d = a[j]*(*s) + b[j];
+ d++;
+ s++;
+ }
+ }
+ }
+ else
+ {
+ for (size_t i = 0; i < src.size(); ++i)
+ d[i] = a[i]*s[i] + b[i];
+ }
+ }
+
+ // -----------------------------------------------------------------------------------
+
+ void affine_transform_conv(
+ tensor& dest,
+ const tensor& src,
+ const tensor& A,
+ const tensor& B
+ )
+ {
+ DLIB_CASSERT(have_same_dimensions(dest,src));
+ DLIB_CASSERT(have_same_dimensions(A,B));
+ DLIB_CASSERT(A.num_samples() == 1 &&
+ A.nr() == 1 &&
+ A.nc() == 1 &&
+ A.k() == src.k());
+
+ auto d = dest.host();
+ auto s = src.host();
+ const auto a = A.host();
+ const auto b = B.host();
+ for (long n = 0; n < dest.num_samples(); ++n)
+ {
+ for (long k = 0; k < dest.k(); ++k)
+ {
+ for (long r = 0; r < dest.nr(); ++r)
+ {
+ for (long c = 0; c < dest.nc(); ++c)
+ {
+ *d++ = a[k]*(*s++) + b[k];
+ }
+ }
+ }
+ }
+ }
+
+ // ----------------------------------------------------------------------------------------
+
+ void affine_transform(
+ const rectangle& rect,
+ tensor& dest,
+ const tensor& src1,
+ const tensor& src2,
+ const tensor& src3,
+ float A,
+ float B,
+ float C
+ )
+ {
+ DLIB_CASSERT(dest.size() == src1.size());
+ DLIB_CASSERT(dest.size() == src2.size());
+ DLIB_CASSERT(dest.size() == src3.size());
+ DLIB_CASSERT(dest.num_samples() == src1.num_samples());
+ DLIB_CASSERT(dest.num_samples() == src2.num_samples());
+ DLIB_CASSERT(dest.num_samples() == src3.num_samples());
+ DLIB_CASSERT(rectangle(0,0, dest.size()/dest.num_samples()-1, dest.num_samples()-1).contains(rect));
+
+
+ auto d = dest.host();
+ auto s1 = src1.host();
+ auto s2 = src2.host();
+ auto s3 = src3.host();
+
+ const auto nc = dest.size()/dest.num_samples();
+
+ for (long r = rect.top(); r <= rect.bottom(); ++r)
+ {
+ for (long c = rect.left(); c <= rect.right(); ++c)
+ {
+ auto idx = r*nc + c;
+ d[idx] = s1[idx]*A + s2[idx]*B + s3[idx]*C;
+ }
+ }
+
+ }
+
+ // -----------------------------------------------------------------------------------
+
+ void compute_adam_update (
+ size_t begin,
+ size_t end,
+ tensor& s,
+ tensor& m,
+ tensor& v,
+ const float t,
+ const float learning_rate,
+ const float weight_decay,
+ const float momentum1,
+ const float momentum2,
+ const tensor& params,
+ const tensor& params_grad
+ )
+ {
+ DLIB_CASSERT(s.size() == m.size() &&
+ s.size() == v.size() &&
+ s.size() == params.size() &&
+ s.size() == params_grad.size());
+ DLIB_CASSERT(begin <= end && end <= params.size());
+ const float eps = 1e-8;
+ const float alpha = learning_rate*std::sqrt(1-std::pow(momentum2,t))/(1-std::pow(momentum1, t));
+
+ // The loop is equivalent to doing this:
+ // m = momentum1*m + (1-momentum1) * (weight_decay*params + params_grad);
+ // v = momentum2*v + (1-momentum2)*squared(weight_decay*params + params_grad);
+ // s = -alpha*m/(sqrt(v) + eps);
+ auto pm = m.host();
+ auto pv = v.host();
+ auto ps = s.host_write_only();
+ auto pparams = params.host();
+ auto ppgrad = params_grad.host();
+ for (size_t i = begin; i < end; ++i)
+ {
+ float g = weight_decay*pparams[i] + ppgrad[i];
+ pm[i] = momentum1*pm[i] + (1-momentum1)*g;
+ pv[i] = momentum2*pv[i] + (1-momentum2)*g*g;
+ ps[i] = -alpha*pm[i]/(std::sqrt(pv[i]) + eps);
+ }
+ }
+
+ // -----------------------------------------------------------------------------------
+
+ void batch_normalize_inference (
+ const double eps,
+ resizable_tensor& dest,
+ const tensor& src,
+ const tensor& gamma,
+ const tensor& beta,
+ const tensor& running_means,
+ const tensor& running_variances
+ )
+ {
+ DLIB_CASSERT(
+ gamma.num_samples() == 1 &&
+ gamma.nr() == src.nr() &&
+ gamma.nc() == src.nc() &&
+ gamma.k() == src.k() &&
+ have_same_dimensions(gamma, beta) &&
+ have_same_dimensions(gamma, running_means) &&
+ have_same_dimensions(gamma, running_variances) &&
+ eps > 0,
+ "\ngamma.num_samples(): " << gamma.num_samples() <<
+ "\ngamma.k(): " << gamma.k() <<
+ "\ngamma.nr(): " << gamma.nr() <<
+ "\ngamma.nc(): " << gamma.nc() <<
+ "\nbeta.num_samples(): " << beta.num_samples() <<
+ "\nbeta.k(): " << beta.k() <<
+ "\nbeta.nr(): " << beta.nr() <<
+ "\nbeta.nc(): " << beta.nc() <<
+ "\nrunning_means.num_samples(): " << running_means.num_samples() <<
+ "\nrunning_means.k(): " << running_means.k() <<
+ "\nrunning_means.nr(): " << running_means.nr() <<
+ "\nrunning_means.nc(): " << running_means.nc() <<
+ "\nrunning_variances.num_samples(): " << running_variances.num_samples() <<
+ "\nrunning_variances.k(): " << running_variances.k() <<
+ "\nrunning_variances.nr(): " << running_variances.nr() <<
+ "\nrunning_variances.nc(): " << running_variances.nc() <<
+ "\nsrc.k(): " << src.k() <<
+ "\nsrc.nr(): " << src.nr() <<
+ "\nsrc.nc(): " << src.nc() <<
+ "\neps: " << eps
+ );
+ dest.copy_size(src);
+
+ auto d = dest.host();
+ auto s = src.host();
+ auto g = gamma.host();
+ auto b = beta.host();
+ auto m = running_means.host();
+ auto v = running_variances.host();
+
+ const long num = src.k()*src.nr()*src.nc();
+ for (long n = 0; n < src.num_samples(); ++n)
+ {
+ for (long k = 0; k < num; ++k)
+ {
+ *d = g[k]*(*s - m[k])/std::sqrt(v[k]+eps) + b[k];
+ ++d;
+ ++s;
+ }
+ }
+ }
+
+ void batch_normalize (
+ const double eps,
+ resizable_tensor& dest,
+ resizable_tensor& means,
+ resizable_tensor& invstds,
+ const double averaging_factor,
+ resizable_tensor& running_means,
+ resizable_tensor& running_variances,
+ const tensor& src,
+ const tensor& gamma,
+ const tensor& beta
+ )
+ {
+ DLIB_CASSERT(0 <= averaging_factor && averaging_factor <= 1, "averaging_factor: " << averaging_factor);
+ DLIB_CASSERT(averaging_factor==1 || have_same_dimensions(running_means,means));
+ DLIB_CASSERT(averaging_factor==1 || have_same_dimensions(running_variances,invstds));
+ DLIB_CASSERT(
+ src.num_samples() > 1 &&
+ gamma.num_samples() == 1 &&
+ beta.num_samples() == 1 &&
+ gamma.nr() == beta.nr() && beta.nr() == src.nr() &&
+ gamma.nc() == beta.nc() && beta.nc() == src.nc() &&
+ gamma.k() == beta.k() && beta.k() == src.k() &&
+ eps > 0,
+ "\ngamma.num_samples(): " << gamma.num_samples() <<
+ "\ngamma.k(): " << gamma.k() <<
+ "\ngamma.nr(): " << gamma.nr() <<
+ "\ngamma.nc(): " << gamma.nc() <<
+ "\nbeta.num_samples(): " << beta.num_samples() <<
+ "\nbeta.k(): " << beta.k() <<
+ "\nbeta.nr(): " << beta.nr() <<
+ "\nbeta.nc(): " << beta.nc() <<
+ "\nsrc.k(): " << src.k() <<
+ "\nsrc.nr(): " << src.nr() <<
+ "\nsrc.nc(): " << src.nc() <<
+ "\neps: " << eps
+ );
+
+ dest.copy_size(src);
+ means.set_size(1, src.k(), src.nr(), src.nc());
+ invstds.set_size(1, src.k(), src.nr(), src.nc());
+
+ // first compute means and invstds
+ means = 0;
+ invstds = 0;
+ const auto p_invstds = invstds.host();
+ const auto p_means = means.host();
+ auto p_src = src.host();
+ const long num = src.k()*src.nr()*src.nc();
+ // compute means, and sum of squares
+ for (long i = 0; i < num; ++i)
+ {
+ for (long n = 0; n < src.num_samples(); ++n)
+ {
+ float val = p_src[n*num+i];
+ p_means[i] += val;
+ p_invstds[i] += val*val;
+ }
+ }
+ means /= src.num_samples();
+ invstds /= src.num_samples();
+ // copy data back to host
+ invstds.host(); means.host();
+
+ // compute variances
+ running_variances.copy_size(invstds);
+ auto rvar = running_variances.host();
+ // This scale makes the running variances unbiased.
+ const double scale = (src.num_samples())/(src.num_samples()-1.0);
+ for (long i = 0; i < num; ++i)
+ {
+ auto actual_var = p_invstds[i] - p_means[i]*p_means[i];
+ if (averaging_factor == 1)
+ rvar[i] = scale*actual_var;
+ else
+ rvar[i] = (1-averaging_factor)*rvar[i] + scale*averaging_factor*actual_var;
+
+ p_invstds[i] = 1.0f/std::sqrt(actual_var + eps);
+ }
+
+ p_src = src.host();
+ auto p_dest = dest.host();
+ const auto p_gamma = gamma.host();
+ const auto p_beta = beta.host();
+ for (long n = 0; n < src.num_samples(); ++n)
+ {
+ for (long i = 0; i < num; ++i)
+ {
+ *p_dest = (*p_src - p_means[i])*p_invstds[i];
+ *p_dest = (*p_dest)*p_gamma[i] + p_beta[i];
+ ++p_src;
+ ++p_dest;
+ }
+ }
+
+ // now keep track of the running means
+ running_means.copy_size(means);
+ if (averaging_factor != 1)
+ running_means = (1-averaging_factor)*mat(running_means) + averaging_factor*mat(means);
+ else
+ running_means = means;
+ }
+
+ void batch_normalize_gradient (
+ const double eps,
+ const tensor& gradient_input,
+ const tensor& means,
+ const tensor& invstds,
+ const tensor& src,
+ const tensor& gamma,
+ tensor& src_grad,
+ tensor& gamma_grad,
+ tensor& beta_grad
+ )
+ {
+
+ const long num = src.k()*src.nr()*src.nc();
+ DLIB_CASSERT(src.num_samples() > 1);
+ DLIB_CASSERT(num == (long)means.size());
+ DLIB_CASSERT(num == (long)invstds.size());
+ DLIB_CASSERT(num == (long)gamma.size());
+ DLIB_CASSERT(num == (long)gamma_grad.size());
+ DLIB_CASSERT(num == (long)beta_grad.size());
+ DLIB_CASSERT(have_same_dimensions(gradient_input, src));
+ DLIB_CASSERT(have_same_dimensions(gradient_input, src_grad));
+ DLIB_CASSERT(eps > 0);
+
+ beta_grad = 0;
+ gamma_grad = 0;
+ auto p_grad = gradient_input.host();
+ auto p_src = src.host();
+ const auto p_gamma = gamma.host();
+ const auto p_gamma_grad = gamma_grad.host();
+ const auto p_beta_grad = beta_grad.host();
+ const auto p_invstds = invstds.host();
+ const auto p_means = means.host();
+
+ resizable_tensor dvars, dmeans;
+ dvars.copy_size(invstds);
+ dmeans.copy_size(means);
+ dvars = 0;
+ dmeans = 0;
+ const auto p_dvars = dvars.host();
+ const auto p_dmeans = dmeans.host();
+
+ for (long n = 0; n < src.num_samples(); ++n)
+ {
+ for (long i = 0; i < num; ++i)
+ {
+ const float x_hat = (*p_src - p_means[i])*p_invstds[i];
+ p_beta_grad[i] += *p_grad;
+ p_gamma_grad[i] += (*p_grad)*x_hat;
+
+ const float dx = *p_grad * p_gamma[i];
+
+ p_dvars[i] += dx*(*p_src - p_means[i])*-0.5*std::pow(p_invstds[i], 3.0f);
+
+ ++p_grad;
+ ++p_src;
+ }
+ }
+
+ const float invnum = 1.0f/src.num_samples();
+ p_grad = gradient_input.host();
+ p_src = src.host();
+ for (long n = 0; n < src.num_samples(); ++n)
+ {
+ for (long i = 0; i < num; ++i)
+ {
+ const float dx = *p_grad * p_gamma[i];
+
+ p_dmeans[i] += dx*-p_invstds[i] + p_dvars[i] * -2*(*p_src - p_means[i])*invnum;
+
+ ++p_grad;
+ ++p_src;
+ }
+ }
+ p_grad = gradient_input.host();
+ p_src = src.host();
+ auto p_src_grad = src_grad.host();
+ for (long n = 0; n < src.num_samples(); ++n)
+ {
+ for (long i = 0; i < num; ++i)
+ {
+ const float dx = *p_grad * p_gamma[i];
+
+ *p_src_grad += dx*p_invstds[i] +
+ p_dvars[i] *2*(*p_src - p_means[i])*invnum +
+ p_dmeans[i]*invnum;
+
+
+ ++p_grad;
+ ++p_src;
+ ++p_src_grad;
+ }
+ }
+ }
+
+ // ----------------------------------------------------------------------------------------
+
+ void batch_normalize_conv_inference (
+ const double eps,
+ resizable_tensor& dest,
+ const tensor& src,
+ const tensor& gamma,
+ const tensor& beta,
+ const tensor& running_means,
+ const tensor& running_variances
+ )
+ {
+ DLIB_CASSERT(
+ gamma.num_samples() == 1 &&
+ gamma.nr() == 1 &&
+ gamma.nc() == 1 &&
+ gamma.k() == src.k() &&
+ have_same_dimensions(gamma, beta) &&
+ have_same_dimensions(gamma, running_means) &&
+ have_same_dimensions(gamma, running_variances) &&
+ eps > 0,
+ "\ngamma.num_samples(): " << gamma.num_samples() <<
+ "\ngamma.k(): " << gamma.k() <<
+ "\ngamma.nr(): " << gamma.nr() <<
+ "\ngamma.nc(): " << gamma.nc() <<
+ "\nbeta.num_samples(): " << beta.num_samples() <<
+ "\nbeta.k(): " << beta.k() <<
+ "\nbeta.nr(): " << beta.nr() <<
+ "\nbeta.nc(): " << beta.nc() <<
+ "\nrunning_means.num_samples(): " << running_means.num_samples() <<
+ "\nrunning_means.k(): " << running_means.k() <<
+ "\nrunning_means.nr(): " << running_means.nr() <<
+ "\nrunning_means.nc(): " << running_means.nc() <<
+ "\nrunning_variances.num_samples(): " << running_variances.num_samples() <<
+ "\nrunning_variances.k(): " << running_variances.k() <<
+ "\nrunning_variances.nr(): " << running_variances.nr() <<
+ "\nrunning_variances.nc(): " << running_variances.nc() <<
+ "\nsrc.k(): " << src.k() <<
+ "\nsrc.nr(): " << src.nr() <<
+ "\nsrc.nc(): " << src.nc() <<
+ "\neps: " << eps
+ );
+ dest.copy_size(src);
+
+ auto d = dest.host();
+ auto s = src.host();
+ auto g = gamma.host();
+ auto b = beta.host();
+ auto m = running_means.host();
+ auto v = running_variances.host();
+
+ const long num = src.nr()*src.nc();
+ for (long n = 0; n < src.num_samples(); ++n)
+ {
+ for (long k = 0; k < src.k(); ++k)
+ {
+ const float invstd = 1.0f/std::sqrt(v[k] + eps);
+ for (long j = 0; j < num; ++j)
+ {
+ *d = g[k]*(*s - m[k])*invstd + b[k];
+ ++d;
+ ++s;
+ }
+ }
+ }
+ }
+
+ void batch_normalize_conv (
+ const double eps,
+ resizable_tensor& dest,
+ resizable_tensor& means,
+ resizable_tensor& invstds,
+ const double averaging_factor,
+ resizable_tensor& running_means,
+ resizable_tensor& running_variances,
+ const tensor& src,
+ const tensor& gamma,
+ const tensor& beta
+ )
+ {
+ DLIB_CASSERT(0 <= averaging_factor && averaging_factor <= 1, "averaging_factor: " << averaging_factor);
+ DLIB_CASSERT(averaging_factor==1 || have_same_dimensions(running_means,means));
+ DLIB_CASSERT(averaging_factor==1 || have_same_dimensions(running_variances,invstds));
+ DLIB_CASSERT(
+ src.num_samples() > 1 &&
+ gamma.num_samples() == 1 &&
+ beta.num_samples() == 1 &&
+ gamma.nr() == 1 &&
+ beta.nr() == 1 &&
+ gamma.nc() == 1 &&
+ beta.nc() == 1 &&
+ gamma.k() == beta.k() && beta.k() == src.k() &&
+ eps > 0,
+ "\ngamma.num_samples(): " << gamma.num_samples() <<
+ "\ngamma.k(): " << gamma.k() <<
+ "\ngamma.nr(): " << gamma.nr() <<
+ "\ngamma.nc(): " << gamma.nc() <<
+ "\nbeta.num_samples(): " << beta.num_samples() <<
+ "\nbeta.k(): " << beta.k() <<
+ "\nbeta.nr(): " << beta.nr() <<
+ "\nbeta.nc(): " << beta.nc() <<
+ "\nsrc.k(): " << src.k() <<
+ "\nsrc.nr(): " << src.nr() <<
+ "\nsrc.nc(): " << src.nc() <<
+ "\neps: " << eps
+ );
+
+ dest.copy_size(src);
+ means.set_size(1, src.k());
+ invstds.set_size(1, src.k());
+
+ // first compute means and invstds
+ means = 0;
+ invstds = 0;
+ const auto p_invstds = invstds.host();
+ const auto p_means = means.host();
+ const auto p_gamma = gamma.host();
+ const auto p_beta = beta.host();
+ auto p_src = src.host();
+ const long num = src.nr()*src.nc();
+ // compute means, and sum of squares
+ for (long n = 0; n < src.num_samples(); ++n)
+ {
+ for (long k = 0; k < src.k(); ++k)
+ {
+ for (long i = 0; i < num; ++i)
+ {
+ p_means[k] += *p_src;
+ p_invstds[k] += (*p_src)*(*p_src);
+ ++p_src;
+ }
+ }
+ }
+ means /= src.num_samples()*num;
+ invstds /= src.num_samples()*num;
+ // copy data back to host
+ invstds.host(); means.host();
+
+ p_src = src.host();
+ // compute variances
+ running_variances.copy_size(invstds);
+ auto rvar = running_variances.host();
+ // This scale makes the running variances unbiased.
+ const double scale = (src.num_samples()*num)/(src.num_samples()*num-1.0);
+ for (long k = 0; k < src.k(); ++k)
+ {
+ float actual_var = p_invstds[k] - p_means[k]*p_means[k];
+ if (averaging_factor == 1)
+ rvar[k] = scale*actual_var;
+ else
+ rvar[k] = (1-averaging_factor)*rvar[k] + scale*averaging_factor*actual_var;
+
+ p_invstds[k] = 1.0f/std::sqrt(actual_var + eps);
+ }
+
+ p_src = src.host();
+ auto p_dest = dest.host();
+ for (long n = 0; n < src.num_samples(); ++n)
+ {
+ for (long k = 0; k < src.k(); ++k)
+ {
+ for (long i = 0; i < num; ++i)
+ {
+ *p_dest = (*p_src - p_means[k])*p_invstds[k];
+ *p_dest = (*p_dest)*p_gamma[k] + p_beta[k];
+ ++p_src;
+ ++p_dest;
+ }
+ }
+ }
+
+ // now keep track of the running means
+ running_means.copy_size(means);
+ if (averaging_factor != 1)
+ running_means = (1-averaging_factor)*mat(running_means) + averaging_factor*mat(means);
+ else
+ running_means = means;
+ }
+
+ void batch_normalize_conv_gradient(
+ const double eps,
+ const tensor& gradient_input,
+ const tensor& means,
+ const tensor& invstds,
+ const tensor& src,
+ const tensor& gamma,
+ tensor& src_grad,
+ tensor& gamma_grad,
+ tensor& beta_grad
+ )
+ {
+
+ const long num = src.nr()*src.nc();
+ DLIB_CASSERT(src.num_samples() > 1);
+ DLIB_CASSERT(src.k() == (long)means.size());
+ DLIB_CASSERT(src.k() == (long)invstds.size());
+ DLIB_CASSERT(src.k() == (long)gamma.size());
+ DLIB_CASSERT(src.k() == (long)gamma_grad.size());
+ DLIB_CASSERT(src.k() == (long)beta_grad.size());
+ DLIB_CASSERT(have_same_dimensions(gradient_input, src));
+ DLIB_CASSERT(have_same_dimensions(gradient_input, src_grad));
+ DLIB_CASSERT(eps > 0);
+
+ beta_grad = 0;
+ gamma_grad = 0;
+
+ auto p_grad = gradient_input.host();
+ auto p_src = src.host();
+ const auto p_gamma = gamma.host();
+ const auto p_gamma_grad = gamma_grad.host();
+ const auto p_beta_grad = beta_grad.host();
+ const auto p_invstds = invstds.host();
+ const auto p_means = means.host();
+
+ resizable_tensor dvars, dmeans;
+ dvars.copy_size(invstds);
+ dmeans.copy_size(means);
+ dvars = 0;
+ dmeans = 0;
+ const auto p_dvars = dvars.host();
+ const auto p_dmeans = dmeans.host();
+
+ for (long n = 0; n < src.num_samples(); ++n)
+ {
+ for (long k = 0; k < src.k(); ++k)
+ {
+ const float invstd_pow = -0.5*std::pow(p_invstds[k], 3.0f);
+ for (long i = 0; i < num; ++i)
+ {
+ const float x_hat = (*p_src - p_means[k])*p_invstds[k];
+ p_beta_grad[k] += *p_grad;
+ p_gamma_grad[k] += (*p_grad)*x_hat;
+
+ const float dx = *p_grad * p_gamma[k];
+
+ p_dvars[k] += dx*(*p_src - p_means[k])*invstd_pow;
+
+ ++p_grad;
+ ++p_src;
+ }
+ }
+ }
+
+ p_grad = gradient_input.host();
+ p_src = src.host();
+ const float invnum = 1.0f/(src.num_samples()*num);
+ for (long n = 0; n < src.num_samples(); ++n)
+ {
+ for (long k = 0; k < src.k(); ++k)
+ {
+ for (long i = 0; i < num; ++i)
+ {
+ const float dx = *p_grad * p_gamma[k];
+
+ p_dmeans[k] += -dx*p_invstds[k] + p_dvars[k] * -2*(*p_src - p_means[k])*invnum;
+
+ ++p_grad;
+ ++p_src;
+ }
+ }
+ }
+ p_grad = gradient_input.host();
+ p_src = src.host();
+ auto p_src_grad = src_grad.host();
+ for (long n = 0; n < src.num_samples(); ++n)
+ {
+ for (long k = 0; k < src.k(); ++k)
+ {
+ for (long i = 0; i < num; ++i)
+ {
+ const float dx = *p_grad * p_gamma[k];
+
+ *p_src_grad += dx*p_invstds[k] +
+ p_dvars[k]*2*(*p_src - p_means[k])*invnum +
+ p_dmeans[k]*invnum;
+
+
+ ++p_grad;
+ ++p_src;
+ ++p_src_grad;
+ }
+ }
+ }
+ }
+
+ // -----------------------------------------------------------------------------------
+
+ void threshold (
+ tensor& data,
+ float thresh
+ )
+ {
+ const auto d = data.host();
+ for (size_t i = 0; i < data.size(); ++i)
+ d[i] = d[i]>thresh ? 1:0;
+ }
+
+ void dot (
+ const tensor& a,
+ const tensor& b,
+ tensor& result,
+ size_t idx
+ )
+ {
+ DLIB_CASSERT(a.size() == b.size());
+ DLIB_CASSERT(idx < result.size());
+
+ const auto aa = a.host();
+ const auto bb = b.host();
+ auto r = result.host();
+ for (size_t i = 0; i < a.size(); ++i)
+ r[idx] += aa[i]*bb[i];
+ }
+
+ // -----------------------------------------------------------------------------------
+ // -----------------------------------------------------------------------------------
+ // -----------------------------------------------------------------------------------
+
+ namespace ttimpl
+ {
+ void softmax (
+ const long num_locations,
+ const long num_channels,
+ tensor& dest,
+ const tensor& src
+ )
+ {
+ DLIB_ASSERT(num_channels*num_locations == src.nr()*src.nc()*src.k());
+ DLIB_CASSERT(have_same_dimensions(dest,src));
+ const auto d = dest.host();
+ const auto s = src.host();
+
+ // Note that we subtract out the max values in each channel before applying
+ // exp() to avoid numeric overflow in the subsequent computations. Doing this
+ // doesn't change the resulting output, it just makes it more numerically
+ // stable.
+ for (long n = 0; n < src.num_samples(); ++n)
+ {
+ auto ss = s + num_locations*num_channels*n;
+ auto dd = d + num_locations*num_channels*n;
+ for (long i = 0; i < num_locations; ++i)
+ {
+ float max_val = -std::numeric_limits<float>::infinity();
+ for (long k = 0; k < num_channels; ++k)
+ max_val = std::max(max_val, ss[k*num_locations]);
+
+ for (long k = 0; k < num_channels; ++k)
+ dd[k*num_locations] = std::exp(ss[k*num_locations]-max_val);
+
+ ++ss;
+ ++dd;
+ }
+ }
+
+ // Now normalize each channel so they sum to 1.
+ for (long n = 0; n < src.num_samples(); ++n)
+ {
+ const auto dd = d + num_locations*num_channels*n;
+ for (long i = 0; i < num_locations; ++i)
+ {
+ const auto ddd = dd+i;
+
+ float temp = 0;
+ for (long k = 0; k < num_channels; ++k)
+ temp += ddd[k*num_locations];
+ for (long k = 0; k < num_channels; ++k)
+ ddd[k*num_locations] /= temp;
+ }
+ }
+ }
+
+ void softmax_gradient (
+ const long num_locations,
+ const long num_channels,
+ tensor& grad,
+ const tensor& dest,
+ const tensor& gradient_input
+ )
+ {
+ DLIB_ASSERT(num_channels*num_locations == grad.nr()*grad.nc()*grad.k());
+ DLIB_CASSERT(have_same_dimensions(grad,dest));
+ DLIB_CASSERT(have_same_dimensions(grad,gradient_input));
+ const auto d = dest.host();
+ const auto g = grad.host();
+ const auto in = gradient_input.host();
+
+
+ for (long n = 0; n < grad.num_samples(); ++n)
+ {
+ const auto d2 = d + num_locations*num_channels*n;
+ const auto g2 = g + num_locations*num_channels*n;
+ const auto in2 = in + num_locations*num_channels*n;
+ for (long i = 0; i < num_locations; ++i)
+ {
+ const auto d3 = d2+i;
+ const auto g3 = g2+i;
+ const auto in3 = in2+i;
+
+ float temp = 0;
+ for (long k = 0; k < num_channels; ++k)
+ temp += -d3[k*num_locations]*in3[k*num_locations];
+ if (is_same_object(gradient_input, grad))
+ {
+ for (long k = 0; k < num_channels; ++k)
+ g3[k*num_locations] = d3[k*num_locations]*(temp+in3[k*num_locations]);
+ }
+ else
+ {
+ for (long k = 0; k < num_channels; ++k)
+ g3[k*num_locations] += d3[k*num_locations]*(temp+in3[k*num_locations]);
+ }
+ }
+ }
+ }
+ }
+
+ // ----------------------------------------------------------------------------------------
+
+ void softmax (
+ tensor& dest,
+ const tensor& src
+ )
+ {
+ DLIB_CASSERT(have_same_dimensions(dest,src));
+ ttimpl::softmax(src.nr()*src.nc(), src.k(), dest, src);
+ }
+
+ void softmax_gradient (
+ tensor& grad,
+ const tensor& dest,
+ const tensor& gradient_input
+ )
+ {
+ DLIB_CASSERT(have_same_dimensions(grad,dest));
+ DLIB_CASSERT(have_same_dimensions(grad,gradient_input));
+ ttimpl::softmax_gradient(grad.nr()*grad.nc(), grad.k(), grad, dest, gradient_input);
+ }
+
+ // ------------------------------------------------------------------------------------
+
+ void softmax_all (
+ tensor& dest,
+ const tensor& src
+ )
+ {
+ DLIB_CASSERT(have_same_dimensions(dest,src));
+ ttimpl::softmax(1, src.nr()*src.nc()*src.k(), dest, src);
+ }
+
+ void softmax_all_gradient (
+ tensor& grad,
+ const tensor& dest,
+ const tensor& gradient_input
+ )
+ {
+ DLIB_CASSERT(have_same_dimensions(grad,dest));
+ DLIB_CASSERT(have_same_dimensions(grad,gradient_input));
+ ttimpl::softmax_gradient(1, grad.nr()*grad.nc()*grad.k(), grad, dest, gradient_input);
+ }
+
+ // ------------------------------------------------------------------------------------
+
+ void sigmoid (
+ tensor& dest,
+ const tensor& src
+ )
+ {
+ const auto d = dest.host();
+ const auto s = src.host();
+ for (size_t i = 0; i < src.size(); ++i)
+ d[i] = 1/(1+std::exp(-s[i]));
+ }
+
+ void sigmoid_gradient (
+ tensor& grad,
+ const tensor& dest,
+ const tensor& gradient_input
+ )
+ {
+ const auto g = grad.host();
+ const auto d = dest.host();
+ const auto in = gradient_input.host();
+ if (is_same_object(gradient_input, grad))
+ {
+ for (size_t i = 0; i < dest.size(); ++i)
+ g[i] = in[i]*d[i]*(1-d[i]);
+ }
+ else
+ {
+ for (size_t i = 0; i < dest.size(); ++i)
+ g[i] += in[i]*d[i]*(1-d[i]);
+ }
+ }
+
+ // ------------------------------------------------------------------------------------
+
+ void relu (
+ tensor& dest,
+ const tensor& src
+ )
+ {
+ dest = lowerbound(mat(src), 0);
+ }
+
+ void relu_gradient (
+ tensor& grad,
+ const tensor& dest,
+ const tensor& gradient_input
+ )
+ {
+ const float* gi = gradient_input.host();
+ const float* in = dest.host();
+ float* out = grad.host();
+ if (is_same_object(grad, gradient_input))
+ {
+ for (size_t i = 0; i < dest.size(); ++i)
+ {
+ if (in[i] > 0)
+ out[i] = gi[i];
+ else
+ out[i] = 0;
+ }
+ }
+ else
+ {
+ for (size_t i = 0; i < dest.size(); ++i)
+ {
+ if (in[i] > 0)
+ out[i] += gi[i];
+ }
+ }
+ }
+
+ // ----------------------------------------------------------------------------------------
+
+ void prelu (
+ tensor& dest,
+ const tensor& src,
+ const tensor& param
+ )
+ {
+ const float p = param.host()[0];
+ const float* s = src.host();
+ float* d = dest.host();
+ for (size_t i = 0; i < dest.size(); ++i)
+ {
+ if (s[i] > 0)
+ d[i] = s[i];
+ else
+ d[i] = p*s[i];
+ }
+ }
+
+ void prelu_gradient (
+ tensor& grad,
+ const tensor& src,
+ const tensor& gradient_input,
+ const tensor& param,
+ tensor& params_grad
+ )
+ {
+ DLIB_CASSERT(is_same_object(grad, gradient_input) == false);
+ const float p = param.host()[0];
+ const float* gi = gradient_input.host();
+ const float* s = src.host();
+ float* out = grad.host();
+ float pgrad = 0;
+ for (size_t i = 0; i < src.size(); ++i)
+ {
+ if (s[i] > 0)
+ {
+ out[i] += gi[i];
+ }
+ else
+ {
+ out[i] += p*gi[i];
+ pgrad += gi[i]*s[i];
+ }
+ }
+ params_grad.host()[0] = pgrad;
+ }
+
+ // ------------------------------------------------------------------------------------
+
+ void tanh (
+ tensor& dest,
+ const tensor& src
+ )
+ {
+ const auto d = dest.host();
+ const auto s = src.host();
+ for (size_t i = 0; i < src.size(); ++i)
+ d[i] = std::tanh(s[i]);
+ }
+
+ void tanh_gradient (
+ tensor& grad,
+ const tensor& dest,
+ const tensor& gradient_input
+ )
+ {
+ const auto g = grad.host();
+ const auto d = dest.host();
+ const auto in = gradient_input.host();
+ if (is_same_object(grad, gradient_input))
+ {
+ for (size_t i = 0; i < dest.size(); ++i)
+ g[i] = in[i]*(1-d[i]*d[i]);
+ }
+ else
+ {
+ for (size_t i = 0; i < dest.size(); ++i)
+ g[i] += in[i]*(1-d[i]*d[i]);
+ }
+ }
+
+ // ----------------------------------------------------------------------------------------
+
+ void resize_bilinear (
+ tensor& dest,
+ long dest_row_stride,
+ long dest_channel_stride,
+ const tensor& src,
+ long src_row_stride,
+ long src_channel_stride
+ )
+ {
+ DLIB_CASSERT(is_same_object(dest, src)==false);
+ DLIB_CASSERT(dest.num_samples() == src.num_samples());
+ DLIB_CASSERT(dest.k() == src.k());
+
+ if (dest.size() == 0 || src.size() == 0)
+ return;
+
+ const float* s = src.host();
+ float* d = dest.host();
+
+ parallel_for(0, dest.k()*dest.num_samples(), [&](long i)
+ {
+ auto simg = sub_image(s+i*src_channel_stride, src.nr(), src.nc(), src_row_stride);
+ auto dimg = sub_image(d+i*dest_channel_stride, dest.nr(), dest.nc(), dest_row_stride);
+
+ resize_image(simg, dimg);
+ });
+ }
+
+ void resize_bilinear_gradient (
+ tensor& grad,
+ long grad_row_stride,
+ long grad_channel_stride,
+ const tensor& gradient_input,
+ long gradient_input_row_stride,
+ long gradient_input_channel_stride
+ )
+ {
+ DLIB_CASSERT(is_same_object(grad, gradient_input)==false);
+ DLIB_CASSERT(gradient_input.num_samples() == grad.num_samples());
+ DLIB_CASSERT(gradient_input.k() == grad.k());
+
+ if (gradient_input.size() == 0 || grad.size() == 0)
+ return;
+
+ const float* gi = gradient_input.host();
+ float* g = grad.host();
+ const float x_scale = (grad.nc()-1)/(float)std::max<long>((gradient_input.nc()-1),1);
+ const float y_scale = (grad.nr()-1)/(float)std::max<long>((gradient_input.nr()-1),1);
+ for (long long samp = 0; samp < gradient_input.num_samples(); ++samp)
+ {
+ for (long long k = 0; k < gradient_input.k(); ++k)
+ {
+ for (long long r = 0; r < gradient_input.nr(); ++r)
+ {
+ const float y = r*y_scale;
+ const long long top = static_cast<long long>(std::floor(y));
+ const long long bottom = std::min(top+1, grad.nr()-1);
+ const float tb_frac = y - top;
+ for (long long c = 0; c < gradient_input.nc(); ++c)
+ {
+ const float x = c*x_scale;
+ const long long left = static_cast<long long>(std::floor(x));
+ const long long right = std::min(left+1, grad.nc()-1);
+ const float lr_frac = x - left;
+
+ const float tmp = gi[r*gradient_input_row_stride+c];
+
+ g[top*grad_row_stride+left] += tmp*(1-tb_frac)*(1-lr_frac);
+ g[top*grad_row_stride+right] += tmp*(1-tb_frac)*(lr_frac);
+ g[bottom*grad_row_stride+left] += tmp*(tb_frac)*(1-lr_frac);
+ g[bottom*grad_row_stride+right] += tmp*(tb_frac)*(lr_frac);
+ }
+ }
+
+ g += grad_channel_stride;
+ gi += gradient_input_channel_stride;
+ }
+ }
+ }
+
+ // ------------------------------------------------------------------------------------
+ // ------------------------------------------------------------------------------------
+ // ------------------------------------------------------------------------------------
+
+ pooling::pooling (
+ ) : window_height(0),window_width(0),stride_y(0),stride_x(0),padding_y(0),padding_x(0),do_max_pooling(true)
+ {
+ }
+
+ void pooling::
+ clear(
+ )
+ {
+ window_height = 0;
+ window_width = 0;
+ stride_y = 0;
+ stride_x = 0;
+ padding_y = 0;
+ padding_x = 0;
+ }
+
+ void pooling::
+ setup_max_pooling(
+ int window_height_,
+ int window_width_,
+ int stride_y_,
+ int stride_x_,
+ int padding_y_,
+ int padding_x_
+ )
+ {
+ DLIB_CASSERT(window_width_ > 0);
+ DLIB_CASSERT(window_height_ > 0);
+ DLIB_CASSERT(stride_y_ > 0);
+ DLIB_CASSERT(stride_x_ > 0);
+ DLIB_CASSERT(0 <= padding_y_ && padding_y_ < window_height_);
+ DLIB_CASSERT(0 <= padding_x_ && padding_x_ < window_width_);
+
+ window_height = window_height_;
+ window_width = window_width_;
+ stride_y = stride_y_;
+ stride_x = stride_x_;
+ padding_y = padding_y_;
+ padding_x = padding_x_;
+ do_max_pooling = true;
+ }
+
+ void pooling::
+ setup_avg_pooling(
+ int window_height_,
+ int window_width_,
+ int stride_y_,
+ int stride_x_,
+ int padding_y_,
+ int padding_x_
+ )
+ {
+ DLIB_CASSERT(window_width_ > 0);
+ DLIB_CASSERT(window_height_ > 0);
+ DLIB_CASSERT(stride_y_ > 0);
+ DLIB_CASSERT(stride_x_ > 0);
+ DLIB_CASSERT(0 <= padding_y_ && padding_y_ < window_height_);
+ DLIB_CASSERT(0 <= padding_x_ && padding_x_ < window_width_);
+
+ window_height = window_height_;
+ window_width = window_width_;
+ stride_y = stride_y_;
+ stride_x = stride_x_;
+ padding_y = padding_y_;
+ padding_x = padding_x_;
+ do_max_pooling = false;
+ }
+
+ void pooling::
+ operator() (
+ resizable_tensor& dest,
+ const tensor& src
+ )
+ {
+ DLIB_CASSERT(window_width > 0);
+ DLIB_CASSERT(window_height > 0);
+ DLIB_CASSERT(stride_y > 0);
+ DLIB_CASSERT(stride_x > 0);
+ DLIB_CASSERT(0 <= padding_y && padding_y < window_height);
+ DLIB_CASSERT(0 <= padding_x && padding_x < window_width);
+ DLIB_CASSERT(window_width <= src.nc() + 2*padding_x,
+ "Pooling windows must be small enough to fit into the padded image.");
+ DLIB_CASSERT(window_height <= src.nr() + 2*padding_y,
+ "Pooling windows must be small enough to fit into the padded image.");
+
+ dest.set_size(
+ src.num_samples(),
+ src.k(),
+ 1+(src.nr()+2*padding_y-window_height)/stride_y,
+ 1+(src.nc()+2*padding_x-window_width)/stride_x
+ );
+
+ if (src.size() == 0)
+ {
+ dest = 0;
+ return;
+ }
+
+
+ auto d = dest.host();
+ const long x_offset = window_width/2 - padding_x;
+ const long y_offset = window_height/2 - padding_y;
+ if (does_max_pooling())
+ {
+ for (long n = 0; n < dest.num_samples(); ++n)
+ {
+ for (long k = 0; k < dest.k(); ++k)
+ {
+ auto simg = image_plane(src,n,k);
+ auto dimg = d + (n*dest.k() + k)*dest.nr()*dest.nc();
+
+ for (long r = 0; r < dest.nr(); ++r)
+ {
+ for (long c = 0; c < dest.nc(); ++c)
+ {
+ auto win = centered_rect(c*stride_x+x_offset,
+ r*stride_y+y_offset,
+ window_width,
+ window_height);
+ dimg[r*dest.nc() + c] = max(subm_clipped(simg,win));
+ }
+ }
+ }
+ }
+ }
+ else
+ {
+ for (long n = 0; n < dest.num_samples(); ++n)
+ {
+ for (long k = 0; k < dest.k(); ++k)
+ {
+ auto simg = image_plane(src,n,k);
+ auto dimg = d + (n*dest.k() + k)*dest.nr()*dest.nc();
+
+ for (long r = 0; r < dest.nr(); ++r)
+ {
+ for (long c = 0; c < dest.nc(); ++c)
+ {
+ auto win = centered_rect(c*stride_x+x_offset,
+ r*stride_y+y_offset,
+ window_width,
+ window_height);
+ dimg[r*dest.nc() + c] = mean(subm_clipped(simg,win));
+ }
+ }
+ }
+ }
+ }
+
+ }
+
+ void pooling::get_gradient(
+ const tensor& gradient_input,
+ const tensor& dest,
+ const tensor& src,
+ tensor& grad
+ )
+ {
+ DLIB_CASSERT(have_same_dimensions(gradient_input,dest));
+ DLIB_CASSERT(have_same_dimensions(src,grad));
+
+
+ if (src.size() == 0)
+ {
+ return;
+ }
+
+
+ auto gi = gradient_input.host();
+ auto g = grad.host();
+ const long x_offset = window_width/2 - padding_x;
+ const long y_offset = window_height/2 - padding_y;
+ if (does_max_pooling())
+ {
+ for (long n = 0; n < dest.num_samples(); ++n)
+ {
+ for (long k = 0; k < dest.k(); ++k)
+ {
+ auto simg = image_plane(src,n,k);
+ auto gimg = g + (n*grad.k() + k)*grad.nr()*grad.nc();
+ auto giimg = gi + (n*dest.k() + k)*dest.nr()*dest.nc();
+ auto imgbox = get_rect(simg);
+
+ for (long r = 0; r < dest.nr(); ++r)
+ {
+ for (long c = 0; c < dest.nc(); ++c)
+ {
+ auto win = centered_rect(c*stride_x+x_offset,
+ r*stride_y+y_offset,
+ window_width,
+ window_height).intersect(imgbox);
+ auto p = max_point(subm(simg,win))+win.tl_corner();
+ gimg[p.y()*grad.nc()+p.x()] += giimg[r*dest.nc()+c];
+ }
+ }
+ }
+ }
+ }
+ else
+ {
+ for (long n = 0; n < dest.num_samples(); ++n)
+ {
+ for (long k = 0; k < dest.k(); ++k)
+ {
+ auto simg = image_plane(src,n,k);
+ auto gimg = g + (n*grad.k() + k)*grad.nr()*grad.nc();
+ auto giimg = gi + (n*dest.k() + k)*dest.nr()*dest.nc();
+ auto imgbox = get_rect(simg);
+
+ for (long r = 0; r < dest.nr(); ++r)
+ {
+ for (long c = 0; c < dest.nc(); ++c)
+ {
+ auto win = centered_rect(c*stride_x+x_offset,
+ r*stride_y+y_offset,
+ window_width,
+ window_height).intersect(imgbox);
+ const float delta = giimg[r*dest.nc()+c]/win.area();
+ for (long y = win.top(); y <= win.bottom(); ++y)
+ {
+ for (long x = win.left(); x <= win.right(); ++x)
+ {
+ gimg[y*grad.nc()+x] += delta;
+ }
+ }
+ }
+ }
+ }
+ }
+ }
+
+ }
+
+ // ------------------------------------------------------------------------------------
+ // ------------------------------------------------------------------------------------
+ // ------------------------------------------------------------------------------------
+
+ void img2col(
+ matrix<float>& output,
+ const tensor& data,
+ long n,
+ long filter_nr,
+ long filter_nc,
+ long stride_y,
+ long stride_x,
+ long padding_y,
+ long padding_x
+ )
+ {
+ const auto d = data.host() + data.k()*data.nr()*data.nc()*n;
+ const rectangle boundary = get_rect(data);
+
+ const long out_nr = 1+(data.nr()+2*padding_y-filter_nr)/stride_y;
+ const long out_nc = 1+(data.nc()+2*padding_x-filter_nc)/stride_x;
+
+ output.set_size(out_nr*out_nc,
+ data.k()*filter_nr*filter_nc);
+ DLIB_CASSERT(output.size() != 0);
+ float* t = &output(0,0);
+
+ // now fill in the Toeplitz output matrix for the n-th sample in data.
+ size_t cnt = 0;
+ const long max_r = data.nr() + padding_y-(filter_nr-1);
+ const long max_c = data.nc() + padding_x-(filter_nc-1);
+ for (long r = -padding_y; r < max_r; r+=stride_y)
+ {
+ for (long c = -padding_x; c < max_c; c+=stride_x)
+ {
+ for (long k = 0; k < data.k(); ++k)
+ {
+ for (long y = 0; y < filter_nr; ++y)
+ {
+ for (long x = 0; x < filter_nc; ++x)
+ {
+ DLIB_ASSERT(cnt < output.size());
+ long xx = c+x;
+ long yy = r+y;
+ if (boundary.contains(xx,yy))
+ *t = d[(k*data.nr() + yy)*data.nc() + xx];
+ else
+ *t = 0;
+ ++t;
+ ++cnt;
+ }
+ }
+ }
+ }
+ }
+ }
+
+ void col2img(
+ const matrix<float>& output,
+ tensor& data,
+ long n,
+ long filter_nr,
+ long filter_nc,
+ long stride_y,
+ long stride_x,
+ long padding_y,
+ long padding_x
+ )
+ {
+ const auto d = data.host() + data.k()*data.nr()*data.nc()*n;
+ const rectangle boundary = get_rect(data);
+
+ DLIB_CASSERT(output.size() != 0);
+ const float* t = &output(0,0);
+
+ // now fill in the Toeplitz output matrix for the n-th sample in data.
+ const long max_r = data.nr() + padding_y-(filter_nr-1);
+ const long max_c = data.nc() + padding_x-(filter_nc-1);
+ for (long r = -padding_y; r < max_r; r+=stride_y)
+ {
+ for (long c = -padding_x; c < max_c; c+=stride_x)
+ {
+ for (long k = 0; k < data.k(); ++k)
+ {
+ for (long y = 0; y < filter_nr; ++y)
+ {
+ for (long x = 0; x < filter_nc; ++x)
+ {
+ long xx = c+x;
+ long yy = r+y;
+ if (boundary.contains(xx,yy))
+ d[(k*data.nr() + yy)*data.nc() + xx] += *t;
+ ++t;
+ }
+ }
+ }
+ }
+ }
+ }
+
+ void tensor_conv::operator() (
+ const bool add_to_output,
+ resizable_tensor& output,
+ const tensor& data,
+ const tensor& filters
+ )
+ {
+ DLIB_CASSERT(last_stride_y > 0 && last_stride_x > 0, "You must call setup() before calling this function.");
+ output.set_size(data.num_samples(),
+ filters.num_samples(),
+ 1+(data.nr()+2*last_padding_y-filters.nr())/last_stride_y,
+ 1+(data.nc()+2*last_padding_x-filters.nc())/last_stride_x);
+ (*this)(add_to_output, static_cast<tensor&>(output),data,filters);
+ }
+
+ void tensor_conv::operator() (
+ const bool add_to_output,
+ tensor& output,
+ const tensor& data,
+ const tensor& filters
+ )
+ {
+ DLIB_CASSERT(is_same_object(output,data) == false);
+ DLIB_CASSERT(is_same_object(output,filters) == false);
+ DLIB_CASSERT(filters.k() == data.k());
+ DLIB_CASSERT(last_stride_y > 0 && last_stride_x > 0, "You must call setup() before calling this function.");
+ DLIB_CASSERT(filters.nr() <= data.nr() + 2*last_padding_y,
+ "Filter windows must be small enough to fit into the padded image.");
+ DLIB_CASSERT(filters.nc() <= data.nc() + 2*last_padding_x,
+ "Filter windows must be small enough to fit into the padded image.");
+
+ DLIB_CASSERT(output.num_samples() == data.num_samples());
+ DLIB_CASSERT(output.k() == filters.num_samples());
+ DLIB_CASSERT(output.nr() == 1+(data.nr()+2*last_padding_y-filters.nr())/last_stride_y);
+ DLIB_CASSERT(output.nc() == 1+(data.nc()+2*last_padding_x-filters.nc())/last_stride_x);
+
+
+ matrix<float> temp;
+ for (long n = 0; n < data.num_samples(); ++n)
+ {
+ img2col(temp, data, n, filters.nr(), filters.nc(), last_stride_y, last_stride_x, last_padding_y, last_padding_x);
+
+ if (add_to_output)
+ output.add_to_sample(n, mat(filters)*trans(temp));
+ else
+ output.set_sample(n, mat(filters)*trans(temp));
+ }
+ }
+
+ // ------------------------------------------------------------------------------------
+
+ void tensor_conv::
+ get_gradient_for_data (
+ const bool add_to_output,
+ const tensor& gradient_input,
+ const tensor& filters,
+ tensor& data_gradient
+ )
+ {
+ matrix<float> temp;
+ if (!add_to_output)
+ data_gradient = 0;
+ for (long n = 0; n < gradient_input.num_samples(); ++n)
+ {
+ auto gi = mat(gradient_input.host()+gradient_input.k()*gradient_input.nr()*gradient_input.nc()*n,
+ gradient_input.k(),
+ gradient_input.nr()*gradient_input.nc());
+
+
+ temp = trans(gi)*mat(filters);
+ col2img(temp, data_gradient, n, filters.nr(), filters.nc(), last_stride_y, last_stride_x, last_padding_y, last_padding_x);
+ }
+ }
+
+ // ------------------------------------------------------------------------------------
+
+ void tensor_conv::
+ get_gradient_for_filters (
+ const bool add_to_output,
+ const tensor& gradient_input,
+ const tensor& data,
+ tensor& filters_gradient
+ )
+ {
+ matrix<float> temp;
+ for (long n = 0; n < gradient_input.num_samples(); ++n)
+ {
+ auto gi = mat(gradient_input.host()+gradient_input.k()*gradient_input.nr()*gradient_input.nc()*n,
+ gradient_input.k(),
+ gradient_input.nr()*gradient_input.nc());
+
+
+ img2col(temp, data, n, filters_gradient.nr(), filters_gradient.nc(), last_stride_y, last_stride_x, last_padding_y, last_padding_x);
+ if (n == 0)
+ {
+ if (add_to_output)
+ filters_gradient += gi*temp;
+ else
+ filters_gradient = gi*temp;
+ }
+ else
+ {
+ filters_gradient += gi*temp;
+ }
+ }
+ }
+
+ // ------------------------------------------------------------------------------------
+
+ void copy_tensor(
+ bool add_to,
+ tensor& dest,
+ size_t dest_k_offset,
+ const tensor& src,
+ size_t src_k_offset,
+ size_t count_k
+ )
+ {
+ const size_t dest_sample_size = static_cast<size_t>(dest.nc() * dest.nr() * dest.k());
+ const size_t src_sample_size = static_cast<size_t>(src.nc() * src.nr() * src.k());
+
+ const size_t block_size = count_k * dest.nc() * dest.nr();
+
+ DLIB_CASSERT(dest.num_samples() == src.num_samples() &&
+ dest.nc() == src.nc() && dest.nr() == src.nr(), "All sources should fit into dest tensor size");
+ DLIB_CASSERT(dest.k() - dest_k_offset >= count_k, "Not enough space in dest tensor");
+ DLIB_CASSERT(src.k() - src_k_offset >= count_k, "Not enough space in src tensor");
+
+ float* dest_p = dest.host() + dest_k_offset * dest.nc() * dest.nr();
+ const float* src_p = src.host() + src_k_offset * src.nc() * src.nr();
+
+ for (long i = 0; i < src.num_samples(); ++i)
+ {
+ if (add_to)
+ {
+ for (size_t j = 0; j < block_size; ++j)
+ dest_p[j] += src_p[j];
+ }
+ else
+ {
+ ::memcpy(dest_p, src_p, block_size * sizeof(float));
+ }
+
+ dest_p += dest_sample_size;
+ src_p += src_sample_size;
+ }
+ }
+
+ // ------------------------------------------------------------------------------------
+ // ------------------------------------------------------------------------------------
+ // ------------------------------------------------------------------------------------
+
+ }
+}
+
+
+#endif // DLIB_DNN_CPU_cPP_
+
+
diff --git a/ml/dlib/dlib/dnn/cpu_dlib.h b/ml/dlib/dlib/dnn/cpu_dlib.h
new file mode 100644
index 000000000..330df01a2
--- /dev/null
+++ b/ml/dlib/dlib/dnn/cpu_dlib.h
@@ -0,0 +1,505 @@
+// Copyright (C) 2015 Davis E. King (davis@dlib.net)
+// License: Boost Software License See LICENSE.txt for the full license.
+#ifndef DLIB_DNN_CPU_H_
+#define DLIB_DNN_CPU_H_
+
+// This file contains CPU implementations of the GPU based functions in cuda_dlib.h
+// and cudnn_dlibapi.h
+
+#include "tensor.h"
+#include "../geometry/rectangle.h"
+
+namespace dlib
+{
+ namespace cpu
+ {
+
+ // -----------------------------------------------------------------------------------
+
+ void multiply (
+ bool add_to,
+ tensor& dest,
+ const tensor& src1,
+ const tensor& src2
+ );
+
+ void multiply_conv (
+ bool add_to,
+ tensor& dest,
+ const tensor& src1,
+ const tensor& src2
+ );
+
+ void multiply_zero_padded (
+ bool add_to,
+ tensor& dest,
+ const tensor& src1,
+ const tensor& src2
+ );
+
+ void scale_channels (
+ bool add_to,
+ tensor& dest,
+ const tensor& src,
+ const tensor& scales
+ );
+
+ void add(
+ float beta,
+ tensor& dest,
+ float alpha,
+ const tensor& src
+ );
+
+ void assign_bias_gradient (
+ tensor& grad,
+ const tensor& gradient_input
+ );
+
+ void add (
+ tensor& dest,
+ const tensor& src1,
+ const tensor& src2
+ );
+
+ void assign_conv_bias_gradient (
+ tensor& grad,
+ const tensor& gradient_input
+ );
+
+ // -----------------------------------------------------------------------------------
+
+ void affine_transform(
+ tensor& dest,
+ const tensor& src,
+ const float A,
+ const float B
+ );
+
+ void affine_transform(
+ tensor& dest,
+ const tensor& src1,
+ const tensor& src2,
+ const float A,
+ const float B,
+ const float C
+ );
+
+ void affine_transform(
+ tensor& dest,
+ const tensor& src1,
+ const tensor& src2,
+ const tensor& src3,
+ const float A,
+ const float B,
+ const float C,
+ const float D
+ );
+
+ void affine_transform_range(
+ size_t begin,
+ size_t end,
+ tensor& dest,
+ const tensor& src1,
+ const tensor& src2,
+ const tensor& src3,
+ const float A,
+ const float B,
+ const float C
+ );
+
+ // -----------------------------------------------------------------------------------
+
+ void affine_transform(
+ tensor& dest,
+ const tensor& src,
+ const tensor& A,
+ const tensor& B
+ );
+
+ // -----------------------------------------------------------------------------------
+
+ void affine_transform_conv(
+ tensor& dest,
+ const tensor& src,
+ const tensor& A,
+ const tensor& B
+ );
+
+ // -----------------------------------------------------------------------------------
+
+ void affine_transform(
+ const rectangle& rect,
+ tensor& dest,
+ const tensor& src1,
+ const tensor& src2,
+ const tensor& src3,
+ float A,
+ float B,
+ float C
+ );
+
+ // -----------------------------------------------------------------------------------
+
+ void compute_adam_update (
+ size_t begin,
+ size_t end,
+ tensor& s,
+ tensor& m,
+ tensor& v,
+ const float t,
+ const float learning_rate,
+ const float weight_decay,
+ const float momentum1,
+ const float momentum2,
+ const tensor& params,
+ const tensor& params_grad
+ );
+
+ // -----------------------------------------------------------------------------------
+
+ void batch_normalize_inference (
+ const double eps,
+ resizable_tensor& dest,
+ const tensor& src,
+ const tensor& gamma,
+ const tensor& beta,
+ const tensor& running_means,
+ const tensor& running_variances
+ );
+
+ void batch_normalize (
+ const double eps,
+ resizable_tensor& dest,
+ resizable_tensor& means,
+ resizable_tensor& invstds,
+ const double averaging_factor,
+ resizable_tensor& running_means,
+ resizable_tensor& running_variances,
+ const tensor& src,
+ const tensor& gamma,
+ const tensor& beta
+ );
+
+ void batch_normalize_gradient (
+ const double eps,
+ const tensor& gradient_input,
+ const tensor& means,
+ const tensor& invstds,
+ const tensor& src,
+ const tensor& gamma,
+ tensor& src_grad,
+ tensor& gamma_grad,
+ tensor& beta_grad
+ );
+
+ void batch_normalize_conv_inference (
+ const double eps,
+ resizable_tensor& dest,
+ const tensor& src,
+ const tensor& gamma,
+ const tensor& beta,
+ const tensor& running_means,
+ const tensor& running_variances
+ );
+
+ void batch_normalize_conv (
+ const double eps,
+ resizable_tensor& dest,
+ resizable_tensor& means,
+ resizable_tensor& invstds,
+ const double averaging_factor,
+ resizable_tensor& running_means,
+ resizable_tensor& running_variances,
+ const tensor& src,
+ const tensor& gamma,
+ const tensor& beta
+ );
+
+ void batch_normalize_conv_gradient (
+ const double eps,
+ const tensor& gradient_input,
+ const tensor& means,
+ const tensor& invstds,
+ const tensor& src,
+ const tensor& gamma,
+ tensor& src_grad,
+ tensor& gamma_grad,
+ tensor& beta_grad
+ );
+
+ // -----------------------------------------------------------------------------------
+
+ void threshold (
+ tensor& data,
+ float thresh
+ );
+
+ void dot (
+ const tensor& a,
+ const tensor& b,
+ tensor& result,
+ size_t idx
+ );
+
+ // -----------------------------------------------------------------------------------
+
+ void softmax (
+ tensor& dest,
+ const tensor& src
+ );
+
+ void softmax_gradient (
+ tensor& grad,
+ const tensor& dest,
+ const tensor& gradient_input
+ );
+
+ // ------------------------------------------------------------------------------------
+
+ void softmax_all (
+ tensor& dest,
+ const tensor& src
+ );
+
+ void softmax_all_gradient (
+ tensor& grad,
+ const tensor& dest,
+ const tensor& gradient_input
+ );
+
+ // ------------------------------------------------------------------------------------
+
+ void sigmoid (
+ tensor& dest,
+ const tensor& src
+ );
+
+ void sigmoid_gradient (
+ tensor& grad,
+ const tensor& dest,
+ const tensor& gradient_input
+ );
+
+ // ------------------------------------------------------------------------------------
+
+ void relu (
+ tensor& dest,
+ const tensor& src
+ );
+
+ void relu_gradient (
+ tensor& grad,
+ const tensor& dest,
+ const tensor& gradient_input
+ );
+
+ // ----------------------------------------------------------------------------------------
+
+ void prelu (
+ tensor& dest,
+ const tensor& src,
+ const tensor& param
+ );
+
+ void prelu_gradient (
+ tensor& grad,
+ const tensor& src,
+ const tensor& gradient_input,
+ const tensor& param,
+ tensor& params_grad
+ );
+
+ // ------------------------------------------------------------------------------------
+
+ void tanh (
+ tensor& dest,
+ const tensor& src
+ );
+
+ void tanh_gradient (
+ tensor& grad,
+ const tensor& dest,
+ const tensor& gradient_input
+ );
+
+ // ----------------------------------------------------------------------------------------
+
+ void resize_bilinear (
+ tensor& dest,
+ long dest_row_stride,
+ long dest_channel_stride,
+ const tensor& src,
+ long src_row_stride,
+ long src_channel_stride
+ );
+
+ void resize_bilinear_gradient (
+ tensor& grad,
+ long grad_row_stride,
+ long grad_channel_stride,
+ const tensor& gradient_input,
+ long gradient_input_row_stride,
+ long gradient_input_channel_stride
+ );
+
+ inline void resize_bilinear (
+ tensor& dest,
+ const tensor& src
+ ) { resize_bilinear(dest, dest.nc(), dest.nr()*dest.nc(), src, src.nc(), src.nr()*src.nc()); }
+
+ inline void resize_bilinear_gradient (
+ tensor& grad,
+ const tensor& gradient_input
+ ) { resize_bilinear_gradient(grad, grad.nc(), grad.nr()*grad.nc(), gradient_input, gradient_input.nc(), gradient_input.nr()*gradient_input.nc()); }
+
+ // -----------------------------------------------------------------------------------
+
+ class pooling
+ {
+ public:
+
+ pooling(const pooling&) = delete;
+ pooling& operator=(const pooling&) = delete;
+
+ pooling (
+ );
+
+ void clear(
+ );
+
+ void setup_max_pooling(
+ int window_height,
+ int window_width,
+ int stride_y,
+ int stride_x,
+ int padding_y,
+ int padding_x
+ );
+
+ void setup_avg_pooling(
+ int window_height,
+ int window_width,
+ int stride_y,
+ int stride_x,
+ int padding_y,
+ int padding_x
+ );
+
+ bool does_max_pooling(
+ ) const { return do_max_pooling; }
+
+ void operator() (
+ resizable_tensor& dest,
+ const tensor& src
+ );
+
+ void get_gradient(
+ const tensor& gradient_input,
+ const tensor& dest,
+ const tensor& src,
+ tensor& grad
+ );
+
+ private:
+ int window_height;
+ int window_width;
+ int stride_y;
+ int stride_x;
+ int padding_y;
+ int padding_x;
+ bool do_max_pooling;
+
+ };
+
+ // -----------------------------------------------------------------------------------
+
+ class tensor_conv
+ {
+ public:
+ tensor_conv(const tensor_conv&) = delete;
+ tensor_conv& operator=(const tensor_conv&) = delete;
+
+ tensor_conv() {}
+
+ void clear(
+ ) {}
+
+ void setup(
+ const tensor& data, /* not used but required for interface */
+ const tensor& filters, /* not used but required for interface */
+ int stride_y,
+ int stride_x,
+ int padding_y,
+ int padding_x
+ )
+ {
+ (void)data; /* silence compiler */
+ DLIB_CASSERT(stride_y > 0 && stride_x > 0);
+ DLIB_CASSERT(0 <= padding_y && padding_y < filters.nr());
+ DLIB_CASSERT(0 <= padding_x && padding_x < filters.nc());
+ last_stride_y = stride_y;
+ last_stride_x = stride_x;
+ last_padding_y = padding_y;
+ last_padding_x = padding_x;
+ }
+
+ void operator() (
+ const bool add_to_output,
+ resizable_tensor& output,
+ const tensor& data,
+ const tensor& filters
+ );
+
+ void operator() (
+ const bool add_to_output,
+ tensor& output,
+ const tensor& data,
+ const tensor& filters
+ );
+
+ void get_gradient_for_data (
+ const bool add_to_output,
+ const tensor& gradient_input,
+ const tensor& filters,
+ tensor& data_gradient
+ );
+
+ void get_gradient_for_filters (
+ const bool add_to_output,
+ const tensor& gradient_input,
+ const tensor& data,
+ tensor& filters_gradient
+ );
+
+ private:
+
+ long last_stride_y = 0;
+ long last_stride_x = 0;
+ long last_padding_y = 0;
+ long last_padding_x = 0;
+ };
+
+ // -----------------------------------------------------------------------------------
+
+ void copy_tensor(
+ bool add_to,
+ tensor& dest,
+ size_t dest_k_offset,
+ const tensor& src,
+ size_t src_k_offset,
+ size_t count_k
+ );
+
+ // -----------------------------------------------------------------------------------
+
+ }
+}
+
+#ifdef NO_MAKEFILE
+#include "cpu_dlib.cpp"
+#endif
+
+#endif // DLIB_DNN_CPU_H_
+
+
diff --git a/ml/dlib/dlib/dnn/cublas_dlibapi.cpp b/ml/dlib/dlib/dnn/cublas_dlibapi.cpp
new file mode 100644
index 000000000..376cc9f00
--- /dev/null
+++ b/ml/dlib/dlib/dnn/cublas_dlibapi.cpp
@@ -0,0 +1,165 @@
+// Copyright (C) 2015 Davis E. King (davis@dlib.net)
+// License: Boost Software License See LICENSE.txt for the full license.
+#ifndef DLIB_DNN_CuBLAS_CPP_
+#define DLIB_DNN_CuBLAS_CPP_
+
+#ifdef DLIB_USE_CUDA
+
+#include "cublas_dlibapi.h"
+#include "cuda_utils.h"
+
+#include <cublas_v2.h>
+#include <vector>
+
+static const char* cublas_get_error_string(cublasStatus_t s)
+{
+ switch(s)
+ {
+ case CUBLAS_STATUS_NOT_INITIALIZED:
+ return "CUDA Runtime API initialization failed.";
+ case CUBLAS_STATUS_ALLOC_FAILED:
+ return "CUDA Resources could not be allocated.";
+ default:
+ return "A call to cuBLAS failed";
+ }
+}
+
+// Check the return value of a call to the cuBLAS runtime for an error condition.
+#define CHECK_CUBLAS(call) \
+do{ \
+ const cublasStatus_t error = call; \
+ if (error != CUBLAS_STATUS_SUCCESS) \
+ { \
+ std::ostringstream sout; \
+ sout << "Error while calling " << #call << " in file " << __FILE__ << ":" << __LINE__ << ". ";\
+ sout << "code: " << error << ", reason: " << cublas_get_error_string(error);\
+ throw dlib::cublas_error(sout.str()); \
+ } \
+}while(false)
+
+namespace dlib
+{
+ namespace cuda
+ {
+
+ // -----------------------------------------------------------------------------------
+
+ class cublas_context
+ {
+ public:
+ // not copyable
+ cublas_context(const cublas_context&) = delete;
+ cublas_context& operator=(const cublas_context&) = delete;
+
+ cublas_context()
+ {
+ handles.resize(16);
+ }
+ ~cublas_context()
+ {
+ for (auto h : handles)
+ {
+ if (h)
+ cublasDestroy(h);
+ }
+ }
+
+ cublasHandle_t get_handle (
+ )
+ {
+ int new_device_id;
+ CHECK_CUDA(cudaGetDevice(&new_device_id));
+ // make room for more devices if needed
+ if (new_device_id >= (long)handles.size())
+ handles.resize(new_device_id+16);
+
+ // If we don't have a handle already for this device then make one
+ if (!handles[new_device_id])
+ CHECK_CUBLAS(cublasCreate(&handles[new_device_id]));
+
+ // Finally, return the handle for the current device
+ return handles[new_device_id];
+ }
+
+ private:
+
+ std::vector<cublasHandle_t> handles;
+ };
+
+ static cublasHandle_t context()
+ {
+ thread_local cublas_context c;
+ return c.get_handle();
+ }
+
+ // -----------------------------------------------------------------------------------
+
+ void gemm (
+ float beta,
+ tensor& dest,
+ float alpha,
+ const tensor& lhs,
+ bool trans_lhs,
+ const tensor& rhs,
+ bool trans_rhs
+ )
+ {
+ // Recall that BLAS uses column major order so to deal with that we flip the
+ // order of the lhs and rhs arguments.
+ const auto transa = trans_lhs ? CUBLAS_OP_T : CUBLAS_OP_N;
+ const auto transb = trans_rhs ? CUBLAS_OP_T : CUBLAS_OP_N;
+
+ const int dest_nr = dest.num_samples();
+ const int dest_nc = dest.size()/dest_nr;
+ const int lhs_nr = lhs.num_samples();
+ const int lhs_nc = lhs.size()/lhs_nr;
+ const int rhs_nr = rhs.num_samples();
+ const int rhs_nc = rhs.size()/rhs_nr;
+ if (trans_lhs && trans_rhs)
+ {
+ DLIB_ASSERT( dest_nr == lhs_nc &&
+ dest_nc == rhs_nr &&
+ lhs_nr == rhs_nc)
+ }
+ else if (!trans_lhs && trans_rhs)
+ {
+ DLIB_ASSERT( dest_nr == lhs_nr &&
+ dest_nc == rhs_nr &&
+ lhs_nc == rhs_nc)
+ }
+ else if (trans_lhs && !trans_rhs)
+ {
+ DLIB_ASSERT( dest_nr == lhs_nc &&
+ dest_nc == rhs_nc &&
+ lhs_nr == rhs_nr)
+ }
+ else
+ {
+ DLIB_ASSERT( dest_nr == lhs_nr &&
+ dest_nc == rhs_nc &&
+ lhs_nc == rhs_nr)
+ }
+
+ const int k = trans_rhs ? rhs_nc : rhs_nr;
+ CHECK_CUBLAS(cublasSgemm(context(),
+ transb,
+ transa,
+ dest_nc, dest_nr, k,
+ &alpha,
+ rhs.device(), rhs_nc,
+ lhs.device(), lhs_nc,
+ &beta,
+ dest.device(),dest_nc));
+ }
+
+ // ------------------------------------------------------------------------------------
+
+ }
+}
+
+#endif // DLIB_USE_CUDA
+
+#endif // DLIB_DNN_CuBLAS_CPP_
+
+
+
diff --git a/ml/dlib/dlib/dnn/cublas_dlibapi.h b/ml/dlib/dlib/dnn/cublas_dlibapi.h
new file mode 100644
index 000000000..b46fd25ca
--- /dev/null
+++ b/ml/dlib/dlib/dnn/cublas_dlibapi.h
@@ -0,0 +1,50 @@
+// Copyright (C) 2015 Davis E. King (davis@dlib.net)
+// License: Boost Software License See LICENSE.txt for the full license.
+#ifndef DLIB_DNN_CuBLAS_H_
+#define DLIB_DNN_CuBLAS_H_
+
+#ifdef DLIB_USE_CUDA
+
+#include "tensor.h"
+#include "cuda_errors.h"
+
+namespace dlib
+{
+ namespace cuda
+ {
+
+ // -----------------------------------------------------------------------------------
+
+ void gemm (
+ float beta,
+ tensor& dest,
+ float alpha,
+ const tensor& lhs,
+ bool trans_lhs,
+ const tensor& rhs,
+ bool trans_rhs
+ );
+ /*!
+ requires
+ - The dimensions of lhs and rhs must be compatible for matrix
+ multiplication. In particular:
+ - Let L == trans_lhs ? trans(mat(lhs)) : mat(lhs)
+ - Let R == trans_rhs ? trans(mat(rhs)) : mat(rhs)
+ - Let D == mat(dest)
+ - D.nr() == L.nr() && D.nc() == R.nc()
+ (i.e. dest must be preallocated and have the correct output dimensions)
+ - L.nc() == R.nr()
+ ensures
+ - performs: dest = alpha*L*R + beta*mat(dest)
+ !*/
+
+ // ------------------------------------------------------------------------------------
+
+ }
+}
+
+#endif // DLIB_USE_CUDA
+
+#endif // DLIB_DNN_CuBLAS_H_
+
+
diff --git a/ml/dlib/dlib/dnn/cuda_data_ptr.cpp b/ml/dlib/dlib/dnn/cuda_data_ptr.cpp
new file mode 100644
index 000000000..8abce0695
--- /dev/null
+++ b/ml/dlib/dlib/dnn/cuda_data_ptr.cpp
@@ -0,0 +1,71 @@
+// Copyright (C) 2017 Davis E. King (davis@dlib.net)
+// License: Boost Software License See LICENSE.txt for the full license.
+#ifndef DLIB_DNN_CuDA_DATA_PTR_CPP_
+#define DLIB_DNN_CuDA_DATA_PTR_CPP_
+
+#ifdef DLIB_USE_CUDA
+
+#include "cuda_data_ptr.h"
+#include "cuda_utils.h"
+
+namespace dlib
+{
+ namespace cuda
+ {
+
+ // -----------------------------------------------------------------------------------
+
+ cuda_data_void_ptr::
+ cuda_data_void_ptr(
+ size_t n
+ ) : num(n)
+ {
+ if (n == 0)
+ return;
+
+ void* data = nullptr;
+
+ CHECK_CUDA(cudaMalloc(&data, n));
+ pdata.reset(data, [](void* ptr){
+ auto err = cudaFree(ptr);
+ if(err!=cudaSuccess)
+ std::cerr << "cudaFree() failed. Reason: " << cudaGetErrorString(err) << std::endl;
+ });
+ }
+
+ // ------------------------------------------------------------------------------------
+
+ void memcpy(
+ void* dest,
+ const cuda_data_void_ptr& src
+ )
+ {
+ if (src.size() != 0)
+ {
+ CHECK_CUDA(cudaMemcpy(dest, src.data(), src.size(), cudaMemcpyDefault));
+ }
+ }
+
+ // ------------------------------------------------------------------------------------
+
+ void memcpy(
+ cuda_data_void_ptr& dest,
+ const void* src
+ )
+ {
+ if (dest.size() != 0)
+ {
+ CHECK_CUDA(cudaMemcpy(dest.data(), src, dest.size(), cudaMemcpyDefault));
+ }
+ }
+
+ // ------------------------------------------------------------------------------------
+
+ }
+}
+
+#endif // DLIB_USE_CUDA
+
+#endif // DLIB_DNN_CuDA_DATA_PTR_CPP_
+
+
diff --git a/ml/dlib/dlib/dnn/cuda_data_ptr.h b/ml/dlib/dlib/dnn/cuda_data_ptr.h
new file mode 100644
index 000000000..7eca608a0
--- /dev/null
+++ b/ml/dlib/dlib/dnn/cuda_data_ptr.h
@@ -0,0 +1,184 @@
+// Copyright (C) 2017 Davis E. King (davis@dlib.net)
+// License: Boost Software License See LICENSE.txt for the full license.
+#ifndef DLIB_DNN_CuDA_DATA_PTR_H_
+#define DLIB_DNN_CuDA_DATA_PTR_H_
+
+#ifdef DLIB_USE_CUDA
+
+#include <memory>
+#include <vector>
+
+namespace dlib
+{
+ namespace cuda
+ {
+
+ // ------------------------------------------------------------------------------------
+
+ class cuda_data_void_ptr
+ {
+ /*!
+ WHAT THIS OBJECT REPRESENTS
+ This is a block of memory on a CUDA device.
+ !*/
+ public:
+
+ cuda_data_void_ptr() = default;
+
+ cuda_data_void_ptr(size_t n);
+ /*!
+ ensures
+ - This object will allocate a device memory buffer of n bytes.
+ - #size() == n
+ !*/
+
+ void* data() { return pdata.get(); }
+ const void* data() const { return pdata.get(); }
+ operator void*() { return pdata.get(); }
+ operator const void*() const { return pdata.get(); }
+
+ void reset() { pdata.reset(); }
+
+ size_t size() const { return num; }
+ /*!
+ ensures
+ - returns the length of this buffer, in bytes.
+ !*/
+
+ private:
+
+ size_t num = 0;
+ std::shared_ptr<void> pdata;
+ };
+
+ // ------------------------------------------------------------------------------------
+
+ void memcpy(
+ void* dest,
+ const cuda_data_void_ptr& src
+ );
+ /*!
+ requires
+ - dest == a pointer to at least src.size() bytes on the host machine.
+ ensures
+ - copies the GPU data from src into dest.
+ !*/
+
+ // ------------------------------------------------------------------------------------
+
+ void memcpy(
+ cuda_data_void_ptr& dest,
+ const void* src
+ );
+ /*!
+ requires
+ - dest == a pointer to at least src.size() bytes on the host machine.
+ ensures
+ - copies the host data from src to the GPU memory buffer dest.
+ !*/
+
+ // ------------------------------------------------------------------------------------
+ // ------------------------------------------------------------------------------------
+ // ------------------------------------------------------------------------------------
+
+ template <typename T>
+ class cuda_data_ptr
+ {
+ /*!
+ WHAT THIS OBJECT REPRESENTS
+ This is a block of memory on a CUDA device. It is just a type safe
+ version of cuda_data_void_ptr.
+ !*/
+
+ public:
+
+ static_assert(std::is_standard_layout<T>::value, "You can only create basic standard layout types on the GPU");
+
+ cuda_data_ptr() = default;
+ cuda_data_ptr(size_t n) : num(n)
+ /*!
+ ensures
+ - This object will allocate a device memory buffer of n T objects.
+ - #size() == n
+ !*/
+ {
+ if (n == 0)
+ return;
+
+ pdata = cuda_data_void_ptr(n*sizeof(T));
+ }
+
+ T* data() { return (T*)pdata.data(); }
+ const T* data() const { return (T*)pdata.data(); }
+
+ operator T*() { return (T*)pdata.data(); }
+ operator const T*() const { return (T*)pdata.data(); }
+
+ void reset() { pdata.reset(); }
+
+ size_t size() const { return num; }
+
+
+ friend void memcpy(
+ std::vector<T>& dest,
+ const cuda_data_ptr& src
+ )
+ {
+ dest.resize(src.size());
+ if (src.size() != 0)
+ memcpy(dest.data(), src.pdata);
+ }
+
+ friend void memcpy(
+ cuda_data_ptr& src,
+ const std::vector<T>& dest
+ )
+ {
+ if (dest.size() != src.size())
+ dest = cuda_data_ptr<T>(src.size());
+
+ if (src.size() != 0)
+ memcpy(src.pdata, dest.data());
+ }
+
+ private:
+
+ size_t num = 0;
+ cuda_data_void_ptr pdata;
+ };
+
+ // ------------------------------------------------------------------------------------
+
+ class resizable_cuda_buffer
+ {
+ /*!
+ WHAT THIS OBJECT REPRESENTS
+ This is a block of memory on a CUDA device that will be automatically
+ resized if requested size is larger than allocated.
+ !*/
+ public:
+ cuda_data_void_ptr get(size_t size)
+ /*!
+ ensures
+ - This object will return the buffer of requested size of larger
+ - buffer.size() >= size
+ !*/
+ {
+ if (buffer.size() < size)
+ {
+ buffer.reset();
+ buffer = cuda_data_void_ptr(size);
+ }
+ return buffer;
+ }
+ private:
+ cuda_data_void_ptr buffer;
+ };
+
+ }
+}
+
+#endif // DLIB_USE_CUDA
+
+#endif // DLIB_DNN_CuDA_DATA_PTR_H_
+
diff --git a/ml/dlib/dlib/dnn/cuda_dlib.cu b/ml/dlib/dlib/dnn/cuda_dlib.cu
new file mode 100644
index 000000000..6c37593f1
--- /dev/null
+++ b/ml/dlib/dlib/dnn/cuda_dlib.cu
@@ -0,0 +1,1630 @@
+// Copyright (C) 2015 Davis E. King (davis@dlib.net)
+// License: Boost Software License See LICENSE.txt for the full license.
+
+#include "cuda_utils.h"
+#include "cuda_dlib.h"
+
+
+namespace dlib
+{
+ namespace cuda
+ {
+
+ // -----------------------------------------------------------------------------------
+
+ void set_device (
+ int dev
+ )
+ {
+ CHECK_CUDA(cudaSetDevice(dev));
+ }
+
+ int get_device (
+ )
+ {
+ int dev = 0;
+ CHECK_CUDA(cudaGetDevice(&dev));
+ return dev;
+ }
+
+ std::string get_device_name (
+ int device
+ )
+ {
+ cudaDeviceProp props;
+ CHECK_CUDA(cudaGetDeviceProperties(&props, device));
+ return props.name;
+ }
+
+ void set_current_device_blocking_sync(
+ )
+ {
+ CHECK_CUDA(cudaSetDeviceFlags(cudaDeviceScheduleBlockingSync));
+ }
+
+ int get_num_devices (
+ )
+ {
+ int num_devices;
+ CHECK_CUDA(cudaGetDeviceCount(&num_devices));
+ return num_devices;
+ }
+
+ bool can_access_peer (int device_id, int peer_device_id)
+ {
+ int can_access;
+ CHECK_CUDA(cudaDeviceCanAccessPeer(&can_access, device_id, peer_device_id));
+ return can_access != 0;
+ }
+ bool can_access_peer (const tensor& device, const tensor& peer_device)
+ {
+ return can_access_peer(device.device_id(), peer_device.device_id());
+ }
+
+ void device_synchronize (int dev)
+ {
+ raii_set_device set_dev(dev);
+ CHECK_CUDA(cudaDeviceSynchronize());
+ }
+ void device_synchronize (const tensor& dev) { device_synchronize(dev.device_id()); }
+
+ enable_peer_access::
+ enable_peer_access(
+ int device_id,
+ int peer_device_id
+ ) : call_disable(false), device_id(device_id), peer_device_id(peer_device_id)
+ {
+ raii_set_device set_dev(device_id);
+
+ auto err = cudaDeviceEnablePeerAccess(peer_device_id, 0);
+ if (err == cudaSuccess)
+ {
+ call_disable = true;
+ }
+ else if (err == cudaErrorPeerAccessAlreadyEnabled)
+ {
+ // call cudaGetLastError() to dispose of this error since we don't
+ // care.
+ auto err2 = cudaGetLastError();
+ if (err2 != cudaErrorPeerAccessAlreadyEnabled)
+ CHECK_CUDA(err2);
+ }
+ else
+ {
+ CHECK_CUDA(err);
+ }
+ }
+
+
+ enable_peer_access::
+ ~enable_peer_access() noexcept(false)
+ {
+ if (call_disable)
+ {
+ raii_set_device set_dev(device_id);
+ CHECK_CUDA(cudaDeviceDisablePeerAccess(peer_device_id));
+ }
+ }
+
+ // -----------------------------------------------------------------------------------
+ // -----------------------------------------------------------------------------------
+ // -----------------------------------------------------------------------------------
+
+ __global__ void _cuda_inverse_norms(float* invnorms, const float* data, size_t nr, size_t nc, const float eps)
+ {
+ // initialize invnorms before we begin.
+ for (auto i : grid_stride_range_y(0, nr))
+ for (auto j : grid_stride_range(0, 1))
+ invnorms[i] = eps;
+ __syncthreads();
+
+ for (auto i : grid_stride_range_y(0, nr))
+ {
+ auto p = data + i*nc;
+ float temp = 0;
+ for (auto j : grid_stride_range(0, nc))
+ temp += p[j]*p[j];
+
+ // and store the sum into invnorms[i]
+ warp_reduce_atomic_add(invnorms[i], temp);
+ }
+ __syncthreads();
+
+ for (auto i : grid_stride_range_y(0, nr))
+ for (auto j : grid_stride_range(0, 1))
+ invnorms[i] = 1.0/std::sqrt(invnorms[i]);
+ }
+
+ void inverse_norms (
+ resizable_tensor& invnorms,
+ const tensor& data,
+ const double eps
+ )
+ {
+ invnorms.set_size(data.num_samples());
+ launch_kernel(_cuda_inverse_norms, max_jobs(data.size()/data.num_samples(), data.num_samples()),
+ invnorms.device(), data.device(), data.num_samples(), data.size()/data.num_samples(), eps);
+ }
+
+ // ----------------------------------------------------------------------------------------
+
+ __global__ void _cuda_dot_prods(float* out, const float* lhs, const float* rhs, size_t nr, size_t nc)
+ {
+ // initialize out before we begin.
+ for (auto i : grid_stride_range_y(0, nr))
+ for (auto j : grid_stride_range(0, 1))
+ out[i] = 0;
+ __syncthreads();
+
+ for (auto i : grid_stride_range_y(0, nr))
+ {
+ auto l = lhs + i*nc;
+ auto r = rhs + i*nc;
+ float temp = 0;
+ for (auto j : grid_stride_range(0, nc))
+ temp += l[j]*r[j];
+
+ // and store the sum into out[i]
+ warp_reduce_atomic_add(out[i], temp);
+ }
+ }
+
+ __global__ void _cuda_dot_prods_add_to(float* out, const float* lhs, const float* rhs, size_t nr, size_t nc)
+ {
+ for (auto i : grid_stride_range_y(0, nr))
+ {
+ auto l = lhs + i*nc;
+ auto r = rhs + i*nc;
+ float temp = 0;
+ for (auto j : grid_stride_range(0, nc))
+ temp += l[j]*r[j];
+
+ // and store the sum into out[i]
+ warp_reduce_atomic_add(out[i], temp);
+ }
+ }
+
+ void dot_prods (
+ resizable_tensor& out,
+ const tensor& lhs,
+ const tensor& rhs
+ )
+ {
+ DLIB_CASSERT(have_same_dimensions(lhs,rhs));
+
+ out.set_size(lhs.num_samples());
+ if (out.size() == 0)
+ return;
+
+ const auto nr = lhs.num_samples();
+ const auto nc = lhs.size()/lhs.num_samples();
+
+ launch_kernel(_cuda_dot_prods, max_jobs(nc,nr), out.device_write_only(), lhs.device(), rhs.device(), nr, nc);
+ }
+
+ void dot_prods (
+ bool add_to,
+ tensor& out,
+ const tensor& lhs,
+ const tensor& rhs
+ )
+ {
+ DLIB_CASSERT(have_same_dimensions(lhs,rhs));
+ DLIB_CASSERT(out.k() == 1 && out.nr() == 1 && out.nc() == 1);
+ DLIB_CASSERT(out.size() == lhs.num_samples());
+
+ const auto nr = lhs.num_samples();
+ const auto nc = lhs.size()/lhs.num_samples();
+
+ if (add_to)
+ launch_kernel(_cuda_dot_prods_add_to, max_jobs(nc,nr), out.device(), lhs.device(), rhs.device(), nr, nc);
+ else
+ launch_kernel(_cuda_dot_prods, max_jobs(nc,nr), out.device_write_only(), lhs.device(), rhs.device(), nr, nc);
+ }
+
+ // ----------------------------------------------------------------------------------------
+
+ __global__ void _cuda_scale_columns(float* out, const float* m, const float* v, size_t nr, size_t nc)
+ {
+ for (auto j : grid_stride_range(0, nr*nc))
+ {
+ out[j] = m[j]*v[j%nc];
+ }
+ }
+
+ void scale_columns (
+ tensor& out,
+ const tensor& m,
+ const tensor& v
+ )
+ {
+ launch_kernel(_cuda_scale_columns, max_jobs(m.size()), out.device(), m.device(), v.device(), m.num_samples(), m.size()/m.num_samples());
+ }
+
+ // ----------------------------------------------------------------------------------------
+
+ __global__ void _cuda_scale_rows(float* out, const float* m, const float* v, size_t nr, size_t nc)
+ {
+ for (auto j : grid_stride_range(0, nr*nc))
+ {
+ out[j] = m[j]*v[j/nc];
+ }
+ }
+
+ void scale_rows (
+ tensor& out,
+ const tensor& m,
+ const tensor& v
+ )
+ {
+ launch_kernel(_cuda_scale_rows, max_jobs(m.size()), out.device(), m.device(), v.device(), m.num_samples(), m.size()/m.num_samples());
+ }
+
+ // ----------------------------------------------------------------------------------------
+
+ __global__ void _cuda_scale_rows2(float* out, const float* m1, const float* m2, const float* v1, const float* v2, size_t nr, size_t nc)
+ {
+ for (auto j : grid_stride_range(0, nr*nc))
+ {
+ out[j] = (m1[j] - m2[j]*v1[j/nc]) * v2[j/nc];
+ }
+ }
+
+ __global__ void _cuda_scale_rows2_beta(const float beta, float* out, const float* m1, const float* m2, const float* v1, const float* v2, size_t nr, size_t nc)
+ {
+ for (auto j : grid_stride_range(0, nr*nc))
+ {
+ out[j] = beta*out[j] + (m1[j] - m2[j]*v1[j/nc]) * v2[j/nc];
+ }
+ }
+
+ void scale_rows2 (
+ float beta,
+ tensor& out,
+ const tensor& m1,
+ const tensor& m2,
+ const tensor& v1,
+ const tensor& v2
+ )
+ {
+ if (beta == 0)
+ {
+ launch_kernel(_cuda_scale_rows2, max_jobs(m1.size()), out.device(),
+ m1.device(), m2.device(), v1.device(), v2.device(), m1.num_samples(),
+ m1.size()/m1.num_samples());
+ }
+ else
+ {
+ launch_kernel(_cuda_scale_rows2_beta, max_jobs(m1.size()), beta,
+ out.device(), m1.device(), m2.device(), v1.device(), v2.device(),
+ m1.num_samples(), m1.size()/m1.num_samples());
+ }
+ }
+
+ // ----------------------------------------------------------------------------------------
+
+ __global__ void _cuda_exp(float* dest, const float* src, size_t n)
+ {
+ for (auto i : grid_stride_range(0, n))
+ dest[i] = ::exp(src[i]);
+ }
+
+ void exp (
+ tensor& dest,
+ const tensor& src
+ )
+ {
+ DLIB_ASSERT(dest.size() == src.size());
+ launch_kernel(_cuda_exp, max_jobs(src.size()), dest.device(), src.device(), src.size());
+ }
+
+ // ----------------------------------------------------------------------------------------
+
+ __global__ void _cuda_log(float* dest, const float* src, size_t n)
+ {
+ for (auto i : grid_stride_range(0, n))
+ dest[i] = ::log(src[i]);
+ }
+
+ void log (
+ tensor& dest,
+ const tensor& src
+ )
+ {
+ DLIB_ASSERT(dest.size() == src.size());
+ launch_kernel(_cuda_log, max_jobs(src.size()), dest.device(), src.device(), src.size());
+ }
+
+ // ----------------------------------------------------------------------------------------
+
+ __global__ void _cuda_log10(float* dest, const float* src, size_t n)
+ {
+ for (auto i : grid_stride_range(0, n))
+ dest[i] = ::log10(src[i]);
+ }
+
+ void log10 (
+ tensor& dest,
+ const tensor& src
+ )
+ {
+ DLIB_ASSERT(dest.size() == src.size());
+ launch_kernel(_cuda_log10, max_jobs(src.size()), dest.device(), src.device(), src.size());
+ }
+
+ // -----------------------------------------------------------------------------------
+
+ __global__ void _cuda_multiply1(float* d, const float* s1, const float* s2, size_t n)
+ {
+ for (auto i : grid_stride_range(0, n))
+ {
+ d[i] = s1[i]*s2[i];
+ }
+ }
+ __global__ void _cuda_multiply2(float* d, const float* s1, const float* s2,
+ size_t n, size_t s1_n, size_t s2_n, size_t max_size)
+ {
+ for (auto i : grid_stride_range(0, n))
+ {
+ d[i] = 0;
+ for (size_t j = i; j < max_size; j += n)
+ d[i] += s1[j%s1_n]*s2[j%s2_n];
+ }
+ }
+
+ __global__ void _cuda_multiply3(float* d, const float* s1, const float* s2,
+ size_t n, size_t s1_n, size_t s2_n)
+ {
+ for (auto i : grid_stride_range(0, n))
+ {
+ d[i] = s1[i%s1_n]*s2[i%s2_n];
+ }
+ }
+
+ __global__ void _cuda_multiply1_add_to(float* d, const float* s1, const float* s2, size_t n)
+ {
+ for (auto i : grid_stride_range(0, n))
+ {
+ d[i] += s1[i]*s2[i];
+ }
+ }
+ __global__ void _cuda_multiply2_add_to(float* d, const float* s1, const float* s2,
+ size_t n, size_t s1_n, size_t s2_n, size_t max_size)
+ {
+ for (auto i : grid_stride_range(0, n))
+ {
+ for (size_t j = i; j < max_size; j += n)
+ d[i] += s1[j%s1_n]*s2[j%s2_n];
+ }
+ }
+
+ __global__ void _cuda_multiply3_add_to(float* d, const float* s1, const float* s2,
+ size_t n, size_t s1_n, size_t s2_n)
+ {
+ for (auto i : grid_stride_range(0, n))
+ {
+ d[i] += s1[i%s1_n]*s2[i%s2_n];
+ }
+ }
+
+ void multiply (
+ bool add_to,
+ tensor& dest,
+ const tensor& src1,
+ const tensor& src2
+ )
+ {
+
+ DLIB_CASSERT(dest.k() == src1.k() && src1.k() == src2.k() &&
+ dest.nr() == src1.nr() && src1.nr() == src2.nr() &&
+ dest.nc() == src1.nc() && src1.nc() == src2.nc() );
+ const long MD = std::max(std::max(dest.num_samples(),src1.num_samples()),src2.num_samples());
+ DLIB_CASSERT((dest.num_samples()==1 || dest.num_samples()==MD) &&
+ (src1.num_samples()==1 || src1.num_samples()==MD) &&
+ (src2.num_samples()==1 || src2.num_samples()==MD) );
+
+ if (dest.size() == 0)
+ return;
+
+ const size_t max_size = std::max(std::max(dest.size(),src1.size()),src2.size());
+ const auto d = dest.host();
+ const auto s1 = src1.host();
+ const auto s2 = src2.host();
+ if (dest.size() == src1.size() && src1.size() == src2.size())
+ {
+ if (add_to)
+ launch_kernel(_cuda_multiply1_add_to,max_jobs(dest.size()),dest.device(), src1.device(), src2.device(), src1.size());
+ else
+ launch_kernel(_cuda_multiply1,max_jobs(dest.size()),dest.device(), src1.device(), src2.device(), src1.size());
+ }
+ else if (dest.num_samples() == 1)
+ {
+ if (add_to)
+ launch_kernel(_cuda_multiply2_add_to,max_jobs(dest.size()),dest.device(), src1.device(), src2.device(),
+ dest.size(), src1.size(), src2.size(), max_size);
+ else
+ launch_kernel(_cuda_multiply2,max_jobs(dest.size()),dest.device(), src1.device(), src2.device(),
+ dest.size(), src1.size(), src2.size(), max_size);
+ }
+ else
+ {
+ if (add_to)
+ launch_kernel(_cuda_multiply3_add_to,max_jobs(dest.size()),dest.device(), src1.device(), src2.device(),
+ dest.size(), src1.size(), src2.size());
+ else
+ launch_kernel(_cuda_multiply3,max_jobs(dest.size()),dest.device(), src1.device(), src2.device(),
+ dest.size(), src1.size(), src2.size());
+ }
+ }
+
+ // ------------------------------------------------------------------------------------
+
+ __global__ void _cuda_multiply_conv(float* d, const float* s1, size_t n, const float* s2, size_t bs, size_t ks)
+ {
+ for (auto i : grid_stride_range(0, n))
+ {
+ auto k = (i/bs)%ks;
+ d[i] = s1[i]*s2[k];
+ }
+ }
+
+ __global__ void _cuda_multiply_conv2(float* d, const float* s1, size_t n, const float* s2, size_t bs, size_t ks)
+ {
+ // zero initialize d before we begin.
+ for (auto i : grid_stride_range_y(0, ks))
+ for (auto j : grid_stride_range(0, 1))
+ d[i] = 0;
+ __syncthreads();
+
+ // loop over all the image planes
+ for (auto i : grid_stride_range_y(0, n))
+ {
+ // sum all the elements in the i-th image plane
+ float temp = 0;
+ for (auto j : grid_stride_range(i*bs, (i+1)*bs))
+ temp += s1[j]*s2[j];
+ auto k = i%ks;
+ // and store the sum into d[k]
+ warp_reduce_atomic_add(d[k], temp);
+ }
+ }
+
+ __global__ void _cuda_multiply_conv_add_to(float* d, const float* s1, size_t n, const float* s2, size_t bs, size_t ks)
+ {
+ for (auto i : grid_stride_range(0, n))
+ {
+ auto k = (i/bs)%ks;
+ d[i] += s1[i]*s2[k];
+ }
+ }
+
+ __global__ void _cuda_multiply_conv2_add_to(float* d, const float* s1, size_t n, const float* s2, size_t bs, size_t ks)
+ {
+ // loop over all the image planes
+ for (auto i : grid_stride_range_y(0, n))
+ {
+ // sum all the elements in the i-th image plane
+ float temp = 0;
+ for (auto j : grid_stride_range(i*bs, (i+1)*bs))
+ temp += s1[j]*s2[j];
+ auto k = i%ks;
+ // and store the sum into d[k]
+ warp_reduce_atomic_add(d[k], temp);
+ }
+ }
+
+
+ void multiply_conv (
+ bool add_to,
+ tensor& dest,
+ const tensor& src1,
+ const tensor& src2
+ )
+ {
+ if (have_same_dimensions(dest,src1))
+ {
+ DLIB_CASSERT(src2.num_samples() == 1 && src2.nr() == 1 && src2.nc() == 1 && src2.k() == src1.k());
+ if (dest.size() == 0)
+ return;
+
+ if (add_to)
+ launch_kernel(_cuda_multiply_conv_add_to,max_jobs(dest.size()),
+ dest.device(), src1.device(), src1.size(), src2.device(), src1.nr()*src1.nc(), src1.k());
+ else
+ launch_kernel(_cuda_multiply_conv,max_jobs(dest.size()),
+ dest.device(), src1.device(), src1.size(), src2.device(), src1.nr()*src1.nc(), src1.k());
+ }
+ else
+ {
+ DLIB_CASSERT(have_same_dimensions(src1,src2));
+ DLIB_CASSERT(dest.num_samples() == 1 && dest.nr() == 1 && dest.nc() == 1 && dest.k() == src1.k());
+ if (dest.size() == 0)
+ return;
+
+
+ const auto bs = src1.nr()*src1.nc();
+ const auto n = src1.num_samples()*src1.k();
+ if (add_to)
+ launch_kernel(_cuda_multiply_conv2_add_to, max_jobs(bs,n),
+ dest.device(), src1.device(), n, src2.device(), bs, src1.k());
+ else
+ launch_kernel(_cuda_multiply_conv2, max_jobs(bs,n),
+ dest.device(), src1.device(), n, src2.device(), bs, src1.k());
+ }
+
+ }
+
+ // ------------------------------------------------------------------------------------
+
+ __global__ void _cuda_scale_channels_add_to(float* d, const float* src, size_t n, const float* scales, size_t bs)
+ {
+ for (auto i : grid_stride_range(0, n))
+ {
+ auto k = i/bs;
+ d[i] += src[i]*scales[k];
+ }
+ }
+
+ __global__ void _cuda_scale_channels(float* d, const float* src, size_t n, const float* scales, size_t bs)
+ {
+ for (auto i : grid_stride_range(0, n))
+ {
+ auto k = i/bs;
+ d[i] = src[i]*scales[k];
+ }
+ }
+
+ void scale_channels (
+ bool add_to,
+ tensor& dest,
+ const tensor& src,
+ const tensor& scales
+ )
+ {
+ DLIB_CASSERT(have_same_dimensions(dest,src) &&
+ scales.num_samples() == src.num_samples() &&
+ scales.k() == src.k() &&
+ scales.nr() == 1 &&
+ scales.nc() == 1 );
+
+ if (dest.size() == 0)
+ return;
+
+ if (add_to)
+ launch_kernel(_cuda_scale_channels_add_to,max_jobs(dest.size()),
+ dest.device(), src.device(), src.size(), scales.device(), src.nr()*src.nc());
+ else
+ launch_kernel(_cuda_scale_channels,max_jobs(dest.size()),
+ dest.device_write_only(), src.device(), src.size(), scales.device(), src.nr()*src.nc());
+ }
+
+ // ------------------------------------------------------------------------------------
+
+ __global__ void _cuda_mult1(float* d, const float* s1, const float* s2, size_t n)
+ {
+ for (auto i : grid_stride_range(0, n))
+ {
+ d[i] = s1[i]*s2[i];
+ }
+ }
+
+ __global__ void _cuda_mult1_add_to(float* d, const float* s1, const float* s2, size_t n)
+ {
+ for (auto i : grid_stride_range(0, n))
+ {
+ d[i] += s1[i]*s2[i];
+ }
+ }
+
+ __global__ void _cuda_mult2(float* d, const float* s1, const float* s2,
+ size_t dn, size_t dk, size_t dr, size_t dc,
+ size_t s1n, size_t s1k, size_t s1r, size_t s1c,
+ size_t s2n, size_t s2k, size_t s2r, size_t s2c)
+ {
+ for (auto i : grid_stride_range(0, dn*dk*dr*dc))
+ {
+ size_t n,k,r,c;
+ unpack_idx(i, dk,dr,dc, n,k,r,c);
+
+ float v1 = 0;
+ float v2 = 0;
+
+ if (n < s1n &&
+ k < s1k &&
+ r < s1r &&
+ c < s1c )
+ {
+ v1 = s1[pack_idx(s1k,s1r,s1c, n,k,r,c)];
+ }
+
+ if (n < s2n &&
+ k < s2k &&
+ r < s2r &&
+ c < s2c )
+ {
+ v2 = s2[pack_idx(s2k,s2r,s2c, n,k,r,c)];
+ }
+
+ d[i] = v1*v2;
+ }
+ }
+
+ __global__ void _cuda_mult2_add_to(float* d, const float* s1, const float* s2,
+ size_t dn, size_t dk, size_t dr, size_t dc,
+ size_t s1n, size_t s1k, size_t s1r, size_t s1c,
+ size_t s2n, size_t s2k, size_t s2r, size_t s2c)
+ {
+ for (auto i : grid_stride_range(0, dn*dk*dr*dc))
+ {
+ size_t n,k,r,c;
+ unpack_idx(i, dk,dr,dc, n,k,r,c);
+
+ float v1 = 0;
+ float v2 = 0;
+
+ if (n < s1n &&
+ k < s1k &&
+ r < s1r &&
+ c < s1c )
+ {
+ v1 = s1[pack_idx(s1k,s1r,s1c, n,k,r,c)];
+ }
+
+ if (n < s2n &&
+ k < s2k &&
+ r < s2r &&
+ c < s2c )
+ {
+ v2 = s2[pack_idx(s2k,s2r,s2c, n,k,r,c)];
+ }
+
+ d[i] += v1*v2;
+ }
+ }
+
+ void multiply_zero_padded (
+ bool add_to,
+ tensor& dest,
+ const tensor& src1,
+ const tensor& src2
+ )
+ {
+ if (dest.size() == 0)
+ return;
+
+ // Do the simple and fast version if everything has the same dimensions
+ if (have_same_dimensions(dest, src1) &&
+ have_same_dimensions(dest, src2))
+ {
+ if (add_to)
+ launch_kernel(_cuda_mult1_add_to,max_jobs(dest.size()), dest.device(), src1.device(), src2.device(), dest.size());
+ else
+ launch_kernel(_cuda_mult1,max_jobs(dest.size()), dest.device(), src1.device(), src2.device(), dest.size());
+ }
+ else
+ {
+ if (add_to)
+ {
+ // Otherwise, do the more complex version with bounds checking.
+ launch_kernel(_cuda_mult2_add_to,max_jobs(dest.size()),
+ dest.device(), src1.device(), src2.device(),
+ dest.num_samples(), dest.k(), dest.nr(), dest.nc(),
+ src1.num_samples(), src1.k(), src1.nr(), src1.nc(),
+ src2.num_samples(), src2.k(), src2.nr(), src2.nc()
+ );
+ }
+ else
+ {
+ // Otherwise, do the more complex version with bounds checking.
+ launch_kernel(_cuda_mult2,max_jobs(dest.size()),
+ dest.device(), src1.device(), src2.device(),
+ dest.num_samples(), dest.k(), dest.nr(), dest.nc(),
+ src1.num_samples(), src1.k(), src1.nr(), src1.nc(),
+ src2.num_samples(), src2.k(), src2.nr(), src2.nc()
+ );
+ }
+ }
+ }
+
+ // ------------------------------------------------------------------------------------
+
+ __global__ void _cuda_add1(float* d, const float* s1, const float* s2, size_t n)
+ {
+ for (auto i : grid_stride_range(0, n))
+ {
+ d[i] = s1[i]+s2[i];
+ }
+ }
+
+ __global__ void _cuda_add2(float* d, const float* s1, const float* s2,
+ size_t dn, size_t dk, size_t dr, size_t dc,
+ size_t s1n, size_t s1k, size_t s1r, size_t s1c,
+ size_t s2n, size_t s2k, size_t s2r, size_t s2c)
+ {
+ for (auto i : grid_stride_range(0, dn*dk*dr*dc))
+ {
+ size_t n,k,r,c;
+ unpack_idx(i, dk,dr,dc, n,k,r,c);
+
+ float v1 = 0;
+ float v2 = 0;
+
+ if (n < s1n &&
+ k < s1k &&
+ r < s1r &&
+ c < s1c )
+ {
+ v1 = s1[pack_idx(s1k,s1r,s1c, n,k,r,c)];
+ }
+
+ if (n < s2n &&
+ k < s2k &&
+ r < s2r &&
+ c < s2c )
+ {
+ v2 = s2[pack_idx(s2k,s2r,s2c, n,k,r,c)];
+ }
+
+ d[i] = v1+v2;
+ }
+ }
+
+ void add (
+ tensor& dest,
+ const tensor& src1,
+ const tensor& src2
+ )
+ {
+ if (dest.size() == 0)
+ return;
+
+ // Do the simple and fast version if everything has the same dimensions
+ if (have_same_dimensions(dest, src1) &&
+ have_same_dimensions(dest, src2))
+ {
+ launch_kernel(_cuda_add1,max_jobs(dest.size()), dest.device(), src1.device(), src2.device(), dest.size());
+ }
+ else
+ {
+ // Otherwise, do the more complex version with bounds checking.
+ launch_kernel(_cuda_add2,max_jobs(dest.size()),
+ dest.device(), src1.device(), src2.device(),
+ dest.num_samples(), dest.k(), dest.nr(), dest.nc(),
+ src1.num_samples(), src1.k(), src1.nr(), src1.nc(),
+ src2.num_samples(), src2.k(), src2.nr(), src2.nc()
+ );
+ }
+
+ }
+
+ // ------------------------------------------------------------------------------------
+
+ __global__ void _cuda_affine_transform1(float* d, const float* s, size_t n, float A, float B)
+ {
+ for (auto i : grid_stride_range(0, n))
+ {
+ d[i] = A*s[i] + B;
+ }
+ }
+
+ __global__ void _cuda_affine_transform1_0(float* d, const float* s, size_t n, float A)
+ {
+ for (auto i : grid_stride_range(0, n))
+ {
+ d[i] = A*s[i];
+ }
+ }
+
+ void affine_transform(
+ tensor& dest,
+ const tensor& src,
+ const float A,
+ const float B
+ )
+ {
+ DLIB_CASSERT(dest.size()==src.size());
+ if (B != 0)
+ launch_kernel(_cuda_affine_transform1,max_jobs(dest.size()),dest.device(), src.device(), src.size(), A, B);
+ else
+ launch_kernel(_cuda_affine_transform1_0,max_jobs(dest.size()),dest.device(), src.device(), src.size(), A);
+ }
+
+ void affine_transform(
+ tensor& dest,
+ const tensor& src,
+ const float A
+ )
+ {
+ DLIB_CASSERT(dest.size()==src.size());
+ launch_kernel(_cuda_affine_transform1_0,max_jobs(dest.size()),dest.device(), src.device(), src.size(), A);
+ }
+
+ // ----------------------------------------------------------------------------------------
+
+ __global__ void _cuda_affine_transform_rect(
+ float* d,
+ const float* s1,
+ const float* s2,
+ const float* s3,
+ float A,
+ float B,
+ float C,
+ size_t start_idx,
+ size_t n,
+ size_t rect_nc,
+ size_t total_nc
+ )
+ {
+ for (auto i : grid_stride_range(0, n))
+ {
+ size_t r = i/rect_nc;
+ size_t c = i%rect_nc;
+ size_t idx = r*total_nc + c + start_idx;
+ d[idx] = A*s1[idx] + B*s2[idx] + C*s3[idx];
+ }
+ }
+
+ void affine_transform(
+ const rectangle& rect,
+ tensor& dest,
+ const tensor& src1,
+ const tensor& src2,
+ const tensor& src3,
+ float A,
+ float B,
+ float C
+ )
+ {
+ DLIB_CASSERT(dest.size() == src1.size());
+ DLIB_CASSERT(dest.size() == src2.size());
+ DLIB_CASSERT(dest.size() == src3.size());
+ DLIB_CASSERT(dest.num_samples() == src1.num_samples());
+ DLIB_CASSERT(dest.num_samples() == src2.num_samples());
+ DLIB_CASSERT(dest.num_samples() == src3.num_samples());
+ DLIB_CASSERT(rectangle(0,0, dest.size()/dest.num_samples()-1, dest.num_samples()-1).contains(rect));
+ launch_kernel(_cuda_affine_transform_rect,max_jobs(rect.area()),
+ dest.device(), src1.device(), src2.device(), src3.device(), A, B, C,
+ rect.left() + rect.top()*(dest.size()/dest.num_samples()),
+ rect.area(),
+ rect.width(),
+ dest.size()/dest.num_samples());
+ }
+
+ // ----------------------------------------------------------------------------------------
+
+ __global__ void _cuda_affine_transform4(float* d, const float* s1, const float* s2, size_t n, float A, float B, float C)
+ {
+ for (auto i : grid_stride_range(0, n))
+ {
+ d[i] = A*s1[i] + B*s2[i] + C;
+ }
+ }
+
+ __global__ void _cuda_affine_transform4_0(float* d, const float* s1, const float* s2, size_t n, float A, float B)
+ {
+ for (auto i : grid_stride_range(0, n))
+ {
+ d[i] = A*s1[i] + B*s2[i];
+ }
+ }
+
+ void affine_transform(
+ tensor& dest,
+ const tensor& src1,
+ const tensor& src2,
+ const float A,
+ const float B,
+ const float C
+ )
+ {
+ DLIB_CASSERT(dest.size()==src1.size());
+ DLIB_CASSERT(dest.size()==src2.size());
+ if (C != 0)
+ launch_kernel(_cuda_affine_transform4,max_jobs(dest.size()),dest.device(), src1.device(), src2.device(), dest.size(), A, B, C);
+ else
+ launch_kernel(_cuda_affine_transform4_0,max_jobs(dest.size()),dest.device(), src1.device(), src2.device(), dest.size(), A, B);
+ }
+
+ void affine_transform(
+ tensor& dest,
+ const tensor& src1,
+ const tensor& src2,
+ const float A,
+ const float B
+ )
+ {
+ DLIB_CASSERT(dest.size()==src1.size());
+ DLIB_CASSERT(dest.size()==src2.size());
+ launch_kernel(_cuda_affine_transform4_0,max_jobs(dest.size()),dest.device(), src1.device(), src2.device(), dest.size(), A, B);
+ }
+
+ // ----------------------------------------------------------------------------------------
+
+ __global__ void _cuda_add_scaled(float* d, const float* s, size_t n, float scale)
+ {
+ for (auto i : grid_stride_range(0, n))
+ {
+ d[i] += scale*s[i];
+ }
+ }
+
+ void add_scaled(
+ tensor& dest,
+ const float scale,
+ const tensor& src
+ )
+ {
+ DLIB_CASSERT(dest.size()==src.size());
+ launch_kernel(_cuda_add_scaled,max_jobs(dest.size()),dest.device(), src.device(), dest.size(), scale);
+ }
+
+ // ----------------------------------------------------------------------------------------
+
+ __global__ void _cuda_add_cv_to_all_columns(float beta, float* dest, float alpha, const float* src, size_t size, size_t stride)
+ {
+ for (auto i : grid_stride_range(0, size))
+ {
+ dest[i] = beta*dest[i] + alpha*src[i/stride];
+ }
+ }
+
+ __global__ void _cuda_add_cv_to_all_columns_no_beta(float* dest, float alpha, const float* src, size_t size, size_t stride)
+ {
+ for (auto i : grid_stride_range(0, size))
+ {
+ dest[i] = alpha*src[i/stride];
+ }
+ }
+
+ void add_cv_to_all_columns(
+ float beta,
+ tensor& dest,
+ float alpha,
+ const tensor& src
+ )
+ {
+ DLIB_CASSERT(dest.num_samples() == src.num_samples() && src.num_samples() == src.size());
+ if (beta == 0)
+ launch_kernel(_cuda_add_cv_to_all_columns_no_beta, max_jobs(dest.size()), dest.device(), alpha, src.device(), dest.size(), dest.size()/dest.num_samples());
+ else
+ launch_kernel(_cuda_add_cv_to_all_columns, max_jobs(dest.size()), beta, dest.device(), alpha, src.device(), dest.size(), dest.size()/dest.num_samples());
+ }
+
+ // ----------------------------------------------------------------------------------------
+
+ __global__ void _cuda_affine_transform5(
+ float* d, const float* s1, const float* s2, const float* s3, size_t n, float A, float B, float C, float D
+ )
+ {
+ for (auto i : grid_stride_range(0, n))
+ {
+ d[i] = A*s1[i] + B*s2[i] + C*s3[i] + D;
+ }
+ }
+
+ void affine_transform(
+ tensor& dest,
+ const tensor& src1,
+ const tensor& src2,
+ const tensor& src3,
+ const float A,
+ const float B,
+ const float C,
+ const float D
+ )
+ {
+ DLIB_CASSERT(dest.size()==src1.size());
+ DLIB_CASSERT(dest.size()==src2.size());
+ DLIB_CASSERT(dest.size()==src3.size());
+ launch_kernel(_cuda_affine_transform5,max_jobs(dest.size()),dest.device(), src1.device(),
+ src2.device(), src3.device(), dest.size(), A, B, C, D);
+ }
+
+ // ----------------------------------------------------------------------------------------
+
+ __global__ void _cuda_affine_transform_range(
+ float* d, const float* s1, const float* s2, const float* s3, size_t begin, size_t end, float A, float B, float C
+ )
+ {
+ for (auto i : grid_stride_range(begin, end))
+ {
+ d[i] = A*s1[i] + B*s2[i] + C*s3[i];
+ }
+ }
+
+
+ void affine_transform_range(
+ size_t begin,
+ size_t end,
+ tensor& dest,
+ const tensor& src1,
+ const tensor& src2,
+ const tensor& src3,
+ const float A,
+ const float B,
+ const float C
+ )
+ {
+ DLIB_CASSERT(dest.size()==src1.size());
+ DLIB_CASSERT(dest.size()==src2.size());
+ DLIB_CASSERT(dest.size()==src3.size());
+ DLIB_CASSERT(begin <= end && end <= dest.size());
+ launch_kernel(_cuda_affine_transform_range,max_jobs(end-begin),
+ dest.device(), src1.device(),
+ src2.device(), src3.device(), begin, end, A, B, C);
+ }
+
+ // -----------------------------------------------------------------------------------
+
+ __global__ void _cuda_affine_transform2(float* d, const float* s, size_t n, const float* A, const float* B)
+ {
+ for (auto i : grid_stride_range(0, n))
+ {
+ d[i] = A[i]*s[i] + B[i];
+ }
+ }
+ __global__ void _cuda_affine_transform3(float* d, const float* s, size_t n, const float* A, const float* B, size_t bs)
+ {
+ for (auto i : grid_stride_range(0, n))
+ {
+ d[i] = A[i%bs]*s[i] + B[i%bs];
+ }
+ }
+
+ void affine_transform(
+ tensor& dest,
+ const tensor& src,
+ const tensor& A,
+ const tensor& B
+ )
+ {
+ DLIB_CASSERT(have_same_dimensions(dest, src));
+ DLIB_CASSERT(
+ ((A.num_samples()==1 && B.num_samples()==1) ||
+ (A.num_samples()==src.num_samples() && B.num_samples()==src.num_samples())));
+ DLIB_CASSERT(
+ A.nr()==B.nr() && B.nr()==src.nr() &&
+ A.nc()==B.nc() && B.nc()==src.nc() &&
+ A.k() ==B.k() && B.k()==src.k(),
+ "\nA.nr(): " << A.nr() << "\nB.nr(): " << B.nr() << "\nsrc.nr(): " << src.nr()
+ <<"\nA.nc(): " << A.nc() << "\nB.nc(): " << B.nc() << "\nsrc.nc(): " << src.nc()
+ <<"\nA.k(): " << A.k() << "\nB.k(): " << B.k() << "\nsrc.k(): " << src.k()
+ );
+
+ if (A.num_samples() == 1)
+ {
+ launch_kernel(_cuda_affine_transform3,max_jobs(dest.size()),dest.device(), src.device(), src.size(), A.device(), B.device(), A.size());
+ }
+ else
+ {
+ launch_kernel(_cuda_affine_transform2,max_jobs(dest.size()),dest.device(), src.device(), src.size(), A.device(), B.device());
+ }
+ }
+
+ // ----------------------------------------------------------------------------------------
+
+ __global__ void _cuda_compute_adam_update(
+ size_t begin,
+ size_t end,
+ float* s,
+ float* m,
+ float* v,
+ const float alpha,
+ const float weight_decay,
+ const float momentum1,
+ const float momentum2,
+ const float* params,
+ const float* params_grad
+ )
+ {
+ const float eps = 1e-8;
+ // The loop is equivalent to doing this:
+ // m = momentum1*m + (1-momentum1) * (weight_decay*params + params_grad);
+ // v = momentum2*v + (1-momentum2)*squared(weight_decay*params + params_grad);
+ // s = -alpha*m/(sqrt(v) + eps);
+ for (auto i : grid_stride_range(begin, end))
+ {
+ float g = (weight_decay*params[i] + params_grad[i]);
+ m[i] = momentum1*m[i] + (1-momentum1)*g;
+ v[i] = momentum2*v[i] + (1-momentum2)*g*g;
+ s[i] = -alpha*m[i]/(std::sqrt(v[i]) + eps);
+ }
+ }
+
+ void compute_adam_update (
+ size_t begin,
+ size_t end,
+ tensor& s,
+ tensor& m,
+ tensor& v,
+ const float t,
+ const float learning_rate,
+ const float weight_decay,
+ const float momentum1,
+ const float momentum2,
+ const tensor& params,
+ const tensor& params_grad
+ )
+ {
+ DLIB_CASSERT(s.size() == m.size() &&
+ s.size() == v.size() &&
+ s.size() == params.size() &&
+ s.size() == params_grad.size());
+ DLIB_CASSERT(begin <= end && end <= params.size());
+ const float alpha = learning_rate*std::sqrt(1-std::pow(momentum2,t))/(1-std::pow(momentum1, t));
+
+ launch_kernel(_cuda_compute_adam_update,max_jobs(end-begin),
+ begin, end, s.device(), m.device(), v.device(), alpha, weight_decay,
+ momentum1, momentum2, params.device(), params_grad.device());
+ }
+
+ // -----------------------------------------------------------------------------------
+
+ __global__ void _cuda_affine_transform_conv(float* d, const float* s, size_t n, const float* A, const float* B, size_t bs, size_t ks)
+ {
+ for (auto i : grid_stride_range(0, n))
+ {
+ auto k = (i/bs)%ks;
+ d[i] = A[k]*s[i] + B[k];
+ }
+ }
+
+ void affine_transform_conv(
+ tensor& dest,
+ const tensor& src,
+ const tensor& A,
+ const tensor& B
+ )
+ {
+ DLIB_CASSERT(have_same_dimensions(dest, src));
+ DLIB_CASSERT(have_same_dimensions(A, B));
+ DLIB_CASSERT(A.num_samples() == 1 && A.nr() == 1 && A.nc() == 1 && A.k() == src.k());
+
+ launch_kernel(_cuda_affine_transform_conv,max_jobs(dest.size()),
+ dest.device(), src.device(), src.size(), A.device(), B.device(), src.nr()*src.nc(), src.k());
+ }
+
+ // -----------------------------------------------------------------------------------
+
+ __global__ void _add_bias_gradient(float* out, const float* in, size_t n, size_t total_n)
+ {
+ for (auto i : grid_stride_range(0, n))
+ {
+ out[i] = in[i];
+ for (size_t j = i+n; j < total_n; j+=n)
+ out[i] += in[j];
+ }
+ }
+
+ void assign_bias_gradient (
+ tensor& grad,
+ const tensor& gradient_input
+ )
+ {
+ DLIB_CASSERT(
+ grad.num_samples() == 1 &&
+ gradient_input.k() == grad.k() &&
+ gradient_input.nr() == grad.nr() &&
+ gradient_input.nc() == grad.nc() &&
+ gradient_input.size() > 0);
+
+ launch_kernel(_add_bias_gradient,max_jobs(grad.size()),grad.device(), gradient_input.device(), grad.size(), gradient_input.size());
+ }
+
+ // ----------------------------------------------------------------------------------------
+
+ __global__ void _set_tensor(float* out, size_t n, const float val)
+ {
+ for (auto i : grid_stride_range(0, n))
+ out[i] = val;
+ }
+
+ void set_tensor (
+ tensor& t,
+ float value
+ )
+ {
+ launch_kernel(_set_tensor, max_jobs(t.size()), t.device(), t.size(), value);
+ }
+
+ // ----------------------------------------------------------------------------------------
+
+ __global__ void _scale_tensor(float* out, size_t n, const float val)
+ {
+ for (auto i : grid_stride_range(0, n))
+ out[i] *= val;
+ }
+
+ void scale_tensor (
+ tensor& t,
+ float value
+ )
+ {
+ launch_kernel(_scale_tensor, max_jobs(t.size()), t.device(), t.size(), value);
+ }
+
+ // -----------------------------------------------------------------------------------
+ // -----------------------------------------------------------------------------------
+
+ __global__ void _cuda_threshold(float* d, size_t n, float thresh)
+ {
+ for (auto i : grid_stride_range(0, n))
+ {
+ d[i] = d[i]>thresh ? 1:0;
+ }
+ }
+
+ void threshold (
+ tensor& data,
+ float thresh
+ )
+ {
+ launch_kernel(_cuda_threshold,max_jobs(data.size()),data.device(), data.size(), thresh);
+ }
+
+ // ------------------------------------------------------------------------------------
+
+ __global__ void _cuda_dot(const float* a, const float* b, size_t n, float* result)
+ {
+ // Parallel sum everything into local temp variables.
+ float temp = 0;
+ for(auto i : grid_stride_range(0, n))
+ temp += a[i]*b[i];
+
+ // Then do the warp reduce add thing to merge into one output value.
+ warp_reduce_atomic_add(*result, temp);
+ }
+
+
+ void dot (
+ const tensor& a,
+ const tensor& b,
+ tensor& result,
+ size_t idx
+ )
+ {
+ DLIB_CASSERT(a.size() == b.size());
+ DLIB_CASSERT(idx < result.size());
+
+ launch_kernel(_cuda_dot, max_jobs(a.size()), a.device(), b.device(), a.size(), result.device()+idx);
+ }
+
+ // ----------------------------------------------------------------------------------------
+
+ __global__ void _cuda_prelu(const float* s, float* d, size_t n, const float* pp)
+ {
+ const float p = *pp;
+ for (auto i : grid_stride_range(0, n))
+ {
+ if (s[i] > 0)
+ d[i] = s[i];
+ else
+ d[i] = p*s[i];
+ }
+ }
+
+ void prelu (
+ tensor& dest,
+ const tensor& src,
+ const tensor& param
+ )
+ {
+ launch_kernel(_cuda_prelu, max_jobs(dest.size()),
+ src.device(), dest.device(), src.size(), param.device());
+ }
+
+ // ----------------------------------------------------------------------------------------
+
+ __global__ void _cuda_prelu_gradient(float* out, const float* s, const float* gi, size_t n, const float* pp, float* ppgrad)
+ {
+ const float p = *pp;
+ float pgrad = 0;
+ for(auto i : grid_stride_range(0, n))
+ {
+ if (s[i] > 0)
+ {
+ out[i] += gi[i];
+ }
+ else
+ {
+ out[i] += p*gi[i];
+ pgrad += gi[i]*s[i];
+ }
+ }
+
+ // Then do the warp reduce add thing to merge into one output value.
+ warp_reduce_atomic_add(*ppgrad, pgrad);
+ }
+
+ void prelu_gradient (
+ tensor& grad,
+ const tensor& src,
+ const tensor& gradient_input,
+ const tensor& param,
+ tensor& params_grad
+ )
+ {
+ params_grad = 0;
+ launch_kernel(_cuda_prelu_gradient, max_jobs(grad.size()),
+ grad.device(), src.device(), gradient_input.device(), grad.size(),
+ param.device(), params_grad.device());
+ }
+
+ // ----------------------------------------------------------------------------------------
+
+ __global__ void _cuda_resize_bilinear(size_t dsize, size_t dchan_size, size_t dnc, float* d,
+ size_t schan_size, int snr, int snc, const float* s,
+ const float x_scale, const float y_scale)
+ {
+ for(auto i : grid_stride_range(0, dsize))
+ {
+ const int idx = i%dchan_size;
+ const int channel = i/dchan_size;
+ const int sidx = channel*schan_size;
+ const int r = idx/dnc;
+ const int c = idx%dnc;
+
+ const float y = r*y_scale;
+ const int top = static_cast<int>(::floor(y));
+ const int bottom = ::min(top+1, snr-1);
+ const float tb_frac = y - top;
+
+ const float x = c*x_scale;
+ const int left = static_cast<int>(::floor(x));
+ const int right = ::min(left+1, snc-1);
+ const float lr_frac = x - left;
+
+ float tl = s[sidx+top*snc+left];
+ float tr = s[sidx+top*snc+right];
+ float bl = s[sidx+bottom*snc+left];
+ float br = s[sidx+bottom*snc+right];
+
+ float temp = (1-tb_frac)*((1-lr_frac)*tl + lr_frac*tr) +
+ tb_frac*((1-lr_frac)*bl + lr_frac*br);
+
+ d[i] = temp;
+ }
+ }
+
+ __global__ void _cuda_resize_bilinear_strided(size_t dsize, size_t dchan_size, size_t dnc, float* d,
+ size_t schan_size, int snr, int snc, const float* s,
+ const float x_scale, const float y_scale,
+ size_t dest_row_stride, size_t src_row_stride, size_t dest_chan_size_strided
+ )
+ {
+ for(auto i : grid_stride_range(0, dsize))
+ {
+ const int idx = i%dchan_size;
+ const int channel = i/dchan_size;
+ const int sidx = channel*schan_size;
+ const int r = idx/dnc;
+ const int c = idx%dnc;
+ const int didx = channel*dest_chan_size_strided + r*dest_row_stride+c;
+
+ const float y = r*y_scale;
+ const int top = static_cast<int>(::floor(y));
+ const int bottom = ::min(top+1, snr-1);
+ const float tb_frac = y - top;
+
+ const float x = c*x_scale;
+ const int left = static_cast<int>(::floor(x));
+ const int right = ::min(left+1, snc-1);
+ const float lr_frac = x - left;
+
+ float tl = s[sidx+top*src_row_stride+left];
+ float tr = s[sidx+top*src_row_stride+right];
+ float bl = s[sidx+bottom*src_row_stride+left];
+ float br = s[sidx+bottom*src_row_stride+right];
+
+ float temp = (1-tb_frac)*((1-lr_frac)*tl + lr_frac*tr) +
+ tb_frac*((1-lr_frac)*bl + lr_frac*br);
+
+ d[didx] = temp;
+ }
+ }
+
+ void resize_bilinear (
+ tensor& dest,
+ long dest_row_stride,
+ long dest_channel_stride,
+ const tensor& src,
+ long src_row_stride,
+ long src_channel_stride
+ )
+ {
+ DLIB_CASSERT(is_same_object(dest, src)==false);
+ DLIB_CASSERT(dest.num_samples() == src.num_samples());
+ DLIB_CASSERT(dest.k() == src.k());
+
+ if (dest.size() == 0 || src.size() == 0)
+ return;
+
+ const float x_scale = (src.nc()-1)/(float)std::max<long>((dest.nc()-1),1);
+ const float y_scale = (src.nr()-1)/(float)std::max<long>((dest.nr()-1),1);
+
+ if (dest.nc() == dest_row_stride && dest.nr()*dest.nc()==dest_channel_stride &&
+ src.nc() == src_row_stride && src.nr()*src.nc()==src_channel_stride)
+ {
+ launch_kernel(_cuda_resize_bilinear,
+ dest.size(), dest.nr()*dest.nc(), dest.nc(), dest.device(),
+ src.nr()*src.nc(), src.nr(), src.nc(), src.device(),
+ x_scale, y_scale);
+ }
+ else
+ {
+ launch_kernel(_cuda_resize_bilinear_strided,
+ dest.size(), dest.nr()*dest.nc(), dest.nc(), dest.device(),
+ src_channel_stride, src.nr(), src.nc(), src.device(),
+ x_scale, y_scale, dest_row_stride, src_row_stride, dest_channel_stride);
+ }
+ }
+
+ // ----------------------------------------------------------------------------------------
+
+ __global__ void _cuda_resize_bilinear_gradient(size_t dsize, size_t dchan_size, size_t dnc, const float* d,
+ size_t schan_size, int snr, int snc, float* s,
+ const float x_scale, const float y_scale)
+ {
+ for(auto i : grid_stride_range(0, dsize))
+ {
+ const float tmp = d[i];
+
+ const int idx = i%dchan_size;
+ const int channel = i/dchan_size;
+ const int sidx = channel*schan_size;
+ const int r = idx/dnc;
+ const int c = idx%dnc;
+
+ const float y = r*y_scale;
+ const int top = static_cast<int>(::floor(y));
+ const int bottom = ::min(top+1, snr-1);
+ const float tb_frac = y - top;
+
+ const float x = c*x_scale;
+ const int left = static_cast<int>(::floor(x));
+ const int right = ::min(left+1, snc-1);
+ const float lr_frac = x - left;
+
+
+ atomicAdd(s+sidx+top*snc+left, tmp*(1-tb_frac)*(1-lr_frac));
+ atomicAdd(s+sidx+top*snc+right, tmp*(1-tb_frac)*(lr_frac));
+ atomicAdd(s+sidx+bottom*snc+left, tmp*(tb_frac)*(1-lr_frac));
+ atomicAdd(s+sidx+bottom*snc+right, tmp*(tb_frac)*(lr_frac));
+ }
+ }
+
+ __global__ void _cuda_resize_bilinear_gradient_strided(size_t dsize, size_t dchan_size, size_t dnc, const float* d,
+ size_t schan_size, int snr, int snc, float* s,
+ const float x_scale, const float y_scale,
+ size_t dest_row_stride, size_t src_row_stride, size_t dest_chan_size_strided
+ )
+ {
+ for(auto i : grid_stride_range(0, dsize))
+ {
+
+ const int idx = i%dchan_size;
+ const int channel = i/dchan_size;
+ const int didx = channel*dest_chan_size_strided;
+ const int sidx = channel*schan_size;
+ const int r = idx/dnc;
+ const int c = idx%dnc;
+
+ const float tmp = d[didx + r*dest_row_stride+c];
+
+ const float y = r*y_scale;
+ const int top = static_cast<int>(::floor(y));
+ const int bottom = ::min(top+1, snr-1);
+ const float tb_frac = y - top;
+
+ const float x = c*x_scale;
+ const int left = static_cast<int>(::floor(x));
+ const int right = ::min(left+1, snc-1);
+ const float lr_frac = x - left;
+
+
+ atomicAdd(s+sidx+top*src_row_stride+left, tmp*(1-tb_frac)*(1-lr_frac));
+ atomicAdd(s+sidx+top*src_row_stride+right, tmp*(1-tb_frac)*(lr_frac));
+ atomicAdd(s+sidx+bottom*src_row_stride+left, tmp*(tb_frac)*(1-lr_frac));
+ atomicAdd(s+sidx+bottom*src_row_stride+right, tmp*(tb_frac)*(lr_frac));
+ }
+ }
+
+ void resize_bilinear_gradient (
+ tensor& grad,
+ long grad_row_stride,
+ long grad_channel_stride,
+ const tensor& gradient_input,
+ long gradient_input_row_stride,
+ long gradient_input_channel_stride
+ )
+ {
+ DLIB_CASSERT(is_same_object(grad, gradient_input)==false);
+ DLIB_CASSERT(gradient_input.num_samples() == grad.num_samples());
+ DLIB_CASSERT(gradient_input.k() == grad.k());
+
+ if (grad.size() == 0 || gradient_input.size() == 0)
+ return;
+
+ const float x_scale = (grad.nc()-1)/(float)std::max<long>((gradient_input.nc()-1),1);
+ const float y_scale = (grad.nr()-1)/(float)std::max<long>((gradient_input.nr()-1),1);
+
+ if (grad.nc() == grad_row_stride && grad.nr()*grad.nc()==grad_channel_stride &&
+ gradient_input.nc() == gradient_input_row_stride && gradient_input.nr()*gradient_input.nc()==gradient_input_channel_stride)
+ {
+ launch_kernel(_cuda_resize_bilinear_gradient,
+ gradient_input.size(), gradient_input.nr()*gradient_input.nc(), gradient_input.nc(), gradient_input.device(),
+ grad.nr()*grad.nc(), grad.nr(), grad.nc(), grad.device(),
+ x_scale, y_scale);
+ }
+ else
+ {
+ launch_kernel(_cuda_resize_bilinear_gradient_strided,
+ gradient_input.size(), gradient_input.nr()*gradient_input.nc(), gradient_input.nc(), gradient_input.device(),
+ grad_channel_stride, grad.nr(), grad.nc(), grad.device(),
+ x_scale, y_scale, gradient_input_row_stride, grad_row_stride, gradient_input_channel_stride);
+ }
+ }
+
+ // ----------------------------------------------------------------------------------------
+
+ __global__ void _cuda_copy_tensor_add_to (float* dest, size_t size, const float* src, size_t dest_stride, size_t src_stride, size_t block_size)
+ {
+ for(auto i : grid_stride_range(0, size))
+ {
+ size_t blk = i/block_size;
+ size_t j = i%block_size;
+ dest[blk*dest_stride + j] += src[blk*src_stride + j];
+ }
+ }
+
+ __global__ void _cuda_copy_tensor (float* dest, size_t size, const float* src, size_t dest_stride, size_t src_stride, size_t block_size)
+ {
+ for(auto i : grid_stride_range(0, size))
+ {
+ size_t blk = i/block_size;
+ size_t j = i%block_size;
+ dest[blk*dest_stride + j] = src[blk*src_stride + j];
+ }
+ }
+
+ void copy_tensor(
+ bool add_to,
+ tensor& dest,
+ size_t dest_k_offset,
+ const tensor& src,
+ size_t src_k_offset,
+ size_t count_k
+ )
+ {
+ const size_t dest_sample_size = static_cast<size_t>(dest.nc() * dest.nr() * dest.k());
+ const size_t src_sample_size = static_cast<size_t>(src.nc() * src.nr() * src.k());
+
+ const size_t block_size = count_k * dest.nc() * dest.nr();
+
+ DLIB_CASSERT(dest.num_samples() == src.num_samples() &&
+ dest.nc() == src.nc() && dest.nr() == src.nr(), "All sources should fit into dest tensor size");
+ DLIB_CASSERT(dest.k() - dest_k_offset >= count_k, "Not enough space in dest tensor");
+ DLIB_CASSERT(src.k() - src_k_offset >= count_k, "Not enough space in src tensor");
+
+ float* dest_p = dest.device() + dest_k_offset * dest.nc() * dest.nr();
+ const float* src_p = src.device() + src_k_offset * src.nc() * src.nr();;
+
+ if (add_to)
+ {
+ launch_kernel(_cuda_copy_tensor_add_to, max_jobs(dest.size()),
+ dest_p, block_size*dest.num_samples(),
+ src_p, dest_sample_size, src_sample_size, block_size);
+ }
+ else
+ {
+ launch_kernel(_cuda_copy_tensor, max_jobs(dest.size()),
+ dest_p, block_size*dest.num_samples(),
+ src_p, dest_sample_size, src_sample_size, block_size);
+ }
+ }
+
+ // ----------------------------------------------------------------------------------------
+
+ }
+}
+
diff --git a/ml/dlib/dlib/dnn/cuda_dlib.h b/ml/dlib/dlib/dnn/cuda_dlib.h
new file mode 100644
index 000000000..3a057ffc4
--- /dev/null
+++ b/ml/dlib/dlib/dnn/cuda_dlib.h
@@ -0,0 +1,469 @@
+// Copyright (C) 2015 Davis E. King (davis@dlib.net)
+// License: Boost Software License See LICENSE.txt for the full license.
+#ifndef DLIB_DNN_CuDA_H_
+#define DLIB_DNN_CuDA_H_
+
+
+#include "tensor.h"
+#include "../geometry/rectangle.h"
+
+namespace dlib
+{
+ namespace cuda
+ {
+
+ // ----------------------------------------------------------------------------------------
+
+ void set_device (
+ int dev
+ );
+
+ int get_device (
+ );
+
+ int get_num_devices (
+ );
+
+ std::string get_device_name (
+ int device
+ );
+
+ void set_current_device_blocking_sync(
+ );
+
+ bool can_access_peer (int device_id, int peer_device_id);
+ bool can_access_peer (const tensor& device, const tensor& peer_device);
+
+ void device_synchronize (int dev);
+ void device_synchronize (const tensor& dev);
+
+
+ class raii_set_device
+ {
+ public:
+ raii_set_device() = delete;
+ raii_set_device(const raii_set_device&) = delete;
+ raii_set_device& operator=(const raii_set_device&) = delete;
+
+ raii_set_device(int dev)
+ {
+ prev_dev = get_device();
+ set_device(dev);
+ }
+
+ raii_set_device(const tensor& dev)
+ {
+ prev_dev = get_device();
+ set_device(dev.device_id());
+ }
+
+ void operator() (int dev)
+ {
+ set_device(dev);
+ }
+
+ void operator() (const tensor& dev)
+ {
+ set_device(dev.device_id());
+ }
+
+ ~raii_set_device() noexcept(false)
+ {
+ set_device(prev_dev);
+ }
+
+ private:
+ int prev_dev;
+ };
+
+
+#ifdef DLIB_USE_CUDA
+
+ class enable_peer_access
+ {
+ public:
+
+ enable_peer_access() = delete;
+ enable_peer_access(const enable_peer_access&) = delete;
+ enable_peer_access& operator=(const enable_peer_access&) = delete;
+
+ enable_peer_access(
+ int device_id,
+ int peer_device_id
+ );
+
+ enable_peer_access(
+ const tensor& device,
+ const tensor& peer_device
+ ) : enable_peer_access(device.device_id(), peer_device.device_id())
+ {}
+
+ ~enable_peer_access() noexcept(false);
+
+ private:
+
+ bool call_disable;
+ int device_id;
+ int peer_device_id;
+ };
+
+ // -----------------------------------------------------------------------------------
+
+ void inverse_norms (
+ resizable_tensor& invnorms,
+ const tensor& data,
+ const double eps
+ );
+
+ void dot_prods (
+ resizable_tensor& out,
+ const tensor& lhs,
+ const tensor& rhs
+ );
+
+ void dot_prods (
+ bool add_to,
+ tensor& out,
+ const tensor& lhs,
+ const tensor& rhs
+ );
+
+ void scale_columns (
+ tensor& out,
+ const tensor& m,
+ const tensor& v
+ );
+
+ void scale_rows (
+ tensor& out,
+ const tensor& m,
+ const tensor& v
+ );
+
+ void scale_rows2 (
+ float beta,
+ tensor& out,
+ const tensor& m1,
+ const tensor& m2,
+ const tensor& v1,
+ const tensor& v2
+ );
+
+ void exp (
+ tensor& dest,
+ const tensor& src
+ );
+
+ void log (
+ tensor& dest,
+ const tensor& src
+ );
+
+ void log10 (
+ tensor& dest,
+ const tensor& src
+ );
+
+ // ------------------------------------------------------------------------------------
+
+ void set_tensor (
+ tensor& t,
+ float value
+ );
+
+ void scale_tensor (
+ tensor& t,
+ float value
+ );
+
+ // ------------------------------------------------------------------------------------
+
+ void multiply (
+ bool add_to,
+ tensor& dest,
+ const tensor& src1,
+ const tensor& src2
+ );
+
+ void multiply_conv (
+ bool add_to,
+ tensor& dest,
+ const tensor& src1,
+ const tensor& src2
+ );
+
+ void multiply_zero_padded (
+ bool add_to,
+ tensor& dest,
+ const tensor& src1,
+ const tensor& src2
+ );
+
+ void scale_channels (
+ bool add_to,
+ tensor& dest,
+ const tensor& src,
+ const tensor& scales
+ );
+
+ void add (
+ tensor& dest,
+ const tensor& src1,
+ const tensor& src2
+ );
+
+ // -----------------------------------------------------------------------------------
+
+ void affine_transform(
+ tensor& dest,
+ const tensor& src,
+ const float A,
+ const float B
+ );
+
+ void affine_transform(
+ tensor& dest,
+ const tensor& src,
+ const float A
+ );
+
+ void affine_transform(
+ tensor& dest,
+ const tensor& src1,
+ const tensor& src2,
+ const float A,
+ const float B,
+ const float C
+ );
+
+ void affine_transform(
+ tensor& dest,
+ const tensor& src1,
+ const tensor& src2,
+ const float A,
+ const float B
+ );
+
+ void affine_transform(
+ tensor& dest,
+ const tensor& src1,
+ const tensor& src2,
+ const tensor& src3,
+ const float A,
+ const float B,
+ const float C,
+ const float D
+ );
+
+ void affine_transform_range(
+ size_t begin,
+ size_t end,
+ tensor& dest,
+ const tensor& src1,
+ const tensor& src2,
+ const tensor& src3,
+ const float A,
+ const float B,
+ const float C
+ );
+
+ void affine_transform(
+ const rectangle& rect,
+ tensor& dest,
+ const tensor& src1,
+ const tensor& src2,
+ const tensor& src3,
+ float A,
+ float B,
+ float C
+ );
+
+ // Note that this function isn't in the tt:: namespace because add_scaled() is
+ // called by cuda::add() so we don't need a tt:: version of add_scaled().
+ void add_scaled(
+ tensor& dest,
+ const float scale,
+ const tensor& src
+ );
+
+ void add_cv_to_all_columns(
+ float beta,
+ tensor& dest,
+ float alpha,
+ const tensor& src
+ );
+
+ // -----------------------------------------------------------------------------------
+
+ void affine_transform(
+ tensor& dest,
+ const tensor& src,
+ const tensor& A,
+ const tensor& B
+ );
+
+ // -----------------------------------------------------------------------------------
+
+ void affine_transform_conv(
+ tensor& dest,
+ const tensor& src,
+ const tensor& A,
+ const tensor& B
+ );
+
+ // ----------------------------------------------------------------------------------------
+
+ void compute_adam_update (
+ size_t begin,
+ size_t end,
+ tensor& s,
+ tensor& m,
+ tensor& v,
+ const float t,
+ const float learning_rate,
+ const float weight_decay,
+ const float momentum1,
+ const float momentum2,
+ const tensor& params,
+ const tensor& params_grad
+ );
+
+ // -----------------------------------------------------------------------------------
+
+ void assign_bias_gradient (
+ tensor& grad,
+ const tensor& gradient_input
+ );
+
+ // -----------------------------------------------------------------------------------
+
+ void threshold (
+ tensor& data,
+ float thresh
+ );
+
+ // ----------------------------------------------------------------------------------------
+
+ void dot (
+ const tensor& a,
+ const tensor& b,
+ tensor& result,
+ size_t idx
+ );
+
+ // ----------------------------------------------------------------------------------------
+
+ void prelu (
+ tensor& dest,
+ const tensor& src,
+ const tensor& param
+ );
+
+ void prelu_gradient (
+ tensor& grad,
+ const tensor& src,
+ const tensor& gradient_input,
+ const tensor& param,
+ tensor& params_grad
+ );
+
+
+ // ----------------------------------------------------------------------------------------
+
+ void resize_bilinear (
+ tensor& dest,
+ long dest_row_stride,
+ long dest_channel_stride,
+ const tensor& src,
+ long src_row_stride,
+ long src_channel_stride
+ );
+
+ void resize_bilinear_gradient (
+ tensor& grad,
+ long grad_row_stride,
+ long grad_channel_stride,
+ const tensor& gradient_input,
+ long gradient_input_row_stride,
+ long gradient_input_channel_stride
+ );
+
+ inline void resize_bilinear (
+ tensor& dest,
+ const tensor& src
+ ) { resize_bilinear(dest, dest.nc(), dest.nr()*dest.nc(), src, src.nc(), src.nr()*src.nc()); }
+
+ inline void resize_bilinear_gradient (
+ tensor& grad,
+ const tensor& gradient_input
+ ) { resize_bilinear_gradient(grad, grad.nc(), grad.nr()*grad.nc(), gradient_input, gradient_input.nc(), gradient_input.nr()*gradient_input.nc()); }
+
+ // ----------------------------------------------------------------------------------------
+
+ void copy_tensor(
+ bool add_to,
+ tensor& dest,
+ size_t dest_k_offset,
+ const tensor& src,
+ size_t src_k_offset,
+ size_t count_k
+ );
+
+ // ------------------------------------------------------------------------------------
+ // ------------------------------------------------------------------------------------
+ // ------------------------------------------------------------------------------------
+ // ------------------------------------------------------------------------------------
+
+#else // if DLIB_USE_CUDA NOT DEFINED
+
+ inline void set_device (
+ int id
+ )
+ {
+ DLIB_CASSERT(id == 0, "dlib::cuda::set_device(id) called with an invalid device id.");
+ }
+
+ inline int get_device (
+ ){ return 0; }
+
+ inline int get_num_devices (
+ ) { return 1; }
+
+ inline std::string get_device_name (
+ int device
+ )
+ {
+ DLIB_CASSERT(device == 0, "dlib::cuda::set_device(id) called with an invalid device id.");
+ return "CUDA_DISABLED";
+ }
+
+ inline void set_current_device_blocking_sync(
+ ) {}
+
+
+ inline bool can_access_peer (int , int )
+ { return false; }
+ inline bool can_access_peer (const tensor& , const tensor& )
+ { return false; }
+
+ inline void device_synchronize (int ){}
+ inline void device_synchronize (const tensor& ){}
+
+ class enable_peer_access
+ {
+ public:
+ enable_peer_access() = delete;
+ enable_peer_access(const enable_peer_access&) = delete;
+ enable_peer_access& operator=(const enable_peer_access&) = delete;
+ enable_peer_access( int, int ){}
+ enable_peer_access( const tensor&, const tensor& ) {}
+ };
+
+#endif // DLIB_USE_CUDA
+
+ }
+}
+
+
+#endif // DLIB_DNN_CuDA_H_
+
diff --git a/ml/dlib/dlib/dnn/cuda_errors.h b/ml/dlib/dlib/dnn/cuda_errors.h
new file mode 100644
index 000000000..fd28693c2
--- /dev/null
+++ b/ml/dlib/dlib/dnn/cuda_errors.h
@@ -0,0 +1,70 @@
+// Copyright (C) 2015 Davis E. King (davis@dlib.net)
+// License: Boost Software License See LICENSE.txt for the full license.
+#ifndef DLIB_CUDA_ERRORs_H_
+#define DLIB_CUDA_ERRORs_H_
+
+
+#include "../error.h"
+
+namespace dlib
+{
+ struct cuda_error : public error
+ {
+ /*!
+ WHAT THIS OBJECT REPRESENTS
+ This is the exception thrown if any calls to the NVIDIA CUDA runtime
+ returns an error.
+ !*/
+
+ cuda_error(const std::string& message): error(message) {}
+ };
+
+
+ struct cudnn_error : public cuda_error
+ {
+ /*!
+ WHAT THIS OBJECT REPRESENTS
+ This is the exception thrown if any calls to the NVIDIA cuDNN library
+ returns an error.
+ !*/
+
+ cudnn_error(const std::string& message): cuda_error(message) {}
+ };
+
+ struct curand_error : public cuda_error
+ {
+ /*!
+ WHAT THIS OBJECT REPRESENTS
+ This is the exception thrown if any calls to the NVIDIA cuRAND library
+ returns an error.
+ !*/
+
+ curand_error(const std::string& message): cuda_error(message) {}
+ };
+
+ struct cublas_error : public cuda_error
+ {
+ /*!
+ WHAT THIS OBJECT REPRESENTS
+ This is the exception thrown if any calls to the NVIDIA cuBLAS library
+ returns an error.
+ !*/
+
+ cublas_error(const std::string& message): cuda_error(message) {}
+ };
+
+ struct cusolver_error : public cuda_error
+ {
+ /*!
+ WHAT THIS OBJECT REPRESENTS
+ This is the exception thrown if any calls to the NVIDIA cuSolver library
+ returns an error.
+ !*/
+
+ cusolver_error(const std::string& message): cuda_error(message) {}
+ };
+}
+
+
+#endif // DLIB_CUDA_ERRORs_H_
+
diff --git a/ml/dlib/dlib/dnn/cuda_utils.h b/ml/dlib/dlib/dnn/cuda_utils.h
new file mode 100644
index 000000000..673a4e8ad
--- /dev/null
+++ b/ml/dlib/dlib/dnn/cuda_utils.h
@@ -0,0 +1,413 @@
+// Copyright (C) 2015 Davis E. King (davis@dlib.net)
+// License: Boost Software License See LICENSE.txt for the full license.
+#ifndef DLIB_CUDA_UtILS_H_
+#define DLIB_CUDA_UtILS_H_
+
+#ifndef DLIB_USE_CUDA
+#error "This file shouldn't be #included unless DLIB_USE_CUDA is #defined"
+#endif
+
+#include "cuda_errors.h"
+#include "../algs.h"
+#include <cmath>
+
+#include <cuda_runtime.h>
+#include <sstream>
+#include <iostream>
+#include <memory>
+#include <vector>
+#include <type_traits>
+
+
+// Check the return value of a call to the CUDA runtime for an error condition.
+#define CHECK_CUDA(call) \
+do{ \
+ const cudaError_t error = call; \
+ if (error != cudaSuccess) \
+ { \
+ std::ostringstream sout; \
+ sout << "Error while calling " << #call << " in file " << __FILE__ << ":" << __LINE__ << ". ";\
+ sout << "code: " << error << ", reason: " << cudaGetErrorString(error);\
+ throw dlib::cuda_error(sout.str()); \
+ } \
+}while(false)
+
+// ----------------------------------------------------------------------------------------
+
+#ifdef __CUDACC__
+
+namespace dlib
+{
+ namespace cuda
+ {
+
+ // ------------------------------------------------------------------------------------
+
+ __inline__ __device__ size_t pack_idx (
+ size_t dim_size3,
+ size_t dim_size2,
+ size_t dim_size1,
+ size_t idx4,
+ size_t idx3,
+ size_t idx2,
+ size_t idx1
+ )
+ /*!
+ ensures
+ - Converts a 4D array index into a 1D index assuming row major layout. To
+ understand precisely what this function does, imagine we had an array
+ declared like this:
+ int ARRAY[anything][dim_size3][dim_size2][dim_size1];
+ Then we could index it like this:
+ ARRAY[idx4][idx3][idx2][idx1]
+ or equivalently like this:
+ ((int*)ARRAY)[pack_idx(dim_size3,dim_size2,dim_size1, idx4,idx3,idx2,idx1)]
+ !*/
+ {
+ return ((idx4*dim_size3 + idx3)*dim_size2 + idx2)*dim_size1 + idx1;
+ }
+
+ __inline__ __device__ void unpack_idx (
+ size_t idx,
+ size_t dim_size3,
+ size_t dim_size2,
+ size_t dim_size1,
+ size_t& idx4,
+ size_t& idx3,
+ size_t& idx2,
+ size_t& idx1
+ )
+ /*!
+ ensures
+ - This function computes the inverse of pack_idx(). Therefore,
+ if PACKED == pack_idx(dim_size3,dim_size2,dim_size1, idx4,idx3,idx2,idx1)
+ then unpack_idx(PACKED,dim_size3,dim_size2,dim_size1, IDX4,IDX3,IDX2,IDX1)
+ results in:
+ - IDX1 == idx1
+ - IDX2 == idx2
+ - IDX3 == idx3
+ - IDX4 == idx4
+ !*/
+ {
+ idx1 = idx%dim_size1;
+
+ idx /= dim_size1;
+ idx2 = idx%dim_size2;
+
+ idx /= dim_size2;
+ idx3 = idx%dim_size3;
+
+ idx /= dim_size3;
+ idx4 = idx;
+ }
+
+ // ------------------------------------------------------------------------------------
+
+ // This function is from the article:
+ // http://devblogs.nvidia.com/parallelforall/faster-parallel-reductions-kepler/
+ __inline__ __device__ float warp_reduce_sum(float val)
+ {
+ for (int offset = warpSize/2; offset > 0; offset /= 2)
+#if CUDART_VERSION >= 9000
+ val += __shfl_down_sync(0xFFFFFFFF,val, offset);
+#else
+ val += __shfl_down(val, offset);
+#endif
+ return val;
+ }
+
+ __inline__ __device__ bool is_first_thread_in_warp()
+ {
+ return (threadIdx.x & (warpSize - 1)) == 0;
+ }
+
+ __inline__ __device__ void warp_reduce_atomic_add(
+ float& out,
+ float val
+ )
+ /*!
+ ensures
+ - Atomically adds all the val variables in the current warp to out.
+ See this page for an extended discussion:
+ http://devblogs.nvidia.com/parallelforall/faster-parallel-reductions-kepler/
+ !*/
+ {
+ val = warp_reduce_sum(val);
+ if (is_first_thread_in_warp())
+ atomicAdd(&out, val);
+ }
+
+ // ------------------------------------------------------------------------------------
+
+ struct max_jobs
+ {
+ max_jobs(int x) : num_x(x) {}
+ max_jobs(int x, int y) : num_x(x), num_y(y) {}
+ int num_x;
+ int num_y = 1;
+ };
+
+ template <typename Kernel, typename... T>
+ void launch_kernel (
+ Kernel K,
+ T ...args
+ )
+ /*!
+ ensures
+ - launches the given kernel K(args...). The point of this function is to
+ automatically set the kernel launch parameters to something reasonable
+ based on the properties of the kernel and the current GPU card.
+ !*/
+ {
+ int num_blocks, num_threads;
+ CHECK_CUDA(cudaOccupancyMaxPotentialBlockSize(&num_blocks,&num_threads,K));
+ K<<<num_blocks,num_threads>>>(args...);
+ }
+
+ template <typename Kernel, typename... T>
+ void launch_kernel (
+ Kernel K,
+ max_jobs m,
+ T ...args
+ )
+ /*!
+ ensures
+ - This function is just like launch_kernel(K,args...) except that you can
+ additionally supply a max_jobs number that tells it how many possible
+ total threads could be used. This is useful when launching potentially
+ small jobs that might not need the number of threads suggested by
+ launch_kernel().
+ !*/
+ {
+ if (m.num_x == 0 || m.num_y == 0)
+ return;
+ int num_blocks, num_threads;
+ CHECK_CUDA(cudaOccupancyMaxPotentialBlockSize(&num_blocks,&num_threads,K));
+ // Check if the job is really small and we don't really need to launch a kernel
+ // with this many blocks and threads.
+ if (num_blocks*num_threads > m.num_x*m.num_y)
+ num_blocks = (m.num_x*m.num_y+num_threads-1)/num_threads;
+
+ if (m.num_y == 1)
+ {
+ K<<<num_blocks,num_threads>>>(args...);
+ }
+ else
+ {
+ /*
+ In general, the reason m.num_y!=1 (i.e. the reason you are in this
+ code path) is because we are using nested grid-stride loops. There are
+ two important things to note about what we are doing here. To
+ illustrate them we will talk about this little CUDA code snippet:
+
+ // initialize out before we begin.
+ for (auto i : grid_stride_range_y(0, nr))
+ for (auto j : grid_stride_range(0, 1))
+ out[i] = 0;
+
+ __syncthreads(); // synchronize threads in block
+
+ // loop over some 2D thing and sum and store things into out.
+ for (auto i : grid_stride_range_y(0, nr))
+ {
+ float temp = 0;
+ for (auto j : grid_stride_range(0, nc))
+ temp += whatever[i*nc+j];
+
+ // store the sum into out[i]
+ warp_reduce_atomic_add(out[i], temp);
+ }
+
+ First, we make sure the number of x threads is a multiple of 32 so that
+ you can use warp_reduce_atomic_add() inside the y loop.
+
+ Second, we put the x block size to 1 so inter-block synchronization is
+ easier. For example, if the number of x blocks wasn't 1 the above code
+ would have a race condition in it. This is because the execution of
+ out[i]=0 would be done by blocks with blockIdx.x==0, but then in the
+ second set of loops, *all* the x blocks use out[i]. Since
+ __syncthreads() doesn't do any synchronization between blocks some of
+ the blocks might begin before the out[i]=0 statements finished and that
+ would be super bad.
+ */
+
+ // Try and make sure that the ratio of x to y threads is reasonable based
+ // on the respective size of our loops.
+ int x_threads = 32;
+ int y_threads = num_threads/32;
+ const int ratio = static_cast<int>(std::round(put_in_range(1, y_threads, m.num_x/(double)m.num_y)));
+ x_threads *= ratio;
+ y_threads /= ratio;
+
+ dim3 blocks(1,num_blocks);
+ dim3 threads(x_threads,y_threads);
+ K<<<blocks,threads>>>(args...);
+ }
+ }
+
+ // ------------------------------------------------------------------------------------
+
+ class grid_stride_range
+ {
+ /*!
+ WHAT THIS OBJECT REPRESENTS
+ This is a tool for making a for loop that loops over an entire block of
+ memory inside a kernel, but doing so in a way that parallelizes
+ appropriately across all the threads in a kernel launch. For example,
+ the following kernel would add the vector a to the vector b and store
+ the output in out (assuming all vectors are of dimension n):
+ __global__ void add_arrays(
+ const float* a,
+ const float* b,
+ float* out,
+ size_t n
+ )
+ {
+ for (auto i : grid_stride_range(0, n))
+ {
+ out[i] = a[i]+b[i];
+ }
+ }
+ !*/
+
+ public:
+ __device__ grid_stride_range(
+ size_t ibegin_,
+ size_t iend_
+ ) :
+ ibegin(ibegin_),
+ iend(iend_)
+ {}
+
+ class iterator
+ {
+ public:
+ __device__ iterator() {}
+ __device__ iterator(size_t pos_) : pos(pos_) {}
+
+ __device__ size_t operator*() const
+ {
+ return pos;
+ }
+
+ __device__ iterator& operator++()
+ {
+ pos += gridDim.x * blockDim.x;
+ return *this;
+ }
+
+ __device__ bool operator!=(const iterator& item) const
+ { return pos < item.pos; }
+
+ private:
+ size_t pos;
+ };
+
+ __device__ iterator begin() const
+ {
+ return iterator(ibegin+blockDim.x * blockIdx.x + threadIdx.x);
+ }
+ __device__ iterator end() const
+ {
+ return iterator(iend);
+ }
+ private:
+
+ size_t ibegin;
+ size_t iend;
+ };
+
+ // ------------------------------------------------------------------------------------
+
+ class grid_stride_range_y
+ {
+ /*!
+ WHAT THIS OBJECT REPRESENTS
+ This object is just like grid_stride_range except that it looks at
+ CUDA's y thread index (e.g. threadIdx.y) instead of the x index.
+ Therefore, if you launch a cuda kernel with a statement like:
+ dim3 blocks(1,10);
+ dim3 threads(32,32); // You need to have x and y not equal to 1 to get parallelism over both loops.
+ add_arrays<<<blocks,threads>>>(a,b,out,nr,nc);
+ You can perform a nested 2D parallel for loop rather than doing just a
+ 1D for loop.
+
+ So the code in the kernel would look like this if you wanted to add two
+ 2D matrices:
+ __global__ void add_arrays(
+ const float* a,
+ const float* b,
+ float* out,
+ size_t nr,
+ size_t nc
+ )
+ {
+ for (auto r : grid_stride_range_y(0, nr))
+ {
+ for (auto c : grid_stride_range(0, nc))
+ {
+ auto i = r*nc+c;
+ out[i] = a[i]+b[i];
+ }
+ }
+ }
+ !*/
+
+ public:
+ __device__ grid_stride_range_y(
+ size_t ibegin_,
+ size_t iend_
+ ) :
+ ibegin(ibegin_),
+ iend(iend_)
+ {}
+
+ class iterator
+ {
+ public:
+ __device__ iterator() {}
+ __device__ iterator(size_t pos_) : pos(pos_) {}
+
+ __device__ size_t operator*() const
+ {
+ return pos;
+ }
+
+ __device__ iterator& operator++()
+ {
+ pos += gridDim.y * blockDim.y;
+ return *this;
+ }
+
+ __device__ bool operator!=(const iterator& item) const
+ { return pos < item.pos; }
+
+ private:
+ size_t pos;
+ };
+
+ __device__ iterator begin() const
+ {
+ return iterator(ibegin+blockDim.y * blockIdx.y + threadIdx.y);
+ }
+ __device__ iterator end() const
+ {
+ return iterator(iend);
+ }
+ private:
+
+ size_t ibegin;
+ size_t iend;
+ };
+
+ // ------------------------------------------------------------------------------------
+
+ }
+}
+
+#endif // __CUDACC__
+
+// ----------------------------------------------------------------------------------------
+
+#endif // DLIB_CUDA_UtILS_H_
+
diff --git a/ml/dlib/dlib/dnn/cudnn_dlibapi.cpp b/ml/dlib/dlib/dnn/cudnn_dlibapi.cpp
new file mode 100644
index 000000000..6926561f1
--- /dev/null
+++ b/ml/dlib/dlib/dnn/cudnn_dlibapi.cpp
@@ -0,0 +1,1604 @@
+// Copyright (C) 2015 Davis E. King (davis@dlib.net)
+// License: Boost Software License See LICENSE.txt for the full license.
+#ifndef DLIB_DNN_CuDNN_CPP_
+#define DLIB_DNN_CuDNN_CPP_
+
+#ifdef DLIB_USE_CUDA
+
+#include "cudnn_dlibapi.h"
+#include "tensor.h"
+#include <cudnn.h>
+#include <iostream>
+#include <string>
+#include <vector>
+#include "cuda_utils.h"
+#include "cpu_dlib.h"
+#include "cuda_dlib.h"
+#include "tensor_tools.h"
+
+static const char* cudnn_get_error_string(cudnnStatus_t s)
+{
+ switch(s)
+ {
+ case CUDNN_STATUS_NOT_INITIALIZED:
+ return "CUDA Runtime API initialization failed.";
+ case CUDNN_STATUS_ALLOC_FAILED:
+ return "CUDA Resources could not be allocated.";
+ case CUDNN_STATUS_BAD_PARAM:
+ return "CUDNN_STATUS_BAD_PARAM";
+ case CUDNN_STATUS_EXECUTION_FAILED:
+ return "CUDNN_STATUS_EXECUTION_FAILED";
+ case CUDNN_STATUS_NOT_SUPPORTED:
+ return "CUDNN_STATUS_NOT_SUPPORTED";
+ case CUDNN_STATUS_ARCH_MISMATCH:
+ return "CUDNN_STATUS_ARCH_MISMATCH: Your GPU is too old and not supported by cuDNN";
+ default:
+ return "A call to cuDNN failed";
+ }
+}
+
+// Check the return value of a call to the cuDNN runtime for an error condition.
+#define CHECK_CUDNN(call) \
+do{ \
+ const cudnnStatus_t error = call; \
+ if (error != CUDNN_STATUS_SUCCESS) \
+ { \
+ std::ostringstream sout; \
+ sout << "Error while calling " << #call << " in file " << __FILE__ << ":" << __LINE__ << ". ";\
+ sout << "code: " << error << ", reason: " << cudnn_get_error_string(error);\
+ throw dlib::cudnn_error(sout.str()); \
+ } \
+}while(false)
+
+
+namespace dlib
+{
+
+ namespace cuda
+ {
+
+ // ------------------------------------------------------------------------------------
+
+ static cudnnTensorDescriptor_t descriptor(const tensor& t)
+ {
+ return (const cudnnTensorDescriptor_t)t.get_cudnn_tensor_descriptor().get_handle();
+ }
+ static cudnnTensorDescriptor_t descriptor(const tensor_descriptor& t)
+ {
+ return (const cudnnTensorDescriptor_t)t.get_handle();
+ }
+
+ // ------------------------------------------------------------------------------------
+
+ class cudnn_context
+ {
+ public:
+ // not copyable
+ cudnn_context(const cudnn_context&) = delete;
+ cudnn_context& operator=(const cudnn_context&) = delete;
+
+ cudnn_context()
+ {
+ handles.resize(16);
+ }
+ ~cudnn_context()
+ {
+ for (auto h : handles)
+ {
+ if (h)
+ cudnnDestroy(h);
+ }
+ }
+
+ cudnnHandle_t get_handle (
+ )
+ {
+ int new_device_id;
+ CHECK_CUDA(cudaGetDevice(&new_device_id));
+ // make room for more devices if needed
+ if (new_device_id >= (long)handles.size())
+ handles.resize(new_device_id+16);
+
+ // If we don't have a handle already for this device then make one
+ if (!handles[new_device_id])
+ CHECK_CUDNN(cudnnCreate(&handles[new_device_id]));
+
+ // Finally, return the handle for the current device
+ return handles[new_device_id];
+ }
+
+ private:
+
+ std::vector<cudnnHandle_t> handles;
+ };
+
+ static cudnnHandle_t context()
+ {
+ thread_local cudnn_context c;
+ return c.get_handle();
+ }
+ // ------------------------------------------------------------------------------------
+
+ class cudnn_device_buffer
+ {
+ public:
+ // not copyable
+ cudnn_device_buffer(const cudnn_device_buffer&) = delete;
+ cudnn_device_buffer& operator=(const cudnn_device_buffer&) = delete;
+
+ cudnn_device_buffer()
+ {
+ buffers.resize(16);
+ }
+ ~cudnn_device_buffer()
+ {
+ }
+
+ std::shared_ptr<resizable_cuda_buffer> get_buffer (
+ )
+ {
+ int new_device_id;
+ CHECK_CUDA(cudaGetDevice(&new_device_id));
+ // make room for more devices if needed
+ if (new_device_id >= (long)buffers.size())
+ buffers.resize(new_device_id+16);
+
+ // If we don't have a buffer already for this device then make one
+ std::shared_ptr<resizable_cuda_buffer> buff = buffers[new_device_id].lock();
+ if (!buff)
+ {
+ buff = std::make_shared<resizable_cuda_buffer>();
+ buffers[new_device_id] = buff;
+ }
+
+ // Finally, return the buffer for the current device
+ return buff;
+ }
+
+ private:
+
+ std::vector<std::weak_ptr<resizable_cuda_buffer>> buffers;
+ };
+
+
+ static std::shared_ptr<resizable_cuda_buffer> device_global_buffer()
+ {
+ thread_local cudnn_device_buffer buffer;
+ return buffer.get_buffer();
+ }
+ // ------------------------------------------------------------------------------------
+
+ class cudnn_activation_descriptor
+ {
+ public:
+ // not copyable
+ cudnn_activation_descriptor(const cudnn_activation_descriptor&) = delete;
+ cudnn_activation_descriptor& operator=(const cudnn_activation_descriptor&) = delete;
+
+ cudnn_activation_descriptor(
+ cudnnActivationMode_t mode,
+ cudnnNanPropagation_t reluNanOpt,
+ double reluCeiling
+ )
+ {
+ CHECK_CUDNN(cudnnCreateActivationDescriptor(&handle));
+ CHECK_CUDNN(cudnnSetActivationDescriptor(handle, mode, reluNanOpt, reluCeiling));
+ }
+
+ ~cudnn_activation_descriptor()
+ {
+ cudnnDestroyActivationDescriptor(handle);
+ }
+
+ cudnnActivationDescriptor_t get_handle (
+ )
+ {
+ return handle;
+ }
+ private:
+ cudnnActivationDescriptor_t handle;
+ };
+
+ static cudnnActivationDescriptor_t relu_activation_descriptor()
+ {
+ thread_local cudnn_activation_descriptor des(CUDNN_ACTIVATION_RELU, CUDNN_PROPAGATE_NAN,0);
+ return des.get_handle();
+ }
+
+ static cudnnActivationDescriptor_t sigmoid_activation_descriptor()
+ {
+ thread_local cudnn_activation_descriptor des(CUDNN_ACTIVATION_SIGMOID, CUDNN_PROPAGATE_NAN,0);
+ return des.get_handle();
+ }
+
+ static cudnnActivationDescriptor_t tanh_activation_descriptor()
+ {
+ thread_local cudnn_activation_descriptor des(CUDNN_ACTIVATION_TANH, CUDNN_PROPAGATE_NAN,0);
+ return des.get_handle();
+ }
+
+ // ------------------------------------------------------------------------------------
+
+ tensor_descriptor::
+ tensor_descriptor(
+ ) : handle(nullptr)
+ {
+ }
+
+ tensor_descriptor::
+ ~tensor_descriptor()
+ {
+ set_size(0,0,0,0);
+ }
+
+ void tensor_descriptor::
+ set_size(
+ int n,
+ int k,
+ int nr,
+ int nc
+ )
+ {
+ if (handle)
+ {
+ cudnnDestroyTensorDescriptor((cudnnTensorDescriptor_t)handle);
+ handle = nullptr;
+ }
+
+ if (n != 0 && nr != 0 && nc != 0 && k != 0)
+ {
+ cudnnTensorDescriptor_t h;
+ CHECK_CUDNN(cudnnCreateTensorDescriptor(&h));
+ handle = h;
+
+ CHECK_CUDNN(cudnnSetTensor4dDescriptor((cudnnTensorDescriptor_t)handle,
+ CUDNN_TENSOR_NCHW,
+ CUDNN_DATA_FLOAT,
+ n,
+ k,
+ nr,
+ nc));
+ }
+ }
+
+ void tensor_descriptor::
+ get_size (
+ int& n,
+ int& k,
+ int& nr,
+ int& nc
+ ) const
+ {
+ if (handle)
+ {
+ int nStride, cStride, hStride, wStride;
+ cudnnDataType_t datatype;
+ CHECK_CUDNN(cudnnGetTensor4dDescriptor((cudnnTensorDescriptor_t)handle,
+ &datatype,
+ &n,
+ &k,
+ &nr,
+ &nc,
+ &nStride,
+ &cStride,
+ &hStride,
+ &wStride));
+ }
+ else
+ {
+ n = 0;
+ k = 0;
+ nr = 0;
+ nc = 0;
+ }
+ }
+
+ // ------------------------------------------------------------------------------------
+
+ void add(
+ float beta,
+ tensor& dest,
+ float alpha,
+ const tensor& src
+ )
+ {
+ DLIB_CASSERT(
+ (have_same_dimensions(src, dest) ||
+ (src.num_samples()==1 && src.k()==dest.k() && src.nr()==1 && src.nc()==1) ||
+ (src.num_samples()==1 && src.k()==dest.k() && src.nr()==dest.nr() && src.nc()==dest.nc()) ||
+ (src.num_samples()==1 && src.k()==1 && src.nr()==dest.nr() && src.nc()==dest.nc()) ||
+ (src.num_samples()==dest.num_samples() && src.k()==1 && src.nr()==1 && src.nc()==1)) &&
+ is_same_object(src,dest) == false ,
+ "\n\t dest.num_samples(): " << dest.num_samples()
+ <<"\n\t dest.k(): " << dest.k()
+ <<"\n\t dest.nr(): " << dest.nr()
+ <<"\n\t dest.nc(): " << dest.nc()
+ <<"\n\t src.num_samples(): " << src.num_samples()
+ <<"\n\t src.k(): " << src.k()
+ <<"\n\t src.nr(): " << src.nr()
+ <<"\n\t src.nc(): " << src.nc()
+ );
+
+ if (dest.size() == src.size() && beta == 1)
+ {
+ // Call the dlib function in this case since it's faster than the one that
+ // comes with cuDNN (at least as of cuDNN v4).
+ add_scaled(dest, alpha, src);
+ return;
+ }
+ else if (src.num_samples()==dest.num_samples() && src.k()==1 && src.nr()==1 && src.nc()==1)
+ {
+ add_cv_to_all_columns(beta, dest, alpha, src);
+ return;
+ }
+
+ CHECK_CUDNN(cudnnAddTensor(context(),
+ &alpha,
+ descriptor(src),
+ src.device(),
+ &beta,
+ descriptor(dest),
+ dest.device()));
+ }
+
+ void assign_conv_bias_gradient (
+ tensor& grad,
+ const tensor& gradient_input
+ )
+ {
+ DLIB_CASSERT(
+ grad.num_samples() == 1 &&
+ grad.k() >= 1 &&
+ grad.nr() == 1 &&
+ grad.nc() == 1 &&
+ gradient_input.k() == grad.k() &&
+ gradient_input.size() > 0 &&
+ is_same_object(grad,gradient_input) == false
+ );
+
+ const float alpha = 1;
+ const float beta = 0;
+ CHECK_CUDNN(cudnnConvolutionBackwardBias(context(),
+ &alpha,
+ descriptor(gradient_input),
+ gradient_input.device(),
+ &beta,
+ descriptor(grad),
+ grad.device()));
+ }
+
+ // ------------------------------------------------------------------------------------
+
+ void batch_normalize_inference (
+ const double eps,
+ resizable_tensor& dest,
+ const tensor& src,
+ const tensor& gamma,
+ const tensor& beta,
+ const tensor& running_means,
+ const tensor& running_variances
+ )
+ {
+ DLIB_CASSERT(
+ gamma.num_samples() == 1 &&
+ gamma.nr() == src.nr() &&
+ gamma.nc() == src.nc() &&
+ gamma.k() == src.k() &&
+ have_same_dimensions(gamma, beta) &&
+ have_same_dimensions(gamma, running_means) &&
+ have_same_dimensions(gamma, running_variances) &&
+ eps > 0,
+ "\ngamma.num_samples(): " << gamma.num_samples() <<
+ "\ngamma.k(): " << gamma.k() <<
+ "\ngamma.nr(): " << gamma.nr() <<
+ "\ngamma.nc(): " << gamma.nc() <<
+ "\nbeta.num_samples(): " << beta.num_samples() <<
+ "\nbeta.k(): " << beta.k() <<
+ "\nbeta.nr(): " << beta.nr() <<
+ "\nbeta.nc(): " << beta.nc() <<
+ "\nrunning_means.num_samples(): " << running_means.num_samples() <<
+ "\nrunning_means.k(): " << running_means.k() <<
+ "\nrunning_means.nr(): " << running_means.nr() <<
+ "\nrunning_means.nc(): " << running_means.nc() <<
+ "\nrunning_variances.num_samples(): " << running_variances.num_samples() <<
+ "\nrunning_variances.k(): " << running_variances.k() <<
+ "\nrunning_variances.nr(): " << running_variances.nr() <<
+ "\nrunning_variances.nc(): " << running_variances.nc() <<
+ "\nsrc.k(): " << src.k() <<
+ "\nsrc.nr(): " << src.nr() <<
+ "\nsrc.nc(): " << src.nc() <<
+ "\neps: " << eps
+ );
+ const float in_scale = 1;
+ const float out_scale = 0;
+
+ dest.copy_size(src);
+
+ CHECK_CUDNN(cudnnBatchNormalizationForwardInference(
+ context(),
+ CUDNN_BATCHNORM_PER_ACTIVATION,
+ &in_scale,
+ &out_scale,
+ descriptor(src),
+ src.device(),
+ descriptor(dest),
+ dest.device(),
+ descriptor(gamma),
+ gamma.device(),
+ beta.device(),
+ running_means.device(),
+ running_variances.device(),
+ eps));
+ }
+
+ void batch_normalize (
+ const double eps,
+ resizable_tensor& dest,
+ resizable_tensor& means,
+ resizable_tensor& invstds,
+ const double averaging_factor,
+ resizable_tensor& running_means,
+ resizable_tensor& running_variances,
+ const tensor& src,
+ const tensor& gamma,
+ const tensor& beta
+ )
+ {
+ DLIB_CASSERT(0 <= averaging_factor && averaging_factor <= 1, "averaging_factor: " << averaging_factor);
+ DLIB_CASSERT(averaging_factor==1 || have_same_dimensions(running_means,means));
+ DLIB_CASSERT(averaging_factor==1 || have_same_dimensions(running_variances,invstds));
+ DLIB_CASSERT(
+ src.num_samples() > 1 &&
+ gamma.num_samples() == 1 &&
+ beta.num_samples() == 1 &&
+ gamma.nr() == beta.nr() && beta.nr() == src.nr() &&
+ gamma.nc() == beta.nc() && beta.nc() == src.nc() &&
+ gamma.k() == beta.k() && beta.k() == src.k() &&
+ eps > 0,
+ "\ngamma.num_samples(): " << gamma.num_samples() <<
+ "\ngamma.k(): " << gamma.k() <<
+ "\ngamma.nr(): " << gamma.nr() <<
+ "\ngamma.nc(): " << gamma.nc() <<
+ "\nbeta.num_samples(): " << beta.num_samples() <<
+ "\nbeta.k(): " << beta.k() <<
+ "\nbeta.nr(): " << beta.nr() <<
+ "\nbeta.nc(): " << beta.nc() <<
+ "\nsrc.k(): " << src.k() <<
+ "\nsrc.nr(): " << src.nr() <<
+ "\nsrc.nc(): " << src.nc() <<
+ "\neps: " << eps
+ );
+
+ const float in_scale = 1;
+ const float out_scale = 0;
+
+ dest.copy_size(src);
+ means.set_size(1, src.k(), src.nr(), src.nc());
+ invstds.copy_size(means);
+ running_means.copy_size(means);
+ running_variances.copy_size(means);
+ // cuDNN requires that running_means and running_variances be initialized to
+ // some valid float values even if the averaging factor would have ignored
+ // them.
+ if (averaging_factor == 1)
+ {
+ running_means = 0;
+ running_variances = 1;
+ }
+
+ CHECK_CUDNN(cudnnBatchNormalizationForwardTraining(
+ context(),
+ CUDNN_BATCHNORM_PER_ACTIVATION,
+ &in_scale,
+ &out_scale,
+ descriptor(src),
+ src.device(),
+ descriptor(dest),
+ dest.device(),
+ descriptor(gamma),
+ gamma.device(),
+ beta.device(),
+ averaging_factor,
+ running_means.device(),
+ running_variances.device(),
+ eps,
+ means.device(),
+ invstds.device()));
+ }
+
+ void batch_normalize_gradient(
+ const double eps,
+ const tensor& gradient_input,
+ const tensor& means,
+ const tensor& invstds,
+ const tensor& src,
+ const tensor& gamma,
+ tensor& src_grad,
+ tensor& gamma_grad,
+ tensor& beta_grad
+ )
+ {
+ const long num = src.k()*src.nr()*src.nc();
+ DLIB_CASSERT(src.num_samples() > 1);
+ DLIB_CASSERT(num == (long)means.size());
+ DLIB_CASSERT(num == (long)invstds.size());
+ DLIB_CASSERT(num == (long)gamma.size());
+ DLIB_CASSERT(num == (long)gamma_grad.size());
+ DLIB_CASSERT(num == (long)beta_grad.size());
+ DLIB_CASSERT(have_same_dimensions(gradient_input, src));
+ DLIB_CASSERT(have_same_dimensions(gradient_input, src_grad));
+ DLIB_CASSERT(eps > 0);
+
+ const float in_scale = 1;
+ const float out_scale = 1;
+ const float in_scale_params = 1;
+ const float out_scale_params = 0;
+
+ CHECK_CUDNN(cudnnBatchNormalizationBackward(
+ context(),
+ CUDNN_BATCHNORM_PER_ACTIVATION,
+ &in_scale,
+ &out_scale,
+ &in_scale_params,
+ &out_scale_params,
+ descriptor(src),
+ src.device(),
+ descriptor(gradient_input),
+ gradient_input.device(),
+ descriptor(src_grad),
+ src_grad.device(),
+ descriptor(gamma),
+ gamma.device(),
+ gamma_grad.device(),
+ beta_grad.device(),
+ eps,
+ means.device(),
+ invstds.device()));
+ }
+
+ // ------------------------------------------------------------------------------------
+
+ void batch_normalize_conv_inference (
+ const double eps,
+ resizable_tensor& dest,
+ const tensor& src,
+ const tensor& gamma,
+ const tensor& beta,
+ const tensor& running_means,
+ const tensor& running_variances
+ )
+ {
+ DLIB_CASSERT(
+ gamma.num_samples() == 1 &&
+ gamma.nr() == 1 &&
+ gamma.nc() == 1 &&
+ gamma.k() == src.k() &&
+ have_same_dimensions(gamma, beta) &&
+ have_same_dimensions(gamma, running_means) &&
+ have_same_dimensions(gamma, running_variances) &&
+ eps > 0,
+ "\ngamma.num_samples(): " << gamma.num_samples() <<
+ "\ngamma.k(): " << gamma.k() <<
+ "\ngamma.nr(): " << gamma.nr() <<
+ "\ngamma.nc(): " << gamma.nc() <<
+ "\nbeta.num_samples(): " << beta.num_samples() <<
+ "\nbeta.k(): " << beta.k() <<
+ "\nbeta.nr(): " << beta.nr() <<
+ "\nbeta.nc(): " << beta.nc() <<
+ "\nrunning_means.num_samples(): " << running_means.num_samples() <<
+ "\nrunning_means.k(): " << running_means.k() <<
+ "\nrunning_means.nr(): " << running_means.nr() <<
+ "\nrunning_means.nc(): " << running_means.nc() <<
+ "\nrunning_variances.num_samples(): " << running_variances.num_samples() <<
+ "\nrunning_variances.k(): " << running_variances.k() <<
+ "\nrunning_variances.nr(): " << running_variances.nr() <<
+ "\nrunning_variances.nc(): " << running_variances.nc() <<
+ "\nsrc.k(): " << src.k() <<
+ "\nsrc.nr(): " << src.nr() <<
+ "\nsrc.nc(): " << src.nc() <<
+ "\neps: " << eps
+ );
+ const float in_scale = 1;
+ const float out_scale = 0;
+
+ dest.copy_size(src);
+
+ CHECK_CUDNN(cudnnBatchNormalizationForwardInference(
+ context(),
+ CUDNN_BATCHNORM_SPATIAL,
+ &in_scale,
+ &out_scale,
+ descriptor(src),
+ src.device(),
+ descriptor(dest),
+ dest.device(),
+ descriptor(gamma),
+ gamma.device(),
+ beta.device(),
+ running_means.device(),
+ running_variances.device(),
+ eps));
+ }
+
+ void batch_normalize_conv (
+ const double eps,
+ resizable_tensor& dest,
+ resizable_tensor& means,
+ resizable_tensor& invstds,
+ const double averaging_factor,
+ resizable_tensor& running_means,
+ resizable_tensor& running_variances,
+ const tensor& src,
+ const tensor& gamma,
+ const tensor& beta
+ )
+ {
+ DLIB_CASSERT(0 <= averaging_factor && averaging_factor <= 1, "averaging_factor: " << averaging_factor);
+ DLIB_CASSERT(averaging_factor==1 || have_same_dimensions(running_means,means));
+ DLIB_CASSERT(averaging_factor==1 || have_same_dimensions(running_variances,invstds));
+ DLIB_CASSERT(
+ src.num_samples() > 1 &&
+ gamma.num_samples() == 1 &&
+ beta.num_samples() == 1 &&
+ gamma.nr() == 1 &&
+ beta.nr() == 1 &&
+ gamma.nc() == 1 &&
+ beta.nc() == 1 &&
+ gamma.k() == beta.k() && beta.k() == src.k() &&
+ eps > 0,
+ "\ngamma.num_samples(): " << gamma.num_samples() <<
+ "\ngamma.k(): " << gamma.k() <<
+ "\ngamma.nr(): " << gamma.nr() <<
+ "\ngamma.nc(): " << gamma.nc() <<
+ "\nbeta.num_samples(): " << beta.num_samples() <<
+ "\nbeta.k(): " << beta.k() <<
+ "\nbeta.nr(): " << beta.nr() <<
+ "\nbeta.nc(): " << beta.nc() <<
+ "\nsrc.k(): " << src.k() <<
+ "\nsrc.nr(): " << src.nr() <<
+ "\nsrc.nc(): " << src.nc() <<
+ "\neps: " << eps
+ );
+ const float in_scale = 1;
+ const float out_scale = 0;
+
+ dest.copy_size(src);
+ means.set_size(1, src.k());
+ invstds.copy_size(means);
+ running_means.copy_size(means);
+ running_variances.copy_size(means);
+ // cuDNN requires that running_means and running_variances be initialized to
+ // some valid float values even if the averaging factor would have ignored
+ // them.
+ if (averaging_factor == 1)
+ {
+ running_means = 0;
+ running_variances = 1;
+ }
+
+ CHECK_CUDNN(cudnnBatchNormalizationForwardTraining(
+ context(),
+ CUDNN_BATCHNORM_SPATIAL,
+ &in_scale,
+ &out_scale,
+ descriptor(src),
+ src.device(),
+ descriptor(dest),
+ dest.device(),
+ descriptor(gamma),
+ gamma.device(),
+ beta.device(),
+ averaging_factor,
+ running_means.device(),
+ running_variances.device(),
+ eps,
+ means.device(),
+ invstds.device()));
+ }
+
+ void batch_normalize_conv_gradient(
+ const double eps,
+ const tensor& gradient_input,
+ const tensor& means,
+ const tensor& invstds,
+ const tensor& src,
+ const tensor& gamma,
+ tensor& src_grad,
+ tensor& gamma_grad,
+ tensor& beta_grad
+ )
+ {
+ DLIB_CASSERT(src.k() == (long)means.size());
+ DLIB_CASSERT(src.k() == (long)invstds.size());
+ DLIB_CASSERT(src.k() == (long)gamma.size());
+ DLIB_CASSERT(src.k() == (long)gamma_grad.size());
+ DLIB_CASSERT(src.k() == (long)beta_grad.size());
+ DLIB_CASSERT(have_same_dimensions(gradient_input, src));
+ DLIB_CASSERT(have_same_dimensions(gradient_input, src_grad));
+ DLIB_CASSERT(eps > 0);
+
+ const float in_scale = 1;
+ const float out_scale = 1;
+ const float in_scale_params = 1;
+ const float out_scale_params = 0;
+
+ CHECK_CUDNN(cudnnBatchNormalizationBackward(
+ context(),
+ CUDNN_BATCHNORM_SPATIAL,
+ &in_scale,
+ &out_scale,
+ &in_scale_params,
+ &out_scale_params,
+ descriptor(src),
+ src.device(),
+ descriptor(gradient_input),
+ gradient_input.device(),
+ descriptor(src_grad),
+ src_grad.device(),
+ descriptor(gamma),
+ gamma.device(),
+ gamma_grad.device(),
+ beta_grad.device(),
+ eps,
+ means.device(),
+ invstds.device()));
+ }
+
+ // ------------------------------------------------------------------------------------
+ // ------------------------------------------------------------------------------------
+
+ tensor_conv::
+ tensor_conv(
+ ) :
+ filter_handle(nullptr),
+ conv_handle(nullptr),
+ forward_algo(0),
+ backward_data_algo(0),
+ backward_filters_algo(0)
+ {
+ clear();
+ }
+
+ void tensor_conv::
+ clear (
+ )
+ {
+ if (filter_handle)
+ cudnnDestroyFilterDescriptor((cudnnFilterDescriptor_t)filter_handle);
+ if (conv_handle)
+ cudnnDestroyConvolutionDescriptor((cudnnConvolutionDescriptor_t)conv_handle);
+ filter_handle = nullptr;
+ conv_handle = nullptr;
+ out_num_samples = 0;
+ out_k = 0;
+ out_nr = 0;
+ out_nc = 0;
+
+ stride_y = 0;
+ stride_x = 0;
+ padding_y = 0;
+ padding_x = 0;
+ data_num_samples = 0;
+ data_k = 0;
+ data_nr = 0;
+ data_nc = 0;
+ filters_num_samples = 0;
+ filters_k = 0;
+ filters_nr = 0;
+ filters_nc = 0;
+
+ forward_algo = 0;
+ backward_data_algo = 0;
+ backward_filters_algo = 0;
+
+ forward_workspace_size_in_bytes = 0;
+ backward_data_workspace_size_in_bytes = 0;
+ backward_filters_workspace_size_in_bytes = 0;
+
+ forward_workspace.reset();
+ backward_data_workspace.reset();
+ backward_filters_workspace.reset();
+ workspace.reset();
+ }
+
+ void tensor_conv::
+ setup(
+ const tensor& data,
+ const tensor& filters,
+ int stride_y_,
+ int stride_x_,
+ int padding_y_,
+ int padding_x_
+ )
+ {
+ DLIB_CASSERT(data.k() == filters.k());
+
+ // if the last call to setup gave the same exact settings then don't do
+ // anything.
+ if (stride_y_ == stride_y &&
+ stride_x_ == stride_x &&
+ padding_y_ == padding_y &&
+ padding_x_ == padding_x &&
+ data_num_samples == data.num_samples() &&
+ data_k == data.k() &&
+ data_nr == data.nr() &&
+ data_nc == data.nc() &&
+ filters_num_samples == filters.num_samples() &&
+ filters_k == filters.k() &&
+ filters_nr == filters.nr() &&
+ filters_nc == filters.nc())
+ {
+ return;
+ }
+
+ clear();
+ try
+ {
+ stride_y = stride_y_;
+ stride_x = stride_x_;
+ padding_y = padding_y_;
+ padding_x = padding_x_;
+ data_num_samples = data.num_samples();
+ data_k = data.k();
+ data_nr = data.nr();
+ data_nc = data.nc();
+ filters_num_samples = filters.num_samples();
+ filters_k = filters.k();
+ filters_nr = filters.nr();
+ filters_nc = filters.nc();
+
+ CHECK_CUDNN(cudnnCreateFilterDescriptor((cudnnFilterDescriptor_t*)&filter_handle));
+ CHECK_CUDNN(cudnnSetFilter4dDescriptor((cudnnFilterDescriptor_t)filter_handle,
+ CUDNN_DATA_FLOAT,
+ CUDNN_TENSOR_NCHW,
+ filters.num_samples(),
+ filters.k(),
+ filters.nr(),
+ filters.nc()));
+
+ CHECK_CUDNN(cudnnCreateConvolutionDescriptor((cudnnConvolutionDescriptor_t*)&conv_handle));
+#if CUDNN_MAJOR >= 6
+ CHECK_CUDNN(cudnnSetConvolution2dDescriptor((cudnnConvolutionDescriptor_t)conv_handle,
+ padding_y, // vertical padding
+ padding_x, // horizontal padding
+ stride_y,
+ stride_x,
+ 1, 1, // must be 1,1
+ CUDNN_CROSS_CORRELATION,
+ CUDNN_DATA_FLOAT)); // could also be CUDNN_CONVOLUTION
+#else
+ CHECK_CUDNN(cudnnSetConvolution2dDescriptor((cudnnConvolutionDescriptor_t)conv_handle,
+ padding_y, // vertical padding
+ padding_x, // horizontal padding
+ stride_y,
+ stride_x,
+ 1, 1, // must be 1,1
+ CUDNN_CROSS_CORRELATION)); // could also be CUDNN_CONVOLUTION
+#endif
+
+ CHECK_CUDNN(cudnnGetConvolution2dForwardOutputDim(
+ (const cudnnConvolutionDescriptor_t)conv_handle,
+ descriptor(data),
+ (const cudnnFilterDescriptor_t)filter_handle,
+ &out_num_samples,
+ &out_k,
+ &out_nr,
+ &out_nc));
+
+ tensor_descriptor dest_desc;
+ dest_desc.set_size(out_num_samples,out_k,out_nr,out_nc);
+
+ // Pick which forward algorithm we will use and allocate the necessary
+ // workspace buffer.
+ cudnnConvolutionFwdAlgo_t forward_best_algo;
+ CHECK_CUDNN(cudnnGetConvolutionForwardAlgorithm(
+ context(),
+ descriptor(data),
+ (const cudnnFilterDescriptor_t)filter_handle,
+ (const cudnnConvolutionDescriptor_t)conv_handle,
+ descriptor(dest_desc),
+ dnn_prefer_fastest_algorithms()?CUDNN_CONVOLUTION_FWD_PREFER_FASTEST:CUDNN_CONVOLUTION_FWD_NO_WORKSPACE,
+ std::numeric_limits<size_t>::max(),
+ &forward_best_algo));
+ forward_algo = forward_best_algo;
+ CHECK_CUDNN(cudnnGetConvolutionForwardWorkspaceSize(
+ context(),
+ descriptor(data),
+ (const cudnnFilterDescriptor_t)filter_handle,
+ (const cudnnConvolutionDescriptor_t)conv_handle,
+ descriptor(dest_desc),
+ forward_best_algo,
+ &forward_workspace_size_in_bytes));
+
+ // Pick which backward data algorithm we will use and allocate the
+ // necessary workspace buffer.
+ cudnnConvolutionBwdDataAlgo_t backward_data_best_algo;
+ CHECK_CUDNN(cudnnGetConvolutionBackwardDataAlgorithm(
+ context(),
+ (const cudnnFilterDescriptor_t)filter_handle,
+ descriptor(dest_desc),
+ (const cudnnConvolutionDescriptor_t)conv_handle,
+ descriptor(data),
+ dnn_prefer_fastest_algorithms()?CUDNN_CONVOLUTION_BWD_DATA_PREFER_FASTEST:CUDNN_CONVOLUTION_BWD_DATA_NO_WORKSPACE,
+ std::numeric_limits<size_t>::max(),
+ &backward_data_best_algo));
+ backward_data_algo = backward_data_best_algo;
+
+ CHECK_CUDNN(cudnnGetConvolutionBackwardDataWorkspaceSize(
+ context(),
+ (const cudnnFilterDescriptor_t)filter_handle,
+ descriptor(dest_desc),
+ (const cudnnConvolutionDescriptor_t)conv_handle,
+ descriptor(data),
+ backward_data_best_algo,
+ &backward_data_workspace_size_in_bytes));
+
+ // Pick which backward filters algorithm we will use and allocate the
+ // necessary workspace buffer.
+ cudnnConvolutionBwdFilterAlgo_t backward_filters_best_algo;
+ CHECK_CUDNN(cudnnGetConvolutionBackwardFilterAlgorithm(
+ context(),
+ descriptor(data),
+ descriptor(dest_desc),
+ (const cudnnConvolutionDescriptor_t)conv_handle,
+ (const cudnnFilterDescriptor_t)filter_handle,
+ dnn_prefer_fastest_algorithms()?CUDNN_CONVOLUTION_BWD_FILTER_PREFER_FASTEST:CUDNN_CONVOLUTION_BWD_FILTER_NO_WORKSPACE,
+ std::numeric_limits<size_t>::max(),
+ &backward_filters_best_algo));
+ // cuDNN 5.1 has a bug that causes
+ // cudnnGetConvolutionBackwardFilterAlgorithm() to pick the winograd
+ // algorithm even for cases where cuDNN doesn't support it, leading to
+ // incorrect outputs. So here we check if we are in a case where winograd
+ // isn't supported and manually overrule
+ // cudnnGetConvolutionBackwardFilterAlgorithm() by picking a safe
+ // algorithm.
+ if (dnn_prefer_fastest_algorithms() &&
+ !(stride_x == 1 && stride_y == 1 && ((filters_nr==3&&filters_nc==3) || (filters_nr==5&&filters_nc==5)))
+ )
+ {
+ backward_filters_best_algo = CUDNN_CONVOLUTION_BWD_FILTER_ALGO_0;
+ }
+ backward_filters_algo = backward_filters_best_algo;
+
+ CHECK_CUDNN(cudnnGetConvolutionBackwardFilterWorkspaceSize(
+ context(),
+ descriptor(data),
+ descriptor(dest_desc),
+ (const cudnnConvolutionDescriptor_t)conv_handle,
+ (const cudnnFilterDescriptor_t)filter_handle,
+ backward_filters_best_algo,
+ &backward_filters_workspace_size_in_bytes));
+
+ workspace = device_global_buffer();
+ }
+ catch(...)
+ {
+ clear();
+ throw;
+ }
+ }
+
+ tensor_conv::
+ ~tensor_conv (
+ )
+ {
+ clear();
+ }
+
+ void tensor_conv::operator() (
+ const bool add_to_output,
+ resizable_tensor& output,
+ const tensor& data,
+ const tensor& filters
+ )
+ {
+ DLIB_CASSERT(stride_y > 0 && stride_x > 0, "You must call setup() before calling this function");
+
+ output.set_size(out_num_samples, out_k, out_nr, out_nc);
+ (*this)(add_to_output, static_cast<tensor&>(output), data, filters);
+ }
+
+ void tensor_conv::operator() (
+ const bool add_to_output,
+ tensor& output,
+ const tensor& data,
+ const tensor& filters
+ )
+ {
+ DLIB_CASSERT(is_same_object(output,data) == false);
+ DLIB_CASSERT(is_same_object(output,filters) == false);
+ DLIB_CASSERT(filters.k() == data.k());
+ DLIB_CASSERT(stride_y > 0 && stride_x > 0, "You must call setup() before calling this function");
+ DLIB_CASSERT(filters.nc() <= data.nc() + 2*padding_x,
+ "Filter windows must be small enough to fit into the padded image."
+ << "\n\t filters.nc(): " << filters.nc()
+ << "\n\t data.nc(): " << data.nc()
+ << "\n\t padding_x: " << padding_x
+ );
+ DLIB_CASSERT(filters.nr() <= data.nr() + 2*padding_y,
+ "Filter windows must be small enough to fit into the padded image."
+ << "\n\t filters.nr(): " << filters.nr()
+ << "\n\t data.nr(): " << data.nr()
+ << "\n\t padding_y: " << padding_y
+ );
+
+
+ DLIB_CASSERT(output.num_samples() == data.num_samples(),out_num_samples << " " << data.num_samples());
+ DLIB_CASSERT(output.k() == filters.num_samples());
+ DLIB_CASSERT(output.nr() == 1+(data.nr()+2*padding_y-filters.nr())/stride_y);
+ DLIB_CASSERT(output.nc() == 1+(data.nc()+2*padding_x-filters.nc())/stride_x);
+
+
+
+ const float alpha = 1;
+ const float beta = add_to_output ? 1 : 0;
+
+ // Since cudnnConvolutionForward() is an asynchronous call, we need to hold a
+ // reference to the workspace buffer so we can be sure it isn't reallocated
+ // while the function is still executing on the device. But each time we come
+ // here, we make sure to grab the latest workspace buffer so that, globally, we
+ // minimize the number of such buffers.
+ forward_workspace = workspace->get(forward_workspace_size_in_bytes);
+
+ CHECK_CUDNN(cudnnConvolutionForward(
+ context(),
+ &alpha,
+ descriptor(data),
+ data.device(),
+ (const cudnnFilterDescriptor_t)filter_handle,
+ filters.device(),
+ (const cudnnConvolutionDescriptor_t)conv_handle,
+ (cudnnConvolutionFwdAlgo_t)forward_algo,
+ forward_workspace,
+ forward_workspace_size_in_bytes,
+ &beta,
+ descriptor(output),
+ output.device()));
+ }
+
+ void tensor_conv::get_gradient_for_data (
+ const bool add_to_output,
+ const tensor& gradient_input,
+ const tensor& filters,
+ tensor& data_gradient
+ )
+ {
+ const float alpha = 1;
+ const float beta = add_to_output ? 1 : 0;
+
+ // Since cudnnConvolutionBackwardData() is an asynchronous call, we need to hold a
+ // reference to the workspace buffer so we can be sure it isn't reallocated
+ // while the function is still executing on the device. But each time we come
+ // here, we make sure to grab the latest workspace buffer so that, globally, we
+ // minimize the number of such buffers.
+ backward_data_workspace = workspace->get(backward_data_workspace_size_in_bytes);
+
+
+ CHECK_CUDNN(cudnnConvolutionBackwardData(context(),
+ &alpha,
+ (const cudnnFilterDescriptor_t)filter_handle,
+ filters.device(),
+ descriptor(gradient_input),
+ gradient_input.device(),
+ (const cudnnConvolutionDescriptor_t)conv_handle,
+ (cudnnConvolutionBwdDataAlgo_t)backward_data_algo,
+ backward_data_workspace,
+ backward_data_workspace_size_in_bytes,
+ &beta,
+ descriptor(data_gradient),
+ data_gradient.device()));
+ }
+
+ void tensor_conv::
+ get_gradient_for_filters (
+ const bool add_to_output,
+ const tensor& gradient_input,
+ const tensor& data,
+ tensor& filters_gradient
+ )
+ {
+ const float alpha = 1;
+ const float beta = add_to_output ? 1 : 0;
+
+ // Since cudnnConvolutionBackwardFilter() is an asynchronous call, we need to hold a
+ // reference to the workspace buffer so we can be sure it isn't reallocated
+ // while the function is still executing on the device. But each time we come
+ // here, we make sure to grab the latest workspace buffer so that, globally, we
+ // minimize the number of such buffers.
+ backward_filters_workspace = workspace->get(backward_filters_workspace_size_in_bytes);
+
+ CHECK_CUDNN(cudnnConvolutionBackwardFilter(context(),
+ &alpha,
+ descriptor(data),
+ data.device(),
+ descriptor(gradient_input),
+ gradient_input.device(),
+ (const cudnnConvolutionDescriptor_t)conv_handle,
+ (cudnnConvolutionBwdFilterAlgo_t)backward_filters_algo,
+ backward_filters_workspace,
+ backward_filters_workspace_size_in_bytes,
+ &beta,
+ (const cudnnFilterDescriptor_t)filter_handle,
+ filters_gradient.device()));
+ }
+
+ // ------------------------------------------------------------------------------------
+ // ------------------------------------------------------------------------------------
+
+ pooling::pooling (
+ ) : handle(nullptr),window_height(0),window_width(0),stride_y(0),stride_x(0),padding_y(0), padding_x(0)
+ {
+ }
+
+ pooling::~pooling(
+ )
+ {
+ clear();
+ }
+
+ void pooling::
+ clear(
+ )
+ {
+ if (handle)
+ cudnnDestroyPoolingDescriptor((cudnnPoolingDescriptor_t)handle);
+ handle = nullptr;
+ window_height = 0;
+ window_width = 0;
+ stride_y = 0;
+ stride_x = 0;
+ padding_y = 0;
+ padding_x = 0;
+ }
+
+ void pooling::
+ setup_max_pooling(
+ int window_height_,
+ int window_width_,
+ int stride_y_,
+ int stride_x_,
+ int padding_y_,
+ int padding_x_
+ )
+ {
+ setup(window_height_, window_width_, stride_y_, stride_x_, padding_y_, padding_x_, CUDNN_POOLING_MAX);
+ do_max_pooling = true;
+ }
+
+ void pooling::
+ setup_avg_pooling(
+ int window_height_,
+ int window_width_,
+ int stride_y_,
+ int stride_x_,
+ int padding_y_,
+ int padding_x_
+ )
+ {
+ setup(window_height_, window_width_, stride_y_, stride_x_, padding_y_, padding_x_, CUDNN_POOLING_AVERAGE_COUNT_EXCLUDE_PADDING);
+ do_max_pooling = false;
+ }
+
+ void pooling::
+ setup(
+ int window_height_,
+ int window_width_,
+ int stride_y_,
+ int stride_x_,
+ int padding_y_,
+ int padding_x_,
+ int pooling_mode
+ )
+ {
+ DLIB_CASSERT (window_height_ > 0 && window_width_ > 0 &&
+ stride_y_ > 0 && stride_x_ > 0 ,
+ "window_height_: " << window_height_
+ << "\t\n window_width_: " << window_width_
+ << "\t\n stride_y_: " << stride_y_
+ << "\t\n stride_x_: " << stride_x_ );
+ DLIB_CASSERT( 0 <= padding_y_ && padding_y_ < window_height_ &&
+ 0 <= padding_x_ && padding_x_ < window_width_,
+ "window_height_: " << window_height_
+ << "\t\n window_width_: " << window_width_
+ << "\t\n padding_y_: " << padding_y_
+ << "\t\n padding_x_: " << padding_x_ );
+
+ if (window_height == window_height_ &&
+ window_width == window_width_ &&
+ stride_y == stride_y_ &&
+ stride_x == stride_x_ &&
+ padding_y == padding_y_ &&
+ padding_x == padding_x_
+ )
+ {
+ return;
+ }
+
+ clear();
+ try
+ {
+ window_height = window_height_;
+ window_width = window_width_;
+ stride_x = stride_x_;
+ stride_y = stride_y_;
+ padding_y = padding_y_;
+ padding_x = padding_x_;
+ cudnnPoolingDescriptor_t poolingDesc;
+ CHECK_CUDNN(cudnnCreatePoolingDescriptor(&poolingDesc));
+ handle = poolingDesc;
+
+ CHECK_CUDNN(cudnnSetPooling2dDescriptor(poolingDesc,
+ (cudnnPoolingMode_t)pooling_mode,
+ CUDNN_PROPAGATE_NAN,
+ window_height,
+ window_width,
+ padding_y,
+ padding_x,
+ stride_y,
+ stride_x));
+ }
+ catch(...)
+ {
+ clear();
+ throw;
+ }
+ }
+
+ void pooling::
+ operator() (
+ resizable_tensor& dest,
+ const tensor& src
+ )
+ {
+ DLIB_CASSERT(window_width <= src.nc() + 2*padding_x,
+ "Pooling windows must be small enough to fit into the padded image."
+ << "\n\t window_width: " << window_width
+ << "\n\t src.nc(): " << src.nc()
+ << "\n\t padding_x: " << padding_x
+ );
+ DLIB_CASSERT(window_height <= src.nr() + 2*padding_y,
+ "Pooling windows must be small enough to fit into the padded image."
+ << "\n\t window_height: " << window_height
+ << "\n\t src.nr(): " << src.nr()
+ << "\n\t padding_y: " << padding_y
+ );
+ const float alpha = 1;
+ const float beta = 0;
+ int outN;
+ int outC;
+ int outH;
+ int outW;
+ CHECK_CUDNN(cudnnGetPooling2dForwardOutputDim((const cudnnPoolingDescriptor_t)handle,
+ descriptor(src),
+ &outN,
+ &outC,
+ &outH,
+ &outW));
+
+
+ dest.set_size(outN,outC,outH,outW);
+
+ DLIB_CASSERT(dest.num_samples() == src.num_samples());
+ DLIB_CASSERT(dest.k() == src.k());
+ DLIB_CASSERT(dest.nr() == 1 + (src.nr() + 2*padding_y - window_height)/stride_y,
+ "\n stride_y: " << stride_y <<
+ "\n padding_y: " << padding_y <<
+ "\n window_height: " << window_height <<
+ "\n src.nr(): " << src.nr() <<
+ "\n dest.nr(): " << dest.nr() <<
+ "\n src.nr()/stride_y: " << src.nr()/stride_y);
+ DLIB_CASSERT(dest.nc() == 1 + (src.nc() + 2*padding_x - window_width)/stride_x,
+ "\n stride_x: " << stride_x <<
+ "\n padding_x: " << padding_x <<
+ "\n window_width: " << window_width <<
+ "\n src.nc(): " << src.nc() <<
+ "\n dest.nc(): " << dest.nc() <<
+ "\n src.nc()/stride_x: " << src.nc()/stride_x);
+
+ CHECK_CUDNN(cudnnPoolingForward(context(),
+ (const cudnnPoolingDescriptor_t)handle,
+ &alpha,
+ descriptor(src),
+ src.device(),
+ &beta,
+ descriptor(dest),
+ dest.device()));
+ }
+
+ void pooling::get_gradient(
+ const tensor& gradient_input,
+ const tensor& dest,
+ const tensor& src,
+ tensor& grad
+ )
+ {
+ DLIB_CASSERT(have_same_dimensions(gradient_input,dest));
+ DLIB_CASSERT(have_same_dimensions(src,grad));
+
+ const float alpha = 1;
+ const float beta = 1;
+ CHECK_CUDNN(cudnnPoolingBackward(context(),
+ (const cudnnPoolingDescriptor_t)handle,
+ &alpha,
+ descriptor(dest),
+ dest.device(),
+ descriptor(gradient_input),
+ gradient_input.device(),
+ descriptor(src),
+ src.device(),
+ &beta,
+ descriptor(grad),
+ grad.device()));
+ }
+
+ // ------------------------------------------------------------------------------------
+ // ------------------------------------------------------------------------------------
+
+ void softmax (
+ tensor& dest,
+ const tensor& src
+ )
+ {
+ DLIB_CASSERT(have_same_dimensions(dest,src));
+ if (src.size() == 0)
+ return;
+
+ const float alpha = 1;
+ const float beta = 0;
+
+ CHECK_CUDNN(cudnnSoftmaxForward(context(),
+ CUDNN_SOFTMAX_ACCURATE,
+ CUDNN_SOFTMAX_MODE_CHANNEL,
+ &alpha,
+ descriptor(src),
+ src.device(),
+ &beta,
+ descriptor(dest),
+ dest.device()));
+ }
+
+
+ void softmax_gradient (
+ tensor& grad,
+ const tensor& dest,
+ const tensor& gradient_input
+ )
+ {
+ DLIB_CASSERT(
+ have_same_dimensions(dest,gradient_input) == true &&
+ have_same_dimensions(dest,grad) == true );
+ if (dest.size() == 0)
+ return;
+
+ const float alpha = 1;
+ const float beta = is_same_object(grad,gradient_input) ? 0 : 1;
+ CHECK_CUDNN(cudnnSoftmaxBackward(context(),
+ CUDNN_SOFTMAX_ACCURATE,
+ CUDNN_SOFTMAX_MODE_CHANNEL,
+ &alpha,
+ descriptor(dest),
+ dest.device(),
+ descriptor(gradient_input),
+ gradient_input.device(),
+ &beta,
+ descriptor(grad),
+ grad.device()));
+ }
+
+ // ------------------------------------------------------------------------------------
+ // ------------------------------------------------------------------------------------
+
+ void softmax_all (
+ tensor& dest,
+ const tensor& src
+ )
+ {
+ DLIB_CASSERT(have_same_dimensions(dest,src));
+ if (src.size() == 0)
+ return;
+
+ const float alpha = 1;
+ const float beta = 0;
+
+ CHECK_CUDNN(cudnnSoftmaxForward(context(),
+ CUDNN_SOFTMAX_ACCURATE,
+ CUDNN_SOFTMAX_MODE_INSTANCE,
+ &alpha,
+ descriptor(src),
+ src.device(),
+ &beta,
+ descriptor(dest),
+ dest.device()));
+ }
+
+
+ void softmax_all_gradient (
+ tensor& grad,
+ const tensor& dest,
+ const tensor& gradient_input
+ )
+ {
+ DLIB_CASSERT(
+ have_same_dimensions(dest,gradient_input) == true &&
+ have_same_dimensions(dest,grad) == true );
+ if (dest.size() == 0)
+ return;
+
+ const float alpha = 1;
+ const float beta = is_same_object(grad,gradient_input) ? 0 : 1;
+ CHECK_CUDNN(cudnnSoftmaxBackward(context(),
+ CUDNN_SOFTMAX_ACCURATE,
+ CUDNN_SOFTMAX_MODE_INSTANCE,
+ &alpha,
+ descriptor(dest),
+ dest.device(),
+ descriptor(gradient_input),
+ gradient_input.device(),
+ &beta,
+ descriptor(grad),
+ grad.device()));
+ }
+
+ // ------------------------------------------------------------------------------------
+ // ------------------------------------------------------------------------------------
+
+ void sigmoid (
+ tensor& dest,
+ const tensor& src
+ )
+ {
+ DLIB_CASSERT(have_same_dimensions(dest,src));
+ if (src.size() == 0)
+ return;
+
+ const float alpha = 1;
+ const float beta = 0;
+ CHECK_CUDNN(cudnnActivationForward(context(),
+ sigmoid_activation_descriptor(),
+ &alpha,
+ descriptor(src),
+ src.device(),
+ &beta,
+ descriptor(dest),
+ dest.device()));
+ }
+
+ void sigmoid_gradient (
+ tensor& grad,
+ const tensor& dest,
+ const tensor& gradient_input
+ )
+ {
+ DLIB_CASSERT(
+ have_same_dimensions(dest,gradient_input) == true &&
+ have_same_dimensions(dest,grad) == true );
+ if (dest.size() == 0)
+ return;
+
+ const float alpha = 1;
+ const float beta = is_same_object(grad,gradient_input) ? 0 : 1;
+ CHECK_CUDNN(cudnnActivationBackward(context(),
+ sigmoid_activation_descriptor(),
+ &alpha,
+ descriptor(dest),
+ dest.device(),
+ descriptor(gradient_input),
+ gradient_input.device(),
+ descriptor(dest),
+ dest.device(),
+ &beta,
+ descriptor(grad),
+ grad.device()));
+ }
+
+ // ------------------------------------------------------------------------------------
+
+ void relu (
+ tensor& dest,
+ const tensor& src
+ )
+ {
+ DLIB_CASSERT(have_same_dimensions(dest,src));
+ if (src.size() == 0)
+ return;
+
+ const float alpha = 1;
+ const float beta = 0;
+ CHECK_CUDNN(cudnnActivationForward(context(),
+ relu_activation_descriptor(),
+ &alpha,
+ descriptor(src),
+ src.device(),
+ &beta,
+ descriptor(dest),
+ dest.device()));
+ }
+
+ void relu_gradient (
+ tensor& grad,
+ const tensor& dest,
+ const tensor& gradient_input
+ )
+ {
+ DLIB_CASSERT(
+ have_same_dimensions(dest,gradient_input) == true &&
+ have_same_dimensions(dest,grad) == true );
+ if (dest.size() == 0)
+ return;
+
+ const float alpha = 1;
+ const float beta = is_same_object(grad,gradient_input) ? 0 : 1;
+ CHECK_CUDNN(cudnnActivationBackward(context(),
+ relu_activation_descriptor(),
+ &alpha,
+ descriptor(dest),
+ dest.device(),
+ descriptor(gradient_input),
+ gradient_input.device(),
+ descriptor(dest),
+ dest.device(),
+ &beta,
+ descriptor(grad),
+ grad.device()));
+ }
+
+ // ------------------------------------------------------------------------------------
+
+ void tanh (
+ tensor& dest,
+ const tensor& src
+ )
+ {
+ DLIB_CASSERT(have_same_dimensions(dest,src));
+ if (src.size() == 0)
+ return;
+
+ const float alpha = 1;
+ const float beta = 0;
+ CHECK_CUDNN(cudnnActivationForward(context(),
+ tanh_activation_descriptor(),
+ &alpha,
+ descriptor(src),
+ src.device(),
+ &beta,
+ descriptor(dest),
+ dest.device()));
+ }
+
+ void tanh_gradient (
+ tensor& grad,
+ const tensor& dest,
+ const tensor& gradient_input
+ )
+ {
+ DLIB_CASSERT(
+ have_same_dimensions(dest,gradient_input) == true &&
+ have_same_dimensions(dest,grad) == true);
+ if (dest.size() == 0)
+ return;
+
+ const float alpha = 1;
+ const float beta = is_same_object(grad,gradient_input) ? 0 : 1;
+ CHECK_CUDNN(cudnnActivationBackward(context(),
+ tanh_activation_descriptor(),
+ &alpha,
+ descriptor(dest),
+ dest.device(),
+ descriptor(gradient_input),
+ gradient_input.device(),
+ descriptor(dest),
+ dest.device(),
+ &beta,
+ descriptor(grad),
+ grad.device()));
+ }
+
+ // ------------------------------------------------------------------------------------
+ }
+}
+
+#endif // DLIB_USE_CUDA
+
+#endif // DLIB_DNN_CuDNN_CPP_
+
+
diff --git a/ml/dlib/dlib/dnn/cudnn_dlibapi.h b/ml/dlib/dlib/dnn/cudnn_dlibapi.h
new file mode 100644
index 000000000..e9ffe5f6d
--- /dev/null
+++ b/ml/dlib/dlib/dnn/cudnn_dlibapi.h
@@ -0,0 +1,518 @@
+// Copyright (C) 2015 Davis E. King (davis@dlib.net)
+// License: Boost Software License See LICENSE.txt for the full license.
+#ifndef DLIB_DNN_CuDNN_H_
+#define DLIB_DNN_CuDNN_H_
+
+#ifdef DLIB_USE_CUDA
+
+#include "cuda_errors.h"
+#include <memory>
+#include "cuda_data_ptr.h"
+
+namespace dlib
+{
+ class tensor;
+ class resizable_tensor;
+
+ namespace cuda
+ {
+
+ // -----------------------------------------------------------------------------------
+
+ class tensor_descriptor
+ {
+ /*!
+ Each tensor object will carry a tensor_descriptor in it when compiled with
+ CUDA.
+ !*/
+
+ public:
+ // not copyable
+ tensor_descriptor(const tensor_descriptor&) = delete;
+ tensor_descriptor& operator=(const tensor_descriptor&) = delete;
+ // but is movable
+ tensor_descriptor(tensor_descriptor&& item) : tensor_descriptor() { swap(item); }
+ tensor_descriptor& operator=(tensor_descriptor&& item) { swap(item); return *this; }
+
+ tensor_descriptor();
+ ~tensor_descriptor();
+
+ void set_size(
+ int n,
+ int k,
+ int nr,
+ int nc
+ );
+ /*!
+ ensures
+ - if any of the arguments are 0 then they are all set to 0 in the tensor.
+ !*/
+
+ void get_size (
+ int& n,
+ int& k,
+ int& nr,
+ int& nc
+ ) const;
+
+ const void* get_handle (
+ ) const { return handle; }
+
+ private:
+
+ void swap(tensor_descriptor& item) { std::swap(handle, item.handle); }
+
+ void* handle;
+ };
+
+ // ------------------------------------------------------------------------------------
+
+ void add(
+ float beta,
+ tensor& dest,
+ float alpha,
+ const tensor& src
+ );
+ /*!
+ requires
+ - One of the following is true:
+ - have_same_dimensions(src, dest)
+ - src.num_samples()==1 && src.k()==dest.k() && src.nr()==1 && src.nc()==1
+ - src.num_samples()==1 && src.k()==dest.k() && src.nr()==dest.nr() && src.nc()==dest.nc()
+ - src.num_samples()==1 && src.k()==1 && src.nr()==dest.nr() && src.nc()==dest.nc()
+ - is_same_object(src,dest) == false
+ ensures
+ - performs: dest = beta*dest + alpha*src
+ However, how the addition happens depends on the dimensions of src. In
+ particular, this function adds the scaled values of one src tensor to
+ dest. Each dimension of the src tensor must match the corresponding
+ dimension of the dest tensor or must be equal to 1. In the latter case,
+ the same value from the src tensor, for those dimensions, will be used to
+ add into the dest tensor.
+ !*/
+
+ // ------------------------------------------------------------------------------------
+
+ void assign_conv_bias_gradient (
+ tensor& grad,
+ const tensor& gradient_input
+ );
+ /*!
+ requires
+ - grad.num_samples() == 1
+ - grad.k() >= 1
+ - grad.nr() == 1
+ - grad.nc() == 1
+ - gradient_input.k() == grad.k()
+ - gradient_input.size() > 0
+ - is_same_object(grad,gradient_input) == false
+ ensures
+ - let BIAS be a tensor with all dimensions equal to 1 except for k which is >= 1.
+ - let OUT be the output of add(1,OUT,1,BIAS)
+ - let f(gradient_input,BIAS) == dot(gradient_input,OUT)
+ - Then this function computes the gradient of f() with respect to BIAS and
+ assigns it to grad.
+ !*/
+
+ // ------------------------------------------------------------------------------------
+
+ void batch_normalize_inference (
+ const double eps,
+ resizable_tensor& dest,
+ const tensor& src,
+ const tensor& gamma,
+ const tensor& beta,
+ const tensor& running_means,
+ const tensor& running_variances
+ );
+
+ void batch_normalize (
+ const double eps,
+ resizable_tensor& dest,
+ resizable_tensor& means,
+ resizable_tensor& invstds,
+ const double averaging_factor,
+ resizable_tensor& running_means,
+ resizable_tensor& running_variances,
+ const tensor& src,
+ const tensor& gamma,
+ const tensor& beta
+ );
+
+ void batch_normalize_gradient(
+ const double eps,
+ const tensor& gradient_input,
+ const tensor& means,
+ const tensor& invstds,
+ const tensor& src,
+ const tensor& gamma,
+ tensor& src_grad,
+ tensor& gamma_grad,
+ tensor& beta_grad
+ );
+
+ // ------------------------------------------------------------------------------------
+
+ void batch_normalize_conv_inference (
+ const double eps,
+ resizable_tensor& dest,
+ const tensor& src,
+ const tensor& gamma,
+ const tensor& beta,
+ const tensor& running_means,
+ const tensor& running_variances
+ );
+
+ void batch_normalize_conv (
+ const double eps,
+ resizable_tensor& dest,
+ resizable_tensor& means,
+ resizable_tensor& invstds,
+ const double averaging_factor,
+ resizable_tensor& running_means,
+ resizable_tensor& running_variances,
+ const tensor& src,
+ const tensor& gamma,
+ const tensor& beta
+ );
+
+ void batch_normalize_conv_gradient(
+ const double eps,
+ const tensor& gradient_input,
+ const tensor& means,
+ const tensor& invstds,
+ const tensor& src,
+ const tensor& gamma,
+ tensor& src_grad,
+ tensor& gamma_grad,
+ tensor& beta_grad
+ );
+
+ // ------------------------------------------------------------------------------------
+
+ class tensor_conv
+ {
+ public:
+ tensor_conv(const tensor_conv&) = delete;
+ tensor_conv& operator=(const tensor_conv&) = delete;
+
+ tensor_conv();
+
+ void clear(
+ );
+
+ ~tensor_conv (
+ );
+
+ void operator() (
+ const bool add_to_output,
+ tensor& output,
+ const tensor& data,
+ const tensor& filters
+ );
+
+ void operator() (
+ const bool add_to_output,
+ resizable_tensor& output,
+ const tensor& data,
+ const tensor& filters
+ );
+
+ void get_gradient_for_data (
+ const bool add_to_output,
+ const tensor& gradient_input,
+ const tensor& filters,
+ tensor& data_gradient
+ );
+
+ void get_gradient_for_filters (
+ const bool add_to_output,
+ const tensor& gradient_input,
+ const tensor& data,
+ tensor& filters_gradient
+ );
+
+ void setup(
+ const tensor& data,
+ const tensor& filters,
+ int stride_y,
+ int stride_x,
+ int padding_y,
+ int padding_x
+ );
+
+ private:
+
+ // These variables record the type of data given to the last call to setup().
+ int stride_y;
+ int stride_x;
+ int padding_y;
+ int padding_x;
+ long data_num_samples, data_k, data_nr, data_nc;
+ long filters_num_samples, filters_k, filters_nr, filters_nc;
+
+
+ void* filter_handle;
+ void* conv_handle;
+
+ // dimensions of the output tensor from operator()
+ int out_num_samples;
+ int out_k;
+ int out_nr;
+ int out_nc;
+
+ int forward_algo;
+ int backward_data_algo;
+ int backward_filters_algo;
+
+ size_t forward_workspace_size_in_bytes;
+ size_t backward_data_workspace_size_in_bytes;
+ size_t backward_filters_workspace_size_in_bytes;
+ std::shared_ptr<resizable_cuda_buffer> workspace;
+ cuda_data_void_ptr forward_workspace;
+ cuda_data_void_ptr backward_data_workspace;
+ cuda_data_void_ptr backward_filters_workspace;
+ };
+
+ // ------------------------------------------------------------------------------------
+
+ class pooling
+ {
+ public:
+
+ pooling(const pooling&) = delete;
+ pooling& operator=(const pooling&) = delete;
+
+ pooling (
+ );
+
+ ~pooling(
+ );
+
+ void clear(
+ );
+
+ void setup_max_pooling(
+ int window_height,
+ int window_width,
+ int stride_y,
+ int stride_x,
+ int padding_y,
+ int padding_x
+ );
+
+ void setup_avg_pooling(
+ int window_height,
+ int window_width,
+ int stride_y,
+ int stride_x,
+ int padding_y,
+ int padding_x
+ );
+
+ bool does_max_pooling(
+ ) const { return do_max_pooling; }
+
+ void operator() (
+ resizable_tensor& dest,
+ const tensor& src
+ );
+
+ void get_gradient(
+ const tensor& gradient_input,
+ const tensor& dest,
+ const tensor& src,
+ tensor& grad
+ );
+
+ private:
+
+ void setup(
+ int window_height,
+ int window_width,
+ int stride_y,
+ int stride_x,
+ int padding_y,
+ int padding_x,
+ int pooling_mode
+ );
+
+ void* handle;
+ int window_height;
+ int window_width;
+ int stride_y;
+ int stride_x;
+ int padding_y;
+ int padding_x;
+ bool do_max_pooling;
+ };
+
+ // ------------------------------------------------------------------------------------
+
+ void softmax (
+ tensor& dest,
+ const tensor& src
+ );
+ /*!
+ requires
+ - have_same_dimensions(dest, src) == true
+ ensures
+ - Note that the softmax function is a vector valued function:
+ s(x) == exp(x)/sum(exp(x))
+ - Computes the softmax function on src and writes the results to dest. The
+ softmax is computed per spatial location across the different channels at
+ each location. That is, softmax() outputs a new tensor, #dest, where
+ each of the spatial locations in dest (i.e. image idx, row idx, and
+ column idx) contains the output of s() evaluated over the channel values
+ at each location.
+ - This function supports in-place operation, i.e. having
+ is_same_object(dest, src)==true
+ !*/
+
+ void softmax_gradient (
+ tensor& grad,
+ const tensor& dest,
+ const tensor& gradient_input
+ );
+ /*!
+ requires
+ - have_same_dimensions(dest,gradient_input) == true
+ - have_same_dimensions(dest,grad) == true
+ - is_same_object(grad, dest)==false
+ ensures
+ - We interpret dest as the output of softmax(dest,SRC) for some SRC tensor.
+ Then let f(SRC) == dot(gradient_input,dest) Then this function computes
+ the gradient of f() with respect to SRC and assigns it to grad.
+ - This function supports in-place operation, i.e. having
+ is_same_object(grad, gradient_input)==true
+ !*/
+
+ // ------------------------------------------------------------------------------------
+
+ void softmax_all (
+ tensor& dest,
+ const tensor& src
+ );
+
+ void softmax_all_gradient (
+ tensor& grad,
+ const tensor& dest,
+ const tensor& gradient_input
+ );
+
+ // ------------------------------------------------------------------------------------
+
+ void sigmoid (
+ tensor& dest,
+ const tensor& src
+ );
+ /*!
+ requires
+ - have_same_dimensions(dest, src) == true
+ ensures
+ - for all valid i:
+ - #dest.host()[i] == 1/(1+std::exp(-src.host()[i]))
+ - This function supports in-place operation, i.e. having
+ is_same_object(dest, src)==true
+ !*/
+
+ void sigmoid_gradient (
+ tensor& grad,
+ const tensor& dest,
+ const tensor& gradient_input
+ );
+ /*!
+ requires
+ - have_same_dimensions(dest,gradient_input) == true
+ - have_same_dimensions(dest,grad) == true
+ - is_same_object(grad,dest) == false
+ ensures
+ - Recalling that dest is the output of sigmoid(dest,SRC) for some SRC tensor,
+ let f(SRC) == dot(gradient_input,dest)
+ - Then this function computes the gradient of f() with respect to SRC and
+ assigns it to grad.
+ - This function supports in-place operation, i.e. having
+ is_same_object(grad, gradient_input)==true
+ !*/
+
+ // ------------------------------------------------------------------------------------
+
+ void relu (
+ tensor& dest,
+ const tensor& src
+ );
+ /*!
+ requires
+ - have_same_dimensions(dest, src) == true
+ ensures
+ - for all valid i:
+ - #dest.host()[i] == std::max(0,src.host()[i])
+ - This function supports in-place operation, i.e. having
+ is_same_object(dest, src)==true
+ !*/
+
+ void relu_gradient (
+ tensor& grad,
+ const tensor& dest,
+ const tensor& gradient_input
+ );
+ /*!
+ requires
+ - have_same_dimensions(dest,gradient_input) == true
+ - have_same_dimensions(dest,grad) == true
+ - is_same_object(grad,dest) == false
+ ensures
+ - Recalling that dest is the output of relu(dest,SRC) for some SRC tensor,
+ let f(SRC) == dot(gradient_input,dest)
+ - Then this function computes the gradient of f() with respect to SRC and
+ assigns it to grad.
+ - This function supports in-place operation, i.e. having
+ is_same_object(grad, gradient_input)==true
+ !*/
+
+ // ------------------------------------------------------------------------------------
+
+ void tanh (
+ tensor& dest,
+ const tensor& src
+ );
+ /*!
+ requires
+ - have_same_dimensions(dest, src) == true
+ ensures
+ - for all valid i:
+ - #dest.host()[i] == std::tanh(src.host()[i])
+ - This function supports in-place operation, i.e. having
+ is_same_object(dest, src)==true
+ !*/
+
+ void tanh_gradient (
+ tensor& grad,
+ const tensor& dest,
+ const tensor& gradient_input
+ );
+ /*!
+ requires
+ - have_same_dimensions(dest,gradient_input) == true
+ - have_same_dimensions(dest,grad) == true
+ - is_same_object(grad,dest) == false
+ ensures
+ - Recalling that dest is the output of tanh(dest,SRC) for some SRC tensor,
+ let f(SRC) == dot(gradient_input,dest)
+ - Then this function computes the gradient of f() with respect to SRC and
+ assigns it to grad.
+ - This function supports in-place operation, i.e. having
+ is_same_object(grad, gradient_input)==true
+ !*/
+
+
+
+ // ------------------------------------------------------------------------------------
+
+ }
+}
+
+#endif // DLIB_USE_CUDA
+
+#endif // DLIB_DNN_CuDNN_H_
+
diff --git a/ml/dlib/dlib/dnn/curand_dlibapi.cpp b/ml/dlib/dlib/dnn/curand_dlibapi.cpp
new file mode 100644
index 000000000..67828e664
--- /dev/null
+++ b/ml/dlib/dlib/dnn/curand_dlibapi.cpp
@@ -0,0 +1,113 @@
+// Copyright (C) 2015 Davis E. King (davis@dlib.net)
+// License: Boost Software License See LICENSE.txt for the full license.
+#ifndef DLIB_DNN_CuRAND_CPP_
+#define DLIB_DNN_CuRAND_CPP_
+
+#ifdef DLIB_USE_CUDA
+
+#include "curand_dlibapi.h"
+#include <curand.h>
+#include "../string.h"
+
+static const char* curand_get_error_string(curandStatus_t s)
+{
+ switch(s)
+ {
+ case CURAND_STATUS_NOT_INITIALIZED:
+ return "CUDA Runtime API initialization failed.";
+ case CURAND_STATUS_LENGTH_NOT_MULTIPLE:
+ return "The requested length must be a multiple of two.";
+ default:
+ return "A call to cuRAND failed";
+ }
+}
+
+// Check the return value of a call to the cuDNN runtime for an error condition.
+#define CHECK_CURAND(call) \
+do{ \
+ const curandStatus_t error = call; \
+ if (error != CURAND_STATUS_SUCCESS) \
+ { \
+ std::ostringstream sout; \
+ sout << "Error while calling " << #call << " in file " << __FILE__ << ":" << __LINE__ << ". ";\
+ sout << "code: " << error << ", reason: " << curand_get_error_string(error);\
+ throw dlib::curand_error(sout.str()); \
+ } \
+}while(false)
+
+namespace dlib
+{
+ namespace cuda
+ {
+
+ // ----------------------------------------------------------------------------------------
+
+ curand_generator::
+ curand_generator(
+ unsigned long long seed
+ ) : handle(nullptr)
+ {
+ curandGenerator_t gen;
+ CHECK_CURAND(curandCreateGenerator(&gen, CURAND_RNG_PSEUDO_DEFAULT));
+ handle = gen;
+
+ CHECK_CURAND(curandSetPseudoRandomGeneratorSeed(gen, seed));
+ }
+
+ curand_generator::
+ ~curand_generator()
+ {
+ if (handle)
+ {
+ curandDestroyGenerator((curandGenerator_t)handle);
+ }
+ }
+
+ void curand_generator::
+ fill_gaussian (
+ tensor& data,
+ float mean,
+ float stddev
+ )
+ {
+ if (data.size() == 0)
+ return;
+
+ CHECK_CURAND(curandGenerateNormal((curandGenerator_t)handle,
+ data.device(),
+ data.size(),
+ mean,
+ stddev));
+ }
+
+ void curand_generator::
+ fill_uniform (
+ tensor& data
+ )
+ {
+ if (data.size() == 0)
+ return;
+
+ CHECK_CURAND(curandGenerateUniform((curandGenerator_t)handle, data.device(), data.size()));
+ }
+
+ void curand_generator::
+ fill (
+ cuda_data_ptr<unsigned int>& data
+ )
+ {
+ if (data.size() == 0)
+ return;
+
+ CHECK_CURAND(curandGenerate((curandGenerator_t)handle, data, data.size()));
+ }
+
+ // -----------------------------------------------------------------------------------
+
+ }
+}
+
+#endif // DLIB_USE_CUDA
+
+#endif // DLIB_DNN_CuRAND_CPP_
+
diff --git a/ml/dlib/dlib/dnn/curand_dlibapi.h b/ml/dlib/dlib/dnn/curand_dlibapi.h
new file mode 100644
index 000000000..cd51fecee
--- /dev/null
+++ b/ml/dlib/dlib/dnn/curand_dlibapi.h
@@ -0,0 +1,75 @@
+// Copyright (C) 2015 Davis E. King (davis@dlib.net)
+// License: Boost Software License See LICENSE.txt for the full license.
+#ifndef DLIB_DNN_CuRAND_H_
+#define DLIB_DNN_CuRAND_H_
+
+#ifdef DLIB_USE_CUDA
+
+#include "tensor.h"
+#include "cuda_errors.h"
+#include "cuda_data_ptr.h"
+
+namespace dlib
+{
+ namespace cuda
+ {
+
+ // -----------------------------------------------------------------------------------
+
+ class curand_generator
+ {
+ public:
+ // not copyable
+ curand_generator(const curand_generator&) = delete;
+ curand_generator& operator=(const curand_generator&) = delete;
+
+ curand_generator() : curand_generator(0) {}
+ curand_generator(unsigned long long seed);
+ ~curand_generator();
+
+ void fill (
+ cuda_data_ptr<unsigned int>& data
+ );
+ /*!
+ ensures
+ - Fills data with random 32-bit unsigned integers.
+ !*/
+
+ void fill_gaussian (
+ tensor& data,
+ float mean = 0,
+ float stddev = 1
+ );
+ /*!
+ requires
+ - data.size()%2 == 0
+ - stddev >= 0
+ ensures
+ - Fills data with random numbers drawn from a Gaussian distribution
+ with the given mean and standard deviation.
+ !*/
+
+ void fill_uniform (
+ tensor& data
+ );
+ /*!
+ ensures
+ - Fills data with uniform random numbers in the range (0.0, 1.0].
+ !*/
+
+ private:
+
+ void* handle;
+ };
+
+ // -----------------------------------------------------------------------------------
+
+ }
+}
+
+#endif // DLIB_USE_CUDA
+
+#endif // DLIB_DNN_CuRAND_H_
+
+
+
diff --git a/ml/dlib/dlib/dnn/cusolver_dlibapi.cu b/ml/dlib/dlib/dnn/cusolver_dlibapi.cu
new file mode 100644
index 000000000..942613134
--- /dev/null
+++ b/ml/dlib/dlib/dnn/cusolver_dlibapi.cu
@@ -0,0 +1,204 @@
+// Copyright (C) 2017 Davis E. King (davis@dlib.net)
+// License: Boost Software License See LICENSE.txt for the full license.
+#ifndef DLIB_DNN_CuSOLVER_CU_
+#define DLIB_DNN_CuSOLVER_CU_
+
+#ifdef DLIB_USE_CUDA
+
+#include "cusolver_dlibapi.h"
+#include <cublas_v2.h>
+#include <cusolverDn.h>
+#include "cuda_utils.h"
+
+// ----------------------------------------------------------------------------------------
+
+static const char* cusolver_get_error_string(cusolverStatus_t s)
+{
+ switch(s)
+ {
+ case CUSOLVER_STATUS_NOT_INITIALIZED:
+ return "CUDA Runtime API initialization failed.";
+ case CUSOLVER_STATUS_ALLOC_FAILED:
+ return "CUDA Resources could not be allocated.";
+ default:
+ return "A call to cuSolver failed";
+ }
+}
+
+// Check the return value of a call to the cuSolver runtime for an error condition.
+#define CHECK_CUSOLVER(call) \
+do{ \
+ const cusolverStatus_t error = call; \
+ if (error != CUSOLVER_STATUS_SUCCESS) \
+ { \
+ std::ostringstream sout; \
+ sout << "Error while calling " << #call << " in file " << __FILE__ << ":" << __LINE__ << ". ";\
+ sout << "code: " << error << ", reason: " << cusolver_get_error_string(error);\
+ throw dlib::cusolver_error(sout.str()); \
+ } \
+}while(false)
+
+// ----------------------------------------------------------------------------------------
+// ----------------------------------------------------------------------------------------
+
+namespace dlib
+{
+ namespace cuda
+ {
+
+ // -----------------------------------------------------------------------------------
+
+ class cusolver_context
+ {
+ public:
+ // not copyable
+ cusolver_context(const cusolver_context&) = delete;
+ cusolver_context& operator=(const cusolver_context&) = delete;
+
+ cusolver_context()
+ {
+ handles.resize(16);
+ }
+ ~cusolver_context()
+ {
+ for (auto h : handles)
+ {
+ if (h)
+ cusolverDnDestroy(h);
+ }
+ }
+
+ cusolverDnHandle_t get_handle (
+ )
+ {
+ int new_device_id;
+ CHECK_CUDA(cudaGetDevice(&new_device_id));
+ // make room for more devices if needed
+ if (new_device_id >= (long)handles.size())
+ handles.resize(new_device_id+16);
+
+ // If we don't have a handle already for this device then make one
+ if (!handles[new_device_id])
+ CHECK_CUSOLVER(cusolverDnCreate(&handles[new_device_id]));
+
+ // Finally, return the handle for the current device
+ return handles[new_device_id];
+ }
+
+ private:
+
+ std::vector<cusolverDnHandle_t> handles;
+ };
+
+ static cusolverDnHandle_t context()
+ {
+ thread_local cusolver_context c;
+ return c.get_handle();
+ }
+
+ // ------------------------------------------------------------------------------------
+ // ------------------------------------------------------------------------------------
+ // ------------------------------------------------------------------------------------
+
+ __global__ void _cuda_set_to_identity_matrix(float* m, size_t nr)
+ {
+ for (auto j : grid_stride_range(0, nr*nr))
+ {
+ if (j%(nr+1) == 0)
+ m[j] = 1;
+ else
+ m[j] = 0;
+ }
+ }
+
+ void set_to_identity_matrix (
+ tensor& m
+ )
+ {
+ DLIB_CASSERT(m.size() == m.num_samples()*m.num_samples());
+ launch_kernel(_cuda_set_to_identity_matrix, max_jobs(m.size()), m.device(), m.num_samples());
+ }
+
+ // ------------------------------------------------------------------------------------
+
+ inv::~inv()
+ {
+ sync_if_needed();
+ }
+
+ // ------------------------------------------------------------------------------------
+
+ void inv::
+ operator() (
+ const tensor& m_,
+ resizable_tensor& out
+ )
+ {
+ DLIB_CASSERT(m_.size() == m_.num_samples()*m_.num_samples(), "Input matrix must be square if you want to invert it.");
+ m = m_;
+
+ out.copy_size(m);
+ set_to_identity_matrix(out);
+
+ const int nc = m.num_samples();
+ int Lwork;
+ CHECK_CUSOLVER(cusolverDnSgetrf_bufferSize(context(), nc , nc, m.device(), nc, &Lwork));
+
+ if (Lwork > (int)workspace.size())
+ {
+ sync_if_needed();
+ workspace = cuda_data_ptr<float>(Lwork);
+ }
+ if (nc > (int)Ipiv.size())
+ {
+ sync_if_needed();
+ Ipiv = cuda_data_ptr<int>(nc);
+ }
+ if (info.size() != 1)
+ {
+ info = cuda_data_ptr<int>(1);
+ }
+
+ CHECK_CUSOLVER(cusolverDnSgetrf(context(), nc, nc, m.device(), nc, workspace, Ipiv, info));
+ CHECK_CUSOLVER(cusolverDnSgetrs(context(), CUBLAS_OP_N, nc, nc, m.device(), nc, Ipiv, out.device(), nc, info));
+ did_work_lately = true;
+ }
+
+ // ------------------------------------------------------------------------------------
+
+ int inv::
+ get_last_status(
+ )
+ {
+ std::vector<int> linfo;
+ memcpy(linfo, info);
+ if (linfo.size() != 0)
+ return linfo[0];
+ else
+ return 0;
+ }
+
+ // ------------------------------------------------------------------------------------
+
+ void inv::
+ sync_if_needed()
+ {
+ if (did_work_lately)
+ {
+ did_work_lately = false;
+ // make sure we wait until any previous kernel launches have finished
+ // before we do something like deallocate the GPU memory.
+ cudaDeviceSynchronize();
+ }
+ }
+
+ // ------------------------------------------------------------------------------------
+
+ }
+}
+
+#endif // DLIB_USE_CUDA
+
+#endif // DLIB_DNN_CuSOLVER_CU_
+
+
diff --git a/ml/dlib/dlib/dnn/cusolver_dlibapi.h b/ml/dlib/dlib/dnn/cusolver_dlibapi.h
new file mode 100644
index 000000000..e5c77c151
--- /dev/null
+++ b/ml/dlib/dlib/dnn/cusolver_dlibapi.h
@@ -0,0 +1,75 @@
+// Copyright (C) 2017 Davis E. King (davis@dlib.net)
+// License: Boost Software License See LICENSE.txt for the full license.
+#ifndef DLIB_DNN_CuSOLVER_H_
+#define DLIB_DNN_CuSOLVER_H_
+
+#ifdef DLIB_USE_CUDA
+
+#include "tensor.h"
+#include "cuda_errors.h"
+#include "cuda_data_ptr.h"
+#include "../noncopyable.h"
+
+namespace dlib
+{
+ namespace cuda
+ {
+
+ // -----------------------------------------------------------------------------------
+
+ class inv : noncopyable
+ {
+ /*!
+ WHAT THIS OBJECT REPRESENTS
+ This is a functor for doing matrix inversion on the GPU. The only
+ reason it's an object is to avoid the reallocation of some GPU memory
+ blocks if you want to do a bunch of matrix inversions in a row.
+ !*/
+
+ public:
+
+ inv() = default;
+ ~inv();
+
+ void operator() (
+ const tensor& m,
+ resizable_tensor& out
+ );
+ /*!
+ requires
+ - m.size() == m.num_samples()*m.num_samples()
+ (i.e. mat(m) must be a square matrix)
+ ensures
+ - out == inv(mat(m));
+ !*/
+
+ int get_last_status(
+ );
+ /*!
+ ensures
+ - returns 0 if the last matrix inversion was successful and != 0
+ otherwise.
+ !*/
+
+ private:
+
+ void sync_if_needed();
+
+ bool did_work_lately = false;
+ resizable_tensor m;
+ cuda_data_ptr<float> workspace;
+ cuda_data_ptr<int> Ipiv;
+ cuda_data_ptr<int> info;
+ };
+
+ // ------------------------------------------------------------------------------------
+
+ }
+}
+
+#endif // DLIB_USE_CUDA
+
+#endif // DLIB_DNN_CuSOLVER_H_
+
+
+
diff --git a/ml/dlib/dlib/dnn/gpu_data.cpp b/ml/dlib/dlib/dnn/gpu_data.cpp
new file mode 100644
index 000000000..6e7cec6be
--- /dev/null
+++ b/ml/dlib/dlib/dnn/gpu_data.cpp
@@ -0,0 +1,228 @@
+// Copyright (C) 2015 Davis E. King (davis@dlib.net)
+// License: Boost Software License See LICENSE.txt for the full license.
+#ifndef DLIB_GPU_DaTA_CPP_
+#define DLIB_GPU_DaTA_CPP_
+
+// Only things that require CUDA are declared in this cpp file. Everything else is in the
+// gpu_data.h header so that it can operate as "header-only" code when using just the CPU.
+#ifdef DLIB_USE_CUDA
+
+#include "gpu_data.h"
+#include <iostream>
+#include "cuda_utils.h"
+#include <cstring>
+
+
+namespace dlib
+{
+
+// ----------------------------------------------------------------------------------------
+
+ void memcpy (
+ gpu_data& dest,
+ const gpu_data& src
+ )
+ {
+ DLIB_CASSERT(dest.size() == src.size());
+ if (src.size() == 0 || &dest == &src)
+ return;
+
+ memcpy(dest,0, src, 0, src.size());
+ }
+
+ void memcpy (
+ gpu_data& dest,
+ size_t dest_offset,
+ const gpu_data& src,
+ size_t src_offset,
+ size_t num
+ )
+ {
+ DLIB_CASSERT(dest_offset + num <= dest.size());
+ DLIB_CASSERT(src_offset + num <= src.size());
+ if (num == 0)
+ return;
+
+ // if there is aliasing
+ if (&dest == &src && std::max(dest_offset, src_offset) < std::min(dest_offset,src_offset)+num)
+ {
+ // if they perfectly alias each other then there is nothing to do
+ if (dest_offset == src_offset)
+ return;
+ else
+ std::memmove(dest.host()+dest_offset, src.host()+src_offset, sizeof(float)*num);
+ }
+ else
+ {
+ // if we write to the entire thing then we can use device_write_only()
+ if (dest_offset == 0 && num == dest.size())
+ {
+ // copy the memory efficiently based on which copy is current in each object.
+ if (src.device_ready())
+ CHECK_CUDA(cudaMemcpy(dest.device_write_only(), src.device()+src_offset, num*sizeof(float), cudaMemcpyDeviceToDevice));
+ else
+ CHECK_CUDA(cudaMemcpy(dest.device_write_only(), src.host()+src_offset, num*sizeof(float), cudaMemcpyHostToDevice));
+ }
+ else
+ {
+ // copy the memory efficiently based on which copy is current in each object.
+ if (dest.device_ready() && src.device_ready())
+ CHECK_CUDA(cudaMemcpy(dest.device()+dest_offset, src.device()+src_offset, num*sizeof(float), cudaMemcpyDeviceToDevice));
+ else if (!dest.device_ready() && src.device_ready())
+ CHECK_CUDA(cudaMemcpy(dest.host()+dest_offset, src.device()+src_offset, num*sizeof(float), cudaMemcpyDeviceToHost));
+ else if (dest.device_ready() && !src.device_ready())
+ CHECK_CUDA(cudaMemcpy(dest.device()+dest_offset, src.host()+src_offset, num*sizeof(float), cudaMemcpyHostToDevice));
+ else
+ CHECK_CUDA(cudaMemcpy(dest.host()+dest_offset, src.host()+src_offset, num*sizeof(float), cudaMemcpyHostToHost));
+ }
+ }
+ }
+// ----------------------------------------------------------------------------------------
+
+ void gpu_data::
+ wait_for_transfer_to_finish() const
+ {
+ if (have_active_transfer)
+ {
+ CHECK_CUDA(cudaStreamSynchronize((cudaStream_t)cuda_stream.get()));
+ have_active_transfer = false;
+ // Check for errors. These calls to cudaGetLastError() are what help us find
+ // out if our kernel launches have been failing.
+ CHECK_CUDA(cudaGetLastError());
+ }
+ }
+
+ void gpu_data::
+ copy_to_device() const
+ {
+ // We want transfers to the device to always be concurrent with any device
+ // computation. So we use our non-default stream to do the transfer.
+ async_copy_to_device();
+ wait_for_transfer_to_finish();
+ }
+
+ void gpu_data::
+ copy_to_host() const
+ {
+ if (!host_current)
+ {
+ wait_for_transfer_to_finish();
+ CHECK_CUDA(cudaMemcpy(data_host.get(), data_device.get(), data_size*sizeof(float), cudaMemcpyDeviceToHost));
+ host_current = true;
+ // At this point we know our RAM block isn't in use because cudaMemcpy()
+ // implicitly syncs with the device.
+ device_in_use = false;
+ // Check for errors. These calls to cudaGetLastError() are what help us find
+ // out if our kernel launches have been failing.
+ CHECK_CUDA(cudaGetLastError());
+ }
+ }
+
+ void gpu_data::
+ async_copy_to_device() const
+ {
+ if (!device_current)
+ {
+ if (device_in_use)
+ {
+ // Wait for any possible CUDA kernels that might be using our memory block to
+ // complete before we overwrite the memory.
+ CHECK_CUDA(cudaStreamSynchronize(0));
+ device_in_use = false;
+ }
+ CHECK_CUDA(cudaMemcpyAsync(data_device.get(), data_host.get(), data_size*sizeof(float), cudaMemcpyHostToDevice, (cudaStream_t)cuda_stream.get()));
+ have_active_transfer = true;
+ device_current = true;
+ }
+ }
+
+ void gpu_data::
+ set_size(
+ size_t new_size
+ )
+ {
+ if (new_size == 0)
+ {
+ if (device_in_use)
+ {
+ // Wait for any possible CUDA kernels that might be using our memory block to
+ // complete before we free the memory.
+ CHECK_CUDA(cudaStreamSynchronize(0));
+ device_in_use = false;
+ }
+ wait_for_transfer_to_finish();
+ data_size = 0;
+ host_current = true;
+ device_current = true;
+ device_in_use = false;
+ data_host.reset();
+ data_device.reset();
+ }
+ else if (new_size != data_size)
+ {
+ if (device_in_use)
+ {
+ // Wait for any possible CUDA kernels that might be using our memory block to
+ // complete before we free the memory.
+ CHECK_CUDA(cudaStreamSynchronize(0));
+ device_in_use = false;
+ }
+ wait_for_transfer_to_finish();
+ data_size = new_size;
+ host_current = true;
+ device_current = true;
+ device_in_use = false;
+
+ try
+ {
+ CHECK_CUDA(cudaGetDevice(&the_device_id));
+
+ // free memory blocks before we allocate new ones.
+ data_host.reset();
+ data_device.reset();
+
+ void* data;
+ CHECK_CUDA(cudaMallocHost(&data, new_size*sizeof(float)));
+ // Note that we don't throw exceptions since the free calls are invariably
+ // called in destructors. They also shouldn't fail anyway unless someone
+ // is resetting the GPU card in the middle of their program.
+ data_host.reset((float*)data, [](float* ptr){
+ auto err = cudaFreeHost(ptr);
+ if(err!=cudaSuccess)
+ std::cerr << "cudaFreeHost() failed. Reason: " << cudaGetErrorString(err) << std::endl;
+ });
+
+ CHECK_CUDA(cudaMalloc(&data, new_size*sizeof(float)));
+ data_device.reset((float*)data, [](float* ptr){
+ auto err = cudaFree(ptr);
+ if(err!=cudaSuccess)
+ std::cerr << "cudaFree() failed. Reason: " << cudaGetErrorString(err) << std::endl;
+ });
+
+ if (!cuda_stream)
+ {
+ cudaStream_t cstream;
+ CHECK_CUDA(cudaStreamCreateWithFlags(&cstream, cudaStreamNonBlocking));
+ cuda_stream.reset(cstream, [](void* ptr){
+ auto err = cudaStreamDestroy((cudaStream_t)ptr);
+ if(err!=cudaSuccess)
+ std::cerr << "cudaStreamDestroy() failed. Reason: " << cudaGetErrorString(err) << std::endl;
+ });
+ }
+
+ }
+ catch(...)
+ {
+ set_size(0);
+ throw;
+ }
+ }
+ }
+
+// ----------------------------------------------------------------------------------------
+}
+
+#endif // DLIB_USE_CUDA
+
+#endif // DLIB_GPU_DaTA_CPP_
+
diff --git a/ml/dlib/dlib/dnn/gpu_data.h b/ml/dlib/dlib/dnn/gpu_data.h
new file mode 100644
index 000000000..022a05f71
--- /dev/null
+++ b/ml/dlib/dlib/dnn/gpu_data.h
@@ -0,0 +1,266 @@
+// Copyright (C) 2015 Davis E. King (davis@dlib.net)
+// License: Boost Software License See LICENSE.txt for the full license.
+#ifndef DLIB_GPU_DaTA_H_
+#define DLIB_GPU_DaTA_H_
+
+#include "gpu_data_abstract.h"
+#include <memory>
+#include <cstring>
+#include "cuda_errors.h"
+#include "../serialize.h"
+
+namespace dlib
+{
+
+// ----------------------------------------------------------------------------------------
+
+ class gpu_data
+ {
+ /*!
+ CONVENTION
+ - if (size() != 0) then
+ - data_host == a pointer to size() floats in CPU memory.
+ - if (data_device) then
+ - data_device == a pointer to size() floats in device memory.
+
+ - if (there might be an active async transfer from host to device) then
+ - have_active_transfer == true
+
+ - We use the host_current and device_current bools to keep track of which
+ copy of the data (or both) are most current. e.g. if the CPU has
+ modified the data and it hasn't been copied to the device yet then
+ host_current==true and device_current==false.
+
+ Similarly, we use device_in_use==true to indicate that device() has been
+ called and no operation to wait for all CUDA kernel completion has been
+ executed. So if device_in_use==true then there might be a CUDA kernel
+ executing that is using the device memory block contained in this object.
+
+ !*/
+ public:
+
+ gpu_data(
+ ) : data_size(0), host_current(true), device_current(true),have_active_transfer(false),device_in_use(false), the_device_id(0)
+ {
+ }
+
+ // Not copyable
+ gpu_data(const gpu_data&) = delete;
+ gpu_data& operator=(const gpu_data&) = delete;
+
+ // but is movable
+ gpu_data(gpu_data&& item) : gpu_data() { swap(item); }
+ gpu_data& operator=(gpu_data&& item) { swap(item); return *this; }
+
+ int device_id() const { return the_device_id; }
+
+#ifdef DLIB_USE_CUDA
+ void async_copy_to_device() const;
+ void set_size(size_t new_size);
+#else
+ // Note that calls to host() or device() will block until any async transfers are complete.
+ void async_copy_to_device() const{}
+
+ void set_size(size_t new_size)
+ {
+ if (new_size == 0)
+ {
+ data_size = 0;
+ host_current = true;
+ device_current = true;
+ device_in_use = false;
+ data_host.reset();
+ data_device.reset();
+ }
+ else if (new_size != data_size)
+ {
+ data_size = new_size;
+ host_current = true;
+ device_current = true;
+ device_in_use = false;
+ data_host.reset(new float[new_size], std::default_delete<float[]>());
+ data_device.reset();
+ }
+ }
+#endif
+
+ const float* host() const
+ {
+ copy_to_host();
+ return data_host.get();
+ }
+
+ float* host()
+ {
+ copy_to_host();
+ device_current = false;
+ return data_host.get();
+ }
+
+ float* host_write_only()
+ {
+ host_current = true;
+ device_current = false;
+ return data_host.get();
+ }
+
+ const float* device() const
+ {
+#ifndef DLIB_USE_CUDA
+ DLIB_CASSERT(false, "CUDA NOT ENABLED");
+#endif
+ copy_to_device();
+ device_in_use = true;
+ return data_device.get();
+ }
+
+ float* device()
+ {
+#ifndef DLIB_USE_CUDA
+ DLIB_CASSERT(false, "CUDA NOT ENABLED");
+#endif
+ copy_to_device();
+ host_current = false;
+ device_in_use = true;
+ return data_device.get();
+ }
+
+ float* device_write_only()
+ {
+#ifndef DLIB_USE_CUDA
+ DLIB_CASSERT(false, "CUDA NOT ENABLED");
+#endif
+ wait_for_transfer_to_finish();
+ host_current = false;
+ device_current = true;
+ device_in_use = true;
+ return data_device.get();
+ }
+
+ bool host_ready (
+ ) const { return host_current; }
+
+ bool device_ready (
+ ) const { return device_current && !have_active_transfer; }
+
+ size_t size() const { return data_size; }
+
+ void swap (gpu_data& item)
+ {
+ std::swap(data_size, item.data_size);
+ std::swap(host_current, item.host_current);
+ std::swap(device_current, item.device_current);
+ std::swap(have_active_transfer, item.have_active_transfer);
+ std::swap(data_host, item.data_host);
+ std::swap(data_device, item.data_device);
+ std::swap(cuda_stream, item.cuda_stream);
+ std::swap(the_device_id, item.the_device_id);
+ }
+
+ private:
+
+#ifdef DLIB_USE_CUDA
+ void copy_to_device() const;
+ void copy_to_host() const;
+ void wait_for_transfer_to_finish() const;
+#else
+ void copy_to_device() const{}
+ void copy_to_host() const{}
+ void wait_for_transfer_to_finish() const{}
+#endif
+
+
+ size_t data_size;
+ mutable bool host_current;
+ mutable bool device_current;
+ mutable bool have_active_transfer;
+ mutable bool device_in_use;
+
+ std::shared_ptr<float> data_host;
+ std::shared_ptr<float> data_device;
+ std::shared_ptr<void> cuda_stream;
+ int the_device_id;
+ };
+
+ inline void serialize(const gpu_data& item, std::ostream& out)
+ {
+ int version = 1;
+ serialize(version, out);
+ serialize(item.size(), out);
+ auto data = item.host();
+ for (size_t i = 0; i < item.size(); ++i)
+ serialize(data[i], out);
+ }
+
+ inline void deserialize(gpu_data& item, std::istream& in)
+ {
+ int version;
+ deserialize(version, in);
+ if (version != 1)
+ throw serialization_error("Unexpected version found while deserializing dlib::gpu_data.");
+ size_t s;
+ deserialize(s, in);
+ item.set_size(s);
+ auto data = item.host();
+ for (size_t i = 0; i < item.size(); ++i)
+ deserialize(data[i], in);
+ }
+
+#ifdef DLIB_USE_CUDA
+ void memcpy (gpu_data& dest, const gpu_data& src);
+
+ void memcpy (
+ gpu_data& dest,
+ size_t dest_offset,
+ const gpu_data& src,
+ size_t src_offset,
+ size_t num
+ );
+
+#else
+
+ inline void memcpy (gpu_data& dest, const gpu_data& src)
+ {
+ DLIB_CASSERT(dest.size() == src.size());
+ if (src.size() == 0 || &dest == &src)
+ return;
+ std::memcpy(dest.host_write_only(), src.host(), sizeof(float)*src.size());
+ }
+
+ inline void memcpy (
+ gpu_data& dest,
+ size_t dest_offset,
+ const gpu_data& src,
+ size_t src_offset,
+ size_t num
+ )
+ {
+ DLIB_CASSERT(dest_offset + num <= dest.size());
+ DLIB_CASSERT(src_offset + num <= src.size());
+ if (num == 0)
+ return;
+ if (&dest == &src && std::max(dest_offset, src_offset) < std::min(dest_offset,src_offset)+num)
+ {
+ // if they perfectly alias each other then there is nothing to do
+ if (dest_offset == src_offset)
+ return;
+ else
+ std::memmove(dest.host()+dest_offset, src.host()+src_offset, sizeof(float)*num);
+ }
+ else
+ {
+ // if we write to the entire thing then we can use host_write_only()
+ if (dest_offset == 0 && num == dest.size())
+ std::memcpy(dest.host_write_only(), src.host()+src_offset, sizeof(float)*num);
+ else
+ std::memcpy(dest.host()+dest_offset, src.host()+src_offset, sizeof(float)*num);
+ }
+ }
+#endif
+
+// ----------------------------------------------------------------------------------------
+
+}
+
+#endif // DLIB_GPU_DaTA_H_
+
diff --git a/ml/dlib/dlib/dnn/gpu_data_abstract.h b/ml/dlib/dlib/dnn/gpu_data_abstract.h
new file mode 100644
index 000000000..f2423dee1
--- /dev/null
+++ b/ml/dlib/dlib/dnn/gpu_data_abstract.h
@@ -0,0 +1,266 @@
+// Copyright (C) 2015 Davis E. King (davis@dlib.net)
+// License: Boost Software License See LICENSE.txt for the full license.
+#undef DLIB_GPU_DaTA_ABSTRACT_H_
+#ifdef DLIB_GPU_DaTA_ABSTRACT_H_
+
+#include "cuda_errors.h"
+#include "../serialize.h"
+
+namespace dlib
+{
+
+// ----------------------------------------------------------------------------------------
+
+ class gpu_data
+ {
+ /*!
+ WHAT THIS OBJECT REPRESENTS
+ This object is a block of size() floats, all stored contiguously in memory.
+ Importantly, it keeps two copies of the floats, one on the host CPU side
+ and another on the GPU device side. It automatically performs the necessary
+ host/device transfers to keep these two copies of the data in sync.
+
+ All transfers to the device happen asynchronously with respect to the
+ default CUDA stream so that CUDA kernel computations can overlap with data
+ transfers. However, any transfers from the device to the host happen
+ synchronously in the default CUDA stream. Therefore, you should perform
+ all your CUDA kernel launches on the default stream so that transfers back
+ to the host do not happen before the relevant computations have completed.
+
+ If DLIB_USE_CUDA is not #defined then this object will not use CUDA at all.
+ Instead, it will simply store one host side memory block of floats.
+
+ THREAD SAFETY
+ Instances of this object are not thread-safe. So don't touch one from
+ multiple threads at the same time.
+ !*/
+ public:
+
+ gpu_data(
+ );
+ /*!
+ ensures
+ - #size() == 0
+ - #host() == nullptr
+ - #device() == nullptr
+ - #host_ready() == true
+ - #device_ready() == true
+ - #device_id() == 0
+ !*/
+
+ // This object is not copyable, however, it is movable.
+ gpu_data(const gpu_data&) = delete;
+ gpu_data& operator=(const gpu_data&) = delete;
+ gpu_data(gpu_data&& item);
+ gpu_data& operator=(gpu_data&& item);
+
+ int device_id(
+ ) const;
+ /*!
+ ensures
+ - returns the ID of the CUDA device that allocated this memory. I.e. the
+ number returned by cudaGetDevice() when the memory was allocated.
+ - If CUDA is not being used then this function always returns 0.
+ !*/
+
+ void async_copy_to_device(
+ );
+ /*!
+ ensures
+ - if (!device_ready()) then
+ - Begins asynchronously copying host data to the device once it is safe
+ to do so. I.e. This function will wait until any previously
+ scheduled CUDA kernels, which are using the device() memory block,
+ have completed before transferring the new data to the device.
+ - A call to device() that happens before the transfer completes will
+ block until the transfer is complete. That is, it is safe to call
+ async_copy_to_device() and then immediately call device().
+ !*/
+
+ void set_size(
+ size_t new_size
+ );
+ /*!
+ ensures
+ - #size() == new_size
+ !*/
+
+ bool host_ready (
+ ) const;
+ /*!
+ ensures
+ - returns true if and only if the host's copy of the data is current. The
+ host's data is current if there aren't any modifications to the data
+ which were made on the device side that have yet to be copied to the
+ host.
+ !*/
+
+ bool device_ready (
+ ) const;
+ /*!
+ ensures
+ - returns true if and only if the device's copy of the data is current.
+ The device's data is current if there aren't any modifications to the
+ data which were made on the host side that have yet to be copied to the
+ device.
+ !*/
+
+ const float* host(
+ ) const;
+ /*!
+ ensures
+ - returns a pointer to the host memory block of size() contiguous float
+ values or nullptr if size()==0.
+ - if (!host_ready()) then
+ - copies the data from the device to the host, while this is happening
+ the call to host() blocks.
+ - #host_ready() == true
+ !*/
+
+ float* host(
+ );
+ /*!
+ ensures
+ - returns a pointer to the host memory block of size() contiguous float
+ values or nullptr if size()==0.
+ - if (!host_ready()) then
+ - copies the data from the device to the host, while this is happening
+ the call to host() blocks.
+ - #host_ready() == true
+ - #device_ready() == false
+ I.e. Marks the device side data as out of date so that the next call to
+ device() will perform a host to device transfer. If you want to begin
+ the transfer immediately then you can call async_copy_to_device() after
+ calling host().
+ !*/
+
+ float* host_write_only(
+ );
+ /*!
+ ensures
+ - This function returns the same pointer as host(), except that it never
+ performs a device to host memory copy. Instead, it immediately marks the
+ device side data as out of date, effectively discarding it. Therefore,
+ the values in the data pointed to by host_write_only() are undefined and
+ you should only call host_write_only() if you are going to assign to
+ every memory location in the returned memory block.
+ - #host_ready() == true
+ - #device_ready() == false
+ !*/
+
+ const float* device(
+ ) const;
+ /*!
+ requires
+ - DLIB_USE_CUDA is #defined
+ ensures
+ - returns a pointer to the device memory block of size() contiguous float
+ values or nullptr if size()==0.
+ - if (!device_ready()) then
+ - copies the data from the host to the device, while this is happening
+ the call to device() blocks.
+ - #device_ready() == true
+ !*/
+
+ float* device(
+ );
+ /*!
+ requires
+ - DLIB_USE_CUDA is #defined
+ ensures
+ - returns a pointer to the device memory block of size() contiguous float
+ values or nullptr if size()==0.
+ - if (!device_ready()) then
+ - copies the data from the host to the device, while this is happening
+ the call to device() blocks.
+ - #host_ready() == false
+ - #device_ready() == true
+ !*/
+
+ float* device_write_only(
+ );
+ /*!
+ requires
+ - DLIB_USE_CUDA is #defined
+ ensures
+ - This function returns the same pointer as device(), except that it never
+ performs a host to device memory copy. Instead, it immediately marks the
+ host side data as out of date, effectively discarding it. Therefore, the
+ values in the data pointed to by device_write_only() are undefined and
+ you should only call device_write_only() if you are going to assign to
+ every memory location in the returned memory block.
+ - #host_ready() == false
+ - #device_ready() == true
+ !*/
+
+
+ size_t size(
+ ) const;
+ /*!
+ ensures
+ - returns the number of floats contained in this object.
+ !*/
+
+ void swap (
+ gpu_data& item
+ );
+ /*!
+ ensures
+ - swaps the state of *this and item
+ !*/
+
+ };
+
+ void serialize(const gpu_data& item, std::ostream& out);
+ void deserialize(gpu_data& item, std::istream& in);
+ /*!
+ provides serialization support
+ !*/
+
+ void memcpy (
+ gpu_data& dest,
+ const gpu_data& src
+ );
+ /*!
+ requires
+ - dest.size() == src.size()
+ ensures
+ - Copies the data in src to dest. If the device data is current (i.e.
+ device_ready()==true) on both src and dest then the copy will happen entirely
+ on the device side.
+ - It doesn't matter what GPU device is selected by cudaSetDevice(). You can
+ always copy gpu_data objects to and from each other regardless.
+ - This function blocks until the copy has completed.
+ !*/
+
+ void memcpy (
+ gpu_data& dest,
+ size_t dest_offset,
+ const gpu_data& src,
+ size_t src_offset,
+ size_t num
+ );
+ /*!
+ requires
+ - dest_offset + num <= dest.size()
+ - src_offset + num <= src.size()
+ ensures
+ - Copies the data in src to dest, but only copies data in the range
+ [src.host()+src_offset, src.host()+src_offset+num) to
+ [dest.host()+dest_offset, dest.host()+dest_offset+num). Therefore, it is
+ just like the above memcpy() except that you can specify some subset of data
+ in a gpu_data object to be copied.
+ - Like the above version of memcpy(), the copy will happen in the most
+ efficient way, automatically using the appropriate type of host/device
+ transfers based on where data is currently resident.
+ - It doesn't matter what GPU device is selected by cudaSetDevice(). You can
+ always copy gpu_data objects to and from each other regardless.
+ - This function blocks until the copy has completed.
+ !*/
+
+// ----------------------------------------------------------------------------------------
+
+}
+
+#endif // DLIB_GPU_DaTA_ABSTRACT_H_
+
diff --git a/ml/dlib/dlib/dnn/input.h b/ml/dlib/dlib/dnn/input.h
new file mode 100644
index 000000000..3b5c954e6
--- /dev/null
+++ b/ml/dlib/dlib/dnn/input.h
@@ -0,0 +1,808 @@
+// Copyright (C) 2015 Davis E. King (davis@dlib.net)
+// License: Boost Software License See LICENSE.txt for the full license.
+#ifndef DLIB_DNn_INPUT_H_
+#define DLIB_DNn_INPUT_H_
+
+#include "input_abstract.h"
+#include "../matrix.h"
+#include "../array2d.h"
+#include "../pixel.h"
+#include "../image_processing.h"
+#include <sstream>
+#include <array>
+#include "tensor_tools.h"
+
+
+namespace dlib
+{
+
+// ----------------------------------------------------------------------------------------
+
+ template <typename T>
+ class input
+ {
+ const static bool always_false = sizeof(T)!=sizeof(T);
+ static_assert(always_false, "Unsupported type given to input<>. input<> only supports "
+ "dlib::matrix and dlib::array2d objects.");
+ };
+
+// ----------------------------------------------------------------------------------------
+
+ template <size_t NR, size_t NC=NR>
+ class input_rgb_image_sized;
+
+ class input_rgb_image
+ {
+ public:
+ typedef matrix<rgb_pixel> input_type;
+
+ input_rgb_image (
+ ) :
+ avg_red(122.782),
+ avg_green(117.001),
+ avg_blue(104.298)
+ {
+ }
+
+ input_rgb_image (
+ float avg_red_,
+ float avg_green_,
+ float avg_blue_
+ ) : avg_red(avg_red_), avg_green(avg_green_), avg_blue(avg_blue_)
+ {}
+
+ template <size_t NR, size_t NC>
+ inline input_rgb_image (
+ const input_rgb_image_sized<NR,NC>& item
+ );
+
+ float get_avg_red() const { return avg_red; }
+ float get_avg_green() const { return avg_green; }
+ float get_avg_blue() const { return avg_blue; }
+
+ bool image_contained_point ( const tensor& data, const point& p) const { return get_rect(data).contains(p); }
+ drectangle tensor_space_to_image_space ( const tensor& /*data*/, drectangle r) const { return r; }
+ drectangle image_space_to_tensor_space ( const tensor& /*data*/, double /*scale*/, drectangle r ) const { return r; }
+
+ template <typename forward_iterator>
+ void to_tensor (
+ forward_iterator ibegin,
+ forward_iterator iend,
+ resizable_tensor& data
+ ) const
+ {
+ DLIB_CASSERT(std::distance(ibegin,iend) > 0);
+ const auto nr = ibegin->nr();
+ const auto nc = ibegin->nc();
+ // make sure all the input matrices have the same dimensions
+ for (auto i = ibegin; i != iend; ++i)
+ {
+ DLIB_CASSERT(i->nr()==nr && i->nc()==nc,
+ "\t input_rgb_image::to_tensor()"
+ << "\n\t All matrices given to to_tensor() must have the same dimensions."
+ << "\n\t nr: " << nr
+ << "\n\t nc: " << nc
+ << "\n\t i->nr(): " << i->nr()
+ << "\n\t i->nc(): " << i->nc()
+ );
+ }
+
+
+ // initialize data to the right size to contain the stuff in the iterator range.
+ data.set_size(std::distance(ibegin,iend), 3, nr, nc);
+
+
+ const size_t offset = nr*nc;
+ auto ptr = data.host();
+ for (auto i = ibegin; i != iend; ++i)
+ {
+ for (long r = 0; r < nr; ++r)
+ {
+ for (long c = 0; c < nc; ++c)
+ {
+ rgb_pixel temp = (*i)(r,c);
+ auto p = ptr++;
+ *p = (temp.red-avg_red)/256.0;
+ p += offset;
+ *p = (temp.green-avg_green)/256.0;
+ p += offset;
+ *p = (temp.blue-avg_blue)/256.0;
+ p += offset;
+ }
+ }
+ ptr += offset*(data.k()-1);
+ }
+
+ }
+
+ friend void serialize(const input_rgb_image& item, std::ostream& out)
+ {
+ serialize("input_rgb_image", out);
+ serialize(item.avg_red, out);
+ serialize(item.avg_green, out);
+ serialize(item.avg_blue, out);
+ }
+
+ friend void deserialize(input_rgb_image& item, std::istream& in)
+ {
+ std::string version;
+ deserialize(version, in);
+ if (version != "input_rgb_image" && version != "input_rgb_image_sized")
+ throw serialization_error("Unexpected version found while deserializing dlib::input_rgb_image.");
+ deserialize(item.avg_red, in);
+ deserialize(item.avg_green, in);
+ deserialize(item.avg_blue, in);
+
+ // read and discard the sizes if this was really a sized input layer.
+ if (version == "input_rgb_image_sized")
+ {
+ size_t nr, nc;
+ deserialize(nr, in);
+ deserialize(nc, in);
+ }
+ }
+
+ friend std::ostream& operator<<(std::ostream& out, const input_rgb_image& item)
+ {
+ out << "input_rgb_image("<<item.avg_red<<","<<item.avg_green<<","<<item.avg_blue<<")";
+ return out;
+ }
+
+ friend void to_xml(const input_rgb_image& item, std::ostream& out)
+ {
+ out << "<input_rgb_image r='"<<item.avg_red<<"' g='"<<item.avg_green<<"' b='"<<item.avg_blue<<"'/>";
+ }
+
+ private:
+ float avg_red;
+ float avg_green;
+ float avg_blue;
+ };
+
+// ----------------------------------------------------------------------------------------
+
+ template <size_t NR, size_t NC>
+ class input_rgb_image_sized
+ {
+ public:
+ static_assert(NR != 0 && NC != 0, "The input image can't be empty.");
+
+ typedef matrix<rgb_pixel> input_type;
+
+ input_rgb_image_sized (
+ ) :
+ avg_red(122.782),
+ avg_green(117.001),
+ avg_blue(104.298)
+ {
+ }
+
+ input_rgb_image_sized (
+ const input_rgb_image& item
+ ) : avg_red(item.get_avg_red()),
+ avg_green(item.get_avg_green()),
+ avg_blue(item.get_avg_blue())
+ {}
+
+ input_rgb_image_sized (
+ float avg_red_,
+ float avg_green_,
+ float avg_blue_
+ ) : avg_red(avg_red_), avg_green(avg_green_), avg_blue(avg_blue_)
+ {}
+
+ float get_avg_red() const { return avg_red; }
+ float get_avg_green() const { return avg_green; }
+ float get_avg_blue() const { return avg_blue; }
+
+ bool image_contained_point ( const tensor& data, const point& p) const { return get_rect(data).contains(p); }
+ drectangle tensor_space_to_image_space ( const tensor& /*data*/, drectangle r) const { return r; }
+ drectangle image_space_to_tensor_space ( const tensor& /*data*/, double /*scale*/, drectangle r ) const { return r; }
+
+ template <typename forward_iterator>
+ void to_tensor (
+ forward_iterator ibegin,
+ forward_iterator iend,
+ resizable_tensor& data
+ ) const
+ {
+ DLIB_CASSERT(std::distance(ibegin,iend) > 0);
+ // make sure all input images have the correct size
+ for (auto i = ibegin; i != iend; ++i)
+ {
+ DLIB_CASSERT(i->nr()==NR && i->nc()==NC,
+ "\t input_rgb_image_sized::to_tensor()"
+ << "\n\t All input images must have "<<NR<<" rows and "<<NC<< " columns, but we got one with "<<i->nr()<<" rows and "<<i->nc()<<" columns."
+ );
+ }
+
+
+ // initialize data to the right size to contain the stuff in the iterator range.
+ data.set_size(std::distance(ibegin,iend), 3, NR, NC);
+
+
+ const size_t offset = NR*NC;
+ auto ptr = data.host();
+ for (auto i = ibegin; i != iend; ++i)
+ {
+ for (size_t r = 0; r < NR; ++r)
+ {
+ for (size_t c = 0; c < NC; ++c)
+ {
+ rgb_pixel temp = (*i)(r,c);
+ auto p = ptr++;
+ *p = (temp.red-avg_red)/256.0;
+ p += offset;
+ *p = (temp.green-avg_green)/256.0;
+ p += offset;
+ *p = (temp.blue-avg_blue)/256.0;
+ p += offset;
+ }
+ }
+ ptr += offset*(data.k()-1);
+ }
+
+ }
+
+ friend void serialize(const input_rgb_image_sized& item, std::ostream& out)
+ {
+ serialize("input_rgb_image_sized", out);
+ serialize(item.avg_red, out);
+ serialize(item.avg_green, out);
+ serialize(item.avg_blue, out);
+ serialize(NR, out);
+ serialize(NC, out);
+ }
+
+ friend void deserialize(input_rgb_image_sized& item, std::istream& in)
+ {
+ std::string version;
+ deserialize(version, in);
+ if (version != "input_rgb_image_sized")
+ throw serialization_error("Unexpected version found while deserializing dlib::input_rgb_image_sized.");
+ deserialize(item.avg_red, in);
+ deserialize(item.avg_green, in);
+ deserialize(item.avg_blue, in);
+ size_t nr, nc;
+ deserialize(nr, in);
+ deserialize(nc, in);
+ if (nr != NR || nc != NC)
+ {
+ std::ostringstream sout;
+ sout << "Wrong image dimensions found while deserializing dlib::input_rgb_image_sized.\n";
+ sout << "Expected "<<NR<<" rows and "<<NC<< " columns, but found "<<nr<<" rows and "<<nc<<" columns.";
+ throw serialization_error(sout.str());
+ }
+ }
+
+ friend std::ostream& operator<<(std::ostream& out, const input_rgb_image_sized& item)
+ {
+ out << "input_rgb_image_sized("<<item.avg_red<<","<<item.avg_green<<","<<item.avg_blue<<") nr="<<NR<<" nc="<<NC;
+ return out;
+ }
+
+ friend void to_xml(const input_rgb_image_sized& item, std::ostream& out)
+ {
+ out << "<input_rgb_image_sized r='"<<item.avg_red<<"' g='"<<item.avg_green<<"' b='"<<item.avg_blue<<"' nr='"<<NR<<"' nc='"<<NC<<"'/>";
+ }
+
+ private:
+ float avg_red;
+ float avg_green;
+ float avg_blue;
+ };
+
+// ----------------------------------------------------------------------------------------
+
+ template <size_t NR, size_t NC>
+ input_rgb_image::
+ input_rgb_image (
+ const input_rgb_image_sized<NR,NC>& item
+ ) : avg_red(item.get_avg_red()),
+ avg_green(item.get_avg_green()),
+ avg_blue(item.get_avg_blue())
+ {}
+
+// ----------------------------------------------------------------------------------------
+
+ template <typename T, long NR, long NC, typename MM, typename L>
+ class input<matrix<T,NR,NC,MM,L>>
+ {
+ public:
+ typedef matrix<T,NR,NC,MM,L> input_type;
+
+ input() {}
+ input(const input&) {}
+
+ template <typename mm>
+ input(const input<array2d<T,mm>>&) {}
+
+ bool image_contained_point ( const tensor& data, const point& p) const { return get_rect(data).contains(p); }
+ drectangle tensor_space_to_image_space ( const tensor& /*data*/, drectangle r) const { return r; }
+ drectangle image_space_to_tensor_space ( const tensor& /*data*/, double /*scale*/, drectangle r ) const { return r; }
+
+ template <typename forward_iterator>
+ void to_tensor (
+ forward_iterator ibegin,
+ forward_iterator iend,
+ resizable_tensor& data
+ ) const
+ {
+ DLIB_CASSERT(std::distance(ibegin,iend) > 0);
+ const auto nr = ibegin->nr();
+ const auto nc = ibegin->nc();
+ // make sure all the input matrices have the same dimensions
+ for (auto i = ibegin; i != iend; ++i)
+ {
+ DLIB_CASSERT(i->nr()==nr && i->nc()==nc,
+ "\t input::to_tensor()"
+ << "\n\t All matrices given to to_tensor() must have the same dimensions."
+ << "\n\t nr: " << nr
+ << "\n\t nc: " << nc
+ << "\n\t i->nr(): " << i->nr()
+ << "\n\t i->nc(): " << i->nc()
+ );
+ }
+
+
+ // initialize data to the right size to contain the stuff in the iterator range.
+ data.set_size(std::distance(ibegin,iend), pixel_traits<T>::num, nr, nc);
+
+ typedef typename pixel_traits<T>::basic_pixel_type bptype;
+
+ const size_t offset = nr*nc;
+ auto ptr = data.host();
+ for (auto i = ibegin; i != iend; ++i)
+ {
+ for (long r = 0; r < nr; ++r)
+ {
+ for (long c = 0; c < nc; ++c)
+ {
+ auto temp = pixel_to_vector<float>((*i)(r,c));
+ auto p = ptr++;
+ for (long j = 0; j < temp.size(); ++j)
+ {
+ if (is_same_type<bptype,unsigned char>::value)
+ *p = temp(j)/256.0;
+ else
+ *p = temp(j);
+ p += offset;
+ }
+ }
+ }
+ ptr += offset*(data.k()-1);
+ }
+
+ }
+
+ friend void serialize(const input& /*item*/, std::ostream& out)
+ {
+ serialize("input<matrix>", out);
+ }
+
+ friend void deserialize(input& /*item*/, std::istream& in)
+ {
+ std::string version;
+ deserialize(version, in);
+ if (version != "input<matrix>")
+ throw serialization_error("Unexpected version found while deserializing dlib::input.");
+ }
+
+ friend std::ostream& operator<<(std::ostream& out, const input& /*item*/)
+ {
+ out << "input<matrix>";
+ return out;
+ }
+
+ friend void to_xml(const input& /*item*/, std::ostream& out)
+ {
+ out << "<input/>";
+ }
+ };
+
+// ----------------------------------------------------------------------------------------
+
+ template <typename T, long NR, long NC, typename MM, typename L, size_t K>
+ class input<std::array<matrix<T,NR,NC,MM,L>,K>>
+ {
+ public:
+ typedef std::array<matrix<T,NR,NC,MM,L>,K> input_type;
+
+ input() {}
+ input(const input&) {}
+
+ bool image_contained_point ( const tensor& data, const point& p) const { return get_rect(data).contains(p); }
+ drectangle tensor_space_to_image_space ( const tensor& /*data*/, drectangle r) const { return r; }
+ drectangle image_space_to_tensor_space ( const tensor& /*data*/, double /*scale*/, drectangle r ) const { return r; }
+
+ template <typename forward_iterator>
+ void to_tensor (
+ forward_iterator ibegin,
+ forward_iterator iend,
+ resizable_tensor& data
+ ) const
+ {
+ DLIB_CASSERT(std::distance(ibegin,iend) > 0);
+ DLIB_CASSERT(ibegin->size() != 0, "When using std::array<matrix> inputs you can't give 0 sized arrays.");
+ const auto nr = (*ibegin)[0].nr();
+ const auto nc = (*ibegin)[0].nc();
+ // make sure all the input matrices have the same dimensions
+ for (auto i = ibegin; i != iend; ++i)
+ {
+ for (size_t k = 0; k < K; ++k)
+ {
+ const auto& arr = *i;
+ DLIB_CASSERT(arr[k].nr()==nr && arr[k].nc()==nc,
+ "\t input::to_tensor()"
+ << "\n\t When using std::array<matrix> as input, all matrices in a batch must have the same dimensions."
+ << "\n\t nr: " << nr
+ << "\n\t nc: " << nc
+ << "\n\t k: " << k
+ << "\n\t arr[k].nr(): " << arr[k].nr()
+ << "\n\t arr[k].nc(): " << arr[k].nc()
+ );
+ }
+ }
+
+
+ // initialize data to the right size to contain the stuff in the iterator range.
+ data.set_size(std::distance(ibegin,iend), K, nr, nc);
+
+ auto ptr = data.host();
+ for (auto i = ibegin; i != iend; ++i)
+ {
+ for (size_t k = 0; k < K; ++k)
+ {
+ for (long r = 0; r < nr; ++r)
+ {
+ for (long c = 0; c < nc; ++c)
+ {
+ if (is_same_type<T,unsigned char>::value)
+ *ptr++ = (*i)[k](r,c)/256.0;
+ else
+ *ptr++ = (*i)[k](r,c);
+ }
+ }
+ }
+ }
+
+ }
+
+ friend void serialize(const input& /*item*/, std::ostream& out)
+ {
+ serialize("input<array<matrix>>", out);
+ }
+
+ friend void deserialize(input& /*item*/, std::istream& in)
+ {
+ std::string version;
+ deserialize(version, in);
+ if (version != "input<array<matrix>>")
+ throw serialization_error("Unexpected version found while deserializing dlib::input<array<matrix>>.");
+ }
+
+ friend std::ostream& operator<<(std::ostream& out, const input& /*item*/)
+ {
+ out << "input<array<matrix>>";
+ return out;
+ }
+
+ friend void to_xml(const input& /*item*/, std::ostream& out)
+ {
+ out << "<input/>";
+ }
+ };
+
+// ----------------------------------------------------------------------------------------
+
+ template <typename T, typename MM>
+ class input<array2d<T,MM>>
+ {
+ public:
+ typedef array2d<T,MM> input_type;
+
+ input() {}
+ input(const input&) {}
+
+ template <long NR, long NC, typename mm, typename L>
+ input(const input<matrix<T,NR,NC,mm,L>>&) {}
+
+ bool image_contained_point ( const tensor& data, const point& p) const { return get_rect(data).contains(p); }
+ drectangle tensor_space_to_image_space ( const tensor& /*data*/, drectangle r) const { return r; }
+ drectangle image_space_to_tensor_space ( const tensor& /*data*/, double /*scale*/, drectangle r ) const { return r; }
+
+ template <typename forward_iterator>
+ void to_tensor (
+ forward_iterator ibegin,
+ forward_iterator iend,
+ resizable_tensor& data
+ ) const
+ {
+ DLIB_CASSERT(std::distance(ibegin,iend) > 0);
+ const auto nr = ibegin->nr();
+ const auto nc = ibegin->nc();
+ // make sure all the input matrices have the same dimensions
+ for (auto i = ibegin; i != iend; ++i)
+ {
+ DLIB_CASSERT(i->nr()==nr && i->nc()==nc,
+ "\t input::to_tensor()"
+ << "\n\t All array2d objects given to to_tensor() must have the same dimensions."
+ << "\n\t nr: " << nr
+ << "\n\t nc: " << nc
+ << "\n\t i->nr(): " << i->nr()
+ << "\n\t i->nc(): " << i->nc()
+ );
+ }
+
+
+ // initialize data to the right size to contain the stuff in the iterator range.
+ data.set_size(std::distance(ibegin,iend), pixel_traits<T>::num, nr, nc);
+ typedef typename pixel_traits<T>::basic_pixel_type bptype;
+
+ const size_t offset = nr*nc;
+ auto ptr = data.host();
+ for (auto i = ibegin; i != iend; ++i)
+ {
+ for (long r = 0; r < nr; ++r)
+ {
+ for (long c = 0; c < nc; ++c)
+ {
+ auto temp = pixel_to_vector<float>((*i)[r][c]);
+ auto p = ptr++;
+ for (long j = 0; j < temp.size(); ++j)
+ {
+ if (is_same_type<bptype,unsigned char>::value)
+ *p = temp(j)/256.0;
+ else
+ *p = temp(j);
+ p += offset;
+ }
+ }
+ }
+ ptr += offset*(data.k()-1);
+ }
+
+ }
+
+ friend void serialize(const input& item, std::ostream& out)
+ {
+ serialize("input<array2d>", out);
+ }
+
+ friend void deserialize(input& item, std::istream& in)
+ {
+ std::string version;
+ deserialize(version, in);
+ if (version != "input<array2d>")
+ throw serialization_error("Unexpected version found while deserializing dlib::input.");
+ }
+ friend std::ostream& operator<<(std::ostream& out, const input& item)
+ {
+ out << "input<array2d>";
+ return out;
+ }
+
+ friend void to_xml(const input& item, std::ostream& out)
+ {
+ out << "<input/>";
+ }
+ };
+
+// ----------------------------------------------------------------------------------------
+
+ template <typename PYRAMID_TYPE>
+ class input_rgb_image_pyramid
+ {
+ public:
+ typedef matrix<rgb_pixel> input_type;
+ typedef PYRAMID_TYPE pyramid_type;
+
+ input_rgb_image_pyramid (
+ ) :
+ avg_red(122.782),
+ avg_green(117.001),
+ avg_blue(104.298)
+ {
+ }
+
+ input_rgb_image_pyramid (
+ float avg_red_,
+ float avg_green_,
+ float avg_blue_
+ ) : avg_red(avg_red_), avg_green(avg_green_), avg_blue(avg_blue_)
+ {}
+
+ float get_avg_red() const { return avg_red; }
+ float get_avg_green() const { return avg_green; }
+ float get_avg_blue() const { return avg_blue; }
+
+ unsigned long get_pyramid_padding () const { return pyramid_padding; }
+ void set_pyramid_padding (unsigned long value) { pyramid_padding = value; }
+
+ unsigned long get_pyramid_outer_padding () const { return pyramid_outer_padding; }
+ void set_pyramid_outer_padding (unsigned long value) { pyramid_outer_padding = value; }
+
+ bool image_contained_point (
+ const tensor& data,
+ const point& p
+ ) const
+ {
+ auto&& rects = any_cast<std::vector<rectangle>>(data.annotation());
+ DLIB_CASSERT(rects.size() > 0);
+ return rects[0].contains(p+rects[0].tl_corner());
+ }
+
+ drectangle tensor_space_to_image_space (
+ const tensor& data,
+ drectangle r
+ ) const
+ {
+ auto&& rects = any_cast<std::vector<rectangle>>(data.annotation());
+ return tiled_pyramid_to_image<pyramid_type>(rects, r);
+ }
+
+ drectangle image_space_to_tensor_space (
+ const tensor& data,
+ double scale,
+ drectangle r
+ ) const
+ {
+ DLIB_CASSERT(0 < scale && scale <= 1 , "scale: "<< scale);
+ auto&& rects = any_cast<std::vector<rectangle>>(data.annotation());
+ return image_to_tiled_pyramid<pyramid_type>(rects, scale, r);
+ }
+
+ template <typename forward_iterator>
+ void to_tensor (
+ forward_iterator ibegin,
+ forward_iterator iend,
+ resizable_tensor& data
+ ) const
+ {
+ DLIB_CASSERT(std::distance(ibegin,iend) > 0);
+ auto nr = ibegin->nr();
+ auto nc = ibegin->nc();
+ // make sure all the input matrices have the same dimensions
+ for (auto i = ibegin; i != iend; ++i)
+ {
+ DLIB_CASSERT(i->nr()==nr && i->nc()==nc,
+ "\t input_rgb_image_pyramid::to_tensor()"
+ << "\n\t All matrices given to to_tensor() must have the same dimensions."
+ << "\n\t nr: " << nr
+ << "\n\t nc: " << nc
+ << "\n\t i->nr(): " << i->nr()
+ << "\n\t i->nc(): " << i->nc()
+ );
+ }
+
+ long NR, NC;
+ pyramid_type pyr;
+ auto& rects = data.annotation().get<std::vector<rectangle>>();
+ impl::compute_tiled_image_pyramid_details(pyr, nr, nc, pyramid_padding, pyramid_outer_padding, rects, NR, NC);
+
+ // initialize data to the right size to contain the stuff in the iterator range.
+ data.set_size(std::distance(ibegin,iend), 3, NR, NC);
+
+ // We need to zero the image before doing the pyramid, since the pyramid
+ // creation code doesn't write to all parts of the image. We also take
+ // care to avoid triggering any device to hosts copies.
+ auto ptr = data.host_write_only();
+ for (size_t i = 0; i < data.size(); ++i)
+ ptr[i] = 0;
+
+ if (rects.size() == 0)
+ return;
+
+ // copy the first raw image into the top part of the tiled pyramid. We need to
+ // do this for each of the input images/samples in the tensor.
+ for (auto i = ibegin; i != iend; ++i)
+ {
+ auto& img = *i;
+ ptr += rects[0].top()*data.nc();
+ for (long r = 0; r < img.nr(); ++r)
+ {
+ auto p = ptr+rects[0].left();
+ for (long c = 0; c < img.nc(); ++c)
+ p[c] = (img(r,c).red-avg_red)/256.0;
+ ptr += data.nc();
+ }
+ ptr += data.nc()*(data.nr()-rects[0].bottom()-1);
+
+ ptr += rects[0].top()*data.nc();
+ for (long r = 0; r < img.nr(); ++r)
+ {
+ auto p = ptr+rects[0].left();
+ for (long c = 0; c < img.nc(); ++c)
+ p[c] = (img(r,c).green-avg_green)/256.0;
+ ptr += data.nc();
+ }
+ ptr += data.nc()*(data.nr()-rects[0].bottom()-1);
+
+ ptr += rects[0].top()*data.nc();
+ for (long r = 0; r < img.nr(); ++r)
+ {
+ auto p = ptr+rects[0].left();
+ for (long c = 0; c < img.nc(); ++c)
+ p[c] = (img(r,c).blue-avg_blue)/256.0;
+ ptr += data.nc();
+ }
+ ptr += data.nc()*(data.nr()-rects[0].bottom()-1);
+ }
+
+ // now build the image pyramid into data. This does the same thing as
+ // create_tiled_pyramid(), except we use the GPU if one is available.
+ for (size_t i = 1; i < rects.size(); ++i)
+ {
+ alias_tensor src(data.num_samples(),data.k(),rects[i-1].height(),rects[i-1].width());
+ alias_tensor dest(data.num_samples(),data.k(),rects[i].height(),rects[i].width());
+
+ auto asrc = src(data, data.nc()*rects[i-1].top() + rects[i-1].left());
+ auto adest = dest(data, data.nc()*rects[i].top() + rects[i].left());
+
+ tt::resize_bilinear(adest, data.nc(), data.nr()*data.nc(),
+ asrc, data.nc(), data.nr()*data.nc());
+ }
+ }
+
+ friend void serialize(const input_rgb_image_pyramid& item, std::ostream& out)
+ {
+ serialize("input_rgb_image_pyramid2", out);
+ serialize(item.avg_red, out);
+ serialize(item.avg_green, out);
+ serialize(item.avg_blue, out);
+ serialize(item.pyramid_padding, out);
+ serialize(item.pyramid_outer_padding, out);
+ }
+
+ friend void deserialize(input_rgb_image_pyramid& item, std::istream& in)
+ {
+ std::string version;
+ deserialize(version, in);
+ if (version != "input_rgb_image_pyramid" && version != "input_rgb_image_pyramid2")
+ throw serialization_error("Unexpected version found while deserializing dlib::input_rgb_image_pyramid.");
+ deserialize(item.avg_red, in);
+ deserialize(item.avg_green, in);
+ deserialize(item.avg_blue, in);
+ if (version == "input_rgb_image_pyramid2")
+ {
+ deserialize(item.pyramid_padding, in);
+ deserialize(item.pyramid_outer_padding, in);
+ }
+ else
+ {
+ item.pyramid_padding = 10;
+ item.pyramid_outer_padding = 11;
+ }
+ }
+
+ friend std::ostream& operator<<(std::ostream& out, const input_rgb_image_pyramid& item)
+ {
+ out << "input_rgb_image_pyramid("<<item.avg_red<<","<<item.avg_green<<","<<item.avg_blue<<")";
+ out << " pyramid_padding="<<item.pyramid_padding;
+ out << " pyramid_outer_padding="<<item.pyramid_outer_padding;
+ return out;
+ }
+
+ friend void to_xml(const input_rgb_image_pyramid& item, std::ostream& out)
+ {
+ out << "<input_rgb_image_pyramid r='"<<item.avg_red<<"' g='"<<item.avg_green
+ <<"' b='"<<item.avg_blue
+ <<"' pyramid_padding='"<<item.pyramid_padding
+ <<"' pyramid_outer_padding='"<<item.pyramid_outer_padding
+ <<"'/>";
+ }
+
+ private:
+ float avg_red;
+ float avg_green;
+ float avg_blue;
+ unsigned long pyramid_padding = 10;
+ unsigned long pyramid_outer_padding = 11;
+ };
+
+// ----------------------------------------------------------------------------------------
+
+}
+
+#endif // DLIB_DNn_INPUT_H_
+
diff --git a/ml/dlib/dlib/dnn/input_abstract.h b/ml/dlib/dlib/dnn/input_abstract.h
new file mode 100644
index 000000000..7130efb17
--- /dev/null
+++ b/ml/dlib/dlib/dnn/input_abstract.h
@@ -0,0 +1,467 @@
+// Copyright (C) 2015 Davis E. King (davis@dlib.net)
+// License: Boost Software License See LICENSE.txt for the full license.
+#undef DLIB_DNn_INPUT_ABSTRACT_H_
+#ifdef DLIB_DNn_INPUT_ABSTRACT_H_
+
+#include "../matrix.h"
+#include "../pixel.h"
+
+
+namespace dlib
+{
+
+// ----------------------------------------------------------------------------------------
+
+ class EXAMPLE_INPUT_LAYER
+ {
+ /*!
+ WHAT THIS OBJECT REPRESENTS
+ Each deep neural network model in dlib begins with an input layer. The job
+ of the input layer is to convert an input_type into a tensor. Nothing more
+ and nothing less.
+
+ Note that there is no dlib::EXAMPLE_INPUT_LAYER type. It is shown here
+ purely to document the interface that an input layer object must implement.
+ If you are using some kind of image or matrix object as your input_type
+ then you can use the provided dlib::input layer defined below. Otherwise,
+ you need to define your own custom input layer.
+
+ THREAD SAFETY
+ to_tensor() must be thread safe. That is, multiple threads must be able to
+ make calls to to_tensor() on a single instance of this object at the same
+ time.
+ !*/
+ public:
+
+ EXAMPLE_INPUT_LAYER(
+ );
+ /*!
+ ensures
+ - Default constructs this object. This function is not required to do
+ anything in particular but it must exist, that is, it is required that
+ layer objects be default constructable.
+ !*/
+
+ EXAMPLE_INPUT_LAYER (
+ const EXAMPLE_INPUT_LAYER& item
+ );
+ /*!
+ ensures
+ - EXAMPLE_INPUT_LAYER objects are copy constructable
+ !*/
+
+ EXAMPLE_INPUT_LAYER(
+ const some_other_input_layer_type& item
+ );
+ /*!
+ ensures
+ - Constructs this object from item. This form of constructor is optional
+ but it allows you to provide a conversion from one input layer type to
+ another. For example, the following code is valid only if my_input_layer2 can
+ be constructed from my_input_layer1:
+ relu<fc<relu<fc<my_input_layer1>>>> my_dnn1;
+ relu<fc<relu<fc<my_input_layer2>>>> my_dnn2(my_dnn1);
+ This kind of pattern is useful if you want to use one type of input layer
+ during training but a different type of layer during testing since it
+ allows you to easily convert between related deep neural network types.
+ !*/
+
+ typedef whatever_type_to_tensor_expects input_type;
+
+ template <typename forward_iterator>
+ void to_tensor (
+ forward_iterator ibegin,
+ forward_iterator iend,
+ resizable_tensor& data
+ ) const;
+ /*!
+ requires
+ - [ibegin, iend) is an iterator range over input_type objects.
+ - std::distance(ibegin,iend) > 0
+ ensures
+ - Converts the iterator range into a tensor and stores it into #data.
+ - #data.num_samples()%distance(ibegin,iend) == 0.
+ Normally you would have #data.num_samples() == distance(ibegin,iend) but
+ you can also expand the output by some integer factor so long as the loss
+ you use can deal with it correctly.
+ - The data in the ith sample of #data corresponds to the input_type object
+ *(ibegin+i/sample_expansion_factor).
+ where sample_expansion_factor==#data.num_samples()/distance(ibegin,iend).
+ !*/
+ };
+
+ std::ostream& operator<<(std::ostream& out, const EXAMPLE_INPUT_LAYER& item);
+ /*!
+ print a string describing this layer.
+ !*/
+
+ void to_xml(const EXAMPLE_INPUT_LAYER& item, std::ostream& out);
+ /*!
+ This function is optional, but required if you want to print your networks with
+ net_to_xml(). Therefore, to_xml() prints a layer as XML.
+ !*/
+
+ void serialize(const EXAMPLE_INPUT_LAYER& item, std::ostream& out);
+ void deserialize(EXAMPLE_INPUT_LAYER& item, std::istream& in);
+ /*!
+ provides serialization support
+ !*/
+
+// ----------------------------------------------------------------------------------------
+
+ template <
+ typename T
+ >
+ class input
+ {
+ /*!
+ REQUIREMENTS ON T
+ One of the following must be true:
+ - T is a matrix or array2d object and it must contain some kind of
+ pixel type. I.e. pixel_traits<T::type> must be defined.
+ - T is a std::array<matrix<U>> where U is any built in scalar type like
+ float, double, or unsigned char.
+
+ WHAT THIS OBJECT REPRESENTS
+ This is a basic input layer that simply copies images into a tensor.
+ !*/
+
+ public:
+ typedef T input_type;
+
+ template <typename forward_iterator>
+ void to_tensor (
+ forward_iterator ibegin,
+ forward_iterator iend,
+ resizable_tensor& data
+ ) const;
+ /*!
+ requires
+ - [ibegin, iend) is an iterator range over input_type objects.
+ - std::distance(ibegin,iend) > 0
+ - The input range should contain image objects that all have the same
+ dimensions.
+ ensures
+ - Converts the iterator range into a tensor and stores it into #data. In
+ particular, if the input images have R rows, C columns, and K channels
+ (where K is given by pixel_traits::num or std::array::size() if
+ std::array inputs are used) then we will have:
+ - #data.num_samples() == std::distance(ibegin,iend)
+ - #data.nr() == R
+ - #data.nc() == C
+ - #data.k() == K
+ For example, a matrix<float,3,3> would turn into a tensor with 3 rows, 3
+ columns, and k()==1. Or a matrix<rgb_pixel,4,5> would turn into a tensor
+ with 4 rows, 5 columns, and k()==3 (since rgb_pixels have 3 channels).
+ Or a std::array<matrix<float,3,3>,5> would turn into a tensor with 3 rows
+ and columns, and k()==5 channels.
+ - If the input data contains pixels of type unsigned char, rgb_pixel, or
+ other pixel types with a basic_pixel_type of unsigned char then each
+ value written to the output tensor is first divided by 256.0 so that the
+ resulting outputs are all in the range [0,1].
+ !*/
+
+ // Provided for compatibility with input_rgb_image_pyramid's interface
+ bool image_contained_point ( const tensor& data, const point& p) const { return get_rect(data).contains(p); }
+ drectangle tensor_space_to_image_space ( const tensor& /*data*/, drectangle r) const { return r; }
+ drectangle image_space_to_tensor_space ( const tensor& /*data*/, double /*scale*/, drectangle r ) const { return r; }
+ };
+
+// ----------------------------------------------------------------------------------------
+
+ class input_rgb_image
+ {
+ /*!
+ WHAT THIS OBJECT REPRESENTS
+ This input layer works with RGB images of type matrix<rgb_pixel>. It is
+ very similar to the dlib::input layer except that it allows you to subtract
+ the average color value from each color channel when converting an image to
+ a tensor.
+ !*/
+ public:
+ typedef matrix<rgb_pixel> input_type;
+
+ input_rgb_image (
+ );
+ /*!
+ ensures
+ - #get_avg_red() == 122.782
+ - #get_avg_green() == 117.001
+ - #get_avg_blue() == 104.298
+ !*/
+
+ input_rgb_image (
+ float avg_red,
+ float avg_green,
+ float avg_blue
+ );
+ /*!
+ ensures
+ - #get_avg_red() == avg_red
+ - #get_avg_green() == avg_green
+ - #get_avg_blue() == avg_blue
+ !*/
+
+ float get_avg_red(
+ ) const;
+ /*!
+ ensures
+ - returns the value subtracted from the red color channel.
+ !*/
+
+ float get_avg_green(
+ ) const;
+ /*!
+ ensures
+ - returns the value subtracted from the green color channel.
+ !*/
+
+ float get_avg_blue(
+ ) const;
+ /*!
+ ensures
+ - returns the value subtracted from the blue color channel.
+ !*/
+
+ template <typename forward_iterator>
+ void to_tensor (
+ forward_iterator ibegin,
+ forward_iterator iend,
+ resizable_tensor& data
+ ) const;
+ /*!
+ requires
+ - [ibegin, iend) is an iterator range over input_type objects.
+ - std::distance(ibegin,iend) > 0
+ - The input range should contain images that all have the same
+ dimensions.
+ ensures
+ - Converts the iterator range into a tensor and stores it into #data. In
+ particular, if the input images have R rows, C columns then we will have:
+ - #data.num_samples() == std::distance(ibegin,iend)
+ - #data.nr() == R
+ - #data.nc() == C
+ - #data.k() == 3
+ Moreover, each color channel is normalized by having its average value
+ subtracted (according to get_avg_red(), get_avg_green(), or
+ get_avg_blue()) and then is divided by 256.0.
+ !*/
+
+
+ // Provided for compatibility with input_rgb_image_pyramid's interface
+ bool image_contained_point ( const tensor& data, const point& p) const { return get_rect(data).contains(p); }
+ drectangle tensor_space_to_image_space ( const tensor& /*data*/, drectangle r) const { return r; }
+ drectangle image_space_to_tensor_space ( const tensor& /*data*/, double /*scale*/, drectangle r ) const { return r; }
+ };
+
+// ----------------------------------------------------------------------------------------
+
+ template <size_t NR, size_t NC=NR>
+ class input_rgb_image_sized
+ {
+ /*!
+ WHAT THIS OBJECT REPRESENTS
+ This layer has an interface and behavior identical to input_rgb_image
+ except that it requires input images to have NR rows and NC columns. This
+ is checked by a DLIB_CASSERT inside to_tensor().
+
+ You can also convert between input_rgb_image and input_rgb_image_sized by
+ copy construction or assignment.
+ !*/
+
+ };
+
+// ----------------------------------------------------------------------------------------
+
+ template <
+ typename PYRAMID_TYPE
+ >
+ class input_rgb_image_pyramid
+ {
+ /*!
+ REQUIREMENTS ON PYRAMID_TYPE
+ PYRAMID_TYPE must be an instance of the dlib::pyramid_down template.
+
+ WHAT THIS OBJECT REPRESENTS
+ This input layer works with RGB images of type matrix<rgb_pixel>. It is
+ identical to input_rgb_image except that it outputs a tensor containing a
+ tiled image pyramid of each input image rather than a simple copy of each
+ image. The tiled image pyramid is created using create_tiled_pyramid().
+ !*/
+
+ public:
+
+ typedef matrix<rgb_pixel> input_type;
+ typedef PYRAMID_TYPE pyramid_type;
+
+ input_rgb_image_pyramid (
+ );
+ /*!
+ ensures
+ - #get_avg_red() == 122.782
+ - #get_avg_green() == 117.001
+ - #get_avg_blue() == 104.298
+ - #get_pyramid_padding() == 10
+ - #get_pyramid_outer_padding() == 11
+ !*/
+
+ input_rgb_image_pyramid (
+ float avg_red,
+ float avg_green,
+ float avg_blue
+ );
+ /*!
+ ensures
+ - #get_avg_red() == avg_red
+ - #get_avg_green() == avg_green
+ - #get_avg_blue() == avg_blue
+ - #get_pyramid_padding() == 10
+ - #get_pyramid_outer_padding() == 11
+ !*/
+
+ float get_avg_red(
+ ) const;
+ /*!
+ ensures
+ - returns the value subtracted from the red color channel.
+ !*/
+
+ float get_avg_green(
+ ) const;
+ /*!
+ ensures
+ - returns the value subtracted from the green color channel.
+ !*/
+
+ float get_avg_blue(
+ ) const;
+ /*!
+ ensures
+ - returns the value subtracted from the blue color channel.
+ !*/
+
+ unsigned long get_pyramid_padding (
+ ) const;
+ /*!
+ ensures
+ - When this object creates a pyramid it will call create_tiled_pyramid() and
+ set create_tiled_pyramid's pyramid_padding parameter to get_pyramid_padding().
+ !*/
+ void set_pyramid_padding (
+ unsigned long value
+ );
+ /*!
+ ensures
+ - #get_pyramid_padding() == value
+ !*/
+
+ unsigned long get_pyramid_outer_padding (
+ ) const;
+ /*!
+ ensures
+ - When this object creates a pyramid it will call create_tiled_pyramid()
+ and set create_tiled_pyramid's pyramid_outer_padding parameter to
+ get_pyramid_outer_padding().
+ !*/
+ void set_pyramid_outer_padding (
+ unsigned long value
+ );
+ /*!
+ ensures
+ - #get_pyramid_outer_padding() == value
+ !*/
+
+ template <typename forward_iterator>
+ void to_tensor (
+ forward_iterator ibegin,
+ forward_iterator iend,
+ resizable_tensor& data
+ ) const;
+ /*!
+ requires
+ - [ibegin, iend) is an iterator range over input_type objects.
+ - std::distance(ibegin,iend) > 0
+ - The input range should contain images that all have the same
+ dimensions.
+ ensures
+ - Converts the iterator range into a tensor and stores it into #data. In
+ particular, we will have:
+ - #data.num_samples() == std::distance(ibegin,iend)
+ - #data.k() == 3
+ - Each sample in #data contains a tiled image pyramid of the
+ corresponding input image. The tiled pyramid is created by
+ create_tiled_pyramid().
+ Moreover, each color channel is normalized by having its average value
+ subtracted (according to get_avg_red(), get_avg_green(), or
+ get_avg_blue()) and then is divided by 256.0.
+ !*/
+
+ bool image_contained_point (
+ const tensor& data,
+ const point& p
+ ) const;
+ /*!
+ requires
+ - data is a tensor that was produced by this->to_tensor()
+ ensures
+ - Since data is a tensor that is built from a bunch of identically sized
+ images, we can ask if those images were big enough to contain the point
+ p. This function returns the answer to that question.
+ !*/
+
+ drectangle image_space_to_tensor_space (
+ const tensor& data,
+ double scale,
+ drectangle r
+ ) const;
+ /*!
+ requires
+ - data is a tensor that was produced by this->to_tensor()
+ - 0 < scale <= 1
+ ensures
+ - This function maps from to_tensor()'s input image space to its output
+ tensor space. Therefore, given that data is a tensor produced by
+ to_tensor(), image_space_to_tensor_space() allows you to ask for the
+ rectangle in data that corresponds to a rectangle in the original image
+ space.
+
+ Note that since the output tensor contains an image pyramid, there are
+ multiple points in the output tensor that correspond to any input
+ location. So you must also specify a scale so we know what level of the
+ pyramid is needed. So given a rectangle r in an input image, you can
+ ask, what rectangle in data corresponds to r when things are scale times
+ smaller? That rectangle is returned by this function.
+ - A scale of 1 means we don't move anywhere in the pyramid scale space relative
+ to the input image while smaller values of scale mean we move down the
+ pyramid.
+ !*/
+
+ drectangle tensor_space_to_image_space (
+ const tensor& data,
+ drectangle r
+ ) const;
+ /*!
+ requires
+ - data is a tensor that was produced by this->to_tensor()
+ ensures
+ - This function maps from to_tensor()'s output tensor space to its input
+ image space. Therefore, given that data is a tensor produced by
+ to_tensor(), tensor_space_to_image_space() allows you to ask for the
+ rectangle in the input image that corresponds to a rectangle in data.
+ - It should be noted that this function isn't always an inverse of
+ image_space_to_tensor_space(). This is because you can ask
+ image_space_to_tensor_space() for the coordinates of points outside the input
+ image and they will be mapped to somewhere that doesn't have an inverse.
+ But for points actually inside the input image this function performs an
+ approximate inverse mapping. I.e. when image_contained_point(data,center(r))==true
+ there is an approximate inverse.
+ !*/
+
+ };
+
+// ----------------------------------------------------------------------------------------
+
+}
+
+#endif // DLIB_DNn_INPUT_ABSTRACT_H_
+
diff --git a/ml/dlib/dlib/dnn/layers.h b/ml/dlib/dlib/dnn/layers.h
new file mode 100644
index 000000000..91436f635
--- /dev/null
+++ b/ml/dlib/dlib/dnn/layers.h
@@ -0,0 +1,3244 @@
+// Copyright (C) 2015 Davis E. King (davis@dlib.net)
+// License: Boost Software License See LICENSE.txt for the full license.
+#ifndef DLIB_DNn_LAYERS_H_
+#define DLIB_DNn_LAYERS_H_
+
+#include "layers_abstract.h"
+#include "tensor.h"
+#include "core.h"
+#include <iostream>
+#include <string>
+#include "../rand.h"
+#include "../string.h"
+#include "tensor_tools.h"
+#include "../vectorstream.h"
+#include "utilities.h"
+#include <sstream>
+
+
+namespace dlib
+{
+
+// ----------------------------------------------------------------------------------------
+
+ struct num_con_outputs
+ {
+ num_con_outputs(unsigned long n) : num_outputs(n) {}
+ unsigned long num_outputs;
+ };
+
+ template <
+ long _num_filters,
+ long _nr,
+ long _nc,
+ int _stride_y,
+ int _stride_x,
+ int _padding_y = _stride_y!=1? 0 : _nr/2,
+ int _padding_x = _stride_x!=1? 0 : _nc/2
+ >
+ class con_
+ {
+ public:
+
+ static_assert(_num_filters > 0, "The number of filters must be > 0");
+ static_assert(_nr >= 0, "The number of rows in a filter must be >= 0");
+ static_assert(_nc >= 0, "The number of columns in a filter must be >= 0");
+ static_assert(_stride_y > 0, "The filter stride must be > 0");
+ static_assert(_stride_x > 0, "The filter stride must be > 0");
+ static_assert(_nr==0 || (0 <= _padding_y && _padding_y < _nr), "The padding must be smaller than the filter size.");
+ static_assert(_nc==0 || (0 <= _padding_x && _padding_x < _nc), "The padding must be smaller than the filter size.");
+ static_assert(_nr!=0 || 0 == _padding_y, "If _nr==0 then the padding must be set to 0 as well.");
+ static_assert(_nc!=0 || 0 == _padding_x, "If _nr==0 then the padding must be set to 0 as well.");
+
+ con_(
+ num_con_outputs o
+ ) :
+ learning_rate_multiplier(1),
+ weight_decay_multiplier(1),
+ bias_learning_rate_multiplier(1),
+ bias_weight_decay_multiplier(0),
+ num_filters_(o.num_outputs),
+ padding_y_(_padding_y),
+ padding_x_(_padding_x)
+ {
+ DLIB_CASSERT(num_filters_ > 0);
+ }
+
+ con_() : con_(num_con_outputs(_num_filters)) {}
+
+ long num_filters() const { return num_filters_; }
+ long nr() const
+ {
+ if (_nr==0)
+ return filters.nr();
+ else
+ return _nr;
+ }
+ long nc() const
+ {
+ if (_nc==0)
+ return filters.nc();
+ else
+ return _nc;
+ }
+ long stride_y() const { return _stride_y; }
+ long stride_x() const { return _stride_x; }
+ long padding_y() const { return padding_y_; }
+ long padding_x() const { return padding_x_; }
+
+ void set_num_filters(long num)
+ {
+ DLIB_CASSERT(num > 0);
+ if (num != num_filters_)
+ {
+ DLIB_CASSERT(get_layer_params().size() == 0,
+ "You can't change the number of filters in con_ if the parameter tensor has already been allocated.");
+ num_filters_ = num;
+ }
+ }
+
+ double get_learning_rate_multiplier () const { return learning_rate_multiplier; }
+ double get_weight_decay_multiplier () const { return weight_decay_multiplier; }
+ void set_learning_rate_multiplier(double val) { learning_rate_multiplier = val; }
+ void set_weight_decay_multiplier(double val) { weight_decay_multiplier = val; }
+
+ double get_bias_learning_rate_multiplier () const { return bias_learning_rate_multiplier; }
+ double get_bias_weight_decay_multiplier () const { return bias_weight_decay_multiplier; }
+ void set_bias_learning_rate_multiplier(double val) { bias_learning_rate_multiplier = val; }
+ void set_bias_weight_decay_multiplier(double val) { bias_weight_decay_multiplier = val; }
+
+ inline dpoint map_input_to_output (
+ dpoint p
+ ) const
+ {
+ p.x() = (p.x()+padding_x()-nc()/2)/stride_x();
+ p.y() = (p.y()+padding_y()-nr()/2)/stride_y();
+ return p;
+ }
+
+ inline dpoint map_output_to_input (
+ dpoint p
+ ) const
+ {
+ p.x() = p.x()*stride_x() - padding_x() + nc()/2;
+ p.y() = p.y()*stride_y() - padding_y() + nr()/2;
+ return p;
+ }
+
+ con_ (
+ const con_& item
+ ) :
+ params(item.params),
+ filters(item.filters),
+ biases(item.biases),
+ learning_rate_multiplier(item.learning_rate_multiplier),
+ weight_decay_multiplier(item.weight_decay_multiplier),
+ bias_learning_rate_multiplier(item.bias_learning_rate_multiplier),
+ bias_weight_decay_multiplier(item.bias_weight_decay_multiplier),
+ num_filters_(item.num_filters_),
+ padding_y_(item.padding_y_),
+ padding_x_(item.padding_x_)
+ {
+ // this->conv is non-copyable and basically stateless, so we have to write our
+ // own copy to avoid trying to copy it and getting an error.
+ }
+
+ con_& operator= (
+ const con_& item
+ )
+ {
+ if (this == &item)
+ return *this;
+
+ // this->conv is non-copyable and basically stateless, so we have to write our
+ // own copy to avoid trying to copy it and getting an error.
+ params = item.params;
+ filters = item.filters;
+ biases = item.biases;
+ padding_y_ = item.padding_y_;
+ padding_x_ = item.padding_x_;
+ learning_rate_multiplier = item.learning_rate_multiplier;
+ weight_decay_multiplier = item.weight_decay_multiplier;
+ bias_learning_rate_multiplier = item.bias_learning_rate_multiplier;
+ bias_weight_decay_multiplier = item.bias_weight_decay_multiplier;
+ num_filters_ = item.num_filters_;
+ return *this;
+ }
+
+ template <typename SUBNET>
+ void setup (const SUBNET& sub)
+ {
+ const long filt_nr = _nr!=0 ? _nr : sub.get_output().nr();
+ const long filt_nc = _nc!=0 ? _nc : sub.get_output().nc();
+
+ long num_inputs = filt_nr*filt_nc*sub.get_output().k();
+ long num_outputs = num_filters_;
+ // allocate params for the filters and also for the filter bias values.
+ params.set_size(num_inputs*num_filters_ + num_filters_);
+
+ dlib::rand rnd(std::rand());
+ randomize_parameters(params, num_inputs+num_outputs, rnd);
+
+ filters = alias_tensor(num_filters_, sub.get_output().k(), filt_nr, filt_nc);
+ biases = alias_tensor(1,num_filters_);
+
+ // set the initial bias values to zero
+ biases(params,filters.size()) = 0;
+ }
+
+ template <typename SUBNET>
+ void forward(const SUBNET& sub, resizable_tensor& output)
+ {
+ conv.setup(sub.get_output(),
+ filters(params,0),
+ _stride_y,
+ _stride_x,
+ padding_y_,
+ padding_x_);
+ conv(false, output,
+ sub.get_output(),
+ filters(params,0));
+
+ tt::add(1,output,1,biases(params,filters.size()));
+ }
+
+ template <typename SUBNET>
+ void backward(const tensor& gradient_input, SUBNET& sub, tensor& params_grad)
+ {
+ conv.get_gradient_for_data (true, gradient_input, filters(params,0), sub.get_gradient_input());
+ // no dpoint computing the parameter gradients if they won't be used.
+ if (learning_rate_multiplier != 0)
+ {
+ auto filt = filters(params_grad,0);
+ conv.get_gradient_for_filters (false, gradient_input, sub.get_output(), filt);
+ auto b = biases(params_grad, filters.size());
+ tt::assign_conv_bias_gradient(b, gradient_input);
+ }
+ }
+
+ const tensor& get_layer_params() const { return params; }
+ tensor& get_layer_params() { return params; }
+
+ friend void serialize(const con_& item, std::ostream& out)
+ {
+ serialize("con_4", out);
+ serialize(item.params, out);
+ serialize(item.num_filters_, out);
+ serialize(_nr, out);
+ serialize(_nc, out);
+ serialize(_stride_y, out);
+ serialize(_stride_x, out);
+ serialize(item.padding_y_, out);
+ serialize(item.padding_x_, out);
+ serialize(item.filters, out);
+ serialize(item.biases, out);
+ serialize(item.learning_rate_multiplier, out);
+ serialize(item.weight_decay_multiplier, out);
+ serialize(item.bias_learning_rate_multiplier, out);
+ serialize(item.bias_weight_decay_multiplier, out);
+ }
+
+ friend void deserialize(con_& item, std::istream& in)
+ {
+ std::string version;
+ deserialize(version, in);
+ long nr;
+ long nc;
+ int stride_y;
+ int stride_x;
+ if (version == "con_4")
+ {
+ deserialize(item.params, in);
+ deserialize(item.num_filters_, in);
+ deserialize(nr, in);
+ deserialize(nc, in);
+ deserialize(stride_y, in);
+ deserialize(stride_x, in);
+ deserialize(item.padding_y_, in);
+ deserialize(item.padding_x_, in);
+ deserialize(item.filters, in);
+ deserialize(item.biases, in);
+ deserialize(item.learning_rate_multiplier, in);
+ deserialize(item.weight_decay_multiplier, in);
+ deserialize(item.bias_learning_rate_multiplier, in);
+ deserialize(item.bias_weight_decay_multiplier, in);
+ if (item.padding_y_ != _padding_y) throw serialization_error("Wrong padding_y found while deserializing dlib::con_");
+ if (item.padding_x_ != _padding_x) throw serialization_error("Wrong padding_x found while deserializing dlib::con_");
+ if (nr != _nr) throw serialization_error("Wrong nr found while deserializing dlib::con_");
+ if (nc != _nc) throw serialization_error("Wrong nc found while deserializing dlib::con_");
+ if (stride_y != _stride_y) throw serialization_error("Wrong stride_y found while deserializing dlib::con_");
+ if (stride_x != _stride_x) throw serialization_error("Wrong stride_x found while deserializing dlib::con_");
+ }
+ else
+ {
+ throw serialization_error("Unexpected version '"+version+"' found while deserializing dlib::con_.");
+ }
+ }
+
+
+ friend std::ostream& operator<<(std::ostream& out, const con_& item)
+ {
+ out << "con\t ("
+ << "num_filters="<<item.num_filters_
+ << ", nr="<<item.nr()
+ << ", nc="<<item.nc()
+ << ", stride_y="<<_stride_y
+ << ", stride_x="<<_stride_x
+ << ", padding_y="<<item.padding_y_
+ << ", padding_x="<<item.padding_x_
+ << ")";
+ out << " learning_rate_mult="<<item.learning_rate_multiplier;
+ out << " weight_decay_mult="<<item.weight_decay_multiplier;
+ out << " bias_learning_rate_mult="<<item.bias_learning_rate_multiplier;
+ out << " bias_weight_decay_mult="<<item.bias_weight_decay_multiplier;
+ return out;
+ }
+
+ friend void to_xml(const con_& item, std::ostream& out)
+ {
+ out << "<con"
+ << " num_filters='"<<item.num_filters_<<"'"
+ << " nr='"<<item.nr()<<"'"
+ << " nc='"<<item.nc()<<"'"
+ << " stride_y='"<<_stride_y<<"'"
+ << " stride_x='"<<_stride_x<<"'"
+ << " padding_y='"<<item.padding_y_<<"'"
+ << " padding_x='"<<item.padding_x_<<"'"
+ << " learning_rate_mult='"<<item.learning_rate_multiplier<<"'"
+ << " weight_decay_mult='"<<item.weight_decay_multiplier<<"'"
+ << " bias_learning_rate_mult='"<<item.bias_learning_rate_multiplier<<"'"
+ << " bias_weight_decay_mult='"<<item.bias_weight_decay_multiplier<<"'>\n";
+ out << mat(item.params);
+ out << "</con>";
+ }
+
+ private:
+
+ resizable_tensor params;
+ alias_tensor filters, biases;
+
+ tt::tensor_conv conv;
+ double learning_rate_multiplier;
+ double weight_decay_multiplier;
+ double bias_learning_rate_multiplier;
+ double bias_weight_decay_multiplier;
+ long num_filters_;
+
+ // These are here only because older versions of con (which you might encounter
+ // serialized to disk) used different padding settings.
+ int padding_y_;
+ int padding_x_;
+
+ };
+
+ template <
+ long num_filters,
+ long nr,
+ long nc,
+ int stride_y,
+ int stride_x,
+ typename SUBNET
+ >
+ using con = add_layer<con_<num_filters,nr,nc,stride_y,stride_x>, SUBNET>;
+
+// ----------------------------------------------------------------------------------------
+
+ template <
+ long _num_filters,
+ long _nr,
+ long _nc,
+ int _stride_y,
+ int _stride_x,
+ int _padding_y = _stride_y!=1? 0 : _nr/2,
+ int _padding_x = _stride_x!=1? 0 : _nc/2
+ >
+ class cont_
+ {
+ public:
+
+ static_assert(_num_filters > 0, "The number of filters must be > 0");
+ static_assert(_nr > 0, "The number of rows in a filter must be > 0");
+ static_assert(_nc > 0, "The number of columns in a filter must be > 0");
+ static_assert(_stride_y > 0, "The filter stride must be > 0");
+ static_assert(_stride_x > 0, "The filter stride must be > 0");
+ static_assert(0 <= _padding_y && _padding_y < _nr, "The padding must be smaller than the filter size.");
+ static_assert(0 <= _padding_x && _padding_x < _nc, "The padding must be smaller than the filter size.");
+
+ cont_(
+ num_con_outputs o
+ ) :
+ learning_rate_multiplier(1),
+ weight_decay_multiplier(1),
+ bias_learning_rate_multiplier(1),
+ bias_weight_decay_multiplier(0),
+ num_filters_(o.num_outputs),
+ padding_y_(_padding_y),
+ padding_x_(_padding_x)
+ {
+ DLIB_CASSERT(num_filters_ > 0);
+ }
+
+ cont_() : cont_(num_con_outputs(_num_filters)) {}
+
+ long num_filters() const { return num_filters_; }
+ long nr() const { return _nr; }
+ long nc() const { return _nc; }
+ long stride_y() const { return _stride_y; }
+ long stride_x() const { return _stride_x; }
+ long padding_y() const { return padding_y_; }
+ long padding_x() const { return padding_x_; }
+
+ void set_num_filters(long num)
+ {
+ DLIB_CASSERT(num > 0);
+ if (num != num_filters_)
+ {
+ DLIB_CASSERT(get_layer_params().size() == 0,
+ "You can't change the number of filters in cont_ if the parameter tensor has already been allocated.");
+ num_filters_ = num;
+ }
+ }
+
+ double get_learning_rate_multiplier () const { return learning_rate_multiplier; }
+ double get_weight_decay_multiplier () const { return weight_decay_multiplier; }
+ void set_learning_rate_multiplier(double val) { learning_rate_multiplier = val; }
+ void set_weight_decay_multiplier(double val) { weight_decay_multiplier = val; }
+
+ double get_bias_learning_rate_multiplier () const { return bias_learning_rate_multiplier; }
+ double get_bias_weight_decay_multiplier () const { return bias_weight_decay_multiplier; }
+ void set_bias_learning_rate_multiplier(double val) { bias_learning_rate_multiplier = val; }
+ void set_bias_weight_decay_multiplier(double val) { bias_weight_decay_multiplier = val; }
+
+ inline dpoint map_output_to_input (
+ dpoint p
+ ) const
+ {
+ p.x() = (p.x()+padding_x()-nc()/2)/stride_x();
+ p.y() = (p.y()+padding_y()-nr()/2)/stride_y();
+ return p;
+ }
+
+ inline dpoint map_input_to_output (
+ dpoint p
+ ) const
+ {
+ p.x() = p.x()*stride_x() - padding_x() + nc()/2;
+ p.y() = p.y()*stride_y() - padding_y() + nr()/2;
+ return p;
+ }
+
+ cont_ (
+ const cont_& item
+ ) :
+ params(item.params),
+ filters(item.filters),
+ biases(item.biases),
+ learning_rate_multiplier(item.learning_rate_multiplier),
+ weight_decay_multiplier(item.weight_decay_multiplier),
+ bias_learning_rate_multiplier(item.bias_learning_rate_multiplier),
+ bias_weight_decay_multiplier(item.bias_weight_decay_multiplier),
+ num_filters_(item.num_filters_),
+ padding_y_(item.padding_y_),
+ padding_x_(item.padding_x_)
+ {
+ // this->conv is non-copyable and basically stateless, so we have to write our
+ // own copy to avoid trying to copy it and getting an error.
+ }
+
+ cont_& operator= (
+ const cont_& item
+ )
+ {
+ if (this == &item)
+ return *this;
+
+ // this->conv is non-copyable and basically stateless, so we have to write our
+ // own copy to avoid trying to copy it and getting an error.
+ params = item.params;
+ filters = item.filters;
+ biases = item.biases;
+ padding_y_ = item.padding_y_;
+ padding_x_ = item.padding_x_;
+ learning_rate_multiplier = item.learning_rate_multiplier;
+ weight_decay_multiplier = item.weight_decay_multiplier;
+ bias_learning_rate_multiplier = item.bias_learning_rate_multiplier;
+ bias_weight_decay_multiplier = item.bias_weight_decay_multiplier;
+ num_filters_ = item.num_filters_;
+ return *this;
+ }
+
+ template <typename SUBNET>
+ void setup (const SUBNET& sub)
+ {
+ long num_inputs = _nr*_nc*sub.get_output().k();
+ long num_outputs = num_filters_;
+ // allocate params for the filters and also for the filter bias values.
+ params.set_size(num_inputs*num_filters_ + num_filters_);
+
+ dlib::rand rnd(std::rand());
+ randomize_parameters(params, num_inputs+num_outputs, rnd);
+
+ filters = alias_tensor(sub.get_output().k(), num_filters_, _nr, _nc);
+ biases = alias_tensor(1,num_filters_);
+
+ // set the initial bias values to zero
+ biases(params,filters.size()) = 0;
+ }
+
+ template <typename SUBNET>
+ void forward(const SUBNET& sub, resizable_tensor& output)
+ {
+ auto filt = filters(params,0);
+ unsigned int gnr = _stride_y * (sub.get_output().nr() - 1) + filt.nr() - 2 * padding_y_;
+ unsigned int gnc = _stride_x * (sub.get_output().nc() - 1) + filt.nc() - 2 * padding_x_;
+ unsigned int gnsamps = sub.get_output().num_samples();
+ unsigned int gk = filt.k();
+ output.set_size(gnsamps,gk,gnr,gnc);
+ conv.setup(output,filt,_stride_y,_stride_x,padding_y_,padding_x_);
+ conv.get_gradient_for_data(false, sub.get_output(),filt,output);
+ tt::add(1,output,1,biases(params,filters.size()));
+ }
+
+ template <typename SUBNET>
+ void backward(const tensor& gradient_input, SUBNET& sub, tensor& params_grad)
+ {
+ auto filt = filters(params,0);
+ conv(true, sub.get_gradient_input(),gradient_input, filt);
+ // no point computing the parameter gradients if they won't be used.
+ if (learning_rate_multiplier != 0)
+ {
+ auto filt = filters(params_grad,0);
+ conv.get_gradient_for_filters (false, sub.get_output(),gradient_input, filt);
+ auto b = biases(params_grad, filters.size());
+ tt::assign_conv_bias_gradient(b, gradient_input);
+ }
+ }
+
+ const tensor& get_layer_params() const { return params; }
+ tensor& get_layer_params() { return params; }
+
+ friend void serialize(const cont_& item, std::ostream& out)
+ {
+ serialize("cont_1", out);
+ serialize(item.params, out);
+ serialize(item.num_filters_, out);
+ serialize(_nr, out);
+ serialize(_nc, out);
+ serialize(_stride_y, out);
+ serialize(_stride_x, out);
+ serialize(item.padding_y_, out);
+ serialize(item.padding_x_, out);
+ serialize(item.filters, out);
+ serialize(item.biases, out);
+ serialize(item.learning_rate_multiplier, out);
+ serialize(item.weight_decay_multiplier, out);
+ serialize(item.bias_learning_rate_multiplier, out);
+ serialize(item.bias_weight_decay_multiplier, out);
+ }
+
+ friend void deserialize(cont_& item, std::istream& in)
+ {
+ std::string version;
+ deserialize(version, in);
+ long nr;
+ long nc;
+ int stride_y;
+ int stride_x;
+ if (version == "cont_1")
+ {
+ deserialize(item.params, in);
+ deserialize(item.num_filters_, in);
+ deserialize(nr, in);
+ deserialize(nc, in);
+ deserialize(stride_y, in);
+ deserialize(stride_x, in);
+ deserialize(item.padding_y_, in);
+ deserialize(item.padding_x_, in);
+ deserialize(item.filters, in);
+ deserialize(item.biases, in);
+ deserialize(item.learning_rate_multiplier, in);
+ deserialize(item.weight_decay_multiplier, in);
+ deserialize(item.bias_learning_rate_multiplier, in);
+ deserialize(item.bias_weight_decay_multiplier, in);
+ if (item.padding_y_ != _padding_y) throw serialization_error("Wrong padding_y found while deserializing dlib::con_");
+ if (item.padding_x_ != _padding_x) throw serialization_error("Wrong padding_x found while deserializing dlib::con_");
+ if (nr != _nr) throw serialization_error("Wrong nr found while deserializing dlib::con_");
+ if (nc != _nc) throw serialization_error("Wrong nc found while deserializing dlib::con_");
+ if (stride_y != _stride_y) throw serialization_error("Wrong stride_y found while deserializing dlib::con_");
+ if (stride_x != _stride_x) throw serialization_error("Wrong stride_x found while deserializing dlib::con_");
+ }
+ else
+ {
+ throw serialization_error("Unexpected version '"+version+"' found while deserializing dlib::con_.");
+ }
+ }
+
+
+ friend std::ostream& operator<<(std::ostream& out, const cont_& item)
+ {
+ out << "cont\t ("
+ << "num_filters="<<item.num_filters_
+ << ", nr="<<_nr
+ << ", nc="<<_nc
+ << ", stride_y="<<_stride_y
+ << ", stride_x="<<_stride_x
+ << ", padding_y="<<item.padding_y_
+ << ", padding_x="<<item.padding_x_
+ << ")";
+ out << " learning_rate_mult="<<item.learning_rate_multiplier;
+ out << " weight_decay_mult="<<item.weight_decay_multiplier;
+ out << " bias_learning_rate_mult="<<item.bias_learning_rate_multiplier;
+ out << " bias_weight_decay_mult="<<item.bias_weight_decay_multiplier;
+ return out;
+ }
+
+ friend void to_xml(const cont_& item, std::ostream& out)
+ {
+ out << "<cont"
+ << " num_filters='"<<item.num_filters_<<"'"
+ << " nr='"<<_nr<<"'"
+ << " nc='"<<_nc<<"'"
+ << " stride_y='"<<_stride_y<<"'"
+ << " stride_x='"<<_stride_x<<"'"
+ << " padding_y='"<<item.padding_y_<<"'"
+ << " padding_x='"<<item.padding_x_<<"'"
+ << " learning_rate_mult='"<<item.learning_rate_multiplier<<"'"
+ << " weight_decay_mult='"<<item.weight_decay_multiplier<<"'"
+ << " bias_learning_rate_mult='"<<item.bias_learning_rate_multiplier<<"'"
+ << " bias_weight_decay_mult='"<<item.bias_weight_decay_multiplier<<"'>\n";
+ out << mat(item.params);
+ out << "</cont>";
+ }
+
+ private:
+
+ resizable_tensor params;
+ alias_tensor filters, biases;
+
+ tt::tensor_conv conv;
+ double learning_rate_multiplier;
+ double weight_decay_multiplier;
+ double bias_learning_rate_multiplier;
+ double bias_weight_decay_multiplier;
+ long num_filters_;
+
+ int padding_y_;
+ int padding_x_;
+
+ };
+
+ template <
+ long num_filters,
+ long nr,
+ long nc,
+ int stride_y,
+ int stride_x,
+ typename SUBNET
+ >
+ using cont = add_layer<cont_<num_filters,nr,nc,stride_y,stride_x>, SUBNET>;
+
+// ----------------------------------------------------------------------------------------
+
+ template <
+ int scale_y,
+ int scale_x
+ >
+ class upsample_
+ {
+ public:
+ static_assert(scale_y >= 1, "upsampling scale factor can't be less than 1.");
+ static_assert(scale_x >= 1, "upsampling scale factor can't be less than 1.");
+
+ upsample_()
+ {
+ }
+
+ template <typename SUBNET>
+ void setup (const SUBNET& /*sub*/)
+ {
+ }
+
+ template <typename SUBNET>
+ void forward(const SUBNET& sub, resizable_tensor& output)
+ {
+ output.set_size(
+ sub.get_output().num_samples(),
+ sub.get_output().k(),
+ scale_y*sub.get_output().nr(),
+ scale_x*sub.get_output().nc());
+ tt::resize_bilinear(output, sub.get_output());
+ }
+
+ template <typename SUBNET>
+ void backward(const tensor& gradient_input, SUBNET& sub, tensor& /*params_grad*/)
+ {
+ tt::resize_bilinear_gradient(sub.get_gradient_input(), gradient_input);
+ }
+
+ inline dpoint map_input_to_output (dpoint p) const
+ {
+ p.x() = p.x()*scale_x;
+ p.y() = p.y()*scale_y;
+ return p;
+ }
+ inline dpoint map_output_to_input (dpoint p) const
+ {
+ p.x() = p.x()/scale_x;
+ p.y() = p.y()/scale_y;
+ return p;
+ }
+
+ const tensor& get_layer_params() const { return params; }
+ tensor& get_layer_params() { return params; }
+
+ friend void serialize(const upsample_& , std::ostream& out)
+ {
+ serialize("upsample_", out);
+ serialize(scale_y, out);
+ serialize(scale_x, out);
+ }
+
+ friend void deserialize(upsample_& , std::istream& in)
+ {
+ std::string version;
+ deserialize(version, in);
+ if (version != "upsample_")
+ throw serialization_error("Unexpected version '"+version+"' found while deserializing dlib::upsample_.");
+
+ int _scale_y;
+ int _scale_x;
+ deserialize(_scale_y, in);
+ deserialize(_scale_x, in);
+ if (_scale_y != scale_y || _scale_x != scale_x)
+ throw serialization_error("Wrong scale found while deserializing dlib::upsample_");
+ }
+
+ friend std::ostream& operator<<(std::ostream& out, const upsample_& )
+ {
+ out << "upsample\t ("
+ << "scale_y="<<scale_y
+ << ", scale_x="<<scale_x
+ << ")";
+ return out;
+ }
+
+ friend void to_xml(const upsample_& /*item*/, std::ostream& out)
+ {
+ out << "<upsample"
+ << " scale_y='"<<scale_y<<"'"
+ << " scale_x='"<<scale_x<<"'/>\n";
+ }
+
+ private:
+ resizable_tensor params;
+ };
+
+ template <
+ int scale,
+ typename SUBNET
+ >
+ using upsample = add_layer<upsample_<scale,scale>, SUBNET>;
+
+// ----------------------------------------------------------------------------------------
+
+ template <
+ long _nr,
+ long _nc,
+ int _stride_y,
+ int _stride_x,
+ int _padding_y = _stride_y!=1? 0 : _nr/2,
+ int _padding_x = _stride_x!=1? 0 : _nc/2
+ >
+ class max_pool_
+ {
+ static_assert(_nr >= 0, "The number of rows in a filter must be >= 0");
+ static_assert(_nc >= 0, "The number of columns in a filter must be >= 0");
+ static_assert(_stride_y > 0, "The filter stride must be > 0");
+ static_assert(_stride_x > 0, "The filter stride must be > 0");
+ static_assert(0 <= _padding_y && ((_nr==0 && _padding_y == 0) || (_nr!=0 && _padding_y < _nr)),
+ "The padding must be smaller than the filter size, unless the filters size is 0.");
+ static_assert(0 <= _padding_x && ((_nc==0 && _padding_x == 0) || (_nc!=0 && _padding_x < _nc)),
+ "The padding must be smaller than the filter size, unless the filters size is 0.");
+ public:
+
+
+ max_pool_(
+ ) :
+ padding_y_(_padding_y),
+ padding_x_(_padding_x)
+ {}
+
+ long nr() const { return _nr; }
+ long nc() const { return _nc; }
+ long stride_y() const { return _stride_y; }
+ long stride_x() const { return _stride_x; }
+ long padding_y() const { return padding_y_; }
+ long padding_x() const { return padding_x_; }
+
+ inline dpoint map_input_to_output (
+ dpoint p
+ ) const
+ {
+ p.x() = (p.x()+padding_x()-nc()/2)/stride_x();
+ p.y() = (p.y()+padding_y()-nr()/2)/stride_y();
+ return p;
+ }
+
+ inline dpoint map_output_to_input (
+ dpoint p
+ ) const
+ {
+ p.x() = p.x()*stride_x() - padding_x() + nc()/2;
+ p.y() = p.y()*stride_y() - padding_y() + nr()/2;
+ return p;
+ }
+
+ max_pool_ (
+ const max_pool_& item
+ ) :
+ padding_y_(item.padding_y_),
+ padding_x_(item.padding_x_)
+ {
+ // this->mp is non-copyable so we have to write our own copy to avoid trying to
+ // copy it and getting an error.
+ }
+
+ max_pool_& operator= (
+ const max_pool_& item
+ )
+ {
+ if (this == &item)
+ return *this;
+
+ padding_y_ = item.padding_y_;
+ padding_x_ = item.padding_x_;
+
+ // this->mp is non-copyable so we have to write our own copy to avoid trying to
+ // copy it and getting an error.
+ return *this;
+ }
+
+ template <typename SUBNET>
+ void setup (const SUBNET& /*sub*/)
+ {
+ }
+
+ template <typename SUBNET>
+ void forward(const SUBNET& sub, resizable_tensor& output)
+ {
+ mp.setup_max_pooling(_nr!=0?_nr:sub.get_output().nr(),
+ _nc!=0?_nc:sub.get_output().nc(),
+ _stride_y, _stride_x, padding_y_, padding_x_);
+
+ mp(output, sub.get_output());
+ }
+
+ template <typename SUBNET>
+ void backward(const tensor& computed_output, const tensor& gradient_input, SUBNET& sub, tensor& /*params_grad*/)
+ {
+ mp.setup_max_pooling(_nr!=0?_nr:sub.get_output().nr(),
+ _nc!=0?_nc:sub.get_output().nc(),
+ _stride_y, _stride_x, padding_y_, padding_x_);
+
+ mp.get_gradient(gradient_input, computed_output, sub.get_output(), sub.get_gradient_input());
+ }
+
+ const tensor& get_layer_params() const { return params; }
+ tensor& get_layer_params() { return params; }
+
+ friend void serialize(const max_pool_& item, std::ostream& out)
+ {
+ serialize("max_pool_2", out);
+ serialize(_nr, out);
+ serialize(_nc, out);
+ serialize(_stride_y, out);
+ serialize(_stride_x, out);
+ serialize(item.padding_y_, out);
+ serialize(item.padding_x_, out);
+ }
+
+ friend void deserialize(max_pool_& item, std::istream& in)
+ {
+ std::string version;
+ deserialize(version, in);
+ long nr;
+ long nc;
+ int stride_y;
+ int stride_x;
+ if (version == "max_pool_2")
+ {
+ deserialize(nr, in);
+ deserialize(nc, in);
+ deserialize(stride_y, in);
+ deserialize(stride_x, in);
+ deserialize(item.padding_y_, in);
+ deserialize(item.padding_x_, in);
+ }
+ else
+ {
+ throw serialization_error("Unexpected version '"+version+"' found while deserializing dlib::max_pool_.");
+ }
+
+ if (item.padding_y_ != _padding_y) throw serialization_error("Wrong padding_y found while deserializing dlib::max_pool_");
+ if (item.padding_x_ != _padding_x) throw serialization_error("Wrong padding_x found while deserializing dlib::max_pool_");
+ if (_nr != nr) throw serialization_error("Wrong nr found while deserializing dlib::max_pool_");
+ if (_nc != nc) throw serialization_error("Wrong nc found while deserializing dlib::max_pool_");
+ if (_stride_y != stride_y) throw serialization_error("Wrong stride_y found while deserializing dlib::max_pool_");
+ if (_stride_x != stride_x) throw serialization_error("Wrong stride_x found while deserializing dlib::max_pool_");
+ }
+
+ friend std::ostream& operator<<(std::ostream& out, const max_pool_& item)
+ {
+ out << "max_pool ("
+ << "nr="<<_nr
+ << ", nc="<<_nc
+ << ", stride_y="<<_stride_y
+ << ", stride_x="<<_stride_x
+ << ", padding_y="<<item.padding_y_
+ << ", padding_x="<<item.padding_x_
+ << ")";
+ return out;
+ }
+
+ friend void to_xml(const max_pool_& item, std::ostream& out)
+ {
+ out << "<max_pool"
+ << " nr='"<<_nr<<"'"
+ << " nc='"<<_nc<<"'"
+ << " stride_y='"<<_stride_y<<"'"
+ << " stride_x='"<<_stride_x<<"'"
+ << " padding_y='"<<item.padding_y_<<"'"
+ << " padding_x='"<<item.padding_x_<<"'"
+ << "/>\n";
+ }
+
+
+ private:
+
+
+ tt::pooling mp;
+ resizable_tensor params;
+
+ int padding_y_;
+ int padding_x_;
+ };
+
+ template <
+ long nr,
+ long nc,
+ int stride_y,
+ int stride_x,
+ typename SUBNET
+ >
+ using max_pool = add_layer<max_pool_<nr,nc,stride_y,stride_x>, SUBNET>;
+
+ template <
+ typename SUBNET
+ >
+ using max_pool_everything = add_layer<max_pool_<0,0,1,1>, SUBNET>;
+
+// ----------------------------------------------------------------------------------------
+
+ template <
+ long _nr,
+ long _nc,
+ int _stride_y,
+ int _stride_x,
+ int _padding_y = _stride_y!=1? 0 : _nr/2,
+ int _padding_x = _stride_x!=1? 0 : _nc/2
+ >
+ class avg_pool_
+ {
+ public:
+ static_assert(_nr >= 0, "The number of rows in a filter must be >= 0");
+ static_assert(_nc >= 0, "The number of columns in a filter must be >= 0");
+ static_assert(_stride_y > 0, "The filter stride must be > 0");
+ static_assert(_stride_x > 0, "The filter stride must be > 0");
+ static_assert(0 <= _padding_y && ((_nr==0 && _padding_y == 0) || (_nr!=0 && _padding_y < _nr)),
+ "The padding must be smaller than the filter size, unless the filters size is 0.");
+ static_assert(0 <= _padding_x && ((_nc==0 && _padding_x == 0) || (_nc!=0 && _padding_x < _nc)),
+ "The padding must be smaller than the filter size, unless the filters size is 0.");
+
+ avg_pool_(
+ ) :
+ padding_y_(_padding_y),
+ padding_x_(_padding_x)
+ {}
+
+ long nr() const { return _nr; }
+ long nc() const { return _nc; }
+ long stride_y() const { return _stride_y; }
+ long stride_x() const { return _stride_x; }
+ long padding_y() const { return padding_y_; }
+ long padding_x() const { return padding_x_; }
+
+ inline dpoint map_input_to_output (
+ dpoint p
+ ) const
+ {
+ p.x() = (p.x()+padding_x()-nc()/2)/stride_x();
+ p.y() = (p.y()+padding_y()-nr()/2)/stride_y();
+ return p;
+ }
+
+ inline dpoint map_output_to_input (
+ dpoint p
+ ) const
+ {
+ p.x() = p.x()*stride_x() - padding_x() + nc()/2;
+ p.y() = p.y()*stride_y() - padding_y() + nr()/2;
+ return p;
+ }
+
+ avg_pool_ (
+ const avg_pool_& item
+ ) :
+ padding_y_(item.padding_y_),
+ padding_x_(item.padding_x_)
+ {
+ // this->ap is non-copyable so we have to write our own copy to avoid trying to
+ // copy it and getting an error.
+ }
+
+ avg_pool_& operator= (
+ const avg_pool_& item
+ )
+ {
+ if (this == &item)
+ return *this;
+
+ padding_y_ = item.padding_y_;
+ padding_x_ = item.padding_x_;
+
+ // this->ap is non-copyable so we have to write our own copy to avoid trying to
+ // copy it and getting an error.
+ return *this;
+ }
+
+ template <typename SUBNET>
+ void setup (const SUBNET& /*sub*/)
+ {
+ }
+
+ template <typename SUBNET>
+ void forward(const SUBNET& sub, resizable_tensor& output)
+ {
+ ap.setup_avg_pooling(_nr!=0?_nr:sub.get_output().nr(),
+ _nc!=0?_nc:sub.get_output().nc(),
+ _stride_y, _stride_x, padding_y_, padding_x_);
+
+ ap(output, sub.get_output());
+ }
+
+ template <typename SUBNET>
+ void backward(const tensor& computed_output, const tensor& gradient_input, SUBNET& sub, tensor& /*params_grad*/)
+ {
+ ap.setup_avg_pooling(_nr!=0?_nr:sub.get_output().nr(),
+ _nc!=0?_nc:sub.get_output().nc(),
+ _stride_y, _stride_x, padding_y_, padding_x_);
+
+ ap.get_gradient(gradient_input, computed_output, sub.get_output(), sub.get_gradient_input());
+ }
+
+ const tensor& get_layer_params() const { return params; }
+ tensor& get_layer_params() { return params; }
+
+ friend void serialize(const avg_pool_& item, std::ostream& out)
+ {
+ serialize("avg_pool_2", out);
+ serialize(_nr, out);
+ serialize(_nc, out);
+ serialize(_stride_y, out);
+ serialize(_stride_x, out);
+ serialize(item.padding_y_, out);
+ serialize(item.padding_x_, out);
+ }
+
+ friend void deserialize(avg_pool_& item, std::istream& in)
+ {
+ std::string version;
+ deserialize(version, in);
+
+ long nr;
+ long nc;
+ int stride_y;
+ int stride_x;
+ if (version == "avg_pool_2")
+ {
+ deserialize(nr, in);
+ deserialize(nc, in);
+ deserialize(stride_y, in);
+ deserialize(stride_x, in);
+ deserialize(item.padding_y_, in);
+ deserialize(item.padding_x_, in);
+ }
+ else
+ {
+ throw serialization_error("Unexpected version '"+version+"' found while deserializing dlib::avg_pool_.");
+ }
+
+ if (item.padding_y_ != _padding_y) throw serialization_error("Wrong padding_y found while deserializing dlib::avg_pool_");
+ if (item.padding_x_ != _padding_x) throw serialization_error("Wrong padding_x found while deserializing dlib::avg_pool_");
+ if (_nr != nr) throw serialization_error("Wrong nr found while deserializing dlib::avg_pool_");
+ if (_nc != nc) throw serialization_error("Wrong nc found while deserializing dlib::avg_pool_");
+ if (_stride_y != stride_y) throw serialization_error("Wrong stride_y found while deserializing dlib::avg_pool_");
+ if (_stride_x != stride_x) throw serialization_error("Wrong stride_x found while deserializing dlib::avg_pool_");
+ }
+
+ friend std::ostream& operator<<(std::ostream& out, const avg_pool_& item)
+ {
+ out << "avg_pool ("
+ << "nr="<<_nr
+ << ", nc="<<_nc
+ << ", stride_y="<<_stride_y
+ << ", stride_x="<<_stride_x
+ << ", padding_y="<<item.padding_y_
+ << ", padding_x="<<item.padding_x_
+ << ")";
+ return out;
+ }
+
+ friend void to_xml(const avg_pool_& item, std::ostream& out)
+ {
+ out << "<avg_pool"
+ << " nr='"<<_nr<<"'"
+ << " nc='"<<_nc<<"'"
+ << " stride_y='"<<_stride_y<<"'"
+ << " stride_x='"<<_stride_x<<"'"
+ << " padding_y='"<<item.padding_y_<<"'"
+ << " padding_x='"<<item.padding_x_<<"'"
+ << "/>\n";
+ }
+ private:
+
+ tt::pooling ap;
+ resizable_tensor params;
+
+ int padding_y_;
+ int padding_x_;
+ };
+
+ template <
+ long nr,
+ long nc,
+ int stride_y,
+ int stride_x,
+ typename SUBNET
+ >
+ using avg_pool = add_layer<avg_pool_<nr,nc,stride_y,stride_x>, SUBNET>;
+
+ template <
+ typename SUBNET
+ >
+ using avg_pool_everything = add_layer<avg_pool_<0,0,1,1>, SUBNET>;
+
+// ----------------------------------------------------------------------------------------
+
+ enum layer_mode
+ {
+ CONV_MODE = 0,
+ FC_MODE = 1
+ };
+
+ const double DEFAULT_BATCH_NORM_EPS = 0.0001;
+
+ template <
+ layer_mode mode
+ >
+ class bn_
+ {
+ public:
+ explicit bn_(
+ unsigned long window_size,
+ double eps_ = DEFAULT_BATCH_NORM_EPS
+ ) :
+ num_updates(0),
+ running_stats_window_size(window_size),
+ learning_rate_multiplier(1),
+ weight_decay_multiplier(0),
+ bias_learning_rate_multiplier(1),
+ bias_weight_decay_multiplier(1),
+ eps(eps_)
+ {
+ DLIB_CASSERT(window_size > 0, "The batch normalization running stats window size can't be 0.");
+ }
+
+ bn_() : bn_(100) {}
+
+ layer_mode get_mode() const { return mode; }
+ unsigned long get_running_stats_window_size () const { return running_stats_window_size; }
+ void set_running_stats_window_size (unsigned long new_window_size )
+ {
+ DLIB_CASSERT(new_window_size > 0, "The batch normalization running stats window size can't be 0.");
+ running_stats_window_size = new_window_size;
+ }
+ double get_eps() const { return eps; }
+
+ double get_learning_rate_multiplier () const { return learning_rate_multiplier; }
+ double get_weight_decay_multiplier () const { return weight_decay_multiplier; }
+ void set_learning_rate_multiplier(double val) { learning_rate_multiplier = val; }
+ void set_weight_decay_multiplier(double val) { weight_decay_multiplier = val; }
+
+ double get_bias_learning_rate_multiplier () const { return bias_learning_rate_multiplier; }
+ double get_bias_weight_decay_multiplier () const { return bias_weight_decay_multiplier; }
+ void set_bias_learning_rate_multiplier(double val) { bias_learning_rate_multiplier = val; }
+ void set_bias_weight_decay_multiplier(double val) { bias_weight_decay_multiplier = val; }
+
+ inline dpoint map_input_to_output (const dpoint& p) const { return p; }
+ inline dpoint map_output_to_input (const dpoint& p) const { return p; }
+
+
+ template <typename SUBNET>
+ void setup (const SUBNET& sub)
+ {
+ if (mode == FC_MODE)
+ {
+ gamma = alias_tensor(1,
+ sub.get_output().k(),
+ sub.get_output().nr(),
+ sub.get_output().nc());
+ }
+ else
+ {
+ gamma = alias_tensor(1, sub.get_output().k());
+ }
+ beta = gamma;
+
+ params.set_size(gamma.size()+beta.size());
+
+ gamma(params,0) = 1;
+ beta(params,gamma.size()) = 0;
+
+ running_means.copy_size(gamma(params,0));
+ running_variances.copy_size(gamma(params,0));
+ running_means = 0;
+ running_variances = 1;
+ num_updates = 0;
+ }
+
+ template <typename SUBNET>
+ void forward(const SUBNET& sub, resizable_tensor& output)
+ {
+ auto g = gamma(params,0);
+ auto b = beta(params,gamma.size());
+ if (sub.get_output().num_samples() > 1)
+ {
+ const double decay = 1.0 - num_updates/(num_updates+1.0);
+ ++num_updates;
+ if (num_updates > running_stats_window_size)
+ num_updates = running_stats_window_size;
+
+ if (mode == FC_MODE)
+ tt::batch_normalize(eps, output, means, invstds, decay, running_means, running_variances, sub.get_output(), g, b);
+ else
+ tt::batch_normalize_conv(eps, output, means, invstds, decay, running_means, running_variances, sub.get_output(), g, b);
+ }
+ else // we are running in testing mode so we just linearly scale the input tensor.
+ {
+ if (mode == FC_MODE)
+ tt::batch_normalize_inference(eps, output, sub.get_output(), g, b, running_means, running_variances);
+ else
+ tt::batch_normalize_conv_inference(eps, output, sub.get_output(), g, b, running_means, running_variances);
+ }
+ }
+
+ template <typename SUBNET>
+ void backward(const tensor& gradient_input, SUBNET& sub, tensor& params_grad)
+ {
+ auto g = gamma(params,0);
+ auto g_grad = gamma(params_grad, 0);
+ auto b_grad = beta(params_grad, gamma.size());
+ if (mode == FC_MODE)
+ tt::batch_normalize_gradient(eps, gradient_input, means, invstds, sub.get_output(), g, sub.get_gradient_input(), g_grad, b_grad );
+ else
+ tt::batch_normalize_conv_gradient(eps, gradient_input, means, invstds, sub.get_output(), g, sub.get_gradient_input(), g_grad, b_grad );
+ }
+
+ const tensor& get_layer_params() const { return params; }
+ tensor& get_layer_params() { return params; }
+
+ friend void serialize(const bn_& item, std::ostream& out)
+ {
+ if (mode == CONV_MODE)
+ serialize("bn_con2", out);
+ else // if FC_MODE
+ serialize("bn_fc2", out);
+ serialize(item.params, out);
+ serialize(item.gamma, out);
+ serialize(item.beta, out);
+ serialize(item.means, out);
+ serialize(item.invstds, out);
+ serialize(item.running_means, out);
+ serialize(item.running_variances, out);
+ serialize(item.num_updates, out);
+ serialize(item.running_stats_window_size, out);
+ serialize(item.learning_rate_multiplier, out);
+ serialize(item.weight_decay_multiplier, out);
+ serialize(item.bias_learning_rate_multiplier, out);
+ serialize(item.bias_weight_decay_multiplier, out);
+ serialize(item.eps, out);
+ }
+
+ friend void deserialize(bn_& item, std::istream& in)
+ {
+ std::string version;
+ deserialize(version, in);
+ if (mode == CONV_MODE)
+ {
+ if (version != "bn_con2")
+ throw serialization_error("Unexpected version '"+version+"' found while deserializing dlib::bn_.");
+ }
+ else // must be in FC_MODE
+ {
+ if (version != "bn_fc2")
+ throw serialization_error("Unexpected version '"+version+"' found while deserializing dlib::bn_.");
+ }
+
+ deserialize(item.params, in);
+ deserialize(item.gamma, in);
+ deserialize(item.beta, in);
+ deserialize(item.means, in);
+ deserialize(item.invstds, in);
+ deserialize(item.running_means, in);
+ deserialize(item.running_variances, in);
+ deserialize(item.num_updates, in);
+ deserialize(item.running_stats_window_size, in);
+ deserialize(item.learning_rate_multiplier, in);
+ deserialize(item.weight_decay_multiplier, in);
+ deserialize(item.bias_learning_rate_multiplier, in);
+ deserialize(item.bias_weight_decay_multiplier, in);
+ deserialize(item.eps, in);
+ }
+
+ friend std::ostream& operator<<(std::ostream& out, const bn_& item)
+ {
+ if (mode == CONV_MODE)
+ out << "bn_con ";
+ else
+ out << "bn_fc ";
+ out << " eps="<<item.eps;
+ out << " running_stats_window_size="<<item.running_stats_window_size;
+ out << " learning_rate_mult="<<item.learning_rate_multiplier;
+ out << " weight_decay_mult="<<item.weight_decay_multiplier;
+ out << " bias_learning_rate_mult="<<item.bias_learning_rate_multiplier;
+ out << " bias_weight_decay_mult="<<item.bias_weight_decay_multiplier;
+ return out;
+ }
+
+ friend void to_xml(const bn_& item, std::ostream& out)
+ {
+ if (mode==CONV_MODE)
+ out << "<bn_con";
+ else
+ out << "<bn_fc";
+
+ out << " eps='"<<item.eps<<"'";
+ out << " running_stats_window_size='"<<item.running_stats_window_size<<"'";
+ out << " learning_rate_mult='"<<item.learning_rate_multiplier<<"'";
+ out << " weight_decay_mult='"<<item.weight_decay_multiplier<<"'";
+ out << " bias_learning_rate_mult='"<<item.bias_learning_rate_multiplier<<"'";
+ out << " bias_weight_decay_mult='"<<item.bias_weight_decay_multiplier<<"'";
+ out << ">\n";
+
+ out << mat(item.params);
+
+ if (mode==CONV_MODE)
+ out << "</bn_con>\n";
+ else
+ out << "</bn_fc>\n";
+ }
+
+ private:
+
+ friend class affine_;
+
+ resizable_tensor params;
+ alias_tensor gamma, beta;
+ resizable_tensor means, running_means;
+ resizable_tensor invstds, running_variances;
+ unsigned long num_updates;
+ unsigned long running_stats_window_size;
+ double learning_rate_multiplier;
+ double weight_decay_multiplier;
+ double bias_learning_rate_multiplier;
+ double bias_weight_decay_multiplier;
+ double eps;
+ };
+
+ template <typename SUBNET>
+ using bn_con = add_layer<bn_<CONV_MODE>, SUBNET>;
+ template <typename SUBNET>
+ using bn_fc = add_layer<bn_<FC_MODE>, SUBNET>;
+
+// ----------------------------------------------------------------------------------------
+
+ namespace impl
+ {
+ class visitor_bn_running_stats_window_size
+ {
+ public:
+
+ visitor_bn_running_stats_window_size(unsigned long new_window_size_) : new_window_size(new_window_size_) {}
+
+ template <typename T>
+ void set_window_size(T&) const
+ {
+ // ignore other layer detail types
+ }
+
+ template < layer_mode mode >
+ void set_window_size(bn_<mode>& l) const
+ {
+ l.set_running_stats_window_size(new_window_size);
+ }
+
+ template<typename input_layer_type>
+ void operator()(size_t , input_layer_type& ) const
+ {
+ // ignore other layers
+ }
+
+ template <typename T, typename U, typename E>
+ void operator()(size_t , add_layer<T,U,E>& l) const
+ {
+ set_window_size(l.layer_details());
+ }
+
+ private:
+
+ unsigned long new_window_size;
+ };
+ }
+
+ template <typename net_type>
+ void set_all_bn_running_stats_window_sizes (
+ net_type& net,
+ unsigned long new_window_size
+ )
+ {
+ visit_layers(net, impl::visitor_bn_running_stats_window_size(new_window_size));
+ }
+
+// ----------------------------------------------------------------------------------------
+// ----------------------------------------------------------------------------------------
+
+ enum fc_bias_mode
+ {
+ FC_HAS_BIAS = 0,
+ FC_NO_BIAS = 1
+ };
+
+ struct num_fc_outputs
+ {
+ num_fc_outputs(unsigned long n) : num_outputs(n) {}
+ unsigned long num_outputs;
+ };
+
+ template <
+ unsigned long num_outputs_,
+ fc_bias_mode bias_mode
+ >
+ class fc_
+ {
+ static_assert(num_outputs_ > 0, "The number of outputs from a fc_ layer must be > 0");
+
+ public:
+ fc_(num_fc_outputs o) : num_outputs(o.num_outputs), num_inputs(0),
+ learning_rate_multiplier(1),
+ weight_decay_multiplier(1),
+ bias_learning_rate_multiplier(1),
+ bias_weight_decay_multiplier(0)
+ {}
+
+ fc_() : fc_(num_fc_outputs(num_outputs_)) {}
+
+ double get_learning_rate_multiplier () const { return learning_rate_multiplier; }
+ double get_weight_decay_multiplier () const { return weight_decay_multiplier; }
+ void set_learning_rate_multiplier(double val) { learning_rate_multiplier = val; }
+ void set_weight_decay_multiplier(double val) { weight_decay_multiplier = val; }
+
+ double get_bias_learning_rate_multiplier () const { return bias_learning_rate_multiplier; }
+ double get_bias_weight_decay_multiplier () const { return bias_weight_decay_multiplier; }
+ void set_bias_learning_rate_multiplier(double val) { bias_learning_rate_multiplier = val; }
+ void set_bias_weight_decay_multiplier(double val) { bias_weight_decay_multiplier = val; }
+
+ unsigned long get_num_outputs (
+ ) const { return num_outputs; }
+
+ void set_num_outputs(long num)
+ {
+ DLIB_CASSERT(num > 0);
+ if (num != (long)num_outputs)
+ {
+ DLIB_CASSERT(get_layer_params().size() == 0,
+ "You can't change the number of filters in fc_ if the parameter tensor has already been allocated.");
+ num_outputs = num;
+ }
+ }
+
+ fc_bias_mode get_bias_mode (
+ ) const { return bias_mode; }
+
+ template <typename SUBNET>
+ void setup (const SUBNET& sub)
+ {
+ num_inputs = sub.get_output().nr()*sub.get_output().nc()*sub.get_output().k();
+ if (bias_mode == FC_HAS_BIAS)
+ params.set_size(num_inputs+1, num_outputs);
+ else
+ params.set_size(num_inputs, num_outputs);
+
+ dlib::rand rnd(std::rand());
+ randomize_parameters(params, num_inputs+num_outputs, rnd);
+
+ weights = alias_tensor(num_inputs, num_outputs);
+
+ if (bias_mode == FC_HAS_BIAS)
+ {
+ biases = alias_tensor(1,num_outputs);
+ // set the initial bias values to zero
+ biases(params,weights.size()) = 0;
+ }
+ }
+
+ template <typename SUBNET>
+ void forward(const SUBNET& sub, resizable_tensor& output)
+ {
+ DLIB_CASSERT((long)num_inputs == sub.get_output().nr()*sub.get_output().nc()*sub.get_output().k(),
+ "The size of the input tensor to this fc layer doesn't match the size the fc layer was trained with.");
+ output.set_size(sub.get_output().num_samples(), num_outputs);
+
+ auto w = weights(params, 0);
+ tt::gemm(0,output, 1,sub.get_output(),false, w,false);
+ if (bias_mode == FC_HAS_BIAS)
+ {
+ auto b = biases(params, weights.size());
+ tt::add(1,output,1,b);
+ }
+ }
+
+ template <typename SUBNET>
+ void backward(const tensor& gradient_input, SUBNET& sub, tensor& params_grad)
+ {
+ // no point computing the parameter gradients if they won't be used.
+ if (learning_rate_multiplier != 0)
+ {
+ // compute the gradient of the weight parameters.
+ auto pw = weights(params_grad, 0);
+ tt::gemm(0,pw, 1,sub.get_output(),true, gradient_input,false);
+
+ if (bias_mode == FC_HAS_BIAS)
+ {
+ // compute the gradient of the bias parameters.
+ auto pb = biases(params_grad, weights.size());
+ tt::assign_bias_gradient(pb, gradient_input);
+ }
+ }
+
+ // compute the gradient for the data
+ auto w = weights(params, 0);
+ tt::gemm(1,sub.get_gradient_input(), 1,gradient_input,false, w,true);
+ }
+
+ alias_tensor_instance get_weights()
+ {
+ return weights(params, 0);
+ }
+
+ alias_tensor_const_instance get_weights() const
+ {
+ return weights(params, 0);
+ }
+
+ alias_tensor_instance get_biases()
+ {
+ static_assert(bias_mode == FC_HAS_BIAS, "This fc_ layer doesn't have a bias vector "
+ "to be retrieved, as per template parameter 'bias_mode'.");
+ return biases(params, weights.size());
+ }
+
+ alias_tensor_const_instance get_biases() const
+ {
+ static_assert(bias_mode == FC_HAS_BIAS, "This fc_ layer doesn't have a bias vector "
+ "to be retrieved, as per template parameter 'bias_mode'.");
+ return biases(params, weights.size());
+ }
+
+ const tensor& get_layer_params() const { return params; }
+ tensor& get_layer_params() { return params; }
+
+ friend void serialize(const fc_& item, std::ostream& out)
+ {
+ serialize("fc_2", out);
+ serialize(item.num_outputs, out);
+ serialize(item.num_inputs, out);
+ serialize(item.params, out);
+ serialize(item.weights, out);
+ serialize(item.biases, out);
+ serialize((int)bias_mode, out);
+ serialize(item.learning_rate_multiplier, out);
+ serialize(item.weight_decay_multiplier, out);
+ serialize(item.bias_learning_rate_multiplier, out);
+ serialize(item.bias_weight_decay_multiplier, out);
+ }
+
+ friend void deserialize(fc_& item, std::istream& in)
+ {
+ std::string version;
+ deserialize(version, in);
+ if (version != "fc_2")
+ throw serialization_error("Unexpected version '"+version+"' found while deserializing dlib::fc_.");
+
+ deserialize(item.num_outputs, in);
+ deserialize(item.num_inputs, in);
+ deserialize(item.params, in);
+ deserialize(item.weights, in);
+ deserialize(item.biases, in);
+ int bmode = 0;
+ deserialize(bmode, in);
+ if (bias_mode != (fc_bias_mode)bmode) throw serialization_error("Wrong fc_bias_mode found while deserializing dlib::fc_");
+ deserialize(item.learning_rate_multiplier, in);
+ deserialize(item.weight_decay_multiplier, in);
+ deserialize(item.bias_learning_rate_multiplier, in);
+ deserialize(item.bias_weight_decay_multiplier, in);
+ }
+
+ friend std::ostream& operator<<(std::ostream& out, const fc_& item)
+ {
+ if (bias_mode == FC_HAS_BIAS)
+ {
+ out << "fc\t ("
+ << "num_outputs="<<item.num_outputs
+ << ")";
+ out << " learning_rate_mult="<<item.learning_rate_multiplier;
+ out << " weight_decay_mult="<<item.weight_decay_multiplier;
+ out << " bias_learning_rate_mult="<<item.bias_learning_rate_multiplier;
+ out << " bias_weight_decay_mult="<<item.bias_weight_decay_multiplier;
+ }
+ else
+ {
+ out << "fc_no_bias ("
+ << "num_outputs="<<item.num_outputs
+ << ")";
+ out << " learning_rate_mult="<<item.learning_rate_multiplier;
+ out << " weight_decay_mult="<<item.weight_decay_multiplier;
+ }
+ return out;
+ }
+
+ friend void to_xml(const fc_& item, std::ostream& out)
+ {
+ if (bias_mode==FC_HAS_BIAS)
+ {
+ out << "<fc"
+ << " num_outputs='"<<item.num_outputs<<"'"
+ << " learning_rate_mult='"<<item.learning_rate_multiplier<<"'"
+ << " weight_decay_mult='"<<item.weight_decay_multiplier<<"'"
+ << " bias_learning_rate_mult='"<<item.bias_learning_rate_multiplier<<"'"
+ << " bias_weight_decay_mult='"<<item.bias_weight_decay_multiplier<<"'";
+ out << ">\n";
+ out << mat(item.params);
+ out << "</fc>\n";
+ }
+ else
+ {
+ out << "<fc_no_bias"
+ << " num_outputs='"<<item.num_outputs<<"'"
+ << " learning_rate_mult='"<<item.learning_rate_multiplier<<"'"
+ << " weight_decay_mult='"<<item.weight_decay_multiplier<<"'";
+ out << ">\n";
+ out << mat(item.params);
+ out << "</fc_no_bias>\n";
+ }
+ }
+
+ private:
+
+ unsigned long num_outputs;
+ unsigned long num_inputs;
+ resizable_tensor params;
+ alias_tensor weights, biases;
+ double learning_rate_multiplier;
+ double weight_decay_multiplier;
+ double bias_learning_rate_multiplier;
+ double bias_weight_decay_multiplier;
+ };
+
+ template <
+ unsigned long num_outputs,
+ typename SUBNET
+ >
+ using fc = add_layer<fc_<num_outputs,FC_HAS_BIAS>, SUBNET>;
+
+ template <
+ unsigned long num_outputs,
+ typename SUBNET
+ >
+ using fc_no_bias = add_layer<fc_<num_outputs,FC_NO_BIAS>, SUBNET>;
+
+// ----------------------------------------------------------------------------------------
+
+ class dropout_
+ {
+ public:
+ explicit dropout_(
+ float drop_rate_ = 0.5
+ ) :
+ drop_rate(drop_rate_),
+ rnd(std::rand())
+ {
+ DLIB_CASSERT(0 <= drop_rate && drop_rate <= 1);
+ }
+
+ // We have to add a copy constructor and assignment operator because the rnd object
+ // is non-copyable.
+ dropout_(
+ const dropout_& item
+ ) : drop_rate(item.drop_rate), mask(item.mask), rnd(std::rand())
+ {}
+
+ dropout_& operator= (
+ const dropout_& item
+ )
+ {
+ if (this == &item)
+ return *this;
+
+ drop_rate = item.drop_rate;
+ mask = item.mask;
+ return *this;
+ }
+
+ float get_drop_rate (
+ ) const { return drop_rate; }
+
+ template <typename SUBNET>
+ void setup (const SUBNET& /*sub*/)
+ {
+ }
+
+ void forward_inplace(const tensor& input, tensor& output)
+ {
+ // create a random mask and use it to filter the data
+ mask.copy_size(input);
+ rnd.fill_uniform(mask);
+ tt::threshold(mask, drop_rate);
+ tt::multiply(false, output, input, mask);
+ }
+
+ void backward_inplace(
+ const tensor& gradient_input,
+ tensor& data_grad,
+ tensor& /*params_grad*/
+ )
+ {
+ if (is_same_object(gradient_input, data_grad))
+ tt::multiply(false, data_grad, mask, gradient_input);
+ else
+ tt::multiply(true, data_grad, mask, gradient_input);
+ }
+
+ inline dpoint map_input_to_output (const dpoint& p) const { return p; }
+ inline dpoint map_output_to_input (const dpoint& p) const { return p; }
+
+ const tensor& get_layer_params() const { return params; }
+ tensor& get_layer_params() { return params; }
+
+ friend void serialize(const dropout_& item, std::ostream& out)
+ {
+ serialize("dropout_", out);
+ serialize(item.drop_rate, out);
+ serialize(item.mask, out);
+ }
+
+ friend void deserialize(dropout_& item, std::istream& in)
+ {
+ std::string version;
+ deserialize(version, in);
+ if (version != "dropout_")
+ throw serialization_error("Unexpected version '"+version+"' found while deserializing dlib::dropout_.");
+ deserialize(item.drop_rate, in);
+ deserialize(item.mask, in);
+ }
+
+ void clean(
+ )
+ {
+ mask.clear();
+ }
+
+ friend std::ostream& operator<<(std::ostream& out, const dropout_& item)
+ {
+ out << "dropout\t ("
+ << "drop_rate="<<item.drop_rate
+ << ")";
+ return out;
+ }
+
+ friend void to_xml(const dropout_& item, std::ostream& out)
+ {
+ out << "<dropout"
+ << " drop_rate='"<<item.drop_rate<<"'";
+ out << "/>\n";
+ }
+
+ private:
+ float drop_rate;
+ resizable_tensor mask;
+
+ tt::tensor_rand rnd;
+ resizable_tensor params; // unused
+ };
+
+
+ template <typename SUBNET>
+ using dropout = add_layer<dropout_, SUBNET>;
+
+// ----------------------------------------------------------------------------------------
+
+ class multiply_
+ {
+ public:
+ explicit multiply_(
+ float val_ = 0.5
+ ) :
+ val(val_)
+ {
+ }
+
+ multiply_ (
+ const dropout_& item
+ ) : val(1-item.get_drop_rate()) {}
+
+ float get_multiply_value (
+ ) const { return val; }
+
+ template <typename SUBNET>
+ void setup (const SUBNET& /*sub*/)
+ {
+ }
+
+ void forward_inplace(const tensor& input, tensor& output)
+ {
+ tt::affine_transform(output, input, val);
+ }
+
+ inline dpoint map_input_to_output (const dpoint& p) const { return p; }
+ inline dpoint map_output_to_input (const dpoint& p) const { return p; }
+
+ void backward_inplace(
+ const tensor& gradient_input,
+ tensor& data_grad,
+ tensor& /*params_grad*/
+ )
+ {
+ if (is_same_object(gradient_input, data_grad))
+ tt::affine_transform(data_grad, gradient_input, val);
+ else
+ tt::affine_transform(data_grad, data_grad, gradient_input, 1, val);
+ }
+
+ const tensor& get_layer_params() const { return params; }
+ tensor& get_layer_params() { return params; }
+
+ friend void serialize(const multiply_& item, std::ostream& out)
+ {
+ serialize("multiply_", out);
+ serialize(item.val, out);
+ }
+
+ friend void deserialize(multiply_& item, std::istream& in)
+ {
+ std::string version;
+ deserialize(version, in);
+ if (version == "dropout_")
+ {
+ // Since we can build a multiply_ from a dropout_ we check if that's what
+ // is in the stream and if so then just convert it right here.
+ unserialize sin(version, in);
+ dropout_ temp;
+ deserialize(temp, sin);
+ item = temp;
+ return;
+ }
+
+ if (version != "multiply_")
+ throw serialization_error("Unexpected version '"+version+"' found while deserializing dlib::multiply_.");
+ deserialize(item.val, in);
+ }
+
+ friend std::ostream& operator<<(std::ostream& out, const multiply_& item)
+ {
+ out << "multiply ("
+ << "val="<<item.val
+ << ")";
+ return out;
+ }
+
+ friend void to_xml(const multiply_& item, std::ostream& out)
+ {
+ out << "<multiply"
+ << " val='"<<item.val<<"'";
+ out << "/>\n";
+ }
+ private:
+ float val;
+ resizable_tensor params; // unused
+ };
+
+ template <typename SUBNET>
+ using multiply = add_layer<multiply_, SUBNET>;
+
+// ----------------------------------------------------------------------------------------
+
+ class affine_
+ {
+ public:
+ affine_(
+ ) : mode(FC_MODE)
+ {
+ }
+
+ affine_(
+ layer_mode mode_
+ ) : mode(mode_)
+ {
+ }
+
+ template <
+ layer_mode bnmode
+ >
+ affine_(
+ const bn_<bnmode>& item
+ )
+ {
+ gamma = item.gamma;
+ beta = item.beta;
+ mode = bnmode;
+
+ params.copy_size(item.params);
+
+ auto g = gamma(params,0);
+ auto b = beta(params,gamma.size());
+
+ resizable_tensor temp(item.params);
+ auto sg = gamma(temp,0);
+ auto sb = beta(temp,gamma.size());
+
+ g = pointwise_multiply(mat(sg), 1.0f/sqrt(mat(item.running_variances)+item.get_eps()));
+ b = mat(sb) - pointwise_multiply(mat(g), mat(item.running_means));
+ }
+
+ layer_mode get_mode() const { return mode; }
+
+ inline dpoint map_input_to_output (const dpoint& p) const { return p; }
+ inline dpoint map_output_to_input (const dpoint& p) const { return p; }
+
+ template <typename SUBNET>
+ void setup (const SUBNET& sub)
+ {
+ if (mode == FC_MODE)
+ {
+ gamma = alias_tensor(1,
+ sub.get_output().k(),
+ sub.get_output().nr(),
+ sub.get_output().nc());
+ }
+ else
+ {
+ gamma = alias_tensor(1, sub.get_output().k());
+ }
+ beta = gamma;
+
+ params.set_size(gamma.size()+beta.size());
+
+ gamma(params,0) = 1;
+ beta(params,gamma.size()) = 0;
+ }
+
+ void forward_inplace(const tensor& input, tensor& output)
+ {
+ auto g = gamma(params,0);
+ auto b = beta(params,gamma.size());
+ if (mode == FC_MODE)
+ tt::affine_transform(output, input, g, b);
+ else
+ tt::affine_transform_conv(output, input, g, b);
+ }
+
+ void backward_inplace(
+ const tensor& gradient_input,
+ tensor& data_grad,
+ tensor& /*params_grad*/
+ )
+ {
+ auto g = gamma(params,0);
+ auto b = beta(params,gamma.size());
+
+ // We are computing the gradient of dot(gradient_input, computed_output*g + b)
+ if (mode == FC_MODE)
+ {
+ if (is_same_object(gradient_input, data_grad))
+ tt::multiply(false, data_grad, gradient_input, g);
+ else
+ tt::multiply(true, data_grad, gradient_input, g);
+ }
+ else
+ {
+ if (is_same_object(gradient_input, data_grad))
+ tt::multiply_conv(false, data_grad, gradient_input, g);
+ else
+ tt::multiply_conv(true, data_grad, gradient_input, g);
+ }
+ }
+
+ const tensor& get_layer_params() const { return empty_params; }
+ tensor& get_layer_params() { return empty_params; }
+
+ friend void serialize(const affine_& item, std::ostream& out)
+ {
+ serialize("affine_", out);
+ serialize(item.params, out);
+ serialize(item.gamma, out);
+ serialize(item.beta, out);
+ serialize((int)item.mode, out);
+ }
+
+ friend void deserialize(affine_& item, std::istream& in)
+ {
+ std::string version;
+ deserialize(version, in);
+ if (version == "bn_con2")
+ {
+ // Since we can build an affine_ from a bn_ we check if that's what is in
+ // the stream and if so then just convert it right here.
+ unserialize sin(version, in);
+ bn_<CONV_MODE> temp;
+ deserialize(temp, sin);
+ item = temp;
+ return;
+ }
+ else if (version == "bn_fc2")
+ {
+ // Since we can build an affine_ from a bn_ we check if that's what is in
+ // the stream and if so then just convert it right here.
+ unserialize sin(version, in);
+ bn_<FC_MODE> temp;
+ deserialize(temp, sin);
+ item = temp;
+ return;
+ }
+
+ if (version != "affine_")
+ throw serialization_error("Unexpected version '"+version+"' found while deserializing dlib::affine_.");
+ deserialize(item.params, in);
+ deserialize(item.gamma, in);
+ deserialize(item.beta, in);
+ int mode;
+ deserialize(mode, in);
+ item.mode = (layer_mode)mode;
+ }
+
+ friend std::ostream& operator<<(std::ostream& out, const affine_& )
+ {
+ out << "affine";
+ return out;
+ }
+
+ friend void to_xml(const affine_& item, std::ostream& out)
+ {
+ if (item.mode==CONV_MODE)
+ out << "<affine_con>\n";
+ else
+ out << "<affine_fc>\n";
+
+ out << mat(item.params);
+
+ if (item.mode==CONV_MODE)
+ out << "</affine_con>\n";
+ else
+ out << "</affine_fc>\n";
+ }
+
+ private:
+ resizable_tensor params, empty_params;
+ alias_tensor gamma, beta;
+ layer_mode mode;
+ };
+
+ template <typename SUBNET>
+ using affine = add_layer<affine_, SUBNET>;
+
+// ----------------------------------------------------------------------------------------
+
+ template <
+ template<typename> class tag
+ >
+ class add_prev_
+ {
+ public:
+ const static unsigned long id = tag_id<tag>::id;
+
+ add_prev_()
+ {
+ }
+
+ template <typename SUBNET>
+ void setup (const SUBNET& /*sub*/)
+ {
+ }
+
+ template <typename SUBNET>
+ void forward(const SUBNET& sub, resizable_tensor& output)
+ {
+ auto&& t1 = sub.get_output();
+ auto&& t2 = layer<tag>(sub).get_output();
+ output.set_size(std::max(t1.num_samples(),t2.num_samples()),
+ std::max(t1.k(),t2.k()),
+ std::max(t1.nr(),t2.nr()),
+ std::max(t1.nc(),t2.nc()));
+ tt::add(output, t1, t2);
+ }
+
+ template <typename SUBNET>
+ void backward(const tensor& gradient_input, SUBNET& sub, tensor& /*params_grad*/)
+ {
+ // The gradient just flows backwards to the two layers that forward() added
+ // together.
+ tt::add(sub.get_gradient_input(), sub.get_gradient_input(), gradient_input);
+ tt::add(layer<tag>(sub).get_gradient_input(), layer<tag>(sub).get_gradient_input(), gradient_input);
+ }
+
+ const tensor& get_layer_params() const { return params; }
+ tensor& get_layer_params() { return params; }
+
+ inline dpoint map_input_to_output (const dpoint& p) const { return p; }
+ inline dpoint map_output_to_input (const dpoint& p) const { return p; }
+
+ friend void serialize(const add_prev_& , std::ostream& out)
+ {
+ serialize("add_prev_", out);
+ }
+
+ friend void deserialize(add_prev_& , std::istream& in)
+ {
+ std::string version;
+ deserialize(version, in);
+ if (version != "add_prev_")
+ throw serialization_error("Unexpected version '"+version+"' found while deserializing dlib::add_prev_.");
+ }
+
+ friend std::ostream& operator<<(std::ostream& out, const add_prev_& item)
+ {
+ out << "add_prev"<<id;
+ return out;
+ }
+
+ friend void to_xml(const add_prev_& item, std::ostream& out)
+ {
+ out << "<add_prev tag='"<<id<<"'/>\n";
+ }
+
+ private:
+ resizable_tensor params;
+ };
+
+ template <
+ template<typename> class tag,
+ typename SUBNET
+ >
+ using add_prev = add_layer<add_prev_<tag>, SUBNET>;
+
+ template <typename SUBNET> using add_prev1 = add_prev<tag1, SUBNET>;
+ template <typename SUBNET> using add_prev2 = add_prev<tag2, SUBNET>;
+ template <typename SUBNET> using add_prev3 = add_prev<tag3, SUBNET>;
+ template <typename SUBNET> using add_prev4 = add_prev<tag4, SUBNET>;
+ template <typename SUBNET> using add_prev5 = add_prev<tag5, SUBNET>;
+ template <typename SUBNET> using add_prev6 = add_prev<tag6, SUBNET>;
+ template <typename SUBNET> using add_prev7 = add_prev<tag7, SUBNET>;
+ template <typename SUBNET> using add_prev8 = add_prev<tag8, SUBNET>;
+ template <typename SUBNET> using add_prev9 = add_prev<tag9, SUBNET>;
+ template <typename SUBNET> using add_prev10 = add_prev<tag10, SUBNET>;
+
+ using add_prev1_ = add_prev_<tag1>;
+ using add_prev2_ = add_prev_<tag2>;
+ using add_prev3_ = add_prev_<tag3>;
+ using add_prev4_ = add_prev_<tag4>;
+ using add_prev5_ = add_prev_<tag5>;
+ using add_prev6_ = add_prev_<tag6>;
+ using add_prev7_ = add_prev_<tag7>;
+ using add_prev8_ = add_prev_<tag8>;
+ using add_prev9_ = add_prev_<tag9>;
+ using add_prev10_ = add_prev_<tag10>;
+
+// ----------------------------------------------------------------------------------------
+
+ template <
+ template<typename> class tag
+ >
+ class mult_prev_
+ {
+ public:
+ const static unsigned long id = tag_id<tag>::id;
+
+ mult_prev_()
+ {
+ }
+
+ template <typename SUBNET>
+ void setup (const SUBNET& /*sub*/)
+ {
+ }
+
+ template <typename SUBNET>
+ void forward(const SUBNET& sub, resizable_tensor& output)
+ {
+ auto&& t1 = sub.get_output();
+ auto&& t2 = layer<tag>(sub).get_output();
+ output.set_size(std::max(t1.num_samples(),t2.num_samples()),
+ std::max(t1.k(),t2.k()),
+ std::max(t1.nr(),t2.nr()),
+ std::max(t1.nc(),t2.nc()));
+ tt::multiply_zero_padded(false, output, t1, t2);
+ }
+
+ template <typename SUBNET>
+ void backward(const tensor& gradient_input, SUBNET& sub, tensor& /*params_grad*/)
+ {
+ auto&& t1 = sub.get_output();
+ auto&& t2 = layer<tag>(sub).get_output();
+ // The gradient just flows backwards to the two layers that forward()
+ // multiplied together.
+ tt::multiply_zero_padded(true, sub.get_gradient_input(), t2, gradient_input);
+ tt::multiply_zero_padded(true, layer<tag>(sub).get_gradient_input(), t1, gradient_input);
+ }
+
+ const tensor& get_layer_params() const { return params; }
+ tensor& get_layer_params() { return params; }
+
+ friend void serialize(const mult_prev_& , std::ostream& out)
+ {
+ serialize("mult_prev_", out);
+ }
+
+ friend void deserialize(mult_prev_& , std::istream& in)
+ {
+ std::string version;
+ deserialize(version, in);
+ if (version != "mult_prev_")
+ throw serialization_error("Unexpected version '"+version+"' found while deserializing dlib::mult_prev_.");
+ }
+
+ friend std::ostream& operator<<(std::ostream& out, const mult_prev_& item)
+ {
+ out << "mult_prev"<<id;
+ return out;
+ }
+
+ friend void to_xml(const mult_prev_& item, std::ostream& out)
+ {
+ out << "<mult_prev tag='"<<id<<"'/>\n";
+ }
+
+ private:
+ resizable_tensor params;
+ };
+
+ template <
+ template<typename> class tag,
+ typename SUBNET
+ >
+ using mult_prev = add_layer<mult_prev_<tag>, SUBNET>;
+
+ template <typename SUBNET> using mult_prev1 = mult_prev<tag1, SUBNET>;
+ template <typename SUBNET> using mult_prev2 = mult_prev<tag2, SUBNET>;
+ template <typename SUBNET> using mult_prev3 = mult_prev<tag3, SUBNET>;
+ template <typename SUBNET> using mult_prev4 = mult_prev<tag4, SUBNET>;
+ template <typename SUBNET> using mult_prev5 = mult_prev<tag5, SUBNET>;
+ template <typename SUBNET> using mult_prev6 = mult_prev<tag6, SUBNET>;
+ template <typename SUBNET> using mult_prev7 = mult_prev<tag7, SUBNET>;
+ template <typename SUBNET> using mult_prev8 = mult_prev<tag8, SUBNET>;
+ template <typename SUBNET> using mult_prev9 = mult_prev<tag9, SUBNET>;
+ template <typename SUBNET> using mult_prev10 = mult_prev<tag10, SUBNET>;
+
+ using mult_prev1_ = mult_prev_<tag1>;
+ using mult_prev2_ = mult_prev_<tag2>;
+ using mult_prev3_ = mult_prev_<tag3>;
+ using mult_prev4_ = mult_prev_<tag4>;
+ using mult_prev5_ = mult_prev_<tag5>;
+ using mult_prev6_ = mult_prev_<tag6>;
+ using mult_prev7_ = mult_prev_<tag7>;
+ using mult_prev8_ = mult_prev_<tag8>;
+ using mult_prev9_ = mult_prev_<tag9>;
+ using mult_prev10_ = mult_prev_<tag10>;
+
+// ----------------------------------------------------------------------------------------
+
+ template <
+ template<typename> class tag
+ >
+ class scale_
+ {
+ public:
+ const static unsigned long id = tag_id<tag>::id;
+
+ scale_()
+ {
+ }
+
+ template <typename SUBNET>
+ void setup (const SUBNET& /*sub*/)
+ {
+ }
+
+ template <typename SUBNET>
+ void forward(const SUBNET& sub, resizable_tensor& output)
+ {
+ auto&& scales = sub.get_output();
+ auto&& src = layer<tag>(sub).get_output();
+ DLIB_CASSERT(scales.num_samples() == src.num_samples() &&
+ scales.k() == src.k() &&
+ scales.nr() == 1 &&
+ scales.nc() == 1,
+ "scales.k(): " << scales.k() <<
+ "\nsrc.k(): " << src.k()
+ );
+
+ output.copy_size(src);
+ tt::scale_channels(false, output, src, scales);
+ }
+
+ template <typename SUBNET>
+ void backward(const tensor& gradient_input, SUBNET& sub, tensor& /*params_grad*/)
+ {
+ auto&& scales = sub.get_output();
+ auto&& src = layer<tag>(sub).get_output();
+ // The gradient just flows backwards to the two layers that forward()
+ // read from.
+ tt::scale_channels(true, layer<tag>(sub).get_gradient_input(), gradient_input, scales);
+
+ if (reshape_src.num_samples() != src.num_samples())
+ {
+ reshape_scales = alias_tensor(src.num_samples()*src.k());
+ reshape_src = alias_tensor(src.num_samples()*src.k(),src.nr()*src.nc());
+ }
+
+ auto&& scales_grad = sub.get_gradient_input();
+ auto sgrad = reshape_scales(scales_grad);
+ tt::dot_prods(true, sgrad, reshape_src(src), reshape_src(gradient_input));
+ }
+
+ const tensor& get_layer_params() const { return params; }
+ tensor& get_layer_params() { return params; }
+
+ friend void serialize(const scale_& item, std::ostream& out)
+ {
+ serialize("scale_", out);
+ serialize(item.reshape_scales, out);
+ serialize(item.reshape_src, out);
+ }
+
+ friend void deserialize(scale_& item, std::istream& in)
+ {
+ std::string version;
+ deserialize(version, in);
+ if (version != "scale_")
+ throw serialization_error("Unexpected version '"+version+"' found while deserializing dlib::scale_.");
+ deserialize(item.reshape_scales, in);
+ deserialize(item.reshape_src, in);
+ }
+
+ friend std::ostream& operator<<(std::ostream& out, const scale_& item)
+ {
+ out << "scale"<<id;
+ return out;
+ }
+
+ friend void to_xml(const scale_& item, std::ostream& out)
+ {
+ out << "<scale tag='"<<id<<"'/>\n";
+ }
+
+ private:
+ alias_tensor reshape_scales;
+ alias_tensor reshape_src;
+ resizable_tensor params;
+ };
+
+ template <
+ template<typename> class tag,
+ typename SUBNET
+ >
+ using scale = add_layer<scale_<tag>, SUBNET>;
+
+ template <typename SUBNET> using scale1 = scale<tag1, SUBNET>;
+ template <typename SUBNET> using scale2 = scale<tag2, SUBNET>;
+ template <typename SUBNET> using scale3 = scale<tag3, SUBNET>;
+ template <typename SUBNET> using scale4 = scale<tag4, SUBNET>;
+ template <typename SUBNET> using scale5 = scale<tag5, SUBNET>;
+ template <typename SUBNET> using scale6 = scale<tag6, SUBNET>;
+ template <typename SUBNET> using scale7 = scale<tag7, SUBNET>;
+ template <typename SUBNET> using scale8 = scale<tag8, SUBNET>;
+ template <typename SUBNET> using scale9 = scale<tag9, SUBNET>;
+ template <typename SUBNET> using scale10 = scale<tag10, SUBNET>;
+
+ using scale1_ = scale_<tag1>;
+ using scale2_ = scale_<tag2>;
+ using scale3_ = scale_<tag3>;
+ using scale4_ = scale_<tag4>;
+ using scale5_ = scale_<tag5>;
+ using scale6_ = scale_<tag6>;
+ using scale7_ = scale_<tag7>;
+ using scale8_ = scale_<tag8>;
+ using scale9_ = scale_<tag9>;
+ using scale10_ = scale_<tag10>;
+
+// ----------------------------------------------------------------------------------------
+
+ class relu_
+ {
+ public:
+ relu_()
+ {
+ }
+
+ template <typename SUBNET>
+ void setup (const SUBNET& /*sub*/)
+ {
+ }
+
+ void forward_inplace(const tensor& input, tensor& output)
+ {
+ tt::relu(output, input);
+ }
+
+ void backward_inplace(
+ const tensor& computed_output,
+ const tensor& gradient_input,
+ tensor& data_grad,
+ tensor&
+ )
+ {
+ tt::relu_gradient(data_grad, computed_output, gradient_input);
+ }
+
+ inline dpoint map_input_to_output (const dpoint& p) const { return p; }
+ inline dpoint map_output_to_input (const dpoint& p) const { return p; }
+
+ const tensor& get_layer_params() const { return params; }
+ tensor& get_layer_params() { return params; }
+
+ friend void serialize(const relu_& , std::ostream& out)
+ {
+ serialize("relu_", out);
+ }
+
+ friend void deserialize(relu_& , std::istream& in)
+ {
+ std::string version;
+ deserialize(version, in);
+ if (version != "relu_")
+ throw serialization_error("Unexpected version '"+version+"' found while deserializing dlib::relu_.");
+ }
+
+ friend std::ostream& operator<<(std::ostream& out, const relu_& )
+ {
+ out << "relu";
+ return out;
+ }
+
+ friend void to_xml(const relu_& /*item*/, std::ostream& out)
+ {
+ out << "<relu/>\n";
+ }
+
+ private:
+ resizable_tensor params;
+ };
+
+
+ template <typename SUBNET>
+ using relu = add_layer<relu_, SUBNET>;
+
+// ----------------------------------------------------------------------------------------
+
+ class prelu_
+ {
+ public:
+ explicit prelu_(
+ float initial_param_value_ = 0.25
+ ) : initial_param_value(initial_param_value_)
+ {
+ }
+
+ float get_initial_param_value (
+ ) const { return initial_param_value; }
+
+ template <typename SUBNET>
+ void setup (const SUBNET& /*sub*/)
+ {
+ params.set_size(1);
+ params = initial_param_value;
+ }
+
+ template <typename SUBNET>
+ void forward(
+ const SUBNET& sub,
+ resizable_tensor& data_output
+ )
+ {
+ data_output.copy_size(sub.get_output());
+ tt::prelu(data_output, sub.get_output(), params);
+ }
+
+ template <typename SUBNET>
+ void backward(
+ const tensor& gradient_input,
+ SUBNET& sub,
+ tensor& params_grad
+ )
+ {
+ tt::prelu_gradient(sub.get_gradient_input(), sub.get_output(),
+ gradient_input, params, params_grad);
+ }
+
+ inline dpoint map_input_to_output (const dpoint& p) const { return p; }
+ inline dpoint map_output_to_input (const dpoint& p) const { return p; }
+
+ const tensor& get_layer_params() const { return params; }
+ tensor& get_layer_params() { return params; }
+
+ friend void serialize(const prelu_& item, std::ostream& out)
+ {
+ serialize("prelu_", out);
+ serialize(item.params, out);
+ serialize(item.initial_param_value, out);
+ }
+
+ friend void deserialize(prelu_& item, std::istream& in)
+ {
+ std::string version;
+ deserialize(version, in);
+ if (version != "prelu_")
+ throw serialization_error("Unexpected version '"+version+"' found while deserializing dlib::prelu_.");
+ deserialize(item.params, in);
+ deserialize(item.initial_param_value, in);
+ }
+
+ friend std::ostream& operator<<(std::ostream& out, const prelu_& item)
+ {
+ out << "prelu\t ("
+ << "initial_param_value="<<item.initial_param_value
+ << ")";
+ return out;
+ }
+
+ friend void to_xml(const prelu_& item, std::ostream& out)
+ {
+ out << "<prelu initial_param_value='"<<item.initial_param_value<<"'>\n";
+ out << mat(item.params);
+ out << "</prelu>\n";
+ }
+
+ private:
+ resizable_tensor params;
+ float initial_param_value;
+ };
+
+ template <typename SUBNET>
+ using prelu = add_layer<prelu_, SUBNET>;
+
+// ----------------------------------------------------------------------------------------
+
+ class sig_
+ {
+ public:
+ sig_()
+ {
+ }
+
+ template <typename SUBNET>
+ void setup (const SUBNET& /*sub*/)
+ {
+ }
+
+ void forward_inplace(const tensor& input, tensor& output)
+ {
+ tt::sigmoid(output, input);
+ }
+
+ void backward_inplace(
+ const tensor& computed_output,
+ const tensor& gradient_input,
+ tensor& data_grad,
+ tensor&
+ )
+ {
+ tt::sigmoid_gradient(data_grad, computed_output, gradient_input);
+ }
+
+ inline dpoint map_input_to_output (const dpoint& p) const { return p; }
+ inline dpoint map_output_to_input (const dpoint& p) const { return p; }
+
+ const tensor& get_layer_params() const { return params; }
+ tensor& get_layer_params() { return params; }
+
+ friend void serialize(const sig_& , std::ostream& out)
+ {
+ serialize("sig_", out);
+ }
+
+ friend void deserialize(sig_& , std::istream& in)
+ {
+ std::string version;
+ deserialize(version, in);
+ if (version != "sig_")
+ throw serialization_error("Unexpected version '"+version+"' found while deserializing dlib::sig_.");
+ }
+
+ friend std::ostream& operator<<(std::ostream& out, const sig_& )
+ {
+ out << "sig";
+ return out;
+ }
+
+ friend void to_xml(const sig_& /*item*/, std::ostream& out)
+ {
+ out << "<sig/>\n";
+ }
+
+
+ private:
+ resizable_tensor params;
+ };
+
+
+ template <typename SUBNET>
+ using sig = add_layer<sig_, SUBNET>;
+
+// ----------------------------------------------------------------------------------------
+
+ class htan_
+ {
+ public:
+ htan_()
+ {
+ }
+
+ template <typename SUBNET>
+ void setup (const SUBNET& /*sub*/)
+ {
+ }
+
+ inline dpoint map_input_to_output (const dpoint& p) const { return p; }
+ inline dpoint map_output_to_input (const dpoint& p) const { return p; }
+
+ void forward_inplace(const tensor& input, tensor& output)
+ {
+ tt::tanh(output, input);
+ }
+
+ void backward_inplace(
+ const tensor& computed_output,
+ const tensor& gradient_input,
+ tensor& data_grad,
+ tensor&
+ )
+ {
+ tt::tanh_gradient(data_grad, computed_output, gradient_input);
+ }
+
+ const tensor& get_layer_params() const { return params; }
+ tensor& get_layer_params() { return params; }
+
+ friend void serialize(const htan_& , std::ostream& out)
+ {
+ serialize("htan_", out);
+ }
+
+ friend void deserialize(htan_& , std::istream& in)
+ {
+ std::string version;
+ deserialize(version, in);
+ if (version != "htan_")
+ throw serialization_error("Unexpected version '"+version+"' found while deserializing dlib::htan_.");
+ }
+
+ friend std::ostream& operator<<(std::ostream& out, const htan_& )
+ {
+ out << "htan";
+ return out;
+ }
+
+ friend void to_xml(const htan_& /*item*/, std::ostream& out)
+ {
+ out << "<htan/>\n";
+ }
+
+
+ private:
+ resizable_tensor params;
+ };
+
+
+ template <typename SUBNET>
+ using htan = add_layer<htan_, SUBNET>;
+
+// ----------------------------------------------------------------------------------------
+
+ class softmax_
+ {
+ public:
+ softmax_()
+ {
+ }
+
+ template <typename SUBNET>
+ void setup (const SUBNET& /*sub*/)
+ {
+ }
+
+ void forward_inplace(const tensor& input, tensor& output)
+ {
+ tt::softmax(output, input);
+ }
+
+ void backward_inplace(
+ const tensor& computed_output,
+ const tensor& gradient_input,
+ tensor& data_grad,
+ tensor&
+ )
+ {
+ tt::softmax_gradient(data_grad, computed_output, gradient_input);
+ }
+
+ const tensor& get_layer_params() const { return params; }
+ tensor& get_layer_params() { return params; }
+
+ friend void serialize(const softmax_& , std::ostream& out)
+ {
+ serialize("softmax_", out);
+ }
+
+ friend void deserialize(softmax_& , std::istream& in)
+ {
+ std::string version;
+ deserialize(version, in);
+ if (version != "softmax_")
+ throw serialization_error("Unexpected version '"+version+"' found while deserializing dlib::softmax_.");
+ }
+
+ friend std::ostream& operator<<(std::ostream& out, const softmax_& )
+ {
+ out << "softmax";
+ return out;
+ }
+
+ friend void to_xml(const softmax_& /*item*/, std::ostream& out)
+ {
+ out << "<softmax/>\n";
+ }
+
+ private:
+ resizable_tensor params;
+ };
+
+ template <typename SUBNET>
+ using softmax = add_layer<softmax_, SUBNET>;
+
+// ----------------------------------------------------------------------------------------
+
+ class softmax_all_
+ {
+ public:
+ softmax_all_()
+ {
+ }
+
+ template <typename SUBNET>
+ void setup (const SUBNET& /*sub*/)
+ {
+ }
+
+ void forward_inplace(const tensor& input, tensor& output)
+ {
+ tt::softmax_all(output, input);
+ }
+
+ void backward_inplace(
+ const tensor& computed_output,
+ const tensor& gradient_input,
+ tensor& data_grad,
+ tensor&
+ )
+ {
+ tt::softmax_all_gradient(data_grad, computed_output, gradient_input);
+ }
+
+ const tensor& get_layer_params() const { return params; }
+ tensor& get_layer_params() { return params; }
+
+ friend void serialize(const softmax_all_& , std::ostream& out)
+ {
+ serialize("softmax_all_", out);
+ }
+
+ friend void deserialize(softmax_all_& , std::istream& in)
+ {
+ std::string version;
+ deserialize(version, in);
+ if (version != "softmax_all_")
+ throw serialization_error("Unexpected version '"+version+"' found while deserializing dlib::softmax_all_.");
+ }
+
+ friend std::ostream& operator<<(std::ostream& out, const softmax_all_& )
+ {
+ out << "softmax_all";
+ return out;
+ }
+
+ friend void to_xml(const softmax_all_& /*item*/, std::ostream& out)
+ {
+ out << "<softmax_all/>\n";
+ }
+
+ private:
+ resizable_tensor params;
+ };
+
+ template <typename SUBNET>
+ using softmax_all = add_layer<softmax_all_, SUBNET>;
+
+// ----------------------------------------------------------------------------------------
+
+ namespace impl
+ {
+ template <template<typename> class TAG_TYPE, template<typename> class... TAG_TYPES>
+ struct concat_helper_impl{
+
+ constexpr static size_t tag_count() {return 1 + concat_helper_impl<TAG_TYPES...>::tag_count();}
+ static void list_tags(std::ostream& out)
+ {
+ out << tag_id<TAG_TYPE>::id << (tag_count() > 1 ? "," : "");
+ concat_helper_impl<TAG_TYPES...>::list_tags(out);
+ }
+
+ template<typename SUBNET>
+ static void resize_out(resizable_tensor& out, const SUBNET& sub, long sum_k)
+ {
+ auto& t = layer<TAG_TYPE>(sub).get_output();
+ concat_helper_impl<TAG_TYPES...>::resize_out(out, sub, sum_k + t.k());
+ }
+ template<typename SUBNET>
+ static void concat(tensor& out, const SUBNET& sub, size_t k_offset)
+ {
+ auto& t = layer<TAG_TYPE>(sub).get_output();
+ tt::copy_tensor(false, out, k_offset, t, 0, t.k());
+ k_offset += t.k();
+ concat_helper_impl<TAG_TYPES...>::concat(out, sub, k_offset);
+ }
+ template<typename SUBNET>
+ static void split(const tensor& input, SUBNET& sub, size_t k_offset)
+ {
+ auto& t = layer<TAG_TYPE>(sub).get_gradient_input();
+ tt::copy_tensor(true, t, 0, input, k_offset, t.k());
+ k_offset += t.k();
+ concat_helper_impl<TAG_TYPES...>::split(input, sub, k_offset);
+ }
+ };
+ template <template<typename> class TAG_TYPE>
+ struct concat_helper_impl<TAG_TYPE>{
+ constexpr static size_t tag_count() {return 1;}
+ static void list_tags(std::ostream& out)
+ {
+ out << tag_id<TAG_TYPE>::id;
+ }
+
+ template<typename SUBNET>
+ static void resize_out(resizable_tensor& out, const SUBNET& sub, long sum_k)
+ {
+ auto& t = layer<TAG_TYPE>(sub).get_output();
+ out.set_size(t.num_samples(), t.k() + sum_k, t.nr(), t.nc());
+ }
+ template<typename SUBNET>
+ static void concat(tensor& out, const SUBNET& sub, size_t k_offset)
+ {
+ auto& t = layer<TAG_TYPE>(sub).get_output();
+ tt::copy_tensor(false, out, k_offset, t, 0, t.k());
+ }
+ template<typename SUBNET>
+ static void split(const tensor& input, SUBNET& sub, size_t k_offset)
+ {
+ auto& t = layer<TAG_TYPE>(sub).get_gradient_input();
+ tt::copy_tensor(true, t, 0, input, k_offset, t.k());
+ }
+ };
+ }
+ // concat layer
+ template<
+ template<typename> class... TAG_TYPES
+ >
+ class concat_
+ {
+ static void list_tags(std::ostream& out) { impl::concat_helper_impl<TAG_TYPES...>::list_tags(out);};
+
+ public:
+ constexpr static size_t tag_count() {return impl::concat_helper_impl<TAG_TYPES...>::tag_count();};
+
+ template <typename SUBNET>
+ void setup (const SUBNET&)
+ {
+ // do nothing
+ }
+ template <typename SUBNET>
+ void forward(const SUBNET& sub, resizable_tensor& output)
+ {
+ // the total depth of result is the sum of depths from all tags
+ impl::concat_helper_impl<TAG_TYPES...>::resize_out(output, sub, 0);
+
+ // copy output from each tag into different part result
+ impl::concat_helper_impl<TAG_TYPES...>::concat(output, sub, 0);
+ }
+
+ template <typename SUBNET>
+ void backward(const tensor& gradient_input, SUBNET& sub, tensor&)
+ {
+ // Gradient is split into parts for each tag layer
+ impl::concat_helper_impl<TAG_TYPES...>::split(gradient_input, sub, 0);
+ }
+
+ dpoint map_input_to_output(dpoint p) const { return p; }
+ dpoint map_output_to_input(dpoint p) const { return p; }
+
+ const tensor& get_layer_params() const { return params; }
+ tensor& get_layer_params() { return params; }
+
+ friend void serialize(const concat_& item, std::ostream& out)
+ {
+ serialize("concat_", out);
+ size_t count = tag_count();
+ serialize(count, out);
+ }
+
+ friend void deserialize(concat_& item, std::istream& in)
+ {
+ std::string version;
+ deserialize(version, in);
+ if (version != "concat_")
+ throw serialization_error("Unexpected version '"+version+"' found while deserializing dlib::concat_.");
+ size_t count_tags;
+ deserialize(count_tags, in);
+ if (count_tags != tag_count())
+ throw serialization_error("Invalid count of tags "+ std::to_string(count_tags) +", expecting " +
+ std::to_string(tag_count()) +
+ " found while deserializing dlib::concat_.");
+ }
+
+ friend std::ostream& operator<<(std::ostream& out, const concat_& item)
+ {
+ out << "concat\t (";
+ list_tags(out);
+ out << ")";
+ return out;
+ }
+
+ friend void to_xml(const concat_& item, std::ostream& out)
+ {
+ out << "<concat tags='";
+ list_tags(out);
+ out << "'/>\n";
+ }
+
+ private:
+ resizable_tensor params; // unused
+ };
+
+
+ // concat layer definitions
+ template <template<typename> class TAG1,
+ template<typename> class TAG2,
+ typename SUBNET>
+ using concat2 = add_layer<concat_<TAG1, TAG2>, SUBNET>;
+
+ template <template<typename> class TAG1,
+ template<typename> class TAG2,
+ template<typename> class TAG3,
+ typename SUBNET>
+ using concat3 = add_layer<concat_<TAG1, TAG2, TAG3>, SUBNET>;
+
+ template <template<typename> class TAG1,
+ template<typename> class TAG2,
+ template<typename> class TAG3,
+ template<typename> class TAG4,
+ typename SUBNET>
+ using concat4 = add_layer<concat_<TAG1, TAG2, TAG3, TAG4>, SUBNET>;
+
+ template <template<typename> class TAG1,
+ template<typename> class TAG2,
+ template<typename> class TAG3,
+ template<typename> class TAG4,
+ template<typename> class TAG5,
+ typename SUBNET>
+ using concat5 = add_layer<concat_<TAG1, TAG2, TAG3, TAG4, TAG5>, SUBNET>;
+
+ // inception layer will use tags internally. If user will use tags too, some conflicts
+ // possible to exclude them, here are new tags specially for inceptions
+ template <typename SUBNET> using itag0 = add_tag_layer< 1000 + 0, SUBNET>;
+ template <typename SUBNET> using itag1 = add_tag_layer< 1000 + 1, SUBNET>;
+ template <typename SUBNET> using itag2 = add_tag_layer< 1000 + 2, SUBNET>;
+ template <typename SUBNET> using itag3 = add_tag_layer< 1000 + 3, SUBNET>;
+ template <typename SUBNET> using itag4 = add_tag_layer< 1000 + 4, SUBNET>;
+ template <typename SUBNET> using itag5 = add_tag_layer< 1000 + 5, SUBNET>;
+ // skip to inception input
+ template <typename SUBNET> using iskip = add_skip_layer< itag0, SUBNET>;
+
+ // here are some templates to be used for creating inception layer groups
+ template <template<typename>class B1,
+ template<typename>class B2,
+ typename SUBNET>
+ using inception2 = concat2<itag1, itag2, itag1<B1<iskip< itag2<B2< itag0<SUBNET>>>>>>>;
+
+ template <template<typename>class B1,
+ template<typename>class B2,
+ template<typename>class B3,
+ typename SUBNET>
+ using inception3 = concat3<itag1, itag2, itag3, itag1<B1<iskip< itag2<B2<iskip< itag3<B3< itag0<SUBNET>>>>>>>>>>;
+
+ template <template<typename>class B1,
+ template<typename>class B2,
+ template<typename>class B3,
+ template<typename>class B4,
+ typename SUBNET>
+ using inception4 = concat4<itag1, itag2, itag3, itag4,
+ itag1<B1<iskip< itag2<B2<iskip< itag3<B3<iskip< itag4<B4< itag0<SUBNET>>>>>>>>>>>>>;
+
+ template <template<typename>class B1,
+ template<typename>class B2,
+ template<typename>class B3,
+ template<typename>class B4,
+ template<typename>class B5,
+ typename SUBNET>
+ using inception5 = concat5<itag1, itag2, itag3, itag4, itag5,
+ itag1<B1<iskip< itag2<B2<iskip< itag3<B3<iskip< itag4<B4<iskip< itag5<B5< itag0<SUBNET>>>>>>>>>>>>>>>>;
+
+// ----------------------------------------------------------------------------------------
+// ----------------------------------------------------------------------------------------
+
+ const double DEFAULT_L2_NORM_EPS = 1e-5;
+
+ class l2normalize_
+ {
+ public:
+ explicit l2normalize_(
+ double eps_ = DEFAULT_L2_NORM_EPS
+ ) :
+ eps(eps_)
+ {
+ }
+
+ double get_eps() const { return eps; }
+
+ template <typename SUBNET>
+ void setup (const SUBNET& /*sub*/)
+ {
+ }
+
+ void forward_inplace(const tensor& input, tensor& output)
+ {
+ tt::inverse_norms(norm, input, eps);
+ tt::scale_rows(output, input, norm);
+ }
+
+ void backward_inplace(
+ const tensor& computed_output,
+ const tensor& gradient_input,
+ tensor& data_grad,
+ tensor& /*params_grad*/
+ )
+ {
+ if (is_same_object(gradient_input, data_grad))
+ {
+ tt::dot_prods(temp, gradient_input, computed_output);
+ tt::scale_rows2(0, data_grad, gradient_input, computed_output, temp, norm);
+ }
+ else
+ {
+ tt::dot_prods(temp, gradient_input, computed_output);
+ tt::scale_rows2(1, data_grad, gradient_input, computed_output, temp, norm);
+ }
+ }
+
+ const tensor& get_layer_params() const { return params; }
+ tensor& get_layer_params() { return params; }
+
+ friend void serialize(const l2normalize_& item, std::ostream& out)
+ {
+ serialize("l2normalize_", out);
+ serialize(item.eps, out);
+ }
+
+ friend void deserialize(l2normalize_& item, std::istream& in)
+ {
+ std::string version;
+ deserialize(version, in);
+ if (version != "l2normalize_")
+ throw serialization_error("Unexpected version '"+version+"' found while deserializing dlib::l2normalize_.");
+ deserialize(item.eps, in);
+ }
+
+ friend std::ostream& operator<<(std::ostream& out, const l2normalize_& item)
+ {
+ out << "l2normalize";
+ out << " eps="<<item.eps;
+ return out;
+ }
+
+ friend void to_xml(const l2normalize_& item, std::ostream& out)
+ {
+ out << "<l2normalize";
+ out << " eps='"<<item.eps<<"'";
+ out << "/>\n";
+ }
+ private:
+ double eps;
+
+ resizable_tensor params; // unused
+ // Here only to avoid reallocation and as a cache between forward/backward
+ // functions.
+ resizable_tensor norm;
+ resizable_tensor temp;
+ };
+
+ template <typename SUBNET>
+ using l2normalize = add_layer<l2normalize_, SUBNET>;
+
+// ----------------------------------------------------------------------------------------
+
+ template <
+ long _offset,
+ long _k,
+ long _nr,
+ long _nc
+ >
+ class extract_
+ {
+ static_assert(_offset >= 0, "The offset must be >= 0.");
+ static_assert(_k > 0, "The number of channels must be > 0.");
+ static_assert(_nr > 0, "The number of rows must be > 0.");
+ static_assert(_nc > 0, "The number of columns must be > 0.");
+ public:
+ extract_(
+ )
+ {
+ }
+
+ template <typename SUBNET>
+ void setup (const SUBNET& sub)
+ {
+ DLIB_CASSERT((long)sub.get_output().size() >= sub.get_output().num_samples()*(_offset+_k*_nr*_nc),
+ "The tensor we are trying to extract from the input tensor is too big to fit into the input tensor.");
+
+ aout = alias_tensor(sub.get_output().num_samples(), _k*_nr*_nc);
+ ain = alias_tensor(sub.get_output().num_samples(), sub.get_output().size()/sub.get_output().num_samples());
+ }
+
+ template <typename SUBNET>
+ void forward(const SUBNET& sub, resizable_tensor& output)
+ {
+ if (aout.num_samples() != sub.get_output().num_samples())
+ {
+ aout = alias_tensor(sub.get_output().num_samples(), _k*_nr*_nc);
+ ain = alias_tensor(sub.get_output().num_samples(), sub.get_output().size()/sub.get_output().num_samples());
+ }
+
+ output.set_size(sub.get_output().num_samples(), _k, _nr, _nc);
+ auto out = aout(output,0);
+ auto in = ain(sub.get_output(),0);
+ tt::copy_tensor(false, out, 0, in, _offset, _k*_nr*_nc);
+ }
+
+ template <typename SUBNET>
+ void backward(const tensor& gradient_input, SUBNET& sub, tensor& /*params_grad*/)
+ {
+ auto out = ain(sub.get_gradient_input(),0);
+ auto in = aout(gradient_input,0);
+ tt::copy_tensor(true, out, _offset, in, 0, _k*_nr*_nc);
+ }
+
+ const tensor& get_layer_params() const { return params; }
+ tensor& get_layer_params() { return params; }
+
+ friend void serialize(const extract_& item, std::ostream& out)
+ {
+ serialize("extract_", out);
+ serialize(_offset, out);
+ serialize(_k, out);
+ serialize(_nr, out);
+ serialize(_nc, out);
+ }
+
+ friend void deserialize(extract_& item, std::istream& in)
+ {
+ std::string version;
+ deserialize(version, in);
+ if (version != "extract_")
+ throw serialization_error("Unexpected version '"+version+"' found while deserializing dlib::extract_.");
+
+ long offset;
+ long k;
+ long nr;
+ long nc;
+ deserialize(offset, in);
+ deserialize(k, in);
+ deserialize(nr, in);
+ deserialize(nc, in);
+
+ if (offset != _offset) throw serialization_error("Wrong offset found while deserializing dlib::extract_");
+ if (k != _k) throw serialization_error("Wrong k found while deserializing dlib::extract_");
+ if (nr != _nr) throw serialization_error("Wrong nr found while deserializing dlib::extract_");
+ if (nc != _nc) throw serialization_error("Wrong nc found while deserializing dlib::extract_");
+ }
+
+ friend std::ostream& operator<<(std::ostream& out, const extract_& item)
+ {
+ out << "extract\t ("
+ << "offset="<<_offset
+ << ", k="<<_k
+ << ", nr="<<_nr
+ << ", nc="<<_nc
+ << ")";
+ return out;
+ }
+
+ friend void to_xml(const extract_& item, std::ostream& out)
+ {
+ out << "<extract";
+ out << " offset='"<<_offset<<"'";
+ out << " k='"<<_k<<"'";
+ out << " nr='"<<_nr<<"'";
+ out << " nc='"<<_nc<<"'";
+ out << "/>\n";
+ }
+ private:
+ alias_tensor aout, ain;
+
+ resizable_tensor params; // unused
+ };
+
+ template <
+ long offset,
+ long k,
+ long nr,
+ long nc,
+ typename SUBNET
+ >
+ using extract = add_layer<extract_<offset,k,nr,nc>, SUBNET>;
+
+// ----------------------------------------------------------------------------------------
+
+}
+
+#endif // DLIB_DNn_LAYERS_H_
+
+
diff --git a/ml/dlib/dlib/dnn/layers_abstract.h b/ml/dlib/dlib/dnn/layers_abstract.h
new file mode 100644
index 000000000..f07025ff8
--- /dev/null
+++ b/ml/dlib/dlib/dnn/layers_abstract.h
@@ -0,0 +1,2631 @@
+// Copyright (C) 2015 Davis E. King (davis@dlib.net)
+// License: Boost Software License See LICENSE.txt for the full license.
+#undef DLIB_DNn_LAYERS_ABSTRACT_H_
+#ifdef DLIB_DNn_LAYERS_ABSTRACT_H_
+
+#include "tensor_abstract.h"
+#include "core_abstract.h"
+
+
+namespace dlib
+{
+
+// ----------------------------------------------------------------------------------------
+
+ class SUBNET
+ {
+ /*!
+ WHAT THIS OBJECT REPRESENTS
+ This object represents a deep neural network. In particular, it is
+ the simplified interface through which layer objects interact with their
+ subnetworks. A layer's two important tasks are to (1) take outputs from its
+ subnetwork and forward propagate them through itself and (2) to backwards
+ propagate an error gradient through itself and onto its subnetwork.
+ The idea of a subnetwork is illustrated in the following diagram:
+
+ +---------------------------------------------------------+
+ | loss <-- layer1 <-- layer2 <-- ... <-- layern <-- input |
+ +---------------------------------------------------------+
+ ^ ^
+ \__ subnetwork for layer1 __/
+
+ Therefore, by "subnetwork" we mean the part of the network closer to the
+ input.
+
+ Note that there is no dlib::SUBNET type. It is shown here purely to
+ document the interface layer objects expect to see when they interact
+ with a network.
+ !*/
+
+ public:
+ // You aren't allowed to copy subnetworks from inside a layer.
+ SUBNET(const SUBNET&) = delete;
+ SUBNET& operator=(const SUBNET&) = delete;
+
+ const tensor& get_output(
+ ) const;
+ /*!
+ ensures
+ - returns the output of this subnetwork. This is the data that the next
+ layer in the network will take as input.
+ - have_same_dimensions(#get_gradient_input(), get_output()) == true
+ !*/
+
+ tensor& get_gradient_input(
+ );
+ /*!
+ ensures
+ - returns the error gradient for this subnetwork. That is, this is the
+ error gradient that this network will use to update itself. Therefore,
+ when performing back propagation, layers that sit on top of this
+ subnetwork write their back propagated error gradients into
+ get_gradient_input(). Or to put it another way, during back propagation,
+ layers take the contents of their get_gradient_input() and back propagate
+ it through themselves and store the results into their subnetwork's
+ get_gradient_input().
+ !*/
+
+ const NEXT_SUBNET& subnet(
+ ) const;
+ /*!
+ ensures
+ - returns the subnetwork of *this network. With respect to the diagram
+ above, if *this was layer1 then subnet() would return the network that
+ begins with layer2.
+ !*/
+
+ NEXT_SUBNET& subnet(
+ );
+ /*!
+ ensures
+ - returns the subnetwork of *this network. With respect to the diagram
+ above, if *this was layer1 then subnet() would return the network that
+ begins with layer2.
+ !*/
+
+ const layer_details_type& layer_details(
+ ) const;
+ /*!
+ ensures
+ - returns the layer_details_type instance that defines the behavior of the
+ layer at the top of this network. I.e. returns the layer details that
+ defines the behavior of the layer nearest to the network output rather
+ than the input layer. For computational layers, this is the object
+ implementing the EXAMPLE_COMPUTATIONAL_LAYER_ interface that defines the
+ layer's behavior.
+ !*/
+
+ unsigned int sample_expansion_factor (
+ ) const;
+ /*!
+ ensures
+ - When to_tensor() is invoked on this network's input layer it converts N
+ input objects into M samples, all stored inside a resizable_tensor. It
+ is always the case that M is some integer multiple of N.
+ sample_expansion_factor() returns the value of this multiplier. To be
+ very specific, it is always true that M==I*N where I is some integer.
+ This integer I is what is returned by sample_expansion_factor().
+
+ It should be noted that computational layers likely do not care about the
+ sample expansion factor. It is only really of concern inside a loss
+ layer where you need to know its value so that tensor samples can be
+ matched against truth objects. Moreover, in most cases the sample
+ expansion factor is 1.
+ !*/
+
+ };
+
+// ----------------------------------------------------------------------------------------
+
+ class EXAMPLE_COMPUTATIONAL_LAYER_
+ {
+ /*!
+ WHAT THIS OBJECT REPRESENTS
+ Each computational layer in a deep neural network can be thought of as a
+ function, f(data,parameters), that takes in a data tensor, some parameters,
+ and produces an output tensor. You create an entire deep network by
+ composing these functions. Importantly, you are able to use a wide range
+ of different functions to accommodate the task you are trying to
+ accomplish. Therefore, dlib includes a number of common layer types but if
+ you want to define your own then you simply implement a class with the same
+ interface as EXAMPLE_COMPUTATIONAL_LAYER_.
+
+ Note that there is no dlib::EXAMPLE_COMPUTATIONAL_LAYER_ type. It is shown
+ here purely to document the interface that a layer object must implement.
+
+ The central work of defining a layer is implementing the forward and backward
+ methods. When you do this you have four options:
+ - Implement the forward() and backward() methods according to the
+ specification shown below. Do not implement forward_inplace() and
+ backward_inplace().
+ - Implement the forward() and backward() methods according to the
+ specification shown below, except exclude the computed_output
+ parameter from backward(). Doing this will allow dlib to make some
+ layers execute in-place and therefore run a little faster and use
+ less memory. Do not implement forward_inplace() and
+ backward_inplace().
+ - Implement the forward_inplace() and backward_inplace() methods
+ according to the specification shown below. Do not implement
+ forward() and backward(). These in-place methods allow some types of
+ layers to be implemented more efficiently.
+ - Implement the forward_inplace() and backward_inplace() methods
+ according to the specification shown below, except exclude the
+ computed_output parameter from backward_inplace(). Doing this will
+ allow dlib to make some layers execute in-place and therefore run a
+ little faster and use less memory. Do not implement forward() and
+ backward().
+
+
+ It should also be noted that layers may define additional layer specific
+ fields and the solvers can use these fields as they see fit. For example,
+ some layers define get_learning_rate_multiplier() and
+ get_weight_decay_multiplier() methods. The solvers that come with dlib
+ look at these methods, if they exist, and adjust the learning rate or
+ weight decay for that layer according to the multiplier. Therefore, you
+ can add these methods to your layer types if you want, or even define new
+ fields and new solvers that use those fields in some way.
+ !*/
+
+ public:
+
+ EXAMPLE_COMPUTATIONAL_LAYER_(
+ );
+ /*!
+ ensures
+ - Default constructs this object. This function is not required to do
+ anything in particular but it must exist, that is, it is required that
+ layer objects be default constructable.
+ !*/
+
+ EXAMPLE_COMPUTATIONAL_LAYER_ (
+ const EXAMPLE_COMPUTATIONAL_LAYER_& item
+ );
+ /*!
+ ensures
+ - EXAMPLE_COMPUTATIONAL_LAYER_ objects are copy constructable
+ !*/
+
+ EXAMPLE_COMPUTATIONAL_LAYER_(
+ const some_other_layer_type& item
+ );
+ /*!
+ ensures
+ - Constructs this object from item. This form of constructor is optional
+ but it allows you to provide a conversion from one layer type to another.
+ For example, the following code is valid only if my_layer2 can be
+ constructed from my_layer1:
+ relu<fc<my_layer1<fc<input<matrix<float>>>>>> my_dnn1;
+ relu<fc<my_layer2<fc<input<matrix<float>>>>>> my_dnn2(my_dnn1);
+ This kind of pattern is useful if you want to use one type of layer
+ during training but a different type of layer during testing since it
+ allows you to easily convert between related deep neural network types.
+
+ Additionally, if you provide a constructor to build a layer from another
+ layer type you should also write your layer's deserialize() routine such
+ that it can read that other layer's serialized data in addition to your
+ own serialized data.
+ !*/
+
+ template <typename SUBNET>
+ void setup (
+ const SUBNET& sub
+ );
+ /*!
+ requires
+ - SUBNET implements the SUBNET interface defined at the top of this file.
+ ensures
+ - performs any necessary initial memory allocations and/or sets parameters
+ to their initial values prior to learning. Therefore, calling setup
+ destroys any previously learned parameters. Also, typically setup()
+ would look at the dimensions of the outputs of sub and configure the
+ number of parameters in *this accordingly.
+ !*/
+
+ template <typename SUBNET>
+ void forward(
+ const SUBNET& sub,
+ resizable_tensor& data_output
+ );
+ /*!
+ requires
+ - SUBNET implements the SUBNET interface defined at the top of this file.
+ - setup() has been called.
+ ensures
+ - Runs the output of the subnetwork through this layer and stores the
+ results into #data_output. In particular, forward() can use any of the
+ outputs in sub (e.g. sub.get_output(), sub.subnet().get_output(), etc.)
+ to compute whatever it wants.
+ !*/
+
+ template <typename SUBNET>
+ void backward(
+ const tensor& computed_output, // this parameter is optional
+ const tensor& gradient_input,
+ SUBNET& sub,
+ tensor& params_grad
+ );
+ /*!
+ requires
+ - SUBNET implements the SUBNET interface defined at the top of this file.
+ - setup() has been called.
+ - computed_output is the tensor resulting from calling forward(sub,computed_output).
+ Moreover, this was the most recent call to forward(). This means that
+ forward() is allowed to cache intermediate results so they can be used
+ during the backward computation.
+ - have_same_dimensions(gradient_input, computed_output) == true
+ - have_same_dimensions(sub.get_gradient_input(), sub.get_output()) == true
+ - have_same_dimensions(params_grad, get_layer_params()) == true
+ ensures
+ - This function outputs the gradients of this layer with respect to the
+ input data from sub and also with respect to this layer's parameters.
+ These gradients are stored into #sub and #params_grad, respectively. To be
+ precise, the gradients are taken of a function f(sub,get_layer_params())
+ which is defined thusly:
+ - Recalling that computed_output is a function of both sub and get_layer_params(),
+ since it is the result of calling forward(sub,computed_output):
+ let f(sub,get_layer_params()) == dot(computed_output, gradient_input)
+ Then we define the following gradient vectors:
+ - PARAMETER_GRADIENT == gradient of f(sub,get_layer_params()) with
+ respect to get_layer_params().
+ - for all valid I:
+ - DATA_GRADIENT_I == gradient of f(sub,get_layer_params()) with
+ respect to layer<I>(sub).get_output() (recall that forward() can
+ draw inputs from the immediate sub layer, sub.subnet(), or
+ any earlier layer. So you must consider the gradients with
+ respect to all inputs drawn from sub)
+ Finally, backward() outputs these gradients by performing:
+ - params_grad = PARAMETER_GRADIENT
+ - for all valid I:
+ - layer<I>(sub).get_gradient_input() += DATA_GRADIENT_I
+ !*/
+
+ void forward_inplace(
+ const tensor& data_input,
+ tensor& data_output
+ );
+ /*!
+ requires
+ - have_same_dimensions(data_input,data_output) == true
+ - setup() has been called.
+ ensures
+ - Runs the data_input tensor through this layer and stores the output into
+ #data_output.
+ - This function supports in-place operation, i.e. having
+ is_same_object(data_input, data_output)==true
+ !*/
+
+ void backward_inplace(
+ const tensor& computed_output, // this parameter is optional
+ const tensor& gradient_input,
+ tensor& data_grad,
+ tensor& params_grad
+ );
+ /*!
+ requires
+ - setup() has been called.
+ - computed_output is the tensor resulting from the most recent call to
+ forward_inplace(). This means that forward_inplace() is allowed to cache
+ intermediate results so they can be used during the backward computation.
+ - have_same_dimensions(gradient_input, data_grad) == true
+ - have_same_dimensions(gradient_input, computed_output) == true
+ - have_same_dimensions(params_grad, get_layer_params()) == true
+ ensures
+ - This function supports in-place operation, i.e. having
+ is_same_object(gradient_input, data_grad)==true
+ - This function outputs the gradients of this layer with respect to the
+ input data from a sublayer and also with respect to this layer's parameters.
+ These gradients are stored into #data_grad and #params_grad, respectively. To be
+ precise, the gradients are taken of a function f(data_input,get_layer_params())
+ which is defined thusly:
+ - Recalling that computed_output is a function of both the input to
+ forward_inplace() and get_layer_params(), since it is the result of
+ calling forward_inplace(data_input,computed_output):
+ let f(data_input,get_layer_params()) == dot(computed_output, gradient_input)
+ Then we define the following gradient vectors:
+ - PARAMETER_GRADIENT == gradient of f(data_input,get_layer_params()) with
+ respect to get_layer_params().
+ - DATA_GRADIENT == gradient of f(data_input,get_layer_params()) with respect
+ to data_input.
+ Finally, backward_inplace() outputs these gradients by performing:
+ - params_grad = PARAMETER_GRADIENT
+ - if (is_same_object(gradient_input, data_grad)) then
+ - data_grad = DATA_GRADIENT
+ - else
+ - data_grad += DATA_GRADIENT
+ !*/
+
+ const tensor& get_layer_params(
+ ) const;
+ /*!
+ ensures
+ - returns the parameters that define the behavior of forward().
+ !*/
+
+ tensor& get_layer_params(
+ );
+ /*!
+ ensures
+ - returns the parameters that define the behavior of forward().
+ !*/
+
+
+ dpoint map_input_to_output(dpoint p) const;
+ dpoint map_output_to_input(dpoint p) const;
+ /*!
+ These two functions are optional. If provided, they should map between
+ (column,row) coordinates in input and output tensors of forward(). Providing
+ these functions allows you to use global utility functions like
+ input_tensor_to_output_tensor().
+ !*/
+
+ void clean (
+ );
+ /*!
+ Implementing this function is optional. If you don't need it then you don't
+ have to provide a clean(). But if you do provide it then it must behave as
+ follows:
+
+ ensures
+ - calling clean() Causes this object to forget about everything except its
+ parameters. This is useful if your layer caches information between
+ forward and backward passes and you want to clean out that cache
+ information before saving the network to disk.
+ !*/
+
+ };
+
+ std::ostream& operator<<(std::ostream& out, const EXAMPLE_COMPUTATIONAL_LAYER_& item);
+ /*!
+ print a string describing this layer.
+ !*/
+
+ void to_xml(const EXAMPLE_COMPUTATIONAL_LAYER_& item, std::ostream& out);
+ /*!
+ This function is optional, but required if you want to print your networks with
+ net_to_xml(). Therefore, to_xml() prints a layer as XML.
+ !*/
+
+ void serialize(const EXAMPLE_COMPUTATIONAL_LAYER_& item, std::ostream& out);
+ void deserialize(EXAMPLE_COMPUTATIONAL_LAYER_& item, std::istream& in);
+ /*!
+ provides serialization support
+ !*/
+
+ // For each layer you define, always define an add_layer template so that layers can be
+ // easily composed. Moreover, the convention is that the layer class ends with an _
+ // while the add_layer template has the same name but without the trailing _.
+ template <typename SUBNET>
+ using EXAMPLE_COMPUTATIONAL_LAYER = add_layer<EXAMPLE_COMPUTATIONAL_LAYER_, SUBNET>;
+
+// ----------------------------------------------------------------------------------------
+// ----------------------------------------------------------------------------------------
+// ----------------------------------------------------------------------------------------
+
+ enum fc_bias_mode
+ {
+ FC_HAS_BIAS = 0,
+ FC_NO_BIAS = 1
+ };
+
+ struct num_fc_outputs
+ {
+ num_fc_outputs(unsigned long n) : num_outputs(n) {}
+ unsigned long num_outputs;
+ };
+
+ template <
+ unsigned long num_outputs,
+ fc_bias_mode bias_mode
+ >
+ class fc_
+ {
+ /*!
+ REQUIREMENTS ON num_outputs
+ num_outputs > 0
+
+ WHAT THIS OBJECT REPRESENTS
+ This is an implementation of the EXAMPLE_COMPUTATIONAL_LAYER_ interface
+ defined above. In particular, it defines a fully connected layer that
+ takes an input tensor and multiplies it by a weight matrix and outputs the
+ results.
+
+ The dimensions of the tensors output by this layer are as follows (letting
+ IN be the input tensor and OUT the output tensor):
+ - OUT.num_samples() == IN.num_samples()
+ - OUT.k() == get_num_outputs()
+ - OUT.nr() == 1
+ - OUT.nc() == 1
+ !*/
+
+ public:
+
+ fc_(
+ );
+ /*!
+ ensures
+ - #get_num_outputs() == num_outputs
+ - #get_bias_mode() == bias_mode
+ - #get_learning_rate_multiplier() == 1
+ - #get_weight_decay_multiplier() == 1
+ - #get_bias_learning_rate_multiplier() == 1
+ - #get_bias_weight_decay_multiplier() == 0
+ !*/
+
+ fc_(
+ num_fc_outputs o
+ );
+ /*!
+ ensures
+ - #get_num_outputs() == o.num_outputs
+ - #get_bias_mode() == bias_mode
+ - #get_learning_rate_multiplier() == 1
+ - #get_weight_decay_multiplier() == 1
+ - #get_bias_learning_rate_multiplier() == 1
+ - #get_bias_weight_decay_multiplier() == 0
+ !*/
+
+ unsigned long get_num_outputs (
+ ) const;
+ /*!
+ ensures
+ - This layer outputs column vectors that contain get_num_outputs()
+ elements. That is, the output tensor T from forward() will be such that:
+ - T.num_samples() == however many samples were given to forward().
+ - T.k() == get_num_outputs()
+ - The rest of the dimensions of T will be 1.
+ !*/
+
+ void set_num_outputs(
+ long num
+ );
+ /*!
+ requires
+ - num > 0
+ - get_layer_params().size() == 0 || get_num_outputs() == num
+ (i.e. You can't change the number of outputs in fc_ if the parameter
+ tensor has already been allocated.)
+ ensures
+ - #get_num_outputs() == num
+ !*/
+
+ fc_bias_mode get_bias_mode (
+ ) const;
+ /*!
+ ensures
+ - returns the bias mode which determines if this layer includes bias terms.
+ That is, if the bias mode is FC_HAS_BIAS then a different constant scalar
+ is added to each of the outputs of this layer.
+ !*/
+
+ double get_learning_rate_multiplier(
+ ) const;
+ /*!
+ ensures
+ - returns a multiplier number. The interpretation is that this object is
+ requesting that the learning rate used to optimize its parameters be
+ multiplied by get_learning_rate_multiplier().
+ !*/
+
+ double get_weight_decay_multiplier(
+ ) const;
+ /*!
+ ensures
+ - returns a multiplier number. The interpretation is that this object is
+ requesting that the weight decay used to optimize its parameters be
+ multiplied by get_weight_decay_multiplier().
+ !*/
+
+ void set_learning_rate_multiplier(
+ double val
+ );
+ /*!
+ requires
+ - val >= 0
+ ensures
+ - #get_learning_rate_multiplier() == val
+ !*/
+
+ void set_weight_decay_multiplier(
+ double val
+ );
+ /*!
+ requires
+ - val >= 0
+ ensures
+ - #get_weight_decay_multiplier() == val
+ !*/
+
+ double get_bias_learning_rate_multiplier(
+ ) const;
+ /*!
+ ensures
+ - returns a multiplier number. The interpretation is that this object is
+ requesting that the learning rate used to optimize its bias parameters be
+ multiplied by get_learning_rate_multiplier()*get_bias_learning_rate_multiplier().
+ !*/
+
+ double get_bias_weight_decay_multiplier(
+ ) const;
+ /*!
+ ensures
+ - returns a multiplier number. The interpretation is that this object is
+ requesting that the weight decay used to optimize its bias parameters be
+ multiplied by get_weight_decay_multiplier()*get_bias_weight_decay_multiplier().
+ !*/
+
+ void set_bias_learning_rate_multiplier(
+ double val
+ );
+ /*!
+ requires
+ - val >= 0
+ ensures
+ - #get_bias_learning_rate_multiplier() == val
+ !*/
+
+ void set_bias_weight_decay_multiplier(
+ double val
+ );
+ /*!
+ requires
+ - val >= 0
+ ensures
+ - #get_bias_weight_decay_multiplier() == val
+ !*/
+
+ alias_tensor_const_instance get_weights(
+ ) const;
+ /*!
+ ensures
+ - returns an alias of get_layer_params(), containing the weights matrix of
+ the fully connected layer.
+ - #get_weights().num_samples() is the number of elements in input sample,
+ i.e. sublayer's output's k * nc * nr.
+ - #get_bias().k() == #get_num_outputs()
+ - if get_bias_mode() == FC_HAS_BIAS:
+ - #get_layer_params().size() == (#get_weights().size() + #get_biases().size())
+ - else:
+ - #get_layer_params().size() == #get_weights().size()
+ !*/
+
+ alias_tensor_instance get_weights(
+ );
+ /*!
+ ensures
+ - returns an alias of get_layer_params(), containing the weights matrix of
+ the fully connected layer.
+ - #get_weights().num_samples() is the number of elements in input sample,
+ i.e. sublayer's output's k * nc * nr.
+ - #get_bias().k() == #get_num_outputs()
+ - if get_bias_mode() == FC_HAS_BIAS:
+ - #get_layer_params().size() == (#get_weights().size() + #get_biases().size())
+ - else:
+ - #get_layer_params().size() == #get_weights().size()
+ !*/
+
+ alias_tensor_const_instance get_biases(
+ ) const;
+ /*!
+ requires
+ - #get_bias_mode() == FC_HAS_BIAS
+ ensures
+ - returns an alias of get_layer_params(), containing the bias vector of
+ the fully connected layer.
+ - #get_bias().num_samples() == 1
+ - #get_bias().k() == #get_num_outputs()
+ - #get_layer_params().size() == (#get_weights().size() + #get_biases().size())
+ !*/
+
+ alias_tensor_instance get_biases(
+ );
+ /*!
+ requires
+ - #get_bias_mode() == FC_HAS_BIAS
+ ensures
+ - returns an alias of get_layer_params(), containing the bias vector of
+ the fully connected layer.
+ - #get_bias().num_samples() == 1
+ - #get_bias().k() == #get_num_outputs()
+ - #get_layer_params().size() == (#get_weights().size() + #get_biases().size())
+ !*/
+
+ template <typename SUBNET> void setup (const SUBNET& sub);
+ template <typename SUBNET> void forward(const SUBNET& sub, resizable_tensor& output);
+ template <typename SUBNET> void backward(const tensor& gradient_input, SUBNET& sub, tensor& params_grad);
+ const tensor& get_layer_params() const;
+ tensor& get_layer_params();
+ /*!
+ These functions are implemented as described in the EXAMPLE_COMPUTATIONAL_LAYER_ interface.
+ !*/
+
+ };
+
+ template <
+ unsigned long num_outputs,
+ typename SUBNET
+ >
+ using fc = add_layer<fc_<num_outputs,FC_HAS_BIAS>, SUBNET>;
+
+ template <
+ unsigned long num_outputs,
+ typename SUBNET
+ >
+ using fc_no_bias = add_layer<fc_<num_outputs,FC_NO_BIAS>, SUBNET>;
+
+// ----------------------------------------------------------------------------------------
+
+ struct num_con_outputs
+ {
+ num_con_outputs(unsigned long n) : num_outputs(n) {}
+ unsigned long num_outputs;
+ };
+
+ template <
+ long _num_filters,
+ long _nr,
+ long _nc,
+ int _stride_y,
+ int _stride_x,
+ int _padding_y = _stride_y!=1? 0 : _nr/2,
+ int _padding_x = _stride_x!=1? 0 : _nc/2
+ >
+ class con_
+ {
+ /*!
+ REQUIREMENTS ON TEMPLATE ARGUMENTS
+ - _num_filters > 0
+ - _nr >= 0
+ - _nc >= 0
+ - _stride_y > 0
+ - _stride_x > 0
+ - _padding_y >= 0
+ - _padding_x >= 0
+ - Also, we require that:
+ - if (_nr == 0) then
+ - _padding_y == 0
+ - else
+ - _padding_y < _nr
+ - if (_nc == 0) then
+ - _padding_x == 0
+ - else
+ - _padding_x < _nc
+
+ WHAT THIS OBJECT REPRESENTS
+ This is an implementation of the EXAMPLE_COMPUTATIONAL_LAYER_ interface
+ defined above. In particular, it defines a convolution layer that takes an
+ input tensor (nominally representing an image) and convolves it with a set
+ of filters and then outputs the results.
+
+ The dimensions of the tensors output by this layer are as follows (letting
+ IN be the input tensor and OUT the output tensor):
+ - OUT.num_samples() == IN.num_samples()
+ - OUT.k() == num_filters()
+ - OUT.nr() == 1+(IN.nr() + 2*padding_y() - nr())/stride_y()
+ - OUT.nc() == 1+(IN.nc() + 2*padding_x() - nc())/stride_x()
+
+ Note also that setting _nr or _nc to 0 has a special meaning of "set the
+ filter size equal to the input image size". Specifically, it means:
+ - if (_nr == 0) then
+ - nr() == IN.nr()
+ - OUT.nr() == 1
+ - if (_nc == 0) then
+ - nc() == IN.nc()
+ - OUT.nc() == 1
+ !*/
+
+ public:
+ con_(
+ );
+ /*!
+ ensures
+ - #num_filters() == _num_filters
+ - #nr() == _nr
+ - #nc() == _nc
+ - #stride_y() == _stride_y
+ - #stride_x() == _stride_x
+ - #padding_y() == _padding_y
+ - #padding_x() == _padding_x
+ - #get_learning_rate_multiplier() == 1
+ - #get_weight_decay_multiplier() == 1
+ - #get_bias_learning_rate_multiplier() == 1
+ - #get_bias_weight_decay_multiplier() == 0
+ !*/
+
+ con_(
+ num_con_outputs o
+ );
+ /*!
+ ensures
+ - #num_filters() == o.num_outputs
+ - #nr() == _nr
+ - #nc() == _nc
+ - #stride_y() == _stride_y
+ - #stride_x() == _stride_x
+ - #padding_y() == _padding_y
+ - #padding_x() == _padding_x
+ - #get_learning_rate_multiplier() == 1
+ - #get_weight_decay_multiplier() == 1
+ - #get_bias_learning_rate_multiplier() == 1
+ - #get_bias_weight_decay_multiplier() == 0
+ !*/
+
+ long num_filters(
+ ) const;
+ /*!
+ ensures
+ - returns the number of filters contained in this layer. The k dimension
+ of the output tensors produced by this layer will be equal to the number
+ of filters.
+ !*/
+
+ void set_num_filters(
+ long num
+ );
+ /*!
+ requires
+ - num > 0
+ - get_layer_params().size() == 0 || num_filters() == num
+ (i.e. You can't change the number of filters in con_ if the parameter
+ tensor has already been allocated.)
+ ensures
+ - #num_filters() == num
+ !*/
+
+ long nr(
+ ) const;
+ /*!
+ ensures
+ - returns the number of rows in the filters in this layer. Note that if
+ nr()==0 then it means the size of the filter is not yet assigned, but
+ once setup() is called nr() will be set to the input tensor's nr().
+ Therefore, nr()==0 has the special interpretation of "be the same size as
+ the input tensor".
+ !*/
+
+ long nc(
+ ) const;
+ /*!
+ ensures
+ - returns the number of columns in the filters in this layer. Note that if
+ nc()==0 then it means the size of the filter is not yet assigned, but
+ once setup() is called nc() will be set to the input tensor's nc().
+ Therefore, nc()==0 has the special interpretation of "be the same size as
+ the input tensor".
+ !*/
+
+ long stride_y(
+ ) const;
+ /*!
+ ensures
+ - returns the vertical stride used when convolving the filters over an
+ image. That is, each filter will be moved stride_y() pixels down at a
+ time when it moves over the image.
+ !*/
+
+ long stride_x(
+ ) const;
+ /*!
+ ensures
+ - returns the horizontal stride used when convolving the filters over an
+ image. That is, each filter will be moved stride_x() pixels right at a
+ time when it moves over the image.
+ !*/
+
+ long padding_y(
+ ) const;
+ /*!
+ ensures
+ - returns the number of pixels of zero padding added to the top and bottom
+ sides of the image.
+ !*/
+
+ long padding_x(
+ ) const;
+ /*!
+ ensures
+ - returns the number of pixels of zero padding added to the left and right
+ sides of the image.
+ !*/
+
+ double get_learning_rate_multiplier(
+ ) const;
+ /*!
+ ensures
+ - returns a multiplier number. The interpretation is that this object is
+ requesting that the learning rate used to optimize its parameters be
+ multiplied by get_learning_rate_multiplier().
+ !*/
+
+ double get_weight_decay_multiplier(
+ ) const;
+ /*!
+ ensures
+ - returns a multiplier number. The interpretation is that this object is
+ requesting that the weight decay used to optimize its parameters be
+ multiplied by get_weight_decay_multiplier().
+ !*/
+
+ void set_learning_rate_multiplier(
+ double val
+ );
+ /*!
+ requires
+ - val >= 0
+ ensures
+ - #get_learning_rate_multiplier() == val
+ !*/
+
+ void set_weight_decay_multiplier(
+ double val
+ );
+ /*!
+ requires
+ - val >= 0
+ ensures
+ - #get_weight_decay_multiplier() == val
+ !*/
+
+ double get_bias_learning_rate_multiplier(
+ ) const;
+ /*!
+ ensures
+ - returns a multiplier number. The interpretation is that this object is
+ requesting that the learning rate used to optimize its bias parameters be
+ multiplied by get_learning_rate_multiplier()*get_bias_learning_rate_multiplier().
+ !*/
+
+ double get_bias_weight_decay_multiplier(
+ ) const;
+ /*!
+ ensures
+ - returns a multiplier number. The interpretation is that this object is
+ requesting that the weight decay used to optimize its bias parameters be
+ multiplied by get_weight_decay_multiplier()*get_bias_weight_decay_multiplier().
+ !*/
+
+ void set_bias_learning_rate_multiplier(
+ double val
+ );
+ /*!
+ requires
+ - val >= 0
+ ensures
+ - #get_bias_learning_rate_multiplier() == val
+ !*/
+
+ void set_bias_weight_decay_multiplier(
+ double val
+ );
+ /*!
+ requires
+ - val >= 0
+ ensures
+ - #get_bias_weight_decay_multiplier() == val
+ !*/
+
+ template <typename SUBNET> void setup (const SUBNET& sub);
+ template <typename SUBNET> void forward(const SUBNET& sub, resizable_tensor& output);
+ template <typename SUBNET> void backward(const tensor& gradient_input, SUBNET& sub, tensor& params_grad);
+ dpoint map_input_to_output(dpoint p) const;
+ dpoint map_output_to_input(dpoint p) const;
+ const tensor& get_layer_params() const;
+ tensor& get_layer_params();
+ /*!
+ These functions are implemented as described in the EXAMPLE_COMPUTATIONAL_LAYER_ interface.
+ !*/
+
+ };
+
+ template <
+ long num_filters,
+ long nr,
+ long nc,
+ int stride_y,
+ int stride_x,
+ typename SUBNET
+ >
+ using con = add_layer<con_<num_filters,nr,nc,stride_y,stride_x>, SUBNET>;
+
+// ----------------------------------------------------------------------------------------
+
+ template <
+ long _num_filters,
+ long _nr,
+ long _nc,
+ int _stride_y,
+ int _stride_x,
+ int _padding_y = _stride_y!=1? 0 : _nr/2,
+ int _padding_x = _stride_x!=1? 0 : _nc/2
+ >
+ class cont_
+ {
+ /*!
+ REQUIREMENTS ON TEMPLATE ARGUMENTS
+ All of them must be > 0.
+ Also, we require that:
+ - 0 <= _padding_y && _padding_y < _nr
+ - 0 <= _padding_x && _padding_x < _nc
+
+ WHAT THIS OBJECT REPRESENTS
+ This is an implementation of the EXAMPLE_COMPUTATIONAL_LAYER_ interface
+ defined above. In particular, it defines a transposed convolution layer
+ that takes an input tensor and transpose convolves (sometimes called
+ "deconvolution") it with a set of filters and then outputs the results.
+
+ This is essentially a convolutional layer that allows fractional strides.
+ Therefore, you can make output tensors that are larger than the input
+ tensors using this layer type.
+
+
+ The dimensions of the tensors output by this layer are as follows (letting
+ IN be the input tensor and OUT the output tensor):
+ - OUT.num_samples() == IN.num_samples()
+ - OUT.k() == num_filters()
+ - OUT.nr() == stride_y()*(IN.nr()-1) + nr() - 2*padding_y()
+ - OUT.nc() == stride_x()*(IN.nc()-1) + nc() - 2*padding_x()
+ !*/
+
+ public:
+ cont_(
+ );
+ /*!
+ ensures
+ - #num_filters() == _num_filters
+ - #nr() == _nr
+ - #nc() == _nc
+ - #stride_y() == _stride_y
+ - #stride_x() == _stride_x
+ - #padding_y() == _padding_y
+ - #padding_x() == _padding_x
+ - #get_learning_rate_multiplier() == 1
+ - #get_weight_decay_multiplier() == 1
+ - #get_bias_learning_rate_multiplier() == 1
+ - #get_bias_weight_decay_multiplier() == 0
+ !*/
+
+ cont_(
+ num_con_outputs o
+ );
+ /*!
+ ensures
+ - #num_filters() == o.num_outputs
+ - #nr() == _nr
+ - #nc() == _nc
+ - #stride_y() == _stride_y
+ - #stride_x() == _stride_x
+ - #padding_y() == _padding_y
+ - #padding_x() == _padding_x
+ - #get_learning_rate_multiplier() == 1
+ - #get_weight_decay_multiplier() == 1
+ - #get_bias_learning_rate_multiplier() == 1
+ - #get_bias_weight_decay_multiplier() == 0
+ !*/
+
+ long num_filters(
+ ) const;
+ /*!
+ ensures
+ - returns the number of filters contained in this layer. The k dimension
+ of the output tensors produced by this layer will be equal to the number
+ of filters.
+ !*/
+
+ void set_num_filters(
+ long num
+ );
+ /*!
+ requires
+ - num > 0
+ - get_layer_params().size() == 0 || num_filters() == num
+ (i.e. You can't change the number of filters in cont_ if the parameter
+ tensor has already been allocated.)
+ ensures
+ - #num_filters() == num
+ !*/
+
+ long nr(
+ ) const;
+ /*!
+ ensures
+ - returns the number of rows in the filters in this layer.
+ !*/
+
+ long nc(
+ ) const;
+ /*!
+ ensures
+ - returns the number of columns in the filters in this layer.
+ !*/
+
+ long stride_y(
+ ) const;
+ /*!
+ ensures
+ - returns the vertical stride used when convolving the filters over an
+ image. That is, each filter will be moved 1.0/stride_y() pixels down at
+ a time when it moves over the image.
+ !*/
+
+ long stride_x(
+ ) const;
+ /*!
+ ensures
+ - returns the horizontal stride used when convolving the filters over an
+ image. That is, each filter will be moved 1.0/stride_x() pixels right at
+ a time when it moves over the image.
+ !*/
+
+ long padding_y(
+ ) const;
+ /*!
+ ensures
+ - returns the number of pixels of zero padding added to the top and bottom
+ sides of the image.
+ !*/
+
+ long padding_x(
+ ) const;
+ /*!
+ ensures
+ - returns the number of pixels of zero padding added to the left and right
+ sides of the image.
+ !*/
+
+ double get_learning_rate_multiplier(
+ ) const;
+ /*!
+ ensures
+ - returns a multiplier number. The interpretation is that this object is
+ requesting that the learning rate used to optimize its parameters be
+ multiplied by get_learning_rate_multiplier().
+ !*/
+
+ double get_weight_decay_multiplier(
+ ) const;
+ /*!
+ ensures
+ - returns a multiplier number. The interpretation is that this object is
+ requesting that the weight decay used to optimize its parameters be
+ multiplied by get_weight_decay_multiplier().
+ !*/
+
+ void set_learning_rate_multiplier(
+ double val
+ );
+ /*!
+ requires
+ - val >= 0
+ ensures
+ - #get_learning_rate_multiplier() == val
+ !*/
+
+ void set_weight_decay_multiplier(
+ double val
+ );
+ /*!
+ requires
+ - val >= 0
+ ensures
+ - #get_weight_decay_multiplier() == val
+ !*/
+
+ double get_bias_learning_rate_multiplier(
+ ) const;
+ /*!
+ ensures
+ - returns a multiplier number. The interpretation is that this object is
+ requesting that the learning rate used to optimize its bias parameters be
+ multiplied by get_learning_rate_multiplier()*get_bias_learning_rate_multiplier().
+ !*/
+
+ double get_bias_weight_decay_multiplier(
+ ) const;
+ /*!
+ ensures
+ - returns a multiplier number. The interpretation is that this object is
+ requesting that the weight decay used to optimize its bias parameters be
+ multiplied by get_weight_decay_multiplier()*get_bias_weight_decay_multiplier().
+ !*/
+
+ void set_bias_learning_rate_multiplier(
+ double val
+ );
+ /*!
+ requires
+ - val >= 0
+ ensures
+ - #get_bias_learning_rate_multiplier() == val
+ !*/
+
+ void set_bias_weight_decay_multiplier(
+ double val
+ );
+ /*!
+ requires
+ - val >= 0
+ ensures
+ - #get_bias_weight_decay_multiplier() == val
+ !*/
+
+ template <typename SUBNET> void setup (const SUBNET& sub);
+ template <typename SUBNET> void forward(const SUBNET& sub, resizable_tensor& output);
+ template <typename SUBNET> void backward(const tensor& gradient_input, SUBNET& sub, tensor& params_grad);
+ dpoint map_input_to_output(dpoint p) const;
+ dpoint map_output_to_input(dpoint p) const;
+ const tensor& get_layer_params() const;
+ tensor& get_layer_params();
+ /*!
+ These functions are implemented as described in the EXAMPLE_COMPUTATIONAL_LAYER_ interface.
+ !*/
+
+ };
+
+ template <
+ long num_filters,
+ long nr,
+ long nc,
+ int stride_y,
+ int stride_x,
+ typename SUBNET
+ >
+ using cont = add_layer<cont_<num_filters,nr,nc,stride_y,stride_x>, SUBNET>;
+
+// ----------------------------------------------------------------------------------------
+
+ template <
+ int scale_y,
+ int scale_x
+ >
+ class upsample_
+ {
+ /*!
+ REQUIREMENTS ON TEMPLATE ARGUMENTS
+ All of them must be >= 1.
+
+ WHAT THIS OBJECT REPRESENTS
+ This is an implementation of the EXAMPLE_COMPUTATIONAL_LAYER_ interface
+ defined above. In particular, it allows you to upsample a layer using
+ bilinear interpolation. To be very specific, it upsamples each of the
+ channels in an input tensor. Therefore, if IN is the input tensor to this
+ layer and OUT the output tensor, then we will have:
+ - OUT.num_samples() == IN.num_samples()
+ - OUT.k() == IN.k()
+ - OUT.nr() == IN.nr()*scale_y
+ - OUT.nc() == IN.nr()*scale_x
+ - for all valid i,k: image_plane(OUT,i,k) is a copy of
+ image_plane(IN,i,k) that has been bilinearly interpolated to fit into
+ the shape of image_plane(OUT,i,k).
+ !*/
+ public:
+
+ upsample_(
+ );
+ /*!
+ ensures
+ - This object has no state, so the constructor does nothing, aside from
+ providing default constructability.
+ !*/
+
+ template <typename SUBNET> void setup (const SUBNET& sub);
+ template <typename SUBNET> void forward(const SUBNET& sub, resizable_tensor& output);
+ template <typename SUBNET> void backward(const tensor& gradient_input, SUBNET& sub, tensor& params_grad);
+ dpoint map_input_to_output(dpoint p) const;
+ dpoint map_output_to_input(dpoint p) const;
+ const tensor& get_layer_params() const;
+ tensor& get_layer_params();
+ /*!
+ These functions are implemented as described in the EXAMPLE_COMPUTATIONAL_LAYER_ interface.
+ !*/
+ };
+
+ template <
+ int scale,
+ typename SUBNET
+ >
+ using upsample = add_layer<upsample_<scale,scale>, SUBNET>;
+
+// ----------------------------------------------------------------------------------------
+
+ class dropout_
+ {
+ /*!
+ WHAT THIS OBJECT REPRESENTS
+ This is an implementation of the EXAMPLE_COMPUTATIONAL_LAYER_ interface
+ defined above. In particular, it defines a dropout layer. Therefore, it
+ passes its inputs through the stochastic function f(x) which outputs either
+ 0 or x. The probability of 0 being output is given by the drop_rate
+ argument to this object's constructor.
+
+ Note that, after you finish training a network with dropout, it is a good
+ idea to replace each dropout_ layer with a multiply_ layer because the
+ multiply_ layer is faster and deterministic.
+ !*/
+
+ public:
+
+ explicit dropout_(
+ float drop_rate = 0.5
+ );
+ /*!
+ requires
+ - 0 <= drop_rate <= 1
+ ensures
+ - #get_drop_rate() == drop_rate
+ !*/
+
+ float get_drop_rate (
+ ) const;
+ /*!
+ ensures
+ - returns the probability that an individual input value to this layer will
+ be replaced with 0.
+ !*/
+
+ template <typename SUBNET> void setup (const SUBNET& sub);
+ void forward_inplace(const tensor& input, tensor& output);
+ void backward_inplace(const tensor& gradient_input, tensor& data_grad, tensor& params_grad);
+ dpoint map_input_to_output(dpoint p) const;
+ dpoint map_output_to_input(dpoint p) const;
+ const tensor& get_layer_params() const;
+ tensor& get_layer_params();
+ /*!
+ These functions are implemented as described in the EXAMPLE_COMPUTATIONAL_LAYER_ interface.
+ !*/
+ };
+
+ template <typename SUBNET>
+ using dropout = add_layer<dropout_, SUBNET>;
+
+// ----------------------------------------------------------------------------------------
+
+ class multiply_
+ {
+ /*!
+ WHAT THIS OBJECT REPRESENTS
+ This is an implementation of the EXAMPLE_COMPUTATIONAL_LAYER_ interface
+ defined above. In particular, it defines a basic layer that just
+ multiplies its input tensor with a constant value and returns the result.
+ It therefore has no learnable parameters.
+ !*/
+
+ public:
+ explicit multiply_(
+ float val = 0.5
+ );
+ /*!
+ ensures
+ - #get_multiply_value() == val
+ !*/
+
+ multiply_ (
+ const dropout_& item
+ );
+ /*!
+ ensures
+ - #get_multiply_value() == 1-item.get_drop_rate()
+ (i.e. We construct the multiply_ layer so that it is essentially a
+ deterministic version of the given dropout_ layer)
+ !*/
+
+ float get_multiply_value (
+ ) const;
+ /*!
+ ensures
+ - this layer simply multiplies its input tensor by get_multiply_value() and
+ produces the result as output.
+ !*/
+
+ template <typename SUBNET> void setup (const SUBNET& sub);
+ void forward_inplace(const tensor& input, tensor& output);
+ void backward_inplace(const tensor& gradient_input, tensor& data_grad, tensor& params_grad);
+ dpoint map_input_to_output(dpoint p) const;
+ dpoint map_output_to_input(dpoint p) const;
+ const tensor& get_layer_params() const;
+ tensor& get_layer_params();
+ /*!
+ These functions are implemented as described in the EXAMPLE_COMPUTATIONAL_LAYER_ interface.
+ !*/
+ };
+
+ template <typename SUBNET>
+ using multiply = add_layer<multiply_, SUBNET>;
+
+// ----------------------------------------------------------------------------------------
+
+ enum layer_mode
+ {
+ CONV_MODE = 0, // convolutional mode
+ FC_MODE = 1 // fully connected mode
+ };
+
+ const double DEFAULT_BATCH_NORM_EPS = 0.0001;
+
+ template <
+ layer_mode mode
+ >
+ class bn_
+ {
+ /*!
+ WHAT THIS OBJECT REPRESENTS
+ This is an implementation of the EXAMPLE_COMPUTATIONAL_LAYER_ interface
+ defined above. In particular, it defines a batch normalization layer that
+ implements the method described in the paper:
+ Batch Normalization: Accelerating Deep Network Training by Reducing
+ Internal Covariate Shift by Sergey Ioffe and Christian Szegedy
+
+ In particular, this layer produces output tensors with the same
+ dimensionality as the input tensors, except that the mean and variances of
+ the elements have been standardized to 0 and 1 respectively.
+
+ It should also be noted that when tensors with a num_samples() dimension of
+ 1 are passed to this layer it doesn't perform batch normalization.
+ Instead, it runs in "inference mode" where the learned linear normalizing
+ transformation is used to transform the tensor.
+
+ Finally, after you finish training a batch normalized network, it is a good
+ idea to replace each bn_ layer with an affine_ layer because the affine_
+ layer is faster and will never surprise you by performing batch
+ normalization on tensors that have a num_samples() dimension > 1. This allows
+ you to run large mini-batches of samples through your final network without
+ batch normalization executing at all.
+ !*/
+
+ public:
+ bn_(
+ );
+ /*!
+ ensures
+ - #get_mode() == mode
+ - #get_running_stats_window_size() == 100
+ - #get_learning_rate_multiplier() == 1
+ - #get_weight_decay_multiplier() == 0
+ - #get_bias_learning_rate_multiplier() == 1
+ - #get_bias_weight_decay_multiplier() == 1
+ - #get_eps() == tt::DEFAULT_BATCH_NORM_EPS
+ !*/
+
+ explicit bn_(
+ unsigned long window_size,
+ double eps = tt::DEFAULT_BATCH_NORM_EPS
+ );
+ /*!
+ requires
+ - eps > 0
+ - window_size > 0
+ ensures
+ - #get_mode() == mode
+ - #get_running_stats_window_size() == window_size
+ - #get_learning_rate_multiplier() == 1
+ - #get_weight_decay_multiplier() == 0
+ - #get_bias_learning_rate_multiplier() == 1
+ - #get_bias_weight_decay_multiplier() == 1
+ - #get_eps() == eps
+ !*/
+
+ layer_mode get_mode(
+ ) const;
+ /*!
+ ensures
+ - returns the mode of this layer, either CONV_MODE or FC_MODE.
+ If the mode is FC_MODE then the normalization is applied across the
+ samples in a tensor (i.e. k()*nr()*nc() different things will be
+ normalized). Otherwise, normalization is applied across everything
+ except for the k() dimension, resulting in there being only k()
+ normalization equations that are applied spatially over the tensor.
+
+ Therefore, if you are putting batch normalization after a fully connected
+ layer you should use FC_MODE. Otherwise, if you are putting batch
+ normalization after a convolutional layer you should use CONV_MODE.
+ !*/
+
+ double get_eps(
+ ) const;
+ /*!
+ ensures
+ - When doing batch normalization, we are dividing by the standard
+ deviation. This epsilon value returned by this function is added to the
+ variance to prevent the division from dividing by zero.
+ !*/
+
+ unsigned long get_running_stats_window_size (
+ ) const;
+ /*!
+ ensures
+ - Just as recommended in the batch normalization paper, this object keeps a
+ running average of the mean and standard deviations of the features.
+ These averages are used during "inference mode" so you can run a single
+ object through a batch normalized network. They are also what is used to
+ initialize an affine_ layer that is constructed from a bn_ layer. This
+ function returns the effective number of recent samples used to compute
+ the running average.
+ !*/
+
+ void set_running_stats_window_size (
+ unsigned long new_window_size
+ );
+ /*!
+ requires
+ - new_window_size > 0
+ ensures
+ - #get_running_stats_window_size() == new_window_size
+ !*/
+
+ double get_learning_rate_multiplier(
+ ) const;
+ /*!
+ ensures
+ - returns a multiplier number. The interpretation is that this object is
+ requesting that the learning rate used to optimize its parameters be
+ multiplied by get_learning_rate_multiplier().
+ !*/
+
+ double get_weight_decay_multiplier(
+ ) const;
+ /*!
+ ensures
+ - returns a multiplier number. The interpretation is that this object is
+ requesting that the weight decay used to optimize its parameters be
+ multiplied by get_weight_decay_multiplier().
+ !*/
+
+ void set_learning_rate_multiplier(
+ double val
+ );
+ /*!
+ requires
+ - val >= 0
+ ensures
+ - #get_learning_rate_multiplier() == val
+ !*/
+
+ void set_weight_decay_multiplier(
+ double val
+ );
+ /*!
+ requires
+ - val >= 0
+ ensures
+ - #get_weight_decay_multiplier() == val
+ !*/
+
+ double get_bias_learning_rate_multiplier(
+ ) const;
+ /*!
+ ensures
+ - returns a multiplier number. The interpretation is that this object is
+ requesting that the learning rate used to optimize its bias parameters be
+ multiplied by get_learning_rate_multiplier()*get_bias_learning_rate_multiplier().
+ !*/
+
+ double get_bias_weight_decay_multiplier(
+ ) const;
+ /*!
+ ensures
+ - returns a multiplier number. The interpretation is that this object is
+ requesting that the weight decay used to optimize its bias parameters be
+ multiplied by get_weight_decay_multiplier()*get_bias_weight_decay_multiplier().
+ !*/
+
+ void set_bias_learning_rate_multiplier(
+ double val
+ );
+ /*!
+ requires
+ - val >= 0
+ ensures
+ - #get_bias_learning_rate_multiplier() == val
+ !*/
+
+ void set_bias_weight_decay_multiplier(
+ double val
+ );
+ /*!
+ requires
+ - val >= 0
+ ensures
+ - #get_bias_weight_decay_multiplier() == val
+ !*/
+
+ template <typename SUBNET> void setup (const SUBNET& sub);
+ template <typename SUBNET> void forward(const SUBNET& sub, resizable_tensor& output);
+ template <typename SUBNET> void backward(const tensor& gradient_input, SUBNET& sub, tensor& params_grad);
+ dpoint map_input_to_output(dpoint p) const;
+ dpoint map_output_to_input(dpoint p) const;
+ const tensor& get_layer_params() const;
+ tensor& get_layer_params();
+ /*!
+ These functions are implemented as described in the EXAMPLE_COMPUTATIONAL_LAYER_ interface.
+ !*/
+ };
+
+ template <typename SUBNET>
+ using bn_con = add_layer<bn_<CONV_MODE>, SUBNET>;
+ template <typename SUBNET>
+ using bn_fc = add_layer<bn_<FC_MODE>, SUBNET>;
+
+// ----------------------------------------------------------------------------------------
+
+ template <typename net_type>
+ void set_all_bn_running_stats_window_sizes (
+ const net_type& net,
+ unsigned long new_window_size
+ );
+ /*!
+ requires
+ - new_window_size > 0
+ - net_type is an object of type add_layer, add_loss_layer, add_skip_layer, or
+ add_tag_layer.
+ ensures
+ - Sets the get_running_stats_window_size() field of all bn_ layers in net to
+ new_window_size.
+ !*/
+
+// ----------------------------------------------------------------------------------------
+
+ class affine_
+ {
+ /*!
+ WHAT THIS OBJECT REPRESENTS
+ This is an implementation of the EXAMPLE_COMPUTATIONAL_LAYER_ interface
+ defined above. In particular, it applies a simple pointwise linear
+ transformation to an input tensor. You can think of it as having two
+ parameter tensors, A and B. If the input tensor is called INPUT then the
+ output of this layer is:
+ A*INPUT+B
+ where all operations are performed element wise and each sample in the
+ INPUT tensor is processed separately.
+
+ Moreover, this object has two modes that effect the dimensionalities of A
+ and B and how they are applied to compute A*INPUT+B. If
+ get_mode()==FC_MODE then A and B each have the same dimensionality as the
+ input tensor, except their num_samples() dimensions are 1. If
+ get_mode()==CONV_MODE then A and B have all their dimensions set to 1
+ except for k(), which is equal to INPUT.k().
+
+ In either case, the computation of A*INPUT+B is performed pointwise over all
+ the elements of INPUT using either:
+ OUTPUT(n,k,r,c) == A(1,k,r,c)*INPUT(n,k,r,c)+B(1,k,r,c)
+ or
+ OUTPUT(n,k,r,c) == A(1,k,1,1)*INPUT(n,k,r,c)+B(1,k,1,1)
+ as appropriate.
+
+
+ Finally, note that the parameters of this layer are not learnable and
+ therefore not modified during network updates. Instead, the layer will
+ perform the identity transformation unless it is initialized with a bn_
+ layer, in which case it will perform whatever transformation the bn_ layer
+ has learned.
+ !*/
+
+ public:
+
+ affine_(
+ );
+ /*!
+ ensures
+ - #get_mode() == FC_MODE
+ !*/
+
+ affine_(
+ layer_mode mode
+ );
+ /*!
+ ensures
+ - #get_mode() == mode
+ !*/
+
+ template <
+ layer_mode mode
+ >
+ affine_(
+ const bn_<mode>& layer
+ );
+ /*!
+ ensures
+ - Constructs affine_ so that it performs the same transformation as the
+ supplied batch normalization layer. You would want to do this after you
+ finish training a network with bn_ layers because the affine_ layer will
+ execute faster.
+ - #get_mode() == layer.get_mode()
+ !*/
+
+ layer_mode get_mode(
+ ) const;
+ /*!
+ ensures
+ - returns the mode of this layer, either CONV_MODE or FC_MODE.
+ !*/
+
+ template <typename SUBNET> void setup (const SUBNET& sub);
+ void forward_inplace(const tensor& input, tensor& output);
+ void backward_inplace(const tensor& computed_output, const tensor& gradient_input, tensor& data_grad, tensor& params_grad);
+ dpoint map_input_to_output(dpoint p) const;
+ dpoint map_output_to_input(dpoint p) const;
+ const tensor& get_layer_params() const;
+ tensor& get_layer_params();
+ /*!
+ These functions are implemented as described in the
+ EXAMPLE_COMPUTATIONAL_LAYER_ interface. Also note that get_layer_params()
+ always returns an empty tensor since there are no learnable parameters in this
+ object.
+ !*/
+
+ };
+
+ template <typename SUBNET>
+ using affine = add_layer<affine_, SUBNET>;
+
+// ----------------------------------------------------------------------------------------
+
+ template <
+ long _nr,
+ long _nc,
+ int _stride_y,
+ int _stride_x,
+ int _padding_y = _stride_y!=1? 0 : _nr/2,
+ int _padding_x = _stride_x!=1? 0 : _nc/2
+ >
+ class max_pool_
+ {
+ /*!
+ REQUIREMENTS ON TEMPLATE ARGUMENTS
+ - _nr >= 0
+ - _nc >= 0
+ - _stride_y > 0
+ - _stride_x > 0
+ - _padding_y >= 0
+ - _padding_x >= 0
+ - if (_nr != 0) then
+ - _padding_y < _nr
+ - else
+ - _padding_y == 0
+ - if (_nc != 0) then
+ - _padding_x < _nr
+ - else
+ - _padding_x == 0
+
+ WHAT THIS OBJECT REPRESENTS
+ This is an implementation of the EXAMPLE_COMPUTATIONAL_LAYER_ interface
+ defined above. In particular, it defines a max pooling layer that takes an
+ input tensor and downsamples it. It does this by sliding a window over the
+ images in an input tensor and outputting, for each channel, the maximum
+ element within the window.
+
+ If _nr == 0 then it means the filter size covers all the rows in the input
+ tensor, similarly for the _nc parameter. To be precise, if we call the
+ input tensor IN and the output tensor OUT, then OUT is defined as follows:
+ - let FILT_NR == (nr()==0) ? IN.nr() : nr()
+ - let FILT_NC == (nc()==0) ? IN.nc() : nc()
+ - OUT.num_samples() == IN.num_samples()
+ - OUT.k() == IN.k()
+ - OUT.nr() == 1+(IN.nr() + 2*padding_y() - FILT_NR)/stride_y()
+ - OUT.nc() == 1+(IN.nc() + 2*padding_x() - FILT_NC)/stride_x()
+ - for all valid s, k, r, and c:
+ - image_plane(OUT,s,k)(r,c) == max(subm_clipped(image_plane(IN,s,k),
+ centered_rect(x*stride_x() + FILT_NC/2 - padding_x(),
+ y*stride_y() + FILT_NR/2 - padding_y(),
+ FILT_NC,
+ FILT_NR)))
+ !*/
+
+ public:
+
+ max_pool_ (
+ );
+ /*!
+ ensures
+ - #nr() == _nr
+ - #nc() == _nc
+ - #stride_y() == _stride_y
+ - #stride_x() == _stride_x
+ - #padding_y() == _padding_y
+ - #padding_x() == _padding_x
+ !*/
+
+ long nr(
+ ) const;
+ /*!
+ ensures
+ - returns the number of rows in the pooling window or 0 if the window size
+ is "the entire input tensor".
+ !*/
+
+ long nc(
+ ) const;
+ /*!
+ ensures
+ - returns the number of rows in the pooling window or 0 if the window size
+ is "the entire input tensor".
+ !*/
+
+ long stride_y(
+ ) const;
+ /*!
+ ensures
+ - returns the vertical stride used when scanning the max pooling window
+ over an image. That is, each window will be moved stride_y() pixels down
+ at a time when it moves over the image.
+ !*/
+
+ long stride_x(
+ ) const;
+ /*!
+ ensures
+ - returns the horizontal stride used when scanning the max pooling window
+ over an image. That is, each window will be moved stride_x() pixels down
+ at a time when it moves over the image.
+ !*/
+
+ long padding_y(
+ ) const;
+ /*!
+ ensures
+ - returns the number of pixels of zero padding added to the top and bottom
+ sides of the image.
+ !*/
+
+ long padding_x(
+ ) const;
+ /*!
+ ensures
+ - returns the number of pixels of zero padding added to the left and right
+ sides of the image.
+ !*/
+
+ template <typename SUBNET> void setup (const SUBNET& sub);
+ template <typename SUBNET> void forward(const SUBNET& sub, resizable_tensor& output);
+ template <typename SUBNET> void backward(const tensor& computed_output, const tensor& gradient_input, SUBNET& sub, tensor& params_grad);
+ dpoint map_input_to_output(dpoint p) const;
+ dpoint map_output_to_input(dpoint p) const;
+ const tensor& get_layer_params() const;
+ tensor& get_layer_params();
+ /*!
+ These functions are implemented as described in the EXAMPLE_COMPUTATIONAL_LAYER_
+ interface. Note that this layer doesn't have any parameters, so the tensor
+ returned by get_layer_params() is always empty.
+ !*/
+ };
+
+ template <
+ long nr,
+ long nc,
+ int stride_y,
+ int stride_x,
+ typename SUBNET
+ >
+ using max_pool = add_layer<max_pool_<nr,nc,stride_y,stride_x>, SUBNET>;
+
+ template <
+ typename SUBNET
+ >
+ using max_pool_everything = add_layer<max_pool_<0,0,1,1>, SUBNET>;
+
+// ----------------------------------------------------------------------------------------
+
+ template <
+ long _nr,
+ long _nc,
+ int _stride_y,
+ int _stride_x,
+ int _padding_y = _stride_y!=1? 0 : _nr/2,
+ int _padding_x = _stride_x!=1? 0 : _nc/2
+ >
+ class avg_pool_
+ {
+ /*!
+ REQUIREMENTS ON TEMPLATE ARGUMENTS
+ - _nr >= 0
+ - _nc >= 0
+ - _stride_y > 0
+ - _stride_x > 0
+ - _padding_y >= 0
+ - _padding_x >= 0
+ - if (_nr != 0) then
+ - _padding_y < _nr
+ - else
+ - _padding_y == 0
+ - if (_nc != 0) then
+ - _padding_x < _nr
+ - else
+ - _padding_x == 0
+
+ WHAT THIS OBJECT REPRESENTS
+ This is an implementation of the EXAMPLE_COMPUTATIONAL_LAYER_ interface
+ defined above. In particular, it defines an average pooling layer that
+ takes an input tensor and downsamples it. It does this by sliding a window
+ over the images in an input tensor and outputting, for each channel, the
+ average element within the window.
+
+ If _nr == 0 then it means the filter size covers all the rows in the input
+ tensor, similarly for the _nc parameter. To be precise, if we call the
+ input tensor IN and the output tensor OUT, then OUT is defined as follows:
+ - let FILT_NR == (nr()==0) ? IN.nr() : nr()
+ - let FILT_NC == (nc()==0) ? IN.nc() : nc()
+ - OUT.num_samples() == IN.num_samples()
+ - OUT.k() == IN.k()
+ - OUT.nr() == 1+(IN.nr() + 2*padding_y() - FILT_NR)/stride_y()
+ - OUT.nc() == 1+(IN.nc() + 2*padding_x() - FILT_NC)/stride_x()
+ - for all valid s, k, r, and c:
+ - image_plane(OUT,s,k)(r,c) == mean(subm_clipped(image_plane(IN,s,k),
+ centered_rect(x*stride_x() + FILT_NC/2 - padding_x(),
+ y*stride_y() + FILT_NR/2 - padding_y(),
+ FILT_NC,
+ FILT_NR)))
+ !*/
+
+ public:
+
+ avg_pool_ (
+ );
+ /*!
+ ensures
+ - #nr() == _nr
+ - #nc() == _nc
+ - #stride_y() == _stride_y
+ - #stride_x() == _stride_x
+ - #padding_y() == _padding_y
+ - #padding_x() == _padding_x
+ !*/
+
+ long nr(
+ ) const;
+ /*!
+ ensures
+ - returns the number of rows in the pooling window or 0 if the window size
+ is "the entire input tensor".
+ !*/
+
+ long nc(
+ ) const;
+ /*!
+ ensures
+ - returns the number of rows in the pooling window or 0 if the window size
+ is "the entire input tensor".
+ !*/
+
+ long stride_y(
+ ) const;
+ /*!
+ ensures
+ - returns the vertical stride used when scanning the pooling window
+ over an image. That is, each window will be moved stride_y() pixels down
+ at a time when it moves over the image.
+ !*/
+
+ long stride_x(
+ ) const;
+ /*!
+ ensures
+ - returns the horizontal stride used when scanning the pooling window
+ over an image. That is, each window will be moved stride_x() pixels down
+ at a time when it moves over the image.
+ !*/
+
+ long padding_y(
+ ) const;
+ /*!
+ ensures
+ - returns the number of pixels of zero padding added to the top and bottom
+ sides of the image.
+ !*/
+
+ long padding_x(
+ ) const;
+ /*!
+ ensures
+ - returns the number of pixels of zero padding added to the left and right
+ sides of the image.
+ !*/
+
+ template <typename SUBNET> void setup (const SUBNET& sub);
+ template <typename SUBNET> void forward(const SUBNET& sub, resizable_tensor& output);
+ template <typename SUBNET> void backward(const tensor& computed_output, const tensor& gradient_input, SUBNET& sub, tensor& params_grad);
+ dpoint map_input_to_output(dpoint p) const;
+ dpoint map_output_to_input(dpoint p) const;
+ const tensor& get_layer_params() const;
+ tensor& get_layer_params();
+ /*!
+ These functions are implemented as described in the EXAMPLE_COMPUTATIONAL_LAYER_
+ interface. Note that this layer doesn't have any parameters, so the tensor
+ returned by get_layer_params() is always empty.
+ !*/
+
+ };
+
+ template <
+ long nr,
+ long nc,
+ int stride_y,
+ int stride_x,
+ typename SUBNET
+ >
+ using avg_pool = add_layer<avg_pool_<nr,nc,stride_y,stride_x>, SUBNET>;
+
+ template <
+ typename SUBNET
+ >
+ using avg_pool_everything = add_layer<avg_pool_<0,0,1,1>, SUBNET>;
+
+// ----------------------------------------------------------------------------------------
+
+ class relu_
+ {
+ /*!
+ WHAT THIS OBJECT REPRESENTS
+ This is an implementation of the EXAMPLE_COMPUTATIONAL_LAYER_ interface
+ defined above. In particular, it defines a rectified linear layer.
+ Therefore, it passes its inputs through the function
+ f(x)=max(x,0)
+ where f() is applied pointwise across the input tensor.
+ !*/
+
+ public:
+
+ relu_(
+ );
+
+ template <typename SUBNET> void setup (const SUBNET& sub);
+ void forward_inplace(const tensor& input, tensor& output);
+ void backward_inplace(const tensor& computed_output, const tensor& gradient_input, tensor& data_grad, tensor& params_grad);
+ dpoint map_input_to_output(dpoint p) const;
+ dpoint map_output_to_input(dpoint p) const;
+ const tensor& get_layer_params() const;
+ tensor& get_layer_params();
+ /*!
+ These functions are implemented as described in the EXAMPLE_COMPUTATIONAL_LAYER_
+ interface. Note that this layer doesn't have any parameters, so the tensor
+ returned by get_layer_params() is always empty.
+ !*/
+ };
+
+ template <typename SUBNET>
+ using relu = add_layer<relu_, SUBNET>;
+
+// ----------------------------------------------------------------------------------------
+
+ class prelu_
+ {
+ /*!
+ WHAT THIS OBJECT REPRESENTS
+ This is an implementation of the EXAMPLE_COMPUTATIONAL_LAYER_ interface
+ defined above. In particular, it defines a parametric rectified linear
+ layer. Therefore, it passes its inputs through the function
+ f(x) = x>0 ? x : p*x
+ where f() is applied pointwise across the input tensor and p is a scalar
+ parameter learned by this layer.
+
+
+ This is the layer type introduced in the paper:
+ He, Kaiming, et al. "Delving deep into rectifiers: Surpassing
+ human-level performance on imagenet classification." Proceedings of the
+ IEEE International Conference on Computer Vision. 2015.
+ !*/
+
+ public:
+
+ explicit prelu_(
+ float initial_param_value = 0.25
+ );
+ /*!
+ ensures
+ - The p parameter will be initialized with initial_param_value.
+ - #get_initial_param_value() == initial_param_value.
+ !*/
+
+ float get_initial_param_value (
+ ) const;
+ /*!
+ ensures
+ - returns the initial value of the prelu parameter.
+ !*/
+
+ template <typename SUBNET> void setup (const SUBNET& sub);
+ void forward_inplace(const tensor& input, tensor& output);
+ void backward_inplace(const tensor& computed_output, const tensor& gradient_input, tensor& data_grad, tensor& params_grad);
+ dpoint map_input_to_output(dpoint p) const;
+ dpoint map_output_to_input(dpoint p) const;
+ const tensor& get_layer_params() const;
+ tensor& get_layer_params();
+ /*!
+ These functions are implemented as described in the EXAMPLE_COMPUTATIONAL_LAYER_ interface.
+ !*/
+ };
+
+ template <typename SUBNET>
+ using prelu = add_layer<prelu_, SUBNET>;
+
+// ----------------------------------------------------------------------------------------
+
+ class sig_
+ {
+ /*!
+ WHAT THIS OBJECT REPRESENTS
+ This is an implementation of the EXAMPLE_COMPUTATIONAL_LAYER_ interface
+ defined above. In particular, it defines a sigmoid layer. Therefore, it
+ passes its inputs through the function
+ f(x)=1/(1+exp(-x))
+ where f() is applied pointwise across the input tensor.
+ !*/
+
+ public:
+
+ sig_(
+ );
+
+ template <typename SUBNET> void setup (const SUBNET& sub);
+ void forward_inplace(const tensor& input, tensor& output);
+ void backward_inplace(const tensor& computed_output, const tensor& gradient_input, tensor& data_grad, tensor& params_grad);
+ dpoint map_input_to_output(dpoint p) const;
+ dpoint map_output_to_input(dpoint p) const;
+ const tensor& get_layer_params() const;
+ tensor& get_layer_params();
+ /*!
+ These functions are implemented as described in the EXAMPLE_COMPUTATIONAL_LAYER_
+ interface. Note that this layer doesn't have any parameters, so the tensor
+ returned by get_layer_params() is always empty.
+ !*/
+ };
+
+ template <typename SUBNET>
+ using sig = add_layer<sig_, SUBNET>;
+
+// ----------------------------------------------------------------------------------------
+
+ class htan_
+ {
+ /*!
+ WHAT THIS OBJECT REPRESENTS
+ This is an implementation of the EXAMPLE_COMPUTATIONAL_LAYER_ interface
+ defined above. In particular, it defines a hyperbolic tangent layer.
+ Therefore, it passes its inputs through the function
+ f(x)=std::tanh(x)
+ where f() is applied pointwise across the input tensor.
+ !*/
+
+ public:
+
+ htan_(
+ );
+
+ template <typename SUBNET> void setup (const SUBNET& sub);
+ void forward_inplace(const tensor& input, tensor& output);
+ void backward_inplace(const tensor& computed_output, const tensor& gradient_input, tensor& data_grad, tensor& params_grad);
+ dpoint map_input_to_output(dpoint p) const;
+ dpoint map_output_to_input(dpoint p) const;
+ const tensor& get_layer_params() const;
+ tensor& get_layer_params();
+ /*!
+ These functions are implemented as described in the EXAMPLE_COMPUTATIONAL_LAYER_
+ interface. Note that this layer doesn't have any parameters, so the tensor
+ returned by get_layer_params() is always empty.
+ !*/
+ };
+
+ template <typename SUBNET>
+ using htan = add_layer<htan_, SUBNET>;
+
+// ----------------------------------------------------------------------------------------
+
+ class softmax_
+ {
+ /*!
+ WHAT THIS OBJECT REPRESENTS
+ This is an implementation of the EXAMPLE_COMPUTATIONAL_LAYER_ interface
+ defined above. In particular, it defines a softmax layer. To be precise,
+ we define the softmax function s(x) as:
+ s(x) == exp(x)/sum(exp(x))
+ where x is a vector. Then this layer treats its input tensor as a
+ collection of multi-channel images and applies s() to each spatial location
+ in each image. In each application, the tensor::k() channel elements at
+ each position are input to s() and then replaced by the outputs of s().
+
+ This means that, for example, if you collapsed each output image to a 1
+ channel image by adding the channels then you would end up with images
+ where each pixel value was 1. This is because the sum of the outputs of
+ s() will always be equal to 1.
+ !*/
+
+ public:
+
+ softmax_(
+ );
+
+ template <typename SUBNET> void setup (const SUBNET& sub);
+ void forward_inplace(const tensor& input, tensor& output);
+ void backward_inplace(const tensor& computed_output, const tensor& gradient_input, tensor& data_grad, tensor& params_grad);
+ const tensor& get_layer_params() const;
+ tensor& get_layer_params();
+ /*!
+ These functions are implemented as described in the EXAMPLE_COMPUTATIONAL_LAYER_
+ interface. Note that this layer doesn't have any parameters, so the tensor
+ returned by get_layer_params() is always empty.
+ !*/
+ };
+
+ template <typename SUBNET>
+ using softmax = add_layer<softmax_, SUBNET>;
+
+// ----------------------------------------------------------------------------------------
+
+ class softmax_all_
+ {
+ /*!
+ WHAT THIS OBJECT REPRESENTS
+ This is an implementation of the EXAMPLE_COMPUTATIONAL_LAYER_ interface
+ defined above. In particular, it defines a softmax layer. To be precise,
+ we define the softmax function s(x) as:
+ s(x) == exp(x)/sum(exp(x))
+ where x is a vector. Then this layer treats its input tensor as a
+ collection of tensor::num_samples() vectors and applies s() to each vector
+ in the tensor. Therefore, there are logically tensor::num_samples()
+ invocations of s().
+ !*/
+
+ public:
+
+ softmax_all_(
+ );
+
+ template <typename SUBNET> void setup (const SUBNET& sub);
+ void forward_inplace(const tensor& input, tensor& output);
+ void backward_inplace(const tensor& computed_output, const tensor& gradient_input, tensor& data_grad, tensor& params_grad);
+ const tensor& get_layer_params() const;
+ tensor& get_layer_params();
+ /*!
+ These functions are implemented as described in the EXAMPLE_COMPUTATIONAL_LAYER_
+ interface. Note that this layer doesn't have any parameters, so the tensor
+ returned by get_layer_params() is always empty.
+ !*/
+ };
+
+ template <typename SUBNET>
+ using softmax_all = add_layer<softmax_all_, SUBNET>;
+
+// ----------------------------------------------------------------------------------------
+
+ template <
+ template<typename> class tag
+ >
+ class add_prev_
+ {
+ /*!
+ WHAT THIS OBJECT REPRESENTS
+ This is an implementation of the EXAMPLE_COMPUTATIONAL_LAYER_ interface
+ defined above. This layer simply adds the output of two previous layers.
+ In particular, it adds the tensor from its immediate predecessor layer,
+ sub.get_output(), with the tensor from a deeper layer,
+ layer<tag>(sub).get_output().
+
+ Therefore, you supply a tag via add_prev_'s template argument that tells it
+ what layer to add to the output of the previous layer. The result of this
+ addition is output by add_prev_. Finally, the addition happens pointwise
+ according to 4D tensor arithmetic. If the dimensions don't match then
+ missing elements are presumed to be equal to 0. Moreover, each dimension
+ of the output tensor is equal to the maximum dimension of either of the
+ inputs. That is, if the tensors A and B are being added to produce C then:
+ - C.num_samples() == max(A.num_samples(), B.num_samples())
+ - C.k() == max(A.k(), B.k())
+ - C.nr() == max(A.nr(), B.nr())
+ - C.nc() == max(A.nc(), B.nc())
+ !*/
+
+ public:
+ add_prev_(
+ );
+
+ template <typename SUBNET> void setup (const SUBNET& sub);
+ template <typename SUBNET> void forward(const SUBNET& sub, resizable_tensor& output);
+ template <typename SUBNET> void backward(const tensor& gradient_input, SUBNET& sub, tensor& params_grad);
+ dpoint map_input_to_output(dpoint p) const;
+ dpoint map_output_to_input(dpoint p) const;
+ const tensor& get_layer_params() const;
+ tensor& get_layer_params();
+ /*!
+ These functions are implemented as described in the EXAMPLE_COMPUTATIONAL_LAYER_ interface.
+ !*/
+ };
+
+
+ template <
+ template<typename> class tag,
+ typename SUBNET
+ >
+ using add_prev = add_layer<add_prev_<tag>, SUBNET>;
+
+ // Here we add some convenient aliases for using add_prev_ with the tag layers.
+ template <typename SUBNET> using add_prev1 = add_prev<tag1, SUBNET>;
+ template <typename SUBNET> using add_prev2 = add_prev<tag2, SUBNET>;
+ template <typename SUBNET> using add_prev3 = add_prev<tag3, SUBNET>;
+ template <typename SUBNET> using add_prev4 = add_prev<tag4, SUBNET>;
+ template <typename SUBNET> using add_prev5 = add_prev<tag5, SUBNET>;
+ template <typename SUBNET> using add_prev6 = add_prev<tag6, SUBNET>;
+ template <typename SUBNET> using add_prev7 = add_prev<tag7, SUBNET>;
+ template <typename SUBNET> using add_prev8 = add_prev<tag8, SUBNET>;
+ template <typename SUBNET> using add_prev9 = add_prev<tag9, SUBNET>;
+ template <typename SUBNET> using add_prev10 = add_prev<tag10, SUBNET>;
+ using add_prev1_ = add_prev_<tag1>;
+ using add_prev2_ = add_prev_<tag2>;
+ using add_prev3_ = add_prev_<tag3>;
+ using add_prev4_ = add_prev_<tag4>;
+ using add_prev5_ = add_prev_<tag5>;
+ using add_prev6_ = add_prev_<tag6>;
+ using add_prev7_ = add_prev_<tag7>;
+ using add_prev8_ = add_prev_<tag8>;
+ using add_prev9_ = add_prev_<tag9>;
+ using add_prev10_ = add_prev_<tag10>;
+
+// ----------------------------------------------------------------------------------------
+
+ template <
+ template<typename> class tag
+ >
+ class mult_prev_
+ {
+ /*!
+ WHAT THIS OBJECT REPRESENTS
+ This is an implementation of the EXAMPLE_COMPUTATIONAL_LAYER_ interface
+ defined above. This layer simply multiplies the output of two previous
+ layers. In particular, it multiplies the tensor from its immediate
+ predecessor layer, sub.get_output(), with the tensor from a deeper layer,
+ layer<tag>(sub).get_output().
+
+ Therefore, you supply a tag via mult_prev_'s template argument that tells
+ it what layer to multiply with the output of the previous layer. The
+ result of this multiplication is output by mult_prev_. Finally, the
+ multiplication happens pointwise according to 4D tensor arithmetic. If the
+ dimensions don't match then missing elements are presumed to be equal to 0.
+ Moreover, each dimension of the output tensor is equal to the maximum
+ dimension of either of the inputs. That is, if the tensors A and B are
+ being multiplied to produce C then:
+ - C.num_samples() == max(A.num_samples(), B.num_samples())
+ - C.k() == max(A.k(), B.k())
+ - C.nr() == max(A.nr(), B.nr())
+ - C.nc() == max(A.nc(), B.nc())
+ !*/
+
+ public:
+ mult_prev_(
+ );
+
+ template <typename SUBNET> void setup (const SUBNET& sub);
+ template <typename SUBNET> void forward(const SUBNET& sub, resizable_tensor& output);
+ template <typename SUBNET> void backward(const tensor& gradient_input, SUBNET& sub, tensor& params_grad);
+ const tensor& get_layer_params() const;
+ tensor& get_layer_params();
+ /*!
+ These functions are implemented as described in the EXAMPLE_COMPUTATIONAL_LAYER_ interface.
+ !*/
+ };
+
+
+ template <
+ template<typename> class tag,
+ typename SUBNET
+ >
+ using mult_prev = add_layer<mult_prev_<tag>, SUBNET>;
+
+ // Here we add some convenient aliases for using mult_prev_ with the tag layers.
+ template <typename SUBNET> using mult_prev1 = mult_prev<tag1, SUBNET>;
+ template <typename SUBNET> using mult_prev2 = mult_prev<tag2, SUBNET>;
+ template <typename SUBNET> using mult_prev3 = mult_prev<tag3, SUBNET>;
+ template <typename SUBNET> using mult_prev4 = mult_prev<tag4, SUBNET>;
+ template <typename SUBNET> using mult_prev5 = mult_prev<tag5, SUBNET>;
+ template <typename SUBNET> using mult_prev6 = mult_prev<tag6, SUBNET>;
+ template <typename SUBNET> using mult_prev7 = mult_prev<tag7, SUBNET>;
+ template <typename SUBNET> using mult_prev8 = mult_prev<tag8, SUBNET>;
+ template <typename SUBNET> using mult_prev9 = mult_prev<tag9, SUBNET>;
+ template <typename SUBNET> using mult_prev10 = mult_prev<tag10, SUBNET>;
+ using mult_prev1_ = mult_prev_<tag1>;
+ using mult_prev2_ = mult_prev_<tag2>;
+ using mult_prev3_ = mult_prev_<tag3>;
+ using mult_prev4_ = mult_prev_<tag4>;
+ using mult_prev5_ = mult_prev_<tag5>;
+ using mult_prev6_ = mult_prev_<tag6>;
+ using mult_prev7_ = mult_prev_<tag7>;
+ using mult_prev8_ = mult_prev_<tag8>;
+ using mult_prev9_ = mult_prev_<tag9>;
+ using mult_prev10_ = mult_prev_<tag10>;
+
+// ----------------------------------------------------------------------------------------
+
+ template <
+ template<typename> class tag
+ >
+ class scale_
+ {
+ /*!
+ WHAT THIS OBJECT REPRESENTS
+ This is an implementation of the EXAMPLE_COMPUTATIONAL_LAYER_ interface
+ defined above. This layer scales the output channels of the tagged layer
+ by multiplying it with the output of the previous layer. To be specific:
+ - Let INPUT == layer<tag>(sub).get_output()
+ - Let SCALES == sub.get_output()
+ - This layer takes INPUT and SCALES as input.
+ - The output of this layer has the same dimensions as INPUT.
+ - This layer requires:
+ - SCALES.num_samples() == INPUT.num_samples()
+ - SCALES.k() == INPUT.k()
+ - SCALES.nr() == 1
+ - SCALES.nc() == 1
+ - The output tensor is produced by pointwise multiplying SCALES with
+ INPUT at each spatial location. Therefore, if OUT is the output of
+ this layer then we would have:
+ OUT(n,k,r,c) == INPUT(n,k,r,c)*SCALES(n,k)
+ !*/
+
+ public:
+ scale_(
+ );
+
+ template <typename SUBNET> void setup (const SUBNET& sub);
+ template <typename SUBNET> void forward(const SUBNET& sub, resizable_tensor& output);
+ template <typename SUBNET> void backward(const tensor& gradient_input, SUBNET& sub, tensor& params_grad);
+ const tensor& get_layer_params() const;
+ tensor& get_layer_params();
+ /*!
+ These functions are implemented as described in the EXAMPLE_COMPUTATIONAL_LAYER_ interface.
+ !*/
+ };
+
+
+ template <
+ template<typename> class tag,
+ typename SUBNET
+ >
+ using scale = add_layer<scale_<tag>, SUBNET>;
+
+ // Here we add some convenient aliases for using scale_ with the tag layers.
+ template <typename SUBNET> using scale1 = scale<tag1, SUBNET>;
+ template <typename SUBNET> using scale2 = scale<tag2, SUBNET>;
+ template <typename SUBNET> using scale3 = scale<tag3, SUBNET>;
+ template <typename SUBNET> using scale4 = scale<tag4, SUBNET>;
+ template <typename SUBNET> using scale5 = scale<tag5, SUBNET>;
+ template <typename SUBNET> using scale6 = scale<tag6, SUBNET>;
+ template <typename SUBNET> using scale7 = scale<tag7, SUBNET>;
+ template <typename SUBNET> using scale8 = scale<tag8, SUBNET>;
+ template <typename SUBNET> using scale9 = scale<tag9, SUBNET>;
+ template <typename SUBNET> using scale10 = scale<tag10, SUBNET>;
+ using scale1_ = scale_<tag1>;
+ using scale2_ = scale_<tag2>;
+ using scale3_ = scale_<tag3>;
+ using scale4_ = scale_<tag4>;
+ using scale5_ = scale_<tag5>;
+ using scale6_ = scale_<tag6>;
+ using scale7_ = scale_<tag7>;
+ using scale8_ = scale_<tag8>;
+ using scale9_ = scale_<tag9>;
+ using scale10_ = scale_<tag10>;
+
+// ----------------------------------------------------------------------------------------
+
+ template<
+ template<typename> class... TAG_TYPES
+ >
+ class concat_
+ {
+ /*!
+ WHAT THIS OBJECT REPRESENTS
+ This is an implementation of the EXAMPLE_COMPUTATIONAL_LAYER_ interface
+ defined above. This layer simply concatenates the output of tagged layers.
+ Importantly, each input layer must have the same dimensions (i.e.
+ num_samples, nr, and nc) except for the k channel, which may vary. This is
+ because the concatenation happens along the k dimension. That is, the
+ output of this network is a tensor, OUT, that is the concatenation of the
+ tensors:
+ for each (tag in TAG_TYPES)
+ layer<tag>(subnet).get_output()
+ Therefore, out.num_samples(), out.nr(), and out.nc() match the dimensions
+ of the input tensors while OUT.k() is the sum of the input layer's k()
+ dimensions.
+ !*/
+
+ public:
+ template <typename SUBNET> void setup (const SUBNET& sub);
+ template <typename SUBNET> void forward(const SUBNET& sub, resizable_tensor& output);
+ template <typename SUBNET> void backward(const tensor& gradient_input, SUBNET& sub, tensor& params_grad);
+ dpoint map_input_to_output(dpoint p) const;
+ dpoint map_output_to_input(dpoint p) const;
+ const tensor& get_layer_params() const;
+ tensor& get_layer_params();
+ /*!
+ These functions are implemented as described in the EXAMPLE_COMPUTATIONAL_LAYER_ interface.
+ !*/
+ };
+
+
+ // concat layer definitions
+ template <template<typename> class TAG1,
+ template<typename> class TAG2,
+ typename SUBNET>
+ using concat2 = add_layer<concat_<TAG1, TAG2>, SUBNET>;
+
+ template <template<typename> class TAG1,
+ template<typename> class TAG2,
+ template<typename> class TAG3,
+ typename SUBNET>
+ using concat3 = add_layer<concat_<TAG1, TAG2, TAG3>, SUBNET>;
+
+ template <template<typename> class TAG1,
+ template<typename> class TAG2,
+ template<typename> class TAG3,
+ template<typename> class TAG4,
+ typename SUBNET>
+ using concat4 = add_layer<concat_<TAG1, TAG2, TAG3, TAG4>, SUBNET>;
+
+ template <template<typename> class TAG1,
+ template<typename> class TAG2,
+ template<typename> class TAG3,
+ template<typename> class TAG4,
+ template<typename> class TAG5,
+ typename SUBNET>
+ using concat5 = add_layer<concat_<TAG1, TAG2, TAG3, TAG4, TAG5>, SUBNET>;
+
+// ----------------------------------------------------------------------------------------
+
+ /*!A inception layer definitions !*/
+
+ // Now define inception layer tag types. These layer aliases allow creating
+ // the networks described in the paper:
+ // Szegedy, Christian, et al. "Going deeper with convolutions." Proceedings of
+ // the IEEE Conference on Computer Vision and Pattern Recognition. 2015.
+ // See the dnn_inception_ex.cpp example for a complete example of their use. Note also
+ // that we use tag ID numbers >= 1000 to avoid conflict with user's tag layers.
+ template <typename SUBNET> using itag0 = add_tag_layer< 1000 + 0, SUBNET>;
+ template <typename SUBNET> using itag1 = add_tag_layer< 1000 + 1, SUBNET>;
+ template <typename SUBNET> using itag2 = add_tag_layer< 1000 + 2, SUBNET>;
+ template <typename SUBNET> using itag3 = add_tag_layer< 1000 + 3, SUBNET>;
+ template <typename SUBNET> using itag4 = add_tag_layer< 1000 + 4, SUBNET>;
+ template <typename SUBNET> using itag5 = add_tag_layer< 1000 + 5, SUBNET>;
+ // skip to inception input
+ template <typename SUBNET> using iskip = add_skip_layer< itag0, SUBNET>;
+
+ // here are some templates to be used for creating inception layer groups
+ template <template<typename>class B1,
+ template<typename>class B2,
+ typename SUBNET>
+ using inception2 = concat2<itag1, itag2, itag1<B1<iskip< itag2<B2< itag0<SUBNET>>>>>>>;
+
+ template <template<typename>class B1,
+ template<typename>class B2,
+ template<typename>class B3,
+ typename SUBNET>
+ using inception3 = concat3<itag1, itag2, itag3, itag1<B1<iskip< itag2<B2<iskip< itag3<B3< itag0<SUBNET>>>>>>>>>>;
+
+ template <template<typename>class B1,
+ template<typename>class B2,
+ template<typename>class B3,
+ template<typename>class B4,
+ typename SUBNET>
+ using inception4 = concat4<itag1, itag2, itag3, itag4,
+ itag1<B1<iskip< itag2<B2<iskip< itag3<B3<iskip< itag4<B4< itag0<SUBNET>>>>>>>>>>>>>;
+
+ template <template<typename>class B1,
+ template<typename>class B2,
+ template<typename>class B3,
+ template<typename>class B4,
+ template<typename>class B5,
+ typename SUBNET>
+ using inception5 = concat5<itag1, itag2, itag3, itag4, itag5,
+ itag1<B1<iskip< itag2<B2<iskip< itag3<B3<iskip< itag4<B4<iskip< itag5<B5< itag0<SUBNET>>>>>>>>>>>>>>>>;
+
+// ----------------------------------------------------------------------------------------
+
+ const double DEFAULT_L2_NORM_EPS = 1e-5;
+
+ class l2normalize_
+ {
+ /*!
+ WHAT THIS OBJECT REPRESENTS
+ This is an implementation of the EXAMPLE_COMPUTATIONAL_LAYER_ interface
+ defined above. It takes tensors as input and L2 normalizes them. In particular,
+ it has the following properties:
+ - The output tensors from this layer have the same dimensions as the
+ input tensors.
+ - If you think of each input tensor as a set of tensor::num_samples()
+ vectors, then the output tensor contains the same vectors except they
+ have been length normalized so that their L2 norms are all 1. I.e.
+ for each vector v we will have ||v||==1.
+ !*/
+
+ public:
+
+ explicit l2normalize_(
+ double eps = tt::DEFAULT_L2_NORM_EPS
+ );
+ /*!
+ requires
+ - eps > 0
+ ensures
+ - #get_eps() == eps
+ !*/
+
+ double get_eps(
+ ) const;
+ /*!
+ ensures
+ - When we normalize a vector we divide it by its L2 norm. However, the
+ get_eps() value is added to the squared norm prior to division to avoid
+ ever dividing by zero.
+ !*/
+
+ template <typename SUBNET> void setup (const SUBNET& sub);
+ void forward_inplace(const tensor& input, tensor& output);
+ void backward_inplace(const tensor& computed_output, const tensor& gradient_input, tensor& data_grad, tensor& params_grad);
+ const tensor& get_layer_params() const;
+ tensor& get_layer_params();
+ /*!
+ These functions are implemented as described in the EXAMPLE_COMPUTATIONAL_LAYER_ interface.
+ !*/
+ };
+
+// ----------------------------------------------------------------------------------------
+
+ template <
+ long _offset,
+ long _k,
+ long _nr,
+ long _nc
+ >
+ class extract_
+ {
+ /*!
+ REQUIREMENTS ON TEMPLATE ARGUMENTS
+ - 0 <= _offset
+ - 0 < _k
+ - 0 < _nr
+ - 0 < _nc
+
+ WHAT THIS OBJECT REPRESENTS
+ This is an implementation of the EXAMPLE_COMPUTATIONAL_LAYER_ interface
+ defined above. In particular, the output of this layer is simply a copy of
+ the input tensor. However, you can configure the extract layer to output
+ only some subset of the input tensor and also to reshape it. Therefore,
+ the dimensions of the tensor output by this layer are as follows (letting
+ IN be the input tensor and OUT the output tensor):
+ - OUT.num_samples() == IN.num_samples()
+ - OUT.k() == _k
+ - OUT.nr() == _nr
+ - OUT.nc() == _nc
+
+ So the output will always have the same number of samples as the input, but
+ within each sample (the k,nr,nc part) we will copy only a subset of the
+ values. Moreover, the _offset parameter controls which part of each sample
+ we take. To be very precise, we will have:
+ - let IN_SIZE = IN.k()*IN.nr()*IN.nc()
+ - let OUT_SIZE = _k*_nr*_nc
+ - for i in range[0,IN.num_samples()) and j in range[0,OUT_SIZE):
+ - OUT.host()[i*OUT_SIZE+j] == IN.host()[i*IN_SIZE+_offset+j]
+
+
+ Finally, all this means that the input tensor to this layer must have a big
+ enough size to accommodate taking a _k*_nr*_nc slice from each of its
+ samples.
+ !*/
+
+ public:
+
+ template <typename SUBNET> void setup (const SUBNET& sub);
+ template <typename SUBNET> void forward(const SUBNET& sub, resizable_tensor& output);
+ template <typename SUBNET> void backward(const tensor& gradient_input, SUBNET& sub, tensor& params_grad);
+ const tensor& get_layer_params() const;
+ tensor& get_layer_params();
+ /*!
+ These functions are implemented as described in the EXAMPLE_COMPUTATIONAL_LAYER_ interface.
+ !*/
+ };
+
+ template <
+ long offset,
+ long k,
+ long nr,
+ long nc,
+ typename SUBNET
+ >
+ using extract = add_layer<extract_<offset,k,nr,nc>, SUBNET>;
+
+// ----------------------------------------------------------------------------------------
+
+}
+
+#endif // DLIB_DNn_LAYERS_ABSTRACT_H_
+
diff --git a/ml/dlib/dlib/dnn/loss.h b/ml/dlib/dlib/dnn/loss.h
new file mode 100644
index 000000000..1b09b85c3
--- /dev/null
+++ b/ml/dlib/dlib/dnn/loss.h
@@ -0,0 +1,2870 @@
+// Copyright (C) 2015 Davis E. King (davis@dlib.net)
+// License: Boost Software License See LICENSE.txt for the full license.
+#ifndef DLIB_DNn_LOSS_H_
+#define DLIB_DNn_LOSS_H_
+
+#include "loss_abstract.h"
+#include "core.h"
+#include "../matrix.h"
+#include "tensor_tools.h"
+#include "../geometry.h"
+#include "../image_processing/box_overlap_testing.h"
+#include "../image_processing/full_object_detection.h"
+#include "../svm/ranking_tools.h"
+#include <sstream>
+#include <map>
+
+namespace dlib
+{
+
+// ----------------------------------------------------------------------------------------
+
+ class loss_binary_hinge_
+ {
+ public:
+
+ typedef float training_label_type;
+ typedef float output_label_type;
+
+ template <
+ typename SUB_TYPE,
+ typename label_iterator
+ >
+ void to_label (
+ const tensor& input_tensor,
+ const SUB_TYPE& sub,
+ label_iterator iter
+ ) const
+ {
+ DLIB_CASSERT(sub.sample_expansion_factor() == 1);
+
+ const tensor& output_tensor = sub.get_output();
+ DLIB_CASSERT(output_tensor.nr() == 1 &&
+ output_tensor.nc() == 1 &&
+ output_tensor.k() == 1);
+ DLIB_CASSERT(input_tensor.num_samples() == output_tensor.num_samples());
+
+ const float* out_data = output_tensor.host();
+ for (long i = 0; i < output_tensor.num_samples(); ++i)
+ {
+ *iter++ = out_data[i];
+ }
+ }
+
+ template <
+ typename const_label_iterator,
+ typename SUBNET
+ >
+ double compute_loss_value_and_gradient (
+ const tensor& input_tensor,
+ const_label_iterator truth,
+ SUBNET& sub
+ ) const
+ {
+ const tensor& output_tensor = sub.get_output();
+ tensor& grad = sub.get_gradient_input();
+
+ DLIB_CASSERT(sub.sample_expansion_factor() == 1);
+ DLIB_CASSERT(input_tensor.num_samples() != 0);
+ DLIB_CASSERT(input_tensor.num_samples()%sub.sample_expansion_factor() == 0);
+ DLIB_CASSERT(input_tensor.num_samples() == grad.num_samples());
+ DLIB_CASSERT(input_tensor.num_samples() == output_tensor.num_samples());
+ DLIB_CASSERT(output_tensor.nr() == 1 &&
+ output_tensor.nc() == 1 &&
+ output_tensor.k() == 1);
+
+ // The loss we output is the average loss over the mini-batch.
+ const double scale = 1.0/output_tensor.num_samples();
+ double loss = 0;
+ const float* out_data = output_tensor.host();
+ float* g = grad.host_write_only();
+ for (long i = 0; i < output_tensor.num_samples(); ++i)
+ {
+ const float y = *truth++;
+ DLIB_CASSERT(y == +1 || y == -1, "y: " << y);
+ const float temp = 1-y*out_data[i];
+ if (temp > 0)
+ {
+ loss += scale*temp;
+ g[i] = -scale*y;
+ }
+ else
+ {
+ g[i] = 0;
+ }
+ }
+ return loss;
+ }
+
+ friend void serialize(const loss_binary_hinge_& , std::ostream& out)
+ {
+ serialize("loss_binary_hinge_", out);
+ }
+
+ friend void deserialize(loss_binary_hinge_& , std::istream& in)
+ {
+ std::string version;
+ deserialize(version, in);
+ if (version != "loss_binary_hinge_")
+ throw serialization_error("Unexpected version found while deserializing dlib::loss_binary_hinge_.");
+ }
+
+ friend std::ostream& operator<<(std::ostream& out, const loss_binary_hinge_& )
+ {
+ out << "loss_binary_hinge";
+ return out;
+ }
+
+ friend void to_xml(const loss_binary_hinge_& /*item*/, std::ostream& out)
+ {
+ out << "<loss_binary_hinge/>";
+ }
+
+ };
+
+ template <typename SUBNET>
+ using loss_binary_hinge = add_loss_layer<loss_binary_hinge_, SUBNET>;
+
+// ----------------------------------------------------------------------------------------
+
+ class loss_binary_log_
+ {
+ public:
+
+ typedef float training_label_type;
+ typedef float output_label_type;
+
+ template <
+ typename SUB_TYPE,
+ typename label_iterator
+ >
+ void to_label (
+ const tensor& input_tensor,
+ const SUB_TYPE& sub,
+ label_iterator iter
+ ) const
+ {
+ DLIB_CASSERT(sub.sample_expansion_factor() == 1);
+
+ const tensor& output_tensor = sub.get_output();
+ DLIB_CASSERT(output_tensor.nr() == 1 &&
+ output_tensor.nc() == 1 &&
+ output_tensor.k() == 1);
+ DLIB_CASSERT(input_tensor.num_samples() == output_tensor.num_samples());
+
+ const float* out_data = output_tensor.host();
+ for (long i = 0; i < output_tensor.num_samples(); ++i)
+ {
+ *iter++ = out_data[i];
+ }
+ }
+
+
+ template <
+ typename const_label_iterator,
+ typename SUBNET
+ >
+ double compute_loss_value_and_gradient (
+ const tensor& input_tensor,
+ const_label_iterator truth,
+ SUBNET& sub
+ ) const
+ {
+ const tensor& output_tensor = sub.get_output();
+ tensor& grad = sub.get_gradient_input();
+
+ DLIB_CASSERT(sub.sample_expansion_factor() == 1);
+ DLIB_CASSERT(input_tensor.num_samples() != 0);
+ DLIB_CASSERT(input_tensor.num_samples()%sub.sample_expansion_factor() == 0);
+ DLIB_CASSERT(input_tensor.num_samples() == grad.num_samples());
+ DLIB_CASSERT(input_tensor.num_samples() == output_tensor.num_samples());
+ DLIB_CASSERT(output_tensor.nr() == 1 &&
+ output_tensor.nc() == 1 &&
+ output_tensor.k() == 1);
+ DLIB_CASSERT(grad.nr() == 1 &&
+ grad.nc() == 1 &&
+ grad.k() == 1);
+
+ tt::sigmoid(grad, output_tensor);
+
+ // The loss we output is the average loss over the mini-batch.
+ const double scale = 1.0/output_tensor.num_samples();
+ double loss = 0;
+ float* g = grad.host();
+ const float* out_data = output_tensor.host();
+ for (long i = 0; i < output_tensor.num_samples(); ++i)
+ {
+ const float y = *truth++;
+ DLIB_CASSERT(y == +1 || y == -1, "y: " << y);
+ float temp;
+ if (y > 0)
+ {
+ temp = log1pexp(-out_data[i]);
+ loss += scale*temp;
+ g[i] = scale*(g[i]-1);
+ }
+ else
+ {
+ temp = -(-out_data[i]-log1pexp(-out_data[i]));
+ loss += scale*temp;
+ g[i] = scale*g[i];
+ }
+ }
+ return loss;
+ }
+
+ friend void serialize(const loss_binary_log_& , std::ostream& out)
+ {
+ serialize("loss_binary_log_", out);
+ }
+
+ friend void deserialize(loss_binary_log_& , std::istream& in)
+ {
+ std::string version;
+ deserialize(version, in);
+ if (version != "loss_binary_log_")
+ throw serialization_error("Unexpected version found while deserializing dlib::loss_binary_log_.");
+ }
+
+ friend std::ostream& operator<<(std::ostream& out, const loss_binary_log_& )
+ {
+ out << "loss_binary_log";
+ return out;
+ }
+
+ friend void to_xml(const loss_binary_log_& /*item*/, std::ostream& out)
+ {
+ out << "<loss_binary_log/>";
+ }
+
+ };
+
+ template <typename T>
+ T safe_log(T input, T epsilon = 1e-10)
+ {
+ // Prevent trying to calculate the logarithm of a very small number (let alone zero)
+ return std::log(std::max(input, epsilon));
+ }
+
+ template <typename SUBNET>
+ using loss_binary_log = add_loss_layer<loss_binary_log_, SUBNET>;
+
+// ----------------------------------------------------------------------------------------
+
+ class loss_multiclass_log_
+ {
+ public:
+
+ typedef unsigned long training_label_type;
+ typedef unsigned long output_label_type;
+
+ template <
+ typename SUB_TYPE,
+ typename label_iterator
+ >
+ void to_label (
+ const tensor& input_tensor,
+ const SUB_TYPE& sub,
+ label_iterator iter
+ ) const
+ {
+ const tensor& output_tensor = sub.get_output();
+ DLIB_CASSERT(sub.sample_expansion_factor() == 1);
+ DLIB_CASSERT(output_tensor.nr() == 1 &&
+ output_tensor.nc() == 1 );
+ DLIB_CASSERT(input_tensor.num_samples() == output_tensor.num_samples());
+
+
+ // Note that output_tensor.k() should match the number of labels.
+
+ for (long i = 0; i < output_tensor.num_samples(); ++i)
+ {
+ // The index of the largest output for this sample is the label.
+ *iter++ = index_of_max(rowm(mat(output_tensor),i));
+ }
+ }
+
+
+ template <
+ typename const_label_iterator,
+ typename SUBNET
+ >
+ double compute_loss_value_and_gradient (
+ const tensor& input_tensor,
+ const_label_iterator truth,
+ SUBNET& sub
+ ) const
+ {
+ const tensor& output_tensor = sub.get_output();
+ tensor& grad = sub.get_gradient_input();
+
+ DLIB_CASSERT(sub.sample_expansion_factor() == 1);
+ DLIB_CASSERT(input_tensor.num_samples() != 0);
+ DLIB_CASSERT(input_tensor.num_samples()%sub.sample_expansion_factor() == 0);
+ DLIB_CASSERT(input_tensor.num_samples() == grad.num_samples());
+ DLIB_CASSERT(input_tensor.num_samples() == output_tensor.num_samples());
+ DLIB_CASSERT(output_tensor.nr() == 1 &&
+ output_tensor.nc() == 1);
+ DLIB_CASSERT(grad.nr() == 1 &&
+ grad.nc() == 1);
+
+ tt::softmax(grad, output_tensor);
+
+ // The loss we output is the average loss over the mini-batch.
+ const double scale = 1.0/output_tensor.num_samples();
+ double loss = 0;
+ float* g = grad.host();
+ for (long i = 0; i < output_tensor.num_samples(); ++i)
+ {
+ const long y = (long)*truth++;
+ // The network must produce a number of outputs that is equal to the number
+ // of labels when using this type of loss.
+ DLIB_CASSERT(y < output_tensor.k(), "y: " << y << ", output_tensor.k(): " << output_tensor.k());
+ for (long k = 0; k < output_tensor.k(); ++k)
+ {
+ const unsigned long idx = i*output_tensor.k()+k;
+ if (k == y)
+ {
+ loss += scale*-safe_log(g[idx]);
+ g[idx] = scale*(g[idx]-1);
+ }
+ else
+ {
+ g[idx] = scale*g[idx];
+ }
+ }
+ }
+ return loss;
+ }
+
+ friend void serialize(const loss_multiclass_log_& , std::ostream& out)
+ {
+ serialize("loss_multiclass_log_", out);
+ }
+
+ friend void deserialize(loss_multiclass_log_& , std::istream& in)
+ {
+ std::string version;
+ deserialize(version, in);
+ if (version != "loss_multiclass_log_")
+ throw serialization_error("Unexpected version found while deserializing dlib::loss_multiclass_log_.");
+ }
+
+ friend std::ostream& operator<<(std::ostream& out, const loss_multiclass_log_& )
+ {
+ out << "loss_multiclass_log";
+ return out;
+ }
+
+ friend void to_xml(const loss_multiclass_log_& /*item*/, std::ostream& out)
+ {
+ out << "<loss_multiclass_log/>";
+ }
+
+ };
+
+ template <typename SUBNET>
+ using loss_multiclass_log = add_loss_layer<loss_multiclass_log_, SUBNET>;
+
+// ----------------------------------------------------------------------------------------
+
+ class loss_multimulticlass_log_
+ {
+
+ public:
+
+ loss_multimulticlass_log_ () = default;
+
+ loss_multimulticlass_log_ (
+ const std::map<std::string,std::vector<std::string>>& labels
+ )
+ {
+ for (auto& l : labels)
+ {
+ possible_labels[l.first] = std::make_shared<decltype(l.second)>(l.second);
+ DLIB_CASSERT(l.second.size() >= 2, "Each classifier must have at least two possible labels.");
+
+ for (size_t i = 0; i < l.second.size(); ++i)
+ {
+ label_idx_lookup[l.first][l.second[i]] = i;
+ ++total_num_labels;
+ }
+ }
+ }
+
+ unsigned long number_of_labels() const { return total_num_labels; }
+
+ unsigned long number_of_classifiers() const { return possible_labels.size(); }
+
+ std::map<std::string,std::vector<std::string>> get_labels (
+ ) const
+ {
+ std::map<std::string,std::vector<std::string>> info;
+ for (auto& i : possible_labels)
+ {
+ for (auto& label : *i.second)
+ info[i.first].emplace_back(label);
+ }
+ return info;
+ }
+
+ class classifier_output
+ {
+
+ public:
+ classifier_output() = default;
+
+ size_t num_classes() const { return class_probs.size(); }
+
+ double probability_of_class (
+ size_t i
+ ) const
+ {
+ DLIB_CASSERT(i < num_classes());
+ return class_probs(i);
+ }
+
+ const std::string& label(
+ size_t i
+ ) const
+ {
+ DLIB_CASSERT(i < num_classes());
+ return (*_labels)[i];
+ }
+
+ operator std::string(
+ ) const
+ {
+ DLIB_CASSERT(num_classes() != 0);
+ return (*_labels)[index_of_max(class_probs)];
+ }
+
+ friend std::ostream& operator<< (std::ostream& out, const classifier_output& item)
+ {
+ DLIB_ASSERT(item.num_classes() != 0);
+ out << static_cast<std::string>(item);
+ return out;
+ }
+
+ private:
+
+ friend class loss_multimulticlass_log_;
+
+ template <typename EXP>
+ classifier_output(
+ const matrix_exp<EXP>& class_probs,
+ const std::shared_ptr<std::vector<std::string>>& _labels
+ ) :
+ class_probs(class_probs),
+ _labels(_labels)
+ {
+ }
+
+ matrix<float,1,0> class_probs;
+ std::shared_ptr<std::vector<std::string>> _labels;
+ };
+
+ typedef std::map<std::string,std::string> training_label_type;
+ typedef std::map<std::string,classifier_output> output_label_type;
+
+
+ template <
+ typename SUB_TYPE,
+ typename label_iterator
+ >
+ void to_label (
+ const tensor& input_tensor,
+ const SUB_TYPE& sub,
+ label_iterator iter_begin
+ ) const
+ {
+ const tensor& output_tensor = sub.get_output();
+ DLIB_CASSERT(sub.sample_expansion_factor() == 1);
+ DLIB_CASSERT(output_tensor.nr() == 1 &&
+ output_tensor.nc() == 1 );
+ DLIB_CASSERT(input_tensor.num_samples() == output_tensor.num_samples());
+
+ DLIB_CASSERT(number_of_labels() != 0, "You must give the loss_multimulticlass_log_'s constructor label data before you can use it!");
+ DLIB_CASSERT(output_tensor.k() == (long)number_of_labels(), "The output tensor must have " << number_of_labels() << " channels.");
+
+
+ long k_offset = 0;
+ for (auto& l : possible_labels)
+ {
+ auto iter = iter_begin;
+ const std::string& classifier_name = l.first;
+ const auto& labels = (*l.second);
+ scratch.set_size(output_tensor.num_samples(), labels.size());
+ tt::copy_tensor(false, scratch, 0, output_tensor, k_offset, labels.size());
+
+ tt::softmax(scratch, scratch);
+
+ for (long i = 0; i < scratch.num_samples(); ++i)
+ (*iter++)[classifier_name] = classifier_output(rowm(mat(scratch),i), l.second);
+
+ k_offset += labels.size();
+ }
+ }
+
+
+ template <
+ typename const_label_iterator,
+ typename SUBNET
+ >
+ double compute_loss_value_and_gradient (
+ const tensor& input_tensor,
+ const_label_iterator truth_begin,
+ SUBNET& sub
+ ) const
+ {
+ const tensor& output_tensor = sub.get_output();
+ tensor& grad = sub.get_gradient_input();
+
+ DLIB_CASSERT(sub.sample_expansion_factor() == 1);
+ DLIB_CASSERT(input_tensor.num_samples() != 0);
+ DLIB_CASSERT(input_tensor.num_samples()%sub.sample_expansion_factor() == 0);
+ DLIB_CASSERT(input_tensor.num_samples() == grad.num_samples());
+ DLIB_CASSERT(input_tensor.num_samples() == output_tensor.num_samples());
+ DLIB_CASSERT(output_tensor.nr() == 1 &&
+ output_tensor.nc() == 1);
+ DLIB_CASSERT(grad.nr() == 1 &&
+ grad.nc() == 1);
+ DLIB_CASSERT(number_of_labels() != 0, "You must give the loss_multimulticlass_log_'s constructor label data before you can use it!");
+ DLIB_CASSERT(output_tensor.k() == (long)number_of_labels(), "The output tensor must have " << number_of_labels() << " channels.");
+
+ // The loss we output is the average loss over the mini-batch.
+ const double scale = 1.0/output_tensor.num_samples();
+ double loss = 0;
+ long k_offset = 0;
+ for (auto& l : label_idx_lookup)
+ {
+ const std::string& classifier_name = l.first;
+ const auto& int_labels = l.second;
+ scratch.set_size(output_tensor.num_samples(), int_labels.size());
+ tt::copy_tensor(false, scratch, 0, output_tensor, k_offset, int_labels.size());
+
+ tt::softmax(scratch, scratch);
+
+
+ auto truth = truth_begin;
+ float* g = scratch.host();
+ for (long i = 0; i < scratch.num_samples(); ++i)
+ {
+ const long y = int_labels.at(truth->at(classifier_name));
+ ++truth;
+
+ for (long k = 0; k < scratch.k(); ++k)
+ {
+ const unsigned long idx = i*scratch.k()+k;
+ if (k == y)
+ {
+ loss += scale*-std::log(g[idx]);
+ g[idx] = scale*(g[idx]-1);
+ }
+ else
+ {
+ g[idx] = scale*g[idx];
+ }
+ }
+ }
+
+ tt::copy_tensor(false, grad, k_offset, scratch, 0, int_labels.size());
+
+ k_offset += int_labels.size();
+ }
+ return loss;
+ }
+
+
+ friend void serialize(const loss_multimulticlass_log_& item, std::ostream& out)
+ {
+ serialize("loss_multimulticlass_log_", out);
+ serialize(item.get_labels(), out);
+ }
+
+ friend void deserialize(loss_multimulticlass_log_& item, std::istream& in)
+ {
+ std::string version;
+ deserialize(version, in);
+ if (version != "loss_multimulticlass_log_")
+ throw serialization_error("Unexpected version found while deserializing dlib::loss_multimulticlass_log_.");
+
+ std::map<std::string,std::vector<std::string>> info;
+ deserialize(info, in);
+ item = loss_multimulticlass_log_(info);
+ }
+
+ friend std::ostream& operator<<(std::ostream& out, const loss_multimulticlass_log_& item)
+ {
+ out << "loss_multimulticlass_log, labels={";
+ for (auto i = item.possible_labels.begin(); i != item.possible_labels.end(); )
+ {
+ auto& category = i->first;
+ auto& labels = *(i->second);
+ out << category << ":(";
+ for (size_t j = 0; j < labels.size(); ++j)
+ {
+ out << labels[j];
+ if (j+1 < labels.size())
+ out << ",";
+ }
+
+ out << ")";
+ if (++i != item.possible_labels.end())
+ out << ", ";
+ }
+ out << "}";
+ return out;
+ }
+
+ friend void to_xml(const loss_multimulticlass_log_& item, std::ostream& out)
+ {
+ out << "<loss_multimulticlass_log>\n";
+ out << item;
+ out << "\n</loss_multimulticlass_log>";
+ }
+
+ private:
+
+ std::map<std::string,std::shared_ptr<std::vector<std::string>>> possible_labels;
+ unsigned long total_num_labels = 0;
+
+ // We make it true that: possible_labels[classifier][label_idx_lookup[classifier][label]] == label
+ std::map<std::string, std::map<std::string,long>> label_idx_lookup;
+
+
+ // Scratch doesn't logically contribute to the state of this object. It's just
+ // temporary scratch space used by this class.
+ mutable resizable_tensor scratch;
+
+
+ };
+
+ template <typename SUBNET>
+ using loss_multimulticlass_log = add_loss_layer<loss_multimulticlass_log_, SUBNET>;
+
+ inline bool operator== (const std::string& lhs, const loss_multimulticlass_log_::classifier_output& rhs)
+ { return lhs == static_cast<const std::string&>(rhs); }
+ inline bool operator== (const loss_multimulticlass_log_::classifier_output& lhs, const std::string& rhs)
+ { return rhs == static_cast<const std::string&>(lhs); }
+
+// ----------------------------------------------------------------------------------------
+// ----------------------------------------------------------------------------------------
+
+ enum class use_image_pyramid : uint8_t
+ {
+ no,
+ yes
+ };
+
+ struct mmod_options
+ {
+ public:
+
+ struct detector_window_details
+ {
+ detector_window_details() = default;
+ detector_window_details(unsigned long w, unsigned long h) : width(w), height(h) {}
+ detector_window_details(unsigned long w, unsigned long h, const std::string& l) : width(w), height(h), label(l) {}
+
+ unsigned long width = 0;
+ unsigned long height = 0;
+ std::string label;
+
+ friend inline void serialize(const detector_window_details& item, std::ostream& out)
+ {
+ int version = 2;
+ serialize(version, out);
+ serialize(item.width, out);
+ serialize(item.height, out);
+ serialize(item.label, out);
+ }
+
+ friend inline void deserialize(detector_window_details& item, std::istream& in)
+ {
+ int version = 0;
+ deserialize(version, in);
+ if (version != 1 && version != 2)
+ throw serialization_error("Unexpected version found while deserializing dlib::mmod_options::detector_window_details");
+ deserialize(item.width, in);
+ deserialize(item.height, in);
+ if (version == 2)
+ deserialize(item.label, in);
+ }
+
+ };
+
+ mmod_options() = default;
+
+ std::vector<detector_window_details> detector_windows;
+ double loss_per_false_alarm = 1;
+ double loss_per_missed_target = 1;
+ double truth_match_iou_threshold = 0.5;
+ test_box_overlap overlaps_nms = test_box_overlap(0.4);
+ test_box_overlap overlaps_ignore;
+
+ use_image_pyramid assume_image_pyramid = use_image_pyramid::yes;
+
+ mmod_options (
+ const std::vector<std::vector<mmod_rect>>& boxes,
+ const unsigned long target_size, // We want the length of the longest dimension of the detector window to be this.
+ const unsigned long min_target_size, // But we require that the smallest dimension of the detector window be at least this big.
+ const double min_detector_window_overlap_iou = 0.75
+ )
+ {
+ DLIB_CASSERT(0 < min_target_size && min_target_size <= target_size);
+ DLIB_CASSERT(0.5 < min_detector_window_overlap_iou && min_detector_window_overlap_iou < 1);
+
+ // Figure out what detector windows we will need.
+ for (auto& label : get_labels(boxes))
+ {
+ for (auto ratio : find_covering_aspect_ratios(boxes, test_box_overlap(min_detector_window_overlap_iou), label))
+ {
+ double detector_width;
+ double detector_height;
+ if (ratio < 1)
+ {
+ detector_height = target_size;
+ detector_width = ratio*target_size;
+ if (detector_width < min_target_size)
+ {
+ detector_height = min_target_size/ratio;
+ detector_width = min_target_size;
+ }
+ }
+ else
+ {
+ detector_width = target_size;
+ detector_height = target_size/ratio;
+ if (detector_height < min_target_size)
+ {
+ detector_width = min_target_size*ratio;
+ detector_height = min_target_size;
+ }
+ }
+
+ detector_window_details p((unsigned long)std::round(detector_width), (unsigned long)std::round(detector_height), label);
+ detector_windows.push_back(p);
+ }
+ }
+
+ DLIB_CASSERT(detector_windows.size() != 0, "You can't call mmod_options's constructor with a set of boxes that is empty (or only contains ignored boxes).");
+
+ set_overlap_nms(boxes);
+ }
+
+ mmod_options(
+ use_image_pyramid assume_image_pyramid,
+ const std::vector<std::vector<mmod_rect>>& boxes,
+ const double min_detector_window_overlap_iou = 0.75
+ )
+ : assume_image_pyramid(assume_image_pyramid)
+ {
+ DLIB_CASSERT(assume_image_pyramid == use_image_pyramid::no);
+ DLIB_CASSERT(0.5 < min_detector_window_overlap_iou && min_detector_window_overlap_iou < 1);
+
+ // Figure out what detector windows we will need.
+ for (auto& label : get_labels(boxes))
+ {
+ for (auto rectangle : find_covering_rectangles(boxes, test_box_overlap(min_detector_window_overlap_iou), label))
+ {
+ detector_windows.push_back(detector_window_details(rectangle.width(), rectangle.height(), label));
+ }
+ }
+
+ DLIB_CASSERT(detector_windows.size() != 0, "You can't call mmod_options's constructor with a set of boxes that is empty (or only contains ignored boxes).");
+
+ set_overlap_nms(boxes);
+ }
+
+ private:
+
+ void set_overlap_nms(const std::vector<std::vector<mmod_rect>>& boxes)
+ {
+ // Convert from mmod_rect to rectangle so we can call
+ // find_tight_overlap_tester().
+ std::vector<std::vector<rectangle>> temp;
+ for (auto&& bi : boxes)
+ {
+ std::vector<rectangle> rtemp;
+ for (auto&& b : bi)
+ {
+ if (b.ignore)
+ continue;
+ rtemp.push_back(b.rect);
+ }
+ temp.push_back(std::move(rtemp));
+ }
+ overlaps_nms = find_tight_overlap_tester(temp);
+ // Relax the non-max-suppression a little so that it doesn't accidentally make
+ // it impossible for the detector to output boxes matching the training data.
+ // This could be a problem with the tightest possible nms test since there is
+ // some small variability in how boxes get positioned between the training data
+ // and the coordinate system used by the detector when it runs. So relaxing it
+ // here takes care of that.
+ auto iou_thresh = advance_toward_1(overlaps_nms.get_iou_thresh());
+ auto percent_covered_thresh = advance_toward_1(overlaps_nms.get_percent_covered_thresh());
+ overlaps_nms = test_box_overlap(iou_thresh, percent_covered_thresh);
+ }
+
+ static double advance_toward_1 (
+ double val
+ )
+ {
+ if (val < 1)
+ val += (1-val)*0.1;
+ return val;
+ }
+
+ static size_t count_overlaps (
+ const std::vector<rectangle>& rects,
+ const test_box_overlap& overlaps,
+ const rectangle& ref_box
+ )
+ {
+ size_t cnt = 0;
+ for (auto& b : rects)
+ {
+ if (overlaps(b, ref_box))
+ ++cnt;
+ }
+ return cnt;
+ }
+
+ static std::vector<rectangle> find_rectangles_overlapping_all_others (
+ std::vector<rectangle> rects,
+ const test_box_overlap& overlaps
+ )
+ {
+ std::vector<rectangle> exemplars;
+ dlib::rand rnd;
+
+ while(rects.size() > 0)
+ {
+ // Pick boxes at random and see if they overlap a lot of other boxes. We will try
+ // 500 different boxes each iteration and select whichever hits the most others to
+ // add to our exemplar set.
+ rectangle best_ref_box;
+ size_t best_cnt = 0;
+ for (int iter = 0; iter < 500; ++iter)
+ {
+ rectangle ref_box = rects[rnd.get_random_64bit_number()%rects.size()];
+ size_t cnt = count_overlaps(rects, overlaps, ref_box);
+ if (cnt >= best_cnt)
+ {
+ best_cnt = cnt;
+ best_ref_box = ref_box;
+ }
+ }
+
+ // Now mark all the boxes the new ref box hit as hit.
+ for (size_t i = 0; i < rects.size(); ++i)
+ {
+ if (overlaps(rects[i], best_ref_box))
+ {
+ // remove box from rects so we don't hit it again later
+ swap(rects[i], rects.back());
+ rects.pop_back();
+ --i;
+ }
+ }
+
+ exemplars.push_back(best_ref_box);
+ }
+
+ return exemplars;
+ }
+
+ static std::set<std::string> get_labels (
+ const std::vector<std::vector<mmod_rect>>& rects
+ )
+ {
+ std::set<std::string> labels;
+ for (auto& rr : rects)
+ {
+ for (auto& r : rr)
+ labels.insert(r.label);
+ }
+ return labels;
+ }
+
+ static std::vector<double> find_covering_aspect_ratios (
+ const std::vector<std::vector<mmod_rect>>& rects,
+ const test_box_overlap& overlaps,
+ const std::string& label
+ )
+ {
+ std::vector<rectangle> boxes;
+ // Make sure all the boxes have the same size and position, so that the only thing our
+ // checks for overlap will care about is aspect ratio (i.e. scale and x,y position are
+ // ignored).
+ for (auto& bb : rects)
+ {
+ for (auto&& b : bb)
+ {
+ if (!b.ignore && b.label == label)
+ boxes.push_back(move_rect(set_rect_area(b.rect,400*400), point(0,0)));
+ }
+ }
+
+ std::vector<double> ratios;
+ for (auto r : find_rectangles_overlapping_all_others(boxes, overlaps))
+ ratios.push_back(r.width()/(double)r.height());
+ return ratios;
+ }
+
+ static std::vector<dlib::rectangle> find_covering_rectangles (
+ const std::vector<std::vector<mmod_rect>>& rects,
+ const test_box_overlap& overlaps,
+ const std::string& label
+ )
+ {
+ std::vector<rectangle> boxes;
+ // Make sure all the boxes have the same position, so that the we only check for
+ // width and height.
+ for (auto& bb : rects)
+ {
+ for (auto&& b : bb)
+ {
+ if (!b.ignore && b.label == label)
+ boxes.push_back(rectangle(b.rect.width(), b.rect.height()));
+ }
+ }
+
+ return find_rectangles_overlapping_all_others(boxes, overlaps);
+ }
+ };
+
+ inline void serialize(const mmod_options& item, std::ostream& out)
+ {
+ int version = 3;
+
+ serialize(version, out);
+ serialize(item.detector_windows, out);
+ serialize(item.loss_per_false_alarm, out);
+ serialize(item.loss_per_missed_target, out);
+ serialize(item.truth_match_iou_threshold, out);
+ serialize(item.overlaps_nms, out);
+ serialize(item.overlaps_ignore, out);
+ serialize(static_cast<uint8_t>(item.assume_image_pyramid), out);
+ }
+
+ inline void deserialize(mmod_options& item, std::istream& in)
+ {
+ int version = 0;
+ deserialize(version, in);
+ if (version != 3 && version != 2 && version != 1)
+ throw serialization_error("Unexpected version found while deserializing dlib::mmod_options");
+ if (version == 1)
+ {
+ unsigned long width;
+ unsigned long height;
+ deserialize(width, in);
+ deserialize(height, in);
+ item.detector_windows = {mmod_options::detector_window_details(width, height)};
+ }
+ else
+ {
+ deserialize(item.detector_windows, in);
+ }
+ deserialize(item.loss_per_false_alarm, in);
+ deserialize(item.loss_per_missed_target, in);
+ deserialize(item.truth_match_iou_threshold, in);
+ deserialize(item.overlaps_nms, in);
+ deserialize(item.overlaps_ignore, in);
+ item.assume_image_pyramid = use_image_pyramid::yes;
+ if (version >= 3)
+ {
+ uint8_t assume_image_pyramid = 0;
+ deserialize(assume_image_pyramid, in);
+ item.assume_image_pyramid = static_cast<use_image_pyramid>(assume_image_pyramid);
+ }
+ }
+
+// ----------------------------------------------------------------------------------------
+
+ class loss_mmod_
+ {
+ struct intermediate_detection
+ {
+ intermediate_detection() = default;
+
+ intermediate_detection(
+ rectangle rect_
+ ) : rect(rect_) {}
+
+ intermediate_detection(
+ rectangle rect_,
+ double detection_confidence_,
+ size_t tensor_offset_,
+ long channel
+ ) : rect(rect_), detection_confidence(detection_confidence_), tensor_offset(tensor_offset_), tensor_channel(channel) {}
+
+ rectangle rect;
+ double detection_confidence = 0;
+ size_t tensor_offset = 0;
+ long tensor_channel = 0;
+
+ bool operator<(const intermediate_detection& item) const { return detection_confidence < item.detection_confidence; }
+ };
+
+ public:
+
+ typedef std::vector<mmod_rect> training_label_type;
+ typedef std::vector<mmod_rect> output_label_type;
+
+ loss_mmod_() {}
+
+ loss_mmod_(mmod_options options_) : options(options_) {}
+
+ const mmod_options& get_options (
+ ) const { return options; }
+
+ template <
+ typename SUB_TYPE,
+ typename label_iterator
+ >
+ void to_label (
+ const tensor& input_tensor,
+ const SUB_TYPE& sub,
+ label_iterator iter,
+ double adjust_threshold = 0
+ ) const
+ {
+ const tensor& output_tensor = sub.get_output();
+ DLIB_CASSERT(output_tensor.k() == (long)options.detector_windows.size());
+ DLIB_CASSERT(input_tensor.num_samples() == output_tensor.num_samples());
+ DLIB_CASSERT(sub.sample_expansion_factor() == 1, sub.sample_expansion_factor());
+
+ std::vector<intermediate_detection> dets_accum;
+ output_label_type final_dets;
+ for (long i = 0; i < output_tensor.num_samples(); ++i)
+ {
+ tensor_to_dets(input_tensor, output_tensor, i, dets_accum, adjust_threshold, sub);
+
+ // Do non-max suppression
+ final_dets.clear();
+ for (unsigned long i = 0; i < dets_accum.size(); ++i)
+ {
+ if (overlaps_any_box_nms(final_dets, dets_accum[i].rect))
+ continue;
+
+ final_dets.push_back(mmod_rect(dets_accum[i].rect,
+ dets_accum[i].detection_confidence,
+ options.detector_windows[dets_accum[i].tensor_channel].label));
+ }
+
+ *iter++ = std::move(final_dets);
+ }
+ }
+
+ template <
+ typename const_label_iterator,
+ typename SUBNET
+ >
+ double compute_loss_value_and_gradient (
+ const tensor& input_tensor,
+ const_label_iterator truth,
+ SUBNET& sub
+ ) const
+ {
+ const tensor& output_tensor = sub.get_output();
+ tensor& grad = sub.get_gradient_input();
+
+ DLIB_CASSERT(input_tensor.num_samples() != 0);
+ DLIB_CASSERT(sub.sample_expansion_factor() == 1);
+ DLIB_CASSERT(input_tensor.num_samples() == grad.num_samples());
+ DLIB_CASSERT(input_tensor.num_samples() == output_tensor.num_samples());
+ DLIB_CASSERT(output_tensor.k() == (long)options.detector_windows.size());
+
+ double det_thresh_speed_adjust = 0;
+
+
+ // we will scale the loss so that it doesn't get really huge
+ const double scale = 1.0/output_tensor.size();
+ double loss = 0;
+
+ float* g = grad.host_write_only();
+ for (size_t i = 0; i < grad.size(); ++i)
+ g[i] = 0;
+
+ const float* out_data = output_tensor.host();
+
+ std::vector<size_t> truth_idxs; truth_idxs.reserve(truth->size());
+ std::vector<intermediate_detection> dets;
+ for (long i = 0; i < output_tensor.num_samples(); ++i)
+ {
+ tensor_to_dets(input_tensor, output_tensor, i, dets, -options.loss_per_false_alarm + det_thresh_speed_adjust, sub);
+
+ const unsigned long max_num_dets = 50 + truth->size()*5;
+ // Prevent calls to tensor_to_dets() from running for a really long time
+ // due to the production of an obscene number of detections.
+ const unsigned long max_num_initial_dets = max_num_dets*100;
+ if (dets.size() >= max_num_initial_dets)
+ {
+ det_thresh_speed_adjust = std::max(det_thresh_speed_adjust,dets[max_num_initial_dets].detection_confidence + options.loss_per_false_alarm);
+ }
+
+
+ // The loss will measure the number of incorrect detections. A detection is
+ // incorrect if it doesn't hit a truth rectangle or if it is a duplicate detection
+ // on a truth rectangle.
+ loss += truth->size()*options.loss_per_missed_target;
+ for (auto&& x : *truth)
+ {
+ if (!x.ignore)
+ {
+ size_t k;
+ point p;
+ if(image_rect_to_feat_coord(p, input_tensor, x, x.label, sub, k, options.assume_image_pyramid))
+ {
+ // Ignore boxes that can't be detected by the CNN.
+ loss -= options.loss_per_missed_target;
+ continue;
+ }
+ const size_t idx = (k*output_tensor.nr() + p.y())*output_tensor.nc() + p.x();
+ loss -= out_data[idx];
+ // compute gradient
+ g[idx] = -scale;
+ truth_idxs.push_back(idx);
+ }
+ else
+ {
+ // This box was ignored so shouldn't have been counted in the loss.
+ loss -= options.loss_per_missed_target;
+ truth_idxs.push_back(0);
+ }
+ }
+
+ // Measure the loss augmented score for the detections which hit a truth rect.
+ std::vector<double> truth_score_hits(truth->size(), 0);
+
+ // keep track of which truth boxes we have hit so far.
+ std::vector<bool> hit_truth_table(truth->size(), false);
+
+ std::vector<intermediate_detection> final_dets;
+ // The point of this loop is to fill out the truth_score_hits array.
+ for (unsigned long i = 0; i < dets.size() && final_dets.size() < max_num_dets; ++i)
+ {
+ if (overlaps_any_box_nms(final_dets, dets[i].rect))
+ continue;
+
+ const auto& det_label = options.detector_windows[dets[i].tensor_channel].label;
+
+ const std::pair<double,unsigned int> hittruth = find_best_match(*truth, dets[i].rect, det_label);
+
+ final_dets.push_back(dets[i].rect);
+
+ const double truth_match = hittruth.first;
+ // if hit truth rect
+ if (truth_match > options.truth_match_iou_threshold)
+ {
+ // if this is the first time we have seen a detect which hit (*truth)[hittruth.second]
+ const double score = dets[i].detection_confidence;
+ if (hit_truth_table[hittruth.second] == false)
+ {
+ hit_truth_table[hittruth.second] = true;
+ truth_score_hits[hittruth.second] += score;
+ }
+ else
+ {
+ truth_score_hits[hittruth.second] += score + options.loss_per_false_alarm;
+ }
+ }
+ }
+
+ // Check if any of the truth boxes are unobtainable because the NMS is
+ // killing them. If so, automatically set those unobtainable boxes to
+ // ignore and print a warning message to the user.
+ for (size_t i = 0; i < hit_truth_table.size(); ++i)
+ {
+ if (!hit_truth_table[i] && !(*truth)[i].ignore)
+ {
+ // So we didn't hit this truth box. Is that because there is
+ // another, different truth box, that overlaps it according to NMS?
+ const std::pair<double,unsigned int> hittruth = find_best_match(*truth, (*truth)[i], i);
+ if (hittruth.second == i || (*truth)[hittruth.second].ignore)
+ continue;
+ rectangle best_matching_truth_box = (*truth)[hittruth.second];
+ if (options.overlaps_nms(best_matching_truth_box, (*truth)[i]))
+ {
+ const size_t idx = truth_idxs[i];
+ // We are ignoring this box so we shouldn't have counted it in the
+ // loss in the first place. So we subtract out the loss values we
+ // added for it in the code above.
+ loss -= options.loss_per_missed_target-out_data[idx];
+ g[idx] = 0;
+ std::cout << "Warning, ignoring object. We encountered a truth rectangle located at " << (*truth)[i].rect;
+ std::cout << " that is suppressed by non-max-suppression ";
+ std::cout << "because it is overlapped by another truth rectangle located at " << best_matching_truth_box
+ << " (IoU:"<< box_intersection_over_union(best_matching_truth_box,(*truth)[i]) <<", Percent covered:"
+ << box_percent_covered(best_matching_truth_box,(*truth)[i]) << ")." << std::endl;
+ }
+ }
+ }
+
+ hit_truth_table.assign(hit_truth_table.size(), false);
+ final_dets.clear();
+
+
+ // Now figure out which detections jointly maximize the loss and detection score sum. We
+ // need to take into account the fact that allowing a true detection in the output, while
+ // initially reducing the loss, may allow us to increase the loss later with many duplicate
+ // detections.
+ for (unsigned long i = 0; i < dets.size() && final_dets.size() < max_num_dets; ++i)
+ {
+ if (overlaps_any_box_nms(final_dets, dets[i].rect))
+ continue;
+
+ const auto& det_label = options.detector_windows[dets[i].tensor_channel].label;
+
+ const std::pair<double,unsigned int> hittruth = find_best_match(*truth, dets[i].rect, det_label);
+
+ const double truth_match = hittruth.first;
+ if (truth_match > options.truth_match_iou_threshold)
+ {
+ if (truth_score_hits[hittruth.second] > options.loss_per_missed_target)
+ {
+ if (!hit_truth_table[hittruth.second])
+ {
+ hit_truth_table[hittruth.second] = true;
+ final_dets.push_back(dets[i]);
+ loss -= options.loss_per_missed_target;
+ }
+ else
+ {
+ final_dets.push_back(dets[i]);
+ loss += options.loss_per_false_alarm;
+ }
+ }
+ }
+ else if (!overlaps_ignore_box(*truth, dets[i].rect))
+ {
+ // didn't hit anything
+ final_dets.push_back(dets[i]);
+ loss += options.loss_per_false_alarm;
+ }
+ }
+
+ for (auto&& x : final_dets)
+ {
+ loss += out_data[x.tensor_offset];
+ g[x.tensor_offset] += scale;
+ }
+
+ ++truth;
+ g += output_tensor.k()*output_tensor.nr()*output_tensor.nc();
+ out_data += output_tensor.k()*output_tensor.nr()*output_tensor.nc();
+ } // END for (long i = 0; i < output_tensor.num_samples(); ++i)
+
+
+ // Here we scale the loss so that it's roughly equal to the number of mistakes
+ // in an image. Note that this scaling is different than the scaling we
+ // applied to the gradient but it doesn't matter since the loss value isn't
+ // used to update parameters. It's used only for display and to check if we
+ // have converged. So it doesn't matter that they are scaled differently and
+ // this way the loss that is displayed is readily interpretable to the user.
+ return loss/output_tensor.num_samples();
+ }
+
+
+ friend void serialize(const loss_mmod_& item, std::ostream& out)
+ {
+ serialize("loss_mmod_", out);
+ serialize(item.options, out);
+ }
+
+ friend void deserialize(loss_mmod_& item, std::istream& in)
+ {
+ std::string version;
+ deserialize(version, in);
+ if (version != "loss_mmod_")
+ throw serialization_error("Unexpected version found while deserializing dlib::loss_mmod_.");
+ deserialize(item.options, in);
+ }
+
+ friend std::ostream& operator<<(std::ostream& out, const loss_mmod_& item)
+ {
+ out << "loss_mmod\t (";
+
+ out << "detector_windows:(";
+ auto& opts = item.options;
+ for (size_t i = 0; i < opts.detector_windows.size(); ++i)
+ {
+ out << opts.detector_windows[i].width << "x" << opts.detector_windows[i].height;
+ if (i+1 < opts.detector_windows.size())
+ out << ",";
+ }
+ out << ")";
+ out << ", loss per FA:" << opts.loss_per_false_alarm;
+ out << ", loss per miss:" << opts.loss_per_missed_target;
+ out << ", truth match IOU thresh:" << opts.truth_match_iou_threshold;
+ out << ", overlaps_nms:("<<opts.overlaps_nms.get_iou_thresh()<<","<<opts.overlaps_nms.get_percent_covered_thresh()<<")";
+ out << ", overlaps_ignore:("<<opts.overlaps_ignore.get_iou_thresh()<<","<<opts.overlaps_ignore.get_percent_covered_thresh()<<")";
+
+ out << ")";
+ return out;
+ }
+
+ friend void to_xml(const loss_mmod_& /*item*/, std::ostream& out)
+ {
+ // TODO, add options fields
+ out << "<loss_mmod/>";
+ }
+
+ private:
+
+ template <typename net_type>
+ void tensor_to_dets (
+ const tensor& input_tensor,
+ const tensor& output_tensor,
+ long i,
+ std::vector<intermediate_detection>& dets_accum,
+ double adjust_threshold,
+ const net_type& net
+ ) const
+ {
+ DLIB_CASSERT(net.sample_expansion_factor() == 1,net.sample_expansion_factor());
+ DLIB_CASSERT(output_tensor.k() == (long)options.detector_windows.size());
+ const float* out_data = output_tensor.host() + output_tensor.k()*output_tensor.nr()*output_tensor.nc()*i;
+ // scan the final layer and output the positive scoring locations
+ dets_accum.clear();
+ for (long k = 0; k < output_tensor.k(); ++k)
+ {
+ for (long r = 0; r < output_tensor.nr(); ++r)
+ {
+ for (long c = 0; c < output_tensor.nc(); ++c)
+ {
+ double score = out_data[(k*output_tensor.nr() + r)*output_tensor.nc() + c];
+ if (score > adjust_threshold)
+ {
+ dpoint p = output_tensor_to_input_tensor(net, point(c,r));
+ drectangle rect = centered_drect(p, options.detector_windows[k].width, options.detector_windows[k].height);
+ rect = input_layer(net).tensor_space_to_image_space(input_tensor,rect);
+
+ dets_accum.push_back(intermediate_detection(rect, score, (k*output_tensor.nr() + r)*output_tensor.nc() + c, k));
+ }
+ }
+ }
+ }
+ std::sort(dets_accum.rbegin(), dets_accum.rend());
+ }
+
+ size_t find_best_detection_window (
+ rectangle rect,
+ const std::string& label,
+ use_image_pyramid assume_image_pyramid
+ ) const
+ {
+ if (assume_image_pyramid == use_image_pyramid::yes)
+ {
+ rect = move_rect(set_rect_area(rect, 400*400), point(0,0));
+ }
+ else
+ {
+ rect = rectangle(rect.width(), rect.height());
+ }
+
+ // Figure out which detection window in options.detector_windows is most similar to rect
+ // (in terms of aspect ratio, if assume_image_pyramid == use_image_pyramid::yes).
+ size_t best_i = 0;
+ double best_ratio_diff = -std::numeric_limits<double>::infinity();
+ for (size_t i = 0; i < options.detector_windows.size(); ++i)
+ {
+ if (options.detector_windows[i].label != label)
+ continue;
+
+ rectangle det_window;
+
+ if (options.assume_image_pyramid == use_image_pyramid::yes)
+ {
+ det_window = centered_rect(point(0,0), options.detector_windows[i].width, options.detector_windows[i].height);
+ det_window = move_rect(set_rect_area(det_window, 400*400), point(0,0));
+ }
+ else
+ {
+ det_window = rectangle(options.detector_windows[i].width, options.detector_windows[i].height);
+ }
+
+ double iou = box_intersection_over_union(rect, det_window);
+ if (iou > best_ratio_diff)
+ {
+ best_ratio_diff = iou;
+ best_i = i;
+ }
+ }
+ return best_i;
+ }
+
+ template <typename net_type>
+ bool image_rect_to_feat_coord (
+ point& tensor_p,
+ const tensor& input_tensor,
+ const rectangle& rect,
+ const std::string& label,
+ const net_type& net,
+ size_t& det_idx,
+ use_image_pyramid assume_image_pyramid
+ ) const
+ {
+ using namespace std;
+ if (!input_layer(net).image_contained_point(input_tensor,center(rect)))
+ {
+ std::ostringstream sout;
+ sout << "Encountered a truth rectangle located at " << rect << " that is outside the image." << endl;
+ sout << "The center of each truth rectangle must be within the image." << endl;
+ throw impossible_labeling_error(sout.str());
+ }
+
+ det_idx = find_best_detection_window(rect,label,assume_image_pyramid);
+
+ double scale = 1.0;
+ if (options.assume_image_pyramid == use_image_pyramid::yes)
+ {
+ // Compute the scale we need to be at to get from rect to our detection window.
+ // Note that we compute the scale as the max of two numbers. It doesn't
+ // actually matter which one we pick, because if they are very different then
+ // it means the box can't be matched by the sliding window. But picking the
+ // max causes the right error message to be selected in the logic below.
+ scale = std::max(options.detector_windows[det_idx].width/(double)rect.width(), options.detector_windows[det_idx].height/(double)rect.height());
+ }
+ else
+ {
+ // We don't want invariance to scale.
+ scale = 1.0;
+ }
+
+ const rectangle mapped_rect = input_layer(net).image_space_to_tensor_space(input_tensor, std::min(1.0,scale), rect);
+
+ // compute the detection window that we would use at this position.
+ tensor_p = center(mapped_rect);
+ rectangle det_window = centered_rect(tensor_p, options.detector_windows[det_idx].width,options.detector_windows[det_idx].height);
+ det_window = input_layer(net).tensor_space_to_image_space(input_tensor, det_window);
+
+ // make sure the rect can actually be represented by the image pyramid we are
+ // using.
+ if (box_intersection_over_union(rect, det_window) <= options.truth_match_iou_threshold)
+ {
+ std::cout << "Warning, ignoring object. We encountered a truth rectangle with a width and height of " << rect.width() << " and " << rect.height() << ". ";
+ std::cout << "The image pyramid and sliding windows can't output a rectangle of this shape. ";
+ const double detector_area = options.detector_windows[det_idx].width*options.detector_windows[det_idx].height;
+ if (mapped_rect.area()/detector_area <= options.truth_match_iou_threshold)
+ {
+ std::cout << "This is because the rectangle is smaller than the best matching detection window, which has a width ";
+ std::cout << "and height of " << options.detector_windows[det_idx].width << " and " << options.detector_windows[det_idx].height << "." << std::endl;
+ }
+ else
+ {
+ std::cout << "This is either because (1) the final layer's features have too large of a stride across the image, limiting the possible locations the sliding window can search ";
+ std::cout << "or (2) because the rectangle's aspect ratio is too different from the best matching detection window, ";
+ std::cout << "which has a width and height of " << options.detector_windows[det_idx].width << " and " << options.detector_windows[det_idx].height << "." << std::endl;
+ }
+ return true;
+ }
+
+ // now map through the CNN to the output layer.
+ tensor_p = input_tensor_to_output_tensor(net,tensor_p);
+
+ const tensor& output_tensor = net.get_output();
+ if (!get_rect(output_tensor).contains(tensor_p))
+ {
+ std::cout << "Warning, ignoring object. We encountered a truth rectangle located at " << rect << " that is too close to the edge ";
+ std::cout << "of the image to be captured by the CNN features." << std::endl;
+ return true;
+ }
+
+ return false;
+ }
+
+
+ bool overlaps_ignore_box (
+ const std::vector<mmod_rect>& boxes,
+ const rectangle& rect
+ ) const
+ {
+ for (auto&& b : boxes)
+ {
+ if (b.ignore && options.overlaps_ignore(b, rect))
+ return true;
+ }
+ return false;
+ }
+
+ std::pair<double,unsigned int> find_best_match(
+ const std::vector<mmod_rect>& boxes,
+ const rectangle& rect,
+ const std::string& label
+ ) const
+ {
+ double match = 0;
+ unsigned int best_idx = 0;
+ for (unsigned long i = 0; i < boxes.size(); ++i)
+ {
+ if (boxes[i].ignore || boxes[i].label != label)
+ continue;
+
+ const double new_match = box_intersection_over_union(rect, boxes[i]);
+ if (new_match > match)
+ {
+ match = new_match;
+ best_idx = i;
+ }
+ }
+
+ return std::make_pair(match,best_idx);
+ }
+
+ std::pair<double,unsigned int> find_best_match(
+ const std::vector<mmod_rect>& boxes,
+ const rectangle& rect,
+ const size_t excluded_idx
+ ) const
+ {
+ double match = 0;
+ unsigned int best_idx = 0;
+ for (unsigned long i = 0; i < boxes.size(); ++i)
+ {
+ if (boxes[i].ignore || excluded_idx == i)
+ continue;
+
+ const double new_match = box_intersection_over_union(rect, boxes[i]);
+ if (new_match > match)
+ {
+ match = new_match;
+ best_idx = i;
+ }
+ }
+
+ return std::make_pair(match,best_idx);
+ }
+
+ template <typename T>
+ inline bool overlaps_any_box_nms (
+ const std::vector<T>& rects,
+ const rectangle& rect
+ ) const
+ {
+ for (auto&& r : rects)
+ {
+ if (options.overlaps_nms(r.rect, rect))
+ return true;
+ }
+ return false;
+ }
+
+
+ mmod_options options;
+
+ };
+
+ template <typename SUBNET>
+ using loss_mmod = add_loss_layer<loss_mmod_, SUBNET>;
+
+// ----------------------------------------------------------------------------------------
+
+ class loss_metric_
+ {
+ public:
+
+ typedef unsigned long training_label_type;
+ typedef matrix<float,0,1> output_label_type;
+
+ loss_metric_() = default;
+
+ loss_metric_(
+ float margin_,
+ float dist_thresh_
+ ) : margin(margin_), dist_thresh(dist_thresh_)
+ {
+ DLIB_CASSERT(margin_ > 0);
+ DLIB_CASSERT(dist_thresh_ > 0);
+ }
+
+ template <
+ typename SUB_TYPE,
+ typename label_iterator
+ >
+ void to_label (
+ const tensor& input_tensor,
+ const SUB_TYPE& sub,
+ label_iterator iter
+ ) const
+ {
+ const tensor& output_tensor = sub.get_output();
+ DLIB_CASSERT(sub.sample_expansion_factor() == 1);
+ DLIB_CASSERT(input_tensor.num_samples() != 0);
+ DLIB_CASSERT(input_tensor.num_samples()%sub.sample_expansion_factor() == 0);
+ DLIB_CASSERT(input_tensor.num_samples() == output_tensor.num_samples());
+ DLIB_CASSERT(output_tensor.nr() == 1 &&
+ output_tensor.nc() == 1);
+
+ const float* p = output_tensor.host();
+ for (long i = 0; i < output_tensor.num_samples(); ++i)
+ {
+ *iter = mat(p,output_tensor.k(),1);
+
+ ++iter;
+ p += output_tensor.k();
+ }
+ }
+
+
+ float get_margin() const { return margin; }
+ float get_distance_threshold() const { return dist_thresh; }
+
+ template <
+ typename const_label_iterator,
+ typename SUBNET
+ >
+ double compute_loss_value_and_gradient (
+ const tensor& input_tensor,
+ const_label_iterator truth,
+ SUBNET& sub
+ ) const
+ {
+ const tensor& output_tensor = sub.get_output();
+ tensor& grad = sub.get_gradient_input();
+
+ DLIB_CASSERT(sub.sample_expansion_factor() == 1);
+ DLIB_CASSERT(input_tensor.num_samples() != 0);
+ DLIB_CASSERT(input_tensor.num_samples()%sub.sample_expansion_factor() == 0);
+ DLIB_CASSERT(input_tensor.num_samples() == grad.num_samples());
+ DLIB_CASSERT(input_tensor.num_samples() == output_tensor.num_samples());
+ DLIB_CASSERT(output_tensor.nr() == 1 &&
+ output_tensor.nc() == 1);
+ DLIB_CASSERT(grad.nr() == 1 &&
+ grad.nc() == 1);
+
+
+
+ temp.set_size(output_tensor.num_samples(), output_tensor.num_samples());
+ grad_mul.copy_size(temp);
+
+ tt::gemm(0, temp, 1, output_tensor, false, output_tensor, true);
+
+
+ std::vector<double> temp_threshs;
+ const float* d = temp.host();
+ double loss = 0;
+ double num_pos_samps = 0.0001;
+ double num_neg_samps = 0.0001;
+ for (long r = 0; r < temp.num_samples(); ++r)
+ {
+ auto xx = d[r*temp.num_samples() + r];
+ const auto x_label = *(truth + r);
+ for (long c = r+1; c < temp.num_samples(); ++c)
+ {
+ const auto y_label = *(truth + c);
+ if (x_label == y_label)
+ {
+ ++num_pos_samps;
+ }
+ else
+ {
+ ++num_neg_samps;
+
+ // Figure out what distance threshold, when applied to the negative pairs,
+ // causes there to be an equal number of positive and negative pairs.
+ auto yy = d[c*temp.num_samples() + c];
+ auto xy = d[r*temp.num_samples() + c];
+ // compute the distance between x and y samples.
+ auto d2 = xx + yy - 2*xy;
+ if (d2 < 0)
+ d2 = 0;
+ temp_threshs.push_back(d2);
+ }
+ }
+ }
+ // The whole objective function is multiplied by this to scale the loss
+ // relative to the number of things in the mini-batch.
+ const double scale = 0.5/num_pos_samps;
+ DLIB_CASSERT(num_pos_samps>=1, "Make sure each mini-batch contains both positive pairs and negative pairs");
+ DLIB_CASSERT(num_neg_samps>=1, "Make sure each mini-batch contains both positive pairs and negative pairs");
+
+ std::sort(temp_threshs.begin(), temp_threshs.end());
+ const float neg_thresh = std::sqrt(temp_threshs[std::min(num_pos_samps,num_neg_samps)-1]);
+
+ // loop over all the pairs of training samples and compute the loss and
+ // gradients. Note that we only use the hardest negative pairs and that in
+ // particular we pick the number of negative pairs equal to the number of
+ // positive pairs so everything is balanced.
+ float* gm = grad_mul.host();
+ for (long r = 0; r < temp.num_samples(); ++r)
+ {
+ gm[r*temp.num_samples() + r] = 0;
+ const auto x_label = *(truth + r);
+ auto xx = d[r*temp.num_samples() + r];
+ for (long c = 0; c < temp.num_samples(); ++c)
+ {
+ if (r==c)
+ continue;
+ const auto y_label = *(truth + c);
+ auto yy = d[c*temp.num_samples() + c];
+ auto xy = d[r*temp.num_samples() + c];
+
+ // compute the distance between x and y samples.
+ auto d2 = xx + yy - 2*xy;
+ if (d2 <= 0)
+ d2 = 0;
+ else
+ d2 = std::sqrt(d2);
+
+ // It should be noted that the derivative of length(x-y) with respect
+ // to the x vector is the unit vector (x-y)/length(x-y). If you stare
+ // at the code below long enough you will see that it's just an
+ // application of this formula.
+
+ if (x_label == y_label)
+ {
+ // Things with the same label should have distances < dist_thresh between
+ // them. If not then we experience non-zero loss.
+ if (d2 < dist_thresh-margin)
+ {
+ gm[r*temp.num_samples() + c] = 0;
+ }
+ else
+ {
+ loss += scale*(d2 - (dist_thresh-margin));
+ gm[r*temp.num_samples() + r] += scale/d2;
+ gm[r*temp.num_samples() + c] = -scale/d2;
+ }
+ }
+ else
+ {
+ // Things with different labels should have distances > dist_thresh between
+ // them. If not then we experience non-zero loss.
+ if (d2 > dist_thresh+margin || d2 > neg_thresh)
+ {
+ gm[r*temp.num_samples() + c] = 0;
+ }
+ else
+ {
+ loss += scale*((dist_thresh+margin) - d2);
+ // don't divide by zero (or a really small number)
+ d2 = std::max(d2, 0.001f);
+ gm[r*temp.num_samples() + r] -= scale/d2;
+ gm[r*temp.num_samples() + c] = scale/d2;
+ }
+ }
+ }
+ }
+
+
+ tt::gemm(0, grad, 1, grad_mul, false, output_tensor, false);
+
+ return loss;
+ }
+
+ friend void serialize(const loss_metric_& item, std::ostream& out)
+ {
+ serialize("loss_metric_2", out);
+ serialize(item.margin, out);
+ serialize(item.dist_thresh, out);
+ }
+
+ friend void deserialize(loss_metric_& item, std::istream& in)
+ {
+ std::string version;
+ deserialize(version, in);
+ if (version == "loss_metric_")
+ {
+ // These values used to be hard coded, so for this version of the metric
+ // learning loss we just use these values.
+ item.margin = 0.1;
+ item.dist_thresh = 0.75;
+ return;
+ }
+ else if (version == "loss_metric_2")
+ {
+ deserialize(item.margin, in);
+ deserialize(item.dist_thresh, in);
+ }
+ else
+ {
+ throw serialization_error("Unexpected version found while deserializing dlib::loss_metric_. Instead found " + version);
+ }
+ }
+
+ friend std::ostream& operator<<(std::ostream& out, const loss_metric_& item )
+ {
+ out << "loss_metric (margin="<<item.margin<<", distance_threshold="<<item.dist_thresh<<")";
+ return out;
+ }
+
+ friend void to_xml(const loss_metric_& item, std::ostream& out)
+ {
+ out << "<loss_metric margin='"<<item.margin<<"' distance_threshold='"<<item.dist_thresh<<"'/>";
+ }
+
+ private:
+ float margin = 0.04;
+ float dist_thresh = 0.6;
+
+
+ // These variables are only here to avoid being reallocated over and over in
+ // compute_loss_value_and_gradient()
+ mutable resizable_tensor temp, grad_mul;
+
+ };
+
+ template <typename SUBNET>
+ using loss_metric = add_loss_layer<loss_metric_, SUBNET>;
+
+// ----------------------------------------------------------------------------------------
+
+ class loss_ranking_
+ {
+ public:
+
+ typedef float training_label_type; // nominally +1/-1
+ typedef float output_label_type; // ranking score
+
+ template <
+ typename SUB_TYPE,
+ typename label_iterator
+ >
+ void to_label (
+ const tensor& input_tensor,
+ const SUB_TYPE& sub,
+ label_iterator iter
+ ) const
+ {
+ DLIB_CASSERT(sub.sample_expansion_factor() == 1);
+
+ const tensor& output_tensor = sub.get_output();
+
+ DLIB_CASSERT(output_tensor.nr() == 1 &&
+ output_tensor.nc() == 1 &&
+ output_tensor.k() == 1);
+ DLIB_CASSERT(input_tensor.num_samples() == output_tensor.num_samples());
+
+ const float* out_data = output_tensor.host();
+ for (long i = 0; i < output_tensor.num_samples(); ++i)
+ {
+ *iter++ = out_data[i];
+ }
+ }
+
+
+ template <
+ typename const_label_iterator,
+ typename SUBNET
+ >
+ double compute_loss_value_and_gradient (
+ const tensor& input_tensor,
+ const_label_iterator truth,
+ SUBNET& sub
+ ) const
+ {
+ const tensor& output_tensor = sub.get_output();
+ tensor& grad = sub.get_gradient_input();
+
+ DLIB_CASSERT(sub.sample_expansion_factor() == 1);
+ DLIB_CASSERT(input_tensor.num_samples() != 0);
+ DLIB_CASSERT(input_tensor.num_samples()%sub.sample_expansion_factor() == 0);
+ DLIB_CASSERT(input_tensor.num_samples() == grad.num_samples());
+ DLIB_CASSERT(input_tensor.num_samples() == output_tensor.num_samples());
+ DLIB_CASSERT(output_tensor.nr() == 1 &&
+ output_tensor.nc() == 1 &&
+ output_tensor.k() == 1);
+ DLIB_CASSERT(grad.nr() == 1 &&
+ grad.nc() == 1 &&
+ grad.k() == 1);
+
+
+ std::vector<double> rel_scores;
+ std::vector<double> nonrel_scores;
+ std::vector<long> rel_idx, nonrel_idx;
+
+ const float* out_data = output_tensor.host();
+ float* g = grad.host_write_only();
+ for (long i = 0; i < output_tensor.num_samples(); ++i)
+ {
+ const float y = *truth++;
+ if (y > 0)
+ {
+ rel_scores.push_back(out_data[i]-y);
+ rel_idx.push_back(i);
+ }
+ else if (y < 0)
+ {
+ nonrel_scores.push_back(out_data[i]-y);
+ nonrel_idx.push_back(i);
+ }
+ else
+ {
+ g[i] = 0;
+ }
+ }
+
+
+ std::vector<unsigned long> rel_counts;
+ std::vector<unsigned long> nonrel_counts;
+ count_ranking_inversions(rel_scores, nonrel_scores, rel_counts, nonrel_counts);
+ const unsigned long total_pairs = rel_scores.size()*nonrel_scores.size();
+ DLIB_CASSERT(total_pairs > 0, "You can't give a ranking mini-batch that contains only one class. Both classes must be represented.");
+ const double scale = 1.0/total_pairs;
+
+
+ double loss = 0;
+ for (unsigned long k = 0; k < rel_counts.size(); ++k)
+ {
+ loss -= rel_counts[k]*rel_scores[k];
+ g[rel_idx[k]] = -1.0*rel_counts[k]*scale;
+ }
+
+ for (unsigned long k = 0; k < nonrel_counts.size(); ++k)
+ {
+ loss += nonrel_counts[k]*nonrel_scores[k];
+ g[nonrel_idx[k]] = nonrel_counts[k]*scale;
+ }
+
+ return loss*scale;
+ }
+
+ friend void serialize(const loss_ranking_& , std::ostream& out)
+ {
+ serialize("loss_ranking_", out);
+ }
+
+ friend void deserialize(loss_ranking_& , std::istream& in)
+ {
+ std::string version;
+ deserialize(version, in);
+ if (version != "loss_ranking_")
+ throw serialization_error("Unexpected version found while deserializing dlib::loss_ranking_.");
+ }
+
+ friend std::ostream& operator<<(std::ostream& out, const loss_ranking_& )
+ {
+ out << "loss_ranking";
+ return out;
+ }
+
+ friend void to_xml(const loss_ranking_& /*item*/, std::ostream& out)
+ {
+ out << "<loss_ranking/>";
+ }
+
+ };
+
+ template <typename SUBNET>
+ using loss_ranking = add_loss_layer<loss_ranking_, SUBNET>;
+
+// ----------------------------------------------------------------------------------------
+
+ class loss_mean_squared_
+ {
+ public:
+
+ typedef float training_label_type;
+ typedef float output_label_type;
+
+ template <
+ typename SUB_TYPE,
+ typename label_iterator
+ >
+ void to_label (
+ const tensor& input_tensor,
+ const SUB_TYPE& sub,
+ label_iterator iter
+ ) const
+ {
+ DLIB_CASSERT(sub.sample_expansion_factor() == 1);
+
+ const tensor& output_tensor = sub.get_output();
+
+ DLIB_CASSERT(output_tensor.nr() == 1 &&
+ output_tensor.nc() == 1 &&
+ output_tensor.k() == 1);
+ DLIB_CASSERT(input_tensor.num_samples() == output_tensor.num_samples());
+
+ const float* out_data = output_tensor.host();
+ for (long i = 0; i < output_tensor.num_samples(); ++i)
+ {
+ *iter++ = out_data[i];
+ }
+ }
+
+
+ template <
+ typename const_label_iterator,
+ typename SUBNET
+ >
+ double compute_loss_value_and_gradient (
+ const tensor& input_tensor,
+ const_label_iterator truth,
+ SUBNET& sub
+ ) const
+ {
+ const tensor& output_tensor = sub.get_output();
+ tensor& grad = sub.get_gradient_input();
+
+ DLIB_CASSERT(sub.sample_expansion_factor() == 1);
+ DLIB_CASSERT(input_tensor.num_samples() != 0);
+ DLIB_CASSERT(input_tensor.num_samples()%sub.sample_expansion_factor() == 0);
+ DLIB_CASSERT(input_tensor.num_samples() == grad.num_samples());
+ DLIB_CASSERT(input_tensor.num_samples() == output_tensor.num_samples());
+ DLIB_CASSERT(output_tensor.nr() == 1 &&
+ output_tensor.nc() == 1 &&
+ output_tensor.k() == 1);
+ DLIB_CASSERT(grad.nr() == 1 &&
+ grad.nc() == 1 &&
+ grad.k() == 1);
+
+ // The loss we output is the average loss over the mini-batch.
+ const double scale = 1.0/output_tensor.num_samples();
+ double loss = 0;
+ float* g = grad.host_write_only();
+ const float* out_data = output_tensor.host();
+ for (long i = 0; i < output_tensor.num_samples(); ++i)
+ {
+ const float y = *truth++;
+ const float temp1 = y - out_data[i];
+ const float temp2 = scale*temp1;
+ loss += temp2*temp1;
+ g[i] = -temp2;
+
+ }
+ return loss;
+ }
+
+ friend void serialize(const loss_mean_squared_& , std::ostream& out)
+ {
+ serialize("loss_mean_squared_", out);
+ }
+
+ friend void deserialize(loss_mean_squared_& , std::istream& in)
+ {
+ std::string version;
+ deserialize(version, in);
+ if (version != "loss_mean_squared_")
+ throw serialization_error("Unexpected version found while deserializing dlib::loss_mean_squared_.");
+ }
+
+ friend std::ostream& operator<<(std::ostream& out, const loss_mean_squared_& )
+ {
+ out << "loss_mean_squared";
+ return out;
+ }
+
+ friend void to_xml(const loss_mean_squared_& /*item*/, std::ostream& out)
+ {
+ out << "<loss_mean_squared/>";
+ }
+
+ };
+
+ template <typename SUBNET>
+ using loss_mean_squared = add_loss_layer<loss_mean_squared_, SUBNET>;
+
+// ----------------------------------------------------------------------------------------
+
+ class loss_epsilon_insensitive_
+ {
+ public:
+
+ typedef float training_label_type;
+ typedef float output_label_type;
+
+ loss_epsilon_insensitive_() = default;
+ loss_epsilon_insensitive_(double eps) : eps(eps)
+ {
+ DLIB_CASSERT(eps >= 0, "You can't set a negative error epsilon.");
+ }
+
+ double get_epsilon () const { return eps; }
+ void set_epsilon(double e)
+ {
+ DLIB_CASSERT(e >= 0, "You can't set a negative error epsilon.");
+ eps = e;
+ }
+
+ template <
+ typename SUB_TYPE,
+ typename label_iterator
+ >
+ void to_label (
+ const tensor& input_tensor,
+ const SUB_TYPE& sub,
+ label_iterator iter
+ ) const
+ {
+ DLIB_CASSERT(sub.sample_expansion_factor() == 1);
+
+ const tensor& output_tensor = sub.get_output();
+
+ DLIB_CASSERT(output_tensor.nr() == 1 &&
+ output_tensor.nc() == 1 &&
+ output_tensor.k() == 1);
+ DLIB_CASSERT(input_tensor.num_samples() == output_tensor.num_samples());
+
+ const float* out_data = output_tensor.host();
+ for (long i = 0; i < output_tensor.num_samples(); ++i)
+ {
+ *iter++ = out_data[i];
+ }
+ }
+
+
+ template <
+ typename const_label_iterator,
+ typename SUBNET
+ >
+ double compute_loss_value_and_gradient (
+ const tensor& input_tensor,
+ const_label_iterator truth,
+ SUBNET& sub
+ ) const
+ {
+ const tensor& output_tensor = sub.get_output();
+ tensor& grad = sub.get_gradient_input();
+
+ DLIB_CASSERT(sub.sample_expansion_factor() == 1);
+ DLIB_CASSERT(input_tensor.num_samples() != 0);
+ DLIB_CASSERT(input_tensor.num_samples()%sub.sample_expansion_factor() == 0);
+ DLIB_CASSERT(input_tensor.num_samples() == grad.num_samples());
+ DLIB_CASSERT(input_tensor.num_samples() == output_tensor.num_samples());
+ DLIB_CASSERT(output_tensor.nr() == 1 &&
+ output_tensor.nc() == 1 &&
+ output_tensor.k() == 1);
+ DLIB_CASSERT(grad.nr() == 1 &&
+ grad.nc() == 1 &&
+ grad.k() == 1);
+
+ // The loss we output is the average loss over the mini-batch.
+ const double scale = 1.0/output_tensor.num_samples();
+ double loss = 0;
+ float* g = grad.host_write_only();
+ const float* out_data = output_tensor.host();
+ for (long i = 0; i < output_tensor.num_samples(); ++i)
+ {
+ const float y = *truth++;
+ const float err = out_data[i]-y;
+ if (err > eps)
+ {
+ loss += scale*(err-eps);
+ g[i] = scale;
+ }
+ else if (err < -eps)
+ {
+ loss += scale*(eps-err);
+ g[i] = -scale;
+ }
+ }
+ return loss;
+ }
+
+ friend void serialize(const loss_epsilon_insensitive_& item, std::ostream& out)
+ {
+ serialize("loss_epsilon_insensitive_", out);
+ serialize(item.eps, out);
+ }
+
+ friend void deserialize(loss_epsilon_insensitive_& item, std::istream& in)
+ {
+ std::string version;
+ deserialize(version, in);
+ if (version != "loss_epsilon_insensitive_")
+ throw serialization_error("Unexpected version found while deserializing dlib::loss_epsilon_insensitive_.");
+ deserialize(item.eps, in);
+ }
+
+ friend std::ostream& operator<<(std::ostream& out, const loss_epsilon_insensitive_& item)
+ {
+ out << "loss_epsilon_insensitive epsilon: " << item.eps;
+ return out;
+ }
+
+ friend void to_xml(const loss_epsilon_insensitive_& item, std::ostream& out)
+ {
+ out << "<loss_epsilon_insensitive_ epsilon='" << item.eps << "'/>";
+ }
+
+ private:
+ double eps = 1;
+
+ };
+
+ template <typename SUBNET>
+ using loss_epsilon_insensitive = add_loss_layer<loss_epsilon_insensitive_, SUBNET>;
+
+// ----------------------------------------------------------------------------------------
+
+ class loss_mean_squared_multioutput_
+ {
+ public:
+
+ typedef matrix<float> training_label_type;
+ typedef matrix<float> output_label_type;
+
+ template <
+ typename SUB_TYPE,
+ typename label_iterator
+ >
+ void to_label (
+ const tensor& input_tensor,
+ const SUB_TYPE& sub,
+ label_iterator iter
+ ) const
+ {
+ DLIB_CASSERT(sub.sample_expansion_factor() == 1);
+
+ const tensor& output_tensor = sub.get_output();
+
+ DLIB_CASSERT(output_tensor.nr() == 1 &&
+ output_tensor.nc() == 1)
+ DLIB_CASSERT(input_tensor.num_samples() == output_tensor.num_samples());
+
+ const float* out_data = output_tensor.host();
+ for (long i = 0; i < output_tensor.num_samples(); ++i)
+ {
+ *iter++ = mat(out_data, output_tensor.k(), 1);
+ out_data += output_tensor.k();
+ }
+ }
+
+
+ template <
+ typename const_label_iterator,
+ typename SUBNET
+ >
+ double compute_loss_value_and_gradient (
+ const tensor& input_tensor,
+ const_label_iterator truth,
+ SUBNET& sub
+ ) const
+ {
+ const tensor& output_tensor = sub.get_output();
+ tensor& grad = sub.get_gradient_input();
+
+ DLIB_CASSERT(sub.sample_expansion_factor() == 1);
+ DLIB_CASSERT(input_tensor.num_samples() != 0);
+ DLIB_CASSERT(input_tensor.num_samples()%sub.sample_expansion_factor() == 0);
+ DLIB_CASSERT(input_tensor.num_samples() == grad.num_samples());
+ DLIB_CASSERT(input_tensor.num_samples() == output_tensor.num_samples());
+ DLIB_CASSERT(output_tensor.nr() == 1 &&
+ output_tensor.nc() == 1);
+ DLIB_CASSERT(grad.nr() == 1 &&
+ grad.nc() == 1);
+ DLIB_CASSERT(grad.k() == output_tensor.k());
+ const long k = output_tensor.k();
+ for (long idx = 0; idx < output_tensor.num_samples(); ++idx)
+ {
+ const_label_iterator truth_matrix_ptr = (truth + idx);
+ DLIB_CASSERT((*truth_matrix_ptr).nr() == k &&
+ (*truth_matrix_ptr).nc() == 1);
+ }
+
+ // The loss we output is the average loss over the mini-batch.
+ const double scale = 1.0/output_tensor.num_samples();
+ double loss = 0;
+ float* g = grad.host_write_only();
+ const float* out_data = output_tensor.host();
+ matrix<float> ytrue;
+ for (long i = 0; i < output_tensor.num_samples(); ++i)
+ {
+ ytrue = *truth++;
+ for (long j = 0; j < output_tensor.k(); ++j)
+ {
+ const float y = ytrue(j, 0);
+ const float temp1 = y - *out_data++;
+ const float temp2 = scale*temp1;
+ loss += temp2*temp1;
+ *g = -temp2;
+ ++g;
+ }
+
+ }
+ return loss;
+ }
+
+ friend void serialize(const loss_mean_squared_multioutput_& , std::ostream& out)
+ {
+ serialize("loss_mean_squared_multioutput_", out);
+ }
+
+ friend void deserialize(loss_mean_squared_multioutput_& , std::istream& in)
+ {
+ std::string version;
+ deserialize(version, in);
+ if (version != "loss_mean_squared_multioutput_")
+ throw serialization_error("Unexpected version found while deserializing dlib::loss_mean_squared_.");
+ }
+
+ friend std::ostream& operator<<(std::ostream& out, const loss_mean_squared_multioutput_& )
+ {
+ out << "loss_mean_squared_multioutput";
+ return out;
+ }
+
+ friend void to_xml(const loss_mean_squared_multioutput_& /*item*/, std::ostream& out)
+ {
+ out << "<loss_mean_squared_multioutput/>";
+ }
+
+ };
+
+ template <typename SUBNET>
+ using loss_mean_squared_multioutput = add_loss_layer<loss_mean_squared_multioutput_, SUBNET>;
+
+// ----------------------------------------------------------------------------------------
+
+ class loss_multiclass_log_per_pixel_
+ {
+ public:
+
+ // In semantic segmentation, if you don't know the ground-truth of some pixel,
+ // set the label of that pixel to this value. When you do so, the pixel will be
+ // ignored when computing gradients.
+ static const uint16_t label_to_ignore = std::numeric_limits<uint16_t>::max();
+
+
+ // In semantic segmentation, 65535 classes ought to be enough for anybody.
+ typedef matrix<uint16_t> training_label_type;
+ typedef matrix<uint16_t> output_label_type;
+
+ template <
+ typename SUB_TYPE,
+ typename label_iterator
+ >
+ static void to_label (
+ const tensor& input_tensor,
+ const SUB_TYPE& sub,
+ label_iterator iter
+ )
+ {
+ DLIB_CASSERT(sub.sample_expansion_factor() == 1);
+
+ const tensor& output_tensor = sub.get_output();
+
+ DLIB_CASSERT(output_tensor.k() >= 1); // Note that output_tensor.k() should match the number of labels.
+ DLIB_CASSERT(output_tensor.k() < std::numeric_limits<uint16_t>::max());
+ DLIB_CASSERT(input_tensor.num_samples() == output_tensor.num_samples());
+
+ const float* const out_data = output_tensor.host();
+
+ // The index of the largest output for each element is the label.
+ const auto find_label = [&](long sample, long r, long c)
+ {
+ uint16_t label = 0;
+ float max_value = out_data[tensor_index(output_tensor, sample, 0, r, c)];
+ for (long k = 1; k < output_tensor.k(); ++k)
+ {
+ const float value = out_data[tensor_index(output_tensor, sample, k, r, c)];
+ if (value > max_value)
+ {
+ label = static_cast<uint16_t>(k);
+ max_value = value;
+ }
+ }
+ return label;
+ };
+
+ for (long i = 0; i < output_tensor.num_samples(); ++i, ++iter)
+ {
+ iter->set_size(output_tensor.nr(), output_tensor.nc());
+ for (long r = 0; r < output_tensor.nr(); ++r)
+ {
+ for (long c = 0; c < output_tensor.nc(); ++c)
+ {
+ // The index of the largest output for this element is the label.
+ iter->operator()(r, c) = find_label(i, r, c);
+ }
+ }
+ }
+ }
+
+ template <
+ typename const_label_iterator,
+ typename SUBNET
+ >
+ double compute_loss_value_and_gradient (
+ const tensor& input_tensor,
+ const_label_iterator truth,
+ SUBNET& sub
+ ) const
+ {
+ const tensor& output_tensor = sub.get_output();
+ tensor& grad = sub.get_gradient_input();
+
+ DLIB_CASSERT(sub.sample_expansion_factor() == 1);
+ DLIB_CASSERT(input_tensor.num_samples() != 0);
+ DLIB_CASSERT(input_tensor.num_samples()%sub.sample_expansion_factor() == 0);
+ DLIB_CASSERT(input_tensor.num_samples() == grad.num_samples());
+ DLIB_CASSERT(input_tensor.num_samples() == output_tensor.num_samples());
+ DLIB_CASSERT(output_tensor.k() >= 1);
+ DLIB_CASSERT(output_tensor.k() < std::numeric_limits<uint16_t>::max());
+ DLIB_CASSERT(output_tensor.nr() == grad.nr() &&
+ output_tensor.nc() == grad.nc() &&
+ output_tensor.k() == grad.k());
+ for (long idx = 0; idx < output_tensor.num_samples(); ++idx)
+ {
+ const_label_iterator truth_matrix_ptr = (truth + idx);
+ DLIB_CASSERT(truth_matrix_ptr->nr() == output_tensor.nr() &&
+ truth_matrix_ptr->nc() == output_tensor.nc(),
+ "truth size = " << truth_matrix_ptr->nr() << " x " << truth_matrix_ptr->nc() << ", "
+ "output size = " << output_tensor.nr() << " x " << output_tensor.nc());
+ }
+
+ tt::softmax(grad, output_tensor);
+
+ // The loss we output is the average loss over the mini-batch, and also over each element of the matrix output.
+ const double scale = 1.0 / (output_tensor.num_samples() * output_tensor.nr() * output_tensor.nc());
+ double loss = 0;
+ float* const g = grad.host();
+ for (long i = 0; i < output_tensor.num_samples(); ++i, ++truth)
+ {
+ for (long r = 0; r < output_tensor.nr(); ++r)
+ {
+ for (long c = 0; c < output_tensor.nc(); ++c)
+ {
+ const uint16_t y = truth->operator()(r, c);
+ // The network must produce a number of outputs that is equal to the number
+ // of labels when using this type of loss.
+ DLIB_CASSERT(static_cast<long>(y) < output_tensor.k() || y == label_to_ignore,
+ "y: " << y << ", output_tensor.k(): " << output_tensor.k());
+ for (long k = 0; k < output_tensor.k(); ++k)
+ {
+ const size_t idx = tensor_index(output_tensor, i, k, r, c);
+ if (k == y)
+ {
+ loss += scale*-safe_log(g[idx]);
+ g[idx] = scale*(g[idx] - 1);
+ }
+ else if (y == label_to_ignore)
+ {
+ g[idx] = 0.f;
+ }
+ else
+ {
+ g[idx] = scale*g[idx];
+ }
+ }
+ }
+ }
+ }
+ return loss;
+ }
+
+ friend void serialize(const loss_multiclass_log_per_pixel_& , std::ostream& out)
+ {
+ serialize("loss_multiclass_log_per_pixel_", out);
+ }
+
+ friend void deserialize(loss_multiclass_log_per_pixel_& , std::istream& in)
+ {
+ std::string version;
+ deserialize(version, in);
+ if (version != "loss_multiclass_log_per_pixel_")
+ throw serialization_error("Unexpected version found while deserializing dlib::loss_multiclass_log_per_pixel_.");
+ }
+
+ friend std::ostream& operator<<(std::ostream& out, const loss_multiclass_log_per_pixel_& )
+ {
+ out << "loss_multiclass_log_per_pixel";
+ return out;
+ }
+
+ friend void to_xml(const loss_multiclass_log_per_pixel_& /*item*/, std::ostream& out)
+ {
+ out << "<loss_multiclass_log_per_pixel/>";
+ }
+
+ private:
+ static size_t tensor_index(const tensor& t, long sample, long k, long row, long column)
+ {
+ // See: https://github.com/davisking/dlib/blob/4dfeb7e186dd1bf6ac91273509f687293bd4230a/dlib/dnn/tensor_abstract.h#L38
+ return ((sample * t.k() + k) * t.nr() + row) * t.nc() + column;
+ }
+
+ };
+
+ template <typename SUBNET>
+ using loss_multiclass_log_per_pixel = add_loss_layer<loss_multiclass_log_per_pixel_, SUBNET>;
+
+// ----------------------------------------------------------------------------------------
+
+ class loss_multiclass_log_per_pixel_weighted_
+ {
+ public:
+
+ struct weighted_label
+ {
+ weighted_label()
+ {}
+
+ weighted_label(uint16_t label, float weight = 1.f)
+ : label(label), weight(weight)
+ {}
+
+ // In semantic segmentation, 65536 classes ought to be enough for anybody.
+ uint16_t label = 0;
+ float weight = 1.f;
+ };
+
+ typedef matrix<weighted_label> training_label_type;
+ typedef matrix<uint16_t> output_label_type;
+
+ template <
+ typename SUB_TYPE,
+ typename label_iterator
+ >
+ static void to_label (
+ const tensor& input_tensor,
+ const SUB_TYPE& sub,
+ label_iterator iter
+ )
+ {
+ loss_multiclass_log_per_pixel_::to_label(input_tensor, sub, iter);
+ }
+
+ template <
+ typename const_label_iterator,
+ typename SUBNET
+ >
+ double compute_loss_value_and_gradient (
+ const tensor& input_tensor,
+ const_label_iterator truth,
+ SUBNET& sub
+ ) const
+ {
+ const tensor& output_tensor = sub.get_output();
+ tensor& grad = sub.get_gradient_input();
+
+ DLIB_CASSERT(sub.sample_expansion_factor() == 1);
+ DLIB_CASSERT(input_tensor.num_samples() != 0);
+ DLIB_CASSERT(input_tensor.num_samples()%sub.sample_expansion_factor() == 0);
+ DLIB_CASSERT(input_tensor.num_samples() == grad.num_samples());
+ DLIB_CASSERT(input_tensor.num_samples() == output_tensor.num_samples());
+ DLIB_CASSERT(output_tensor.k() >= 1);
+ DLIB_CASSERT(output_tensor.k() < std::numeric_limits<uint16_t>::max());
+ DLIB_CASSERT(output_tensor.nr() == grad.nr() &&
+ output_tensor.nc() == grad.nc() &&
+ output_tensor.k() == grad.k());
+ for (long idx = 0; idx < output_tensor.num_samples(); ++idx)
+ {
+ const_label_iterator truth_matrix_ptr = (truth + idx);
+ DLIB_CASSERT(truth_matrix_ptr->nr() == output_tensor.nr() &&
+ truth_matrix_ptr->nc() == output_tensor.nc(),
+ "truth size = " << truth_matrix_ptr->nr() << " x " << truth_matrix_ptr->nc() << ", "
+ "output size = " << output_tensor.nr() << " x " << output_tensor.nc());
+ }
+
+ tt::softmax(grad, output_tensor);
+
+ // The loss we output is the weighted average loss over the mini-batch, and also over each element of the matrix output.
+ const double scale = 1.0 / (output_tensor.num_samples() * output_tensor.nr() * output_tensor.nc());
+ double loss = 0;
+ float* const g = grad.host();
+ for (long i = 0; i < output_tensor.num_samples(); ++i, ++truth)
+ {
+ for (long r = 0; r < output_tensor.nr(); ++r)
+ {
+ for (long c = 0; c < output_tensor.nc(); ++c)
+ {
+ const weighted_label& weighted_label = truth->operator()(r, c);
+ const uint16_t y = weighted_label.label;
+ const float weight = weighted_label.weight;
+ // The network must produce a number of outputs that is equal to the number
+ // of labels when using this type of loss.
+ DLIB_CASSERT(static_cast<long>(y) < output_tensor.k() || weight == 0.f,
+ "y: " << y << ", output_tensor.k(): " << output_tensor.k());
+ for (long k = 0; k < output_tensor.k(); ++k)
+ {
+ const size_t idx = tensor_index(output_tensor, i, k, r, c);
+ if (k == y)
+ {
+ loss += weight*scale*-safe_log(g[idx]);
+ g[idx] = weight*scale*(g[idx] - 1);
+ }
+ else
+ {
+ g[idx] = weight*scale*g[idx];
+ }
+ }
+ }
+ }
+ }
+ return loss;
+ }
+
+ friend void serialize(const loss_multiclass_log_per_pixel_weighted_& , std::ostream& out)
+ {
+ serialize("loss_multiclass_log_per_pixel_weighted_", out);
+ }
+
+ friend void deserialize(loss_multiclass_log_per_pixel_weighted_& , std::istream& in)
+ {
+ std::string version;
+ deserialize(version, in);
+ if (version != "loss_multiclass_log_per_pixel_weighted_")
+ throw serialization_error("Unexpected version found while deserializing dlib::loss_multiclass_log_per_pixel_weighted_.");
+ }
+
+ friend std::ostream& operator<<(std::ostream& out, const loss_multiclass_log_per_pixel_weighted_& )
+ {
+ out << "loss_multiclass_log_per_pixel_weighted";
+ return out;
+ }
+
+ friend void to_xml(const loss_multiclass_log_per_pixel_weighted_& /*item*/, std::ostream& out)
+ {
+ out << "<loss_multiclass_log_per_pixel_weighted/>";
+ }
+
+ private:
+ static size_t tensor_index(const tensor& t, long sample, long k, long row, long column)
+ {
+ // See: https://github.com/davisking/dlib/blob/4dfeb7e186dd1bf6ac91273509f687293bd4230a/dlib/dnn/tensor_abstract.h#L38
+ return ((sample * t.k() + k) * t.nr() + row) * t.nc() + column;
+ }
+
+ };
+
+ template <typename SUBNET>
+ using loss_multiclass_log_per_pixel_weighted = add_loss_layer<loss_multiclass_log_per_pixel_weighted_, SUBNET>;
+
+// ----------------------------------------------------------------------------------------
+
+ class loss_mean_squared_per_pixel_
+ {
+ public:
+
+ typedef matrix<float> training_label_type;
+ typedef matrix<float> output_label_type;
+
+ template <
+ typename SUB_TYPE,
+ typename label_iterator
+ >
+ void to_label (
+ const tensor& input_tensor,
+ const SUB_TYPE& sub,
+ label_iterator iter
+ ) const
+ {
+ DLIB_CASSERT(sub.sample_expansion_factor() == 1);
+
+ const tensor& output_tensor = sub.get_output();
+
+ DLIB_CASSERT(output_tensor.k() == 1, "output k = " << output_tensor.k());
+ DLIB_CASSERT(input_tensor.num_samples() == output_tensor.num_samples());
+
+ const float* out_data = output_tensor.host();
+ for (long i = 0; i < output_tensor.num_samples(); ++i, ++iter)
+ {
+ iter->set_size(output_tensor.nr(), output_tensor.nc());
+ for (long r = 0; r < output_tensor.nr(); ++r)
+ {
+ for (long c = 0; c < output_tensor.nc(); ++c)
+ {
+ iter->operator()(r, c) = out_data[tensor_index(output_tensor, i, 0, r, c)];
+ }
+ }
+ }
+ }
+
+
+ template <
+ typename const_label_iterator,
+ typename SUBNET
+ >
+ double compute_loss_value_and_gradient (
+ const tensor& input_tensor,
+ const_label_iterator truth,
+ SUBNET& sub
+ ) const
+ {
+ const tensor& output_tensor = sub.get_output();
+ tensor& grad = sub.get_gradient_input();
+
+ DLIB_CASSERT(sub.sample_expansion_factor() == 1);
+ DLIB_CASSERT(input_tensor.num_samples() != 0);
+ DLIB_CASSERT(input_tensor.num_samples() % sub.sample_expansion_factor() == 0);
+ DLIB_CASSERT(input_tensor.num_samples() == grad.num_samples());
+ DLIB_CASSERT(input_tensor.num_samples() == output_tensor.num_samples());
+ DLIB_CASSERT(output_tensor.k() >= 1);
+ DLIB_CASSERT(output_tensor.k() < std::numeric_limits<uint16_t>::max());
+ DLIB_CASSERT(output_tensor.nr() == grad.nr() &&
+ output_tensor.nc() == grad.nc() &&
+ output_tensor.k() == grad.k());
+ for (long idx = 0; idx < output_tensor.num_samples(); ++idx)
+ {
+ const_label_iterator truth_matrix_ptr = (truth + idx);
+ DLIB_CASSERT(truth_matrix_ptr->nr() == output_tensor.nr() &&
+ truth_matrix_ptr->nc() == output_tensor.nc(),
+ "truth size = " << truth_matrix_ptr->nr() << " x " << truth_matrix_ptr->nc() << ", "
+ "output size = " << output_tensor.nr() << " x " << output_tensor.nc());
+ }
+
+ // The loss we output is the average loss over the mini-batch, and also over each element of the matrix output.
+ const double scale = 1.0 / (output_tensor.num_samples() * output_tensor.nr() * output_tensor.nc());
+ double loss = 0;
+ float* const g = grad.host();
+ const float* out_data = output_tensor.host();
+ for (long i = 0; i < output_tensor.num_samples(); ++i, ++truth)
+ {
+ for (long r = 0; r < output_tensor.nr(); ++r)
+ {
+ for (long c = 0; c < output_tensor.nc(); ++c)
+ {
+ const float y = truth->operator()(r, c);
+ const size_t idx = tensor_index(output_tensor, i, 0, r, c);
+ const float temp1 = y - out_data[idx];
+ const float temp2 = scale*temp1;
+ loss += temp2*temp1;
+ g[idx] = -temp2;
+ }
+ }
+ }
+ return loss;
+ }
+
+ friend void serialize(const loss_mean_squared_per_pixel_& , std::ostream& out)
+ {
+ serialize("loss_mean_squared_per_pixel_", out);
+ }
+
+ friend void deserialize(loss_mean_squared_per_pixel_& , std::istream& in)
+ {
+ std::string version;
+ deserialize(version, in);
+ if (version != "loss_mean_squared_per_pixel_")
+ throw serialization_error("Unexpected version found while deserializing dlib::loss_mean_squared_per_pixel_.");
+ }
+
+ friend std::ostream& operator<<(std::ostream& out, const loss_mean_squared_per_pixel_& )
+ {
+ out << "loss_mean_squared_per_pixel";
+ return out;
+ }
+
+ friend void to_xml(const loss_mean_squared_per_pixel_& /*item*/, std::ostream& out)
+ {
+ out << "<loss_mean_squared_per_pixel/>";
+ }
+
+ private:
+ static size_t tensor_index(const tensor& t, long sample, long k, long row, long column)
+ {
+ // See: https://github.com/davisking/dlib/blob/4dfeb7e186dd1bf6ac91273509f687293bd4230a/dlib/dnn/tensor_abstract.h#L38
+ return ((sample * t.k() + k) * t.nr() + row) * t.nc() + column;
+ }
+ };
+
+ template <typename SUBNET>
+ using loss_mean_squared_per_pixel = add_loss_layer<loss_mean_squared_per_pixel_, SUBNET>;
+
+// ----------------------------------------------------------------------------------------
+
+ class loss_dot_
+ {
+ public:
+
+ typedef matrix<float,0,1> training_label_type;
+ typedef matrix<float,0,1> output_label_type;
+
+ template <
+ typename SUB_TYPE,
+ typename label_iterator
+ >
+ void to_label (
+ const tensor& input_tensor,
+ const SUB_TYPE& sub,
+ label_iterator iter
+ ) const
+ {
+ const tensor& output_tensor = sub.get_output();
+ DLIB_CASSERT(sub.sample_expansion_factor() == 1);
+ DLIB_CASSERT(input_tensor.num_samples() != 0);
+ DLIB_CASSERT(input_tensor.num_samples()%sub.sample_expansion_factor() == 0);
+ DLIB_CASSERT(input_tensor.num_samples() == output_tensor.num_samples());
+
+ for (long i = 0; i < output_tensor.num_samples(); ++i)
+ *iter++ = trans(rowm(mat(output_tensor),i));
+ }
+
+
+ template <
+ typename const_label_iterator,
+ typename SUBNET
+ >
+ double compute_loss_value_and_gradient (
+ const tensor& input_tensor,
+ const_label_iterator truth,
+ SUBNET& sub
+ ) const
+ {
+ const tensor& output_tensor = sub.get_output();
+ tensor& grad = sub.get_gradient_input();
+
+ DLIB_CASSERT(sub.sample_expansion_factor() == 1);
+ DLIB_CASSERT(input_tensor.num_samples() != 0);
+ DLIB_CASSERT(input_tensor.num_samples()%sub.sample_expansion_factor() == 0);
+ DLIB_CASSERT(input_tensor.num_samples() == grad.num_samples());
+ DLIB_CASSERT(input_tensor.num_samples() == output_tensor.num_samples());
+
+ const long network_output_dims = output_tensor.size()/output_tensor.num_samples();
+
+
+ // The loss we output is the average loss over the mini-batch.
+ const double scale = 1.0/output_tensor.num_samples();
+ double loss = 0;
+ float* g = grad.host();
+ const float* out_data = output_tensor.host();
+ for (long i = 0; i < output_tensor.num_samples(); ++i)
+ {
+ DLIB_CASSERT(truth->size() == network_output_dims, "The network must output a vector with the same dimensionality as the training labels. "
+ << "\ntruth->size(): " << truth->size()
+ << "\nnetwork_output_dims: " << network_output_dims);
+
+ const float* t = &(*truth++)(0);
+
+ for (long j = 0; j < network_output_dims; ++j)
+ {
+ g[j] = -t[j]*scale;
+ loss -= out_data[j]*t[j];
+ }
+
+ g += network_output_dims;
+ out_data += network_output_dims;
+ }
+ return loss*scale;
+ }
+
+ friend void serialize(const loss_dot_& , std::ostream& out)
+ {
+ serialize("loss_dot_", out);
+ }
+
+ friend void deserialize(loss_dot_& , std::istream& in)
+ {
+ std::string version;
+ deserialize(version, in);
+ if (version != "loss_dot_")
+ throw serialization_error("Unexpected version found while deserializing dlib::loss_dot_.");
+ }
+
+ friend std::ostream& operator<<(std::ostream& out, const loss_dot_& )
+ {
+ out << "loss_dot";
+ return out;
+ }
+
+ friend void to_xml(const loss_dot_& /*item*/, std::ostream& out)
+ {
+ out << "<loss_dot/>";
+ }
+
+ };
+
+ template <typename SUBNET>
+ using loss_dot = add_loss_layer<loss_dot_, SUBNET>;
+
+// ----------------------------------------------------------------------------------------
+
+}
+
+#endif // DLIB_DNn_LOSS_H_
+
diff --git a/ml/dlib/dlib/dnn/loss_abstract.h b/ml/dlib/dlib/dnn/loss_abstract.h
new file mode 100644
index 000000000..0dd043677
--- /dev/null
+++ b/ml/dlib/dlib/dnn/loss_abstract.h
@@ -0,0 +1,1542 @@
+// Copyright (C) 2015 Davis E. King (davis@dlib.net)
+// License: Boost Software License See LICENSE.txt for the full license.
+#undef DLIB_DNn_LOSS_ABSTRACT_H_
+#ifdef DLIB_DNn_LOSS_ABSTRACT_H_
+
+#include "core_abstract.h"
+#include "../image_processing/full_object_detection_abstract.h"
+
+namespace dlib
+{
+
+// ----------------------------------------------------------------------------------------
+
+ class EXAMPLE_LOSS_LAYER_
+ {
+ /*!
+ WHAT THIS OBJECT REPRESENTS
+ A loss layer is the final layer in a deep neural network. It computes the
+ task loss. That is, it computes a number that tells us how well the
+ network is performing on some task, such as predicting a binary label.
+
+ You can use one of the loss layers that comes with dlib (defined below).
+ But importantly, you are able to define your own loss layers to suit your
+ needs. You do this by creating a class that defines an interface matching
+ the one described by this EXAMPLE_LOSS_LAYER_ class. Note that there is no
+ dlib::EXAMPLE_LOSS_LAYER_ type. It is shown here purely to document the
+ interface that a loss layer must implement.
+
+ A loss layer can optionally provide a to_label() method that converts the
+ output of a network into a user defined type. If to_label() is not
+ provided then the operator() methods of add_loss_layer will not be
+ available, but otherwise everything will function as normal.
+
+ Finally, note that there are two broad flavors of loss layer, supervised
+ and unsupervised. The EXAMPLE_LOSS_LAYER_ as shown here is a supervised
+ layer. To make an unsupervised loss you simply leave out the
+ training_label_type typedef and the truth iterator argument to
+ compute_loss_value_and_gradient().
+ !*/
+
+ public:
+
+ // In most cases training_label_type and output_label_type will be the same type.
+ typedef whatever_type_you_use_for_training_labels training_label_type;
+ typedef whatever_type_you_use_for_outout_labels output_label_type;
+
+ EXAMPLE_LOSS_LAYER_ (
+ );
+ /*!
+ ensures
+ - EXAMPLE_LOSS_LAYER_ objects are default constructable.
+ !*/
+
+ EXAMPLE_LOSS_LAYER_ (
+ const EXAMPLE_LOSS_LAYER_& item
+ );
+ /*!
+ ensures
+ - EXAMPLE_LOSS_LAYER_ objects are copy constructable.
+ !*/
+
+ // Implementing to_label() is optional.
+ template <
+ typename SUB_TYPE,
+ typename label_iterator
+ >
+ void to_label (
+ const tensor& input_tensor,
+ const SUB_TYPE& sub,
+ label_iterator iter
+ ) const;
+ /*!
+ requires
+ - SUBNET implements the SUBNET interface defined at the top of
+ layers_abstract.h.
+ - input_tensor was given as input to the network sub and the outputs are
+ now visible in layer<i>(sub).get_output(), for all valid i.
+ - input_tensor.num_samples() > 0
+ - input_tensor.num_samples()%sub.sample_expansion_factor() == 0.
+ - iter == an iterator pointing to the beginning of a range of
+ input_tensor.num_samples()/sub.sample_expansion_factor() elements. Moreover,
+ they must be output_label_type elements.
+ ensures
+ - Converts the output of the provided network to output_label_type objects and
+ stores the results into the range indicated by iter. In particular, for
+ all valid i, it will be the case that:
+ *(iter+i/sub.sample_expansion_factor()) is populated based on the output of
+ sub and corresponds to the ith sample in input_tensor.
+ !*/
+
+ template <
+ typename const_label_iterator,
+ typename SUBNET
+ >
+ double compute_loss_value_and_gradient (
+ const tensor& input_tensor,
+ const_label_iterator truth,
+ SUBNET& sub
+ ) const;
+ /*!
+ requires
+ - SUBNET implements the SUBNET interface defined at the top of
+ layers_abstract.h.
+ - input_tensor was given as input to the network sub and the outputs are
+ now visible in layer<i>(sub).get_output(), for all valid i.
+ - input_tensor.num_samples() > 0
+ - input_tensor.num_samples()%sub.sample_expansion_factor() == 0.
+ - for all valid i:
+ - layer<i>(sub).get_gradient_input() has the same dimensions as
+ layer<i>(sub).get_output().
+ - layer<i>(sub).get_gradient_input() contains all zeros (i.e.
+ initially, all input gradients are 0).
+ - truth == an iterator pointing to the beginning of a range of
+ input_tensor.num_samples()/sub.sample_expansion_factor() elements. Moreover,
+ they must be training_label_type elements.
+ - for all valid i:
+ - *(truth+i/sub.sample_expansion_factor()) is the label of the ith sample in
+ input_tensor.
+ ensures
+ - This function computes a loss function that describes how well the output
+ of sub matches the expected labels given by truth. Let's write the loss
+ function as L(input_tensor, truth, sub).
+ - Then compute_loss_value_and_gradient() computes the gradient of L() with
+ respect to the outputs in sub. Specifically, compute_loss_value_and_gradient()
+ assigns the gradients into sub by performing the following tensor
+ assignments, for all valid i:
+ - layer<i>(sub).get_gradient_input() = the gradient of
+ L(input_tensor,truth,sub) with respect to layer<i>(sub).get_output().
+ Note that, since get_gradient_input() is zero initialized, you don't
+ have to write gradient information to layers that have a zero
+ loss gradient.
+ - returns L(input_tensor,truth,sub)
+ !*/
+ };
+
+ std::ostream& operator<<(std::ostream& out, const EXAMPLE_LOSS_LAYER_& item);
+ /*!
+ print a string describing this layer.
+ !*/
+
+ void to_xml(const EXAMPLE_LOSS_LAYER_& item, std::ostream& out);
+ /*!
+ This function is optional, but required if you want to print your networks with
+ net_to_xml(). Therefore, to_xml() prints a layer as XML.
+ !*/
+
+ void serialize(const EXAMPLE_LOSS_LAYER_& item, std::ostream& out);
+ void deserialize(EXAMPLE_LOSS_LAYER_& item, std::istream& in);
+ /*!
+ provides serialization support
+ !*/
+
+ // For each loss layer you define, always define an add_loss_layer template so that
+ // layers can be easily composed. Moreover, the convention is that the layer class
+ // ends with an _ while the add_loss_layer template has the same name but without the
+ // trailing _.
+ template <typename SUBNET>
+ using EXAMPLE_LOSS_LAYER = add_loss_layer<EXAMPLE_LOSS_LAYER_, SUBNET>;
+
+// ----------------------------------------------------------------------------------------
+// ----------------------------------------------------------------------------------------
+// ----------------------------------------------------------------------------------------
+
+ class loss_binary_hinge_
+ {
+ /*!
+ WHAT THIS OBJECT REPRESENTS
+ This object implements the loss layer interface defined above by
+ EXAMPLE_LOSS_LAYER_. In particular, it implements the hinge loss, which is
+ appropriate for binary classification problems. Therefore, the possible
+ labels when using this loss are +1 and -1. Moreover, it will cause the
+ network to produce outputs > 0 when predicting a member of the +1 class and
+ values < 0 otherwise.
+ !*/
+ public:
+
+ typedef float training_label_type;
+ typedef float output_label_type;
+
+ template <
+ typename SUB_TYPE,
+ typename label_iterator
+ >
+ void to_label (
+ const tensor& input_tensor,
+ const SUB_TYPE& sub,
+ label_iterator iter
+ ) const;
+ /*!
+ This function has the same interface as EXAMPLE_LOSS_LAYER_::to_label() except
+ it has the additional calling requirements that:
+ - sub.get_output().nr() == 1
+ - sub.get_output().nc() == 1
+ - sub.get_output().k() == 1
+ - sub.get_output().num_samples() == input_tensor.num_samples()
+ - sub.sample_expansion_factor() == 1
+ and the output label is the raw score for each classified object. If the score
+ is > 0 then the classifier is predicting the +1 class, otherwise it is
+ predicting the -1 class.
+ !*/
+
+ template <
+ typename const_label_iterator,
+ typename SUBNET
+ >
+ double compute_loss_value_and_gradient (
+ const tensor& input_tensor,
+ const_label_iterator truth,
+ SUBNET& sub
+ ) const;
+ /*!
+ This function has the same interface as EXAMPLE_LOSS_LAYER_::compute_loss_value_and_gradient()
+ except it has the additional calling requirements that:
+ - sub.get_output().nr() == 1
+ - sub.get_output().nc() == 1
+ - sub.get_output().k() == 1
+ - sub.get_output().num_samples() == input_tensor.num_samples()
+ - sub.sample_expansion_factor() == 1
+ - all values pointed to by truth are +1 or -1.
+ !*/
+
+ };
+
+ template <typename SUBNET>
+ using loss_binary_hinge = add_loss_layer<loss_binary_hinge_, SUBNET>;
+
+// ----------------------------------------------------------------------------------------
+
+ class loss_binary_log_
+ {
+ /*!
+ WHAT THIS OBJECT REPRESENTS
+ This object implements the loss layer interface defined above by
+ EXAMPLE_LOSS_LAYER_. In particular, it implements the log loss, which is
+ appropriate for binary classification problems. Therefore, the possible
+ labels when using this loss are +1 and -1. Moreover, it will cause the
+ network to produce outputs > 0 when predicting a member of the +1 class and
+ values < 0 otherwise.
+
+ To be more specific, this object contains a sigmoid layer followed by a
+ cross-entropy layer.
+ !*/
+ public:
+
+ typedef float training_label_type;
+ typedef float output_label_type;
+
+ template <
+ typename SUB_TYPE,
+ typename label_iterator
+ >
+ void to_label (
+ const tensor& input_tensor,
+ const SUB_TYPE& sub,
+ label_iterator iter
+ ) const;
+ /*!
+ This function has the same interface as EXAMPLE_LOSS_LAYER_::to_label() except
+ it has the additional calling requirements that:
+ - sub.get_output().nr() == 1
+ - sub.get_output().nc() == 1
+ - sub.get_output().k() == 1
+ - sub.get_output().num_samples() == input_tensor.num_samples()
+ - sub.sample_expansion_factor() == 1
+ and the output label is the raw score for each classified object. If the score
+ is > 0 then the classifier is predicting the +1 class, otherwise it is
+ predicting the -1 class.
+ !*/
+
+ template <
+ typename const_label_iterator,
+ typename SUBNET
+ >
+ double compute_loss_value_and_gradient (
+ const tensor& input_tensor,
+ const_label_iterator truth,
+ SUBNET& sub
+ ) const;
+ /*!
+ This function has the same interface as EXAMPLE_LOSS_LAYER_::compute_loss_value_and_gradient()
+ except it has the additional calling requirements that:
+ - sub.get_output().nr() == 1
+ - sub.get_output().nc() == 1
+ - sub.get_output().k() == 1
+ - sub.get_output().num_samples() == input_tensor.num_samples()
+ - sub.sample_expansion_factor() == 1
+ - all values pointed to by truth are +1 or -1.
+ !*/
+
+ };
+
+ template <typename SUBNET>
+ using loss_binary_log = add_loss_layer<loss_binary_log_, SUBNET>;
+
+// ----------------------------------------------------------------------------------------
+
+ class loss_multiclass_log_
+ {
+ /*!
+ WHAT THIS OBJECT REPRESENTS
+ This object implements the loss layer interface defined above by
+ EXAMPLE_LOSS_LAYER_. In particular, it implements the multiclass logistic
+ regression loss (e.g. negative log-likelihood loss), which is appropriate
+ for multiclass classification problems. This means that the possible
+ labels when using this loss are integers >= 0.
+
+ Moreover, if after training you were to replace the loss layer of the
+ network with a softmax layer, the network outputs would give the
+ probabilities of each class assignment. That is, if you have K classes
+ then the network should output tensors with the tensor::k()'th dimension
+ equal to K. Applying softmax to these K values gives the probabilities of
+ each class. The index into that K dimensional vector with the highest
+ probability is the predicted class label.
+ !*/
+
+ public:
+
+ typedef unsigned long training_label_type;
+ typedef unsigned long output_label_type;
+
+ template <
+ typename SUB_TYPE,
+ typename label_iterator
+ >
+ void to_label (
+ const tensor& input_tensor,
+ const SUB_TYPE& sub,
+ label_iterator iter
+ ) const;
+ /*!
+ This function has the same interface as EXAMPLE_LOSS_LAYER_::to_label() except
+ it has the additional calling requirements that:
+ - sub.get_output().nr() == 1
+ - sub.get_output().nc() == 1
+ - sub.get_output().num_samples() == input_tensor.num_samples()
+ - sub.sample_expansion_factor() == 1
+ and the output label is the predicted class for each classified object. The number
+ of possible output classes is sub.get_output().k().
+ !*/
+
+ template <
+ typename const_label_iterator,
+ typename SUBNET
+ >
+ double compute_loss_value_and_gradient (
+ const tensor& input_tensor,
+ const_label_iterator truth,
+ SUBNET& sub
+ ) const;
+ /*!
+ This function has the same interface as EXAMPLE_LOSS_LAYER_::compute_loss_value_and_gradient()
+ except it has the additional calling requirements that:
+ - sub.get_output().nr() == 1
+ - sub.get_output().nc() == 1
+ - sub.get_output().num_samples() == input_tensor.num_samples()
+ - sub.sample_expansion_factor() == 1
+ - all values pointed to by truth are < sub.get_output().k()
+ !*/
+
+ };
+
+ template <typename SUBNET>
+ using loss_multiclass_log = add_loss_layer<loss_multiclass_log_, SUBNET>;
+
+// ----------------------------------------------------------------------------------------
+
+ class loss_multimulticlass_log_
+ {
+ /*!
+ WHAT THIS OBJECT REPRESENTS
+ This object implements the loss layer interface defined above by
+ EXAMPLE_LOSS_LAYER_. In particular, it implements a collection of
+ multiclass classifiers. An example will make its use clear. So suppose,
+ for example, that you want to make something that takes a picture of a
+ vehicle and answers the following questions:
+ - What type of vehicle is it? A sedan or a truck?
+ - What color is it? red, green, blue, gray, or black?
+ You need two separate multi-class classifiers to do this. One to decide
+ the type of vehicle, and another to decide the color. The
+ loss_multimulticlass_log_ allows you to pack these two classifiers into one
+ neural network. This means that when you use the network to process an
+ image it will output 2 labels for each image, the type label and the color
+ label.
+
+ To create a loss_multimulticlass_log_ for the above case you would
+ construct it as follows:
+ std::map<std::string,std::vector<std::string>> labels;
+ labels["type"] = {"sedan", "truck"};
+ labels["color"] = {"red", "green", "blue", "gray", "black"};
+ loss_multimulticlass_log_ myloss(labels);
+ Then you could use myloss with a network object and train it to do this
+ task. More generally, you can use any number of classifiers and labels
+ when using this object. Finally, each of the classifiers uses a standard
+ multi-class logistic regression loss.
+ !*/
+
+ public:
+
+ loss_multimulticlass_log_(
+ );
+ /*!
+ ensures
+ - #number_of_labels() == 0
+ - #get_labels().size() == 0
+ !*/
+
+ loss_multimulticlass_log_ (
+ const std::map<std::string,std::vector<std::string>>& labels
+ );
+ /*!
+ requires
+ - Each vector in labels must contain at least 2 strings. I.e. each
+ classifier must have at least two possible labels.
+ ensures
+ - #number_of_labels() == the total number of strings in all the
+ std::vectors in labels.
+ - #number_of_classifiers() == labels.size()
+ - #get_labels() == labels
+ !*/
+
+ unsigned long number_of_labels(
+ ) const;
+ /*!
+ ensures
+ - returns the total number of labels known to this loss. This is the count of
+ all the labels in each classifier.
+ !*/
+
+ unsigned long number_of_classifiers(
+ ) const;
+ /*!
+ ensures
+ - returns the number of classifiers defined by this loss.
+ !*/
+
+ std::map<std::string,std::vector<std::string>> get_labels (
+ ) const;
+ /*!
+ ensures
+ - returns the names of the classifiers and labels used by this loss. In
+ particular, if the returned object is L then:
+ - L[CLASS] == the set of labels used by the classifier CLASS.
+ - L.size() == number_of_classifiers()
+ - The count of strings in the vectors in L == number_of_labels()
+ !*/
+
+ class classifier_output
+ {
+ /*!
+ WHAT THIS OBJECT REPRESENTS
+ This object stores the predictions from one of the classifiers in
+ loss_multimulticlass_log_. It allows you to find out the most likely
+ string label predicted by that classifier, as well as get the class
+ conditional probability of any of the classes in the classifier.
+ !*/
+
+ public:
+
+ classifier_output(
+ );
+ /*!
+ ensures
+ - #num_classes() == 0
+ !*/
+
+ size_t num_classes(
+ ) const;
+ /*!
+ ensures
+ - returns the number of possible classes output by this classifier.
+ !*/
+
+ double probability_of_class (
+ size_t i
+ ) const;
+ /*!
+ requires
+ - i < num_classes()
+ ensures
+ - returns the probability that the true class has a label of label(i).
+ - The sum of probability_of_class(j) for j in the range [0, num_classes()) is always 1.
+ !*/
+
+ const std::string& label(
+ size_t i
+ ) const;
+ /*!
+ requires
+ - i < num_classes()
+ ensures
+ - returns the string label for the ith class.
+ !*/
+
+ operator std::string(
+ ) const;
+ /*!
+ requires
+ - num_classes() != 0
+ ensures
+ - returns the string label for the most probable class.
+ !*/
+
+ friend std::ostream& operator<< (std::ostream& out, const classifier_output& item);
+ /*!
+ requires
+ - num_classes() != 0
+ ensures
+ - prints the most probable class label to out.
+ !*/
+
+ };
+
+ // Both training_label_type and output_label_type should always have sizes equal to
+ // number_of_classifiers(). That is, the std::map should have an entry for every
+ // classifier known to this loss.
+ typedef std::map<std::string,std::string> training_label_type;
+ typedef std::map<std::string,classifier_output> output_label_type;
+
+
+ template <
+ typename SUB_TYPE,
+ typename label_iterator
+ >
+ void to_label (
+ const tensor& input_tensor,
+ const SUB_TYPE& sub,
+ label_iterator iter
+ ) const;
+ /*!
+ This function has the same interface as EXAMPLE_LOSS_LAYER_::to_label() except
+ it has the additional calling requirements that:
+ - number_of_labels() != 0
+ - sub.get_output().k() == number_of_labels()
+ - sub.get_output().nr() == 1
+ - sub.get_output().nc() == 1
+ - sub.get_output().num_samples() == input_tensor.num_samples()
+ - sub.sample_expansion_factor() == 1
+ !*/
+
+ template <
+ typename const_label_iterator,
+ typename SUBNET
+ >
+ double compute_loss_value_and_gradient (
+ const tensor& input_tensor,
+ const_label_iterator truth,
+ SUBNET& sub
+ ) const;
+ /*!
+ This function has the same interface as EXAMPLE_LOSS_LAYER_::compute_loss_value_and_gradient()
+ except it has the additional calling requirements that:
+ - number_of_labels() != 0
+ - sub.get_output().k() == number_of_labels()
+ It should be noted that the last layer in your network should usually
+ be an fc layer. If so, you can satisfy this requirement of k() being
+ number_of_labels() by calling set_num_outputs() prior to training your
+ network like so:
+ your_network.subnet().layer_details().set_num_outputs(your_network.loss_details().number_of_labels());
+ - sub.get_output().nr() == 1
+ - sub.get_output().nc() == 1
+ - sub.get_output().num_samples() == input_tensor.num_samples()
+ - sub.sample_expansion_factor() == 1
+ - All the std::maps pointed to by truth contain entries for all the
+ classifiers known to this loss. That is, it must be valid to call
+ truth[i][classifier] for any of the classifiers known to this loss. To
+ say this another way, all the training samples must contain labels for
+ each of the classifiers defined by this loss.
+
+ To really belabor this, this also means that truth[i].size() ==
+ get_labels().size() and that both truth[i] and get_labels() have the same
+ set of key strings. It also means that the value strings in truth[i]
+ must be strings known to the loss, i.e. they are valid labels according
+ to get_labels().
+ !*/
+ };
+
+ template <typename SUBNET>
+ using loss_multimulticlass_log = add_loss_layer<loss_multimulticlass_log_, SUBNET>;
+
+ // Allow comparison between classifier_outputs and std::string to check if the
+ // predicted class is a particular string.
+ inline bool operator== (const std::string& lhs, const loss_multimulticlass_log_::classifier_output& rhs)
+ { return lhs == static_cast<const std::string&>(rhs); }
+ inline bool operator== (const loss_multimulticlass_log_::classifier_output& lhs, const std::string& rhs)
+ { return rhs == static_cast<const std::string&>(lhs); }
+
+// ----------------------------------------------------------------------------------------
+// ----------------------------------------------------------------------------------------
+
+ enum class use_image_pyramid : uint8_t
+ {
+ no,
+ yes
+ };
+
+ struct mmod_options
+ {
+ /*!
+ WHAT THIS OBJECT REPRESENTS
+ This object contains all the parameters that control the behavior of loss_mmod_.
+ !*/
+
+ public:
+
+ struct detector_window_details
+ {
+ detector_window_details() = default;
+ detector_window_details(unsigned long w, unsigned long h) : width(w), height(h) {}
+ detector_window_details(unsigned long w, unsigned long h, const std::string& l) : width(w), height(h), label(l) {}
+
+ unsigned long width = 0;
+ unsigned long height = 0;
+ std::string label;
+
+ friend inline void serialize(const detector_window_details& item, std::ostream& out);
+ friend inline void deserialize(detector_window_details& item, std::istream& in);
+ };
+
+ mmod_options() = default;
+
+ // This kind of object detector is a sliding window detector. The detector_windows
+ // field determines how many sliding windows we will use and what the shape of each
+ // window is. It also determines the output label applied to each detection
+ // identified by each window. Since you will usually use the MMOD loss with an
+ // image pyramid, the detector sizes also determine the size of the smallest object
+ // you can detect.
+ std::vector<detector_window_details> detector_windows;
+
+ // These parameters control how we penalize different kinds of mistakes. See
+ // Max-Margin Object Detection by Davis E. King (http://arxiv.org/abs/1502.00046)
+ // for further details.
+ double loss_per_false_alarm = 1;
+ double loss_per_missed_target = 1;
+
+ // A detection must have an intersection-over-union value greater than this for us
+ // to consider it a match against a ground truth box.
+ double truth_match_iou_threshold = 0.5;
+
+ // When doing non-max suppression, we use overlaps_nms to decide if a box overlaps
+ // an already output detection and should therefore be thrown out.
+ test_box_overlap overlaps_nms = test_box_overlap(0.4);
+
+ // Any mmod_rect in the training data that has its ignore field set to true defines
+ // an "ignore zone" in an image. Any detection from that area is totally ignored
+ // by the optimizer. Therefore, this overlaps_ignore field defines how we decide
+ // if a box falls into an ignore zone. You use these ignore zones if there are
+ // objects in your dataset that you are unsure if you want to detect or otherwise
+ // don't care if the detector gets them or not.
+ test_box_overlap overlaps_ignore;
+
+ // Usually the detector would be scale-invariant, and used with an image pyramid.
+ // However, sometimes scale-invariance may not be desired.
+ use_image_pyramid assume_image_pyramid = use_image_pyramid::yes;
+
+ mmod_options (
+ const std::vector<std::vector<mmod_rect>>& boxes,
+ const unsigned long target_size,
+ const unsigned long min_target_size,
+ const double min_detector_window_overlap_iou = 0.75
+ );
+ /*!
+ requires
+ - 0 < min_target_size <= target_size
+ - 0.5 < min_detector_window_overlap_iou < 1
+ ensures
+ - use_image_pyramid_ == use_image_pyramid::yes
+ - This function should be used when scale-invariance is desired, and
+ input_rgb_image_pyramid is therefore used as the input layer.
+ - This function tries to automatically set the MMOD options to reasonable
+ values, assuming you have a training dataset of boxes.size() images, where
+ the ith image contains objects boxes[i] you want to detect.
+ - The most important thing this function does is decide what detector
+ windows should be used. This is done by finding a set of detector
+ windows that are sized such that:
+ - When slid over an image pyramid, each box in boxes will have an
+ intersection-over-union with one of the detector windows of at least
+ min_detector_window_overlap_iou. That is, we will make sure that
+ each box in boxes could potentially be detected by one of the
+ detector windows. This essentially comes down to picking detector
+ windows with aspect ratios similar to the aspect ratios in boxes.
+ Note that we also make sure that each box can be detected by a window
+ with the same label. For example, if all the boxes had the same
+ aspect ratio but there were 4 different labels used in boxes then
+ there would be 4 resulting detector windows, one for each label.
+ - The longest edge of each detector window is target_size pixels in
+ length, unless the window's shortest side would be less than
+ min_target_size pixels in length. In this case the shortest side
+ will be set to min_target_size length, and the other side sized to
+ preserve the aspect ratio of the window.
+ This means that target_size and min_target_size control the size of the
+ detector windows, while the aspect ratios of the detector windows are
+ automatically determined by the contents of boxes. It should also be
+ emphasized that the detector isn't going to be able to detect objects
+ smaller than any of the detector windows. So consider that when setting
+ these sizes.
+ - This function will also set the overlaps_nms tester to the most
+ restrictive tester that doesn't reject anything in boxes.
+ !*/
+
+ mmod_options (
+ use_image_pyramid use_image_pyramid,
+ const std::vector<std::vector<mmod_rect>>& boxes,
+ const double min_detector_window_overlap_iou = 0.75
+ );
+ /*!
+ requires
+ - use_image_pyramid == use_image_pyramid::no
+ - 0.5 < min_detector_window_overlap_iou < 1
+ ensures
+ - This function should be used when scale-invariance is not desired, and
+ there is no intention to apply an image pyramid.
+ - This function tries to automatically set the MMOD options to reasonable
+ values, assuming you have a training dataset of boxes.size() images, where
+ the ith image contains objects boxes[i] you want to detect.
+ - The most important thing this function does is decide what detector
+ windows should be used. This is done by finding a set of detector
+ windows that are sized such that:
+ - When slid over an image, each box in boxes will have an
+ intersection-over-union with one of the detector windows of at least
+ min_detector_window_overlap_iou. That is, we will make sure that
+ each box in boxes could potentially be detected by one of the
+ detector windows.
+ - This function will also set the overlaps_nms tester to the most
+ restrictive tester that doesn't reject anything in boxes.
+ !*/
+ };
+
+ void serialize(const mmod_options& item, std::ostream& out);
+ void deserialize(mmod_options& item, std::istream& in);
+
+// ----------------------------------------------------------------------------------------
+
+ class loss_mmod_
+ {
+ /*!
+ WHAT THIS OBJECT REPRESENTS
+ This object implements the loss layer interface defined above by
+ EXAMPLE_LOSS_LAYER_. In particular, it implements the Max Margin Object
+ Detection loss defined in the paper:
+ Max-Margin Object Detection by Davis E. King (http://arxiv.org/abs/1502.00046).
+
+ This means you use this loss if you want to detect the locations of objects
+ in images.
+
+ It should also be noted that this loss layer requires an input layer that
+ defines the following functions:
+ - image_contained_point()
+ - tensor_space_to_image_space()
+ - image_space_to_tensor_space()
+ A reference implementation of them and their definitions can be found in
+ the input_rgb_image_pyramid object, which is the recommended input layer to
+ be used with loss_mmod_.
+ !*/
+
+ public:
+
+ typedef std::vector<mmod_rect> training_label_type;
+ typedef std::vector<mmod_rect> output_label_type;
+
+ loss_mmod_(
+ );
+ /*!
+ ensures
+ - #get_options() == mmod_options()
+ !*/
+
+ loss_mmod_(
+ mmod_options options_
+ );
+ /*!
+ ensures
+ - #get_options() == options_
+ !*/
+
+ const mmod_options& get_options (
+ ) const;
+ /*!
+ ensures
+ - returns the options object that defines the general behavior of this loss layer.
+ !*/
+
+ template <
+ typename SUB_TYPE,
+ typename label_iterator
+ >
+ void to_label (
+ const tensor& input_tensor,
+ const SUB_TYPE& sub,
+ label_iterator iter,
+ double adjust_threshold = 0
+ ) const;
+ /*!
+ This function has the same interface as EXAMPLE_LOSS_LAYER_::to_label() except
+ it has the additional calling requirements that:
+ - sub.get_output().k() == 1
+ - sub.get_output().num_samples() == input_tensor.num_samples()
+ - sub.sample_expansion_factor() == 1
+ Also, the output labels are std::vectors of mmod_rects where, for each mmod_rect R,
+ we have the following interpretations:
+ - R.rect == the location of an object in the image.
+ - R.detection_confidence the score for the object, the bigger the score the
+ more confident the detector is that an object is really there. Only
+ objects with a detection_confidence > adjust_threshold are output. So if
+ you want to output more objects (that are also of less confidence) you
+ can call to_label() with a smaller value of adjust_threshold.
+ - R.ignore == false (this value is unused by to_label()).
+ !*/
+
+ template <
+ typename const_label_iterator,
+ typename SUBNET
+ >
+ double compute_loss_value_and_gradient (
+ const tensor& input_tensor,
+ const_label_iterator truth,
+ SUBNET& sub
+ ) const;
+ /*!
+ This function has the same interface as EXAMPLE_LOSS_LAYER_::compute_loss_value_and_gradient()
+ except it has the additional calling requirements that:
+ - sub.get_output().k() == 1
+ - sub.get_output().num_samples() == input_tensor.num_samples()
+ - sub.sample_expansion_factor() == 1
+ Also, the loss value returned is roughly equal to the average number of
+ mistakes made per image. This is the sum of false alarms and missed
+ detections, weighted by the loss weights for these types of mistakes specified
+ in the mmod_options.
+ !*/
+ };
+
+ template <typename SUBNET>
+ using loss_mmod = add_loss_layer<loss_mmod_, SUBNET>;
+
+// ----------------------------------------------------------------------------------------
+
+ class loss_metric_
+ {
+ /*!
+ WHAT THIS OBJECT REPRESENTS
+ This object implements the loss layer interface defined above by
+ EXAMPLE_LOSS_LAYER_. In particular, it allows you to learn to map objects
+ into a vector space where objects sharing the same class label are close to
+ each other, while objects with different labels are far apart.
+
+ To be specific, it optimizes the following loss function which considers
+ all pairs of objects in a mini-batch and computes a different loss depending
+ on their respective class labels. So if objects A1 and A2 in a mini-batch
+ share the same class label then their contribution to the loss is:
+ max(0, length(A1-A2)-get_distance_threshold() + get_margin())
+
+ While if A1 and B1 have different class labels then their contribution to
+ the loss function is:
+ max(0, get_distance_threshold()-length(A1-B1) + get_margin())
+
+ Therefore, this loss layer optimizes a version of the hinge loss.
+ Moreover, the loss is trying to make sure that all objects with the same
+ label are within get_distance_threshold() distance of each other.
+ Conversely, if two objects have different labels then they should be more
+ than get_distance_threshold() distance from each other in the learned
+ embedding. So this loss function gives you a natural decision boundary for
+ deciding if two objects are from the same class.
+
+ Finally, the loss balances the number of negative pairs relative to the
+ number of positive pairs. Therefore, if there are N pairs that share the
+ same identity in a mini-batch then the algorithm will only include the N
+ worst non-matching pairs in the loss. That is, the algorithm performs hard
+ negative mining on the non-matching pairs. This is important since there
+ are in general way more non-matching pairs than matching pairs. So to
+ avoid imbalance in the loss this kind of hard negative mining is useful.
+ !*/
+ public:
+
+ typedef unsigned long training_label_type;
+ typedef matrix<float,0,1> output_label_type;
+
+ loss_metric_(
+ );
+ /*!
+ ensures
+ - #get_margin() == 0.04
+ - #get_distance_threshold() == 0.6
+ !*/
+
+ loss_metric_(
+ float margin,
+ float dist_thresh
+ );
+ /*!
+ requires
+ - margin > 0
+ - dist_thresh > 0
+ ensures
+ - #get_margin() == margin
+ - #get_distance_threshold() == dist_thresh
+ !*/
+
+ template <
+ typename SUB_TYPE,
+ typename label_iterator
+ >
+ void to_label (
+ const tensor& input_tensor,
+ const SUB_TYPE& sub,
+ label_iterator iter
+ ) const;
+ /*!
+ This function has the same interface as EXAMPLE_LOSS_LAYER_::to_label() except
+ it has the additional calling requirements that:
+ - sub.get_output().nr() == 1
+ - sub.get_output().nc() == 1
+ - sub.get_output().num_samples() == input_tensor.num_samples()
+ - sub.sample_expansion_factor() == 1
+ This loss expects the network to produce a single vector (per sample) as
+ output. This vector is the learned embedding. Therefore, to_label() just
+ copies these output vectors from the network into the output label_iterators
+ given to this function, one for each sample in the input_tensor.
+ !*/
+
+ float get_margin() const;
+ /*!
+ ensures
+ - returns the margin value used by the loss function. See the discussion
+ in WHAT THIS OBJECT REPRESENTS for details.
+ !*/
+
+ float get_distance_threshold() const;
+ /*!
+ ensures
+ - returns the distance threshold value used by the loss function. See the discussion
+ in WHAT THIS OBJECT REPRESENTS for details.
+ !*/
+
+ template <
+ typename const_label_iterator,
+ typename SUBNET
+ >
+ double compute_loss_value_and_gradient (
+ const tensor& input_tensor,
+ const_label_iterator truth,
+ SUBNET& sub
+ ) const;
+ /*!
+ This function has the same interface as EXAMPLE_LOSS_LAYER_::compute_loss_value_and_gradient()
+ except it has the additional calling requirements that:
+ - sub.get_output().nr() == 1
+ - sub.get_output().nc() == 1
+ - sub.get_output().num_samples() == input_tensor.num_samples()
+ - sub.sample_expansion_factor() == 1
+ !*/
+
+ };
+
+ template <typename SUBNET>
+ using loss_metric = add_loss_layer<loss_metric_, SUBNET>;
+
+// ----------------------------------------------------------------------------------------
+
+ class loss_ranking_
+ {
+ /*!
+ WHAT THIS OBJECT REPRESENTS
+ This object implements the loss layer interface defined above by
+ EXAMPLE_LOSS_LAYER_. In particular, it implements the pairwise ranking
+ loss described in the paper:
+ Optimizing Search Engines using Clickthrough Data by Thorsten Joachims
+
+ This is the same loss function used by the dlib::svm_rank_trainer object.
+ Therefore, it is generally appropriate when you have a two class problem
+ and you want to learn a function that ranks one class before the other.
+
+ So for example, suppose you have two classes of data. Objects of type A
+ and objects of type B. Moreover, suppose that you want to sort the objects
+ so that A objects always come before B objects. This loss will help you
+ learn a function that assigns a real number to each object such that A
+ objects get a larger number assigned to them than B objects. This lets you
+ then sort the objects according to the output of the neural network and
+ obtain the desired result of having A objects come before B objects.
+
+ The training labels should be positive values for objects you want to get
+ high scores and negative for objects that should get small scores. So
+ relative to our A/B example, you would give A objects labels of +1 and B
+ objects labels of -1. This should cause the learned network to give A
+ objects large positive values and B objects negative values.
+
+
+ Finally, the specific loss function is:
+ For all pairs of positive vs negative training examples A_i and B_j respectively:
+ sum_ij: max(0, B_i - A_j + margin_ij)
+ where margin_ij = the label for A_j minus the label for B_i. If you
+ always use +1 and -1 labels then the margin is always 2. However, this
+ formulation allows you to give certain training samples different weight by
+ adjusting the training labels appropriately.
+ !*/
+
+ public:
+
+ typedef float training_label_type;
+ typedef float output_label_type;
+
+ template <
+ typename SUB_TYPE,
+ typename label_iterator
+ >
+ void to_label (
+ const tensor& input_tensor,
+ const SUB_TYPE& sub,
+ label_iterator iter
+ ) const;
+ /*!
+ This function has the same interface as EXAMPLE_LOSS_LAYER_::to_label() except
+ it has the additional calling requirements that:
+ - sub.get_output().nr() == 1
+ - sub.get_output().nc() == 1
+ - sub.get_output().k() == 1
+ - sub.get_output().num_samples() == input_tensor.num_samples()
+ - sub.sample_expansion_factor() == 1
+ and the output label is the predicted ranking score.
+ !*/
+
+ template <
+ typename const_label_iterator,
+ typename SUBNET
+ >
+ double compute_loss_value_and_gradient (
+ const tensor& input_tensor,
+ const_label_iterator truth,
+ SUBNET& sub
+ ) const;
+ /*!
+ This function has the same interface as EXAMPLE_LOSS_LAYER_::compute_loss_value_and_gradient()
+ except it has the additional calling requirements that:
+ - sub.get_output().nr() == 1
+ - sub.get_output().nc() == 1
+ - sub.get_output().k() == 1
+ - sub.get_output().num_samples() == input_tensor.num_samples()
+ - sub.sample_expansion_factor() == 1
+ !*/
+
+ };
+
+ template <typename SUBNET>
+ using loss_ranking = add_loss_layer<loss_ranking_, SUBNET>;
+
+// ----------------------------------------------------------------------------------------
+
+ class loss_epsilon_insensitive_
+ {
+ /*!
+ WHAT THIS OBJECT REPRESENTS
+ This object implements the loss layer interface defined above by
+ EXAMPLE_LOSS_LAYER_. In particular, it implements the epsilon insensitive
+ loss, which is appropriate for regression problems. In particular, this
+ loss function is;
+ loss(y1,y2) = abs(y1-y2)<epsilon ? 0 : abs(y1-y2)-epsilon
+
+ Therefore, the loss is basically just the abs() loss except there is a dead
+ zone around zero, causing the loss to not care about mistakes of magnitude
+ smaller than epsilon.
+ !*/
+ public:
+
+ typedef float training_label_type;
+ typedef float output_label_type;
+
+ loss_epsilon_insensitive_(
+ ) = default;
+ /*!
+ ensures
+ - #get_epsilon() == 1
+ !*/
+
+ loss_epsilon_insensitive_(
+ double eps
+ );
+ /*!
+ requires
+ - eps >= 0
+ ensures
+ - #get_epsilon() == eps
+ !*/
+
+ double get_epsilon (
+ ) const;
+ /*!
+ ensures
+ - returns the epsilon value used in the loss function. Mistakes in the
+ regressor smaller than get_epsilon() are ignored by the loss function.
+ !*/
+
+ void set_epsilon(
+ double eps
+ );
+ /*!
+ requires
+ - eps >= 0
+ ensures
+ - #get_epsilon() == eps
+ !*/
+
+ template <
+ typename SUB_TYPE,
+ typename label_iterator
+ >
+ void to_label (
+ const tensor& input_tensor,
+ const SUB_TYPE& sub,
+ label_iterator iter
+ ) const;
+ /*!
+ This function has the same interface as EXAMPLE_LOSS_LAYER_::to_label() except
+ it has the additional calling requirements that:
+ - sub.get_output().nr() == 1
+ - sub.get_output().nc() == 1
+ - sub.get_output().k() == 1
+ - sub.get_output().num_samples() == input_tensor.num_samples()
+ - sub.sample_expansion_factor() == 1
+ and the output label is the predicted continuous variable.
+ !*/
+
+ template <
+ typename const_label_iterator,
+ typename SUBNET
+ >
+ double compute_loss_value_and_gradient (
+ const tensor& input_tensor,
+ const_label_iterator truth,
+ SUBNET& sub
+ ) const;
+ /*!
+ This function has the same interface as EXAMPLE_LOSS_LAYER_::compute_loss_value_and_gradient()
+ except it has the additional calling requirements that:
+ - sub.get_output().nr() == 1
+ - sub.get_output().nc() == 1
+ - sub.get_output().k() == 1
+ - sub.get_output().num_samples() == input_tensor.num_samples()
+ - sub.sample_expansion_factor() == 1
+ !*/
+
+ };
+
+ template <typename SUBNET>
+ using loss_epsilon_insensitive = add_loss_layer<loss_epsilon_insensitive_, SUBNET>;
+
+// ----------------------------------------------------------------------------------------
+
+ class loss_mean_squared_
+ {
+ /*!
+ WHAT THIS OBJECT REPRESENTS
+ This object implements the loss layer interface defined above by
+ EXAMPLE_LOSS_LAYER_. In particular, it implements the mean squared loss, which is
+ appropriate for regression problems.
+ !*/
+ public:
+
+ typedef float training_label_type;
+ typedef float output_label_type;
+
+ template <
+ typename SUB_TYPE,
+ typename label_iterator
+ >
+ void to_label (
+ const tensor& input_tensor,
+ const SUB_TYPE& sub,
+ label_iterator iter
+ ) const;
+ /*!
+ This function has the same interface as EXAMPLE_LOSS_LAYER_::to_label() except
+ it has the additional calling requirements that:
+ - sub.get_output().nr() == 1
+ - sub.get_output().nc() == 1
+ - sub.get_output().k() == 1
+ - sub.get_output().num_samples() == input_tensor.num_samples()
+ - sub.sample_expansion_factor() == 1
+ and the output label is the predicted continuous variable.
+ !*/
+
+ template <
+ typename const_label_iterator,
+ typename SUBNET
+ >
+ double compute_loss_value_and_gradient (
+ const tensor& input_tensor,
+ const_label_iterator truth,
+ SUBNET& sub
+ ) const;
+ /*!
+ This function has the same interface as EXAMPLE_LOSS_LAYER_::compute_loss_value_and_gradient()
+ except it has the additional calling requirements that:
+ - sub.get_output().nr() == 1
+ - sub.get_output().nc() == 1
+ - sub.get_output().k() == 1
+ - sub.get_output().num_samples() == input_tensor.num_samples()
+ - sub.sample_expansion_factor() == 1
+ !*/
+
+ };
+
+ template <typename SUBNET>
+ using loss_mean_squared = add_loss_layer<loss_mean_squared_, SUBNET>;
+
+// ----------------------------------------------------------------------------------------
+
+ class loss_mean_squared_multioutput_
+ {
+ /*!
+ WHAT THIS OBJECT REPRESENTS
+ This object implements the loss layer interface defined above by
+ EXAMPLE_LOSS_LAYER_. In particular, it implements the mean squared loss,
+ which is appropriate for regression problems. It is basically just like
+ loss_mean_squared_ except that it lets you define multiple outputs instead
+ of just 1.
+ !*/
+ public:
+
+ typedef matrix<float> training_label_type;
+ typedef matrix<float> output_label_type;
+
+ template <
+ typename SUB_TYPE,
+ typename label_iterator
+ >
+ void to_label (
+ const tensor& input_tensor,
+ const SUB_TYPE& sub,
+ label_iterator iter
+ ) const;
+ /*!
+ This function has the same interface as EXAMPLE_LOSS_LAYER_::to_label() except
+ it has the additional calling requirements that:
+ - sub.get_output().nr() == 1
+ - sub.get_output().nc() == 1
+ - sub.get_output().num_samples() == input_tensor.num_samples()
+ - sub.sample_expansion_factor() == 1
+ and the output label is the predicted continuous variable.
+ !*/
+
+ template <
+ typename const_label_iterator,
+ typename SUBNET
+ >
+ double compute_loss_value_and_gradient (
+ const tensor& input_tensor,
+ const_label_iterator truth,
+ SUBNET& sub
+ ) const;
+ /*!
+ This function has the same interface as EXAMPLE_LOSS_LAYER_::compute_loss_value_and_gradient()
+ except it has the additional calling requirements that:
+ - sub.get_output().nr() == 1
+ - sub.get_output().nc() == 1
+ - sub.get_output().num_samples() == input_tensor.num_samples()
+ - sub.sample_expansion_factor() == 1
+ - (*(truth + idx)).nc() == 1 for all idx such that 0 <= idx < sub.get_output().num_samples()
+ - (*(truth + idx)).nr() == sub.get_output().k() for all idx such that 0 <= idx < sub.get_output().num_samples()
+ !*/
+
+ };
+
+ template <typename SUBNET>
+ using loss_mean_squared_multioutput = add_loss_layer<loss_mean_squared_multioutput_, SUBNET>;
+
+// ----------------------------------------------------------------------------------------
+
+ class loss_multiclass_log_per_pixel_
+ {
+ /*!
+ WHAT THIS OBJECT REPRESENTS
+ This object implements the loss layer interface defined above by
+ EXAMPLE_LOSS_LAYER_. In particular, it implements the multiclass logistic
+ regression loss (e.g. negative log-likelihood loss), which is appropriate
+ for multiclass classification problems. It is basically just like
+ loss_multiclass_log_ except that it lets you define matrix outputs instead
+ of scalar outputs. It should be useful, for example, in semantic
+ segmentation where we want to classify each pixel of an image.
+ !*/
+ public:
+
+ // In semantic segmentation, if you don't know the ground-truth of some pixel,
+ // set the label of that pixel to this value. When you do so, the pixel will be
+ // ignored when computing gradients.
+ static const uint16_t label_to_ignore = std::numeric_limits<uint16_t>::max();
+
+ // In semantic segmentation, 65535 classes ought to be enough for anybody.
+ typedef matrix<uint16_t> training_label_type;
+ typedef matrix<uint16_t> output_label_type;
+
+ template <
+ typename SUB_TYPE,
+ typename label_iterator
+ >
+ void to_label (
+ const tensor& input_tensor,
+ const SUB_TYPE& sub,
+ label_iterator iter
+ ) const;
+ /*!
+ This function has the same interface as EXAMPLE_LOSS_LAYER_::to_label() except
+ it has the additional calling requirements that:
+ - sub.get_output().num_samples() == input_tensor.num_samples()
+ - sub.sample_expansion_factor() == 1
+ and the output label is the predicted class for each classified element. The number
+ of possible output classes is sub.get_output().k().
+ !*/
+
+ template <
+ typename const_label_iterator,
+ typename SUBNET
+ >
+ double compute_loss_value_and_gradient (
+ const tensor& input_tensor,
+ const_label_iterator truth,
+ SUBNET& sub
+ ) const;
+ /*!
+ This function has the same interface as EXAMPLE_LOSS_LAYER_::compute_loss_value_and_gradient()
+ except it has the additional calling requirements that:
+ - sub.get_output().num_samples() == input_tensor.num_samples()
+ - sub.sample_expansion_factor() == 1
+ - all values pointed to by truth are < sub.get_output().k() or are equal to label_to_ignore.
+ !*/
+
+ };
+
+ template <typename SUBNET>
+ using loss_multiclass_log_per_pixel = add_loss_layer<loss_multiclass_log_per_pixel_, SUBNET>;
+
+// ----------------------------------------------------------------------------------------
+
+ class loss_multiclass_log_per_pixel_weighted_
+ {
+ /*!
+ WHAT THIS OBJECT REPRESENTS
+ This object implements the loss layer interface defined above by
+ EXAMPLE_LOSS_LAYER_. In particular, it implements the multiclass logistic
+ regression loss (e.g. negative log-likelihood loss), which is appropriate
+ for multiclass classification problems. It is basically just like
+ loss_multiclass_log_per_pixel_ except that it lets you define per-pixel
+ weights, which may be useful e.g. if you want to emphasize rare classes
+ while training. (If the classification problem is difficult, a flat weight
+ structure may lead the network to always predict the most common label, in
+ particular if the degree of imbalance is high. To emphasize a certain
+ class or classes, simply increase the weights of the corresponding pixels,
+ relative to the weights of the other pixels.)
+
+ Note that if you set the weight to 0 whenever a pixel's label is equal to
+ loss_multiclass_log_per_pixel_::label_to_ignore, and to 1 otherwise, then
+ you essentially get loss_multiclass_log_per_pixel_ as a special case.
+ !*/
+ public:
+
+ struct weighted_label
+ {
+ /*!
+ WHAT THIS OBJECT REPRESENTS
+ This object represents the truth label of a single pixel, together with
+ an associated weight (the higher the weight, the more emphasis the
+ corresponding pixel is given during the training).
+ !*/
+
+ weighted_label();
+ weighted_label(uint16_t label, float weight = 1.f);
+
+ // The ground-truth label. In semantic segmentation, 65536 classes ought to be
+ // enough for anybody.
+ uint16_t label = 0;
+
+ // The weight of the corresponding pixel.
+ float weight = 1.f;
+ };
+
+ typedef matrix<weighted_label> training_label_type;
+ typedef matrix<uint16_t> output_label_type;
+
+ template <
+ typename SUB_TYPE,
+ typename label_iterator
+ >
+ void to_label (
+ const tensor& input_tensor,
+ const SUB_TYPE& sub,
+ label_iterator iter
+ ) const;
+ /*!
+ This function has the same interface as EXAMPLE_LOSS_LAYER_::to_label() except
+ it has the additional calling requirements that:
+ - sub.get_output().num_samples() == input_tensor.num_samples()
+ - sub.sample_expansion_factor() == 1
+ and the output label is the predicted class for each classified element. The number
+ of possible output classes is sub.get_output().k().
+ !*/
+
+ template <
+ typename const_label_iterator,
+ typename SUBNET
+ >
+ double compute_loss_value_and_gradient (
+ const tensor& input_tensor,
+ const_label_iterator truth,
+ SUBNET& sub
+ ) const;
+ /*!
+ This function has the same interface as EXAMPLE_LOSS_LAYER_::compute_loss_value_and_gradient()
+ except it has the additional calling requirements that:
+ - sub.get_output().num_samples() == input_tensor.num_samples()
+ - sub.sample_expansion_factor() == 1
+ - all labels pointed to by truth are < sub.get_output().k(), or the corresponding weight
+ is zero.
+ !*/
+
+ };
+
+ template <typename SUBNET>
+ using loss_multiclass_log_per_pixel_weighted = add_loss_layer<loss_multiclass_log_per_pixel_weighted_, SUBNET>;
+
+// ----------------------------------------------------------------------------------------
+
+ class loss_mean_squared_per_pixel_
+ {
+ /*!
+ WHAT THIS OBJECT REPRESENTS
+ This object implements the loss layer interface defined above by
+ EXAMPLE_LOSS_LAYER_. In particular, it implements the mean squared loss,
+ which is appropriate for regression problems. It is basically just like
+ loss_mean_squared_multioutput_ except that it lets you define matrix or
+ image outputs, instead of vector.
+ !*/
+ public:
+
+ typedef matrix<float> training_label_type;
+ typedef matrix<float> output_label_type;
+
+ template <
+ typename SUB_TYPE,
+ typename label_iterator
+ >
+ void to_label (
+ const tensor& input_tensor,
+ const SUB_TYPE& sub,
+ label_iterator iter
+ ) const;
+ /*!
+ This function has the same interface as EXAMPLE_LOSS_LAYER_::to_label() except
+ it has the additional calling requirements that:
+ - sub.get_output().num_samples() == input_tensor.num_samples()
+ - sub.sample_expansion_factor() == 1
+ and the output labels are the predicted continuous variables.
+ !*/
+
+ template <
+ typename const_label_iterator,
+ typename SUBNET
+ >
+ double compute_loss_value_and_gradient (
+ const tensor& input_tensor,
+ const_label_iterator truth,
+ SUBNET& sub
+ ) const;
+ /*!
+ This function has the same interface as EXAMPLE_LOSS_LAYER_::compute_loss_value_and_gradient()
+ except it has the additional calling requirements that:
+ - sub.get_output().k() == 1
+ - sub.get_output().num_samples() == input_tensor.num_samples()
+ - sub.sample_expansion_factor() == 1
+ - for all idx such that 0 <= idx < sub.get_output().num_samples():
+ - sub.get_output().nr() == (*(truth + idx)).nr()
+ - sub.get_output().nc() == (*(truth + idx)).nc()
+ !*/
+ };
+
+ template <typename SUBNET>
+ using loss_mean_squared_per_pixel = add_loss_layer<loss_mean_squared_per_pixel_, SUBNET>;
+
+// ----------------------------------------------------------------------------------------
+
+ class loss_dot_
+ {
+ /*!
+ WHAT THIS OBJECT REPRESENTS
+ This object implements the loss layer interface defined above by
+ EXAMPLE_LOSS_LAYER_. In particular, selecting this loss means you want
+ maximize the dot product between the output of a network and a set of
+ training vectors. The loss is therefore the negative dot product. To be
+ very specific, if X is the output vector of a network and Y is a training
+ label (also a vector), then the loss for this training sample is: -dot(X,Y)
+ !*/
+
+ public:
+
+ typedef matrix<float,0,1> training_label_type;
+ typedef matrix<float,0,1> output_label_type;
+
+ template <
+ typename SUB_TYPE,
+ typename label_iterator
+ >
+ void to_label (
+ const tensor& input_tensor,
+ const SUB_TYPE& sub,
+ label_iterator iter
+ ) const;
+ /*!
+ This function has the same interface as EXAMPLE_LOSS_LAYER_::to_label() except
+ it has the additional calling requirements that:
+ - sub.get_output().num_samples() == input_tensor.num_samples()
+ - sub.sample_expansion_factor() == 1
+ and the output labels are simply the final network outputs stuffed into a
+ vector. To be very specific, the output is the following for all valid i:
+ *(iter+i) == trans(rowm(mat(sub.get_output()),i))
+ !*/
+
+
+ template <
+ typename const_label_iterator,
+ typename SUBNET
+ >
+ double compute_loss_value_and_gradient (
+ const tensor& input_tensor,
+ const_label_iterator truth,
+ SUBNET& sub
+ ) const;
+ /*!
+ This function has the same interface as EXAMPLE_LOSS_LAYER_::compute_loss_value_and_gradient()
+ except it has the additional calling requirements that:
+ - sub.get_output().num_samples() == input_tensor.num_samples()
+ - sub.sample_expansion_factor() == 1
+ - Let NETWORK_OUTPUT_DIMS == sub.get_output().size()/sub.get_output().num_samples()
+ - for all idx such that 0 <= idx < sub.get_output().num_samples():
+ - NETWORK_OUTPUT_DIMS == (*(truth + idx)).size()
+ !*/
+ };
+
+ template <typename SUBNET>
+ using loss_dot = add_loss_layer<loss_dot_, SUBNET>;
+
+// ----------------------------------------------------------------------------------------
+
+}
+
+#endif // DLIB_DNn_LOSS_ABSTRACT_H_
+
diff --git a/ml/dlib/dlib/dnn/solvers.h b/ml/dlib/dlib/dnn/solvers.h
new file mode 100644
index 000000000..204541a7e
--- /dev/null
+++ b/ml/dlib/dlib/dnn/solvers.h
@@ -0,0 +1,405 @@
+// Copyright (C) 2015 Davis E. King (davis@dlib.net)
+// License: Boost Software License See LICENSE.txt for the full license.
+#ifndef DLIB_DNn_SOLVERS_H_
+#define DLIB_DNn_SOLVERS_H_
+
+#include "solvers_abstract.h"
+#include "tensor.h"
+#include <iostream>
+#include "layers.h"
+
+namespace dlib
+{
+ class sgd
+ {
+ public:
+
+ explicit sgd(
+ float weight_decay_,
+ float momentum_ = 0.9
+ )
+ {
+ weight_decay = weight_decay_;
+ momentum = momentum_;
+ }
+
+ sgd(
+ ) : sgd(0.0005, 0.9)
+ {
+ }
+
+ float get_momentum (
+ ) const { return momentum; }
+
+ float get_weight_decay (
+ ) const { return weight_decay; }
+
+ template <typename layer_type>
+ const tensor& operator() (
+ const float learning_rate,
+ const layer_type& l,
+ const tensor& params_grad
+ )
+ {
+ const tensor& params = l.get_layer_params();
+
+ DLIB_CASSERT(params.size() != 0);
+ if (v.size() == 0)
+ {
+ v.copy_size(params_grad);
+ v = 0;
+ }
+
+ const double lr = learning_rate*get_learning_rate_multiplier(l);
+ const double wd = weight_decay*get_weight_decay_multiplier(l);
+
+ //perform: v = momentum*mat(v) - wd*lr*mat(params) - lr*mat(params_grad);
+ tt::affine_transform(v, v, params, params_grad, momentum, -wd*lr, -lr);
+
+ return v;
+ }
+
+ template <unsigned long N>
+ const tensor& operator() (
+ const float learning_rate,
+ const fc_<N,FC_HAS_BIAS>& l,
+ const tensor& params_grad
+ )
+ {
+ update_considering_bias(learning_rate, l, params_grad, params_grad.size()-l.get_num_outputs());
+ return v;
+ }
+
+ template <
+ long _num_filters,
+ long _nr,
+ long _nc,
+ int _stride_y,
+ int _stride_x,
+ int _padding_y,
+ int _padding_x
+ >
+ const tensor& operator() (
+ const float learning_rate,
+ const con_<_num_filters,_nr,_nc,_stride_y,_stride_x,_padding_y,_padding_x>& l,
+ const tensor& params_grad
+ )
+ {
+ update_considering_bias(learning_rate, l, params_grad, params_grad.size()-l.num_filters());
+ return v;
+ }
+
+ template <
+ long _num_filters,
+ long _nr,
+ long _nc,
+ int _stride_y,
+ int _stride_x,
+ int _padding_y,
+ int _padding_x
+ >
+ const tensor& operator() (
+ const float learning_rate,
+ const cont_<_num_filters,_nr,_nc,_stride_y,_stride_x,_padding_y,_padding_x>& l,
+ const tensor& params_grad
+ )
+ {
+ update_considering_bias(learning_rate, l, params_grad, params_grad.size()-l.num_filters());
+ return v;
+ }
+
+ template < layer_mode mode >
+ const tensor& operator() (
+ const float learning_rate,
+ const bn_<mode>& l,
+ const tensor& params_grad
+ )
+ {
+ update_considering_bias(learning_rate, l, params_grad, params_grad.size()/2);
+ return v;
+ }
+
+ friend void serialize(const sgd& item, std::ostream& out)
+ {
+ serialize("sgd2", out);
+ serialize(item.v, out);
+ serialize(item.weight_decay, out);
+ serialize(item.momentum, out);
+ }
+
+ friend void deserialize(sgd& item, std::istream& in)
+ {
+ std::string version;
+ deserialize(version, in);
+ if (version != "sgd2")
+ throw serialization_error("Unexpected version found while deserializing dlib::sgd.");
+ deserialize(item.v, in);
+ deserialize(item.weight_decay, in);
+ deserialize(item.momentum, in);
+ }
+
+ friend std::ostream& operator<< (std::ostream& out, const sgd& item)
+ {
+ out << "sgd: weight_decay="<<item.get_weight_decay() << ", momentum="<<item.get_momentum();
+ return out;
+ }
+
+ private:
+
+ template <typename layer_type>
+ void update_considering_bias(
+ const float learning_rate,
+ const layer_type& l,
+ const tensor& params_grad,
+ unsigned long bias_offset
+ )
+ {
+ const tensor& params = l.get_layer_params();
+
+ DLIB_CASSERT(params.size() != 0);
+ if (v.size() == 0)
+ {
+ v.copy_size(params_grad);
+ v = 0;
+ }
+
+ double lr = learning_rate*get_learning_rate_multiplier(l);
+ double wd = weight_decay*get_weight_decay_multiplier(l);
+
+ //perform: v = momentum*mat(v) - wd*lr*mat(params) - lr*mat(params_grad);
+
+ if (l.get_bias_learning_rate_multiplier() == 1 && l.get_bias_weight_decay_multiplier() == 1)
+ {
+ tt::affine_transform(v, v, params, params_grad, momentum, -wd*lr, -lr);
+ }
+ else
+ {
+
+ tt::affine_transform_range(0, bias_offset, v, v, params, params_grad, momentum, -wd*lr, -lr);
+
+ // now update the biases but apply their multipliers
+ lr *= l.get_bias_learning_rate_multiplier();
+ wd *= l.get_bias_weight_decay_multiplier();
+ tt::affine_transform_range(bias_offset, v.size(), v, v, params, params_grad, momentum, -wd*lr, -lr);
+ }
+ }
+
+ resizable_tensor v;
+ float weight_decay;
+ float momentum;
+
+ };
+
+// ----------------------------------------------------------------------------------------
+
+ class adam
+ {
+ public:
+
+ adam(
+ float weight_decay_,
+ float momentum1_,
+ float momentum2_
+ )
+ {
+ weight_decay = weight_decay_;
+ momentum1 = momentum1_;
+ momentum2 = momentum2_;
+ t = 0;
+ }
+
+ adam(
+ ) : adam(0.0005, 0.9, 0.999)
+ {}
+
+ float get_momentum1 (
+ ) const { return momentum1; }
+
+ float get_momentum2 (
+ ) const { return momentum2; }
+
+ float get_weight_decay (
+ ) const { return weight_decay; }
+
+ template <typename layer_type>
+ const tensor& operator() (
+ const float learning_rate,
+ const layer_type& l,
+ const tensor& params_grad
+ )
+ {
+ const tensor& params = l.get_layer_params();
+ DLIB_CASSERT(params.size() != 0);
+ if (v.size() == 0)
+ {
+ m.copy_size(params_grad);
+ m = 0;
+ v.copy_size(params_grad);
+ v = 0;
+ s.copy_size(params_grad);
+ }
+
+ ++t;
+
+
+ tt::compute_adam_update(0, params.size(), s, m, v, t,
+ learning_rate*get_learning_rate_multiplier(l),
+ weight_decay*get_weight_decay_multiplier(l),
+ momentum1, momentum2, params, params_grad);
+
+ return s;
+ }
+
+ template <unsigned long N>
+ const tensor& operator() (
+ const float learning_rate,
+ const fc_<N,FC_HAS_BIAS>& l,
+ const tensor& params_grad
+ )
+ {
+ update_considering_bias(learning_rate, l, params_grad, params_grad.size()-l.get_num_outputs());
+ return s;
+ }
+
+ template <
+ long _num_filters,
+ long _nr,
+ long _nc,
+ int _stride_y,
+ int _stride_x,
+ int _padding_y,
+ int _padding_x
+ >
+ const tensor& operator() (
+ const float learning_rate,
+ const con_<_num_filters,_nr,_nc,_stride_y,_stride_x,_padding_y,_padding_x>& l,
+ const tensor& params_grad
+ )
+ {
+ update_considering_bias(learning_rate, l, params_grad, params_grad.size()-l.num_filters());
+ return s;
+ }
+
+ template <
+ long _num_filters,
+ long _nr,
+ long _nc,
+ int _stride_y,
+ int _stride_x,
+ int _padding_y,
+ int _padding_x
+ >
+ const tensor& operator() (
+ const float learning_rate,
+ const cont_<_num_filters,_nr,_nc,_stride_y,_stride_x,_padding_y,_padding_x>& l,
+ const tensor& params_grad
+ )
+ {
+ update_considering_bias(learning_rate, l, params_grad, params_grad.size()-l.num_filters());
+ return s;
+ }
+
+ template < layer_mode mode >
+ const tensor& operator() (
+ const float learning_rate,
+ const bn_<mode>& l,
+ const tensor& params_grad
+ )
+ {
+ update_considering_bias(learning_rate, l, params_grad, params_grad.size()/2);
+ return s;
+ }
+
+
+ friend void serialize(const adam& item, std::ostream& out)
+ {
+ serialize("adam2", out);
+ serialize(item.m, out);
+ serialize(item.v, out);
+ serialize(item.s, out);
+ serialize(item.weight_decay, out);
+ serialize(item.momentum1, out);
+ serialize(item.momentum2, out);
+ serialize(item.t, out);
+ }
+
+ friend void deserialize(adam& item, std::istream& in)
+ {
+ std::string version;
+ deserialize(version, in);
+ if (version != "adam2")
+ throw serialization_error("Unexpected version found while deserializing dlib::adam.");
+ deserialize(item.m, in);
+ deserialize(item.v, in);
+ deserialize(item.s, in);
+ deserialize(item.weight_decay, in);
+ deserialize(item.momentum1, in);
+ deserialize(item.momentum2, in);
+ deserialize(item.t, in);
+ }
+
+ friend std::ostream& operator<< (std::ostream& out, const adam& item)
+ {
+ out << "adam: weight_decay="<<item.get_weight_decay() << ", momentum1="<<item.get_momentum1() << ", momentum2="<<item.get_momentum2();
+ return out;
+ }
+
+ private:
+
+ template <typename layer_type>
+ void update_considering_bias(
+ const float learning_rate,
+ const layer_type& l,
+ const tensor& params_grad,
+ unsigned long bias_offset
+ )
+ {
+ const tensor& params = l.get_layer_params();
+ DLIB_CASSERT(params.size() != 0);
+ if (v.size() == 0)
+ {
+ m.copy_size(params_grad);
+ m = 0;
+ v.copy_size(params_grad);
+ v = 0;
+ s.copy_size(params_grad);
+ }
+
+
+ ++t;
+
+ if (l.get_bias_learning_rate_multiplier() == 1 && l.get_bias_weight_decay_multiplier() == 1)
+ {
+ tt::compute_adam_update(0, params.size(), s, m, v, t,
+ learning_rate*get_learning_rate_multiplier(l),
+ weight_decay*get_weight_decay_multiplier(l),
+ momentum1, momentum2, params, params_grad);
+ }
+ else
+ {
+ tt::compute_adam_update(0, bias_offset, s, m, v, t,
+ learning_rate*get_learning_rate_multiplier(l),
+ weight_decay*get_weight_decay_multiplier(l),
+ momentum1, momentum2, params, params_grad);
+
+ tt::compute_adam_update(bias_offset, params.size(), s, m, v, t,
+ learning_rate*get_learning_rate_multiplier(l)*l.get_bias_learning_rate_multiplier(),
+ weight_decay*get_weight_decay_multiplier(l)*l.get_bias_weight_decay_multiplier(),
+ momentum1, momentum2, params, params_grad);
+ }
+ }
+ resizable_tensor m;
+ resizable_tensor v;
+ resizable_tensor s;
+ float weight_decay;
+ float momentum1;
+ float momentum2;
+ float t;
+ };
+
+// ----------------------------------------------------------------------------------------
+
+}
+
+#endif // DLIB_DNn_SOLVERS_H_
+
diff --git a/ml/dlib/dlib/dnn/solvers_abstract.h b/ml/dlib/dlib/dnn/solvers_abstract.h
new file mode 100644
index 000000000..d10ef163a
--- /dev/null
+++ b/ml/dlib/dlib/dnn/solvers_abstract.h
@@ -0,0 +1,204 @@
+// Copyright (C) 2015 Davis E. King (davis@dlib.net)
+// License: Boost Software License See LICENSE.txt for the full license.
+#undef DLIB_DNn_SOLVERS_ABSTRACT_H_
+#ifdef DLIB_DNn_SOLVERS_ABSTRACT_H_
+
+#include "tensor_abstract.h"
+#include <iostream>
+
+namespace dlib
+{
+
+// ----------------------------------------------------------------------------------------
+// ----------------------------------------------------------------------------------------
+// ----------------------------------------------------------------------------------------
+
+ class EXAMPLE_SOLVER
+ {
+ /*!
+ WHAT THIS OBJECT REPRESENTS
+ A solver defines the parameter update rule for a single layer in a deep
+ neural network. It takes a parameter gradient vector and the layer's
+ parameters and tells you how the parameters should be updated.
+ Importantly, each solver instance is used with only one layer in a network.
+ This allows us to define solvers that have per layer state, for example, a
+ solver may keep a momentum term and apply it to its update rule.
+
+ Note that there is no dlib::EXAMPLE_SOLVER type. It is shown here purely
+ to document the interface a solver object must implement.
+ !*/
+
+ public:
+
+ EXAMPLE_SOLVER(
+ );
+
+ template <typename layer_type>
+ const tensor& operator() (
+ const float learning_rate,
+ const layer_type& l,
+ const tensor& params_grad
+ )
+ /*!
+ requires
+ - l.get_layer_params().size() != 0
+ - have_same_dimensions(l.get_layer_params(), params_grad) == true.
+ - When this function is invoked on a particular solver instance, it is
+ always supplied with the same layer instance, l. That is, the solver is
+ allowed to remember things from one invocation to another and to assume
+ that it is being serially applied to optimize the same layer's
+ parameters.
+ ensures
+ - Returns a step vector V that is intended to be used to update the
+ parameters by adding V to l.get_layer_params().
+ - This function will use the given "learning rate" to compute V. How the
+ learning rate is used is solver dependent. But in general the learning
+ rate should be used to select the step size, i.e. to somehow determine
+ the magnitude of V.
+ !*/
+ };
+
+ void serialize(const EXAMPLE_SOLVER& item, std::ostream& out);
+ void deserialize(EXAMPLE_SOLVER& item, std::istream& in);
+ /*!
+ provides serialization support
+ !*/
+
+ std::ostream& operator<< (std::ostream& out, const EXAMPLE_SOLVER& item);
+ /*!
+ Prints the solver's name and parameters to out.
+ !*/
+
+// ----------------------------------------------------------------------------------------
+// ----------------------------------------------------------------------------------------
+// ----------------------------------------------------------------------------------------
+
+ class sgd
+ {
+ /*!
+ WHAT THIS OBJECT REPRESENTS
+ This object implements the EXAMPLE_SOLVER interface defined above. It is a
+ basic stochastic gradient descent solver which uses momentum and weight
+ decay. In particular, it computes the update vector V according to:
+ V = momentum*V - weight_decay*learning_rate*l.get_layer_params() - learning_rate*params_grad;
+ Here V is a momentum term that is remembered by the solver from one
+ invocation of operator() to the next.
+
+
+ Note that the actual learning rate and weight decay used by the solver are
+ multiplied by the per layer multipliers. That is, the solver will call
+ get_learning_rate_multiplier(l) and get_weight_decay_multiplier(l) and
+ multiply these values with the nominal learning rate and weight decay,
+ respectively, to determine the values it will use during each step. It is
+ also overloaded to allow additional learning rate multipliers to be applied
+ to fc_ and con_ bias parameters.
+ !*/
+ public:
+
+ sgd(
+ );
+ /*!
+ ensures
+ - #get_weight_decay() == 0.0005
+ - #get_momentum() == 0.9
+ !*/
+
+ explicit sgd(
+ float weight_decay,
+ float momentum = 0.9
+ );
+ /*!
+ requires
+ - weight_decay >= 0
+ - momentum >= 0
+ ensures
+ - #get_weight_decay() == weight_decay
+ - #get_momentum() == momentum
+ !*/
+
+ float get_weight_decay () const;
+ float get_momentum () const;
+ };
+
+ void serialize(const sgd& item, std::ostream& out);
+ void deserialize(sgd& item, std::istream& in);
+ /*!
+ provides serialization support
+ !*/
+
+ std::ostream& operator<< (std::ostream& out, const sgd& item);
+ /*!
+ Prints the solver's name and parameters to out.
+ !*/
+
+// ----------------------------------------------------------------------------------------
+
+ class adam
+ {
+ /*!
+ WHAT THIS OBJECT REPRESENTS
+ This object implements the EXAMPLE_SOLVER interface defined above. In
+ particular, it implements the ADAM parameter update method described in the
+ paper:
+ Kingma, Diederik P., and Jimmy Ba Adam. "A method for stochastic
+ optimization." International Conference on Learning Representation. 2015.
+
+
+ Note that the actual learning rate and weight decay used by the solver are
+ multiplied by the per layer multipliers. That is, the solver will call
+ get_learning_rate_multiplier(l) and get_weight_decay_multiplier(l) and
+ multiply these values with the nominal learning rate and weight decay,
+ respectively, to determine the values it will use during each step. It is
+ also overloaded to allow additional learning rate multipliers to be applied
+ to fc_ and con_ bias parameters.
+ !*/
+
+ public:
+
+ adam(
+ );
+ /*!
+ ensures
+ - #get_weight_decay() == 0.0005
+ - #get_momentum1() == 0.9
+ - #get_momentum2() == 0.999
+ !*/
+
+ adam(
+ float weight_decay,
+ float momentum1,
+ float momentum2
+ );
+ /*!
+ requires
+ - weight_decay >= 0
+ - 0 <= momentum1 < 1
+ - 0 <= momentum2 < 1
+ ensures
+ - #get_weight_decay() == weight_decay
+ - #get_momentum1() == momentum1
+ - #get_momentum2() == momentum2
+ !*/
+
+ float get_weight_decay () const;
+ float get_momentum1 () const;
+ float get_momentum2 () const;
+ };
+
+ void serialize(const adam& item, std::ostream& out);
+ void deserialize(adam& item, std::istream& in);
+ /*!
+ provides serialization support
+ !*/
+
+ std::ostream& operator<< (std::ostream& out, const adam& item);
+ /*!
+ Prints the solver's name and parameters to out.
+ !*/
+
+// ----------------------------------------------------------------------------------------
+
+}
+
+#endif // DLIB_DNn_SOLVERS_ABSTRACT_H_
+
diff --git a/ml/dlib/dlib/dnn/tensor.h b/ml/dlib/dlib/dnn/tensor.h
new file mode 100644
index 000000000..8039fe666
--- /dev/null
+++ b/ml/dlib/dlib/dnn/tensor.h
@@ -0,0 +1,686 @@
+// Copyright (C) 2015 Davis E. King (davis@dlib.net)
+// License: Boost Software License See LICENSE.txt for the full license.
+#ifndef DLIB_DNn_TENSOR_H_
+#define DLIB_DNn_TENSOR_H_
+
+#include "tensor_abstract.h"
+#include <cstring>
+#include "../matrix.h"
+#include "cudnn_dlibapi.h"
+#include "gpu_data.h"
+#include "../byte_orderer.h"
+#include <memory>
+#include "../any.h"
+
+namespace dlib
+{
+
+// ----------------------------------------------------------------------------------------
+
+ class tensor;
+ namespace cuda
+ {
+ void set_tensor (
+ tensor& t,
+ float value
+ );
+
+ void scale_tensor (
+ tensor& t,
+ float value
+ );
+ }
+
+// ----------------------------------------------------------------------------------------
+
+ class tensor
+ {
+ public:
+
+ tensor (
+ ) :
+ m_n(0), m_k(0), m_nr(0), m_nc(0), m_size(0)
+ {
+ }
+
+ virtual ~tensor() {}
+
+ long long num_samples() const { return m_n; }
+ long long k() const { return m_k; }
+ long long nr() const { return m_nr; }
+ long long nc() const { return m_nc; }
+ size_t size() const { return m_size; }
+
+ typedef float* iterator;
+ typedef const float* const_iterator;
+ iterator begin() { return host(); }
+ const_iterator begin() const { return host(); }
+ iterator end() { return host()+size(); }
+ const_iterator end() const { return host()+size(); }
+
+ void async_copy_to_device() const
+ {
+ data().async_copy_to_device();
+ }
+
+ virtual const float* host() const = 0;
+ virtual float* host() = 0;
+ virtual float* host_write_only() = 0;
+ virtual const float* device() const = 0;
+ virtual float* device() = 0;
+ virtual float* device_write_only() = 0;
+
+ virtual const any& annotation() const = 0;
+ virtual any& annotation() = 0;
+
+ int device_id() const { return data().device_id(); }
+
+ tensor& operator= (float val)
+ {
+#ifdef DLIB_USE_CUDA
+ // If you are using CUDA then presumably you will be mostly using tensors on
+ // the GPU. So unless you seem to be actively working with the host side's
+ // data then we do this initialization on the device side since this avoids a
+ // host to device transfer that would likely immediately follow.
+ if (data().device_ready())
+ {
+ cuda::set_tensor(*this, val);
+ return *this;
+ }
+#endif
+ auto d = host_write_only();
+ for (size_t i = 0; i < size(); ++i)
+ d[i] = val;
+
+ return *this;
+ }
+
+ tensor& operator*= (float val)
+ {
+#ifdef DLIB_USE_CUDA
+ cuda::scale_tensor(*this, val);
+ return *this;
+#else
+ for (auto& d : *this)
+ d *= val;
+
+ return *this;
+#endif
+ }
+
+ tensor& operator/= (float val)
+ {
+ *this *= 1.0/val;
+ return *this;
+ }
+
+ template <typename EXP>
+ tensor& operator= (const matrix_exp<EXP>& item)
+ {
+ DLIB_CASSERT(num_samples() == item.nr() &&
+ nr()*nc()*k() == item.nc());
+ static_assert((is_same_type<float, typename EXP::type>::value == true),
+ "To assign a matrix to a tensor the matrix must contain float values");
+
+ set_ptrm(host_write_only(), m_n, m_nr*m_nc*m_k) = item;
+ return *this;
+ }
+
+ template <typename EXP>
+ tensor& operator+= (const matrix_exp<EXP>& item)
+ {
+ DLIB_CASSERT(num_samples() == item.nr() &&
+ nr()*nc()*k() == item.nc());
+ static_assert((is_same_type<float, typename EXP::type>::value == true),
+ "To assign a matrix to a tensor the matrix must contain float values");
+ set_ptrm(host(), m_n, m_nr*m_nc*m_k) += item;
+ return *this;
+ }
+
+ template <typename EXP>
+ tensor& operator-= (const matrix_exp<EXP>& item)
+ {
+ DLIB_CASSERT(num_samples() == item.nr() &&
+ nr()*nc()*k() == item.nc());
+ static_assert((is_same_type<float, typename EXP::type>::value == true),
+ "To assign a matrix to a tensor the matrix must contain float values");
+ set_ptrm(host(), m_n, m_nr*m_nc*m_k) -= item;
+ return *this;
+ }
+
+ template <typename EXP>
+ void set_sample (
+ unsigned long long idx,
+ const matrix_exp<EXP>& item
+ )
+ {
+ DLIB_CASSERT(idx < (unsigned long long)num_samples());
+ DLIB_CASSERT(item.size() == nr()*nc()*k());
+ static_assert((is_same_type<float, typename EXP::type>::value == true),
+ "To assign a matrix to a tensor the matrix must contain float values");
+ set_ptrm(host()+idx*item.size(), item.nr(), item.nc()) = item;
+ }
+
+
+ template <typename EXP>
+ void add_to_sample (
+ unsigned long long idx,
+ const matrix_exp<EXP>& item
+ )
+ {
+ DLIB_CASSERT(idx < (unsigned long long)num_samples());
+ DLIB_CASSERT(item.size() == nr()*nc()*k());
+ static_assert((is_same_type<float, typename EXP::type>::value == true),
+ "To assign a matrix to a tensor the matrix must contain float values");
+ set_ptrm(host()+idx*item.size(), item.nr(), item.nc()) += item;
+ }
+
+
+#ifdef DLIB_USE_CUDA
+ virtual const cuda::tensor_descriptor& get_cudnn_tensor_descriptor (
+ ) const = 0;
+#endif
+
+ friend void memcpy (
+ tensor& dest,
+ const tensor& src
+ )
+ {
+ DLIB_CASSERT(dest.size() == src.size());
+ memcpy(dest.data(), dest.get_alias_offset(),
+ src.data(), src.get_alias_offset(),
+ src.size());
+ }
+
+
+ protected:
+
+ friend class alias_tensor;
+
+ virtual gpu_data& data() = 0;
+ virtual const gpu_data& data() const = 0;
+ virtual size_t get_alias_offset() const { return 0; } // needed by alias_tensor.
+
+ long long m_n;
+ long long m_k;
+ long long m_nr;
+ long long m_nc;
+ long long m_size; // always equal to m_n*m_k*m_nr*m_nc
+ };
+
+// ----------------------------------------------------------------------------------------
+
+ inline bool is_vector (
+ const tensor& t
+ )
+ {
+ return t.size() == (size_t)t.num_samples() ||
+ t.size() == (size_t)t.k() ||
+ t.size() == (size_t)t.nr() ||
+ t.size() == (size_t)t.nc();
+ }
+
+// ----------------------------------------------------------------------------------------
+
+ inline const matrix_op<op_pointer_to_mat<float> > mat (
+ const tensor& t,
+ long long nr,
+ long long nc
+ )
+ {
+ DLIB_ASSERT(nr >= 0 && nc >= 0 ,
+ "\tconst matrix_exp mat(tensor, nr, nc)"
+ << "\n\t nr and nc must be >= 0"
+ << "\n\t nr: " << nr
+ << "\n\t nc: " << nc
+ );
+ DLIB_ASSERT(nr*nc == (long long)t.size() ,
+ "\tconst matrix_exp mat(tensor, nr, nc)"
+ << "\n\t The sizes don't match up."
+ << "\n\t nr*nc: " << nr*nc
+ << "\n\t t.size(): " << t.size()
+ );
+ typedef op_pointer_to_mat<float> op;
+ return matrix_op<op>(op(t.host(),nr,nc));
+ }
+
+ inline const matrix_op<op_pointer_to_mat<float> > mat (
+ const tensor& t
+ )
+ {
+ if (t.size() != 0)
+ return mat(t, t.num_samples(), t.size()/t.num_samples());
+ else
+ return mat((float*)0,0,0);
+ }
+
+ inline const matrix_op<op_pointer_to_mat<float> > image_plane (
+ const tensor& t,
+ long long sample = 0,
+ long long k = 0
+ )
+ {
+ DLIB_ASSERT(0 <= sample && sample < t.num_samples() &&
+ 0 <= k && k < t.k() &&
+ t.size() != 0,
+ "\tconst matrix_exp image_plane(tensor,sample,k)"
+ << "\n\t Invalid arguments were given to this function."
+ << "\n\t sample: " << sample
+ << "\n\t k: " << k
+ << "\n\t t.num_samples(): " << t.num_samples()
+ << "\n\t t.k(): " << t.k()
+ << "\n\t t.size(): " << t.size()
+ );
+
+
+ typedef op_pointer_to_mat<float> op;
+ return matrix_op<op>(op(t.host() + ((sample*t.k() + k)*t.nr())*t.nc(),
+ t.nr(),
+ t.nc()));
+ }
+
+// ----------------------------------------------------------------------------------------
+
+ inline bool have_same_dimensions (
+ const tensor& a,
+ const tensor& b
+ )
+ {
+ return a.num_samples() == b.num_samples() &&
+ a.k() == b.k() &&
+ a.nr() == b.nr() &&
+ a.nc() == b.nc();
+ }
+
+// ----------------------------------------------------------------------------------------
+
+ class resizable_tensor : public tensor
+ {
+ public:
+ resizable_tensor(
+ )
+ {}
+
+ template <typename EXP>
+ resizable_tensor(
+ const matrix_exp<EXP>& item
+ )
+ {
+ set_size(item.nr(), item.nc());
+ *this = item;
+ }
+
+ explicit resizable_tensor(
+ long long n_, long long k_ = 1, long long nr_ = 1, long long nc_ = 1
+ )
+ {
+ DLIB_ASSERT( n_ >= 0 && k_ >= 0 && nr_ >= 0 && nc_ >= 0);
+
+ set_size(n_,k_,nr_,nc_);
+ }
+
+ resizable_tensor(const resizable_tensor& item) : _annotation(item.annotation())
+ {
+ copy_size(item);
+ memcpy(*this, item);
+ }
+ resizable_tensor(const tensor& item) : _annotation(item.annotation())
+ {
+ copy_size(item);
+ memcpy(*this, item);
+ }
+
+ resizable_tensor(resizable_tensor&& item) { swap(item); }
+ resizable_tensor& operator=(resizable_tensor&& item) { swap(item); return *this; }
+
+ virtual const float* host() const { return data_instance.host(); }
+ virtual float* host() { return data_instance.host(); }
+ virtual float* host_write_only() { return data_instance.host_write_only(); }
+ virtual const float* device() const { return data_instance.device(); }
+ virtual float* device() { return data_instance.device(); }
+ virtual float* device_write_only() { return data_instance.device_write_only(); }
+
+ virtual const any& annotation() const { return _annotation; }
+ virtual any& annotation() { return _annotation; }
+
+ void clear(
+ )
+ {
+ set_size(0,0,0,0);
+ _annotation.clear();
+ // free underlying memory
+ data_instance.set_size(0);
+ }
+
+ void copy_size (
+ const tensor& item
+ )
+ {
+ set_size(item.num_samples(), item.k(), item.nr(), item.nc());
+ }
+
+ resizable_tensor& operator= (float val)
+ {
+ tensor::operator=(val);
+ return *this;
+ }
+
+ template <typename EXP>
+ resizable_tensor& operator= (
+ const matrix_exp<EXP>& item
+ )
+ {
+ if (!(num_samples() == item.nr() && k()*nr()*nc() == item.nc()))
+ set_size(item.nr(), item.nc());
+ tensor::operator=(item);
+ return *this;
+ }
+
+ void set_size(
+ long long n_, long long k_ = 1, long long nr_ = 1, long long nc_ = 1
+ )
+ {
+ DLIB_ASSERT( n_ >= 0 && k_ >= 0 && nr_ >= 0 && nc_ >= 0);
+
+ m_n = n_;
+ m_k = k_;
+ m_nr = nr_;
+ m_nc = nc_;
+ m_size = n_*k_*nr_*nc_;
+ if ((long long)data_instance.size() < m_size)
+ data_instance.set_size(m_size);
+#ifdef DLIB_USE_CUDA
+ cudnn_descriptor.set_size(m_n,m_k,m_nr,m_nc);
+#endif
+ }
+
+
+ resizable_tensor& operator= (const resizable_tensor& item)
+ {
+ resizable_tensor temp(item);
+ temp.swap(*this);
+ return *this;
+ }
+
+ resizable_tensor& operator= (const tensor& item)
+ {
+ resizable_tensor temp(item);
+ temp.swap(*this);
+ return *this;
+ }
+
+
+ void swap(resizable_tensor& item)
+ {
+ std::swap(m_n, item.m_n);
+ std::swap(m_k, item.m_k);
+ std::swap(m_nr, item.m_nr);
+ std::swap(m_nc, item.m_nc);
+ std::swap(m_size, item.m_size);
+ std::swap(data_instance, item.data_instance);
+ std::swap(_annotation, item._annotation);
+#ifdef DLIB_USE_CUDA
+ std::swap(cudnn_descriptor, item.cudnn_descriptor);
+#endif
+ }
+
+#ifdef DLIB_USE_CUDA
+ virtual const cuda::tensor_descriptor& get_cudnn_tensor_descriptor (
+ ) const { return cudnn_descriptor; }
+#endif
+
+ private:
+
+#ifdef DLIB_USE_CUDA
+ cuda::tensor_descriptor cudnn_descriptor;
+#endif
+
+ gpu_data data_instance;
+ any _annotation;
+ virtual gpu_data& data() { return data_instance; }
+ virtual const gpu_data& data() const { return data_instance; }
+ };
+
+ inline void serialize(const tensor& item, std::ostream& out)
+ {
+ int version = 2;
+ serialize(version, out);
+ serialize(item.num_samples(), out);
+ serialize(item.k(), out);
+ serialize(item.nr(), out);
+ serialize(item.nc(), out);
+ byte_orderer bo;
+ auto sbuf = out.rdbuf();
+ for (auto d : item)
+ {
+ // Write out our data as 4byte little endian IEEE floats rather than using
+ // dlib's default float serialization. We do this because it will result in
+ // more compact outputs. It's slightly less portable but it seems doubtful
+ // that any CUDA enabled platform isn't going to use IEEE floats. But if one
+ // does we can just update the serialization code here to handle it if such a
+ // platform is encountered.
+ bo.host_to_little(d);
+ static_assert(sizeof(d)==4, "This serialization code assumes we are writing 4 byte floats");
+ sbuf->sputn((char*)&d, sizeof(d));
+ }
+ }
+
+ inline void deserialize(resizable_tensor& item, std::istream& in)
+ {
+ int version;
+ deserialize(version, in);
+ if (version != 2)
+ throw serialization_error("Unexpected version found while deserializing dlib::resizable_tensor.");
+
+ long long num_samples=0, k=0, nr=0, nc=0;
+ deserialize(num_samples, in);
+ deserialize(k, in);
+ deserialize(nr, in);
+ deserialize(nc, in);
+ item.set_size(num_samples, k, nr, nc);
+ byte_orderer bo;
+ auto sbuf = in.rdbuf();
+ for (auto& d : item)
+ {
+ static_assert(sizeof(d)==4, "This serialization code assumes we are writing 4 byte floats");
+ if (sbuf->sgetn((char*)&d,sizeof(d)) != sizeof(d))
+ {
+ in.setstate(std::ios::badbit);
+ throw serialization_error("Error reading data while deserializing dlib::resizable_tensor.");
+ }
+ bo.little_to_host(d);
+ }
+ }
+
+// ----------------------------------------------------------------------------------------
+
+ inline double dot(
+ const tensor& a,
+ const tensor& b
+ )
+ {
+ DLIB_CASSERT(a.size() == b.size());
+ const float* da = a.host();
+ const float* db = b.host();
+ double sum = 0;
+ for (size_t i = 0; i < a.size(); ++i)
+ sum += da[i]*db[i];
+ return sum;
+ }
+
+// ----------------------------------------------------------------------------------------
+// ----------------------------------------------------------------------------------------
+
+ class alias_tensor_instance : public tensor
+ {
+ alias_tensor_instance(
+ ) : data_instance(0), _annotation(0), data_offset(0) {}
+
+ public:
+ friend class alias_tensor;
+ friend class alias_tensor_const_instance;
+
+ alias_tensor_instance& operator= (float val)
+ {
+ tensor::operator=(val);
+ return *this;
+ }
+
+ template <typename EXP>
+ alias_tensor_instance& operator= (const matrix_exp<EXP>& item)
+ {
+ tensor::operator=(item);
+ return *this;
+ }
+
+ virtual const float* host() const { return data_instance->host()+data_offset; }
+ virtual float* host() { return data_instance->host()+data_offset; }
+ virtual float* host_write_only() { return data_instance->host()+data_offset; }
+ virtual const float* device() const { return data_instance->device()+data_offset; }
+ virtual float* device() { return data_instance->device()+data_offset; }
+ virtual float* device_write_only() { return data_instance->device()+data_offset; }
+
+ virtual const any& annotation() const { return *_annotation; }
+ virtual any& annotation() { return *_annotation; }
+
+#ifdef DLIB_USE_CUDA
+ virtual const cuda::tensor_descriptor& get_cudnn_tensor_descriptor (
+ ) const { return *cudnn_descriptor; }
+#endif
+ private:
+
+ virtual size_t get_alias_offset() const { return data_offset; }
+
+#ifdef DLIB_USE_CUDA
+ std::shared_ptr<cuda::tensor_descriptor> cudnn_descriptor;
+#endif
+ gpu_data* data_instance;
+ any* _annotation;
+ size_t data_offset;
+ virtual gpu_data& data() { return *data_instance; }
+ virtual const gpu_data& data() const { return *data_instance; }
+ };
+
+// ----------------------------------------------------------------------------------------
+
+ class alias_tensor_const_instance
+ {
+ public:
+ const tensor& get() const { return inst; }
+ operator const tensor& () { return inst; }
+
+ alias_tensor_const_instance(const alias_tensor_instance& item) : inst(item) {}
+
+ private:
+ alias_tensor_instance inst;
+
+ friend class alias_tensor;
+ alias_tensor_const_instance() {}
+ };
+
+// ----------------------------------------------------------------------------------------
+
+ class alias_tensor
+ {
+ public:
+
+ alias_tensor (
+ ) {}
+
+ alias_tensor (
+ long long n_, long long k_ = 1, long long nr_ = 1, long long nc_ = 1
+ )
+ {
+ DLIB_ASSERT( n_ >= 0 && k_ >= 0 && nr_ >= 0 && nc_ >= 0);
+
+ inst.m_n = n_;
+ inst.m_k = k_;
+ inst.m_nr = nr_;
+ inst.m_nc = nc_;
+ inst.m_size = n_*k_*nr_*nc_;
+ }
+
+ long long num_samples(
+ ) const { return inst.m_n; }
+
+ long long k(
+ ) const { return inst.m_k; }
+
+ long long nr(
+ ) const { return inst.m_nr; }
+
+ long long nc(
+ ) const { return inst.m_nc; }
+
+ size_t size(
+ ) const { return inst.m_size; }
+
+ alias_tensor_instance operator() (
+ tensor& t,
+ size_t offset = 0
+ ) const
+ {
+ DLIB_CASSERT(offset+size() <= t.size(),
+ "offset: "<<offset <<"\n"<<
+ "size(): "<<size() <<"\n"<<
+ "t.size(): "<<t.size() <<"\n");
+
+#ifdef DLIB_USE_CUDA
+ if (!inst.cudnn_descriptor)
+ {
+ inst.cudnn_descriptor = std::make_shared<cuda::tensor_descriptor>();
+ inst.cudnn_descriptor->set_size(inst.m_n, inst.m_k, inst.m_nr, inst.m_nc);
+ }
+#endif
+ inst.data_instance = &t.data();
+ inst._annotation = &t.annotation();
+ // Note that t might already be an aliasing tensor so we need to take that into
+ // account.
+ inst.data_offset = t.get_alias_offset()+offset;
+ return inst;
+ }
+
+ alias_tensor_const_instance operator() (
+ const tensor& t,
+ size_t offset = 0
+ ) const
+ {
+ alias_tensor_const_instance temp;
+ temp.inst = (*this)(const_cast<tensor&>(t),offset);
+ return temp;
+ }
+
+ private:
+ mutable alias_tensor_instance inst;
+ };
+
+ inline void serialize(const alias_tensor& item, std::ostream& out)
+ {
+ int version = 1;
+ serialize(version, out);
+ serialize(item.num_samples(), out);
+ serialize(item.k(), out);
+ serialize(item.nr(), out);
+ serialize(item.nc(), out);
+ }
+
+ inline void deserialize(alias_tensor& item, std::istream& in)
+ {
+ int version = 0;
+ deserialize(version, in);
+ if (version != 1)
+ throw serialization_error("Unexpected version found while deserializing dlib::alias_tensor.");
+ long long num_samples, k, nr, nc;
+ deserialize(num_samples, in);
+ deserialize(k, in);
+ deserialize(nr, in);
+ deserialize(nc, in);
+ item = alias_tensor(num_samples, k, nr, nc);
+ }
+
+// ----------------------------------------------------------------------------------------
+
+}
+
+#endif // DLIB_DNn_TENSOR_H_
+
diff --git a/ml/dlib/dlib/dnn/tensor_abstract.h b/ml/dlib/dlib/dnn/tensor_abstract.h
new file mode 100644
index 000000000..73a9fff77
--- /dev/null
+++ b/ml/dlib/dlib/dnn/tensor_abstract.h
@@ -0,0 +1,727 @@
+// Copyright (C) 2015 Davis E. King (davis@dlib.net)
+// License: Boost Software License See LICENSE.txt for the full license.
+#undef DLIB_DNn_TENSOR_ABSTRACT_H_
+#ifdef DLIB_DNn_TENSOR_ABSTRACT_H_
+
+#include "../matrix.h"
+#include "../any/any_abstract.h"
+
+namespace dlib
+{
+// ----------------------------------------------------------------------------------------
+
+ class tensor
+ {
+ /*!
+ WHAT THIS OBJECT REPRESENTS
+ This object represents a 4D array of float values, all stored contiguously
+ in memory. Importantly, it keeps two copies of the floats, one on the host
+ CPU side and another on the GPU device side. It automatically performs the
+ necessary host/device transfers to keep these two copies of the data in
+ sync.
+
+ All transfers to the device happen asynchronously with respect to the
+ default CUDA stream so that CUDA kernel computations can overlap with data
+ transfers. However, any transfers from the device to the host happen
+ synchronously in the default CUDA stream. Therefore, you should perform
+ all your CUDA kernel launches on the default stream so that transfers back
+ to the host do not happen before the relevant computations have completed.
+
+ If DLIB_USE_CUDA is not #defined then this object will not use CUDA at all.
+ Instead, it will simply store one host side memory block of floats.
+
+ Finally, the convention in dlib code is to interpret the tensor as a set of
+ num_samples() 3D arrays, each of dimension k() by nr() by nc(). Also,
+ while this class does not specify a memory layout, the convention is to
+ assume that indexing into an element at coordinates (sample,k,r,c) can be
+ accomplished via:
+ host()[((sample*t.k() + k)*t.nr() + r)*t.nc() + c]
+
+ THREAD SAFETY
+ Instances of this object are not thread-safe. So don't touch one from
+ multiple threads at the same time.
+ !*/
+
+ public:
+
+ virtual ~tensor();
+
+ long long num_samples(
+ ) const;
+ /*!
+ ensures
+ - returns the number of 3D arrays of dimension k() by nr() by nc() there
+ are in this object.
+ !*/
+
+ long long k(
+ ) const;
+ /*!
+ ensures
+ - returns the k dimension of this tensor. Generally, we think of a tensor
+ as containing num_samples() images of nr() by nc() rows and columns, each
+ with k() channels.
+ !*/
+
+ long long nr(
+ ) const;
+ /*!
+ ensures
+ - returns the number of rows in this tensor.
+ !*/
+
+ long long nc(
+ ) const;
+ /*!
+ ensures
+ - returns the number of columns in this tensor.
+ !*/
+
+ size_t size(
+ ) const;
+ /*!
+ ensures
+ - returns num_samples()*k()*nr()*nc()
+ (i.e. the total number of floats in this tensor)
+ !*/
+
+ void async_copy_to_device(
+ ) const;
+ /*!
+ ensures
+ - This function does not block.
+ - if (the host version of the data is newer than the device's copy) then
+ - Begins asynchronously copying host data to the device.
+ - A call to device() that happens before the transfer completes will
+ block until the transfer is complete. That is, it is safe to call
+ async_copy_to_device() and then immediately call device().
+ !*/
+
+ typedef float* iterator;
+ typedef const float* const_iterator;
+ iterator begin() { return host(); }
+ const_iterator begin() const { return host(); }
+ iterator end() { return host()+size(); }
+ const_iterator end() const { return host()+size(); }
+ /*!
+ ensures
+ - makes a tensor iterable just like the STL containers.
+ !*/
+
+ virtual const float* host(
+ ) const = 0;
+ /*!
+ ensures
+ - returns a pointer to the host memory block of size() contiguous float
+ values or nullptr if size()==0.
+ - if (the host's copy of the data is out of date) then
+ - copies the data from the device to the host, while this is happening
+ the call to host() blocks.
+ !*/
+
+ virtual float* host(
+ ) = 0;
+ /*!
+ ensures
+ - returns a pointer to the host memory block of size() contiguous float
+ values or nullptr if size()==0.
+ - if (the host's copy of the data is out of date) then
+ - copies the data from the device to the host, while this is happening
+ the call to host() blocks.
+ - Marks the device side data as out of date so that the next call to
+ device() will perform a host to device transfer. If you want to begin
+ the transfer immediately then you can call async_copy_to_device() after
+ calling host().
+ !*/
+
+ virtual float* host_write_only(
+ ) = 0;
+ /*!
+ ensures
+ - This function returns the same pointer as host(), except that it never
+ performs a device to host memory copy. Instead, it immediately marks the
+ device side data as out of date, effectively discarding it. Therefore,
+ the values in the data pointed to by host_write_only() are undefined and
+ you should only call host_write_only() if you are going to assign to
+ every memory location in the returned memory block.
+ !*/
+
+ virtual const float* device(
+ ) const = 0;
+ /*!
+ requires
+ - DLIB_USE_CUDA is #defined
+ ensures
+ - returns a pointer to the device memory block of size() contiguous float
+ values or nullptr if size()==0.
+ - if (the device's copy of the data is out of date) then
+ - copies the data from the host to the device, while this is happening
+ the call to device() blocks.
+ !*/
+
+ virtual float* device(
+ ) = 0;
+ /*!
+ requires
+ - DLIB_USE_CUDA is #defined
+ ensures
+ - returns a pointer to the device memory block of size() contiguous float
+ values or nullptr if size()==0.
+ - if (the device's copy of the data is out of date) then
+ - copies the data from the host to the device, while this is happening
+ the call to device() blocks.
+ - Marks the host side data as out of date so that the next call to
+ host() will perform a device to host transfer.
+ !*/
+
+ virtual float* device_write_only(
+ ) = 0;
+ /*!
+ requires
+ - DLIB_USE_CUDA is #defined
+ ensures
+ - This function returns the same pointer as device(), except that it never
+ performs a host to device memory copy. Instead, it immediately marks the
+ host side data as out of date, effectively discarding it. Therefore, the
+ values in the data pointed to by device_write_only() are undefined and
+ you should only call device_write_only() if you are going to assign to
+ every memory location in the returned memory block.
+ !*/
+
+ virtual const any& annotation(
+ ) const = 0;
+ /*!
+ ensures
+ - returns a const reference to the any object in this tensor. The any
+ object can be used to store any additional annotation you like in a
+ tensor. However, it should be noted that the annotation() is ignored by
+ serialize() and therefore not saved when a tensor is serialized.
+ !*/
+
+ virtual any& annotation(
+ ) = 0;
+ /*!
+ ensures
+ - returns a non-const reference to the any object in this tensor. The any
+ object can be used to store any additional annotation you like in a
+ tensor. However, it should be noted that the annotation() is ignored by
+ serialize() and therefore not saved when a tensor is serialized.
+ !*/
+
+ int device_id(
+ ) const;
+ /*!
+ ensures
+ - returns the ID of the CUDA device that allocated this memory. I.e. the
+ number returned by cudaGetDevice() when the memory was allocated.
+ - If CUDA is not being used then this function always returns 0.
+ !*/
+
+ tensor& operator= (
+ float val
+ );
+ /*!
+ ensures
+ - sets all elements of this tensor equal to val.
+ - returns *this
+ !*/
+
+ tensor& operator*= (
+ float val
+ );
+ /*!
+ ensures
+ - pointwise multiplies all elements of *this tensor with val.
+ - returns *this
+ !*/
+
+ tensor& operator/= (
+ float val
+ );
+ /*!
+ ensures
+ - pointwise divides all elements of *this tensor with val.
+ - returns *this
+ !*/
+
+ template <typename EXP>
+ tensor& operator= (
+ const matrix_exp<EXP>& item
+ );
+ /*!
+ requires
+ - num_samples() == item.nr()
+ - k()*nr()*nc() == item.nc()
+ - item contains float values
+ ensures
+ - Assigns item to *this tensor by performing:
+ set_ptrm(host(), num_samples(), k()*nr()*nc()) = item;
+ !*/
+
+ template <typename EXP>
+ tensor& operator+= (
+ const matrix_exp<EXP>& item
+ );
+ /*!
+ requires
+ - num_samples() == item.nr()
+ - k()*nr()*nc() == item.nc()
+ - item contains float values
+ ensures
+ - Adds item to *this tensor by performing:
+ set_ptrm(host(), num_samples(), k()*nr()*nc()) += item;
+ !*/
+
+ template <typename EXP>
+ tensor& operator-= (
+ const matrix_exp<EXP>& item
+ );
+ /*!
+ requires
+ - num_samples() == item.nr()
+ - k()*nr()*nc() == item.nc()
+ - item contains float values
+ ensures
+ - Subtracts item from *this tensor by performing:
+ set_ptrm(host(), num_samples(), k()*nr()*nc()) -= item;
+ !*/
+
+ template <typename EXP>
+ void set_sample (
+ unsigned long long idx,
+ const matrix_exp<EXP>& item
+ );
+ /*!
+ requires
+ - idx < num_samples()
+ - k()*nr()*nc() == item.size()
+ - item contains float values
+ ensures
+ - Assigns item to the idx'th sample in *this by performing:
+ set_ptrm(host()+idx*item.size(), item.nr(), item.nc()) = item;
+ !*/
+
+
+ template <typename EXP>
+ void add_to_sample (
+ unsigned long long idx,
+ const matrix_exp<EXP>& item
+ );
+ /*!
+ requires
+ - idx < num_samples()
+ - k()*nr()*nc() == item.size()
+ - item contains float values
+ ensures
+ - Adds item to the idx'th sample in *this by performing:
+ set_ptrm(host()+idx*item.size(), item.nr(), item.nc()) += item;
+ !*/
+
+ protected:
+
+ // You can't move or copy another tensor into *this since that might modify the
+ // tensor's dimensions. If you want to do that sort of thing then use a
+ // resizable_tensor.
+ tensor(const tensor& item);
+ tensor& operator= (const tensor& item);
+ tensor(tensor&& item);
+ tensor& operator=(tensor&& item);
+ };
+
+// ----------------------------------------------------------------------------------------
+
+ void memcpy (
+ tensor& dest,
+ const tensor& src
+ );
+ /*!
+ requires
+ - dest.size() == src.size()
+ ensures
+ - Copies the data in src to dest. If the device data is current on both src
+ and dest then the copy will happen entirely on the device side.
+ - It doesn't matter what GPU device is selected by cudaSetDevice(). You can
+ always copy tensor objects to and from each other regardless.
+ - This function blocks until the copy has completed.
+ !*/
+
+// ----------------------------------------------------------------------------------------
+
+ bool is_vector (
+ const tensor& t
+ );
+ /*!
+ ensures
+ - returns true if and only if one of the following is true:
+ - t.size() == t.num_samples()
+ - t.size() == t.k()
+ - t.size() == t.nr()
+ - t.size() == t.nc()
+ !*/
+
+// ----------------------------------------------------------------------------------------
+
+ const matrix_exp mat (
+ const tensor& t,
+ long long nr,
+ long long nc
+ );
+ /*!
+ requires
+ - nr >= 0
+ - nc >= 0
+ - nr*nc == t.size()
+ ensures
+ - returns a matrix M such that:
+ - M.nr() == nr
+ - m.nc() == nc
+ - for all valid r and c:
+ M(r,c) == t.host()[r*nc + c]
+ (i.e. the tensor is interpreted as a matrix laid out in memory
+ in row major order)
+ !*/
+
+ const matrix_exp mat (
+ const tensor& t
+ );
+ /*!
+ ensures
+ - if (t.size() != 0) then
+ - returns mat(t, t.num_samples(), t.size()/t.num_samples())
+ - else
+ - returns an empty matrix.
+ !*/
+
+ const matrix_exp image_plane (
+ const tensor& t,
+ long long sample = 0,
+ long long k = 0
+ );
+ /*!
+ requires
+ - t.size() != 0
+ - 0 <= sample < t.num_samples()
+ - 0 <= k < t.k()
+ ensures
+ - returns the k-th image plane from the sample-th image in t. That is,
+ returns a matrix M such that:
+ - M contains float valued elements.
+ - M.nr() == t.nr()
+ - M.nc() == t.nc()
+ - for all valid r and c:
+ - M(r,c) == t.host()[((sample*t.k() + k)*t.nr() + r)*t.nc() + c]
+ !*/
+
+// ----------------------------------------------------------------------------------------
+
+ bool have_same_dimensions (
+ const tensor& a,
+ const tensor& b
+ );
+ /*!
+ ensures
+ - returns true if and only if all of the fallowing are satisfied:
+ - a.num_samples() == b.num_samples()
+ - a.k() == b.k()
+ - a.nr() == b.nr()
+ - a.nc() == b.nc()
+ !*/
+
+// ----------------------------------------------------------------------------------------
+
+ class resizable_tensor : public tensor
+ {
+ /*!
+ WHAT THIS OBJECT REPRESENTS
+ This object is just a tensor with the additional ability to be resized.
+ !*/
+
+ public:
+ resizable_tensor(
+ );
+ /*!
+ ensures
+ - #size() == 0
+ - #num_samples() == 0
+ - #k() == 0
+ - #nr() == 0
+ - #nc() == 0
+ - #capacity() == 0
+ !*/
+
+ template <typename EXP>
+ resizable_tensor(
+ const matrix_exp<EXP>& item
+ );
+ /*!
+ requires
+ - item contains float values
+ ensures
+ - #num_samples() == item.nr()
+ - #k() == item.nc()
+ - #nr() == 1
+ - #nc() == 1
+ - Assigns item to *this tensor by performing:
+ set_ptrm(host(), num_samples(), k()*nr()*nc()) = item;
+ - #capacity() == size()
+ !*/
+
+ explicit resizable_tensor(
+ long long n_, long long k_ = 1, long long nr_ = 1, long long nc_ = 1
+ );
+ /*!
+ requires
+ - n_ >= 0
+ - k_ >= 0
+ - nr_ >= 0
+ - nc_ >= 0
+ ensures
+ - #size() == n_*k_*nr_*nc_
+ - #num_samples() == n_
+ - #k() == k_
+ - #nr() == nr_
+ - #nc() == nc_
+ - #capacity() == size()
+ !*/
+
+ // This object is copyable and movable
+ resizable_tensor(const resizable_tensor&) = default;
+ resizable_tensor(resizable_tensor&&) = default;
+ resizable_tensor& operator= (const resizable_tensor&) = default;
+ resizable_tensor& operator= (resizable_tensor&&) = default;
+
+ size_t capacity (
+ ) const;
+ /*!
+ ensures
+ - returns the total number of floats allocated. This might be different
+ from the size() since calls to set_size() that make a tensor smaller
+ don't trigger reallocations. They simply adjust the nominal dimensions
+ while keeping the same allocated memory block. This makes calls to
+ set_size() very fast. If you need to deallocate a tensor then use
+ clear().
+ !*/
+
+ void clear(
+ );
+ /*!
+ ensures
+ - #size() == 0
+ - #num_samples() == 0
+ - #k() == 0
+ - #nr() == 0
+ - #nc() == 0
+ - #annotation().is_empty() == true
+ - #capacity() == 0
+ !*/
+
+ void copy_size (
+ const tensor& item
+ );
+ /*!
+ ensures
+ - resizes *this so that: have_same_dimensions(#*this, item)==true
+ !*/
+
+ void set_size(
+ long long n_, long long k_ = 1, long long nr_ = 1, long long nc_ = 1
+ );
+ /*!
+ requires
+ - n_ >= 0
+ - k_ >= 0
+ - nr_ >= 0
+ - nc_ >= 0
+ ensures
+ - #size() == n_*k_*nr_*nc_
+ - #num_samples() == n_
+ - #k() == k_
+ - #nr() == nr_
+ - #nc() == nc_
+ - #capacity() == max(#size(), capacity())
+ (i.e. capacity() never goes down when calling set_size().)
+ !*/
+
+ template <typename EXP>
+ resizable_tensor& operator= (
+ const matrix_exp<EXP>& item
+ );
+ /*!
+ requires
+ - item contains float values
+ ensures
+ - if (num_samples() == item.nr() && k()*nr()*nc() == item.nc()) then
+ - the dimensions of this tensor are not changed
+ - else
+ - #num_samples() == item.nr()
+ - #k() == item.nc()
+ - #nr() == 1
+ - #nc() == 1
+ - Assigns item to *this tensor by performing:
+ set_ptrm(host(), num_samples(), k()*nr()*nc()) = item;
+ !*/
+ };
+
+ void serialize(const tensor& item, std::ostream& out);
+ void deserialize(resizable_tensor& item, std::istream& in);
+ /*!
+ provides serialization support for tensor and resizable_tensor. Note that you can
+ serialize to/from any combination of tenor and resizable_tensor objects.
+ !*/
+
+// ----------------------------------------------------------------------------------------
+
+ double dot(
+ const tensor& a,
+ const tensor& b
+ );
+ /*!
+ requires
+ - a.size() == b.size()
+ ensures
+ - returns the dot product between a and b when they are both treated as
+ a.size() dimensional vectors. That is, this function pointwise multiplies
+ the vectors together, then sums the result and returns it.
+
+ !*/
+
+// ----------------------------------------------------------------------------------------
+
+ class alias_tensor_instance : public tensor
+ {
+ /*!
+ WHAT THIS OBJECT REPRESENTS
+ This object is a tensor that aliases another tensor. That is, it doesn't
+ have its own block of memory but instead simply holds pointers to the
+ memory of another tensor object. It therefore allows you to efficiently
+ break a tensor into pieces and pass those pieces into functions.
+
+ An alias_tensor_instance doesn't own the resources it points to in any sense.
+ So it is important to make sure that the underlying owning tensor doesn't get
+ destructed before any alias tensors which point to it are destructed.
+ !*/
+
+ // You can't default initialize this object. You can only get instances of it from
+ // alias_tensor::operator().
+ alias_tensor_instance(
+ );
+ };
+
+ class alias_tensor_const_instance
+ {
+ /*!
+ WHAT THIS OBJECT REPRESENTS
+ This is essentially a const version of alias_tensor_instance and therefore
+ represents a tensor. However, due to the mechanics of C++, this object
+ can't inherit from tensor. So instead it provides a get() and an implicit
+ conversion to const tensor.
+ !*/
+
+ public:
+
+ // non-const alias tensors are convertible to const ones.
+ alias_tensor_const_instance(const alias_tensor_instance& item);
+
+ // Methods that cast the alias to a tensor.
+ const tensor& get() const;
+ operator const tensor& ();
+
+ private:
+ // You can't default initialize this object. You can only get instances of it from
+ // alias_tensor::operator().
+ alias_tensor_const_instance();
+ };
+
+ class alias_tensor
+ {
+ /*!
+ WHAT THIS OBJECT REPRESENTS
+ This is a tool for creating tensor objects that alias other tensor objects.
+ That is, it allows you to make a tensor that references the memory space of
+ another tensor object rather than owning its own memory. This allows you
+ to do things like interpret a single tensor in different ways or even as a
+ group of multiple tensors.
+ !*/
+ public:
+
+ alias_tensor (
+ );
+ /*!
+ ensures
+ - #size() == 0
+ - #num_samples() == 0
+ - #k() == 0
+ - #nr() == 0
+ - #nc() == 0
+ !*/
+
+ alias_tensor (
+ long long n_, long long k_ = 1, long long nr_ = 1, long long nc_ = 1
+ );
+ /*!
+ requires
+ - n_ >= 0
+ - k_ >= 0
+ - nr_ >= 0
+ - nc_ >= 0
+ ensures
+ - #size() == n_*k_*nr_*nc_
+ - #num_samples() == n_
+ - #k() == k_
+ - #nr() == nr_
+ - #nc() == nc_
+ !*/
+
+ long long num_samples() const;
+ long long k() const;
+ long long nr() const;
+ long long nc() const;
+ size_t size() const;
+
+ alias_tensor_instance operator() (
+ tensor& t,
+ size_t offset = 0
+ ) const;
+ /*!
+ requires
+ - offset+size() <= t.size()
+ ensures
+ - Returns a tensor that simply aliases the elements of t beginning with t's
+ offset'th element. Specifically, this function returns an aliasing
+ tensor T such that:
+ - T.size() == size()
+ - T.num_samples() == num_samples()
+ - T.k() == k()
+ - T.nr() == nr()
+ - T.nc() == nc()
+ - T.host() == t.host()+offset
+ - T.device() == t.device()+offset
+ - &T.annotation() == &t.annotation()
+ !*/
+
+ alias_tensor_const_instance operator() (
+ const tensor& t,
+ size_t offset = 0
+ ) const;
+ /*!
+ requires
+ - offset+size() <= t.size()
+ ensures
+ - This function is identical to the above version of operator() except that
+ it takes and returns const tensors instead of non-const tensors.
+ !*/
+ };
+
+ void serialize(const alias_tensor& item, std::ostream& out);
+ void deserialize(alias_tensor& item, std::istream& in);
+ /*!
+ provides serialization support for alias_tensor.
+ !*/
+
+// ----------------------------------------------------------------------------------------
+
+}
+
+#endif // DLIB_DNn_TENSOR_ABSTRACT_H_
+
+
diff --git a/ml/dlib/dlib/dnn/tensor_tools.cpp b/ml/dlib/dlib/dnn/tensor_tools.cpp
new file mode 100644
index 000000000..c0f7fd69d
--- /dev/null
+++ b/ml/dlib/dlib/dnn/tensor_tools.cpp
@@ -0,0 +1,985 @@
+// Copyright (C) 2015 Davis E. King (davis@dlib.net)
+// License: Boost Software License See LICENSE.txt for the full license.
+#ifndef DLIB_TeNSOR_TOOLS_CPP_
+#define DLIB_TeNSOR_TOOLS_CPP_
+
+#include "tensor_tools.h"
+#include "../string.h"
+#include <atomic>
+
+namespace dlib
+{
+ namespace
+ {
+ std::atomic<bool>& dnn_prefer_fastest_algo (
+ )
+ {
+ static std::atomic<bool> var(true);
+ return var;
+ }
+ }
+
+ bool dnn_prefer_fastest_algorithms (
+ )
+ {
+ return dnn_prefer_fastest_algo();
+ }
+
+ void set_dnn_prefer_fastest_algorithms(
+ )
+ {
+ dnn_prefer_fastest_algo() = true;
+ }
+
+ void set_dnn_prefer_smallest_algorithms(
+ )
+ {
+ dnn_prefer_fastest_algo() = false;
+ }
+}
+
+namespace dlib { namespace tt
+{
+
+// ----------------------------------------------------------------------------------------
+
+ void inverse_norms (
+ resizable_tensor& invnorms,
+ const tensor& data,
+ const double eps
+ )
+ {
+#ifdef DLIB_USE_CUDA
+ cuda::inverse_norms(invnorms, data, eps);
+#else
+ invnorms = reciprocal(sqrt(sum_cols(squared(mat(data))) + eps));
+#endif
+ }
+
+ void dot_prods (
+ resizable_tensor& out,
+ const tensor& lhs,
+ const tensor& rhs
+ )
+ {
+#ifdef DLIB_USE_CUDA
+ cuda::dot_prods(out, lhs, rhs);
+#else
+ out = sum_cols(pointwise_multiply(mat(lhs), mat(rhs)));
+#endif
+ }
+
+ void dot_prods (
+ bool add_to,
+ tensor& out,
+ const tensor& lhs,
+ const tensor& rhs
+ )
+ {
+#ifdef DLIB_USE_CUDA
+ cuda::dot_prods(add_to, out, lhs, rhs);
+#else
+ if (add_to)
+ out += sum_cols(pointwise_multiply(mat(lhs), mat(rhs)));
+ else
+ out = sum_cols(pointwise_multiply(mat(lhs), mat(rhs)));
+#endif
+ }
+
+ void scale_columns (
+ tensor& out,
+ const tensor& m,
+ const tensor& v
+ )
+ {
+ DLIB_CASSERT(have_same_dimensions(out,m));
+ DLIB_CASSERT(is_vector(v));
+ if (m.size() == 0 && v.size() == 0)
+ return;
+ DLIB_CASSERT(m.size() != 0);
+ DLIB_CASSERT(m.size()/m.num_samples() == v.size());
+
+#ifdef DLIB_USE_CUDA
+ cuda::scale_columns(out, m, v);
+#else
+ DLIB_CASSERT(false, "shouldn't be called right now");
+ out = scale_columns(mat(m), mat(v));
+#endif
+ }
+
+ void scale_rows (
+ tensor& out,
+ const tensor& m,
+ const tensor& v
+ )
+ {
+ DLIB_CASSERT(have_same_dimensions(out,m));
+ DLIB_CASSERT(is_vector(v));
+ if (m.size() == 0 && v.size() == 0)
+ return;
+ DLIB_CASSERT(m.size() != 0);
+ DLIB_CASSERT(m.num_samples() == v.size());
+
+#ifdef DLIB_USE_CUDA
+ cuda::scale_rows(out, m, v);
+#else
+ out = scale_rows(mat(m), mat(v));
+#endif
+ }
+
+ void scale_rows2 (
+ float beta,
+ tensor& out,
+ const tensor& m1,
+ const tensor& m2,
+ const tensor& v1,
+ const tensor& v2
+ )
+ {
+ DLIB_CASSERT(have_same_dimensions(out,m1));
+ DLIB_CASSERT(have_same_dimensions(out,m2));
+ DLIB_CASSERT(have_same_dimensions(v1,v2));
+ DLIB_CASSERT(is_vector(mat(v1)));
+ DLIB_CASSERT(v1.size() == m1.num_samples());
+
+#ifdef DLIB_USE_CUDA
+ cuda::scale_rows2(beta, out, m1, m2, v1, v2);
+#else
+ if (beta == 0)
+ out = scale_rows(mat(m1) - scale_rows(mat(m2),mat(v1)), mat(v2));
+ else
+ out = beta*mat(out) + scale_rows(mat(m1) - scale_rows(mat(m2),mat(v1)), mat(v2));
+#endif
+ }
+
+// ----------------------------------------------------------------------------------------
+
+ void exp (
+ tensor& dest,
+ const tensor& src
+ )
+ {
+ DLIB_CASSERT(dest.size() == src.size());
+
+#ifdef DLIB_USE_CUDA
+ cuda::exp(dest,src);
+#else
+ dest = exp(mat(src));
+#endif
+ }
+
+// ----------------------------------------------------------------------------------------
+
+ void log (
+ tensor& dest,
+ const tensor& src
+ )
+ {
+ DLIB_CASSERT(dest.size() == src.size());
+
+#ifdef DLIB_USE_CUDA
+ cuda::log(dest,src);
+#else
+ dest = log(mat(src));
+#endif
+ }
+
+// ----------------------------------------------------------------------------------------
+
+ void log10 (
+ tensor& dest,
+ const tensor& src
+ )
+ {
+ DLIB_CASSERT(dest.size() == src.size());
+
+#ifdef DLIB_USE_CUDA
+ cuda::log10(dest,src);
+#else
+ dest = log10(mat(src));
+#endif
+ }
+
+// ----------------------------------------------------------------------------------------
+
+ void gemm (
+ float beta,
+ tensor& dest,
+ float alpha,
+ const tensor& lhs,
+ bool trans_lhs,
+ const tensor& rhs,
+ bool trans_rhs
+ )
+ {
+#ifdef DLIB_USE_CUDA
+ cuda::gemm(beta, dest, alpha, lhs, trans_lhs, rhs, trans_rhs);
+#else
+ if (beta != 0)
+ {
+ if (trans_lhs && trans_rhs)
+ dest = alpha*trans(mat(lhs))*trans(mat(rhs)) + beta*mat(dest);
+ else if (!trans_lhs && trans_rhs)
+ dest = alpha*mat(lhs)*trans(mat(rhs)) + beta*mat(dest);
+ else if (trans_lhs && !trans_rhs)
+ dest = alpha*trans(mat(lhs))*mat(rhs) + beta*mat(dest);
+ else
+ dest = alpha*mat(lhs)*mat(rhs) + beta*mat(dest);
+ }
+ else
+ {
+ if (trans_lhs && trans_rhs)
+ dest = alpha*trans(mat(lhs))*trans(mat(rhs));
+ else if (!trans_lhs && trans_rhs)
+ dest = alpha*mat(lhs)*trans(mat(rhs));
+ else if (trans_lhs && !trans_rhs)
+ dest = alpha*trans(mat(lhs))*mat(rhs);
+ else
+ dest = alpha*mat(lhs)*mat(rhs);
+ }
+#endif
+ }
+
+// ----------------------------------------------------------------------------------------
+// ----------------------------------------------------------------------------------------
+
+ tensor_rand::
+ tensor_rand(
+ unsigned long long seed
+ )
+#ifdef DLIB_USE_CUDA
+ :rnd(seed){}
+#else
+ {rnd.set_seed(cast_to_string(seed)); }
+#endif
+
+ void tensor_rand::
+ fill_gaussian (
+ tensor& data,
+ float mean,
+ float stddev
+ )
+ {
+ DLIB_CASSERT(data.size()%2 == 0);
+#ifdef DLIB_USE_CUDA
+ rnd.fill_gaussian(data, mean, stddev);
+#else
+ for (auto& x : data)
+ x = rnd.get_random_gaussian()*stddev + mean;
+#endif
+ }
+
+ void tensor_rand::
+ fill_uniform (
+ tensor& data
+ )
+ {
+#ifdef DLIB_USE_CUDA
+ rnd.fill_uniform(data);
+#else
+ for (auto& x : data)
+ x = rnd.get_random_float();
+#endif
+ }
+
+// ----------------------------------------------------------------------------------------
+// ----------------------------------------------------------------------------------------
+
+ void multiply (
+ bool add_to,
+ tensor& dest,
+ const tensor& src1,
+ const tensor& src2
+ )
+ {
+ DLIB_CASSERT(dest.k() == src1.k() && src1.k() == src2.k() &&
+ dest.nr() == src1.nr() && src1.nr() == src2.nr() &&
+ dest.nc() == src1.nc() && src1.nc() == src2.nc() );
+ const long MD = std::max(std::max(dest.num_samples(),src1.num_samples()),src2.num_samples());
+ DLIB_CASSERT((dest.num_samples()==1 || dest.num_samples()==MD) &&
+ (src1.num_samples()==1 || src1.num_samples()==MD) &&
+ (src2.num_samples()==1 || src2.num_samples()==MD) );
+#ifdef DLIB_USE_CUDA
+ cuda::multiply(add_to, dest, src1, src2);
+#else
+ cpu::multiply(add_to, dest, src1, src2);
+#endif
+
+ }
+
+ void scale_channels (
+ bool add_to,
+ tensor& dest,
+ const tensor& src,
+ const tensor& scales
+ )
+ {
+#ifdef DLIB_USE_CUDA
+ cuda::scale_channels(add_to, dest, src, scales);
+#else
+ cpu::scale_channels(add_to, dest, src, scales);
+#endif
+ }
+
+ void multiply_conv (
+ bool add_to,
+ tensor& dest,
+ const tensor& src1,
+ const tensor& src2
+ )
+ {
+#ifdef DLIB_USE_CUDA
+ cuda::multiply_conv(add_to, dest, src1, src2);
+#else
+ cpu::multiply_conv(add_to, dest, src1, src2);
+#endif
+ }
+
+ void multiply_zero_padded (
+ bool add_to,
+ tensor& dest,
+ const tensor& src1,
+ const tensor& src2
+ )
+ {
+#ifdef DLIB_USE_CUDA
+ cuda::multiply_zero_padded(add_to, dest, src1, src2);
+#else
+ cpu::multiply_zero_padded(add_to, dest, src1, src2);
+#endif
+ }
+
+// ----------------------------------------------------------------------------------------
+
+ void affine_transform(
+ tensor& dest,
+ const tensor& src,
+ const float A,
+ const float B
+ )
+ {
+#ifdef DLIB_USE_CUDA
+ cuda::affine_transform(dest,src,A,B);
+#else
+ cpu::affine_transform(dest,src,A,B);
+#endif
+ }
+
+ void affine_transform(
+ tensor& dest,
+ const tensor& src,
+ const float A
+ )
+ {
+#ifdef DLIB_USE_CUDA
+ cuda::affine_transform(dest,src,A);
+#else
+ cpu::affine_transform(dest,src,A,0);
+#endif
+ }
+
+ void affine_transform(
+ tensor& dest,
+ const tensor& src1,
+ const tensor& src2,
+ const float A,
+ const float B,
+ const float C
+ )
+ {
+#ifdef DLIB_USE_CUDA
+ cuda::affine_transform(dest,src1,src2,A,B,C);
+#else
+ cpu::affine_transform(dest,src1,src2,A,B,C);
+#endif
+ }
+
+ void affine_transform(
+ tensor& dest,
+ const tensor& src1,
+ const tensor& src2,
+ const float A,
+ const float B
+ )
+ {
+#ifdef DLIB_USE_CUDA
+ cuda::affine_transform(dest,src1,src2,A,B);
+#else
+ cpu::affine_transform(dest,src1,src2,A,B,0);
+#endif
+ }
+
+ void affine_transform(
+ tensor& dest,
+ const tensor& src1,
+ const tensor& src2,
+ const tensor& src3,
+ const float A,
+ const float B,
+ const float C,
+ const float D
+ )
+ {
+#ifdef DLIB_USE_CUDA
+ cuda::affine_transform(dest,src1,src2,src3,A,B,C,D);
+#else
+ cpu::affine_transform(dest,src1,src2,src3,A,B,C,D);
+#endif
+ }
+
+ void affine_transform_range(
+ size_t begin,
+ size_t end,
+ tensor& dest,
+ const tensor& src1,
+ const tensor& src2,
+ const tensor& src3,
+ const float A,
+ const float B,
+ const float C
+ )
+ {
+#ifdef DLIB_USE_CUDA
+ cuda::affine_transform_range(begin, end, dest,src1,src2,src3,A,B,C);
+#else
+ cpu::affine_transform_range(begin, end, dest,src1,src2,src3,A,B,C);
+#endif
+ }
+
+ void affine_transform(
+ const rectangle& rect,
+ tensor& dest,
+ const tensor& src1,
+ const tensor& src2,
+ const tensor& src3,
+ float A,
+ float B,
+ float C
+ )
+ {
+#ifdef DLIB_USE_CUDA
+ cuda::affine_transform(rect, dest,src1,src2,src3,A,B,C);
+#else
+ cpu::affine_transform(rect, dest,src1,src2,src3,A,B,C);
+#endif
+ }
+
+ void affine_transform(
+ tensor& dest,
+ const tensor& src1,
+ const tensor& src2,
+ const tensor& src3,
+ const float A,
+ const float B,
+ const float C
+ )
+ {
+#ifdef DLIB_USE_CUDA
+ cuda::affine_transform_range(0,dest.size(),dest,src1,src2,src3,A,B,C);
+#else
+ cpu::affine_transform_range(0,dest.size(),dest,src1,src2,src3,A,B,C);
+#endif
+ }
+
+// ----------------------------------------------------------------------------------------
+
+ void affine_transform(
+ tensor& dest,
+ const tensor& src,
+ const tensor& A,
+ const tensor& B
+ )
+ {
+#ifdef DLIB_USE_CUDA
+ cuda::affine_transform(dest,src,A,B);
+#else
+ cpu::affine_transform(dest,src,A,B);
+#endif
+ }
+
+// ----------------------------------------------------------------------------------------
+
+ void affine_transform_conv(
+ tensor& dest,
+ const tensor& src,
+ const tensor& A,
+ const tensor& B
+ )
+ {
+#ifdef DLIB_USE_CUDA
+ cuda::affine_transform_conv(dest,src,A,B);
+#else
+ cpu::affine_transform_conv(dest,src,A,B);
+#endif
+ }
+
+// ----------------------------------------------------------------------------------------
+
+ void compute_adam_update (
+ size_t begin,
+ size_t end,
+ tensor& s,
+ tensor& m,
+ tensor& v,
+ const float t,
+ const float learning_rate,
+ const float weight_decay,
+ const float momentum1,
+ const float momentum2,
+ const tensor& params,
+ const tensor& params_grad
+ )
+ {
+#ifdef DLIB_USE_CUDA
+ cuda::compute_adam_update(begin, end, s, m, v, t, learning_rate, weight_decay, momentum1,
+ momentum2, params, params_grad);
+#else
+ cpu::compute_adam_update(begin, end, s, m, v, t, learning_rate, weight_decay, momentum1,
+ momentum2, params, params_grad);
+#endif
+ }
+
+// ----------------------------------------------------------------------------------------
+
+ void batch_normalize_inference (
+ const double eps,
+ resizable_tensor& dest,
+ const tensor& src,
+ const tensor& gamma,
+ const tensor& beta,
+ const tensor& running_means,
+ const tensor& running_variances
+ )
+ {
+#ifdef DLIB_USE_CUDA
+ cuda::batch_normalize_inference(eps,dest,src,gamma,beta,running_means,running_variances);
+#else
+ cpu::batch_normalize_inference(eps,dest,src,gamma,beta,running_means,running_variances);
+#endif
+ }
+
+ void batch_normalize (
+ const double eps,
+ resizable_tensor& dest,
+ resizable_tensor& means,
+ resizable_tensor& vars,
+ const double averaging_factor,
+ resizable_tensor& running_means,
+ resizable_tensor& running_variances,
+ const tensor& src,
+ const tensor& gamma,
+ const tensor& beta
+ )
+ {
+#ifdef DLIB_USE_CUDA
+ cuda::batch_normalize(eps,dest,means,vars,averaging_factor,running_means,running_variances,src,gamma,beta);
+#else
+ cpu::batch_normalize(eps,dest,means,vars,averaging_factor,running_means,running_variances,src,gamma,beta);
+#endif
+ }
+
+ void batch_normalize_gradient (
+ const double eps,
+ const tensor& gradient_input,
+ const tensor& means,
+ const tensor& invstds,
+ const tensor& src,
+ const tensor& gamma,
+ tensor& src_grad,
+ tensor& gamma_grad,
+ tensor& beta_grad
+ )
+ {
+
+#ifdef DLIB_USE_CUDA
+ cuda::batch_normalize_gradient(eps,gradient_input, means, invstds, src, gamma, src_grad, gamma_grad, beta_grad);
+#else
+ cpu::batch_normalize_gradient(eps,gradient_input, means, invstds, src, gamma, src_grad, gamma_grad, beta_grad);
+#endif
+ }
+
+// ----------------------------------------------------------------------------------------
+
+ void batch_normalize_conv_inference (
+ const double eps,
+ resizable_tensor& dest,
+ const tensor& src,
+ const tensor& gamma,
+ const tensor& beta,
+ const tensor& running_means,
+ const tensor& running_variances
+ )
+ {
+#ifdef DLIB_USE_CUDA
+ cuda::batch_normalize_conv_inference(eps,dest,src,gamma,beta,running_means,running_variances);
+#else
+ cpu::batch_normalize_conv_inference(eps,dest,src,gamma,beta,running_means,running_variances);
+#endif
+ }
+
+ void batch_normalize_conv (
+ const double eps,
+ resizable_tensor& dest,
+ resizable_tensor& means,
+ resizable_tensor& vars,
+ const double averaging_factor,
+ resizable_tensor& running_means,
+ resizable_tensor& running_variances,
+ const tensor& src,
+ const tensor& gamma,
+ const tensor& beta
+ )
+ {
+#ifdef DLIB_USE_CUDA
+ cuda::batch_normalize_conv(eps,dest,means,vars,averaging_factor,running_means,running_variances,src,gamma,beta);
+#else
+ cpu::batch_normalize_conv(eps,dest,means,vars,averaging_factor,running_means,running_variances,src,gamma,beta);
+#endif
+ }
+
+ void batch_normalize_conv_gradient (
+ const double eps,
+ const tensor& gradient_input,
+ const tensor& means,
+ const tensor& invstds,
+ const tensor& src,
+ const tensor& gamma,
+ tensor& src_grad,
+ tensor& gamma_grad,
+ tensor& beta_grad
+ )
+ {
+
+#ifdef DLIB_USE_CUDA
+ cuda::batch_normalize_conv_gradient(eps,gradient_input, means, invstds, src, gamma, src_grad, gamma_grad, beta_grad);
+#else
+ cpu::batch_normalize_conv_gradient(eps,gradient_input, means, invstds, src, gamma, src_grad, gamma_grad, beta_grad);
+#endif
+ }
+
+// ----------------------------------------------------------------------------------------
+
+ void threshold (
+ tensor& data,
+ float thresh
+ )
+ {
+#ifdef DLIB_USE_CUDA
+ cuda::threshold(data,thresh);
+#else
+ cpu::threshold(data,thresh);
+#endif
+ }
+
+ void dot (
+ const tensor& a,
+ const tensor& b,
+ tensor& result,
+ size_t idx
+ )
+ {
+#ifdef DLIB_USE_CUDA
+ cuda::dot(a,b,result,idx);
+#else
+ cpu::dot(a,b,result,idx);
+#endif
+ }
+
+// ----------------------------------------------------------------------------------------
+
+ void add(
+ float beta,
+ tensor& dest,
+ float alpha,
+ const tensor& src
+ )
+ {
+#ifdef DLIB_USE_CUDA
+ cuda::add(beta,dest,alpha,src);
+#else
+ cpu::add(beta,dest,alpha,src);
+#endif
+ }
+
+// ----------------------------------------------------------------------------------------
+
+ void add (
+ tensor& dest,
+ const tensor& src1,
+ const tensor& src2
+ )
+ {
+#ifdef DLIB_USE_CUDA
+ cuda::add(dest, src1, src2);
+#else
+ cpu::add(dest, src1, src2);
+#endif
+ }
+
+// ----------------------------------------------------------------------------------------
+
+ void assign_conv_bias_gradient (
+ tensor& grad,
+ const tensor& gradient_input
+ )
+ {
+#ifdef DLIB_USE_CUDA
+ cuda::assign_conv_bias_gradient(grad,gradient_input);
+#else
+ cpu::assign_conv_bias_gradient(grad,gradient_input);
+#endif
+ }
+
+// ----------------------------------------------------------------------------------------
+
+ void assign_bias_gradient (
+ tensor& grad,
+ const tensor& gradient_input
+ )
+ {
+#ifdef DLIB_USE_CUDA
+ cuda::assign_bias_gradient(grad,gradient_input);
+#else
+ cpu::assign_bias_gradient(grad,gradient_input);
+#endif
+ }
+
+// ----------------------------------------------------------------------------------------
+// ----------------------------------------------------------------------------------------
+
+ void softmax (
+ tensor& dest,
+ const tensor& src
+ )
+ {
+#ifdef DLIB_USE_CUDA
+ cuda::softmax(dest,src);
+#else
+ cpu::softmax(dest,src);
+#endif
+ }
+
+ void softmax_gradient (
+ tensor& grad,
+ const tensor& dest,
+ const tensor& gradient_input
+ )
+ {
+#ifdef DLIB_USE_CUDA
+ cuda::softmax_gradient(grad, dest, gradient_input);
+#else
+ cpu::softmax_gradient(grad, dest, gradient_input);
+#endif
+ }
+
+// ----------------------------------------------------------------------------------------
+
+ void softmax_all (
+ tensor& dest,
+ const tensor& src
+ )
+ {
+#ifdef DLIB_USE_CUDA
+ cuda::softmax_all(dest,src);
+#else
+ cpu::softmax_all(dest,src);
+#endif
+ }
+
+ void softmax_all_gradient (
+ tensor& grad,
+ const tensor& dest,
+ const tensor& gradient_input
+ )
+ {
+#ifdef DLIB_USE_CUDA
+ cuda::softmax_all_gradient(grad, dest, gradient_input);
+#else
+ cpu::softmax_all_gradient(grad, dest, gradient_input);
+#endif
+ }
+
+// ----------------------------------------------------------------------------------------
+
+ void sigmoid (
+ tensor& dest,
+ const tensor& src
+ )
+ {
+#ifdef DLIB_USE_CUDA
+ cuda::sigmoid(dest,src);
+#else
+ cpu::sigmoid(dest,src);
+#endif
+ }
+
+ void sigmoid_gradient (
+ tensor& grad,
+ const tensor& dest,
+ const tensor& gradient_input
+ )
+ {
+#ifdef DLIB_USE_CUDA
+ cuda::sigmoid_gradient(grad, dest, gradient_input);
+#else
+ cpu::sigmoid_gradient(grad, dest, gradient_input);
+#endif
+ }
+
+// ----------------------------------------------------------------------------------------
+
+ void relu (
+ tensor& dest,
+ const tensor& src
+ )
+ {
+#ifdef DLIB_USE_CUDA
+ cuda::relu(dest,src);
+#else
+ cpu::relu(dest,src);
+#endif
+ }
+
+ void relu_gradient (
+ tensor& grad,
+ const tensor& dest,
+ const tensor& gradient_input
+ )
+ {
+#ifdef DLIB_USE_CUDA
+ cuda::relu_gradient(grad, dest, gradient_input);
+#else
+ cpu::relu_gradient(grad, dest, gradient_input);
+#endif
+ }
+
+// ----------------------------------------------------------------------------------------
+
+ void prelu (
+ tensor& dest,
+ const tensor& src,
+ const tensor& param
+ )
+ {
+#ifdef DLIB_USE_CUDA
+ cuda::prelu(dest, src, param);
+#else
+ cpu::prelu(dest, src, param);
+#endif
+ }
+
+ void prelu_gradient (
+ tensor& grad,
+ const tensor& src,
+ const tensor& gradient_input,
+ const tensor& param,
+ tensor& params_grad
+ )
+ {
+#ifdef DLIB_USE_CUDA
+ cuda::prelu_gradient(grad, src, gradient_input, param, params_grad);
+#else
+ cpu::prelu_gradient(grad, src, gradient_input, param, params_grad);
+#endif
+ }
+
+// ----------------------------------------------------------------------------------------
+
+ void tanh (
+ tensor& dest,
+ const tensor& src
+ )
+ {
+#ifdef DLIB_USE_CUDA
+ cuda::tanh(dest,src);
+#else
+ cpu::tanh(dest,src);
+#endif
+ }
+
+ void tanh_gradient (
+ tensor& grad,
+ const tensor& dest,
+ const tensor& gradient_input
+ )
+ {
+#ifdef DLIB_USE_CUDA
+ cuda::tanh_gradient(grad, dest, gradient_input);
+#else
+ cpu::tanh_gradient(grad, dest, gradient_input);
+#endif
+ }
+
+// ----------------------------------------------------------------------------------------
+
+ void resize_bilinear (
+ tensor& dest,
+ long dest_row_stride,
+ long dest_channel_stride,
+ const tensor& src,
+ long src_row_stride,
+ long src_channel_stride
+ )
+ {
+#ifdef DLIB_USE_CUDA
+ cuda::resize_bilinear(dest,dest_row_stride,dest_channel_stride, src,src_row_stride,src_channel_stride);
+#else
+ cpu::resize_bilinear(dest,dest_row_stride,dest_channel_stride, src,src_row_stride,src_channel_stride);
+#endif
+ }
+
+ void resize_bilinear_gradient (
+ tensor& grad,
+ long grad_row_stride,
+ long grad_channel_stride,
+ const tensor& gradient_input,
+ long gradient_input_row_stride,
+ long gradient_input_channel_stride
+ )
+ {
+#ifdef DLIB_USE_CUDA
+ cuda::resize_bilinear_gradient(grad,grad_row_stride,grad_channel_stride, gradient_input,gradient_input_row_stride,gradient_input_channel_stride);
+#else
+ cpu::resize_bilinear_gradient(grad,grad_row_stride,grad_channel_stride, gradient_input,gradient_input_row_stride,gradient_input_channel_stride);
+#endif
+ }
+
+// ------------------------------------------------------------------------------------
+
+ void copy_tensor(
+ bool add_to,
+ tensor& dest,
+ size_t dest_k_offset,
+ const tensor& src,
+ size_t src_k_offset,
+ size_t count_k
+ )
+ {
+#ifdef DLIB_USE_CUDA
+ cuda::copy_tensor(add_to, dest, dest_k_offset, src, src_k_offset, count_k);
+#else
+ cpu::copy_tensor(add_to, dest, dest_k_offset, src, src_k_offset, count_k);
+#endif
+ }
+
+// ----------------------------------------------------------------------------------------
+
+ void inv::
+ operator() (
+ const tensor& m,
+ resizable_tensor& out
+ )
+ {
+#ifdef DLIB_USE_CUDA
+ finv(m,out);
+#else
+ out = dlib::inv(mat(m));
+#endif
+ }
+
+// ----------------------------------------------------------------------------------------
+
+}}
+
+#endif // DLIB_TeNSOR_TOOLS_CPP_
+
diff --git a/ml/dlib/dlib/dnn/tensor_tools.h b/ml/dlib/dlib/dnn/tensor_tools.h
new file mode 100644
index 000000000..9ba3154e5
--- /dev/null
+++ b/ml/dlib/dlib/dnn/tensor_tools.h
@@ -0,0 +1,1711 @@
+// Copyright (C) 2015 Davis E. King (davis@dlib.net)
+// License: Boost Software License See LICENSE.txt for the full license.
+#ifndef DLIB_TeNSOR_TOOLS_H_
+#define DLIB_TeNSOR_TOOLS_H_
+
+#include "tensor.h"
+#include "cudnn_dlibapi.h"
+#include "cublas_dlibapi.h"
+#include "cusolver_dlibapi.h"
+#include "curand_dlibapi.h"
+#include "cpu_dlib.h"
+#include "cuda_dlib.h"
+#include "../rand.h"
+#include <memory>
+#include "../geometry/rectangle.h"
+#include "../test_for_odr_violations.h"
+
+namespace dlib
+{
+ bool dnn_prefer_fastest_algorithms();
+ void set_dnn_prefer_fastest_algorithms();
+ void set_dnn_prefer_smallest_algorithms();
+}
+
+namespace dlib { namespace tt
+{
+
+// ----------------------------------------------------------------------------------------
+
+ void inverse_norms (
+ resizable_tensor& invnorms,
+ const tensor& data,
+ const double eps
+ );
+ /*!
+ ensures
+ - #invnorms == reciprocal(sqrt(sum_cols(squared(mat(data))) + eps))
+ !*/
+
+ void dot_prods (
+ resizable_tensor& out,
+ const tensor& lhs,
+ const tensor& rhs
+ );
+ /*!
+ requires
+ - have_same_dimensions(lhs,rhs) == true
+ ensures
+ - #out.num_samples() == lhs.num_samples()
+ - #out.k() == #out.nr() == #out.nc() == 1
+ - #out == sum_cols(pointwise_multiply(mat(lhs), mat(rhs)));
+ !*/
+
+ void dot_prods (
+ bool add_to,
+ tensor& out,
+ const tensor& lhs,
+ const tensor& rhs
+ );
+ /*!
+ requires
+ - have_same_dimensions(lhs,rhs) == true
+ - out.size() == lhs.num_samples()
+ - out.k() == out.nr() == out.nc() == 1
+ ensures
+ - if (add_to) then
+ - #out == mat(out) + sum_cols(pointwise_multiply(mat(lhs), mat(rhs)));
+ - else
+ - #out == sum_cols(pointwise_multiply(mat(lhs), mat(rhs)));
+ !*/
+
+ void scale_columns (
+ tensor& out,
+ const tensor& m,
+ const tensor& v
+ );
+ /*!
+ requires
+ - have_same_dimensions(out,m) == true
+ - is_vector(v) == true
+ - v.size() == mat(m).nc()
+ ensures
+ - performs: out = scale_columns(mat(m),mat(v));
+ !*/
+
+ void scale_rows (
+ tensor& out,
+ const tensor& m,
+ const tensor& v
+ );
+ /*!
+ requires
+ - have_same_dimensions(out,m) == true
+ - is_vector(v) == true
+ - v.size() == m.num_samples()
+ ensures
+ - performs: out = scale_rows(mat(m),mat(v));
+ !*/
+
+ void scale_rows2 (
+ float beta,
+ tensor& out,
+ const tensor& m1,
+ const tensor& m2,
+ const tensor& v1,
+ const tensor& v2
+ );
+ /*!
+ requires
+ - have_same_dimensions(out,m1) == true
+ - have_same_dimensions(out,m2) == true
+ - have_same_dimensions(v1,v2) == true
+ - is_vector(v1) == true
+ - v1.size() == m1.num_samples()
+ ensures
+ - performs:
+ out = beta*out + scale_rows(mat(m1) - scale_rows(mat(m2),mat(v1)), mat(v2));
+ !*/
+
+// ----------------------------------------------------------------------------------------
+
+ void exp (
+ tensor& dest,
+ const tensor& src
+ );
+ /*!
+ requires
+ - dest.size() == src.size()
+ ensures
+ - performs: dest = exp(mat(src))
+ !*/
+
+// ----------------------------------------------------------------------------------------
+
+ void log (
+ tensor& dest,
+ const tensor& src
+ );
+ /*!
+ requires
+ - dest.size() == src.size()
+ ensures
+ - performs: dest = log(mat(src))
+ !*/
+
+// ----------------------------------------------------------------------------------------
+
+ void log10 (
+ tensor& dest,
+ const tensor& src
+ );
+ /*!
+ requires
+ - dest.size() == src.size()
+ ensures
+ - performs: dest = log10(mat(src))
+ !*/
+
+// ----------------------------------------------------------------------------------------
+
+ void gemm (
+ float beta,
+ tensor& dest,
+ float alpha,
+ const tensor& lhs,
+ bool trans_lhs,
+ const tensor& rhs,
+ bool trans_rhs
+ );
+ /*!
+ requires
+ - dest does not alias the memory of lhs or rhs
+ - The dimensions of lhs and rhs must be compatible for matrix multiplication.
+ In particular:
+ - Let L == trans_lhs ? trans(mat(lhs)) : mat(lhs)
+ - Let R == trans_rhs ? trans(mat(rhs)) : mat(rhs)
+ - Let D == mat(dest)
+ - D.nr() == L.nr() && D.nc() == R.nc()
+ (i.e. dest must be preallocated and have the correct output dimensions)
+ - L.nc() == R.nr()
+ ensures
+ - performs: dest = alpha*L*R + beta*mat(dest)
+ !*/
+
+// ----------------------------------------------------------------------------------------
+
+ class inv
+ {
+ /*!
+ WHAT THIS OBJECT REPRESENTS
+ This is a functor for doing matrix inversion on the GPU. The only
+ reason it's an object is to avoid the reallocation of some GPU memory
+ blocks if you want to do a bunch of matrix inversions in a row.
+ !*/
+ public:
+
+ void operator() (
+ const tensor& m,
+ resizable_tensor& out
+ );
+ /*!
+ requires
+ - m.size() == m.num_samples()*m.num_samples()
+ (i.e. mat(m) must be a square matrix)
+ ensures
+ - out == inv(mat(m));
+ !*/
+
+ private:
+#ifdef DLIB_USE_CUDA
+ cuda::inv finv;
+#endif
+ };
+
+// ----------------------------------------------------------------------------------------
+
+ class tensor_rand
+ {
+ /*!
+ WHAT THIS OBJECT REPRESENTS
+ This is a tool for filling a tensor with random numbers.
+
+ Note that the sequence of random numbers output by this object is different
+ when dlib is compiled with DLIB_USE_CUDA. So you should not write code
+ that depends on any specific sequence of numbers coming out of a
+ tensor_rand.
+
+ !*/
+
+ public:
+ // not copyable
+ tensor_rand(const tensor_rand&) = delete;
+ tensor_rand& operator=(const tensor_rand&) = delete;
+
+ tensor_rand() : tensor_rand(0) {}
+ tensor_rand(unsigned long long seed);
+
+ void fill_gaussian (
+ tensor& data,
+ float mean = 0,
+ float stddev = 1
+ );
+ /*!
+ requires
+ - data.size()%2 == 0
+ ensures
+ - Fills data with random numbers drawn from a Gaussian distribution
+ with the given mean and standard deviation.
+ !*/
+
+ void fill_uniform (
+ tensor& data
+ );
+ /*!
+ ensures
+ - Fills data with uniform random numbers in the range (0.0, 1.0].
+ !*/
+
+#ifdef DLIB_USE_CUDA
+ cuda::curand_generator rnd;
+#else
+ dlib::rand rnd;
+#endif
+ };
+
+// ----------------------------------------------------------------------------------------
+
+ void multiply (
+ bool add_to,
+ tensor& dest,
+ const tensor& src1,
+ const tensor& src2
+ );
+ /*!
+ requires
+ - dest.k() == src1.k() == src2.k()
+ - dest.nr() == src1.nr() == src2.nr()
+ - dest.nc() == src1.nc() == src2.nc()
+ - dest.num_samples(), src1.num_samples(), and src2.num_samples() must each
+ either be 1 or whichever ones aren't equal to 1 must have the same values.
+ ensures
+ - let MD = max(dest.num_samples(), src1.num_samples(), src2.num_samples)
+ - This function pointwise multiplies src1 with src2 and stores the result into
+ #dest. However, how the multiplication happens depends on the dimensions of
+ the tensors. First, when src1 and src2 are multiplied together, if either
+ has a num_samples() dimension that is != MD, then it is first replicated to
+ produce a tensor with num_samples()==MD dimensions and then they are
+ pointwise multiplied together.
+
+ Second, if dest.num_samples()==1, then after the pointwise multiplication of
+ src1 with src2, the result has its samples summed to produce an output tensor
+ with num_samples()==1 which is then assigned to #dest.
+ - if (add_to) then
+ - Instead of assigning the result to dest, this function adds the result to dest.
+ !*/
+
+ void scale_channels (
+ bool add_to,
+ tensor& dest,
+ const tensor& src,
+ const tensor& scales
+ );
+ /*!
+ requires
+ - have_same_dimensions(dest, src) == true
+ - scales.num_samples() == src.num_samples()
+ - scales.k() == src.k()
+ - scales.nr() == 1
+ - scales.nc() == 1
+ ensures
+ - Scales each channel of src by the corresponding value in scales. To be
+ precise, we will have:
+ - #dest(n,k,r,c) == src(n,k,r,c)*scales(n,k,1,1)
+ - if (add_to) then
+ - Instead of assigning the result to dest, this function adds the result to dest.
+ !*/
+
+ void multiply_conv (
+ bool add_to,
+ tensor& dest,
+ const tensor& src1,
+ const tensor& src2
+ );
+ /*!
+ requires
+ - if (have_same_dimensions(dest, src1) == true) then
+ - src2.num_samples() == 1
+ - src2.nr() == 1
+ - src2.nc() == 1
+ - src2.k() == src1.k()
+ - else
+ - have_same_dimensions(src1, src2) == true)
+ - dest.num_samples() == 1
+ - dest.nr() == 1
+ - dest.nc() == 1
+ - dest.k() == src1.k()
+ ensures
+ - Performs #dest == src1*src2
+ In particular, if the elements of dest, src1, and src2 were indexed by (n,k,r,c) then
+ we would have:
+ - if (have_same_dimensions(dest,src1)) then
+ #dest(n,k,r,c) == src1(n,k,r,c)*src2(k)
+ - else
+ #dest(k) == sum over {n,r,c} of src1(n,k,r,c)*src2(n,k,r,c)
+ - if (add_to) then
+ - Instead of assigning the result to dest, this function adds the result to dest.
+ !*/
+
+ void multiply_zero_padded (
+ bool add_to,
+ tensor& dest,
+ const tensor& src1,
+ const tensor& src2
+ );
+ /*!
+ ensures
+ - if (add_to) then
+ - performs: dest += src1 * src2
+ - else
+ - performs: dest = src1 * src2
+ - In either case, the multiplication happens pointwise according to 4D tensor
+ arithmetic. If the dimensions don't match then missing elements are presumed
+ to be equal to 0.
+ !*/
+
+// ----------------------------------------------------------------------------------------
+
+ void affine_transform(
+ tensor& dest,
+ const tensor& src,
+ const float A,
+ const float B
+ );
+ /*!
+ requires
+ - dest.size()==src.size()
+ ensures
+ - #dest == A*src + B
+ !*/
+
+ void affine_transform(
+ tensor& dest,
+ const tensor& src,
+ const float A
+ );
+ /*!
+ requires
+ - dest.size()==src.size()
+ ensures
+ - #dest == A*src
+ !*/
+
+ void affine_transform(
+ tensor& dest,
+ const tensor& src1,
+ const tensor& src2,
+ const float A,
+ const float B,
+ const float C
+ );
+ /*!
+ requires
+ - dest.size()==src1.size()
+ - dest.size()==src2.size()
+ ensures
+ - #dest == A*src1 + B*src2 + C
+ !*/
+
+ void affine_transform(
+ tensor& dest,
+ const tensor& src1,
+ const tensor& src2,
+ const float A,
+ const float B
+ );
+ /*!
+ requires
+ - dest.size()==src1.size()
+ - dest.size()==src2.size()
+ ensures
+ - #dest == A*src1 + B*src2
+ !*/
+
+ void affine_transform(
+ tensor& dest,
+ const tensor& src1,
+ const tensor& src2,
+ const tensor& src3,
+ const float A,
+ const float B,
+ const float C,
+ const float D
+ );
+ /*!
+ requires
+ - dest.size()==src1.size()
+ - dest.size()==src2.size()
+ - dest.size()==src3.size()
+ ensures
+ - #dest == A*src1 + B*src2 + C*src3 + D
+ !*/
+
+ void affine_transform(
+ tensor& dest,
+ const tensor& src1,
+ const tensor& src2,
+ const tensor& src3,
+ const float A,
+ const float B,
+ const float C
+ );
+ /*!
+ requires
+ - dest.size()==src1.size()
+ - dest.size()==src2.size()
+ - dest.size()==src3.size()
+ ensures
+ - #dest == A*src1 + B*src2 + C*src3
+ !*/
+
+ void affine_transform_range(
+ size_t begin,
+ size_t end,
+ tensor& dest,
+ const tensor& src1,
+ const tensor& src2,
+ const tensor& src3,
+ const float A,
+ const float B,
+ const float C
+ );
+ /*!
+ requires
+ - dest.size()==src1.size()
+ - dest.size()==src2.size()
+ - dest.size()==src3.size()
+ - begin <= end <= dest.size()
+ ensures
+ - This function operates much like
+ affine_transform(dest,src1,src2,src3,A,B,C,0), except that it runs over only
+ the half open range [begin,end) rather than processing the entire tensor.
+ Specifically, it does this:
+ - for i in the range [begin, end):
+ - #dest.host()[i] == A*src1.host()[i] + B*src2.host()[i] + C*src3.host()[i]
+ !*/
+
+ void affine_transform(
+ const rectangle& rect,
+ tensor& dest,
+ const tensor& src1,
+ const tensor& src2,
+ const tensor& src3,
+ float A,
+ float B,
+ float C
+ );
+ /*!
+ requires
+ - dest.size()==src1.size()
+ - dest.size()==src2.size()
+ - dest.size()==src3.size()
+ - dest.num_samples()==src1.num_samples()
+ - dest.num_samples()==src2.num_samples()
+ - dest.num_samples()==src3.num_samples()
+ - get_rect(mat(dest)).contains(rect) == true
+ (i.e. rect must be entirely contained within dest)
+ ensures
+ - This function operates much like
+ affine_transform(dest,src1,src2,src3,A,B,C,0), except that it runs over only
+ the sub-rectangle indicated by rect. In particular, this function is equivalent
+ to:
+ set_subm(dest,rect) = A*subm(mat(src1),rect) + B*subm(mat(src2),rect) + C*subm(mat(src3),rect)
+ !*/
+
+// ----------------------------------------------------------------------------------------
+
+ void affine_transform(
+ tensor& dest,
+ const tensor& src,
+ const tensor& A,
+ const tensor& B
+ );
+ /*!
+ requires
+ - have_same_dimensions(dest,src) == true
+ - if (A.num_samples() == 1) then
+ - B.num_samples() == 1
+ - else
+ - A.num_samples() == src.num_samples()
+ - B.num_samples() == src.num_samples()
+ - A.nr() == B.nr() == src.nr()
+ - A.nc() == B.nc() == src.nc()
+ - A.k() == B.k() == src.k()
+ ensures
+ - if (A.num_samples() == 1) then
+ - #dest == A*src + B
+ (done for each sample in src)
+ - else
+ - for all valid i:
+ - #dest.host()[i] == A.host()[i]*src.host()[i] + B.host()[i]
+ !*/
+
+// ----------------------------------------------------------------------------------------
+
+ void affine_transform_conv(
+ tensor& dest,
+ const tensor& src,
+ const tensor& A,
+ const tensor& B
+ );
+ /*!
+ requires
+ - have_same_dimensions(dest,src) == true
+ - have_same_dimensions(A, B) == true
+ - A.num_samples() == 1
+ - A.nr() == 1
+ - A.nc() == 1
+ - A.k() == src.k()
+ ensures
+ - Performs #dest == A*src + B
+ In particular, if the elements of dest and src were indexed by (n,k,r,c) then
+ we would have:
+ #dest(n,k,r,c) == A(k)*src(n,k,r,c) + B(k).
+ !*/
+
+// ----------------------------------------------------------------------------------------
+
+ void compute_adam_update (
+ size_t begin,
+ size_t end,
+ tensor& s,
+ tensor& m,
+ tensor& v,
+ const float t,
+ const float learning_rate,
+ const float weight_decay,
+ const float momentum1,
+ const float momentum2,
+ const tensor& params,
+ const tensor& params_grad
+ );
+ /*!
+ requires
+ - s.size() == m.size() = v.size() == params.size() == params_grad.size()
+ - t > 0
+ - learning_rate > 0
+ - weight_decay >= 0
+ - 0 <= momentum1 < 1
+ - 0 <= momentum2 < 1
+ - begin <= end <= params.size()
+ ensures
+ - This function implements the ADAM parameter update method described in the paper:
+ Kingma, Diederik P., and Jimmy Ba Adam. "A method for stochastic
+ optimization." International Conference on Learning Representation. 2015.
+ Specifically, it implements the method shown as Algorithm 1.
+ - #s is the update vector that should be added to the parameters.
+ - The function only operates in the half open range [begin,end) of the memory
+ blocks of each tensor. E.g. to make this function run on the entire tensor
+ set begin to 0 and end to params.size().
+ !*/
+
+// ----------------------------------------------------------------------------------------
+
+ void batch_normalize_inference (
+ const double eps,
+ resizable_tensor& dest,
+ const tensor& src,
+ const tensor& gamma,
+ const tensor& beta,
+ const tensor& running_means,
+ const tensor& running_variances
+ );
+ /*!
+ requires
+ - eps > 0
+ - gamma.num_samples() == 1
+ - gamma.nr() == src.nr()
+ - gamma.nc() == src.nc()
+ - gamma.k() == src.k()
+ - have_same_dimensions(gamma, beta)
+ - have_same_dimensions(gamma, running_means)
+ - have_same_dimensions(gamma, running_variances)
+ ensures
+ - Linearly transforms src as a call to batch_normalize() would if src had means
+ and variances as given by running_means and running_variances. That is, this
+ function performs:
+ dest = gamma*(src-running_means)/sqrt(running_variances+eps) + beta
+ Note that it does it in a pointwise fashion over the samples in src.
+ !*/
+
+ void batch_normalize (
+ const double eps,
+ resizable_tensor& dest,
+ resizable_tensor& means,
+ resizable_tensor& invstds,
+ const double averaging_factor,
+ resizable_tensor& running_means,
+ resizable_tensor& running_variances,
+ const tensor& src,
+ const tensor& gamma,
+ const tensor& beta
+ );
+ /*!
+ requires
+ - eps > 0
+ - src.num_samples() > 1
+ - gamma.num_samples() == 1
+ - beta.num_samples() == 1
+ - gamma.nr() == beta.nr() == src.nr()
+ - gamma.nc() == beta.nc() == src.nc()
+ - gamma.k() == beta.k() == src.k()
+ - 0 <= averaging_factor <= 1
+ - if (averaging_factor != 1)
+ - have_same_dimensions(running_means, means) == true
+ - have_same_dimensions(running_variances, invstds) == true
+ ensures
+ - have_same_dimensions(#dest, src) == true
+ - #means.num_samples() == 1
+ - #invstds.num_samples() == 1
+ - means.nr() == invstds.nr() == src.nr()
+ - means.nc() == invstds.nc() == src.nc()
+ - means.k() == invstds.k() == src.k()
+ - #src == the batch normalized version of src.
+ - #means == the mean values of the contents of src.
+ - #invstds == 1/(the standard deviation values of the contents of src).
+ - #running_means = (1-averaging_factor)*mat(#running_means) + averaging_factor*mat(#means);
+ - #running_variances = (1-averaging_factor)*mat(#running_variances) + averaging_factor*(variance of contents of src);
+ !*/
+
+ void batch_normalize_gradient (
+ const double eps,
+ const tensor& gradient_input,
+ const tensor& means,
+ const tensor& invstds,
+ const tensor& src,
+ const tensor& gamma,
+ tensor& src_grad,
+ tensor& gamma_grad,
+ tensor& beta_grad
+ );
+ /*!
+ requires
+ - eps > 0
+ - invstds and means should be the output of a call to
+ batch_normalize(eps,dest,means,invstds,src,gamma,beta)
+ - have_same_dimensions(gradient_input, src) == true
+ - have_same_dimensions(src, src_grad) == true
+ - src.num_samples() > 1
+ - gamma.num_samples() == 1
+ - have_same_dimensions(gamma, gamma_grad) == true
+ - have_same_dimensions(gamma, beta_grad) == true
+ - gamma.nr() == src.nr()
+ - gamma.nc() == src.nc()
+ - gamma.k() == src.k()
+ - have_same_dimensions(means, gamma) == true
+ - have_same_dimensions(invstds, gamma) == true
+ ensures
+ - Let f(src,gamma,beta) == dot(gradient_input, dest output of
+ batch_normalize(eps,dest,means,invstds,src,gamma,beta))
+ - Adds the gradient of f() with respect to src to #src_grad.
+ - Assigns the gradient of f() with respect to gamma to #gamma_grad.
+ - Assigns the gradient of f() with respect to beta to #beta_grad.
+ !*/
+
+// ----------------------------------------------------------------------------------------
+
+ void batch_normalize_conv_inference (
+ const double eps,
+ resizable_tensor& dest,
+ const tensor& src,
+ const tensor& gamma,
+ const tensor& beta,
+ const tensor& running_means,
+ const tensor& running_variances
+ );
+ /*!
+ requires
+ - eps > 0
+ - gamma.num_samples() == 1
+ - gamma.nr() == 1
+ - gamma.nc() == 1
+ - gamma.k() == src.k()
+ - have_same_dimensions(gamma, beta)
+ - have_same_dimensions(gamma, running_means)
+ - have_same_dimensions(gamma, running_variances)
+ ensures
+ - Linearly transforms src as a call to batch_normalize_conv() would if src had
+ means and variances as given by running_means and running_variances. That
+ is, this function performs:
+ dest = gamma*(src-running_means)/sqrt(running_variances+eps) + beta
+ Note that it does this in a pointwise fashion over the samples, rows, and
+ columns in src.
+ !*/
+
+ void batch_normalize_conv (
+ const double eps,
+ resizable_tensor& dest,
+ resizable_tensor& means,
+ resizable_tensor& invstds,
+ const double averaging_factor,
+ resizable_tensor& running_means,
+ resizable_tensor& running_variances,
+ const tensor& src,
+ const tensor& gamma,
+ const tensor& beta
+ );
+ /*!
+ requires
+ - eps > 0
+ - src.num_samples() > 1
+ - gamma.num_samples()==gamma.nr()==gamma.nc() == 1
+ - beta.num_samples() ==beta.nr() ==gamma.nc() == 1
+ - gamma.k() == beta.k() == src.k()
+ - 0 <= averaging_factor <= 1
+ - if (averaging_factor != 1)
+ - have_same_dimensions(running_means, means) == true
+ - have_same_dimensions(running_variances, invstds) == true
+ ensures
+ - have_same_dimensions(#dest, src) == true
+ - #means.num_samples()==means.nr()==means.nc() == 1
+ - #invstds.num_samples() ==invstds.nr() ==invstds.nc() == 1
+ - means.k() == invstds.k() == src.k()
+ - #src == the batch normalized version of src.
+ - #means == the mean values of the contents of src.
+ - #invstds == 1/(the standard deviation values of the contents of src).
+ - #running_means = (1-averaging_factor)*mat(#running_means) + averaging_factor*mat(#means);
+ - #running_variances = (1-averaging_factor)*mat(#running_variances) + averaging_factor*(variance of contents of src);
+ !*/
+
+ void batch_normalize_conv_gradient (
+ const double eps,
+ const tensor& gradient_input,
+ const tensor& means,
+ const tensor& invstds,
+ const tensor& src,
+ const tensor& gamma,
+ tensor& src_grad,
+ tensor& gamma_grad,
+ tensor& beta_grad
+ );
+ /*!
+ requires
+ - eps > 0
+ - invstds and means should be the output of a call to
+ batch_normalize_conv(eps,dest,means,invstds,src,gamma,beta)
+ - have_same_dimensions(gradient_input, src) == true
+ - have_same_dimensions(src, src_grad) == true
+ - src.num_samples() > 1
+ - gamma.num_samples()==gamma.nr()==gamma.nc() == 1
+ - have_same_dimensions(gamma, gamma_grad) == true
+ - have_same_dimensions(gamma, beta_grad) == true
+ - gamma.k() == src.k()
+ - have_same_dimensions(means, gamma) == true
+ - have_same_dimensions(invstds, gamma) == true
+ ensures
+ - Let f(src,gamma,beta) == dot(gradient_input, dest output of
+ batch_normalize_conv(eps,dest,means,invstds,src,gamma,beta))
+ - Adds the gradient of f() with respect to src to #src_grad.
+ - Assigns the gradient of f() with respect to gamma to #gamma_grad.
+ - Assigns the gradient of f() with respect to beta to #beta_grad.
+ !*/
+
+// -----------------------------------------------------------------------------------
+
+ void threshold (
+ tensor& data,
+ float thresh
+ );
+ /*!
+ ensures
+ - Sets all elements of data to 1 or 0 depending on if they are above or below
+ the given threshold. Specifically, for all valid i:
+ - #data.host()[i] == data.host()[i]>thresh ? 1 : 0
+ !*/
+
+ void dot (
+ const tensor& a,
+ const tensor& b,
+ tensor& result,
+ size_t idx
+ );
+ /*!
+ requires
+ - a.size() == b.size()
+ - idx < result.size()
+ ensures
+ - #result.host()[idx] == result.host()[idx] + dot(a,b);
+ I.e. Adds the dot product between a and b into the idx-th element of result.
+ The reason you might want to use this more complex version of dot() is
+ because, when using CUDA, it runs by generating asynchronous kernel launches
+ whereas the version of dot() that returns the result immediately as a scalar
+ must block the host while we wait for the result to be computed and then
+ transfered from the GPU do the host for return by dot(). So this version of
+ dot() might be much faster in some cases.
+ !*/
+
+// ----------------------------------------------------------------------------------------
+
+ void add(
+ float beta,
+ tensor& dest,
+ float alpha,
+ const tensor& src
+ );
+ /*!
+ requires
+ - One of the following is true:
+ - have_same_dimensions(src, dest)
+ - src.num_samples()==1 && src.k()==dest.k() && src.nr()==1 && src.nc()==1
+ - src.num_samples()==1 && src.k()==dest.k() && src.nr()==dest.nr() && src.nc()==dest.nc()
+ - src.num_samples()==1 && src.k()==1 && src.nr()==dest.nr() && src.nc()==dest.nc()
+ - src.num_samples()==dest.num_samples() && src.k()==1 && src.nr()==1 && src.nc()==1
+ - is_same_object(src,dest) == false
+ ensures
+ - performs: dest = beta*dest + alpha*src
+ However, how the addition happens depends on the dimensions of src. In
+ particular, this function adds the scaled values of one src tensor to dest.
+ Each dimension of the src tensor must match the corresponding dimension of
+ the dest tensor or must be equal to 1. In the latter case, the same value
+ from the src tensor, for those dimensions, will be used to add into the dest
+ tensor.
+ !*/
+
+// ----------------------------------------------------------------------------------------
+
+ void add (
+ tensor& dest,
+ const tensor& src1,
+ const tensor& src2
+ );
+ /*!
+ ensures
+ - performs: dest = src1 + src2
+ The addition happens pointwise according to 4D tensor arithmetic. If the
+ dimensions don't match then missing elements are presumed to be equal to 0.
+ !*/
+
+// ----------------------------------------------------------------------------------------
+
+ void assign_conv_bias_gradient (
+ tensor& grad,
+ const tensor& gradient_input
+ );
+ /*!
+ requires
+ - grad.num_samples() == 1
+ - grad.k() >= 1
+ - grad.nr() == 1
+ - grad.nc() == 1
+ - gradient_input.k() == grad.k()
+ - gradient_input.size() > 0
+ - is_same_object(grad,gradient_input) == false
+ ensures
+ - let BIAS be a tensor with the same dimensions as grad.
+ - let OUT be the output of add(1,OUT,1,BIAS)
+ - let f(gradient_input,BIAS) == dot(gradient_input,OUT)
+ - Then this function computes the gradient of f() with respect to BIAS and
+ assigns it to grad.
+ !*/
+
+// ----------------------------------------------------------------------------------------
+
+ void assign_bias_gradient (
+ tensor& grad,
+ const tensor& gradient_input
+ );
+ /*!
+ requires
+ - grad.num_samples() == 1
+ - gradient_input.k() == grad.k()
+ - gradient_input.nr() == grad.nr()
+ - gradient_input.nc() == grad.nc()
+ - gradient_input.size() > 0
+ - is_same_object(grad,gradient_input) == false
+ ensures
+ - let BIAS be a tensor with the same dimensions as grad.
+ - let OUT be the output of add(1,OUT,1,BIAS)
+ - let f(gradient_input,BIAS) == dot(gradient_input,OUT)
+ - Then this function computes the gradient of f() with respect to BIAS and
+ assigns it to grad.
+ !*/
+
+// ----------------------------------------------------------------------------------------
+
+ class tensor_conv
+ {
+ public:
+ tensor_conv(const tensor_conv&) = delete;
+ tensor_conv& operator=(const tensor_conv&) = delete;
+
+ tensor_conv() {}
+
+ void clear(
+ ) { impl.clear(); }
+
+ void operator() (
+ const bool add_to_output,
+ tensor& output,
+ const tensor& data,
+ const tensor& filters
+ ) { impl(add_to_output,output,data,filters); }
+ /*!
+ requires
+ - setup() has been called. Specifically, setup() has been called like this:
+ this->setup(data, filters, stride_y, stride_x, padding_y, padding_x);
+ - is_same_object(output,data) == false
+ - is_same_object(output,filters) == false
+ - filters.k() == data.k()
+ - filters.nr() <= src.nr() + 2*padding_y
+ - filters.nc() <= src.nc() + 2*padding_x
+ - #output.num_samples() == data.num_samples()
+ - #output.k() == filters.num_samples()
+ - #output.nr() == 1+(data.nr() + 2*padding_y - filters.nr())/stride_y
+ - #output.nc() == 1+(data.nc() + 2*padding_x - filters.nc())/stride_x
+ ensures
+ - Convolves filters over data. If add_to_output==true then we add the
+ results to output, otherwise we assign to output, overwriting the
+ previous values in output.
+ - filters contains filters.num_samples() filters.
+ !*/
+
+ void operator() (
+ const bool add_to_output,
+ resizable_tensor& output,
+ const tensor& data,
+ const tensor& filters
+ ) { impl(add_to_output,output,data,filters); }
+ /*!
+ requires
+ - setup() has been called. Specifically, setup() has been called like this:
+ this->setup(data, filters, stride_y, stride_x, padding_y, padding_x);
+ - is_same_object(output,data) == false
+ - is_same_object(output,filters) == false
+ - filters.k() == data.k()
+ - filters.nr() <= src.nr() + 2*padding_y
+ - filters.nc() <= src.nc() + 2*padding_x
+ ensures
+ - Convolves filters over data. If add_to_output==true then we add the
+ results to output, otherwise we assign to output, overwriting the
+ previous values in output.
+ - filters contains filters.num_samples() filters.
+ - #output.num_samples() == data.num_samples()
+ - #output.k() == filters.num_samples()
+ - #output.nr() == 1+(data.nr() + 2*padding_y - filters.nr())/stride_y
+ - #output.nc() == 1+(data.nc() + 2*padding_x - filters.nc())/stride_x
+ !*/
+
+ void get_gradient_for_data (
+ const bool add_to_output,
+ const tensor& gradient_input,
+ const tensor& filters,
+ tensor& data_gradient
+ ) { impl.get_gradient_for_data(add_to_output,gradient_input,filters,data_gradient); }
+ /*!
+ requires
+ - One of the following must be true:
+ - filters has the same dimensions as the filters object given to the
+ last call to operator(). Also, data_gradient has the same dimensions
+ as the data object given to the last call to operator().
+ - setup() has been called. Specifically, setup() has been called like this:
+ this->setup(data_gradient, filters, stride_y, stride_x, padding_y, padding_x);
+ - gradient_input has the following dimensions:
+ - gradient_input.num_samples() == data_gradient.num_samples()
+ - gradient_input.k() == filters.num_samples()
+ - gradient_input.nr() == 1+(data_gradient.nr() + 2*padding_y - filters.nr())/stride_y
+ - gradient_input.nc() == 1+(data_gradient.nc() + 2*padding_x - filters.nc())/stride_x
+ - NOTE, these dimensions are what you would obtain if gradient_input
+ has the same dimensions as the last output of operator().
+ - is_same_object(data_gradient,filters) == false
+ - is_same_object(data_gradient,gradient_input) == false
+ ensures
+ - let OUT be the output of (*this)(OUT,data,filters,sx,sy).
+ - let f(data,filters) == dot(OUT, gradient_input)
+ - if (add_to_output) then
+ - This function finds the gradient of f() with respect to data and adds
+ this gradient to data_gradient.
+ - else
+ - This function finds the gradient of f() with respect to data and
+ assigns this gradient to data_gradient, overwriting the previous
+ values in data_gradient.
+ !*/
+
+ void get_gradient_for_filters (
+ const bool add_to_output,
+ const tensor& gradient_input,
+ const tensor& data,
+ tensor& filters_gradient
+ ) { impl.get_gradient_for_filters(add_to_output,gradient_input,data,filters_gradient); }
+ /*!
+ requires
+ - One of the following must be true:
+ - filters_gradient has the same dimensions as the filters object given
+ to the last call to operator(). Also, data has the same dimensions
+ as the data object given to the last call to operator().
+ - setup() has been called. Specifically, setup() has been called like this:
+ this->setup(data, filters_gradient, stride_y, stride_x, padding_y, padding_x);
+ - gradient_input has the following dimensions:
+ - gradient_input.num_samples() == data.num_samples()
+ - gradient_input.k() == filters.num_samples()
+ - gradient_input.nr() == 1+(data.nr() + 2*padding_y - filters.nr())/stride_y
+ - gradient_input.nc() == 1+(data.nc() + 2*padding_x - filters.nc())/stride_x
+ - NOTE, these dimensions are what you would obtain if gradient_input
+ has the same dimensions as the last output of operator().
+ - is_same_object(filters_gradient,data) == false
+ - is_same_object(filters_gradient,gradient_input) == false
+ ensures
+ - let OUT be the output of (*this)(OUT,data,filters,sx,sy).
+ - let f(data,filters) == dot(OUT, gradient_input)
+ - if (add_to_output) then
+ - This function finds the gradient of f() with respect to filters and
+ adds this gradient to filters_gradient.
+ - else
+ - This function finds the gradient of f() with respect to filters and
+ assigns this gradient to filters_gradient, overwriting the previous
+ values in filters_gradient.
+ !*/
+
+
+ void setup(
+ const tensor& data,
+ const tensor& filters,
+ int stride_y,
+ int stride_x,
+ int padding_y,
+ int padding_x
+ ) {impl.setup(data,filters,stride_y,stride_x,padding_y,padding_x); }
+ /*!
+ requires
+ - filters.k() == data.k()
+ - stride_y > 0
+ - stride_x > 0
+ - 0 <= padding_y < filters.nr()
+ - 0 <= padding_x < filters.nc()
+ ensures
+ - When operator() is called, the output tensor will have these dimensions:
+ - output.nr() == 1+(data.nr() + 2*padding_y - filters.nr())/stride_y
+ - output.nc() == 1+(data.nc() + 2*padding_x - filters.nc())/stride_x
+ - output.num_samples() == data.num_samples()
+ - output.k() == filters.num_samples()
+ - The point of setup() is to allow this object to gather information about
+ all the tensor sizes and filter layouts involved in the computation. In
+ particular, the reason the tensors are input into setup() is just to
+ observe their sizes. setup() doesn't do anything with the contents of
+ the tensors, or store any kind of references to the data or filter
+ tensors.
+ !*/
+
+ private:
+#ifdef DLIB_USE_CUDA
+ cuda::tensor_conv impl;
+#else
+ cpu::tensor_conv impl;
+#endif
+
+ };
+
+// ----------------------------------------------------------------------------------------
+
+ class pooling
+ {
+ /*!
+ WHAT THIS OBJECT REPRESENTS
+ The pooling object is a tool for performing spatial pooling over a tensor.
+ It can be configured to do either max or average pooling.
+ !*/
+ public:
+
+ pooling(const pooling&) = delete;
+ pooling& operator=(const pooling&) = delete;
+
+ pooling (
+ ) = default;
+
+ void clear(
+ ) { impl.clear(); }
+
+ void setup_max_pooling(
+ int window_height,
+ int window_width,
+ int stride_y,
+ int stride_x,
+ int padding_y,
+ int padding_x
+ ) { impl.setup_max_pooling(window_height, window_width, stride_y, stride_x, padding_y, padding_x); }
+ /*!
+ requires
+ - window_height > 0
+ - window_width > 0
+ - stride_y > 0
+ - stride_x > 0
+ - 0 <= padding_y < window_height
+ - 0 <= padding_x < window_width
+ ensures
+ - When you call operator() it will do max pooling with the given
+ parameters.
+ !*/
+
+ void setup_avg_pooling(
+ int window_height,
+ int window_width,
+ int stride_y,
+ int stride_x,
+ int padding_y,
+ int padding_x
+ ) { impl.setup_avg_pooling(window_height, window_width, stride_y, stride_x, padding_y, padding_x); }
+ /*!
+ requires
+ - window_height > 0
+ - window_width > 0
+ - stride_y > 0
+ - stride_x > 0
+ - 0 <= padding_y < window_height
+ - 0 <= padding_x < window_width
+ ensures
+ - When you call operator() it will do average pooling with the given
+ parameters.
+ !*/
+
+ bool does_max_pooling(
+ ) const { return impl.does_max_pooling(); }
+
+ void operator() (
+ resizable_tensor& dest,
+ const tensor& src
+ ) { impl(dest, src); }
+ /*!
+ requires
+ - is_same_object(dest,src) == false
+ - either setup_max_pooling() or setup_avg_pooling() has been called.
+ - window_width <= src.nc() + 2*padding_x
+ - window_height <= src.nr() + 2*padding_y
+ ensures
+ - #dest.num_samples() == src.num_samples()
+ - #dest.k() == src.k()
+ - #dest.nr() == 1 + (src.nr() + 2*padding_y - window_height)/stride_y
+ - #dest.nc() == 1 + (src.nc() + 2*padding_x - window_width)/stride_x
+ - WINDOW == centered_rect(x*stride_x + window_width/2 - padding_x,
+ y*stride_y + window_height/2 - padding_y,
+ window_width,
+ window_height)
+ - for all valid s, k, r, and c:
+ - if (does_max_pooling()) then
+ - image_plane(#dest,s,k)(r,c) == max(subm_clipped(image_plane(src,s,k),WINDOW(c,r)))
+ - else
+ - image_plane(#dest,s,k)(r,c) == mean(subm_clipped(image_plane(src,s,k),WINDOW(c,r)))
+ !*/
+
+ void get_gradient(
+ const tensor& gradient_input,
+ const tensor& dest,
+ const tensor& src,
+ tensor& grad
+ ) { impl.get_gradient(gradient_input, dest, src, grad); }
+ /*!
+ requires
+ - have_same_dimensions(gradient_input,dest) == true
+ - have_same_dimensions(src,grad) == true
+ - dest contains the result of calling (*this)(dest,src)
+ - is_same_object(grad,gradient_input) == false
+ - is_same_object(grad,dest) == false
+ - is_same_object(grad,src) == false
+ ensures
+ - Recalling that dest is the output of (*this)(dest,src),
+ let f(src) == dot(gradient_input,dest)
+ - Then this function computes the gradient of f() with respect to src and
+ adds it to grad.
+ !*/
+
+ private:
+#ifdef DLIB_USE_CUDA
+ cuda::pooling impl;
+#else
+ cpu::pooling impl;
+#endif
+ };
+
+// ----------------------------------------------------------------------------------------
+
+ void softmax (
+ tensor& dest,
+ const tensor& src
+ );
+ /*!
+ requires
+ - have_same_dimensions(dest, src) == true
+ ensures
+ - Note that the softmax function is a vector valued function:
+ s(x) == exp(x)/sum(exp(x))
+ - Computes the softmax function on src and writes the results to dest. The
+ softmax is computed per spatial location across the different channels at
+ each location. That is, softmax() outputs a new tensor, #dest, where each of
+ the spatial locations in dest (i.e. image idx, row idx, and column idx)
+ contains the output of s() evaluated over the channel values at each
+ location.
+ - This function supports in-place operation, i.e. having
+ is_same_object(dest, src)==true
+ !*/
+
+ void softmax_gradient (
+ tensor& grad,
+ const tensor& dest,
+ const tensor& gradient_input
+ );
+ /*!
+ requires
+ - have_same_dimensions(dest,gradient_input) == true
+ - have_same_dimensions(dest,grad) == true
+ ensures
+ - We interpret dest as the output of softmax(dest,SRC) for some SRC tensor.
+ Then let f(SRC) == dot(gradient_input,dest). Then this function computes the
+ gradient of f() with respect to SRC and stores it to grad. Moreover, if
+ is_same_object(grad,gradient_input)==true then the output is assigned to
+ grad, replacing its previous contents. Otherwise the output is added to
+ grad.
+ - This function supports in-place operation, i.e. having
+ is_same_object(grad, gradient_input)==true
+ !*/
+
+// ----------------------------------------------------------------------------------------
+
+ void softmax_all (
+ tensor& dest,
+ const tensor& src
+ );
+ /*!
+ requires
+ - have_same_dimensions(dest, src) == true
+ ensures
+ - Note that the softmax function is a vector valued function:
+ s(x) == exp(x)/sum(exp(x))
+ - Computes the softmax function on src and writes the results to dest. The
+ softmax is computed over the entire tensor with one invocation of s(). So
+ unlike softmax() which computes many s() evaluations, one for each spatial
+ location, softmax_all() calls s() once for the entire tensor.
+ - This function supports in-place operation, i.e. having
+ is_same_object(dest, src)==true
+ !*/
+
+ void softmax_all_gradient (
+ tensor& grad,
+ const tensor& dest,
+ const tensor& gradient_input
+ );
+ /*!
+ requires
+ - have_same_dimensions(dest,gradient_input) == true
+ - have_same_dimensions(dest,grad) == true
+ - is_same_object(grad, dest)==false
+ ensures
+ - We interpret dest as the output of softmax_all(dest,SRC) for some SRC tensor.
+ Then let f(SRC) == dot(gradient_input,dest) Then this function computes the
+ gradient of f() with respect to SRC and assigns it to grad.
+ - This function supports in-place operation, i.e. having
+ is_same_object(grad, gradient_input)==true
+ !*/
+
+// ----------------------------------------------------------------------------------------
+
+ void sigmoid (
+ tensor& dest,
+ const tensor& src
+ );
+ /*!
+ requires
+ - have_same_dimensions(dest, src) == true
+ ensures
+ - for all valid i:
+ - #dest.host()[i] == 1/(1+std::exp(-src.host()[i]))
+ - This function supports in-place operation, i.e. having
+ is_same_object(dest, src)==true
+ !*/
+
+ void sigmoid_gradient (
+ tensor& grad,
+ const tensor& dest,
+ const tensor& gradient_input
+ );
+ /*!
+ requires
+ - have_same_dimensions(dest,gradient_input) == true
+ - have_same_dimensions(dest,grad) == true
+ ensures
+ - Recalling that dest is the output of sigmoid(dest,SRC) for some SRC tensor,
+ let f(SRC) == dot(gradient_input,dest). Then this function computes the
+ gradient of f() with respect to SRC and stores it to grad. Moreover, if
+ is_same_object(grad,gradient_input)==true then the output is assigned to
+ grad, replacing its previous contents. Otherwise the output is added to
+ grad.
+ - This function supports in-place operation, i.e. having
+ is_same_object(grad, gradient_input)==true
+ !*/
+
+// ----------------------------------------------------------------------------------------
+
+ void relu (
+ tensor& dest,
+ const tensor& src
+ );
+ /*!
+ requires
+ - have_same_dimensions(dest, src) == true
+ ensures
+ - for all valid i:
+ - #dest.host()[i] == std::max(0,src.host()[i])
+ - This function supports in-place operation, i.e. having
+ is_same_object(dest, src)==true
+ !*/
+
+ void relu_gradient (
+ tensor& grad,
+ const tensor& dest,
+ const tensor& gradient_input
+ );
+ /*!
+ requires
+ - have_same_dimensions(dest,gradient_input) == true
+ - have_same_dimensions(dest,grad) == true
+ ensures
+ - Recalling that dest is the output of relu(dest,SRC) for some SRC tensor,
+ let f(SRC) == dot(gradient_input,dest). Then this function computes the
+ gradient of f() with respect to SRC and stores it to grad. Moreover, if
+ is_same_object(grad,gradient_input)==true then the output is assigned to
+ grad, replacing its previous contents. Otherwise the output is added to
+ grad.
+ - This function supports in-place operation, i.e. having
+ is_same_object(grad, gradient_input)==true
+ !*/
+
+// ----------------------------------------------------------------------------------------
+
+ void prelu (
+ tensor& dest,
+ const tensor& src,
+ const tensor& param
+ );
+ /*!
+ requires
+ - have_same_dimensions(dest, src) == true
+ - param.size() == 1
+ ensures
+ - for all valid i:
+ - if (src.host()[i] > 0) then
+ - #dest.host()[i] == src.host()[i]
+ - else
+ - #dest.host()[i] == src.host()[i] * param.host()[0]
+ - This function supports in-place operation, i.e. having
+ is_same_object(dest, src)==true
+ !*/
+
+ void prelu_gradient (
+ tensor& grad,
+ const tensor& src,
+ const tensor& gradient_input,
+ const tensor& param,
+ tensor& params_grad
+ );
+ /*!
+ requires
+ - have_same_dimensions(grad,src) == true
+ - have_same_dimensions(grad,gradient_input) == true
+ - param.size() == 1
+ - params_grad.size() == 1
+ - is_same_object(grad, gradient_input) == false
+ ensures
+ - Recalling that dest is the output of prelu(dest,src,param) let
+ f(src,param) == dot(gradient_input,dest)
+ - Then this function computes the gradient of f() with respect to src and
+ param. It assigns the gradient with respect to param to #params_grad and
+ adds the gradient with respect to src to #grad.
+ !*/
+
+// ----------------------------------------------------------------------------------------
+
+ void tanh (
+ tensor& dest,
+ const tensor& src
+ );
+ /*!
+ requires
+ - have_same_dimensions(dest, src) == true
+ ensures
+ - for all valid i:
+ - #dest.host()[i] == std::tanh(src.host()[i])
+ - This function supports in-place operation, i.e. having
+ is_same_object(dest, src)==true
+ !*/
+
+ void tanh_gradient (
+ tensor& grad,
+ const tensor& dest,
+ const tensor& gradient_input
+ );
+ /*!
+ requires
+ - have_same_dimensions(dest,gradient_input) == true
+ - have_same_dimensions(dest,grad) == true
+ ensures
+ - Recalling that dest is the output of tanh(dest,SRC) for some SRC tensor,
+ let f(SRC) == dot(gradient_input,dest). Then this function computes the
+ gradient of f() with respect to SRC and stores it to grad. Moreover, if
+ is_same_object(grad,gradient_input)==true then the output is assigned to
+ grad, replacing its previous contents. Otherwise the output is added to
+ grad.
+ - This function supports in-place operation, i.e. having
+ is_same_object(grad, gradient_input)==true
+ !*/
+
+// ----------------------------------------------------------------------------------------
+
+ void resize_bilinear (
+ tensor& dest,
+ long dest_row_stride,
+ long dest_channel_stride,
+ const tensor& src,
+ long src_row_stride,
+ long src_channel_stride
+ );
+ /*!
+ requires
+ - is_same_object(dest, src)==false
+ - dest.num_samples() == src.num_samples()
+ - dest.k() == src.k()
+ ensures
+ - for all valid i,k: image_plane(dest,i,k) is a copy of image_plane(src,i,k)
+ that has been bilinearly interpolated to fit into the shape of
+ image_plane(dest,i,k).
+ - Instead of supposing the row stride and channel stride in the tensors is
+ given by tensor::nc() and tensor::nr()*tensor::nc() respectively, we use the
+ provided stride values to transition from one row and channel to the next.
+ This is useful in combination with alias_tensor objects since it allows you
+ to operate on subwindows in an image.
+ !*/
+
+ void resize_bilinear_gradient (
+ tensor& grad,
+ long grad_row_stride,
+ long grad_channel_stride,
+ const tensor& gradient_input,
+ long gradient_input_row_stride,
+ long gradient_input_channel_stride
+ );
+ /*!
+ requires
+ - is_same_object(grad, gradient_input)==false
+ - gradient_input.num_samples() == grad.num_samples()
+ - gradient_input.k() == grad.k()
+ ensures
+ - Suppose that DEST is the output of resize_bilinear(DEST,SRC) for some SRC
+ tensor, let f(SRC) == dot(gradient_input,DEST). Then this function computes
+ the gradient of f() with respect to SRC and adds it to grad. It should be
+ noted that we don't need to know the contents of DEST to compute this
+ gradient. All that matters is that gradient_input have the same dimensions
+ as DEST.
+ - Instead of supposing the row stride and channel stride in the tensors is
+ given by tensor::nc() and tensor::nr()*tensor::nc() respectively, we use the
+ provided stride values to transition from one row and channel to the next.
+ This is useful in combination with alias_tensor objects since it allows you
+ to operate on subwindows in an image.
+ !*/
+
+ inline void resize_bilinear (
+ tensor& dest,
+ const tensor& src
+ ) { resize_bilinear(dest, dest.nc(), dest.nr()*dest.nc(), src, src.nc(), src.nr()*src.nc()); }
+ /*!
+ requires
+ - is_same_object(dest, src)==false
+ - dest.num_samples() == src.num_samples()
+ - dest.k() == src.k()
+ ensures
+ - for all valid i,k: image_plane(dest,i,k) is a copy of image_plane(src,i,k)
+ that has been bilinearly interpolated to fit into the shape of
+ image_plane(dest,i,k).
+ !*/
+
+ inline void resize_bilinear_gradient (
+ tensor& grad,
+ const tensor& gradient_input
+ ) { resize_bilinear_gradient(grad, grad.nc(), grad.nr()*grad.nc(), gradient_input, gradient_input.nc(), gradient_input.nr()*gradient_input.nc()); }
+ /*!
+ requires
+ - is_same_object(grad, gradient_input)==false
+ - gradient_input.num_samples() == grad.num_samples()
+ - gradient_input.k() == grad.k()
+ ensures
+ - Suppose that DEST is the output of resize_bilinear(DEST,SRC) for some SRC
+ tensor, let f(SRC) == dot(gradient_input,DEST). Then this function computes
+ the gradient of f() with respect to SRC and adds it to grad. It should be
+ noted that we don't need to know the contents of DEST to compute this
+ gradient. All that matters is that gradient_input have the same dimensions
+ as DEST.
+ !*/
+
+// ----------------------------------------------------------------------------------------
+
+ class multi_device_tensor_averager
+ {
+ /*!
+ WHAT THIS OBJECT REPRESENTS
+ This object is a tool for very quickly averaging a bunch of tensors
+ together.
+ !*/
+ public:
+
+ multi_device_tensor_averager(const multi_device_tensor_averager&) = delete;
+ multi_device_tensor_averager& operator=(const multi_device_tensor_averager&) = delete;
+
+ multi_device_tensor_averager() = default;
+
+ void set(
+ std::vector<tensor*> items
+ )
+ /*!
+ requires
+ - All the tensors in items are the same size
+ ensures
+ - When you call average() we will average the tensors in items.
+ - It's important that the tensors already be allocated to their devices
+ before you call set(). This is because set() will setup the types of
+ between device transfers now and use them when you call average().
+ !*/
+ {
+ using namespace ::dlib::cuda;
+ accessible_groups.clear();
+ epa.clear();
+ if (items.size() < 1)
+ return;
+
+ scale = 1.0/items.size();
+
+ // split item into groups of accessible devices
+ std::vector<tensor*> group, unused;
+ while(items.size() > 0)
+ {
+ group.push_back(items[0]);
+ for(size_t i = 1; i < items.size(); ++i)
+ {
+ if (can_access_peer(*items[0], *items[i]))
+ group.push_back(items[i]);
+ else
+ unused.push_back(items[i]);
+ }
+ accessible_groups.push_back(group);
+ unused.swap(items);
+ unused.clear();
+ group.clear();
+ }
+ for (auto&& g : accessible_groups)
+ {
+ for (size_t i = 1; i < g.size(); ++i)
+ {
+ epa.emplace_back(new enable_peer_access(*g[0], *g[i]));
+ }
+ }
+ }
+
+ size_t num_device_groups(
+ ) const { return accessible_groups.size(); }
+ /*!
+ ensures
+ - The devices given to set() are grouped together when they can directly
+ access each other using GPUDirect. This function returns the number of
+ such groups. For example, if all devices can directly access each other
+ then the number of groups is 1.
+ !*/
+
+ void average()
+ /*!
+ requires
+ - All the devices have stopped writing to the tensors given to set(). So
+ you should probably call cudaDeviceSynchronize() on each of the relevant
+ devices before calling average().
+ ensures
+ - Computes the average of all the tensors given to set() and then sets them
+ all equal to the average.
+ !*/
+ {
+ using namespace ::dlib::cuda;
+
+
+ // First we average things within each group
+ for (auto&& g : accessible_groups)
+ {
+ raii_set_device set_dev(*g[0]);
+ if (g.size() == 1)
+ tt::affine_transform(*g[0], *g[0], scale);
+ else
+ tt::affine_transform(*g[0], *g[0], *g[1], scale, scale);
+
+ for (size_t i = 2; i < g.size(); ++i)
+ tt::affine_transform(*g[0], *g[0], *g[i], 1, scale);
+ }
+
+ if (accessible_groups.size() > 1)
+ {
+ tensor& total_avg = *accessible_groups[0][0];
+ raii_set_device set_dev(total_avg);
+ accum_buffer.copy_size(total_avg);
+ // now we need to average things across groups
+ for (size_t i = 1; i < accessible_groups.size(); ++i)
+ {
+ memcpy(accum_buffer, *accessible_groups[i][0]);
+ tt::add(total_avg, total_avg, accum_buffer);
+ }
+
+ // Now total_avg has the final average in it. So we need to send
+ // copies of it back to each of the groups.
+ for (size_t i = 1; i < accessible_groups.size(); ++i)
+ {
+ memcpy(*accessible_groups[i][0], total_avg);
+ }
+ }
+
+
+ // Now propagate averages back out to each element using point to point
+ // communication inside a group.
+ for (auto&& g : accessible_groups)
+ {
+ raii_set_device set_dev(*g[0]);
+ for (size_t i = 1; i < g.size(); ++i)
+ memcpy(*g[i], *g[0]);
+ }
+ }
+
+ private:
+ std::vector<std::unique_ptr<::dlib::cuda::enable_peer_access>> epa;
+ std::vector<std::vector<tensor*>> accessible_groups;
+ float scale;
+
+ resizable_tensor accum_buffer;
+ };
+
+// ----------------------------------------------------------------------------------------
+
+ void copy_tensor(
+ bool add_to,
+ tensor& dest,
+ size_t dest_k_offset,
+ const tensor& src,
+ size_t src_k_offset,
+ size_t count_k
+ );
+ /*!
+ requires
+ - dest.nc() == src.nc()
+ - dest.nr() == src.nr()
+ - dest.num_samples() == src.num_samples()
+ - dest.k() - dest_k_offset >= count_k
+ - src.k() - src_k_offset >= count_k
+ - is_same_object(dest,src) == false
+ - The memory areas of src and dest do not overlap.
+ ensures
+ - if (add_to) then
+ - performs: dest[i, k + dest_k_offset, r, c] += src[i, k + src_k_offset, r, c], where k in [0..count_k]
+ i.e., adds content of each sample from src in to corresponding place of sample at dest.
+ - else
+ - performs: dest[i, k + dest_k_offset, r, c] = src[i, k + src_k_offset, r, c], where k in [0..count_k]
+ i.e., copies content of each sample from src in to corresponding place of sample at dest.
+ !*/
+
+// ----------------------------------------------------------------------------------------
+
+}}
+
+#ifdef NO_MAKEFILE
+#include "tensor_tools.cpp"
+#endif
+
+#endif // DLIB_TeNSOR_TOOLS_H_
+
+
diff --git a/ml/dlib/dlib/dnn/trainer.h b/ml/dlib/dlib/dnn/trainer.h
new file mode 100644
index 000000000..7cb2bf5e5
--- /dev/null
+++ b/ml/dlib/dlib/dnn/trainer.h
@@ -0,0 +1,1333 @@
+// Copyright (C) 2015 Davis E. King (davis@dlib.net)
+// License: Boost Software License See LICENSE.txt for the full license.
+#ifndef DLIB_DNn_TRAINER_H_
+#define DLIB_DNn_TRAINER_H_
+
+#include "trainer_abstract.h"
+#include "core.h"
+#include "solvers.h"
+#include "../statistics.h"
+#include <chrono>
+#include <fstream>
+#include <sstream>
+#include "../serialize.h"
+
+#include "../pipe.h"
+#include "../threads.h"
+#include "cuda_dlib.h"
+#include "../statistics/running_gradient.h"
+#include <atomic>
+#include <cstdio>
+#include <set>
+#include <future>
+#include <exception>
+#include <mutex>
+#include "../dir_nav.h"
+#include "../md5.h"
+
+namespace dlib
+{
+
+// ----------------------------------------------------------------------------------------
+
+ namespace impl
+ {
+ template <typename training_label_type>
+ struct dnn_job_t
+ {
+ dnn_job_t() = default;
+ dnn_job_t(const dnn_job_t&) = delete;
+ dnn_job_t& operator=(const dnn_job_t&) = delete;
+
+ std::vector<std::vector<training_label_type>> labels;
+ std::vector<resizable_tensor> t;
+ std::vector<int> have_data; // have_data[i] is true if there is data in labels[i] and t[i].
+ bool test_only = false;
+ };
+
+ template <typename training_label_type>
+ void swap(dnn_job_t<training_label_type>& a, dnn_job_t<training_label_type>& b)
+ {
+ a.labels.swap(b.labels);
+ a.t.swap(b.t);
+ a.have_data.swap(b.have_data);
+ std::swap(a.test_only,b.test_only);
+ }
+ }
+
+ enum class force_flush_to_disk {
+ no = 0,
+ yes = 1
+ };
+
+ template <
+ typename net_type,
+ typename solver_type = sgd
+ >
+ class dnn_trainer : private threaded_object
+ {
+ public:
+
+ static_assert(is_loss_layer_type<net_type>::value,
+ "The last layer in a network must be a loss layer.");
+
+ typedef typename net_type::training_label_type training_label_type;
+ typedef typename net_type::input_type input_type;
+ const static size_t num_computational_layers = net_type::num_computational_layers;
+ const static size_t num_layers = net_type::num_layers;
+ private:
+ typedef impl::dnn_job_t<training_label_type> job_t;
+ public:
+
+ dnn_trainer() = delete;
+ dnn_trainer(const dnn_trainer&) = delete;
+ dnn_trainer& operator=(const dnn_trainer&) = delete;
+
+ explicit dnn_trainer(net_type& net_) : job_pipe(0), net(net_)
+ {
+ solver_type default_solver;
+ devices.push_back(std::make_shared<device_data>(dlib::cuda::get_device(), net, default_solver));
+
+ init();
+ }
+
+ dnn_trainer(
+ net_type& net_,
+ const solver_type& solver_
+ ) : job_pipe(0), net(net_)
+ {
+ devices.push_back(std::make_shared<device_data>(dlib::cuda::get_device(), net, solver_));
+
+ init();
+ }
+
+ dnn_trainer(
+ net_type& net_,
+ const solver_type& solver_,
+ const std::vector<int>& cuda_extra_devices
+ ) : job_pipe(0), net(net_)
+ {
+ devices.push_back(std::make_shared<device_data>(dlib::cuda::get_device(), net, solver_));
+
+ const int total_devices = dlib::cuda::get_num_devices();
+
+ // Make device contexts for the extra device ids but be careful to avoid any
+ // duplicate ids.
+ std::set<int> temp(cuda_extra_devices.begin(), cuda_extra_devices.end());
+ temp.erase(devices[0]->device_id);
+ for (auto id : temp)
+ {
+ DLIB_CASSERT(0 <= id && id < total_devices, "Invalid CUDA device id given to dnn_trainer.");
+ // Switch to this device so that any tensor objects that get allocated when
+ // we create the device context happen on this device.
+ dlib::cuda::set_device(id);
+ devices.push_back(std::make_shared<device_data>(id, net, solver_, clone_net()));
+ }
+ // Set the current device back to what it was before this constructor was
+ // called.
+ dlib::cuda::set_device(devices[0]->device_id);
+
+ init();
+ }
+
+ ~dnn_trainer(
+ )
+ {
+ job_pipe.disable();
+ stop();
+ wait();
+ }
+
+ net_type& get_net (
+ force_flush_to_disk force_flush = force_flush_to_disk::yes
+ )
+ {
+ wait_for_thread_to_pause();
+ sync_to_disk(force_flush == force_flush_to_disk::yes);
+ propagate_exception();
+ return net;
+ }
+
+
+ unsigned long get_mini_batch_size (
+ ) const { return mini_batch_size; }
+
+ void set_mini_batch_size (
+ unsigned long batch_size
+ )
+ {
+ DLIB_CASSERT(batch_size > 0);
+ mini_batch_size = batch_size;
+ }
+
+ unsigned long get_max_num_epochs (
+ ) const { return max_num_epochs; }
+
+ void set_max_num_epochs (
+ unsigned long num
+ )
+ {
+ DLIB_CASSERT(num > 0);
+ max_num_epochs = num;
+ }
+
+ void be_verbose (
+ )
+ {
+ verbose = true;
+ }
+
+ void be_quiet (
+ )
+ {
+ verbose = false;
+ }
+
+
+ const std::vector<solver_type>& get_solvers (
+ ) const
+ {
+ wait_for_thread_to_pause();
+ propagate_exception();
+ return devices[0]->solvers;
+ }
+
+ void train_one_step (
+ const std::vector<input_type>& data,
+ const std::vector<training_label_type>& labels
+ )
+ {
+ DLIB_CASSERT(data.size() == labels.size());
+
+ train_one_step(data.begin(), data.end(), labels.begin());
+ }
+
+ template <
+ typename data_iterator,
+ typename label_iterator
+ >
+ void train_one_step (
+ data_iterator dbegin,
+ data_iterator dend,
+ label_iterator lbegin
+ )
+ {
+ DLIB_CASSERT(std::distance(dbegin, dend) > 0);
+
+ print_periodic_verbose_status();
+ sync_to_disk();
+ send_job(false, dbegin, dend, lbegin);
+
+ ++train_one_step_calls;
+ }
+
+ void train_one_step (
+ const std::vector<input_type>& data
+ )
+ {
+ train_one_step(data.begin(), data.end());
+ }
+
+ template <
+ typename data_iterator
+ >
+ void train_one_step (
+ data_iterator dbegin,
+ data_iterator dend
+ )
+ {
+ DLIB_CASSERT(std::distance(dbegin, dend) > 0);
+ print_periodic_verbose_status();
+ sync_to_disk();
+ send_job(false, dbegin, dend);
+ ++train_one_step_calls;
+ }
+
+ void test_one_step (
+ const std::vector<input_type>& data,
+ const std::vector<training_label_type>& labels
+ )
+ {
+ DLIB_CASSERT(data.size() == labels.size());
+
+ test_one_step(data.begin(), data.end(), labels.begin());
+ }
+
+ template <
+ typename data_iterator,
+ typename label_iterator
+ >
+ void test_one_step (
+ data_iterator dbegin,
+ data_iterator dend,
+ label_iterator lbegin
+ )
+ {
+ DLIB_CASSERT(std::distance(dbegin, dend) > 0);
+
+ print_periodic_verbose_status();
+ sync_to_disk();
+ send_job(true, dbegin, dend, lbegin);
+
+ ++test_one_step_calls;
+ }
+
+ void test_one_step (
+ const std::vector<input_type>& data
+ )
+ {
+ test_one_step(data.begin(), data.end());
+ }
+
+ template <
+ typename data_iterator
+ >
+ void test_one_step (
+ data_iterator dbegin,
+ data_iterator dend
+ )
+ {
+ DLIB_CASSERT(std::distance(dbegin, dend) > 0);
+ print_periodic_verbose_status();
+ sync_to_disk();
+ send_job(true, dbegin, dend);
+ ++test_one_step_calls;
+ }
+
+ void train (
+ const std::vector<input_type>& data,
+ const std::vector<training_label_type>& labels
+ )
+ {
+ DLIB_CASSERT(data.size() == labels.size() && data.size() > 0);
+
+ // The reason these two loops don't initialize their counter variables but
+ // instead use class members is so we can include the state of the loops in the
+ // stuff written by sync_to_disk()
+ for (;
+ epoch_iteration < max_num_epochs && learning_rate >= min_learning_rate;
+ ++epoch_iteration)
+ {
+ using namespace std::chrono;
+ last_time = system_clock::now();
+ clear_average_loss();
+ for (; epoch_pos < data.size() && learning_rate >= min_learning_rate; epoch_pos += mini_batch_size)
+ {
+ if (verbose)
+ {
+ auto now_time = system_clock::now();
+ if (now_time-last_time > seconds(20))
+ {
+ last_time = now_time;
+ auto iter = epoch_iteration + epoch_pos/(double)data.size();
+ std::cout << "epoch: " << rpad(cast_to_string(iter),epoch_string_pad) << " "
+ << "learning rate: " << rpad(cast_to_string(learning_rate),lr_string_pad) << " "
+ << "average loss: " << rpad(cast_to_string(get_average_loss()),string_pad) << " ";
+ print_progress();
+ }
+ }
+
+ sync_to_disk();
+ send_job(false, data.begin()+epoch_pos,
+ data.begin()+std::min(epoch_pos+mini_batch_size,data.size()),
+ labels.begin()+epoch_pos);
+ }
+ epoch_pos = 0;
+
+ if (verbose)
+ {
+ // Capitalize the E in Epoch so it's easy to grep out the lines that
+ // are for full epoch status statements.
+ std::cout << "Epoch: " << rpad(cast_to_string(epoch_iteration+1),epoch_string_pad) << " "
+ << "learning rate: " << rpad(cast_to_string(learning_rate),lr_string_pad) << " "
+ << "average loss: " << rpad(cast_to_string(get_average_loss()),string_pad) << " ";
+ print_progress();
+ }
+ }
+ wait_for_thread_to_pause();
+ // if we modified the network at all then be sure to sync the final result.
+ sync_to_disk(true);
+ }
+
+ void train (
+ const std::vector<input_type>& data
+ )
+ {
+ DLIB_CASSERT(data.size() > 0);
+
+ const bool has_unsupervised_loss = std::is_same<no_label_type, training_label_type>::value;
+ static_assert(has_unsupervised_loss,
+ "You can only call this version of train() when using an unsupervised loss.");
+
+ // The reason these two loops don't initialize their counter variables but
+ // instead use class members is so we can include the state of the loops in the
+ // stuff written by sync_to_disk()
+ for (;
+ epoch_iteration < max_num_epochs && learning_rate >= min_learning_rate;
+ ++epoch_iteration)
+ {
+ using namespace std::chrono;
+ last_time = system_clock::now();
+ clear_average_loss();
+ for (; epoch_pos < data.size() && learning_rate >= min_learning_rate; epoch_pos += mini_batch_size)
+ {
+ if (verbose)
+ {
+ auto now_time = system_clock::now();
+ if (now_time-last_time > seconds(20))
+ {
+ last_time = now_time;
+ auto iter = epoch_iteration + epoch_pos/(double)data.size();
+ std::cout << "epoch: " << rpad(cast_to_string(iter),epoch_string_pad) << " "
+ << "learning rate: " << rpad(cast_to_string(learning_rate),lr_string_pad) << " "
+ << "average loss: " << rpad(cast_to_string(get_average_loss()),string_pad) << " ";
+ print_progress();
+ }
+ }
+
+ sync_to_disk();
+ send_job(false, data.begin()+epoch_pos,
+ data.begin()+std::min(epoch_pos+mini_batch_size,data.size()));
+ }
+ epoch_pos = 0;
+
+ if (verbose)
+ {
+ // Capitalize the E in Epoch so it's easy to grep out the lines that
+ // are for full epoch status statements.
+ std::cout << "Epoch: " << rpad(cast_to_string(epoch_iteration+1),epoch_string_pad) << " "
+ << "learning rate: " << rpad(cast_to_string(learning_rate),lr_string_pad) << " "
+ << "average loss: " << rpad(cast_to_string(get_average_loss()),string_pad) << " ";
+ print_progress();
+ }
+ }
+ wait_for_thread_to_pause();
+ // if we modified the network at all then be sure to sync the final result.
+ sync_to_disk(true);
+ }
+
+ void set_synchronization_file (
+ const std::string& filename,
+ std::chrono::seconds time_between_syncs_ = std::chrono::minutes(15)
+ )
+ {
+ last_sync_time = std::chrono::system_clock::now();
+ sync_filename = filename;
+ time_between_syncs = time_between_syncs_;
+
+ // check if the sync file already exists, if it does we should load it.
+ std::ifstream fin(newest_syncfile(), std::ios::binary);
+ if (fin)
+ deserialize(*this, fin);
+ }
+
+ const std::string& get_synchronization_file (
+ )
+ {
+ return sync_filename;
+ }
+
+ double get_average_loss (
+ ) const
+ {
+ wait_for_thread_to_pause();
+ return rs.mean();
+ }
+
+ double get_average_test_loss (
+ ) const
+ {
+ wait_for_thread_to_pause();
+ return rs_test.mean();
+ }
+
+ void clear_average_loss (
+ )
+ {
+ wait_for_thread_to_pause();
+ rs.clear();
+ }
+
+ void set_learning_rate (
+ double lr
+ )
+ {
+ DLIB_CASSERT(lr > 0);
+ wait_for_thread_to_pause();
+ if (learning_rate != lr)
+ {
+ steps_without_progress = 0;
+ test_steps_without_progress = 0;
+ previous_loss_values.clear();
+ test_previous_loss_values.clear();
+ }
+ learning_rate = lr;
+ lr_schedule.set_size(0);
+ }
+
+ double get_learning_rate(
+ ) const
+ {
+ return learning_rate;
+ }
+
+ void set_min_learning_rate (
+ double lr
+ )
+ {
+ DLIB_CASSERT(lr > 0);
+ wait_for_thread_to_pause();
+ lr_schedule.set_size(0);
+ min_learning_rate = lr;
+ }
+
+ double get_min_learning_rate (
+ ) const
+ {
+ return min_learning_rate;
+ }
+
+ template <typename EXP>
+ void set_learning_rate_schedule (
+ const matrix_exp<EXP>& schedule
+ )
+ {
+ DLIB_CASSERT(schedule.size() > 0);
+ DLIB_CASSERT(min(schedule) > 0);
+ set_learning_rate(schedule(0,0));
+ set_min_learning_rate(min(schedule));
+ set_learning_rate_shrink_factor(1);
+ lr_schedule = matrix_cast<double>(reshape_to_column_vector(schedule));
+ lr_schedule_pos = 0;
+ }
+
+ const matrix<double,0,1>& get_learning_rate_schedule (
+ ) const
+ {
+ return lr_schedule;
+ }
+
+ void set_iterations_without_progress_threshold (
+ unsigned long thresh
+ )
+ {
+ wait_for_thread_to_pause();
+ lr_schedule.set_size(0);
+ iter_without_progress_thresh = thresh;
+ }
+
+ unsigned long get_iterations_without_progress_threshold (
+ ) const
+ {
+ return iter_without_progress_thresh;
+ }
+
+ unsigned long get_steps_without_progress (
+ ) const
+ {
+ return steps_without_progress;
+ }
+
+ void set_test_iterations_without_progress_threshold (
+ unsigned long thresh
+ )
+ {
+ wait_for_thread_to_pause();
+ lr_schedule.set_size(0);
+ test_iter_without_progress_thresh = thresh;
+ }
+
+ unsigned long get_test_iterations_without_progress_threshold (
+ ) const
+ {
+ return test_iter_without_progress_thresh;
+ }
+
+ unsigned long get_test_steps_without_progress (
+ ) const
+ {
+ return test_steps_without_progress;
+ }
+
+ void set_learning_rate_shrink_factor (
+ double shrink
+ )
+ {
+ DLIB_CASSERT(0 < shrink && shrink <= 1);
+ wait_for_thread_to_pause();
+ lr_schedule.set_size(0);
+ learning_rate_shrink = shrink;
+ steps_without_progress = 0;
+ test_steps_without_progress = 0;
+ }
+
+ double get_learning_rate_shrink_factor (
+ ) const
+ {
+ return learning_rate_shrink;
+ }
+
+ unsigned long long get_train_one_step_calls (
+ ) const
+ {
+ return train_one_step_calls;
+ }
+
+ unsigned long long get_test_one_step_calls (
+ ) const
+ {
+ return test_one_step_calls;
+ }
+
+ private:
+
+ void record_test_loss(double loss)
+ {
+ test_previous_loss_values.push_back(loss);
+ if (is_finite(loss))
+ rs_test.add(loss);
+ // discard really old loss values.
+ while (test_previous_loss_values.size() > test_iter_without_progress_thresh)
+ test_previous_loss_values.pop_front();
+ }
+
+ void record_loss(double loss)
+ {
+ // This kind of budgeting causes our gradient checking to use a fixed amount of
+ // computational resources, regardless of the size of iter_without_progress_thresh.
+ gradient_check_budget += 200;
+
+ rs.add(loss);
+ previous_loss_values.push_back(loss);
+ // discard really old loss values.
+ while (previous_loss_values.size() > iter_without_progress_thresh)
+ previous_loss_values.pop_front();
+ }
+
+ template <typename T>
+ double compute_parameter_gradients(size_t device, job_t& next_job, const T&)
+ {
+ if (next_job.have_data[device])
+ {
+ auto&& dev = *devices[device];
+ dlib::cuda::set_device(dev.device_id);
+ if (next_job.test_only)
+ return dev.net.compute_loss(next_job.t[device], next_job.labels[device].begin());
+ else
+ return dev.net.compute_parameter_gradients(next_job.t[device], next_job.labels[device].begin());
+ }
+ else
+ {
+ return 0;
+ }
+ }
+
+ double compute_parameter_gradients(size_t device, job_t& next_job, const no_label_type&)
+ {
+ if (next_job.have_data[device])
+ {
+ auto&& dev = *devices[device];
+ dlib::cuda::set_device(dev.device_id);
+ no_label_type pick_which_run_update;
+ if (next_job.test_only)
+ return dev.net.compute_loss(next_job.t[device]);
+ else
+ return dev.net.compute_parameter_gradients(next_job.t[device]);
+ }
+ else
+ {
+ return 0;
+ }
+ }
+
+ void update_parameters(size_t device)
+ {
+ auto&& dev = *devices[device];
+ dlib::cuda::set_device(dev.device_id);
+ dev.net.update_parameters(make_sstack(dev.solvers), learning_rate);
+ }
+
+ void thread() try
+ {
+ training_label_type pick_which_run_update;
+ job_t next_job;
+
+ std::vector<dlib::future<double>> losses(devices.size());
+
+ std::vector<tt::multi_device_tensor_averager> averagers;
+ // An array of all the parameter tensors in the first network. We will
+ // periodically copy these tensors to all the other devices to make sure the
+ // different GPUs don't go out of sync.
+ std::vector<tensor*> reference_params;
+ visit_layer_parameters(devices[0]->net, [&](size_t, tensor& t) { reference_params.push_back(&t); });
+
+ // We make separate thread pools with just one thread in them because we want
+ // to make sure each device is always executed on the same thread. We care
+ // about this because there are thread_local context variables for some cuda
+ // components and they get allocated for each combination of thread and device.
+ // So if we make sure the same device always uses the same thread this will
+ // reduce the number of contexts we allocate from num_devices*num_devices to
+ // just num_devices.
+ std::vector<std::shared_ptr<thread_pool>> tp;
+ for (size_t i = 0; i < devices.size(); ++i)
+ tp.push_back(std::make_shared<thread_pool>(1));
+
+
+ main_iteration_counter = 0;
+ while(job_pipe.dequeue(next_job))
+ {
+ if (next_job.test_only)
+ {
+ // compute the testing loss
+ for (size_t i = 0; i < devices.size(); ++i)
+ tp[i]->add_task_by_value([&,i](double& loss){ loss = compute_parameter_gradients(i, next_job, pick_which_run_update); }, losses[i]);
+ // aggregate loss values from all the network computations.
+ double theloss = 0;
+ for (auto&& loss : losses)
+ theloss += loss.get();
+ record_test_loss(theloss/losses.size());
+
+ // Check if we should shrink the learning rate based on how the test
+ // error has been doing lately.
+ if (learning_rate_shrink != 1)
+ {
+ test_steps_without_progress = count_steps_without_decrease(test_previous_loss_values);
+ if (test_steps_without_progress >= test_iter_without_progress_thresh)
+ {
+ test_steps_without_progress = count_steps_without_decrease_robust(test_previous_loss_values);
+ if (test_steps_without_progress >= test_iter_without_progress_thresh)
+ {
+ // optimization has flattened out, so drop the learning rate.
+ learning_rate = learning_rate_shrink*learning_rate;
+ test_steps_without_progress = 0;
+ // Empty out some of the previous loss values so that test_steps_without_progress
+ // will decrease below test_iter_without_progress_thresh.
+ for (unsigned long cnt = 0; cnt < test_previous_loss_values_dump_amount+test_iter_without_progress_thresh/10 && test_previous_loss_values.size() > 0; ++cnt)
+ test_previous_loss_values.pop_front();
+ }
+ }
+ }
+ continue;
+ }
+
+ updated_net_since_last_sync = true;
+ ++main_iteration_counter;
+ // Call compute_parameter_gradients() and update_parameters() but pick the
+ // right version for unsupervised or supervised training based on the type
+ // of training_label_type.
+ for (size_t i = 0; i < devices.size(); ++i)
+ tp[i]->add_task_by_value([&,i](double& loss){ loss = compute_parameter_gradients(i, next_job, pick_which_run_update); }, losses[i]);
+ // aggregate loss values from all the network computations.
+ double theloss = 0;
+ for (auto&& loss : losses)
+ theloss += loss.get();
+ record_loss(theloss/losses.size());
+
+ // Now, if there is more than one active device we need to synchronize the
+ // gradient updates between devices. So we do that now.
+ if (devices.size() > 1)
+ {
+ // if this is the first iteration then we need to setup the averagers.
+ // We can't do this outside the loop because the tensors that get
+ // averaged need to be allocated to their devices before we call set()
+ // so that the averagers can determine how best to average them.
+ if (averagers.size() == 0 || sync_file_reloaded)
+ {
+ averagers = std::vector<tt::multi_device_tensor_averager>(net_type::num_computational_layers);
+ // setup the averagers to point to the tensors in the networks.
+ std::vector<std::vector<tensor*>> all_tensors(devices.size());
+ for (size_t i = 0; i < all_tensors.size(); ++i)
+ {
+ all_tensors[i].resize(net_type::num_computational_layers);
+ visit_layer_parameter_gradients(devices[i]->net, [&](size_t j, tensor& t){
+ all_tensors[i][j] = &t;
+ });
+ }
+ // Now set each averager to average the tensors at the same layer in each
+ // network.
+ for (size_t i = 0; i < net_type::num_computational_layers; ++i)
+ {
+ std::vector<tensor*> temp(all_tensors.size());
+ for (size_t j = 0; j < all_tensors.size(); ++j)
+ temp[j] = all_tensors[j][i];
+ // ignore layers that don't have parameters
+ if (temp[0]->size() != 0)
+ averagers[i].set(temp);
+ }
+
+ sync_file_reloaded = false;
+ }
+
+
+ for (auto&& d : devices)
+ cuda::device_synchronize(d->device_id);
+
+ for (auto&& avg : averagers)
+ avg.average();
+ }
+
+
+ // Now apply all the updates to each device.
+ for (size_t i = 0; i < devices.size(); ++i)
+ tp[i]->add_task_by_value([&,i](){ if (next_job.have_data[i]) update_parameters(i); });
+ // and wait for the updates to all happen.
+ for (size_t i = 0; i < devices.size(); ++i)
+ tp[i]->wait_for_all_tasks();
+
+
+ // Every now and then force all the parameters to be the same just to make
+ // sure they aren't drifting apart due to any non-deterministic behavior on
+ // the GPU. It's also important to do this on the first iteration because
+ // the different networks may be initialized differently when tensor data
+ // is first passed through them. So this code block deals with these
+ // issues.
+ if (devices.size() > 1 && main_iteration_counter%2000 == 1)
+ {
+ for (size_t i = 1; i < devices.size(); ++i)
+ {
+ visit_layer_parameters(devices[i]->net, [&](size_t j, tensor& t)
+ {
+ memcpy(t, *reference_params[j]);
+ });
+ }
+ }
+
+ // If we have been running for a while then check if the loss is still
+ // dropping. If it isn't then we will reduce the learning rate. Note that we
+ // have a "budget" that prevents us from calling
+ // count_steps_without_decrease() every iteration. We do this because
+ // it can be expensive to compute when previous_loss_values is large.
+ if (gradient_check_budget > iter_without_progress_thresh && learning_rate_shrink != 1)
+ {
+ gradient_check_budget = 0;
+ steps_without_progress = count_steps_without_decrease(previous_loss_values);
+ if (steps_without_progress >= iter_without_progress_thresh)
+ {
+ // Double check that we aren't seeing decrease. This second check
+ // discards the top 10% largest values and checks again. We do
+ // this because sometimes a mini-batch might be bad and cause the
+ // loss to suddenly jump up, making count_steps_without_decrease()
+ // return a large number. But if we discard the top 10% of the
+ // values in previous_loss_values then we are robust to that kind
+ // of noise. Another way of looking at it, if the reason
+ // count_steps_without_decrease() returns a large value is only
+ // because the most recent loss values have suddenly been large,
+ // then we shouldn't stop or lower the learning rate. We should
+ // keep going until whatever disturbance we hit is damped down.
+ steps_without_progress = count_steps_without_decrease_robust(previous_loss_values);
+ if (steps_without_progress >= iter_without_progress_thresh)
+ {
+ // optimization has flattened out, so drop the learning rate.
+ learning_rate = learning_rate_shrink*learning_rate;
+ steps_without_progress = 0;
+ // Empty out some of the previous loss values so that steps_without_progress
+ // will decrease below iter_without_progress_thresh.
+ for (unsigned long cnt = 0; cnt < previous_loss_values_dump_amount+iter_without_progress_thresh/10 && previous_loss_values.size() > 0; ++cnt)
+ previous_loss_values.pop_front();
+ }
+ }
+ }
+ else if (lr_schedule.size() != 0) // or use the learning rate schedule if we have one.
+ {
+ if (lr_schedule_pos < lr_schedule.size())
+ learning_rate = lr_schedule(lr_schedule_pos++);
+ else
+ learning_rate = lr_schedule(lr_schedule.size()-1)*0.99;
+ }
+ }
+ }
+ catch(...)
+ {
+ // If an exception happens then permanently disable the trainer object.
+ job_pipe.disable();
+ std::lock_guard<std::mutex> lock(eptr_mutex);
+ eptr = std::current_exception();
+ }
+
+ void wait_for_thread_to_pause() const
+ {
+ job_pipe.wait_for_num_blocked_dequeues(1);
+ }
+
+ const static long string_pad = 11;
+ const static long epoch_string_pad = 4;
+ const static long lr_string_pad = 4;
+
+ void init()
+ {
+ max_num_epochs = 10000;
+ mini_batch_size = 128;
+ verbose = false;
+ learning_rate = 1e-2;
+ min_learning_rate = 1e-5;
+ iter_without_progress_thresh = 2000;
+ steps_without_progress = 0;
+ test_iter_without_progress_thresh = 500;
+ test_steps_without_progress = 0;
+
+ learning_rate_shrink = 0.1;
+ epoch_iteration = 0;
+ epoch_pos = 0;
+ train_one_step_calls = 0;
+ test_one_step_calls = 0;
+ gradient_check_budget = 0;
+ lr_schedule_pos = 0;
+
+ main_iteration_counter = 0;
+ main_iteration_counter_at_last_disk_sync = 0;
+ prob_loss_increasing_thresh_default_value = 0.99;
+ prob_loss_increasing_thresh_max_value = 0.99999;
+ prob_loss_increasing_thresh = prob_loss_increasing_thresh_default_value;
+ updated_net_since_last_sync = false;
+ sync_file_reloaded = false;
+ previous_loss_values_dump_amount = 400;
+ test_previous_loss_values_dump_amount = 100;
+
+ rs_test = running_stats_decayed<double>(200);
+
+ start();
+ }
+
+ // serialize and deserialize are private because we hold net by reference so
+ // allowing someone to serialize this training object is weird and will likely
+ // result in user errors. However, we use these functions as part of the automatic
+ // sync code in this object.
+ friend void serialize(const dnn_trainer& item, std::ostream& out)
+ {
+ item.wait_for_thread_to_pause();
+ int version = 12;
+ serialize(version, out);
+
+ size_t nl = dnn_trainer::num_layers;
+ serialize(nl, out);
+ serialize(item.rs, out);
+ serialize(item.rs_test, out);
+ serialize(item.previous_loss_values, out);
+ serialize(item.max_num_epochs, out);
+ serialize(item.mini_batch_size, out);
+ serialize(item.verbose, out);
+ serialize(item.net, out);
+ serialize(item.devices[0]->solvers, out);
+ serialize(item.learning_rate.load(), out);
+ serialize(item.min_learning_rate, out);
+ serialize(item.iter_without_progress_thresh.load(), out);
+ serialize(item.steps_without_progress.load(), out);
+ serialize(item.learning_rate_shrink.load(), out);
+ serialize(item.epoch_iteration, out);
+ serialize(item.epoch_pos, out);
+ serialize(item.train_one_step_calls, out);
+ serialize(item.test_one_step_calls, out);
+ serialize(item.lr_schedule, out);
+ serialize(item.lr_schedule_pos, out);
+ serialize(item.test_iter_without_progress_thresh.load(), out);
+ serialize(item.test_steps_without_progress.load(), out);
+ serialize(item.test_previous_loss_values, out);
+ serialize(item.previous_loss_values_dump_amount, out);
+ serialize(item.test_previous_loss_values_dump_amount, out);
+
+ }
+ friend void deserialize(dnn_trainer& item, std::istream& in)
+ {
+ item.wait_for_thread_to_pause();
+ int version = 0;
+ deserialize(version, in);
+ if (version != 12)
+ throw serialization_error("Unexpected version found while deserializing dlib::dnn_trainer.");
+
+ size_t num_layers = 0;
+ deserialize(num_layers, in);
+ if (num_layers != dnn_trainer::num_layers)
+ {
+ std::ostringstream sout;
+ sout << "Error deserializing dlib::dnn_trainer. The saved sync file is for a network with " << std::endl;
+ sout << "a different number of layers. We expected the number of layers to be " << dnn_trainer::num_layers << " but" << std::endl;
+ sout << "instead the file contains " << num_layers << " layers." << std::endl;
+ throw serialization_error(sout.str());
+ }
+
+ double dtemp; long ltemp;
+ deserialize(item.rs, in);
+ deserialize(item.rs_test, in);
+ deserialize(item.previous_loss_values, in);
+ deserialize(item.max_num_epochs, in);
+ deserialize(item.mini_batch_size, in);
+ deserialize(item.verbose, in);
+ deserialize(item.net, in);
+ deserialize(item.devices[0]->solvers, in);
+ deserialize(dtemp, in); item.learning_rate = dtemp;
+ deserialize(item.min_learning_rate, in);
+ deserialize(ltemp, in); item.iter_without_progress_thresh = ltemp;
+ deserialize(ltemp, in); item.steps_without_progress = ltemp;
+ deserialize(dtemp, in); item.learning_rate_shrink = dtemp;
+ deserialize(item.epoch_iteration, in);
+ deserialize(item.epoch_pos, in);
+ deserialize(item.train_one_step_calls, in);
+ deserialize(item.test_one_step_calls, in);
+ deserialize(item.lr_schedule, in);
+ deserialize(item.lr_schedule_pos, in);
+ deserialize(ltemp, in); item.test_iter_without_progress_thresh = ltemp;
+ deserialize(ltemp, in); item.test_steps_without_progress = ltemp;
+ deserialize(item.test_previous_loss_values, in);
+ deserialize(item.previous_loss_values_dump_amount, in);
+ deserialize(item.test_previous_loss_values_dump_amount, in);
+
+ if (item.devices.size() > 1)
+ {
+ const auto prev_dev = dlib::cuda::get_device();
+ // initialize all the other device networks and solver objects
+ for (size_t i = 1; i < item.devices.size(); ++i)
+ {
+ // Switch to this device so that any tensor objects that get allocated when
+ // we copy this stuff happen on this device.
+ dlib::cuda::set_device(item.devices[i]->device_id);
+ item.devices[i]->solvers = item.devices[0]->solvers;
+ item.devices[i]->net = item.devices[0]->net;
+ }
+ dlib::cuda::set_device(prev_dev);
+ }
+ }
+
+ void sync_to_disk (
+ bool do_it_now = false
+ )
+ {
+ // don't sync anything if we haven't updated the network since the last sync
+ if (!updated_net_since_last_sync)
+ return;
+
+ // If the sync file isn't set then don't do anything.
+ if (sync_filename.size() == 0)
+ return;
+
+ // Only sync if it has been long enough since the last sync or we are being
+ // explicitly forced to do it.
+ if (std::chrono::system_clock::now() - last_sync_time > time_between_syncs ||
+ do_it_now)
+ {
+ wait_for_thread_to_pause();
+
+ // compact network before saving to disk.
+ this->net.clean();
+
+ // if the loss has actually been going up since the last time we saved our
+ // state to disk then something has probably gone wrong in the
+ // optimization. So in this case we do the opposite and recall the
+ // previously saved state in the hopes that the problem won't reoccur.
+ if (loss_increased_since_last_disk_sync())
+ {
+ std::ifstream fin(newest_syncfile(), std::ios::binary);
+ deserialize(*this, fin);
+ sync_file_reloaded = true;
+ if (verbose)
+ std::cout << "Loss has been increasing, reloading saved state from " << newest_syncfile() << std::endl;
+ }
+ else
+ {
+
+ const std::string filename = oldest_syncfile();
+ serialize(filename) << *this;
+
+ if (verbose)
+ std::cout << "Saved state to " << filename << std::endl;
+ }
+
+ last_sync_time = std::chrono::system_clock::now();
+ main_iteration_counter_at_last_disk_sync = main_iteration_counter;
+ updated_net_since_last_sync = false;
+ }
+ }
+
+ std::string newest_syncfile (
+ )
+ {
+ return select_newest_file(sync_filename, sync_filename + "_");
+ }
+
+ std::string oldest_syncfile (
+ )
+ {
+ return select_oldest_file(sync_filename, sync_filename + "_");
+ }
+
+ bool loss_increased_since_last_disk_sync()
+ {
+ size_t gradient_updates_since_last_sync = main_iteration_counter - main_iteration_counter_at_last_disk_sync;
+
+ // if we haven't synced anything to disk yet then return false.
+ if (!std::ifstream(newest_syncfile(), std::ios::binary))
+ return false;
+
+ for (auto x : previous_loss_values)
+ {
+ // If we get a NaN value of loss assume things have gone horribly wrong and
+ // we should reload the state of the trainer.
+ if (std::isnan(x))
+ return true;
+ }
+
+ // if we haven't seen much data yet then just say false. Or, alternatively, if
+ // it's been too long since the last sync then don't reload either.
+ if (gradient_updates_since_last_sync < 30 || previous_loss_values.size() < 2*gradient_updates_since_last_sync)
+ return false;
+
+ // Now look at the data since a little before the last disk sync. We will
+ // check if the loss is getting bettor or worse.
+ running_gradient g;
+ for (size_t i = previous_loss_values.size() - 2*gradient_updates_since_last_sync; i < previous_loss_values.size(); ++i)
+ g.add(previous_loss_values[i]);
+
+ // if the loss is very likely to be increasing then return true
+ const double prob = g.probability_gradient_greater_than(0);
+ if (prob > prob_loss_increasing_thresh && prob_loss_increasing_thresh <= prob_loss_increasing_thresh_max_value)
+ {
+ // Exponentially decay the threshold towards 1 so that if we keep finding
+ // the loss to be increasing over and over we will make the test
+ // progressively harder and harder until it fails, therefore ensuring we
+ // can't get stuck reloading from a previous state over and over.
+ prob_loss_increasing_thresh = 0.1*prob_loss_increasing_thresh + 0.9*1;
+ return true;
+ }
+ else
+ {
+ // decay back to the default threshold
+ prob_loss_increasing_thresh = std::pow(prob_loss_increasing_thresh, 10.0);
+ // but don't decay below the default value
+ prob_loss_increasing_thresh = std::max(prob_loss_increasing_thresh, prob_loss_increasing_thresh_default_value);
+
+ return false;
+ }
+ }
+
+
+ struct clone_net{};
+
+ // per device state. All the containers have the same number of objects in them.
+ struct device_data
+ {
+ device_data(
+ int device_id_,
+ net_type& net_,
+ const solver_type& solver_
+ ) : device_id(device_id_), net(net_), solvers(num_computational_layers, solver_) {}
+
+ device_data(
+ int device_id_,
+ net_type& net_,
+ const solver_type& solver_,
+ clone_net
+ ) : device_id(device_id_), net_copy(std::make_shared<net_type>(net_)), net(*net_copy), solvers(num_computational_layers, solver_) {}
+
+ int device_id;
+ std::shared_ptr<net_type> net_copy;
+ net_type& net;
+ std::vector<solver_type> solvers;
+ };
+
+ template <
+ typename data_iterator,
+ typename label_iterator
+ >
+ void send_job (
+ bool test_only,
+ data_iterator dbegin,
+ data_iterator dend,
+ label_iterator lbegin
+ )
+ {
+ propagate_exception();
+ size_t num = std::distance(dbegin, dend);
+ size_t devs = devices.size();
+ job.t.resize(devs);
+ job.labels.resize(devs);
+ job.have_data.resize(devs);
+ job.test_only = test_only;
+
+ // chop the data into devs blocks, each of about block_size elements.
+ size_t block_size = (num+devs-1)/devs;
+
+ const auto prev_dev = dlib::cuda::get_device();
+ for (size_t i = 0; i < devs; ++i)
+ {
+ dlib::cuda::set_device(devices[i]->device_id);
+
+ size_t start = i*block_size;
+ size_t stop = std::min(num, start+block_size);
+
+ if (start < stop)
+ {
+ devices[i]->net.to_tensor(dbegin+start, dbegin+stop, job.t[i]);
+ job.labels[i].assign(lbegin+start, lbegin+stop);
+ job.have_data[i] = true;
+ }
+ else
+ {
+ job.have_data[i] = false;
+ }
+ }
+
+ dlib::cuda::set_device(prev_dev);
+ job_pipe.enqueue(job);
+ }
+
+ template <
+ typename data_iterator
+ >
+ void send_job (
+ bool test_only,
+ data_iterator dbegin,
+ data_iterator dend
+ )
+ {
+ typename std::vector<training_label_type>::iterator nothing;
+ send_job(test_only, dbegin, dend, nothing);
+ }
+
+ void print_progress()
+ {
+ if (lr_schedule.size() == 0)
+ {
+ if (test_previous_loss_values.size() == 0)
+ std::cout << "steps without apparent progress: " << steps_without_progress;
+ else
+ std::cout << "steps without apparent progress: train=" << steps_without_progress << ", test=" << test_steps_without_progress;
+ }
+ else
+ {
+ std::ostringstream sout;
+ sout << "percent complete: " << std::fixed << std::setprecision(2) << 100.0*lr_schedule_pos/(double)lr_schedule.size() << "%";
+ std::cout << sout.str();
+ }
+ std::cout << std::endl;
+ }
+
+ void print_periodic_verbose_status()
+ {
+ if (verbose)
+ {
+ using namespace std::chrono;
+ auto now_time = system_clock::now();
+ if (now_time-last_time > seconds(40))
+ {
+ last_time = now_time;
+ std::cout << "step#: " << rpad(cast_to_string(train_one_step_calls),epoch_string_pad) << " "
+ << "learning rate: " << rpad(cast_to_string(learning_rate),lr_string_pad) << " ";
+ if (test_previous_loss_values.size() == 0)
+ {
+ std::cout << "average loss: " << rpad(cast_to_string(get_average_loss()),string_pad) << " ";
+ }
+ else
+ {
+ std::cout << "train loss: " << rpad(cast_to_string(get_average_loss()),string_pad) << " ";
+ std::cout << "test loss: " << rpad(cast_to_string(get_average_test_loss()),string_pad) << " ";
+ }
+ print_progress();
+ clear_average_loss();
+ }
+ }
+ }
+
+ std::vector<std::shared_ptr<device_data>> devices;
+ dlib::pipe<job_t> job_pipe;
+ job_t job;
+
+
+ running_stats<double> rs;
+ running_stats_decayed<double> rs_test;
+ std::deque<double> previous_loss_values;
+ unsigned long max_num_epochs;
+ size_t mini_batch_size;
+ bool verbose;
+ net_type& net;
+ std::atomic<double> learning_rate;
+ double min_learning_rate;
+ std::atomic<unsigned long> iter_without_progress_thresh;
+ std::atomic<unsigned long> steps_without_progress;
+
+ std::atomic<unsigned long> test_iter_without_progress_thresh;
+ std::atomic<unsigned long> test_steps_without_progress;
+ std::deque<double> test_previous_loss_values;
+
+ std::atomic<double> learning_rate_shrink;
+ std::chrono::time_point<std::chrono::system_clock> last_sync_time;
+ std::string sync_filename;
+ std::chrono::seconds time_between_syncs;
+ unsigned long epoch_iteration;
+ size_t epoch_pos;
+ std::chrono::time_point<std::chrono::system_clock> last_time;
+ unsigned long long train_one_step_calls;
+ unsigned long long test_one_step_calls;
+ matrix<double,0,1> lr_schedule;
+ long lr_schedule_pos;
+ unsigned long gradient_check_budget;
+
+ std::exception_ptr eptr = nullptr;
+ mutable std::mutex eptr_mutex;
+ void propagate_exception() const
+ {
+ std::lock_guard<std::mutex> lock(eptr_mutex);
+ if (eptr)
+ std::rethrow_exception(eptr);
+ }
+
+ // These 5 variables are not serialized
+ size_t main_iteration_counter;
+ size_t main_iteration_counter_at_last_disk_sync;
+ double prob_loss_increasing_thresh_default_value;
+ double prob_loss_increasing_thresh_max_value;
+ double prob_loss_increasing_thresh;
+ std::atomic<bool> updated_net_since_last_sync;
+
+ bool sync_file_reloaded;
+ unsigned long previous_loss_values_dump_amount;
+ unsigned long test_previous_loss_values_dump_amount;
+ };
+
+// ----------------------------------------------------------------------------------------
+
+ template <
+ typename net_type,
+ typename solver_type
+ >
+ std::ostream& operator<< (
+ std::ostream& out,
+ dnn_trainer<net_type,solver_type>& trainer
+ )
+ {
+ using std::endl;
+ out << "dnn_trainer details: \n";
+ out << " net_type::num_layers: " << net_type::num_layers << endl;
+ // figure out how big the net is in MB.
+ std::ostringstream sout;
+ net_type temp = trainer.get_net(); // make a copy so that we can clean it without mutating the trainer's net.
+ temp.clean();
+ serialize(temp, sout);
+ out << " net size: " << sout.str().size()/1024.0/1024.0 << "MB" << endl;
+ // Don't include the loss params in the hash since we print them on the next line.
+ // They also aren't really part of the "architecture" of the network.
+ out << " net architecture hash: " << md5(cast_to_string(trainer.get_net().subnet())) << endl;
+ out << " loss: " << trainer.get_net().loss_details() << endl;
+
+ out << " synchronization file: " << trainer.get_synchronization_file() << endl;
+ out << " trainer.get_solvers()[0]: " << trainer.get_solvers()[0] << endl;
+ auto sched = trainer.get_learning_rate_schedule();
+ if (sched.size() != 0)
+ {
+ out << " using explicit user-supplied learning rate schedule" << endl;
+ }
+ else
+ {
+ out << " learning rate: "<< trainer.get_learning_rate() << endl;
+ out << " learning rate shrink factor: "<< trainer.get_learning_rate_shrink_factor() << endl;
+ out << " min learning rate: "<< trainer.get_min_learning_rate() << endl;
+ out << " iterations without progress threshold: "<< trainer.get_iterations_without_progress_threshold() << endl;
+ out << " test iterations without progress threshold: "<< trainer.get_test_iterations_without_progress_threshold() << endl;
+ }
+ return out;
+ }
+
+// ----------------------------------------------------------------------------------------
+
+}
+
+#endif // DLIB_DNn_TRAINER_H_
+
diff --git a/ml/dlib/dlib/dnn/trainer_abstract.h b/ml/dlib/dlib/dnn/trainer_abstract.h
new file mode 100644
index 000000000..3bfb6dc99
--- /dev/null
+++ b/ml/dlib/dlib/dnn/trainer_abstract.h
@@ -0,0 +1,765 @@
+// Copyright (C) 2015 Davis E. King (davis@dlib.net)
+// License: Boost Software License See LICENSE.txt for the full license.
+#undef DLIB_DNn_TRAINER_ABSTRACT_H_
+#ifdef DLIB_DNn_TRAINER_ABSTRACT_H_
+
+#include "core_abstract.h"
+#include "solvers_abstract.h"
+#include <vector>
+#include <chrono>
+
+
+namespace dlib
+{
+
+// ----------------------------------------------------------------------------------------
+
+ enum class force_flush_to_disk {
+ no = 0,
+ yes = 1
+ };
+
+// ----------------------------------------------------------------------------------------
+
+ template <
+ typename net_type,
+ typename solver_type = sgd
+ >
+ class dnn_trainer
+ {
+ /*!
+ REQUIREMENTS ON net_type
+ - net_type is an add_loss_layer object.
+
+ REQUIREMENTS ON solver_type
+ - solver_type is an implementation of the EXAMPLE_SOLVER interface defined
+ in solvers_abstract.h
+
+ WHAT THIS OBJECT REPRESENTS
+ This object is a tool training a deep neural network. To use it you supply
+ a neural network type and a solver, then you call train() with your
+ training data and it will output a new network instance that has hopefully
+ learned something useful from your training data.
+
+ If you are compiling with CUDA then this object will use the GPU that is
+ currently selected (i.e. the one indicated by cudaGetDevice()) when
+ dnn_trainer is constructed. It will continue to use that device even if
+ you later change it by a call to cudaSetDevice().
+
+ EXCEPTIONS
+ If an exception is thrown by any part of the neural network during training
+ then the exception will be propagated out of the trainer to the user.
+ Moreover, the trainer instance will be unusable and should be destroyed.
+ !*/
+
+ public:
+
+ typedef typename net_type::training_label_type training_label_type;
+ typedef typename net_type::input_type input_type;
+ const static size_t num_computational_layers = net_type::num_computational_layers;
+
+ dnn_trainer() = delete;
+ dnn_trainer(const dnn_trainer&) = delete;
+ dnn_trainer& operator=(const dnn_trainer&) = delete;
+
+ dnn_trainer(
+ net_type& net,
+ const solver_type& solver = solver_type(),
+ const std::vector<int>& cuda_extra_devices = {}
+ );
+ /*!
+ requires
+ - for all valid i:
+ - 0 <= cuda_extra_devices[i] < dlib::cuda::get_num_devices()
+ ensures
+ - &#get_net() == &net
+ (i.e. The dnn_trainer holds a reference to net, it does not copy it.
+ Therefore, you must ensure net has a lifetime at least as long as the
+ dnn_trainer).
+ - #get_solvers() == a set of solvers that are all initialized with the
+ provided solver instance.
+ - #get_max_num_epochs() == 10000
+ - #get_mini_batch_size() == 128
+ - #get_learning_rate() == 1e-2
+ - #get_min_learning_rate() == 1e-5
+ - #get_iterations_without_progress_threshold() == 2000
+ - #get_test_iterations_without_progress_threshold() == 500
+ - #get_learning_rate_shrink_factor() == 0.1
+ - #get_learning_rate_schedule().size() == 0
+ - #get_train_one_step_calls() == 0
+ - #get_test_one_step_calls() == 0
+ - #get_synchronization_file() == ""
+ - if (cuda_extra_devices.size() > 0) then
+ - This object will use multiple graphics cards to run the learning
+ algorithms. In particular, it will always use whatever device is
+ currently selected on the calling thread (the device indicated by
+ cudaGetDevice()). In addition, you can ask to use additional
+ devices, which you do by putting their device numbers into
+ cuda_extra_devices.
+ !*/
+
+ net_type& get_net (
+ force_flush_to_disk force_flush = force_flush_to_disk::yes
+ );
+ /*!
+ ensures
+ - returns the neural network object used by this trainer. This is the
+ network that is optimized when you call train() or train_one_step().
+ Recall that the dnn_trainer doesn't contain the net_type object but
+ simply holds a reference to an external network which was provided to the
+ dnn_trainer's constructor.
+ - This function blocks until all threads inside the dnn_trainer have
+ stopped touching the net.
+ - If force_flush is yes, then this function will sync the trainer state to
+ disk if the current state hasn't already been synced to disk since the
+ last network modification.
+ !*/
+
+ const std::vector<solver_type>& get_solvers (
+ ) const;
+ /*!
+ ensures
+ - returns the solvers used to optimize each layer of the neural network
+ get_net(). In particular, the first layer's solver is
+ get_solvers()[0], the second layer's solver is
+ get_solvers()[1], and so on.
+ - This function blocks until all threads inside the dnn_trainer have
+ stopped touching the net.
+ !*/
+
+ unsigned long get_mini_batch_size (
+ ) const;
+ /*!
+ ensures
+ - During training, we call the network's update() routine over and over
+ with training data. The number of training samples we give to each call
+ to update is the "mini-batch size", which is defined by
+ get_mini_batch_size().
+ !*/
+
+ void set_mini_batch_size (
+ unsigned long batch_size
+ );
+ /*!
+ requires
+ - batch_size > 0
+ ensures
+ - #get_mini_batch_size() == batch_size
+ !*/
+
+ unsigned long get_max_num_epochs (
+ ) const;
+ /*!
+ ensures
+ - train() will execute at most get_max_num_epochs() iterations over the
+ training data before returning.
+ !*/
+
+ void set_max_num_epochs (
+ unsigned long num
+ );
+ /*!
+ requires
+ - num > 0
+ ensures
+ - #get_max_num_epochs() == num
+ !*/
+
+ void set_learning_rate (
+ double lr
+ );
+ /*!
+ requires
+ - lr > 0
+ ensures
+ - #get_learning_rate() == lr
+ - #get_learning_rate_schedule().size() == 0
+ - This function blocks until all threads inside the dnn_trainer have
+ stopped touching the net.
+ !*/
+
+ double get_learning_rate(
+ ) const;
+ /*!
+ ensures
+ - During each training step, a solver tells us how to modify the parameters
+ of each layer in the network. It does this by outputting a step vector
+ that, when added to the parameters, will hopefully result in improved
+ network performance. The learning rate is one of the inputs to the
+ solver and influences the size of this step vector. This function
+ returns the current learning rate, that is, the learning rate that will
+ be used during the next training step.
+ !*/
+
+ void set_min_learning_rate (
+ double lr
+ );
+ /*!
+ requires
+ - lr > 0
+ ensures
+ - #get_min_learning_rate() == lr
+ - #get_learning_rate_schedule().size() == 0
+ - This function blocks until all threads inside the dnn_trainer have
+ stopped touching the net.
+ !*/
+
+ double get_min_learning_rate (
+ ) const;
+ /*!
+ ensures
+ - During training via this->train(), this object will test if progress is
+ still being made and if it isn't then it will reduce get_learning_rate()
+ by setting it to get_learning_rate()*get_learning_rate_shrink_factor().
+ However, it will not reduce it below get_min_learning_rate(). Once this
+ minimum learning rate is crossed the training will terminate.
+ - get_min_learning_rate() doesn't apply if you are using train_one_step().
+ You can keep calling train_one_step() as many times as you want and the
+ learning rate will drop infinitely close to 0 if you run long enough.
+ !*/
+
+ template <typename EXP>
+ void set_learning_rate_schedule (
+ const matrix_exp<EXP>& schedule
+ );
+ /*!
+ requires
+ - schedule.size() > 0
+ - min(schedule) > 0
+ ensures
+ - #get_learning_rate_schedule() == reshape_to_column_vector(schedule)
+ - #get_learning_rate() == schedule(0,0)
+ - #get_min_learning_rate() == min(schedule)
+ - #set_learning_rate_shrink_factor() == 1
+ !*/
+
+ const matrix<double,0,1>& get_learning_rate_schedule (
+ ) const;
+ /*!
+ ensures
+ - if (this function returns a non-empty matrix) then
+ - This trainer will use an explicit learning rate schedule defined by
+ the learning rate values in get_learning_rate_schedule(). For
+ example, if get_learning_rate_schedule() returned {0.1, 0.09, 0.08,
+ 0.07, 0.06} then the first training mini-batch would use a learning
+ rate of 0.1, then the next training mini-batch uses 0.09, and then
+ 0.8, and so on until the end of the schedule is reached.
+
+ If you continue to run training after the end of the schedule has
+ been reached then the learning rate will be fixed to 0.99 times the
+ final value. So in our example, eventually the learning rate would
+ be fixed to 0.99*0.06. This allows you to test if we have reached the
+ end of the schedule by checking if get_learning_rate() >= 0.06.
+ !*/
+
+ unsigned long get_steps_without_progress (
+ ) const;
+ /*!
+ ensures
+ - if (get_learning_rate_shrink_factor() != 1) then
+ - returns an estimate of how many mini-batches have executed without us
+ observing a statistically significant decrease in the training error.
+ - else
+ - returns 0
+ !*/
+
+ void set_iterations_without_progress_threshold (
+ unsigned long thresh
+ );
+ /*!
+ ensures
+ - #get_iterations_without_progress_threshold() == thresh
+ - #get_learning_rate_schedule().size() == 0
+ - This function blocks until all threads inside the dnn_trainer have
+ stopped touching the net.
+ !*/
+
+ unsigned long get_iterations_without_progress_threshold (
+ ) const;
+ /*!
+ ensures
+ - This object monitors the progress of training and estimates if the
+ training error is being reduced. It does this by looking at the previous
+ get_iterations_without_progress_threshold() mini-batch results and
+ applying the statistical test defined by the running_gradient object to
+ see if the training error is getting smaller. If it isn't being reduced
+ then get_learning_rate() is made smaller by a factor of get_learning_rate_shrink_factor().
+
+ Therefore, get_iterations_without_progress_threshold() should always be
+ set to something sensibly large so that this test can be done with
+ reasonably high confidence. Think of this test as saying "if the loss
+ hasn't decreased for the previous get_iterations_without_progress_threshold()
+ then shrink the learning rate".
+ !*/
+
+ void set_learning_rate_shrink_factor (
+ double shrink
+ );
+ /*!
+ requires
+ - 0 < shrink && shrink <= 1
+ ensures
+ - #get_learning_rate_shrink_factor() == shrink
+ - #get_learning_rate_schedule().size() == 0
+ - This function blocks until all threads inside the dnn_trainer have
+ stopped touching the net.
+ !*/
+
+ double get_learning_rate_shrink_factor (
+ ) const;
+ /*!
+ ensures
+ - Whenever the training routine thinks it isn't making progress anymore it
+ will reduce get_learning_rate() by multiplying it by get_learning_rate_shrink_factor().
+ - You can disable the automatic learning rate reduction by setting
+ get_learning_rate_shrink_factor() to 1.
+ !*/
+
+ unsigned long long get_train_one_step_calls (
+ ) const;
+ /*!
+ ensures
+ - returns the number of times train_one_step() has been called.
+ !*/
+
+ unsigned long long get_test_one_step_calls (
+ ) const;
+ /*!
+ ensures
+ - returns the number of times test_one_step() has been called.
+ !*/
+
+ void be_verbose (
+ );
+ /*!
+ ensures
+ - This object will print status messages to standard out so that a
+ user can observe the progress of the algorithm.
+ !*/
+
+ void be_quiet (
+ );
+ /*!
+ ensures
+ - This object will not print anything to standard out
+ !*/
+
+ void set_synchronization_file (
+ const std::string& filename,
+ std::chrono::seconds time_between_syncs = std::chrono::minutes(15)
+ );
+ /*!
+ ensures
+ - #get_synchronization_file() == filename
+ - While training is running, either via train() or repeated calls to
+ train_one_step(), this object will save its entire state, including the
+ state of get_net(), to disk in the file named filename every
+ time_between_syncs seconds.
+ - If the filename file already exists then the state of this trainer will
+ be loaded from that file by this call to set_synchronization_file().
+ This allows you to resume a training session which was previously
+ interrupted.
+ - It should be noted that when saving, the trainer will alternate between
+ saving to a file called filename and another file called filename+"_".
+ We do this because it's possible that your computer might crash (not
+ because of dlib, just in general) before the data is safely saved to
+ disk. This way, you will always have a backup file if the write to disk
+ gets corrupted or is incomplete. Moreover, when loading, we will always
+ load from the newest of the two possible files.
+ !*/
+
+ const std::string& get_synchronization_file (
+ );
+ /*!
+ ensures
+ - Returns the name of the file the dnn_trainer will periodically save it's
+ state to. If the return value is "" then synchronization is disabled.
+ !*/
+
+ void train (
+ const std::vector<input_type>& data,
+ const std::vector<training_label_type>& labels
+ );
+ /*!
+ requires
+ - data.size() == labels.size()
+ - data.size() > 0
+ - net_type uses a supervised loss.
+ i.e. net_type::training_label_type != no_label_type.
+ ensures
+ - Trains a supervised neural network based on the given training data.
+ The goal of training is to find the network parameters that minimize
+ get_net().compute_loss(data.begin(), data.end(), labels.begin()).
+ - The optimizer will run until get_learning_rate() < get_min_learning_rate()
+ or get_max_num_epochs() training epochs have been executed.
+ - Each layer in the network will be optimized by its corresponding solver
+ in get_solvers().
+ - Each call to train DOES NOT reinitialize the state of get_net() or
+ get_solvers(). That is, the existing state of the solvers and network is
+ the starting point for the optimization each time train() is called. In
+ particular, if you use the set_synchronization_file() method you can
+ resume an interrupted train() call by simply calling train() again and it
+ will pick up from the last synchronization point.
+ - You can obtain the average loss value during the final training epoch by
+ calling get_average_loss().
+ - This function blocks until all threads inside the dnn_trainer have
+ stopped touching the net.
+ !*/
+
+ void train (
+ const std::vector<input_type>& data
+ );
+ /*!
+ requires
+ - data.size() > 0
+ - net_type uses an unsupervised loss.
+ i.e. net_type::training_label_type == no_label_type.
+ ensures
+ - Trains an unsupervised neural network based on the given training data.
+ The goal of training is to find the network parameters that minimize
+ get_net().compute_loss(data.begin(), data.end()).
+ - The optimizer will run until get_learning_rate() < get_min_learning_rate()
+ or get_max_num_epochs() training epochs have been executed.
+ - Each layer in the network will be optimized by its corresponding solver
+ in get_solvers().
+ - Each call to train DOES NOT reinitialize the state of get_net() or
+ get_solvers(). That is, the existing state of the solvers and network is
+ the starting point for the optimization each time train() is called. In
+ particular, if you use the set_synchronization_file() method you can
+ resume an interrupted train() call by simply calling train() again and it
+ will pick up from the last synchronization point.
+ - You can obtain the average loss value during the final training epoch by
+ calling get_average_loss().
+ - This function blocks until all threads inside the dnn_trainer have
+ stopped touching the net.
+ !*/
+
+ void train_one_step (
+ const std::vector<input_type>& data,
+ const std::vector<training_label_type>& labels
+ );
+ /*!
+ requires
+ - data.size() == labels.size()
+ - data.size() > 0
+ - net_type uses a supervised loss.
+ i.e. net_type::training_label_type != no_label_type.
+ ensures
+ - Performs one stochastic gradient update step based on the mini-batch of
+ data and labels supplied to this function. In particular, calling
+ train_one_step() in a loop is equivalent to calling the train() method
+ defined above. However, train_one_step() allows you to stream data from
+ disk into the training process while train() requires you to first load
+ all the training data into RAM. Otherwise, these training methods are
+ equivalent.
+ - You can observe the current average loss value by calling get_average_loss().
+ - The network training will happen in another thread. Therefore, after
+ calling this function you should call get_net() before you touch the net
+ object from the calling thread to ensure no other threads are still
+ accessing the network.
+ - #get_train_one_step_calls() == get_train_one_step_calls() + 1.
+ !*/
+
+ template <
+ typename data_iterator,
+ typename label_iterator
+ >
+ void train_one_step (
+ data_iterator dbegin,
+ data_iterator dend,
+ label_iterator lbegin
+ );
+ /*!
+ requires
+ - std::advance(lbegin, std::distance(dbegin, dend) - 1) is dereferencable
+ - std::distance(dbegin, dend) > 0
+ - net_type uses a supervised loss.
+ i.e. net_type::training_label_type != no_label_type.
+ ensures
+ - Performs one stochastic gradient update step based on the mini-batch of
+ data and labels supplied to this function. In particular, calling
+ train_one_step() in a loop is equivalent to calling the train() method
+ defined above. However, train_one_step() allows you to stream data from
+ disk into the training process while train() requires you to first load
+ all the training data into RAM. Otherwise, these training methods are
+ equivalent.
+ - You can observe the current average loss value by calling get_average_loss().
+ - The network training will happen in another thread. Therefore, after
+ calling this function you should call get_net() before you touch the net
+ object from the calling thread to ensure no other threads are still
+ accessing the network.
+ - #get_train_one_step_calls() == get_train_one_step_calls() + 1.
+ !*/
+
+ void train_one_step (
+ const std::vector<input_type>& data
+ );
+ /*!
+ requires
+ - data.size() > 0
+ - net_type uses an unsupervised loss.
+ i.e. net_type::training_label_type == no_label_type.
+ ensures
+ - Performs one stochastic gradient update step based on the mini-batch of
+ data supplied to this function. In particular, calling train_one_step()
+ in a loop is equivalent to calling the train() method defined above.
+ However, train_one_step() allows you to stream data from disk into the
+ training process while train() requires you to first load all the
+ training data into RAM. Otherwise, these training methods are
+ equivalent.
+ - You can observe the current average loss value by calling get_average_loss().
+ - The network training will happen in another thread. Therefore, after
+ calling this function you should call get_net() before you touch the net
+ object from the calling thread to ensure no other threads are still
+ accessing the network.
+ - #get_train_one_step_calls() == get_train_one_step_calls() + 1.
+ !*/
+
+ template <
+ typename data_iterator
+ >
+ void train_one_step (
+ data_iterator dbegin,
+ data_iterator dend
+ );
+ /*!
+ requires
+ - std::distance(dbegin, dend) > 0
+ - net_type uses an unsupervised loss.
+ i.e. net_type::training_label_type == no_label_type.
+ ensures
+ - Performs one stochastic gradient update step based on the mini-batch of
+ data supplied to this function. In particular, calling train_one_step()
+ in a loop is equivalent to calling the train() method defined above.
+ However, train_one_step() allows you to stream data from disk into the
+ training process while train() requires you to first load all the
+ training data into RAM. Otherwise, these training methods are
+ equivalent.
+ - You can observe the current average loss value by calling get_average_loss().
+ - The network training will happen in another thread. Therefore, after
+ calling this function you should call get_net() before you touch the net
+ object from the calling thread to ensure no other threads are still
+ accessing the network.
+ - #get_train_one_step_calls() == get_train_one_step_calls() + 1.
+ !*/
+
+ double get_average_loss (
+ ) const;
+ /*!
+ ensures
+ - returns the average loss value observed during previous calls to
+ train_one_step() or train(). That is, the average output of
+ net_type::update() during the previous mini-batch updates.
+ - Note that, if be_verbose() has been called, then this object will
+ automatically call clear_average_loss() periodically when it logs the
+ loss to the console.
+ - This function blocks until all threads inside the dnn_trainer have
+ stopped touching the net.
+ !*/
+
+ void clear_average_loss (
+ );
+ /*!
+ ensures
+ - #get_average_loss() == 0
+ - get_average_loss() uses a dlib::running_stats object to keep a running
+ average of the loss values seen during the previous mini-batch updates
+ applied during training. Calling clear_average_loss() resets the
+ running_stats object so it forgets about all previous loss values
+ observed.
+ - This function blocks until all threads inside the dnn_trainer have
+ stopped touching the net.
+ !*/
+
+ // ----------------------
+
+ double get_average_test_loss (
+ ) const;
+ /*!
+ ensures
+ - returns the average loss value observed during previous calls to
+ test_one_step().
+ - This function blocks until all threads inside the dnn_trainer have
+ stopped touching the net.
+ !*/
+
+ void test_one_step (
+ const std::vector<input_type>& data,
+ const std::vector<training_label_type>& labels
+ );
+ /*!
+ requires
+ - data.size() == labels.size()
+ - data.size() > 0
+ - net_type uses a supervised loss.
+ i.e. net_type::training_label_type != no_label_type.
+ ensures
+ - Runs the given data through the network and computes and records the loss.
+ - This call does not modify network parameters. The point of
+ test_one_step() is two fold, to allow you to observe the accuracy of the
+ network on hold out data during training, and to allow the trainer to
+ automatically adjust the learning rate when the test loss stops
+ improving. It should be noted that you are not required to use
+ test_one_step() at all, but if you want to do this kind of thing it is
+ available.
+ - You can observe the current average loss value by calling get_average_test_loss().
+ - The computation will happen in another thread. Therefore, after calling
+ this function you should call get_net() before you touch the net object
+ from the calling thread to ensure no other threads are still accessing
+ the network.
+ - #get_test_one_step_calls() == get_test_one_step_calls() + 1.
+ !*/
+
+ template <
+ typename data_iterator,
+ typename label_iterator
+ >
+ void test_one_step (
+ data_iterator dbegin,
+ data_iterator dend,
+ label_iterator lbegin
+ );
+ /*!
+ requires
+ - std::advance(lbegin, std::distance(dbegin, dend) - 1) is dereferencable
+ - std::distance(dbegin, dend) > 0
+ - net_type uses a supervised loss.
+ i.e. net_type::training_label_type != no_label_type.
+ ensures
+ - Runs the given data through the network and computes and records the loss.
+ - This call does not modify network parameters. The point of
+ test_one_step() is two fold, to allow you to observe the accuracy of the
+ network on hold out data during training, and to allow the trainer to
+ automatically adjust the learning rate when the test loss stops
+ improving. It should be noted that you are not required to use
+ test_one_step() at all, but if you want to do this kind of thing it is
+ available.
+ - You can observe the current average loss value by calling get_average_test_loss().
+ - The computation will happen in another thread. Therefore, after calling
+ this function you should call get_net() before you touch the net object
+ from the calling thread to ensure no other threads are still accessing
+ the network.
+ - #get_test_one_step_calls() == get_test_one_step_calls() + 1.
+ !*/
+
+ void test_one_step (
+ const std::vector<input_type>& data
+ );
+ /*!
+ requires
+ - data.size() > 0
+ - net_type uses an unsupervised loss.
+ i.e. net_type::training_label_type == no_label_type.
+ ensures
+ - Runs the given data through the network and computes and records the loss.
+ - This call does not modify network parameters. The point of
+ test_one_step() is two fold, to allow you to observe the accuracy of the
+ network on hold out data during training, and to allow the trainer to
+ automatically adjust the learning rate when the test loss stops
+ improving. It should be noted that you are not required to use
+ test_one_step() at all, but if you want to do this kind of thing it is
+ available.
+ - You can observe the current average loss value by calling get_average_test_loss().
+ - The computation will happen in another thread. Therefore, after calling
+ this function you should call get_net() before you touch the net object
+ from the calling thread to ensure no other threads are still accessing
+ the network.
+ - #get_test_one_step_calls() == get_test_one_step_calls() + 1.
+ !*/
+
+ template <
+ typename data_iterator
+ >
+ void test_one_step (
+ data_iterator dbegin,
+ data_iterator dend
+ );
+ /*!
+ requires
+ - std::distance(dbegin, dend) > 0
+ - net_type uses an unsupervised loss.
+ i.e. net_type::training_label_type == no_label_type.
+ ensures
+ - Runs the given data through the network and computes and records the loss.
+ - This call does not modify network parameters. The point of
+ test_one_step() is two fold, to allow you to observe the accuracy of the
+ network on hold out data during training, and to allow the trainer to
+ automatically adjust the learning rate when the test loss stops
+ improving. It should be noted that you are not required to use
+ test_one_step() at all, but if you want to do this kind of thing it is
+ available.
+ - You can observe the current average loss value by calling get_average_test_loss().
+ - The computation will happen in another thread. Therefore, after calling
+ this function you should call get_net() before you touch the net object
+ from the calling thread to ensure no other threads are still accessing
+ the network.
+ - #get_test_one_step_calls() == get_test_one_step_calls() + 1.
+ !*/
+
+ void set_test_iterations_without_progress_threshold (
+ unsigned long thresh
+ );
+ /*!
+ ensures
+ - #get_test_iterations_without_progress_threshold() == thresh
+ - #get_learning_rate_schedule().size() == 0
+ - This function blocks until all threads inside the dnn_trainer have
+ stopped touching the net.
+ !*/
+
+ unsigned long get_test_iterations_without_progress_threshold (
+ ) const;
+ /*!
+ ensures
+ - This object monitors the progress of training and estimates if the
+ testing error is being reduced. It does this by looking at the previous
+ get_test_iterations_without_progress_threshold() mini-batch results from
+ test_one_step() and applying the statistical test defined by the
+ running_gradient object to see if the testing error is getting smaller.
+ If it isn't being reduced then get_learning_rate() is made smaller by a
+ factor of get_learning_rate_shrink_factor().
+
+ Therefore, get_test_iterations_without_progress_threshold() should always be
+ set to something sensibly large so that this test can be done with
+ reasonably high confidence. Think of this test as saying "if the testing loss
+ hasn't decreased for the previous get_test_iterations_without_progress_threshold()
+ calls to test_one_step() then shrink the learning rate".
+ !*/
+
+ unsigned long get_test_steps_without_progress (
+ ) const;
+ /*!
+ ensures
+ - if (get_learning_rate_shrink_factor() != 1) then
+ - returns an estimate of how many mini-batches have executed without us
+ observing a statistically significant decrease in the testing error
+ (i.e. the error on the data given to the trainer via test_one_step()
+ calls).
+ - else
+ - returns 0
+ !*/
+
+ };
+
+// ----------------------------------------------------------------------------------------
+
+ template <
+ typename net_type,
+ typename solver_type
+ >
+ std::ostream& operator<< (
+ std::ostream& out,
+ dnn_trainer<net_type,solver_type>& trainer
+ );
+ /*!
+ ensures
+ - Prints a log of the current parameters of trainer to out.
+ !*/
+
+// ----------------------------------------------------------------------------------------
+
+}
+
+#endif // DLIB_DNn_TRAINER_ABSTRACT_H_
+
+
diff --git a/ml/dlib/dlib/dnn/utilities.h b/ml/dlib/dlib/dnn/utilities.h
new file mode 100644
index 000000000..976128c81
--- /dev/null
+++ b/ml/dlib/dlib/dnn/utilities.h
@@ -0,0 +1,281 @@
+// Copyright (C) 2016 Davis E. King (davis@dlib.net)
+// License: Boost Software License See LICENSE.txt for the full license.
+#ifndef DLIB_DNn_UTILITIES_H_
+#define DLIB_DNn_UTILITIES_H_
+
+#include "core.h"
+#include "utilities_abstract.h"
+#include "../geometry.h"
+#include <fstream>
+
+namespace dlib
+{
+
+// ----------------------------------------------------------------------------------------
+
+ inline double log1pexp(double x)
+ {
+ using std::exp;
+ using namespace std; // Do this instead of using std::log1p because some compilers
+ // error out otherwise (E.g. gcc 4.9 in cygwin)
+ if (x <= -37)
+ return exp(x);
+ else if (-37 < x && x <= 18)
+ return log1p(exp(x));
+ else if (18 < x && x <= 33.3)
+ return x + exp(-x);
+ else
+ return x;
+ }
+
+// ----------------------------------------------------------------------------------------
+
+ inline void randomize_parameters (
+ tensor& params,
+ unsigned long num_inputs_and_outputs,
+ dlib::rand& rnd
+ )
+ {
+ for (auto& val : params)
+ {
+ // Draw a random number to initialize the layer according to formula (16)
+ // from Understanding the difficulty of training deep feedforward neural
+ // networks by Xavier Glorot and Yoshua Bengio.
+ val = 2*rnd.get_random_float()-1;
+ val *= std::sqrt(6.0/(num_inputs_and_outputs));
+ }
+ }
+
+// ----------------------------------------------------------------------------------------
+
+ namespace impl
+ {
+ class visitor_net_to_xml
+ {
+ public:
+
+ visitor_net_to_xml(std::ostream& out_) : out(out_) {}
+
+ template<typename input_layer_type>
+ void operator()(size_t idx, const input_layer_type& l)
+ {
+ out << "<layer idx='"<<idx<<"' type='input'>\n";
+ to_xml(l,out);
+ out << "</layer>\n";
+ }
+
+ template <typename T, typename U>
+ void operator()(size_t idx, const add_loss_layer<T,U>& l)
+ {
+ out << "<layer idx='"<<idx<<"' type='loss'>\n";
+ to_xml(l.loss_details(),out);
+ out << "</layer>\n";
+ }
+
+ template <typename T, typename U, typename E>
+ void operator()(size_t idx, const add_layer<T,U,E>& l)
+ {
+ out << "<layer idx='"<<idx<<"' type='comp'>\n";
+ to_xml(l.layer_details(),out);
+ out << "</layer>\n";
+ }
+
+ template <unsigned long ID, typename U, typename E>
+ void operator()(size_t idx, const add_tag_layer<ID,U,E>& l)
+ {
+ out << "<layer idx='"<<idx<<"' type='tag' id='"<<ID<<"'/>\n";
+ }
+
+ template <template<typename> class T, typename U>
+ void operator()(size_t idx, const add_skip_layer<T,U>& l)
+ {
+ out << "<layer idx='"<<idx<<"' type='skip' id='"<<(tag_id<T>::id)<<"'/>\n";
+ }
+
+ private:
+
+ std::ostream& out;
+ };
+ }
+
+ template <typename net_type>
+ void net_to_xml (
+ const net_type& net,
+ std::ostream& out
+ )
+ {
+ auto old_precision = out.precision(9);
+ out << "<net>\n";
+ visit_layers(net, impl::visitor_net_to_xml(out));
+ out << "</net>\n";
+ // restore the original stream precision.
+ out.precision(old_precision);
+ }
+
+ template <typename net_type>
+ void net_to_xml (
+ const net_type& net,
+ const std::string& filename
+ )
+ {
+ std::ofstream fout(filename);
+ net_to_xml(net, fout);
+ }
+
+// ----------------------------------------------------------------------------------------
+
+ namespace impl
+ {
+
+ class visitor_net_map_input_to_output
+ {
+ public:
+
+ visitor_net_map_input_to_output(dpoint& p_) : p(p_) {}
+
+ dpoint& p;
+
+ template<typename input_layer_type>
+ void operator()(const input_layer_type& net)
+ {
+ }
+
+ template <typename T, typename U>
+ void operator()(const add_loss_layer<T,U>& net)
+ {
+ (*this)(net.subnet());
+ }
+
+ template <typename T, typename U, typename E>
+ void operator()(const add_layer<T,U,E>& net)
+ {
+ (*this)(net.subnet());
+ p = net.layer_details().map_input_to_output(p);
+ }
+ template <bool B, typename T, typename U, typename E>
+ void operator()(const dimpl::subnet_wrapper<add_layer<T,U,E>,B>& net)
+ {
+ (*this)(net.subnet());
+ p = net.layer_details().map_input_to_output(p);
+ }
+
+
+ template <unsigned long ID, typename U, typename E>
+ void operator()(const add_tag_layer<ID,U,E>& net)
+ {
+ // tag layers are an identity transform, so do nothing
+ (*this)(net.subnet());
+ }
+ template <bool is_first, unsigned long ID, typename U, typename E>
+ void operator()(const dimpl::subnet_wrapper<add_tag_layer<ID,U,E>,is_first>& net)
+ {
+ // tag layers are an identity transform, so do nothing
+ (*this)(net.subnet());
+ }
+
+
+ template <template<typename> class TAG_TYPE, typename U>
+ void operator()(const add_skip_layer<TAG_TYPE,U>& net)
+ {
+ (*this)(layer<TAG_TYPE>(net));
+ }
+ template <bool is_first, template<typename> class TAG_TYPE, typename SUBNET>
+ void operator()(const dimpl::subnet_wrapper<add_skip_layer<TAG_TYPE,SUBNET>,is_first>& net)
+ {
+ // skip layers are an identity transform, so do nothing
+ (*this)(layer<TAG_TYPE>(net));
+ }
+
+ };
+
+ class visitor_net_map_output_to_input
+ {
+ public:
+ visitor_net_map_output_to_input(dpoint& p_) : p(p_) {}
+
+ dpoint& p;
+
+ template<typename input_layer_type>
+ void operator()(const input_layer_type& net)
+ {
+ }
+
+ template <typename T, typename U>
+ void operator()(const add_loss_layer<T,U>& net)
+ {
+ (*this)(net.subnet());
+ }
+
+ template <typename T, typename U, typename E>
+ void operator()(const add_layer<T,U,E>& net)
+ {
+ p = net.layer_details().map_output_to_input(p);
+ (*this)(net.subnet());
+ }
+ template <bool B, typename T, typename U, typename E>
+ void operator()(const dimpl::subnet_wrapper<add_layer<T,U,E>,B>& net)
+ {
+ p = net.layer_details().map_output_to_input(p);
+ (*this)(net.subnet());
+ }
+
+
+ template <unsigned long ID, typename U, typename E>
+ void operator()(const add_tag_layer<ID,U,E>& net)
+ {
+ // tag layers are an identity transform, so do nothing
+ (*this)(net.subnet());
+ }
+ template <bool is_first, unsigned long ID, typename U, typename E>
+ void operator()(const dimpl::subnet_wrapper<add_tag_layer<ID,U,E>,is_first>& net)
+ {
+ // tag layers are an identity transform, so do nothing
+ (*this)(net.subnet());
+ }
+
+
+ template <template<typename> class TAG_TYPE, typename U>
+ void operator()(const add_skip_layer<TAG_TYPE,U>& net)
+ {
+ (*this)(layer<TAG_TYPE>(net));
+ }
+ template <bool is_first, template<typename> class TAG_TYPE, typename SUBNET>
+ void operator()(const dimpl::subnet_wrapper<add_skip_layer<TAG_TYPE,SUBNET>,is_first>& net)
+ {
+ // skip layers are an identity transform, so do nothing
+ (*this)(layer<TAG_TYPE>(net));
+ }
+
+ };
+ }
+
+ template <typename net_type>
+ inline dpoint input_tensor_to_output_tensor(
+ const net_type& net,
+ dpoint p
+ )
+ {
+ impl::visitor_net_map_input_to_output temp(p);
+ temp(net);
+ return p;
+ }
+
+ template <typename net_type>
+ inline dpoint output_tensor_to_input_tensor(
+ const net_type& net,
+ dpoint p
+ )
+ {
+ impl::visitor_net_map_output_to_input temp(p);
+ temp(net);
+ return p;
+ }
+
+// ----------------------------------------------------------------------------------------
+
+}
+
+#endif // DLIB_DNn_UTILITIES_H_
+
+
+
diff --git a/ml/dlib/dlib/dnn/utilities_abstract.h b/ml/dlib/dlib/dnn/utilities_abstract.h
new file mode 100644
index 000000000..2a9a3d3fc
--- /dev/null
+++ b/ml/dlib/dlib/dnn/utilities_abstract.h
@@ -0,0 +1,127 @@
+// Copyright (C) 2016 Davis E. King (davis@dlib.net)
+// License: Boost Software License See LICENSE.txt for the full license.
+#undef DLIB_DNn_UTILITIES_ABSTRACT_H_
+#ifdef DLIB_DNn_UTILITIES_ABSTRACT_H_
+
+#include "core_abstract.h"
+#include "../geometry/vector_abstract.h"
+
+namespace dlib
+{
+
+// ----------------------------------------------------------------------------------------
+
+ double log1pexp(
+ double x
+ );
+ /*!
+ ensures
+ - returns log(1+exp(x))
+ (except computes it using a numerically accurate method)
+ !*/
+
+// ----------------------------------------------------------------------------------------
+
+ void randomize_parameters (
+ tensor& params,
+ unsigned long num_inputs_and_outputs,
+ dlib::rand& rnd
+ );
+ /*!
+ ensures
+ - This function assigns random values into params based on the given random
+ number generator. In particular, it uses the parameter initialization method
+ of formula 16 from the paper "Understanding the difficulty of training deep
+ feedforward neural networks" by Xavier Glorot and Yoshua Bengio.
+ - It is assumed that the total number of inputs and outputs from the layer is
+ num_inputs_and_outputs. That is, you should set num_inputs_and_outputs to
+ the sum of the dimensionalities of the vectors going into and out of the
+ layer that uses params as its parameters.
+ !*/
+
+// ----------------------------------------------------------------------------------------
+
+ template <typename net_type>
+ void net_to_xml (
+ const net_type& net,
+ std::ostream& out
+ );
+ /*!
+ requires
+ - net_type is an object of type add_layer, add_loss_layer, add_skip_layer, or
+ add_tag_layer.
+ - All layers in the net must provide to_xml() functions.
+ ensures
+ - Prints the given neural network object as an XML document to the given output
+ stream.
+ !*/
+
+ template <typename net_type>
+ void net_to_xml (
+ const net_type& net,
+ const std::string& filename
+ );
+ /*!
+ requires
+ - net_type is an object of type add_layer, add_loss_layer, add_skip_layer, or
+ add_tag_layer.
+ - All layers in the net must provide to_xml() functions.
+ ensures
+ - This function is just like the above net_to_xml(), except it writes to a file
+ rather than an ostream.
+ !*/
+
+// ----------------------------------------------------------------------------------------
+
+ template <typename net_type>
+ dpoint input_tensor_to_output_tensor(
+ const net_type& net,
+ dpoint p
+ );
+ /*!
+ requires
+ - net_type is an object of type add_layer, add_skip_layer, or add_tag_layer.
+ - All layers in the net must provide map_input_to_output() functions.
+ ensures
+ - Given a dpoint (i.e. a row,column coordinate) in the input tensor given to
+ net, this function returns the corresponding dpoint in the output tensor
+ net.get_output(). This kind of mapping is useful when working with fully
+ convolutional networks as you will often want to know what parts of the
+ output feature maps correspond to what parts of the input.
+ - If the network contains skip layers then any layers skipped over by the skip
+ layer are ignored for the purpose of computing this coordinate mapping. That
+ is, if you walk the network from the output layer to the input layer, where
+ each time you encounter a skip layer you jump to the layer indicated by the
+ skip layer, you will visit exactly the layers in the network involved in the
+ input_tensor_to_output_tensor() calculation. This behavior is useful since it
+ allows you to compute some auxiliary DNN as a separate branch of computation
+ that is separate from the main network's job of running some kind of fully
+ convolutional network over an image. For instance, you might want to have a
+ branch in your network that computes some global image level
+ summarization/feature.
+ !*/
+
+// ----------------------------------------------------------------------------------------
+
+ template <typename net_type>
+ dpoint output_tensor_to_input_tensor(
+ const net_type& net,
+ dpoint p
+ );
+ /*!
+ requires
+ - net_type is an object of type add_layer, add_skip_layer, or add_tag_layer.
+ - All layers in the net must provide map_output_to_input() functions.
+ ensures
+ - This function provides the reverse mapping of input_tensor_to_output_tensor().
+ That is, given a dpoint in net.get_output(), what is the corresponding dpoint
+ in the input tensor?
+ !*/
+
+// ----------------------------------------------------------------------------------------
+
+}
+
+#endif // DLIB_DNn_UTILITIES_ABSTRACT_H_
+
+
diff --git a/ml/dlib/dlib/dnn/validation.h b/ml/dlib/dlib/dnn/validation.h
new file mode 100644
index 000000000..c65cb4526
--- /dev/null
+++ b/ml/dlib/dlib/dnn/validation.h
@@ -0,0 +1,122 @@
+// Copyright (C) 2016 Davis E. King (davis@dlib.net)
+// License: Boost Software License See LICENSE.txt for the full license.
+#ifndef DLIB_DNn_VALIDATION_H_
+#define DLIB_DNn_VALIDATION_H_
+
+#include "../svm/cross_validate_object_detection_trainer_abstract.h"
+#include "../svm/cross_validate_object_detection_trainer.h"
+#include "layers.h"
+#include <set>
+
+namespace dlib
+{
+ namespace impl
+ {
+ inline std::set<std::string> get_labels (
+ const std::vector<mmod_rect>& rects1,
+ const std::vector<mmod_rect>& rects2
+ )
+ {
+ std::set<std::string> labels;
+ for (auto& rr : rects1)
+ labels.insert(rr.label);
+ for (auto& rr : rects2)
+ labels.insert(rr.label);
+ return labels;
+ }
+ }
+
+ template <
+ typename SUBNET,
+ typename image_array_type
+ >
+ const matrix<double,1,3> test_object_detection_function (
+ loss_mmod<SUBNET>& detector,
+ const image_array_type& images,
+ const std::vector<std::vector<mmod_rect>>& truth_dets,
+ const test_box_overlap& overlap_tester = test_box_overlap(),
+ const double adjust_threshold = 0,
+ const test_box_overlap& overlaps_ignore_tester = test_box_overlap()
+ )
+ {
+ // make sure requires clause is not broken
+ DLIB_CASSERT( is_learning_problem(images,truth_dets) == true ,
+ "\t matrix test_object_detection_function()"
+ << "\n\t invalid inputs were given to this function"
+ << "\n\t is_learning_problem(images,truth_dets): " << is_learning_problem(images,truth_dets)
+ << "\n\t images.size(): " << images.size()
+ );
+
+
+
+ double correct_hits = 0;
+ double total_true_targets = 0;
+
+ std::vector<std::pair<double,bool> > all_dets;
+ unsigned long missing_detections = 0;
+
+ resizable_tensor temp;
+
+ for (unsigned long i = 0; i < images.size(); ++i)
+ {
+ std::vector<mmod_rect> hits;
+ detector.to_tensor(&images[i], &images[i]+1, temp);
+ detector.subnet().forward(temp);
+ detector.loss_details().to_label(temp, detector.subnet(), &hits, adjust_threshold);
+
+
+ for (auto& label : impl::get_labels(truth_dets[i], hits))
+ {
+ std::vector<full_object_detection> truth_boxes;
+ std::vector<rectangle> ignore;
+ std::vector<std::pair<double,rectangle>> boxes;
+ // copy hits and truth_dets into the above three objects
+ for (auto&& b : truth_dets[i])
+ {
+ if (b.ignore)
+ {
+ ignore.push_back(b);
+ }
+ else if (b.label == label)
+ {
+ truth_boxes.push_back(full_object_detection(b.rect));
+ ++total_true_targets;
+ }
+ }
+ for (auto&& b : hits)
+ {
+ if (b.label == label)
+ boxes.push_back(std::make_pair(b.detection_confidence, b.rect));
+ }
+
+ correct_hits += impl::number_of_truth_hits(truth_boxes, ignore, boxes, overlap_tester, all_dets, missing_detections, overlaps_ignore_tester);
+ }
+ }
+
+ std::sort(all_dets.rbegin(), all_dets.rend());
+
+ double precision, recall;
+
+ double total_hits = all_dets.size();
+
+ if (total_hits == 0)
+ precision = 1;
+ else
+ precision = correct_hits / total_hits;
+
+ if (total_true_targets == 0)
+ recall = 1;
+ else
+ recall = correct_hits / total_true_targets;
+
+ matrix<double, 1, 3> res;
+ res = precision, recall, average_precision(all_dets, missing_detections);
+ return res;
+ }
+
+// ----------------------------------------------------------------------------------------
+
+}
+
+#endif // DLIB_DNn_VALIDATION_H_
+