38 files changed, 33283 insertions, 0 deletions
diff --git a/ml/dlib/dlib/dnn/core.h b/ml/dlib/dlib/dnn/core.h
new file mode 100644
index 000000000..5f1d05498
--- /dev/null
+++ b/ml/dlib/dlib/dnn/core.h
@@ -0,0 +1,3599 @@
+// Copyright (C) 2015  Davis E. King (davis@dlib.net)
+// License: Boost Software License   See LICENSE.txt for the full license.
+#ifndef DLIB_DNn_CORE_H_
+#define DLIB_DNn_CORE_H_
+
+#include "core_abstract.h"
+#include "tensor.h"
+#include <iterator>
+#include <memory>
+#include <sstream>
+#include <type_traits>
+#include "../statistics.h"
+#include "../rand.h"
+#include "../algs.h"
+#include <utility>
+#include <tuple>
+#include <cmath>
+#include <vector>
+#include "tensor_tools.h"
+#include <type_traits>
+#include "../metaprogramming.h"
+
+#ifdef _MSC_VER
+// Tell Visual Studio not to recursively inline functions very much because otherwise it
+// takes hours to compile the DNN code sometimes.  It's crazy.  Hopefully we can remove
+// this some day when the visual studio compiler is more efficient.
+#pragma inline_depth(2)
+#endif
+
+
+namespace dlib
+{
+
+// ----------------------------------------------------------------------------------------
+
+    namespace impl
+    {
+        template <typename T, typename int_<decltype(&T::get_learning_rate_multiplier)>::type = 0>
+        double get_learning_rate_multiplier (
+            const T& obj,
+            special_
+        ) { return obj.get_learning_rate_multiplier(); }
+
+        template <typename T>
+        double get_learning_rate_multiplier ( const T& , general_) { return 1; }
+    }
+    template <typename T>
+    double get_learning_rate_multiplier(const T& obj) { return impl::get_learning_rate_multiplier(obj, special_()); }
+
+// ----------------------------------------------------------------------------------------
+
+    namespace impl
+    {
+        template <typename T, typename int_<decltype(&T::get_weight_decay_multiplier)>::type = 0>
+        double get_weight_decay_multiplier (
+            const T& obj,
+            special_
+        ) { return obj.get_weight_decay_multiplier(); }
+
+        template <typename T>
+        double get_weight_decay_multiplier ( const T& , general_) { return 1; }
+    }
+    template <typename T>
+    double get_weight_decay_multiplier(const T& obj) { return impl::get_weight_decay_multiplier(obj, special_()); }
+
+// ----------------------------------------------------------------------------------------
+
+    namespace impl
+    {
+        // The reason we return an int for this version rather than doing the more straight forward thing (like we do above) is to avoid a bug in visual studio 2015.
+        template <typename T>
+        auto call_clean_method_if_exists (
+            T& obj,
+            special_
+        ) -> typename int_<decltype(&T::clean)>::type { obj.clean();  return 0;  }
+
+        template <typename T>
+        void call_clean_method_if_exists (T& , general_) {}
+    }
+    template <typename T>
+    void call_clean_method_if_exists(T& obj) { impl::call_clean_method_if_exists(obj, special_()); }
+    /*!
+        ensures
+            - calls obj.clean() if obj has a .clean() method.
+    !*/
+
+// ----------------------------------------------------------------------------------------
+
+    namespace impl
+    {
+        class repeat_input_layer 
+        {
+            /*!
+                None of the declarations in this object are really used. The only reason it
+                exists is to allow the repeat object to use a special input layer in its
+                internal networks which will cause add_tag_layer objects that happen to be
+                right at the input to not create copies of their input tensors.  So
+                introducing the repeat_input_layer object allows us to optimize the
+                implementation of add_tag_layer for a special case that arises when it's
+                used in the context of the repeat layer.
+            !*/
+        public:
+            typedef int input_type;
+
+            template <typename forward_iterator>
+            void to_tensor (
+                forward_iterator ,
+                forward_iterator ,
+                resizable_tensor& 
+            ) const
+            {
+            }
+
+            friend void serialize(const repeat_input_layer&, std::ostream&){}
+            friend void deserialize(repeat_input_layer&, std::istream&){}
+            friend std::ostream& operator<<(std::ostream& out, const repeat_input_layer&) { return out; }
+        };
+
+        inline std::string tensor_to_str (
+            const tensor& t,
+            int& min_length 
+        ) 
+        {
+            if (t.size() == 0)
+                return "";
+
+            std::ostringstream sout;
+            sout << "output size=(num:"<<  t.num_samples() << ", ";
+            sout << "k:" << t.k() << ",";
+            while (sout.tellp() < 28) sout << " ";
+            sout << "nr:" << t.nr() << ",";
+            while (sout.tellp() < 28+8) sout << " ";
+            sout << "nc:" << t.nc() << ")";
+            while (sout.tellp() < min_length) sout << " ";
+            min_length = sout.tellp();
+            sout << "\t";
+            return sout.str();
+        }
+    }
+
+// ----------------------------------------------------------------------------------------
+
+    // Tell us if T is one of the special layer types (i.e. add_layer, repeat, add_tag_layer, or
+    // add_skip_layer).
+    template <typename T> struct is_nonloss_layer_type : std::false_type {};
+    // Tell us if T is an instance of add_loss_layer.
+    template <typename T> struct is_loss_layer_type : std::false_type {};
+    // Tell us if T is an instance of add_layer
+    template <typename T> struct is_add_layer : std::false_type {};
+
+    namespace impl
+    {
+        template <size_t... indices, typename Tuple>
+        auto tuple_subset(
+            const Tuple& item, 
+            compile_time_integer_list<indices...>
+        ) -> decltype(std::make_tuple(std::get<indices>(item)...))
+        {
+            return std::make_tuple(std::get<indices>(item)...);
+        }
+
+        template <typename Head, typename... Tail>
+        std::tuple<Tail...> basic_tuple_tail(
+            const std::tuple<Head, Tail...>& item
+        )
+        {
+            return tuple_subset(item, typename make_compile_time_integer_range<sizeof...(Tail)>::type());
+        }
+
+        template <typename T>
+        std::tuple<T> tuple_flatten(const T& t) 
+        {
+            return std::make_tuple(t);
+        }
+
+        template <typename... T>
+        auto tuple_flatten(
+            const std::tuple<T...>& item
+        ) -> decltype(tuple_flatten(item, typename make_compile_time_integer_range<sizeof...(T)>::type()))
+        {
+            return tuple_flatten(item, typename make_compile_time_integer_range<sizeof...(T)>::type());
+        }
+
+        template <size_t... indices, typename... T>
+        auto tuple_flatten(
+            const std::tuple<T...>& item, 
+            compile_time_integer_list<indices...>
+        ) -> decltype(std::tuple_cat(tuple_flatten(std::get<indices-1>(item))...))
+        {
+            return std::tuple_cat(tuple_flatten(std::get<indices-1>(item))...);
+        }
+
+        template <typename T>
+        struct tuple_head_helper
+        {
+            typedef T type;
+            static const type& get(const T& item) 
+            {
+                return item;
+            }
+        };
+
+        template <typename T, typename... U>
+        struct tuple_head_helper<std::tuple<T, U...>>
+        {
+            typedef typename tuple_head_helper<T>::type type;
+            static const type& get(const std::tuple<T,U...>& item) 
+            {
+                return tuple_head_helper<T>::get(std::get<0>(item));
+            }
+        };
+
+        template <typename T> struct alwaysbool { typedef bool type; };
+        // one more structure for VS 2015 UP3 support workaround
+        template <typename T> struct alwaysbool2 { typedef bool type; };
+
+        resizable_tensor& rt();
+
+        // The significance of a layer's backward method requiring forward's outputs is
+        // that such as layer can't have an in-place layer stacked on top of it because
+        // in-place layers overwrite the output of the layer they sit on top of.
+        template <typename layer_type, typename SUBNET>
+        constexpr auto backward_requires_forward_output(
+            layer_type& layer,
+            SUBNET& sub
+        ) -> typename alwaysbool<decltype(layer.backward(rt(),rt(),sub,rt()))>::type
+        {
+            return true;
+        }
+
+        template <typename layer_type, typename SUBNET>
+        constexpr auto backward_requires_forward_output(
+            layer_type& layer,
+            SUBNET& sub
+        ) -> typename alwaysbool<decltype(layer.backward(rt(),sub,rt()))>::type
+        {
+            return false;
+        }
+
+        template <typename layer_type, typename SUBNET>
+        constexpr auto backward_requires_forward_output(
+            layer_type& layer,
+            SUBNET& sub
+        ) -> typename alwaysbool<decltype(layer.backward_inplace(rt(),rt(),sub.get_gradient_input(),rt()))>::type
+        {
+            return true;
+        }
+
+        template <typename layer_type, typename SUBNET>
+        constexpr auto backward_requires_forward_output(
+            layer_type& layer,
+            SUBNET& sub
+        ) -> typename alwaysbool<decltype(layer.backward_inplace(rt(),sub.get_gradient_input(),rt()))>::type
+        {
+            return false;
+        }
+
+        template <typename layer_type, typename SUBNET>
+        constexpr auto has_inplace_backward(
+            layer_type& layer,
+            SUBNET& sub
+        ) -> typename alwaysbool2<decltype(layer.backward(rt(),rt(),sub,rt()))>::type
+        {
+            return false;
+        }
+
+        template <typename layer_type, typename SUBNET>
+        constexpr auto has_inplace_backward(
+            layer_type& layer,
+            SUBNET& sub
+        ) -> typename alwaysbool2<decltype(layer.backward(rt(),sub,rt()))>::type
+        {
+            return false;
+        }
+
+        template <typename layer_type, typename SUBNET>
+        constexpr auto has_inplace_backward(
+            layer_type& layer,
+            SUBNET& sub
+        ) -> typename alwaysbool2<decltype(layer.backward_inplace(rt(),rt(),sub.get_gradient_input(),rt()))>::type
+        {
+            return true;
+        }
+
+        template <typename layer_type, typename SUBNET>
+        constexpr auto has_inplace_backward(
+            layer_type& layer,
+            SUBNET& sub
+        ) -> typename alwaysbool2<decltype(layer.backward_inplace(rt(),sub.get_gradient_input(),rt()))>::type
+        {
+            return true;
+        }
+
+        template <typename layer_type, typename SUBNET>
+        constexpr auto is_inplace_layer(
+            layer_type& layer,
+            const SUBNET& sub 
+        ) -> typename alwaysbool2<decltype(layer.forward(sub,rt()))>::type
+        {
+            return false;
+        }
+
+        template <typename layer_type, typename SUBNET>
+        constexpr auto is_inplace_layer(
+            layer_type& layer,
+            const SUBNET& sub
+        ) -> typename alwaysbool<decltype(layer.forward_inplace(sub.get_output(),rt()))>::type
+        {
+            return true;
+        }
+
+        template <typename layer_type, typename SUBNET>
+        auto call_layer_backward(
+            layer_type& layer,
+            const tensor& computed_output, 
+            const tensor& gradient_input, 
+            SUBNET& sub, 
+            tensor& params_grad
+        ) -> decltype(layer.backward(computed_output,gradient_input,sub,params_grad))
+        {
+            layer.backward(computed_output,gradient_input,sub,params_grad);
+        }
+
+        template <typename layer_type, typename SUBNET>
+        auto call_layer_backward(
+            layer_type& layer,
+            const tensor& , 
+            const tensor& gradient_input, 
+            SUBNET& sub, 
+            tensor& params_grad
+        ) -> decltype(layer.backward(gradient_input,sub,params_grad))
+        {
+            layer.backward(gradient_input,sub,params_grad);
+        }
+
+        template <typename layer_type, typename SUBNET>
+        auto call_layer_backward(
+            layer_type& layer,
+            const tensor& computed_output, 
+            const tensor& gradient_input, 
+            SUBNET& sub, 
+            tensor& params_grad
+        ) -> decltype(layer.backward_inplace(computed_output,gradient_input,sub.get_gradient_input(),params_grad))
+        {
+            layer.backward_inplace(computed_output,gradient_input,sub.get_gradient_input(),params_grad);
+        }
+
+        template <typename layer_type, typename SUBNET>
+        auto call_layer_backward(
+            layer_type& layer,
+            const tensor& , 
+            const tensor& gradient_input, 
+            SUBNET& sub, 
+            tensor& params_grad
+        ) -> decltype(layer.backward_inplace(gradient_input,sub.get_gradient_input(),params_grad))
+        {
+            layer.backward_inplace(gradient_input,sub.get_gradient_input(),params_grad);
+        }
+
+
+        template <typename layer_type, typename SUBNET>
+        auto call_layer_forward(
+            layer_type& layer,
+            const SUBNET& sub, 
+            tensor& /*data_output*/
+        ) -> decltype(layer.forward(sub,rt()))
+        {
+            // This overload of call_layer_forward() is here because this template
+            // naturally gets instantiated but only on code paths that never get executed.
+            // So rather than writing a bunch of hard to read template magic around call
+            // sites we just have this overload that doesn't do anything (and an assert to
+            // make sure that's the case).
+            DLIB_CASSERT(false, "This should never happen");
+        }
+
+        template <typename layer_type, typename SUBNET>
+        auto call_layer_forward(
+            layer_type& layer,
+            const SUBNET& sub, 
+            resizable_tensor& data_output
+        ) -> decltype(layer.forward(sub,data_output))
+        {
+            layer.forward(sub,data_output);
+        }
+
+        template <typename layer_type, typename SUBNET>
+        auto call_layer_forward(
+            layer_type& layer,
+            const SUBNET& sub, 
+            tensor& data_output
+        ) -> decltype(layer.forward_inplace(sub.get_output(),data_output))
+        {
+            layer.forward_inplace(sub.get_output(),data_output);
+        }
+
+        template <typename layer_type, typename SUBNET>
+        auto call_layer_forward(
+            layer_type& layer,
+            const SUBNET& sub, 
+            resizable_tensor& data_output
+        ) -> decltype(layer.forward_inplace(sub.get_output(),data_output))
+        {
+            if (!have_same_dimensions(data_output, sub.get_output()))
+                data_output.copy_size(sub.get_output());
+            layer.forward_inplace(sub.get_output(),static_cast<tensor&>(data_output));
+        }
+
+
+    } // end namespace impl
+
+    template <typename... T>
+    typename impl::tuple_head_helper<std::tuple<T...>>::type tuple_head (
+        const std::tuple<T...>& item
+    ) 
+    {
+        return impl::tuple_head_helper<std::tuple<T...>>::get(item);
+    }
+
+    template <typename... T>
+    auto tuple_tail(
+        const std::tuple<T...>& item
+    ) -> decltype(impl::basic_tuple_tail(impl::tuple_flatten(item)))
+    {
+        return impl::basic_tuple_tail(impl::tuple_flatten(item));
+    }
+
+    inline std::tuple<> tuple_tail(
+        const std::tuple<>& item
+    ) 
+    {
+        return item;
+    }
+// ----------------------------------------------------------------------------------------
+
+    template <typename T>
+    class sstack
+    {
+    public:
+        typedef T value_type;
+
+        sstack() = delete;
+
+        sstack (
+            T* data_,
+            size_t s
+        ) : data(data_), mysize(s) {}
+
+        const T& top() const 
+        { 
+            DLIB_CASSERT(size() != 0, "You can't call top() on an empty stack");
+            return *data;
+        }
+        T& top()  
+        { 
+            DLIB_CASSERT(size() != 0, "You can't call top() on an empty stack");
+            return *data;
+        }
+
+        size_t size() const { return mysize; }
+
+        sstack pop(size_t num=1) 
+        { 
+            DLIB_CASSERT(num <= size(), "You can't pop more things from the stack than it has in it.");
+            return sstack(data+num, mysize-num);
+        }
+
+    private:
+
+        T* data;
+        size_t mysize;
+    };
+
+    template <typename T>
+    sstack<T> make_sstack(std::vector<T>& item)
+    {
+        return sstack<T>(item.data(), item.size());
+    }
+
+// ----------------------------------------------------------------------------------------
+// ----------------------------------------------------------------------------------------
+// ----------------------------------------------------------------------------------------
+
+    namespace dimpl
+    {
+        template <typename T, bool is_first = true, typename enabled=void>
+        class subnet_wrapper
+        {
+            /*!
+                WHAT THIS OBJECT REPRESENTS
+                    This is a tool that makes an add_layer or add_loss_layer object
+                    expose only the part of its interface defined by the SUBNET
+                    type in layers_abstract.h.  This way, when we pass subnetwork
+                    objects to the layer callbacks those callbacks won't be able to 
+                    interact with the subnetworks in a way other than specified 
+                    by the SUBNET interface spec.
+
+                    We also allow the top layer of a subnet_wrapper stack to call the
+                    private_get_output() and private_get_gradient_input() functions.  This
+                    way, layers that have had their output/gradient overwritten by in-place
+                    layers can only be accessed from the in-place layers that sit directly
+                    on top of them since those in-place layers are the only layers that
+                    know how to interact with them properly.
+            !*/
+
+        public:
+            subnet_wrapper(const subnet_wrapper&) = delete;
+            subnet_wrapper& operator=(const subnet_wrapper&) = delete;
+
+            subnet_wrapper(T& l_, unsigned int sef) : l(l_),_sample_expansion_factor(sef) {}
+            // Not much here because in this case T is one of the input layer types 
+            // that doesn't have anything in it.
+            typedef T layer_details_type;
+            const layer_details_type& layer_details() const { return l; }
+            unsigned int sample_expansion_factor() const { return _sample_expansion_factor; }
+        private:
+            T& l;
+            unsigned int _sample_expansion_factor;
+        };
+
+        template <typename T>
+        class subnet_wrapper<T,true, typename std::enable_if<is_nonloss_layer_type<T>::value>::type>
+        {
+
+        public:
+            subnet_wrapper(const subnet_wrapper&) = delete;
+            subnet_wrapper& operator=(const subnet_wrapper&) = delete;
+
+            typedef T wrapped_type;
+            const static size_t num_computational_layers = T::num_computational_layers;
+            const static size_t num_layers = T::num_layers;
+            typedef typename T::layer_details_type layer_details_type;
+
+            subnet_wrapper(T& l_, unsigned int = 0) : l(l_),subnetwork(l.subnet(), l.sample_expansion_factor()) {}
+
+            const tensor& get_output() const { return l.private_get_output(); }
+            tensor& get_gradient_input() { return l.private_get_gradient_input(); }
+
+            const layer_details_type& layer_details() const { return l.layer_details(); }
+
+            const subnet_wrapper<typename T::subnet_type,false>& subnet() const { return subnetwork; }
+            subnet_wrapper<typename T::subnet_type,false>& subnet() { return subnetwork; }
+            unsigned int sample_expansion_factor() const { return l.sample_expansion_factor(); }
+
+        private:
+            T& l;
+            subnet_wrapper<typename T::subnet_type,false> subnetwork;
+        };
+
+        template <typename T>
+        class subnet_wrapper<T,false, typename std::enable_if<is_nonloss_layer_type<T>::value>::type>
+        {
+
+        public:
+            subnet_wrapper(const subnet_wrapper&) = delete;
+            subnet_wrapper& operator=(const subnet_wrapper&) = delete;
+
+            typedef T wrapped_type;
+            const static size_t num_computational_layers = T::num_computational_layers;
+            const static size_t num_layers = T::num_layers;
+            typedef typename T::layer_details_type layer_details_type;
+
+            subnet_wrapper(T& l_, unsigned int = 0) : l(l_),subnetwork(l.subnet(), l.sample_expansion_factor()) {}
+
+            const tensor& get_output() const { return l.get_output(); }
+            tensor& get_gradient_input() { return l.get_gradient_input(); }
+
+            const layer_details_type& layer_details() const { return l.layer_details(); }
+
+            const subnet_wrapper<typename T::subnet_type,false>& subnet() const { return subnetwork; }
+            subnet_wrapper<typename T::subnet_type,false>& subnet() { return subnetwork; }
+            unsigned int sample_expansion_factor() const { return l.sample_expansion_factor(); }
+
+        private:
+            T& l;
+            subnet_wrapper<typename T::subnet_type,false> subnetwork;
+        };
+    }
+
+// ----------------------------------------------------------------------------------------
+
+    template <typename LAYER_DETAILS, typename SUBNET, typename enabled = void>
+    class add_layer;
+
+    template <typename LAYER_DETAILS, typename SUBNET, typename enabled>
+    void serialize(const add_layer<LAYER_DETAILS,SUBNET,enabled>& item, std::ostream& out);
+    template <typename LAYER_DETAILS, typename SUBNET, typename enabled>
+    void deserialize(add_layer<LAYER_DETAILS,SUBNET,enabled>& item, std::istream& in);
+
+    template <typename T, typename U>
+    struct is_nonloss_layer_type<add_layer<T,U>> : std::true_type {};
+
+    template <typename LAYER_DETAILS, typename SUBNET>
+    class add_layer<LAYER_DETAILS,SUBNET,
+            typename std::enable_if<is_nonloss_layer_type<SUBNET>::value>::type>
+    {
+    public:
+        typedef LAYER_DETAILS layer_details_type;
+        typedef SUBNET subnet_type;
+        typedef typename subnet_type::input_type input_type;
+        const static size_t num_layers = subnet_type::num_layers + 1;
+        const static size_t num_computational_layers = subnet_type::num_computational_layers + 1;
+
+        add_layer(
+        ):
+            subnetwork(new subnet_type()),
+            this_layer_setup_called(false),
+            gradient_input_is_stale(true),
+            get_output_and_gradient_input_disabled(false)
+        {
+            if (this_layer_operates_inplace())
+                subnetwork->disable_output_and_gradient_getters();
+        }
+
+        add_layer(const add_layer& item)
+        {
+            details = item.details;
+            subnetwork.reset(new subnet_type(*item.subnetwork));
+            this_layer_setup_called = item.this_layer_setup_called;
+            gradient_input_is_stale = item.gradient_input_is_stale;
+            get_output_and_gradient_input_disabled = item.get_output_and_gradient_input_disabled;
+            x_grad = item.x_grad;
+            cached_output = item.cached_output; 
+            params_grad = item.params_grad; 
+            temp_tensor = item.temp_tensor;
+        }
+        add_layer& operator=(const add_layer& item) { add_layer(item).swap(*this); return *this;}
+        add_layer(add_layer&& item) : add_layer() { swap(item); }
+        add_layer& operator=(add_layer&& item) { swap(item); return *this; }
+
+        template <typename T, typename U, typename E>
+        friend class add_layer;
+        template <typename T, bool is_first, typename E>
+        friend class dimpl::subnet_wrapper;
+        template <unsigned long T, typename U, typename E>
+        friend class add_tag_layer;
+        template <template<typename> class T, typename U>
+        friend class add_skip_layer;
+        template <size_t N, template<typename> class L, typename S>
+        friend class repeat;
+
+        // Allow copying networks from one to another as long as their corresponding 
+        // layers can be constructed from each other.
+        template <typename T, typename U, typename E>
+        add_layer(
+            const add_layer<T,U,E>& item
+        ) :
+            details(item.layer_details()), 
+            subnetwork(new subnet_type(item.subnet())),
+            this_layer_setup_called(item.this_layer_setup_called),
+            gradient_input_is_stale(item.gradient_input_is_stale),
+            get_output_and_gradient_input_disabled(item.get_output_and_gradient_input_disabled),
+            x_grad(item.x_grad),
+            cached_output(item.cached_output)
+        {
+            if (this_layer_operates_inplace())
+                subnetwork->disable_output_and_gradient_getters();
+        }
+
+        template <typename ...T>
+        add_layer(
+            const LAYER_DETAILS& layer_det, 
+            T&& ...args
+        ) : 
+            details(layer_det), 
+            subnetwork(new subnet_type(std::forward<T>(args)...)),
+            this_layer_setup_called(false),
+            gradient_input_is_stale(true),
+            get_output_and_gradient_input_disabled(false)
+        {
+            if (this_layer_operates_inplace())
+                subnetwork->disable_output_and_gradient_getters();
+        }
+
+        template <typename T, typename ...U>
+        struct disable_forwarding_constr 
+        {
+            const static bool value = std::is_constructible<LAYER_DETAILS,T>::value;
+        };
+        template <typename ...T, typename ...U>
+        struct disable_forwarding_constr<std::tuple<T...>,U...>
+        {
+            const static bool value = disable_forwarding_constr<typename std::remove_reference<T>::type...>::value;
+        };
+        template <typename T, typename ...U>
+        struct disable_forwarding_constr<std::tuple<T>,U...>
+        {
+            const static bool value = disable_forwarding_constr<typename std::remove_reference<T>::type>::value;
+        };
+        template <typename ...U>
+        struct disable_forwarding_constr<std::tuple<>,U...>
+        {
+            const static bool value = true;
+        };
+        template <typename ...T>
+        struct disable_forwarding_constr<add_layer<T...>>
+        {
+            const static bool value = true;
+        };
+
+        template <
+            typename ...T,
+            typename = typename std::enable_if<!disable_forwarding_constr<typename std::remove_reference<T>::type...>::value>::type
+            >
+        add_layer(
+            T&& ...args
+        ) : 
+            subnetwork(new subnet_type(std::forward<T>(args)...)),
+            this_layer_setup_called(false),
+            gradient_input_is_stale(true),
+            get_output_and_gradient_input_disabled(false)
+        {
+            if (this_layer_operates_inplace())
+                subnetwork->disable_output_and_gradient_getters();
+        }
+
+        template <typename ...T>
+        add_layer(
+            LAYER_DETAILS&& layer_det, 
+            T&& ...args
+        ) : 
+            details(std::move(layer_det)), 
+            subnetwork(new subnet_type(std::forward<T>(args)...)),
+            this_layer_setup_called(false),
+            gradient_input_is_stale(true),
+            get_output_and_gradient_input_disabled(false)
+        {
+            if (this_layer_operates_inplace())
+                subnetwork->disable_output_and_gradient_getters();
+        }
+
+        template <typename ...T, typename LD, typename ...U>
+        add_layer(
+            const std::tuple<LD,U...>& layer_det, 
+            T&& ...args
+        ) : 
+            details(tuple_head(layer_det)), 
+            subnetwork(new subnet_type(tuple_tail(layer_det),std::forward<T>(args)...)),
+            this_layer_setup_called(false),
+            gradient_input_is_stale(true),
+            get_output_and_gradient_input_disabled(false)
+        {
+            if (this_layer_operates_inplace())
+                subnetwork->disable_output_and_gradient_getters();
+        }
+
+        template <typename ...T, typename LD, typename ...U>
+        add_layer(
+            std::tuple<>,
+            const std::tuple<LD,U...>& layer_det, 
+            T&& ...args
+        ) : add_layer(layer_det,args...) { }
+
+        add_layer (
+            std::tuple<>
+        ) : add_layer() {}
+
+        template <typename ...T>
+        add_layer(
+            std::tuple<>, 
+            LAYER_DETAILS&& layer_det, 
+            T&& ...args
+        ) : add_layer(layer_det, args...) { }
+
+        template <typename forward_iterator>
+        void to_tensor (
+            forward_iterator ibegin,
+            forward_iterator iend,
+            resizable_tensor& data
+        ) const
+        {
+            subnetwork->to_tensor(ibegin,iend,data);
+        }
+
+        template <typename forward_iterator>
+        const tensor& operator() (
+            forward_iterator ibegin,
+            forward_iterator iend
+        )
+        {
+            to_tensor(ibegin,iend,temp_tensor);
+            return forward(temp_tensor);
+        }
+
+
+        const tensor& operator() (const input_type& x)
+        {
+            return (*this)(&x, &x+1);
+        }
+
+        const tensor& forward(const tensor& x)
+        {
+            subnetwork->forward(x);
+            const dimpl::subnet_wrapper<subnet_type> wsub(*subnetwork);
+            if (!this_layer_setup_called)
+            {
+                details.setup(wsub);
+                this_layer_setup_called = true;
+            }
+            if (this_layer_operates_inplace())
+                impl::call_layer_forward(details, wsub, private_get_output());
+            else
+                impl::call_layer_forward(details, wsub, cached_output);
+
+            gradient_input_is_stale = true;
+            return private_get_output();
+        }
+
+    private:
+        tensor& private_get_output() const
+        { 
+            if (const_cast<add_layer&>(*this).this_layer_operates_inplace())
+                return subnetwork->private_get_output();
+            else
+                return const_cast<resizable_tensor&>(cached_output); 
+        }
+        tensor& private_get_gradient_input() 
+        { 
+            if (this_layer_operates_inplace())
+            {
+                return subnetwork->private_get_gradient_input();
+            }
+            else
+            {
+                if (gradient_input_is_stale)
+                {
+                    gradient_input_is_stale = false;
+                    x_grad.copy_size(private_get_output());
+                    x_grad = 0;
+                }
+                return x_grad; 
+            }
+        }
+        void disable_output_and_gradient_getters (
+        ) { get_output_and_gradient_input_disabled = true; }
+    public:
+        const tensor& get_output() const 
+        { 
+            if (get_output_and_gradient_input_disabled)
+                throw dlib::error("Accessing this layer's get_output() is disabled because an in-place layer has been stacked on top of it.");
+            return private_get_output(); 
+        }
+        tensor& get_gradient_input() 
+        { 
+            if (get_output_and_gradient_input_disabled)
+                throw dlib::error("Accessing this layer's get_gradient_input() is disabled because an in-place layer has been stacked on top of it.");
+            return private_get_gradient_input();
+        }
+
+        const tensor& get_final_data_gradient(
+        ) const { return subnetwork->get_final_data_gradient(); }
+
+        void back_propagate_error(const tensor& x)
+        {
+            back_propagate_error(x, private_get_gradient_input());
+        }
+        void back_propagate_error(const tensor& x, const tensor& gradient_input)
+        {
+            dimpl::subnet_wrapper<subnet_type> wsub(*subnetwork);
+            params_grad.copy_size(details.get_layer_params());
+            impl::call_layer_backward(details, private_get_output(),
+                gradient_input, wsub, static_cast<tensor&>(params_grad));
+
+            subnetwork->back_propagate_error(x); 
+
+            // zero out get_gradient_input()
+            gradient_input_is_stale = true;
+        }
+
+        template <typename solver_type>
+        void update_parameters(sstack<solver_type> solvers, double learning_rate)
+        {
+            DLIB_CASSERT(solvers.size()>=num_computational_layers);
+            // Don't try to adjust the parameters if this layer doesn't have any or the
+            // learning rate is disabled for this layer.
+            if (params_grad.size() != 0 && get_learning_rate_multiplier(details) != 0)
+            {
+                const tensor& step = solvers.top()(learning_rate, details, static_cast<const tensor&>(params_grad));
+                tt::add(details.get_layer_params(), details.get_layer_params(), step);
+            }
+            subnetwork->update_parameters(solvers.pop(), learning_rate);
+        }
+
+        const tensor& get_parameter_gradient(
+        ) const { return params_grad; }
+
+        tensor& get_parameter_gradient (
+        ) { return params_grad; }
+
+        const subnet_type& subnet() const { return *subnetwork; }
+        subnet_type& subnet() { return *subnetwork; }
+
+        const layer_details_type& layer_details() const { return details; } 
+        layer_details_type& layer_details() { return details; } 
+
+        unsigned int sample_expansion_factor() const { return subnet().sample_expansion_factor(); }
+
+        void clean()
+        {
+            x_grad.clear();
+            cached_output.clear();
+            params_grad.clear();
+            temp_tensor.clear();
+            gradient_input_is_stale = true;
+            subnetwork->clean();
+            call_clean_method_if_exists(details);
+        }
+
+        friend void serialize(const add_layer& item, std::ostream& out)
+        {
+            int version = 2;
+            serialize(version, out);
+            serialize(*item.subnetwork, out);
+            serialize(item.details, out);
+            serialize(item.this_layer_setup_called, out);
+            serialize(item.gradient_input_is_stale, out);
+            serialize(item.get_output_and_gradient_input_disabled, out);
+            serialize(item.x_grad, out);
+            serialize(item.cached_output, out);
+            serialize(item.params_grad, out);
+        }
+
+        friend void deserialize(add_layer& item, std::istream& in)
+        {
+            int version = 0;
+            deserialize(version, in);
+            if (!(1 <= version && version <= 2))
+                throw serialization_error("Unexpected version found while deserializing dlib::add_layer.");
+            deserialize(*item.subnetwork, in);
+            deserialize(item.details, in);
+            deserialize(item.this_layer_setup_called, in);
+            deserialize(item.gradient_input_is_stale, in);
+            deserialize(item.get_output_and_gradient_input_disabled, in);
+            deserialize(item.x_grad, in);
+            deserialize(item.cached_output, in);
+            if (version == 2)
+                deserialize(item.params_grad, in);
+        }
+
+        friend std::ostream& operator<< (std::ostream& out, const add_layer& item)
+        {
+            int min_length = 0;
+            item.print(out, 0, min_length);
+            return out;
+        }
+
+        void print (std::ostream& out, unsigned long idx, int& min_length) const
+        {
+            out << "layer<" << idx << ">\t" << impl::tensor_to_str(private_get_output(), min_length) << layer_details() << "\n";
+            subnet().print(out, idx+1, min_length);
+        }
+
+    private:
+
+        bool this_layer_operates_inplace(
+        ) 
+        {
+            // This layer can run in-place if it's an in-place capable layer and also if
+            // the layer it's on top of doesn't need its own output tensor (since in-place
+            // layers overwrite that tensor)
+            return impl::is_inplace_layer(details, *subnetwork) && !subnetwork->this_layer_requires_forward_output();
+        }
+        bool this_layer_requires_forward_output(
+        ) 
+        {
+            return impl::backward_requires_forward_output(details, *subnetwork);
+        }
+
+        void swap(add_layer& item)
+        {
+            std::swap(subnetwork,item.subnetwork);
+            std::swap(details, item.details);
+            std::swap(this_layer_setup_called, item.this_layer_setup_called);
+            std::swap(gradient_input_is_stale, item.gradient_input_is_stale);
+            std::swap(get_output_and_gradient_input_disabled, item.get_output_and_gradient_input_disabled);
+            std::swap(x_grad, item.x_grad);
+            std::swap(cached_output, item.cached_output);
+            std::swap(params_grad, item.params_grad);
+        }
+
+
+        LAYER_DETAILS details;
+        std::unique_ptr<subnet_type> subnetwork;
+        bool this_layer_setup_called;
+        bool gradient_input_is_stale;
+        bool get_output_and_gradient_input_disabled;
+        // Note that if this_layer_operates_inplace()==true then x_grad and cached_output
+        // are not used at all.  Instead, this layer uses these variables from the lower
+        // layer.
+        resizable_tensor x_grad;
+        resizable_tensor cached_output; 
+
+        resizable_tensor params_grad; 
+
+        // temp_tensor doesn't logically contribute to the state of this object.  
+        // It is here only to prevent it from being reallocated over and over.
+        resizable_tensor temp_tensor;
+
+    };
+
+    template <typename T, typename U, typename E>
+    struct is_add_layer<add_layer<T,U,E>> : std::true_type {};
+    template <typename T, typename U, typename E>
+    struct is_add_layer<const add_layer<T,U,E>> : std::true_type {};
+    template <typename T, typename U, typename E>
+    struct is_add_layer<add_layer<T,U,E>&> : std::true_type {};
+    template <typename T, typename U, typename E>
+    struct is_add_layer<const add_layer<T,U,E>&> : std::true_type {};
+
+// ----------------------------------------------------------------------------------------
+
+// This version of add_layer handles the special case where the subnetwork being given is
+// just an input layer object.
+    template <typename LAYER_DETAILS, typename INPUT_LAYER, typename enabled>
+    class add_layer
+    {
+    public:
+        typedef LAYER_DETAILS layer_details_type;
+        typedef INPUT_LAYER subnet_type;
+        typedef typename INPUT_LAYER::input_type input_type;
+        const static size_t num_layers = 2;
+        const static size_t num_computational_layers = 1;
+
+        add_layer(
+        ): 
+            this_layer_setup_called(false),
+            gradient_input_is_stale(true),
+            get_output_and_gradient_input_disabled(false),
+            _sample_expansion_factor(0)
+        {}
+
+        add_layer(const add_layer&) = default;
+        add_layer(add_layer&& item) : add_layer() { swap(item); }
+        add_layer& operator=(const add_layer&) = default;
+        add_layer& operator=(add_layer&& item) { swap(item); return *this; }
+
+        template <typename T, typename U, typename E>
+        friend class add_layer;
+        template <typename T, bool is_first, typename E>
+        friend class dimpl::subnet_wrapper;
+        template <unsigned long T, typename U, typename E>
+        friend class add_tag_layer;
+        template <template<typename> class T, typename U>
+        friend class add_skip_layer;
+        template <size_t N, template<typename> class L, typename S>
+        friend class repeat;
+
+        // Allow copying networks from one to another as long as their corresponding 
+        // layers can be constructed from each other.
+        template <typename T, typename U, typename E>
+        add_layer(
+            const add_layer<T,U,E>& item
+        ):
+            input_layer(item.subnet()),
+            details(item.layer_details()),
+            this_layer_setup_called(item.this_layer_setup_called),
+            gradient_input_is_stale(item.gradient_input_is_stale),
+            get_output_and_gradient_input_disabled(false),
+            _sample_expansion_factor(item._sample_expansion_factor),
+            x_grad(item.x_grad),
+            cached_output(item.cached_output),
+            grad_final(item.grad_final)
+        {
+        }
+
+        add_layer(
+            const LAYER_DETAILS& layer_det
+        ) : 
+            details(layer_det), 
+            this_layer_setup_called(false),
+            gradient_input_is_stale(true),
+            get_output_and_gradient_input_disabled(false),
+            _sample_expansion_factor(0)
+        {}
+
+        add_layer(
+            const INPUT_LAYER& il 
+        ) : 
+            input_layer(il), 
+            this_layer_setup_called(false),
+            gradient_input_is_stale(true),
+            get_output_and_gradient_input_disabled(false),
+            _sample_expansion_factor(0)
+        {}
+
+        add_layer(
+            LAYER_DETAILS&& layer_det
+        ) : 
+            details(std::move(layer_det)), 
+            this_layer_setup_called(false),
+            gradient_input_is_stale(true),
+            get_output_and_gradient_input_disabled(false),
+            _sample_expansion_factor(0)
+        {}
+
+        add_layer(
+            LAYER_DETAILS layer_det, 
+            INPUT_LAYER il
+        ) : 
+            details(std::move(layer_det)),
+            input_layer(std::move(il)),
+            this_layer_setup_called(false),
+            gradient_input_is_stale(true),
+            get_output_and_gradient_input_disabled(false),
+            _sample_expansion_factor(0)
+        {}
+
+        add_layer(
+            std::tuple<>,
+            const LAYER_DETAILS& layer_det
+        ) : add_layer(layer_det) {}
+
+        add_layer(
+            std::tuple<>,
+            LAYER_DETAILS&& layer_det
+        ) : add_layer(layer_det) {}
+
+        add_layer(
+            std::tuple<>,
+            LAYER_DETAILS layer_det, 
+            INPUT_LAYER il
+        ) : add_layer(layer_det,il) {}
+
+        add_layer(
+            const std::tuple<LAYER_DETAILS>& layer_det
+        ) : add_layer(tuple_head(layer_det)) {}
+
+        add_layer(
+            const std::tuple<LAYER_DETAILS>& layer_det,
+            INPUT_LAYER il
+        ) : add_layer(tuple_head(layer_det),il) {}
+
+        template <typename forward_iterator>
+        void to_tensor (
+            forward_iterator ibegin,
+            forward_iterator iend,
+            resizable_tensor& data
+        ) const
+        {
+            input_layer.to_tensor(ibegin, iend, data);
+            // make sure the input layer's to_tensor() function is implemented properly.
+            DLIB_CASSERT(data.num_samples() >= std::distance(ibegin,iend), 
+            "The input layer can't produce fewer output tensors than there are inputs.");
+            DLIB_CASSERT(data.num_samples()%std::distance(ibegin,iend) == 0,
+            "The number of tensors produced by the input layer must be an integer multiple of the number of input objects.");
+
+            _sample_expansion_factor = data.num_samples()/std::distance(ibegin,iend);
+            data.async_copy_to_device();
+        }
+
+
+        template <typename forward_iterator>
+        const tensor& operator() (
+            forward_iterator ibegin,
+            forward_iterator iend
+        )
+        {
+            to_tensor(ibegin,iend,temp_tensor);
+            return forward(temp_tensor);
+        }
+
+
+        const tensor& operator() (const input_type& x)
+        {
+            return (*this)(&x, &x+1);
+        }
+
+        const tensor& forward (const tensor& x)
+        {
+            DLIB_CASSERT(sample_expansion_factor() != 0, "You must call to_tensor() before this function can be used.");
+            DLIB_CASSERT(x.num_samples()%sample_expansion_factor() == 0);
+            subnet_wrapper wsub(x, grad_final, _sample_expansion_factor);
+            if (!this_layer_setup_called)
+            {
+                details.setup(wsub);
+                this_layer_setup_called = true;
+            }
+            impl::call_layer_forward(details, wsub, cached_output);
+            gradient_input_is_stale = true;
+            return private_get_output();
+        }
+
+    private:
+        tensor& private_get_output() const { return const_cast<resizable_tensor&>(cached_output); }
+        tensor& private_get_gradient_input() 
+        { 
+            if (gradient_input_is_stale)
+            {
+                gradient_input_is_stale = false;
+                x_grad.copy_size(private_get_output());
+                x_grad = 0;
+            }
+            return x_grad; 
+        }
+        void disable_output_and_gradient_getters (
+        ) { get_output_and_gradient_input_disabled = true; }
+    public:
+        const tensor& get_output() const 
+        { 
+            if (get_output_and_gradient_input_disabled)
+                throw dlib::error("Accessing this layer's get_output() is disabled because an in-place layer has been stacked on top of it.");
+            return private_get_output(); 
+        }
+        tensor& get_gradient_input() 
+        { 
+            if (get_output_and_gradient_input_disabled)
+                throw dlib::error("Accessing this layer's get_gradient_input() is disabled because an in-place layer has been stacked on top of it.");
+            return private_get_gradient_input();
+        }
+
+        const tensor& get_final_data_gradient(
+        ) const { return grad_final; }
+
+        void back_propagate_error(const tensor& x)
+        {
+            back_propagate_error(x, private_get_gradient_input());
+        }
+        void back_propagate_error(const tensor& x, const tensor& gradient_input)
+        {
+            // make sure grad_final is initialized to 0
+            if (!have_same_dimensions(x, grad_final))
+                grad_final.copy_size(x);
+            grad_final = 0;  
+
+            subnet_wrapper wsub(x, grad_final, _sample_expansion_factor);
+            params_grad.copy_size(details.get_layer_params());
+            impl::call_layer_backward(details, private_get_output(),
+                gradient_input, wsub, static_cast<tensor&>(params_grad));
+
+            // zero out get_gradient_input()
+            gradient_input_is_stale = true;
+        }
+
+        template <typename solver_type>
+        void update_parameters(sstack<solver_type> solvers, double learning_rate)
+        {
+            DLIB_CASSERT(solvers.size()>=num_computational_layers);
+            // Don't try to adjust the parameters if this layer doesn't have any or the
+            // learning rate is disabled for this layer.
+            if (params_grad.size() != 0 && get_learning_rate_multiplier(details) != 0) 
+            {
+                const tensor& step = solvers.top()(learning_rate, details, static_cast<const tensor&>(params_grad));
+                tt::add(details.get_layer_params(), details.get_layer_params(), step);
+            }
+        }
+
+        const tensor& get_parameter_gradient(
+        ) const { return params_grad; }
+
+        tensor& get_parameter_gradient (
+        )  { return params_grad; }
+
+        const subnet_type& subnet() const { return input_layer; } 
+        subnet_type& subnet() { return input_layer; } 
+
+        const layer_details_type& layer_details() const { return details; } 
+        layer_details_type& layer_details() { return details; } 
+
+        unsigned int sample_expansion_factor() const { return _sample_expansion_factor; }
+
+        void clean()
+        {
+            x_grad.clear();
+            grad_final.clear();
+            cached_output.clear();
+            params_grad.clear();
+            temp_tensor.clear();
+            gradient_input_is_stale = true;
+            call_clean_method_if_exists(details);
+        }
+
+        friend void serialize(const add_layer& item, std::ostream& out)
+        {
+            int version = 3;
+            serialize(version, out);
+            serialize(item.input_layer, out);
+            serialize(item.details, out);
+            serialize(item.this_layer_setup_called, out);
+            serialize(item.gradient_input_is_stale, out);
+            serialize(item.get_output_and_gradient_input_disabled, out);
+            serialize(item.x_grad, out);
+            serialize(item.cached_output, out);
+            serialize(item.grad_final, out);
+            serialize(item._sample_expansion_factor, out);
+        }
+
+        friend void deserialize(add_layer& item, std::istream& in)
+        {
+            int version = 0;
+            deserialize(version, in);
+            if (!(2 <= version && version <= 3))
+                throw serialization_error("Unexpected version found while deserializing dlib::add_layer.");
+            deserialize(item.input_layer, in);
+            deserialize(item.details, in);
+            deserialize(item.this_layer_setup_called, in);
+            deserialize(item.gradient_input_is_stale, in);
+            deserialize(item.get_output_and_gradient_input_disabled, in);
+            deserialize(item.x_grad, in);
+            deserialize(item.cached_output, in);
+            deserialize(item.grad_final, in);
+            if (version >= 3)
+                deserialize(item._sample_expansion_factor, in);
+            else
+                item._sample_expansion_factor = 1; // all layer types set this to 1 in older dlib versions, so that's what we put here.
+        }
+
+        friend std::ostream& operator<< (std::ostream& out, const add_layer& item)
+        {
+            int min_length = 0;
+            item.print(out, 0, min_length);
+            return out;
+        }
+
+        void print (std::ostream& out, unsigned long idx, int& min_length) const
+        {
+            out << "layer<" << idx << ">\t" << impl::tensor_to_str(private_get_output(), min_length) << layer_details() << "\n";
+
+            // Don't print the repeat_input_layer since it doesn't exist from the user's
+            // point of view.  It's just an artifact of how repeat<> works.
+            if (!std::is_same<subnet_type, impl::repeat_input_layer>::value)
+                out << "layer<" << idx+1 << ">\t" << subnet() << "\n";
+        }
+
+    private:
+
+        bool this_layer_requires_forward_output(
+        ) 
+        {
+            subnet_wrapper wsub(grad_final, grad_final, _sample_expansion_factor);
+            return impl::backward_requires_forward_output(details, wsub);
+        }
+
+        class subnet_wrapper
+        {
+        public:
+            subnet_wrapper(const tensor& x_, resizable_tensor& grad_final_, unsigned int sef) :
+                x(x_), grad_final(grad_final_), _sample_expansion_factor(sef) {}
+
+            subnet_wrapper(const subnet_wrapper&) = delete;
+            subnet_wrapper& operator=(const subnet_wrapper&) = delete;
+
+            unsigned int sample_expansion_factor() const { return _sample_expansion_factor;}
+            const tensor& get_output() const { return x; }
+            tensor& get_gradient_input() 
+            { 
+                if (!have_same_dimensions(x, grad_final))
+                {
+                    grad_final.copy_size(x);
+                    grad_final = 0;  
+                }
+                return grad_final; 
+            }
+
+        private:
+            const tensor& x;
+            resizable_tensor& grad_final;
+            unsigned int _sample_expansion_factor;
+        };
+
+        void swap(add_layer& item)
+        {
+            std::swap(input_layer, item.input_layer);
+            std::swap(details, item.details);
+            std::swap(this_layer_setup_called, item.this_layer_setup_called);
+            std::swap(gradient_input_is_stale, item.gradient_input_is_stale);
+            std::swap(get_output_and_gradient_input_disabled, item.get_output_and_gradient_input_disabled);
+            std::swap(x_grad, item.x_grad); 
+            std::swap(cached_output, item.cached_output); 
+            std::swap(grad_final, item.grad_final); 
+            std::swap(_sample_expansion_factor, item._sample_expansion_factor); 
+        }
+
+        subnet_type input_layer;
+        LAYER_DETAILS details;
+        bool this_layer_setup_called;
+        bool gradient_input_is_stale;
+        bool get_output_and_gradient_input_disabled;
+        mutable unsigned int _sample_expansion_factor;
+        resizable_tensor x_grad; 
+        resizable_tensor cached_output; 
+        resizable_tensor grad_final;
+
+        // The following 2 objects don't logically contribute to the state of this class.
+        // They are only here to prevent them from being reallocated over and over in
+        // member functions.
+        resizable_tensor params_grad; 
+        resizable_tensor temp_tensor; 
+    };
+
+// ----------------------------------------------------------------------------------------
+
+    template <unsigned long ID, typename SUBNET, typename enabled=void>
+    class add_tag_layer;
+
+    template <template<typename SUBNET> class tag>
+    struct tag_id
+    {
+        const static unsigned long id = tag<impl::repeat_input_layer>::id;
+    };
+
+    template <unsigned long ID, typename SUBNET>
+    class add_tag_layer<ID,SUBNET,
+            typename std::enable_if<is_nonloss_layer_type<SUBNET>::value>::type>
+    {
+    public:
+        typedef SUBNET subnet_type;
+        typedef typename subnet_type::input_type input_type;
+        typedef int layer_details_type; // not really used anywhere, but required by subnet_wrapper.
+        const static size_t num_layers = subnet_type::num_layers + 1;
+        const static size_t num_computational_layers = subnet_type::num_computational_layers;
+        const static unsigned long id = ID;
+
+        add_tag_layer() {};
+        add_tag_layer(const add_tag_layer&) = default;
+        add_tag_layer(add_tag_layer&&) = default;
+        add_tag_layer& operator=(add_tag_layer&&) = default;
+        add_tag_layer& operator=(const add_tag_layer&) = default;
+
+        template <typename T>
+        add_tag_layer(
+            const add_tag_layer<ID,T>& item
+        ) : subnetwork(item.subnet())
+        {}
+
+        template <typename ...T>
+        add_tag_layer(
+            T ...args
+        ) : 
+            subnetwork(std::move(args)...) 
+        {
+        }
+
+        template <typename forward_iterator>
+        void to_tensor (
+            forward_iterator ibegin,
+            forward_iterator iend,
+            resizable_tensor& data
+        ) const
+        {
+            subnetwork.to_tensor(ibegin,iend,data);
+        }
+
+        template <typename forward_iterator>
+        const tensor& operator() (
+            forward_iterator ibegin,
+            forward_iterator iend
+        )
+        {
+            return subnetwork(ibegin,iend);
+        }
+
+        const tensor& operator() (const input_type& x)
+        {
+            return subnetwork(x);
+        }
+
+        const tensor& forward(const tensor& x)
+        {
+            return subnetwork.forward(x);
+        }
+
+        const tensor& get_output() const { return subnetwork.get_output(); }
+
+        tensor& get_gradient_input() 
+        { 
+            return subnetwork.get_gradient_input();
+        }
+
+        const tensor& get_final_data_gradient(
+        ) const { return subnetwork.get_final_data_gradient(); }
+
+        void back_propagate_error(const tensor& x)
+        {
+            subnetwork.back_propagate_error(x);
+        }
+        void back_propagate_error(const tensor& x, const tensor& gradient_input)
+        {
+            subnetwork.back_propagate_error(x,gradient_input);
+        }
+
+        template <typename solver_type>
+        void update_parameters(sstack<solver_type> solvers, double learning_rate)
+        {
+            subnetwork.update_parameters(solvers, learning_rate);
+        }
+
+        const tensor& get_parameter_gradient(
+        ) const { return params_grad; }
+
+        tensor& get_parameter_gradient (
+        ) { return params_grad; }
+
+        const subnet_type& subnet() const { return subnetwork; }
+        subnet_type& subnet() { return subnetwork; }
+
+        unsigned int sample_expansion_factor() const { return subnet().sample_expansion_factor(); }
+
+        void clean()
+        {
+            subnetwork.clean();
+        }
+
+        friend void serialize(const add_tag_layer& item, std::ostream& out)
+        {
+            int version = 1;
+            serialize(version, out);
+            serialize(item.subnetwork, out);
+        }
+
+        friend void deserialize(add_tag_layer& item, std::istream& in)
+        {
+            int version = 0;
+            deserialize(version, in);
+            if (version != 1)
+                throw serialization_error("Unexpected version found while deserializing dlib::add_tag_layer.");
+            deserialize(item.subnetwork, in);
+        }
+
+        friend std::ostream& operator<< (std::ostream& out, const add_tag_layer& item)
+        {
+            int min_length = 0;
+            item.print(out, 0, min_length);
+            return out;
+        }
+
+        void print (std::ostream& out, unsigned long idx, int& min_length) const
+        {
+            out << "layer<" << idx << ">\t" << impl::tensor_to_str(private_get_output(), min_length) << "tag" << ID << "\n";
+            subnet().print(out, idx+1, min_length);
+        }
+
+    private:
+
+        template <typename T, typename U, typename E>
+        friend class add_layer;
+        template <typename T, bool is_first, typename E>
+        friend class dimpl::subnet_wrapper;
+        template <unsigned long T, typename U, typename E>
+        friend class add_tag_layer;
+        template <template<typename> class T, typename U>
+        friend class add_skip_layer;
+        template <size_t N, template<typename> class L, typename S>
+        friend class repeat;
+
+        // You wouldn't put a tag on a layer if you didn't want to access its forward
+        // outputs.  So this is always true.
+        bool this_layer_requires_forward_output(
+        ) { return true; } 
+
+        void disable_output_and_gradient_getters (
+        ) 
+        { 
+            // This should never happen because only inplace layers call
+            // disable_output_and_gradient_getters(), however, putting a tag layer right
+            // before an inplace layer basically means you don't want the following layer
+            // to operate in place.  So the inplace layer should turn itself into an
+            // out-of-place layer and not call disable_output_and_gradient_getters(). 
+            DLIB_CASSERT(false,"This should never happen");
+        }
+
+        tensor& private_get_output() const
+        { return subnetwork.private_get_output(); }
+        tensor& private_get_gradient_input() 
+        { return subnetwork.private_get_gradient_input(); }
+
+        subnet_type subnetwork;
+
+        // This member doesn't logically contribute to the state of the object since it is
+        // always empty. It's just here so we can have the get_parameter_gradient() methods
+        // which have to return something.  So they return this empty tensor.
+        resizable_tensor params_grad;
+    };
+
+// ----------------------------------------------------------------------------------------
+
+    template <typename ...T>
+    struct decorator_repeat_group
+    {
+        decorator_repeat_group(
+            T&& ...args
+        ) : data(std::forward<T>(args)...) {}
+
+        std::tuple<T...> data;
+    };
+    template <typename ...T>
+    decorator_repeat_group<T...> repeat_group (
+        T&& ...args
+    )
+    {
+        return decorator_repeat_group<T...>(std::forward<T>(args)...);
+    }
+
+    template <
+        size_t num,
+        template<typename> class REPEATED_LAYER, 
+        typename SUBNET
+        >
+    class repeat
+    {
+        static_assert(num > 0, "You can't have a layer repeated 0 times.");
+    public:
+        typedef SUBNET subnet_type;
+        typedef typename SUBNET::input_type input_type;
+        typedef int layer_details_type; // not really used anywhere, but required by subnet_wrapper.
+        const static size_t comp_layers_in_each_group = (REPEATED_LAYER<SUBNET>::num_computational_layers-SUBNET::num_computational_layers);
+        const static size_t comp_layers_in_repeated_group = comp_layers_in_each_group*num;
+        const static size_t num_computational_layers = comp_layers_in_repeated_group + SUBNET::num_computational_layers;
+
+        const static size_t layers_in_each_group = (REPEATED_LAYER<SUBNET>::num_layers-SUBNET::num_layers);
+        const static size_t layers_in_repeated_group = layers_in_each_group*num;
+        const static size_t num_layers = subnet_type::num_layers + layers_in_repeated_group;
+
+
+        typedef REPEATED_LAYER<impl::repeat_input_layer> repeated_layer_type;
+
+        repeat(
+        ) : 
+            details(num)
+        {
+        }
+
+        size_t num_repetitions (
+        ) const { return num; }
+
+        const repeated_layer_type& get_repeated_layer (
+            size_t i 
+        ) const
+        { 
+            DLIB_CASSERT(i < num_repetitions());
+            return details[i]; 
+        }
+
+        repeated_layer_type& get_repeated_layer (
+            size_t i 
+        ) 
+        { 
+            DLIB_CASSERT(i < num_repetitions());
+            return details[i]; 
+        }
+
+        repeat(const repeat&) = default;
+        repeat(repeat&&) = default;
+        repeat& operator=(repeat&&) = default;
+        repeat& operator=(const repeat&) = default;
+
+        template <template<typename> class T, typename U>
+        repeat(
+            const repeat<num,T,U>& item
+        ) : 
+            subnetwork(item.subnetwork)
+        {
+            for (auto&& d : item.details)
+                details.emplace_back(d);
+        }
+
+        template <typename T, typename ...U>
+        repeat(
+            T arg1,
+            U ...args2
+        ): 
+            details(num, std::move(arg1)),
+            subnetwork(std::move(args2)...)
+        {
+        }
+
+        template <typename ...T, typename ...U>
+        repeat(
+            decorator_repeat_group<T...>&& arg1,
+            U ...args2
+        ): 
+            details(num, arg1.data),
+            subnetwork(std::move(args2)...)
+        {
+        }
+
+        template <typename T, typename ...U>
+        repeat(
+            std::tuple<>,
+            T arg1,
+            U ...args2
+        ): 
+            details(num, std::move(arg1)),
+            subnetwork(std::move(args2)...)
+        {
+        }
+
+        template <typename forward_iterator>
+        void to_tensor (
+            forward_iterator ibegin,
+            forward_iterator iend,
+            resizable_tensor& data
+        ) const
+        {
+            subnetwork.to_tensor(ibegin,iend,data);
+            // call to_tensor on the networks in details just to populate the
+            // _sample_expansion_factor values in those networks.  Other than that this
+            // call is a noop.  
+            for (auto& d : details)
+                d.to_tensor(ibegin, iend, data);
+        }
+
+        template <typename forward_iterator>
+        const tensor& operator() (
+            forward_iterator ibegin,
+            forward_iterator iend
+        )
+        {
+            to_tensor(ibegin,iend,temp_tensor);
+            return forward(temp_tensor);
+        }
+
+        const tensor& operator() (const input_type& x)
+        {
+            return (*this)(&x, &x+1);
+        }
+
+        const tensor& forward(const tensor& x)
+        {
+            subnetwork.forward(x);
+            details[details.size()-1].forward(subnetwork.get_output());
+            for (long i = details.size()-2; i >= 0; --i)
+                details[i].forward(details[i+1].get_output());
+            return private_get_output();
+        }
+
+    private:
+        tensor& private_get_output() const
+        { 
+            return details[0].private_get_output();
+        }
+        tensor& private_get_gradient_input() 
+        { 
+            return details[0].private_get_gradient_input();
+        }
+    public:
+        const tensor& get_output() const 
+        { 
+            return details[0].get_output(); 
+        }
+        tensor& get_gradient_input() 
+        { 
+            return details[0].get_gradient_input();
+        }
+
+        const tensor& get_parameter_gradient(
+        ) const { return details[0].get_parameter_gradient(); }
+
+        tensor& get_parameter_gradient (
+        ) { return details[0].get_parameter_gradient(); }
+
+        void back_propagate_error(const tensor& x)
+        {
+            back_propagate_error(x, private_get_gradient_input());
+        }
+        void back_propagate_error(const tensor& x, const tensor& gradient_input)
+        {
+            if (details.size() > 1)
+            {
+                details[0].back_propagate_error(details[1].get_output(), gradient_input);
+                for (size_t i = 1; i < details.size(); ++i)
+                {
+                    if (i+1 < details.size())
+                        details[i].back_propagate_error(details[i+1].get_output(), details[i-1].get_final_data_gradient());
+                    else
+                        details[i].back_propagate_error(subnetwork.get_output(), details[i-1].get_final_data_gradient());
+                }
+            }
+            else
+            {
+                details[0].back_propagate_error(subnetwork.get_output(), gradient_input);
+            }
+            subnetwork.back_propagate_error(x, details.back().get_final_data_gradient());
+        }
+
+        template <typename solver_type>
+        void update_parameters(sstack<solver_type> solvers, double learning_rate)
+        {
+            for (size_t i = 0; i < details.size(); ++i)
+                details[i].update_parameters(solvers.pop(comp_layers_in_each_group*i),learning_rate);
+            subnetwork.update_parameters(solvers.pop(comp_layers_in_each_group*details.size()),learning_rate);
+        }
+
+        const subnet_type& subnet() const { return subnetwork; }
+        subnet_type& subnet() { return subnetwork; }
+
+        unsigned int sample_expansion_factor() const { return subnet().sample_expansion_factor(); }
+
+        void clean()
+        {
+            temp_tensor.clear();
+            subnetwork.clean();
+            for (auto&& d : details)
+                d.clean();
+        }
+
+        friend void serialize(const repeat& item, std::ostream& out)
+        {
+            int version = 1;
+            serialize(version, out);
+            serialize(item.details, out);
+            serialize(item.subnetwork, out);
+        }
+
+        friend void deserialize(repeat& item, std::istream& in)
+        {
+            int version = 0;
+            deserialize(version, in);
+            if (version != 1)
+                throw serialization_error("Unexpected version found while deserializing dlib::repeat.");
+            deserialize(item.details, in);
+            deserialize(item.subnetwork, in);
+        }
+
+        friend std::ostream& operator<< (std::ostream& out, const repeat& item)
+        {
+            int min_length = 0;
+            item.print(out, 0, min_length);
+            return out;
+        }
+
+        void print (std::ostream& out, unsigned long idx, int& min_length) const
+        {
+            for (size_t i = 0; i < num_repetitions(); ++i)
+            {
+                get_repeated_layer(i).print(out, idx, min_length);
+                idx += layers_in_each_group;
+            }
+            subnet().print(out, idx, min_length);
+        }
+    private:
+
+
+        template <typename T, typename U, typename E>
+        friend class add_layer;
+        template <typename T, bool is_first, typename E>
+        friend class dimpl::subnet_wrapper;
+        template <unsigned long T, typename U, typename E>
+        friend class add_tag_layer;
+        template <template<typename> class T, typename U>
+        friend class add_skip_layer;
+        template <size_t N, template<typename> class L, typename S>
+        friend class repeat;
+
+        bool this_layer_requires_forward_output(
+        ) 
+        { 
+            return details[0].this_layer_requires_forward_output(); 
+        } 
+
+        void disable_output_and_gradient_getters (
+        ) 
+        { 
+            details[0].disable_output_and_gradient_getters();
+        }
+
+
+        std::vector<repeated_layer_type> details; 
+        subnet_type subnetwork;
+
+        // temp_tensor doesn't logically contribute to the state of this class.
+        // It is here only to void needing to reallocate it over and over.
+        resizable_tensor temp_tensor;
+    };
+
+    template <
+        size_t num,
+        template<typename> class REPEATED_LAYER, 
+        typename SUBNET
+        >
+    struct is_nonloss_layer_type<repeat<num,REPEATED_LAYER,SUBNET>> : std::true_type {};
+
+// ----------------------------------------------------------------------------------------
+
+// This version of add_tag_layer handles the special case where the subnetwork being given
+// is just an input layer object.
+    template <unsigned long ID, typename INPUT_LAYER, typename enabled>
+    class add_tag_layer
+    {
+    public:
+        typedef INPUT_LAYER subnet_type;
+        typedef typename subnet_type::input_type input_type;
+        typedef int layer_details_type; // not really used anywhere, but required by subnet_wrapper.
+        const static size_t num_computational_layers = 0;
+        const static size_t num_layers = 2;
+        const static unsigned long id = ID;
+
+        add_tag_layer():cached_output_ptr(nullptr),gradient_input_is_stale(true),_sample_expansion_factor(0) {}
+
+        add_tag_layer(const add_tag_layer&) = default;
+        add_tag_layer& operator=(const add_tag_layer&) = default;
+        add_tag_layer(add_tag_layer&& item) : add_tag_layer() { swap(item); }
+        add_tag_layer& operator=(add_tag_layer&& item) { swap(item); return *this; }
+
+        template <typename T, typename E>
+        add_tag_layer(
+            const add_tag_layer<ID,T,E>& item
+        ) : input_layer(item.subnet()), 
+            cached_output(item.cached_output),
+            cached_output_ptr(nullptr),
+            grad_final(item.grad_final),
+            gradient_input_is_stale(item.gradient_input_is_stale),
+            _sample_expansion_factor(0)
+        {}
+
+        template <typename ...T>
+        add_tag_layer(
+            T ...args
+        ) : 
+            input_layer(std::move(args)...),
+            cached_output_ptr(nullptr),
+            gradient_input_is_stale(true),
+            _sample_expansion_factor(0)
+        {
+        }
+
+        add_tag_layer (
+            std::tuple<>
+        ) : 
+            cached_output_ptr(nullptr),
+            gradient_input_is_stale(true),
+            _sample_expansion_factor(0)
+        {}
+
+        template <typename forward_iterator>
+        void to_tensor (
+            forward_iterator ibegin,
+            forward_iterator iend,
+            resizable_tensor& data
+        ) const
+        {
+            input_layer.to_tensor(ibegin,iend,data);
+
+            // make sure the input layer's to_tensor() function is implemented properly.
+            DLIB_CASSERT(data.num_samples() >= std::distance(ibegin,iend), 
+            "The input layer can't produce fewer output tensors than there are inputs.");
+            DLIB_CASSERT(data.num_samples()%std::distance(ibegin,iend) == 0,
+            "The number of tensors produced by the input layer must be an integer multiple of the number of input objects.");
+
+            _sample_expansion_factor = data.num_samples()/std::distance(ibegin,iend);
+            data.async_copy_to_device();
+        }
+
+        unsigned int sample_expansion_factor() const { return _sample_expansion_factor; }
+
+        template <typename forward_iterator>
+        const tensor& operator() (
+            forward_iterator ibegin, 
+            forward_iterator iend
+        )
+        {
+            input_layer.to_tensor(ibegin,iend,cached_output);
+            cached_output_ptr = nullptr;
+            return get_output();
+        }
+
+        const tensor& operator() (const input_type& x)
+        {
+            return (*this)(&x, &x+1);
+        }
+
+        const tensor& forward(const tensor& x)
+        {
+            // If this tag is the first layer in one of the sub networks inside a repeat
+            // layer then we don't want it to be creating copies of x.  This is because, we
+            // can just hold a pointer to x since the way repeat is constructed guarantees
+            // that x will have a lifetime larger than this pointer. 
+            if (is_same_type<INPUT_LAYER, impl::repeat_input_layer>::value)
+                cached_output_ptr = const_cast<tensor*>(&x);
+            else
+                cached_output = x;
+            gradient_input_is_stale = true;
+            return get_output();
+        }
+
+        const tensor& get_output() const 
+        { 
+            if (cached_output_ptr)
+                return *cached_output_ptr;
+            else
+                return cached_output; 
+        }
+
+        const tensor& get_final_data_gradient(
+        ) const { return grad_final; }
+
+        tensor& get_gradient_input() 
+        { 
+            if (!have_same_dimensions(get_output(), grad_final) ||
+                gradient_input_is_stale)
+            {
+                grad_final.copy_size(get_output());
+                grad_final = 0;
+                gradient_input_is_stale = false;
+            }
+            return grad_final; 
+        }
+
+        void back_propagate_error(const tensor& /*x*/)
+        {
+            // nothing to do
+        }
+        void back_propagate_error(const tensor& /*x*/, const tensor& /*gradient_input*/)
+        {
+            // nothing to do
+        }
+
+        template <typename solver_type>
+        void update_parameters(sstack<solver_type> /*solvers*/, double /*learning_rate*/)
+        {
+            // nothing to do
+        }
+
+        const subnet_type& subnet() const { return input_layer; }
+        subnet_type& subnet() { return input_layer; }
+
+        void clean()
+        {
+            grad_final.clear();
+            cached_output.clear();
+            cached_output_ptr = 0;
+        }
+
+        friend void serialize(const add_tag_layer& item, std::ostream& out)
+        {
+            int version = 2;
+            serialize(version, out);
+            serialize(item.input_layer, out);
+            serialize(item.cached_output, out);
+            serialize(item.grad_final, out);
+            serialize(item.gradient_input_is_stale, out);
+            serialize(item._sample_expansion_factor, out);
+        }
+
+        friend void deserialize(add_tag_layer& item, std::istream& in)
+        {
+            int version = 0;
+            deserialize(version, in);
+            if (!(1 <= version && version <= 2))
+                throw serialization_error("Unexpected version found while deserializing dlib::add_tag_layer.");
+            deserialize(item.input_layer, in);
+            deserialize(item.cached_output, in);
+            deserialize(item.grad_final, in);
+            deserialize(item.gradient_input_is_stale, in);
+            item.cached_output_ptr = nullptr;
+            if (version >= 2)
+                deserialize(item._sample_expansion_factor, in);
+            else
+                item._sample_expansion_factor = 1; // all layer types set this to 1 in older dlib versions, so that's what we put here.
+                
+        }
+
+        friend std::ostream& operator<< (std::ostream& out, const add_tag_layer& item)
+        {
+            int min_length = 0;
+            item.print(out, 0, min_length);
+            return out;
+        }
+
+        void print (std::ostream& out, unsigned long idx, int& min_length) const
+        {
+            out << "layer<"<<idx << ">\t"<<impl::tensor_to_str(private_get_output(), min_length)<< "tag" << ID << "\n";
+            // Don't print the repeat_input_layer since it doesn't exist from the user's
+            // point of view.  It's just an artifact of how repeat<> works.
+            if (!std::is_same<subnet_type, impl::repeat_input_layer>::value)
+                out << "layer<"<< idx+1 << ">\t" << subnet() << "\n";
+        }
+
+    private:
+
+        template <typename T, typename U, typename E>
+        friend class add_layer;
+        template <typename T, bool is_first, typename E>
+        friend class dimpl::subnet_wrapper;
+        template <unsigned long T, typename U, typename E>
+        friend class add_tag_layer;
+        template <template<typename> class T, typename U>
+        friend class add_skip_layer;
+        template <size_t N, template<typename> class L, typename S>
+        friend class repeat;
+
+        // You woudln't put a tag on a layer if you didn't want to access its forward
+        // outputs.  So this is always true.
+        bool this_layer_requires_forward_output(
+        ) { return true; } 
+
+        void disable_output_and_gradient_getters (
+        ) 
+        { 
+            // This should never happen because only inplace layers call
+            // disable_output_and_gradient_getters(), however, putting a tag layer right
+            // before an inplace layer basically means you don't want the following layer
+            // to operate in place.  So the inplace layer should turn itself into an
+            // out-of-place layer and not call disable_output_and_gradient_getters(). 
+            DLIB_CASSERT(false,"This should never happen");
+        }
+
+        tensor& private_get_output() const
+        { return const_cast<tensor&>(get_output()); }
+        tensor& private_get_gradient_input() 
+        { return get_gradient_input(); }
+
+        void swap(add_tag_layer& item)
+        {
+            std::swap(input_layer, item.input_layer);
+            std::swap(cached_output, item.cached_output);
+            std::swap(cached_output_ptr, item.cached_output_ptr);
+            std::swap(grad_final, item.grad_final);
+            std::swap(gradient_input_is_stale, item.gradient_input_is_stale);
+            std::swap(_sample_expansion_factor, item._sample_expansion_factor);
+        }
+
+        subnet_type input_layer;
+        resizable_tensor cached_output;
+        tensor* cached_output_ptr;
+        resizable_tensor grad_final;
+        bool gradient_input_is_stale;
+        mutable unsigned int _sample_expansion_factor;
+    };
+
+    template <unsigned long ID, typename U, typename E>
+    struct is_nonloss_layer_type<add_tag_layer<ID,U,E>> : std::true_type {};
+
+
+// ----------------------------------------------------------------------------------------
+// ----------------------------------------------------------------------------------------
+// ----------------------------------------------------------------------------------------
+
+    template <typename LOSS_DETAILS, typename SUBNET>
+    class add_loss_layer;
+
+    class no_label_type
+    {
+    private:
+        // We don't want anyone making these no_label_type objects.  They are here only to
+        // allow add_loss_layer::training_label_type and dnn_trainer::training_label_type
+        // to exist which avoids needing to overload add_loss_layer and dnn_trainer for
+        // supervised an unsupervised losses.  It also can be a type to use in template
+        // metaprogramming to indicate "no label".  So here we make the constructor private
+        // with the exception that add_loss_layer objects can make it (again, just to
+        // simplify add_loss_layer's implementation).
+        no_label_type(){};
+        template <typename LOSS_DETAILS, typename SUBNET> friend class add_loss_layer;
+        template < typename net_type, typename solver_type > friend class dnn_trainer; 
+    };
+
+// ----------------------------------------------------------------------------------------
+
+    template <typename LOSS_DETAILS, typename SUBNET>
+    class add_loss_layer
+    {
+        template <typename T, typename enabled=void>
+        struct get_loss_layer_training_label_type
+        {
+            typedef no_label_type type;
+        };
+        template <typename T>
+        struct get_loss_layer_training_label_type<T,typename std::enable_if<sizeof(typename T::training_label_type)!=0>::type>
+        {
+            typedef typename T::training_label_type type;
+        };
+
+        template <typename T, typename enabled=void>
+        struct get_loss_layer_output_label_type
+        {
+            typedef no_label_type type;
+        };
+        template <typename T>
+        struct get_loss_layer_output_label_type<T,typename std::enable_if<sizeof(typename T::output_label_type)!=0>::type>
+        {
+            typedef typename T::output_label_type type;
+        };
+
+    public:
+        typedef LOSS_DETAILS loss_details_type;
+        typedef SUBNET subnet_type;
+        typedef typename subnet_type::input_type input_type;
+        const static size_t num_layers = subnet_type::num_layers + 1;
+        // Note that the loss layer doesn't count as an additional computational layer.
+        const static size_t num_computational_layers = subnet_type::num_computational_layers;
+        typedef typename get_loss_layer_training_label_type<LOSS_DETAILS>::type training_label_type;
+        typedef typename get_loss_layer_output_label_type<LOSS_DETAILS>::type output_label_type;
+
+        static_assert(is_nonloss_layer_type<SUBNET>::value, 
+            "SUBNET must be of type add_layer, add_skip_layer, or add_tag_layer."); 
+
+
+        add_loss_layer() {};
+        add_loss_layer(const add_loss_layer&) = default;
+        add_loss_layer& operator=(const add_loss_layer&) = default;
+        add_loss_layer(add_loss_layer&& item) : add_loss_layer() { swap(item); }
+        add_loss_layer& operator=(add_loss_layer&& item) { swap(item); return *this; }
+
+        template <typename T, typename U>
+        add_loss_layer(
+            const add_loss_layer<T,U>& item
+        ) : 
+            loss(item.loss_details()),
+            subnetwork(item.subnet())
+        {}
+
+        template <typename ...T>
+        add_loss_layer(
+            const LOSS_DETAILS& layer_det, 
+            T&& ...args
+        ) : 
+            loss(layer_det), 
+            subnetwork(std::forward<T>(args)...)
+        {
+        }
+
+        template <typename ...T>
+        add_loss_layer(
+            LOSS_DETAILS&& layer_det, 
+            T&& ...args
+        ) : 
+            loss(std::move(layer_det)), 
+            subnetwork(std::forward<T>(args)...)
+        {
+        }
+
+        template <typename T, typename ...U>
+        struct disable_forwarding_constr 
+        {
+            const static bool value = std::is_constructible<LOSS_DETAILS,T>::value;
+        };
+        template <typename ...T>
+        struct disable_forwarding_constr<add_loss_layer<T...>>
+        {
+            const static bool value = true;
+        };
+
+        template <
+            typename ...T, 
+            typename = typename std::enable_if<!disable_forwarding_constr<typename std::remove_reference<T>::type...>::value>::type
+            >
+        add_loss_layer(
+            T&& ...args
+        ) : 
+            subnetwork(std::forward<T>(args)...)
+        {
+        }
+
+        template <typename forward_iterator>
+        void to_tensor (
+            forward_iterator ibegin,
+            forward_iterator iend,
+            resizable_tensor& data
+        ) const
+        {
+            subnetwork.to_tensor(ibegin,iend,data);
+        }
+
+        unsigned int sample_expansion_factor() const { return subnet().sample_expansion_factor(); }
+
+        template <typename output_iterator>
+        void operator() (
+            const tensor& x, 
+            output_iterator obegin
+        )
+        {
+            subnetwork.forward(x);
+            const dimpl::subnet_wrapper<subnet_type> wsub(subnetwork);
+            loss.to_label(x, wsub, obegin);
+        }
+
+        template <typename forward_iterator, typename output_iterator>
+        void operator() (
+            forward_iterator ibegin,
+            forward_iterator iend,
+            output_iterator obegin
+        )
+        {
+            to_tensor(ibegin,iend,temp_tensor);
+            (*this)(temp_tensor, obegin);
+        }
+
+        const output_label_type& operator() (const input_type& x)
+        {
+            (*this)(&x, &x+1, &temp_label);
+            return temp_label;
+        }
+
+        template <typename ...T>
+        const output_label_type& process (const input_type& x, T&& ...args)
+        {
+            to_tensor(&x,&x+1,temp_tensor);
+            subnetwork.forward(temp_tensor);
+            const dimpl::subnet_wrapper<subnet_type> wsub(subnetwork);
+            loss.to_label(temp_tensor, wsub, &temp_label, std::forward<T>(args)...);
+            return temp_label;
+        }
+
+        template <typename iterable_type, typename ...T>
+        std::vector<output_label_type> process_batch (const iterable_type& data, size_t batch_size, T&& ...args)
+        {
+            std::vector<output_label_type> results(std::distance(data.begin(), data.end()));
+            auto o = results.begin();
+            auto i = data.begin();
+            auto num_remaining = results.size();
+            while(num_remaining != 0)
+            {
+                auto inc = std::min(batch_size, num_remaining);
+                to_tensor(i,i+inc,temp_tensor);
+                subnetwork.forward(temp_tensor);
+                const dimpl::subnet_wrapper<subnet_type> wsub(subnetwork);
+                loss.to_label(temp_tensor, wsub, o, std::forward<T>(args)...);
+
+                i += inc;
+                o += inc;
+                num_remaining -= inc;
+            }
+            return results;
+        }
+
+        template <typename iterable_type>
+        std::vector<output_label_type> operator() (
+            const iterable_type& data,
+            size_t batch_size = 128
+        )
+        {
+            std::vector<output_label_type> results(std::distance(data.begin(), data.end()));
+            auto o = results.begin();
+            auto i = data.begin();
+            auto num_remaining = results.size();
+            while(num_remaining != 0)
+            {
+                auto inc = std::min(batch_size, num_remaining);
+                (*this)(i, i+inc, o);
+                i += inc;
+                o += inc;
+                num_remaining -= inc;
+            }
+            return results;
+        }
+
+        template <typename label_iterator>
+        double compute_loss (
+            const tensor& x,
+            label_iterator lbegin 
+        )
+        {
+            subnetwork.forward(x);
+            dimpl::subnet_wrapper<subnet_type> wsub(subnetwork);
+            return loss.compute_loss_value_and_gradient(x, lbegin, wsub);
+        }
+
+        template <typename forward_iterator, typename label_iterator>
+        double compute_loss (
+            forward_iterator ibegin,
+            forward_iterator iend,
+            label_iterator lbegin 
+        )
+        {
+            to_tensor(ibegin,iend,temp_tensor);
+            return compute_loss(temp_tensor, lbegin);
+        }
+
+        double compute_loss (
+            const tensor& x
+        )
+        {
+            subnetwork.forward(x);
+            dimpl::subnet_wrapper<subnet_type> wsub(subnetwork);
+            return loss.compute_loss_value_and_gradient(x, wsub);
+        }
+
+        template <typename forward_iterator>
+        double compute_loss (
+            forward_iterator ibegin,
+            forward_iterator iend
+        )
+        {
+            to_tensor(ibegin,iend,temp_tensor);
+            return compute_loss(temp_tensor);
+        }
+
+        template <typename label_iterator>
+        double compute_parameter_gradients (
+            const tensor& x,
+            label_iterator lbegin
+        )
+        {
+            subnetwork.forward(x);
+            dimpl::subnet_wrapper<subnet_type> wsub(subnetwork);
+            double l = loss.compute_loss_value_and_gradient(x, lbegin, wsub);
+            subnetwork.back_propagate_error(x);
+            return l;
+        }
+        template <typename forward_iterator, typename label_iterator>
+        double compute_parameter_gradients (
+            forward_iterator ibegin,
+            forward_iterator iend,
+            label_iterator lbegin
+        )
+        {
+            to_tensor(ibegin,iend,temp_tensor);
+            return compute_parameter_gradients(temp_tensor, lbegin);
+        }
+        double compute_parameter_gradients (
+            const tensor& x
+        )
+        {
+            subnetwork.forward(x);
+            dimpl::subnet_wrapper<subnet_type> wsub(subnetwork);
+            double l = loss.compute_loss_value_and_gradient(x, wsub);
+            subnetwork.back_propagate_error(x);
+            return l;
+        }
+        template <typename forward_iterator>
+        double compute_parameter_gradients (
+            forward_iterator ibegin,
+            forward_iterator iend
+        )
+        {
+            to_tensor(ibegin,iend,temp_tensor);
+            return compute_parameter_gradients(temp_tensor);
+        }
+
+        template <typename solver_type>
+        void update_parameters (
+            sstack<solver_type> solvers,
+            double learning_rate
+        )
+        {
+            subnetwork.update_parameters(solvers, learning_rate);
+        }
+
+        const subnet_type& subnet() const { return subnetwork; }
+        subnet_type& subnet() { return subnetwork; }
+        const loss_details_type& loss_details() const { return loss; }
+        loss_details_type& loss_details() { return loss; }
+
+        void clean (
+        )
+        {
+            temp_tensor.clear();
+            subnetwork.clean();
+        }
+
+        template <typename T, typename U>
+        friend void serialize(const add_loss_layer<T,U>& item, std::ostream& out);
+        template <typename T, typename U>
+        friend void deserialize(add_loss_layer<T,U>& item, std::istream& in);
+
+        friend std::ostream& operator<< (std::ostream& out, const add_loss_layer& item)
+        {
+            int min_length = 0;
+            item.print(out, 0, min_length);
+            return out;
+        }
+
+        void print (std::ostream& out, unsigned long idx, int& min_length) const
+        {
+            out << "layer<" << idx << ">\t" << loss_details() << "\n";
+            subnet().print(out, idx+1, min_length);
+        }
+
+    private:
+
+
+        void swap(add_loss_layer& item)
+        {
+            std::swap(loss, item.loss);
+            std::swap(subnetwork, item.subnetwork);
+        }
+
+        loss_details_type loss;
+        subnet_type subnetwork;
+
+        // These two objects don't logically contribute to the state of this object.  They
+        // are here to prevent them from being reallocated over and over.
+        output_label_type temp_label;
+        resizable_tensor temp_tensor;
+    };
+
+    template <typename LOSS_DETAILS, typename SUBNET>
+    void serialize(const add_loss_layer<LOSS_DETAILS,SUBNET>& item, std::ostream& out)
+    {
+        int version = 1;
+        serialize(version, out);
+        serialize(item.loss, out);
+        serialize(item.subnetwork, out);
+    }
+
+    template <typename LOSS_DETAILS, typename SUBNET>
+    void deserialize(add_loss_layer<LOSS_DETAILS,SUBNET>& item, std::istream& in)
+    {
+        int version = 0;
+        deserialize(version, in);
+        if (version != 1)
+            throw serialization_error("Unexpected version found while deserializing dlib::add_loss_layer.");
+        deserialize(item.loss, in);
+        deserialize(item.subnetwork, in);
+    }
+
+
+    template <typename T, typename U>
+    struct is_loss_layer_type<add_loss_layer<T,U>> : std::true_type {};
+
+// ----------------------------------------------------------------------------------------
+// ----------------------------------------------------------------------------------------
+// ----------------------------------------------------------------------------------------
+
+    namespace impl
+    {
+        template <unsigned int i, typename T, typename enabled = void>
+        struct layer_helper
+        {
+            static_assert(i < T::num_layers, "Call to layer() attempted to access non-existing layer in neural network.");
+            static T& makeT();
+            using next_type = typename std::remove_reference<decltype(makeT().subnet())>::type;
+            using type = typename layer_helper<i-1,next_type>::type;
+            static type& layer(T& n)
+            {
+                return layer_helper<i-1,next_type>::layer(n.subnet());
+            }
+        };
+        template <
+            unsigned int i,
+            size_t N, template<typename> class L, typename S
+        >
+        struct layer_helper<i,repeat<N,L,S>, typename std::enable_if<(i!=0&&i>=repeat<N,L,S>::layers_in_repeated_group)>::type>
+        {
+            const static size_t layers_in_repeated_group = repeat<N,L,S>::layers_in_repeated_group;
+
+            static repeat<N,L,S>& makeT();
+            using next_type = typename std::remove_reference<decltype(makeT().subnet())>::type;
+            using type = typename layer_helper<i-layers_in_repeated_group,next_type>::type;
+            static type& layer(repeat<N,L,S>& n)
+            {
+                return layer_helper<i-layers_in_repeated_group,next_type>::layer(n.subnet());
+            }
+        };
+        template <
+            unsigned int i,
+            size_t N, template<typename> class L, typename S
+        >
+        struct layer_helper<i,repeat<N,L,S>, typename std::enable_if<(i!=0&&i<repeat<N,L,S>::layers_in_repeated_group)>::type>
+        {
+            const static size_t layers_in_each_group = repeat<N,L,S>::layers_in_each_group;
+            typedef typename repeat<N,L,S>::repeated_layer_type repeated_layer_type;
+            using next_type = repeated_layer_type;
+            using type = typename layer_helper<i%layers_in_each_group,next_type>::type;
+            static type& layer(repeat<N,L,S>& n)
+            {
+                return layer_helper<i%layers_in_each_group,next_type>::layer(n.get_repeated_layer(i/layers_in_each_group));
+            }
+        };
+        template <
+            size_t N, template<typename> class L, typename S
+        >
+        struct layer_helper<0,repeat<N,L,S>, void>
+        {
+            typedef typename repeat<N,L,S>::repeated_layer_type repeated_layer_type;
+            using type = repeated_layer_type;
+            static type& layer(repeat<N,L,S>& n)
+            {
+                return n.get_repeated_layer(0);
+            }
+        };
+
+
+
+        template <
+            unsigned int i,
+            size_t N, template<typename> class L, typename S
+        >
+        struct layer_helper<i,const repeat<N,L,S>, typename std::enable_if<(i!=0&&i>=repeat<N,L,S>::layers_in_repeated_group)>::type>
+        {
+            const static size_t layers_in_repeated_group = repeat<N,L,S>::layers_in_repeated_group;
+
+            static const repeat<N,L,S>& makeT();
+            using next_type = const typename std::remove_reference<decltype(makeT().subnet())>::type;
+            using type = const typename layer_helper<i-layers_in_repeated_group,next_type>::type;
+            static type& layer(const repeat<N,L,S>& n)
+            {
+                return layer_helper<i-layers_in_repeated_group,next_type>::layer(n.subnet());
+            }
+        };
+        template <
+            unsigned int i,
+            size_t N, template<typename> class L, typename S
+        >
+        struct layer_helper<i,const repeat<N,L,S>, typename std::enable_if<(i!=0&&i<repeat<N,L,S>::layers_in_repeated_group)>::type>
+        {
+            const static size_t layers_in_each_group = repeat<N,L,S>::layers_in_each_group;
+            typedef typename repeat<N,L,S>::repeated_layer_type repeated_layer_type;
+            using next_type = const repeated_layer_type;
+            using type = const typename layer_helper<i%layers_in_each_group,next_type>::type;
+            static type& layer(const repeat<N,L,S>& n)
+            {
+                return layer_helper<i%layers_in_each_group,next_type>::layer(n.get_repeated_layer(i/layers_in_each_group));
+            }
+        };
+        template <
+            size_t N, template<typename> class L, typename S
+        >
+        struct layer_helper<0,const repeat<N,L,S>, void>
+        {
+            typedef typename repeat<N,L,S>::repeated_layer_type repeated_layer_type;
+            using type = const repeated_layer_type;
+            static type& layer(const repeat<N,L,S>& n)
+            {
+                return n.get_repeated_layer(0);
+            }
+        };
+
+
+
+        template <typename T>
+        struct layer_helper<0,T,void>
+        {
+            using type = T;
+            static type& layer(T& n)
+            {
+                return n;
+            }
+        };
+
+        template <template<typename> class Match, typename T, unsigned int i, typename enabled = void>
+        struct layer_helper_match
+        {
+            static T& makeT();
+            using next_type = typename std::remove_reference<decltype(makeT().subnet())>::type;
+            using type = typename layer_helper_match<Match,next_type,i>::type;
+            static type& layer(T& n)
+            {
+                return layer_helper_match<Match,next_type,i>::layer(n.subnet());
+            }
+        };
+        // This overload catches add_layer and add_loss_layer templates.
+        template <template<typename> class Match, typename T, unsigned int i>
+        struct layer_helper_match<Match,T,i,
+            typename std::enable_if<std::is_same<const T,const  Match<typename T::subnet_type>>::value>::type>
+        {
+            using type = typename layer_helper<i,T>::type;
+            static type& layer(T& n)
+            {
+                return layer_helper<i,T>::layer(n);
+            }
+        };
+        // This overload catches input templates.
+        template <template<typename> class Match, typename T, unsigned int i>
+        struct layer_helper_match<Match,T,i,
+            typename std::enable_if<std::is_same<const T,const  Match<typename T::input_type>>::value>::type>
+        {
+            using type = typename layer_helper<i,T>::type;
+            static type& layer(T& n)
+            {
+                return layer_helper<i,T>::layer(n);
+            }
+        };
+        // This overload catches subnet_wrapper templates.
+        template <template<typename> class Match, typename T, unsigned int i>
+        struct layer_helper_match<Match,T,i,
+            typename std::enable_if<std::is_same<const typename T::wrapped_type, 
+                                                 const Match<typename T::wrapped_type::subnet_type>>::value>::type>
+        {
+            using type = typename layer_helper<i,T>::type;
+            static type& layer(T& n)
+            {
+                return layer_helper<i,T>::layer(n);
+            }
+        };
+    }
+
+    template <unsigned int i, typename T>
+    typename impl::layer_helper<i,T>::type& layer (T& n) 
+    {
+        return impl::layer_helper<i,T>::layer(n);
+    }
+
+    template <template<typename> class Match, typename T>
+    typename impl::layer_helper_match<Match,T,0>::type& layer (T& n) 
+    {
+        return impl::layer_helper_match<Match,T,0>::layer(n);
+    }
+
+    template <template<typename> class Match, unsigned int i, typename T>
+    typename impl::layer_helper_match<Match,T,i>::type& layer (T& n) 
+    {
+        return impl::layer_helper_match<Match,T,i>::layer(n);
+    }
+
+// ----------------------------------------------------------------------------------------
+
+
+    namespace dimpl
+    {
+        template <typename T>
+        T& get_input_details (
+            T& net
+        ) 
+        { 
+            return net; 
+        } 
+
+        template <typename T, bool is_first, typename enabled>
+        auto get_input_details (
+            dimpl::subnet_wrapper<T,is_first,enabled>& net
+        ) -> decltype(net.layer_details())&
+        {
+            return net.layer_details();
+        }
+
+        template <typename T, bool is_first, typename enabled>
+        auto get_input_details (
+            const dimpl::subnet_wrapper<T,is_first,enabled>& net
+        ) -> decltype(net.layer_details())&
+        {
+            return net.layer_details();
+        }
+    }
+
+    template <typename net_type>
+    auto input_layer (
+        net_type& net
+    ) -> decltype(dimpl::get_input_details(layer<net_type::num_layers-1>(net)))&
+    {
+        // Calling input_layer() on a subnet_wrapper is a little funny since the behavior of
+        // .subnet() returns another subnet_wrapper rather than an input details object as it
+        // does in add_layer.
+        return dimpl::get_input_details(layer<net_type::num_layers-1>(net));
+    }
+
+// ----------------------------------------------------------------------------------------
+
+    template <template<typename> class TAG_TYPE, typename SUBNET>
+    class add_skip_layer
+    {
+    public:
+        typedef SUBNET subnet_type;
+        typedef typename subnet_type::input_type input_type;
+        typedef int layer_details_type; // not really used anywhere, but required by subnet_wrapper.
+        const static size_t num_layers = subnet_type::num_layers + 1;
+        const static size_t num_computational_layers = subnet_type::num_computational_layers;
+        const static unsigned long id = tag_id<TAG_TYPE>::id;
+
+        add_skip_layer() {};
+        add_skip_layer(const add_skip_layer&) = default;
+        add_skip_layer(add_skip_layer&&) = default;
+        add_skip_layer& operator=(add_skip_layer&&) = default;
+        add_skip_layer& operator=(const add_skip_layer&) = default;
+
+        template <typename T>
+        add_skip_layer(
+            const add_skip_layer<TAG_TYPE,T>& item
+        ) : subnetwork(item.subnet())
+        {}
+
+        template <typename ...T>
+        add_skip_layer(
+            T ...args
+        ) : 
+            subnetwork(std::move(args)...) 
+        {
+        }
+
+        template <typename forward_iterator>
+        void to_tensor (
+            forward_iterator ibegin,
+            forward_iterator iend,
+            resizable_tensor& data
+        ) const
+        {
+            subnetwork.to_tensor(ibegin,iend,data);
+        }
+
+        template <typename forward_iterator>
+        const tensor& operator() (
+            forward_iterator ibegin,
+            forward_iterator iend
+        )
+        {
+            subnetwork(ibegin,iend);
+            return layer<TAG_TYPE>(subnetwork).get_output();
+        }
+
+        const tensor& operator() (const input_type& x)
+        {
+            subnetwork(x);
+            return layer<TAG_TYPE>(subnetwork).get_output();
+        }
+
+        const tensor& forward(const tensor& x)
+        {
+            subnetwork.forward(x);
+            return layer<TAG_TYPE>(subnetwork).get_output();
+        }
+
+        const tensor& get_output() const 
+        { 
+            return layer<TAG_TYPE>(subnetwork).get_output();
+        }
+
+        tensor& get_gradient_input() 
+        { 
+            return layer<TAG_TYPE>(subnetwork).get_gradient_input();
+        }
+
+        const tensor& get_final_data_gradient(
+        ) const 
+        { 
+            return subnetwork.get_final_data_gradient(); 
+        }
+
+        void back_propagate_error(const tensor& x)
+        {
+            subnetwork.back_propagate_error(x);
+        }
+
+        template <typename solver_type>
+        void update_parameters(sstack<solver_type> solvers, double learning_rate)
+        {
+            subnetwork.update_parameters(solvers, learning_rate);
+        }
+
+        const tensor& get_parameter_gradient(
+        ) const { return params_grad; }
+
+        tensor& get_parameter_gradient (
+        ) { return params_grad; }
+
+
+        const subnet_type& subnet() const 
+        { 
+            return subnetwork; 
+        }
+
+        subnet_type& subnet() 
+        { 
+            return subnetwork; 
+        }
+
+        unsigned int sample_expansion_factor() const { return subnet().sample_expansion_factor(); }
+
+        void clean()
+        {
+            subnetwork.clean();
+        }
+
+        friend void serialize(const add_skip_layer& item, std::ostream& out)
+        {
+            int version = 1;
+            serialize(version, out);
+            serialize(item.subnetwork, out);
+        }
+
+        friend void deserialize(add_skip_layer& item, std::istream& in)
+        {
+            int version = 0;
+            deserialize(version, in);
+            if (version != 1)
+                throw serialization_error("Unexpected version found while deserializing dlib::add_skip_layer.");
+            deserialize(item.subnetwork, in);
+        }
+
+        friend std::ostream& operator<< (std::ostream& out, const add_skip_layer& item)
+        {
+            int min_length = 0;
+            item.print(out, 0, min_length);
+            return out;
+        }
+
+        void print (std::ostream& out, unsigned long idx, int& min_length) const
+        {
+            out << "layer<" << idx << ">\t"<<impl::tensor_to_str(private_get_output(), min_length) <<"skip"<<id<<"\n";
+            subnet().print(out, idx+1, min_length);
+        }
+
+    private:
+
+
+        template <typename T, typename U, typename E>
+        friend class add_layer;
+        template <typename T, bool is_first, typename E>
+        friend class dimpl::subnet_wrapper;
+        template <unsigned long T, typename U, typename E>
+        friend class add_tag_layer;
+        template <template<typename> class T, typename U>
+        friend class add_skip_layer;
+        template <size_t N, template<typename> class L, typename S>
+        friend class repeat;
+
+        bool this_layer_requires_forward_output(
+        ) { return layer<TAG_TYPE>(subnetwork).this_layer_requires_forward_output(); } 
+
+        void disable_output_and_gradient_getters (
+        ) { layer<TAG_TYPE>(subnetwork).disable_output_and_gradient_getters(); }
+
+        tensor& private_get_output() const
+        { return layer<TAG_TYPE>(subnetwork).private_get_output(); }
+        tensor& private_get_gradient_input() 
+        { return layer<TAG_TYPE>(subnetwork).private_get_gradient_input(); }
+
+        subnet_type subnetwork;
+
+        // This member doesn't logically contribute to the state of the object since it is
+        // always empty. It's just here so we can have the get_parameter_gradient() methods
+        // which have to return something.  So they return this empty tensor.
+        resizable_tensor params_grad;
+    };
+    template <template<typename> class T, typename U>
+    struct is_nonloss_layer_type<add_skip_layer<T,U>> : std::true_type {};
+
+    template <typename SUBNET> using tag1  = add_tag_layer< 1, SUBNET>;
+    template <typename SUBNET> using tag2  = add_tag_layer< 2, SUBNET>;
+    template <typename SUBNET> using tag3  = add_tag_layer< 3, SUBNET>;
+    template <typename SUBNET> using tag4  = add_tag_layer< 4, SUBNET>;
+    template <typename SUBNET> using tag5  = add_tag_layer< 5, SUBNET>;
+    template <typename SUBNET> using tag6  = add_tag_layer< 6, SUBNET>;
+    template <typename SUBNET> using tag7  = add_tag_layer< 7, SUBNET>;
+    template <typename SUBNET> using tag8  = add_tag_layer< 8, SUBNET>;
+    template <typename SUBNET> using tag9  = add_tag_layer< 9, SUBNET>;
+    template <typename SUBNET> using tag10 = add_tag_layer<10, SUBNET>;
+
+    template <typename SUBNET> using skip1  = add_skip_layer< tag1, SUBNET>;
+    template <typename SUBNET> using skip2  = add_skip_layer< tag2, SUBNET>;
+    template <typename SUBNET> using skip3  = add_skip_layer< tag3, SUBNET>;
+    template <typename SUBNET> using skip4  = add_skip_layer< tag4, SUBNET>;
+    template <typename SUBNET> using skip5  = add_skip_layer< tag5, SUBNET>;
+    template <typename SUBNET> using skip6  = add_skip_layer< tag6, SUBNET>;
+    template <typename SUBNET> using skip7  = add_skip_layer< tag7, SUBNET>;
+    template <typename SUBNET> using skip8  = add_skip_layer< tag8, SUBNET>;
+    template <typename SUBNET> using skip9  = add_skip_layer< tag9, SUBNET>;
+    template <typename SUBNET> using skip10 = add_skip_layer<tag10, SUBNET>;
+
+// ----------------------------------------------------------------------------------------
+
+    namespace timpl
+    {
+        inline void fill_with_gassuan_random_numbers (
+            tensor& t,
+            dlib::rand& rnd,
+            double sigma = 1
+        )
+        {
+            float* data = t.host();
+            for (size_t i = 0; i < t.size(); ++i)
+                data[i] = rnd.get_random_gaussian()*sigma;
+        }
+
+        class test_layer_subnet 
+        {
+        public:
+            test_layer_subnet (
+                dlib::rand& rnd_
+            ) : rnd(rnd_) 
+            {
+                // Output and gradient_input have to have the same dimensions in each
+                // layer.
+                const long num_samples = rnd.get_random_32bit_number()%4+3;
+                const long k  = rnd.get_random_32bit_number()%4+2;
+                const long nr = rnd.get_random_32bit_number()%4+2;
+                const long nc = rnd.get_random_32bit_number()%4+2;
+
+                output.set_size(num_samples, k, nr, nc);
+                gradient_input.set_size(num_samples, k, nr, nc);
+
+                // Use a non-zero initial gradient to make sure the layers add to it
+                // rather than assign and blow away the initial value.
+                fill_with_gassuan_random_numbers(gradient_input, rnd, 0.01);
+
+                fill_with_gassuan_random_numbers(output, rnd);
+            }
+
+
+            tensor& get_mutable_output() { return output; }
+            const tensor& get_output() const { return output; }
+            const tensor& private_get_output() const { return get_output(); }
+            const test_layer_subnet& subnet() const { init_sub(); return *subnetwork; }
+
+            tensor& get_gradient_input() { return gradient_input; }
+            tensor& private_get_gradient_input() { return get_gradient_input(); }
+            test_layer_subnet& subnet() { init_sub(); return *subnetwork; }
+
+
+
+            unsigned long count_outputs() const
+            {
+                if (subnetwork)
+                    return subnetwork->count_outputs() + output.size();
+                else
+                    return output.size();
+            }
+
+            float& get_output_element(unsigned long i)
+            {
+                if (i < output.size())
+                    return output.host()[i];
+                else
+                    return subnet().get_output_element(i-output.size());
+            }
+
+            float get_gradient_input_element(unsigned long i) const
+            {
+                if (i < gradient_input.size())
+                    return gradient_input.host()[i];
+                else
+                    return subnet().get_gradient_input_element(i-gradient_input.size());
+            }
+
+
+        private:
+            // We lazily initialize sub-layers as needed when someone tries to call
+            // subnet()
+            void init_sub() const
+            {
+                if (!subnetwork)
+                    subnetwork.reset(new test_layer_subnet(rnd));
+            }
+
+            dlib::rand& rnd;
+            mutable std::unique_ptr<test_layer_subnet> subnetwork;
+            resizable_tensor output;
+            resizable_tensor gradient_input;
+        };
+
+    }
+
+    struct layer_test_results
+    {
+        layer_test_results() : was_good(true) {}
+        explicit layer_test_results(const std::string& l) : log(l),was_good(false) {}
+
+        std::string log;
+        bool was_good;
+
+        operator bool() const { return was_good; }
+    };
+
+    inline std::ostream& operator<< (std::ostream& out, const layer_test_results& item)
+    {
+        out << item.log;
+        return out;
+    }
+
+    template <
+        typename layer_details_type
+        >
+    layer_test_results impl_test_layer (
+        layer_details_type l,
+        const float base_eps 
+    )
+    {
+        using namespace timpl;
+        // Do some setup
+        running_stats<double> rs_data, rs_params;
+        dlib::rand rnd;
+        std::ostringstream sout;
+        for (int iter = 0; iter < 10; ++iter)
+        {
+            test_layer_subnet subnetwork(rnd);
+            resizable_tensor output, out2, out3;
+            // Run setup() and forward() as well to make sure any calls to subnet() have
+            // happened before we start assuming we know how many data elements there are
+            // (since we do a lazy layer creation thing based on calls to subnet() inside
+            // test_layer_subnet).
+            l.setup(subnetwork);
+            impl::call_layer_forward(l, subnetwork, output);
+
+            resizable_tensor input_grad;
+            input_grad.copy_size(output);
+            fill_with_gassuan_random_numbers(input_grad, rnd);
+
+
+            // The f() we are computing gradients of is this thing.  It's value at the current
+            // parameter and data values is:
+            //sout << "f(data,params): " << dot(output, input_grad) << std::endl;
+
+            // We are going to save a copy of the subnetwork.get_gradient_input() data before we do
+            // backpropagation since the backward() function is supposed to *add* to the
+            // gradients rather than overwrite them.  We will use this saved data to check if
+            // that is the case.
+            const unsigned long num_data_inputs = subnetwork.count_outputs();
+            std::vector<float> initial_gradient_input(num_data_inputs);
+            for (unsigned long i = 0; i < num_data_inputs; ++i)
+                initial_gradient_input[i] = subnetwork.get_gradient_input_element(i);
+
+
+            // Now tell the layer to compute all the gradients.  In the rest of this function
+            // we will just be checking that these gradients were computed correctly by
+            // comparing them to a central differences approximation.
+            resizable_tensor params_grad;
+            params_grad.copy_size(l.get_layer_params());
+            // But first, set the params grad to something crazy so that it's very obvious if
+            // it doesn't get fully assigned.
+            params_grad = std::numeric_limits<float>::infinity();
+            impl::call_layer_backward(l, output, input_grad, subnetwork, params_grad);
+
+            static_assert(impl::is_inplace_layer(l, subnetwork) == impl::has_inplace_backward(l, subnetwork),
+                "Layer not defined correctly.  forward and backward methods must either both be in-place or both out-of-place. ");
+
+            // Make sure the outputs of forward() and backward() are the same when they are run
+            // in in-place mode.
+            if (impl::is_inplace_layer(l, subnetwork))
+            {
+                test_layer_subnet subnetwork2(rnd);
+                layer_details_type ll(l);
+                ll.setup(subnetwork2);
+                resizable_tensor ip_out;
+                impl::call_layer_forward(ll, subnetwork2, ip_out);
+                impl::call_layer_forward(ll, subnetwork2, subnetwork2.get_mutable_output());
+                const auto forward_error = max(abs(mat(ip_out) - mat(subnetwork2.get_output())));
+                if (forward_error > 0.00001)
+                {
+                    using namespace std;
+                    sout << "This layer is supposed to support in-place computations but the output of forward_inplace()\n";
+                    sout << "changes when invoked in-place vs. out-of-place. The error was: " << forward_error << endl;
+                    return layer_test_results(sout.str()); 
+                }
+
+                resizable_tensor params_grad;
+                params_grad.copy_size(ll.get_layer_params());
+                params_grad = std::numeric_limits<float>::infinity();
+
+                resizable_tensor input_grad;
+                input_grad.copy_size(ip_out);
+                fill_with_gassuan_random_numbers(input_grad, rnd);
+                resizable_tensor params_grad1, params_grad2, data_grad1, data_grad2;
+                params_grad1 = params_grad;
+                params_grad2 = params_grad;
+                // Now call backward() and make sure it works as well.  Recall that when an
+                // in-place layer works in-place it assigns to it's outputs but when it's
+                // not running in-place it adds.  So we initialize to a non-zero value to
+                // check that this is the behavior that really executes.
+                subnetwork2.get_gradient_input() = 9;
+                impl::call_layer_backward(ll, ip_out, input_grad, subnetwork2, params_grad1);
+                data_grad1 = subnetwork2.get_gradient_input();
+
+                subnetwork2.get_gradient_input() = mat(input_grad);
+                impl::call_layer_backward(ll, ip_out, subnetwork2.get_gradient_input(), subnetwork2, params_grad2);
+                data_grad2 = subnetwork2.get_gradient_input();
+                if (params_grad.size() != 0)
+                {
+                    const auto backward_param_error = max(abs(mat(params_grad1) - mat(params_grad2)));
+                    if (backward_param_error > 0.00001)
+                    {
+                        using namespace std;
+                        sout << "This layer is supposed to support in-place computations but the output of backward_inplace()\n";
+                        sout << "changes when invoked in-place vs. out-of-place. The error was: " << backward_param_error << endl;
+                        return layer_test_results(sout.str()); 
+                    }
+                }
+                const auto backward_data_error = max(abs(mat(data_grad1)-9 - mat(data_grad2)));
+                if (backward_data_error > 0.00001)
+                {
+                    using namespace std;
+                    sout << "This layer is supposed to support in-place computations but the output of backward_inplace()\n";
+                    sout << "changes when invoked in-place vs. out-of-place. The error was: " << backward_data_error << endl;
+                    return layer_test_results(sout.str()); 
+                }
+            }
+
+            // ==================================================================
+            // first validate the way the parameter gradients are computed
+            for (unsigned long i = 0; i < params_grad.size(); ++i)
+            {
+                layer_details_type l1(l);
+
+                float eps = l1.get_layer_params().host()[i]*base_eps;
+                if (eps == 0)
+                    eps = base_eps;
+                const float oldval = l1.get_layer_params().host()[i];
+                l1.get_layer_params().host()[i] = oldval+eps;
+                impl::call_layer_forward(l1, subnetwork, out2);
+                l1.get_layer_params().host()[i] = oldval-eps;
+                impl::call_layer_forward(l1, subnetwork, out3);
+                l1.get_layer_params().host()[i] = oldval;
+
+                // Compute a reference derivative via a central differences approximation and
+                // compare it to the one output by the layer and make sure they match.
+                double reference_derivative = (dot(out2,input_grad)-dot(out3, input_grad))/(2*eps);
+                double output_derivative = params_grad.host()[i];
+                double relative_error;
+                if (reference_derivative*output_derivative != 0)
+                    relative_error = (reference_derivative - output_derivative)/(reference_derivative);
+                else
+                    relative_error = (reference_derivative - output_derivative);
+                double absolute_error = (reference_derivative - output_derivative);
+                rs_params.add(std::abs(relative_error));
+                if (std::abs(relative_error) > 0.05 && std::abs(absolute_error) > 0.006)
+                {
+                    using namespace std;
+                    sout << "Gradient error in parameter #" << i <<".  Relative error: "<< relative_error << endl;
+                    sout << "expected derivative: " << reference_derivative << endl;
+                    sout << "output derivative:   " << output_derivative << endl;
+                    sout << "iteration:           " << iter << endl;
+                    return layer_test_results(sout.str()); 
+                }
+            }
+
+            // ==================================================================
+            // now validate the data gradients
+            for (unsigned long i = 0; i < num_data_inputs; ++i)
+            {
+                const float oldval = subnetwork.get_output_element(i);
+                float eps = oldval*base_eps;
+                if (eps == 0)
+                    eps = base_eps;
+                subnetwork.get_output_element(i) = oldval+eps;
+                impl::call_layer_forward(l, subnetwork, out2);
+                subnetwork.get_output_element(i) = oldval-eps;
+                impl::call_layer_forward(l, subnetwork, out3);
+                subnetwork.get_output_element(i) = oldval;
+
+                // Compute a reference derivative via a central differences approximation and
+                // compare it to the one output by the layer and make sure they match.
+                double reference_derivative = (dot(out2,input_grad)-dot(out3, input_grad))/(2*eps);
+                double output_derivative = subnetwork.get_gradient_input_element(i);
+                output_derivative -= initial_gradient_input[i];
+                double relative_error;
+                if (reference_derivative*output_derivative != 0)
+                    relative_error = (reference_derivative - output_derivative)/(reference_derivative);
+                else
+                    relative_error = (reference_derivative - output_derivative);
+                double absolute_error = (reference_derivative - output_derivative);
+                rs_data.add(std::abs(relative_error));
+                if (std::abs(relative_error) > 0.05 && std::abs(absolute_error) > 0.006)
+                {
+                    using namespace std;
+                    sout << "Gradient error in data variable #" << i <<".  Relative error: "<< relative_error << endl;
+                    sout << "expected derivative: " << reference_derivative << endl;
+                    sout << "output derivative:   " << output_derivative << endl;
+                    sout << "iteration:           " << iter << endl;
+                    return layer_test_results(sout.str()); 
+                }
+            }
+
+        } // end for (int iter = 0; iter < 10; ++iter)
+
+        if (rs_params.mean() > 0.003)
+        {
+            using namespace std;
+            sout << "Average parameter gradient error is somewhat large at: "<< rs_params.mean() << endl;
+            return layer_test_results(sout.str()); 
+        }
+        if (rs_data.mean() > 0.003)
+        {
+            using namespace std;
+            sout << "Average data gradient error is somewhat large at: "<< rs_data.mean() << endl;
+            return layer_test_results(sout.str()); 
+        }
+
+        return layer_test_results();
+    }
+
+    template <
+        typename layer_details_type
+        >
+    layer_test_results test_layer (
+        layer_details_type l
+    )
+    {
+        // Try a few different derivative step sizes to see if any work. 
+        for (float base_eps = 0.0001; base_eps < 0.1; base_eps *= 2)
+        {
+            auto result = impl_test_layer(l, base_eps);
+            if (result)
+                return result;
+        }
+        // However, if none of the step sizes worked then try this one and probably result
+        // in returning an error.
+        return impl_test_layer(l, 0.01);
+    }
+
+// ----------------------------------------------------------------------------------------
+
+    namespace impl
+    {
+        template <size_t i, size_t num>
+        struct vlp_loop
+        {
+            template <typename T, typename U>
+            static typename std::enable_if<!is_add_layer<U>::value>::type invoke_functor(T&& , size_t& , U&& )
+            {
+                // intentionally left empty
+            }
+
+            template <typename T, typename U>
+            static typename std::enable_if<is_add_layer<U>::value>::type invoke_functor(T&& v , size_t& comp_i, U&& l )
+            {
+                v(comp_i, l.layer_details().get_layer_params());
+                ++comp_i;
+            }
+
+            template <
+                typename net_type,
+                typename visitor
+                >
+            static void visit(
+                size_t comp_i,
+                net_type& net,
+                visitor&& v
+            )
+            {
+                invoke_functor(v, comp_i, layer<i>(net));
+                vlp_loop<i+1, num>::visit(comp_i, net,v);
+            }
+        };
+
+        template <size_t num>
+        struct vlp_loop<num,num>
+        {
+            template <
+                typename net_type,
+                typename visitor
+                >
+            static void visit(
+                size_t,
+                net_type&,
+                visitor&& 
+            )
+            {
+                // Base case of recursion.  Don't do anything.
+            }
+        };
+
+    }
+
+    template <
+        typename net_type,
+        typename visitor
+        >
+    void visit_layer_parameters(
+        net_type& net,
+        visitor v
+    )
+    {
+        size_t comp_i = 0;
+        impl::vlp_loop<0, net_type::num_layers>::visit(comp_i, net, v);
+    }
+
+// ----------------------------------------------------------------------------------------
+
+    namespace impl
+    {
+        template <size_t i, size_t num>
+        struct vlpg_loop
+        {
+            template <typename T, typename U>
+            static typename std::enable_if<!is_add_layer<U>::value>::type invoke_functor(T&& , size_t& , U&& )
+            {
+                // intentionally left empty
+            }
+
+            template <typename T, typename U>
+            static typename std::enable_if<is_add_layer<U>::value>::type invoke_functor(T&& v , size_t& comp_i, U&& l )
+            {
+                v(comp_i, l.get_parameter_gradient());
+                ++comp_i;
+            }
+
+            template <
+                typename net_type,
+                typename visitor
+                >
+            static void visit(
+                size_t comp_i,
+                net_type& net,
+                visitor&& v
+            )
+            {
+                invoke_functor(v, comp_i, layer<i>(net));
+                vlpg_loop<i+1, num>::visit(comp_i, net,v);
+            }
+        };
+
+        template <size_t num>
+        struct vlpg_loop<num,num>
+        {
+            template <
+                typename net_type,
+                typename visitor
+                >
+            static void visit(
+                size_t,
+                net_type&,
+                visitor&& 
+            )
+            {
+                // Base case of recursion.  Don't do anything.
+            }
+        };
+
+    }
+
+    template <
+        typename net_type,
+        typename visitor
+        >
+    void visit_layer_parameter_gradients(
+        net_type& net,
+        visitor v
+    )
+    {
+        size_t comp_i = 0;
+        impl::vlpg_loop<0, net_type::num_layers>::visit(comp_i, net, v);
+    }
+
+// ----------------------------------------------------------------------------------------
+
+    namespace impl
+    {
+        template <size_t i, size_t num>
+        struct vl_loop
+        {
+            template <
+                typename net_type,
+                typename visitor
+                >
+            static void visit(
+                net_type& net,
+                visitor&& v
+            )
+            {
+                v(i, layer<i>(net));
+                vl_loop<i+1, num>::visit(net,v);
+            }
+        };
+
+        template <size_t num>
+        struct vl_loop<num,num>
+        {
+            template <
+                typename net_type,
+                typename visitor
+                >
+            static void visit(
+                net_type&,
+                visitor&& 
+            )
+            {
+                // Base case of recursion.  Don't do anything.
+            }
+        };
+
+        template <size_t i, size_t num>
+        struct vl_loop_backwards
+        {
+            template <
+                typename net_type,
+                typename visitor
+                >
+            static void visit(
+                net_type& net,
+                visitor&& v
+            )
+            {
+                vl_loop_backwards<i+1, num>::visit(net,v);
+                v(i, layer<i>(net));
+            }
+        };
+
+        template <size_t num>
+        struct vl_loop_backwards<num,num>
+        {
+            template <
+                typename net_type,
+                typename visitor
+                >
+            static void visit(
+                net_type&,
+                visitor&& 
+            )
+            {
+                // Base case of recursion.  Don't do anything.
+            }
+        };
+
+    }
+
+    template <
+        typename net_type,
+        typename visitor
+        >
+    void visit_layers(
+        net_type& net,
+        visitor v
+    )
+    {
+        impl::vl_loop<0, net_type::num_layers>::visit(net, v);
+    }
+
+    template <
+        typename net_type,
+        typename visitor
+        >
+    void visit_layers_backwards(
+        net_type& net,
+        visitor v
+    )
+    {
+        impl::vl_loop_backwards<0, net_type::num_layers>::visit(net, v);
+    }
+
+    template <
+        size_t begin,
+        size_t end,
+        typename net_type,
+        typename visitor
+        >
+    void visit_layers_range(
+        net_type& net,
+        visitor v
+    )
+    {
+        static_assert(begin <= end, "Invalid range");
+        static_assert(end <= net_type::num_layers, "Invalid range");
+        impl::vl_loop<begin,end>::visit(net, v);
+    }
+
+    template <
+        size_t begin,
+        size_t end,
+        typename net_type,
+        typename visitor
+        >
+    void visit_layers_backwards_range(
+        net_type& net,
+        visitor v
+    )
+    {
+        static_assert(begin <= end, "Invalid range");
+        static_assert(end <= net_type::num_layers, "Invalid range");
+        impl::vl_loop_backwards<begin,end>::visit(net, v);
+    }
+
+// ----------------------------------------------------------------------------------------
+
+    namespace impl
+    {
+        template <size_t i, unsigned long tag_id>
+        struct vl_until_tag
+        {
+            template <
+                typename net_type,
+                typename next_net_type,
+                typename visitor
+                >
+            static void visit(
+                net_type& net,
+                next_net_type& next_net,
+                visitor&& v
+            )
+            {
+                v(next_net);
+                vl_until_tag<i+1,tag_id>::visit(net,layer<i+1>(net),v);
+            }
+
+            template <
+                typename net_type,
+                typename SUBNET,
+                typename visitor
+                >
+            static void visit(
+                net_type& net,
+                const add_tag_layer<tag_id,SUBNET>& next_net,
+                visitor&& v
+            )
+            {
+                v(next_net);
+            }
+
+            template <
+                typename net_type,
+                typename SUBNET,
+                typename visitor
+                >
+            static void visit(
+                net_type& net,
+                add_tag_layer<tag_id,SUBNET>& next_net,
+                visitor&& v
+            )
+            {
+                v(next_net);
+            }
+        };
+    }
+
+    template <
+        unsigned long tag_id,
+        typename net_type,
+        typename visitor
+        >
+    void visit_layers_until_tag(
+        net_type& net,
+        visitor v
+    )
+    {
+        impl::vl_until_tag<0,tag_id>::visit(net, net, v);
+    }
+
+// ----------------------------------------------------------------------------------------
+
+}
+
+#endif // DLIB_DNn_CORE_H_
+
+
diff --git a/ml/dlib/dlib/dnn/core_abstract.h b/ml/dlib/dlib/dnn/core_abstract.h
new file mode 100644
index 000000000..db168a88b
--- /dev/null
+++ b/ml/dlib/dlib/dnn/core_abstract.h
@@ -0,0 +1,1700 @@
+// Copyright (C) 2015  Davis E. King (davis@dlib.net)
+// License: Boost Software License   See LICENSE.txt for the full license.
+#undef DLIB_DNn_CORE_ABSTRACT_H_
+#ifdef DLIB_DNn_CORE_ABSTRACT_H_
+
+#include "tensor_abstract.h"
+#include <memory>
+#include <type_traits>
+#include <tuple>
+#include <vector>
+#include "../rand.h"
+
+
+namespace dlib
+{
+
+// ----------------------------------------------------------------------------------------
+
+    template <
+        typename... T 
+        >
+    auto tuple_tail(
+        const std::tuple<T...>& item 
+    );
+    /*!
+        ensures
+            - returns a tuple that contains everything in item except for tuple_head(item).
+              The items will be in the same order as they are in item, just without
+              tuple_head(item).
+            - This function will correctly handle nested tuples.
+    !*/
+
+    template <typename... T>
+    auto tuple_head (
+        const std::tuple<T...>& item
+    ); 
+    /*!
+        ensures
+            - returns a copy of the first thing in the tuple that isn't a std::tuple.
+              Essentially, this function calls std::get<0>() recursively on item until
+              a non-std::tuple object is found.
+    !*/
+
+// ----------------------------------------------------------------------------------------
+
+    template <typename T>
+    double get_learning_rate_multiplier(
+        const T& obj
+    ); 
+    /*!
+        ensures
+            - if (obj has a get_learning_rate_multiplier() member function) then
+                - returns obj.get_learning_rate_multiplier()
+            - else
+                - returns 1
+    !*/
+
+    template <typename T>
+    double get_weight_decay_multiplier(
+        const T& obj
+    ); 
+    /*!
+        ensures
+            - if (obj has a get_weight_decay_multiplier() member function) then
+                - returns obj.get_weight_decay_multiplier()
+            - else
+                - returns 1
+    !*/
+
+// ----------------------------------------------------------------------------------------
+
+    bool dnn_prefer_fastest_algorithms(
+    );
+    /*!
+        ensures
+            - If dlib should prefer to use fast algorithms rather than ones that use less
+              RAM then this function returns true and false otherwise.
+            - On program startup this function will default to true.
+    !*/
+
+    void set_dnn_prefer_fastest_algorithms(
+    );
+    /*!
+        ensures
+            - #dnn_prefer_fastest_algorithms() == true
+    !*/
+
+    void set_dnn_prefer_smallest_algorithms(
+    );
+    /*!
+        ensures
+            - #dnn_prefer_fastest_algorithms() == false 
+    !*/
+
+// ----------------------------------------------------------------------------------------
+
+    template <
+        typename T
+        >
+    class sstack
+    {
+        /*!
+            WHAT THIS OBJECT REPRESENTS
+                This is a basic stack of T objects.  It contains no data itself but simply
+                points to a memory range of T object and allows you to access that block of
+                T objects as a stack.
+        !*/
+
+    public:
+        typedef T value_type;
+
+        sstack() = delete;
+
+        sstack (
+            T* data,
+            size_t s
+        );
+        /*!
+            ensures
+                - #size() == s
+                - #top() == *data
+                - #pop(i).top() == data[i]
+        !*/
+
+        const T& top(
+        ) const;
+        /*!
+            requires
+                - size() != 0
+            ensures
+                - returns the top element of the stack.
+        !*/
+
+        T& top(
+        );
+        /*!
+            requires
+                - size() != 0
+            ensures
+                - returns the top element of the stack.  
+        !*/
+
+        size_t size(
+        ) const;
+        /*!
+            ensures
+                - returns the number of elements in this stack.  
+        !*/
+
+        sstack pop(
+            size_t num = 1
+        ); 
+        /*!
+            requires
+                - num <= size()
+            ensures
+                - returns a reference to the sub-stack S such that:
+                    - S.size() == size()-num.
+                    - S.top() is num elements down the stack. 
+        !*/
+    };
+
+    template <
+        typename T
+        >
+    sstack<T> make_sstack(
+        std::vector<T>& item
+    ) { return sstack<T>(item.data(), item.size()); }
+    /*!
+        ensures
+            - returns a sstack that sits on top of the given std::vector.
+    !*/
+
+// ----------------------------------------------------------------------------------------
+
+    template <
+        typename LAYER_DETAILS, 
+        typename SUBNET
+        >
+    class add_layer
+    {
+        /*!
+            REQUIREMENTS ON LAYER_DETAILS
+                - Must be a type that implements the EXAMPLE_COMPUTATIONAL_LAYER_ interface
+                  defined in layers_abstract.h
+
+            REQUIREMENTS ON SUBNET
+                - One of the following must be true:
+                    - SUBNET implements the EXAMPLE_INPUT_LAYER interface defined in
+                      input_abstract.h.
+                    - SUBNET is an add_layer object.
+                    - SUBNET is an add_tag_layer object.
+                    - SUBNET is an add_skip_layer object.
+                    - SUBNET is a repeat object.
+
+            WHAT THIS OBJECT REPRESENTS
+                This object represents a deep neural network.  In particular, it is a tool
+                for adding another layer on top of the neural network of type SUBNET, which
+                is specified as a template argument.  The specific layer added is defined
+                by the LAYER_DETAILS details template argument.
+        !*/
+
+    public:
+        typedef LAYER_DETAILS layer_details_type;
+        typedef SUBNET subnet_type;
+        typedef typename subnet_type::input_type input_type;
+        // num_computational_layers will always give the number of layers in the network
+        // that transform tensors (i.e. layers defined by something that implements the
+        // EXAMPLE_COMPUTATIONAL_LAYER_ interface).  This is all the layers except for
+        // loss, tag, and skip layers.
+        const static size_t num_computational_layers = subnet_type::num_computational_layers + 1;
+        // num_layers counts all the layers in the network regardless of their type.  
+        const static size_t num_layers = subnet_type::num_layers + 1;
+
+        add_layer(
+        );
+        /*!
+            ensures
+                - default constructs all the layers in this network.
+                - #sample_expansion_factor() == 0
+        !*/
+
+        add_layer(const add_layer&) = default;
+        add_layer(add_layer&&) = default;
+        add_layer& operator=(add_layer&&) = default;
+        add_layer& operator=(const add_layer&) = default;
+        /*!
+            ensures
+                - this object is copyable and movable.
+        !*/
+
+        template <typename T, typename U>
+        add_layer(
+            const add_layer<T,U>& item
+        );
+        /*!
+            ensures
+                - This constructor allows you to copy neural network objects from one to
+                  another as long as their corresponding layers can be constructed from
+                  each other.
+                - #layer_details() == layer_details_type(item.layer_details())
+                - #subnet()        == subnet_type(item.subnet())
+                - #sample_expansion_factor() == item.sample_expansion_factor()
+        !*/
+
+        template <typename ...T, typename LD, typename ...U>
+        add_layer(
+            const std::tuple<LD,U...>& layer_det, 
+            T&& ...args
+        );
+        /*!
+            ensures
+                - #layer_details() == layer_details_type(tuple_head(layer_det))
+                - #subnet()        == subnet_type(tuple_tail(layer_det),args)
+                - #sample_expansion_factor() == 0 
+        !*/
+
+        template <typename ...T>
+        add_layer(
+            const layer_details_type& layer_det, 
+            T&& ...args
+        );
+        /*!
+            ensures
+                - #layer_details() == layer_details_type(layer_det)
+                - #subnet()        == subnet_type(args)
+                - #sample_expansion_factor() == 0 
+        !*/
+
+        template <typename ...T>
+        add_layer(
+            T&& ...args
+        );
+        /*!
+            ensures
+                - This version of the constructor is only called if layer_details_type
+                  can't be constructed from the first thing in args.  In this case, the
+                  args are simply passed on to the sub layers in their entirety.
+                - #layer_details() == layer_details_type()
+                - #subnet()        == subnet_type(args)
+                - #sample_expansion_factor() == 0 
+        !*/
+
+        template <typename ...T>
+        add_layer(
+            layer_details_type&& layer_det, 
+            T&& ...args
+        );
+        /*!
+            ensures
+                - #layer_details() == layer_det
+                - #subnet()        == subnet_type(args)
+                - #sample_expansion_factor() == 0 
+        !*/
+
+        template <typename forward_iterator>
+        void to_tensor (
+            forward_iterator ibegin,
+            forward_iterator iend,
+            resizable_tensor& data
+        ) const;
+        /*!
+            requires
+                - [ibegin, iend) is an iterator range over input_type objects.
+                - std::distance(ibegin,iend) > 0
+            ensures
+                - Converts the iterator range into a tensor and stores it into #data.
+                - #data.num_samples()%distance(ibegin,iend) == 0. 
+                - #sample_expansion_factor() == #data.num_samples()/distance(ibegin,iend).
+                - #sample_expansion_factor() > 0
+                - The data in the ith sample of #data corresponds to the input_type object
+                  *(ibegin+i/#sample_expansion_factor()).
+                - Invokes data.async_copy_to_device() so that the data begins transferring
+                  to the GPU device, if present.
+                - This function is implemented by calling the to_tensor() routine defined
+                  at the input layer of this network.  
+        !*/
+
+        unsigned int sample_expansion_factor (
+        ) const;
+        /*!
+            ensures
+                - When to_tensor() is invoked on this network's input layer it converts N
+                  input objects into M samples, all stored inside a resizable_tensor.  It
+                  is always the case that M is some integer multiple of N.
+                  sample_expansion_factor() returns the value of this multiplier.  To be
+                  very specific, it is always true that M==I*N where I is some integer.
+                  This integer I is what is returned by sample_expansion_factor().
+        !*/
+
+        const subnet_type& subnet(
+        ) const; 
+        /*!
+            ensures
+                - returns the immediate subnetwork of *this network.  
+        !*/
+
+        subnet_type& subnet(
+        );
+        /*!
+            ensures
+                - returns the immediate subnetwork of *this network.  
+        !*/
+
+        const layer_details_type& layer_details(
+        ) const; 
+        /*!
+            ensures
+                - returns the layer_details_type instance that defines the behavior of the
+                  layer at the top of this network.  I.e. returns the layer details that
+                  defines the behavior of the layer nearest to the network output rather
+                  than the input layer.
+        !*/
+
+        layer_details_type& layer_details(
+        );
+        /*!
+            ensures
+                - returns the layer_details_type instance that defines the behavior of the
+                  layer at the top of this network.  I.e. returns the layer details that
+                  defines the behavior of the layer nearest to the network output rather
+                  than the input layer.
+        !*/
+
+        template <typename forward_iterator>
+        const tensor& operator() (
+            forward_iterator ibegin,
+            forward_iterator iend
+        );
+        /*!
+            requires
+                - [ibegin, iend) is an iterator range over input_type objects.
+                - std::distance(ibegin,iend) > 0
+            ensures
+                - runs [ibegin,iend) through the network and returns the results.
+                  In particular, this function performs:
+                    to_tensor(ibegin,iend,temp_tensor);
+                    return forward(temp_tensor);
+                - The return value from this function is also available in #get_output().
+                  i.e. this function returns #get_output().
+                - have_same_dimensions(#get_gradient_input(), #get_output()) == true.
+                - All elements of #get_gradient_input() are set to 0. 
+                  i.e. calling this function clears out #get_gradient_input() and ensures
+                  it has the same dimensions as the most recent output.
+        !*/
+
+        const tensor& operator() (
+            const input_type& x
+        );
+        /*!
+            ensures
+                - runs a single x through the network and returns the output.
+                  I.e. returns (*this)(&x, &x+1);
+        !*/
+
+        const tensor& forward(
+            const tensor& x
+        );
+        /*!
+            requires
+                - sample_expansion_factor() != 0
+                  (i.e. to_tensor() must have been called to set sample_expansion_factor()
+                  to something non-zero.)
+                - x.num_samples()%sample_expansion_factor() == 0
+                - x.num_samples() > 0
+            ensures
+                - Runs x through the network and returns the results.  In particular, this
+                  function performs the equivalent of:
+                    subnet().forward(x);
+                    if (this is the first time forward() has been called) then
+                        layer_details().setup(subnet());
+                    layer_details().forward(subnet(), get_output());
+                - The return value from this function is also available in #get_output().
+                  i.e. this function returns #get_output().
+                - have_same_dimensions(#get_gradient_input(), #get_output()) == true
+                - All elements of #get_gradient_input() are set to 0. 
+                  i.e. calling this function clears out #get_gradient_input() and ensures
+                  it has the same dimensions as the most recent output.
+        !*/
+
+        const tensor& get_output(
+        ) const;
+        /*!
+            ensures
+                - returns the output for the last tensor that was run through the network.
+                  If nothing has been run through the network yet then returns an empty
+                  tensor. 
+        !*/
+
+        tensor& get_gradient_input(
+        );
+        /*!
+            ensures
+                - returns the error gradient for this network.  That is, this is the error
+                  gradient that this network will use to compute parameter gradients when
+                  back_propagate_error() is called.  Therefore, when performing back
+                  propagation, layers that sit on top of this network layer write their
+                  back-propagated error gradients into get_gradient_input().  Or to put it
+                  another way, during back-propagation, layers take the contents of their
+                  get_gradient_input() and back-propagate it through themselves and store
+                  the result into their subnetwork's get_gradient_input().
+
+                  This means you should consider get_gradient_input() as an input to the
+                  back_propagate_error() method.  
+        !*/
+
+        const tensor& get_final_data_gradient(
+        ) const;
+        /*!
+            ensures
+                - if back_propagate_error() has been called to back-propagate a gradient
+                  through this network then you can call get_final_data_gradient() to
+                  obtain the last data gradient computed.  That is, this function returns
+                  the gradient of the network with respect to its inputs.
+                - Note that there is only one "final data gradient" for an entire network,
+                  not one per layer, since there is only one input to the entire network.
+        !*/
+
+        const tensor& get_parameter_gradient(
+        ) const; 
+        /*!
+            ensures
+                - if back_propagate_error() has been called then you can call
+                  get_parameter_gradient() to find the gradient of this layer's parameters.
+                  When we update the parameters by calling update_parameters(), it will use
+                  the gradient in get_parameter_gradient() to perform the update.
+                  Therefore, you should consider get_parameter_gradient() as an input to
+                  update_parameters().
+        !*/
+
+        tensor& get_parameter_gradient (
+        ); 
+        /*!
+            ensures
+                - returns a non-const reference to the tensor returned by the above
+                  get_parameter_gradient() method.  You could use this method to modify the
+                  parameter gradient in some way before invoking update_parameters().
+        !*/
+
+        void back_propagate_error(
+            const tensor& x
+        );
+        /*!
+            requires
+                - forward(x) was called to forward propagate x though the network.
+                  Moreover, this was the most recent call to forward() and x has not been
+                  subsequently modified in any way.
+                - get_gradient_input() has been set equal to the gradient of this network's
+                  output with respect to some loss function.
+            ensures
+                - Back propagates the error gradient, get_gradient_input(), through this
+                  network and computes parameter and data gradients, via backpropagation.
+                  Specifically, this function populates get_final_data_gradient() and also,
+                  for each layer, the tensor returned by get_parameter_gradient().
+                - All elements of #get_gradient_input() are set to 0. 
+                - have_same_dimensions(#get_final_data_gradient(), x) == true.
+                - have_same_dimensions(#get_parameter_gradient(), layer_details().get_layer_params()) == true.
+                - #get_final_data_gradient() contains the gradient of the network with
+                  respect to x.
+        !*/
+
+        void back_propagate_error(
+            const tensor& x, 
+            const tensor& gradient_input
+        );
+        /*!
+            requires
+                - forward(x) was called to forward propagate x though the network.
+                  Moreover, this was the most recent call to forward() and x has not been
+                  subsequently modified in any way.
+                - have_same_dimensions(gradient_input, get_output()) == true
+            ensures
+                - This function is identical to the version of back_propagate_error()
+                  defined immediately above except that it back-propagates gradient_input
+                  through the network instead of get_gradient_input().  Therefore, this
+                  version of back_propagate_error() is equivalent to performing:
+                    get_gradient_input() = gradient_input;
+                    back_propagate_error(x);
+                  Except that calling back_propagate_error(x,gradient_input) avoids the
+                  copy and is therefore slightly more efficient.
+                - All elements of #get_gradient_input() are set to 0. 
+                - have_same_dimensions(#get_final_data_gradient(), x) == true.
+                - have_same_dimensions(#get_parameter_gradient(), layer_details().get_layer_params()) == true.
+                - #get_final_data_gradient() contains the gradient of the network with
+                  respect to x.
+        !*/
+
+        template <typename solver_type>
+        void update_parameters(
+            sstack<solver_type> solvers, 
+            double learning_rate
+        );
+        /*!
+            requires
+                - solver_type is an implementation of the EXAMPLE_SOLVER interface defined
+                  in solvers_abstract.h
+                - back_propagate_error() has been called.
+                - The given solvers have only ever been used with this network.  That is,
+                  if you want to call update_parameters() on some other neural network
+                  object then you must NOT reuse the same solvers object.
+                - solvers.size() >= num_computational_layers
+                - 0 < learning_rate <= 1
+            ensures
+                - Updates all the parameters in the network.  In particular, we pass each
+                  layer's parameter gradient (i.e. the tensor returned by the layer's
+                  get_parameter_gradient() member) through that layer's corresponding
+                  solver object.  This produces a parameter delta vector which we add to
+                  the layer's parameters.
+                - The solvers use the given learning rate.
+        !*/
+
+        void clean(
+        );
+        /*!
+            ensures
+                - Causes the network to forget about everything but its parameters.  
+                  That is, for each layer we will have:
+                    - get_output().num_samples() == 0
+                    - get_gradient_input().num_samples() == 0
+                  However, running new input data though this network will still produce
+                  the same output it would have produced regardless of any calls to
+                  clean().  The purpose of clean() is to compact the network object prior
+                  to saving it to disk so that it takes up less space and the IO is
+                  quicker.
+                - This also calls the .clean() method on any layer details objects that 
+                  define a .clean() method.
+        !*/
+
+    };
+
+    template <typename T, typename U> 
+    std::ostream& operator<<(std::ostream& out, const add_layer<T,U>& item);
+    /*!
+        prints the network architecture to the given output stream.
+    !*/
+
+    template <typename T, typename U> 
+    void serialize(const add_layer<T,U>& item, std::ostream& out);
+    template <typename T, typename U> 
+    void deserialize(add_layer<T,U>& item, std::istream& in);
+    /*!
+        provides serialization support  
+    !*/
+
+// ----------------------------------------------------------------------------------------
+// ----------------------------------------------------------------------------------------
+// ----------------------------------------------------------------------------------------
+
+    class no_label_type;
+
+    template <
+        typename LOSS_DETAILS, 
+        typename SUBNET
+        >
+    class add_loss_layer
+    {
+        /*!
+            REQUIREMENTS ON LOSS_DETAILS 
+                - Must be a type that implements the EXAMPLE_LOSS_LAYER_ interface defined
+                  in loss_abstract.h
+
+            REQUIREMENTS ON SUBNET
+                - One of the following must be true:
+                    - SUBNET is an add_layer object.
+                    - SUBNET is an add_tag_layer object.
+                    - SUBNET is an add_skip_layer object.
+                    - SUBNET is a repeat object.
+
+            WHAT THIS OBJECT REPRESENTS
+                This object represents a deep neural network.  In particular, it is a tool
+                for adding a loss layer on top of the neural network of type SUBNET, which
+                is specified as a template argument.  The specific layer added is defined
+                by the LOSS_DETAILS details template argument.  Importantly, a loss layer
+                is the last layer in a deep neural network.  So once it is added you can't
+                add any other layers of any type.
+        !*/
+
+    public:
+        typedef LOSS_DETAILS loss_details_type;
+        typedef SUBNET subnet_type;
+        typedef typename subnet_type::input_type input_type;
+        const static size_t num_computational_layers = subnet_type::num_computational_layers;
+        const static size_t num_layers = subnet_type::num_layers + 1;
+        // If LOSS_DETAILS is an unsupervised loss then training_label_type==no_label_type.
+        // Otherwise it is defined as follows:
+        typedef typename LOSS_DETAILS::training_label_type training_label_type;
+        // Similarly, if LOSS_DETAILS doesn't provide any output conversion then
+        // output_label_type==no_label_type.
+        typedef typename LOSS_DETAILS::output_label_type output_label_type;
+
+
+
+        add_loss_layer() = default;
+        /*!
+            ensures
+                - default constructs all the layers in this network.
+        !*/
+
+        add_loss_layer(const add_loss_layer&) = default;
+        add_loss_layer(add_loss_layer&&) = default;
+        add_loss_layer& operator=(add_loss_layer&&) = default;
+        add_loss_layer& operator=(const add_loss_layer&) = default;
+        /*!
+            ensures
+                - this object is copyable and movable.
+        !*/
+
+        template <typename T, typename U>
+        add_loss_layer(
+            const add_loss_layer<T,U>& item
+        );
+        /*!
+            ensures
+                - This constructor allows you to copy neural network objects from one to
+                  another as long as their corresponding layers can be constructed from
+                  each other.
+                - #loss_details() == loss_details_type(item.loss_details())
+                - #subnet()       == subnet_type(item.subnet())
+        !*/
+
+        template <typename ...T>
+        add_loss_layer(
+            const LOSS_DETAILS& layer_det, 
+            T&& ...args
+        ); 
+        /*!
+            ensures
+                - #loss_details() == loss_details_type(layer_det)
+                - #subnet()       == subnet_type(args)
+        !*/
+
+        template <typename ...T>
+        add_loss_layer(
+            LOSS_DETAILS&& layer_det, 
+            T&& ...args
+        );
+        /*!
+            ensures
+                - #loss_details() == loss_details_type(layer_det)
+                - #subnet()       == subnet_type(args)
+        !*/
+
+        template <typename ...T>
+        add_loss_layer(
+            T&& ...args
+        ); 
+        /*!
+            ensures
+                - This version of the constructor is only called if loss_details_type can't
+                  be constructed from the first thing in args.  In this case, the args are
+                  simply passed on to the sub layers in their entirety.
+                - #loss_details() == loss_details_type()
+                - #subnet()       == subnet_type(args)
+        !*/
+
+        const subnet_type& subnet(
+        ) const; 
+        /*!
+            ensures
+                - returns the immediate subnetwork of *this network.  
+        !*/
+
+        subnet_type& subnet(
+        ); 
+        /*!
+            ensures
+                - returns the immediate subnetwork of *this network.  
+        !*/
+
+        const loss_details_type& loss_details(
+        ) const; 
+        /*!
+            ensures
+                - returns the loss_details_type instance that defines the behavior of the
+                  loss layer used by this network.
+        !*/
+
+        loss_details_type& loss_details(
+        ); 
+        /*!
+            ensures
+                - returns the loss_details_type instance that defines the behavior of the
+                  loss layer used by this network.
+        !*/
+
+        template <typename forward_iterator>
+        void to_tensor (
+            forward_iterator ibegin,
+            forward_iterator iend,
+            resizable_tensor& data
+        ) const;
+        /*!
+            requires
+                - [ibegin, iend) is an iterator range over input_type objects.
+                - std::distance(ibegin,iend) > 0
+            ensures
+                - Converts the iterator range into a tensor and stores it into #data.
+                - #data.num_samples()%distance(ibegin,iend) == 0. 
+                - #sample_expansion_factor() == #data.num_samples()/distance(ibegin,iend).
+                - #sample_expansion_factor() > 0
+                - The data in the ith sample of #data corresponds to the input_type object
+                  *(ibegin+i/sample_expansion_factor()).
+                - Invokes data.async_copy_to_device() so that the data begins transferring
+                  to the GPU device, if present.
+                - This function is implemented by calling the to_tensor() routine defined
+                  at the input layer of this network.  
+        !*/
+
+        unsigned int sample_expansion_factor (
+        ) const;
+        /*!
+            ensures
+                - When to_tensor() is invoked on this network's input layer it converts N
+                  input objects into M samples, all stored inside a resizable_tensor.  It
+                  is always the case that M is some integer multiple of N.
+                  sample_expansion_factor() returns the value of this multiplier.  To be
+                  very specific, it is always true that M==I*N where I is some integer.
+                  This integer I is what is returned by sample_expansion_factor().
+        !*/
+
+    // -------------
+
+        template <typename output_iterator>
+        void operator() (
+            const tensor& x, 
+            output_iterator obegin
+        );
+        /*!
+            requires
+                - sample_expansion_factor() != 0
+                  (i.e. to_tensor() must have been called to set sample_expansion_factor()
+                  to something non-zero.)
+                - x.num_samples()%sample_expansion_factor() == 0
+                - x.num_samples() > 0
+                - obegin == iterator pointing to the start of a range of
+                  x.num_samples()/sample_expansion_factor() output_label_type elements.
+            ensures
+                - runs x through the network and writes the output to the range at obegin.
+                - loss_details().to_label() is used to write the network output into
+                  obegin.
+        !*/
+
+        template <typename forward_iterator, typename label_iterator>
+        void operator() (
+            forward_iterator ibegin,
+            forward_iterator iend,
+            label_iterator obegin
+        );
+        /*!
+            requires
+                - [ibegin, iend) is an iterator range over input_type objects.
+                - std::distance(ibegin,iend) > 0
+                - obegin == iterator pointing to the start of a range of
+                  std::distance(ibegin,iend) output_label_type elements.
+            ensures
+                - runs [ibegin,iend) through the network and writes the output to the range
+                  at obegin.
+                - loss_details().to_label() is used to write the network output into
+                  obegin.
+        !*/
+
+    // -------------
+
+        const output_label_type& operator() (
+            const input_type& x
+        );
+        /*!
+            ensures
+                - runs a single object, x, through the network and returns the output.
+                - loss_details().to_label() is used to convert the network output into a
+                  output_label_type.
+        !*/
+
+        template <typename iterable_type>
+        std::vector<output_label_type> operator() (
+            const iterable_type& data,
+            size_t batch_size = 128
+        );
+        /*!
+            requires
+                - batch_size > 0
+                - data must have a .begin() and .end() that supply iterators over a
+                  sequence of input_type elements.  E.g. data could have a type of
+                  std::vector<input_type>
+            ensures
+                - runs all the objects in data through the network and returns their
+                  predicted labels.  This means this function returns a vector V such that:
+                    - V.size() == data.size()
+                    - for all valid i: V[i] == the predicted label of data[i].
+                - Elements of data are run through the network in batches of batch_size
+                  items.  Using a batch_size > 1 can be faster because it better exploits
+                  the available hardware parallelism.
+                - loss_details().to_label() is used to convert the network output into a
+                  output_label_type.
+        !*/
+
+        template <typename ...T>
+        const output_label_type& process (
+            const input_type& x, 
+            T&& ...args
+        );
+        /*!
+            ensures
+                - This function is just like (*this)(x), i.e. it runs a single object, x,
+                  through the network and returns the output.  But we additionally pass the 
+                  given args to loss_details().to_label() as the 4th argument (or more,
+                  depending on how many things are in args) when converting the network
+                  output to an output_label_type.  This is useful, for instance, with loss
+                  layers like loss_mmod_ which has an optional adjust_threshold argument to
+                  to_label() that adjusts the detection threshold.  Therefore, for such
+                  networks you could call them like: net.process(some_image, -0.5), and -0.5
+                  would be passed so the adjust_threshold argument of to_tensor().
+        !*/
+
+        template <typename iterable_type, typename ...T>
+        std::vector<output_label_type> process_batch (
+            const iterable_type& data, 
+            size_t batch_size, 
+            T&& ...args
+        );
+        /*!
+            requires
+                - batch_size > 0
+                - data must have a .begin() and .end() that supply iterators over a
+                  sequence of input_type elements.  E.g. data could have a type of
+                  std::vector<input_type>
+            ensures
+                - This function is just like (*this)(data,batch_size), i.e. it runs a
+                  bunch of objects through the network and returns the outputs.  But we
+                  additionally pass the given args to loss_details().to_label() as the 4th
+                  argument (or more, depending on how many things are in args) when
+                  converting the network output to output_label_types.  This is useful,
+                  for instance, with loss layers like loss_mmod_ which has an optional
+                  adjust_threshold argument to to_label() that adjusts the detection
+                  threshold.  Therefore, for such networks you could call them like:
+                  net.process_batch(std::vector<image_type>({some_image, another_image}), 128, -0.5), 
+                  and -0.5 would be passed so the adjust_threshold argument of to_tensor().
+        !*/
+
+    // -------------
+
+        template <typename label_iterator>
+        double compute_loss (
+            const tensor& x,
+            label_iterator lbegin 
+        );
+        /*!
+            requires
+                - sample_expansion_factor() != 0
+                  (i.e. to_tensor() must have been called to set sample_expansion_factor()
+                  to something non-zero.)
+                - x.num_samples()%sample_expansion_factor() == 0
+                - x.num_samples() > 0
+                - lbegin == iterator pointing to the start of a range of
+                  x.num_samples()/sample_expansion_factor() training_label_type elements.
+            ensures
+                - runs x through the network, compares the output to the expected output
+                  pointed to by lbegin, and returns the resulting loss. 
+                - for all valid k:
+                    - the expected label of the kth sample in x is *(lbegin+k/sample_expansion_factor()).
+                - This function does not update the network parameters.
+        !*/
+
+        template <typename forward_iterator, typename label_iterator>
+        double compute_loss (
+            forward_iterator ibegin,
+            forward_iterator iend,
+            label_iterator lbegin 
+        );
+        /*!
+            requires
+                - [ibegin, iend) is an iterator range over input_type objects.
+                - std::distance(ibegin,iend) > 0
+                - lbegin == iterator pointing to the start of a range of
+                  std::distance(ibegin,iend) training_label_type elements.
+            ensures
+                - runs [ibegin,iend) through the network, compares the output to the
+                  expected output pointed to by lbegin, and returns the resulting loss. 
+                - for all valid k:
+                    - the expected label of *(ibegin+k) is *(lbegin+k).
+                - This function does not update the network parameters.
+        !*/
+
+    // -------------
+
+        double compute_loss (
+            const tensor& x
+        );
+        /*!
+            requires
+                - LOSS_DETAILS is an unsupervised loss.  i.e. training_label_type==no_label_type.
+                - sample_expansion_factor() != 0
+                  (i.e. to_tensor() must have been called to set sample_expansion_factor()
+                  to something non-zero.)
+                - x.num_samples()%sample_expansion_factor() == 0
+                - x.num_samples() > 0
+            ensures
+                - runs x through the network and returns the resulting loss. 
+                - This function does not update the network parameters.
+        !*/
+
+        template <typename forward_iterator>
+        double compute_loss (
+            forward_iterator ibegin,
+            forward_iterator iend,
+        );
+        /*!
+            requires
+                - LOSS_DETAILS is an unsupervised loss.  i.e. training_label_type==no_label_type.
+                - [ibegin, iend) is an iterator range over input_type objects.
+                - std::distance(ibegin,iend) > 0
+            ensures
+                - runs [ibegin,iend) through the network and returns the resulting loss. 
+                - This function does not update the network parameters.
+        !*/
+
+    // -------------
+
+        template <typename label_iterator>
+        double compute_parameter_gradients (
+            const tensor& x,
+            label_iterator lbegin
+        );
+        /*!
+            requires
+                - sample_expansion_factor() != 0
+                  (i.e. to_tensor() must have been called to set sample_expansion_factor()
+                  to something non-zero.)
+                - x.num_samples()%sample_expansion_factor() == 0
+                - x.num_samples() > 0
+                - lbegin == iterator pointing to the start of a range of
+                  x.num_samples()/sample_expansion_factor() training_label_type elements.
+            ensures
+                - runs x through the network, compares the output to the expected output
+                  pointed to by lbegin, and computes parameter and data gradients with
+                  respect to the loss, via backpropagation.  Specifically, this function
+                  updates get_final_data_gradient() and also, for each layer, the tensor
+                  returned by get_parameter_gradient().
+                - for all valid k:
+                    - the expected label of the kth sample in x is *(lbegin+k/sample_expansion_factor()).
+                - returns compute_loss(x,lbegin)
+        !*/
+
+        template <typename forward_iterator, typename label_iterator>
+        double compute_parameter_gradients (
+            forward_iterator ibegin,
+            forward_iterator iend,
+            label_iterator lbegin
+        );
+        /*!
+            requires
+                - [ibegin, iend) is an iterator range over input_type objects.
+                - std::distance(ibegin,iend) > 0
+                - lbegin == iterator pointing to the start of a range of
+                  std::distance(ibegin,iend) training_label_type elements.
+            ensures
+                - runs [ibegin,iend) through the network, compares the output to the
+                  expected output pointed to by lbegin, and computes parameter and data
+                  gradients with respect to the loss, via backpropagation.  Specifically,
+                  this function updates get_final_data_gradient() and also, for each layer,
+                  the tensor returned by get_parameter_gradient().
+                - for all valid k:
+                    - the expected label of *(ibegin+k) is *(lbegin+k).
+                - returns compute_loss(ibegin,iend,lbegin)
+        !*/
+
+        double compute_parameter_gradients (
+            const tensor& x
+        );
+        /*!
+            requires
+                - LOSS_DETAILS is an unsupervised loss.  i.e. training_label_type==no_label_type.
+                - sample_expansion_factor() != 0
+                  (i.e. to_tensor() must have been called to set sample_expansion_factor()
+                  to something non-zero.)
+                - x.num_samples()%sample_expansion_factor() == 0
+                - x.num_samples() > 0
+            ensures
+                - runs x through the network and computes parameter and data gradients with
+                  respect to the loss, via backpropagation.  Specifically, this function
+                  updates get_final_data_gradient() and also, for each layer, the tensor
+                  returned by get_parameter_gradient().
+                - returns compute_loss(x)
+        !*/
+
+        template <typename forward_iterator>
+        double compute_parameter_gradients (
+            forward_iterator ibegin,
+            forward_iterator iend
+        );
+        /*!
+            requires
+                - LOSS_DETAILS is an unsupervised loss.  i.e. training_label_type==no_label_type.
+                - [ibegin, iend) is an iterator range over input_type objects.
+                - std::distance(ibegin,iend) > 0
+            ensures
+                - runs [ibegin,iend) through the network and computes parameter and data
+                  gradients with respect to the loss, via backpropagation.  Specifically,
+                  this function updates get_final_data_gradient() and also, for each layer,
+                  the tensor returned by get_parameter_gradient().
+                - returns compute_loss(ibegin,iend)
+        !*/
+
+        template <typename solver_type>
+        void update_parameters (
+            sstack<solver_type> solvers,
+            double learning_rate
+        );
+        /*!
+            requires
+                - solver_type is an implementation of the EXAMPLE_SOLVER interface defined
+                  in solvers_abstract.h
+                - compute_parameter_gradients() has been called.
+                - The given solvers have only ever been used with this network.  That
+                  is, if you want to call update_parameters() on some other neural network
+                  object then you must NOT reuse the same solvers object.
+                - solvers.size() >= num_computational_layers
+                - 0 < learning_rate <= 1
+            ensures
+                - Updates all the parameters in the network.  In particular, we pass each
+                  layer's parameter gradient (i.e. the tensor returned by the layer's
+                  get_parameter_gradient() member) through that layer's corresponding
+                  solver object.  This produces a parameter delta vector which we add to
+                  the layer's parameters.
+                - The solvers use the given learning rate.
+        !*/
+
+    // -------------
+
+        void clean (
+        );
+        /*!
+            ensures
+                - Causes the network to forget about everything but its parameters.  
+                - invokes subnet().clean()
+        !*/
+    };
+
+    template <typename T, typename U> 
+    std::ostream& operator<<(std::ostream& out, const add_loss_layer<T,U>& item);
+    /*!
+        prints the network architecture to the given output stream.
+    !*/
+
+    template <typename T, typename U> 
+    void serialize(const add_loss_layer<T,U>& item, std::ostream& out);
+    template <typename T, typename U> 
+    void deserialize(add_loss_layer<T,U>& item, std::istream& in);
+    /*!
+        provides serialization support  
+    !*/
+
+// ----------------------------------------------------------------------------------------
+// ----------------------------------------------------------------------------------------
+// ----------------------------------------------------------------------------------------
+
+    template <typename ...T>
+    decorator_repeat_group<T...> repeat_group (
+        T&& ...args
+    );
+    /*!
+        ensures
+            - Decorates a group of variables.  This is essentially like std::make_tuple()
+              except it's only purpose is to group variables together so they can be passed
+              to the repeat object's constructor.
+    !*/
+
+    template <
+        size_t num,
+        template<typename> class REPEATED_LAYER, 
+        typename SUBNET
+        >
+    class repeat 
+    {
+        /*!
+            REQUIREMENTS ON num
+                - num > 0
+
+            REQUIREMENTS ON REPEATED_LAYER
+                - REPEATED_LAYER must be a template that stacks more layers onto a deep neural
+                  network.  For example, if net_type were a network without a loss layer,
+                  then it should be legal to create a deeper network with a type of
+                  REPEATED_LAYER<net_type>.
+
+            REQUIREMENTS ON SUBNET
+                - One of the following must be true:
+                    - SUBNET is an add_layer object.
+                    - SUBNET is an add_tag_layer object.
+                    - SUBNET is an add_skip_layer object.
+                    - SUBNET is a repeat object.
+
+            WHAT THIS OBJECT REPRESENTS
+                This object adds more layers to a deep neural network.  In particular, it
+                adds REPEATED_LAYER on top of SUBNET num times.  So for example, if num were 2 then
+                repeat<2,REPEATED_LAYER,SUBNET> would create a network equivalent to REPEATED_LAYER<REPEATED_LAYER<SUBNET>>.
+
+                Also, this object provides an interface identical to the one defined by the
+                add_layer object except that we add the num_repetitions() and
+                get_repeated_layer() methods.  These additions are shown below along with
+                some additional explanatory comments.
+        !*/
+
+    public:
+
+        typedef SUBNET subnet_type;
+        typedef typename SUBNET::input_type input_type;
+        const static size_t num_computational_layers = (REPEATED_LAYER<SUBNET>::num_computational_layers-SUBNET::num_computational_layers)*num + SUBNET::num_computational_layers;
+        const static size_t num_layers = (REPEATED_LAYER<SUBNET>::num_layers-SUBNET::num_layers)*num + SUBNET::num_layers;
+        typedef REPEATED_LAYER<an_unspecified_input_type> repeated_layer_type;
+
+        template <typename T, typename ...U>
+        repeat(
+            T arg1,
+            U ...args2
+        );
+        /*!
+            ensures
+                - arg1 is used to initialize the num_repetitions() copies of REPEATED_LAYER inside
+                  this object.  That is, all the REPEATED_LAYER elements are initialized identically
+                  by being given copies of arg1.
+                - The rest of the arguments to the constructor, i.e. args2, are passed to
+                  SUBNET's constructor.  
+        !*/
+
+        template <typename ...T, typename ...U>
+        repeat(
+            decorator_repeat_group<T...>&& arg1,
+            U ...args2
+        );
+        /*!
+            ensures
+                - arg1 is used to initialize the num_repetitions() copies of REPEATED_LAYER inside
+                  this object.  That is, all the REPEATED_LAYER elements are initialized identically
+                  by being given copies of an undecorated arg1.
+                - The rest of the arguments to the constructor, i.e. args2, are passed to
+                  SUBNET's constructor.  
+        !*/
+
+        size_t num_repetitions (
+        ) const; 
+        /*!
+            ensures
+                - returns num (i.e. the number of times REPEATED_LAYER was stacked on top of SUBNET)
+        !*/
+
+        const repeated_layer_type& get_repeated_layer (
+            size_t i 
+        ) const;
+        /*!
+            requires
+                - i < num_repetitions()
+            ensures
+                - returns a reference to the i-th instance of REPEATED_LAYER.  For example,
+                  get_repeated_layer(0) returns the instance of REPEATED_LAYER that is on the top of
+                  the network while get_repeated_layer(num_repetitions()-1) returns the
+                  instance of REPEATED_LAYER that is stacked immediately on top of SUBNET.
+        !*/
+
+        repeated_layer_type& get_repeated_layer (
+            size_t i 
+        );
+        /*!
+            requires
+                - i < num_repetitions()
+            ensures
+                - returns a reference to the i-th instance of REPEATED_LAYER.  For example,
+                  get_repeated_layer(0) returns the instance of REPEATED_LAYER that is on the top of
+                  the network while get_repeated_layer(num_repetitions()-1) returns the
+                  instance of REPEATED_LAYER that is stacked immediately on top of SUBNET.
+        !*/
+
+        const subnet_type& subnet(
+        ) const; 
+        /*!
+            ensures
+                - returns the SUBNET base network that repeat sits on top of.  If you want
+                  to access the REPEATED_LAYER components then you must use get_repeated_layer(). 
+        !*/
+
+        subnet_type& subnet(
+        ); 
+        /*!
+            ensures
+                - returns the SUBNET base network that repeat sits on top of.  If you want
+                  to access the REPEATED_LAYER components then you must use get_repeated_layer(). 
+        !*/
+    };
+
+    template < size_t num, template<typename> class T, typename U >
+    std::ostream& operator<<(std::ostream& out, const repeat<num,T,U>& item);
+    /*!
+        prints the network architecture to the given output stream.
+    !*/
+
+    template < size_t num, template<typename> class T, typename U >
+    void serialize(const repeat<num,T,U>& item, std::ostream& out);
+    template < size_t num, template<typename> class T, typename U >
+    void deserialize(repeat<num,T,U>& item, std::istream& in);
+    /*!
+        provides serialization support  
+    !*/
+
+// ----------------------------------------------------------------------------------------
+
+    template <
+        unsigned long ID, 
+        typename SUBNET
+        >
+    class add_tag_layer
+    {
+        /*!
+            REQUIREMENTS ON SUBNET
+                - One of the following must be true:
+                    - SUBNET implements the EXAMPLE_INPUT_LAYER interface defined in
+                      input_abstract.h.
+                    - SUBNET is an add_layer object.
+                    - SUBNET is an add_tag_layer object.
+                    - SUBNET is an add_skip_layer object.
+                    - SUBNET is a repeat object.
+
+            WHAT THIS OBJECT REPRESENTS
+                This object adds a new layer to a deep neural network.  However, this layer
+                simply performs the identity transform.  This means it is a no-op and its
+                presence does not change the behavior of the network.  It exists solely to
+                be used by add_skip_layer to reference a particular part of a network.
+
+                Also, this object provides an interface identical to the one defined by the
+                add_layer object.
+        !*/
+    };
+
+    template <unsigned long ID, typename U> 
+    std::ostream& operator<<(std::ostream& out, const add_tag_layer<ID,U>& item);
+    /*!
+        prints the network architecture to the given output stream.
+    !*/
+
+    template <unsigned long ID, typename U> 
+    void serialize(const add_tag_layer<ID,U>& item, std::ostream& out);
+    template <unsigned long ID, typename U> 
+    void deserialize(add_tag_layer<ID,U>& item, std::istream& in);
+    /*!
+        provides serialization support  
+    !*/
+
+    template <typename SUBNET> using tag1  = add_tag_layer< 1, SUBNET>;
+    template <typename SUBNET> using tag2  = add_tag_layer< 2, SUBNET>;
+    template <typename SUBNET> using tag3  = add_tag_layer< 3, SUBNET>;
+    template <typename SUBNET> using tag4  = add_tag_layer< 4, SUBNET>;
+    template <typename SUBNET> using tag5  = add_tag_layer< 5, SUBNET>;
+    template <typename SUBNET> using tag6  = add_tag_layer< 6, SUBNET>;
+    template <typename SUBNET> using tag7  = add_tag_layer< 7, SUBNET>;
+    template <typename SUBNET> using tag8  = add_tag_layer< 8, SUBNET>;
+    template <typename SUBNET> using tag9  = add_tag_layer< 9, SUBNET>;
+    template <typename SUBNET> using tag10 = add_tag_layer<10, SUBNET>;
+
+    template <template<typename SUBNET> class tag>
+    struct tag_id
+    {
+        /*!
+            REQUIREMENTS ON tag
+                Tag should be an add_tag_layer template such as tag1, tag2, etc.
+
+            WHAT THIS OBJECT REPRESENTS
+                This is a tool for finding the numeric ID of a tag layer.  For example,
+                tag_id<tag3>::id == 3.
+        !*/
+
+        const static unsigned long id;
+    };
+
+// ----------------------------------------------------------------------------------------
+
+    template <
+        template<typename> class TAG_TYPE, 
+        typename SUBNET
+        >
+    class add_skip_layer
+    {
+        /*!
+            REQUIREMENTS ON SUBNET
+                - One of the following must be true:
+                    - SUBNET is an add_layer object.
+                    - SUBNET is an add_tag_layer object.
+                    - SUBNET is an add_skip_layer object.
+                    - SUBNET is a repeat object.
+
+            WHAT THIS OBJECT REPRESENTS
+                This object adds a new layer to a deep neural network which draws its
+                inputs from layer<TAG_TYPE>(subnet()) and performs the identity transform.
+
+                Also, this object provides an interface identical to the one defined by the
+                add_layer object.
+        !*/
+    };
+
+    template <template<typename> class T, typename U>
+    std::ostream& operator<<(std::ostream& out, const add_skip_layer<T,U>& item);
+    /*!
+        prints the network architecture to the given output stream.
+    !*/
+
+    template <template<typename> class T, typename U>
+    void serialize(const add_skip_layer<T,U>& item, std::ostream& out);
+    template <template<typename> class T, typename U>
+    void deserialize(add_skip_layer<T,U>& item, std::istream& in);
+    /*!
+        provides serialization support  
+    !*/
+
+    template <typename SUBNET> using skip1  = add_skip_layer< tag1, SUBNET>;
+    template <typename SUBNET> using skip2  = add_skip_layer< tag2, SUBNET>;
+    template <typename SUBNET> using skip3  = add_skip_layer< tag3, SUBNET>;
+    template <typename SUBNET> using skip4  = add_skip_layer< tag4, SUBNET>;
+    template <typename SUBNET> using skip5  = add_skip_layer< tag5, SUBNET>;
+    template <typename SUBNET> using skip6  = add_skip_layer< tag6, SUBNET>;
+    template <typename SUBNET> using skip7  = add_skip_layer< tag7, SUBNET>;
+    template <typename SUBNET> using skip8  = add_skip_layer< tag8, SUBNET>;
+    template <typename SUBNET> using skip9  = add_skip_layer< tag9, SUBNET>;
+    template <typename SUBNET> using skip10 = add_skip_layer<tag10, SUBNET>;
+
+// ----------------------------------------------------------------------------------------
+
+    template <
+        unsigned int i, 
+        typename net_type
+        >
+    auto& layer (
+        net_type& n
+    );
+    /*!
+        requires
+            - net_type is an object of type add_layer, add_loss_layer, add_skip_layer, or
+              add_tag_layer.
+            - i < net_type::num_layers
+        ensures
+            - This function allows you to access any layer in a network by its layer index
+              i.  Therefore, it will walk i steps down the network and return the layer
+              object there.  Since networks can be big, the best way to find layer index
+              numbers is to print a network to the screen since the print out will include
+              indexes for each layer.
+            - In general, this function chains together i calls to n.subnet() and returns
+              the result.  So for example:
+                - if (i == 0)
+                    - returns n
+                - else if (i == 1)
+                    - returns n.subnet()
+                - else if (i == 2)
+                    - returns n.subnet().subnet()
+                - else if (i == 3)
+                    - returns n.subnet().subnet().subnet()
+                - else
+                    - etc.
+              Except that when it hits a repeat layer it recurses into the repeated layers
+              contained inside.  That is, if the layer index indicates a layer in a repeat
+              object this function will make the appropriate call to get_repeated_layer()
+              and do the right thing.
+    !*/
+
+    template <
+        template<typename> class Match, 
+        typename net_type 
+        >
+    auto& layer (
+        net_type& n
+    );
+    /*!
+        requires
+            - net_type is an object of type add_layer, add_loss_layer, add_skip_layer, or
+              add_tag_layer.
+        ensures
+            - returns the first layer in n that is of type Match.  E.g. if net_type is
+              fc<relu<fc<input<sample_type>>>> then calling layer<relu>(n) would return
+              layer<1>(n), that is, a reference to the relu layer.
+    !*/
+
+    template <
+        template<typename> class Match, 
+        unsigned int i, 
+        typename net_type
+        >
+    auto& layer (
+        net_type& n
+    );
+    /*!
+        requires
+            - net_type is an object of type add_layer, add_loss_layer, add_skip_layer, or
+              add_tag_layer.
+        ensures
+            - returns layer<i>(layer<Match>(n))
+    !*/
+
+// ----------------------------------------------------------------------------------------
+
+    template <typename net_type>
+    auto& input_layer (
+        net_type& net
+    );
+    /*!
+        requires
+            - net_type is an object of type add_layer, add_loss_layer, add_skip_layer, or
+              add_tag_layer.
+        ensures
+            - returns the input later of the given network object.  Specifically, this
+              function is equivalent to calling:
+                layer<net_type::num_layers-1>(net);
+              That is, you get the input layer details object for the network.
+    !*/
+
+// ----------------------------------------------------------------------------------------
+
+    template <
+        typename net_type,
+        typename visitor
+        >
+    void visit_layer_parameters(
+        net_type& net,
+        visitor v
+    );
+    /*!
+        requires
+            - net_type is an object of type add_layer, add_loss_layer, add_skip_layer, or
+              add_tag_layer.
+            - v is a function object with a signature equivalent to: 
+                v(size_t idx, tensor& t)
+        ensures
+            - Loops over all the computational layers (i.e. layers with parameters, as
+              opposed to loss, tag, or input layers) in net and passes their parameters to
+              v().  To be specific, this function essentially performs the following:
+
+                size_t computational_layer_idx = 0;
+                for (size_t i = 0; i < net_type::num_layers; ++i)
+                {
+                    if (layer<i>(net) is a computational layer)
+                    {
+                        v(computational_layer_idx, layer<i>(net).layer_details().get_layer_params());
+                        ++computational_layer_idx;
+                    }
+                }
+            - When v() is called, the first argument is always < net_type::num_computational_layers.
+    !*/
+
+// ----------------------------------------------------------------------------------------
+
+    template <
+        typename net_type,
+        typename visitor
+        >
+    void visit_layer_parameter_gradients(
+        net_type& net,
+        visitor v
+    );
+    /*!
+        requires
+            - net_type is an object of type add_layer, add_loss_layer, add_skip_layer, or
+              add_tag_layer.
+            - v is a function object with a signature equivalent to: 
+                v(size_t idx, tensor& t)
+        ensures
+            - Loops over all the computational layers (i.e. layers with parameters, as
+              opposed to loss, tag, or input layers) in net and passes their parameter
+              gradients to v().  To be specific, this function essentially performs the
+              following:
+
+                size_t computational_layer_idx = 0;
+                for (size_t i = 0; i < net_type::num_layers; ++i)
+                {
+                    if (layer<i>(net) is a computational layer)
+                    {
+                        v(computational_layer_idx, layer<i>(net).get_parameter_gradient());
+                        ++computational_layer_idx;
+                    }
+                }
+            - When v() is called, the first argument is always < net_type::num_computational_layers.
+    !*/
+
+// ----------------------------------------------------------------------------------------
+
+    template <
+        typename net_type,
+        typename visitor
+        >
+    void visit_layers(
+        net_type& net,
+        visitor v
+    );
+    /*!
+        requires
+            - net_type is an object of type add_layer, add_loss_layer, add_skip_layer, or
+              add_tag_layer.
+            - v is a function object with a signature equivalent to: 
+                v(size_t idx, any_net_type& t)
+              That is, it must take a size_t and then any of the network types such as
+              add_layer, add_loss_layer, etc.
+        ensures
+            - Loops over all the layers in net and calls v() on them.  To be specific, this
+              function essentially performs the following:
+
+                for (size_t i = 0; i < net_type::num_layers; ++i)
+                    v(i, layer<i>(net));
+    !*/
+
+    template <
+        typename net_type,
+        typename visitor
+        >
+    void visit_layers_backwards(
+        net_type& net,
+        visitor v
+    );
+    /*!
+        requires
+            - net_type is an object of type add_layer, add_loss_layer, add_skip_layer, or
+              add_tag_layer.
+            - v is a function object with a signature equivalent to: 
+                v(size_t idx, any_net_type& t)
+              That is, it must take a size_t and then any of the network types such as
+              add_layer, add_loss_layer, etc.
+        ensures
+            - Loops over all the layers in net and calls v() on them.  The loop happens in
+              the reverse order of visit_layers().  To be specific, this function
+              essentially performs the following:
+
+                for (size_t i = net_type::num_layers; i != 0; --i)
+                    v(i-1, layer<i-1>(net));
+    !*/
+
+// ----------------------------------------------------------------------------------------
+
+    template <
+        size_t begin,
+        size_t end,
+        typename net_type,
+        typename visitor
+        >
+    void visit_layers_range(
+        net_type& net,
+        visitor v
+    );
+    /*!
+        requires
+            - net_type is an object of type add_layer, add_loss_layer, add_skip_layer, or
+              add_tag_layer.
+            - v is a function object with a signature equivalent to: 
+                v(size_t idx, any_net_type& t)
+              That is, it must take a size_t and then any of the network types such as
+              add_layer, add_loss_layer, etc.
+            - begin <= end <= net_type::num_layers
+        ensures
+            - Loops over the layers in the range [begin,end) in net and calls v() on them.
+              The loop happens in the reverse order of visit_layers().  To be specific,
+              this function essentially performs the following:
+
+                for (size_t i = begin; i < end; ++i)
+                    v(i, layer<i>(net));
+    !*/
+
+    template <
+        size_t begin,
+        size_t end,
+        typename net_type,
+        typename visitor
+        >
+    void visit_layers_backwards_range(
+        net_type& net,
+        visitor v
+    );
+    /*!
+        requires
+            - net_type is an object of type add_layer, add_loss_layer, add_skip_layer, or
+              add_tag_layer.
+            - v is a function object with a signature equivalent to: 
+                v(size_t idx, any_net_type& t)
+              That is, it must take a size_t and then any of the network types such as
+              add_layer, add_loss_layer, etc.
+            - begin <= end <= net_type::num_layers
+        ensures
+            - Loops over the layers in the range [begin,end) in net and calls v() on them.
+              The loop happens in the reverse order of visit_layers_range().  To be specific,
+              this function essentially performs the following:
+
+                for (size_t i = end; i != begin; --i)
+                    v(i-1, layer<i-1>(net));
+    !*/
+
+// ----------------------------------------------------------------------------------------
+
+    template <
+        unsigned long tag_id,
+        typename net_type,
+        typename visitor
+        >
+    void visit_layers_until_tag(
+        net_type& net,
+        visitor v
+    );
+    /*!
+        requires
+            - net_type is an object of type add_layer, add_loss_layer, add_skip_layer, or
+              add_tag_layer.
+            - v is a function object with a signature equivalent to: 
+                v(any_net_type& t)
+              That is, it must take any of the network types such as add_layer,
+              add_loss_layer, etc.
+        ensures
+            - Loops over all the layers in net beginning with layer<0>(net) and going until
+              a tag layer with an ID of tag_id is encountered.  To be specific, this
+              function essentially performs the following:
+
+                size_t i = 0;
+                while(layer<i>(net) isn't an add_tag_layer with ID == tag_id) {
+                    v(layer<i>(net));
+                    ++i;
+                }
+                v(layer<i>(net));  // also visits the tag layer itself at the very end.
+    !*/
+
+// ----------------------------------------------------------------------------------------
+
+    struct layer_test_results
+    {
+        std::string log;
+        bool was_good;
+
+        operator bool() const { return was_good; }
+    };
+
+    inline std::ostream& operator<< (std::ostream& out, const layer_test_results& item)
+    {
+        out << item.log;
+        return out;
+    }
+
+    template <
+        typename layer_details_type
+        >
+    layer_test_results test_layer (
+        layer_details_type l
+    );
+    /*!
+        ensures
+            - Checks if l correctly implements the EXAMPLE_COMPUTATIONAL_LAYER_ interface
+              defined in layers_abstract.h.  Importantly, it computes numerical approximations 
+              to the gradients and compares them to the outputs of the layer.  
+            - The results of the testing are returned.  In particular, if the returned object
+              is RESULT then we will have:
+                - RESULT.was_good == false if and only if the layer failed the testing.
+                - RESULT.log == a string describing why the testing failed if was_good==false.
+            - Note that this function is only capable of checking layers that take
+              arbitrary subnetworks as input.  So if you have designed a layer that expects
+              only a certain restricted type of subnetwork then you might get a compile or
+              runtime error when you call this function.
+    !*/
+
+// ----------------------------------------------------------------------------------------
+
+}
+
+#endif // DLIB_DNn_CORE_ABSTRACT_H_ 
+
diff --git a/ml/dlib/dlib/dnn/cpu_dlib.cpp b/ml/dlib/dlib/dnn/cpu_dlib.cpp
new file mode 100644
index 000000000..ed5661102
--- /dev/null
+++ b/ml/dlib/dlib/dnn/cpu_dlib.cpp
@@ -0,0 +1,2170 @@
+// Copyright (C) 2015  Davis E. King (davis@dlib.net)
+// License: Boost Software License   See LICENSE.txt for the full license.
+#ifndef DLIB_DNN_CPU_cPP_
+#define DLIB_DNN_CPU_cPP_
+
+// This file contains CPU implementations of the GPU based functions in cuda_dlib.h
+
+#include "cpu_dlib.h"
+#include "tensor_tools.h"
+#include "../image_transforms/interpolation.h"
+#include "../threads.h"
+
+namespace dlib
+{
+    namespace cpu 
+    {
+
+    // -----------------------------------------------------------------------------------
+
+        void multiply (
+            bool add_to,
+            tensor& dest,
+            const tensor& src1,
+            const tensor& src2
+        )
+        {
+            DLIB_CASSERT(dest.k() == src1.k() && src1.k() == src2.k() &&
+                dest.nr() == src1.nr() && src1.nr() == src2.nr() &&
+                dest.nc() == src1.nc() && src1.nc() == src2.nc() );
+            const long MD = std::max(std::max(dest.num_samples(),src1.num_samples()),src2.num_samples());
+            DLIB_CASSERT((dest.num_samples()==1 || dest.num_samples()==MD) &&
+                (src1.num_samples()==1 || src1.num_samples()==MD) &&
+                (src2.num_samples()==1 || src2.num_samples()==MD) );
+
+            if (dest.size() == 0)
+                return;
+
+            const size_t max_size = std::max(std::max(dest.size(),src1.size()),src2.size());
+            const auto d = dest.host();
+            const auto s1 = src1.host();
+            const auto s2 = src2.host();
+            if (dest.size() == src1.size() && src1.size() == src2.size())
+            {
+                if (add_to)
+                {
+                    for (size_t i = 0; i < src1.size(); ++i)
+                        d[i] += s1[i]*s2[i];
+                }
+                else
+                {
+                    for (size_t i = 0; i < src1.size(); ++i)
+                        d[i] = s1[i]*s2[i];
+                }
+            }
+            else if (dest.num_samples() == 1)
+            {
+                if (!add_to)
+                {
+                    for (size_t i = 0; i < dest.size(); ++i)
+                        d[i] = 0;
+                }
+                for (size_t i = 0; i < max_size; ++i)
+                    d[i%dest.size()] += s1[i%src1.size()]*s2[i%src2.size()];
+            }
+            else
+            {
+                if (add_to)
+                {
+                    for (size_t i = 0; i < max_size; ++i)
+                        d[i] += s1[i%src1.size()]*s2[i%src2.size()];
+                }
+                else
+                {
+                    for (size_t i = 0; i < max_size; ++i)
+                        d[i] = s1[i%src1.size()]*s2[i%src2.size()];
+                }
+            }
+        }
+
+    // ------------------------------------------------------------------------------------
+
+        void multiply_conv (
+            bool add_to,
+            tensor& dest,
+            const tensor& src1,
+            const tensor& src2
+        )
+        {
+            auto d = dest.host();
+            auto s1 = src1.host();
+            auto s2 = src2.host();
+            if (have_same_dimensions(dest,src1))
+            {
+                DLIB_CASSERT(src2.num_samples() == 1 && src2.nr() == 1 && src2.nc() == 1 && src2.k() == src1.k());
+
+                if (add_to)
+                {
+                    for (long n = 0; n < dest.num_samples(); ++n)
+                    {
+                        for (long k = 0; k < dest.k(); ++k)
+                        {
+                            for (long r = 0; r < dest.nr(); ++r)
+                            {
+                                for (long c = 0; c < dest.nc(); ++c)
+                                {
+                                    *d++ += (*s1++)*s2[k];
+                                }
+                            }
+                        }
+                    }
+                }
+                else
+                {
+                    for (long n = 0; n < dest.num_samples(); ++n)
+                    {
+                        for (long k = 0; k < dest.k(); ++k)
+                        {
+                            for (long r = 0; r < dest.nr(); ++r)
+                            {
+                                for (long c = 0; c < dest.nc(); ++c)
+                                {
+                                    *d++ = (*s1++)*s2[k];
+                                }
+                            }
+                        }
+                    }
+                }
+            }
+            else
+            {
+                DLIB_CASSERT(have_same_dimensions(src1,src2));
+                DLIB_CASSERT(dest.num_samples() == 1 && dest.nr() == 1 && dest.nc() == 1 && dest.k() == src1.k());
+
+                if (!add_to)
+                {
+                    for (long k = 0; k < src1.k(); ++k)
+                        d[k] = 0;
+                }
+
+                for (long n = 0; n < src1.num_samples(); ++n)
+                {
+                    for (long k = 0; k < src1.k(); ++k)
+                    {
+                        for (long r = 0; r < src1.nr(); ++r)
+                        {
+                            for (long c = 0; c < src1.nc(); ++c)
+                            {
+                                d[k] += (*s1++)*(*s2++);
+                            }
+                        }
+                    }
+                }
+            }
+        }
+
+    // ------------------------------------------------------------------------------------
+
+        void scale_channels (
+            bool add_to,
+            tensor& dest,
+            const tensor& src,
+            const tensor& scales
+        )
+        {
+            DLIB_CASSERT(have_same_dimensions(dest,src) && 
+                         scales.num_samples() == src.num_samples() &&
+                         scales.k()           == src.k() &&
+                         scales.nr()          == 1 &&
+                         scales.nc()          == 1 );
+
+            if (dest.size() == 0)
+                return;
+
+            if (add_to)
+            {
+                auto d = dest.host();
+                auto s = src.host();
+                auto scal = scales.host();
+
+                for (long n = 0; n < src.num_samples(); ++n)
+                {
+                    for (long k = 0; k < src.k(); ++k)
+                    {
+                        const auto scale = scal[n*scales.k() + k];
+                        for (long r = 0; r < src.nr(); ++r)
+                        {
+                            for (long c = 0; c < src.nc(); ++c)
+                            {
+                                *d++ += (*s++) * scale;
+                            }
+                        }
+                    }
+                }
+
+
+            }
+            else
+            {
+                auto d = dest.host_write_only();
+                auto s = src.host();
+                auto scal = scales.host();
+
+                for (long n = 0; n < src.num_samples(); ++n)
+                {
+                    for (long k = 0; k < src.k(); ++k)
+                    {
+                        const auto scale = scal[n*scales.k() + k];
+                        for (long r = 0; r < src.nr(); ++r)
+                        {
+                            for (long c = 0; c < src.nc(); ++c)
+                            {
+                                *d++ = (*s++) * scale;
+                            }
+                        }
+                    }
+                }
+            }
+        }
+
+    // ------------------------------------------------------------------------------------
+
+        void add(
+            float beta,
+            tensor& dest,
+            float alpha,
+            const tensor& src
+        )
+        {
+            DLIB_CASSERT(
+                  (have_same_dimensions(src, dest) ||
+                  (src.num_samples()==1 && src.k()==dest.k() && src.nr()==1 && src.nc()==1) ||
+                  (src.num_samples()==1 && src.k()==dest.k() && src.nr()==dest.nr() && src.nc()==dest.nc()) ||
+                  (src.num_samples()==1 && src.k()==1 && src.nr()==dest.nr() && src.nc()==dest.nc()) ||
+                  (src.num_samples()==dest.num_samples() && src.k()==1 && src.nr()==1 && src.nc()==1)) &&
+                  is_same_object(src,dest) == false , 
+                    "\n\t dest.num_samples(): " << dest.num_samples()
+                    <<"\n\t dest.k():           " << dest.k()
+                    <<"\n\t dest.nr():          " << dest.nr()
+                    <<"\n\t dest.nc():          " << dest.nc()
+                    <<"\n\t src.num_samples():  " << src.num_samples()
+                    <<"\n\t src.k():            " << src.k()
+                    <<"\n\t src.nr():           " << src.nr()
+                    <<"\n\t src.nc():           " << src.nc()
+                    );
+
+
+            if (beta == 0 && alpha == 0)
+            {
+                dest = 0;
+                return;
+            }
+
+            auto d = dest.host();
+            auto s = src.host();
+            for (long n = 0; n < dest.num_samples(); ++n)
+            {
+                const auto sn = src.num_samples()==1 ? 0:n;
+                for (long k = 0; k < dest.k(); ++k)
+                {
+                    const auto sk = src.k()==1 ? 0:k;
+                    for (long r = 0; r < dest.nr(); ++r)
+                    {
+                        const auto sr = src.nr()==1 ? 0:r;
+                        for (long c = 0; c < dest.nc(); ++c)
+                        {
+                            const auto sc = src.nc()==1 ? 0:c;
+
+                            const auto s_idx = ((sn*src.k() + sk)*src.nr() + sr)*src.nc() + sc;
+                            *d = beta*(*d) + alpha*s[s_idx];
+                            ++d;
+                        }
+                    }
+                }
+            }
+        }
+
+    // ----------------------------------------------------------------------------------------
+
+        void add (
+            tensor& dest,
+            const tensor& src1,
+            const tensor& src2
+        )
+        {
+            auto d = dest.host();
+            auto s1 = src1.host();
+            auto s2 = src2.host();
+
+            // Do the simple and fast version if everything has the same dimensions
+            if (have_same_dimensions(dest, src1) &&
+                have_same_dimensions(dest, src2))
+            {
+                for (size_t i = 0; i < dest.size(); ++i)
+                    d[i] = s1[i] + s2[i];
+                return;
+            }
+
+            // Otherwise, do the more complex version with bounds checking.
+            for (long n = 0; n < dest.num_samples(); ++n)
+            {
+                for (long k = 0; k < dest.k(); ++k)
+                {
+                    for (long r = 0; r < dest.nr(); ++r)
+                    {
+                        for (long c = 0; c < dest.nc(); ++c)
+                        {
+                            float v1 = 0;
+                            float v2 = 0;
+
+                            // if this index is inside src1
+                            if (n < src1.num_samples() && 
+                                k < src1.k() && 
+                                r < src1.nr() && 
+                                c < src1.nc() )
+                            {
+                                const auto s_idx = ((n*src1.k() + k)*src1.nr() + r)*src1.nc() + c;
+                                v1 = s1[s_idx];
+                            }
+
+                            // if this index is inside src2
+                            if (n < src2.num_samples() && 
+                                k < src2.k() && 
+                                r < src2.nr() && 
+                                c < src2.nc() )
+                            {
+                                const auto s_idx = ((n*src2.k() + k)*src2.nr() + r)*src2.nc() + c;
+                                v2 = s2[s_idx];
+                            }
+
+                            *d = v1 + v2;
+                            ++d;
+                        }
+                    }
+                }
+            }
+        }
+
+    // ----------------------------------------------------------------------------------------
+
+        void multiply_zero_padded (
+            bool add_to,
+            tensor& dest,
+            const tensor& src1,
+            const tensor& src2
+        )
+        {
+            auto d = dest.host();
+            auto s1 = src1.host();
+            auto s2 = src2.host();
+
+            // Do the simple and fast version if everything has the same dimensions
+            if (have_same_dimensions(dest, src1) &&
+                have_same_dimensions(dest, src2))
+            {
+                if (add_to)
+                {
+                    for (size_t i = 0; i < dest.size(); ++i)
+                        d[i] += s1[i] * s2[i];
+                }
+                else
+                {
+                    for (size_t i = 0; i < dest.size(); ++i)
+                        d[i] = s1[i] * s2[i];
+                }
+                return;
+            }
+
+            // Otherwise, do the more complex version with bounds checking.
+            for (long n = 0; n < dest.num_samples(); ++n)
+            {
+                for (long k = 0; k < dest.k(); ++k)
+                {
+                    for (long r = 0; r < dest.nr(); ++r)
+                    {
+                        for (long c = 0; c < dest.nc(); ++c)
+                        {
+                            float v1 = 0;
+                            float v2 = 0;
+
+                            // if this index is inside src1
+                            if (n < src1.num_samples() && 
+                                k < src1.k() && 
+                                r < src1.nr() && 
+                                c < src1.nc() )
+                            {
+                                const auto s_idx = ((n*src1.k() + k)*src1.nr() + r)*src1.nc() + c;
+                                v1 = s1[s_idx];
+                            }
+
+                            // if this index is inside src2
+                            if (n < src2.num_samples() && 
+                                k < src2.k() && 
+                                r < src2.nr() && 
+                                c < src2.nc() )
+                            {
+                                const auto s_idx = ((n*src2.k() + k)*src2.nr() + r)*src2.nc() + c;
+                                v2 = s2[s_idx];
+                            }
+
+                            if (add_to)
+                                *d += v1 * v2;
+                            else
+                                *d = v1 * v2;
+                            ++d;
+                        }
+                    }
+                }
+            }
+        }
+
+    // ----------------------------------------------------------------------------------------
+
+        void assign_bias_gradient (
+            tensor& grad,
+            const tensor& gradient_input
+        )
+        {
+            DLIB_CASSERT(
+                  grad.num_samples() == 1 &&
+                  gradient_input.k() == grad.k() &&
+                  gradient_input.nr() == grad.nr() &&
+                  gradient_input.nc() == grad.nc() &&
+                  gradient_input.size() > 0);
+
+            auto out = grad.host();
+            auto in = gradient_input.host();
+
+            for (size_t i = 0; i < grad.size(); ++i)
+                out[i] = *in++;
+
+            for (long j = 1; j < gradient_input.num_samples(); ++j)
+            {
+                for (size_t i = 0; i < grad.size(); ++i)
+                    out[i] += *in++;
+            }
+        }
+
+    // ------------------------------------------------------------------------------------
+
+        void assign_conv_bias_gradient (
+            tensor& grad,
+            const tensor& gradient_input
+        )
+        {
+            DLIB_CASSERT(
+                  grad.num_samples() == 1 &&
+                  grad.k()  >= 1 &&
+                  grad.nr() == 1 &&
+                  grad.nc() == 1 &&
+                  gradient_input.k() == grad.k() &&
+                  gradient_input.size() > 0 && 
+                  is_same_object(grad,gradient_input) == false
+                  );
+
+            auto g = grad.host();
+            auto gi = gradient_input.host();
+
+            for (long k = 0; k < gradient_input.k(); ++k)
+                g[k] = 0;
+
+            for (long n = 0; n < gradient_input.num_samples(); ++n)
+            {
+                for (long k = 0; k < gradient_input.k(); ++k)
+                {
+                    for (long r = 0; r < gradient_input.nr(); ++r)
+                    {
+                        for (long c = 0; c < gradient_input.nc(); ++c)
+                        {
+                            g[k] += (*gi++);
+                        }
+                    }
+                }
+            }
+        }
+
+    // -----------------------------------------------------------------------------------
+
+        void affine_transform(
+            tensor& dest,
+            const tensor& src,
+            const float A,
+            const float B
+        )
+        {
+            DLIB_CASSERT(dest.size()==src.size());
+            const auto d = dest.host();
+            const auto s = src.host();
+            for (size_t i = 0; i < src.size(); ++i)
+                d[i] = A*s[i] + B;
+        }
+
+        void affine_transform(
+            tensor& dest,
+            const tensor& src1,
+            const tensor& src2,
+            const float A,
+            const float B,
+            const float C
+        )
+        {
+            DLIB_CASSERT(dest.size()==src1.size());
+            DLIB_CASSERT(dest.size()==src2.size());
+            const auto d = dest.host();
+            const auto s1 = src1.host();
+            const auto s2 = src2.host();
+            for (size_t i = 0; i < src1.size(); ++i)
+                d[i] = A*s1[i] + B*s2[i] + C;
+        }
+
+        void affine_transform(
+            tensor& dest,
+            const tensor& src1,
+            const tensor& src2,
+            const tensor& src3,
+            const float A,
+            const float B,
+            const float C,
+            const float D
+        )
+        {
+            DLIB_CASSERT(dest.size()==src1.size());
+            DLIB_CASSERT(dest.size()==src2.size());
+            DLIB_CASSERT(dest.size()==src3.size());
+            const auto d = dest.host();
+            const auto s1 = src1.host();
+            const auto s2 = src2.host();
+            const auto s3 = src3.host();
+            for (size_t i = 0; i < src1.size(); ++i)
+                d[i] = A*s1[i] + B*s2[i] + C*s3[i] + D;
+        }
+
+        void affine_transform_range(
+            size_t begin,
+            size_t end,
+            tensor& dest,
+            const tensor& src1,
+            const tensor& src2,
+            const tensor& src3,
+            const float A,
+            const float B,
+            const float C
+        )
+        {
+            DLIB_CASSERT(dest.size()==src1.size());
+            DLIB_CASSERT(dest.size()==src2.size());
+            DLIB_CASSERT(dest.size()==src3.size());
+            DLIB_CASSERT(begin <= end && end <= dest.size());
+            const auto d = dest.host();
+            const auto s1 = src1.host();
+            const auto s2 = src2.host();
+            const auto s3 = src3.host();
+            for (size_t i = begin; i < end; ++i)
+                d[i] = A*s1[i] + B*s2[i] + C*s3[i];
+        }
+
+    // -----------------------------------------------------------------------------------
+
+        void affine_transform(
+            tensor& dest,
+            const tensor& src,
+            const tensor& A,
+            const tensor& B
+        )
+        {
+            DLIB_CASSERT(have_same_dimensions(dest,src));
+            DLIB_CASSERT(
+                  ((A.num_samples()==1 && B.num_samples()==1) ||
+                  (A.num_samples()==src.num_samples() && B.num_samples()==src.num_samples())) &&
+                  A.nr()==B.nr() && B.nr()==src.nr() &&
+                  A.nc()==B.nc() && B.nc()==src.nc() &&
+                  A.k() ==B.k()  && B.k()==src.k());
+
+            auto d = dest.host();
+            auto s = src.host();
+            const auto a = A.host();
+            const auto b = B.host();
+            if (A.num_samples() == 1)
+            {
+                const long num = src.size()/src.num_samples();
+                for (long i = 0; i < src.num_samples(); ++i)
+                {
+                    for (long j = 0; j < num; ++j)
+                    {
+                        *d = a[j]*(*s) + b[j];
+                        d++;
+                        s++;
+                    }
+                }
+            }
+            else
+            {
+                for (size_t i = 0; i < src.size(); ++i)
+                    d[i] = a[i]*s[i] + b[i];
+            }
+        }
+
+    // -----------------------------------------------------------------------------------
+
+        void affine_transform_conv(
+            tensor& dest,
+            const tensor& src,
+            const tensor& A,
+            const tensor& B
+        )
+        {
+            DLIB_CASSERT(have_same_dimensions(dest,src));
+            DLIB_CASSERT(have_same_dimensions(A,B));
+            DLIB_CASSERT(A.num_samples() == 1 &&
+                         A.nr() == 1 &&
+                         A.nc() == 1 &&
+                         A.k() == src.k());
+
+            auto d = dest.host();
+            auto s = src.host();
+            const auto a = A.host();
+            const auto b = B.host();
+            for (long n = 0; n < dest.num_samples(); ++n)
+            {
+                for (long k = 0; k < dest.k(); ++k)
+                {
+                    for (long r = 0; r < dest.nr(); ++r)
+                    {
+                        for (long c = 0; c < dest.nc(); ++c)
+                        {
+                            *d++ = a[k]*(*s++) + b[k];
+                        }
+                    }
+                }
+            }
+        }
+
+    // ----------------------------------------------------------------------------------------
+
+        void affine_transform(
+            const rectangle& rect,
+            tensor& dest, 
+            const tensor& src1, 
+            const tensor& src2, 
+            const tensor& src3, 
+            float A, 
+            float B,
+            float C
+        )
+        {
+            DLIB_CASSERT(dest.size() == src1.size());
+            DLIB_CASSERT(dest.size() == src2.size());
+            DLIB_CASSERT(dest.size() == src3.size());
+            DLIB_CASSERT(dest.num_samples() == src1.num_samples());
+            DLIB_CASSERT(dest.num_samples() == src2.num_samples());
+            DLIB_CASSERT(dest.num_samples() == src3.num_samples());
+            DLIB_CASSERT(rectangle(0,0, dest.size()/dest.num_samples()-1, dest.num_samples()-1).contains(rect));
+
+
+            auto d = dest.host();
+            auto s1 = src1.host();
+            auto s2 = src2.host();
+            auto s3 = src3.host();
+
+            const auto nc = dest.size()/dest.num_samples();
+
+            for (long r = rect.top(); r <= rect.bottom(); ++r)
+            {
+                for (long c = rect.left(); c <= rect.right(); ++c)
+                {
+                    auto idx = r*nc + c;
+                    d[idx] = s1[idx]*A + s2[idx]*B + s3[idx]*C;
+                }
+            }
+
+        }
+
+    // -----------------------------------------------------------------------------------
+
+        void compute_adam_update (
+            size_t begin,
+            size_t end,
+            tensor& s,
+            tensor& m,
+            tensor& v,
+            const float t,
+            const float learning_rate,
+            const float weight_decay,
+            const float momentum1,
+            const float momentum2,
+            const tensor& params,
+            const tensor& params_grad
+        )
+        {
+            DLIB_CASSERT(s.size() == m.size() &&
+                         s.size() == v.size() &&
+                         s.size() == params.size() &&
+                         s.size() == params_grad.size());
+            DLIB_CASSERT(begin <= end && end <= params.size());
+            const float eps = 1e-8;
+            const float alpha = learning_rate*std::sqrt(1-std::pow(momentum2,t))/(1-std::pow(momentum1, t));
+
+            // The loop is equivalent to doing this:
+            //   m = momentum1*m + (1-momentum1)    *   (weight_decay*params + params_grad);
+            //   v = momentum2*v + (1-momentum2)*squared(weight_decay*params + params_grad);
+            //   s = -alpha*m/(sqrt(v) + eps);
+            auto pm = m.host();
+            auto pv = v.host();
+            auto ps = s.host_write_only();
+            auto pparams = params.host();
+            auto ppgrad = params_grad.host();
+            for (size_t i = begin; i < end; ++i)
+            {
+                float g = weight_decay*pparams[i] + ppgrad[i];
+                pm[i] = momentum1*pm[i] + (1-momentum1)*g;
+                pv[i] = momentum2*pv[i] + (1-momentum2)*g*g;
+                ps[i] = -alpha*pm[i]/(std::sqrt(pv[i]) + eps);
+            }
+        }
+
+    // -----------------------------------------------------------------------------------
+
+        void batch_normalize_inference (
+            const double eps,
+            resizable_tensor& dest,
+            const tensor& src,
+            const tensor& gamma, 
+            const tensor& beta,
+            const tensor& running_means,
+            const tensor& running_variances
+        )
+        {
+            DLIB_CASSERT(
+                gamma.num_samples() == 1 && 
+                gamma.nr() == src.nr() &&
+                gamma.nc() == src.nc() &&
+                gamma.k()  == src.k() &&
+                have_same_dimensions(gamma, beta) &&
+                have_same_dimensions(gamma, running_means) &&
+                have_same_dimensions(gamma, running_variances) && 
+                eps > 0, 
+                "\ngamma.num_samples(): " << gamma.num_samples() << 
+                "\ngamma.k():  " << gamma.k() << 
+                "\ngamma.nr(): " << gamma.nr() << 
+                "\ngamma.nc(): " << gamma.nc() << 
+                "\nbeta.num_samples(): " << beta.num_samples() << 
+                "\nbeta.k():   " << beta.k() << 
+                "\nbeta.nr():  " << beta.nr() << 
+                "\nbeta.nc():  " << beta.nc() << 
+                "\nrunning_means.num_samples(): " << running_means.num_samples() << 
+                "\nrunning_means.k():   " << running_means.k() << 
+                "\nrunning_means.nr():  " << running_means.nr() << 
+                "\nrunning_means.nc():  " << running_means.nc() << 
+                "\nrunning_variances.num_samples(): " << running_variances.num_samples() << 
+                "\nrunning_variances.k():   " << running_variances.k() << 
+                "\nrunning_variances.nr():  " << running_variances.nr() << 
+                "\nrunning_variances.nc():  " << running_variances.nc() << 
+                "\nsrc.k():   " << src.k() << 
+                "\nsrc.nr():  " << src.nr() << 
+                "\nsrc.nc():  " << src.nc() <<
+                "\neps:  " << eps 
+            );
+            dest.copy_size(src);
+
+            auto d = dest.host();
+            auto s = src.host();
+            auto g = gamma.host();
+            auto b = beta.host();
+            auto m = running_means.host();
+            auto v = running_variances.host();
+
+            const long num = src.k()*src.nr()*src.nc();
+            for (long n = 0; n < src.num_samples(); ++n)
+            {
+                for (long k = 0; k < num; ++k)
+                {
+                    *d = g[k]*(*s - m[k])/std::sqrt(v[k]+eps) + b[k];
+                    ++d;
+                    ++s;
+                }
+            }
+        }
+
+        void batch_normalize (
+            const double eps,
+            resizable_tensor& dest,
+            resizable_tensor& means,
+            resizable_tensor& invstds,
+            const double averaging_factor,
+            resizable_tensor& running_means,
+            resizable_tensor& running_variances,
+            const tensor& src,
+            const tensor& gamma, 
+            const tensor& beta 
+        )
+        {
+            DLIB_CASSERT(0 <= averaging_factor && averaging_factor <= 1, "averaging_factor: " << averaging_factor);
+            DLIB_CASSERT(averaging_factor==1 || have_same_dimensions(running_means,means));
+            DLIB_CASSERT(averaging_factor==1 || have_same_dimensions(running_variances,invstds));
+            DLIB_CASSERT(
+                src.num_samples() > 1 &&
+                gamma.num_samples() == 1 && 
+                beta.num_samples() == 1 && 
+                gamma.nr() == beta.nr() && beta.nr() == src.nr() &&
+                gamma.nc() == beta.nc() && beta.nc() == src.nc() &&
+                gamma.k()  == beta.k()  && beta.k() == src.k() &&
+                eps > 0, 
+                "\ngamma.num_samples(): " << gamma.num_samples() << 
+                "\ngamma.k():  " << gamma.k() << 
+                "\ngamma.nr(): " << gamma.nr() << 
+                "\ngamma.nc(): " << gamma.nc() << 
+                "\nbeta.num_samples(): " << beta.num_samples() << 
+                "\nbeta.k():   " << beta.k() << 
+                "\nbeta.nr():  " << beta.nr() << 
+                "\nbeta.nc():  " << beta.nc() << 
+                "\nsrc.k():   " << src.k() << 
+                "\nsrc.nr():  " << src.nr() << 
+                "\nsrc.nc():  " << src.nc() <<
+                "\neps:  " << eps 
+            );
+
+            dest.copy_size(src);
+            means.set_size(1, src.k(), src.nr(), src.nc());
+            invstds.set_size(1, src.k(), src.nr(), src.nc());
+
+            // first compute means and invstds
+            means = 0;
+            invstds = 0;
+            const auto p_invstds = invstds.host();
+            const auto p_means = means.host();
+            auto p_src = src.host();
+            const long num = src.k()*src.nr()*src.nc();
+            // compute means, and sum of squares
+            for (long i = 0; i < num; ++i)
+            {
+                for (long n = 0; n < src.num_samples(); ++n)
+                {
+                    float val = p_src[n*num+i];
+                    p_means[i] += val;
+                    p_invstds[i] += val*val;
+                }
+            }
+            means /= src.num_samples();
+            invstds /= src.num_samples();
+            // copy data back to host
+            invstds.host(); means.host();
+
+            // compute variances 
+            running_variances.copy_size(invstds);
+            auto rvar = running_variances.host();
+            // This scale makes the running variances unbiased.
+            const double scale = (src.num_samples())/(src.num_samples()-1.0);
+            for (long i = 0; i < num; ++i)
+            {
+                auto actual_var = p_invstds[i] - p_means[i]*p_means[i];
+                if (averaging_factor == 1)
+                    rvar[i] = scale*actual_var;
+                else
+                    rvar[i] = (1-averaging_factor)*rvar[i] + scale*averaging_factor*actual_var;
+
+                p_invstds[i] = 1.0f/std::sqrt(actual_var + eps);
+            }
+
+            p_src = src.host();
+            auto p_dest = dest.host();
+            const auto p_gamma = gamma.host();   
+            const auto p_beta = beta.host();   
+            for (long n = 0; n < src.num_samples(); ++n)
+            {
+                for (long i = 0; i < num; ++i)
+                {
+                    *p_dest = (*p_src - p_means[i])*p_invstds[i];
+                    *p_dest = (*p_dest)*p_gamma[i] + p_beta[i];
+                    ++p_src;
+                    ++p_dest;
+                }
+            }
+
+            // now keep track of the running means 
+            running_means.copy_size(means);
+            if (averaging_factor != 1)
+                running_means = (1-averaging_factor)*mat(running_means) + averaging_factor*mat(means);
+            else
+                running_means = means;
+        }
+
+        void batch_normalize_gradient (
+            const double eps,
+            const tensor& gradient_input,
+            const tensor& means,
+            const tensor& invstds,
+            const tensor& src,
+            const tensor& gamma,
+            tensor& src_grad,
+            tensor& gamma_grad, 
+            tensor& beta_grad 
+        )
+        {
+
+            const long num = src.k()*src.nr()*src.nc();
+            DLIB_CASSERT(src.num_samples() > 1);
+            DLIB_CASSERT(num == (long)means.size());
+            DLIB_CASSERT(num == (long)invstds.size());
+            DLIB_CASSERT(num == (long)gamma.size());
+            DLIB_CASSERT(num == (long)gamma_grad.size());
+            DLIB_CASSERT(num == (long)beta_grad.size());
+            DLIB_CASSERT(have_same_dimensions(gradient_input, src));
+            DLIB_CASSERT(have_same_dimensions(gradient_input, src_grad));
+            DLIB_CASSERT(eps > 0);
+
+            beta_grad = 0;
+            gamma_grad = 0;
+            auto p_grad = gradient_input.host();
+            auto p_src = src.host();
+            const auto p_gamma = gamma.host();   
+            const auto p_gamma_grad = gamma_grad.host();   
+            const auto p_beta_grad = beta_grad.host();   
+            const auto p_invstds = invstds.host();
+            const auto p_means = means.host();
+
+            resizable_tensor dvars, dmeans;
+            dvars.copy_size(invstds);
+            dmeans.copy_size(means);
+            dvars = 0;
+            dmeans = 0;
+            const auto p_dvars = dvars.host();
+            const auto p_dmeans = dmeans.host();
+
+            for (long n = 0; n < src.num_samples(); ++n)
+            {
+                for (long i = 0; i < num; ++i)
+                {
+                    const float x_hat = (*p_src - p_means[i])*p_invstds[i];
+                    p_beta_grad[i] += *p_grad;
+                    p_gamma_grad[i] += (*p_grad)*x_hat;
+
+                    const float dx = *p_grad * p_gamma[i];
+
+                    p_dvars[i] += dx*(*p_src - p_means[i])*-0.5*std::pow(p_invstds[i], 3.0f);
+
+                    ++p_grad;
+                    ++p_src;
+                }
+            }
+
+            const float invnum = 1.0f/src.num_samples();
+            p_grad = gradient_input.host();
+            p_src = src.host();
+            for (long n = 0; n < src.num_samples(); ++n)
+            {
+                for (long i = 0; i < num; ++i)
+                {
+                    const float dx = *p_grad * p_gamma[i];
+
+                    p_dmeans[i] += dx*-p_invstds[i] + p_dvars[i] * -2*(*p_src - p_means[i])*invnum;
+
+                    ++p_grad;
+                    ++p_src;
+                }
+            }
+            p_grad = gradient_input.host();
+            p_src = src.host();
+            auto p_src_grad = src_grad.host();
+            for (long n = 0; n < src.num_samples(); ++n)
+            {
+                for (long i = 0; i < num; ++i)
+                {
+                    const float dx = *p_grad * p_gamma[i];
+
+                    *p_src_grad += dx*p_invstds[i] + 
+                        p_dvars[i] *2*(*p_src - p_means[i])*invnum + 
+                        p_dmeans[i]*invnum;
+
+
+                    ++p_grad;
+                    ++p_src;
+                    ++p_src_grad;
+                }
+            }
+        }
+
+    // ----------------------------------------------------------------------------------------
+
+        void batch_normalize_conv_inference (
+            const double eps,
+            resizable_tensor& dest,
+            const tensor& src,
+            const tensor& gamma, 
+            const tensor& beta,
+            const tensor& running_means,
+            const tensor& running_variances
+        )
+        {
+            DLIB_CASSERT(
+                gamma.num_samples() == 1 && 
+                gamma.nr() == 1 &&
+                gamma.nc() == 1 &&
+                gamma.k()  == src.k() &&
+                have_same_dimensions(gamma, beta) &&
+                have_same_dimensions(gamma, running_means) &&
+                have_same_dimensions(gamma, running_variances) &&
+                eps > 0, 
+                "\ngamma.num_samples(): " << gamma.num_samples() << 
+                "\ngamma.k():  " << gamma.k() << 
+                "\ngamma.nr(): " << gamma.nr() << 
+                "\ngamma.nc(): " << gamma.nc() << 
+                "\nbeta.num_samples(): " << beta.num_samples() << 
+                "\nbeta.k():   " << beta.k() << 
+                "\nbeta.nr():  " << beta.nr() << 
+                "\nbeta.nc():  " << beta.nc() << 
+                "\nrunning_means.num_samples(): " << running_means.num_samples() << 
+                "\nrunning_means.k():   " << running_means.k() << 
+                "\nrunning_means.nr():  " << running_means.nr() << 
+                "\nrunning_means.nc():  " << running_means.nc() << 
+                "\nrunning_variances.num_samples(): " << running_variances.num_samples() << 
+                "\nrunning_variances.k():   " << running_variances.k() << 
+                "\nrunning_variances.nr():  " << running_variances.nr() << 
+                "\nrunning_variances.nc():  " << running_variances.nc() << 
+                "\nsrc.k():   " << src.k() << 
+                "\nsrc.nr():  " << src.nr() << 
+                "\nsrc.nc():  " << src.nc() <<
+                "\neps:  " << eps 
+            );
+            dest.copy_size(src);
+
+            auto d = dest.host();
+            auto s = src.host();
+            auto g = gamma.host();
+            auto b = beta.host();
+            auto m = running_means.host();
+            auto v = running_variances.host();
+
+            const long num = src.nr()*src.nc();
+            for (long n = 0; n < src.num_samples(); ++n)
+            {
+                for (long k = 0; k < src.k(); ++k)
+                {
+                    const float invstd = 1.0f/std::sqrt(v[k] + eps);
+                    for (long j = 0; j < num; ++j)
+                    {
+                        *d = g[k]*(*s - m[k])*invstd + b[k];
+                        ++d;
+                        ++s;
+                    }
+                }
+            }
+        }
+
+        void batch_normalize_conv (
+            const double eps,
+            resizable_tensor& dest,
+            resizable_tensor& means,
+            resizable_tensor& invstds,
+            const double averaging_factor,
+            resizable_tensor& running_means,
+            resizable_tensor& running_variances,
+            const tensor& src,
+            const tensor& gamma, 
+            const tensor& beta 
+        )
+        {
+            DLIB_CASSERT(0 <= averaging_factor && averaging_factor <= 1, "averaging_factor: " << averaging_factor);
+            DLIB_CASSERT(averaging_factor==1 || have_same_dimensions(running_means,means));
+            DLIB_CASSERT(averaging_factor==1 || have_same_dimensions(running_variances,invstds));
+            DLIB_CASSERT(
+                src.num_samples() > 1 &&
+                gamma.num_samples() == 1 && 
+                beta.num_samples() == 1 && 
+                gamma.nr() == 1 && 
+                beta.nr() == 1 && 
+                gamma.nc() == 1 && 
+                beta.nc() == 1 && 
+                gamma.k()  == beta.k()  && beta.k() == src.k() &&
+                eps > 0, 
+                "\ngamma.num_samples(): " << gamma.num_samples() << 
+                "\ngamma.k():  " << gamma.k() << 
+                "\ngamma.nr(): " << gamma.nr() << 
+                "\ngamma.nc(): " << gamma.nc() << 
+                "\nbeta.num_samples(): " << beta.num_samples() << 
+                "\nbeta.k():   " << beta.k() << 
+                "\nbeta.nr():  " << beta.nr() << 
+                "\nbeta.nc():  " << beta.nc() << 
+                "\nsrc.k():   " << src.k() << 
+                "\nsrc.nr():  " << src.nr() << 
+                "\nsrc.nc():  " << src.nc()  <<
+                "\neps:  " << eps 
+            );
+
+            dest.copy_size(src);
+            means.set_size(1, src.k());
+            invstds.set_size(1, src.k());
+
+            // first compute means and invstds
+            means = 0;
+            invstds = 0;
+            const auto p_invstds = invstds.host();
+            const auto p_means = means.host();
+            const auto p_gamma = gamma.host();   
+            const auto p_beta = beta.host();   
+            auto p_src = src.host();
+            const long num = src.nr()*src.nc();
+            // compute means, and sum of squares
+            for (long n = 0; n < src.num_samples(); ++n)
+            {
+                for (long k = 0; k < src.k(); ++k)
+                {
+                    for (long i = 0; i < num; ++i)
+                    {
+                        p_means[k] += *p_src;
+                        p_invstds[k] += (*p_src)*(*p_src);
+                        ++p_src;
+                    }
+                }
+            }
+            means /= src.num_samples()*num;
+            invstds /= src.num_samples()*num;
+            // copy data back to host
+            invstds.host(); means.host();
+
+            p_src = src.host();
+            // compute variances 
+            running_variances.copy_size(invstds);
+            auto rvar = running_variances.host();
+            // This scale makes the running variances unbiased.
+            const double scale = (src.num_samples()*num)/(src.num_samples()*num-1.0);
+            for (long k = 0; k < src.k(); ++k)
+            {
+                float actual_var = p_invstds[k] - p_means[k]*p_means[k];
+                if (averaging_factor == 1)
+                    rvar[k] = scale*actual_var;
+                else
+                    rvar[k] = (1-averaging_factor)*rvar[k] + scale*averaging_factor*actual_var;
+
+                p_invstds[k] = 1.0f/std::sqrt(actual_var + eps);
+            }
+
+            p_src = src.host();
+            auto p_dest = dest.host();
+            for (long n = 0; n < src.num_samples(); ++n)
+            {
+                for (long k = 0; k < src.k(); ++k)
+                {
+                    for (long i = 0; i < num; ++i)
+                    {
+                        *p_dest = (*p_src - p_means[k])*p_invstds[k];
+                        *p_dest = (*p_dest)*p_gamma[k] + p_beta[k];
+                        ++p_src;
+                        ++p_dest;
+                    }
+                }
+            }
+
+            // now keep track of the running means 
+            running_means.copy_size(means);
+            if (averaging_factor != 1)
+                running_means = (1-averaging_factor)*mat(running_means) + averaging_factor*mat(means);
+            else
+                running_means = means;
+        }
+
+        void batch_normalize_conv_gradient(
+            const double eps,
+            const tensor& gradient_input,
+            const tensor& means,
+            const tensor& invstds,
+            const tensor& src,
+            const tensor& gamma,
+            tensor& src_grad,
+            tensor& gamma_grad, 
+            tensor& beta_grad 
+        )
+        {
+
+            const long num = src.nr()*src.nc();
+            DLIB_CASSERT(src.num_samples() > 1);
+            DLIB_CASSERT(src.k() == (long)means.size());
+            DLIB_CASSERT(src.k() == (long)invstds.size());
+            DLIB_CASSERT(src.k() == (long)gamma.size());
+            DLIB_CASSERT(src.k() == (long)gamma_grad.size());
+            DLIB_CASSERT(src.k() == (long)beta_grad.size());
+            DLIB_CASSERT(have_same_dimensions(gradient_input, src));
+            DLIB_CASSERT(have_same_dimensions(gradient_input, src_grad));
+            DLIB_CASSERT(eps > 0);
+
+            beta_grad = 0;
+            gamma_grad = 0;
+
+            auto p_grad = gradient_input.host();
+            auto p_src = src.host();
+            const auto p_gamma = gamma.host();   
+            const auto p_gamma_grad = gamma_grad.host();   
+            const auto p_beta_grad = beta_grad.host();   
+            const auto p_invstds = invstds.host();
+            const auto p_means = means.host();
+
+            resizable_tensor dvars, dmeans;
+            dvars.copy_size(invstds);
+            dmeans.copy_size(means);
+            dvars = 0;
+            dmeans = 0;
+            const auto p_dvars = dvars.host();
+            const auto p_dmeans = dmeans.host();
+
+            for (long n = 0; n < src.num_samples(); ++n)
+            {
+                for (long k = 0; k < src.k(); ++k)
+                {
+                    const float invstd_pow = -0.5*std::pow(p_invstds[k], 3.0f);
+                    for (long i = 0; i < num; ++i)
+                    {
+                        const float x_hat = (*p_src - p_means[k])*p_invstds[k];
+                        p_beta_grad[k] += *p_grad;
+                        p_gamma_grad[k] += (*p_grad)*x_hat;
+
+                        const float dx = *p_grad * p_gamma[k];
+
+                        p_dvars[k] += dx*(*p_src - p_means[k])*invstd_pow;
+
+                        ++p_grad;
+                        ++p_src;
+                    }
+                }
+            }
+
+            p_grad = gradient_input.host();
+            p_src = src.host();
+            const float invnum = 1.0f/(src.num_samples()*num);
+            for (long n = 0; n < src.num_samples(); ++n)
+            {
+                for (long k = 0; k < src.k(); ++k)
+                {
+                    for (long i = 0; i < num; ++i)
+                    {
+                        const float dx = *p_grad * p_gamma[k];
+
+                        p_dmeans[k] += -dx*p_invstds[k] + p_dvars[k] * -2*(*p_src - p_means[k])*invnum;
+
+                        ++p_grad;
+                        ++p_src;
+                    }
+                }
+            }
+            p_grad = gradient_input.host();
+            p_src = src.host();
+            auto p_src_grad = src_grad.host();
+            for (long n = 0; n < src.num_samples(); ++n)
+            {
+                for (long k = 0; k < src.k(); ++k)
+                {
+                    for (long i = 0; i < num; ++i)
+                    {
+                        const float dx = *p_grad * p_gamma[k];
+
+                        *p_src_grad += dx*p_invstds[k] + 
+                            p_dvars[k]*2*(*p_src - p_means[k])*invnum + 
+                            p_dmeans[k]*invnum;
+
+
+                        ++p_grad;
+                        ++p_src;
+                        ++p_src_grad;
+                    }
+                }
+            }
+        }
+
+    // -----------------------------------------------------------------------------------
+
+        void threshold (
+            tensor& data,
+            float thresh
+        )
+        {
+            const auto d = data.host();
+            for (size_t i = 0; i < data.size(); ++i)
+                d[i] = d[i]>thresh ? 1:0;
+        }
+
+        void dot (
+            const tensor& a,
+            const tensor& b,
+            tensor& result,
+            size_t idx
+        )
+        {
+            DLIB_CASSERT(a.size() == b.size());
+            DLIB_CASSERT(idx < result.size());
+
+            const auto aa = a.host();
+            const auto bb = b.host();
+            auto r = result.host();
+            for (size_t i = 0; i < a.size(); ++i)
+                r[idx] += aa[i]*bb[i];
+        }
+
+    // -----------------------------------------------------------------------------------
+    // -----------------------------------------------------------------------------------
+    // -----------------------------------------------------------------------------------
+
+        namespace ttimpl
+        {
+        void softmax (
+            const long num_locations,
+            const long num_channels,
+            tensor& dest,
+            const tensor& src
+        )
+        {
+            DLIB_ASSERT(num_channels*num_locations == src.nr()*src.nc()*src.k());
+            DLIB_CASSERT(have_same_dimensions(dest,src));
+            const auto d = dest.host();
+            const auto s = src.host();
+
+            // Note that we subtract out the max values in each channel before applying
+            // exp() to avoid numeric overflow in the subsequent computations.  Doing this
+            // doesn't change the resulting output, it just makes it more numerically
+            // stable.
+            for (long n = 0; n < src.num_samples(); ++n)
+            {
+                auto ss = s + num_locations*num_channels*n;
+                auto dd = d + num_locations*num_channels*n;
+                for (long i = 0; i < num_locations; ++i)
+                {
+                    float max_val = -std::numeric_limits<float>::infinity();
+                    for (long k = 0; k < num_channels; ++k)
+                        max_val = std::max(max_val, ss[k*num_locations]);
+
+                    for (long k = 0; k < num_channels; ++k)
+                        dd[k*num_locations] = std::exp(ss[k*num_locations]-max_val);
+
+                    ++ss;
+                    ++dd;
+                }
+            }
+
+            // Now normalize each channel so they sum to 1.
+            for (long n = 0; n < src.num_samples(); ++n)
+            {
+                const auto dd = d + num_locations*num_channels*n;
+                for (long i = 0; i < num_locations; ++i)
+                {
+                    const auto ddd = dd+i;
+
+                    float temp = 0;
+                    for (long k = 0; k < num_channels; ++k)
+                        temp += ddd[k*num_locations];
+                    for (long k = 0; k < num_channels; ++k)
+                        ddd[k*num_locations] /= temp;
+                }
+            }
+        }
+
+        void softmax_gradient (
+            const long num_locations,
+            const long num_channels,
+            tensor& grad,
+            const tensor& dest,
+            const tensor& gradient_input
+        )
+        {
+            DLIB_ASSERT(num_channels*num_locations == grad.nr()*grad.nc()*grad.k());
+            DLIB_CASSERT(have_same_dimensions(grad,dest));
+            DLIB_CASSERT(have_same_dimensions(grad,gradient_input));
+            const auto d = dest.host();
+            const auto g = grad.host();
+            const auto in = gradient_input.host();
+
+
+            for (long n = 0; n < grad.num_samples(); ++n)
+            {
+                const auto d2 = d + num_locations*num_channels*n;
+                const auto g2 = g + num_locations*num_channels*n;
+                const auto in2 = in + num_locations*num_channels*n;
+                for (long i = 0; i < num_locations; ++i)
+                {
+                    const auto d3 = d2+i;
+                    const auto g3 = g2+i;
+                    const auto in3 = in2+i;
+
+                    float temp = 0;
+                    for (long k = 0; k < num_channels; ++k)
+                        temp += -d3[k*num_locations]*in3[k*num_locations];
+                    if (is_same_object(gradient_input, grad))
+                    {
+                        for (long k = 0; k < num_channels; ++k)
+                            g3[k*num_locations] = d3[k*num_locations]*(temp+in3[k*num_locations]);
+                    }
+                    else
+                    {
+                        for (long k = 0; k < num_channels; ++k)
+                            g3[k*num_locations] += d3[k*num_locations]*(temp+in3[k*num_locations]);
+                    }
+                }
+            }
+        }
+        }
+
+    // ----------------------------------------------------------------------------------------
+
+        void softmax (
+            tensor& dest,
+            const tensor& src
+        )
+        {
+            DLIB_CASSERT(have_same_dimensions(dest,src));
+            ttimpl::softmax(src.nr()*src.nc(), src.k(), dest, src);
+        }
+
+        void softmax_gradient (
+            tensor& grad,
+            const tensor& dest,
+            const tensor& gradient_input
+        )
+        {
+            DLIB_CASSERT(have_same_dimensions(grad,dest));
+            DLIB_CASSERT(have_same_dimensions(grad,gradient_input));
+            ttimpl::softmax_gradient(grad.nr()*grad.nc(), grad.k(), grad, dest, gradient_input);
+        }
+
+    // ------------------------------------------------------------------------------------
+
+        void softmax_all (
+            tensor& dest,
+            const tensor& src
+        )
+        {
+            DLIB_CASSERT(have_same_dimensions(dest,src));
+            ttimpl::softmax(1, src.nr()*src.nc()*src.k(), dest, src);
+        }
+
+        void softmax_all_gradient (
+            tensor& grad,
+            const tensor& dest,
+            const tensor& gradient_input
+        )
+        {
+            DLIB_CASSERT(have_same_dimensions(grad,dest));
+            DLIB_CASSERT(have_same_dimensions(grad,gradient_input));
+            ttimpl::softmax_gradient(1, grad.nr()*grad.nc()*grad.k(), grad, dest, gradient_input);
+        }
+
+    // ------------------------------------------------------------------------------------
+
+        void sigmoid (
+            tensor& dest,
+            const tensor& src
+        )
+        {
+            const auto d = dest.host();
+            const auto s = src.host();
+            for (size_t i = 0; i < src.size(); ++i)
+                d[i] = 1/(1+std::exp(-s[i]));
+        }
+
+        void sigmoid_gradient (
+            tensor& grad,
+            const tensor& dest,
+            const tensor& gradient_input
+        )
+        {
+            const auto g = grad.host();
+            const auto d = dest.host();
+            const auto in = gradient_input.host();
+            if (is_same_object(gradient_input, grad))
+            {
+                for (size_t i = 0; i < dest.size(); ++i)
+                    g[i] = in[i]*d[i]*(1-d[i]);
+            }
+            else
+            {
+                for (size_t i = 0; i < dest.size(); ++i)
+                    g[i] += in[i]*d[i]*(1-d[i]);
+            }
+        }
+
+    // ------------------------------------------------------------------------------------
+
+        void relu (
+            tensor& dest,
+            const tensor& src
+        )
+        {
+            dest = lowerbound(mat(src), 0);
+        }
+
+        void relu_gradient (
+            tensor& grad,
+            const tensor& dest,
+            const tensor& gradient_input
+        )
+        {
+            const float* gi = gradient_input.host();
+            const float* in = dest.host();
+            float* out = grad.host();
+            if (is_same_object(grad, gradient_input))
+            {
+                for (size_t i = 0; i < dest.size(); ++i)
+                {
+                    if (in[i] > 0)
+                        out[i] = gi[i];
+                    else
+                        out[i] = 0;
+                }
+            }
+            else
+            {
+                for (size_t i = 0; i < dest.size(); ++i)
+                {
+                    if (in[i] > 0)
+                        out[i] += gi[i];
+                }
+            }
+        }
+
+    // ----------------------------------------------------------------------------------------
+
+        void prelu (
+            tensor& dest,
+            const tensor& src,
+            const tensor& param
+        )
+        {
+            const float p = param.host()[0];
+            const float* s = src.host();
+            float* d = dest.host();
+            for (size_t i = 0; i < dest.size(); ++i)
+            {
+                if (s[i] > 0)
+                    d[i] = s[i];
+                else
+                    d[i] = p*s[i];
+            }
+        }
+
+        void prelu_gradient (
+            tensor& grad,
+            const tensor& src,
+            const tensor& gradient_input,
+            const tensor& param,
+            tensor& params_grad 
+        )
+        {
+            DLIB_CASSERT(is_same_object(grad, gradient_input) == false);
+            const float p = param.host()[0];
+            const float* gi = gradient_input.host();
+            const float* s = src.host();
+            float* out = grad.host();
+            float pgrad = 0;
+            for (size_t i = 0; i < src.size(); ++i)
+            {
+                if (s[i] > 0)
+                {
+                    out[i] += gi[i];
+                }
+                else
+                {
+                    out[i] += p*gi[i];
+                    pgrad += gi[i]*s[i];
+                }
+            }
+            params_grad.host()[0] = pgrad;
+        }
+
+    // ------------------------------------------------------------------------------------
+
+        void tanh (
+            tensor& dest,
+            const tensor& src
+        )
+        {
+            const auto d = dest.host();
+            const auto s = src.host();
+            for (size_t i = 0; i < src.size(); ++i)
+                d[i] = std::tanh(s[i]);
+        }
+
+        void tanh_gradient (
+            tensor& grad,
+            const tensor& dest,
+            const tensor& gradient_input
+        )
+        {
+            const auto g = grad.host();
+            const auto d = dest.host();
+            const auto in = gradient_input.host();
+            if (is_same_object(grad, gradient_input))
+            {
+                for (size_t i = 0; i < dest.size(); ++i)
+                    g[i] = in[i]*(1-d[i]*d[i]);
+            }
+            else
+            {
+                for (size_t i = 0; i < dest.size(); ++i)
+                    g[i] += in[i]*(1-d[i]*d[i]);
+            }
+        }
+
+    // ----------------------------------------------------------------------------------------
+
+        void resize_bilinear (
+            tensor& dest,
+            long dest_row_stride,
+            long dest_channel_stride,
+            const tensor& src,
+            long src_row_stride,
+            long src_channel_stride
+        )
+        {
+            DLIB_CASSERT(is_same_object(dest, src)==false);
+            DLIB_CASSERT(dest.num_samples() == src.num_samples());
+            DLIB_CASSERT(dest.k() == src.k());
+
+            if (dest.size() == 0 || src.size() == 0)
+                return;
+
+            const float* s = src.host();
+            float* d = dest.host();
+
+            parallel_for(0, dest.k()*dest.num_samples(), [&](long i)
+            {
+                auto simg = sub_image(s+i*src_channel_stride, src.nr(), src.nc(), src_row_stride);
+                auto dimg = sub_image(d+i*dest_channel_stride, dest.nr(), dest.nc(), dest_row_stride);
+
+                resize_image(simg, dimg);
+            });
+        }
+
+        void resize_bilinear_gradient (
+            tensor& grad,
+            long grad_row_stride,
+            long grad_channel_stride,
+            const tensor& gradient_input,
+            long gradient_input_row_stride,
+            long gradient_input_channel_stride
+        )
+        {
+            DLIB_CASSERT(is_same_object(grad, gradient_input)==false);
+            DLIB_CASSERT(gradient_input.num_samples() == grad.num_samples());
+            DLIB_CASSERT(gradient_input.k() == grad.k());
+
+            if (gradient_input.size() == 0 || grad.size() == 0)
+                return;
+
+            const float* gi = gradient_input.host();
+            float* g = grad.host();
+            const float x_scale = (grad.nc()-1)/(float)std::max<long>((gradient_input.nc()-1),1);
+            const float y_scale = (grad.nr()-1)/(float)std::max<long>((gradient_input.nr()-1),1);
+            for (long long samp = 0; samp < gradient_input.num_samples(); ++samp)
+            {
+                for (long long k = 0; k < gradient_input.k(); ++k)
+                {
+                    for (long long r = 0; r < gradient_input.nr(); ++r)
+                    {
+                        const float y = r*y_scale;
+                        const long long top    = static_cast<long long>(std::floor(y));
+                        const long long bottom = std::min(top+1, grad.nr()-1);
+                        const float tb_frac = y - top;
+                        for (long long c = 0; c < gradient_input.nc(); ++c)
+                        {
+                            const float x = c*x_scale;
+                            const long long left   = static_cast<long long>(std::floor(x));
+                            const long long right  = std::min(left+1, grad.nc()-1);
+                            const float lr_frac = x - left;
+
+                            const float tmp = gi[r*gradient_input_row_stride+c];
+
+                            g[top*grad_row_stride+left]     += tmp*(1-tb_frac)*(1-lr_frac);
+                            g[top*grad_row_stride+right]    += tmp*(1-tb_frac)*(lr_frac);
+                            g[bottom*grad_row_stride+left]  += tmp*(tb_frac)*(1-lr_frac);
+                            g[bottom*grad_row_stride+right] += tmp*(tb_frac)*(lr_frac);
+                        }
+                    }
+
+                    g += grad_channel_stride;
+                    gi += gradient_input_channel_stride;
+                }
+            }
+        }
+
+    // ------------------------------------------------------------------------------------
+    // ------------------------------------------------------------------------------------
+    // ------------------------------------------------------------------------------------
+
+        pooling::pooling (
+        ) : window_height(0),window_width(0),stride_y(0),stride_x(0),padding_y(0),padding_x(0),do_max_pooling(true)
+        {
+        }
+
+        void pooling::
+        clear(
+        )
+        {
+            window_height = 0;
+            window_width = 0;
+            stride_y = 0;
+            stride_x = 0;
+            padding_y = 0;
+            padding_x = 0;
+        }
+
+        void pooling::
+        setup_max_pooling(
+            int window_height_,
+            int window_width_,
+            int stride_y_,
+            int stride_x_,
+            int padding_y_,
+            int padding_x_
+        )
+        {
+            DLIB_CASSERT(window_width_ > 0);
+            DLIB_CASSERT(window_height_ > 0);
+            DLIB_CASSERT(stride_y_ > 0);
+            DLIB_CASSERT(stride_x_ > 0);
+            DLIB_CASSERT(0 <= padding_y_ && padding_y_ < window_height_);
+            DLIB_CASSERT(0 <= padding_x_ && padding_x_ < window_width_);
+
+            window_height = window_height_;
+            window_width = window_width_;
+            stride_y = stride_y_;
+            stride_x = stride_x_;
+            padding_y = padding_y_;
+            padding_x = padding_x_;
+            do_max_pooling = true;
+        }
+
+        void pooling::
+        setup_avg_pooling(
+            int window_height_,
+            int window_width_,
+            int stride_y_,
+            int stride_x_,
+            int padding_y_,
+            int padding_x_
+        )
+        {
+            DLIB_CASSERT(window_width_ > 0);
+            DLIB_CASSERT(window_height_ > 0);
+            DLIB_CASSERT(stride_y_ > 0);
+            DLIB_CASSERT(stride_x_ > 0);
+            DLIB_CASSERT(0 <= padding_y_ && padding_y_ < window_height_);
+            DLIB_CASSERT(0 <= padding_x_ && padding_x_ < window_width_);
+
+            window_height = window_height_;
+            window_width = window_width_;
+            stride_y = stride_y_;
+            stride_x = stride_x_;
+            padding_y = padding_y_;
+            padding_x = padding_x_;
+            do_max_pooling = false;
+        }
+
+        void pooling::
+        operator() (
+            resizable_tensor& dest,
+            const tensor& src
+        )
+        {
+            DLIB_CASSERT(window_width > 0);
+            DLIB_CASSERT(window_height > 0);
+            DLIB_CASSERT(stride_y > 0);
+            DLIB_CASSERT(stride_x > 0);
+            DLIB_CASSERT(0 <= padding_y && padding_y < window_height);
+            DLIB_CASSERT(0 <= padding_x && padding_x < window_width);
+            DLIB_CASSERT(window_width  <= src.nc() + 2*padding_x,
+                "Pooling windows must be small enough to fit into the padded image.");
+            DLIB_CASSERT(window_height <= src.nr() + 2*padding_y,
+                "Pooling windows must be small enough to fit into the padded image.");
+
+            dest.set_size(
+                 src.num_samples(),
+                 src.k(),
+                 1+(src.nr()+2*padding_y-window_height)/stride_y,
+                 1+(src.nc()+2*padding_x-window_width)/stride_x
+                );
+
+            if (src.size() == 0)
+            {
+                dest = 0;
+                return;
+            }
+
+
+            auto d = dest.host();
+            const long x_offset = window_width/2 - padding_x;
+            const long y_offset = window_height/2 - padding_y;
+            if (does_max_pooling())
+            {
+                for (long n = 0; n < dest.num_samples(); ++n)
+                {
+                    for (long k = 0; k < dest.k(); ++k)
+                    {
+                        auto simg = image_plane(src,n,k);
+                        auto dimg = d + (n*dest.k() + k)*dest.nr()*dest.nc();
+
+                        for (long r = 0; r < dest.nr(); ++r)
+                        {
+                            for (long c = 0; c < dest.nc(); ++c)
+                            {
+                                auto win = centered_rect(c*stride_x+x_offset,
+                                    r*stride_y+y_offset,
+                                    window_width,
+                                    window_height);
+                                dimg[r*dest.nc() + c] = max(subm_clipped(simg,win));
+                            }
+                        }
+                    }
+                }
+            }
+            else
+            {
+                for (long n = 0; n < dest.num_samples(); ++n)
+                {
+                    for (long k = 0; k < dest.k(); ++k)
+                    {
+                        auto simg = image_plane(src,n,k);
+                        auto dimg = d + (n*dest.k() + k)*dest.nr()*dest.nc();
+
+                        for (long r = 0; r < dest.nr(); ++r)
+                        {
+                            for (long c = 0; c < dest.nc(); ++c)
+                            {
+                                auto win = centered_rect(c*stride_x+x_offset,
+                                    r*stride_y+y_offset,
+                                    window_width,
+                                    window_height);
+                                dimg[r*dest.nc() + c] = mean(subm_clipped(simg,win));
+                            }
+                        }
+                    }
+                }
+            }
+
+        }
+
+        void pooling::get_gradient(
+            const tensor& gradient_input, 
+            const tensor& dest,
+            const tensor& src,
+            tensor& grad 
+        )
+        {
+            DLIB_CASSERT(have_same_dimensions(gradient_input,dest));
+            DLIB_CASSERT(have_same_dimensions(src,grad));
+
+
+            if (src.size() == 0)
+            {
+                return;
+            }
+
+
+            auto gi = gradient_input.host();
+            auto g = grad.host();
+            const long x_offset = window_width/2 - padding_x;
+            const long y_offset = window_height/2 - padding_y;
+            if (does_max_pooling())
+            {
+                for (long n = 0; n < dest.num_samples(); ++n)
+                {
+                    for (long k = 0; k < dest.k(); ++k)
+                    {
+                        auto simg = image_plane(src,n,k);
+                        auto gimg = g + (n*grad.k() + k)*grad.nr()*grad.nc();
+                        auto giimg = gi + (n*dest.k() + k)*dest.nr()*dest.nc();
+                        auto imgbox = get_rect(simg);
+
+                        for (long r = 0; r < dest.nr(); ++r)
+                        {
+                            for (long c = 0; c < dest.nc(); ++c)
+                            {
+                                auto win = centered_rect(c*stride_x+x_offset,
+                                    r*stride_y+y_offset,
+                                    window_width,
+                                    window_height).intersect(imgbox);
+                                auto p = max_point(subm(simg,win))+win.tl_corner();
+                                gimg[p.y()*grad.nc()+p.x()] += giimg[r*dest.nc()+c];
+                            }
+                        }
+                    }
+                }
+            }
+            else
+            {
+                for (long n = 0; n < dest.num_samples(); ++n)
+                {
+                    for (long k = 0; k < dest.k(); ++k)
+                    {
+                        auto simg = image_plane(src,n,k);
+                        auto gimg = g + (n*grad.k() + k)*grad.nr()*grad.nc();
+                        auto giimg = gi + (n*dest.k() + k)*dest.nr()*dest.nc();
+                        auto imgbox = get_rect(simg);
+
+                        for (long r = 0; r < dest.nr(); ++r)
+                        {
+                            for (long c = 0; c < dest.nc(); ++c)
+                            {
+                                auto win = centered_rect(c*stride_x+x_offset,
+                                    r*stride_y+y_offset,
+                                    window_width,
+                                    window_height).intersect(imgbox);
+                                const float delta = giimg[r*dest.nc()+c]/win.area();
+                                for (long y = win.top(); y <= win.bottom(); ++y)
+                                {
+                                    for (long x = win.left(); x <= win.right(); ++x)
+                                    {
+                                        gimg[y*grad.nc()+x] += delta;
+                                    }
+                                }
+                            }
+                        }
+                    }
+                }
+            }
+
+        }
+
+    // ------------------------------------------------------------------------------------
+    // ------------------------------------------------------------------------------------
+    // ------------------------------------------------------------------------------------
+
+        void img2col(
+            matrix<float>& output,
+            const tensor& data,
+            long n,
+            long filter_nr,
+            long filter_nc,
+            long stride_y,
+            long stride_x,
+            long padding_y,
+            long padding_x
+        )
+        {
+            const auto d = data.host() + data.k()*data.nr()*data.nc()*n;
+            const rectangle boundary = get_rect(data);
+
+            const long out_nr = 1+(data.nr()+2*padding_y-filter_nr)/stride_y;
+            const long out_nc = 1+(data.nc()+2*padding_x-filter_nc)/stride_x;
+
+            output.set_size(out_nr*out_nc, 
+                            data.k()*filter_nr*filter_nc);
+            DLIB_CASSERT(output.size() != 0);
+            float* t = &output(0,0);
+
+            // now fill in the Toeplitz output matrix for the n-th sample in data.  
+            size_t cnt = 0;
+            const long max_r = data.nr() + padding_y-(filter_nr-1);
+            const long max_c = data.nc() + padding_x-(filter_nc-1);
+            for (long r = -padding_y; r < max_r; r+=stride_y)
+            {
+                for (long c = -padding_x; c < max_c; c+=stride_x)
+                {
+                    for (long k = 0; k < data.k(); ++k)
+                    {
+                        for (long y = 0; y < filter_nr; ++y)
+                        {
+                            for (long x = 0; x < filter_nc; ++x)
+                            {
+                                DLIB_ASSERT(cnt < output.size());
+                                long xx = c+x;
+                                long yy = r+y;
+                                if (boundary.contains(xx,yy))
+                                    *t = d[(k*data.nr() + yy)*data.nc() + xx];
+                                else
+                                    *t = 0;
+                                ++t;
+                                ++cnt;
+                            }
+                        }
+                    }
+                }
+            }
+        }
+
+        void col2img(
+            const matrix<float>& output,
+            tensor& data,
+            long n,
+            long filter_nr,
+            long filter_nc,
+            long stride_y,
+            long stride_x,
+            long padding_y,
+            long padding_x
+        )
+        {
+            const auto d = data.host() + data.k()*data.nr()*data.nc()*n;
+            const rectangle boundary = get_rect(data);
+
+            DLIB_CASSERT(output.size() != 0);
+            const float* t = &output(0,0);
+
+            // now fill in the Toeplitz output matrix for the n-th sample in data.  
+            const long max_r = data.nr() + padding_y-(filter_nr-1);
+            const long max_c = data.nc() + padding_x-(filter_nc-1);
+            for (long r = -padding_y; r < max_r; r+=stride_y)
+            {
+                for (long c = -padding_x; c < max_c; c+=stride_x)
+                {
+                    for (long k = 0; k < data.k(); ++k)
+                    {
+                        for (long y = 0; y < filter_nr; ++y)
+                        {
+                            for (long x = 0; x < filter_nc; ++x)
+                            {
+                                long xx = c+x;
+                                long yy = r+y;
+                                if (boundary.contains(xx,yy))
+                                    d[(k*data.nr() + yy)*data.nc() + xx] += *t;
+                                ++t;
+                            }
+                        }
+                    }
+                }
+            }
+        }
+
+        void tensor_conv::operator() (
+            const bool add_to_output,
+            resizable_tensor& output,
+            const tensor& data,
+            const tensor& filters
+        )
+        {
+            DLIB_CASSERT(last_stride_y > 0 && last_stride_x > 0, "You must call setup() before calling this function.");
+            output.set_size(data.num_samples(),
+                            filters.num_samples(),
+                            1+(data.nr()+2*last_padding_y-filters.nr())/last_stride_y,
+                            1+(data.nc()+2*last_padding_x-filters.nc())/last_stride_x);
+            (*this)(add_to_output, static_cast<tensor&>(output),data,filters);
+        }
+
+        void tensor_conv::operator() (
+            const bool add_to_output,
+            tensor& output,
+            const tensor& data,
+            const tensor& filters
+        )
+        {
+            DLIB_CASSERT(is_same_object(output,data) == false);
+            DLIB_CASSERT(is_same_object(output,filters) == false);
+            DLIB_CASSERT(filters.k() == data.k());
+            DLIB_CASSERT(last_stride_y > 0 && last_stride_x > 0, "You must call setup() before calling this function.");
+            DLIB_CASSERT(filters.nr() <= data.nr() + 2*last_padding_y,
+                "Filter windows must be small enough to fit into the padded image.");
+            DLIB_CASSERT(filters.nc() <= data.nc() + 2*last_padding_x,
+                "Filter windows must be small enough to fit into the padded image.");
+
+            DLIB_CASSERT(output.num_samples() == data.num_samples());
+            DLIB_CASSERT(output.k() == filters.num_samples());
+            DLIB_CASSERT(output.nr() == 1+(data.nr()+2*last_padding_y-filters.nr())/last_stride_y);
+            DLIB_CASSERT(output.nc() == 1+(data.nc()+2*last_padding_x-filters.nc())/last_stride_x);
+
+
+            matrix<float> temp;
+            for (long n = 0; n < data.num_samples(); ++n)
+            {
+                img2col(temp, data, n, filters.nr(), filters.nc(), last_stride_y, last_stride_x, last_padding_y, last_padding_x);
+
+                if (add_to_output)
+                    output.add_to_sample(n, mat(filters)*trans(temp));
+                else 
+                    output.set_sample(n, mat(filters)*trans(temp));
+            }
+        }
+
+    // ------------------------------------------------------------------------------------
+
+        void tensor_conv::
+        get_gradient_for_data (
+            const bool add_to_output,
+            const tensor& gradient_input, 
+            const tensor& filters,
+            tensor& data_gradient
+        )
+        {
+            matrix<float> temp;
+            if (!add_to_output)
+                data_gradient = 0;
+            for (long n = 0; n < gradient_input.num_samples(); ++n)
+            {
+                auto gi = mat(gradient_input.host()+gradient_input.k()*gradient_input.nr()*gradient_input.nc()*n,
+                              gradient_input.k(),
+                              gradient_input.nr()*gradient_input.nc());
+                                    
+
+                temp = trans(gi)*mat(filters);
+                col2img(temp, data_gradient, n, filters.nr(), filters.nc(), last_stride_y, last_stride_x, last_padding_y, last_padding_x);
+            }
+        }
+
+    // ------------------------------------------------------------------------------------
+
+        void tensor_conv::
+        get_gradient_for_filters (
+            const bool add_to_output,
+            const tensor& gradient_input, 
+            const tensor& data,
+            tensor& filters_gradient
+        )
+        {
+            matrix<float> temp;
+            for (long n = 0; n < gradient_input.num_samples(); ++n)
+            {
+                auto gi = mat(gradient_input.host()+gradient_input.k()*gradient_input.nr()*gradient_input.nc()*n,
+                              gradient_input.k(),
+                              gradient_input.nr()*gradient_input.nc());
+
+
+                img2col(temp, data, n, filters_gradient.nr(), filters_gradient.nc(), last_stride_y, last_stride_x, last_padding_y, last_padding_x);
+                if (n == 0)
+                {
+                    if (add_to_output)
+                        filters_gradient += gi*temp;
+                    else
+                        filters_gradient = gi*temp;
+                }
+                else
+                {
+                    filters_gradient += gi*temp;
+                }
+            }
+        }
+
+     // ------------------------------------------------------------------------------------
+
+        void copy_tensor(
+            bool add_to,
+            tensor& dest,
+            size_t dest_k_offset,
+            const tensor& src,
+            size_t src_k_offset,
+            size_t count_k
+        )
+        {
+            const size_t dest_sample_size = static_cast<size_t>(dest.nc() * dest.nr() * dest.k());
+            const size_t src_sample_size = static_cast<size_t>(src.nc() * src.nr() * src.k());
+
+            const size_t block_size = count_k * dest.nc() * dest.nr();
+
+            DLIB_CASSERT(dest.num_samples() == src.num_samples() &&
+                dest.nc() == src.nc() && dest.nr() == src.nr(), "All sources should fit into dest tensor size");
+            DLIB_CASSERT(dest.k() - dest_k_offset >= count_k, "Not enough space in dest tensor");
+            DLIB_CASSERT(src.k() - src_k_offset >= count_k, "Not enough space in src tensor");
+
+            float* dest_p = dest.host() + dest_k_offset * dest.nc() * dest.nr();
+            const float* src_p = src.host() + src_k_offset * src.nc() * src.nr();
+
+            for (long i = 0; i < src.num_samples(); ++i)
+            {
+                if (add_to)
+                {
+                    for (size_t j = 0; j < block_size; ++j)
+                        dest_p[j] += src_p[j];
+                }
+                else
+                {
+                    ::memcpy(dest_p, src_p, block_size * sizeof(float));
+                }
+
+                dest_p += dest_sample_size;
+                src_p  += src_sample_size;
+            }
+        }
+
+    // ------------------------------------------------------------------------------------
+    // ------------------------------------------------------------------------------------
+    // ------------------------------------------------------------------------------------
+
+    } 
+}
+
+
+#endif // DLIB_DNN_CPU_cPP_
+
+
diff --git a/ml/dlib/dlib/dnn/cpu_dlib.h b/ml/dlib/dlib/dnn/cpu_dlib.h
new file mode 100644
index 000000000..330df01a2
--- /dev/null
+++ b/ml/dlib/dlib/dnn/cpu_dlib.h
@@ -0,0 +1,505 @@
+// Copyright (C) 2015  Davis E. King (davis@dlib.net)
+// License: Boost Software License   See LICENSE.txt for the full license.
+#ifndef DLIB_DNN_CPU_H_
+#define DLIB_DNN_CPU_H_
+
+// This file contains CPU implementations of the GPU based functions in cuda_dlib.h
+// and cudnn_dlibapi.h
+
+#include "tensor.h"
+#include "../geometry/rectangle.h"
+
+namespace dlib
+{
+    namespace cpu 
+    {
+
+    // -----------------------------------------------------------------------------------
+
+        void multiply (
+            bool add_to,
+            tensor& dest,
+            const tensor& src1,
+            const tensor& src2
+        );
+
+        void multiply_conv (
+            bool add_to,
+            tensor& dest,
+            const tensor& src1,
+            const tensor& src2
+        );
+
+        void multiply_zero_padded (
+            bool add_to,
+            tensor& dest,
+            const tensor& src1,
+            const tensor& src2
+        );
+
+        void scale_channels (
+            bool add_to,
+            tensor& dest,
+            const tensor& src,
+            const tensor& scales
+        );
+
+        void add(
+            float beta,
+            tensor& dest,
+            float alpha,
+            const tensor& src
+        );
+
+        void assign_bias_gradient (
+            tensor& grad,
+            const tensor& gradient_input
+        );
+
+        void add (
+            tensor& dest,
+            const tensor& src1,
+            const tensor& src2
+        );
+
+        void assign_conv_bias_gradient (
+            tensor& grad,
+            const tensor& gradient_input
+        );
+
+    // -----------------------------------------------------------------------------------
+
+        void affine_transform(
+            tensor& dest,
+            const tensor& src,
+            const float A,
+            const float B
+        );
+
+        void affine_transform(
+            tensor& dest,
+            const tensor& src1,
+            const tensor& src2,
+            const float A,
+            const float B,
+            const float C
+        );
+
+        void affine_transform(
+            tensor& dest,
+            const tensor& src1,
+            const tensor& src2,
+            const tensor& src3,
+            const float A,
+            const float B,
+            const float C,
+            const float D
+        );
+
+        void affine_transform_range(
+            size_t begin,
+            size_t end,
+            tensor& dest,
+            const tensor& src1,
+            const tensor& src2,
+            const tensor& src3,
+            const float A,
+            const float B,
+            const float C
+        );
+
+    // -----------------------------------------------------------------------------------
+
+        void affine_transform(
+            tensor& dest,
+            const tensor& src,
+            const tensor& A,
+            const tensor& B
+        );
+
+    // -----------------------------------------------------------------------------------
+
+        void affine_transform_conv(
+            tensor& dest,
+            const tensor& src,
+            const tensor& A,
+            const tensor& B
+        );
+
+    // -----------------------------------------------------------------------------------
+
+        void affine_transform(
+            const rectangle& rect,
+            tensor& dest, 
+            const tensor& src1, 
+            const tensor& src2, 
+            const tensor& src3, 
+            float A, 
+            float B,
+            float C
+        );
+
+    // -----------------------------------------------------------------------------------
+
+        void compute_adam_update (
+            size_t begin,
+            size_t end,
+            tensor& s,
+            tensor& m,
+            tensor& v,
+            const float t,
+            const float learning_rate,
+            const float weight_decay,
+            const float momentum1,
+            const float momentum2,
+            const tensor& params,
+            const tensor& params_grad
+        );
+
+    // -----------------------------------------------------------------------------------
+
+        void batch_normalize_inference (
+            const double eps,
+            resizable_tensor& dest,
+            const tensor& src,
+            const tensor& gamma, 
+            const tensor& beta,
+            const tensor& running_means,
+            const tensor& running_variances
+        );
+
+        void batch_normalize (
+            const double eps,
+            resizable_tensor& dest,
+            resizable_tensor& means,
+            resizable_tensor& invstds,
+            const double averaging_factor,
+            resizable_tensor& running_means,
+            resizable_tensor& running_variances,
+            const tensor& src,
+            const tensor& gamma, 
+            const tensor& beta 
+        );
+
+        void batch_normalize_gradient (
+            const double eps,
+            const tensor& gradient_input,
+            const tensor& means,
+            const tensor& invstds,
+            const tensor& src,
+            const tensor& gamma,
+            tensor& src_grad,
+            tensor& gamma_grad, 
+            tensor& beta_grad 
+        );
+
+        void batch_normalize_conv_inference (
+            const double eps,
+            resizable_tensor& dest,
+            const tensor& src,
+            const tensor& gamma, 
+            const tensor& beta,
+            const tensor& running_means,
+            const tensor& running_variances
+        );
+
+        void batch_normalize_conv (
+            const double eps,
+            resizable_tensor& dest,
+            resizable_tensor& means,
+            resizable_tensor& invstds,
+            const double averaging_factor,
+            resizable_tensor& running_means,
+            resizable_tensor& running_variances,
+            const tensor& src,
+            const tensor& gamma, 
+            const tensor& beta 
+        );
+
+        void batch_normalize_conv_gradient (
+            const double eps,
+            const tensor& gradient_input,
+            const tensor& means,
+            const tensor& invstds,
+            const tensor& src,
+            const tensor& gamma,
+            tensor& src_grad,
+            tensor& gamma_grad, 
+            tensor& beta_grad 
+        );
+
+    // -----------------------------------------------------------------------------------
+
+        void threshold (
+            tensor& data,
+            float thresh
+        );
+
+        void dot (
+            const tensor& a,
+            const tensor& b,
+            tensor& result,
+            size_t idx
+        );
+
+    // -----------------------------------------------------------------------------------
+
+        void softmax (
+            tensor& dest,
+            const tensor& src
+        );
+
+        void softmax_gradient (
+            tensor& grad,
+            const tensor& dest,
+            const tensor& gradient_input
+        );
+
+    // ------------------------------------------------------------------------------------
+
+        void softmax_all (
+            tensor& dest,
+            const tensor& src
+        );
+
+        void softmax_all_gradient (
+            tensor& grad,
+            const tensor& dest,
+            const tensor& gradient_input
+        );
+
+    // ------------------------------------------------------------------------------------
+
+        void sigmoid (
+            tensor& dest,
+            const tensor& src
+        );
+
+        void sigmoid_gradient (
+            tensor& grad,
+            const tensor& dest,
+            const tensor& gradient_input
+        );
+
+    // ------------------------------------------------------------------------------------
+
+        void relu (
+            tensor& dest,
+            const tensor& src
+        );
+
+        void relu_gradient (
+            tensor& grad,
+            const tensor& dest,
+            const tensor& gradient_input
+        );
+
+    // ----------------------------------------------------------------------------------------
+
+        void prelu (
+            tensor& dest,
+            const tensor& src,
+            const tensor& param
+        );
+
+        void prelu_gradient (
+            tensor& grad,
+            const tensor& src,
+            const tensor& gradient_input,
+            const tensor& param,
+            tensor& params_grad 
+        );
+
+    // ------------------------------------------------------------------------------------
+
+        void tanh (
+            tensor& dest,
+            const tensor& src
+        );
+
+        void tanh_gradient (
+            tensor& grad,
+            const tensor& dest,
+            const tensor& gradient_input
+        );
+
+    // ----------------------------------------------------------------------------------------
+
+        void resize_bilinear (
+            tensor& dest,
+            long dest_row_stride,
+            long dest_channel_stride,
+            const tensor& src,
+            long src_row_stride,
+            long src_channel_stride
+        );
+
+        void resize_bilinear_gradient (
+            tensor& grad,
+            long grad_row_stride,
+            long grad_channel_stride,
+            const tensor& gradient_input,
+            long gradient_input_row_stride,
+            long gradient_input_channel_stride
+        );
+
+        inline void resize_bilinear (
+            tensor& dest,
+            const tensor& src
+        ) { resize_bilinear(dest, dest.nc(), dest.nr()*dest.nc(), src, src.nc(), src.nr()*src.nc()); }
+
+        inline void resize_bilinear_gradient (
+            tensor& grad,
+            const tensor& gradient_input
+        ) { resize_bilinear_gradient(grad, grad.nc(), grad.nr()*grad.nc(), gradient_input, gradient_input.nc(), gradient_input.nr()*gradient_input.nc()); }
+
+    // -----------------------------------------------------------------------------------
+
+        class pooling
+        {
+        public:
+
+            pooling(const pooling&) = delete;
+            pooling& operator=(const pooling&) = delete;
+
+            pooling (
+            );
+
+            void clear(
+            );
+
+            void setup_max_pooling(
+                int window_height,
+                int window_width,
+                int stride_y,
+                int stride_x,
+                int padding_y,
+                int padding_x
+            );
+
+            void setup_avg_pooling(
+                int window_height,
+                int window_width,
+                int stride_y,
+                int stride_x,
+                int padding_y,
+                int padding_x
+            );
+
+            bool does_max_pooling(
+            ) const { return do_max_pooling; }
+
+            void operator() (
+                resizable_tensor& dest,
+                const tensor& src
+            );
+
+            void get_gradient(
+                const tensor& gradient_input, 
+                const tensor& dest,
+                const tensor& src,
+                tensor& grad 
+            );
+
+        private:
+            int window_height;
+            int window_width;
+            int stride_y;
+            int stride_x;
+            int padding_y;
+            int padding_x;
+            bool do_max_pooling;
+
+        };
+
+    // -----------------------------------------------------------------------------------
+
+        class tensor_conv
+        {
+        public:
+            tensor_conv(const tensor_conv&) = delete;
+            tensor_conv& operator=(const tensor_conv&) = delete;
+
+            tensor_conv() {}
+
+            void clear(
+            ) {}
+
+            void setup(
+                const tensor& data,    /* not used but required for interface */
+                const tensor& filters, /* not used but required for interface */
+                int stride_y,
+                int stride_x,
+                int padding_y,
+                int padding_x
+            ) 
+            {
+                (void)data;    /* silence compiler */
+                DLIB_CASSERT(stride_y > 0 && stride_x > 0);
+                DLIB_CASSERT(0 <= padding_y && padding_y < filters.nr());
+                DLIB_CASSERT(0 <= padding_x && padding_x < filters.nc());
+                last_stride_y = stride_y;
+                last_stride_x = stride_x;
+                last_padding_y = padding_y;
+                last_padding_x = padding_x;            
+            }
+
+             void operator() (
+                const bool add_to_output,
+                resizable_tensor& output,
+                const tensor& data,
+                const tensor& filters
+            );
+
+             void operator() (
+                const bool add_to_output,
+                tensor& output,
+                const tensor& data,
+                const tensor& filters
+            );
+
+            void get_gradient_for_data (
+                const bool add_to_output,
+                const tensor& gradient_input, 
+                const tensor& filters,
+                tensor& data_gradient
+            );
+
+            void get_gradient_for_filters (
+                const bool add_to_output,
+                const tensor& gradient_input, 
+                const tensor& data,
+                tensor& filters_gradient
+            );
+
+        private:
+
+            long last_stride_y = 0;
+            long last_stride_x = 0;
+            long last_padding_y = 0;
+            long last_padding_x = 0;
+        };
+
+    // -----------------------------------------------------------------------------------
+
+        void copy_tensor(
+            bool add_to,
+            tensor& dest,
+            size_t dest_k_offset,
+            const tensor& src,
+            size_t src_k_offset,
+            size_t count_k
+        );
+
+    // -----------------------------------------------------------------------------------
+
+    } 
+}
+
+#ifdef NO_MAKEFILE
+#include "cpu_dlib.cpp"
+#endif
+
+#endif // DLIB_DNN_CPU_H_
+
+
diff --git a/ml/dlib/dlib/dnn/cublas_dlibapi.cpp b/ml/dlib/dlib/dnn/cublas_dlibapi.cpp
new file mode 100644
index 000000000..376cc9f00
--- /dev/null
+++ b/ml/dlib/dlib/dnn/cublas_dlibapi.cpp
@@ -0,0 +1,165 @@
+// Copyright (C) 2015  Davis E. King (davis@dlib.net)
+// License: Boost Software License   See LICENSE.txt for the full license.
+#ifndef DLIB_DNN_CuBLAS_CPP_
+#define DLIB_DNN_CuBLAS_CPP_
+
+#ifdef DLIB_USE_CUDA
+
+#include "cublas_dlibapi.h"
+#include "cuda_utils.h"
+
+#include <cublas_v2.h>
+#include <vector>
+
+static const char* cublas_get_error_string(cublasStatus_t s)
+{
+    switch(s)
+    {
+        case CUBLAS_STATUS_NOT_INITIALIZED: 
+            return "CUDA Runtime API initialization failed.";
+        case CUBLAS_STATUS_ALLOC_FAILED: 
+            return "CUDA Resources could not be allocated.";
+        default:
+            return "A call to cuBLAS failed";
+    }
+}
+
+// Check the return value of a call to the cuBLAS runtime for an error condition.
+#define CHECK_CUBLAS(call)                                                      \
+do{                                                                              \
+    const cublasStatus_t error = call;                                         \
+    if (error != CUBLAS_STATUS_SUCCESS)                                        \
+    {                                                                          \
+        std::ostringstream sout;                                               \
+        sout << "Error while calling " << #call << " in file " << __FILE__ << ":" << __LINE__ << ". ";\
+        sout << "code: " << error << ", reason: " << cublas_get_error_string(error);\
+        throw dlib::cublas_error(sout.str());                            \
+    }                                                                          \
+}while(false)
+
+namespace dlib
+{
+    namespace cuda 
+    {
+
+    // -----------------------------------------------------------------------------------
+
+        class cublas_context
+        {
+        public:
+            // not copyable
+            cublas_context(const cublas_context&) = delete;
+            cublas_context& operator=(const cublas_context&) = delete;
+
+            cublas_context()
+            {
+                handles.resize(16);
+            }
+            ~cublas_context()
+            {
+                for (auto h : handles)
+                {
+                    if (h)
+                        cublasDestroy(h);
+                }
+            }
+
+            cublasHandle_t get_handle (
+            )  
+            { 
+                int new_device_id;
+                CHECK_CUDA(cudaGetDevice(&new_device_id));
+                // make room for more devices if needed
+                if (new_device_id >= (long)handles.size())
+                    handles.resize(new_device_id+16);
+
+                // If we don't have a handle already for this device then make one
+                if (!handles[new_device_id])
+                    CHECK_CUBLAS(cublasCreate(&handles[new_device_id]));
+
+                // Finally, return the handle for the current device
+                return handles[new_device_id];
+            }
+
+        private:
+
+            std::vector<cublasHandle_t> handles;
+        };
+
+        static cublasHandle_t context()
+        {
+            thread_local cublas_context c;
+            return c.get_handle();
+        }
+
+    // -----------------------------------------------------------------------------------
+
+        void gemm (
+            float beta,
+            tensor& dest,
+            float alpha,
+            const tensor& lhs,
+            bool trans_lhs,
+            const tensor& rhs,
+            bool trans_rhs
+        )
+        {
+            // Recall that BLAS uses column major order so to deal with that we flip the
+            // order of the lhs and rhs arguments.
+            const auto transa = trans_lhs ? CUBLAS_OP_T : CUBLAS_OP_N;
+            const auto transb = trans_rhs ? CUBLAS_OP_T : CUBLAS_OP_N;
+
+            const int dest_nr = dest.num_samples();
+            const int dest_nc = dest.size()/dest_nr;
+            const int lhs_nr = lhs.num_samples();
+            const int lhs_nc = lhs.size()/lhs_nr;
+            const int rhs_nr = rhs.num_samples();
+            const int rhs_nc = rhs.size()/rhs_nr;
+            if (trans_lhs && trans_rhs)
+            {
+                DLIB_ASSERT( dest_nr == lhs_nc &&
+                              dest_nc == rhs_nr &&
+                              lhs_nr == rhs_nc)
+            }
+            else if (!trans_lhs && trans_rhs)
+            {
+                DLIB_ASSERT( dest_nr == lhs_nr &&
+                              dest_nc == rhs_nr &&
+                              lhs_nc == rhs_nc)
+            }
+            else if (trans_lhs && !trans_rhs)
+            {
+                DLIB_ASSERT( dest_nr == lhs_nc &&
+                              dest_nc == rhs_nc &&
+                              lhs_nr == rhs_nr)
+            }
+            else
+            {
+                DLIB_ASSERT( dest_nr == lhs_nr &&
+                              dest_nc == rhs_nc &&
+                              lhs_nc == rhs_nr)
+            }
+
+            const int k = trans_rhs ? rhs_nc : rhs_nr;
+            CHECK_CUBLAS(cublasSgemm(context(),
+                              transb,
+                              transa, 
+                              dest_nc, dest_nr, k,
+                              &alpha,
+                              rhs.device(), rhs_nc,
+                              lhs.device(), lhs_nc,
+                              &beta,
+                              dest.device(),dest_nc));
+        }
+
+    // ------------------------------------------------------------------------------------
+
+    }  
+}
+
+#endif // DLIB_USE_CUDA
+
+#endif // DLIB_DNN_CuBLAS_CPP_
+
+
+
diff --git a/ml/dlib/dlib/dnn/cublas_dlibapi.h b/ml/dlib/dlib/dnn/cublas_dlibapi.h
new file mode 100644
index 000000000..b46fd25ca
--- /dev/null
+++ b/ml/dlib/dlib/dnn/cublas_dlibapi.h
@@ -0,0 +1,50 @@
+// Copyright (C) 2015  Davis E. King (davis@dlib.net)
+// License: Boost Software License   See LICENSE.txt for the full license.
+#ifndef DLIB_DNN_CuBLAS_H_
+#define DLIB_DNN_CuBLAS_H_
+
+#ifdef DLIB_USE_CUDA
+
+#include "tensor.h"
+#include "cuda_errors.h"
+
+namespace dlib
+{
+    namespace cuda 
+    {
+
+    // -----------------------------------------------------------------------------------
+
+        void gemm (
+            float beta,
+            tensor& dest,
+            float alpha,
+            const tensor& lhs,
+            bool trans_lhs,
+            const tensor& rhs,
+            bool trans_rhs
+        );
+        /*!
+            requires
+                - The dimensions of lhs and rhs must be compatible for matrix
+                  multiplication.  In particular:
+                    - Let L == trans_lhs ? trans(mat(lhs)) : mat(lhs)
+                    - Let R == trans_rhs ? trans(mat(rhs)) : mat(rhs)
+                    - Let D == mat(dest)
+                    - D.nr() == L.nr() && D.nc() == R.nc()
+                      (i.e. dest must be preallocated and have the correct output dimensions)
+                    - L.nc() == R.nr()
+            ensures
+                - performs: dest = alpha*L*R + beta*mat(dest)
+        !*/
+
+    // ------------------------------------------------------------------------------------
+
+    }  
+}
+
+#endif // DLIB_USE_CUDA
+
+#endif // DLIB_DNN_CuBLAS_H_
+
+
diff --git a/ml/dlib/dlib/dnn/cuda_data_ptr.cpp b/ml/dlib/dlib/dnn/cuda_data_ptr.cpp
new file mode 100644
index 000000000..8abce0695
--- /dev/null
+++ b/ml/dlib/dlib/dnn/cuda_data_ptr.cpp
@@ -0,0 +1,71 @@
+// Copyright (C) 2017  Davis E. King (davis@dlib.net)
+// License: Boost Software License   See LICENSE.txt for the full license.
+#ifndef DLIB_DNN_CuDA_DATA_PTR_CPP_
+#define DLIB_DNN_CuDA_DATA_PTR_CPP_
+
+#ifdef DLIB_USE_CUDA
+
+#include "cuda_data_ptr.h"
+#include "cuda_utils.h"
+
+namespace dlib
+{
+    namespace cuda 
+    {
+
+    // -----------------------------------------------------------------------------------
+
+        cuda_data_void_ptr::
+        cuda_data_void_ptr(
+            size_t n
+        ) : num(n)
+        {
+            if (n == 0)
+                return;
+
+            void* data = nullptr;
+
+            CHECK_CUDA(cudaMalloc(&data, n));
+            pdata.reset(data, [](void* ptr){
+                auto err = cudaFree(ptr);
+                if(err!=cudaSuccess)
+                std::cerr << "cudaFree() failed. Reason: " << cudaGetErrorString(err) << std::endl;
+            });
+        }
+
+    // ------------------------------------------------------------------------------------
+
+        void memcpy(
+            void* dest,
+            const cuda_data_void_ptr& src
+        )
+        {
+            if (src.size() != 0)
+            {
+                CHECK_CUDA(cudaMemcpy(dest, src.data(),  src.size(), cudaMemcpyDefault));
+            }
+        }
+
+    // ------------------------------------------------------------------------------------
+
+        void memcpy(
+            cuda_data_void_ptr& dest, 
+            const void* src
+        )
+        {
+            if (dest.size() != 0)
+            {
+                CHECK_CUDA(cudaMemcpy(dest.data(), src, dest.size(), cudaMemcpyDefault));
+            }
+        }
+
+    // ------------------------------------------------------------------------------------
+
+    }  
+}
+
+#endif // DLIB_USE_CUDA
+
+#endif // DLIB_DNN_CuDA_DATA_PTR_CPP_
+
+
diff --git a/ml/dlib/dlib/dnn/cuda_data_ptr.h b/ml/dlib/dlib/dnn/cuda_data_ptr.h
new file mode 100644
index 000000000..7eca608a0
--- /dev/null
+++ b/ml/dlib/dlib/dnn/cuda_data_ptr.h
@@ -0,0 +1,184 @@
+// Copyright (C) 2017  Davis E. King (davis@dlib.net)
+// License: Boost Software License   See LICENSE.txt for the full license.
+#ifndef DLIB_DNN_CuDA_DATA_PTR_H_
+#define DLIB_DNN_CuDA_DATA_PTR_H_
+
+#ifdef DLIB_USE_CUDA
+
+#include <memory>
+#include <vector>
+
+namespace dlib
+{
+    namespace cuda
+    {
+
+    // ------------------------------------------------------------------------------------
+
+        class cuda_data_void_ptr
+        {
+            /*!
+                WHAT THIS OBJECT REPRESENTS
+                    This is a block of memory on a CUDA device.  
+            !*/
+        public:
+
+            cuda_data_void_ptr() = default;
+
+            cuda_data_void_ptr(size_t n); 
+            /*!
+                ensures
+                    - This object will allocate a device memory buffer of n bytes.
+                    - #size() == n
+            !*/
+
+            void* data() { return pdata.get(); }
+            const void* data() const { return pdata.get(); }
+            operator void*() { return pdata.get(); }
+            operator const void*() const { return pdata.get(); }
+
+            void reset() { pdata.reset(); }
+
+            size_t size() const { return num; }
+            /*!
+                ensures
+                    - returns the length of this buffer, in bytes.
+            !*/
+
+        private:
+
+            size_t num = 0;
+            std::shared_ptr<void> pdata;
+        };
+
+    // ------------------------------------------------------------------------------------
+
+        void memcpy(
+            void* dest,
+            const cuda_data_void_ptr& src
+        );
+        /*!
+            requires
+                - dest == a pointer to at least src.size() bytes on the host machine.
+            ensures
+                - copies the GPU data from src into dest.
+        !*/
+
+    // ------------------------------------------------------------------------------------
+
+        void memcpy(
+            cuda_data_void_ptr& dest, 
+            const void* src
+        );
+        /*!
+            requires
+                - dest == a pointer to at least src.size() bytes on the host machine.
+            ensures
+                - copies the host data from src to the GPU memory buffer dest.
+        !*/
+
+    // ------------------------------------------------------------------------------------
+    // ------------------------------------------------------------------------------------
+    // ------------------------------------------------------------------------------------
+
+        template <typename T>
+        class cuda_data_ptr
+        {
+            /*!
+                WHAT THIS OBJECT REPRESENTS
+                    This is a block of memory on a CUDA device.   It is just a type safe
+                    version of cuda_data_void_ptr.
+            !*/
+
+        public:
+
+            static_assert(std::is_standard_layout<T>::value, "You can only create basic standard layout types on the GPU");
+
+            cuda_data_ptr() = default;
+            cuda_data_ptr(size_t n) : num(n)
+            /*!
+                ensures
+                    - This object will allocate a device memory buffer of n T objects.
+                    - #size() == n
+            !*/
+            {
+                if (n == 0)
+                    return;
+
+                pdata = cuda_data_void_ptr(n*sizeof(T));
+            }
+
+            T* data() { return (T*)pdata.data(); }
+            const T* data() const { return (T*)pdata.data(); }
+
+            operator T*() { return (T*)pdata.data(); }
+            operator const T*() const { return (T*)pdata.data(); }
+
+            void reset() { pdata.reset(); }
+
+            size_t size() const { return num; }
+
+
+            friend void memcpy(
+                std::vector<T>& dest,
+                const cuda_data_ptr& src
+            )
+            {
+                dest.resize(src.size());
+                if (src.size() != 0)
+                    memcpy(dest.data(), src.pdata);
+            }
+
+            friend void memcpy(
+                cuda_data_ptr& src,
+                const std::vector<T>& dest
+            )
+            {
+                if (dest.size() != src.size())
+                    dest = cuda_data_ptr<T>(src.size());
+
+                if (src.size() != 0)
+                    memcpy(src.pdata, dest.data());
+            }
+
+        private:
+
+            size_t num = 0;
+            cuda_data_void_ptr pdata;
+        };
+
+    // ------------------------------------------------------------------------------------
+
+        class resizable_cuda_buffer
+        {
+            /*!
+                WHAT THIS OBJECT REPRESENTS
+                    This is a block of memory on a CUDA device that will be automatically
+                    resized if requested size is larger than allocated.
+            !*/
+        public:
+            cuda_data_void_ptr get(size_t size)
+            /*!
+                ensures
+                    - This object will return the buffer of requested size of larger
+                    - buffer.size() >= size
+            !*/
+            {
+                if (buffer.size() < size)
+                {
+                    buffer.reset();
+                    buffer = cuda_data_void_ptr(size);
+                }
+                return buffer;
+            }
+        private:
+            cuda_data_void_ptr buffer;
+        };
+
+    }
+}
+
+#endif // DLIB_USE_CUDA
+
+#endif // DLIB_DNN_CuDA_DATA_PTR_H_
+
diff --git a/ml/dlib/dlib/dnn/cuda_dlib.cu b/ml/dlib/dlib/dnn/cuda_dlib.cu
new file mode 100644
index 000000000..6c37593f1
--- /dev/null
+++ b/ml/dlib/dlib/dnn/cuda_dlib.cu
@@ -0,0 +1,1630 @@
+// Copyright (C) 2015  Davis E. King (davis@dlib.net)
+// License: Boost Software License   See LICENSE.txt for the full license.
+
+#include "cuda_utils.h"
+#include "cuda_dlib.h"
+
+
+namespace dlib 
+{ 
+    namespace cuda 
+    {
+
+    // -----------------------------------------------------------------------------------
+
+        void set_device (
+            int dev
+        )
+        {
+            CHECK_CUDA(cudaSetDevice(dev));
+        }
+
+        int get_device (
+        )
+        {
+            int dev = 0;
+            CHECK_CUDA(cudaGetDevice(&dev));
+            return dev;
+        }
+
+        std::string get_device_name (
+            int device
+        )
+        {
+            cudaDeviceProp props;
+            CHECK_CUDA(cudaGetDeviceProperties(&props, device));
+            return props.name;
+        }
+
+        void set_current_device_blocking_sync(
+        )
+        {
+            CHECK_CUDA(cudaSetDeviceFlags(cudaDeviceScheduleBlockingSync));
+        }
+
+        int get_num_devices (
+        )
+        {
+            int num_devices;
+            CHECK_CUDA(cudaGetDeviceCount(&num_devices));
+            return num_devices;
+        }
+
+        bool can_access_peer (int device_id, int peer_device_id)
+        {
+            int can_access;
+            CHECK_CUDA(cudaDeviceCanAccessPeer(&can_access, device_id, peer_device_id));
+            return can_access != 0;
+        }
+        bool can_access_peer (const tensor& device, const tensor& peer_device)
+        {
+            return can_access_peer(device.device_id(), peer_device.device_id());
+        }
+
+        void device_synchronize (int dev) 
+        { 
+            raii_set_device set_dev(dev);
+            CHECK_CUDA(cudaDeviceSynchronize());
+        }
+        void device_synchronize (const tensor& dev) { device_synchronize(dev.device_id()); }
+
+        enable_peer_access::
+        enable_peer_access(
+            int device_id,
+            int peer_device_id
+        ) : call_disable(false), device_id(device_id), peer_device_id(peer_device_id)
+        {
+            raii_set_device set_dev(device_id);
+
+            auto err = cudaDeviceEnablePeerAccess(peer_device_id, 0);
+            if (err == cudaSuccess)
+            {
+                call_disable = true;
+            }
+            else if (err == cudaErrorPeerAccessAlreadyEnabled)
+            {
+                // call cudaGetLastError() to dispose of this error since we don't
+                // care.
+                auto err2 = cudaGetLastError();
+                if (err2 != cudaErrorPeerAccessAlreadyEnabled)
+                    CHECK_CUDA(err2);
+            }
+            else
+            {
+                CHECK_CUDA(err);
+            }
+        }
+
+
+        enable_peer_access::
+        ~enable_peer_access() noexcept(false)
+        {
+            if (call_disable)
+            {
+                raii_set_device set_dev(device_id);
+                CHECK_CUDA(cudaDeviceDisablePeerAccess(peer_device_id));
+            }
+        }
+
+    // -----------------------------------------------------------------------------------
+    // -----------------------------------------------------------------------------------
+    // -----------------------------------------------------------------------------------
+
+        __global__ void _cuda_inverse_norms(float* invnorms, const float* data, size_t nr, size_t nc, const float eps)
+        {
+            // initialize invnorms before we begin.
+            for (auto i : grid_stride_range_y(0, nr))
+                for (auto j : grid_stride_range(0, 1))
+                    invnorms[i] = eps;
+            __syncthreads();
+
+            for (auto i : grid_stride_range_y(0, nr))
+            {
+                auto p = data + i*nc;
+                float temp = 0;
+                for (auto j : grid_stride_range(0, nc))
+                    temp += p[j]*p[j];
+
+                // and store the sum into invnorms[i]
+                warp_reduce_atomic_add(invnorms[i], temp);
+            }
+            __syncthreads();
+
+            for (auto i : grid_stride_range_y(0, nr))
+                for (auto j : grid_stride_range(0, 1))
+                    invnorms[i] = 1.0/std::sqrt(invnorms[i]);
+        }
+
+        void inverse_norms (
+            resizable_tensor& invnorms,
+            const tensor& data,
+            const double eps
+        )
+        {
+            invnorms.set_size(data.num_samples());
+            launch_kernel(_cuda_inverse_norms, max_jobs(data.size()/data.num_samples(), data.num_samples()),
+                invnorms.device(), data.device(), data.num_samples(), data.size()/data.num_samples(), eps);
+        }
+
+    // ----------------------------------------------------------------------------------------
+
+        __global__ void _cuda_dot_prods(float* out, const float* lhs, const float* rhs, size_t nr, size_t nc)
+        {
+            // initialize out before we begin.
+            for (auto i : grid_stride_range_y(0, nr))
+                for (auto j : grid_stride_range(0, 1))
+                    out[i] = 0;
+            __syncthreads();
+
+            for (auto i : grid_stride_range_y(0, nr))
+            {
+                auto l = lhs + i*nc;
+                auto r = rhs + i*nc;
+                float temp = 0;
+                for (auto j : grid_stride_range(0, nc))
+                    temp += l[j]*r[j];
+
+                // and store the sum into out[i]
+                warp_reduce_atomic_add(out[i], temp);
+            }
+        }
+
+        __global__ void _cuda_dot_prods_add_to(float* out, const float* lhs, const float* rhs, size_t nr, size_t nc)
+        {
+            for (auto i : grid_stride_range_y(0, nr))
+            {
+                auto l = lhs + i*nc;
+                auto r = rhs + i*nc;
+                float temp = 0;
+                for (auto j : grid_stride_range(0, nc))
+                    temp += l[j]*r[j];
+
+                // and store the sum into out[i]
+                warp_reduce_atomic_add(out[i], temp);
+            }
+        }
+
+        void dot_prods (
+            resizable_tensor& out,
+            const tensor& lhs,
+            const tensor& rhs
+        )
+        {
+            DLIB_CASSERT(have_same_dimensions(lhs,rhs));
+
+            out.set_size(lhs.num_samples());
+            if (out.size() == 0)
+                return;
+
+            const auto nr = lhs.num_samples();
+            const auto nc = lhs.size()/lhs.num_samples();
+
+            launch_kernel(_cuda_dot_prods, max_jobs(nc,nr), out.device_write_only(), lhs.device(), rhs.device(), nr, nc);
+        }
+
+        void dot_prods (
+            bool add_to,
+            tensor& out,
+            const tensor& lhs,
+            const tensor& rhs
+        )
+        {
+            DLIB_CASSERT(have_same_dimensions(lhs,rhs));
+            DLIB_CASSERT(out.k() == 1 && out.nr() == 1 && out.nc() == 1);
+            DLIB_CASSERT(out.size() == lhs.num_samples());
+
+            const auto nr = lhs.num_samples();
+            const auto nc = lhs.size()/lhs.num_samples();
+
+            if (add_to)
+                launch_kernel(_cuda_dot_prods_add_to, max_jobs(nc,nr), out.device(), lhs.device(), rhs.device(), nr, nc);
+            else
+                launch_kernel(_cuda_dot_prods, max_jobs(nc,nr), out.device_write_only(), lhs.device(), rhs.device(), nr, nc);
+        }
+
+    // ----------------------------------------------------------------------------------------
+
+        __global__ void _cuda_scale_columns(float* out, const float* m, const float* v, size_t nr, size_t nc)
+        {
+            for (auto j : grid_stride_range(0, nr*nc))
+            {
+                out[j] = m[j]*v[j%nc];
+            }
+        }
+
+        void scale_columns (
+            tensor& out,
+            const tensor& m,
+            const tensor& v
+        )
+        {
+            launch_kernel(_cuda_scale_columns, max_jobs(m.size()), out.device(), m.device(), v.device(), m.num_samples(), m.size()/m.num_samples());
+        }
+
+    // ----------------------------------------------------------------------------------------
+
+        __global__ void _cuda_scale_rows(float* out, const float* m, const float* v, size_t nr, size_t nc)
+        {
+            for (auto j : grid_stride_range(0, nr*nc))
+            {
+                out[j] = m[j]*v[j/nc];
+            }
+        }
+
+        void scale_rows (
+            tensor& out,
+            const tensor& m,
+            const tensor& v
+        )
+        {
+            launch_kernel(_cuda_scale_rows, max_jobs(m.size()), out.device(), m.device(), v.device(), m.num_samples(), m.size()/m.num_samples());
+        }
+
+    // ----------------------------------------------------------------------------------------
+
+        __global__ void _cuda_scale_rows2(float* out, const float* m1, const float* m2, const float* v1, const float* v2, size_t nr, size_t nc)
+        {
+            for (auto j : grid_stride_range(0, nr*nc))
+            {
+                out[j] = (m1[j] - m2[j]*v1[j/nc]) * v2[j/nc];
+            }
+        }
+
+        __global__ void _cuda_scale_rows2_beta(const float beta, float* out, const float* m1, const float* m2, const float* v1, const float* v2, size_t nr, size_t nc)
+        {
+            for (auto j : grid_stride_range(0, nr*nc))
+            {
+                out[j] = beta*out[j] + (m1[j] - m2[j]*v1[j/nc]) * v2[j/nc];
+            }
+        }
+
+        void scale_rows2 (
+            float beta, 
+            tensor& out,
+            const tensor& m1,
+            const tensor& m2,
+            const tensor& v1,
+            const tensor& v2
+        )
+        {
+            if (beta == 0)
+            {
+                launch_kernel(_cuda_scale_rows2, max_jobs(m1.size()), out.device(),
+                    m1.device(), m2.device(), v1.device(), v2.device(), m1.num_samples(),
+                    m1.size()/m1.num_samples());
+            }
+            else
+            {
+                launch_kernel(_cuda_scale_rows2_beta, max_jobs(m1.size()), beta,
+                    out.device(), m1.device(), m2.device(), v1.device(), v2.device(),
+                    m1.num_samples(), m1.size()/m1.num_samples());
+            }
+        }
+
+    // ----------------------------------------------------------------------------------------
+
+        __global__ void _cuda_exp(float* dest, const float* src, size_t n)
+        {
+            for (auto i : grid_stride_range(0, n))
+                dest[i] = ::exp(src[i]);
+        }
+
+        void exp (
+            tensor& dest,
+            const tensor& src
+        )
+        {
+            DLIB_ASSERT(dest.size() == src.size());
+            launch_kernel(_cuda_exp, max_jobs(src.size()), dest.device(), src.device(), src.size());
+        }
+
+    // ----------------------------------------------------------------------------------------
+
+        __global__ void _cuda_log(float* dest, const float* src, size_t n)
+        {
+            for (auto i : grid_stride_range(0, n))
+                dest[i] = ::log(src[i]);
+        }
+
+        void log (
+            tensor& dest,
+            const tensor& src
+        )
+        {
+            DLIB_ASSERT(dest.size() == src.size());
+            launch_kernel(_cuda_log, max_jobs(src.size()), dest.device(), src.device(), src.size());
+        }
+
+    // ----------------------------------------------------------------------------------------
+
+        __global__ void _cuda_log10(float* dest, const float* src, size_t n)
+        {
+            for (auto i : grid_stride_range(0, n))
+                dest[i] = ::log10(src[i]);
+        }
+
+        void log10 (
+            tensor& dest,
+            const tensor& src
+        )
+        {
+            DLIB_ASSERT(dest.size() == src.size());
+            launch_kernel(_cuda_log10, max_jobs(src.size()), dest.device(), src.device(), src.size());
+        }
+
+    // -----------------------------------------------------------------------------------
+
+        __global__ void _cuda_multiply1(float* d, const float* s1, const float* s2, size_t n)
+        {
+            for (auto i : grid_stride_range(0, n))
+            {
+                d[i] = s1[i]*s2[i];
+            }
+        }
+        __global__ void _cuda_multiply2(float* d, const float* s1, const float* s2, 
+                                       size_t n, size_t s1_n, size_t s2_n, size_t max_size)
+        {
+            for (auto i : grid_stride_range(0, n))
+            {
+                d[i] = 0;
+                for (size_t j = i; j < max_size; j += n)
+                    d[i] += s1[j%s1_n]*s2[j%s2_n];
+            }
+        }
+
+        __global__ void _cuda_multiply3(float* d, const float* s1, const float* s2, 
+                                       size_t n, size_t s1_n, size_t s2_n)
+        {
+            for (auto i : grid_stride_range(0, n))
+            {
+                d[i] = s1[i%s1_n]*s2[i%s2_n];
+            }
+        }
+
+        __global__ void _cuda_multiply1_add_to(float* d, const float* s1, const float* s2, size_t n)
+        {
+            for (auto i : grid_stride_range(0, n))
+            {
+                d[i] += s1[i]*s2[i];
+            }
+        }
+        __global__ void _cuda_multiply2_add_to(float* d, const float* s1, const float* s2, 
+                                       size_t n, size_t s1_n, size_t s2_n, size_t max_size)
+        {
+            for (auto i : grid_stride_range(0, n))
+            {
+                for (size_t j = i; j < max_size; j += n)
+                    d[i] += s1[j%s1_n]*s2[j%s2_n];
+            }
+        }
+
+        __global__ void _cuda_multiply3_add_to(float* d, const float* s1, const float* s2, 
+                                       size_t n, size_t s1_n, size_t s2_n)
+        {
+            for (auto i : grid_stride_range(0, n))
+            {
+                d[i] += s1[i%s1_n]*s2[i%s2_n];
+            }
+        }
+
+        void multiply (
+            bool add_to,
+            tensor& dest,
+            const tensor& src1,
+            const tensor& src2
+        )
+        {
+
+            DLIB_CASSERT(dest.k() == src1.k() && src1.k() == src2.k() &&
+                dest.nr() == src1.nr() && src1.nr() == src2.nr() &&
+                dest.nc() == src1.nc() && src1.nc() == src2.nc() );
+            const long MD = std::max(std::max(dest.num_samples(),src1.num_samples()),src2.num_samples());
+            DLIB_CASSERT((dest.num_samples()==1 || dest.num_samples()==MD) &&
+                (src1.num_samples()==1 || src1.num_samples()==MD) &&
+                (src2.num_samples()==1 || src2.num_samples()==MD) );
+
+            if (dest.size() == 0)
+                return;
+
+            const size_t max_size = std::max(std::max(dest.size(),src1.size()),src2.size());
+            const auto d = dest.host();
+            const auto s1 = src1.host();
+            const auto s2 = src2.host();
+            if (dest.size() == src1.size() && src1.size() == src2.size())
+            {
+                if (add_to)
+                    launch_kernel(_cuda_multiply1_add_to,max_jobs(dest.size()),dest.device(), src1.device(), src2.device(), src1.size());
+                else
+                    launch_kernel(_cuda_multiply1,max_jobs(dest.size()),dest.device(), src1.device(), src2.device(), src1.size());
+            }
+            else if (dest.num_samples() == 1)
+            {
+                if (add_to)
+                    launch_kernel(_cuda_multiply2_add_to,max_jobs(dest.size()),dest.device(), src1.device(), src2.device(), 
+                                                dest.size(), src1.size(), src2.size(), max_size);
+                else
+                    launch_kernel(_cuda_multiply2,max_jobs(dest.size()),dest.device(), src1.device(), src2.device(), 
+                                                dest.size(), src1.size(), src2.size(), max_size);
+            }
+            else
+            {
+                if (add_to)
+                    launch_kernel(_cuda_multiply3_add_to,max_jobs(dest.size()),dest.device(), src1.device(), src2.device(), 
+                                                dest.size(), src1.size(), src2.size());
+                else
+                    launch_kernel(_cuda_multiply3,max_jobs(dest.size()),dest.device(), src1.device(), src2.device(), 
+                                                dest.size(), src1.size(), src2.size());
+            }
+        }
+
+    // ------------------------------------------------------------------------------------
+
+        __global__ void _cuda_multiply_conv(float* d, const float* s1, size_t n, const float* s2, size_t bs, size_t ks)
+        {
+            for (auto i : grid_stride_range(0, n))
+            {
+                auto k = (i/bs)%ks;
+                d[i] = s1[i]*s2[k];
+            }
+        }
+
+        __global__ void _cuda_multiply_conv2(float* d, const float* s1, size_t n, const float* s2, size_t bs, size_t ks)
+        {
+            // zero initialize d before we begin.
+            for (auto i : grid_stride_range_y(0, ks))
+                for (auto j : grid_stride_range(0, 1))
+                    d[i] = 0;
+            __syncthreads();
+
+            // loop over all the image planes
+            for (auto i : grid_stride_range_y(0, n))
+            {
+                // sum all the elements in the i-th image plane
+                float temp = 0;
+                for (auto j : grid_stride_range(i*bs, (i+1)*bs))
+                    temp += s1[j]*s2[j];
+                auto k = i%ks;
+                // and store the sum into d[k]
+                warp_reduce_atomic_add(d[k], temp);
+            }
+        }
+
+        __global__ void _cuda_multiply_conv_add_to(float* d, const float* s1, size_t n, const float* s2, size_t bs, size_t ks)
+        {
+            for (auto i : grid_stride_range(0, n))
+            {
+                auto k = (i/bs)%ks;
+                d[i] += s1[i]*s2[k];
+            }
+        }
+
+        __global__ void _cuda_multiply_conv2_add_to(float* d, const float* s1, size_t n, const float* s2, size_t bs, size_t ks)
+        {
+            // loop over all the image planes
+            for (auto i : grid_stride_range_y(0, n))
+            {
+                // sum all the elements in the i-th image plane
+                float temp = 0;
+                for (auto j : grid_stride_range(i*bs, (i+1)*bs))
+                    temp += s1[j]*s2[j];
+                auto k = i%ks;
+                // and store the sum into d[k]
+                warp_reduce_atomic_add(d[k], temp);
+            }
+        }
+
+
+        void multiply_conv (
+            bool add_to,
+            tensor& dest,
+            const tensor& src1,
+            const tensor& src2
+        )
+        {
+            if (have_same_dimensions(dest,src1))
+            {
+                DLIB_CASSERT(src2.num_samples() == 1 && src2.nr() == 1 && src2.nc() == 1 && src2.k() == src1.k());
+                if (dest.size() == 0)
+                    return;
+
+                if (add_to)
+                    launch_kernel(_cuda_multiply_conv_add_to,max_jobs(dest.size()),
+                        dest.device(), src1.device(), src1.size(), src2.device(), src1.nr()*src1.nc(), src1.k());
+                else
+                    launch_kernel(_cuda_multiply_conv,max_jobs(dest.size()),
+                        dest.device(), src1.device(), src1.size(), src2.device(), src1.nr()*src1.nc(), src1.k());
+            }
+            else
+            {
+                DLIB_CASSERT(have_same_dimensions(src1,src2));
+                DLIB_CASSERT(dest.num_samples() == 1 && dest.nr() == 1 && dest.nc() == 1 && dest.k() == src1.k());
+                if (dest.size() == 0)
+                    return;
+
+
+                const auto bs = src1.nr()*src1.nc();
+                const auto n = src1.num_samples()*src1.k();
+                if (add_to)
+                    launch_kernel(_cuda_multiply_conv2_add_to, max_jobs(bs,n),
+                        dest.device(), src1.device(), n, src2.device(), bs, src1.k());
+                else
+                    launch_kernel(_cuda_multiply_conv2, max_jobs(bs,n),
+                        dest.device(), src1.device(), n, src2.device(), bs, src1.k());
+            }
+
+        }
+
+    // ------------------------------------------------------------------------------------
+
+        __global__ void _cuda_scale_channels_add_to(float* d, const float* src, size_t n, const float* scales, size_t bs)
+        {
+            for (auto i : grid_stride_range(0, n))
+            {
+                auto k = i/bs;
+                d[i] += src[i]*scales[k];
+            }
+        }
+
+        __global__ void _cuda_scale_channels(float* d, const float* src, size_t n, const float* scales, size_t bs)
+        {
+            for (auto i : grid_stride_range(0, n))
+            {
+                auto k = i/bs;
+                d[i] = src[i]*scales[k];
+            }
+        }
+
+        void scale_channels (
+            bool add_to,
+            tensor& dest,
+            const tensor& src,
+            const tensor& scales
+        )
+        {
+            DLIB_CASSERT(have_same_dimensions(dest,src) && 
+                         scales.num_samples() == src.num_samples() &&
+                         scales.k()           == src.k() &&
+                         scales.nr()          == 1 &&
+                         scales.nc()          == 1 );
+
+            if (dest.size() == 0)
+                return;
+
+            if (add_to)
+                launch_kernel(_cuda_scale_channels_add_to,max_jobs(dest.size()),
+                    dest.device(), src.device(), src.size(), scales.device(), src.nr()*src.nc());
+            else
+                launch_kernel(_cuda_scale_channels,max_jobs(dest.size()),
+                    dest.device_write_only(), src.device(), src.size(), scales.device(), src.nr()*src.nc());
+        }
+
+    // ------------------------------------------------------------------------------------
+
+        __global__ void _cuda_mult1(float* d, const float* s1, const float* s2, size_t n)
+        {
+            for (auto i : grid_stride_range(0, n))
+            {
+                d[i] = s1[i]*s2[i];
+            }
+        }
+
+        __global__ void _cuda_mult1_add_to(float* d, const float* s1, const float* s2, size_t n)
+        {
+            for (auto i : grid_stride_range(0, n))
+            {
+                d[i] += s1[i]*s2[i];
+            }
+        }
+
+        __global__ void _cuda_mult2(float* d, const float* s1, const float* s2, 
+                                   size_t dn, size_t dk, size_t dr, size_t dc,
+                                   size_t s1n, size_t s1k, size_t s1r, size_t s1c,
+                                   size_t s2n, size_t s2k, size_t s2r, size_t s2c)
+        {
+            for (auto i : grid_stride_range(0, dn*dk*dr*dc))
+            {
+                size_t n,k,r,c;
+                unpack_idx(i, dk,dr,dc, n,k,r,c);
+
+                float v1 = 0;
+                float v2 = 0;
+
+                if (n < s1n &&
+                    k < s1k &&
+                    r < s1r &&
+                    c < s1c )
+                {
+                    v1 = s1[pack_idx(s1k,s1r,s1c, n,k,r,c)];
+                }
+
+                if (n < s2n &&
+                    k < s2k &&
+                    r < s2r &&
+                    c < s2c )
+                {
+                    v2 = s2[pack_idx(s2k,s2r,s2c, n,k,r,c)];
+                }
+
+                d[i] = v1*v2;
+            }
+        }
+
+        __global__ void _cuda_mult2_add_to(float* d, const float* s1, const float* s2, 
+                                   size_t dn, size_t dk, size_t dr, size_t dc,
+                                   size_t s1n, size_t s1k, size_t s1r, size_t s1c,
+                                   size_t s2n, size_t s2k, size_t s2r, size_t s2c)
+        {
+            for (auto i : grid_stride_range(0, dn*dk*dr*dc))
+            {
+                size_t n,k,r,c;
+                unpack_idx(i, dk,dr,dc, n,k,r,c);
+
+                float v1 = 0;
+                float v2 = 0;
+
+                if (n < s1n &&
+                    k < s1k &&
+                    r < s1r &&
+                    c < s1c )
+                {
+                    v1 = s1[pack_idx(s1k,s1r,s1c, n,k,r,c)];
+                }
+
+                if (n < s2n &&
+                    k < s2k &&
+                    r < s2r &&
+                    c < s2c )
+                {
+                    v2 = s2[pack_idx(s2k,s2r,s2c, n,k,r,c)];
+                }
+
+                d[i] += v1*v2;
+            }
+        }
+
+        void multiply_zero_padded (
+            bool add_to,
+            tensor& dest,
+            const tensor& src1,
+            const tensor& src2
+        )
+        {
+            if (dest.size() == 0)
+                return;
+
+            // Do the simple and fast version if everything has the same dimensions
+            if (have_same_dimensions(dest, src1) &&
+                have_same_dimensions(dest, src2))
+            {
+                if (add_to)
+                    launch_kernel(_cuda_mult1_add_to,max_jobs(dest.size()), dest.device(), src1.device(), src2.device(), dest.size());
+                else
+                    launch_kernel(_cuda_mult1,max_jobs(dest.size()), dest.device(), src1.device(), src2.device(), dest.size());
+            }
+            else
+            {
+                if (add_to)
+                {
+                    // Otherwise, do the more complex version with bounds checking.
+                    launch_kernel(_cuda_mult2_add_to,max_jobs(dest.size()),
+                                dest.device(), src1.device(), src2.device(), 
+                                dest.num_samples(), dest.k(), dest.nr(), dest.nc(),
+                                src1.num_samples(), src1.k(), src1.nr(), src1.nc(),
+                                src2.num_samples(), src2.k(), src2.nr(), src2.nc()
+                                );
+                }
+                else
+                {
+                    // Otherwise, do the more complex version with bounds checking.
+                    launch_kernel(_cuda_mult2,max_jobs(dest.size()),
+                                dest.device(), src1.device(), src2.device(), 
+                                dest.num_samples(), dest.k(), dest.nr(), dest.nc(),
+                                src1.num_samples(), src1.k(), src1.nr(), src1.nc(),
+                                src2.num_samples(), src2.k(), src2.nr(), src2.nc()
+                                );
+                }
+            }
+        }
+
+    // ------------------------------------------------------------------------------------
+
+        __global__ void _cuda_add1(float* d, const float* s1, const float* s2, size_t n)
+        {
+            for (auto i : grid_stride_range(0, n))
+            {
+                d[i] = s1[i]+s2[i];
+            }
+        }
+
+        __global__ void _cuda_add2(float* d, const float* s1, const float* s2, 
+                                   size_t dn, size_t dk, size_t dr, size_t dc,
+                                   size_t s1n, size_t s1k, size_t s1r, size_t s1c,
+                                   size_t s2n, size_t s2k, size_t s2r, size_t s2c)
+        {
+            for (auto i : grid_stride_range(0, dn*dk*dr*dc))
+            {
+                size_t n,k,r,c;
+                unpack_idx(i, dk,dr,dc, n,k,r,c);
+
+                float v1 = 0;
+                float v2 = 0;
+
+                if (n < s1n &&
+                    k < s1k &&
+                    r < s1r &&
+                    c < s1c )
+                {
+                    v1 = s1[pack_idx(s1k,s1r,s1c, n,k,r,c)];
+                }
+
+                if (n < s2n &&
+                    k < s2k &&
+                    r < s2r &&
+                    c < s2c )
+                {
+                    v2 = s2[pack_idx(s2k,s2r,s2c, n,k,r,c)];
+                }
+
+                d[i] = v1+v2;
+            }
+        }
+
+        void add (
+            tensor& dest,
+            const tensor& src1,
+            const tensor& src2
+        )
+        {
+            if (dest.size() == 0)
+                return;
+
+            // Do the simple and fast version if everything has the same dimensions
+            if (have_same_dimensions(dest, src1) &&
+                have_same_dimensions(dest, src2))
+            {
+                launch_kernel(_cuda_add1,max_jobs(dest.size()), dest.device(), src1.device(), src2.device(), dest.size());
+            }
+            else
+            {
+                // Otherwise, do the more complex version with bounds checking.
+                launch_kernel(_cuda_add2,max_jobs(dest.size()),
+                            dest.device(), src1.device(), src2.device(), 
+                            dest.num_samples(), dest.k(), dest.nr(), dest.nc(),
+                            src1.num_samples(), src1.k(), src1.nr(), src1.nc(),
+                            src2.num_samples(), src2.k(), src2.nr(), src2.nc()
+                            );
+            }
+
+        }
+
+    // ------------------------------------------------------------------------------------
+
+        __global__ void _cuda_affine_transform1(float* d, const float* s, size_t n, float A, float B)
+        {
+            for (auto i : grid_stride_range(0, n))
+            {
+                d[i] = A*s[i] + B;
+            }
+        }
+
+        __global__ void _cuda_affine_transform1_0(float* d, const float* s, size_t n, float A)
+        {
+            for (auto i : grid_stride_range(0, n))
+            {
+                d[i] = A*s[i];
+            }
+        }
+
+        void affine_transform(
+            tensor& dest,
+            const tensor& src,
+            const float A,
+            const float B
+        )
+        {
+            DLIB_CASSERT(dest.size()==src.size());
+            if (B != 0)
+                launch_kernel(_cuda_affine_transform1,max_jobs(dest.size()),dest.device(), src.device(), src.size(), A, B);
+            else
+                launch_kernel(_cuda_affine_transform1_0,max_jobs(dest.size()),dest.device(), src.device(), src.size(), A);
+        }
+
+        void affine_transform(
+            tensor& dest,
+            const tensor& src,
+            const float A
+        )
+        {
+            DLIB_CASSERT(dest.size()==src.size());
+            launch_kernel(_cuda_affine_transform1_0,max_jobs(dest.size()),dest.device(), src.device(), src.size(), A);
+        }
+
+    // ----------------------------------------------------------------------------------------
+
+        __global__ void _cuda_affine_transform_rect(
+            float* d, 
+            const float* s1, 
+            const float* s2, 
+            const float* s3, 
+            float A, 
+            float B,
+            float C,
+            size_t start_idx,
+            size_t n, 
+            size_t rect_nc,
+            size_t total_nc
+        )
+        {
+            for (auto i : grid_stride_range(0, n))
+            {
+                size_t r = i/rect_nc;
+                size_t c = i%rect_nc;
+                size_t idx = r*total_nc + c + start_idx;
+                d[idx] = A*s1[idx] + B*s2[idx] + C*s3[idx];
+            }
+        }
+
+        void affine_transform(
+            const rectangle& rect,
+            tensor& dest, 
+            const tensor& src1, 
+            const tensor& src2, 
+            const tensor& src3, 
+            float A, 
+            float B,
+            float C
+        )
+        {
+            DLIB_CASSERT(dest.size() == src1.size());
+            DLIB_CASSERT(dest.size() == src2.size());
+            DLIB_CASSERT(dest.size() == src3.size());
+            DLIB_CASSERT(dest.num_samples() == src1.num_samples());
+            DLIB_CASSERT(dest.num_samples() == src2.num_samples());
+            DLIB_CASSERT(dest.num_samples() == src3.num_samples());
+            DLIB_CASSERT(rectangle(0,0, dest.size()/dest.num_samples()-1, dest.num_samples()-1).contains(rect));
+            launch_kernel(_cuda_affine_transform_rect,max_jobs(rect.area()),
+                dest.device(), src1.device(), src2.device(), src3.device(), A, B, C,
+                rect.left() + rect.top()*(dest.size()/dest.num_samples()),
+                rect.area(),
+                rect.width(),
+                dest.size()/dest.num_samples());
+        }
+
+    // ----------------------------------------------------------------------------------------
+
+        __global__ void _cuda_affine_transform4(float* d, const float* s1, const float* s2, size_t n, float A, float B, float C)
+        {
+            for (auto i : grid_stride_range(0, n))
+            {
+                d[i] = A*s1[i] + B*s2[i] + C;
+            }
+        }
+
+        __global__ void _cuda_affine_transform4_0(float* d, const float* s1, const float* s2, size_t n, float A, float B)
+        {
+            for (auto i : grid_stride_range(0, n))
+            {
+                d[i] = A*s1[i] + B*s2[i];
+            }
+        }
+
+        void affine_transform(
+            tensor& dest,
+            const tensor& src1,
+            const tensor& src2,
+            const float A,
+            const float B,
+            const float C
+        )
+        {
+            DLIB_CASSERT(dest.size()==src1.size());
+            DLIB_CASSERT(dest.size()==src2.size());
+            if (C != 0)
+                launch_kernel(_cuda_affine_transform4,max_jobs(dest.size()),dest.device(), src1.device(), src2.device(), dest.size(), A, B, C);
+            else
+                launch_kernel(_cuda_affine_transform4_0,max_jobs(dest.size()),dest.device(), src1.device(), src2.device(), dest.size(), A, B);
+        }
+
+        void affine_transform(
+            tensor& dest,
+            const tensor& src1,
+            const tensor& src2,
+            const float A,
+            const float B
+        )
+        {
+            DLIB_CASSERT(dest.size()==src1.size());
+            DLIB_CASSERT(dest.size()==src2.size());
+            launch_kernel(_cuda_affine_transform4_0,max_jobs(dest.size()),dest.device(), src1.device(), src2.device(), dest.size(), A, B);
+        }
+
+    // ----------------------------------------------------------------------------------------
+
+        __global__ void _cuda_add_scaled(float* d, const float* s, size_t n, float scale)
+        {
+            for (auto i : grid_stride_range(0, n))
+            {
+                d[i] += scale*s[i]; 
+            }
+        }
+
+        void add_scaled(
+            tensor& dest,
+            const float scale,
+            const tensor& src
+        )
+        {
+            DLIB_CASSERT(dest.size()==src.size());
+            launch_kernel(_cuda_add_scaled,max_jobs(dest.size()),dest.device(), src.device(), dest.size(), scale);
+        }
+
+    // ----------------------------------------------------------------------------------------
+
+        __global__ void _cuda_add_cv_to_all_columns(float beta, float* dest, float alpha, const float* src, size_t size, size_t stride)
+        {
+            for (auto i : grid_stride_range(0, size))
+            {
+                dest[i] = beta*dest[i] + alpha*src[i/stride];
+            }
+        }
+
+        __global__ void _cuda_add_cv_to_all_columns_no_beta(float* dest, float alpha, const float* src, size_t size, size_t stride)
+        {
+            for (auto i : grid_stride_range(0, size))
+            {
+                dest[i] = alpha*src[i/stride];
+            }
+        }
+
+        void add_cv_to_all_columns(
+            float beta, 
+            tensor& dest, 
+            float alpha, 
+            const tensor& src
+        )
+        {
+            DLIB_CASSERT(dest.num_samples() == src.num_samples() && src.num_samples() == src.size());
+            if (beta == 0)
+                launch_kernel(_cuda_add_cv_to_all_columns_no_beta, max_jobs(dest.size()), dest.device(), alpha, src.device(), dest.size(), dest.size()/dest.num_samples());
+            else
+                launch_kernel(_cuda_add_cv_to_all_columns, max_jobs(dest.size()), beta, dest.device(), alpha, src.device(), dest.size(), dest.size()/dest.num_samples());
+        }
+
+    // ----------------------------------------------------------------------------------------
+
+        __global__ void _cuda_affine_transform5(
+            float* d, const float* s1, const float* s2, const float* s3, size_t n, float A, float B, float C, float D
+        )
+        {
+            for (auto i : grid_stride_range(0, n))
+            {
+                d[i] = A*s1[i] + B*s2[i] + C*s3[i] + D;
+            }
+        }
+
+        void affine_transform(
+            tensor& dest,
+            const tensor& src1,
+            const tensor& src2,
+            const tensor& src3,
+            const float A,
+            const float B,
+            const float C,
+            const float D
+        )
+        {
+            DLIB_CASSERT(dest.size()==src1.size());
+            DLIB_CASSERT(dest.size()==src2.size());
+            DLIB_CASSERT(dest.size()==src3.size());
+            launch_kernel(_cuda_affine_transform5,max_jobs(dest.size()),dest.device(), src1.device(),
+                src2.device(), src3.device(), dest.size(), A, B, C, D);
+        }
+
+    // ----------------------------------------------------------------------------------------
+
+        __global__ void _cuda_affine_transform_range(
+            float* d, const float* s1, const float* s2, const float* s3, size_t begin, size_t end, float A, float B, float C
+        )
+        {
+            for (auto i : grid_stride_range(begin, end))
+            {
+                d[i] = A*s1[i] + B*s2[i] + C*s3[i];
+            }
+        }
+
+
+        void affine_transform_range(
+            size_t begin,
+            size_t end,
+            tensor& dest,
+            const tensor& src1,
+            const tensor& src2,
+            const tensor& src3,
+            const float A,
+            const float B,
+            const float C
+        )
+        {
+            DLIB_CASSERT(dest.size()==src1.size());
+            DLIB_CASSERT(dest.size()==src2.size());
+            DLIB_CASSERT(dest.size()==src3.size());
+            DLIB_CASSERT(begin <= end && end <= dest.size());
+            launch_kernel(_cuda_affine_transform_range,max_jobs(end-begin),
+                dest.device(), src1.device(),
+                src2.device(), src3.device(), begin, end, A, B, C);
+        }
+
+    // -----------------------------------------------------------------------------------
+
+        __global__ void _cuda_affine_transform2(float* d, const float* s, size_t n, const float* A, const float* B)
+        {
+            for (auto i : grid_stride_range(0, n))
+            {
+                d[i] = A[i]*s[i] + B[i];
+            }
+        }
+        __global__ void _cuda_affine_transform3(float* d, const float* s, size_t n, const float* A, const float* B, size_t bs)
+        {
+            for (auto i : grid_stride_range(0, n))
+            {
+                d[i] = A[i%bs]*s[i] + B[i%bs];
+            }
+        }
+
+        void affine_transform(
+            tensor& dest,
+            const tensor& src,
+            const tensor& A,
+            const tensor& B
+        )
+        {
+            DLIB_CASSERT(have_same_dimensions(dest, src));
+            DLIB_CASSERT(
+                  ((A.num_samples()==1 && B.num_samples()==1) ||
+                  (A.num_samples()==src.num_samples() && B.num_samples()==src.num_samples())));
+            DLIB_CASSERT(
+                  A.nr()==B.nr() && B.nr()==src.nr() &&
+                  A.nc()==B.nc() && B.nc()==src.nc() &&
+                  A.k() ==B.k()  && B.k()==src.k(),
+                  "\nA.nr(): " << A.nr() << "\nB.nr(): " << B.nr() << "\nsrc.nr(): " << src.nr()
+                  <<"\nA.nc(): " << A.nc() << "\nB.nc(): " << B.nc() << "\nsrc.nc(): " << src.nc()
+                  <<"\nA.k(): " << A.k() << "\nB.k(): " << B.k() << "\nsrc.k(): " << src.k()
+                  );
+
+            if (A.num_samples() == 1)
+            {
+                launch_kernel(_cuda_affine_transform3,max_jobs(dest.size()),dest.device(), src.device(), src.size(), A.device(), B.device(), A.size());
+            }
+            else
+            {
+                launch_kernel(_cuda_affine_transform2,max_jobs(dest.size()),dest.device(), src.device(), src.size(), A.device(), B.device());
+            }
+        }
+
+    // ----------------------------------------------------------------------------------------
+
+        __global__ void _cuda_compute_adam_update(
+            size_t begin,
+            size_t end,
+            float* s,
+            float* m,
+            float* v,
+            const float alpha,
+            const float weight_decay,
+            const float momentum1,
+            const float momentum2,
+            const float* params,
+            const float* params_grad
+        )
+        {
+            const float eps = 1e-8;
+            // The loop is equivalent to doing this:
+            //   m = momentum1*m + (1-momentum1)    *   (weight_decay*params + params_grad);
+            //   v = momentum2*v + (1-momentum2)*squared(weight_decay*params + params_grad);
+            //   s = -alpha*m/(sqrt(v) + eps);
+            for (auto i : grid_stride_range(begin, end))
+            {
+                float g = (weight_decay*params[i] + params_grad[i]);
+                m[i] = momentum1*m[i] + (1-momentum1)*g;
+                v[i] = momentum2*v[i] + (1-momentum2)*g*g;
+                s[i] = -alpha*m[i]/(std::sqrt(v[i]) + eps);
+            }
+        }
+
+        void compute_adam_update (
+            size_t begin,
+            size_t end,
+            tensor& s,
+            tensor& m,
+            tensor& v,
+            const float t,
+            const float learning_rate,
+            const float weight_decay,
+            const float momentum1,
+            const float momentum2,
+            const tensor& params,
+            const tensor& params_grad
+        )
+        {
+            DLIB_CASSERT(s.size() == m.size() &&
+                         s.size() == v.size() &&
+                         s.size() == params.size() &&
+                         s.size() == params_grad.size());
+            DLIB_CASSERT(begin <= end && end <= params.size());
+            const float alpha = learning_rate*std::sqrt(1-std::pow(momentum2,t))/(1-std::pow(momentum1, t));
+
+            launch_kernel(_cuda_compute_adam_update,max_jobs(end-begin),
+                    begin, end, s.device(), m.device(), v.device(), alpha, weight_decay,
+                    momentum1, momentum2, params.device(), params_grad.device());
+        }
+
+    // -----------------------------------------------------------------------------------
+
+        __global__ void _cuda_affine_transform_conv(float* d, const float* s, size_t n, const float* A, const float* B, size_t bs, size_t ks)
+        {
+            for (auto i : grid_stride_range(0, n))
+            {
+                auto k = (i/bs)%ks;
+                d[i] = A[k]*s[i] + B[k];
+            }
+        }
+
+        void affine_transform_conv(
+            tensor& dest,
+            const tensor& src,
+            const tensor& A,
+            const tensor& B
+        )
+        {
+            DLIB_CASSERT(have_same_dimensions(dest, src));
+            DLIB_CASSERT(have_same_dimensions(A, B));
+            DLIB_CASSERT(A.num_samples() == 1 && A.nr() == 1 && A.nc() == 1 && A.k() == src.k());
+
+            launch_kernel(_cuda_affine_transform_conv,max_jobs(dest.size()),
+                    dest.device(), src.device(), src.size(), A.device(), B.device(), src.nr()*src.nc(), src.k());
+        }
+
+    // -----------------------------------------------------------------------------------
+
+        __global__ void _add_bias_gradient(float* out, const float* in, size_t n, size_t total_n)
+        {
+            for (auto i : grid_stride_range(0, n))
+            {
+                out[i] = in[i];
+                for (size_t j = i+n; j < total_n; j+=n)
+                    out[i] += in[j];
+            }
+        }
+
+        void assign_bias_gradient (
+            tensor& grad,
+            const tensor& gradient_input
+        )
+        {
+            DLIB_CASSERT(
+                  grad.num_samples() == 1 &&
+                  gradient_input.k() == grad.k() &&
+                  gradient_input.nr() == grad.nr() &&
+                  gradient_input.nc() == grad.nc() &&
+                  gradient_input.size() > 0);
+
+            launch_kernel(_add_bias_gradient,max_jobs(grad.size()),grad.device(), gradient_input.device(), grad.size(), gradient_input.size());
+        }
+
+    // ----------------------------------------------------------------------------------------
+
+        __global__ void _set_tensor(float* out, size_t n, const float val)
+        {
+            for (auto i : grid_stride_range(0, n))
+                out[i] = val;
+        }
+
+        void set_tensor (
+            tensor& t,
+            float value
+        )
+        {
+            launch_kernel(_set_tensor, max_jobs(t.size()), t.device(), t.size(), value);
+        }
+
+    // ----------------------------------------------------------------------------------------
+
+        __global__ void _scale_tensor(float* out, size_t n, const float val)
+        {
+            for (auto i : grid_stride_range(0, n))
+                out[i] *= val;
+        }
+
+        void scale_tensor (
+            tensor& t,
+            float value
+        )
+        {
+            launch_kernel(_scale_tensor, max_jobs(t.size()), t.device(), t.size(), value);
+        }
+
+    // -----------------------------------------------------------------------------------
+    // -----------------------------------------------------------------------------------
+
+        __global__ void _cuda_threshold(float* d, size_t n, float thresh)
+        {
+            for (auto i : grid_stride_range(0, n))
+            {
+                d[i] = d[i]>thresh ? 1:0;
+            }
+        }
+
+        void threshold (
+            tensor& data,
+            float thresh
+        )
+        {
+            launch_kernel(_cuda_threshold,max_jobs(data.size()),data.device(), data.size(), thresh);
+        }
+
+    // ------------------------------------------------------------------------------------
+
+        __global__ void _cuda_dot(const float* a, const float* b, size_t n, float* result)
+        {
+            // Parallel sum everything into local temp variables.
+            float temp = 0;
+            for(auto i : grid_stride_range(0, n))
+                temp += a[i]*b[i];
+
+            // Then do the warp reduce add thing to merge into one output value.
+            warp_reduce_atomic_add(*result, temp);
+        }
+
+
+        void dot (
+            const tensor& a,
+            const tensor& b,
+            tensor& result,
+            size_t idx
+        )
+        {
+            DLIB_CASSERT(a.size() == b.size());
+            DLIB_CASSERT(idx < result.size());
+
+            launch_kernel(_cuda_dot, max_jobs(a.size()), a.device(), b.device(), a.size(), result.device()+idx);
+        }
+
+    // ----------------------------------------------------------------------------------------
+
+        __global__ void _cuda_prelu(const float* s, float* d, size_t n, const float* pp)
+        {
+            const float p = *pp;
+            for (auto i : grid_stride_range(0, n))
+            {
+                if (s[i] > 0)
+                    d[i] = s[i];
+                else
+                    d[i] = p*s[i];
+            }
+        }
+
+        void prelu (
+            tensor& dest,
+            const tensor& src,
+            const tensor& param
+        )
+        {
+            launch_kernel(_cuda_prelu, max_jobs(dest.size()), 
+                src.device(), dest.device(), src.size(), param.device());
+        }
+
+    // ----------------------------------------------------------------------------------------
+
+        __global__ void _cuda_prelu_gradient(float* out, const float* s, const float* gi, size_t n, const float* pp, float* ppgrad)
+        {
+            const float p = *pp;
+            float pgrad = 0;
+            for(auto i : grid_stride_range(0, n))
+            {
+                if (s[i] > 0)
+                {
+                    out[i] += gi[i];
+                }
+                else
+                {
+                    out[i] += p*gi[i];
+                    pgrad += gi[i]*s[i];
+                }
+            }
+
+            // Then do the warp reduce add thing to merge into one output value.
+            warp_reduce_atomic_add(*ppgrad, pgrad);
+        }
+
+        void prelu_gradient (
+            tensor& grad,
+            const tensor& src,
+            const tensor& gradient_input,
+            const tensor& param,
+            tensor& params_grad 
+        )
+        {
+            params_grad = 0;
+            launch_kernel(_cuda_prelu_gradient, max_jobs(grad.size()), 
+                grad.device(), src.device(), gradient_input.device(), grad.size(),
+                param.device(), params_grad.device());
+        }
+
+    // ----------------------------------------------------------------------------------------
+
+        __global__ void _cuda_resize_bilinear(size_t dsize, size_t dchan_size, size_t dnc, float* d, 
+                                              size_t schan_size, int snr, int snc, const float* s, 
+                                              const float x_scale, const float y_scale)
+        {
+            for(auto i : grid_stride_range(0, dsize)) 
+            {
+                const int idx = i%dchan_size;
+                const int channel = i/dchan_size;
+                const int sidx = channel*schan_size;
+                const int r = idx/dnc;
+                const int c = idx%dnc;
+
+                const float y = r*y_scale;
+                const int top    = static_cast<int>(::floor(y));
+                const int bottom = ::min(top+1, snr-1);
+                const float tb_frac = y - top;
+
+                const float x = c*x_scale;
+                const int left   = static_cast<int>(::floor(x));
+                const int right  = ::min(left+1, snc-1);
+                const float lr_frac = x - left;
+
+                float tl = s[sidx+top*snc+left];
+                float tr = s[sidx+top*snc+right];
+                float bl = s[sidx+bottom*snc+left];
+                float br = s[sidx+bottom*snc+right];
+
+                float temp = (1-tb_frac)*((1-lr_frac)*tl + lr_frac*tr) + 
+                    tb_frac*((1-lr_frac)*bl + lr_frac*br);
+
+                d[i] = temp;
+            }
+        }
+
+        __global__ void _cuda_resize_bilinear_strided(size_t dsize, size_t dchan_size, size_t dnc, float* d, 
+                                              size_t schan_size, int snr, int snc, const float* s, 
+                                              const float x_scale, const float y_scale, 
+                                              size_t dest_row_stride, size_t src_row_stride, size_t dest_chan_size_strided
+                                              )
+        {
+            for(auto i : grid_stride_range(0, dsize)) 
+            {
+                const int idx = i%dchan_size;
+                const int channel = i/dchan_size;
+                const int sidx = channel*schan_size;
+                const int r = idx/dnc;
+                const int c = idx%dnc;
+                const int didx = channel*dest_chan_size_strided + r*dest_row_stride+c;
+
+                const float y = r*y_scale;
+                const int top    = static_cast<int>(::floor(y));
+                const int bottom = ::min(top+1, snr-1);
+                const float tb_frac = y - top;
+
+                const float x = c*x_scale;
+                const int left   = static_cast<int>(::floor(x));
+                const int right  = ::min(left+1, snc-1);
+                const float lr_frac = x - left;
+
+                float tl = s[sidx+top*src_row_stride+left];
+                float tr = s[sidx+top*src_row_stride+right];
+                float bl = s[sidx+bottom*src_row_stride+left];
+                float br = s[sidx+bottom*src_row_stride+right];
+
+                float temp = (1-tb_frac)*((1-lr_frac)*tl + lr_frac*tr) + 
+                    tb_frac*((1-lr_frac)*bl + lr_frac*br);
+
+                d[didx] = temp;
+            }
+        }
+
+        void resize_bilinear (
+            tensor& dest,
+            long dest_row_stride,
+            long dest_channel_stride,
+            const tensor& src,
+            long src_row_stride,
+            long src_channel_stride
+        )
+        {
+            DLIB_CASSERT(is_same_object(dest, src)==false);
+            DLIB_CASSERT(dest.num_samples() == src.num_samples());
+            DLIB_CASSERT(dest.k() == src.k());
+
+            if (dest.size() == 0 || src.size() == 0)
+                return;
+
+            const float x_scale = (src.nc()-1)/(float)std::max<long>((dest.nc()-1),1);
+            const float y_scale = (src.nr()-1)/(float)std::max<long>((dest.nr()-1),1);
+
+            if (dest.nc() == dest_row_stride && dest.nr()*dest.nc()==dest_channel_stride &&
+                src.nc()  == src_row_stride  && src.nr()*src.nc()==src_channel_stride)
+            {
+                launch_kernel(_cuda_resize_bilinear, 
+                        dest.size(), dest.nr()*dest.nc(), dest.nc(), dest.device(),
+                        src.nr()*src.nc(), src.nr(), src.nc(), src.device(),
+                        x_scale, y_scale);
+            }
+            else
+            {
+                launch_kernel(_cuda_resize_bilinear_strided, 
+                        dest.size(), dest.nr()*dest.nc(), dest.nc(), dest.device(),
+                        src_channel_stride, src.nr(), src.nc(), src.device(),
+                        x_scale, y_scale, dest_row_stride, src_row_stride, dest_channel_stride);
+            }
+        }
+
+    // ----------------------------------------------------------------------------------------
+
+        __global__ void _cuda_resize_bilinear_gradient(size_t dsize, size_t dchan_size, size_t dnc, const float* d, 
+                                              size_t schan_size, int snr, int snc, float* s, 
+                                              const float x_scale, const float y_scale)
+        {
+            for(auto i : grid_stride_range(0, dsize)) 
+            {
+                const float tmp = d[i];
+
+                const int idx = i%dchan_size;
+                const int channel = i/dchan_size;
+                const int sidx = channel*schan_size;
+                const int r = idx/dnc;
+                const int c = idx%dnc;
+
+                const float y = r*y_scale;
+                const int top    = static_cast<int>(::floor(y));
+                const int bottom = ::min(top+1, snr-1);
+                const float tb_frac = y - top;
+
+                const float x = c*x_scale;
+                const int left   = static_cast<int>(::floor(x));
+                const int right  = ::min(left+1, snc-1);
+                const float lr_frac = x - left;
+
+
+                atomicAdd(s+sidx+top*snc+left,     tmp*(1-tb_frac)*(1-lr_frac));
+                atomicAdd(s+sidx+top*snc+right,    tmp*(1-tb_frac)*(lr_frac));
+                atomicAdd(s+sidx+bottom*snc+left,  tmp*(tb_frac)*(1-lr_frac));
+                atomicAdd(s+sidx+bottom*snc+right, tmp*(tb_frac)*(lr_frac));
+            }
+        }
+
+        __global__ void _cuda_resize_bilinear_gradient_strided(size_t dsize, size_t dchan_size, size_t dnc, const float* d, 
+                                              size_t schan_size, int snr, int snc, float* s, 
+                                              const float x_scale, const float y_scale,
+                                              size_t dest_row_stride, size_t src_row_stride, size_t dest_chan_size_strided
+                                              )
+        {
+            for(auto i : grid_stride_range(0, dsize)) 
+            {
+
+                const int idx = i%dchan_size;
+                const int channel = i/dchan_size;
+                const int didx = channel*dest_chan_size_strided;
+                const int sidx = channel*schan_size;
+                const int r = idx/dnc;
+                const int c = idx%dnc;
+
+                const float tmp = d[didx + r*dest_row_stride+c];
+
+                const float y = r*y_scale;
+                const int top    = static_cast<int>(::floor(y));
+                const int bottom = ::min(top+1, snr-1);
+                const float tb_frac = y - top;
+
+                const float x = c*x_scale;
+                const int left   = static_cast<int>(::floor(x));
+                const int right  = ::min(left+1, snc-1);
+                const float lr_frac = x - left;
+
+
+                atomicAdd(s+sidx+top*src_row_stride+left,     tmp*(1-tb_frac)*(1-lr_frac));
+                atomicAdd(s+sidx+top*src_row_stride+right,    tmp*(1-tb_frac)*(lr_frac));
+                atomicAdd(s+sidx+bottom*src_row_stride+left,  tmp*(tb_frac)*(1-lr_frac));
+                atomicAdd(s+sidx+bottom*src_row_stride+right, tmp*(tb_frac)*(lr_frac));
+            }
+        }
+
+        void resize_bilinear_gradient (
+            tensor& grad,
+            long grad_row_stride,
+            long grad_channel_stride,
+            const tensor& gradient_input,
+            long gradient_input_row_stride,
+            long gradient_input_channel_stride
+        )
+        {
+            DLIB_CASSERT(is_same_object(grad, gradient_input)==false);
+            DLIB_CASSERT(gradient_input.num_samples() == grad.num_samples());
+            DLIB_CASSERT(gradient_input.k() == grad.k());
+
+            if (grad.size() == 0 || gradient_input.size() == 0)
+                return;
+
+            const float x_scale = (grad.nc()-1)/(float)std::max<long>((gradient_input.nc()-1),1);
+            const float y_scale = (grad.nr()-1)/(float)std::max<long>((gradient_input.nr()-1),1);
+
+            if (grad.nc() == grad_row_stride && grad.nr()*grad.nc()==grad_channel_stride &&
+                gradient_input.nc() == gradient_input_row_stride && gradient_input.nr()*gradient_input.nc()==gradient_input_channel_stride)
+            {
+                launch_kernel(_cuda_resize_bilinear_gradient, 
+                        gradient_input.size(), gradient_input.nr()*gradient_input.nc(), gradient_input.nc(), gradient_input.device(),
+                        grad.nr()*grad.nc(), grad.nr(), grad.nc(), grad.device(),
+                        x_scale, y_scale);
+            }
+            else
+            {
+                launch_kernel(_cuda_resize_bilinear_gradient_strided, 
+                        gradient_input.size(), gradient_input.nr()*gradient_input.nc(), gradient_input.nc(), gradient_input.device(),
+                        grad_channel_stride, grad.nr(), grad.nc(), grad.device(),
+                        x_scale, y_scale, gradient_input_row_stride, grad_row_stride, gradient_input_channel_stride);
+            }
+        }
+
+    // ----------------------------------------------------------------------------------------
+
+        __global__ void _cuda_copy_tensor_add_to (float* dest, size_t size,  const float* src,  size_t dest_stride, size_t src_stride, size_t block_size)
+        {
+            for(auto i : grid_stride_range(0, size)) 
+            {
+                size_t blk = i/block_size;
+                size_t j = i%block_size;
+                dest[blk*dest_stride + j] += src[blk*src_stride + j];
+            }
+        }
+
+        __global__ void _cuda_copy_tensor (float* dest, size_t size,  const float* src,  size_t dest_stride, size_t src_stride, size_t block_size)
+        {
+            for(auto i : grid_stride_range(0, size)) 
+            {
+                size_t blk = i/block_size;
+                size_t j = i%block_size;
+                dest[blk*dest_stride + j] = src[blk*src_stride + j];
+            }
+        }
+
+        void copy_tensor(
+            bool add_to,
+            tensor& dest,
+            size_t dest_k_offset,
+            const tensor& src,
+            size_t src_k_offset,
+            size_t count_k
+        )
+        {
+            const size_t dest_sample_size = static_cast<size_t>(dest.nc() * dest.nr() * dest.k());
+            const size_t src_sample_size = static_cast<size_t>(src.nc() * src.nr() * src.k());
+
+            const size_t block_size = count_k * dest.nc() * dest.nr();
+
+            DLIB_CASSERT(dest.num_samples() == src.num_samples() &&
+                         dest.nc() == src.nc() && dest.nr() == src.nr(), "All sources should fit into dest tensor size");
+            DLIB_CASSERT(dest.k() - dest_k_offset >= count_k, "Not enough space in dest tensor");
+            DLIB_CASSERT(src.k() - src_k_offset >= count_k, "Not enough space in src tensor");
+
+            float* dest_p = dest.device() + dest_k_offset * dest.nc() * dest.nr();
+            const float* src_p = src.device() + src_k_offset * src.nc() * src.nr();;
+
+            if (add_to)
+            {
+                launch_kernel(_cuda_copy_tensor_add_to, max_jobs(dest.size()), 
+                              dest_p, block_size*dest.num_samples(),
+                              src_p, dest_sample_size, src_sample_size, block_size);
+            }
+            else
+            {
+                launch_kernel(_cuda_copy_tensor, max_jobs(dest.size()), 
+                              dest_p, block_size*dest.num_samples(),
+                              src_p, dest_sample_size, src_sample_size, block_size);
+            }
+        }
+
+    // ----------------------------------------------------------------------------------------
+
+    }
+}
+
diff --git a/ml/dlib/dlib/dnn/cuda_dlib.h b/ml/dlib/dlib/dnn/cuda_dlib.h
new file mode 100644
index 000000000..3a057ffc4
--- /dev/null
+++ b/ml/dlib/dlib/dnn/cuda_dlib.h
@@ -0,0 +1,469 @@
+// Copyright (C) 2015  Davis E. King (davis@dlib.net)
+// License: Boost Software License   See LICENSE.txt for the full license.
+#ifndef DLIB_DNN_CuDA_H_
+#define DLIB_DNN_CuDA_H_
+
+
+#include "tensor.h"
+#include "../geometry/rectangle.h"
+
+namespace dlib
+{
+    namespace cuda 
+    {
+
+    // ----------------------------------------------------------------------------------------
+
+        void set_device (
+            int dev
+        );
+
+        int get_device (
+        );
+
+        int get_num_devices (
+        );
+
+        std::string get_device_name (
+            int device
+        );
+
+        void set_current_device_blocking_sync(
+        );
+
+        bool can_access_peer (int device_id, int peer_device_id);
+        bool can_access_peer (const tensor& device, const tensor& peer_device);
+
+        void device_synchronize (int dev);
+        void device_synchronize (const tensor& dev);
+
+
+        class raii_set_device
+        {
+        public:
+            raii_set_device() = delete;
+            raii_set_device(const raii_set_device&) = delete;
+            raii_set_device& operator=(const raii_set_device&) = delete;
+
+            raii_set_device(int dev)
+            {
+                prev_dev = get_device();
+                set_device(dev);
+            }
+
+            raii_set_device(const tensor& dev)
+            {
+                prev_dev = get_device();
+                set_device(dev.device_id());
+            }
+
+            void operator() (int dev)
+            {
+                set_device(dev);
+            }
+
+            void operator() (const tensor& dev)
+            {
+                set_device(dev.device_id());
+            }
+
+            ~raii_set_device() noexcept(false)
+            {
+                set_device(prev_dev);
+            }
+
+        private:
+            int prev_dev;
+        };
+
+
+#ifdef DLIB_USE_CUDA
+
+        class enable_peer_access
+        {
+        public:
+
+            enable_peer_access() = delete;
+            enable_peer_access(const enable_peer_access&) = delete;
+            enable_peer_access& operator=(const enable_peer_access&) = delete;
+
+            enable_peer_access(
+                int device_id,
+                int peer_device_id
+            );
+
+            enable_peer_access(
+                const tensor& device,
+                const tensor& peer_device
+            ) : enable_peer_access(device.device_id(), peer_device.device_id())
+            {}
+
+            ~enable_peer_access() noexcept(false);
+
+        private:
+
+            bool call_disable;
+            int device_id;
+            int peer_device_id;
+        };
+
+    // -----------------------------------------------------------------------------------
+
+        void inverse_norms (
+            resizable_tensor& invnorms,
+            const tensor& data,
+            const double eps
+        );
+
+        void dot_prods (
+            resizable_tensor& out,
+            const tensor& lhs,
+            const tensor& rhs
+        );
+
+        void dot_prods (
+            bool add_to,
+            tensor& out,
+            const tensor& lhs,
+            const tensor& rhs
+        );
+
+        void scale_columns (
+            tensor& out,
+            const tensor& m,
+            const tensor& v
+        );
+
+        void scale_rows (
+            tensor& out,
+            const tensor& m,
+            const tensor& v
+        );
+
+        void scale_rows2 (
+            float beta, 
+            tensor& out,
+            const tensor& m1,
+            const tensor& m2,
+            const tensor& v1,
+            const tensor& v2
+        );
+
+        void exp (
+            tensor& dest,
+            const tensor& src
+        );
+
+        void log (
+            tensor& dest,
+            const tensor& src
+        );
+
+        void log10 (
+            tensor& dest,
+            const tensor& src
+        );
+
+    // ------------------------------------------------------------------------------------
+
+        void set_tensor (
+            tensor& t,
+            float value
+        );
+
+        void scale_tensor (
+            tensor& t,
+            float value
+        );
+
+    // ------------------------------------------------------------------------------------
+
+        void multiply (
+            bool add_to,
+            tensor& dest,
+            const tensor& src1,
+            const tensor& src2
+        );
+
+        void multiply_conv (
+            bool add_to,
+            tensor& dest,
+            const tensor& src1,
+            const tensor& src2
+        );
+
+        void multiply_zero_padded (
+            bool add_to,
+            tensor& dest,
+            const tensor& src1,
+            const tensor& src2
+        );
+
+        void scale_channels (
+            bool add_to,
+            tensor& dest,
+            const tensor& src,
+            const tensor& scales
+        );
+
+        void add (
+            tensor& dest,
+            const tensor& src1,
+            const tensor& src2
+        );
+
+    // -----------------------------------------------------------------------------------
+
+        void affine_transform(
+            tensor& dest,
+            const tensor& src,
+            const float A,
+            const float B
+        );
+
+        void affine_transform(
+            tensor& dest,
+            const tensor& src,
+            const float A
+        );
+
+        void affine_transform(
+            tensor& dest,
+            const tensor& src1,
+            const tensor& src2,
+            const float A,
+            const float B,
+            const float C
+        );
+
+        void affine_transform(
+            tensor& dest,
+            const tensor& src1,
+            const tensor& src2,
+            const float A,
+            const float B
+        );
+
+        void affine_transform(
+            tensor& dest,
+            const tensor& src1,
+            const tensor& src2,
+            const tensor& src3,
+            const float A,
+            const float B,
+            const float C,
+            const float D
+        );
+
+        void affine_transform_range(
+            size_t begin,
+            size_t end,
+            tensor& dest,
+            const tensor& src1,
+            const tensor& src2,
+            const tensor& src3,
+            const float A,
+            const float B,
+            const float C
+        );
+
+        void affine_transform(
+            const rectangle& rect,
+            tensor& dest, 
+            const tensor& src1, 
+            const tensor& src2, 
+            const tensor& src3, 
+            float A, 
+            float B,
+            float C
+        );
+
+        // Note that this function isn't in the tt:: namespace because add_scaled() is
+        // called by cuda::add() so we don't need a tt:: version of add_scaled().  
+        void add_scaled(
+            tensor& dest,
+            const float scale,
+            const tensor& src
+        );
+
+        void add_cv_to_all_columns(
+            float beta, 
+            tensor& dest, 
+            float alpha, 
+            const tensor& src
+        );
+
+    // -----------------------------------------------------------------------------------
+
+        void affine_transform(
+            tensor& dest,
+            const tensor& src,
+            const tensor& A,
+            const tensor& B
+        );
+
+    // -----------------------------------------------------------------------------------
+
+        void affine_transform_conv(
+            tensor& dest,
+            const tensor& src,
+            const tensor& A,
+            const tensor& B
+        );
+
+    // ----------------------------------------------------------------------------------------
+
+        void compute_adam_update (
+            size_t begin,
+            size_t end,
+            tensor& s,
+            tensor& m,
+            tensor& v,
+            const float t,
+            const float learning_rate,
+            const float weight_decay,
+            const float momentum1,
+            const float momentum2,
+            const tensor& params,
+            const tensor& params_grad
+        );
+
+    // -----------------------------------------------------------------------------------
+
+        void assign_bias_gradient (
+            tensor& grad,
+            const tensor& gradient_input
+        );
+
+    // -----------------------------------------------------------------------------------
+
+        void threshold (
+            tensor& data,
+            float thresh
+        );
+
+    // ----------------------------------------------------------------------------------------
+
+        void dot (
+            const tensor& a,
+            const tensor& b,
+            tensor& result,
+            size_t idx
+        );
+
+    // ----------------------------------------------------------------------------------------
+
+        void prelu (
+            tensor& dest,
+            const tensor& src,
+            const tensor& param
+        );
+
+        void prelu_gradient (
+            tensor& grad,
+            const tensor& src,
+            const tensor& gradient_input,
+            const tensor& param,
+            tensor& params_grad 
+        );
+
+
+    // ----------------------------------------------------------------------------------------
+
+        void resize_bilinear (
+            tensor& dest,
+            long dest_row_stride,
+            long dest_channel_stride,
+            const tensor& src,
+            long src_row_stride,
+            long src_channel_stride
+        );
+
+        void resize_bilinear_gradient (
+            tensor& grad,
+            long grad_row_stride,
+            long grad_channel_stride,
+            const tensor& gradient_input,
+            long gradient_input_row_stride,
+            long gradient_input_channel_stride
+        );
+
+        inline void resize_bilinear (
+            tensor& dest,
+            const tensor& src
+        ) { resize_bilinear(dest, dest.nc(), dest.nr()*dest.nc(), src, src.nc(), src.nr()*src.nc()); }
+
+        inline void resize_bilinear_gradient (
+            tensor& grad,
+            const tensor& gradient_input
+        ) { resize_bilinear_gradient(grad, grad.nc(), grad.nr()*grad.nc(), gradient_input, gradient_input.nc(), gradient_input.nr()*gradient_input.nc()); }
+
+    // ----------------------------------------------------------------------------------------
+
+        void copy_tensor(
+            bool add_to,
+            tensor& dest,
+            size_t dest_k_offset,
+            const tensor& src,
+            size_t src_k_offset,
+            size_t count_k
+        );
+
+    // ------------------------------------------------------------------------------------
+    // ------------------------------------------------------------------------------------
+    // ------------------------------------------------------------------------------------
+    // ------------------------------------------------------------------------------------
+
+#else // if DLIB_USE_CUDA NOT DEFINED
+
+        inline void set_device (
+            int id
+        )
+        {
+            DLIB_CASSERT(id == 0, "dlib::cuda::set_device(id) called with an invalid device id.");
+        }
+
+        inline int get_device (
+        ){ return 0; }
+
+        inline int get_num_devices (
+        ) { return 1; }
+
+        inline std::string get_device_name (
+            int device
+        ) 
+        {
+            DLIB_CASSERT(device == 0, "dlib::cuda::set_device(id) called with an invalid device id.");
+            return "CUDA_DISABLED";
+        }
+
+        inline void set_current_device_blocking_sync(
+        ) {}
+
+
+        inline bool can_access_peer (int , int )
+        { return false; }
+        inline bool can_access_peer (const tensor& , const tensor& )
+        { return false; }
+
+        inline void device_synchronize (int ){}
+        inline void device_synchronize (const tensor& ){}
+
+        class enable_peer_access
+        {
+        public:
+            enable_peer_access() = delete;
+            enable_peer_access(const enable_peer_access&) = delete;
+            enable_peer_access& operator=(const enable_peer_access&) = delete;
+            enable_peer_access( int, int ){}
+            enable_peer_access( const tensor&, const tensor& ) {}
+        };
+
+#endif // DLIB_USE_CUDA
+
+    } 
+}
+
+
+#endif // DLIB_DNN_CuDA_H_
+
diff --git a/ml/dlib/dlib/dnn/cuda_errors.h b/ml/dlib/dlib/dnn/cuda_errors.h
new file mode 100644
index 000000000..fd28693c2
--- /dev/null
+++ b/ml/dlib/dlib/dnn/cuda_errors.h
@@ -0,0 +1,70 @@
+// Copyright (C) 2015  Davis E. King (davis@dlib.net)
+// License: Boost Software License   See LICENSE.txt for the full license.
+#ifndef DLIB_CUDA_ERRORs_H_
+#define DLIB_CUDA_ERRORs_H_
+
+
+#include "../error.h"
+
+namespace dlib
+{
+    struct cuda_error : public error
+    {
+        /*!
+            WHAT THIS OBJECT REPRESENTS
+                This is the exception thrown if any calls to the NVIDIA CUDA runtime
+                returns an error.  
+        !*/
+
+        cuda_error(const std::string& message): error(message) {}
+    };
+
+
+    struct cudnn_error : public cuda_error
+    {
+        /*!
+            WHAT THIS OBJECT REPRESENTS
+                This is the exception thrown if any calls to the NVIDIA cuDNN library
+                returns an error.  
+        !*/
+
+        cudnn_error(const std::string& message): cuda_error(message) {}
+    };
+
+    struct curand_error : public cuda_error
+    {
+        /*!
+            WHAT THIS OBJECT REPRESENTS
+                This is the exception thrown if any calls to the NVIDIA cuRAND library
+                returns an error.  
+        !*/
+
+        curand_error(const std::string& message): cuda_error(message) {}
+    };
+
+    struct cublas_error : public cuda_error
+    {
+        /*!
+            WHAT THIS OBJECT REPRESENTS
+                This is the exception thrown if any calls to the NVIDIA cuBLAS library
+                returns an error.  
+        !*/
+
+        cublas_error(const std::string& message): cuda_error(message) {}
+    };
+
+    struct cusolver_error : public cuda_error
+    {
+        /*!
+            WHAT THIS OBJECT REPRESENTS
+                This is the exception thrown if any calls to the NVIDIA cuSolver library
+                returns an error.  
+        !*/
+
+        cusolver_error(const std::string& message): cuda_error(message) {}
+    };
+}
+
+
+#endif // DLIB_CUDA_ERRORs_H_
+
diff --git a/ml/dlib/dlib/dnn/cuda_utils.h b/ml/dlib/dlib/dnn/cuda_utils.h
new file mode 100644
index 000000000..673a4e8ad
--- /dev/null
+++ b/ml/dlib/dlib/dnn/cuda_utils.h
@@ -0,0 +1,413 @@
+// Copyright (C) 2015  Davis E. King (davis@dlib.net)
+// License: Boost Software License   See LICENSE.txt for the full license.
+#ifndef DLIB_CUDA_UtILS_H_
+#define DLIB_CUDA_UtILS_H_
+
+#ifndef DLIB_USE_CUDA
+#error "This file shouldn't be #included unless DLIB_USE_CUDA is #defined"
+#endif
+
+#include "cuda_errors.h"
+#include "../algs.h"
+#include <cmath>
+
+#include <cuda_runtime.h>
+#include <sstream>
+#include <iostream>
+#include <memory>
+#include <vector>
+#include <type_traits>
+
+
+// Check the return value of a call to the CUDA runtime for an error condition.
+#define CHECK_CUDA(call)                                                       \
+do{                                                                              \
+    const cudaError_t error = call;                                            \
+    if (error != cudaSuccess)                                                  \
+    {                                                                          \
+        std::ostringstream sout;                                               \
+        sout << "Error while calling " << #call << " in file " << __FILE__ << ":" << __LINE__ << ". ";\
+        sout << "code: " << error << ", reason: " << cudaGetErrorString(error);\
+        throw dlib::cuda_error(sout.str());                                          \
+    }                                                                          \
+}while(false)
+
+// ----------------------------------------------------------------------------------------
+
+#ifdef __CUDACC__
+
+namespace dlib
+{
+    namespace cuda
+    {
+
+    // ------------------------------------------------------------------------------------
+
+        __inline__ __device__ size_t pack_idx (
+            size_t dim_size3,
+            size_t dim_size2,
+            size_t dim_size1,
+            size_t idx4,
+            size_t idx3,
+            size_t idx2,
+            size_t idx1
+        )
+        /*!
+            ensures
+                - Converts a 4D array index into a 1D index assuming row major layout.  To
+                  understand precisely what this function does, imagine we had an array
+                  declared like this:
+                    int ARRAY[anything][dim_size3][dim_size2][dim_size1];
+                  Then we could index it like this:
+                    ARRAY[idx4][idx3][idx2][idx1]
+                  or equivalently like this:
+                    ((int*)ARRAY)[pack_idx(dim_size3,dim_size2,dim_size1, idx4,idx3,idx2,idx1)]
+        !*/
+        {
+            return ((idx4*dim_size3 + idx3)*dim_size2 + idx2)*dim_size1 + idx1;
+        }
+
+        __inline__ __device__ void unpack_idx (
+            size_t idx,
+            size_t dim_size3,
+            size_t dim_size2,
+            size_t dim_size1,
+            size_t& idx4,
+            size_t& idx3,
+            size_t& idx2,
+            size_t& idx1
+        )
+        /*!
+            ensures
+                - This function computes the inverse of pack_idx().  Therefore, 
+                  if PACKED == pack_idx(dim_size3,dim_size2,dim_size1, idx4,idx3,idx2,idx1)
+                  then unpack_idx(PACKED,dim_size3,dim_size2,dim_size1, IDX4,IDX3,IDX2,IDX1)
+                  results in:
+                    - IDX1 == idx1
+                    - IDX2 == idx2
+                    - IDX3 == idx3
+                    - IDX4 == idx4
+        !*/
+        {
+            idx1 = idx%dim_size1;
+
+            idx /= dim_size1;
+            idx2 = idx%dim_size2;
+
+            idx /= dim_size2;
+            idx3 = idx%dim_size3;
+
+            idx /= dim_size3;
+            idx4 = idx;
+        }
+
+    // ------------------------------------------------------------------------------------
+
+        // This function is from the article:
+        // http://devblogs.nvidia.com/parallelforall/faster-parallel-reductions-kepler/
+        __inline__ __device__ float warp_reduce_sum(float val) 
+        {
+            for (int offset = warpSize/2; offset > 0; offset /= 2) 
+#if CUDART_VERSION >= 9000
+                val += __shfl_down_sync(0xFFFFFFFF,val, offset);
+#else
+                val += __shfl_down(val, offset);
+#endif
+            return val;
+        }
+
+        __inline__ __device__ bool is_first_thread_in_warp()
+        {
+            return (threadIdx.x & (warpSize - 1)) == 0;
+        }
+
+        __inline__ __device__ void warp_reduce_atomic_add(
+            float& out, 
+            float val
+        ) 
+        /*!
+            ensures
+                - Atomically adds all the val variables in the current warp to out.
+                  See this page for an extended discussion: 
+                  http://devblogs.nvidia.com/parallelforall/faster-parallel-reductions-kepler/
+        !*/
+        {
+            val = warp_reduce_sum(val);
+            if (is_first_thread_in_warp())
+                atomicAdd(&out, val);
+        }
+
+    // ------------------------------------------------------------------------------------
+
+        struct max_jobs
+        {
+            max_jobs(int x) : num_x(x) {}
+            max_jobs(int x, int y) : num_x(x), num_y(y) {}
+            int num_x;
+            int num_y = 1;
+        };
+
+        template <typename Kernel, typename... T>
+        void launch_kernel (
+            Kernel K,
+            T ...args
+        )
+        /*!
+            ensures
+                - launches the given kernel K(args...).  The point of this function is to
+                  automatically set the kernel launch parameters to something reasonable
+                  based on the properties of the kernel and the current GPU card.
+        !*/
+        {
+            int num_blocks, num_threads;
+            CHECK_CUDA(cudaOccupancyMaxPotentialBlockSize(&num_blocks,&num_threads,K));
+            K<<<num_blocks,num_threads>>>(args...);
+        }
+
+        template <typename Kernel, typename... T>
+        void launch_kernel (
+            Kernel K,
+            max_jobs m,
+            T ...args
+        )
+        /*!
+            ensures
+                - This function is just like launch_kernel(K,args...) except that you can
+                  additionally supply a max_jobs number that tells it how many possible
+                  total threads could be used.  This is useful when launching potentially
+                  small jobs that might not need the number of threads suggested by
+                  launch_kernel().  
+        !*/
+        {
+            if (m.num_x == 0 || m.num_y == 0)
+                return;
+            int num_blocks, num_threads;
+            CHECK_CUDA(cudaOccupancyMaxPotentialBlockSize(&num_blocks,&num_threads,K));
+            // Check if the job is really small and we don't really need to launch a kernel
+            // with this many blocks and threads.
+            if (num_blocks*num_threads > m.num_x*m.num_y)
+                num_blocks = (m.num_x*m.num_y+num_threads-1)/num_threads;
+
+            if (m.num_y == 1)
+            {
+                K<<<num_blocks,num_threads>>>(args...);
+            }
+            else
+            {
+                /*
+                    In general, the reason m.num_y!=1 (i.e. the reason you are in this
+                    code path) is because we are using nested grid-stride loops.  There are
+                    two important things to note about what we are doing here.  To
+                    illustrate them we will talk about this little CUDA code snippet:
+
+                        // initialize out before we begin.
+                        for (auto i : grid_stride_range_y(0, nr))
+                            for (auto j : grid_stride_range(0, 1))
+                                out[i] = 0;
+
+                        __syncthreads(); // synchronize threads in block
+
+                        // loop over some 2D thing and sum and store things into out.
+                        for (auto i : grid_stride_range_y(0, nr))
+                        {
+                            float temp = 0;
+                            for (auto j : grid_stride_range(0, nc))
+                                temp += whatever[i*nc+j];
+
+                            // store the sum into out[i]
+                            warp_reduce_atomic_add(out[i], temp);
+                        }
+                    
+                    First, we make sure the number of x threads is a multiple of 32 so that
+                    you can use warp_reduce_atomic_add() inside the y loop.  
+                    
+                    Second, we put the x block size to 1 so inter-block synchronization is
+                    easier.  For example, if the number of x blocks wasn't 1 the above code
+                    would have a race condition in it.  This is because the execution of
+                    out[i]=0 would be done by blocks with blockIdx.x==0, but then in the
+                    second set of loops, *all* the x blocks use out[i].  Since
+                    __syncthreads() doesn't do any synchronization between blocks some of
+                    the blocks might begin before the out[i]=0 statements finished and that
+                    would be super bad.
+                */
+                
+                // Try and make sure that the ratio of x to y threads is reasonable based
+                // on the respective size of our loops.
+                int x_threads = 32;
+                int y_threads = num_threads/32;
+                const int ratio = static_cast<int>(std::round(put_in_range(1, y_threads, m.num_x/(double)m.num_y)));
+                x_threads *= ratio;
+                y_threads /= ratio;
+
+                dim3 blocks(1,num_blocks);  
+                dim3 threads(x_threads,y_threads); 
+                K<<<blocks,threads>>>(args...);
+            }
+        }
+
+    // ------------------------------------------------------------------------------------
+
+        class grid_stride_range
+        {
+            /*!
+                WHAT THIS OBJECT REPRESENTS
+                    This is a tool for making a for loop that loops over an entire block of
+                    memory inside a kernel, but doing so in a way that parallelizes
+                    appropriately across all the threads in a kernel launch.  For example,
+                    the following kernel would add the vector a to the vector b and store
+                    the output in out (assuming all vectors are of dimension n):
+                        __global__ void add_arrays(
+                            const float* a, 
+                            const float* b, 
+                            float* out, 
+                            size_t n
+                        )
+                        {
+                            for (auto i : grid_stride_range(0, n))
+                            {
+                                out[i] = a[i]+b[i];
+                            }
+                        }
+            !*/
+
+        public:
+            __device__ grid_stride_range(
+                size_t ibegin_,
+                size_t iend_
+            ) : 
+                ibegin(ibegin_),
+                iend(iend_)
+            {}
+
+            class iterator
+            {
+            public:
+                __device__ iterator() {}
+                __device__ iterator(size_t pos_) : pos(pos_) {}
+
+                __device__ size_t operator*() const
+                {
+                    return pos;
+                }
+
+                __device__ iterator& operator++()
+                {
+                    pos += gridDim.x * blockDim.x;
+                    return *this;
+                }
+
+                __device__ bool operator!=(const iterator& item) const
+                { return pos < item.pos; }
+
+            private:
+                size_t pos;
+            };
+
+            __device__ iterator begin() const
+            {
+                return iterator(ibegin+blockDim.x * blockIdx.x + threadIdx.x);
+            }
+            __device__ iterator end() const
+            {
+                return iterator(iend);
+            }
+        private:
+
+            size_t ibegin;
+            size_t iend;
+        };
+
+    // ------------------------------------------------------------------------------------
+
+        class grid_stride_range_y
+        {
+            /*!
+                WHAT THIS OBJECT REPRESENTS
+                    This object is just like grid_stride_range except that it looks at
+                    CUDA's y thread index (e.g. threadIdx.y) instead of the x index.
+                    Therefore, if you launch a cuda kernel with a statement like:
+                        dim3 blocks(1,10);
+                        dim3 threads(32,32);  // You need to have x and y not equal to 1 to get parallelism over both loops.
+                        add_arrays<<<blocks,threads>>>(a,b,out,nr,nc);
+                    You can perform a nested 2D parallel for loop rather than doing just a
+                    1D for loop.
+                   
+                    So the code in the kernel would look like this if you wanted to add two
+                    2D matrices:
+                        __global__ void add_arrays(
+                            const float* a, 
+                            const float* b, 
+                            float* out, 
+                            size_t nr,
+                            size_t nc
+                        )
+                        {
+                            for (auto r : grid_stride_range_y(0, nr))
+                            {
+                                for (auto c : grid_stride_range(0, nc))
+                                {
+                                    auto i = r*nc+c;
+                                    out[i] = a[i]+b[i];
+                                }
+                            }
+                        }
+            !*/
+
+        public:
+            __device__ grid_stride_range_y(
+                size_t ibegin_,
+                size_t iend_
+            ) : 
+                ibegin(ibegin_),
+                iend(iend_)
+            {}
+
+            class iterator
+            {
+            public:
+                __device__ iterator() {}
+                __device__ iterator(size_t pos_) : pos(pos_) {}
+
+                __device__ size_t operator*() const
+                {
+                    return pos;
+                }
+
+                __device__ iterator& operator++()
+                {
+                    pos += gridDim.y * blockDim.y;
+                    return *this;
+                }
+
+                __device__ bool operator!=(const iterator& item) const
+                { return pos < item.pos; }
+
+            private:
+                size_t pos;
+            };
+
+            __device__ iterator begin() const
+            {
+                return iterator(ibegin+blockDim.y * blockIdx.y + threadIdx.y);
+            }
+            __device__ iterator end() const
+            {
+                return iterator(iend);
+            }
+        private:
+
+            size_t ibegin;
+            size_t iend;
+        };
+
+    // ------------------------------------------------------------------------------------
+
+    }
+}
+
+#endif // __CUDACC__
+
+// ----------------------------------------------------------------------------------------
+
+#endif // DLIB_CUDA_UtILS_H_
+
diff --git a/ml/dlib/dlib/dnn/cudnn_dlibapi.cpp b/ml/dlib/dlib/dnn/cudnn_dlibapi.cpp
new file mode 100644
index 000000000..6926561f1
--- /dev/null
+++ b/ml/dlib/dlib/dnn/cudnn_dlibapi.cpp
@@ -0,0 +1,1604 @@
+// Copyright (C) 2015  Davis E. King (davis@dlib.net)
+// License: Boost Software License   See LICENSE.txt for the full license.
+#ifndef DLIB_DNN_CuDNN_CPP_
+#define DLIB_DNN_CuDNN_CPP_
+
+#ifdef DLIB_USE_CUDA
+
+#include "cudnn_dlibapi.h"
+#include "tensor.h"
+#include <cudnn.h>
+#include <iostream>
+#include <string>
+#include <vector>
+#include "cuda_utils.h"
+#include "cpu_dlib.h"
+#include "cuda_dlib.h"
+#include "tensor_tools.h"
+
+static const char* cudnn_get_error_string(cudnnStatus_t s)
+{
+    switch(s)
+    {
+        case CUDNN_STATUS_NOT_INITIALIZED: 
+            return "CUDA Runtime API initialization failed.";
+        case CUDNN_STATUS_ALLOC_FAILED: 
+            return "CUDA Resources could not be allocated.";
+        case CUDNN_STATUS_BAD_PARAM:
+            return "CUDNN_STATUS_BAD_PARAM";
+        case CUDNN_STATUS_EXECUTION_FAILED:
+            return "CUDNN_STATUS_EXECUTION_FAILED";
+        case CUDNN_STATUS_NOT_SUPPORTED:
+            return "CUDNN_STATUS_NOT_SUPPORTED";
+        case CUDNN_STATUS_ARCH_MISMATCH:
+            return "CUDNN_STATUS_ARCH_MISMATCH: Your GPU is too old and not supported by cuDNN";
+        default:
+            return "A call to cuDNN failed";
+    }
+}
+
+// Check the return value of a call to the cuDNN runtime for an error condition.
+#define CHECK_CUDNN(call)                                                      \
+do{                                                                              \
+    const cudnnStatus_t error = call;                                         \
+    if (error != CUDNN_STATUS_SUCCESS)                                        \
+    {                                                                          \
+        std::ostringstream sout;                                               \
+        sout << "Error while calling " << #call << " in file " << __FILE__ << ":" << __LINE__ << ". ";\
+        sout << "code: " << error << ", reason: " << cudnn_get_error_string(error);\
+        throw dlib::cudnn_error(sout.str());                            \
+    }                                                                          \
+}while(false)
+
+
+namespace dlib
+{
+
+    namespace cuda 
+    {
+
+    // ------------------------------------------------------------------------------------
+
+        static cudnnTensorDescriptor_t descriptor(const tensor& t) 
+        {
+            return (const cudnnTensorDescriptor_t)t.get_cudnn_tensor_descriptor().get_handle();
+        }
+        static cudnnTensorDescriptor_t descriptor(const tensor_descriptor& t) 
+        {
+            return (const cudnnTensorDescriptor_t)t.get_handle();
+        }
+
+    // ------------------------------------------------------------------------------------
+
+        class cudnn_context
+        {
+        public:
+            // not copyable
+            cudnn_context(const cudnn_context&) = delete;
+            cudnn_context& operator=(const cudnn_context&) = delete;
+
+            cudnn_context()
+            {
+                handles.resize(16);
+            }
+            ~cudnn_context()
+            {
+                for (auto h : handles)
+                {
+                    if (h)
+                        cudnnDestroy(h);
+                }
+            }
+
+            cudnnHandle_t get_handle (
+            )  
+            { 
+                int new_device_id;
+                CHECK_CUDA(cudaGetDevice(&new_device_id));
+                // make room for more devices if needed
+                if (new_device_id >= (long)handles.size())
+                    handles.resize(new_device_id+16);
+
+                // If we don't have a handle already for this device then make one
+                if (!handles[new_device_id])
+                    CHECK_CUDNN(cudnnCreate(&handles[new_device_id]));
+
+                // Finally, return the handle for the current device
+                return handles[new_device_id];
+            }
+
+        private:
+
+            std::vector<cudnnHandle_t> handles;
+        };
+
+        static cudnnHandle_t context()
+        {
+            thread_local cudnn_context c;
+            return c.get_handle();
+        }
+    // ------------------------------------------------------------------------------------
+
+        class cudnn_device_buffer
+        {
+        public:
+            // not copyable
+            cudnn_device_buffer(const cudnn_device_buffer&) = delete;
+            cudnn_device_buffer& operator=(const cudnn_device_buffer&) = delete;
+
+            cudnn_device_buffer()
+            {
+                buffers.resize(16);
+            }
+            ~cudnn_device_buffer()
+            {
+            }
+
+            std::shared_ptr<resizable_cuda_buffer> get_buffer (
+            )
+            {
+                int new_device_id;
+                CHECK_CUDA(cudaGetDevice(&new_device_id));
+                // make room for more devices if needed
+                if (new_device_id >= (long)buffers.size())
+                    buffers.resize(new_device_id+16);
+
+                // If we don't have a buffer already for this device then make one
+                std::shared_ptr<resizable_cuda_buffer> buff = buffers[new_device_id].lock();
+                if (!buff)
+                {
+                    buff = std::make_shared<resizable_cuda_buffer>();
+                    buffers[new_device_id] = buff;
+                }
+
+                // Finally, return the buffer for the current device
+                return buff;
+            }
+
+        private:
+
+            std::vector<std::weak_ptr<resizable_cuda_buffer>> buffers;
+        };
+
+
+        static std::shared_ptr<resizable_cuda_buffer> device_global_buffer()
+        {
+            thread_local cudnn_device_buffer buffer;
+            return buffer.get_buffer();
+        }
+    // ------------------------------------------------------------------------------------
+
+        class cudnn_activation_descriptor
+        {
+        public:
+            // not copyable 
+            cudnn_activation_descriptor(const cudnn_activation_descriptor&) = delete;
+            cudnn_activation_descriptor& operator=(const cudnn_activation_descriptor&) = delete;
+
+            cudnn_activation_descriptor(
+                cudnnActivationMode_t mode,
+                cudnnNanPropagation_t reluNanOpt,
+                double reluCeiling
+            )
+            {
+                CHECK_CUDNN(cudnnCreateActivationDescriptor(&handle));
+                CHECK_CUDNN(cudnnSetActivationDescriptor(handle, mode, reluNanOpt, reluCeiling));
+            }
+
+            ~cudnn_activation_descriptor()
+            {
+                cudnnDestroyActivationDescriptor(handle);
+            }
+
+            cudnnActivationDescriptor_t get_handle (
+            ) 
+            { 
+                return handle; 
+            }
+        private:
+            cudnnActivationDescriptor_t handle;
+        };
+
+        static cudnnActivationDescriptor_t relu_activation_descriptor()
+        {
+            thread_local cudnn_activation_descriptor des(CUDNN_ACTIVATION_RELU, CUDNN_PROPAGATE_NAN,0);
+            return des.get_handle();
+        }
+
+        static cudnnActivationDescriptor_t sigmoid_activation_descriptor()
+        {
+            thread_local cudnn_activation_descriptor des(CUDNN_ACTIVATION_SIGMOID, CUDNN_PROPAGATE_NAN,0);
+            return des.get_handle();
+        }
+
+        static cudnnActivationDescriptor_t tanh_activation_descriptor()
+        {
+            thread_local cudnn_activation_descriptor des(CUDNN_ACTIVATION_TANH, CUDNN_PROPAGATE_NAN,0);
+            return des.get_handle();
+        }
+
+    // ------------------------------------------------------------------------------------
+
+        tensor_descriptor::
+        tensor_descriptor(
+        ) : handle(nullptr)
+        {
+        }
+
+        tensor_descriptor::
+        ~tensor_descriptor()
+        {
+            set_size(0,0,0,0);
+        }
+
+        void tensor_descriptor::
+        set_size(
+            int n, 
+            int k,
+            int nr, 
+            int nc 
+        )
+        {
+            if (handle)
+            {
+                cudnnDestroyTensorDescriptor((cudnnTensorDescriptor_t)handle);
+                handle = nullptr;
+            }
+
+            if (n != 0 && nr != 0 && nc != 0 && k != 0)
+            {
+                cudnnTensorDescriptor_t h;
+                CHECK_CUDNN(cudnnCreateTensorDescriptor(&h));
+                handle = h;
+
+                CHECK_CUDNN(cudnnSetTensor4dDescriptor((cudnnTensorDescriptor_t)handle,
+                        CUDNN_TENSOR_NCHW,
+                        CUDNN_DATA_FLOAT,
+                        n,
+                        k,
+                        nr,
+                        nc));
+            }
+        }
+
+        void tensor_descriptor::
+        get_size (
+            int& n, 
+            int& k,
+            int& nr,
+            int& nc
+        ) const
+        {
+            if (handle)
+            {
+                int nStride, cStride, hStride, wStride;
+                cudnnDataType_t datatype;
+                CHECK_CUDNN(cudnnGetTensor4dDescriptor((cudnnTensorDescriptor_t)handle,
+                        &datatype,
+                        &n,
+                        &k,
+                        &nr,
+                        &nc,
+                        &nStride,
+                        &cStride,
+                        &hStride,
+                        &wStride));
+            }
+            else
+            {
+                n = 0;
+                k = 0;
+                nr = 0;
+                nc = 0;
+            }
+        }
+
+    // ------------------------------------------------------------------------------------
+
+        void add(
+            float beta,
+            tensor& dest,
+            float alpha,
+            const tensor& src
+        )
+        {
+            DLIB_CASSERT(
+                  (have_same_dimensions(src, dest) ||
+                  (src.num_samples()==1 && src.k()==dest.k() && src.nr()==1 && src.nc()==1) ||
+                  (src.num_samples()==1 && src.k()==dest.k() && src.nr()==dest.nr() && src.nc()==dest.nc()) ||
+                  (src.num_samples()==1 && src.k()==1 && src.nr()==dest.nr() && src.nc()==dest.nc()) ||
+                  (src.num_samples()==dest.num_samples() && src.k()==1 && src.nr()==1 && src.nc()==1)) &&
+                  is_same_object(src,dest) == false , 
+                    "\n\t dest.num_samples(): " << dest.num_samples()
+                    <<"\n\t dest.k():           " << dest.k()
+                    <<"\n\t dest.nr():          " << dest.nr()
+                    <<"\n\t dest.nc():          " << dest.nc()
+                    <<"\n\t src.num_samples():  " << src.num_samples()
+                    <<"\n\t src.k():            " << src.k()
+                    <<"\n\t src.nr():           " << src.nr()
+                    <<"\n\t src.nc():           " << src.nc()
+                    );
+
+            if (dest.size() == src.size() && beta == 1)
+            {
+                // Call the dlib function in this case since it's faster than the one that
+                // comes with cuDNN (at least as of cuDNN v4).
+                add_scaled(dest, alpha, src);
+                return;
+            }
+            else if (src.num_samples()==dest.num_samples() && src.k()==1 && src.nr()==1 && src.nc()==1)
+            {
+                add_cv_to_all_columns(beta, dest, alpha, src);
+                return;
+            }
+
+            CHECK_CUDNN(cudnnAddTensor(context(),
+                                    &alpha,
+                                    descriptor(src),
+                                    src.device(),
+                                    &beta,
+                                    descriptor(dest),
+                                    dest.device()));
+        }
+
+        void assign_conv_bias_gradient (
+            tensor& grad,
+            const tensor& gradient_input
+        )
+        {
+            DLIB_CASSERT(
+                  grad.num_samples() == 1 &&
+                  grad.k()  >= 1 &&
+                  grad.nr() == 1 &&
+                  grad.nc() == 1 &&
+                  gradient_input.k() == grad.k() &&
+                  gradient_input.size() > 0 &&
+                  is_same_object(grad,gradient_input) == false
+                  );
+
+            const float alpha = 1;
+            const float beta = 0;
+            CHECK_CUDNN(cudnnConvolutionBackwardBias(context(),
+                                               &alpha,
+                                               descriptor(gradient_input),
+                                               gradient_input.device(),
+                                               &beta,
+                                               descriptor(grad),
+                                               grad.device()));
+        }
+
+    // ------------------------------------------------------------------------------------
+
+        void batch_normalize_inference (
+            const double eps,
+            resizable_tensor& dest,
+            const tensor& src,
+            const tensor& gamma, 
+            const tensor& beta,
+            const tensor& running_means,
+            const tensor& running_variances
+        )
+        {
+            DLIB_CASSERT(
+                gamma.num_samples() == 1 && 
+                gamma.nr() == src.nr() &&
+                gamma.nc() == src.nc() &&
+                gamma.k()  == src.k() &&
+                have_same_dimensions(gamma, beta) &&
+                have_same_dimensions(gamma, running_means) &&
+                have_same_dimensions(gamma, running_variances) && 
+                eps > 0, 
+                "\ngamma.num_samples(): " << gamma.num_samples() << 
+                "\ngamma.k():  " << gamma.k() << 
+                "\ngamma.nr(): " << gamma.nr() << 
+                "\ngamma.nc(): " << gamma.nc() << 
+                "\nbeta.num_samples(): " << beta.num_samples() << 
+                "\nbeta.k():   " << beta.k() << 
+                "\nbeta.nr():  " << beta.nr() << 
+                "\nbeta.nc():  " << beta.nc() << 
+                "\nrunning_means.num_samples(): " << running_means.num_samples() << 
+                "\nrunning_means.k():   " << running_means.k() << 
+                "\nrunning_means.nr():  " << running_means.nr() << 
+                "\nrunning_means.nc():  " << running_means.nc() << 
+                "\nrunning_variances.num_samples(): " << running_variances.num_samples() << 
+                "\nrunning_variances.k():   " << running_variances.k() << 
+                "\nrunning_variances.nr():  " << running_variances.nr() << 
+                "\nrunning_variances.nc():  " << running_variances.nc() << 
+                "\nsrc.k():   " << src.k() << 
+                "\nsrc.nr():  " << src.nr() << 
+                "\nsrc.nc():  " << src.nc() <<
+                "\neps:  " << eps 
+            );
+            const float in_scale = 1;
+            const float out_scale = 0;
+
+            dest.copy_size(src);
+
+            CHECK_CUDNN(cudnnBatchNormalizationForwardInference(
+                                context(),
+                                CUDNN_BATCHNORM_PER_ACTIVATION,
+                                &in_scale,
+                                &out_scale,
+                                descriptor(src),
+                                src.device(),
+                                descriptor(dest),
+                                dest.device(),
+                                descriptor(gamma),
+                                gamma.device(),
+                                beta.device(),
+                                running_means.device(),
+                                running_variances.device(),
+                                eps));
+        }
+
+        void batch_normalize (
+            const double eps,
+            resizable_tensor& dest,
+            resizable_tensor& means,
+            resizable_tensor& invstds,
+            const double averaging_factor,
+            resizable_tensor& running_means,
+            resizable_tensor& running_variances,
+            const tensor& src,
+            const tensor& gamma, 
+            const tensor& beta 
+        )
+        {
+            DLIB_CASSERT(0 <= averaging_factor && averaging_factor <= 1, "averaging_factor: " << averaging_factor);
+            DLIB_CASSERT(averaging_factor==1 || have_same_dimensions(running_means,means));
+            DLIB_CASSERT(averaging_factor==1 || have_same_dimensions(running_variances,invstds));
+            DLIB_CASSERT(
+                src.num_samples() > 1 &&
+                gamma.num_samples() == 1 && 
+                beta.num_samples() == 1 && 
+                gamma.nr() == beta.nr() && beta.nr() == src.nr() &&
+                gamma.nc() == beta.nc() && beta.nc() == src.nc() &&
+                gamma.k()  == beta.k()  && beta.k() == src.k() &&
+                eps > 0, 
+                "\ngamma.num_samples(): " << gamma.num_samples() << 
+                "\ngamma.k():  " << gamma.k() << 
+                "\ngamma.nr(): " << gamma.nr() << 
+                "\ngamma.nc(): " << gamma.nc() << 
+                "\nbeta.num_samples(): " << beta.num_samples() << 
+                "\nbeta.k():   " << beta.k() << 
+                "\nbeta.nr():  " << beta.nr() << 
+                "\nbeta.nc():  " << beta.nc() << 
+                "\nsrc.k():   " << src.k() << 
+                "\nsrc.nr():  " << src.nr() << 
+                "\nsrc.nc():  " << src.nc() <<
+                "\neps:  " << eps 
+            );
+
+            const float in_scale = 1;
+            const float out_scale = 0;
+
+            dest.copy_size(src);
+            means.set_size(1, src.k(), src.nr(), src.nc());
+            invstds.copy_size(means);
+            running_means.copy_size(means);
+            running_variances.copy_size(means);
+            // cuDNN requires that running_means and running_variances be initialized to
+            // some valid float values even if the averaging factor would have ignored
+            // them.  
+            if (averaging_factor == 1)
+            {
+                running_means = 0;
+                running_variances = 1;
+            }
+
+            CHECK_CUDNN(cudnnBatchNormalizationForwardTraining(
+                                context(),
+                                CUDNN_BATCHNORM_PER_ACTIVATION,
+                                &in_scale,
+                                &out_scale,
+                                descriptor(src),
+                                src.device(),
+                                descriptor(dest),
+                                dest.device(),
+                                descriptor(gamma),
+                                gamma.device(),
+                                beta.device(),
+                                averaging_factor,
+                                running_means.device(),
+                                running_variances.device(),
+                                eps,
+                                means.device(),
+                                invstds.device()));
+        }
+
+        void batch_normalize_gradient(
+            const double eps,
+            const tensor& gradient_input,
+            const tensor& means,
+            const tensor& invstds,
+            const tensor& src,
+            const tensor& gamma,
+            tensor& src_grad,
+            tensor& gamma_grad, 
+            tensor& beta_grad 
+        )
+        {
+            const long num = src.k()*src.nr()*src.nc();
+            DLIB_CASSERT(src.num_samples() > 1);
+            DLIB_CASSERT(num == (long)means.size());
+            DLIB_CASSERT(num == (long)invstds.size());
+            DLIB_CASSERT(num == (long)gamma.size());
+            DLIB_CASSERT(num == (long)gamma_grad.size());
+            DLIB_CASSERT(num == (long)beta_grad.size());
+            DLIB_CASSERT(have_same_dimensions(gradient_input, src));
+            DLIB_CASSERT(have_same_dimensions(gradient_input, src_grad));
+            DLIB_CASSERT(eps > 0);
+
+            const float in_scale = 1;
+            const float out_scale = 1;
+            const float in_scale_params = 1;
+            const float out_scale_params = 0;
+
+            CHECK_CUDNN(cudnnBatchNormalizationBackward(
+                                context(),
+                                CUDNN_BATCHNORM_PER_ACTIVATION,
+                                &in_scale,
+                                &out_scale,
+                                &in_scale_params,
+                                &out_scale_params,
+                                descriptor(src),
+                                src.device(),
+                                descriptor(gradient_input),
+                                gradient_input.device(),
+                                descriptor(src_grad),
+                                src_grad.device(),
+                                descriptor(gamma),
+                                gamma.device(),
+                                gamma_grad.device(),
+                                beta_grad.device(),
+                                eps,
+                                means.device(),
+                                invstds.device()));
+        }
+
+    // ------------------------------------------------------------------------------------
+
+        void batch_normalize_conv_inference (
+            const double eps,
+            resizable_tensor& dest,
+            const tensor& src,
+            const tensor& gamma, 
+            const tensor& beta,
+            const tensor& running_means,
+            const tensor& running_variances
+        )
+        {
+            DLIB_CASSERT(
+                gamma.num_samples() == 1 && 
+                gamma.nr() == 1 &&
+                gamma.nc() == 1 &&
+                gamma.k()  == src.k() &&
+                have_same_dimensions(gamma, beta) &&
+                have_same_dimensions(gamma, running_means) &&
+                have_same_dimensions(gamma, running_variances) &&
+                eps > 0, 
+                "\ngamma.num_samples(): " << gamma.num_samples() << 
+                "\ngamma.k():  " << gamma.k() << 
+                "\ngamma.nr(): " << gamma.nr() << 
+                "\ngamma.nc(): " << gamma.nc() << 
+                "\nbeta.num_samples(): " << beta.num_samples() << 
+                "\nbeta.k():   " << beta.k() << 
+                "\nbeta.nr():  " << beta.nr() << 
+                "\nbeta.nc():  " << beta.nc() << 
+                "\nrunning_means.num_samples(): " << running_means.num_samples() << 
+                "\nrunning_means.k():   " << running_means.k() << 
+                "\nrunning_means.nr():  " << running_means.nr() << 
+                "\nrunning_means.nc():  " << running_means.nc() << 
+                "\nrunning_variances.num_samples(): " << running_variances.num_samples() << 
+                "\nrunning_variances.k():   " << running_variances.k() << 
+                "\nrunning_variances.nr():  " << running_variances.nr() << 
+                "\nrunning_variances.nc():  " << running_variances.nc() << 
+                "\nsrc.k():   " << src.k() << 
+                "\nsrc.nr():  " << src.nr() << 
+                "\nsrc.nc():  " << src.nc() <<
+                "\neps:  " << eps 
+            );
+            const float in_scale = 1;
+            const float out_scale = 0;
+
+            dest.copy_size(src);
+
+            CHECK_CUDNN(cudnnBatchNormalizationForwardInference(
+                                context(),
+                                CUDNN_BATCHNORM_SPATIAL,
+                                &in_scale,
+                                &out_scale,
+                                descriptor(src),
+                                src.device(),
+                                descriptor(dest),
+                                dest.device(),
+                                descriptor(gamma),
+                                gamma.device(),
+                                beta.device(),
+                                running_means.device(),
+                                running_variances.device(),
+                                eps));
+        }
+
+        void batch_normalize_conv (
+            const double eps,
+            resizable_tensor& dest,
+            resizable_tensor& means,
+            resizable_tensor& invstds,
+            const double averaging_factor,
+            resizable_tensor& running_means,
+            resizable_tensor& running_variances,
+            const tensor& src,
+            const tensor& gamma, 
+            const tensor& beta 
+        )
+        {
+            DLIB_CASSERT(0 <= averaging_factor && averaging_factor <= 1, "averaging_factor: " << averaging_factor);
+            DLIB_CASSERT(averaging_factor==1 || have_same_dimensions(running_means,means));
+            DLIB_CASSERT(averaging_factor==1 || have_same_dimensions(running_variances,invstds));
+            DLIB_CASSERT(
+                src.num_samples() > 1 &&
+                gamma.num_samples() == 1 && 
+                beta.num_samples() == 1 && 
+                gamma.nr() == 1 && 
+                beta.nr() == 1 && 
+                gamma.nc() == 1 && 
+                beta.nc() == 1 && 
+                gamma.k()  == beta.k()  && beta.k() == src.k() &&
+                eps > 0, 
+                "\ngamma.num_samples(): " << gamma.num_samples() << 
+                "\ngamma.k():  " << gamma.k() << 
+                "\ngamma.nr(): " << gamma.nr() << 
+                "\ngamma.nc(): " << gamma.nc() << 
+                "\nbeta.num_samples(): " << beta.num_samples() << 
+                "\nbeta.k():   " << beta.k() << 
+                "\nbeta.nr():  " << beta.nr() << 
+                "\nbeta.nc():  " << beta.nc() << 
+                "\nsrc.k():   " << src.k() << 
+                "\nsrc.nr():  " << src.nr() << 
+                "\nsrc.nc():  " << src.nc() <<
+                "\neps:  " << eps 
+            );
+            const float in_scale = 1;
+            const float out_scale = 0;
+
+            dest.copy_size(src);
+            means.set_size(1, src.k());
+            invstds.copy_size(means);
+            running_means.copy_size(means);
+            running_variances.copy_size(means);
+            // cuDNN requires that running_means and running_variances be initialized to
+            // some valid float values even if the averaging factor would have ignored
+            // them.  
+            if (averaging_factor == 1)
+            {
+                running_means = 0;
+                running_variances = 1;
+            }
+
+            CHECK_CUDNN(cudnnBatchNormalizationForwardTraining(
+                                context(),
+                                CUDNN_BATCHNORM_SPATIAL,
+                                &in_scale,
+                                &out_scale,
+                                descriptor(src),
+                                src.device(),
+                                descriptor(dest),
+                                dest.device(),
+                                descriptor(gamma),
+                                gamma.device(),
+                                beta.device(),
+                                averaging_factor,
+                                running_means.device(),
+                                running_variances.device(),
+                                eps,
+                                means.device(),
+                                invstds.device()));
+        }
+
+        void batch_normalize_conv_gradient(
+            const double eps,
+            const tensor& gradient_input,
+            const tensor& means,
+            const tensor& invstds,
+            const tensor& src,
+            const tensor& gamma,
+            tensor& src_grad,
+            tensor& gamma_grad, 
+            tensor& beta_grad 
+        )
+        {
+            DLIB_CASSERT(src.k() == (long)means.size());
+            DLIB_CASSERT(src.k() == (long)invstds.size());
+            DLIB_CASSERT(src.k() == (long)gamma.size());
+            DLIB_CASSERT(src.k() == (long)gamma_grad.size());
+            DLIB_CASSERT(src.k() == (long)beta_grad.size());
+            DLIB_CASSERT(have_same_dimensions(gradient_input, src));
+            DLIB_CASSERT(have_same_dimensions(gradient_input, src_grad));
+            DLIB_CASSERT(eps > 0);
+
+            const float in_scale = 1;
+            const float out_scale = 1;
+            const float in_scale_params = 1;
+            const float out_scale_params = 0;
+
+            CHECK_CUDNN(cudnnBatchNormalizationBackward(
+                                context(),
+                                CUDNN_BATCHNORM_SPATIAL,
+                                &in_scale,
+                                &out_scale,
+                                &in_scale_params,
+                                &out_scale_params,
+                                descriptor(src),
+                                src.device(),
+                                descriptor(gradient_input),
+                                gradient_input.device(),
+                                descriptor(src_grad),
+                                src_grad.device(),
+                                descriptor(gamma),
+                                gamma.device(),
+                                gamma_grad.device(),
+                                beta_grad.device(),
+                                eps,
+                                means.device(),
+                                invstds.device()));
+        }
+
+    // ------------------------------------------------------------------------------------
+    // ------------------------------------------------------------------------------------
+
+        tensor_conv::
+        tensor_conv(
+        ) : 
+            filter_handle(nullptr),
+            conv_handle(nullptr),
+            forward_algo(0),
+            backward_data_algo(0),
+            backward_filters_algo(0)
+        {
+            clear();
+        }
+
+        void tensor_conv::
+        clear (
+        )
+        {
+            if (filter_handle) 
+                cudnnDestroyFilterDescriptor((cudnnFilterDescriptor_t)filter_handle);
+            if (conv_handle) 
+                cudnnDestroyConvolutionDescriptor((cudnnConvolutionDescriptor_t)conv_handle);
+            filter_handle = nullptr;
+            conv_handle = nullptr;
+            out_num_samples = 0;
+            out_k = 0;
+            out_nr = 0;
+            out_nc = 0;
+
+            stride_y = 0;
+            stride_x = 0;
+            padding_y = 0;
+            padding_x = 0;
+            data_num_samples = 0;
+            data_k = 0;
+            data_nr = 0;
+            data_nc = 0;
+            filters_num_samples = 0;
+            filters_k = 0;
+            filters_nr = 0;
+            filters_nc = 0;
+
+            forward_algo = 0;
+            backward_data_algo = 0;
+            backward_filters_algo = 0;
+
+            forward_workspace_size_in_bytes = 0;
+            backward_data_workspace_size_in_bytes = 0;
+            backward_filters_workspace_size_in_bytes = 0;
+
+            forward_workspace.reset();
+            backward_data_workspace.reset();
+            backward_filters_workspace.reset();
+            workspace.reset();
+        }
+
+        void tensor_conv::
+        setup(
+            const tensor& data,
+            const tensor& filters,
+            int stride_y_,
+            int stride_x_,
+            int padding_y_,
+            int padding_x_
+        ) 
+        {
+            DLIB_CASSERT(data.k() == filters.k());
+
+            // if the last call to setup gave the same exact settings then don't do
+            // anything.
+            if (stride_y_ == stride_y && 
+                stride_x_ == stride_x &&
+                padding_y_ == padding_y && 
+                padding_x_ == padding_x &&
+                data_num_samples == data.num_samples() &&
+                data_k == data.k() &&
+                data_nr == data.nr() &&
+                data_nc == data.nc() &&
+                filters_num_samples == filters.num_samples() &&
+                filters_k == filters.k() &&
+                filters_nr == filters.nr() &&
+                filters_nc == filters.nc())
+            {
+                return;
+            }
+
+            clear();
+            try
+            {
+                stride_y = stride_y_;
+                stride_x = stride_x_;
+                padding_y = padding_y_;
+                padding_x = padding_x_;
+                data_num_samples = data.num_samples();
+                data_k = data.k();
+                data_nr = data.nr();
+                data_nc = data.nc();
+                filters_num_samples = filters.num_samples();
+                filters_k = filters.k();
+                filters_nr = filters.nr();
+                filters_nc = filters.nc();
+
+                CHECK_CUDNN(cudnnCreateFilterDescriptor((cudnnFilterDescriptor_t*)&filter_handle));
+                CHECK_CUDNN(cudnnSetFilter4dDescriptor((cudnnFilterDescriptor_t)filter_handle, 
+                                                 CUDNN_DATA_FLOAT, 
+                                                 CUDNN_TENSOR_NCHW,
+                                                 filters.num_samples(),
+                                                 filters.k(),
+                                                 filters.nr(),
+                                                 filters.nc()));
+
+                CHECK_CUDNN(cudnnCreateConvolutionDescriptor((cudnnConvolutionDescriptor_t*)&conv_handle));
+#if CUDNN_MAJOR >= 6
+                CHECK_CUDNN(cudnnSetConvolution2dDescriptor((cudnnConvolutionDescriptor_t)conv_handle,
+                        padding_y, // vertical padding
+                        padding_x, // horizontal padding
+                        stride_y,
+                        stride_x,
+                        1, 1, // must be 1,1
+                        CUDNN_CROSS_CORRELATION,
+                        CUDNN_DATA_FLOAT)); // could also be CUDNN_CONVOLUTION
+#else
+                CHECK_CUDNN(cudnnSetConvolution2dDescriptor((cudnnConvolutionDescriptor_t)conv_handle,
+                        padding_y, // vertical padding
+                        padding_x, // horizontal padding
+                        stride_y,
+                        stride_x,
+                        1, 1, // must be 1,1
+                        CUDNN_CROSS_CORRELATION)); // could also be CUDNN_CONVOLUTION
+#endif
+
+                CHECK_CUDNN(cudnnGetConvolution2dForwardOutputDim(
+                        (const cudnnConvolutionDescriptor_t)conv_handle,
+                        descriptor(data),
+                        (const cudnnFilterDescriptor_t)filter_handle,
+                        &out_num_samples,
+                        &out_k,
+                        &out_nr,
+                        &out_nc));
+
+                tensor_descriptor dest_desc;
+                dest_desc.set_size(out_num_samples,out_k,out_nr,out_nc);
+
+                // Pick which forward algorithm we will use and allocate the necessary
+                // workspace buffer.
+                cudnnConvolutionFwdAlgo_t forward_best_algo;
+                CHECK_CUDNN(cudnnGetConvolutionForwardAlgorithm(
+                        context(), 
+                        descriptor(data),
+                        (const cudnnFilterDescriptor_t)filter_handle,
+                        (const cudnnConvolutionDescriptor_t)conv_handle,
+                        descriptor(dest_desc),
+                        dnn_prefer_fastest_algorithms()?CUDNN_CONVOLUTION_FWD_PREFER_FASTEST:CUDNN_CONVOLUTION_FWD_NO_WORKSPACE,
+                        std::numeric_limits<size_t>::max(),
+                        &forward_best_algo));
+                forward_algo = forward_best_algo;
+                CHECK_CUDNN(cudnnGetConvolutionForwardWorkspaceSize( 
+                        context(),
+                        descriptor(data),
+                        (const cudnnFilterDescriptor_t)filter_handle,
+                        (const cudnnConvolutionDescriptor_t)conv_handle,
+                        descriptor(dest_desc),
+                        forward_best_algo,
+                        &forward_workspace_size_in_bytes));
+
+                // Pick which backward data algorithm we will use and allocate the
+                // necessary workspace buffer.
+                cudnnConvolutionBwdDataAlgo_t backward_data_best_algo;
+                CHECK_CUDNN(cudnnGetConvolutionBackwardDataAlgorithm(
+                        context(),
+                        (const cudnnFilterDescriptor_t)filter_handle,
+                        descriptor(dest_desc),
+                        (const cudnnConvolutionDescriptor_t)conv_handle,
+                        descriptor(data),
+                        dnn_prefer_fastest_algorithms()?CUDNN_CONVOLUTION_BWD_DATA_PREFER_FASTEST:CUDNN_CONVOLUTION_BWD_DATA_NO_WORKSPACE,
+                        std::numeric_limits<size_t>::max(),
+                        &backward_data_best_algo));
+                backward_data_algo = backward_data_best_algo;
+
+                CHECK_CUDNN(cudnnGetConvolutionBackwardDataWorkspaceSize(
+                        context(),
+                        (const cudnnFilterDescriptor_t)filter_handle,
+                        descriptor(dest_desc),
+                        (const cudnnConvolutionDescriptor_t)conv_handle,
+                        descriptor(data),
+                        backward_data_best_algo,
+                        &backward_data_workspace_size_in_bytes));
+
+                // Pick which backward filters algorithm we will use and allocate the
+                // necessary workspace buffer.
+                cudnnConvolutionBwdFilterAlgo_t backward_filters_best_algo;
+                CHECK_CUDNN(cudnnGetConvolutionBackwardFilterAlgorithm(
+                        context(),
+                        descriptor(data),
+                        descriptor(dest_desc),
+                        (const cudnnConvolutionDescriptor_t)conv_handle,
+                        (const cudnnFilterDescriptor_t)filter_handle,
+                        dnn_prefer_fastest_algorithms()?CUDNN_CONVOLUTION_BWD_FILTER_PREFER_FASTEST:CUDNN_CONVOLUTION_BWD_FILTER_NO_WORKSPACE,
+                        std::numeric_limits<size_t>::max(),
+                        &backward_filters_best_algo));
+                // cuDNN 5.1 has a bug that causes
+                // cudnnGetConvolutionBackwardFilterAlgorithm() to pick the winograd
+                // algorithm even for cases where cuDNN doesn't support it, leading to
+                // incorrect outputs.  So here we check if we are in a case where winograd
+                // isn't supported and manually overrule
+                // cudnnGetConvolutionBackwardFilterAlgorithm() by picking a safe
+                // algorithm.
+                if (dnn_prefer_fastest_algorithms() && 
+                    !(stride_x == 1 && stride_y == 1 && ((filters_nr==3&&filters_nc==3) || (filters_nr==5&&filters_nc==5)))
+                    )
+                {
+                    backward_filters_best_algo = CUDNN_CONVOLUTION_BWD_FILTER_ALGO_0;
+                }
+                backward_filters_algo = backward_filters_best_algo;
+
+                CHECK_CUDNN(cudnnGetConvolutionBackwardFilterWorkspaceSize( 
+                        context(),
+                        descriptor(data),
+                        descriptor(dest_desc),
+                        (const cudnnConvolutionDescriptor_t)conv_handle,
+                        (const cudnnFilterDescriptor_t)filter_handle,
+                        backward_filters_best_algo,
+                        &backward_filters_workspace_size_in_bytes));
+
+                workspace = device_global_buffer();
+            }
+            catch(...)
+            {
+                clear();
+                throw;
+            }
+        }
+
+        tensor_conv::
+        ~tensor_conv (
+        )
+        {
+            clear();
+        }
+
+        void tensor_conv::operator() (
+            const bool add_to_output,
+            resizable_tensor& output,
+            const tensor& data,
+            const tensor& filters
+        )
+        {
+            DLIB_CASSERT(stride_y > 0 && stride_x > 0, "You must call setup() before calling this function");
+
+            output.set_size(out_num_samples, out_k, out_nr, out_nc);
+            (*this)(add_to_output, static_cast<tensor&>(output), data, filters);
+        }
+
+        void tensor_conv::operator() (
+            const bool add_to_output,
+            tensor& output,
+            const tensor& data,
+            const tensor& filters
+        )
+        {
+            DLIB_CASSERT(is_same_object(output,data) == false);
+            DLIB_CASSERT(is_same_object(output,filters) == false);
+            DLIB_CASSERT(filters.k() == data.k());
+            DLIB_CASSERT(stride_y > 0 && stride_x > 0, "You must call setup() before calling this function");
+            DLIB_CASSERT(filters.nc() <= data.nc() + 2*padding_x,
+                "Filter windows must be small enough to fit into the padded image."
+                << "\n\t filters.nc(): " << filters.nc() 
+                << "\n\t data.nc():  " << data.nc() 
+                << "\n\t padding_x: " << padding_x 
+                );
+            DLIB_CASSERT(filters.nr() <= data.nr() + 2*padding_y,
+                "Filter windows must be small enough to fit into the padded image."
+                << "\n\t filters.nr(): " << filters.nr() 
+                << "\n\t data.nr():  " << data.nr() 
+                << "\n\t padding_y: " << padding_y 
+                );
+
+
+            DLIB_CASSERT(output.num_samples() == data.num_samples(),out_num_samples << "  " << data.num_samples());
+            DLIB_CASSERT(output.k() == filters.num_samples());
+            DLIB_CASSERT(output.nr() == 1+(data.nr()+2*padding_y-filters.nr())/stride_y);
+            DLIB_CASSERT(output.nc() == 1+(data.nc()+2*padding_x-filters.nc())/stride_x);
+
+
+
+            const float alpha = 1;
+            const float beta = add_to_output ? 1 : 0;
+
+            // Since cudnnConvolutionForward() is an asynchronous call, we need to hold a
+            // reference to the workspace buffer so we can be sure it isn't reallocated
+            // while the function is still executing on the device.  But each time we come
+            // here, we make sure to grab the latest workspace buffer so that, globally, we
+            // minimize the number of such buffers.
+            forward_workspace = workspace->get(forward_workspace_size_in_bytes);
+
+            CHECK_CUDNN(cudnnConvolutionForward(
+                    context(),
+                    &alpha,
+                    descriptor(data),
+                    data.device(),
+                    (const cudnnFilterDescriptor_t)filter_handle,
+                    filters.device(),
+                    (const cudnnConvolutionDescriptor_t)conv_handle,
+                    (cudnnConvolutionFwdAlgo_t)forward_algo,
+                    forward_workspace,
+                    forward_workspace_size_in_bytes,
+                    &beta,
+                    descriptor(output),
+                    output.device()));
+        }
+
+        void tensor_conv::get_gradient_for_data (
+            const bool add_to_output,
+            const tensor& gradient_input, 
+            const tensor& filters,
+            tensor& data_gradient
+        )
+        {
+            const float alpha = 1;
+            const float beta = add_to_output ? 1 : 0;
+
+            // Since cudnnConvolutionBackwardData() is an asynchronous call, we need to hold a
+            // reference to the workspace buffer so we can be sure it isn't reallocated
+            // while the function is still executing on the device.  But each time we come
+            // here, we make sure to grab the latest workspace buffer so that, globally, we
+            // minimize the number of such buffers.
+            backward_data_workspace = workspace->get(backward_data_workspace_size_in_bytes);
+
+
+            CHECK_CUDNN(cudnnConvolutionBackwardData(context(),
+                                                  &alpha,
+                                                  (const cudnnFilterDescriptor_t)filter_handle,
+                                                  filters.device(),
+                                                  descriptor(gradient_input),
+                                                  gradient_input.device(),
+                                                  (const cudnnConvolutionDescriptor_t)conv_handle,
+                                                  (cudnnConvolutionBwdDataAlgo_t)backward_data_algo,
+                                                  backward_data_workspace,
+                                                  backward_data_workspace_size_in_bytes,
+                                                  &beta,
+                                                  descriptor(data_gradient),
+                                                  data_gradient.device()));
+        }
+
+        void tensor_conv::
+        get_gradient_for_filters (
+            const bool add_to_output,
+            const tensor& gradient_input, 
+            const tensor& data,
+            tensor& filters_gradient
+        )
+        {
+            const float alpha = 1;
+            const float beta = add_to_output ? 1 : 0;
+
+            // Since cudnnConvolutionBackwardFilter() is an asynchronous call, we need to hold a
+            // reference to the workspace buffer so we can be sure it isn't reallocated
+            // while the function is still executing on the device.  But each time we come
+            // here, we make sure to grab the latest workspace buffer so that, globally, we
+            // minimize the number of such buffers.
+            backward_filters_workspace = workspace->get(backward_filters_workspace_size_in_bytes);
+
+            CHECK_CUDNN(cudnnConvolutionBackwardFilter(context(),
+                                                    &alpha,
+                                                    descriptor(data),
+                                                    data.device(),
+                                                    descriptor(gradient_input),
+                                                    gradient_input.device(),
+                                                    (const cudnnConvolutionDescriptor_t)conv_handle,
+                                                    (cudnnConvolutionBwdFilterAlgo_t)backward_filters_algo,
+                                                    backward_filters_workspace,
+                                                    backward_filters_workspace_size_in_bytes,
+                                                    &beta,
+                                                    (const cudnnFilterDescriptor_t)filter_handle,
+                                                    filters_gradient.device()));
+        }
+
+    // ------------------------------------------------------------------------------------
+    // ------------------------------------------------------------------------------------
+
+        pooling::pooling (
+        ) : handle(nullptr),window_height(0),window_width(0),stride_y(0),stride_x(0),padding_y(0), padding_x(0)
+        {
+        }
+
+        pooling::~pooling(
+        )
+        {
+            clear();
+        }
+
+        void pooling::
+        clear(
+        )
+        {
+            if (handle)
+                cudnnDestroyPoolingDescriptor((cudnnPoolingDescriptor_t)handle);
+            handle = nullptr;
+            window_height = 0;
+            window_width = 0;
+            stride_y = 0;
+            stride_x = 0;
+            padding_y = 0;
+            padding_x = 0;
+        }
+
+        void pooling::
+        setup_max_pooling(
+            int window_height_,
+            int window_width_,
+            int stride_y_,
+            int stride_x_,
+            int padding_y_,
+            int padding_x_ 
+        )
+        {
+            setup(window_height_, window_width_, stride_y_, stride_x_, padding_y_, padding_x_, CUDNN_POOLING_MAX);
+            do_max_pooling = true;
+        }
+
+        void pooling::
+        setup_avg_pooling(
+            int window_height_,
+            int window_width_,
+            int stride_y_,
+            int stride_x_,
+            int padding_y_,
+            int padding_x_
+        )
+        {
+            setup(window_height_, window_width_, stride_y_, stride_x_, padding_y_, padding_x_, CUDNN_POOLING_AVERAGE_COUNT_EXCLUDE_PADDING);
+            do_max_pooling = false;
+        }
+
+        void pooling::
+        setup(
+            int window_height_,
+            int window_width_,
+            int stride_y_,
+            int stride_x_,
+            int padding_y_,
+            int padding_x_,
+            int pooling_mode
+        )
+        {
+            DLIB_CASSERT (window_height_ > 0 && window_width_ > 0 && 
+                          stride_y_ > 0 && stride_x_ > 0 , 
+                          "window_height_: " << window_height_ 
+                          << "\t\n window_width_: " << window_width_ 
+                          << "\t\n stride_y_: " << stride_y_ 
+                          << "\t\n stride_x_: " << stride_x_ );
+            DLIB_CASSERT( 0 <= padding_y_ && padding_y_ < window_height_ && 
+                          0 <= padding_x_ && padding_x_ < window_width_,
+                          "window_height_: " << window_height_ 
+                          << "\t\n window_width_: " << window_width_ 
+                          << "\t\n padding_y_: " << padding_y_ 
+                          << "\t\n padding_x_: " << padding_x_ );
+
+            if (window_height == window_height_ &&
+                window_width  == window_width_ &&
+                stride_y == stride_y_ &&
+                stride_x == stride_x_ && 
+                padding_y == padding_y_ &&
+                padding_x == padding_x_
+                )
+            {
+                return;
+            }
+
+            clear();
+            try
+            {
+                window_height = window_height_;
+                window_width = window_width_;
+                stride_x = stride_x_;
+                stride_y = stride_y_;
+                padding_y  = padding_y_;
+                padding_x  = padding_x_;
+                cudnnPoolingDescriptor_t poolingDesc;
+                CHECK_CUDNN(cudnnCreatePoolingDescriptor(&poolingDesc));
+                handle = poolingDesc;
+
+                CHECK_CUDNN(cudnnSetPooling2dDescriptor(poolingDesc,
+                                                (cudnnPoolingMode_t)pooling_mode,
+                                                CUDNN_PROPAGATE_NAN,
+                                                window_height,
+                                                window_width,
+                                                padding_y,
+                                                padding_x,  
+                                                stride_y,
+                                                stride_x));
+            }
+            catch(...)
+            {
+                clear();
+                throw;
+            }
+        }
+
+        void pooling::
+        operator() (
+            resizable_tensor& dest,
+            const tensor& src
+        )
+        {
+            DLIB_CASSERT(window_width  <= src.nc() + 2*padding_x,
+                "Pooling windows must be small enough to fit into the padded image."
+                << "\n\t window_width: " << window_width 
+                << "\n\t src.nc():  " << src.nc() 
+                << "\n\t padding_x: " << padding_x 
+                );
+            DLIB_CASSERT(window_height <= src.nr() + 2*padding_y,
+                "Pooling windows must be small enough to fit into the padded image."
+                << "\n\t window_height: " << window_height 
+                << "\n\t src.nr():  " << src.nr() 
+                << "\n\t padding_y: " << padding_y 
+                );
+            const float alpha = 1;
+            const float beta = 0;
+            int outN;
+            int outC;
+            int outH;
+            int outW;
+            CHECK_CUDNN(cudnnGetPooling2dForwardOutputDim((const cudnnPoolingDescriptor_t)handle,
+                                                    descriptor(src),
+                                                    &outN,
+                                                    &outC,
+                                                    &outH,
+                                                    &outW));
+
+
+            dest.set_size(outN,outC,outH,outW);
+
+            DLIB_CASSERT(dest.num_samples() == src.num_samples());
+            DLIB_CASSERT(dest.k() == src.k());
+            DLIB_CASSERT(dest.nr() == 1 + (src.nr() + 2*padding_y - window_height)/stride_y, 
+                "\n stride_y:  " << stride_y  <<
+                "\n padding_y: " << padding_y  <<
+                "\n window_height: " << window_height  <<
+                "\n src.nr(): " << src.nr()  <<
+                "\n dest.nr(): " << dest.nr()  <<
+                "\n src.nr()/stride_y: " <<  src.nr()/stride_y); 
+            DLIB_CASSERT(dest.nc() == 1 + (src.nc() + 2*padding_x - window_width)/stride_x, 
+                "\n stride_x:  " << stride_x  <<
+                "\n padding_x: " << padding_x  <<
+                "\n window_width: " << window_width  <<
+                "\n src.nc(): " << src.nc()  <<
+                "\n dest.nc(): " << dest.nc()  <<
+                "\n src.nc()/stride_x: " <<  src.nc()/stride_x); 
+
+            CHECK_CUDNN(cudnnPoolingForward(context(),
+                                     (const cudnnPoolingDescriptor_t)handle,
+                                     &alpha,
+                                     descriptor(src),
+                                     src.device(),
+                                     &beta,
+                                     descriptor(dest),
+                                     dest.device()));
+        }
+
+        void pooling::get_gradient(
+            const tensor& gradient_input, 
+            const tensor& dest,
+            const tensor& src,
+            tensor& grad 
+        )
+        {
+            DLIB_CASSERT(have_same_dimensions(gradient_input,dest));
+            DLIB_CASSERT(have_same_dimensions(src,grad));
+
+            const float alpha = 1;
+            const float beta = 1;
+            CHECK_CUDNN(cudnnPoolingBackward(context(),
+                                       (const cudnnPoolingDescriptor_t)handle,
+                                       &alpha,
+                                       descriptor(dest),
+                                       dest.device(),
+                                       descriptor(gradient_input),
+                                       gradient_input.device(),
+                                       descriptor(src),
+                                       src.device(),
+                                       &beta,
+                                       descriptor(grad),
+                                       grad.device()));
+        }
+
+    // ------------------------------------------------------------------------------------
+    // ------------------------------------------------------------------------------------
+
+        void softmax (
+            tensor& dest,
+            const tensor& src
+        )
+        {
+            DLIB_CASSERT(have_same_dimensions(dest,src));
+            if (src.size() == 0)
+                return;
+
+            const float alpha = 1;
+            const float beta = 0;
+
+            CHECK_CUDNN(cudnnSoftmaxForward(context(),
+                                      CUDNN_SOFTMAX_ACCURATE,
+                                      CUDNN_SOFTMAX_MODE_CHANNEL,
+                                      &alpha,
+                                      descriptor(src),
+                                      src.device(),
+                                      &beta,
+                                      descriptor(dest),
+                                      dest.device()));
+        }
+
+
+        void softmax_gradient (
+            tensor& grad,
+            const tensor& dest,
+            const tensor& gradient_input
+        )
+        {
+            DLIB_CASSERT(
+                  have_same_dimensions(dest,gradient_input) == true &&
+                  have_same_dimensions(dest,grad) == true );
+            if (dest.size() == 0)
+                return;
+
+            const float alpha = 1;
+            const float beta = is_same_object(grad,gradient_input) ? 0 : 1;
+            CHECK_CUDNN(cudnnSoftmaxBackward(context(),
+                                      CUDNN_SOFTMAX_ACCURATE,
+                                      CUDNN_SOFTMAX_MODE_CHANNEL,
+                                      &alpha,
+                                      descriptor(dest),
+                                      dest.device(),
+                                      descriptor(gradient_input),
+                                      gradient_input.device(),
+                                      &beta,
+                                      descriptor(grad),
+                                      grad.device()));
+        }
+
+    // ------------------------------------------------------------------------------------
+    // ------------------------------------------------------------------------------------
+
+        void softmax_all (
+            tensor& dest,
+            const tensor& src
+        )
+        {
+            DLIB_CASSERT(have_same_dimensions(dest,src));
+            if (src.size() == 0)
+                return;
+
+            const float alpha = 1;
+            const float beta = 0;
+
+            CHECK_CUDNN(cudnnSoftmaxForward(context(),
+                                      CUDNN_SOFTMAX_ACCURATE,
+                                      CUDNN_SOFTMAX_MODE_INSTANCE,
+                                      &alpha,
+                                      descriptor(src),
+                                      src.device(),
+                                      &beta,
+                                      descriptor(dest),
+                                      dest.device()));
+        }
+
+
+        void softmax_all_gradient (
+            tensor& grad,
+            const tensor& dest,
+            const tensor& gradient_input
+        )
+        {
+            DLIB_CASSERT(
+                  have_same_dimensions(dest,gradient_input) == true &&
+                  have_same_dimensions(dest,grad) == true );
+            if (dest.size() == 0)
+                return;
+
+            const float alpha = 1;
+            const float beta = is_same_object(grad,gradient_input) ? 0 : 1;
+            CHECK_CUDNN(cudnnSoftmaxBackward(context(),
+                                      CUDNN_SOFTMAX_ACCURATE,
+                                      CUDNN_SOFTMAX_MODE_INSTANCE,
+                                      &alpha,
+                                      descriptor(dest),
+                                      dest.device(),
+                                      descriptor(gradient_input),
+                                      gradient_input.device(),
+                                      &beta,
+                                      descriptor(grad),
+                                      grad.device()));
+        }
+
+    // ------------------------------------------------------------------------------------
+    // ------------------------------------------------------------------------------------
+
+        void sigmoid (
+            tensor& dest,
+            const tensor& src
+        )
+        {
+            DLIB_CASSERT(have_same_dimensions(dest,src));
+            if (src.size() == 0)
+                return;
+
+            const float alpha = 1;
+            const float beta = 0;
+            CHECK_CUDNN(cudnnActivationForward(context(),
+                                         sigmoid_activation_descriptor(),
+                                         &alpha,
+                                         descriptor(src),
+                                         src.device(),
+                                         &beta,
+                                         descriptor(dest),
+                                         dest.device()));
+        }
+
+        void sigmoid_gradient (
+            tensor& grad,
+            const tensor& dest,
+            const tensor& gradient_input
+        )
+        {
+            DLIB_CASSERT(
+                  have_same_dimensions(dest,gradient_input) == true &&
+                  have_same_dimensions(dest,grad) == true );
+            if (dest.size() == 0)
+                return;
+
+            const float alpha = 1;
+            const float beta = is_same_object(grad,gradient_input) ? 0 : 1;
+            CHECK_CUDNN(cudnnActivationBackward(context(),
+                                          sigmoid_activation_descriptor(),
+                                          &alpha,
+                                          descriptor(dest),
+                                          dest.device(),
+                                          descriptor(gradient_input),
+                                          gradient_input.device(),
+                                          descriptor(dest),
+                                          dest.device(),
+                                          &beta,
+                                          descriptor(grad),
+                                          grad.device()));
+        }
+
+    // ------------------------------------------------------------------------------------
+
+        void relu (
+            tensor& dest,
+            const tensor& src
+        )
+        {
+            DLIB_CASSERT(have_same_dimensions(dest,src));
+            if (src.size() == 0)
+                return;
+
+            const float alpha = 1;
+            const float beta = 0;
+            CHECK_CUDNN(cudnnActivationForward(context(),
+                                         relu_activation_descriptor(),
+                                         &alpha,
+                                         descriptor(src),
+                                         src.device(),
+                                         &beta,
+                                         descriptor(dest),
+                                         dest.device()));
+        }
+
+        void relu_gradient (
+            tensor& grad,
+            const tensor& dest,
+            const tensor& gradient_input
+        )
+        {
+            DLIB_CASSERT(
+                  have_same_dimensions(dest,gradient_input) == true &&
+                  have_same_dimensions(dest,grad) == true );
+            if (dest.size() == 0)
+                return;
+
+            const float alpha = 1;
+            const float beta = is_same_object(grad,gradient_input) ? 0 : 1;
+            CHECK_CUDNN(cudnnActivationBackward(context(),
+                                          relu_activation_descriptor(),
+                                          &alpha,
+                                          descriptor(dest),
+                                          dest.device(),
+                                          descriptor(gradient_input),
+                                          gradient_input.device(),
+                                          descriptor(dest),
+                                          dest.device(),
+                                          &beta,
+                                          descriptor(grad),
+                                          grad.device()));
+        }
+
+    // ------------------------------------------------------------------------------------
+
+        void tanh (
+            tensor& dest,
+            const tensor& src
+        )
+        {
+            DLIB_CASSERT(have_same_dimensions(dest,src));
+            if (src.size() == 0)
+                return;
+
+            const float alpha = 1;
+            const float beta = 0;
+            CHECK_CUDNN(cudnnActivationForward(context(),
+                                         tanh_activation_descriptor(),
+                                         &alpha,
+                                         descriptor(src),
+                                         src.device(),
+                                         &beta,
+                                         descriptor(dest),
+                                         dest.device()));
+        }
+
+        void tanh_gradient (
+            tensor& grad,
+            const tensor& dest,
+            const tensor& gradient_input
+        )
+        {
+            DLIB_CASSERT(
+                  have_same_dimensions(dest,gradient_input) == true &&
+                  have_same_dimensions(dest,grad) == true);
+            if (dest.size() == 0)
+                return;
+
+            const float alpha = 1;
+            const float beta = is_same_object(grad,gradient_input) ? 0 : 1;
+            CHECK_CUDNN(cudnnActivationBackward(context(),
+                                          tanh_activation_descriptor(),
+                                          &alpha,
+                                          descriptor(dest),
+                                          dest.device(),
+                                          descriptor(gradient_input),
+                                          gradient_input.device(),
+                                          descriptor(dest),
+                                          dest.device(),
+                                          &beta,
+                                          descriptor(grad),
+                                          grad.device()));
+        }
+
+    // ------------------------------------------------------------------------------------
+    }
+}
+
+#endif // DLIB_USE_CUDA
+
+#endif // DLIB_DNN_CuDNN_CPP_
+
+
diff --git a/ml/dlib/dlib/dnn/cudnn_dlibapi.h b/ml/dlib/dlib/dnn/cudnn_dlibapi.h
new file mode 100644
index 000000000..e9ffe5f6d
--- /dev/null
+++ b/ml/dlib/dlib/dnn/cudnn_dlibapi.h
@@ -0,0 +1,518 @@
+// Copyright (C) 2015  Davis E. King (davis@dlib.net)
+// License: Boost Software License   See LICENSE.txt for the full license.
+#ifndef DLIB_DNN_CuDNN_H_
+#define DLIB_DNN_CuDNN_H_
+
+#ifdef DLIB_USE_CUDA
+
+#include "cuda_errors.h"
+#include <memory>
+#include "cuda_data_ptr.h"
+
+namespace dlib
+{
+    class tensor;
+    class resizable_tensor;
+
+    namespace cuda 
+    {
+
+    // -----------------------------------------------------------------------------------
+
+        class tensor_descriptor
+        {
+            /*!
+                Each tensor object will carry a tensor_descriptor in it when compiled with
+                CUDA.
+            !*/
+
+        public:
+            // not copyable
+            tensor_descriptor(const tensor_descriptor&) = delete;
+            tensor_descriptor& operator=(const tensor_descriptor&) = delete;
+            // but is movable
+            tensor_descriptor(tensor_descriptor&& item) : tensor_descriptor() { swap(item); }
+            tensor_descriptor& operator=(tensor_descriptor&& item) { swap(item); return *this; }
+
+            tensor_descriptor();
+            ~tensor_descriptor();
+
+            void set_size(
+                int n, 
+                int k,
+                int nr, 
+                int nc 
+            );
+            /*!
+                ensures
+                    - if any of the arguments are 0 then they are all set to 0 in the tensor.
+            !*/
+
+            void get_size (
+                int& n, 
+                int& k,
+                int& nr,
+                int& nc 
+            ) const;
+
+            const void* get_handle (
+            ) const { return handle; }
+
+        private:
+
+            void swap(tensor_descriptor& item) { std::swap(handle, item.handle); }
+
+            void* handle;
+        };
+
+        // ------------------------------------------------------------------------------------
+
+        void add(
+            float beta,
+            tensor& dest,
+            float alpha,
+            const tensor& src
+        );
+        /*!
+            requires
+                - One of the following is true: 
+                    - have_same_dimensions(src, dest)
+                    - src.num_samples()==1 && src.k()==dest.k() && src.nr()==1 && src.nc()==1
+                    - src.num_samples()==1 && src.k()==dest.k() && src.nr()==dest.nr() && src.nc()==dest.nc()
+                    - src.num_samples()==1 && src.k()==1 && src.nr()==dest.nr() && src.nc()==dest.nc()
+                - is_same_object(src,dest) == false
+            ensures
+                - performs: dest = beta*dest + alpha*src
+                  However, how the addition happens depends on the dimensions of src.  In
+                  particular, this function adds the scaled values of one src tensor to
+                  dest. Each dimension of the src tensor must match the corresponding
+                  dimension of the dest tensor or must be equal to 1. In the latter case,
+                  the same value from the src tensor, for those dimensions, will be used to
+                  add into the dest tensor.
+        !*/
+
+    // ------------------------------------------------------------------------------------
+
+        void assign_conv_bias_gradient (
+            tensor& grad,
+            const tensor& gradient_input
+        );
+        /*!
+            requires
+                - grad.num_samples() == 1
+                - grad.k()  >= 1
+                - grad.nr() == 1
+                - grad.nc() == 1
+                - gradient_input.k() == grad.k()
+                - gradient_input.size() > 0
+                - is_same_object(grad,gradient_input) == false
+            ensures
+                - let BIAS be a tensor with all dimensions equal to 1 except for k which is >= 1.
+                - let OUT be the output of add(1,OUT,1,BIAS)
+                - let f(gradient_input,BIAS) == dot(gradient_input,OUT)
+                - Then this function computes the gradient of f() with respect to BIAS and
+                  assigns it to grad.
+        !*/
+
+    // ------------------------------------------------------------------------------------
+
+        void batch_normalize_inference (
+            const double eps,
+            resizable_tensor& dest,
+            const tensor& src,
+            const tensor& gamma, 
+            const tensor& beta,
+            const tensor& running_means,
+            const tensor& running_variances
+        );
+
+        void batch_normalize (
+            const double eps,
+            resizable_tensor& dest,
+            resizable_tensor& means,
+            resizable_tensor& invstds,
+            const double averaging_factor,
+            resizable_tensor& running_means,
+            resizable_tensor& running_variances,
+            const tensor& src,
+            const tensor& gamma, 
+            const tensor& beta 
+        );
+
+        void batch_normalize_gradient(
+            const double eps,
+            const tensor& gradient_input,
+            const tensor& means,
+            const tensor& invstds,
+            const tensor& src,
+            const tensor& gamma,
+            tensor& src_grad,
+            tensor& gamma_grad, 
+            tensor& beta_grad 
+        );
+
+    // ------------------------------------------------------------------------------------
+
+        void batch_normalize_conv_inference (
+            const double eps,
+            resizable_tensor& dest,
+            const tensor& src,
+            const tensor& gamma, 
+            const tensor& beta,
+            const tensor& running_means,
+            const tensor& running_variances
+        );
+
+        void batch_normalize_conv (
+            const double eps,
+            resizable_tensor& dest,
+            resizable_tensor& means,
+            resizable_tensor& invstds,
+            const double averaging_factor,
+            resizable_tensor& running_means,
+            resizable_tensor& running_variances,
+            const tensor& src,
+            const tensor& gamma, 
+            const tensor& beta 
+        );
+
+        void batch_normalize_conv_gradient(
+            const double eps,
+            const tensor& gradient_input,
+            const tensor& means,
+            const tensor& invstds,
+            const tensor& src,
+            const tensor& gamma,
+            tensor& src_grad,
+            tensor& gamma_grad, 
+            tensor& beta_grad 
+        );
+
+    // ------------------------------------------------------------------------------------
+
+        class tensor_conv
+        {
+        public:
+            tensor_conv(const tensor_conv&) = delete;
+            tensor_conv& operator=(const tensor_conv&) = delete;
+
+            tensor_conv();
+
+            void clear(
+            );
+
+            ~tensor_conv (
+            );
+
+            void operator() (
+                const bool add_to_output,
+                tensor& output,
+                const tensor& data,
+                const tensor& filters
+            );
+
+            void operator() (
+                const bool add_to_output,
+                resizable_tensor& output,
+                const tensor& data,
+                const tensor& filters
+            );
+
+            void get_gradient_for_data (
+                const bool add_to_output,
+                const tensor& gradient_input, 
+                const tensor& filters,
+                tensor& data_gradient
+            );
+
+            void get_gradient_for_filters (
+                const bool add_to_output,
+                const tensor& gradient_input, 
+                const tensor& data,
+                tensor& filters_gradient
+            );
+
+           void setup(
+                const tensor& data,
+                const tensor& filters,
+                int stride_y,
+                int stride_x,
+                int padding_y,
+                int padding_x
+            );
+
+        private:
+
+            // These variables record the type of data given to the last call to setup().
+            int stride_y;
+            int stride_x;
+            int padding_y;
+            int padding_x;
+            long data_num_samples, data_k, data_nr, data_nc;
+            long filters_num_samples, filters_k, filters_nr, filters_nc;
+
+
+            void* filter_handle;
+            void* conv_handle;
+
+            // dimensions of the output tensor from operator()
+            int out_num_samples;
+            int out_k;
+            int out_nr;
+            int out_nc;
+
+            int forward_algo;
+            int backward_data_algo;
+            int backward_filters_algo;
+
+            size_t forward_workspace_size_in_bytes;
+            size_t backward_data_workspace_size_in_bytes;
+            size_t backward_filters_workspace_size_in_bytes;
+            std::shared_ptr<resizable_cuda_buffer> workspace;
+            cuda_data_void_ptr forward_workspace;
+            cuda_data_void_ptr backward_data_workspace;
+            cuda_data_void_ptr backward_filters_workspace;
+        };
+
+    // ------------------------------------------------------------------------------------
+
+        class pooling
+        {
+        public:
+
+            pooling(const pooling&) = delete;
+            pooling& operator=(const pooling&) = delete;
+
+            pooling (
+            );
+
+            ~pooling(
+            );
+
+            void clear(
+            );
+
+            void setup_max_pooling(
+                int window_height,
+                int window_width,
+                int stride_y,
+                int stride_x,
+                int padding_y,
+                int padding_x
+            );
+
+            void setup_avg_pooling(
+                int window_height,
+                int window_width,
+                int stride_y,
+                int stride_x,
+                int padding_y,
+                int padding_x
+            );
+
+            bool does_max_pooling(
+            ) const { return do_max_pooling; }
+
+            void operator() (
+                resizable_tensor& dest,
+                const tensor& src
+            );
+
+            void get_gradient(
+                const tensor& gradient_input, 
+                const tensor& dest,
+                const tensor& src,
+                tensor& grad 
+            );
+
+        private:
+
+            void setup(
+                int window_height,
+                int window_width,
+                int stride_y,
+                int stride_x,
+                int padding_y,
+                int padding_x,
+                int pooling_mode
+            );
+
+            void* handle;
+            int window_height;
+            int window_width;
+            int stride_y;
+            int stride_x;
+            int padding_y;
+            int padding_x;
+            bool do_max_pooling;
+        };
+
+    // ------------------------------------------------------------------------------------
+
+        void softmax (
+            tensor& dest,
+            const tensor& src
+        );
+        /*!
+            requires
+                - have_same_dimensions(dest, src) == true
+            ensures
+                - Note that the softmax function is a vector valued function: 
+                    s(x) == exp(x)/sum(exp(x)) 
+                - Computes the softmax function on src and writes the results to dest.  The
+                  softmax is computed per spatial location across the different channels at
+                  each location.  That is, softmax() outputs a new tensor, #dest, where
+                  each of the spatial locations in dest (i.e. image idx, row idx, and
+                  column idx) contains the output of s() evaluated over the channel values
+                  at each location.
+                - This function supports in-place operation, i.e. having
+                  is_same_object(dest, src)==true
+        !*/
+
+        void softmax_gradient (
+            tensor& grad,
+            const tensor& dest,
+            const tensor& gradient_input
+        );
+        /*!
+            requires
+                - have_same_dimensions(dest,gradient_input) == true 
+                - have_same_dimensions(dest,grad) == true 
+                - is_same_object(grad, dest)==false
+            ensures
+                - We interpret dest as the output of softmax(dest,SRC) for some SRC tensor.
+                  Then let f(SRC) == dot(gradient_input,dest) Then this function computes
+                  the gradient of f() with respect to SRC and assigns it to grad.
+                - This function supports in-place operation, i.e. having
+                  is_same_object(grad, gradient_input)==true
+        !*/
+
+    // ------------------------------------------------------------------------------------
+
+        void softmax_all (
+            tensor& dest,
+            const tensor& src
+        );
+
+        void softmax_all_gradient (
+            tensor& grad,
+            const tensor& dest,
+            const tensor& gradient_input
+        );
+
+    // ------------------------------------------------------------------------------------
+
+        void sigmoid (
+            tensor& dest,
+            const tensor& src
+        );
+        /*!
+            requires
+                - have_same_dimensions(dest, src) == true
+            ensures
+                - for all valid i:
+                    - #dest.host()[i] == 1/(1+std::exp(-src.host()[i])) 
+                - This function supports in-place operation, i.e. having
+                  is_same_object(dest, src)==true
+        !*/
+
+        void sigmoid_gradient (
+            tensor& grad,
+            const tensor& dest,
+            const tensor& gradient_input
+        );
+        /*!
+            requires
+                - have_same_dimensions(dest,gradient_input) == true 
+                - have_same_dimensions(dest,grad) == true 
+                - is_same_object(grad,dest) == false
+            ensures
+                - Recalling that dest is the output of sigmoid(dest,SRC) for some SRC tensor,
+                  let f(SRC) == dot(gradient_input,dest)
+                - Then this function computes the gradient of f() with respect to SRC and
+                  assigns it to grad.
+                - This function supports in-place operation, i.e. having
+                  is_same_object(grad, gradient_input)==true
+        !*/
+
+    // ------------------------------------------------------------------------------------
+
+        void relu (
+            tensor& dest,
+            const tensor& src
+        );
+        /*!
+            requires
+                - have_same_dimensions(dest, src) == true
+            ensures
+                - for all valid i:
+                    - #dest.host()[i] == std::max(0,src.host()[i]) 
+                - This function supports in-place operation, i.e. having
+                  is_same_object(dest, src)==true
+        !*/
+
+        void relu_gradient (
+            tensor& grad,
+            const tensor& dest,
+            const tensor& gradient_input
+        );
+        /*!
+            requires
+                - have_same_dimensions(dest,gradient_input) == true 
+                - have_same_dimensions(dest,grad) == true 
+                - is_same_object(grad,dest) == false
+            ensures
+                - Recalling that dest is the output of relu(dest,SRC) for some SRC tensor,
+                  let f(SRC) == dot(gradient_input,dest)
+                - Then this function computes the gradient of f() with respect to SRC and
+                  assigns it to grad.
+                - This function supports in-place operation, i.e. having
+                  is_same_object(grad, gradient_input)==true
+        !*/
+
+    // ------------------------------------------------------------------------------------
+
+        void tanh (
+            tensor& dest,
+            const tensor& src
+        );
+        /*!
+            requires
+                - have_same_dimensions(dest, src) == true
+            ensures
+                - for all valid i:
+                    - #dest.host()[i] == std::tanh(src.host()[i]) 
+                - This function supports in-place operation, i.e. having
+                  is_same_object(dest, src)==true
+        !*/
+
+        void tanh_gradient (
+            tensor& grad,
+            const tensor& dest,
+            const tensor& gradient_input
+        );
+        /*!
+            requires
+                - have_same_dimensions(dest,gradient_input) == true 
+                - have_same_dimensions(dest,grad) == true 
+                - is_same_object(grad,dest) == false
+            ensures
+                - Recalling that dest is the output of tanh(dest,SRC) for some SRC tensor,
+                  let f(SRC) == dot(gradient_input,dest)
+                - Then this function computes the gradient of f() with respect to SRC and
+                  assigns it to grad.
+                - This function supports in-place operation, i.e. having
+                  is_same_object(grad, gradient_input)==true
+        !*/
+
+
+
+    // ------------------------------------------------------------------------------------
+
+    } 
+}
+
+#endif // DLIB_USE_CUDA
+
+#endif // DLIB_DNN_CuDNN_H_
+
diff --git a/ml/dlib/dlib/dnn/curand_dlibapi.cpp b/ml/dlib/dlib/dnn/curand_dlibapi.cpp
new file mode 100644
index 000000000..67828e664
--- /dev/null
+++ b/ml/dlib/dlib/dnn/curand_dlibapi.cpp
@@ -0,0 +1,113 @@
+// Copyright (C) 2015  Davis E. King (davis@dlib.net)
+// License: Boost Software License   See LICENSE.txt for the full license.
+#ifndef DLIB_DNN_CuRAND_CPP_
+#define DLIB_DNN_CuRAND_CPP_
+
+#ifdef DLIB_USE_CUDA
+
+#include "curand_dlibapi.h"
+#include <curand.h>
+#include "../string.h"
+
+static const char* curand_get_error_string(curandStatus_t s)
+{
+    switch(s)
+    {
+        case CURAND_STATUS_NOT_INITIALIZED: 
+            return "CUDA Runtime API initialization failed.";
+        case CURAND_STATUS_LENGTH_NOT_MULTIPLE:
+            return "The requested length must be a multiple of two.";
+        default:
+            return "A call to cuRAND failed";
+    }
+}
+
+// Check the return value of a call to the cuDNN runtime for an error condition.
+#define CHECK_CURAND(call)                                                      \
+do{                                                                              \
+    const curandStatus_t error = call;                                         \
+    if (error != CURAND_STATUS_SUCCESS)                                        \
+    {                                                                          \
+        std::ostringstream sout;                                               \
+        sout << "Error while calling " << #call << " in file " << __FILE__ << ":" << __LINE__ << ". ";\
+        sout << "code: " << error << ", reason: " << curand_get_error_string(error);\
+        throw dlib::curand_error(sout.str());                            \
+    }                                                                          \
+}while(false)
+
+namespace dlib
+{
+    namespace cuda 
+    {
+
+    // ----------------------------------------------------------------------------------------
+
+        curand_generator::
+        curand_generator(
+            unsigned long long seed
+        ) : handle(nullptr)
+        {
+            curandGenerator_t gen;
+            CHECK_CURAND(curandCreateGenerator(&gen, CURAND_RNG_PSEUDO_DEFAULT));
+            handle = gen;
+
+            CHECK_CURAND(curandSetPseudoRandomGeneratorSeed(gen, seed));
+        }
+
+        curand_generator::
+        ~curand_generator()
+        {
+            if (handle)
+            {
+                curandDestroyGenerator((curandGenerator_t)handle);
+            }
+        }
+
+        void curand_generator::
+        fill_gaussian (
+            tensor& data,
+            float mean,
+            float stddev
+        )
+        {
+            if (data.size() == 0)
+                return;
+
+            CHECK_CURAND(curandGenerateNormal((curandGenerator_t)handle, 
+                                        data.device(),
+                                        data.size(),
+                                        mean,
+                                        stddev));
+        }
+
+        void curand_generator::
+        fill_uniform (
+            tensor& data
+        )
+        {
+            if (data.size() == 0)
+                return;
+
+            CHECK_CURAND(curandGenerateUniform((curandGenerator_t)handle, data.device(), data.size()));
+        }
+
+        void curand_generator::
+        fill (
+            cuda_data_ptr<unsigned int>& data
+        )
+        {
+            if (data.size() == 0)
+                return;
+
+            CHECK_CURAND(curandGenerate((curandGenerator_t)handle, data, data.size()));
+        }
+
+    // -----------------------------------------------------------------------------------
+
+    }  
+}
+
+#endif // DLIB_USE_CUDA
+
+#endif // DLIB_DNN_CuRAND_CPP_
+
diff --git a/ml/dlib/dlib/dnn/curand_dlibapi.h b/ml/dlib/dlib/dnn/curand_dlibapi.h
new file mode 100644
index 000000000..cd51fecee
--- /dev/null
+++ b/ml/dlib/dlib/dnn/curand_dlibapi.h
@@ -0,0 +1,75 @@
+// Copyright (C) 2015  Davis E. King (davis@dlib.net)
+// License: Boost Software License   See LICENSE.txt for the full license.
+#ifndef DLIB_DNN_CuRAND_H_
+#define DLIB_DNN_CuRAND_H_
+
+#ifdef DLIB_USE_CUDA
+
+#include "tensor.h"
+#include "cuda_errors.h"
+#include "cuda_data_ptr.h"
+
+namespace dlib
+{
+    namespace cuda 
+    {
+
+    // -----------------------------------------------------------------------------------
+
+        class curand_generator
+        {
+        public:
+            // not copyable
+            curand_generator(const curand_generator&) = delete;
+            curand_generator& operator=(const curand_generator&) = delete;
+
+            curand_generator() : curand_generator(0) {}
+            curand_generator(unsigned long long seed);
+            ~curand_generator();
+
+            void fill (
+                cuda_data_ptr<unsigned int>& data
+            );
+            /*!
+                ensures
+                    - Fills data with random 32-bit unsigned integers.
+            !*/
+
+            void fill_gaussian (
+                tensor& data,
+                float mean = 0,
+                float stddev = 1
+            );
+            /*!
+                requires
+                    - data.size()%2 == 0
+                    - stddev >= 0
+                ensures
+                    - Fills data with random numbers drawn from a Gaussian distribution
+                      with the given mean and standard deviation.
+            !*/
+
+            void fill_uniform (
+                tensor& data
+            );
+            /*!
+                ensures
+                    - Fills data with uniform random numbers in the range (0.0, 1.0].
+            !*/
+
+        private:
+
+            void* handle;
+        };
+
+    // -----------------------------------------------------------------------------------
+
+    }  
+}
+
+#endif // DLIB_USE_CUDA
+
+#endif // DLIB_DNN_CuRAND_H_
+
+
+
diff --git a/ml/dlib/dlib/dnn/cusolver_dlibapi.cu b/ml/dlib/dlib/dnn/cusolver_dlibapi.cu
new file mode 100644
index 000000000..942613134
--- /dev/null
+++ b/ml/dlib/dlib/dnn/cusolver_dlibapi.cu
@@ -0,0 +1,204 @@
+// Copyright (C) 2017  Davis E. King (davis@dlib.net)
+// License: Boost Software License   See LICENSE.txt for the full license.
+#ifndef DLIB_DNN_CuSOLVER_CU_
+#define DLIB_DNN_CuSOLVER_CU_
+
+#ifdef DLIB_USE_CUDA
+
+#include "cusolver_dlibapi.h"
+#include <cublas_v2.h>
+#include <cusolverDn.h>
+#include "cuda_utils.h"
+
+// ----------------------------------------------------------------------------------------
+
+static const char* cusolver_get_error_string(cusolverStatus_t s)
+{
+    switch(s)
+    {
+        case CUSOLVER_STATUS_NOT_INITIALIZED: 
+            return "CUDA Runtime API initialization failed.";
+        case CUSOLVER_STATUS_ALLOC_FAILED: 
+            return "CUDA Resources could not be allocated.";
+        default:
+            return "A call to cuSolver failed";
+    }
+}
+
+// Check the return value of a call to the cuSolver runtime for an error condition.
+#define CHECK_CUSOLVER(call)                                                      \
+do{                                                                              \
+    const cusolverStatus_t error = call;                                         \
+    if (error != CUSOLVER_STATUS_SUCCESS)                                        \
+    {                                                                          \
+        std::ostringstream sout;                                               \
+        sout << "Error while calling " << #call << " in file " << __FILE__ << ":" << __LINE__ << ". ";\
+        sout << "code: " << error << ", reason: " << cusolver_get_error_string(error);\
+        throw dlib::cusolver_error(sout.str());                                \
+    }                                                                          \
+}while(false)
+
+// ----------------------------------------------------------------------------------------
+// ----------------------------------------------------------------------------------------
+
+namespace dlib
+{
+    namespace cuda 
+    {
+
+    // -----------------------------------------------------------------------------------
+
+        class cusolver_context
+        {
+        public:
+            // not copyable
+            cusolver_context(const cusolver_context&) = delete;
+            cusolver_context& operator=(const cusolver_context&) = delete;
+
+            cusolver_context()
+            {
+                handles.resize(16);
+            }
+            ~cusolver_context()
+            {
+                for (auto h : handles)
+                {
+                    if (h)
+                        cusolverDnDestroy(h);
+                }
+            }
+
+            cusolverDnHandle_t get_handle (
+            )  
+            { 
+                int new_device_id;
+                CHECK_CUDA(cudaGetDevice(&new_device_id));
+                // make room for more devices if needed
+                if (new_device_id >= (long)handles.size())
+                    handles.resize(new_device_id+16);
+
+                // If we don't have a handle already for this device then make one
+                if (!handles[new_device_id])
+                    CHECK_CUSOLVER(cusolverDnCreate(&handles[new_device_id]));
+
+                // Finally, return the handle for the current device
+                return handles[new_device_id];
+            }
+
+        private:
+
+            std::vector<cusolverDnHandle_t> handles;
+        };
+
+        static cusolverDnHandle_t context()
+        {
+            thread_local cusolver_context c;
+            return c.get_handle();
+        }
+
+    // ------------------------------------------------------------------------------------
+    // ------------------------------------------------------------------------------------
+    // ------------------------------------------------------------------------------------
+
+        __global__ void _cuda_set_to_identity_matrix(float* m, size_t nr)
+        {
+            for (auto j : grid_stride_range(0, nr*nr))
+            {
+                if (j%(nr+1) == 0)
+                    m[j] = 1;
+                else
+                    m[j] = 0;
+            }
+        }
+
+        void set_to_identity_matrix (
+            tensor& m 
+        )
+        {
+            DLIB_CASSERT(m.size() == m.num_samples()*m.num_samples());
+            launch_kernel(_cuda_set_to_identity_matrix, max_jobs(m.size()), m.device(), m.num_samples());
+        }
+
+    // ------------------------------------------------------------------------------------
+
+        inv::~inv()
+        {
+            sync_if_needed();
+        }
+
+    // ------------------------------------------------------------------------------------
+
+        void inv::
+        operator() (
+            const tensor& m_,
+            resizable_tensor& out
+        )
+        {
+            DLIB_CASSERT(m_.size() == m_.num_samples()*m_.num_samples(), "Input matrix must be square if you want to invert it.");
+            m = m_;
+
+            out.copy_size(m);
+            set_to_identity_matrix(out);
+
+            const int nc = m.num_samples();
+            int Lwork;
+            CHECK_CUSOLVER(cusolverDnSgetrf_bufferSize(context(), nc , nc, m.device(), nc, &Lwork));
+
+            if (Lwork > (int)workspace.size())
+            {
+                sync_if_needed();
+                workspace = cuda_data_ptr<float>(Lwork);
+            }
+            if (nc > (int)Ipiv.size())
+            {
+                sync_if_needed();
+                Ipiv = cuda_data_ptr<int>(nc);
+            }
+            if (info.size() != 1)
+            {
+                info = cuda_data_ptr<int>(1);
+            }
+
+            CHECK_CUSOLVER(cusolverDnSgetrf(context(), nc, nc, m.device(), nc, workspace, Ipiv, info));
+            CHECK_CUSOLVER(cusolverDnSgetrs(context(), CUBLAS_OP_N, nc, nc, m.device(), nc, Ipiv, out.device(), nc, info));
+            did_work_lately = true;
+        }
+
+    // ------------------------------------------------------------------------------------
+
+        int inv::
+        get_last_status(
+        )
+        {
+            std::vector<int> linfo; 
+            memcpy(linfo, info);
+            if (linfo.size() != 0)
+                return linfo[0];
+            else
+                return 0;
+        }
+
+    // ------------------------------------------------------------------------------------
+
+        void inv::
+        sync_if_needed()
+        {
+            if (did_work_lately)
+            {
+                did_work_lately = false;
+                // make sure we wait until any previous kernel launches have finished
+                // before we do something like deallocate the GPU memory.
+                cudaDeviceSynchronize();
+            }
+        }
+
+    // ------------------------------------------------------------------------------------
+
+    }  
+}
+
+#endif // DLIB_USE_CUDA
+
+#endif // DLIB_DNN_CuSOLVER_CU_
+
+
diff --git a/ml/dlib/dlib/dnn/cusolver_dlibapi.h b/ml/dlib/dlib/dnn/cusolver_dlibapi.h
new file mode 100644
index 000000000..e5c77c151
--- /dev/null
+++ b/ml/dlib/dlib/dnn/cusolver_dlibapi.h
@@ -0,0 +1,75 @@
+// Copyright (C) 2017  Davis E. King (davis@dlib.net)
+// License: Boost Software License   See LICENSE.txt for the full license.
+#ifndef DLIB_DNN_CuSOLVER_H_
+#define DLIB_DNN_CuSOLVER_H_
+
+#ifdef DLIB_USE_CUDA
+
+#include "tensor.h"
+#include "cuda_errors.h"
+#include "cuda_data_ptr.h"
+#include "../noncopyable.h"
+
+namespace dlib
+{
+    namespace cuda 
+    {
+
+    // -----------------------------------------------------------------------------------
+
+        class inv : noncopyable
+        {
+            /*!
+                WHAT THIS OBJECT REPRESENTS
+                    This is a functor for doing matrix inversion on the GPU.  The only
+                    reason it's an object is to avoid the reallocation of some GPU memory
+                    blocks if you want to do a bunch of matrix inversions in a row.
+            !*/
+
+        public:
+
+            inv() = default;
+            ~inv();
+
+            void operator() (
+                const tensor& m,
+                resizable_tensor& out
+            );
+            /*!
+                requires
+                    - m.size() == m.num_samples()*m.num_samples()
+                      (i.e. mat(m) must be a square matrix)
+                ensures
+                    - out == inv(mat(m));
+            !*/
+
+            int get_last_status(
+            );
+            /*!
+                ensures
+                    - returns 0 if the last matrix inversion was successful and != 0
+                      otherwise.
+            !*/
+
+        private:
+
+            void sync_if_needed();
+
+            bool did_work_lately = false;
+            resizable_tensor m;
+            cuda_data_ptr<float> workspace;
+            cuda_data_ptr<int> Ipiv;
+            cuda_data_ptr<int> info;
+        };
+
+    // ------------------------------------------------------------------------------------
+
+    }  
+}
+
+#endif // DLIB_USE_CUDA
+
+#endif // DLIB_DNN_CuSOLVER_H_
+
+
+
diff --git a/ml/dlib/dlib/dnn/gpu_data.cpp b/ml/dlib/dlib/dnn/gpu_data.cpp
new file mode 100644
index 000000000..6e7cec6be
--- /dev/null
+++ b/ml/dlib/dlib/dnn/gpu_data.cpp
@@ -0,0 +1,228 @@
+// Copyright (C) 2015  Davis E. King (davis@dlib.net)
+// License: Boost Software License   See LICENSE.txt for the full license.
+#ifndef DLIB_GPU_DaTA_CPP_
+#define DLIB_GPU_DaTA_CPP_
+
+// Only things that require CUDA are declared in this cpp file.  Everything else is in the
+// gpu_data.h header so that it can operate as "header-only" code when using just the CPU.
+#ifdef DLIB_USE_CUDA
+
+#include "gpu_data.h"
+#include <iostream>
+#include "cuda_utils.h"
+#include <cstring>
+
+
+namespace dlib
+{
+
+// ----------------------------------------------------------------------------------------
+
+    void memcpy (
+        gpu_data& dest, 
+        const gpu_data& src
+    )
+    {
+        DLIB_CASSERT(dest.size() == src.size());
+        if (src.size() == 0 || &dest == &src)
+            return;
+
+        memcpy(dest,0, src, 0, src.size());
+    }
+
+    void memcpy (
+        gpu_data& dest, 
+        size_t dest_offset,
+        const gpu_data& src,
+        size_t src_offset,
+        size_t num
+    )
+    {
+        DLIB_CASSERT(dest_offset + num <= dest.size());
+        DLIB_CASSERT(src_offset + num <= src.size());
+        if (num == 0)
+            return;
+
+        // if there is aliasing
+        if (&dest == &src && std::max(dest_offset, src_offset) < std::min(dest_offset,src_offset)+num)
+        {
+            // if they perfectly alias each other then there is nothing to do
+            if (dest_offset == src_offset)
+                return;
+            else
+                std::memmove(dest.host()+dest_offset, src.host()+src_offset, sizeof(float)*num);
+        }
+        else
+        {
+            // if we write to the entire thing then we can use device_write_only()
+            if (dest_offset == 0 && num == dest.size())
+            {
+                // copy the memory efficiently based on which copy is current in each object.
+                if (src.device_ready())
+                    CHECK_CUDA(cudaMemcpy(dest.device_write_only(), src.device()+src_offset,  num*sizeof(float), cudaMemcpyDeviceToDevice));
+                else 
+                    CHECK_CUDA(cudaMemcpy(dest.device_write_only(), src.host()+src_offset,    num*sizeof(float), cudaMemcpyHostToDevice));
+            }
+            else
+            {
+                // copy the memory efficiently based on which copy is current in each object.
+                if (dest.device_ready() && src.device_ready())
+                    CHECK_CUDA(cudaMemcpy(dest.device()+dest_offset, src.device()+src_offset, num*sizeof(float), cudaMemcpyDeviceToDevice));
+                else if (!dest.device_ready() && src.device_ready())
+                    CHECK_CUDA(cudaMemcpy(dest.host()+dest_offset, src.device()+src_offset,   num*sizeof(float), cudaMemcpyDeviceToHost));
+                else if (dest.device_ready() && !src.device_ready())
+                    CHECK_CUDA(cudaMemcpy(dest.device()+dest_offset, src.host()+src_offset,   num*sizeof(float), cudaMemcpyHostToDevice));
+                else 
+                    CHECK_CUDA(cudaMemcpy(dest.host()+dest_offset, src.host()+src_offset,     num*sizeof(float), cudaMemcpyHostToHost));
+            }
+        }
+    }
+// ----------------------------------------------------------------------------------------
+
+    void gpu_data::
+    wait_for_transfer_to_finish() const
+    {
+        if (have_active_transfer)
+        {
+            CHECK_CUDA(cudaStreamSynchronize((cudaStream_t)cuda_stream.get()));
+            have_active_transfer = false;
+            // Check for errors.  These calls to cudaGetLastError() are what help us find
+            // out if our kernel launches have been failing.
+            CHECK_CUDA(cudaGetLastError());
+        }
+    }
+
+    void gpu_data::
+    copy_to_device() const
+    {
+        // We want transfers to the device to always be concurrent with any device
+        // computation.  So we use our non-default stream to do the transfer.
+        async_copy_to_device();
+        wait_for_transfer_to_finish();
+    }
+
+    void gpu_data::
+    copy_to_host() const
+    {
+        if (!host_current)
+        {
+            wait_for_transfer_to_finish();
+            CHECK_CUDA(cudaMemcpy(data_host.get(), data_device.get(), data_size*sizeof(float), cudaMemcpyDeviceToHost));
+            host_current = true;
+            // At this point we know our RAM block isn't in use because cudaMemcpy()
+            // implicitly syncs with the device. 
+            device_in_use = false;
+            // Check for errors.  These calls to cudaGetLastError() are what help us find
+            // out if our kernel launches have been failing.
+            CHECK_CUDA(cudaGetLastError());
+        }
+    }
+
+    void gpu_data::
+    async_copy_to_device() const
+    {
+        if (!device_current)
+        {
+            if (device_in_use)
+            {
+                // Wait for any possible CUDA kernels that might be using our memory block to
+                // complete before we overwrite the memory.
+                CHECK_CUDA(cudaStreamSynchronize(0));
+                device_in_use = false;
+            }
+            CHECK_CUDA(cudaMemcpyAsync(data_device.get(), data_host.get(), data_size*sizeof(float), cudaMemcpyHostToDevice, (cudaStream_t)cuda_stream.get()));
+            have_active_transfer = true;
+            device_current = true;
+        }
+    }
+
+    void gpu_data::
+    set_size(
+        size_t new_size
+    )
+    {
+        if (new_size == 0)
+        {
+            if (device_in_use)
+            {
+                // Wait for any possible CUDA kernels that might be using our memory block to
+                // complete before we free the memory.
+                CHECK_CUDA(cudaStreamSynchronize(0));
+                device_in_use = false;
+            }
+            wait_for_transfer_to_finish();
+            data_size = 0;
+            host_current = true;
+            device_current = true;
+            device_in_use = false;
+            data_host.reset();
+            data_device.reset();
+        }
+        else if (new_size != data_size)
+        {
+            if (device_in_use)
+            {
+                // Wait for any possible CUDA kernels that might be using our memory block to
+                // complete before we free the memory.
+                CHECK_CUDA(cudaStreamSynchronize(0));
+                device_in_use = false;
+            }
+            wait_for_transfer_to_finish();
+            data_size = new_size;
+            host_current = true;
+            device_current = true;
+            device_in_use = false;
+
+            try
+            {
+                CHECK_CUDA(cudaGetDevice(&the_device_id));
+
+                // free memory blocks before we allocate new ones.
+                data_host.reset();
+                data_device.reset();
+
+                void* data;
+                CHECK_CUDA(cudaMallocHost(&data, new_size*sizeof(float)));
+                // Note that we don't throw exceptions since the free calls are invariably
+                // called in destructors.  They also shouldn't fail anyway unless someone
+                // is resetting the GPU card in the middle of their program.
+                data_host.reset((float*)data, [](float* ptr){
+                    auto err = cudaFreeHost(ptr);
+                    if(err!=cudaSuccess)
+                        std::cerr << "cudaFreeHost() failed. Reason: " << cudaGetErrorString(err) << std::endl;
+                });
+
+                CHECK_CUDA(cudaMalloc(&data, new_size*sizeof(float)));
+                data_device.reset((float*)data, [](float* ptr){
+                    auto err = cudaFree(ptr);
+                    if(err!=cudaSuccess)
+                        std::cerr << "cudaFree() failed. Reason: " << cudaGetErrorString(err) << std::endl;
+                });
+
+                if (!cuda_stream)
+                {
+                    cudaStream_t cstream;
+                    CHECK_CUDA(cudaStreamCreateWithFlags(&cstream, cudaStreamNonBlocking));
+                    cuda_stream.reset(cstream, [](void* ptr){
+                        auto err = cudaStreamDestroy((cudaStream_t)ptr);
+                        if(err!=cudaSuccess)
+                            std::cerr << "cudaStreamDestroy() failed. Reason: " << cudaGetErrorString(err) << std::endl;
+                    });
+                }
+
+            }
+            catch(...)
+            {
+                set_size(0);
+                throw;
+            }
+        }
+    }
+
+// ----------------------------------------------------------------------------------------
+}
+
+#endif // DLIB_USE_CUDA
+
+#endif // DLIB_GPU_DaTA_CPP_
+
diff --git a/ml/dlib/dlib/dnn/gpu_data.h b/ml/dlib/dlib/dnn/gpu_data.h
new file mode 100644
index 000000000..022a05f71
--- /dev/null
+++ b/ml/dlib/dlib/dnn/gpu_data.h
@@ -0,0 +1,266 @@
+// Copyright (C) 2015  Davis E. King (davis@dlib.net)
+// License: Boost Software License   See LICENSE.txt for the full license.
+#ifndef DLIB_GPU_DaTA_H_
+#define DLIB_GPU_DaTA_H_
+
+#include "gpu_data_abstract.h"
+#include <memory>
+#include <cstring>
+#include "cuda_errors.h"
+#include "../serialize.h"
+
+namespace dlib
+{
+
+// ----------------------------------------------------------------------------------------
+
+    class gpu_data 
+    {
+        /*!
+            CONVENTION
+                - if (size() != 0) then
+                    - data_host == a pointer to size() floats in CPU memory.
+                - if (data_device) then 
+                    - data_device == a pointer to size() floats in device memory.
+
+                - if (there might be an active async transfer from host to device) then
+                    - have_active_transfer == true
+
+                - We use the host_current and device_current bools to keep track of which
+                  copy of the data (or both) are most current.  e.g. if the CPU has
+                  modified the data and it hasn't been copied to the device yet then
+                  host_current==true and device_current==false.
+
+                  Similarly, we use device_in_use==true to indicate that device() has been
+                  called and no operation to wait for all CUDA kernel completion has been
+                  executed.  So if device_in_use==true then there might be a CUDA kernel
+                  executing that is using the device memory block contained in this object.
+
+        !*/
+    public:
+
+        gpu_data(
+        ) : data_size(0), host_current(true), device_current(true),have_active_transfer(false),device_in_use(false), the_device_id(0)
+        {
+        }
+
+        // Not copyable
+        gpu_data(const gpu_data&) = delete;
+        gpu_data& operator=(const gpu_data&) = delete;
+
+        // but is movable
+        gpu_data(gpu_data&& item) : gpu_data() { swap(item); }
+        gpu_data& operator=(gpu_data&& item) { swap(item); return *this; }
+
+        int device_id() const { return the_device_id; }
+
+#ifdef DLIB_USE_CUDA
+        void async_copy_to_device() const; 
+        void set_size(size_t new_size);
+#else
+        // Note that calls to host() or device() will block until any async transfers are complete.
+        void async_copy_to_device() const{}
+
+        void set_size(size_t new_size)
+        {
+            if (new_size == 0)
+            {
+                data_size = 0;
+                host_current = true;
+                device_current = true;
+                device_in_use = false;
+                data_host.reset();
+                data_device.reset();
+            }
+            else if (new_size != data_size)
+            {
+                data_size = new_size;
+                host_current = true;
+                device_current = true;
+                device_in_use = false;
+                data_host.reset(new float[new_size], std::default_delete<float[]>());
+                data_device.reset();
+            }
+        }
+#endif
+
+        const float* host() const 
+        { 
+            copy_to_host();
+            return data_host.get(); 
+        }
+
+        float* host() 
+        {
+            copy_to_host();
+            device_current = false;
+            return data_host.get(); 
+        }
+
+        float* host_write_only() 
+        {
+            host_current = true;
+            device_current = false;
+            return data_host.get(); 
+        }
+
+        const float* device() const 
+        { 
+#ifndef DLIB_USE_CUDA
+            DLIB_CASSERT(false, "CUDA NOT ENABLED");
+#endif
+            copy_to_device();
+            device_in_use = true;
+            return data_device.get(); 
+        }
+
+        float* device() 
+        {
+#ifndef DLIB_USE_CUDA
+            DLIB_CASSERT(false, "CUDA NOT ENABLED");
+#endif
+            copy_to_device();
+            host_current = false;
+            device_in_use = true;
+            return data_device.get(); 
+        }
+
+        float* device_write_only()
+        {
+#ifndef DLIB_USE_CUDA
+            DLIB_CASSERT(false, "CUDA NOT ENABLED");
+#endif
+            wait_for_transfer_to_finish();
+            host_current = false;
+            device_current = true;
+            device_in_use = true;
+            return data_device.get(); 
+        }
+
+        bool host_ready (
+        ) const { return host_current; }
+
+        bool device_ready (
+        ) const { return device_current && !have_active_transfer; }
+
+        size_t size() const { return data_size; }
+
+        void swap (gpu_data& item)
+        {
+            std::swap(data_size, item.data_size);
+            std::swap(host_current, item.host_current);
+            std::swap(device_current, item.device_current);
+            std::swap(have_active_transfer, item.have_active_transfer);
+            std::swap(data_host, item.data_host);
+            std::swap(data_device, item.data_device);
+            std::swap(cuda_stream, item.cuda_stream);
+            std::swap(the_device_id, item.the_device_id);
+        }
+
+    private:
+
+#ifdef DLIB_USE_CUDA
+        void copy_to_device() const;
+        void copy_to_host() const;
+        void wait_for_transfer_to_finish() const;
+#else
+        void copy_to_device() const{}
+        void copy_to_host() const{}
+        void wait_for_transfer_to_finish() const{}
+#endif
+
+
+        size_t data_size;
+        mutable bool host_current;
+        mutable bool device_current;
+        mutable bool have_active_transfer;
+        mutable bool device_in_use;
+
+        std::shared_ptr<float> data_host;
+        std::shared_ptr<float> data_device;
+        std::shared_ptr<void> cuda_stream;
+        int the_device_id;
+    };
+
+    inline void serialize(const gpu_data& item, std::ostream& out)
+    {
+        int version = 1;
+        serialize(version, out);
+        serialize(item.size(), out);
+        auto data = item.host();
+        for (size_t i = 0; i < item.size(); ++i)
+            serialize(data[i], out);
+    }
+
+    inline void deserialize(gpu_data& item, std::istream& in)
+    {
+        int version;
+        deserialize(version, in);
+        if (version != 1)
+            throw serialization_error("Unexpected version found while deserializing dlib::gpu_data.");
+        size_t s;
+        deserialize(s, in);
+        item.set_size(s);
+        auto data = item.host();
+        for (size_t i = 0; i < item.size(); ++i)
+            deserialize(data[i], in);
+    }
+
+#ifdef DLIB_USE_CUDA
+    void memcpy (gpu_data& dest, const gpu_data& src);
+
+    void memcpy (
+        gpu_data& dest, 
+        size_t dest_offset,
+        const gpu_data& src,
+        size_t src_offset,
+        size_t num
+    );
+
+#else
+
+    inline void memcpy (gpu_data& dest, const gpu_data& src)
+    {
+        DLIB_CASSERT(dest.size() == src.size());
+        if (src.size() == 0 || &dest == &src)
+            return;
+        std::memcpy(dest.host_write_only(), src.host(), sizeof(float)*src.size());
+    }
+
+    inline void memcpy (
+        gpu_data& dest, 
+        size_t dest_offset,
+        const gpu_data& src,
+        size_t src_offset,
+        size_t num
+    )
+    {
+        DLIB_CASSERT(dest_offset + num <= dest.size());
+        DLIB_CASSERT(src_offset + num <= src.size());
+        if (num == 0)
+            return;
+        if (&dest == &src && std::max(dest_offset, src_offset) < std::min(dest_offset,src_offset)+num)
+        {
+            // if they perfectly alias each other then there is nothing to do
+            if (dest_offset == src_offset)
+                return;
+            else
+                std::memmove(dest.host()+dest_offset, src.host()+src_offset, sizeof(float)*num);
+        }
+        else
+        {
+            // if we write to the entire thing then we can use host_write_only()
+            if (dest_offset == 0 && num == dest.size())
+                std::memcpy(dest.host_write_only(), src.host()+src_offset, sizeof(float)*num);
+            else
+                std::memcpy(dest.host()+dest_offset, src.host()+src_offset, sizeof(float)*num);
+        }
+    }
+#endif
+
+// ----------------------------------------------------------------------------------------
+
+}
+
+#endif // DLIB_GPU_DaTA_H_
+
diff --git a/ml/dlib/dlib/dnn/gpu_data_abstract.h b/ml/dlib/dlib/dnn/gpu_data_abstract.h
new file mode 100644
index 000000000..f2423dee1
--- /dev/null
+++ b/ml/dlib/dlib/dnn/gpu_data_abstract.h
@@ -0,0 +1,266 @@
+// Copyright (C) 2015  Davis E. King (davis@dlib.net)
+// License: Boost Software License   See LICENSE.txt for the full license.
+#undef DLIB_GPU_DaTA_ABSTRACT_H_
+#ifdef DLIB_GPU_DaTA_ABSTRACT_H_
+
+#include "cuda_errors.h"
+#include "../serialize.h"
+
+namespace dlib
+{
+
+// ----------------------------------------------------------------------------------------
+
+    class gpu_data 
+    {
+        /*!
+            WHAT THIS OBJECT REPRESENTS
+                This object is a block of size() floats, all stored contiguously in memory.
+                Importantly, it keeps two copies of the floats, one on the host CPU side
+                and another on the GPU device side. It automatically performs the necessary
+                host/device transfers to keep these two copies of the data in sync.
+
+                All transfers to the device happen asynchronously with respect to the
+                default CUDA stream so that CUDA kernel computations can overlap with data
+                transfers.  However, any transfers from the device to the host happen
+                synchronously in the default CUDA stream.  Therefore, you should perform
+                all your CUDA kernel launches on the default stream so that transfers back
+                to the host do not happen before the relevant computations have completed.
+
+                If DLIB_USE_CUDA is not #defined then this object will not use CUDA at all.
+                Instead, it will simply store one host side memory block of floats.  
+
+            THREAD SAFETY
+                Instances of this object are not thread-safe.  So don't touch one from
+                multiple threads at the same time.
+        !*/
+    public:
+
+        gpu_data(
+        );
+        /*!
+            ensures
+                - #size() == 0
+                - #host() == nullptr 
+                - #device() == nullptr 
+                - #host_ready() == true
+                - #device_ready() == true
+                - #device_id() == 0
+        !*/
+
+        // This object is not copyable, however, it is movable.
+        gpu_data(const gpu_data&) = delete;
+        gpu_data& operator=(const gpu_data&) = delete;
+        gpu_data(gpu_data&& item);
+        gpu_data& operator=(gpu_data&& item);
+
+        int device_id(
+        ) const; 
+        /*!
+            ensures
+                - returns the ID of the CUDA device that allocated this memory. I.e. the
+                  number returned by cudaGetDevice() when the memory was allocated.
+                - If CUDA is not being used then this function always returns 0.
+        !*/
+
+        void async_copy_to_device(
+        ); 
+        /*!
+            ensures
+                - if (!device_ready()) then
+                    - Begins asynchronously copying host data to the device once it is safe
+                      to do so.  I.e. This function will wait until any previously
+                      scheduled CUDA kernels, which are using the device() memory block,
+                      have completed before transferring the new data to the device.
+                    - A call to device() that happens before the transfer completes will
+                      block until the transfer is complete.  That is, it is safe to call
+                      async_copy_to_device() and then immediately call device().
+        !*/
+
+        void set_size(
+            size_t new_size
+        );
+        /*!
+            ensures
+                - #size() == new_size
+        !*/
+
+        bool host_ready (
+        ) const;
+        /*!
+            ensures
+                - returns true if and only if the host's copy of the data is current.  The
+                  host's data is current if there aren't any modifications to the data
+                  which were made on the device side that have yet to be copied to the
+                  host.
+        !*/
+
+        bool device_ready (
+        ) const; 
+        /*!
+            ensures
+                - returns true if and only if the device's copy of the data is current.
+                  The device's data is current if there aren't any modifications to the
+                  data which were made on the host side that have yet to be copied to the
+                  device.
+        !*/
+
+        const float* host(
+        ) const;
+        /*!
+            ensures
+                - returns a pointer to the host memory block of size() contiguous float
+                  values or nullptr if size()==0.
+                - if (!host_ready()) then
+                    - copies the data from the device to the host, while this is happening
+                      the call to host() blocks. 
+                - #host_ready() == true 
+        !*/
+
+        float* host(
+        );
+        /*!
+            ensures
+                - returns a pointer to the host memory block of size() contiguous float
+                  values or nullptr if size()==0.
+                - if (!host_ready()) then
+                    - copies the data from the device to the host, while this is happening
+                      the call to host() blocks. 
+                - #host_ready() == true 
+                - #device_ready() == false
+                  I.e. Marks the device side data as out of date so that the next call to
+                  device() will perform a host to device transfer.  If you want to begin
+                  the transfer immediately then you can call async_copy_to_device() after
+                  calling host().
+        !*/
+
+        float* host_write_only(
+        );
+        /*!
+            ensures
+                - This function returns the same pointer as host(), except that it never
+                  performs a device to host memory copy.  Instead, it immediately marks the
+                  device side data as out of date, effectively discarding it.  Therefore,
+                  the values in the data pointed to by host_write_only() are undefined and
+                  you should only call host_write_only() if you are going to assign to
+                  every memory location in the returned memory block.  
+                - #host_ready() == true
+                - #device_ready() == false 
+        !*/
+
+        const float* device(
+        ) const;
+        /*!
+            requires
+                - DLIB_USE_CUDA is #defined
+            ensures
+                - returns a pointer to the device memory block of size() contiguous float
+                  values or nullptr if size()==0.
+                - if (!device_ready()) then
+                    - copies the data from the host to the device, while this is happening
+                      the call to device() blocks. 
+                - #device_ready() == true
+        !*/
+
+        float* device(
+        );
+        /*!
+            requires
+                - DLIB_USE_CUDA is #defined
+            ensures
+                - returns a pointer to the device memory block of size() contiguous float
+                  values or nullptr if size()==0.
+                - if (!device_ready()) then
+                    - copies the data from the host to the device, while this is happening
+                      the call to device() blocks. 
+                - #host_ready() == false
+                - #device_ready() == true
+        !*/
+
+        float* device_write_only(
+        );
+        /*!
+            requires
+                - DLIB_USE_CUDA is #defined
+            ensures
+                - This function returns the same pointer as device(), except that it never
+                  performs a host to device memory copy.  Instead, it immediately marks the
+                  host side data as out of date, effectively discarding it.  Therefore, the
+                  values in the data pointed to by device_write_only() are undefined and
+                  you should only call device_write_only() if you are going to assign to
+                  every memory location in the returned memory block.  
+                - #host_ready() == false 
+                - #device_ready() == true 
+        !*/
+
+
+        size_t size(
+        ) const; 
+        /*!
+            ensures
+                - returns the number of floats contained in this object.
+        !*/
+
+        void swap (
+            gpu_data& item
+        );
+        /*!
+            ensures
+                - swaps the state of *this and item
+        !*/
+
+    };
+
+    void serialize(const gpu_data& item, std::ostream& out);
+    void deserialize(gpu_data& item, std::istream& in);
+    /*!
+        provides serialization support
+    !*/
+
+    void memcpy (
+        gpu_data& dest, 
+        const gpu_data& src
+    );
+    /*!
+        requires
+            - dest.size() == src.size()
+        ensures
+            - Copies the data in src to dest.  If the device data is current (i.e.
+              device_ready()==true) on both src and dest then the copy will happen entirely
+              on the device side.
+            - It doesn't matter what GPU device is selected by cudaSetDevice().  You can
+              always copy gpu_data objects to and from each other regardless.
+            - This function blocks until the copy has completed.
+    !*/
+
+    void memcpy (
+        gpu_data& dest, 
+        size_t dest_offset,
+        const gpu_data& src,
+        size_t src_offset,
+        size_t num
+    );
+    /*!
+        requires
+            - dest_offset + num <= dest.size()
+            - src_offset  + num <= src.size()
+        ensures
+            - Copies the data in src to dest, but only copies data in the range
+              [src.host()+src_offset, src.host()+src_offset+num) to
+              [dest.host()+dest_offset, dest.host()+dest_offset+num).  Therefore, it is
+              just like the above memcpy() except that you can specify some subset of data
+              in a gpu_data object to be copied.
+            - Like the above version of memcpy(), the copy will happen in the most
+              efficient way, automatically using the appropriate type of host/device
+              transfers based on where data is currently resident. 
+            - It doesn't matter what GPU device is selected by cudaSetDevice().  You can
+              always copy gpu_data objects to and from each other regardless.
+            - This function blocks until the copy has completed.
+    !*/
+
+// ----------------------------------------------------------------------------------------
+
+}
+
+#endif // DLIB_GPU_DaTA_ABSTRACT_H_
+
diff --git a/ml/dlib/dlib/dnn/input.h b/ml/dlib/dlib/dnn/input.h
new file mode 100644
index 000000000..3b5c954e6
--- /dev/null
+++ b/ml/dlib/dlib/dnn/input.h
@@ -0,0 +1,808 @@
+// Copyright (C) 2015  Davis E. King (davis@dlib.net)
+// License: Boost Software License   See LICENSE.txt for the full license.
+#ifndef DLIB_DNn_INPUT_H_
+#define DLIB_DNn_INPUT_H_
+
+#include "input_abstract.h"
+#include "../matrix.h"
+#include "../array2d.h"
+#include "../pixel.h"
+#include "../image_processing.h"
+#include <sstream>
+#include <array>
+#include "tensor_tools.h"
+
+
+namespace dlib
+{
+
+// ----------------------------------------------------------------------------------------
+
+    template <typename T>
+    class input
+    {
+        const static bool always_false = sizeof(T)!=sizeof(T); 
+        static_assert(always_false, "Unsupported type given to input<>.  input<> only supports "
+            "dlib::matrix and dlib::array2d objects."); 
+    };
+
+// ----------------------------------------------------------------------------------------
+
+    template <size_t NR, size_t NC=NR>
+    class input_rgb_image_sized;
+
+    class input_rgb_image
+    {
+    public:
+        typedef matrix<rgb_pixel> input_type;
+
+        input_rgb_image (
+        ) : 
+            avg_red(122.782), 
+            avg_green(117.001),
+            avg_blue(104.298) 
+        {
+        }
+
+        input_rgb_image (
+            float avg_red_,
+            float avg_green_,
+            float avg_blue_
+        ) : avg_red(avg_red_), avg_green(avg_green_), avg_blue(avg_blue_) 
+        {}
+
+        template <size_t NR, size_t NC>
+        inline input_rgb_image (
+            const input_rgb_image_sized<NR,NC>& item
+        ); 
+
+        float get_avg_red()   const { return avg_red; }
+        float get_avg_green() const { return avg_green; }
+        float get_avg_blue()  const { return avg_blue; }
+
+        bool image_contained_point ( const tensor& data, const point& p) const { return get_rect(data).contains(p); }
+        drectangle tensor_space_to_image_space ( const tensor& /*data*/, drectangle r) const { return r; }
+        drectangle image_space_to_tensor_space ( const tensor& /*data*/, double /*scale*/, drectangle r ) const { return r; }
+
+        template <typename forward_iterator>
+        void to_tensor (
+            forward_iterator ibegin,
+            forward_iterator iend,
+            resizable_tensor& data
+        ) const
+        {
+            DLIB_CASSERT(std::distance(ibegin,iend) > 0);
+            const auto nr = ibegin->nr();
+            const auto nc = ibegin->nc();
+            // make sure all the input matrices have the same dimensions
+            for (auto i = ibegin; i != iend; ++i)
+            {
+                DLIB_CASSERT(i->nr()==nr && i->nc()==nc,
+                    "\t input_rgb_image::to_tensor()"
+                    << "\n\t All matrices given to to_tensor() must have the same dimensions."
+                    << "\n\t nr: " << nr
+                    << "\n\t nc: " << nc
+                    << "\n\t i->nr(): " << i->nr()
+                    << "\n\t i->nc(): " << i->nc()
+                );
+            }
+
+            
+            // initialize data to the right size to contain the stuff in the iterator range.
+            data.set_size(std::distance(ibegin,iend), 3, nr, nc);
+
+
+            const size_t offset = nr*nc;
+            auto ptr = data.host();
+            for (auto i = ibegin; i != iend; ++i)
+            {
+                for (long r = 0; r < nr; ++r)
+                {
+                    for (long c = 0; c < nc; ++c)
+                    {
+                        rgb_pixel temp = (*i)(r,c);
+                        auto p = ptr++;
+                        *p = (temp.red-avg_red)/256.0; 
+                        p += offset;
+                        *p = (temp.green-avg_green)/256.0; 
+                        p += offset;
+                        *p = (temp.blue-avg_blue)/256.0; 
+                        p += offset;
+                    }
+                }
+                ptr += offset*(data.k()-1);
+            }
+
+        }
+
+        friend void serialize(const input_rgb_image& item, std::ostream& out)
+        {
+            serialize("input_rgb_image", out);
+            serialize(item.avg_red, out);
+            serialize(item.avg_green, out);
+            serialize(item.avg_blue, out);
+        }
+
+        friend void deserialize(input_rgb_image& item, std::istream& in)
+        {
+            std::string version;
+            deserialize(version, in);
+            if (version != "input_rgb_image" && version != "input_rgb_image_sized")
+                throw serialization_error("Unexpected version found while deserializing dlib::input_rgb_image.");
+            deserialize(item.avg_red, in);
+            deserialize(item.avg_green, in);
+            deserialize(item.avg_blue, in);
+
+            // read and discard the sizes if this was really a sized input layer.
+            if (version == "input_rgb_image_sized")
+            {
+                size_t nr, nc;
+                deserialize(nr, in);
+                deserialize(nc, in);
+            }
+        }
+
+        friend std::ostream& operator<<(std::ostream& out, const input_rgb_image& item)
+        {
+            out << "input_rgb_image("<<item.avg_red<<","<<item.avg_green<<","<<item.avg_blue<<")";
+            return out;
+        }
+
+        friend void to_xml(const input_rgb_image& item, std::ostream& out)
+        {
+            out << "<input_rgb_image r='"<<item.avg_red<<"' g='"<<item.avg_green<<"' b='"<<item.avg_blue<<"'/>";
+        }
+
+    private:
+        float avg_red;
+        float avg_green;
+        float avg_blue;
+    };
+
+// ----------------------------------------------------------------------------------------
+
+    template <size_t NR, size_t NC>
+    class input_rgb_image_sized
+    {
+    public:
+        static_assert(NR != 0 && NC != 0, "The input image can't be empty.");
+
+        typedef matrix<rgb_pixel> input_type;
+
+        input_rgb_image_sized (
+        ) : 
+            avg_red(122.782), 
+            avg_green(117.001),
+            avg_blue(104.298) 
+        {
+        }
+
+        input_rgb_image_sized (
+            const input_rgb_image& item
+        ) : avg_red(item.get_avg_red()),
+            avg_green(item.get_avg_green()),
+            avg_blue(item.get_avg_blue())
+        {}
+
+        input_rgb_image_sized (
+            float avg_red_,
+            float avg_green_,
+            float avg_blue_
+        ) : avg_red(avg_red_), avg_green(avg_green_), avg_blue(avg_blue_) 
+        {}
+
+        float get_avg_red()   const { return avg_red; }
+        float get_avg_green() const { return avg_green; }
+        float get_avg_blue()  const { return avg_blue; }
+
+        bool image_contained_point ( const tensor& data, const point& p) const { return get_rect(data).contains(p); }
+        drectangle tensor_space_to_image_space ( const tensor& /*data*/, drectangle r) const { return r; }
+        drectangle image_space_to_tensor_space ( const tensor& /*data*/, double /*scale*/, drectangle r ) const { return r; }
+
+        template <typename forward_iterator>
+        void to_tensor (
+            forward_iterator ibegin,
+            forward_iterator iend,
+            resizable_tensor& data
+        ) const
+        {
+            DLIB_CASSERT(std::distance(ibegin,iend) > 0);
+            // make sure all input images have the correct size
+            for (auto i = ibegin; i != iend; ++i)
+            {
+                DLIB_CASSERT(i->nr()==NR && i->nc()==NC,
+                    "\t input_rgb_image_sized::to_tensor()"
+                    << "\n\t All input images must have "<<NR<<" rows and "<<NC<< " columns, but we got one with "<<i->nr()<<" rows and "<<i->nc()<<" columns."
+                );
+            }
+
+            
+            // initialize data to the right size to contain the stuff in the iterator range.
+            data.set_size(std::distance(ibegin,iend), 3, NR, NC);
+
+
+            const size_t offset = NR*NC;
+            auto ptr = data.host();
+            for (auto i = ibegin; i != iend; ++i)
+            {
+                for (size_t r = 0; r < NR; ++r)
+                {
+                    for (size_t c = 0; c < NC; ++c)
+                    {
+                        rgb_pixel temp = (*i)(r,c);
+                        auto p = ptr++;
+                        *p = (temp.red-avg_red)/256.0; 
+                        p += offset;
+                        *p = (temp.green-avg_green)/256.0; 
+                        p += offset;
+                        *p = (temp.blue-avg_blue)/256.0; 
+                        p += offset;
+                    }
+                }
+                ptr += offset*(data.k()-1);
+            }
+
+        }
+
+        friend void serialize(const input_rgb_image_sized& item, std::ostream& out)
+        {
+            serialize("input_rgb_image_sized", out);
+            serialize(item.avg_red, out);
+            serialize(item.avg_green, out);
+            serialize(item.avg_blue, out);
+            serialize(NR, out);
+            serialize(NC, out);
+        }
+
+        friend void deserialize(input_rgb_image_sized& item, std::istream& in)
+        {
+            std::string version;
+            deserialize(version, in);
+            if (version != "input_rgb_image_sized")
+                throw serialization_error("Unexpected version found while deserializing dlib::input_rgb_image_sized.");
+            deserialize(item.avg_red, in);
+            deserialize(item.avg_green, in);
+            deserialize(item.avg_blue, in);
+            size_t nr, nc;
+            deserialize(nr, in);
+            deserialize(nc, in);
+            if (nr != NR || nc != NC)
+            {
+                std::ostringstream sout;
+                sout << "Wrong image dimensions found while deserializing dlib::input_rgb_image_sized.\n";
+                sout << "Expected "<<NR<<" rows and "<<NC<< " columns, but found "<<nr<<" rows and "<<nc<<" columns.";
+                throw serialization_error(sout.str());
+            }
+        }
+
+        friend std::ostream& operator<<(std::ostream& out, const input_rgb_image_sized& item)
+        {
+            out << "input_rgb_image_sized("<<item.avg_red<<","<<item.avg_green<<","<<item.avg_blue<<") nr="<<NR<<" nc="<<NC;
+            return out;
+        }
+
+        friend void to_xml(const input_rgb_image_sized& item, std::ostream& out)
+        {
+            out << "<input_rgb_image_sized r='"<<item.avg_red<<"' g='"<<item.avg_green<<"' b='"<<item.avg_blue<<"' nr='"<<NR<<"' nc='"<<NC<<"'/>";
+        }
+
+    private:
+        float avg_red;
+        float avg_green;
+        float avg_blue;
+    };
+
+// ----------------------------------------------------------------------------------------
+
+    template <size_t NR, size_t NC>
+    input_rgb_image::
+    input_rgb_image (
+        const input_rgb_image_sized<NR,NC>& item
+    ) : avg_red(item.get_avg_red()),
+        avg_green(item.get_avg_green()),
+        avg_blue(item.get_avg_blue())
+    {}
+
+// ----------------------------------------------------------------------------------------
+
+    template <typename T, long NR, long NC, typename MM, typename L>
+    class input<matrix<T,NR,NC,MM,L>> 
+    {
+    public:
+        typedef matrix<T,NR,NC,MM,L> input_type;
+
+        input() {}
+        input(const input&) {}
+
+        template <typename mm>
+        input(const input<array2d<T,mm>>&) {}
+
+        bool image_contained_point ( const tensor& data, const point& p) const { return get_rect(data).contains(p); }
+        drectangle tensor_space_to_image_space ( const tensor& /*data*/, drectangle r) const { return r; }
+        drectangle image_space_to_tensor_space ( const tensor& /*data*/, double /*scale*/, drectangle r ) const { return r; }
+
+        template <typename forward_iterator>
+        void to_tensor (
+            forward_iterator ibegin,
+            forward_iterator iend,
+            resizable_tensor& data
+        ) const
+        {
+            DLIB_CASSERT(std::distance(ibegin,iend) > 0);
+            const auto nr = ibegin->nr();
+            const auto nc = ibegin->nc();
+            // make sure all the input matrices have the same dimensions
+            for (auto i = ibegin; i != iend; ++i)
+            {
+                DLIB_CASSERT(i->nr()==nr && i->nc()==nc,
+                    "\t input::to_tensor()"
+                    << "\n\t All matrices given to to_tensor() must have the same dimensions."
+                    << "\n\t nr: " << nr
+                    << "\n\t nc: " << nc
+                    << "\n\t i->nr(): " << i->nr()
+                    << "\n\t i->nc(): " << i->nc()
+                );
+            }
+
+            
+            // initialize data to the right size to contain the stuff in the iterator range.
+            data.set_size(std::distance(ibegin,iend), pixel_traits<T>::num, nr, nc);
+
+            typedef typename pixel_traits<T>::basic_pixel_type bptype;
+
+            const size_t offset = nr*nc;
+            auto ptr = data.host();
+            for (auto i = ibegin; i != iend; ++i)
+            {
+                for (long r = 0; r < nr; ++r)
+                {
+                    for (long c = 0; c < nc; ++c)
+                    {
+                        auto temp = pixel_to_vector<float>((*i)(r,c));
+                        auto p = ptr++;
+                        for (long j = 0; j < temp.size(); ++j)
+                        {
+                            if (is_same_type<bptype,unsigned char>::value)
+                                *p = temp(j)/256.0;
+                            else
+                                *p = temp(j);
+                            p += offset;
+                        }
+                    }
+                }
+                ptr += offset*(data.k()-1);
+            }
+
+        }
+
+        friend void serialize(const input& /*item*/, std::ostream& out)
+        {
+            serialize("input<matrix>", out);
+        }
+
+        friend void deserialize(input& /*item*/, std::istream& in)
+        {
+            std::string version;
+            deserialize(version, in);
+            if (version != "input<matrix>")
+                throw serialization_error("Unexpected version found while deserializing dlib::input.");
+        }
+
+        friend std::ostream& operator<<(std::ostream& out, const input& /*item*/)
+        {
+            out << "input<matrix>";
+            return out;
+        }
+
+        friend void to_xml(const input& /*item*/, std::ostream& out)
+        {
+            out << "<input/>";
+        }
+    };
+
+// ----------------------------------------------------------------------------------------
+
+    template <typename T, long NR, long NC, typename MM, typename L, size_t K>
+    class input<std::array<matrix<T,NR,NC,MM,L>,K>> 
+    {
+    public:
+        typedef std::array<matrix<T,NR,NC,MM,L>,K> input_type;
+
+        input() {}
+        input(const input&) {}
+
+        bool image_contained_point ( const tensor& data, const point& p) const { return get_rect(data).contains(p); }
+        drectangle tensor_space_to_image_space ( const tensor& /*data*/, drectangle r) const { return r; }
+        drectangle image_space_to_tensor_space ( const tensor& /*data*/, double /*scale*/, drectangle r ) const { return r; }
+
+        template <typename forward_iterator>
+        void to_tensor (
+            forward_iterator ibegin,
+            forward_iterator iend,
+            resizable_tensor& data
+        ) const
+        {
+            DLIB_CASSERT(std::distance(ibegin,iend) > 0);
+            DLIB_CASSERT(ibegin->size() != 0, "When using std::array<matrix> inputs you can't give 0 sized arrays.");
+            const auto nr = (*ibegin)[0].nr();
+            const auto nc = (*ibegin)[0].nc();
+            // make sure all the input matrices have the same dimensions
+            for (auto i = ibegin; i != iend; ++i)
+            {
+                for (size_t k = 0; k < K; ++k)
+                {
+                    const auto& arr = *i;
+                    DLIB_CASSERT(arr[k].nr()==nr && arr[k].nc()==nc,
+                        "\t input::to_tensor()"
+                        << "\n\t When using std::array<matrix> as input, all matrices in a batch must have the same dimensions."
+                        << "\n\t nr: " << nr
+                        << "\n\t nc: " << nc
+                        << "\n\t k:  " << k 
+                        << "\n\t arr[k].nr(): " << arr[k].nr()
+                        << "\n\t arr[k].nc(): " << arr[k].nc()
+                    );
+                }
+            }
+
+            
+            // initialize data to the right size to contain the stuff in the iterator range.
+            data.set_size(std::distance(ibegin,iend), K, nr, nc);
+
+            auto ptr = data.host();
+            for (auto i = ibegin; i != iend; ++i)
+            {
+                for (size_t k = 0; k < K; ++k)
+                {
+                    for (long r = 0; r < nr; ++r)
+                    {
+                        for (long c = 0; c < nc; ++c)
+                        {
+                            if (is_same_type<T,unsigned char>::value)
+                                *ptr++ = (*i)[k](r,c)/256.0;
+                            else
+                                *ptr++ = (*i)[k](r,c);
+                        }
+                    }
+                }
+            }
+
+        }
+
+        friend void serialize(const input& /*item*/, std::ostream& out)
+        {
+            serialize("input<array<matrix>>", out);
+        }
+
+        friend void deserialize(input& /*item*/, std::istream& in)
+        {
+            std::string version;
+            deserialize(version, in);
+            if (version != "input<array<matrix>>")
+                throw serialization_error("Unexpected version found while deserializing dlib::input<array<matrix>>.");
+        }
+
+        friend std::ostream& operator<<(std::ostream& out, const input& /*item*/)
+        {
+            out << "input<array<matrix>>";
+            return out;
+        }
+
+        friend void to_xml(const input& /*item*/, std::ostream& out)
+        {
+            out << "<input/>";
+        }
+    };
+
+// ----------------------------------------------------------------------------------------
+
+    template <typename T, typename MM>
+    class input<array2d<T,MM>> 
+    {
+    public:
+        typedef array2d<T,MM> input_type;
+
+        input() {}
+        input(const input&) {}
+
+        template <long NR, long NC, typename mm, typename L>
+        input(const input<matrix<T,NR,NC,mm,L>>&) {}
+
+        bool image_contained_point ( const tensor& data, const point& p) const { return get_rect(data).contains(p); }
+        drectangle tensor_space_to_image_space ( const tensor& /*data*/, drectangle r) const { return r; }
+        drectangle image_space_to_tensor_space ( const tensor& /*data*/, double /*scale*/, drectangle r ) const { return r; }
+
+        template <typename forward_iterator>
+        void to_tensor (
+            forward_iterator ibegin,
+            forward_iterator iend,
+            resizable_tensor& data
+        ) const
+        {
+            DLIB_CASSERT(std::distance(ibegin,iend) > 0);
+            const auto nr = ibegin->nr();
+            const auto nc = ibegin->nc();
+            // make sure all the input matrices have the same dimensions
+            for (auto i = ibegin; i != iend; ++i)
+            {
+                DLIB_CASSERT(i->nr()==nr && i->nc()==nc,
+                    "\t input::to_tensor()"
+                    << "\n\t All array2d objects given to to_tensor() must have the same dimensions."
+                    << "\n\t nr: " << nr
+                    << "\n\t nc: " << nc
+                    << "\n\t i->nr(): " << i->nr()
+                    << "\n\t i->nc(): " << i->nc()
+                );
+            }
+
+            
+            // initialize data to the right size to contain the stuff in the iterator range.
+            data.set_size(std::distance(ibegin,iend), pixel_traits<T>::num, nr, nc);
+            typedef typename pixel_traits<T>::basic_pixel_type bptype;
+
+            const size_t offset = nr*nc;
+            auto ptr = data.host();
+            for (auto i = ibegin; i != iend; ++i)
+            {
+                for (long r = 0; r < nr; ++r)
+                {
+                    for (long c = 0; c < nc; ++c)
+                    {
+                        auto temp = pixel_to_vector<float>((*i)[r][c]);
+                        auto p = ptr++;
+                        for (long j = 0; j < temp.size(); ++j)
+                        {
+                            if (is_same_type<bptype,unsigned char>::value)
+                                *p = temp(j)/256.0;
+                            else
+                                *p = temp(j);
+                            p += offset;
+                        }
+                    }
+                }
+                ptr += offset*(data.k()-1);
+            }
+
+        }
+
+        friend void serialize(const input& item, std::ostream& out)
+        {
+            serialize("input<array2d>", out);
+        }
+
+        friend void deserialize(input& item, std::istream& in)
+        {
+            std::string version;
+            deserialize(version, in);
+            if (version != "input<array2d>")
+                throw serialization_error("Unexpected version found while deserializing dlib::input.");
+        }
+        friend std::ostream& operator<<(std::ostream& out, const input& item)
+        {
+            out << "input<array2d>";
+            return out;
+        }
+
+        friend void to_xml(const input& item, std::ostream& out)
+        {
+            out << "<input/>";
+        }
+    };
+
+// ----------------------------------------------------------------------------------------
+
+    template <typename PYRAMID_TYPE>
+    class input_rgb_image_pyramid
+    {
+    public:
+        typedef matrix<rgb_pixel> input_type;
+        typedef PYRAMID_TYPE pyramid_type;
+
+        input_rgb_image_pyramid (
+        ) : 
+            avg_red(122.782), 
+            avg_green(117.001),
+            avg_blue(104.298) 
+        {
+        }
+
+        input_rgb_image_pyramid (
+            float avg_red_,
+            float avg_green_,
+            float avg_blue_
+        ) : avg_red(avg_red_), avg_green(avg_green_), avg_blue(avg_blue_) 
+        {}
+
+        float get_avg_red()   const { return avg_red; }
+        float get_avg_green() const { return avg_green; }
+        float get_avg_blue()  const { return avg_blue; }
+
+        unsigned long get_pyramid_padding () const { return pyramid_padding; }
+        void set_pyramid_padding (unsigned long value) { pyramid_padding = value; }
+
+        unsigned long get_pyramid_outer_padding () const { return pyramid_outer_padding; }
+        void set_pyramid_outer_padding (unsigned long value) { pyramid_outer_padding = value; }
+
+        bool image_contained_point (
+            const tensor& data,
+            const point& p
+        ) const
+        {
+            auto&& rects = any_cast<std::vector<rectangle>>(data.annotation());
+            DLIB_CASSERT(rects.size() > 0);
+            return rects[0].contains(p+rects[0].tl_corner());
+        }
+
+        drectangle tensor_space_to_image_space (
+            const tensor& data,
+            drectangle r
+        ) const
+        {
+            auto&& rects = any_cast<std::vector<rectangle>>(data.annotation());
+            return tiled_pyramid_to_image<pyramid_type>(rects, r);
+        }
+
+        drectangle image_space_to_tensor_space (
+            const tensor& data,
+            double scale,
+            drectangle r 
+        ) const
+        {
+            DLIB_CASSERT(0 < scale && scale <= 1 , "scale: "<< scale);
+            auto&& rects = any_cast<std::vector<rectangle>>(data.annotation());
+            return image_to_tiled_pyramid<pyramid_type>(rects, scale, r);
+        }
+
+        template <typename forward_iterator>
+        void to_tensor (
+            forward_iterator ibegin,
+            forward_iterator iend,
+            resizable_tensor& data
+        ) const
+        {
+            DLIB_CASSERT(std::distance(ibegin,iend) > 0);
+            auto nr = ibegin->nr();
+            auto nc = ibegin->nc();
+            // make sure all the input matrices have the same dimensions
+            for (auto i = ibegin; i != iend; ++i)
+            {
+                DLIB_CASSERT(i->nr()==nr && i->nc()==nc,
+                    "\t input_rgb_image_pyramid::to_tensor()"
+                    << "\n\t All matrices given to to_tensor() must have the same dimensions."
+                    << "\n\t nr: " << nr
+                    << "\n\t nc: " << nc
+                    << "\n\t i->nr(): " << i->nr()
+                    << "\n\t i->nc(): " << i->nc()
+                );
+            }
+
+            long NR, NC;
+            pyramid_type pyr;
+            auto& rects = data.annotation().get<std::vector<rectangle>>();
+            impl::compute_tiled_image_pyramid_details(pyr, nr, nc, pyramid_padding, pyramid_outer_padding, rects, NR, NC);
+
+            // initialize data to the right size to contain the stuff in the iterator range.
+            data.set_size(std::distance(ibegin,iend), 3, NR, NC);
+
+            // We need to zero the image before doing the pyramid, since the pyramid
+            // creation code doesn't write to all parts of the image.  We also take
+            // care to avoid triggering any device to hosts copies.
+            auto ptr = data.host_write_only();
+            for (size_t i = 0; i < data.size(); ++i)
+                ptr[i] = 0;
+
+            if (rects.size() == 0)
+                return;
+
+            // copy the first raw image into the top part of the tiled pyramid.  We need to
+            // do this for each of the input images/samples in the tensor.
+            for (auto i = ibegin; i != iend; ++i)
+            {
+                auto& img = *i;
+                ptr += rects[0].top()*data.nc();
+                for (long r = 0; r < img.nr(); ++r)
+                {
+                    auto p = ptr+rects[0].left();
+                    for (long c = 0; c < img.nc(); ++c)
+                        p[c] = (img(r,c).red-avg_red)/256.0;
+                    ptr += data.nc();
+                }
+                ptr += data.nc()*(data.nr()-rects[0].bottom()-1);
+
+                ptr += rects[0].top()*data.nc();
+                for (long r = 0; r < img.nr(); ++r)
+                {
+                    auto p = ptr+rects[0].left();
+                    for (long c = 0; c < img.nc(); ++c)
+                        p[c] = (img(r,c).green-avg_green)/256.0;
+                    ptr += data.nc();
+                }
+                ptr += data.nc()*(data.nr()-rects[0].bottom()-1);
+
+                ptr += rects[0].top()*data.nc();
+                for (long r = 0; r < img.nr(); ++r)
+                {
+                    auto p = ptr+rects[0].left();
+                    for (long c = 0; c < img.nc(); ++c)
+                        p[c] = (img(r,c).blue-avg_blue)/256.0;
+                    ptr += data.nc();
+                }
+                ptr += data.nc()*(data.nr()-rects[0].bottom()-1);
+            }
+
+            // now build the image pyramid into data.  This does the same thing as
+            // create_tiled_pyramid(), except we use the GPU if one is available. 
+            for (size_t i = 1; i < rects.size(); ++i)
+            {
+                alias_tensor src(data.num_samples(),data.k(),rects[i-1].height(),rects[i-1].width());
+                alias_tensor dest(data.num_samples(),data.k(),rects[i].height(),rects[i].width());
+
+                auto asrc  = src(data, data.nc()*rects[i-1].top() + rects[i-1].left());
+                auto adest = dest(data, data.nc()*rects[i].top() + rects[i].left());
+
+                tt::resize_bilinear(adest, data.nc(), data.nr()*data.nc(), 
+                                    asrc, data.nc(), data.nr()*data.nc());
+            }
+        }
+
+        friend void serialize(const input_rgb_image_pyramid& item, std::ostream& out)
+        {
+            serialize("input_rgb_image_pyramid2", out);
+            serialize(item.avg_red, out);
+            serialize(item.avg_green, out);
+            serialize(item.avg_blue, out);
+            serialize(item.pyramid_padding, out);
+            serialize(item.pyramid_outer_padding, out);
+        }
+
+        friend void deserialize(input_rgb_image_pyramid& item, std::istream& in)
+        {
+            std::string version;
+            deserialize(version, in);
+            if (version != "input_rgb_image_pyramid" && version != "input_rgb_image_pyramid2")
+                throw serialization_error("Unexpected version found while deserializing dlib::input_rgb_image_pyramid.");
+            deserialize(item.avg_red, in);
+            deserialize(item.avg_green, in);
+            deserialize(item.avg_blue, in);
+            if (version == "input_rgb_image_pyramid2")
+            {
+                deserialize(item.pyramid_padding, in);
+                deserialize(item.pyramid_outer_padding, in);
+            }
+            else
+            {
+                item.pyramid_padding = 10;
+                item.pyramid_outer_padding = 11;
+            }
+        }
+
+        friend std::ostream& operator<<(std::ostream& out, const input_rgb_image_pyramid& item)
+        {
+            out << "input_rgb_image_pyramid("<<item.avg_red<<","<<item.avg_green<<","<<item.avg_blue<<")";
+            out << " pyramid_padding="<<item.pyramid_padding;
+            out << " pyramid_outer_padding="<<item.pyramid_outer_padding;
+            return out;
+        }
+
+        friend void to_xml(const input_rgb_image_pyramid& item, std::ostream& out)
+        {
+            out << "<input_rgb_image_pyramid r='"<<item.avg_red<<"' g='"<<item.avg_green
+                <<"' b='"<<item.avg_blue
+                <<"' pyramid_padding='"<<item.pyramid_padding
+                <<"' pyramid_outer_padding='"<<item.pyramid_outer_padding
+                <<"'/>";
+        }
+
+    private:
+        float avg_red;
+        float avg_green;
+        float avg_blue;
+        unsigned long pyramid_padding = 10;
+        unsigned long pyramid_outer_padding = 11;
+    };
+
+// ----------------------------------------------------------------------------------------
+
+}
+
+#endif // DLIB_DNn_INPUT_H_
+
diff --git a/ml/dlib/dlib/dnn/input_abstract.h b/ml/dlib/dlib/dnn/input_abstract.h
new file mode 100644
index 000000000..7130efb17
--- /dev/null
+++ b/ml/dlib/dlib/dnn/input_abstract.h
@@ -0,0 +1,467 @@
+// Copyright (C) 2015  Davis E. King (davis@dlib.net)
+// License: Boost Software License   See LICENSE.txt for the full license.
+#undef DLIB_DNn_INPUT_ABSTRACT_H_
+#ifdef DLIB_DNn_INPUT_ABSTRACT_H_
+
+#include "../matrix.h"
+#include "../pixel.h"
+
+
+namespace dlib
+{
+
+// ----------------------------------------------------------------------------------------
+
+    class EXAMPLE_INPUT_LAYER
+    {
+        /*!
+            WHAT THIS OBJECT REPRESENTS
+                Each deep neural network model in dlib begins with an input layer. The job
+                of the input layer is to convert an input_type into a tensor.  Nothing more
+                and nothing less.  
+                
+                Note that there is no dlib::EXAMPLE_INPUT_LAYER type.  It is shown here
+                purely to document the interface that an input layer object must implement.
+                If you are using some kind of image or matrix object as your input_type
+                then you can use the provided dlib::input layer defined below.  Otherwise,
+                you need to define your own custom input layer.
+
+            THREAD SAFETY
+                to_tensor() must be thread safe.  That is, multiple threads must be able to
+                make calls to to_tensor() on a single instance of this object at the same
+                time.
+        !*/
+    public:
+
+        EXAMPLE_INPUT_LAYER(
+        );
+        /*!
+            ensures
+                - Default constructs this object.  This function is not required to do
+                  anything in particular but it must exist, that is, it is required that
+                  layer objects be default constructable. 
+        !*/
+
+        EXAMPLE_INPUT_LAYER (
+            const EXAMPLE_INPUT_LAYER& item
+        );
+        /*!
+            ensures
+                - EXAMPLE_INPUT_LAYER objects are copy constructable
+        !*/
+
+        EXAMPLE_INPUT_LAYER(
+            const some_other_input_layer_type& item
+        );
+        /*!
+            ensures
+                - Constructs this object from item.  This form of constructor is optional
+                  but it allows you to provide a conversion from one input layer type to
+                  another.  For example, the following code is valid only if my_input_layer2 can
+                  be constructed from my_input_layer1:
+                    relu<fc<relu<fc<my_input_layer1>>>> my_dnn1;
+                    relu<fc<relu<fc<my_input_layer2>>>> my_dnn2(my_dnn1);
+                  This kind of pattern is useful if you want to use one type of input layer
+                  during training but a different type of layer during testing since it
+                  allows you to easily convert between related deep neural network types.  
+        !*/
+
+        typedef whatever_type_to_tensor_expects input_type;
+
+        template <typename forward_iterator>
+        void to_tensor (
+            forward_iterator ibegin,
+            forward_iterator iend,
+            resizable_tensor& data
+        ) const;
+        /*!
+            requires
+                - [ibegin, iend) is an iterator range over input_type objects.
+                - std::distance(ibegin,iend) > 0
+            ensures
+                - Converts the iterator range into a tensor and stores it into #data.
+                - #data.num_samples()%distance(ibegin,iend) == 0. 
+                  Normally you would have #data.num_samples() == distance(ibegin,iend) but
+                  you can also expand the output by some integer factor so long as the loss
+                  you use can deal with it correctly.
+                - The data in the ith sample of #data corresponds to the input_type object
+                  *(ibegin+i/sample_expansion_factor).
+                  where sample_expansion_factor==#data.num_samples()/distance(ibegin,iend).
+        !*/
+    };
+
+    std::ostream& operator<<(std::ostream& out, const EXAMPLE_INPUT_LAYER& item);
+    /*!
+        print a string describing this layer.
+    !*/
+
+    void to_xml(const EXAMPLE_INPUT_LAYER& item, std::ostream& out);
+    /*!
+        This function is optional, but required if you want to print your networks with
+        net_to_xml().  Therefore, to_xml() prints a layer as XML.
+    !*/
+
+    void serialize(const EXAMPLE_INPUT_LAYER& item, std::ostream& out);
+    void deserialize(EXAMPLE_INPUT_LAYER& item, std::istream& in);
+    /*!
+        provides serialization support  
+    !*/
+
+// ----------------------------------------------------------------------------------------
+
+    template <
+        typename T
+        >
+    class input 
+    {
+        /*!
+            REQUIREMENTS ON T
+                One of the following must be true:
+                    - T is a matrix or array2d object and it must contain some kind of
+                      pixel type.  I.e. pixel_traits<T::type> must be defined.   
+                    - T is a std::array<matrix<U>> where U is any built in scalar type like
+                      float, double, or unsigned char. 
+
+            WHAT THIS OBJECT REPRESENTS
+                This is a basic input layer that simply copies images into a tensor.  
+        !*/
+
+    public:
+        typedef T input_type;
+
+        template <typename forward_iterator>
+        void to_tensor (
+            forward_iterator ibegin,
+            forward_iterator iend,
+            resizable_tensor& data
+        ) const;
+        /*!
+            requires
+                - [ibegin, iend) is an iterator range over input_type objects.
+                - std::distance(ibegin,iend) > 0
+                - The input range should contain image objects that all have the same
+                  dimensions.
+            ensures
+                - Converts the iterator range into a tensor and stores it into #data.  In
+                  particular, if the input images have R rows, C columns, and K channels
+                  (where K is given by pixel_traits::num or std::array::size() if
+                  std::array inputs are used) then we will have:
+                    - #data.num_samples() == std::distance(ibegin,iend)
+                    - #data.nr() == R
+                    - #data.nc() == C
+                    - #data.k() == K
+                  For example, a matrix<float,3,3> would turn into a tensor with 3 rows, 3
+                  columns, and k()==1.  Or a matrix<rgb_pixel,4,5> would turn into a tensor
+                  with 4 rows, 5 columns, and k()==3 (since rgb_pixels have 3 channels).
+                  Or a std::array<matrix<float,3,3>,5> would turn into a tensor with 3 rows
+                  and columns, and k()==5 channels.
+                - If the input data contains pixels of type unsigned char, rgb_pixel, or
+                  other pixel types with a basic_pixel_type of unsigned char then each
+                  value written to the output tensor is first divided by 256.0 so that the
+                  resulting outputs are all in the range [0,1].
+        !*/
+
+        // Provided for compatibility with input_rgb_image_pyramid's interface
+        bool image_contained_point ( const tensor& data, const point& p) const { return get_rect(data).contains(p); }
+        drectangle tensor_space_to_image_space ( const tensor& /*data*/, drectangle r) const { return r; }
+        drectangle image_space_to_tensor_space ( const tensor& /*data*/, double /*scale*/, drectangle r ) const { return r; }
+    };
+
+// ----------------------------------------------------------------------------------------
+
+    class input_rgb_image
+    {
+        /*!
+            WHAT THIS OBJECT REPRESENTS
+                This input layer works with RGB images of type matrix<rgb_pixel>.  It is
+                very similar to the dlib::input layer except that it allows you to subtract
+                the average color value from each color channel when converting an image to
+                a tensor.
+        !*/
+    public:
+        typedef matrix<rgb_pixel> input_type;
+
+        input_rgb_image (
+        );
+        /*!
+            ensures
+                - #get_avg_red()   == 122.782
+                - #get_avg_green() == 117.001
+                - #get_avg_blue()  == 104.298
+        !*/
+
+        input_rgb_image (
+            float avg_red,
+            float avg_green,
+            float avg_blue
+        ); 
+        /*!
+            ensures
+                - #get_avg_red() == avg_red
+                - #get_avg_green() == avg_green
+                - #get_avg_blue() == avg_blue
+        !*/
+
+        float get_avg_red(
+        ) const;
+        /*!
+            ensures
+                - returns the value subtracted from the red color channel.
+        !*/
+
+        float get_avg_green(
+        ) const;
+        /*!
+            ensures
+                - returns the value subtracted from the green color channel.
+        !*/
+
+        float get_avg_blue(
+        ) const;
+        /*!
+            ensures
+                - returns the value subtracted from the blue color channel.
+        !*/
+
+        template <typename forward_iterator>
+        void to_tensor (
+            forward_iterator ibegin,
+            forward_iterator iend,
+            resizable_tensor& data
+        ) const;
+        /*!
+            requires
+                - [ibegin, iend) is an iterator range over input_type objects.
+                - std::distance(ibegin,iend) > 0
+                - The input range should contain images that all have the same
+                  dimensions.
+            ensures
+                - Converts the iterator range into a tensor and stores it into #data.  In
+                  particular, if the input images have R rows, C columns then we will have:
+                    - #data.num_samples() == std::distance(ibegin,iend)
+                    - #data.nr() == R
+                    - #data.nc() == C
+                    - #data.k() == 3
+                  Moreover, each color channel is normalized by having its average value
+                  subtracted (according to get_avg_red(), get_avg_green(), or
+                  get_avg_blue()) and then is divided by 256.0.
+        !*/
+
+
+        // Provided for compatibility with input_rgb_image_pyramid's interface
+        bool image_contained_point ( const tensor& data, const point& p) const { return get_rect(data).contains(p); }
+        drectangle tensor_space_to_image_space ( const tensor& /*data*/, drectangle r) const { return r; }
+        drectangle image_space_to_tensor_space ( const tensor& /*data*/, double /*scale*/, drectangle r ) const { return r; }
+    };
+
+// ----------------------------------------------------------------------------------------
+
+    template <size_t NR, size_t NC=NR>
+    class input_rgb_image_sized 
+    {
+        /*!
+            WHAT THIS OBJECT REPRESENTS
+                This layer has an interface and behavior identical to input_rgb_image
+                except that it requires input images to have NR rows and NC columns.  This
+                is checked by a DLIB_CASSERT inside to_tensor().
+
+                You can also convert between input_rgb_image and input_rgb_image_sized by
+                copy construction or assignment.
+        !*/
+
+    };
+
+// ----------------------------------------------------------------------------------------
+
+    template <
+        typename PYRAMID_TYPE
+        >
+    class input_rgb_image_pyramid
+    {
+        /*!
+            REQUIREMENTS ON PYRAMID_TYPE
+                PYRAMID_TYPE must be an instance of the dlib::pyramid_down template.
+
+            WHAT THIS OBJECT REPRESENTS
+                This input layer works with RGB images of type matrix<rgb_pixel>.  It is
+                identical to input_rgb_image except that it outputs a tensor containing a
+                tiled image pyramid of each input image rather than a simple copy of each
+                image.  The tiled image pyramid is created using create_tiled_pyramid().
+        !*/
+
+    public:
+
+        typedef matrix<rgb_pixel> input_type;
+        typedef PYRAMID_TYPE pyramid_type;
+
+        input_rgb_image_pyramid (
+        );
+        /*!
+            ensures
+                - #get_avg_red()   == 122.782
+                - #get_avg_green() == 117.001
+                - #get_avg_blue()  == 104.298
+                - #get_pyramid_padding() == 10
+                - #get_pyramid_outer_padding() == 11
+        !*/
+
+        input_rgb_image_pyramid (
+            float avg_red,
+            float avg_green,
+            float avg_blue
+        ); 
+        /*!
+            ensures
+                - #get_avg_red() == avg_red
+                - #get_avg_green() == avg_green
+                - #get_avg_blue() == avg_blue
+                - #get_pyramid_padding() == 10
+                - #get_pyramid_outer_padding() == 11
+        !*/
+
+        float get_avg_red(
+        ) const;
+        /*!
+            ensures
+                - returns the value subtracted from the red color channel.
+        !*/
+
+        float get_avg_green(
+        ) const;
+        /*!
+            ensures
+                - returns the value subtracted from the green color channel.
+        !*/
+
+        float get_avg_blue(
+        ) const;
+        /*!
+            ensures
+                - returns the value subtracted from the blue color channel.
+        !*/
+
+        unsigned long get_pyramid_padding (
+        ) const; 
+        /*!
+            ensures
+                - When this object creates a pyramid it will call create_tiled_pyramid() and
+                  set create_tiled_pyramid's pyramid_padding parameter to get_pyramid_padding().
+        !*/
+        void set_pyramid_padding (
+            unsigned long value
+        );
+        /*!
+            ensures
+                - #get_pyramid_padding() == value
+        !*/
+
+        unsigned long get_pyramid_outer_padding (
+        ) const; 
+        /*!
+            ensures
+                - When this object creates a pyramid it will call create_tiled_pyramid()
+                  and set create_tiled_pyramid's pyramid_outer_padding parameter to
+                  get_pyramid_outer_padding().
+        !*/
+        void set_pyramid_outer_padding (
+            unsigned long value
+        );
+        /*!
+            ensures
+                - #get_pyramid_outer_padding() == value
+        !*/
+
+        template <typename forward_iterator>
+        void to_tensor (
+            forward_iterator ibegin,
+            forward_iterator iend,
+            resizable_tensor& data
+        ) const;
+        /*!
+            requires
+                - [ibegin, iend) is an iterator range over input_type objects.
+                - std::distance(ibegin,iend) > 0
+                - The input range should contain images that all have the same
+                  dimensions.
+            ensures
+                - Converts the iterator range into a tensor and stores it into #data.  In
+                  particular, we will have:
+                    - #data.num_samples() == std::distance(ibegin,iend)
+                    - #data.k() == 3
+                    - Each sample in #data contains a tiled image pyramid of the
+                      corresponding input image.  The tiled pyramid is created by
+                      create_tiled_pyramid().
+                  Moreover, each color channel is normalized by having its average value
+                  subtracted (according to get_avg_red(), get_avg_green(), or
+                  get_avg_blue()) and then is divided by 256.0.
+        !*/
+
+        bool image_contained_point (
+            const tensor& data,
+            const point& p
+        ) const;
+        /*!
+            requires
+                - data is a tensor that was produced by this->to_tensor()
+            ensures
+                - Since data is a tensor that is built from a bunch of identically sized
+                  images, we can ask if those images were big enough to contain the point
+                  p.  This function returns the answer to that question.
+        !*/
+
+        drectangle image_space_to_tensor_space (
+            const tensor& data,
+            double scale,
+            drectangle r 
+        ) const;
+        /*!
+            requires
+                - data is a tensor that was produced by this->to_tensor()
+                - 0 < scale <= 1
+            ensures
+                - This function maps from to_tensor()'s input image space to its output
+                  tensor space.  Therefore, given that data is a tensor produced by
+                  to_tensor(), image_space_to_tensor_space() allows you to ask for the
+                  rectangle in data that corresponds to a rectangle in the original image
+                  space.
+
+                  Note that since the output tensor contains an image pyramid, there are
+                  multiple points in the output tensor that correspond to any input
+                  location.  So you must also specify a scale so we know what level of the
+                  pyramid is needed.  So given a rectangle r in an input image, you can
+                  ask, what rectangle in data corresponds to r when things are scale times
+                  smaller?  That rectangle is returned by this function.
+                - A scale of 1 means we don't move anywhere in the pyramid scale space relative
+                  to the input image while smaller values of scale mean we move down the
+                  pyramid.
+        !*/
+
+        drectangle tensor_space_to_image_space (
+            const tensor& data,
+            drectangle r
+        ) const;
+        /*!
+            requires
+                - data is a tensor that was produced by this->to_tensor()
+            ensures
+                - This function maps from to_tensor()'s output tensor space to its input
+                  image space.  Therefore, given that data is a tensor produced by
+                  to_tensor(), tensor_space_to_image_space() allows you to ask for the
+                  rectangle in the input image that corresponds to a rectangle in data.
+                - It should be noted that this function isn't always an inverse of
+                  image_space_to_tensor_space().  This is because you can ask
+                  image_space_to_tensor_space() for the coordinates of points outside the input
+                  image and they will be mapped to somewhere that doesn't have an inverse.
+                  But for points actually inside the input image this function performs an
+                  approximate inverse mapping.  I.e. when image_contained_point(data,center(r))==true 
+                  there is an approximate inverse.
+        !*/
+
+    };
+
+// ----------------------------------------------------------------------------------------
+
+}
+
+#endif // DLIB_DNn_INPUT_ABSTRACT_H_
+
diff --git a/ml/dlib/dlib/dnn/layers.h b/ml/dlib/dlib/dnn/layers.h
new file mode 100644
index 000000000..91436f635
--- /dev/null
+++ b/ml/dlib/dlib/dnn/layers.h
@@ -0,0 +1,3244 @@
+// Copyright (C) 2015  Davis E. King (davis@dlib.net)
+// License: Boost Software License   See LICENSE.txt for the full license.
+#ifndef DLIB_DNn_LAYERS_H_
+#define DLIB_DNn_LAYERS_H_
+
+#include "layers_abstract.h"
+#include "tensor.h"
+#include "core.h"
+#include <iostream>
+#include <string>
+#include "../rand.h"
+#include "../string.h"
+#include "tensor_tools.h"
+#include "../vectorstream.h"
+#include "utilities.h"
+#include <sstream>
+
+
+namespace dlib
+{
+
+// ----------------------------------------------------------------------------------------
+
+    struct num_con_outputs
+    {
+        num_con_outputs(unsigned long n) : num_outputs(n) {}
+        unsigned long num_outputs;
+    };
+
+    template <
+        long _num_filters,
+        long _nr,
+        long _nc,
+        int _stride_y,
+        int _stride_x,
+        int _padding_y = _stride_y!=1? 0 : _nr/2,
+        int _padding_x = _stride_x!=1? 0 : _nc/2
+        >
+    class con_
+    {
+    public:
+
+        static_assert(_num_filters > 0, "The number of filters must be > 0");
+        static_assert(_nr >= 0, "The number of rows in a filter must be >= 0");
+        static_assert(_nc >= 0, "The number of columns in a filter must be >= 0");
+        static_assert(_stride_y > 0, "The filter stride must be > 0");
+        static_assert(_stride_x > 0, "The filter stride must be > 0");
+        static_assert(_nr==0 || (0 <= _padding_y && _padding_y < _nr), "The padding must be smaller than the filter size.");
+        static_assert(_nc==0 || (0 <= _padding_x && _padding_x < _nc), "The padding must be smaller than the filter size.");
+        static_assert(_nr!=0 || 0 == _padding_y, "If _nr==0 then the padding must be set to 0 as well.");
+        static_assert(_nc!=0 || 0 == _padding_x, "If _nr==0 then the padding must be set to 0 as well.");
+
+        con_(
+            num_con_outputs o
+        ) : 
+            learning_rate_multiplier(1),
+            weight_decay_multiplier(1),
+            bias_learning_rate_multiplier(1),
+            bias_weight_decay_multiplier(0),
+            num_filters_(o.num_outputs),
+            padding_y_(_padding_y),
+            padding_x_(_padding_x)
+        {
+            DLIB_CASSERT(num_filters_ > 0);
+        }
+
+        con_() : con_(num_con_outputs(_num_filters)) {}
+
+        long num_filters() const { return num_filters_; }
+        long nr() const 
+        { 
+            if (_nr==0)
+                return filters.nr();
+            else
+                return _nr;
+        }
+        long nc() const 
+        { 
+            if (_nc==0)
+                return filters.nc();
+            else
+                return _nc;
+        }
+        long stride_y() const { return _stride_y; }
+        long stride_x() const { return _stride_x; }
+        long padding_y() const { return padding_y_; }
+        long padding_x() const { return padding_x_; }
+
+        void set_num_filters(long num) 
+        {
+            DLIB_CASSERT(num > 0);
+            if (num != num_filters_)
+            {
+                DLIB_CASSERT(get_layer_params().size() == 0, 
+                    "You can't change the number of filters in con_ if the parameter tensor has already been allocated.");
+                num_filters_ = num;
+            }
+        }
+
+        double get_learning_rate_multiplier () const  { return learning_rate_multiplier; }
+        double get_weight_decay_multiplier () const   { return weight_decay_multiplier; }
+        void set_learning_rate_multiplier(double val) { learning_rate_multiplier = val; }
+        void set_weight_decay_multiplier(double val)  { weight_decay_multiplier  = val; }
+
+        double get_bias_learning_rate_multiplier () const  { return bias_learning_rate_multiplier; }
+        double get_bias_weight_decay_multiplier () const   { return bias_weight_decay_multiplier; }
+        void set_bias_learning_rate_multiplier(double val) { bias_learning_rate_multiplier = val; }
+        void set_bias_weight_decay_multiplier(double val)  { bias_weight_decay_multiplier  = val; }
+
+        inline dpoint map_input_to_output (
+            dpoint p
+        ) const
+        {
+            p.x() = (p.x()+padding_x()-nc()/2)/stride_x();
+            p.y() = (p.y()+padding_y()-nr()/2)/stride_y();
+            return p;
+        }
+
+        inline dpoint map_output_to_input (
+            dpoint p
+        ) const
+        {
+            p.x() = p.x()*stride_x() - padding_x() + nc()/2;
+            p.y() = p.y()*stride_y() - padding_y() + nr()/2;
+            return p;
+        }
+
+        con_ (
+            const con_& item
+        ) : 
+            params(item.params),
+            filters(item.filters),
+            biases(item.biases),
+            learning_rate_multiplier(item.learning_rate_multiplier),
+            weight_decay_multiplier(item.weight_decay_multiplier),
+            bias_learning_rate_multiplier(item.bias_learning_rate_multiplier),
+            bias_weight_decay_multiplier(item.bias_weight_decay_multiplier),
+            num_filters_(item.num_filters_),
+            padding_y_(item.padding_y_),
+            padding_x_(item.padding_x_)
+        {
+            // this->conv is non-copyable and basically stateless, so we have to write our
+            // own copy to avoid trying to copy it and getting an error.
+        }
+
+        con_& operator= (
+            const con_& item
+        )
+        {
+            if (this == &item)
+                return *this;
+
+            // this->conv is non-copyable and basically stateless, so we have to write our
+            // own copy to avoid trying to copy it and getting an error.
+            params = item.params;
+            filters = item.filters;
+            biases = item.biases;
+            padding_y_ = item.padding_y_;
+            padding_x_ = item.padding_x_;
+            learning_rate_multiplier = item.learning_rate_multiplier;
+            weight_decay_multiplier = item.weight_decay_multiplier;
+            bias_learning_rate_multiplier = item.bias_learning_rate_multiplier;
+            bias_weight_decay_multiplier = item.bias_weight_decay_multiplier;
+            num_filters_ = item.num_filters_;
+            return *this;
+        }
+
+        template <typename SUBNET>
+        void setup (const SUBNET& sub)
+        {
+            const long filt_nr = _nr!=0 ? _nr : sub.get_output().nr();
+            const long filt_nc = _nc!=0 ? _nc : sub.get_output().nc();
+
+            long num_inputs = filt_nr*filt_nc*sub.get_output().k();
+            long num_outputs = num_filters_;
+            // allocate params for the filters and also for the filter bias values.
+            params.set_size(num_inputs*num_filters_ + num_filters_);
+
+            dlib::rand rnd(std::rand());
+            randomize_parameters(params, num_inputs+num_outputs, rnd);
+
+            filters = alias_tensor(num_filters_, sub.get_output().k(), filt_nr, filt_nc);
+            biases = alias_tensor(1,num_filters_);
+
+            // set the initial bias values to zero
+            biases(params,filters.size()) = 0;
+        }
+
+        template <typename SUBNET>
+        void forward(const SUBNET& sub, resizable_tensor& output)
+        {
+            conv.setup(sub.get_output(),
+                       filters(params,0),
+                       _stride_y,
+                       _stride_x,
+                       padding_y_,
+                       padding_x_);
+            conv(false, output,
+                sub.get_output(),
+                filters(params,0));
+
+            tt::add(1,output,1,biases(params,filters.size()));
+        } 
+
+        template <typename SUBNET>
+        void backward(const tensor& gradient_input, SUBNET& sub, tensor& params_grad)
+        {
+            conv.get_gradient_for_data (true, gradient_input, filters(params,0), sub.get_gradient_input());
+            // no dpoint computing the parameter gradients if they won't be used.
+            if (learning_rate_multiplier != 0)
+            {
+                auto filt = filters(params_grad,0);
+                conv.get_gradient_for_filters (false, gradient_input, sub.get_output(), filt);
+                auto b = biases(params_grad, filters.size());
+                tt::assign_conv_bias_gradient(b, gradient_input);
+            }
+        }
+
+        const tensor& get_layer_params() const { return params; }
+        tensor& get_layer_params() { return params; }
+
+        friend void serialize(const con_& item, std::ostream& out)
+        {
+            serialize("con_4", out);
+            serialize(item.params, out);
+            serialize(item.num_filters_, out);
+            serialize(_nr, out);
+            serialize(_nc, out);
+            serialize(_stride_y, out);
+            serialize(_stride_x, out);
+            serialize(item.padding_y_, out);
+            serialize(item.padding_x_, out);
+            serialize(item.filters, out);
+            serialize(item.biases, out);
+            serialize(item.learning_rate_multiplier, out);
+            serialize(item.weight_decay_multiplier, out);
+            serialize(item.bias_learning_rate_multiplier, out);
+            serialize(item.bias_weight_decay_multiplier, out);
+        }
+
+        friend void deserialize(con_& item, std::istream& in)
+        {
+            std::string version;
+            deserialize(version, in);
+            long nr;
+            long nc;
+            int stride_y;
+            int stride_x;
+            if (version == "con_4")
+            {
+                deserialize(item.params, in);
+                deserialize(item.num_filters_, in);
+                deserialize(nr, in);
+                deserialize(nc, in);
+                deserialize(stride_y, in);
+                deserialize(stride_x, in);
+                deserialize(item.padding_y_, in);
+                deserialize(item.padding_x_, in);
+                deserialize(item.filters, in);
+                deserialize(item.biases, in);
+                deserialize(item.learning_rate_multiplier, in);
+                deserialize(item.weight_decay_multiplier, in);
+                deserialize(item.bias_learning_rate_multiplier, in);
+                deserialize(item.bias_weight_decay_multiplier, in);
+                if (item.padding_y_ != _padding_y) throw serialization_error("Wrong padding_y found while deserializing dlib::con_");
+                if (item.padding_x_ != _padding_x) throw serialization_error("Wrong padding_x found while deserializing dlib::con_");
+                if (nr != _nr) throw serialization_error("Wrong nr found while deserializing dlib::con_");
+                if (nc != _nc) throw serialization_error("Wrong nc found while deserializing dlib::con_");
+                if (stride_y != _stride_y) throw serialization_error("Wrong stride_y found while deserializing dlib::con_");
+                if (stride_x != _stride_x) throw serialization_error("Wrong stride_x found while deserializing dlib::con_");
+            }
+            else
+            {
+                throw serialization_error("Unexpected version '"+version+"' found while deserializing dlib::con_.");
+            }
+        }
+
+
+        friend std::ostream& operator<<(std::ostream& out, const con_& item)
+        {
+            out << "con\t ("
+                << "num_filters="<<item.num_filters_
+                << ", nr="<<item.nr()
+                << ", nc="<<item.nc()
+                << ", stride_y="<<_stride_y
+                << ", stride_x="<<_stride_x
+                << ", padding_y="<<item.padding_y_
+                << ", padding_x="<<item.padding_x_
+                << ")";
+            out << " learning_rate_mult="<<item.learning_rate_multiplier;
+            out << " weight_decay_mult="<<item.weight_decay_multiplier;
+            out << " bias_learning_rate_mult="<<item.bias_learning_rate_multiplier;
+            out << " bias_weight_decay_mult="<<item.bias_weight_decay_multiplier;
+            return out;
+        }
+
+        friend void to_xml(const con_& item, std::ostream& out)
+        {
+            out << "<con"
+                << " num_filters='"<<item.num_filters_<<"'"
+                << " nr='"<<item.nr()<<"'"
+                << " nc='"<<item.nc()<<"'"
+                << " stride_y='"<<_stride_y<<"'"
+                << " stride_x='"<<_stride_x<<"'"
+                << " padding_y='"<<item.padding_y_<<"'"
+                << " padding_x='"<<item.padding_x_<<"'"
+                << " learning_rate_mult='"<<item.learning_rate_multiplier<<"'"
+                << " weight_decay_mult='"<<item.weight_decay_multiplier<<"'"
+                << " bias_learning_rate_mult='"<<item.bias_learning_rate_multiplier<<"'"
+                << " bias_weight_decay_mult='"<<item.bias_weight_decay_multiplier<<"'>\n";
+            out << mat(item.params);
+            out << "</con>";
+        }
+
+    private:
+
+        resizable_tensor params;
+        alias_tensor filters, biases;
+
+        tt::tensor_conv conv;
+        double learning_rate_multiplier;
+        double weight_decay_multiplier;
+        double bias_learning_rate_multiplier;
+        double bias_weight_decay_multiplier;
+        long num_filters_;
+
+        // These are here only because older versions of con (which you might encounter
+        // serialized to disk) used different padding settings.
+        int padding_y_;
+        int padding_x_;
+
+    };
+
+    template <
+        long num_filters,
+        long nr,
+        long nc,
+        int stride_y,
+        int stride_x,
+        typename SUBNET
+        >
+    using con = add_layer<con_<num_filters,nr,nc,stride_y,stride_x>, SUBNET>;
+
+// ----------------------------------------------------------------------------------------
+
+    template <
+        long _num_filters,
+        long _nr,
+        long _nc,
+        int _stride_y,
+        int _stride_x,
+        int _padding_y = _stride_y!=1? 0 : _nr/2,
+        int _padding_x = _stride_x!=1? 0 : _nc/2
+        >
+    class cont_
+    {
+    public:
+
+        static_assert(_num_filters > 0, "The number of filters must be > 0");
+        static_assert(_nr > 0, "The number of rows in a filter must be > 0");
+        static_assert(_nc > 0, "The number of columns in a filter must be > 0");
+        static_assert(_stride_y > 0, "The filter stride must be > 0");
+        static_assert(_stride_x > 0, "The filter stride must be > 0");
+        static_assert(0 <= _padding_y && _padding_y < _nr, "The padding must be smaller than the filter size.");
+        static_assert(0 <= _padding_x && _padding_x < _nc, "The padding must be smaller than the filter size.");
+
+        cont_(
+            num_con_outputs o
+        ) : 
+            learning_rate_multiplier(1),
+            weight_decay_multiplier(1),
+            bias_learning_rate_multiplier(1),
+            bias_weight_decay_multiplier(0),
+            num_filters_(o.num_outputs),
+            padding_y_(_padding_y),
+            padding_x_(_padding_x)
+        {
+            DLIB_CASSERT(num_filters_ > 0);
+        }
+
+        cont_() : cont_(num_con_outputs(_num_filters)) {}
+
+        long num_filters() const { return num_filters_; }
+        long nr() const { return _nr; }
+        long nc() const { return _nc; }
+        long stride_y() const { return _stride_y; }
+        long stride_x() const { return _stride_x; }
+        long padding_y() const { return padding_y_; }
+        long padding_x() const { return padding_x_; }
+
+        void set_num_filters(long num)
+        {
+            DLIB_CASSERT(num > 0);
+            if (num != num_filters_)
+            {
+                DLIB_CASSERT(get_layer_params().size() == 0,
+                    "You can't change the number of filters in cont_ if the parameter tensor has already been allocated.");
+                num_filters_ = num;
+            }
+        }
+
+        double get_learning_rate_multiplier () const  { return learning_rate_multiplier; }
+        double get_weight_decay_multiplier () const   { return weight_decay_multiplier; }
+        void set_learning_rate_multiplier(double val) { learning_rate_multiplier = val; }
+        void set_weight_decay_multiplier(double val)  { weight_decay_multiplier  = val; }
+
+        double get_bias_learning_rate_multiplier () const  { return bias_learning_rate_multiplier; }
+        double get_bias_weight_decay_multiplier () const   { return bias_weight_decay_multiplier; }
+        void set_bias_learning_rate_multiplier(double val) { bias_learning_rate_multiplier = val; }
+        void set_bias_weight_decay_multiplier(double val)  { bias_weight_decay_multiplier  = val; }
+
+        inline dpoint map_output_to_input (
+            dpoint p
+        ) const
+        {
+            p.x() = (p.x()+padding_x()-nc()/2)/stride_x();
+            p.y() = (p.y()+padding_y()-nr()/2)/stride_y();
+            return p;
+        }
+
+        inline dpoint map_input_to_output (
+            dpoint p
+        ) const
+        {
+            p.x() = p.x()*stride_x() - padding_x() + nc()/2;
+            p.y() = p.y()*stride_y() - padding_y() + nr()/2;
+            return p;
+        }
+
+        cont_ (
+            const cont_& item
+        ) : 
+            params(item.params),
+            filters(item.filters),
+            biases(item.biases),
+            learning_rate_multiplier(item.learning_rate_multiplier),
+            weight_decay_multiplier(item.weight_decay_multiplier),
+            bias_learning_rate_multiplier(item.bias_learning_rate_multiplier),
+            bias_weight_decay_multiplier(item.bias_weight_decay_multiplier),
+            num_filters_(item.num_filters_),
+            padding_y_(item.padding_y_),
+            padding_x_(item.padding_x_)
+        {
+            // this->conv is non-copyable and basically stateless, so we have to write our
+            // own copy to avoid trying to copy it and getting an error.
+        }
+
+        cont_& operator= (
+            const cont_& item
+        )
+        {
+            if (this == &item)
+                return *this;
+
+            // this->conv is non-copyable and basically stateless, so we have to write our
+            // own copy to avoid trying to copy it and getting an error.
+            params = item.params;
+            filters = item.filters;
+            biases = item.biases;
+            padding_y_ = item.padding_y_;
+            padding_x_ = item.padding_x_;
+            learning_rate_multiplier = item.learning_rate_multiplier;
+            weight_decay_multiplier = item.weight_decay_multiplier;
+            bias_learning_rate_multiplier = item.bias_learning_rate_multiplier;
+            bias_weight_decay_multiplier = item.bias_weight_decay_multiplier;
+            num_filters_ = item.num_filters_;
+            return *this;
+        }
+
+        template <typename SUBNET>
+        void setup (const SUBNET& sub)
+        {
+            long num_inputs = _nr*_nc*sub.get_output().k();
+            long num_outputs = num_filters_;
+            // allocate params for the filters and also for the filter bias values.
+            params.set_size(num_inputs*num_filters_ + num_filters_);
+
+            dlib::rand rnd(std::rand());
+            randomize_parameters(params, num_inputs+num_outputs, rnd);
+
+            filters = alias_tensor(sub.get_output().k(), num_filters_, _nr, _nc);
+            biases = alias_tensor(1,num_filters_);
+
+            // set the initial bias values to zero
+            biases(params,filters.size()) = 0;
+        }
+
+        template <typename SUBNET>
+        void forward(const SUBNET& sub, resizable_tensor& output)
+        {
+            auto filt = filters(params,0);
+            unsigned int gnr = _stride_y * (sub.get_output().nr() - 1) + filt.nr() - 2 * padding_y_;
+            unsigned int gnc = _stride_x * (sub.get_output().nc() - 1) + filt.nc() - 2 * padding_x_;
+            unsigned int gnsamps = sub.get_output().num_samples();
+            unsigned int gk = filt.k();
+            output.set_size(gnsamps,gk,gnr,gnc);
+            conv.setup(output,filt,_stride_y,_stride_x,padding_y_,padding_x_);
+            conv.get_gradient_for_data(false, sub.get_output(),filt,output);            
+            tt::add(1,output,1,biases(params,filters.size()));
+        } 
+
+        template <typename SUBNET>
+        void backward(const tensor& gradient_input, SUBNET& sub, tensor& params_grad)
+        {
+            auto filt = filters(params,0);           
+            conv(true, sub.get_gradient_input(),gradient_input, filt);
+            // no point computing the parameter gradients if they won't be used.
+            if (learning_rate_multiplier != 0)
+            {
+                auto filt = filters(params_grad,0);                
+                conv.get_gradient_for_filters (false, sub.get_output(),gradient_input, filt);
+                auto b = biases(params_grad, filters.size());
+                tt::assign_conv_bias_gradient(b, gradient_input);
+            }
+        }
+
+        const tensor& get_layer_params() const { return params; }
+        tensor& get_layer_params() { return params; }
+
+        friend void serialize(const cont_& item, std::ostream& out)
+        {
+            serialize("cont_1", out);
+            serialize(item.params, out);
+            serialize(item.num_filters_, out);
+            serialize(_nr, out);
+            serialize(_nc, out);
+            serialize(_stride_y, out);
+            serialize(_stride_x, out);
+            serialize(item.padding_y_, out);
+            serialize(item.padding_x_, out);
+            serialize(item.filters, out);
+            serialize(item.biases, out);
+            serialize(item.learning_rate_multiplier, out);
+            serialize(item.weight_decay_multiplier, out);
+            serialize(item.bias_learning_rate_multiplier, out);
+            serialize(item.bias_weight_decay_multiplier, out);
+        }
+
+        friend void deserialize(cont_& item, std::istream& in)
+        {
+            std::string version;
+            deserialize(version, in);
+            long nr;
+            long nc;
+            int stride_y;
+            int stride_x;
+            if (version == "cont_1")
+            {
+                deserialize(item.params, in);
+                deserialize(item.num_filters_, in);
+                deserialize(nr, in);
+                deserialize(nc, in);
+                deserialize(stride_y, in);
+                deserialize(stride_x, in);
+                deserialize(item.padding_y_, in);
+                deserialize(item.padding_x_, in);
+                deserialize(item.filters, in);
+                deserialize(item.biases, in);
+                deserialize(item.learning_rate_multiplier, in);
+                deserialize(item.weight_decay_multiplier, in);
+                deserialize(item.bias_learning_rate_multiplier, in);
+                deserialize(item.bias_weight_decay_multiplier, in);
+                if (item.padding_y_ != _padding_y) throw serialization_error("Wrong padding_y found while deserializing dlib::con_");
+                if (item.padding_x_ != _padding_x) throw serialization_error("Wrong padding_x found while deserializing dlib::con_");
+                if (nr != _nr) throw serialization_error("Wrong nr found while deserializing dlib::con_");
+                if (nc != _nc) throw serialization_error("Wrong nc found while deserializing dlib::con_");
+                if (stride_y != _stride_y) throw serialization_error("Wrong stride_y found while deserializing dlib::con_");
+                if (stride_x != _stride_x) throw serialization_error("Wrong stride_x found while deserializing dlib::con_");
+            }
+            else
+            {
+                throw serialization_error("Unexpected version '"+version+"' found while deserializing dlib::con_.");
+            }
+        }
+
+
+        friend std::ostream& operator<<(std::ostream& out, const cont_& item)
+        {
+            out << "cont\t ("
+                << "num_filters="<<item.num_filters_
+                << ", nr="<<_nr
+                << ", nc="<<_nc
+                << ", stride_y="<<_stride_y
+                << ", stride_x="<<_stride_x
+                << ", padding_y="<<item.padding_y_
+                << ", padding_x="<<item.padding_x_
+                << ")";
+            out << " learning_rate_mult="<<item.learning_rate_multiplier;
+            out << " weight_decay_mult="<<item.weight_decay_multiplier;
+            out << " bias_learning_rate_mult="<<item.bias_learning_rate_multiplier;
+            out << " bias_weight_decay_mult="<<item.bias_weight_decay_multiplier;
+            return out;
+        }
+
+        friend void to_xml(const cont_& item, std::ostream& out)
+        {
+            out << "<cont"
+                << " num_filters='"<<item.num_filters_<<"'"
+                << " nr='"<<_nr<<"'"
+                << " nc='"<<_nc<<"'"
+                << " stride_y='"<<_stride_y<<"'"
+                << " stride_x='"<<_stride_x<<"'"
+                << " padding_y='"<<item.padding_y_<<"'"
+                << " padding_x='"<<item.padding_x_<<"'"
+                << " learning_rate_mult='"<<item.learning_rate_multiplier<<"'"
+                << " weight_decay_mult='"<<item.weight_decay_multiplier<<"'"
+                << " bias_learning_rate_mult='"<<item.bias_learning_rate_multiplier<<"'"
+                << " bias_weight_decay_mult='"<<item.bias_weight_decay_multiplier<<"'>\n";
+            out << mat(item.params);
+            out << "</cont>";
+        }
+
+    private:
+
+        resizable_tensor params;
+        alias_tensor filters, biases;
+
+        tt::tensor_conv conv;
+        double learning_rate_multiplier;
+        double weight_decay_multiplier;
+        double bias_learning_rate_multiplier;
+        double bias_weight_decay_multiplier;
+        long num_filters_;
+
+        int padding_y_;
+        int padding_x_;
+
+    };
+
+    template <
+        long num_filters,
+        long nr,
+        long nc,
+        int stride_y,
+        int stride_x,
+        typename SUBNET
+        >
+    using cont = add_layer<cont_<num_filters,nr,nc,stride_y,stride_x>, SUBNET>;
+
+// ----------------------------------------------------------------------------------------
+
+    template <
+        int scale_y, 
+        int scale_x 
+        >
+    class upsample_
+    {
+    public:
+        static_assert(scale_y >= 1, "upsampling scale factor can't be less than 1.");
+        static_assert(scale_x >= 1, "upsampling scale factor can't be less than 1.");
+
+        upsample_() 
+        {
+        }
+
+        template <typename SUBNET>
+        void setup (const SUBNET& /*sub*/)
+        {
+        }
+
+        template <typename SUBNET>
+        void forward(const SUBNET& sub, resizable_tensor& output)
+        {
+            output.set_size(
+                sub.get_output().num_samples(),
+                sub.get_output().k(),
+                scale_y*sub.get_output().nr(),
+                scale_x*sub.get_output().nc());
+            tt::resize_bilinear(output, sub.get_output());
+        } 
+
+        template <typename SUBNET>
+        void backward(const tensor& gradient_input, SUBNET& sub, tensor& /*params_grad*/)
+        {
+            tt::resize_bilinear_gradient(sub.get_gradient_input(), gradient_input);
+        }
+
+        inline dpoint map_input_to_output (dpoint p) const 
+        { 
+            p.x() = p.x()*scale_x;
+            p.y() = p.y()*scale_y;
+            return p; 
+        }
+        inline dpoint map_output_to_input (dpoint p) const 
+        { 
+            p.x() = p.x()/scale_x;
+            p.y() = p.y()/scale_y;
+            return p; 
+        }
+
+        const tensor& get_layer_params() const { return params; }
+        tensor& get_layer_params() { return params; }
+
+        friend void serialize(const upsample_& , std::ostream& out)
+        {
+            serialize("upsample_", out);
+            serialize(scale_y, out);
+            serialize(scale_x, out);
+        }
+
+        friend void deserialize(upsample_& , std::istream& in)
+        {
+            std::string version;
+            deserialize(version, in);
+            if (version != "upsample_")
+                throw serialization_error("Unexpected version '"+version+"' found while deserializing dlib::upsample_.");
+
+            int _scale_y;
+            int _scale_x;
+            deserialize(_scale_y, in);
+            deserialize(_scale_x, in);
+            if (_scale_y != scale_y || _scale_x != scale_x)
+                throw serialization_error("Wrong scale found while deserializing dlib::upsample_");
+        }
+
+        friend std::ostream& operator<<(std::ostream& out, const upsample_& )
+        {
+            out << "upsample\t ("
+                << "scale_y="<<scale_y
+                << ", scale_x="<<scale_x
+                << ")";
+            return out;
+        }
+
+        friend void to_xml(const upsample_& /*item*/, std::ostream& out)
+        {
+            out << "<upsample"
+                << " scale_y='"<<scale_y<<"'"
+                << " scale_x='"<<scale_x<<"'/>\n";
+        }
+
+    private:
+        resizable_tensor params;
+    };
+
+    template <
+        int scale,
+        typename SUBNET
+        >
+    using upsample = add_layer<upsample_<scale,scale>, SUBNET>;
+
+// ----------------------------------------------------------------------------------------
+
+    template <
+        long _nr,
+        long _nc,
+        int _stride_y,
+        int _stride_x,
+        int _padding_y = _stride_y!=1? 0 : _nr/2,
+        int _padding_x = _stride_x!=1? 0 : _nc/2
+        >
+    class max_pool_
+    {
+        static_assert(_nr >= 0, "The number of rows in a filter must be >= 0");
+        static_assert(_nc >= 0, "The number of columns in a filter must be >= 0");
+        static_assert(_stride_y > 0, "The filter stride must be > 0");
+        static_assert(_stride_x > 0, "The filter stride must be > 0");
+        static_assert(0 <= _padding_y && ((_nr==0 && _padding_y == 0) || (_nr!=0 && _padding_y < _nr)), 
+            "The padding must be smaller than the filter size, unless the filters size is 0.");
+        static_assert(0 <= _padding_x && ((_nc==0 && _padding_x == 0) || (_nc!=0 && _padding_x < _nc)), 
+            "The padding must be smaller than the filter size, unless the filters size is 0.");
+    public:
+
+
+        max_pool_(
+        ) :
+            padding_y_(_padding_y),
+            padding_x_(_padding_x)
+        {}
+
+        long nr() const { return _nr; }
+        long nc() const { return _nc; }
+        long stride_y() const { return _stride_y; }
+        long stride_x() const { return _stride_x; }
+        long padding_y() const { return padding_y_; }
+        long padding_x() const { return padding_x_; }
+
+        inline dpoint map_input_to_output (
+            dpoint p
+        ) const
+        {
+            p.x() = (p.x()+padding_x()-nc()/2)/stride_x();
+            p.y() = (p.y()+padding_y()-nr()/2)/stride_y();
+            return p;
+        }
+
+        inline dpoint map_output_to_input (
+            dpoint p
+        ) const
+        {
+            p.x() = p.x()*stride_x() - padding_x() + nc()/2;
+            p.y() = p.y()*stride_y() - padding_y() + nr()/2;
+            return p;
+        }
+
+        max_pool_ (
+            const max_pool_& item
+        )  :
+            padding_y_(item.padding_y_),
+            padding_x_(item.padding_x_)
+        {
+            // this->mp is non-copyable so we have to write our own copy to avoid trying to
+            // copy it and getting an error.
+        }
+
+        max_pool_& operator= (
+            const max_pool_& item
+        )
+        {
+            if (this == &item)
+                return *this;
+
+            padding_y_ = item.padding_y_;
+            padding_x_ = item.padding_x_;
+
+            // this->mp is non-copyable so we have to write our own copy to avoid trying to
+            // copy it and getting an error.
+            return *this;
+        }
+
+        template <typename SUBNET>
+        void setup (const SUBNET& /*sub*/)
+        {
+        }
+
+        template <typename SUBNET>
+        void forward(const SUBNET& sub, resizable_tensor& output)
+        {
+            mp.setup_max_pooling(_nr!=0?_nr:sub.get_output().nr(), 
+                                 _nc!=0?_nc:sub.get_output().nc(),
+                                 _stride_y, _stride_x, padding_y_, padding_x_);
+
+            mp(output, sub.get_output());
+        } 
+
+        template <typename SUBNET>
+        void backward(const tensor& computed_output, const tensor& gradient_input, SUBNET& sub, tensor& /*params_grad*/)
+        {
+            mp.setup_max_pooling(_nr!=0?_nr:sub.get_output().nr(), 
+                                 _nc!=0?_nc:sub.get_output().nc(),
+                                 _stride_y, _stride_x, padding_y_, padding_x_);
+
+            mp.get_gradient(gradient_input, computed_output, sub.get_output(), sub.get_gradient_input());
+        }
+
+        const tensor& get_layer_params() const { return params; }
+        tensor& get_layer_params() { return params; }
+
+        friend void serialize(const max_pool_& item, std::ostream& out)
+        {
+            serialize("max_pool_2", out);
+            serialize(_nr, out);
+            serialize(_nc, out);
+            serialize(_stride_y, out);
+            serialize(_stride_x, out);
+            serialize(item.padding_y_, out);
+            serialize(item.padding_x_, out);
+        }
+
+        friend void deserialize(max_pool_& item, std::istream& in)
+        {
+            std::string version;
+            deserialize(version, in);
+            long nr;
+            long nc;
+            int stride_y;
+            int stride_x;
+            if (version == "max_pool_2")
+            {
+                deserialize(nr, in);
+                deserialize(nc, in);
+                deserialize(stride_y, in);
+                deserialize(stride_x, in);
+                deserialize(item.padding_y_, in);
+                deserialize(item.padding_x_, in);
+            }
+            else
+            {
+                throw serialization_error("Unexpected version '"+version+"' found while deserializing dlib::max_pool_.");
+            }
+
+            if (item.padding_y_ != _padding_y) throw serialization_error("Wrong padding_y found while deserializing dlib::max_pool_");
+            if (item.padding_x_ != _padding_x) throw serialization_error("Wrong padding_x found while deserializing dlib::max_pool_");
+            if (_nr != nr) throw serialization_error("Wrong nr found while deserializing dlib::max_pool_");
+            if (_nc != nc) throw serialization_error("Wrong nc found while deserializing dlib::max_pool_");
+            if (_stride_y != stride_y) throw serialization_error("Wrong stride_y found while deserializing dlib::max_pool_");
+            if (_stride_x != stride_x) throw serialization_error("Wrong stride_x found while deserializing dlib::max_pool_");
+        }
+
+        friend std::ostream& operator<<(std::ostream& out, const max_pool_& item)
+        {
+            out << "max_pool ("
+                << "nr="<<_nr
+                << ", nc="<<_nc
+                << ", stride_y="<<_stride_y
+                << ", stride_x="<<_stride_x
+                << ", padding_y="<<item.padding_y_
+                << ", padding_x="<<item.padding_x_
+                << ")";
+            return out;
+        }
+
+        friend void to_xml(const max_pool_& item, std::ostream& out)
+        {
+            out << "<max_pool"
+                << " nr='"<<_nr<<"'"
+                << " nc='"<<_nc<<"'"
+                << " stride_y='"<<_stride_y<<"'"
+                << " stride_x='"<<_stride_x<<"'"
+                << " padding_y='"<<item.padding_y_<<"'"
+                << " padding_x='"<<item.padding_x_<<"'"
+                << "/>\n";
+        }
+
+
+    private:
+
+
+        tt::pooling mp;
+        resizable_tensor params;
+
+        int padding_y_;
+        int padding_x_;
+    };
+
+    template <
+        long nr,
+        long nc,
+        int stride_y,
+        int stride_x,
+        typename SUBNET
+        >
+    using max_pool = add_layer<max_pool_<nr,nc,stride_y,stride_x>, SUBNET>;
+
+    template <
+        typename SUBNET
+        >
+    using max_pool_everything = add_layer<max_pool_<0,0,1,1>, SUBNET>;
+
+// ----------------------------------------------------------------------------------------
+
+    template <
+        long _nr,
+        long _nc,
+        int _stride_y,
+        int _stride_x,
+        int _padding_y = _stride_y!=1? 0 : _nr/2,
+        int _padding_x = _stride_x!=1? 0 : _nc/2
+        >
+    class avg_pool_
+    {
+    public:
+        static_assert(_nr >= 0, "The number of rows in a filter must be >= 0");
+        static_assert(_nc >= 0, "The number of columns in a filter must be >= 0");
+        static_assert(_stride_y > 0, "The filter stride must be > 0");
+        static_assert(_stride_x > 0, "The filter stride must be > 0");
+        static_assert(0 <= _padding_y && ((_nr==0 && _padding_y == 0) || (_nr!=0 && _padding_y < _nr)), 
+            "The padding must be smaller than the filter size, unless the filters size is 0.");
+        static_assert(0 <= _padding_x && ((_nc==0 && _padding_x == 0) || (_nc!=0 && _padding_x < _nc)), 
+            "The padding must be smaller than the filter size, unless the filters size is 0.");
+
+        avg_pool_(
+        ) :
+            padding_y_(_padding_y),
+            padding_x_(_padding_x)
+        {}
+
+        long nr() const { return _nr; }
+        long nc() const { return _nc; }
+        long stride_y() const { return _stride_y; }
+        long stride_x() const { return _stride_x; }
+        long padding_y() const { return padding_y_; }
+        long padding_x() const { return padding_x_; }
+
+        inline dpoint map_input_to_output (
+            dpoint p
+        ) const
+        {
+            p.x() = (p.x()+padding_x()-nc()/2)/stride_x();
+            p.y() = (p.y()+padding_y()-nr()/2)/stride_y();
+            return p;
+        }
+
+        inline dpoint map_output_to_input (
+            dpoint p
+        ) const
+        {
+            p.x() = p.x()*stride_x() - padding_x() + nc()/2;
+            p.y() = p.y()*stride_y() - padding_y() + nr()/2;
+            return p;
+        }
+
+        avg_pool_ (
+            const avg_pool_& item
+        )  :
+            padding_y_(item.padding_y_),
+            padding_x_(item.padding_x_)
+        {
+            // this->ap is non-copyable so we have to write our own copy to avoid trying to
+            // copy it and getting an error.
+        }
+
+        avg_pool_& operator= (
+            const avg_pool_& item
+        )
+        {
+            if (this == &item)
+                return *this;
+
+            padding_y_ = item.padding_y_;
+            padding_x_ = item.padding_x_;
+
+            // this->ap is non-copyable so we have to write our own copy to avoid trying to
+            // copy it and getting an error.
+            return *this;
+        }
+
+        template <typename SUBNET>
+        void setup (const SUBNET& /*sub*/)
+        {
+        }
+
+        template <typename SUBNET>
+        void forward(const SUBNET& sub, resizable_tensor& output)
+        {
+            ap.setup_avg_pooling(_nr!=0?_nr:sub.get_output().nr(), 
+                                 _nc!=0?_nc:sub.get_output().nc(),
+                                 _stride_y, _stride_x, padding_y_, padding_x_);
+
+            ap(output, sub.get_output());
+        } 
+
+        template <typename SUBNET>
+        void backward(const tensor& computed_output, const tensor& gradient_input, SUBNET& sub, tensor& /*params_grad*/)
+        {
+            ap.setup_avg_pooling(_nr!=0?_nr:sub.get_output().nr(), 
+                                 _nc!=0?_nc:sub.get_output().nc(),
+                                 _stride_y, _stride_x, padding_y_, padding_x_);
+
+            ap.get_gradient(gradient_input, computed_output, sub.get_output(), sub.get_gradient_input());
+        }
+
+        const tensor& get_layer_params() const { return params; }
+        tensor& get_layer_params() { return params; }
+
+        friend void serialize(const avg_pool_& item, std::ostream& out)
+        {
+            serialize("avg_pool_2", out);
+            serialize(_nr, out);
+            serialize(_nc, out);
+            serialize(_stride_y, out);
+            serialize(_stride_x, out);
+            serialize(item.padding_y_, out);
+            serialize(item.padding_x_, out);
+        }
+
+        friend void deserialize(avg_pool_& item, std::istream& in)
+        {
+            std::string version;
+            deserialize(version, in);
+
+            long nr;
+            long nc;
+            int stride_y;
+            int stride_x;
+            if (version == "avg_pool_2")
+            {
+                deserialize(nr, in);
+                deserialize(nc, in);
+                deserialize(stride_y, in);
+                deserialize(stride_x, in);
+                deserialize(item.padding_y_, in);
+                deserialize(item.padding_x_, in);
+            }
+            else
+            {
+                throw serialization_error("Unexpected version '"+version+"' found while deserializing dlib::avg_pool_.");
+            }
+
+            if (item.padding_y_ != _padding_y) throw serialization_error("Wrong padding_y found while deserializing dlib::avg_pool_");
+            if (item.padding_x_ != _padding_x) throw serialization_error("Wrong padding_x found while deserializing dlib::avg_pool_");
+            if (_nr != nr) throw serialization_error("Wrong nr found while deserializing dlib::avg_pool_");
+            if (_nc != nc) throw serialization_error("Wrong nc found while deserializing dlib::avg_pool_");
+            if (_stride_y != stride_y) throw serialization_error("Wrong stride_y found while deserializing dlib::avg_pool_");
+            if (_stride_x != stride_x) throw serialization_error("Wrong stride_x found while deserializing dlib::avg_pool_");
+        }
+
+        friend std::ostream& operator<<(std::ostream& out, const avg_pool_& item)
+        {
+            out << "avg_pool ("
+                << "nr="<<_nr
+                << ", nc="<<_nc
+                << ", stride_y="<<_stride_y
+                << ", stride_x="<<_stride_x
+                << ", padding_y="<<item.padding_y_
+                << ", padding_x="<<item.padding_x_
+                << ")";
+            return out;
+        }
+
+        friend void to_xml(const avg_pool_& item, std::ostream& out)
+        {
+            out << "<avg_pool"
+                << " nr='"<<_nr<<"'"
+                << " nc='"<<_nc<<"'"
+                << " stride_y='"<<_stride_y<<"'"
+                << " stride_x='"<<_stride_x<<"'"
+                << " padding_y='"<<item.padding_y_<<"'"
+                << " padding_x='"<<item.padding_x_<<"'"
+                << "/>\n";
+        }
+    private:
+
+        tt::pooling ap;
+        resizable_tensor params;
+
+        int padding_y_;
+        int padding_x_;
+    };
+
+    template <
+        long nr,
+        long nc,
+        int stride_y,
+        int stride_x,
+        typename SUBNET
+        >
+    using avg_pool = add_layer<avg_pool_<nr,nc,stride_y,stride_x>, SUBNET>;
+
+    template <
+        typename SUBNET
+        >
+    using avg_pool_everything = add_layer<avg_pool_<0,0,1,1>, SUBNET>;
+
+// ----------------------------------------------------------------------------------------
+
+    enum layer_mode
+    {
+        CONV_MODE = 0,
+        FC_MODE = 1
+    };
+
+    const double DEFAULT_BATCH_NORM_EPS = 0.0001;
+
+    template <
+        layer_mode mode
+        >
+    class bn_
+    {
+    public:
+        explicit bn_(
+            unsigned long window_size,
+            double eps_ = DEFAULT_BATCH_NORM_EPS
+        ) : 
+            num_updates(0), 
+            running_stats_window_size(window_size),
+            learning_rate_multiplier(1),
+            weight_decay_multiplier(0),
+            bias_learning_rate_multiplier(1),
+            bias_weight_decay_multiplier(1),
+            eps(eps_)
+        {
+            DLIB_CASSERT(window_size > 0, "The batch normalization running stats window size can't be 0.");
+        }
+
+        bn_() : bn_(100) {}
+
+        layer_mode get_mode() const { return mode; }
+        unsigned long get_running_stats_window_size () const { return running_stats_window_size; }
+        void set_running_stats_window_size (unsigned long new_window_size ) 
+        { 
+            DLIB_CASSERT(new_window_size > 0, "The batch normalization running stats window size can't be 0.");
+            running_stats_window_size = new_window_size; 
+        }
+        double get_eps() const { return eps; }
+
+        double get_learning_rate_multiplier () const  { return learning_rate_multiplier; }
+        double get_weight_decay_multiplier () const   { return weight_decay_multiplier; }
+        void set_learning_rate_multiplier(double val) { learning_rate_multiplier = val; }
+        void set_weight_decay_multiplier(double val)  { weight_decay_multiplier  = val; }
+
+        double get_bias_learning_rate_multiplier () const  { return bias_learning_rate_multiplier; }
+        double get_bias_weight_decay_multiplier () const   { return bias_weight_decay_multiplier; }
+        void set_bias_learning_rate_multiplier(double val) { bias_learning_rate_multiplier = val; }
+        void set_bias_weight_decay_multiplier(double val)  { bias_weight_decay_multiplier  = val; }
+
+        inline dpoint map_input_to_output (const dpoint& p) const { return p; }
+        inline dpoint map_output_to_input (const dpoint& p) const { return p; }
+
+
+        template <typename SUBNET>
+        void setup (const SUBNET& sub)
+        {
+            if (mode == FC_MODE)
+            {
+                gamma = alias_tensor(1,
+                                sub.get_output().k(),
+                                sub.get_output().nr(),
+                                sub.get_output().nc());
+            }
+            else
+            {
+                gamma = alias_tensor(1, sub.get_output().k());
+            }
+            beta = gamma;
+
+            params.set_size(gamma.size()+beta.size());
+
+            gamma(params,0) = 1;
+            beta(params,gamma.size()) = 0;
+
+            running_means.copy_size(gamma(params,0));
+            running_variances.copy_size(gamma(params,0));
+            running_means = 0;
+            running_variances = 1;
+            num_updates = 0;
+        }
+
+        template <typename SUBNET>
+        void forward(const SUBNET& sub, resizable_tensor& output)
+        {
+            auto g = gamma(params,0);
+            auto b = beta(params,gamma.size());
+            if (sub.get_output().num_samples() > 1)
+            {
+                const double decay = 1.0 - num_updates/(num_updates+1.0);
+                ++num_updates;
+                if (num_updates > running_stats_window_size)
+                    num_updates = running_stats_window_size;
+
+                if (mode == FC_MODE)
+                    tt::batch_normalize(eps, output, means, invstds, decay, running_means, running_variances, sub.get_output(), g, b);
+                else 
+                    tt::batch_normalize_conv(eps, output, means, invstds, decay, running_means, running_variances, sub.get_output(), g, b);
+            }
+            else // we are running in testing mode so we just linearly scale the input tensor.
+            {
+                if (mode == FC_MODE)
+                    tt::batch_normalize_inference(eps, output, sub.get_output(), g, b, running_means, running_variances);
+                else
+                    tt::batch_normalize_conv_inference(eps, output, sub.get_output(), g, b, running_means, running_variances);
+            }
+        } 
+
+        template <typename SUBNET>
+        void backward(const tensor& gradient_input, SUBNET& sub, tensor& params_grad)
+        {
+            auto g = gamma(params,0);
+            auto g_grad = gamma(params_grad, 0);
+            auto b_grad = beta(params_grad, gamma.size());
+            if (mode == FC_MODE)
+                tt::batch_normalize_gradient(eps, gradient_input, means, invstds, sub.get_output(), g, sub.get_gradient_input(), g_grad, b_grad );
+            else
+                tt::batch_normalize_conv_gradient(eps, gradient_input, means, invstds, sub.get_output(), g, sub.get_gradient_input(), g_grad, b_grad );
+        }
+
+        const tensor& get_layer_params() const { return params; }
+        tensor& get_layer_params() { return params; }
+
+        friend void serialize(const bn_& item, std::ostream& out)
+        {
+            if (mode == CONV_MODE)
+                serialize("bn_con2", out);
+            else // if FC_MODE
+                serialize("bn_fc2", out);
+            serialize(item.params, out);
+            serialize(item.gamma, out);
+            serialize(item.beta, out);
+            serialize(item.means, out);
+            serialize(item.invstds, out);
+            serialize(item.running_means, out);
+            serialize(item.running_variances, out);
+            serialize(item.num_updates, out);
+            serialize(item.running_stats_window_size, out);
+            serialize(item.learning_rate_multiplier, out);
+            serialize(item.weight_decay_multiplier, out);
+            serialize(item.bias_learning_rate_multiplier, out);
+            serialize(item.bias_weight_decay_multiplier, out);
+            serialize(item.eps, out);
+        }
+
+        friend void deserialize(bn_& item, std::istream& in)
+        {
+            std::string version;
+            deserialize(version, in);
+            if (mode == CONV_MODE) 
+            {
+                if (version != "bn_con2")
+                    throw serialization_error("Unexpected version '"+version+"' found while deserializing dlib::bn_.");
+            }
+            else // must be in FC_MODE
+            {
+                if (version != "bn_fc2")
+                    throw serialization_error("Unexpected version '"+version+"' found while deserializing dlib::bn_.");
+            }
+
+            deserialize(item.params, in);
+            deserialize(item.gamma, in);
+            deserialize(item.beta, in);
+            deserialize(item.means, in);
+            deserialize(item.invstds, in);
+            deserialize(item.running_means, in);
+            deserialize(item.running_variances, in);
+            deserialize(item.num_updates, in);
+            deserialize(item.running_stats_window_size, in);
+            deserialize(item.learning_rate_multiplier, in);
+            deserialize(item.weight_decay_multiplier, in);
+            deserialize(item.bias_learning_rate_multiplier, in);
+            deserialize(item.bias_weight_decay_multiplier, in);
+            deserialize(item.eps, in);
+        }
+
+        friend std::ostream& operator<<(std::ostream& out, const bn_& item)
+        {
+            if (mode == CONV_MODE)
+                out << "bn_con  ";
+            else
+                out << "bn_fc   ";
+            out << " eps="<<item.eps;
+            out << " running_stats_window_size="<<item.running_stats_window_size;
+            out << " learning_rate_mult="<<item.learning_rate_multiplier;
+            out << " weight_decay_mult="<<item.weight_decay_multiplier;
+            out << " bias_learning_rate_mult="<<item.bias_learning_rate_multiplier;
+            out << " bias_weight_decay_mult="<<item.bias_weight_decay_multiplier;
+            return out;
+        }
+
+        friend void to_xml(const bn_& item, std::ostream& out)
+        {
+            if (mode==CONV_MODE)
+                out << "<bn_con";
+            else
+                out << "<bn_fc";
+
+            out << " eps='"<<item.eps<<"'";
+            out << " running_stats_window_size='"<<item.running_stats_window_size<<"'";
+            out << " learning_rate_mult='"<<item.learning_rate_multiplier<<"'";
+            out << " weight_decay_mult='"<<item.weight_decay_multiplier<<"'";
+            out << " bias_learning_rate_mult='"<<item.bias_learning_rate_multiplier<<"'";
+            out << " bias_weight_decay_mult='"<<item.bias_weight_decay_multiplier<<"'";
+            out << ">\n";
+
+            out << mat(item.params);
+
+            if (mode==CONV_MODE)
+                out << "</bn_con>\n";
+            else
+                out << "</bn_fc>\n";
+        }
+
+    private:
+
+        friend class affine_;
+
+        resizable_tensor params;
+        alias_tensor gamma, beta;
+        resizable_tensor means, running_means;
+        resizable_tensor invstds, running_variances;
+        unsigned long num_updates;
+        unsigned long running_stats_window_size;
+        double learning_rate_multiplier;
+        double weight_decay_multiplier;
+        double bias_learning_rate_multiplier;
+        double bias_weight_decay_multiplier;
+        double eps;
+    };
+
+    template <typename SUBNET>
+    using bn_con = add_layer<bn_<CONV_MODE>, SUBNET>;
+    template <typename SUBNET>
+    using bn_fc = add_layer<bn_<FC_MODE>, SUBNET>;
+
+// ----------------------------------------------------------------------------------------
+
+    namespace impl
+    {
+        class visitor_bn_running_stats_window_size
+        {
+        public:
+
+            visitor_bn_running_stats_window_size(unsigned long new_window_size_) : new_window_size(new_window_size_) {}
+
+            template <typename T>
+            void set_window_size(T&) const
+            {
+                // ignore other layer detail types
+            }
+
+            template < layer_mode mode >
+            void set_window_size(bn_<mode>& l) const
+            {
+                l.set_running_stats_window_size(new_window_size);
+            }
+
+            template<typename input_layer_type>
+            void operator()(size_t , input_layer_type& )  const
+            {
+                // ignore other layers
+            }
+
+            template <typename T, typename U, typename E>
+            void operator()(size_t , add_layer<T,U,E>& l)  const
+            {
+                set_window_size(l.layer_details());
+            }
+
+        private:
+
+            unsigned long new_window_size;
+        };
+    }
+
+    template <typename net_type>
+    void set_all_bn_running_stats_window_sizes (
+        net_type& net,
+        unsigned long new_window_size
+    )
+    {
+        visit_layers(net, impl::visitor_bn_running_stats_window_size(new_window_size));
+    }
+
+// ----------------------------------------------------------------------------------------
+// ----------------------------------------------------------------------------------------
+
+    enum fc_bias_mode
+    {
+        FC_HAS_BIAS = 0,
+        FC_NO_BIAS = 1
+    };
+
+    struct num_fc_outputs
+    {
+        num_fc_outputs(unsigned long n) : num_outputs(n) {}
+        unsigned long num_outputs;
+    };
+
+    template <
+        unsigned long num_outputs_,
+        fc_bias_mode bias_mode
+        >
+    class fc_
+    {
+        static_assert(num_outputs_ > 0, "The number of outputs from a fc_ layer must be > 0");
+
+    public:
+        fc_(num_fc_outputs o) : num_outputs(o.num_outputs), num_inputs(0),
+            learning_rate_multiplier(1),
+            weight_decay_multiplier(1),
+            bias_learning_rate_multiplier(1),
+            bias_weight_decay_multiplier(0)
+        {}
+
+        fc_() : fc_(num_fc_outputs(num_outputs_)) {}
+
+        double get_learning_rate_multiplier () const  { return learning_rate_multiplier; }
+        double get_weight_decay_multiplier () const   { return weight_decay_multiplier; }
+        void set_learning_rate_multiplier(double val) { learning_rate_multiplier = val; }
+        void set_weight_decay_multiplier(double val)  { weight_decay_multiplier  = val; }
+
+        double get_bias_learning_rate_multiplier () const  { return bias_learning_rate_multiplier; }
+        double get_bias_weight_decay_multiplier () const   { return bias_weight_decay_multiplier; }
+        void set_bias_learning_rate_multiplier(double val) { bias_learning_rate_multiplier = val; }
+        void set_bias_weight_decay_multiplier(double val)  { bias_weight_decay_multiplier  = val; }
+
+        unsigned long get_num_outputs (
+        ) const { return num_outputs; }
+
+        void set_num_outputs(long num) 
+        {
+            DLIB_CASSERT(num > 0);
+            if (num != (long)num_outputs)
+            {
+                DLIB_CASSERT(get_layer_params().size() == 0, 
+                    "You can't change the number of filters in fc_ if the parameter tensor has already been allocated.");
+                num_outputs = num;
+            }
+        }
+
+        fc_bias_mode get_bias_mode (
+        ) const { return bias_mode; }
+
+        template <typename SUBNET>
+        void setup (const SUBNET& sub)
+        {
+            num_inputs = sub.get_output().nr()*sub.get_output().nc()*sub.get_output().k();
+            if (bias_mode == FC_HAS_BIAS)
+                params.set_size(num_inputs+1, num_outputs);
+            else
+                params.set_size(num_inputs, num_outputs);
+
+            dlib::rand rnd(std::rand());
+            randomize_parameters(params, num_inputs+num_outputs, rnd);
+
+            weights = alias_tensor(num_inputs, num_outputs);
+
+            if (bias_mode == FC_HAS_BIAS)
+            {
+                biases = alias_tensor(1,num_outputs);
+                // set the initial bias values to zero
+                biases(params,weights.size()) = 0;
+            }
+        }
+
+        template <typename SUBNET>
+        void forward(const SUBNET& sub, resizable_tensor& output)
+        {
+            DLIB_CASSERT((long)num_inputs == sub.get_output().nr()*sub.get_output().nc()*sub.get_output().k(),
+                "The size of the input tensor to this fc layer doesn't match the size the fc layer was trained with.");
+            output.set_size(sub.get_output().num_samples(), num_outputs);
+
+            auto w = weights(params, 0);
+            tt::gemm(0,output, 1,sub.get_output(),false, w,false);
+            if (bias_mode == FC_HAS_BIAS)
+            {
+                auto b = biases(params, weights.size());
+                tt::add(1,output,1,b);
+            }
+        } 
+
+        template <typename SUBNET>
+        void backward(const tensor& gradient_input, SUBNET& sub, tensor& params_grad)
+        {
+            // no point computing the parameter gradients if they won't be used.
+            if (learning_rate_multiplier != 0)
+            {
+                // compute the gradient of the weight parameters.  
+                auto pw = weights(params_grad, 0);
+                tt::gemm(0,pw, 1,sub.get_output(),true, gradient_input,false);
+
+                if (bias_mode == FC_HAS_BIAS)
+                {
+                    // compute the gradient of the bias parameters.  
+                    auto pb = biases(params_grad, weights.size());
+                    tt::assign_bias_gradient(pb, gradient_input);
+                }
+            }
+
+            // compute the gradient for the data
+            auto w = weights(params, 0);
+            tt::gemm(1,sub.get_gradient_input(), 1,gradient_input,false, w,true);
+        }
+
+        alias_tensor_instance get_weights()
+        {
+            return weights(params, 0);
+        }
+
+        alias_tensor_const_instance get_weights() const
+        {
+            return weights(params, 0);
+        }
+
+        alias_tensor_instance get_biases()
+        {
+            static_assert(bias_mode == FC_HAS_BIAS, "This fc_ layer doesn't have a bias vector "
+                "to be retrieved, as per template parameter 'bias_mode'.");
+            return biases(params, weights.size());
+        }
+
+        alias_tensor_const_instance get_biases() const
+        {
+            static_assert(bias_mode == FC_HAS_BIAS, "This fc_ layer doesn't have a bias vector "
+                "to be retrieved, as per template parameter 'bias_mode'.");
+            return biases(params, weights.size());
+        }
+
+        const tensor& get_layer_params() const { return params; }
+        tensor& get_layer_params() { return params; }
+
+        friend void serialize(const fc_& item, std::ostream& out)
+        {
+            serialize("fc_2", out);
+            serialize(item.num_outputs, out);
+            serialize(item.num_inputs, out);
+            serialize(item.params, out);
+            serialize(item.weights, out);
+            serialize(item.biases, out);
+            serialize((int)bias_mode, out);
+            serialize(item.learning_rate_multiplier, out);
+            serialize(item.weight_decay_multiplier, out);
+            serialize(item.bias_learning_rate_multiplier, out);
+            serialize(item.bias_weight_decay_multiplier, out);
+        }
+
+        friend void deserialize(fc_& item, std::istream& in)
+        {
+            std::string version;
+            deserialize(version, in);
+            if (version != "fc_2")
+                throw serialization_error("Unexpected version '"+version+"' found while deserializing dlib::fc_.");
+
+            deserialize(item.num_outputs, in);
+            deserialize(item.num_inputs, in);
+            deserialize(item.params, in);
+            deserialize(item.weights, in);
+            deserialize(item.biases, in);
+            int bmode = 0;
+            deserialize(bmode, in);
+            if (bias_mode != (fc_bias_mode)bmode) throw serialization_error("Wrong fc_bias_mode found while deserializing dlib::fc_");
+            deserialize(item.learning_rate_multiplier, in);
+            deserialize(item.weight_decay_multiplier, in);
+            deserialize(item.bias_learning_rate_multiplier, in);
+            deserialize(item.bias_weight_decay_multiplier, in);
+        }
+
+        friend std::ostream& operator<<(std::ostream& out, const fc_& item)
+        {
+            if (bias_mode == FC_HAS_BIAS)
+            {
+                out << "fc\t ("
+                    << "num_outputs="<<item.num_outputs
+                    << ")";
+                out << " learning_rate_mult="<<item.learning_rate_multiplier;
+                out << " weight_decay_mult="<<item.weight_decay_multiplier;
+                out << " bias_learning_rate_mult="<<item.bias_learning_rate_multiplier;
+                out << " bias_weight_decay_mult="<<item.bias_weight_decay_multiplier;
+            }
+            else
+            {
+                out << "fc_no_bias ("
+                    << "num_outputs="<<item.num_outputs
+                    << ")";
+                out << " learning_rate_mult="<<item.learning_rate_multiplier;
+                out << " weight_decay_mult="<<item.weight_decay_multiplier;
+            }
+            return out;
+        }
+
+        friend void to_xml(const fc_& item, std::ostream& out)
+        {
+            if (bias_mode==FC_HAS_BIAS)
+            {
+                out << "<fc"
+                    << " num_outputs='"<<item.num_outputs<<"'"
+                    << " learning_rate_mult='"<<item.learning_rate_multiplier<<"'"
+                    << " weight_decay_mult='"<<item.weight_decay_multiplier<<"'"
+                    << " bias_learning_rate_mult='"<<item.bias_learning_rate_multiplier<<"'"
+                    << " bias_weight_decay_mult='"<<item.bias_weight_decay_multiplier<<"'";
+                out << ">\n";
+                out << mat(item.params);
+                out << "</fc>\n";
+            }
+            else
+            {
+                out << "<fc_no_bias"
+                    << " num_outputs='"<<item.num_outputs<<"'"
+                    << " learning_rate_mult='"<<item.learning_rate_multiplier<<"'"
+                    << " weight_decay_mult='"<<item.weight_decay_multiplier<<"'";
+                out << ">\n";
+                out << mat(item.params);
+                out << "</fc_no_bias>\n";
+            }
+        }
+
+    private:
+
+        unsigned long num_outputs;
+        unsigned long num_inputs;
+        resizable_tensor params;
+        alias_tensor weights, biases;
+        double learning_rate_multiplier;
+        double weight_decay_multiplier;
+        double bias_learning_rate_multiplier;
+        double bias_weight_decay_multiplier;
+    };
+
+    template <
+        unsigned long num_outputs,
+        typename SUBNET
+        >
+    using fc = add_layer<fc_<num_outputs,FC_HAS_BIAS>, SUBNET>;
+
+    template <
+        unsigned long num_outputs,
+        typename SUBNET
+        >
+    using fc_no_bias = add_layer<fc_<num_outputs,FC_NO_BIAS>, SUBNET>;
+
+// ----------------------------------------------------------------------------------------
+
+    class dropout_
+    {
+    public:
+        explicit dropout_(
+            float drop_rate_ = 0.5
+        ) :
+            drop_rate(drop_rate_),
+            rnd(std::rand())
+        {
+            DLIB_CASSERT(0 <= drop_rate && drop_rate <= 1);
+        }
+
+        // We have to add a copy constructor and assignment operator because the rnd object
+        // is non-copyable.
+        dropout_(
+            const dropout_& item
+        ) : drop_rate(item.drop_rate), mask(item.mask), rnd(std::rand())
+        {}
+
+        dropout_& operator= (
+            const dropout_& item
+        )
+        {
+            if (this == &item)
+                return *this;
+
+            drop_rate = item.drop_rate;
+            mask = item.mask;
+            return *this;
+        }
+
+        float get_drop_rate (
+        ) const { return drop_rate; }
+
+        template <typename SUBNET>
+        void setup (const SUBNET& /*sub*/)
+        {
+        }
+
+        void forward_inplace(const tensor& input, tensor& output)
+        {
+            // create a random mask and use it to filter the data
+            mask.copy_size(input);
+            rnd.fill_uniform(mask);
+            tt::threshold(mask, drop_rate);
+            tt::multiply(false, output, input, mask);
+        } 
+
+        void backward_inplace(
+            const tensor& gradient_input, 
+            tensor& data_grad, 
+            tensor& /*params_grad*/
+        )
+        {
+            if (is_same_object(gradient_input, data_grad))
+                tt::multiply(false, data_grad, mask, gradient_input);
+            else
+                tt::multiply(true, data_grad, mask, gradient_input);
+        }
+
+        inline dpoint map_input_to_output (const dpoint& p) const { return p; }
+        inline dpoint map_output_to_input (const dpoint& p) const { return p; }
+
+        const tensor& get_layer_params() const { return params; }
+        tensor& get_layer_params() { return params; }
+
+        friend void serialize(const dropout_& item, std::ostream& out)
+        {
+            serialize("dropout_", out);
+            serialize(item.drop_rate, out);
+            serialize(item.mask, out);
+        }
+
+        friend void deserialize(dropout_& item, std::istream& in)
+        {
+            std::string version;
+            deserialize(version, in);
+            if (version != "dropout_")
+                throw serialization_error("Unexpected version '"+version+"' found while deserializing dlib::dropout_.");
+            deserialize(item.drop_rate, in);
+            deserialize(item.mask, in);
+        }
+
+        void clean(
+        ) 
+        {
+            mask.clear();
+        }
+
+        friend std::ostream& operator<<(std::ostream& out, const dropout_& item)
+        {
+            out << "dropout\t ("
+                << "drop_rate="<<item.drop_rate
+                << ")";
+            return out;
+        }
+
+        friend void to_xml(const dropout_& item, std::ostream& out)
+        {
+            out << "<dropout"
+                << " drop_rate='"<<item.drop_rate<<"'";
+            out << "/>\n";
+        }
+
+    private:
+        float drop_rate;
+        resizable_tensor mask;
+
+        tt::tensor_rand rnd;
+        resizable_tensor params; // unused
+    };
+
+
+    template <typename SUBNET>
+    using dropout = add_layer<dropout_, SUBNET>;
+
+// ----------------------------------------------------------------------------------------
+
+    class multiply_
+    {
+    public:
+        explicit multiply_(
+            float val_ = 0.5
+        ) :
+            val(val_)
+        {
+        }
+
+        multiply_ (
+            const dropout_& item
+        ) : val(1-item.get_drop_rate()) {}
+
+        float get_multiply_value (
+        ) const { return val; }
+
+        template <typename SUBNET>
+        void setup (const SUBNET& /*sub*/)
+        {
+        }
+
+        void forward_inplace(const tensor& input, tensor& output)
+        {
+            tt::affine_transform(output, input, val);
+        } 
+
+        inline dpoint map_input_to_output (const dpoint& p) const { return p; }
+        inline dpoint map_output_to_input (const dpoint& p) const { return p; }
+
+        void backward_inplace(
+            const tensor& gradient_input, 
+            tensor& data_grad, 
+            tensor& /*params_grad*/
+        )
+        {
+            if (is_same_object(gradient_input, data_grad))
+                tt::affine_transform(data_grad, gradient_input, val);
+            else
+                tt::affine_transform(data_grad, data_grad, gradient_input, 1, val);
+        }
+
+        const tensor& get_layer_params() const { return params; }
+        tensor& get_layer_params() { return params; }
+
+        friend void serialize(const multiply_& item, std::ostream& out)
+        {
+            serialize("multiply_", out);
+            serialize(item.val, out);
+        }
+
+        friend void deserialize(multiply_& item, std::istream& in)
+        {
+            std::string version;
+            deserialize(version, in);
+            if (version == "dropout_")
+            {
+                // Since we can build a multiply_ from a dropout_ we check if that's what
+                // is in the stream and if so then just convert it right here.
+                unserialize sin(version, in);
+                dropout_ temp;
+                deserialize(temp, sin);
+                item = temp;
+                return;
+            }
+
+            if (version != "multiply_")
+                throw serialization_error("Unexpected version '"+version+"' found while deserializing dlib::multiply_.");
+            deserialize(item.val, in);
+        }
+
+        friend std::ostream& operator<<(std::ostream& out, const multiply_& item)
+        {
+            out << "multiply ("
+                << "val="<<item.val
+                << ")";
+            return out;
+        }
+
+        friend void to_xml(const multiply_& item, std::ostream& out)
+        {
+            out << "<multiply"
+                << " val='"<<item.val<<"'";
+            out << "/>\n";
+        }
+    private:
+        float val;
+        resizable_tensor params; // unused
+    };
+
+    template <typename SUBNET>
+    using multiply = add_layer<multiply_, SUBNET>;
+
+// ----------------------------------------------------------------------------------------
+
+    class affine_
+    {
+    public:
+        affine_(
+        ) : mode(FC_MODE)
+        {
+        }
+
+        affine_(
+            layer_mode mode_
+        ) : mode(mode_)
+        {
+        }
+
+        template <
+            layer_mode bnmode
+            >
+        affine_(
+            const bn_<bnmode>& item
+        )
+        {
+            gamma = item.gamma;
+            beta = item.beta;
+            mode = bnmode;
+
+            params.copy_size(item.params);
+
+            auto g = gamma(params,0);
+            auto b = beta(params,gamma.size());
+            
+            resizable_tensor temp(item.params);
+            auto sg = gamma(temp,0);
+            auto sb = beta(temp,gamma.size());
+
+            g = pointwise_multiply(mat(sg), 1.0f/sqrt(mat(item.running_variances)+item.get_eps()));
+            b = mat(sb) - pointwise_multiply(mat(g), mat(item.running_means));
+        }
+
+        layer_mode get_mode() const { return mode; }
+
+        inline dpoint map_input_to_output (const dpoint& p) const { return p; }
+        inline dpoint map_output_to_input (const dpoint& p) const { return p; }
+
+        template <typename SUBNET>
+        void setup (const SUBNET& sub)
+        {
+            if (mode == FC_MODE)
+            {
+                gamma = alias_tensor(1,
+                                sub.get_output().k(),
+                                sub.get_output().nr(),
+                                sub.get_output().nc());
+            }
+            else
+            {
+                gamma = alias_tensor(1, sub.get_output().k());
+            }
+            beta = gamma;
+
+            params.set_size(gamma.size()+beta.size());
+
+            gamma(params,0) = 1;
+            beta(params,gamma.size()) = 0;
+        }
+
+        void forward_inplace(const tensor& input, tensor& output)
+        {
+            auto g = gamma(params,0);
+            auto b = beta(params,gamma.size());
+            if (mode == FC_MODE)
+                tt::affine_transform(output, input, g, b);
+            else
+                tt::affine_transform_conv(output, input, g, b);
+        } 
+
+        void backward_inplace(
+            const tensor& gradient_input, 
+            tensor& data_grad, 
+            tensor& /*params_grad*/
+        )
+        {
+            auto g = gamma(params,0);
+            auto b = beta(params,gamma.size());
+
+            // We are computing the gradient of dot(gradient_input, computed_output*g + b)
+            if (mode == FC_MODE)
+            {
+                if (is_same_object(gradient_input, data_grad))
+                    tt::multiply(false, data_grad, gradient_input, g);
+                else
+                    tt::multiply(true, data_grad, gradient_input, g);
+            }
+            else
+            {
+                if (is_same_object(gradient_input, data_grad))
+                    tt::multiply_conv(false, data_grad, gradient_input, g);
+                else
+                    tt::multiply_conv(true, data_grad, gradient_input, g);
+            }
+        }
+
+        const tensor& get_layer_params() const { return empty_params; }
+        tensor& get_layer_params() { return empty_params; }
+
+        friend void serialize(const affine_& item, std::ostream& out)
+        {
+            serialize("affine_", out);
+            serialize(item.params, out);
+            serialize(item.gamma, out);
+            serialize(item.beta, out);
+            serialize((int)item.mode, out);
+        }
+
+        friend void deserialize(affine_& item, std::istream& in)
+        {
+            std::string version;
+            deserialize(version, in);
+            if (version == "bn_con2")
+            {
+                // Since we can build an affine_ from a bn_ we check if that's what is in
+                // the stream and if so then just convert it right here.
+                unserialize sin(version, in);
+                bn_<CONV_MODE> temp;
+                deserialize(temp, sin);
+                item = temp;
+                return;
+            }
+            else if (version == "bn_fc2")
+            {
+                // Since we can build an affine_ from a bn_ we check if that's what is in
+                // the stream and if so then just convert it right here.
+                unserialize sin(version, in);
+                bn_<FC_MODE> temp;
+                deserialize(temp, sin);
+                item = temp;
+                return;
+            }
+
+            if (version != "affine_")
+                throw serialization_error("Unexpected version '"+version+"' found while deserializing dlib::affine_.");
+            deserialize(item.params, in);
+            deserialize(item.gamma, in);
+            deserialize(item.beta, in);
+            int mode;
+            deserialize(mode, in);
+            item.mode = (layer_mode)mode;
+        }
+
+        friend std::ostream& operator<<(std::ostream& out, const affine_& )
+        {
+            out << "affine";
+            return out;
+        }
+
+        friend void to_xml(const affine_& item, std::ostream& out)
+        {
+            if (item.mode==CONV_MODE)
+                out << "<affine_con>\n";
+            else
+                out << "<affine_fc>\n";
+
+            out << mat(item.params);
+
+            if (item.mode==CONV_MODE)
+                out << "</affine_con>\n";
+            else
+                out << "</affine_fc>\n";
+        }
+
+    private:
+        resizable_tensor params, empty_params; 
+        alias_tensor gamma, beta;
+        layer_mode mode;
+    };
+
+    template <typename SUBNET>
+    using affine = add_layer<affine_, SUBNET>;
+
+// ----------------------------------------------------------------------------------------
+
+    template <
+        template<typename> class tag
+        >
+    class add_prev_
+    {
+    public:
+        const static unsigned long id = tag_id<tag>::id;
+
+        add_prev_() 
+        {
+        }
+
+        template <typename SUBNET>
+        void setup (const SUBNET& /*sub*/)
+        {
+        }
+
+        template <typename SUBNET>
+        void forward(const SUBNET& sub, resizable_tensor& output)
+        {
+            auto&& t1 = sub.get_output();
+            auto&& t2 = layer<tag>(sub).get_output();
+            output.set_size(std::max(t1.num_samples(),t2.num_samples()),
+                            std::max(t1.k(),t2.k()),
+                            std::max(t1.nr(),t2.nr()),
+                            std::max(t1.nc(),t2.nc()));
+            tt::add(output, t1, t2);
+        }
+
+        template <typename SUBNET>
+        void backward(const tensor& gradient_input, SUBNET& sub, tensor& /*params_grad*/)
+        {
+            // The gradient just flows backwards to the two layers that forward() added
+            // together.
+            tt::add(sub.get_gradient_input(), sub.get_gradient_input(), gradient_input);
+            tt::add(layer<tag>(sub).get_gradient_input(), layer<tag>(sub).get_gradient_input(), gradient_input);
+        }
+
+        const tensor& get_layer_params() const { return params; }
+        tensor& get_layer_params() { return params; }
+
+        inline dpoint map_input_to_output (const dpoint& p) const { return p; }
+        inline dpoint map_output_to_input (const dpoint& p) const { return p; }
+
+        friend void serialize(const add_prev_& , std::ostream& out)
+        {
+            serialize("add_prev_", out);
+        }
+
+        friend void deserialize(add_prev_& , std::istream& in)
+        {
+            std::string version;
+            deserialize(version, in);
+            if (version != "add_prev_")
+                throw serialization_error("Unexpected version '"+version+"' found while deserializing dlib::add_prev_.");
+        }
+
+        friend std::ostream& operator<<(std::ostream& out, const add_prev_& item)
+        {
+            out << "add_prev"<<id;
+            return out;
+        }
+
+        friend void to_xml(const add_prev_& item, std::ostream& out)
+        {
+            out << "<add_prev tag='"<<id<<"'/>\n";
+        }
+
+    private:
+        resizable_tensor params;
+    };
+
+    template <
+        template<typename> class tag,
+        typename SUBNET
+        >
+    using add_prev = add_layer<add_prev_<tag>, SUBNET>;
+
+    template <typename SUBNET> using add_prev1  = add_prev<tag1, SUBNET>;
+    template <typename SUBNET> using add_prev2  = add_prev<tag2, SUBNET>;
+    template <typename SUBNET> using add_prev3  = add_prev<tag3, SUBNET>;
+    template <typename SUBNET> using add_prev4  = add_prev<tag4, SUBNET>;
+    template <typename SUBNET> using add_prev5  = add_prev<tag5, SUBNET>;
+    template <typename SUBNET> using add_prev6  = add_prev<tag6, SUBNET>;
+    template <typename SUBNET> using add_prev7  = add_prev<tag7, SUBNET>;
+    template <typename SUBNET> using add_prev8  = add_prev<tag8, SUBNET>;
+    template <typename SUBNET> using add_prev9  = add_prev<tag9, SUBNET>;
+    template <typename SUBNET> using add_prev10 = add_prev<tag10, SUBNET>;
+
+    using add_prev1_  = add_prev_<tag1>;
+    using add_prev2_  = add_prev_<tag2>;
+    using add_prev3_  = add_prev_<tag3>;
+    using add_prev4_  = add_prev_<tag4>;
+    using add_prev5_  = add_prev_<tag5>;
+    using add_prev6_  = add_prev_<tag6>;
+    using add_prev7_  = add_prev_<tag7>;
+    using add_prev8_  = add_prev_<tag8>;
+    using add_prev9_  = add_prev_<tag9>;
+    using add_prev10_ = add_prev_<tag10>;
+
+// ----------------------------------------------------------------------------------------
+
+    template <
+        template<typename> class tag
+        >
+    class mult_prev_
+    {
+    public:
+        const static unsigned long id = tag_id<tag>::id;
+
+        mult_prev_() 
+        {
+        }
+
+        template <typename SUBNET>
+        void setup (const SUBNET& /*sub*/)
+        {
+        }
+
+        template <typename SUBNET>
+        void forward(const SUBNET& sub, resizable_tensor& output)
+        {
+            auto&& t1 = sub.get_output();
+            auto&& t2 = layer<tag>(sub).get_output();
+            output.set_size(std::max(t1.num_samples(),t2.num_samples()),
+                            std::max(t1.k(),t2.k()),
+                            std::max(t1.nr(),t2.nr()),
+                            std::max(t1.nc(),t2.nc()));
+            tt::multiply_zero_padded(false, output, t1, t2);
+        }
+
+        template <typename SUBNET>
+        void backward(const tensor& gradient_input, SUBNET& sub, tensor& /*params_grad*/)
+        {
+            auto&& t1 = sub.get_output();
+            auto&& t2 = layer<tag>(sub).get_output();
+            // The gradient just flows backwards to the two layers that forward()
+            // multiplied together.
+            tt::multiply_zero_padded(true, sub.get_gradient_input(), t2, gradient_input);
+            tt::multiply_zero_padded(true, layer<tag>(sub).get_gradient_input(), t1, gradient_input);
+        }
+
+        const tensor& get_layer_params() const { return params; }
+        tensor& get_layer_params() { return params; }
+
+        friend void serialize(const mult_prev_& , std::ostream& out)
+        {
+            serialize("mult_prev_", out);
+        }
+
+        friend void deserialize(mult_prev_& , std::istream& in)
+        {
+            std::string version;
+            deserialize(version, in);
+            if (version != "mult_prev_")
+                throw serialization_error("Unexpected version '"+version+"' found while deserializing dlib::mult_prev_.");
+        }
+
+        friend std::ostream& operator<<(std::ostream& out, const mult_prev_& item)
+        {
+            out << "mult_prev"<<id;
+            return out;
+        }
+
+        friend void to_xml(const mult_prev_& item, std::ostream& out)
+        {
+            out << "<mult_prev tag='"<<id<<"'/>\n";
+        }
+
+    private:
+        resizable_tensor params;
+    };
+
+    template <
+        template<typename> class tag,
+        typename SUBNET
+        >
+    using mult_prev = add_layer<mult_prev_<tag>, SUBNET>;
+
+    template <typename SUBNET> using mult_prev1  = mult_prev<tag1, SUBNET>;
+    template <typename SUBNET> using mult_prev2  = mult_prev<tag2, SUBNET>;
+    template <typename SUBNET> using mult_prev3  = mult_prev<tag3, SUBNET>;
+    template <typename SUBNET> using mult_prev4  = mult_prev<tag4, SUBNET>;
+    template <typename SUBNET> using mult_prev5  = mult_prev<tag5, SUBNET>;
+    template <typename SUBNET> using mult_prev6  = mult_prev<tag6, SUBNET>;
+    template <typename SUBNET> using mult_prev7  = mult_prev<tag7, SUBNET>;
+    template <typename SUBNET> using mult_prev8  = mult_prev<tag8, SUBNET>;
+    template <typename SUBNET> using mult_prev9  = mult_prev<tag9, SUBNET>;
+    template <typename SUBNET> using mult_prev10 = mult_prev<tag10, SUBNET>;
+
+    using mult_prev1_  = mult_prev_<tag1>;
+    using mult_prev2_  = mult_prev_<tag2>;
+    using mult_prev3_  = mult_prev_<tag3>;
+    using mult_prev4_  = mult_prev_<tag4>;
+    using mult_prev5_  = mult_prev_<tag5>;
+    using mult_prev6_  = mult_prev_<tag6>;
+    using mult_prev7_  = mult_prev_<tag7>;
+    using mult_prev8_  = mult_prev_<tag8>;
+    using mult_prev9_  = mult_prev_<tag9>;
+    using mult_prev10_ = mult_prev_<tag10>;
+
+// ----------------------------------------------------------------------------------------
+
+    template <
+        template<typename> class tag
+        >
+    class scale_
+    {
+    public:
+        const static unsigned long id = tag_id<tag>::id;
+
+        scale_() 
+        {
+        }
+
+        template <typename SUBNET>
+        void setup (const SUBNET& /*sub*/)
+        {
+        }
+
+        template <typename SUBNET>
+        void forward(const SUBNET& sub, resizable_tensor& output)
+        {
+            auto&& scales = sub.get_output();
+            auto&& src = layer<tag>(sub).get_output();
+            DLIB_CASSERT(scales.num_samples() == src.num_samples() &&
+                         scales.k()           == src.k() &&
+                         scales.nr()          == 1 &&
+                         scales.nc()          == 1, 
+                         "scales.k(): " << scales.k() <<
+                         "\nsrc.k(): " << src.k() 
+                         );
+
+            output.copy_size(src);
+            tt::scale_channels(false, output, src, scales);
+        }
+
+        template <typename SUBNET>
+        void backward(const tensor& gradient_input, SUBNET& sub, tensor& /*params_grad*/)
+        {
+            auto&& scales = sub.get_output();
+            auto&& src = layer<tag>(sub).get_output();
+            // The gradient just flows backwards to the two layers that forward()
+            // read from.
+            tt::scale_channels(true, layer<tag>(sub).get_gradient_input(), gradient_input, scales);
+
+            if (reshape_src.num_samples() != src.num_samples())
+            {
+                reshape_scales = alias_tensor(src.num_samples()*src.k());
+                reshape_src = alias_tensor(src.num_samples()*src.k(),src.nr()*src.nc());
+            }
+
+            auto&& scales_grad = sub.get_gradient_input();
+            auto sgrad = reshape_scales(scales_grad);
+            tt::dot_prods(true, sgrad, reshape_src(src), reshape_src(gradient_input));
+        }
+
+        const tensor& get_layer_params() const { return params; }
+        tensor& get_layer_params() { return params; }
+
+        friend void serialize(const scale_& item, std::ostream& out)
+        {
+            serialize("scale_", out);
+            serialize(item.reshape_scales, out);
+            serialize(item.reshape_src, out);
+        }
+
+        friend void deserialize(scale_& item, std::istream& in)
+        {
+            std::string version;
+            deserialize(version, in);
+            if (version != "scale_")
+                throw serialization_error("Unexpected version '"+version+"' found while deserializing dlib::scale_.");
+            deserialize(item.reshape_scales, in);
+            deserialize(item.reshape_src, in);
+        }
+
+        friend std::ostream& operator<<(std::ostream& out, const scale_& item)
+        {
+            out << "scale"<<id;
+            return out;
+        }
+
+        friend void to_xml(const scale_& item, std::ostream& out)
+        {
+            out << "<scale tag='"<<id<<"'/>\n";
+        }
+
+    private:
+        alias_tensor reshape_scales;
+        alias_tensor reshape_src;
+        resizable_tensor params;
+    };
+
+    template <
+        template<typename> class tag,
+        typename SUBNET
+        >
+    using scale = add_layer<scale_<tag>, SUBNET>;
+
+    template <typename SUBNET> using scale1  = scale<tag1, SUBNET>;
+    template <typename SUBNET> using scale2  = scale<tag2, SUBNET>;
+    template <typename SUBNET> using scale3  = scale<tag3, SUBNET>;
+    template <typename SUBNET> using scale4  = scale<tag4, SUBNET>;
+    template <typename SUBNET> using scale5  = scale<tag5, SUBNET>;
+    template <typename SUBNET> using scale6  = scale<tag6, SUBNET>;
+    template <typename SUBNET> using scale7  = scale<tag7, SUBNET>;
+    template <typename SUBNET> using scale8  = scale<tag8, SUBNET>;
+    template <typename SUBNET> using scale9  = scale<tag9, SUBNET>;
+    template <typename SUBNET> using scale10 = scale<tag10, SUBNET>;
+
+    using scale1_  = scale_<tag1>;
+    using scale2_  = scale_<tag2>;
+    using scale3_  = scale_<tag3>;
+    using scale4_  = scale_<tag4>;
+    using scale5_  = scale_<tag5>;
+    using scale6_  = scale_<tag6>;
+    using scale7_  = scale_<tag7>;
+    using scale8_  = scale_<tag8>;
+    using scale9_  = scale_<tag9>;
+    using scale10_ = scale_<tag10>;
+
+// ----------------------------------------------------------------------------------------
+
+    class relu_
+    {
+    public:
+        relu_() 
+        {
+        }
+
+        template <typename SUBNET>
+        void setup (const SUBNET& /*sub*/)
+        {
+        }
+
+        void forward_inplace(const tensor& input, tensor& output)
+        {
+            tt::relu(output, input);
+        } 
+
+        void backward_inplace(
+            const tensor& computed_output,
+            const tensor& gradient_input, 
+            tensor& data_grad, 
+            tensor& 
+        )
+        {
+            tt::relu_gradient(data_grad, computed_output, gradient_input);
+        }
+
+        inline dpoint map_input_to_output (const dpoint& p) const { return p; }
+        inline dpoint map_output_to_input (const dpoint& p) const { return p; }
+
+        const tensor& get_layer_params() const { return params; }
+        tensor& get_layer_params() { return params; }
+
+        friend void serialize(const relu_& , std::ostream& out)
+        {
+            serialize("relu_", out);
+        }
+
+        friend void deserialize(relu_& , std::istream& in)
+        {
+            std::string version;
+            deserialize(version, in);
+            if (version != "relu_")
+                throw serialization_error("Unexpected version '"+version+"' found while deserializing dlib::relu_.");
+        }
+
+        friend std::ostream& operator<<(std::ostream& out, const relu_& )
+        {
+            out << "relu";
+            return out;
+        }
+
+        friend void to_xml(const relu_& /*item*/, std::ostream& out)
+        {
+            out << "<relu/>\n";
+        }
+
+    private:
+        resizable_tensor params;
+    };
+
+
+    template <typename SUBNET>
+    using relu = add_layer<relu_, SUBNET>;
+
+// ----------------------------------------------------------------------------------------
+
+    class prelu_
+    {
+    public:
+        explicit prelu_(
+            float initial_param_value_ = 0.25
+        ) : initial_param_value(initial_param_value_)
+        {
+        }
+
+        float get_initial_param_value (
+        ) const { return initial_param_value; }
+
+        template <typename SUBNET>
+        void setup (const SUBNET& /*sub*/)
+        {
+            params.set_size(1);
+            params = initial_param_value;
+        }
+
+        template <typename SUBNET>
+        void forward(
+            const SUBNET& sub, 
+            resizable_tensor& data_output
+        )
+        {
+            data_output.copy_size(sub.get_output());
+            tt::prelu(data_output, sub.get_output(), params);
+        }
+
+        template <typename SUBNET>
+        void backward(
+            const tensor& gradient_input, 
+            SUBNET& sub, 
+            tensor& params_grad
+        )
+        {
+            tt::prelu_gradient(sub.get_gradient_input(), sub.get_output(), 
+                gradient_input, params, params_grad);
+        }
+
+        inline dpoint map_input_to_output (const dpoint& p) const { return p; }
+        inline dpoint map_output_to_input (const dpoint& p) const { return p; }
+
+        const tensor& get_layer_params() const { return params; }
+        tensor& get_layer_params() { return params; }
+
+        friend void serialize(const prelu_& item, std::ostream& out)
+        {
+            serialize("prelu_", out);
+            serialize(item.params, out);
+            serialize(item.initial_param_value, out);
+        }
+
+        friend void deserialize(prelu_& item, std::istream& in)
+        {
+            std::string version;
+            deserialize(version, in);
+            if (version != "prelu_")
+                throw serialization_error("Unexpected version '"+version+"' found while deserializing dlib::prelu_.");
+            deserialize(item.params, in);
+            deserialize(item.initial_param_value, in);
+        }
+
+        friend std::ostream& operator<<(std::ostream& out, const prelu_& item)
+        {
+            out << "prelu\t ("
+                << "initial_param_value="<<item.initial_param_value
+                << ")";
+            return out;
+        }
+
+        friend void to_xml(const prelu_& item, std::ostream& out)
+        {
+            out << "<prelu initial_param_value='"<<item.initial_param_value<<"'>\n";
+            out << mat(item.params);
+            out << "</prelu>\n";
+        }
+
+    private:
+        resizable_tensor params;
+        float initial_param_value;
+    };
+
+    template <typename SUBNET>
+    using prelu = add_layer<prelu_, SUBNET>;
+
+// ----------------------------------------------------------------------------------------
+
+    class sig_
+    {
+    public:
+        sig_() 
+        {
+        }
+
+        template <typename SUBNET>
+        void setup (const SUBNET& /*sub*/)
+        {
+        }
+
+        void forward_inplace(const tensor& input, tensor& output)
+        {
+            tt::sigmoid(output, input);
+        } 
+
+        void backward_inplace(
+            const tensor& computed_output,
+            const tensor& gradient_input, 
+            tensor& data_grad, 
+            tensor& 
+        )
+        {
+            tt::sigmoid_gradient(data_grad, computed_output, gradient_input);
+        }
+
+        inline dpoint map_input_to_output (const dpoint& p) const { return p; }
+        inline dpoint map_output_to_input (const dpoint& p) const { return p; }
+
+        const tensor& get_layer_params() const { return params; }
+        tensor& get_layer_params() { return params; }
+
+        friend void serialize(const sig_& , std::ostream& out)
+        {
+            serialize("sig_", out);
+        }
+
+        friend void deserialize(sig_& , std::istream& in)
+        {
+            std::string version;
+            deserialize(version, in);
+            if (version != "sig_")
+                throw serialization_error("Unexpected version '"+version+"' found while deserializing dlib::sig_.");
+        }
+
+        friend std::ostream& operator<<(std::ostream& out, const sig_& )
+        {
+            out << "sig";
+            return out;
+        }
+
+        friend void to_xml(const sig_& /*item*/, std::ostream& out)
+        {
+            out << "<sig/>\n";
+        }
+
+
+    private:
+        resizable_tensor params;
+    };
+
+
+    template <typename SUBNET>
+    using sig = add_layer<sig_, SUBNET>;
+
+// ----------------------------------------------------------------------------------------
+
+    class htan_
+    {
+    public:
+        htan_() 
+        {
+        }
+
+        template <typename SUBNET>
+        void setup (const SUBNET& /*sub*/)
+        {
+        }
+
+        inline dpoint map_input_to_output (const dpoint& p) const { return p; }
+        inline dpoint map_output_to_input (const dpoint& p) const { return p; }
+
+        void forward_inplace(const tensor& input, tensor& output)
+        {
+            tt::tanh(output, input);
+        } 
+
+        void backward_inplace(
+            const tensor& computed_output,
+            const tensor& gradient_input, 
+            tensor& data_grad, 
+            tensor& 
+        )
+        {
+            tt::tanh_gradient(data_grad, computed_output, gradient_input);
+        }
+
+        const tensor& get_layer_params() const { return params; }
+        tensor& get_layer_params() { return params; }
+
+        friend void serialize(const htan_& , std::ostream& out)
+        {
+            serialize("htan_", out);
+        }
+
+        friend void deserialize(htan_& , std::istream& in)
+        {
+            std::string version;
+            deserialize(version, in);
+            if (version != "htan_")
+                throw serialization_error("Unexpected version '"+version+"' found while deserializing dlib::htan_.");
+        }
+
+        friend std::ostream& operator<<(std::ostream& out, const htan_& )
+        {
+            out << "htan";
+            return out;
+        }
+
+        friend void to_xml(const htan_& /*item*/, std::ostream& out)
+        {
+            out << "<htan/>\n";
+        }
+
+
+    private:
+        resizable_tensor params;
+    };
+
+
+    template <typename SUBNET>
+    using htan = add_layer<htan_, SUBNET>;
+
+// ----------------------------------------------------------------------------------------
+
+    class softmax_
+    {
+    public:
+        softmax_() 
+        {
+        }
+
+        template <typename SUBNET>
+        void setup (const SUBNET& /*sub*/)
+        {
+        }
+
+        void forward_inplace(const tensor& input, tensor& output)
+        {
+            tt::softmax(output, input);
+        } 
+
+        void backward_inplace(
+            const tensor& computed_output,
+            const tensor& gradient_input, 
+            tensor& data_grad, 
+            tensor& 
+        )
+        {
+            tt::softmax_gradient(data_grad, computed_output, gradient_input);
+        }
+
+        const tensor& get_layer_params() const { return params; }
+        tensor& get_layer_params() { return params; }
+
+        friend void serialize(const softmax_& , std::ostream& out)
+        {
+            serialize("softmax_", out);
+        }
+
+        friend void deserialize(softmax_& , std::istream& in)
+        {
+            std::string version;
+            deserialize(version, in);
+            if (version != "softmax_")
+                throw serialization_error("Unexpected version '"+version+"' found while deserializing dlib::softmax_.");
+        }
+
+        friend std::ostream& operator<<(std::ostream& out, const softmax_& )
+        {
+            out << "softmax";
+            return out;
+        }
+
+        friend void to_xml(const softmax_& /*item*/, std::ostream& out)
+        {
+            out << "<softmax/>\n";
+        }
+
+    private:
+        resizable_tensor params;
+    };
+
+    template <typename SUBNET>
+    using softmax = add_layer<softmax_, SUBNET>;
+
+// ----------------------------------------------------------------------------------------
+
+    class softmax_all_
+    {
+    public:
+        softmax_all_() 
+        {
+        }
+
+        template <typename SUBNET>
+        void setup (const SUBNET& /*sub*/)
+        {
+        }
+
+        void forward_inplace(const tensor& input, tensor& output)
+        {
+            tt::softmax_all(output, input);
+        } 
+
+        void backward_inplace(
+            const tensor& computed_output,
+            const tensor& gradient_input, 
+            tensor& data_grad, 
+            tensor& 
+        )
+        {
+            tt::softmax_all_gradient(data_grad, computed_output, gradient_input);
+        }
+
+        const tensor& get_layer_params() const { return params; }
+        tensor& get_layer_params() { return params; }
+
+        friend void serialize(const softmax_all_& , std::ostream& out)
+        {
+            serialize("softmax_all_", out);
+        }
+
+        friend void deserialize(softmax_all_& , std::istream& in)
+        {
+            std::string version;
+            deserialize(version, in);
+            if (version != "softmax_all_")
+                throw serialization_error("Unexpected version '"+version+"' found while deserializing dlib::softmax_all_.");
+        }
+
+        friend std::ostream& operator<<(std::ostream& out, const softmax_all_& )
+        {
+            out << "softmax_all";
+            return out;
+        }
+
+        friend void to_xml(const softmax_all_& /*item*/, std::ostream& out)
+        {
+            out << "<softmax_all/>\n";
+        }
+
+    private:
+        resizable_tensor params;
+    };
+
+    template <typename SUBNET>
+    using softmax_all = add_layer<softmax_all_, SUBNET>;
+
+// ----------------------------------------------------------------------------------------
+
+    namespace impl
+    {
+        template <template<typename> class TAG_TYPE, template<typename> class... TAG_TYPES>
+        struct concat_helper_impl{
+
+            constexpr static size_t tag_count() {return 1 + concat_helper_impl<TAG_TYPES...>::tag_count();}
+            static void list_tags(std::ostream& out)
+            {
+                out << tag_id<TAG_TYPE>::id << (tag_count() > 1 ? "," : "");
+                concat_helper_impl<TAG_TYPES...>::list_tags(out);
+            }
+
+            template<typename SUBNET>
+            static void resize_out(resizable_tensor& out, const SUBNET& sub, long sum_k)
+            {
+                auto& t = layer<TAG_TYPE>(sub).get_output();
+                concat_helper_impl<TAG_TYPES...>::resize_out(out, sub, sum_k + t.k());
+            }
+            template<typename SUBNET>
+            static void concat(tensor& out, const SUBNET& sub, size_t k_offset)
+            {
+                auto& t = layer<TAG_TYPE>(sub).get_output();
+                tt::copy_tensor(false, out, k_offset, t, 0, t.k());
+                k_offset += t.k();
+                concat_helper_impl<TAG_TYPES...>::concat(out, sub, k_offset);
+            }
+            template<typename SUBNET>
+            static void split(const tensor& input, SUBNET& sub, size_t k_offset)
+            {
+                auto& t = layer<TAG_TYPE>(sub).get_gradient_input();
+                tt::copy_tensor(true, t, 0, input, k_offset, t.k());
+                k_offset += t.k();
+                concat_helper_impl<TAG_TYPES...>::split(input, sub, k_offset);
+            }
+        };
+        template <template<typename> class TAG_TYPE>
+        struct concat_helper_impl<TAG_TYPE>{
+            constexpr static size_t tag_count() {return 1;}
+            static void list_tags(std::ostream& out) 
+            { 
+                out << tag_id<TAG_TYPE>::id;
+            }
+
+            template<typename SUBNET>
+            static void resize_out(resizable_tensor& out, const SUBNET& sub, long sum_k)
+            {
+                auto& t = layer<TAG_TYPE>(sub).get_output();
+                out.set_size(t.num_samples(), t.k() + sum_k, t.nr(), t.nc());
+            }
+            template<typename SUBNET>
+            static void concat(tensor& out, const SUBNET& sub, size_t k_offset)
+            {
+                auto& t = layer<TAG_TYPE>(sub).get_output();
+                tt::copy_tensor(false, out, k_offset, t, 0, t.k());
+            }
+            template<typename SUBNET>
+            static void split(const tensor& input, SUBNET& sub, size_t k_offset)
+            {
+                auto& t = layer<TAG_TYPE>(sub).get_gradient_input();
+                tt::copy_tensor(true, t, 0, input, k_offset, t.k());
+            }
+        };
+    }
+    // concat layer
+    template<
+        template<typename> class... TAG_TYPES
+        >
+    class concat_
+    {
+        static void list_tags(std::ostream& out) { impl::concat_helper_impl<TAG_TYPES...>::list_tags(out);};
+
+    public:
+        constexpr static size_t tag_count() {return impl::concat_helper_impl<TAG_TYPES...>::tag_count();};
+
+        template <typename SUBNET>
+        void setup (const SUBNET&)
+        {
+            // do nothing
+        }
+        template <typename SUBNET>
+        void forward(const SUBNET& sub, resizable_tensor& output)
+        {
+            // the total depth of result is the sum of depths from all tags
+            impl::concat_helper_impl<TAG_TYPES...>::resize_out(output, sub, 0);
+
+            // copy output from each tag into different part result
+            impl::concat_helper_impl<TAG_TYPES...>::concat(output, sub, 0);
+        }
+
+        template <typename SUBNET>
+        void backward(const tensor& gradient_input, SUBNET& sub, tensor&)
+        {
+            // Gradient is split into parts for each tag layer
+            impl::concat_helper_impl<TAG_TYPES...>::split(gradient_input, sub, 0);
+        }
+
+        dpoint map_input_to_output(dpoint p) const { return p; }
+        dpoint map_output_to_input(dpoint p) const { return p; }
+
+        const tensor& get_layer_params() const { return params; }
+        tensor& get_layer_params() { return params; }
+
+        friend void serialize(const concat_& item, std::ostream& out)
+        {
+            serialize("concat_", out);
+            size_t count = tag_count();
+            serialize(count, out);
+        }
+
+        friend void deserialize(concat_& item, std::istream& in)
+        {
+            std::string version;
+            deserialize(version, in);
+            if (version != "concat_")
+                throw serialization_error("Unexpected version '"+version+"' found while deserializing dlib::concat_.");
+            size_t count_tags;
+            deserialize(count_tags, in);
+            if (count_tags != tag_count())
+                throw serialization_error("Invalid count of tags "+ std::to_string(count_tags) +", expecting " +
+                                          std::to_string(tag_count()) +
+                                                  " found while deserializing dlib::concat_.");
+        }
+
+        friend std::ostream& operator<<(std::ostream& out, const concat_& item)
+        {
+            out << "concat\t (";
+            list_tags(out);
+            out << ")";
+            return out;
+        }
+
+        friend void to_xml(const concat_& item, std::ostream& out)
+        {
+            out << "<concat tags='";
+            list_tags(out);
+            out << "'/>\n";
+        }
+
+    private:
+        resizable_tensor params; // unused
+    };
+
+
+    // concat layer definitions
+    template <template<typename> class TAG1,
+            template<typename> class TAG2,
+            typename SUBNET>
+    using concat2 = add_layer<concat_<TAG1, TAG2>, SUBNET>;
+
+    template <template<typename> class TAG1,
+            template<typename> class TAG2,
+            template<typename> class TAG3,
+            typename SUBNET>
+    using concat3 = add_layer<concat_<TAG1, TAG2, TAG3>, SUBNET>;
+
+    template <template<typename> class TAG1,
+            template<typename> class TAG2,
+            template<typename> class TAG3,
+            template<typename> class TAG4,
+            typename SUBNET>
+    using concat4 = add_layer<concat_<TAG1, TAG2, TAG3, TAG4>, SUBNET>;
+
+    template <template<typename> class TAG1,
+            template<typename> class TAG2,
+            template<typename> class TAG3,
+            template<typename> class TAG4,
+            template<typename> class TAG5,
+            typename SUBNET>
+    using concat5 = add_layer<concat_<TAG1, TAG2, TAG3, TAG4, TAG5>, SUBNET>;
+
+    // inception layer will use tags internally. If user will use tags too, some conflicts
+    // possible to exclude them, here are new tags specially for inceptions
+    template <typename SUBNET> using itag0  = add_tag_layer< 1000 + 0, SUBNET>;
+    template <typename SUBNET> using itag1  = add_tag_layer< 1000 + 1, SUBNET>;
+    template <typename SUBNET> using itag2  = add_tag_layer< 1000 + 2, SUBNET>;
+    template <typename SUBNET> using itag3  = add_tag_layer< 1000 + 3, SUBNET>;
+    template <typename SUBNET> using itag4  = add_tag_layer< 1000 + 4, SUBNET>;
+    template <typename SUBNET> using itag5  = add_tag_layer< 1000 + 5, SUBNET>;
+    // skip to inception input
+    template <typename SUBNET> using iskip  = add_skip_layer< itag0, SUBNET>;
+
+    // here are some templates to be used for creating inception layer groups
+    template <template<typename>class B1,
+            template<typename>class B2,
+            typename SUBNET>
+    using inception2 = concat2<itag1, itag2, itag1<B1<iskip< itag2<B2< itag0<SUBNET>>>>>>>;
+
+    template <template<typename>class B1,
+            template<typename>class B2,
+            template<typename>class B3,
+            typename SUBNET>
+    using inception3 = concat3<itag1, itag2, itag3, itag1<B1<iskip< itag2<B2<iskip< itag3<B3<  itag0<SUBNET>>>>>>>>>>;
+
+    template <template<typename>class B1,
+            template<typename>class B2,
+            template<typename>class B3,
+            template<typename>class B4,
+            typename SUBNET>
+    using inception4 = concat4<itag1, itag2, itag3, itag4,
+                itag1<B1<iskip< itag2<B2<iskip< itag3<B3<iskip<  itag4<B4<  itag0<SUBNET>>>>>>>>>>>>>;
+
+    template <template<typename>class B1,
+            template<typename>class B2,
+            template<typename>class B3,
+            template<typename>class B4,
+            template<typename>class B5,
+            typename SUBNET>
+    using inception5 = concat5<itag1, itag2, itag3, itag4, itag5,
+                itag1<B1<iskip< itag2<B2<iskip< itag3<B3<iskip<  itag4<B4<iskip<  itag5<B5<  itag0<SUBNET>>>>>>>>>>>>>>>>;
+
+// ----------------------------------------------------------------------------------------
+// ----------------------------------------------------------------------------------------
+
+    const double DEFAULT_L2_NORM_EPS = 1e-5;
+
+    class l2normalize_
+    {
+    public:
+        explicit l2normalize_(
+            double eps_ = DEFAULT_L2_NORM_EPS
+        ) : 
+            eps(eps_)
+        {
+        }
+
+        double get_eps() const { return eps; }
+
+        template <typename SUBNET>
+        void setup (const SUBNET& /*sub*/)
+        {
+        }
+
+        void forward_inplace(const tensor& input, tensor& output)
+        {
+            tt::inverse_norms(norm, input, eps);
+            tt::scale_rows(output, input, norm);
+        } 
+
+        void backward_inplace(
+            const tensor& computed_output, 
+            const tensor& gradient_input, 
+            tensor& data_grad, 
+            tensor& /*params_grad*/
+        )
+        {
+            if (is_same_object(gradient_input, data_grad))
+            {
+                tt::dot_prods(temp, gradient_input, computed_output);
+                tt::scale_rows2(0, data_grad, gradient_input, computed_output, temp, norm);
+            }
+            else
+            {
+                tt::dot_prods(temp, gradient_input, computed_output);
+                tt::scale_rows2(1, data_grad, gradient_input, computed_output, temp, norm);
+            }
+        }
+
+        const tensor& get_layer_params() const { return params; }
+        tensor& get_layer_params() { return params; }
+
+        friend void serialize(const l2normalize_& item, std::ostream& out)
+        {
+            serialize("l2normalize_", out);
+            serialize(item.eps, out);
+        }
+
+        friend void deserialize(l2normalize_& item, std::istream& in)
+        {
+            std::string version;
+            deserialize(version, in);
+            if (version != "l2normalize_")
+                throw serialization_error("Unexpected version '"+version+"' found while deserializing dlib::l2normalize_.");
+            deserialize(item.eps, in);
+        }
+
+        friend std::ostream& operator<<(std::ostream& out, const l2normalize_& item)
+        {
+            out << "l2normalize";
+            out << " eps="<<item.eps;
+            return out;
+        }
+
+        friend void to_xml(const l2normalize_& item, std::ostream& out)
+        {
+            out << "<l2normalize";
+            out << " eps='"<<item.eps<<"'";
+            out << "/>\n";
+        }
+    private:
+        double eps;
+
+        resizable_tensor params; // unused
+        // Here only to avoid reallocation and as a cache between forward/backward
+        // functions.  
+        resizable_tensor norm;
+        resizable_tensor temp;
+    };
+
+    template <typename SUBNET>
+    using l2normalize = add_layer<l2normalize_, SUBNET>;
+
+// ----------------------------------------------------------------------------------------
+
+    template <
+        long _offset,
+        long _k,
+        long _nr,
+        long _nc
+        >
+    class extract_
+    {
+        static_assert(_offset >= 0, "The offset must be >= 0.");
+        static_assert(_k > 0,  "The number of channels must be > 0.");
+        static_assert(_nr > 0, "The number of rows must be > 0.");
+        static_assert(_nc > 0, "The number of columns must be > 0.");
+    public:
+        extract_(
+        )  
+        {
+        }
+
+        template <typename SUBNET>
+        void setup (const SUBNET& sub)
+        {
+            DLIB_CASSERT((long)sub.get_output().size() >= sub.get_output().num_samples()*(_offset+_k*_nr*_nc), 
+                "The tensor we are trying to extract from the input tensor is too big to fit into the input tensor.");
+
+            aout = alias_tensor(sub.get_output().num_samples(), _k*_nr*_nc);
+            ain = alias_tensor(sub.get_output().num_samples(),  sub.get_output().size()/sub.get_output().num_samples());
+        }
+
+        template <typename SUBNET>
+        void forward(const SUBNET& sub, resizable_tensor& output)
+        {
+            if (aout.num_samples() != sub.get_output().num_samples())
+            {
+                aout = alias_tensor(sub.get_output().num_samples(), _k*_nr*_nc);
+                ain = alias_tensor(sub.get_output().num_samples(),  sub.get_output().size()/sub.get_output().num_samples());
+            }
+
+            output.set_size(sub.get_output().num_samples(), _k, _nr, _nc);
+            auto out = aout(output,0);
+            auto in = ain(sub.get_output(),0);
+            tt::copy_tensor(false, out, 0, in, _offset, _k*_nr*_nc);
+        } 
+
+        template <typename SUBNET>
+        void backward(const tensor& gradient_input, SUBNET& sub, tensor& /*params_grad*/)
+        {
+            auto out = ain(sub.get_gradient_input(),0);
+            auto in = aout(gradient_input,0);
+            tt::copy_tensor(true, out, _offset, in, 0, _k*_nr*_nc);
+        }
+
+        const tensor& get_layer_params() const { return params; }
+        tensor& get_layer_params() { return params; }
+
+        friend void serialize(const extract_& item, std::ostream& out)
+        {
+            serialize("extract_", out);
+            serialize(_offset, out);
+            serialize(_k, out);
+            serialize(_nr, out);
+            serialize(_nc, out);
+        }
+
+        friend void deserialize(extract_& item, std::istream& in)
+        {
+            std::string version;
+            deserialize(version, in);
+            if (version != "extract_")
+                throw serialization_error("Unexpected version '"+version+"' found while deserializing dlib::extract_.");
+
+            long offset;
+            long k;
+            long nr;
+            long nc;
+            deserialize(offset, in);
+            deserialize(k, in);
+            deserialize(nr, in);
+            deserialize(nc, in);
+
+            if (offset != _offset) throw serialization_error("Wrong offset found while deserializing dlib::extract_");
+            if (k != _k)   throw serialization_error("Wrong k found while deserializing dlib::extract_");
+            if (nr != _nr) throw serialization_error("Wrong nr found while deserializing dlib::extract_");
+            if (nc != _nc) throw serialization_error("Wrong nc found while deserializing dlib::extract_");
+        }
+
+        friend std::ostream& operator<<(std::ostream& out, const extract_& item)
+        {
+            out << "extract\t ("
+                << "offset="<<_offset
+                << ", k="<<_k
+                << ", nr="<<_nr
+                << ", nc="<<_nc
+                << ")";
+            return out;
+        }
+
+        friend void to_xml(const extract_& item, std::ostream& out)
+        {
+            out << "<extract";
+            out << " offset='"<<_offset<<"'";
+            out << " k='"<<_k<<"'";
+            out << " nr='"<<_nr<<"'";
+            out << " nc='"<<_nc<<"'";
+            out << "/>\n";
+        }
+    private:
+        alias_tensor aout, ain;
+
+        resizable_tensor params; // unused
+    };
+
+    template <
+        long offset,
+        long k,
+        long nr,
+        long nc,
+        typename SUBNET
+        >
+    using extract = add_layer<extract_<offset,k,nr,nc>, SUBNET>;
+
+// ----------------------------------------------------------------------------------------
+
+}
+
+#endif // DLIB_DNn_LAYERS_H_
+
+
diff --git a/ml/dlib/dlib/dnn/layers_abstract.h b/ml/dlib/dlib/dnn/layers_abstract.h
new file mode 100644
index 000000000..f07025ff8
--- /dev/null
+++ b/ml/dlib/dlib/dnn/layers_abstract.h
@@ -0,0 +1,2631 @@
+// Copyright (C) 2015  Davis E. King (davis@dlib.net)
+// License: Boost Software License   See LICENSE.txt for the full license.
+#undef DLIB_DNn_LAYERS_ABSTRACT_H_
+#ifdef DLIB_DNn_LAYERS_ABSTRACT_H_
+
+#include "tensor_abstract.h"
+#include "core_abstract.h"
+
+
+namespace dlib
+{
+
+// ----------------------------------------------------------------------------------------
+
+    class SUBNET 
+    {
+        /*!
+            WHAT THIS OBJECT REPRESENTS
+                This object represents a deep neural network.  In particular, it is
+                the simplified interface through which layer objects interact with their
+                subnetworks.  A layer's two important tasks are to (1) take outputs from its
+                subnetwork and forward propagate them through itself and (2) to backwards
+                propagate an error gradient through itself and onto its subnetwork.
+                The idea of a subnetwork is illustrated in the following diagram:
+
+                  +---------------------------------------------------------+
+                  | loss <-- layer1 <-- layer2 <-- ... <-- layern <-- input |
+                  +---------------------------------------------------------+
+                                      ^                            ^
+                                      \__ subnetwork for layer1 __/
+
+                Therefore, by "subnetwork" we mean the part of the network closer to the
+                input.
+
+                Note that there is no dlib::SUBNET type.  It is shown here purely to
+                document the interface layer objects expect to see when they interact
+                with a network.
+        !*/
+
+    public:
+        // You aren't allowed to copy subnetworks from inside a layer.
+        SUBNET(const SUBNET&) = delete;
+        SUBNET& operator=(const SUBNET&) = delete;
+
+        const tensor& get_output(
+        ) const;
+        /*!
+            ensures
+                - returns the output of this subnetwork.  This is the data that the next
+                  layer in the network will take as input.
+                - have_same_dimensions(#get_gradient_input(), get_output()) == true
+        !*/
+
+        tensor& get_gradient_input(
+        );
+        /*!
+            ensures
+                - returns the error gradient for this subnetwork.  That is, this is the
+                  error gradient that this network will use to update itself.  Therefore,
+                  when performing back propagation, layers that sit on top of this
+                  subnetwork write their back propagated error gradients into
+                  get_gradient_input().  Or to put it another way, during back propagation,
+                  layers take the contents of their get_gradient_input() and back propagate
+                  it through themselves and store the results into their subnetwork's
+                  get_gradient_input().
+        !*/
+
+        const NEXT_SUBNET& subnet(
+        ) const;
+        /*!
+            ensures
+                - returns the subnetwork of *this network.  With respect to the diagram
+                  above, if *this was layer1 then subnet() would return the network that
+                  begins with layer2.
+        !*/
+
+        NEXT_SUBNET& subnet(
+        );
+        /*!
+            ensures
+                - returns the subnetwork of *this network.  With respect to the diagram
+                  above, if *this was layer1 then subnet() would return the network that
+                  begins with layer2.
+        !*/
+
+        const layer_details_type& layer_details(
+        ) const; 
+        /*!
+            ensures
+                - returns the layer_details_type instance that defines the behavior of the
+                  layer at the top of this network.  I.e. returns the layer details that
+                  defines the behavior of the layer nearest to the network output rather
+                  than the input layer.  For computational layers, this is the object
+                  implementing the EXAMPLE_COMPUTATIONAL_LAYER_ interface that defines the
+                  layer's behavior.
+        !*/
+
+        unsigned int sample_expansion_factor (
+        ) const;
+        /*!
+            ensures
+                - When to_tensor() is invoked on this network's input layer it converts N
+                  input objects into M samples, all stored inside a resizable_tensor.  It
+                  is always the case that M is some integer multiple of N.
+                  sample_expansion_factor() returns the value of this multiplier.  To be
+                  very specific, it is always true that M==I*N where I is some integer.
+                  This integer I is what is returned by sample_expansion_factor().
+
+                  It should be noted that computational layers likely do not care about the
+                  sample expansion factor.  It is only really of concern inside a loss
+                  layer where you need to know its value so that tensor samples can be
+                  matched against truth objects.  Moreover, in most cases the sample
+                  expansion factor is 1.
+        !*/
+
+    };
+
+// ----------------------------------------------------------------------------------------
+
+    class EXAMPLE_COMPUTATIONAL_LAYER_
+    {
+        /*!
+            WHAT THIS OBJECT REPRESENTS
+                Each computational layer in a deep neural network can be thought of as a
+                function, f(data,parameters), that takes in a data tensor, some parameters,
+                and produces an output tensor.  You create an entire deep network by
+                composing these functions.  Importantly, you are able to use a wide range
+                of different functions to accommodate the task you are trying to
+                accomplish.  Therefore, dlib includes a number of common layer types but if
+                you want to define your own then you simply implement a class with the same
+                interface as EXAMPLE_COMPUTATIONAL_LAYER_.
+
+                Note that there is no dlib::EXAMPLE_COMPUTATIONAL_LAYER_ type.  It is shown
+                here purely to document the interface that a layer object must implement.
+
+                The central work of defining a layer is implementing the forward and backward
+                methods.  When you do this you have four options:
+                    - Implement the forward() and backward() methods according to the
+                      specification shown below.  Do not implement forward_inplace() and
+                      backward_inplace().
+                    - Implement the forward() and backward() methods according to the
+                      specification shown below, except exclude the computed_output
+                      parameter from backward().  Doing this will allow dlib to make some
+                      layers execute in-place and therefore run a little faster and use
+                      less memory. Do not implement forward_inplace() and
+                      backward_inplace().
+                    - Implement the forward_inplace() and backward_inplace() methods
+                      according to the specification shown below.  Do not implement
+                      forward() and backward().  These in-place methods allow some types of
+                      layers to be implemented more efficiently.
+                    - Implement the forward_inplace() and backward_inplace() methods
+                      according to the specification shown below, except exclude the
+                      computed_output parameter from backward_inplace().  Doing this will
+                      allow dlib to make some layers execute in-place and therefore run a
+                      little faster and use less memory.  Do not implement forward() and
+                      backward().
+
+
+                It should also be noted that layers may define additional layer specific
+                fields and the solvers can use these fields as they see fit.  For example,
+                some layers define get_learning_rate_multiplier() and
+                get_weight_decay_multiplier() methods.  The solvers that come with dlib
+                look at these methods, if they exist, and adjust the learning rate or
+                weight decay for that layer according to the multiplier.  Therefore, you
+                can add these methods to your layer types if you want, or even define new
+                fields and new solvers that use those fields in some way.  
+        !*/
+
+    public:
+
+        EXAMPLE_COMPUTATIONAL_LAYER_(
+        );
+        /*!
+            ensures
+                - Default constructs this object.  This function is not required to do
+                  anything in particular but it must exist, that is, it is required that
+                  layer objects be default constructable. 
+        !*/
+
+        EXAMPLE_COMPUTATIONAL_LAYER_ (
+            const EXAMPLE_COMPUTATIONAL_LAYER_& item
+        );
+        /*!
+            ensures
+                - EXAMPLE_COMPUTATIONAL_LAYER_ objects are copy constructable
+        !*/
+
+        EXAMPLE_COMPUTATIONAL_LAYER_(
+            const some_other_layer_type& item
+        );
+        /*!
+            ensures
+                - Constructs this object from item.  This form of constructor is optional
+                  but it allows you to provide a conversion from one layer type to another.
+                  For example, the following code is valid only if my_layer2 can be
+                  constructed from my_layer1:
+                    relu<fc<my_layer1<fc<input<matrix<float>>>>>> my_dnn1;
+                    relu<fc<my_layer2<fc<input<matrix<float>>>>>> my_dnn2(my_dnn1);
+                  This kind of pattern is useful if you want to use one type of layer
+                  during training but a different type of layer during testing since it
+                  allows you to easily convert between related deep neural network types.  
+
+                  Additionally, if you provide a constructor to build a layer from another
+                  layer type you should also write your layer's deserialize() routine such
+                  that it can read that other layer's serialized data in addition to your
+                  own serialized data.  
+        !*/
+
+        template <typename SUBNET>
+        void setup (
+            const SUBNET& sub
+        );
+        /*!
+            requires
+                - SUBNET implements the SUBNET interface defined at the top of this file.
+            ensures
+                - performs any necessary initial memory allocations and/or sets parameters
+                  to their initial values prior to learning.  Therefore, calling setup
+                  destroys any previously learned parameters.  Also, typically setup()
+                  would look at the dimensions of the outputs of sub and configure the
+                  number of parameters in *this accordingly.
+        !*/
+
+        template <typename SUBNET>
+        void forward(
+            const SUBNET& sub, 
+            resizable_tensor& data_output
+        );
+        /*!
+            requires
+                - SUBNET implements the SUBNET interface defined at the top of this file.
+                - setup() has been called.
+            ensures
+                - Runs the output of the subnetwork through this layer and stores the
+                  results into #data_output.  In particular, forward() can use any of the
+                  outputs in sub (e.g. sub.get_output(), sub.subnet().get_output(), etc.)
+                  to compute whatever it wants.
+        !*/
+
+        template <typename SUBNET>
+        void backward(
+            const tensor& computed_output, // this parameter is optional
+            const tensor& gradient_input, 
+            SUBNET& sub, 
+            tensor& params_grad
+        );
+        /*!
+            requires
+                - SUBNET implements the SUBNET interface defined at the top of this file.
+                - setup() has been called.
+                - computed_output is the tensor resulting from calling forward(sub,computed_output).  
+                  Moreover, this was the most recent call to forward().  This means that
+                  forward() is allowed to cache intermediate results so they can be used
+                  during the backward computation.
+                - have_same_dimensions(gradient_input, computed_output) == true
+                - have_same_dimensions(sub.get_gradient_input(), sub.get_output()) == true
+                - have_same_dimensions(params_grad, get_layer_params()) == true
+            ensures
+                - This function outputs the gradients of this layer with respect to the
+                  input data from sub and also with respect to this layer's parameters.
+                  These gradients are stored into #sub and #params_grad, respectively. To be
+                  precise, the gradients are taken of a function f(sub,get_layer_params())
+                  which is defined thusly:   
+                    - Recalling that computed_output is a function of both sub and get_layer_params(), 
+                      since it is the result of calling forward(sub,computed_output):
+                      let f(sub,get_layer_params()) == dot(computed_output, gradient_input)
+                  Then we define the following gradient vectors: 
+                    - PARAMETER_GRADIENT == gradient of f(sub,get_layer_params()) with
+                      respect to get_layer_params(). 
+                    - for all valid I:
+                        - DATA_GRADIENT_I == gradient of f(sub,get_layer_params()) with
+                          respect to layer<I>(sub).get_output() (recall that forward() can
+                          draw inputs from the immediate sub layer, sub.subnet(), or
+                          any earlier layer.  So you must consider the gradients with
+                          respect to all inputs drawn from sub)
+                  Finally, backward() outputs these gradients by performing:
+                    - params_grad = PARAMETER_GRADIENT 
+                    - for all valid I:
+                        - layer<I>(sub).get_gradient_input() += DATA_GRADIENT_I
+        !*/
+
+        void forward_inplace(
+            const tensor& data_input, 
+            tensor& data_output
+        );
+        /*!
+            requires
+                - have_same_dimensions(data_input,data_output) == true
+                - setup() has been called.
+            ensures
+                - Runs the data_input tensor through this layer and stores the output into
+                  #data_output.
+                - This function supports in-place operation, i.e. having
+                  is_same_object(data_input, data_output)==true
+        !*/
+
+        void backward_inplace(
+            const tensor& computed_output, // this parameter is optional
+            const tensor& gradient_input,
+            tensor& data_grad,
+            tensor& params_grad
+        );
+        /*!
+            requires
+                - setup() has been called.
+                - computed_output is the tensor resulting from the most recent call to
+                  forward_inplace().  This means that forward_inplace() is allowed to cache
+                  intermediate results so they can be used during the backward computation.
+                - have_same_dimensions(gradient_input, data_grad) == true
+                - have_same_dimensions(gradient_input, computed_output) == true
+                - have_same_dimensions(params_grad, get_layer_params()) == true
+            ensures
+                - This function supports in-place operation, i.e. having
+                  is_same_object(gradient_input, data_grad)==true
+                - This function outputs the gradients of this layer with respect to the
+                  input data from a sublayer and also with respect to this layer's parameters.
+                  These gradients are stored into #data_grad and #params_grad, respectively. To be
+                  precise, the gradients are taken of a function f(data_input,get_layer_params())
+                  which is defined thusly:   
+                    - Recalling that computed_output is a function of both the input to
+                      forward_inplace() and get_layer_params(), since it is the result of
+                      calling forward_inplace(data_input,computed_output):
+                      let f(data_input,get_layer_params()) == dot(computed_output, gradient_input)
+                  Then we define the following gradient vectors: 
+                    - PARAMETER_GRADIENT == gradient of f(data_input,get_layer_params()) with
+                      respect to get_layer_params(). 
+                    - DATA_GRADIENT == gradient of f(data_input,get_layer_params()) with respect
+                      to data_input. 
+                  Finally, backward_inplace() outputs these gradients by performing:
+                    - params_grad = PARAMETER_GRADIENT 
+                    - if (is_same_object(gradient_input, data_grad)) then
+                        - data_grad = DATA_GRADIENT
+                    - else
+                        - data_grad += DATA_GRADIENT
+        !*/
+
+        const tensor& get_layer_params(
+        ) const; 
+        /*!
+            ensures
+                - returns the parameters that define the behavior of forward().
+        !*/
+
+        tensor& get_layer_params(
+        ); 
+        /*!
+            ensures
+                - returns the parameters that define the behavior of forward().
+        !*/
+
+
+        dpoint map_input_to_output(dpoint p) const;
+        dpoint map_output_to_input(dpoint p) const;
+        /*!
+            These two functions are optional.  If provided, they should map between
+            (column,row) coordinates in input and output tensors of forward().  Providing
+            these functions allows you to use global utility functions like
+            input_tensor_to_output_tensor().
+        !*/
+
+        void clean (
+        );
+        /*!
+            Implementing this function is optional.  If you don't need it then you don't
+            have to provide a clean().  But if you do provide it then it must behave as
+            follows:
+
+            ensures
+                - calling clean() Causes this object to forget about everything except its
+                  parameters.  This is useful if your layer caches information between
+                  forward and backward passes and you want to clean out that cache
+                  information before saving the network to disk.  
+        !*/
+
+    };
+
+    std::ostream& operator<<(std::ostream& out, const EXAMPLE_COMPUTATIONAL_LAYER_& item);
+    /*!
+        print a string describing this layer.
+    !*/
+
+    void to_xml(const EXAMPLE_COMPUTATIONAL_LAYER_& item, std::ostream& out);
+    /*!
+        This function is optional, but required if you want to print your networks with
+        net_to_xml().  Therefore, to_xml() prints a layer as XML.
+    !*/
+
+    void serialize(const EXAMPLE_COMPUTATIONAL_LAYER_& item, std::ostream& out);
+    void deserialize(EXAMPLE_COMPUTATIONAL_LAYER_& item, std::istream& in);
+    /*!
+        provides serialization support  
+    !*/
+
+    // For each layer you define, always define an add_layer template so that layers can be
+    // easily composed.  Moreover, the convention is that the layer class ends with an _
+    // while the add_layer template has the same name but without the trailing _.
+    template <typename SUBNET>
+    using EXAMPLE_COMPUTATIONAL_LAYER = add_layer<EXAMPLE_COMPUTATIONAL_LAYER_, SUBNET>;
+
+// ----------------------------------------------------------------------------------------
+// ----------------------------------------------------------------------------------------
+// ----------------------------------------------------------------------------------------
+
+    enum fc_bias_mode
+    {
+        FC_HAS_BIAS = 0,
+        FC_NO_BIAS = 1
+    };
+
+    struct num_fc_outputs
+    {
+        num_fc_outputs(unsigned long n) : num_outputs(n) {}
+        unsigned long num_outputs;
+    };
+
+    template <
+        unsigned long num_outputs,
+        fc_bias_mode bias_mode
+        >
+    class fc_
+    {
+        /*!
+            REQUIREMENTS ON num_outputs
+                num_outputs > 0
+
+            WHAT THIS OBJECT REPRESENTS
+                This is an implementation of the EXAMPLE_COMPUTATIONAL_LAYER_ interface
+                defined above.  In particular, it defines a fully connected layer that
+                takes an input tensor and multiplies it by a weight matrix and outputs the
+                results.
+
+                The dimensions of the tensors output by this layer are as follows (letting
+                IN be the input tensor and OUT the output tensor):
+                    - OUT.num_samples() == IN.num_samples()
+                    - OUT.k()  == get_num_outputs()
+                    - OUT.nr() == 1
+                    - OUT.nc() == 1
+        !*/
+
+    public:
+
+        fc_(
+        );
+        /*!
+            ensures
+                - #get_num_outputs() == num_outputs
+                - #get_bias_mode() == bias_mode 
+                - #get_learning_rate_multiplier()      == 1
+                - #get_weight_decay_multiplier()       == 1
+                - #get_bias_learning_rate_multiplier() == 1
+                - #get_bias_weight_decay_multiplier()  == 0
+        !*/
+
+        fc_(
+            num_fc_outputs o
+        );
+        /*!
+            ensures
+                - #get_num_outputs() == o.num_outputs 
+                - #get_bias_mode() == bias_mode 
+                - #get_learning_rate_multiplier()      == 1
+                - #get_weight_decay_multiplier()       == 1
+                - #get_bias_learning_rate_multiplier() == 1
+                - #get_bias_weight_decay_multiplier()  == 0
+        !*/
+
+        unsigned long get_num_outputs (
+        ) const; 
+        /*!
+            ensures
+                - This layer outputs column vectors that contain get_num_outputs()
+                  elements. That is, the output tensor T from forward() will be such that:
+                    - T.num_samples() == however many samples were given to forward().
+                    - T.k() == get_num_outputs()
+                    - The rest of the dimensions of T will be 1.
+        !*/
+
+        void set_num_outputs(
+            long num
+        );
+        /*!
+            requires
+                - num > 0
+                - get_layer_params().size() == 0 || get_num_outputs() == num
+                  (i.e. You can't change the number of outputs in fc_ if the parameter
+                  tensor has already been allocated.)
+            ensures
+                - #get_num_outputs() == num
+        !*/
+
+        fc_bias_mode get_bias_mode (
+        ) const;
+        /*!
+            ensures
+                - returns the bias mode which determines if this layer includes bias terms.
+                  That is, if the bias mode is FC_HAS_BIAS then a different constant scalar
+                  is added to each of the outputs of this layer. 
+        !*/
+
+        double get_learning_rate_multiplier(
+        ) const;  
+        /*!
+            ensures
+                - returns a multiplier number.  The interpretation is that this object is
+                  requesting that the learning rate used to optimize its parameters be
+                  multiplied by get_learning_rate_multiplier().
+        !*/
+
+        double get_weight_decay_multiplier(
+        ) const; 
+        /*!
+            ensures
+                - returns a multiplier number.  The interpretation is that this object is
+                  requesting that the weight decay used to optimize its parameters be
+                  multiplied by get_weight_decay_multiplier().
+        !*/
+
+        void set_learning_rate_multiplier(
+            double val
+        );
+        /*!
+            requires
+                - val >= 0
+            ensures
+                - #get_learning_rate_multiplier() == val
+        !*/
+
+        void set_weight_decay_multiplier(
+            double val
+        ); 
+        /*!
+            requires
+                - val >= 0
+            ensures
+                - #get_weight_decay_multiplier() == val
+        !*/
+
+        double get_bias_learning_rate_multiplier(
+        ) const; 
+        /*!
+            ensures
+                - returns a multiplier number.  The interpretation is that this object is
+                  requesting that the learning rate used to optimize its bias parameters be
+                  multiplied by get_learning_rate_multiplier()*get_bias_learning_rate_multiplier().
+        !*/
+
+        double get_bias_weight_decay_multiplier(
+        ) const; 
+        /*!
+            ensures
+                - returns a multiplier number.  The interpretation is that this object is
+                  requesting that the weight decay used to optimize its bias parameters be
+                  multiplied by get_weight_decay_multiplier()*get_bias_weight_decay_multiplier().
+        !*/
+
+        void set_bias_learning_rate_multiplier(
+            double val
+        ); 
+        /*!
+            requires
+                - val >= 0
+            ensures
+                - #get_bias_learning_rate_multiplier() == val
+        !*/
+
+        void set_bias_weight_decay_multiplier(
+            double val
+        ); 
+        /*!
+            requires
+                - val >= 0
+            ensures
+                - #get_bias_weight_decay_multiplier() == val
+        !*/
+
+        alias_tensor_const_instance get_weights(
+        ) const;
+        /*!
+            ensures
+                - returns an alias of get_layer_params(), containing the weights matrix of
+                  the fully connected layer.
+                - #get_weights().num_samples() is the number of elements in input sample,
+                  i.e. sublayer's output's k * nc * nr.
+                - #get_bias().k() == #get_num_outputs()
+                - if get_bias_mode() == FC_HAS_BIAS:
+                    - #get_layer_params().size() == (#get_weights().size() + #get_biases().size())
+                - else:
+                    - #get_layer_params().size() == #get_weights().size()
+        !*/
+
+        alias_tensor_instance get_weights(
+        );
+        /*!
+            ensures
+                - returns an alias of get_layer_params(), containing the weights matrix of
+                  the fully connected layer.
+                - #get_weights().num_samples() is the number of elements in input sample,
+                  i.e. sublayer's output's k * nc * nr.
+                - #get_bias().k() == #get_num_outputs()
+                - if get_bias_mode() == FC_HAS_BIAS:
+                    - #get_layer_params().size() == (#get_weights().size() + #get_biases().size())
+                - else:
+                    - #get_layer_params().size() == #get_weights().size()
+        !*/
+
+        alias_tensor_const_instance get_biases(
+        ) const;
+        /*!
+            requires
+                - #get_bias_mode() == FC_HAS_BIAS
+            ensures
+                - returns an alias of get_layer_params(), containing the bias vector of
+                  the fully connected layer.
+                - #get_bias().num_samples() == 1
+                - #get_bias().k() == #get_num_outputs()
+                - #get_layer_params().size() == (#get_weights().size() + #get_biases().size())
+        !*/
+
+        alias_tensor_instance get_biases(
+        );
+        /*!
+            requires
+                - #get_bias_mode() == FC_HAS_BIAS
+            ensures
+                - returns an alias of get_layer_params(), containing the bias vector of
+                  the fully connected layer.
+                - #get_bias().num_samples() == 1
+                - #get_bias().k() == #get_num_outputs()
+                - #get_layer_params().size() == (#get_weights().size() + #get_biases().size())
+        !*/
+
+        template <typename SUBNET> void setup (const SUBNET& sub);
+        template <typename SUBNET> void forward(const SUBNET& sub, resizable_tensor& output);
+        template <typename SUBNET> void backward(const tensor& gradient_input, SUBNET& sub, tensor& params_grad);
+        const tensor& get_layer_params() const; 
+        tensor& get_layer_params(); 
+        /*!
+            These functions are implemented as described in the EXAMPLE_COMPUTATIONAL_LAYER_ interface.
+        !*/
+
+    };
+
+    template <
+        unsigned long num_outputs,
+        typename SUBNET
+        >
+    using fc = add_layer<fc_<num_outputs,FC_HAS_BIAS>, SUBNET>;
+
+    template <
+        unsigned long num_outputs,
+        typename SUBNET
+        >
+    using fc_no_bias = add_layer<fc_<num_outputs,FC_NO_BIAS>, SUBNET>;
+
+// ----------------------------------------------------------------------------------------
+
+    struct num_con_outputs
+    {
+        num_con_outputs(unsigned long n) : num_outputs(n) {}
+        unsigned long num_outputs;
+    };
+
+    template <
+        long _num_filters,
+        long _nr,
+        long _nc,
+        int _stride_y,
+        int _stride_x,
+        int _padding_y = _stride_y!=1? 0 : _nr/2,
+        int _padding_x = _stride_x!=1? 0 : _nc/2
+        >
+    class con_
+    {
+        /*!
+            REQUIREMENTS ON TEMPLATE ARGUMENTS
+                - _num_filters > 0
+                - _nr >= 0
+                - _nc >= 0
+                - _stride_y > 0
+                - _stride_x > 0
+                - _padding_y >= 0
+                - _padding_x >= 0
+                - Also, we require that:
+                    - if (_nr == 0) then
+                        - _padding_y == 0
+                    - else
+                        - _padding_y < _nr
+                    - if (_nc == 0) then
+                        - _padding_x == 0
+                    - else
+                        - _padding_x < _nc
+
+            WHAT THIS OBJECT REPRESENTS
+                This is an implementation of the EXAMPLE_COMPUTATIONAL_LAYER_ interface
+                defined above.  In particular, it defines a convolution layer that takes an
+                input tensor (nominally representing an image) and convolves it with a set
+                of filters and then outputs the results. 
+
+                The dimensions of the tensors output by this layer are as follows (letting
+                IN be the input tensor and OUT the output tensor):
+                    - OUT.num_samples() == IN.num_samples()
+                    - OUT.k()  == num_filters()
+                    - OUT.nr() == 1+(IN.nr() + 2*padding_y() - nr())/stride_y()
+                    - OUT.nc() == 1+(IN.nc() + 2*padding_x() - nc())/stride_x()
+
+                Note also that setting _nr or _nc to 0 has a special meaning of "set the
+                filter size equal to the input image size".  Specifically, it means: 
+                    - if (_nr == 0) then
+                        - nr() == IN.nr()
+                        - OUT.nr() == 1
+                    - if (_nc == 0) then
+                        - nc() == IN.nc()
+                        - OUT.nc() == 1
+        !*/
+
+    public:
+        con_(
+        );
+        /*!
+            ensures
+                - #num_filters() == _num_filters
+                - #nr() == _nr
+                - #nc() == _nc
+                - #stride_y() == _stride_y
+                - #stride_x() == _stride_x
+                - #padding_y() == _padding_y
+                - #padding_x() == _padding_x
+                - #get_learning_rate_multiplier()      == 1
+                - #get_weight_decay_multiplier()       == 1
+                - #get_bias_learning_rate_multiplier() == 1
+                - #get_bias_weight_decay_multiplier()  == 0
+        !*/
+
+        con_(
+            num_con_outputs o
+        );
+        /*!
+            ensures
+                - #num_filters() == o.num_outputs 
+                - #nr() == _nr
+                - #nc() == _nc
+                - #stride_y() == _stride_y
+                - #stride_x() == _stride_x
+                - #padding_y() == _padding_y
+                - #padding_x() == _padding_x
+                - #get_learning_rate_multiplier()      == 1
+                - #get_weight_decay_multiplier()       == 1
+                - #get_bias_learning_rate_multiplier() == 1
+                - #get_bias_weight_decay_multiplier()  == 0
+        !*/
+
+        long num_filters(
+        ) const; 
+        /*!
+            ensures
+                - returns the number of filters contained in this layer.  The k dimension
+                  of the output tensors produced by this layer will be equal to the number
+                  of filters.
+        !*/
+
+        void set_num_filters(
+            long num
+        );
+        /*!
+            requires
+                - num > 0
+                - get_layer_params().size() == 0 || num_filters() == num
+                  (i.e. You can't change the number of filters in con_ if the parameter
+                  tensor has already been allocated.)
+            ensures
+                - #num_filters() == num
+        !*/
+
+        long nr(
+        ) const; 
+        /*!
+            ensures
+                - returns the number of rows in the filters in this layer.  Note that if
+                  nr()==0 then it means the size of the filter is not yet assigned, but
+                  once setup() is called nr() will be set to the input tensor's nr().
+                  Therefore, nr()==0 has the special interpretation of "be the same size as
+                  the input tensor".
+        !*/
+
+        long nc(
+        ) const;
+        /*!
+            ensures
+                - returns the number of columns in the filters in this layer.  Note that if
+                  nc()==0 then it means the size of the filter is not yet assigned, but
+                  once setup() is called nc() will be set to the input tensor's nc().
+                  Therefore, nc()==0 has the special interpretation of "be the same size as
+                  the input tensor".
+        !*/
+
+        long stride_y(
+        ) const; 
+        /*!
+            ensures
+                - returns the vertical stride used when convolving the filters over an
+                  image.  That is, each filter will be moved stride_y() pixels down at a
+                  time when it moves over the image.
+        !*/
+
+        long stride_x(
+        ) const;
+        /*!
+            ensures
+                - returns the horizontal stride used when convolving the filters over an
+                  image.  That is, each filter will be moved stride_x() pixels right at a
+                  time when it moves over the image.
+        !*/
+
+        long padding_y(
+        ) const; 
+        /*!
+            ensures
+                - returns the number of pixels of zero padding added to the top and bottom
+                  sides of the image.
+        !*/
+
+        long padding_x(
+        ) const; 
+        /*!
+            ensures
+                - returns the number of pixels of zero padding added to the left and right 
+                  sides of the image.
+        !*/
+
+        double get_learning_rate_multiplier(
+        ) const;  
+        /*!
+            ensures
+                - returns a multiplier number.  The interpretation is that this object is
+                  requesting that the learning rate used to optimize its parameters be
+                  multiplied by get_learning_rate_multiplier().
+        !*/
+
+        double get_weight_decay_multiplier(
+        ) const; 
+        /*!
+            ensures
+                - returns a multiplier number.  The interpretation is that this object is
+                  requesting that the weight decay used to optimize its parameters be
+                  multiplied by get_weight_decay_multiplier().
+        !*/
+
+        void set_learning_rate_multiplier(
+            double val
+        );
+        /*!
+            requires
+                - val >= 0
+            ensures
+                - #get_learning_rate_multiplier() == val
+        !*/
+
+        void set_weight_decay_multiplier(
+            double val
+        ); 
+        /*!
+            requires
+                - val >= 0
+            ensures
+                - #get_weight_decay_multiplier() == val
+        !*/
+
+        double get_bias_learning_rate_multiplier(
+        ) const; 
+        /*!
+            ensures
+                - returns a multiplier number.  The interpretation is that this object is
+                  requesting that the learning rate used to optimize its bias parameters be
+                  multiplied by get_learning_rate_multiplier()*get_bias_learning_rate_multiplier().
+        !*/
+
+        double get_bias_weight_decay_multiplier(
+        ) const; 
+        /*!
+            ensures
+                - returns a multiplier number.  The interpretation is that this object is
+                  requesting that the weight decay used to optimize its bias parameters be
+                  multiplied by get_weight_decay_multiplier()*get_bias_weight_decay_multiplier().
+        !*/
+
+        void set_bias_learning_rate_multiplier(
+            double val
+        ); 
+        /*!
+            requires
+                - val >= 0
+            ensures
+                - #get_bias_learning_rate_multiplier() == val
+        !*/
+
+        void set_bias_weight_decay_multiplier(
+            double val
+        ); 
+        /*!
+            requires
+                - val >= 0
+            ensures
+                - #get_bias_weight_decay_multiplier() == val
+        !*/
+
+        template <typename SUBNET> void setup (const SUBNET& sub);
+        template <typename SUBNET> void forward(const SUBNET& sub, resizable_tensor& output);
+        template <typename SUBNET> void backward(const tensor& gradient_input, SUBNET& sub, tensor& params_grad);
+        dpoint map_input_to_output(dpoint p) const;
+        dpoint map_output_to_input(dpoint p) const;
+        const tensor& get_layer_params() const; 
+        tensor& get_layer_params(); 
+        /*!
+            These functions are implemented as described in the EXAMPLE_COMPUTATIONAL_LAYER_ interface.
+        !*/
+
+    };
+
+    template <
+        long num_filters,
+        long nr,
+        long nc,
+        int stride_y,
+        int stride_x,
+        typename SUBNET
+        >
+    using con = add_layer<con_<num_filters,nr,nc,stride_y,stride_x>, SUBNET>;
+
+// ----------------------------------------------------------------------------------------
+
+    template <
+        long _num_filters,
+        long _nr,
+        long _nc,
+        int _stride_y,
+        int _stride_x,
+        int _padding_y = _stride_y!=1? 0 : _nr/2,
+        int _padding_x = _stride_x!=1? 0 : _nc/2
+        >
+    class cont_
+    {
+        /*!
+            REQUIREMENTS ON TEMPLATE ARGUMENTS
+                All of them must be > 0.
+                Also, we require that:
+                    - 0 <= _padding_y && _padding_y < _nr
+                    - 0 <= _padding_x && _padding_x < _nc
+
+            WHAT THIS OBJECT REPRESENTS
+                This is an implementation of the EXAMPLE_COMPUTATIONAL_LAYER_ interface
+                defined above.  In particular, it defines a transposed convolution layer
+                that takes an input tensor and transpose convolves (sometimes called
+                "deconvolution") it with a set of filters and then outputs the results. 
+
+                This is essentially a convolutional layer that allows fractional strides.
+                Therefore, you can make output tensors that are larger than the input
+                tensors using this layer type. 
+
+                
+                The dimensions of the tensors output by this layer are as follows (letting
+                IN be the input tensor and OUT the output tensor):
+                    - OUT.num_samples() == IN.num_samples()
+                    - OUT.k()  == num_filters()
+                    - OUT.nr() == stride_y()*(IN.nr()-1) + nr() - 2*padding_y()
+                    - OUT.nc() == stride_x()*(IN.nc()-1) + nc() - 2*padding_x()
+        !*/
+
+    public:
+        cont_(
+        );
+        /*!
+            ensures
+                - #num_filters() == _num_filters
+                - #nr() == _nr
+                - #nc() == _nc
+                - #stride_y() == _stride_y
+                - #stride_x() == _stride_x
+                - #padding_y() == _padding_y
+                - #padding_x() == _padding_x
+                - #get_learning_rate_multiplier()      == 1
+                - #get_weight_decay_multiplier()       == 1
+                - #get_bias_learning_rate_multiplier() == 1
+                - #get_bias_weight_decay_multiplier()  == 0
+        !*/
+
+        cont_(
+            num_con_outputs o
+        );
+        /*!
+            ensures
+                - #num_filters() == o.num_outputs 
+                - #nr() == _nr
+                - #nc() == _nc
+                - #stride_y() == _stride_y
+                - #stride_x() == _stride_x
+                - #padding_y() == _padding_y
+                - #padding_x() == _padding_x
+                - #get_learning_rate_multiplier()      == 1
+                - #get_weight_decay_multiplier()       == 1
+                - #get_bias_learning_rate_multiplier() == 1
+                - #get_bias_weight_decay_multiplier()  == 0
+        !*/
+
+        long num_filters(
+        ) const; 
+        /*!
+            ensures
+                - returns the number of filters contained in this layer.  The k dimension
+                  of the output tensors produced by this layer will be equal to the number
+                  of filters.
+        !*/
+
+        void set_num_filters(
+            long num
+        );
+        /*!
+            requires
+                - num > 0
+                - get_layer_params().size() == 0 || num_filters() == num
+                  (i.e. You can't change the number of filters in cont_ if the parameter
+                  tensor has already been allocated.)
+            ensures
+                - #num_filters() == num
+        !*/
+
+        long nr(
+        ) const; 
+        /*!
+            ensures
+                - returns the number of rows in the filters in this layer.
+        !*/
+
+        long nc(
+        ) const;
+        /*!
+            ensures
+                - returns the number of columns in the filters in this layer.
+        !*/
+
+        long stride_y(
+        ) const; 
+        /*!
+            ensures
+                - returns the vertical stride used when convolving the filters over an
+                  image.  That is, each filter will be moved 1.0/stride_y() pixels down at
+                  a time when it moves over the image.
+        !*/
+
+        long stride_x(
+        ) const;
+        /*!
+            ensures
+                - returns the horizontal stride used when convolving the filters over an
+                  image.  That is, each filter will be moved 1.0/stride_x() pixels right at
+                  a time when it moves over the image.
+        !*/
+
+        long padding_y(
+        ) const; 
+        /*!
+            ensures
+                - returns the number of pixels of zero padding added to the top and bottom
+                  sides of the image.
+        !*/
+
+        long padding_x(
+        ) const; 
+        /*!
+            ensures
+                - returns the number of pixels of zero padding added to the left and right 
+                  sides of the image.
+        !*/
+
+        double get_learning_rate_multiplier(
+        ) const;  
+        /*!
+            ensures
+                - returns a multiplier number.  The interpretation is that this object is
+                  requesting that the learning rate used to optimize its parameters be
+                  multiplied by get_learning_rate_multiplier().
+        !*/
+
+        double get_weight_decay_multiplier(
+        ) const; 
+        /*!
+            ensures
+                - returns a multiplier number.  The interpretation is that this object is
+                  requesting that the weight decay used to optimize its parameters be
+                  multiplied by get_weight_decay_multiplier().
+        !*/
+
+        void set_learning_rate_multiplier(
+            double val
+        );
+        /*!
+            requires
+                - val >= 0
+            ensures
+                - #get_learning_rate_multiplier() == val
+        !*/
+
+        void set_weight_decay_multiplier(
+            double val
+        ); 
+        /*!
+            requires
+                - val >= 0
+            ensures
+                - #get_weight_decay_multiplier() == val
+        !*/
+
+        double get_bias_learning_rate_multiplier(
+        ) const; 
+        /*!
+            ensures
+                - returns a multiplier number.  The interpretation is that this object is
+                  requesting that the learning rate used to optimize its bias parameters be
+                  multiplied by get_learning_rate_multiplier()*get_bias_learning_rate_multiplier().
+        !*/
+
+        double get_bias_weight_decay_multiplier(
+        ) const; 
+        /*!
+            ensures
+                - returns a multiplier number.  The interpretation is that this object is
+                  requesting that the weight decay used to optimize its bias parameters be
+                  multiplied by get_weight_decay_multiplier()*get_bias_weight_decay_multiplier().
+        !*/
+
+        void set_bias_learning_rate_multiplier(
+            double val
+        ); 
+        /*!
+            requires
+                - val >= 0
+            ensures
+                - #get_bias_learning_rate_multiplier() == val
+        !*/
+
+        void set_bias_weight_decay_multiplier(
+            double val
+        ); 
+        /*!
+            requires
+                - val >= 0
+            ensures
+                - #get_bias_weight_decay_multiplier() == val
+        !*/
+
+        template <typename SUBNET> void setup (const SUBNET& sub);
+        template <typename SUBNET> void forward(const SUBNET& sub, resizable_tensor& output);
+        template <typename SUBNET> void backward(const tensor& gradient_input, SUBNET& sub, tensor& params_grad);
+        dpoint map_input_to_output(dpoint p) const;
+        dpoint map_output_to_input(dpoint p) const;
+        const tensor& get_layer_params() const; 
+        tensor& get_layer_params(); 
+        /*!
+            These functions are implemented as described in the EXAMPLE_COMPUTATIONAL_LAYER_ interface.
+        !*/
+
+    };
+
+    template <
+        long num_filters,
+        long nr,
+        long nc,
+        int stride_y,
+        int stride_x,
+        typename SUBNET
+        >
+    using cont = add_layer<cont_<num_filters,nr,nc,stride_y,stride_x>, SUBNET>;
+
+// ----------------------------------------------------------------------------------------
+
+    template <
+        int scale_y, 
+        int scale_x 
+        >
+    class upsample_
+    {
+        /*!
+            REQUIREMENTS ON TEMPLATE ARGUMENTS
+                All of them must be >= 1.
+
+            WHAT THIS OBJECT REPRESENTS
+                This is an implementation of the EXAMPLE_COMPUTATIONAL_LAYER_ interface
+                defined above.  In particular, it allows you to upsample a layer using
+                bilinear interpolation.  To be very specific, it upsamples each of the
+                channels in an input tensor.  Therefore, if IN is the input tensor to this
+                layer and OUT the output tensor, then we will have:
+                    - OUT.num_samples() == IN.num_samples()
+                    - OUT.k()  == IN.k() 
+                    - OUT.nr() == IN.nr()*scale_y
+                    - OUT.nc() == IN.nr()*scale_x
+                    - for all valid i,k:  image_plane(OUT,i,k) is a copy of
+                      image_plane(IN,i,k) that has been bilinearly interpolated to fit into
+                      the shape of image_plane(OUT,i,k).
+        !*/
+    public:
+
+        upsample_(
+        );
+        /*!
+            ensures
+                - This object has no state, so the constructor does nothing, aside from
+                  providing default constructability.
+        !*/
+
+        template <typename SUBNET> void setup (const SUBNET& sub);
+        template <typename SUBNET> void forward(const SUBNET& sub, resizable_tensor& output);
+        template <typename SUBNET> void backward(const tensor& gradient_input, SUBNET& sub, tensor& params_grad);
+        dpoint map_input_to_output(dpoint p) const;
+        dpoint map_output_to_input(dpoint p) const;
+        const tensor& get_layer_params() const; 
+        tensor& get_layer_params(); 
+        /*!
+            These functions are implemented as described in the EXAMPLE_COMPUTATIONAL_LAYER_ interface.
+        !*/
+    };
+
+    template <
+        int scale,
+        typename SUBNET
+        >
+    using upsample = add_layer<upsample_<scale,scale>, SUBNET>;
+
+// ----------------------------------------------------------------------------------------
+
+    class dropout_
+    {
+        /*!
+            WHAT THIS OBJECT REPRESENTS
+                This is an implementation of the EXAMPLE_COMPUTATIONAL_LAYER_ interface
+                defined above.  In particular, it defines a dropout layer.  Therefore, it
+                passes its inputs through the stochastic function f(x) which outputs either
+                0 or x.  The probability of 0 being output is given by the drop_rate
+                argument to this object's constructor.
+
+                Note that, after you finish training a network with dropout, it is a good
+                idea to replace each dropout_ layer with a multiply_ layer because the
+                multiply_ layer is faster and deterministic. 
+        !*/
+
+    public:
+
+        explicit dropout_(
+            float drop_rate = 0.5
+        );
+        /*!
+            requires
+                - 0 <= drop_rate <= 1
+            ensures
+                - #get_drop_rate() == drop_rate
+        !*/
+
+        float get_drop_rate (
+        ) const; 
+        /*!
+            ensures
+                - returns the probability that an individual input value to this layer will
+                  be replaced with 0.
+        !*/
+
+        template <typename SUBNET> void setup (const SUBNET& sub);
+        void forward_inplace(const tensor& input, tensor& output);
+        void backward_inplace(const tensor& gradient_input, tensor& data_grad, tensor& params_grad);
+        dpoint map_input_to_output(dpoint p) const;
+        dpoint map_output_to_input(dpoint p) const;
+        const tensor& get_layer_params() const; 
+        tensor& get_layer_params(); 
+        /*!
+            These functions are implemented as described in the EXAMPLE_COMPUTATIONAL_LAYER_ interface.
+        !*/
+    };
+
+    template <typename SUBNET>
+    using dropout = add_layer<dropout_, SUBNET>;
+
+// ----------------------------------------------------------------------------------------
+
+    class multiply_
+    {
+        /*!
+            WHAT THIS OBJECT REPRESENTS
+                This is an implementation of the EXAMPLE_COMPUTATIONAL_LAYER_ interface
+                defined above.  In particular, it defines a basic layer that just
+                multiplies its input tensor with a constant value and returns the result.
+                It therefore has no learnable parameters.
+        !*/
+
+    public:
+        explicit multiply_(
+            float val = 0.5
+        ); 
+        /*!
+            ensures
+                - #get_multiply_value() == val
+        !*/
+
+        multiply_ (
+            const dropout_& item
+        ); 
+        /*!
+            ensures
+                - #get_multiply_value() == 1-item.get_drop_rate()
+                  (i.e. We construct the multiply_ layer so that it is essentially a
+                  deterministic version of the given dropout_ layer)
+        !*/
+
+        float get_multiply_value (
+        ) const;
+        /*!
+            ensures
+                - this layer simply multiplies its input tensor by get_multiply_value() and
+                  produces the result as output.
+        !*/
+
+        template <typename SUBNET> void setup (const SUBNET& sub);
+        void forward_inplace(const tensor& input, tensor& output);
+        void backward_inplace(const tensor& gradient_input, tensor& data_grad, tensor& params_grad);
+        dpoint map_input_to_output(dpoint p) const;
+        dpoint map_output_to_input(dpoint p) const;
+        const tensor& get_layer_params() const; 
+        tensor& get_layer_params(); 
+        /*!
+            These functions are implemented as described in the EXAMPLE_COMPUTATIONAL_LAYER_ interface.
+        !*/
+    };
+
+    template <typename SUBNET>
+    using multiply = add_layer<multiply_, SUBNET>;
+
+// ----------------------------------------------------------------------------------------
+
+    enum layer_mode
+    {
+        CONV_MODE = 0, // convolutional mode
+        FC_MODE = 1    // fully connected mode
+    };
+
+    const double DEFAULT_BATCH_NORM_EPS = 0.0001;
+
+    template <
+        layer_mode mode
+        >
+    class bn_
+    {
+        /*!
+            WHAT THIS OBJECT REPRESENTS
+                This is an implementation of the EXAMPLE_COMPUTATIONAL_LAYER_ interface
+                defined above.  In particular, it defines a batch normalization layer that
+                implements the method described in the paper: 
+                    Batch Normalization: Accelerating Deep Network Training by Reducing
+                    Internal Covariate Shift by Sergey Ioffe and Christian Szegedy
+                
+                In particular, this layer produces output tensors with the same
+                dimensionality as the input tensors, except that the mean and variances of
+                the elements have been standardized to 0 and 1 respectively. 
+
+                It should also be noted that when tensors with a num_samples() dimension of
+                1 are passed to this layer it doesn't perform batch normalization.
+                Instead, it runs in "inference mode" where the learned linear normalizing
+                transformation is used to transform the tensor. 
+
+                Finally, after you finish training a batch normalized network, it is a good
+                idea to replace each bn_ layer with an affine_ layer because the affine_
+                layer is faster and will never surprise you by performing batch
+                normalization on tensors that have a num_samples() dimension > 1.  This allows
+                you to run large mini-batches of samples through your final network without
+                batch normalization executing at all. 
+        !*/
+
+    public:
+        bn_(
+        );
+        /*!
+            ensures
+                - #get_mode() == mode
+                - #get_running_stats_window_size()      == 100
+                - #get_learning_rate_multiplier()       == 1
+                - #get_weight_decay_multiplier()        == 0
+                - #get_bias_learning_rate_multiplier()  == 1
+                - #get_bias_weight_decay_multiplier()   == 1
+                - #get_eps() == tt::DEFAULT_BATCH_NORM_EPS
+        !*/
+
+        explicit bn_(
+            unsigned long window_size,
+            double eps = tt::DEFAULT_BATCH_NORM_EPS
+        );
+        /*!
+            requires
+                - eps > 0
+                - window_size > 0
+            ensures
+                - #get_mode() == mode 
+                - #get_running_stats_window_size()     == window_size
+                - #get_learning_rate_multiplier()      == 1
+                - #get_weight_decay_multiplier()       == 0
+                - #get_bias_learning_rate_multiplier() == 1
+                - #get_bias_weight_decay_multiplier()  == 1
+                - #get_eps() == eps
+        !*/
+
+        layer_mode get_mode(
+        ) const; 
+        /*!
+            ensures
+                - returns the mode of this layer, either CONV_MODE or FC_MODE.
+                  If the mode is FC_MODE then the normalization is applied across the
+                  samples in a tensor (i.e. k()*nr()*nc() different things will be
+                  normalized).  Otherwise, normalization is applied across everything
+                  except for the k() dimension, resulting in there being only k()
+                  normalization equations that are applied spatially over the tensor.
+
+                  Therefore, if you are putting batch normalization after a fully connected
+                  layer you should use FC_MODE.  Otherwise, if you are putting batch
+                  normalization after a convolutional layer you should use CONV_MODE.
+        !*/
+
+        double get_eps(
+        ) const; 
+        /*!
+            ensures
+                - When doing batch normalization, we are dividing by the standard
+                  deviation.  This epsilon value returned by this function is added to the
+                  variance to prevent the division from dividing by zero.
+        !*/
+
+        unsigned long get_running_stats_window_size (
+        ) const; 
+        /*!
+            ensures
+                - Just as recommended in the batch normalization paper, this object keeps a
+                  running average of the mean and standard deviations of the features.
+                  These averages are used during "inference mode" so you can run a single
+                  object through a batch normalized network.  They are also what is used to
+                  initialize an affine_ layer that is constructed from a bn_ layer.  This
+                  function returns the effective number of recent samples used to compute
+                  the running average.
+        !*/
+
+        void set_running_stats_window_size (
+            unsigned long new_window_size
+        );
+        /*!
+            requires
+                - new_window_size > 0
+            ensures
+                - #get_running_stats_window_size() == new_window_size
+        !*/
+
+        double get_learning_rate_multiplier(
+        ) const;  
+        /*!
+            ensures
+                - returns a multiplier number.  The interpretation is that this object is
+                  requesting that the learning rate used to optimize its parameters be
+                  multiplied by get_learning_rate_multiplier().
+        !*/
+
+        double get_weight_decay_multiplier(
+        ) const; 
+        /*!
+            ensures
+                - returns a multiplier number.  The interpretation is that this object is
+                  requesting that the weight decay used to optimize its parameters be
+                  multiplied by get_weight_decay_multiplier().
+        !*/
+
+        void set_learning_rate_multiplier(
+            double val
+        );
+        /*!
+            requires
+                - val >= 0
+            ensures
+                - #get_learning_rate_multiplier() == val
+        !*/
+
+        void set_weight_decay_multiplier(
+            double val
+        ); 
+        /*!
+            requires
+                - val >= 0
+            ensures
+                - #get_weight_decay_multiplier() == val
+        !*/
+
+        double get_bias_learning_rate_multiplier(
+        ) const; 
+        /*!
+            ensures
+                - returns a multiplier number.  The interpretation is that this object is
+                  requesting that the learning rate used to optimize its bias parameters be
+                  multiplied by get_learning_rate_multiplier()*get_bias_learning_rate_multiplier().
+        !*/
+
+        double get_bias_weight_decay_multiplier(
+        ) const; 
+        /*!
+            ensures
+                - returns a multiplier number.  The interpretation is that this object is
+                  requesting that the weight decay used to optimize its bias parameters be
+                  multiplied by get_weight_decay_multiplier()*get_bias_weight_decay_multiplier().
+        !*/
+
+        void set_bias_learning_rate_multiplier(
+            double val
+        ); 
+        /*!
+            requires
+                - val >= 0
+            ensures
+                - #get_bias_learning_rate_multiplier() == val
+        !*/
+
+        void set_bias_weight_decay_multiplier(
+            double val
+        ); 
+        /*!
+            requires
+                - val >= 0
+            ensures
+                - #get_bias_weight_decay_multiplier() == val
+        !*/
+
+        template <typename SUBNET> void setup (const SUBNET& sub);
+        template <typename SUBNET> void forward(const SUBNET& sub, resizable_tensor& output);
+        template <typename SUBNET> void backward(const tensor& gradient_input, SUBNET& sub, tensor& params_grad);
+        dpoint map_input_to_output(dpoint p) const;
+        dpoint map_output_to_input(dpoint p) const;
+        const tensor& get_layer_params() const; 
+        tensor& get_layer_params(); 
+        /*!
+            These functions are implemented as described in the EXAMPLE_COMPUTATIONAL_LAYER_ interface.
+        !*/
+    };
+
+    template <typename SUBNET>
+    using bn_con = add_layer<bn_<CONV_MODE>, SUBNET>;
+    template <typename SUBNET>
+    using bn_fc = add_layer<bn_<FC_MODE>, SUBNET>;
+
+// ----------------------------------------------------------------------------------------
+
+    template <typename net_type>
+    void set_all_bn_running_stats_window_sizes (
+        const net_type& net,
+        unsigned long new_window_size
+    );
+    /*!
+        requires
+            - new_window_size > 0
+            - net_type is an object of type add_layer, add_loss_layer, add_skip_layer, or
+              add_tag_layer.
+        ensures
+            - Sets the get_running_stats_window_size() field of all bn_ layers in net to
+              new_window_size.
+    !*/
+
+// ----------------------------------------------------------------------------------------
+
+    class affine_
+    {
+        /*!
+            WHAT THIS OBJECT REPRESENTS
+                This is an implementation of the EXAMPLE_COMPUTATIONAL_LAYER_ interface
+                defined above.  In particular, it applies a simple pointwise linear
+                transformation to an input tensor.  You can think of it as having two
+                parameter tensors, A and B.  If the input tensor is called INPUT then the
+                output of this layer is:
+                    A*INPUT+B
+                where all operations are performed element wise and each sample in the
+                INPUT tensor is processed separately.
+
+                Moreover, this object has two modes that effect the dimensionalities of A
+                and B and how they are applied to compute A*INPUT+B.  If
+                get_mode()==FC_MODE then A and B each have the same dimensionality as the
+                input tensor, except their num_samples() dimensions are 1.  If
+                get_mode()==CONV_MODE then A and B have all their dimensions set to 1
+                except for k(), which is equal to INPUT.k().
+
+                In either case, the computation of A*INPUT+B is performed pointwise over all
+                the elements of INPUT using either:
+                    OUTPUT(n,k,r,c) == A(1,k,r,c)*INPUT(n,k,r,c)+B(1,k,r,c)
+                or
+                    OUTPUT(n,k,r,c) == A(1,k,1,1)*INPUT(n,k,r,c)+B(1,k,1,1)
+                as appropriate.
+
+
+                Finally, note that the parameters of this layer are not learnable and
+                therefore not modified during network updates.  Instead, the layer will
+                perform the identity transformation unless it is initialized with a bn_
+                layer, in which case it will perform whatever transformation the bn_ layer
+                has learned.
+        !*/
+
+    public:
+
+        affine_(
+        );
+        /*!
+            ensures
+                - #get_mode() == FC_MODE 
+        !*/
+
+        affine_(
+            layer_mode mode
+        );
+        /*!
+            ensures
+                - #get_mode() == mode
+        !*/
+
+        template <
+            layer_mode mode
+            >
+        affine_(
+            const bn_<mode>& layer
+        );
+        /*!
+            ensures
+                - Constructs affine_ so that it performs the same transformation as the
+                  supplied batch normalization layer.  You would want to do this after you
+                  finish training a network with bn_ layers because the affine_ layer will
+                  execute faster.  
+                - #get_mode() == layer.get_mode()
+        !*/
+
+        layer_mode get_mode(
+        ) const; 
+        /*!
+            ensures
+                - returns the mode of this layer, either CONV_MODE or FC_MODE.  
+        !*/
+
+        template <typename SUBNET> void setup (const SUBNET& sub);
+        void forward_inplace(const tensor& input, tensor& output);
+        void backward_inplace(const tensor& computed_output, const tensor& gradient_input, tensor& data_grad, tensor& params_grad);
+        dpoint map_input_to_output(dpoint p) const;
+        dpoint map_output_to_input(dpoint p) const;
+        const tensor& get_layer_params() const; 
+        tensor& get_layer_params(); 
+        /*!
+            These functions are implemented as described in the
+            EXAMPLE_COMPUTATIONAL_LAYER_ interface.  Also note that get_layer_params()
+            always returns an empty tensor since there are no learnable parameters in this
+            object.
+        !*/
+
+    };
+
+    template <typename SUBNET>
+    using affine = add_layer<affine_, SUBNET>;
+
+// ----------------------------------------------------------------------------------------
+
+    template <
+        long _nr,
+        long _nc,
+        int _stride_y,
+        int _stride_x,
+        int _padding_y = _stride_y!=1? 0 : _nr/2,
+        int _padding_x = _stride_x!=1? 0 : _nc/2
+        >
+    class max_pool_
+    {
+        /*!
+            REQUIREMENTS ON TEMPLATE ARGUMENTS
+                - _nr >= 0
+                - _nc >= 0
+                - _stride_y > 0
+                - _stride_x > 0
+                - _padding_y >= 0
+                - _padding_x >= 0
+                - if (_nr != 0) then
+                    - _padding_y < _nr
+                - else
+                    - _padding_y == 0
+                - if (_nc != 0) then
+                    - _padding_x < _nr
+                - else
+                    - _padding_x == 0
+
+            WHAT THIS OBJECT REPRESENTS
+                This is an implementation of the EXAMPLE_COMPUTATIONAL_LAYER_ interface
+                defined above.  In particular, it defines a max pooling layer that takes an
+                input tensor and downsamples it.  It does this by sliding a window over the
+                images in an input tensor and outputting, for each channel, the maximum
+                element within the window.  
+
+                If _nr == 0 then it means the filter size covers all the rows in the input
+                tensor, similarly for the _nc parameter.  To be precise, if we call the
+                input tensor IN and the output tensor OUT, then OUT is defined as follows:
+                    - let FILT_NR == (nr()==0) ? IN.nr() : nr()
+                    - let FILT_NC == (nc()==0) ? IN.nc() : nc()
+                    - OUT.num_samples() == IN.num_samples()
+                    - OUT.k()  == IN.k()
+                    - OUT.nr() == 1+(IN.nr() + 2*padding_y() - FILT_NR)/stride_y()
+                    - OUT.nc() == 1+(IN.nc() + 2*padding_x() - FILT_NC)/stride_x()
+                    - for all valid s, k, r, and c:
+                        - image_plane(OUT,s,k)(r,c) == max(subm_clipped(image_plane(IN,s,k),
+                                                                  centered_rect(x*stride_x() + FILT_NC/2 - padding_x(),
+                                                                                y*stride_y() + FILT_NR/2 - padding_y(),
+                                                                                FILT_NC,
+                                                                                FILT_NR)))
+        !*/
+
+    public:
+
+        max_pool_ (
+        );
+        /*!
+            ensures
+                - #nr() == _nr
+                - #nc() == _nc
+                - #stride_y() == _stride_y
+                - #stride_x() == _stride_x
+                - #padding_y() == _padding_y
+                - #padding_x() == _padding_x
+        !*/
+
+        long nr(
+        ) const; 
+        /*!
+            ensures
+                - returns the number of rows in the pooling window or 0 if the window size
+                  is "the entire input tensor".
+        !*/
+
+        long nc(
+        ) const;
+        /*!
+            ensures
+                - returns the number of rows in the pooling window or 0 if the window size
+                  is "the entire input tensor".
+        !*/
+
+        long stride_y(
+        ) const; 
+        /*!
+            ensures
+                - returns the vertical stride used when scanning the max pooling window
+                  over an image.  That is, each window will be moved stride_y() pixels down
+                  at a time when it moves over the image.
+        !*/
+
+        long stride_x(
+        ) const;
+        /*!
+            ensures
+                - returns the horizontal stride used when scanning the max pooling window
+                  over an image.  That is, each window will be moved stride_x() pixels down
+                  at a time when it moves over the image.
+        !*/
+
+        long padding_y(
+        ) const; 
+        /*!
+            ensures
+                - returns the number of pixels of zero padding added to the top and bottom
+                  sides of the image.
+        !*/
+
+        long padding_x(
+        ) const; 
+        /*!
+            ensures
+                - returns the number of pixels of zero padding added to the left and right 
+                  sides of the image.
+        !*/
+
+        template <typename SUBNET> void setup (const SUBNET& sub);
+        template <typename SUBNET> void forward(const SUBNET& sub, resizable_tensor& output);
+        template <typename SUBNET> void backward(const tensor& computed_output, const tensor& gradient_input, SUBNET& sub, tensor& params_grad);
+        dpoint map_input_to_output(dpoint p) const;
+        dpoint map_output_to_input(dpoint p) const;
+        const tensor& get_layer_params() const; 
+        tensor& get_layer_params(); 
+        /*!
+            These functions are implemented as described in the EXAMPLE_COMPUTATIONAL_LAYER_ 
+            interface.  Note that this layer doesn't have any parameters, so the tensor
+            returned by get_layer_params() is always empty.
+        !*/
+    };
+
+    template <
+        long nr,
+        long nc,
+        int stride_y,
+        int stride_x,
+        typename SUBNET
+        >
+    using max_pool = add_layer<max_pool_<nr,nc,stride_y,stride_x>, SUBNET>;
+
+    template <
+        typename SUBNET
+        >
+    using max_pool_everything = add_layer<max_pool_<0,0,1,1>, SUBNET>;
+
+// ----------------------------------------------------------------------------------------
+
+    template <
+        long _nr,
+        long _nc,
+        int _stride_y,
+        int _stride_x,
+        int _padding_y = _stride_y!=1? 0 : _nr/2,
+        int _padding_x = _stride_x!=1? 0 : _nc/2
+        >
+    class avg_pool_
+    {
+        /*!
+            REQUIREMENTS ON TEMPLATE ARGUMENTS
+                - _nr >= 0
+                - _nc >= 0
+                - _stride_y > 0
+                - _stride_x > 0
+                - _padding_y >= 0
+                - _padding_x >= 0
+                - if (_nr != 0) then
+                    - _padding_y < _nr
+                - else
+                    - _padding_y == 0
+                - if (_nc != 0) then
+                    - _padding_x < _nr
+                - else
+                    - _padding_x == 0
+
+            WHAT THIS OBJECT REPRESENTS
+                This is an implementation of the EXAMPLE_COMPUTATIONAL_LAYER_ interface
+                defined above.  In particular, it defines an average pooling layer that
+                takes an input tensor and downsamples it.  It does this by sliding a window
+                over the images in an input tensor and outputting, for each channel, the
+                average element within the window.  
+
+                If _nr == 0 then it means the filter size covers all the rows in the input
+                tensor, similarly for the _nc parameter.  To be precise, if we call the
+                input tensor IN and the output tensor OUT, then OUT is defined as follows:
+                    - let FILT_NR == (nr()==0) ? IN.nr() : nr()
+                    - let FILT_NC == (nc()==0) ? IN.nc() : nc()
+                    - OUT.num_samples() == IN.num_samples()
+                    - OUT.k()  == IN.k()
+                    - OUT.nr() == 1+(IN.nr() + 2*padding_y() - FILT_NR)/stride_y()
+                    - OUT.nc() == 1+(IN.nc() + 2*padding_x() - FILT_NC)/stride_x()
+                    - for all valid s, k, r, and c:
+                        - image_plane(OUT,s,k)(r,c) == mean(subm_clipped(image_plane(IN,s,k),
+                                                                  centered_rect(x*stride_x() + FILT_NC/2 - padding_x(),
+                                                                                y*stride_y() + FILT_NR/2 - padding_y(),
+                                                                                FILT_NC,
+                                                                                FILT_NR)))
+        !*/
+
+    public:
+
+        avg_pool_ (
+        );
+        /*!
+            ensures
+                - #nr() == _nr
+                - #nc() == _nc
+                - #stride_y() == _stride_y
+                - #stride_x() == _stride_x
+                - #padding_y() == _padding_y
+                - #padding_x() == _padding_x
+        !*/
+
+        long nr(
+        ) const; 
+        /*!
+            ensures
+                - returns the number of rows in the pooling window or 0 if the window size
+                  is "the entire input tensor".
+        !*/
+
+        long nc(
+        ) const;
+        /*!
+            ensures
+                - returns the number of rows in the pooling window or 0 if the window size
+                  is "the entire input tensor".
+        !*/
+
+        long stride_y(
+        ) const; 
+        /*!
+            ensures
+                - returns the vertical stride used when scanning the pooling window
+                  over an image.  That is, each window will be moved stride_y() pixels down
+                  at a time when it moves over the image.
+        !*/
+
+        long stride_x(
+        ) const;
+        /*!
+            ensures
+                - returns the horizontal stride used when scanning the pooling window
+                  over an image.  That is, each window will be moved stride_x() pixels down
+                  at a time when it moves over the image.
+        !*/
+
+        long padding_y(
+        ) const; 
+        /*!
+            ensures
+                - returns the number of pixels of zero padding added to the top and bottom
+                  sides of the image.
+        !*/
+
+        long padding_x(
+        ) const; 
+        /*!
+            ensures
+                - returns the number of pixels of zero padding added to the left and right 
+                  sides of the image.
+        !*/
+
+        template <typename SUBNET> void setup (const SUBNET& sub);
+        template <typename SUBNET> void forward(const SUBNET& sub, resizable_tensor& output);
+        template <typename SUBNET> void backward(const tensor& computed_output, const tensor& gradient_input, SUBNET& sub, tensor& params_grad);
+        dpoint map_input_to_output(dpoint p) const;
+        dpoint map_output_to_input(dpoint p) const;
+        const tensor& get_layer_params() const; 
+        tensor& get_layer_params(); 
+        /*!
+            These functions are implemented as described in the EXAMPLE_COMPUTATIONAL_LAYER_ 
+            interface.  Note that this layer doesn't have any parameters, so the tensor
+            returned by get_layer_params() is always empty.
+        !*/
+
+    };
+
+    template <
+        long nr,
+        long nc,
+        int stride_y,
+        int stride_x,
+        typename SUBNET
+        >
+    using avg_pool = add_layer<avg_pool_<nr,nc,stride_y,stride_x>, SUBNET>;
+
+    template <
+        typename SUBNET
+        >
+    using avg_pool_everything = add_layer<avg_pool_<0,0,1,1>, SUBNET>;
+
+// ----------------------------------------------------------------------------------------
+
+    class relu_
+    {
+        /*!
+            WHAT THIS OBJECT REPRESENTS
+                This is an implementation of the EXAMPLE_COMPUTATIONAL_LAYER_ interface
+                defined above.  In particular, it defines a rectified linear layer.
+                Therefore, it passes its inputs through the function 
+                    f(x)=max(x,0) 
+                where f() is applied pointwise across the input tensor.
+        !*/
+
+    public:
+
+        relu_(
+        );
+
+        template <typename SUBNET> void setup (const SUBNET& sub);
+        void forward_inplace(const tensor& input, tensor& output);
+        void backward_inplace(const tensor& computed_output, const tensor& gradient_input, tensor& data_grad, tensor& params_grad);
+        dpoint map_input_to_output(dpoint p) const;
+        dpoint map_output_to_input(dpoint p) const;
+        const tensor& get_layer_params() const; 
+        tensor& get_layer_params(); 
+        /*!
+            These functions are implemented as described in the EXAMPLE_COMPUTATIONAL_LAYER_ 
+            interface.  Note that this layer doesn't have any parameters, so the tensor
+            returned by get_layer_params() is always empty.
+        !*/
+    };
+
+    template <typename SUBNET>
+    using relu = add_layer<relu_, SUBNET>;
+
+// ----------------------------------------------------------------------------------------
+
+    class prelu_
+    {
+        /*!
+            WHAT THIS OBJECT REPRESENTS
+                This is an implementation of the EXAMPLE_COMPUTATIONAL_LAYER_ interface
+                defined above.  In particular, it defines a parametric rectified linear
+                layer.  Therefore, it passes its inputs through the function 
+                    f(x) = x>0 ? x : p*x 
+                where f() is applied pointwise across the input tensor and p is a scalar
+                parameter learned by this layer.
+
+
+                This is the layer type introduced in the paper:
+                    He, Kaiming, et al. "Delving deep into rectifiers: Surpassing
+                    human-level performance on imagenet classification." Proceedings of the
+                    IEEE International Conference on Computer Vision. 2015.
+        !*/
+
+    public:
+
+        explicit prelu_(
+            float initial_param_value = 0.25
+        );
+        /*!
+            ensures
+                - The p parameter will be initialized with initial_param_value.
+                - #get_initial_param_value() == initial_param_value.
+        !*/
+
+        float get_initial_param_value (
+        ) const;
+        /*!
+            ensures
+                - returns the initial value of the prelu parameter. 
+        !*/
+
+        template <typename SUBNET> void setup (const SUBNET& sub);
+        void forward_inplace(const tensor& input, tensor& output);
+        void backward_inplace(const tensor& computed_output, const tensor& gradient_input, tensor& data_grad, tensor& params_grad);
+        dpoint map_input_to_output(dpoint p) const;
+        dpoint map_output_to_input(dpoint p) const;
+        const tensor& get_layer_params() const; 
+        tensor& get_layer_params(); 
+        /*!
+            These functions are implemented as described in the EXAMPLE_COMPUTATIONAL_LAYER_ interface.
+        !*/
+    };
+
+    template <typename SUBNET>
+    using prelu = add_layer<prelu_, SUBNET>;
+
+// ----------------------------------------------------------------------------------------
+
+    class sig_
+    {
+        /*!
+            WHAT THIS OBJECT REPRESENTS
+                This is an implementation of the EXAMPLE_COMPUTATIONAL_LAYER_ interface
+                defined above.  In particular, it defines a sigmoid layer.  Therefore, it
+                passes its inputs through the function 
+                    f(x)=1/(1+exp(-x)) 
+                where f() is applied pointwise across the input tensor.
+        !*/
+
+    public:
+
+        sig_(
+        );
+
+        template <typename SUBNET> void setup (const SUBNET& sub);
+        void forward_inplace(const tensor& input, tensor& output);
+        void backward_inplace(const tensor& computed_output, const tensor& gradient_input, tensor& data_grad, tensor& params_grad);
+        dpoint map_input_to_output(dpoint p) const;
+        dpoint map_output_to_input(dpoint p) const;
+        const tensor& get_layer_params() const; 
+        tensor& get_layer_params(); 
+        /*!
+            These functions are implemented as described in the EXAMPLE_COMPUTATIONAL_LAYER_ 
+            interface.  Note that this layer doesn't have any parameters, so the tensor
+            returned by get_layer_params() is always empty.
+        !*/
+    };
+
+    template <typename SUBNET>
+    using sig = add_layer<sig_, SUBNET>;
+
+// ----------------------------------------------------------------------------------------
+
+    class htan_
+    {
+        /*!
+            WHAT THIS OBJECT REPRESENTS
+                This is an implementation of the EXAMPLE_COMPUTATIONAL_LAYER_ interface
+                defined above.  In particular, it defines a hyperbolic tangent layer.
+                Therefore, it passes its inputs through the function 
+                    f(x)=std::tanh(x)
+                where f() is applied pointwise across the input tensor.
+        !*/
+
+    public:
+
+        htan_(
+        );
+
+        template <typename SUBNET> void setup (const SUBNET& sub);
+        void forward_inplace(const tensor& input, tensor& output);
+        void backward_inplace(const tensor& computed_output, const tensor& gradient_input, tensor& data_grad, tensor& params_grad);
+        dpoint map_input_to_output(dpoint p) const;
+        dpoint map_output_to_input(dpoint p) const;
+        const tensor& get_layer_params() const; 
+        tensor& get_layer_params(); 
+        /*!
+            These functions are implemented as described in the EXAMPLE_COMPUTATIONAL_LAYER_ 
+            interface.  Note that this layer doesn't have any parameters, so the tensor
+            returned by get_layer_params() is always empty.
+        !*/
+    };
+
+    template <typename SUBNET>
+    using htan = add_layer<htan_, SUBNET>;
+
+// ----------------------------------------------------------------------------------------
+
+    class softmax_
+    {
+        /*!
+            WHAT THIS OBJECT REPRESENTS
+                This is an implementation of the EXAMPLE_COMPUTATIONAL_LAYER_ interface
+                defined above.  In particular, it defines a softmax layer.  To be precise,
+                we define the softmax function s(x) as:
+                    s(x) == exp(x)/sum(exp(x)) 
+                where x is a vector.  Then this layer treats its input tensor as a
+                collection of multi-channel images and applies s() to each spatial location
+                in each image.  In each application, the tensor::k() channel elements at
+                each position are input to s() and then replaced by the outputs of s().   
+
+                This means that, for example, if you collapsed each output image to a 1
+                channel image by adding the channels then you would end up with images
+                where each pixel value was 1.  This is because the sum of the outputs of
+                s() will always be equal to 1.
+        !*/
+
+    public:
+
+        softmax_(
+        );
+
+        template <typename SUBNET> void setup (const SUBNET& sub);
+        void forward_inplace(const tensor& input, tensor& output);
+        void backward_inplace(const tensor& computed_output, const tensor& gradient_input, tensor& data_grad, tensor& params_grad);
+        const tensor& get_layer_params() const; 
+        tensor& get_layer_params(); 
+        /*!
+            These functions are implemented as described in the EXAMPLE_COMPUTATIONAL_LAYER_ 
+            interface.  Note that this layer doesn't have any parameters, so the tensor
+            returned by get_layer_params() is always empty.
+        !*/
+    };
+
+    template <typename SUBNET>
+    using softmax = add_layer<softmax_, SUBNET>;
+
+// ----------------------------------------------------------------------------------------
+
+    class softmax_all_
+    {
+        /*!
+            WHAT THIS OBJECT REPRESENTS
+                This is an implementation of the EXAMPLE_COMPUTATIONAL_LAYER_ interface
+                defined above.  In particular, it defines a softmax layer.  To be precise,
+                we define the softmax function s(x) as:
+                    s(x) == exp(x)/sum(exp(x)) 
+                where x is a vector.  Then this layer treats its input tensor as a
+                collection of tensor::num_samples() vectors and applies s() to each vector
+                in the tensor.  Therefore, there are logically tensor::num_samples()
+                invocations of s().
+        !*/
+
+    public:
+
+        softmax_all_(
+        );
+
+        template <typename SUBNET> void setup (const SUBNET& sub);
+        void forward_inplace(const tensor& input, tensor& output);
+        void backward_inplace(const tensor& computed_output, const tensor& gradient_input, tensor& data_grad, tensor& params_grad);
+        const tensor& get_layer_params() const; 
+        tensor& get_layer_params(); 
+        /*!
+            These functions are implemented as described in the EXAMPLE_COMPUTATIONAL_LAYER_ 
+            interface.  Note that this layer doesn't have any parameters, so the tensor
+            returned by get_layer_params() is always empty.
+        !*/
+    };
+
+    template <typename SUBNET>
+    using softmax_all = add_layer<softmax_all_, SUBNET>;
+
+// ----------------------------------------------------------------------------------------
+
+    template <
+        template<typename> class tag
+        >
+    class add_prev_
+    {
+        /*!
+            WHAT THIS OBJECT REPRESENTS
+                This is an implementation of the EXAMPLE_COMPUTATIONAL_LAYER_ interface
+                defined above.  This layer simply adds the output of two previous layers.
+                In particular, it adds the tensor from its immediate predecessor layer,
+                sub.get_output(), with the tensor from a deeper layer,
+                layer<tag>(sub).get_output().
+
+                Therefore, you supply a tag via add_prev_'s template argument that tells it
+                what layer to add to the output of the previous layer.  The result of this
+                addition is output by add_prev_.  Finally, the addition happens pointwise
+                according to 4D tensor arithmetic.  If the dimensions don't match then
+                missing elements are presumed to be equal to 0.  Moreover, each dimension
+                of the output tensor is equal to the maximum dimension of either of the
+                inputs.  That is, if the tensors A and B are being added to produce C then:
+                    - C.num_samples() == max(A.num_samples(), B.num_samples())
+                    - C.k()  == max(A.k(), B.k())
+                    - C.nr() == max(A.nr(), B.nr())
+                    - C.nc() == max(A.nc(), B.nc())
+        !*/
+
+    public:
+        add_prev_(
+        ); 
+
+        template <typename SUBNET> void setup (const SUBNET& sub);
+        template <typename SUBNET> void forward(const SUBNET& sub, resizable_tensor& output);
+        template <typename SUBNET> void backward(const tensor& gradient_input, SUBNET& sub, tensor& params_grad);
+        dpoint map_input_to_output(dpoint p) const;
+        dpoint map_output_to_input(dpoint p) const;
+        const tensor& get_layer_params() const; 
+        tensor& get_layer_params(); 
+        /*!
+            These functions are implemented as described in the EXAMPLE_COMPUTATIONAL_LAYER_ interface.
+        !*/
+    };
+
+
+    template <
+        template<typename> class tag,
+        typename SUBNET
+        >
+    using add_prev = add_layer<add_prev_<tag>, SUBNET>;
+
+    // Here we add some convenient aliases for using add_prev_ with the tag layers. 
+    template <typename SUBNET> using add_prev1  = add_prev<tag1, SUBNET>;
+    template <typename SUBNET> using add_prev2  = add_prev<tag2, SUBNET>;
+    template <typename SUBNET> using add_prev3  = add_prev<tag3, SUBNET>;
+    template <typename SUBNET> using add_prev4  = add_prev<tag4, SUBNET>;
+    template <typename SUBNET> using add_prev5  = add_prev<tag5, SUBNET>;
+    template <typename SUBNET> using add_prev6  = add_prev<tag6, SUBNET>;
+    template <typename SUBNET> using add_prev7  = add_prev<tag7, SUBNET>;
+    template <typename SUBNET> using add_prev8  = add_prev<tag8, SUBNET>;
+    template <typename SUBNET> using add_prev9  = add_prev<tag9, SUBNET>;
+    template <typename SUBNET> using add_prev10 = add_prev<tag10, SUBNET>;
+    using add_prev1_  = add_prev_<tag1>;
+    using add_prev2_  = add_prev_<tag2>;
+    using add_prev3_  = add_prev_<tag3>;
+    using add_prev4_  = add_prev_<tag4>;
+    using add_prev5_  = add_prev_<tag5>;
+    using add_prev6_  = add_prev_<tag6>;
+    using add_prev7_  = add_prev_<tag7>;
+    using add_prev8_  = add_prev_<tag8>;
+    using add_prev9_  = add_prev_<tag9>;
+    using add_prev10_ = add_prev_<tag10>;
+
+// ----------------------------------------------------------------------------------------
+
+    template <
+        template<typename> class tag
+        >
+    class mult_prev_
+    {
+        /*!
+            WHAT THIS OBJECT REPRESENTS
+                This is an implementation of the EXAMPLE_COMPUTATIONAL_LAYER_ interface
+                defined above.  This layer simply multiplies the output of two previous
+                layers.  In particular, it multiplies the tensor from its immediate
+                predecessor layer, sub.get_output(), with the tensor from a deeper layer,
+                layer<tag>(sub).get_output().
+
+                Therefore, you supply a tag via mult_prev_'s template argument that tells
+                it what layer to multiply with the output of the previous layer.  The
+                result of this multiplication is output by mult_prev_.  Finally, the
+                multiplication happens pointwise according to 4D tensor arithmetic.  If the
+                dimensions don't match then missing elements are presumed to be equal to 0.
+                Moreover, each dimension of the output tensor is equal to the maximum
+                dimension of either of the inputs.  That is, if the tensors A and B are
+                being multiplied to produce C then:
+                    - C.num_samples() == max(A.num_samples(), B.num_samples())
+                    - C.k()  == max(A.k(), B.k())
+                    - C.nr() == max(A.nr(), B.nr())
+                    - C.nc() == max(A.nc(), B.nc())
+        !*/
+
+    public:
+        mult_prev_(
+        ); 
+
+        template <typename SUBNET> void setup (const SUBNET& sub);
+        template <typename SUBNET> void forward(const SUBNET& sub, resizable_tensor& output);
+        template <typename SUBNET> void backward(const tensor& gradient_input, SUBNET& sub, tensor& params_grad);
+        const tensor& get_layer_params() const; 
+        tensor& get_layer_params(); 
+        /*!
+            These functions are implemented as described in the EXAMPLE_COMPUTATIONAL_LAYER_ interface.
+        !*/
+    };
+
+
+    template <
+        template<typename> class tag,
+        typename SUBNET
+        >
+    using mult_prev = add_layer<mult_prev_<tag>, SUBNET>;
+
+    // Here we add some convenient aliases for using mult_prev_ with the tag layers. 
+    template <typename SUBNET> using mult_prev1  = mult_prev<tag1, SUBNET>;
+    template <typename SUBNET> using mult_prev2  = mult_prev<tag2, SUBNET>;
+    template <typename SUBNET> using mult_prev3  = mult_prev<tag3, SUBNET>;
+    template <typename SUBNET> using mult_prev4  = mult_prev<tag4, SUBNET>;
+    template <typename SUBNET> using mult_prev5  = mult_prev<tag5, SUBNET>;
+    template <typename SUBNET> using mult_prev6  = mult_prev<tag6, SUBNET>;
+    template <typename SUBNET> using mult_prev7  = mult_prev<tag7, SUBNET>;
+    template <typename SUBNET> using mult_prev8  = mult_prev<tag8, SUBNET>;
+    template <typename SUBNET> using mult_prev9  = mult_prev<tag9, SUBNET>;
+    template <typename SUBNET> using mult_prev10 = mult_prev<tag10, SUBNET>;
+    using mult_prev1_  = mult_prev_<tag1>;
+    using mult_prev2_  = mult_prev_<tag2>;
+    using mult_prev3_  = mult_prev_<tag3>;
+    using mult_prev4_  = mult_prev_<tag4>;
+    using mult_prev5_  = mult_prev_<tag5>;
+    using mult_prev6_  = mult_prev_<tag6>;
+    using mult_prev7_  = mult_prev_<tag7>;
+    using mult_prev8_  = mult_prev_<tag8>;
+    using mult_prev9_  = mult_prev_<tag9>;
+    using mult_prev10_ = mult_prev_<tag10>;
+
+// ----------------------------------------------------------------------------------------
+
+    template <
+        template<typename> class tag
+        >
+    class scale_
+    {
+        /*!
+            WHAT THIS OBJECT REPRESENTS
+                This is an implementation of the EXAMPLE_COMPUTATIONAL_LAYER_ interface
+                defined above.  This layer scales the output channels of the tagged layer
+                by multiplying it with the output of the previous layer.  To be specific:
+                    - Let INPUT  == layer<tag>(sub).get_output()
+                    - Let SCALES == sub.get_output()
+                    - This layer takes INPUT and SCALES as input.
+                    - The output of this layer has the same dimensions as INPUT.
+                    - This layer requires:
+                        - SCALES.num_samples() == INPUT.num_samples()
+                        - SCALES.k()  == INPUT.k()
+                        - SCALES.nr() == 1
+                        - SCALES.nc() == 1
+                    - The output tensor is produced by pointwise multiplying SCALES with
+                      INPUT at each spatial location.  Therefore, if OUT is the output of
+                      this layer then we would have:
+                        OUT(n,k,r,c) == INPUT(n,k,r,c)*SCALES(n,k)
+        !*/
+
+    public:
+        scale_(
+        ); 
+
+        template <typename SUBNET> void setup (const SUBNET& sub);
+        template <typename SUBNET> void forward(const SUBNET& sub, resizable_tensor& output);
+        template <typename SUBNET> void backward(const tensor& gradient_input, SUBNET& sub, tensor& params_grad);
+        const tensor& get_layer_params() const; 
+        tensor& get_layer_params(); 
+        /*!
+            These functions are implemented as described in the EXAMPLE_COMPUTATIONAL_LAYER_ interface.
+        !*/
+    };
+
+
+    template <
+        template<typename> class tag,
+        typename SUBNET
+        >
+    using scale = add_layer<scale_<tag>, SUBNET>;
+
+    // Here we add some convenient aliases for using scale_ with the tag layers. 
+    template <typename SUBNET> using scale1  = scale<tag1, SUBNET>;
+    template <typename SUBNET> using scale2  = scale<tag2, SUBNET>;
+    template <typename SUBNET> using scale3  = scale<tag3, SUBNET>;
+    template <typename SUBNET> using scale4  = scale<tag4, SUBNET>;
+    template <typename SUBNET> using scale5  = scale<tag5, SUBNET>;
+    template <typename SUBNET> using scale6  = scale<tag6, SUBNET>;
+    template <typename SUBNET> using scale7  = scale<tag7, SUBNET>;
+    template <typename SUBNET> using scale8  = scale<tag8, SUBNET>;
+    template <typename SUBNET> using scale9  = scale<tag9, SUBNET>;
+    template <typename SUBNET> using scale10 = scale<tag10, SUBNET>;
+    using scale1_  = scale_<tag1>;
+    using scale2_  = scale_<tag2>;
+    using scale3_  = scale_<tag3>;
+    using scale4_  = scale_<tag4>;
+    using scale5_  = scale_<tag5>;
+    using scale6_  = scale_<tag6>;
+    using scale7_  = scale_<tag7>;
+    using scale8_  = scale_<tag8>;
+    using scale9_  = scale_<tag9>;
+    using scale10_ = scale_<tag10>;
+
+// ----------------------------------------------------------------------------------------
+
+    template<
+        template<typename> class... TAG_TYPES
+        >
+    class concat_
+    {
+        /*!
+            WHAT THIS OBJECT REPRESENTS
+                This is an implementation of the EXAMPLE_COMPUTATIONAL_LAYER_ interface
+                defined above.  This layer simply concatenates the output of tagged layers.
+                Importantly, each input layer must have the same dimensions (i.e.
+                num_samples, nr, and nc) except for the k channel, which may vary.  This is
+                because the concatenation happens along the k dimension.  That is, the
+                output of this network is a tensor, OUT, that is the concatenation of the
+                tensors:
+                    for each (tag in TAG_TYPES)
+                        layer<tag>(subnet).get_output()
+                Therefore, out.num_samples(), out.nr(), and out.nc() match the dimensions
+                of the input tensors while OUT.k() is the sum of the input layer's k()
+                dimensions.
+        !*/
+
+    public:
+        template <typename SUBNET> void setup (const SUBNET& sub);
+        template <typename SUBNET> void forward(const SUBNET& sub, resizable_tensor& output);
+        template <typename SUBNET> void backward(const tensor& gradient_input, SUBNET& sub, tensor& params_grad);
+        dpoint map_input_to_output(dpoint p) const;
+        dpoint map_output_to_input(dpoint p) const;
+        const tensor& get_layer_params() const;
+        tensor& get_layer_params();
+        /*!
+            These functions are implemented as described in the EXAMPLE_COMPUTATIONAL_LAYER_ interface.
+        !*/
+    };
+
+
+    // concat layer definitions
+    template <template<typename> class TAG1,
+              template<typename> class TAG2,
+              typename SUBNET>
+    using concat2 = add_layer<concat_<TAG1, TAG2>, SUBNET>;
+
+    template <template<typename> class TAG1,
+              template<typename> class TAG2,
+              template<typename> class TAG3,
+              typename SUBNET>
+    using concat3 = add_layer<concat_<TAG1, TAG2, TAG3>, SUBNET>;
+
+    template <template<typename> class TAG1,
+              template<typename> class TAG2,
+              template<typename> class TAG3,
+              template<typename> class TAG4,
+              typename SUBNET>
+    using concat4 = add_layer<concat_<TAG1, TAG2, TAG3, TAG4>, SUBNET>;
+
+    template <template<typename> class TAG1,
+              template<typename> class TAG2,
+              template<typename> class TAG3,
+              template<typename> class TAG4,
+              template<typename> class TAG5,
+              typename SUBNET>
+    using concat5 = add_layer<concat_<TAG1, TAG2, TAG3, TAG4, TAG5>, SUBNET>;
+
+// ----------------------------------------------------------------------------------------
+    
+    /*!A inception layer definitions !*/
+
+    // Now define inception layer tag types.  These layer aliases allow creating
+    // the networks described in the paper: 
+    //   Szegedy, Christian, et al. "Going deeper with convolutions." Proceedings of
+    //   the IEEE Conference on Computer Vision and Pattern Recognition. 2015.
+    // See the dnn_inception_ex.cpp example for a complete example of their use.  Note also
+    // that we use tag ID numbers >= 1000 to avoid conflict with user's tag layers.
+    template <typename SUBNET> using itag0  = add_tag_layer< 1000 + 0, SUBNET>;
+    template <typename SUBNET> using itag1  = add_tag_layer< 1000 + 1, SUBNET>;
+    template <typename SUBNET> using itag2  = add_tag_layer< 1000 + 2, SUBNET>;
+    template <typename SUBNET> using itag3  = add_tag_layer< 1000 + 3, SUBNET>;
+    template <typename SUBNET> using itag4  = add_tag_layer< 1000 + 4, SUBNET>;
+    template <typename SUBNET> using itag5  = add_tag_layer< 1000 + 5, SUBNET>;
+    // skip to inception input
+    template <typename SUBNET> using iskip  = add_skip_layer< itag0, SUBNET>;
+
+    // here are some templates to be used for creating inception layer groups
+    template <template<typename>class B1,
+              template<typename>class B2,
+              typename SUBNET>
+    using inception2 = concat2<itag1, itag2, itag1<B1<iskip< itag2<B2< itag0<SUBNET>>>>>>>;
+
+    template <template<typename>class B1,
+              template<typename>class B2,
+              template<typename>class B3,
+              typename SUBNET>
+    using inception3 = concat3<itag1, itag2, itag3, itag1<B1<iskip< itag2<B2<iskip< itag3<B3<  itag0<SUBNET>>>>>>>>>>;
+
+    template <template<typename>class B1,
+              template<typename>class B2,
+              template<typename>class B3,
+              template<typename>class B4,
+              typename SUBNET>
+    using inception4 = concat4<itag1, itag2, itag3, itag4,
+                itag1<B1<iskip< itag2<B2<iskip< itag3<B3<iskip<  itag4<B4<  itag0<SUBNET>>>>>>>>>>>>>;
+
+    template <template<typename>class B1,
+              template<typename>class B2,
+              template<typename>class B3,
+              template<typename>class B4,
+              template<typename>class B5,
+              typename SUBNET>
+    using inception5 = concat5<itag1, itag2, itag3, itag4, itag5,
+                itag1<B1<iskip< itag2<B2<iskip< itag3<B3<iskip<  itag4<B4<iskip<  itag5<B5<  itag0<SUBNET>>>>>>>>>>>>>>>>;
+
+// ----------------------------------------------------------------------------------------
+
+    const double DEFAULT_L2_NORM_EPS = 1e-5;
+
+    class l2normalize_
+    {
+        /*!
+            WHAT THIS OBJECT REPRESENTS
+                This is an implementation of the EXAMPLE_COMPUTATIONAL_LAYER_ interface
+                defined above.  It takes tensors as input and L2 normalizes them.  In particular,
+                it has the following properties:
+                    - The output tensors from this layer have the same dimensions as the
+                      input tensors.
+                    - If you think of each input tensor as a set of tensor::num_samples()
+                      vectors, then the output tensor contains the same vectors except they
+                      have been length normalized so that their L2 norms are all 1.  I.e. 
+                      for each vector v we will have ||v||==1.
+        !*/
+
+    public:
+
+        explicit l2normalize_(
+            double eps = tt::DEFAULT_L2_NORM_EPS
+        );
+        /*!
+            requires
+                - eps > 0
+            ensures
+                - #get_eps() == eps
+        !*/
+
+        double get_eps(
+        ) const; 
+        /*!
+            ensures
+                - When we normalize a vector we divide it by its L2 norm.  However, the
+                  get_eps() value is added to the squared norm prior to division to avoid
+                  ever dividing by zero. 
+        !*/
+
+        template <typename SUBNET> void setup (const SUBNET& sub);
+        void forward_inplace(const tensor& input, tensor& output);
+        void backward_inplace(const tensor& computed_output, const tensor& gradient_input, tensor& data_grad, tensor& params_grad);
+        const tensor& get_layer_params() const; 
+        tensor& get_layer_params(); 
+        /*!
+            These functions are implemented as described in the EXAMPLE_COMPUTATIONAL_LAYER_ interface.
+        !*/
+    };
+
+// ----------------------------------------------------------------------------------------
+
+    template <
+        long _offset,
+        long _k,
+        long _nr,
+        long _nc
+        >
+    class extract_
+    {
+        /*!
+            REQUIREMENTS ON TEMPLATE ARGUMENTS
+                - 0 <= _offset
+                - 0 < _k
+                - 0 < _nr
+                - 0 < _nc
+
+            WHAT THIS OBJECT REPRESENTS
+                This is an implementation of the EXAMPLE_COMPUTATIONAL_LAYER_ interface
+                defined above.  In particular, the output of this layer is simply a copy of
+                the input tensor.  However, you can configure the extract layer to output
+                only some subset of the input tensor and also to reshape it.  Therefore,
+                the dimensions of the tensor output by this layer are as follows (letting
+                IN be the input tensor and OUT the output tensor):
+                    - OUT.num_samples() == IN.num_samples()
+                    - OUT.k()  == _k 
+                    - OUT.nr() == _nr 
+                    - OUT.nc() == _nc 
+
+                So the output will always have the same number of samples as the input, but
+                within each sample (the k,nr,nc part) we will copy only a subset of the
+                values.  Moreover, the _offset parameter controls which part of each sample
+                we take.  To be very precise, we will have:
+                    - let IN_SIZE   = IN.k()*IN.nr()*IN.nc()
+                    - let OUT_SIZE  = _k*_nr*_nc 
+                    - for i in range[0,IN.num_samples()) and j in range[0,OUT_SIZE):
+                        - OUT.host()[i*OUT_SIZE+j] == IN.host()[i*IN_SIZE+_offset+j]
+
+
+                Finally, all this means that the input tensor to this layer must have a big
+                enough size to accommodate taking a _k*_nr*_nc slice from each of its
+                samples.  
+        !*/
+
+    public:
+
+        template <typename SUBNET> void setup (const SUBNET& sub);
+        template <typename SUBNET> void forward(const SUBNET& sub, resizable_tensor& output);
+        template <typename SUBNET> void backward(const tensor& gradient_input, SUBNET& sub, tensor& params_grad);
+        const tensor& get_layer_params() const; 
+        tensor& get_layer_params(); 
+        /*!
+            These functions are implemented as described in the EXAMPLE_COMPUTATIONAL_LAYER_ interface.
+        !*/
+    };
+
+    template <
+        long offset,
+        long k,
+        long nr,
+        long nc,
+        typename SUBNET
+        >
+    using extract = add_layer<extract_<offset,k,nr,nc>, SUBNET>;
+
+// ----------------------------------------------------------------------------------------
+
+}
+
+#endif // DLIB_DNn_LAYERS_ABSTRACT_H_
+
diff --git a/ml/dlib/dlib/dnn/loss.h b/ml/dlib/dlib/dnn/loss.h
new file mode 100644
index 000000000..1b09b85c3
--- /dev/null
+++ b/ml/dlib/dlib/dnn/loss.h
@@ -0,0 +1,2870 @@
+// Copyright (C) 2015  Davis E. King (davis@dlib.net)
+// License: Boost Software License   See LICENSE.txt for the full license.
+#ifndef DLIB_DNn_LOSS_H_
+#define DLIB_DNn_LOSS_H_
+
+#include "loss_abstract.h"
+#include "core.h"
+#include "../matrix.h"
+#include "tensor_tools.h"
+#include "../geometry.h"
+#include "../image_processing/box_overlap_testing.h"
+#include "../image_processing/full_object_detection.h"
+#include "../svm/ranking_tools.h"
+#include <sstream>
+#include <map>
+
+namespace dlib
+{
+
+// ----------------------------------------------------------------------------------------
+
+    class loss_binary_hinge_ 
+    {
+    public:
+
+        typedef float training_label_type;
+        typedef float output_label_type;
+
+        template <
+            typename SUB_TYPE,
+            typename label_iterator
+            >
+        void to_label (
+            const tensor& input_tensor,
+            const SUB_TYPE& sub,
+            label_iterator iter
+        ) const
+        {
+            DLIB_CASSERT(sub.sample_expansion_factor() == 1);
+
+            const tensor& output_tensor = sub.get_output();
+            DLIB_CASSERT(output_tensor.nr() == 1 && 
+                         output_tensor.nc() == 1 && 
+                         output_tensor.k() == 1);
+            DLIB_CASSERT(input_tensor.num_samples() == output_tensor.num_samples());
+
+            const float* out_data = output_tensor.host();
+            for (long i = 0; i < output_tensor.num_samples(); ++i)
+            {
+                *iter++ = out_data[i];
+            }
+        }
+
+        template <
+            typename const_label_iterator,
+            typename SUBNET
+            >
+        double compute_loss_value_and_gradient (
+            const tensor& input_tensor,
+            const_label_iterator truth, 
+            SUBNET& sub
+        ) const
+        {
+            const tensor& output_tensor = sub.get_output();
+            tensor& grad = sub.get_gradient_input();
+
+            DLIB_CASSERT(sub.sample_expansion_factor() == 1);
+            DLIB_CASSERT(input_tensor.num_samples() != 0);
+            DLIB_CASSERT(input_tensor.num_samples()%sub.sample_expansion_factor() == 0);
+            DLIB_CASSERT(input_tensor.num_samples() == grad.num_samples());
+            DLIB_CASSERT(input_tensor.num_samples() == output_tensor.num_samples());
+            DLIB_CASSERT(output_tensor.nr() == 1 && 
+                         output_tensor.nc() == 1 && 
+                         output_tensor.k() == 1);
+
+            // The loss we output is the average loss over the mini-batch.
+            const double scale = 1.0/output_tensor.num_samples();
+            double loss = 0;
+            const float* out_data = output_tensor.host();
+            float* g = grad.host_write_only();
+            for (long i = 0; i < output_tensor.num_samples(); ++i)
+            {
+                const float y = *truth++;
+                DLIB_CASSERT(y == +1 || y == -1, "y: " << y);
+                const float temp = 1-y*out_data[i];
+                if (temp > 0)
+                {
+                    loss += scale*temp;
+                    g[i] = -scale*y;
+                }
+                else
+                {
+                    g[i] = 0;
+                }
+            }
+            return loss;
+        }
+
+        friend void serialize(const loss_binary_hinge_& , std::ostream& out)
+        {
+            serialize("loss_binary_hinge_", out);
+        }
+
+        friend void deserialize(loss_binary_hinge_& , std::istream& in)
+        {
+            std::string version;
+            deserialize(version, in);
+            if (version != "loss_binary_hinge_")
+                throw serialization_error("Unexpected version found while deserializing dlib::loss_binary_hinge_.");
+        }
+
+        friend std::ostream& operator<<(std::ostream& out, const loss_binary_hinge_& )
+        {
+            out << "loss_binary_hinge";
+            return out;
+        }
+
+        friend void to_xml(const loss_binary_hinge_& /*item*/, std::ostream& out)
+        {
+            out << "<loss_binary_hinge/>";
+        }
+
+    };
+
+    template <typename SUBNET>
+    using loss_binary_hinge = add_loss_layer<loss_binary_hinge_, SUBNET>;
+
+// ----------------------------------------------------------------------------------------
+
+    class loss_binary_log_ 
+    {
+    public:
+
+        typedef float training_label_type;
+        typedef float output_label_type;
+
+        template <
+            typename SUB_TYPE,
+            typename label_iterator
+            >
+        void to_label (
+            const tensor& input_tensor,
+            const SUB_TYPE& sub,
+            label_iterator iter
+        ) const
+        {
+            DLIB_CASSERT(sub.sample_expansion_factor() == 1);
+
+            const tensor& output_tensor = sub.get_output();
+            DLIB_CASSERT(output_tensor.nr() == 1 && 
+                         output_tensor.nc() == 1 && 
+                         output_tensor.k() == 1);
+            DLIB_CASSERT(input_tensor.num_samples() == output_tensor.num_samples());
+
+            const float* out_data = output_tensor.host();
+            for (long i = 0; i < output_tensor.num_samples(); ++i)
+            {
+                *iter++ = out_data[i];
+            }
+        }
+
+
+        template <
+            typename const_label_iterator,
+            typename SUBNET
+            >
+        double compute_loss_value_and_gradient (
+            const tensor& input_tensor,
+            const_label_iterator truth, 
+            SUBNET& sub
+        ) const
+        {
+            const tensor& output_tensor = sub.get_output();
+            tensor& grad = sub.get_gradient_input();
+
+            DLIB_CASSERT(sub.sample_expansion_factor() == 1);
+            DLIB_CASSERT(input_tensor.num_samples() != 0);
+            DLIB_CASSERT(input_tensor.num_samples()%sub.sample_expansion_factor() == 0);
+            DLIB_CASSERT(input_tensor.num_samples() == grad.num_samples());
+            DLIB_CASSERT(input_tensor.num_samples() == output_tensor.num_samples());
+            DLIB_CASSERT(output_tensor.nr() == 1 && 
+                         output_tensor.nc() == 1 && 
+                         output_tensor.k() == 1);
+            DLIB_CASSERT(grad.nr() == 1 && 
+                         grad.nc() == 1 && 
+                         grad.k() == 1);
+
+            tt::sigmoid(grad, output_tensor);
+
+            // The loss we output is the average loss over the mini-batch.
+            const double scale = 1.0/output_tensor.num_samples();
+            double loss = 0;
+            float* g = grad.host();
+            const float* out_data = output_tensor.host();
+            for (long i = 0; i < output_tensor.num_samples(); ++i)
+            {
+                const float y = *truth++;
+                DLIB_CASSERT(y == +1 || y == -1, "y: " << y);
+                float temp;
+                if (y > 0)
+                {
+                    temp = log1pexp(-out_data[i]);
+                    loss += scale*temp;
+                    g[i] = scale*(g[i]-1);
+                }
+                else
+                {
+                    temp = -(-out_data[i]-log1pexp(-out_data[i]));
+                    loss += scale*temp;
+                    g[i] = scale*g[i];
+                }
+            }
+            return loss;
+        }
+
+        friend void serialize(const loss_binary_log_& , std::ostream& out)
+        {
+            serialize("loss_binary_log_", out);
+        }
+
+        friend void deserialize(loss_binary_log_& , std::istream& in)
+        {
+            std::string version;
+            deserialize(version, in);
+            if (version != "loss_binary_log_")
+                throw serialization_error("Unexpected version found while deserializing dlib::loss_binary_log_.");
+        }
+
+        friend std::ostream& operator<<(std::ostream& out, const loss_binary_log_& )
+        {
+            out << "loss_binary_log";
+            return out;
+        }
+
+        friend void to_xml(const loss_binary_log_& /*item*/, std::ostream& out)
+        {
+            out << "<loss_binary_log/>";
+        }
+
+    };
+
+    template <typename T>
+    T safe_log(T input, T epsilon = 1e-10)
+    {
+        // Prevent trying to calculate the logarithm of a very small number (let alone zero)
+        return std::log(std::max(input, epsilon));
+    }
+
+    template <typename SUBNET>
+    using loss_binary_log = add_loss_layer<loss_binary_log_, SUBNET>;
+
+// ----------------------------------------------------------------------------------------
+
+    class loss_multiclass_log_ 
+    {
+    public:
+
+        typedef unsigned long training_label_type;
+        typedef unsigned long output_label_type;
+
+        template <
+            typename SUB_TYPE,
+            typename label_iterator
+            >
+        void to_label (
+            const tensor& input_tensor,
+            const SUB_TYPE& sub,
+            label_iterator iter
+        ) const
+        {
+            const tensor& output_tensor = sub.get_output();
+            DLIB_CASSERT(sub.sample_expansion_factor() == 1);
+            DLIB_CASSERT(output_tensor.nr() == 1 && 
+                         output_tensor.nc() == 1 );
+            DLIB_CASSERT(input_tensor.num_samples() == output_tensor.num_samples());
+
+
+            // Note that output_tensor.k() should match the number of labels.
+
+            for (long i = 0; i < output_tensor.num_samples(); ++i)
+            {
+                // The index of the largest output for this sample is the label.
+                *iter++ = index_of_max(rowm(mat(output_tensor),i));
+            }
+        }
+
+
+        template <
+            typename const_label_iterator,
+            typename SUBNET
+            >
+        double compute_loss_value_and_gradient (
+            const tensor& input_tensor,
+            const_label_iterator truth, 
+            SUBNET& sub
+        ) const
+        {
+            const tensor& output_tensor = sub.get_output();
+            tensor& grad = sub.get_gradient_input();
+
+            DLIB_CASSERT(sub.sample_expansion_factor() == 1);
+            DLIB_CASSERT(input_tensor.num_samples() != 0);
+            DLIB_CASSERT(input_tensor.num_samples()%sub.sample_expansion_factor() == 0);
+            DLIB_CASSERT(input_tensor.num_samples() == grad.num_samples());
+            DLIB_CASSERT(input_tensor.num_samples() == output_tensor.num_samples());
+            DLIB_CASSERT(output_tensor.nr() == 1 && 
+                         output_tensor.nc() == 1);
+            DLIB_CASSERT(grad.nr() == 1 && 
+                         grad.nc() == 1);
+
+            tt::softmax(grad, output_tensor);
+
+            // The loss we output is the average loss over the mini-batch.
+            const double scale = 1.0/output_tensor.num_samples();
+            double loss = 0;
+            float* g = grad.host();
+            for (long i = 0; i < output_tensor.num_samples(); ++i)
+            {
+                const long y = (long)*truth++;
+                // The network must produce a number of outputs that is equal to the number
+                // of labels when using this type of loss.
+                DLIB_CASSERT(y < output_tensor.k(), "y: " << y << ", output_tensor.k(): " << output_tensor.k());
+                for (long k = 0; k < output_tensor.k(); ++k)
+                {
+                    const unsigned long idx = i*output_tensor.k()+k;
+                    if (k == y)
+                    {
+                        loss += scale*-safe_log(g[idx]);
+                        g[idx] = scale*(g[idx]-1);
+                    }
+                    else
+                    {
+                        g[idx] = scale*g[idx];
+                    }
+                }
+            }
+            return loss;
+        }
+
+        friend void serialize(const loss_multiclass_log_& , std::ostream& out)
+        {
+            serialize("loss_multiclass_log_", out);
+        }
+
+        friend void deserialize(loss_multiclass_log_& , std::istream& in)
+        {
+            std::string version;
+            deserialize(version, in);
+            if (version != "loss_multiclass_log_")
+                throw serialization_error("Unexpected version found while deserializing dlib::loss_multiclass_log_.");
+        }
+
+        friend std::ostream& operator<<(std::ostream& out, const loss_multiclass_log_& )
+        {
+            out << "loss_multiclass_log";
+            return out;
+        }
+
+        friend void to_xml(const loss_multiclass_log_& /*item*/, std::ostream& out)
+        {
+            out << "<loss_multiclass_log/>";
+        }
+
+    };
+
+    template <typename SUBNET>
+    using loss_multiclass_log = add_loss_layer<loss_multiclass_log_, SUBNET>;
+
+// ----------------------------------------------------------------------------------------
+
+    class loss_multimulticlass_log_ 
+    {
+
+    public:
+
+        loss_multimulticlass_log_ () = default;
+
+        loss_multimulticlass_log_ (
+            const std::map<std::string,std::vector<std::string>>& labels
+        )
+        {
+            for (auto& l : labels)
+            {
+                possible_labels[l.first] = std::make_shared<decltype(l.second)>(l.second);
+                DLIB_CASSERT(l.second.size() >= 2, "Each classifier must have at least two possible labels.");
+
+                for (size_t i = 0; i < l.second.size(); ++i)
+                {
+                    label_idx_lookup[l.first][l.second[i]] = i;
+                    ++total_num_labels;
+                }
+            }
+        }
+
+        unsigned long number_of_labels() const { return total_num_labels; }
+
+        unsigned long number_of_classifiers() const { return possible_labels.size(); }
+
+        std::map<std::string,std::vector<std::string>> get_labels ( 
+        ) const 
+        {
+            std::map<std::string,std::vector<std::string>> info; 
+            for (auto& i : possible_labels)
+            {
+                for (auto& label : *i.second)
+                    info[i.first].emplace_back(label);
+            }
+            return info;
+        }
+
+        class classifier_output
+        {
+
+        public:
+            classifier_output() = default;
+
+            size_t num_classes() const { return class_probs.size(); }
+
+            double probability_of_class (
+                size_t i
+            ) const 
+            { 
+                DLIB_CASSERT(i < num_classes());
+                return class_probs(i); 
+            }
+
+            const std::string& label(
+                size_t i
+            ) const 
+            { 
+                DLIB_CASSERT(i < num_classes()); 
+                return (*_labels)[i]; 
+            }
+
+            operator std::string(
+            ) const
+            {
+                DLIB_CASSERT(num_classes() != 0); 
+                return (*_labels)[index_of_max(class_probs)];
+            }
+
+            friend std::ostream& operator<< (std::ostream& out, const classifier_output& item)
+            {
+                DLIB_ASSERT(item.num_classes() != 0); 
+                out << static_cast<std::string>(item);
+                return out;
+            }
+
+        private:
+
+            friend class loss_multimulticlass_log_;
+
+            template <typename EXP>
+            classifier_output(
+                const matrix_exp<EXP>& class_probs,
+                const std::shared_ptr<std::vector<std::string>>& _labels
+            ) : 
+                class_probs(class_probs), 
+                _labels(_labels)
+            {
+            }
+
+            matrix<float,1,0> class_probs;
+            std::shared_ptr<std::vector<std::string>> _labels;
+        };
+
+        typedef std::map<std::string,std::string> training_label_type;
+        typedef std::map<std::string,classifier_output> output_label_type;
+
+
+        template <
+            typename SUB_TYPE,
+            typename label_iterator
+            >
+        void to_label (
+            const tensor& input_tensor,
+            const SUB_TYPE& sub,
+            label_iterator iter_begin
+        ) const
+        {
+            const tensor& output_tensor = sub.get_output();
+            DLIB_CASSERT(sub.sample_expansion_factor() == 1);
+            DLIB_CASSERT(output_tensor.nr() == 1 && 
+                         output_tensor.nc() == 1 );
+            DLIB_CASSERT(input_tensor.num_samples() == output_tensor.num_samples());
+
+            DLIB_CASSERT(number_of_labels() != 0, "You must give the loss_multimulticlass_log_'s constructor label data before you can use it!");
+            DLIB_CASSERT(output_tensor.k() == (long)number_of_labels(), "The output tensor must have " << number_of_labels() << " channels.");
+
+
+            long k_offset = 0;
+            for (auto& l : possible_labels)
+            {
+                auto iter = iter_begin;
+                const std::string& classifier_name = l.first;
+                const auto& labels = (*l.second); 
+                scratch.set_size(output_tensor.num_samples(), labels.size());
+                tt::copy_tensor(false, scratch, 0, output_tensor, k_offset, labels.size());
+
+                tt::softmax(scratch, scratch);
+
+                for (long i = 0; i < scratch.num_samples(); ++i)
+                    (*iter++)[classifier_name] = classifier_output(rowm(mat(scratch),i), l.second);
+
+                k_offset += labels.size();
+            }
+        }
+
+
+        template <
+            typename const_label_iterator,
+            typename SUBNET
+            >
+        double compute_loss_value_and_gradient (
+            const tensor& input_tensor,
+            const_label_iterator truth_begin, 
+            SUBNET& sub
+        ) const
+        {
+            const tensor& output_tensor = sub.get_output();
+            tensor& grad = sub.get_gradient_input();
+
+            DLIB_CASSERT(sub.sample_expansion_factor() == 1);
+            DLIB_CASSERT(input_tensor.num_samples() != 0);
+            DLIB_CASSERT(input_tensor.num_samples()%sub.sample_expansion_factor() == 0);
+            DLIB_CASSERT(input_tensor.num_samples() == grad.num_samples());
+            DLIB_CASSERT(input_tensor.num_samples() == output_tensor.num_samples());
+            DLIB_CASSERT(output_tensor.nr() == 1 && 
+                         output_tensor.nc() == 1);
+            DLIB_CASSERT(grad.nr() == 1 && 
+                         grad.nc() == 1);
+            DLIB_CASSERT(number_of_labels() != 0, "You must give the loss_multimulticlass_log_'s constructor label data before you can use it!");
+            DLIB_CASSERT(output_tensor.k() == (long)number_of_labels(), "The output tensor must have " << number_of_labels() << " channels.");
+
+            // The loss we output is the average loss over the mini-batch.
+            const double scale = 1.0/output_tensor.num_samples();
+            double loss = 0;
+            long k_offset = 0;
+            for (auto& l : label_idx_lookup)
+            {
+                const std::string& classifier_name = l.first;
+                const auto& int_labels = l.second; 
+                scratch.set_size(output_tensor.num_samples(), int_labels.size());
+                tt::copy_tensor(false, scratch, 0, output_tensor, k_offset, int_labels.size());
+
+                tt::softmax(scratch, scratch);
+
+
+                auto truth = truth_begin;
+                float* g = scratch.host();
+                for (long i = 0; i < scratch.num_samples(); ++i)
+                {
+                    const long y = int_labels.at(truth->at(classifier_name));
+                    ++truth;
+
+                    for (long k = 0; k < scratch.k(); ++k)
+                    {
+                        const unsigned long idx = i*scratch.k()+k;
+                        if (k == y)
+                        {
+                            loss += scale*-std::log(g[idx]);
+                            g[idx] = scale*(g[idx]-1);
+                        }
+                        else
+                        {
+                            g[idx] = scale*g[idx];
+                        }
+                    }
+                }
+
+                tt::copy_tensor(false, grad, k_offset, scratch, 0, int_labels.size());
+
+                k_offset += int_labels.size();
+            }
+            return loss;
+        }
+
+
+        friend void serialize(const loss_multimulticlass_log_& item, std::ostream& out)
+        {
+            serialize("loss_multimulticlass_log_", out);
+            serialize(item.get_labels(), out);
+        }
+
+        friend void deserialize(loss_multimulticlass_log_& item, std::istream& in)
+        {
+            std::string version;
+            deserialize(version, in);
+            if (version != "loss_multimulticlass_log_")
+                throw serialization_error("Unexpected version found while deserializing dlib::loss_multimulticlass_log_.");
+
+            std::map<std::string,std::vector<std::string>> info; 
+            deserialize(info, in);
+            item = loss_multimulticlass_log_(info);
+        }
+
+        friend std::ostream& operator<<(std::ostream& out, const loss_multimulticlass_log_& item)
+        {
+            out << "loss_multimulticlass_log, labels={";
+            for (auto i = item.possible_labels.begin(); i != item.possible_labels.end(); )
+            {
+                auto& category = i->first;
+                auto& labels = *(i->second);
+                out << category << ":(";
+                for (size_t j = 0; j < labels.size(); ++j)
+                {
+                    out << labels[j];
+                    if (j+1 < labels.size())
+                        out << ",";
+                }
+
+                out << ")";
+                if (++i != item.possible_labels.end())
+                    out << ", ";
+            }
+            out << "}";
+            return out;
+        }
+
+        friend void to_xml(const loss_multimulticlass_log_& item, std::ostream& out)
+        {
+            out << "<loss_multimulticlass_log>\n";
+            out << item;
+            out << "\n</loss_multimulticlass_log>";
+        }
+
+    private:
+
+        std::map<std::string,std::shared_ptr<std::vector<std::string>>> possible_labels;
+        unsigned long total_num_labels = 0;
+
+        // We make it true that: possible_labels[classifier][label_idx_lookup[classifier][label]] == label
+        std::map<std::string, std::map<std::string,long>> label_idx_lookup;
+
+
+        // Scratch doesn't logically contribute to the state of this object.  It's just
+        // temporary scratch space used by this class.  
+        mutable resizable_tensor scratch;
+
+
+    };
+
+    template <typename SUBNET>
+    using loss_multimulticlass_log = add_loss_layer<loss_multimulticlass_log_, SUBNET>;
+
+    inline bool operator== (const std::string& lhs, const loss_multimulticlass_log_::classifier_output& rhs)
+    { return lhs == static_cast<const std::string&>(rhs); }
+    inline bool operator== (const loss_multimulticlass_log_::classifier_output& lhs, const std::string& rhs)
+    { return rhs == static_cast<const std::string&>(lhs); }
+
+// ----------------------------------------------------------------------------------------
+// ----------------------------------------------------------------------------------------
+
+    enum class use_image_pyramid : uint8_t
+    {
+        no,
+        yes
+    };
+
+    struct mmod_options
+    {
+    public:
+
+        struct detector_window_details
+        {
+            detector_window_details() = default; 
+            detector_window_details(unsigned long w, unsigned long h) : width(w), height(h) {}
+            detector_window_details(unsigned long w, unsigned long h, const std::string& l) : width(w), height(h), label(l) {}
+
+            unsigned long width = 0;
+            unsigned long height = 0;
+            std::string label;
+
+            friend inline void serialize(const detector_window_details& item, std::ostream& out)
+            {
+                int version = 2;
+                serialize(version, out);
+                serialize(item.width, out);
+                serialize(item.height, out);
+                serialize(item.label, out);
+            }
+
+            friend inline void deserialize(detector_window_details& item, std::istream& in)
+            {
+                int version = 0;
+                deserialize(version, in);
+                if (version != 1 && version != 2)
+                    throw serialization_error("Unexpected version found while deserializing dlib::mmod_options::detector_window_details");
+                deserialize(item.width, in);
+                deserialize(item.height, in);
+                if (version == 2)
+                    deserialize(item.label, in);
+            }
+
+        };
+
+        mmod_options() = default;
+
+        std::vector<detector_window_details> detector_windows;
+        double loss_per_false_alarm = 1;
+        double loss_per_missed_target = 1;
+        double truth_match_iou_threshold = 0.5;
+        test_box_overlap overlaps_nms = test_box_overlap(0.4);
+        test_box_overlap overlaps_ignore;
+
+        use_image_pyramid assume_image_pyramid = use_image_pyramid::yes;
+
+        mmod_options (
+            const std::vector<std::vector<mmod_rect>>& boxes,
+            const unsigned long target_size,       // We want the length of the longest dimension of the detector window to be this.
+            const unsigned long min_target_size,   // But we require that the smallest dimension of the detector window be at least this big.
+            const double min_detector_window_overlap_iou = 0.75
+        )
+        {
+            DLIB_CASSERT(0 < min_target_size && min_target_size <= target_size);
+            DLIB_CASSERT(0.5 < min_detector_window_overlap_iou && min_detector_window_overlap_iou < 1);
+
+            // Figure out what detector windows we will need.
+            for (auto& label : get_labels(boxes))
+            {
+                for (auto ratio : find_covering_aspect_ratios(boxes, test_box_overlap(min_detector_window_overlap_iou), label))
+                {
+                    double detector_width;
+                    double detector_height;
+                    if (ratio < 1)
+                    {
+                        detector_height = target_size;
+                        detector_width = ratio*target_size;
+                        if (detector_width < min_target_size)
+                        {
+                            detector_height = min_target_size/ratio;
+                            detector_width = min_target_size;
+                        }
+                    }
+                    else
+                    {
+                        detector_width = target_size;
+                        detector_height = target_size/ratio;
+                        if (detector_height < min_target_size)
+                        {
+                            detector_width = min_target_size*ratio;
+                            detector_height = min_target_size;
+                        }
+                    }
+
+                    detector_window_details p((unsigned long)std::round(detector_width), (unsigned long)std::round(detector_height), label);
+                    detector_windows.push_back(p);
+                }
+            }
+
+            DLIB_CASSERT(detector_windows.size() != 0, "You can't call mmod_options's constructor with a set of boxes that is empty (or only contains ignored boxes).");
+
+            set_overlap_nms(boxes);
+        }
+
+        mmod_options(
+            use_image_pyramid assume_image_pyramid,
+            const std::vector<std::vector<mmod_rect>>& boxes,
+            const double min_detector_window_overlap_iou = 0.75
+        )
+            : assume_image_pyramid(assume_image_pyramid)
+        {
+            DLIB_CASSERT(assume_image_pyramid == use_image_pyramid::no);
+            DLIB_CASSERT(0.5 < min_detector_window_overlap_iou && min_detector_window_overlap_iou < 1);
+
+            // Figure out what detector windows we will need.
+            for (auto& label : get_labels(boxes))
+            {
+                for (auto rectangle : find_covering_rectangles(boxes, test_box_overlap(min_detector_window_overlap_iou), label))
+                {
+                    detector_windows.push_back(detector_window_details(rectangle.width(), rectangle.height(), label));
+                }
+            }
+
+            DLIB_CASSERT(detector_windows.size() != 0, "You can't call mmod_options's constructor with a set of boxes that is empty (or only contains ignored boxes).");
+
+            set_overlap_nms(boxes);
+        }
+
+    private:
+
+        void set_overlap_nms(const std::vector<std::vector<mmod_rect>>& boxes)
+        {
+            // Convert from mmod_rect to rectangle so we can call
+            // find_tight_overlap_tester().
+            std::vector<std::vector<rectangle>> temp;
+            for (auto&& bi : boxes)
+            {
+                std::vector<rectangle> rtemp;
+                for (auto&& b : bi)
+                {
+                    if (b.ignore)
+                        continue;
+                    rtemp.push_back(b.rect);
+                }
+                temp.push_back(std::move(rtemp));
+            }
+            overlaps_nms = find_tight_overlap_tester(temp);
+            // Relax the non-max-suppression a little so that it doesn't accidentally make
+            // it impossible for the detector to output boxes matching the training data.
+            // This could be a problem with the tightest possible nms test since there is
+            // some small variability in how boxes get positioned between the training data
+            // and the coordinate system used by the detector when it runs.  So relaxing it
+            // here takes care of that.
+            auto iou_thresh             = advance_toward_1(overlaps_nms.get_iou_thresh());
+            auto percent_covered_thresh = advance_toward_1(overlaps_nms.get_percent_covered_thresh());
+            overlaps_nms = test_box_overlap(iou_thresh, percent_covered_thresh);
+        }
+
+        static double advance_toward_1 (
+            double val
+        )
+        {
+            if (val < 1)
+                val += (1-val)*0.1;
+            return val;
+        }
+
+        static size_t count_overlaps (
+            const std::vector<rectangle>& rects,
+            const test_box_overlap& overlaps,
+            const rectangle& ref_box
+        )
+        {
+            size_t cnt = 0;
+            for (auto& b : rects)
+            {
+                if (overlaps(b, ref_box))
+                    ++cnt;
+            }
+            return cnt;
+        }
+
+        static std::vector<rectangle> find_rectangles_overlapping_all_others (
+            std::vector<rectangle> rects,
+            const test_box_overlap& overlaps
+        )
+        {
+            std::vector<rectangle> exemplars;
+            dlib::rand rnd;
+
+            while(rects.size() > 0)
+            {
+                // Pick boxes at random and see if they overlap a lot of other boxes.  We will try
+                // 500 different boxes each iteration and select whichever hits the most others to
+                // add to our exemplar set.
+                rectangle best_ref_box;
+                size_t best_cnt = 0;
+                for (int iter = 0; iter < 500; ++iter)
+                {
+                    rectangle ref_box = rects[rnd.get_random_64bit_number()%rects.size()];
+                    size_t cnt = count_overlaps(rects, overlaps, ref_box);
+                    if (cnt >= best_cnt)
+                    {
+                        best_cnt = cnt;
+                        best_ref_box = ref_box;
+                    }
+                }
+
+                // Now mark all the boxes the new ref box hit as hit.
+                for (size_t i = 0; i < rects.size(); ++i)
+                {
+                    if (overlaps(rects[i], best_ref_box))
+                    {
+                        // remove box from rects so we don't hit it again later
+                        swap(rects[i], rects.back());
+                        rects.pop_back();
+                        --i;
+                    }
+                }
+
+                exemplars.push_back(best_ref_box);
+            }
+
+            return exemplars;
+        }
+
+        static std::set<std::string> get_labels (
+            const std::vector<std::vector<mmod_rect>>& rects
+        )
+        {
+            std::set<std::string> labels;
+            for (auto& rr : rects)
+            {
+                for (auto& r : rr)
+                    labels.insert(r.label);
+            }
+            return labels;
+        }
+
+        static std::vector<double> find_covering_aspect_ratios (
+            const std::vector<std::vector<mmod_rect>>& rects,
+            const test_box_overlap& overlaps,
+            const std::string& label
+        )
+        {
+            std::vector<rectangle> boxes;
+            // Make sure all the boxes have the same size and position, so that the only thing our
+            // checks for overlap will care about is aspect ratio (i.e. scale and x,y position are
+            // ignored).
+            for (auto& bb : rects)
+            {
+                for (auto&& b : bb)
+                {
+                    if (!b.ignore && b.label == label)
+                        boxes.push_back(move_rect(set_rect_area(b.rect,400*400), point(0,0)));
+                }
+            }
+
+            std::vector<double> ratios;
+            for (auto r : find_rectangles_overlapping_all_others(boxes, overlaps))
+                ratios.push_back(r.width()/(double)r.height());
+            return ratios;
+        }
+
+        static std::vector<dlib::rectangle> find_covering_rectangles (
+            const std::vector<std::vector<mmod_rect>>& rects,
+            const test_box_overlap& overlaps,
+            const std::string& label
+        )
+        {
+            std::vector<rectangle> boxes;
+            // Make sure all the boxes have the same position, so that the we only check for
+            // width and height.
+            for (auto& bb : rects)
+            {
+                for (auto&& b : bb)
+                {
+                    if (!b.ignore && b.label == label)
+                        boxes.push_back(rectangle(b.rect.width(), b.rect.height()));
+                }
+            }
+
+            return find_rectangles_overlapping_all_others(boxes, overlaps);
+        }
+    };
+
+    inline void serialize(const mmod_options& item, std::ostream& out)
+    {
+        int version = 3;
+
+        serialize(version, out);
+        serialize(item.detector_windows, out);
+        serialize(item.loss_per_false_alarm, out);
+        serialize(item.loss_per_missed_target, out);
+        serialize(item.truth_match_iou_threshold, out);
+        serialize(item.overlaps_nms, out);
+        serialize(item.overlaps_ignore, out);
+        serialize(static_cast<uint8_t>(item.assume_image_pyramid), out);
+    }
+
+    inline void deserialize(mmod_options& item, std::istream& in)
+    {
+        int version = 0;
+        deserialize(version, in);
+        if (version != 3 && version != 2 && version != 1)
+            throw serialization_error("Unexpected version found while deserializing dlib::mmod_options");
+        if (version == 1)
+        {
+            unsigned long width;
+            unsigned long height;
+            deserialize(width, in);
+            deserialize(height, in);
+            item.detector_windows = {mmod_options::detector_window_details(width, height)};
+        }
+        else
+        {
+            deserialize(item.detector_windows, in);
+        }
+        deserialize(item.loss_per_false_alarm, in);
+        deserialize(item.loss_per_missed_target, in);
+        deserialize(item.truth_match_iou_threshold, in);
+        deserialize(item.overlaps_nms, in);
+        deserialize(item.overlaps_ignore, in);
+        item.assume_image_pyramid = use_image_pyramid::yes;
+        if (version >= 3)
+        {
+            uint8_t assume_image_pyramid = 0;
+            deserialize(assume_image_pyramid, in);
+            item.assume_image_pyramid = static_cast<use_image_pyramid>(assume_image_pyramid);
+        }
+    }
+
+// ----------------------------------------------------------------------------------------
+
+    class loss_mmod_ 
+    {
+        struct intermediate_detection
+        {
+            intermediate_detection() = default; 
+
+            intermediate_detection(
+                rectangle rect_
+            ) : rect(rect_) {}
+
+            intermediate_detection(
+                rectangle rect_,
+                double detection_confidence_,
+                size_t tensor_offset_,
+                long channel
+            ) : rect(rect_), detection_confidence(detection_confidence_), tensor_offset(tensor_offset_), tensor_channel(channel) {}
+
+            rectangle rect;
+            double detection_confidence = 0;
+            size_t tensor_offset = 0;
+            long tensor_channel = 0;
+
+            bool operator<(const intermediate_detection& item) const { return detection_confidence < item.detection_confidence; }
+        };
+
+    public:
+
+        typedef std::vector<mmod_rect> training_label_type;
+        typedef std::vector<mmod_rect> output_label_type;
+
+        loss_mmod_() {}
+
+        loss_mmod_(mmod_options options_) : options(options_) {}
+
+        const mmod_options& get_options (
+        ) const { return options; }
+
+        template <
+            typename SUB_TYPE,
+            typename label_iterator
+            >
+        void to_label (
+            const tensor& input_tensor,
+            const SUB_TYPE& sub,
+            label_iterator iter,
+            double adjust_threshold = 0
+        ) const
+        {
+            const tensor& output_tensor = sub.get_output();
+            DLIB_CASSERT(output_tensor.k() == (long)options.detector_windows.size());
+            DLIB_CASSERT(input_tensor.num_samples() == output_tensor.num_samples());
+            DLIB_CASSERT(sub.sample_expansion_factor() == 1,  sub.sample_expansion_factor());
+
+            std::vector<intermediate_detection> dets_accum;
+            output_label_type final_dets;
+            for (long i = 0; i < output_tensor.num_samples(); ++i)
+            {
+                tensor_to_dets(input_tensor, output_tensor, i, dets_accum, adjust_threshold, sub);
+
+                // Do non-max suppression
+                final_dets.clear();
+                for (unsigned long i = 0; i < dets_accum.size(); ++i)
+                {
+                    if (overlaps_any_box_nms(final_dets, dets_accum[i].rect))
+                        continue;
+
+                    final_dets.push_back(mmod_rect(dets_accum[i].rect,
+                                                   dets_accum[i].detection_confidence,
+                                                   options.detector_windows[dets_accum[i].tensor_channel].label));
+                }
+
+                *iter++ = std::move(final_dets);
+            }
+        }
+
+        template <
+            typename const_label_iterator,
+            typename SUBNET
+            >
+        double compute_loss_value_and_gradient (
+            const tensor& input_tensor,
+            const_label_iterator truth, 
+            SUBNET& sub
+        ) const
+        {
+            const tensor& output_tensor = sub.get_output();
+            tensor& grad = sub.get_gradient_input();
+
+            DLIB_CASSERT(input_tensor.num_samples() != 0);
+            DLIB_CASSERT(sub.sample_expansion_factor() == 1);
+            DLIB_CASSERT(input_tensor.num_samples() == grad.num_samples());
+            DLIB_CASSERT(input_tensor.num_samples() == output_tensor.num_samples());
+            DLIB_CASSERT(output_tensor.k() == (long)options.detector_windows.size());
+
+            double det_thresh_speed_adjust = 0;
+
+
+            // we will scale the loss so that it doesn't get really huge
+            const double scale = 1.0/output_tensor.size();
+            double loss = 0;
+
+            float* g = grad.host_write_only();
+            for (size_t i = 0; i < grad.size(); ++i)
+                g[i] = 0;
+
+            const float* out_data = output_tensor.host();
+
+            std::vector<size_t> truth_idxs;  truth_idxs.reserve(truth->size());
+            std::vector<intermediate_detection> dets;
+            for (long i = 0; i < output_tensor.num_samples(); ++i)
+            {
+                tensor_to_dets(input_tensor, output_tensor, i, dets, -options.loss_per_false_alarm + det_thresh_speed_adjust, sub);
+
+                const unsigned long max_num_dets = 50 + truth->size()*5;
+                // Prevent calls to tensor_to_dets() from running for a really long time
+                // due to the production of an obscene number of detections.
+                const unsigned long max_num_initial_dets = max_num_dets*100;
+                if (dets.size() >= max_num_initial_dets)
+                {
+                    det_thresh_speed_adjust = std::max(det_thresh_speed_adjust,dets[max_num_initial_dets].detection_confidence + options.loss_per_false_alarm);
+                }
+
+
+                // The loss will measure the number of incorrect detections.  A detection is
+                // incorrect if it doesn't hit a truth rectangle or if it is a duplicate detection
+                // on a truth rectangle.
+                loss += truth->size()*options.loss_per_missed_target;
+                for (auto&& x : *truth)
+                {
+                    if (!x.ignore)
+                    {
+                        size_t k;
+                        point p;
+                        if(image_rect_to_feat_coord(p, input_tensor, x, x.label, sub, k, options.assume_image_pyramid))
+                        {
+                            // Ignore boxes that can't be detected by the CNN.
+                            loss -= options.loss_per_missed_target;
+                            continue;
+                        }
+                        const size_t idx = (k*output_tensor.nr() + p.y())*output_tensor.nc() + p.x();
+                        loss -= out_data[idx];
+                        // compute gradient
+                        g[idx] = -scale;
+                        truth_idxs.push_back(idx);
+                    }
+                    else
+                    {
+                        // This box was ignored so shouldn't have been counted in the loss.
+                        loss -= options.loss_per_missed_target;
+                        truth_idxs.push_back(0);
+                    }
+                }
+
+                // Measure the loss augmented score for the detections which hit a truth rect.
+                std::vector<double> truth_score_hits(truth->size(), 0);
+
+                // keep track of which truth boxes we have hit so far.
+                std::vector<bool> hit_truth_table(truth->size(), false);
+
+                std::vector<intermediate_detection> final_dets;
+                // The point of this loop is to fill out the truth_score_hits array. 
+                for (unsigned long i = 0; i < dets.size() && final_dets.size() < max_num_dets; ++i)
+                {
+                    if (overlaps_any_box_nms(final_dets, dets[i].rect))
+                        continue;
+
+                    const auto& det_label = options.detector_windows[dets[i].tensor_channel].label;
+
+                    const std::pair<double,unsigned int> hittruth = find_best_match(*truth, dets[i].rect, det_label);
+
+                    final_dets.push_back(dets[i].rect);
+
+                    const double truth_match = hittruth.first;
+                    // if hit truth rect
+                    if (truth_match > options.truth_match_iou_threshold)
+                    {
+                        // if this is the first time we have seen a detect which hit (*truth)[hittruth.second]
+                        const double score = dets[i].detection_confidence;
+                        if (hit_truth_table[hittruth.second] == false)
+                        {
+                            hit_truth_table[hittruth.second] = true;
+                            truth_score_hits[hittruth.second] += score;
+                        }
+                        else
+                        {
+                            truth_score_hits[hittruth.second] += score + options.loss_per_false_alarm;
+                        }
+                    }
+                }
+
+                // Check if any of the truth boxes are unobtainable because the NMS is
+                // killing them.  If so, automatically set those unobtainable boxes to
+                // ignore and print a warning message to the user.
+                for (size_t i = 0; i < hit_truth_table.size(); ++i)
+                {
+                    if (!hit_truth_table[i] && !(*truth)[i].ignore) 
+                    {
+                        // So we didn't hit this truth box.  Is that because there is
+                        // another, different truth box, that overlaps it according to NMS?
+                        const std::pair<double,unsigned int> hittruth = find_best_match(*truth, (*truth)[i], i);
+                        if (hittruth.second == i || (*truth)[hittruth.second].ignore)
+                            continue;
+                        rectangle best_matching_truth_box = (*truth)[hittruth.second];
+                        if (options.overlaps_nms(best_matching_truth_box, (*truth)[i]))
+                        {
+                            const size_t idx = truth_idxs[i];
+                            // We are ignoring this box so we shouldn't have counted it in the
+                            // loss in the first place.  So we subtract out the loss values we
+                            // added for it in the code above.
+                            loss -= options.loss_per_missed_target-out_data[idx];
+                            g[idx] = 0;
+                            std::cout << "Warning, ignoring object.  We encountered a truth rectangle located at " << (*truth)[i].rect;
+                            std::cout << " that is suppressed by non-max-suppression ";
+                            std::cout << "because it is overlapped by another truth rectangle located at " << best_matching_truth_box 
+                                      << " (IoU:"<< box_intersection_over_union(best_matching_truth_box,(*truth)[i]) <<", Percent covered:" 
+                                      << box_percent_covered(best_matching_truth_box,(*truth)[i]) << ")." << std::endl;
+                        }
+                    }
+                }
+
+                hit_truth_table.assign(hit_truth_table.size(), false);
+                final_dets.clear();
+
+
+                // Now figure out which detections jointly maximize the loss and detection score sum.  We
+                // need to take into account the fact that allowing a true detection in the output, while 
+                // initially reducing the loss, may allow us to increase the loss later with many duplicate
+                // detections.
+                for (unsigned long i = 0; i < dets.size() && final_dets.size() < max_num_dets; ++i)
+                {
+                    if (overlaps_any_box_nms(final_dets, dets[i].rect))
+                        continue;
+
+                    const auto& det_label = options.detector_windows[dets[i].tensor_channel].label;
+
+                    const std::pair<double,unsigned int> hittruth = find_best_match(*truth, dets[i].rect, det_label);
+
+                    const double truth_match = hittruth.first;
+                    if (truth_match > options.truth_match_iou_threshold)
+                    {
+                        if (truth_score_hits[hittruth.second] > options.loss_per_missed_target)
+                        {
+                            if (!hit_truth_table[hittruth.second])
+                            {
+                                hit_truth_table[hittruth.second] = true;
+                                final_dets.push_back(dets[i]);
+                                loss -= options.loss_per_missed_target;
+                            }
+                            else
+                            {
+                                final_dets.push_back(dets[i]);
+                                loss += options.loss_per_false_alarm;
+                            }
+                        }
+                    }
+                    else if (!overlaps_ignore_box(*truth, dets[i].rect))
+                    {
+                        // didn't hit anything
+                        final_dets.push_back(dets[i]);
+                        loss += options.loss_per_false_alarm;
+                    }
+                }
+
+                for (auto&& x : final_dets)
+                {
+                    loss += out_data[x.tensor_offset];
+                    g[x.tensor_offset] += scale;
+                }
+
+                ++truth;
+                g        += output_tensor.k()*output_tensor.nr()*output_tensor.nc();
+                out_data += output_tensor.k()*output_tensor.nr()*output_tensor.nc();
+            } // END for (long i = 0; i < output_tensor.num_samples(); ++i)
+
+
+            // Here we scale the loss so that it's roughly equal to the number of mistakes
+            // in an image.  Note that this scaling is different than the scaling we
+            // applied to the gradient but it doesn't matter since the loss value isn't
+            // used to update parameters.  It's used only for display and to check if we
+            // have converged.  So it doesn't matter that they are scaled differently and
+            // this way the loss that is displayed is readily interpretable to the user.
+            return loss/output_tensor.num_samples();
+        }
+
+
+        friend void serialize(const loss_mmod_& item, std::ostream& out)
+        {
+            serialize("loss_mmod_", out);
+            serialize(item.options, out);
+        }
+
+        friend void deserialize(loss_mmod_& item, std::istream& in)
+        {
+            std::string version;
+            deserialize(version, in);
+            if (version != "loss_mmod_")
+                throw serialization_error("Unexpected version found while deserializing dlib::loss_mmod_.");
+            deserialize(item.options, in);
+        }
+
+        friend std::ostream& operator<<(std::ostream& out, const loss_mmod_& item)
+        {
+            out << "loss_mmod\t (";
+
+            out << "detector_windows:(";
+            auto& opts = item.options;
+            for (size_t i = 0; i < opts.detector_windows.size(); ++i)
+            {
+                out << opts.detector_windows[i].width << "x" << opts.detector_windows[i].height;
+                if (i+1 < opts.detector_windows.size())
+                    out << ",";
+            }
+            out << ")";
+            out << ", loss per FA:" << opts.loss_per_false_alarm;
+            out << ", loss per miss:" << opts.loss_per_missed_target;
+            out << ", truth match IOU thresh:" << opts.truth_match_iou_threshold;
+            out << ", overlaps_nms:("<<opts.overlaps_nms.get_iou_thresh()<<","<<opts.overlaps_nms.get_percent_covered_thresh()<<")";
+            out << ", overlaps_ignore:("<<opts.overlaps_ignore.get_iou_thresh()<<","<<opts.overlaps_ignore.get_percent_covered_thresh()<<")";
+
+            out << ")";
+            return out;
+        }
+
+        friend void to_xml(const loss_mmod_& /*item*/, std::ostream& out)
+        {
+            // TODO, add options fields
+            out << "<loss_mmod/>";
+        }
+
+    private:
+
+        template <typename net_type>
+        void tensor_to_dets (
+            const tensor& input_tensor,
+            const tensor& output_tensor,
+            long i,
+            std::vector<intermediate_detection>& dets_accum,
+            double adjust_threshold,
+            const net_type& net 
+        ) const
+        {
+            DLIB_CASSERT(net.sample_expansion_factor() == 1,net.sample_expansion_factor());
+            DLIB_CASSERT(output_tensor.k() == (long)options.detector_windows.size());
+            const float* out_data = output_tensor.host() + output_tensor.k()*output_tensor.nr()*output_tensor.nc()*i;
+            // scan the final layer and output the positive scoring locations
+            dets_accum.clear();
+            for (long k = 0; k < output_tensor.k(); ++k)
+            {
+                for (long r = 0; r < output_tensor.nr(); ++r)
+                {
+                    for (long c = 0; c < output_tensor.nc(); ++c)
+                    {
+                        double score = out_data[(k*output_tensor.nr() + r)*output_tensor.nc() + c];
+                        if (score > adjust_threshold)
+                        {
+                            dpoint p = output_tensor_to_input_tensor(net, point(c,r));
+                            drectangle rect = centered_drect(p, options.detector_windows[k].width, options.detector_windows[k].height);
+                            rect = input_layer(net).tensor_space_to_image_space(input_tensor,rect);
+
+                            dets_accum.push_back(intermediate_detection(rect, score, (k*output_tensor.nr() + r)*output_tensor.nc() + c, k));
+                        }
+                    }
+                }
+            }
+            std::sort(dets_accum.rbegin(), dets_accum.rend());
+        }
+
+        size_t find_best_detection_window (
+            rectangle rect,
+            const std::string& label,
+            use_image_pyramid assume_image_pyramid
+        ) const
+        {
+            if (assume_image_pyramid == use_image_pyramid::yes)
+            {
+                rect = move_rect(set_rect_area(rect, 400*400), point(0,0));
+            }
+            else
+            {
+                rect = rectangle(rect.width(), rect.height());
+            }
+
+            // Figure out which detection window in options.detector_windows is most similar to rect
+            // (in terms of aspect ratio, if assume_image_pyramid == use_image_pyramid::yes).
+            size_t best_i = 0;
+            double best_ratio_diff = -std::numeric_limits<double>::infinity();
+            for (size_t i = 0; i < options.detector_windows.size(); ++i)
+            {
+                if (options.detector_windows[i].label != label)
+                    continue;
+
+                rectangle det_window;
+                
+                if (options.assume_image_pyramid == use_image_pyramid::yes)
+                {
+                    det_window = centered_rect(point(0,0), options.detector_windows[i].width, options.detector_windows[i].height);
+                    det_window = move_rect(set_rect_area(det_window, 400*400), point(0,0));
+                }
+                else
+                {
+                    det_window = rectangle(options.detector_windows[i].width, options.detector_windows[i].height);
+                }
+
+                double iou = box_intersection_over_union(rect, det_window);
+                if (iou > best_ratio_diff)
+                {
+                    best_ratio_diff = iou;
+                    best_i = i;
+                }
+            }
+            return best_i;
+        }
+
+        template <typename net_type>
+        bool image_rect_to_feat_coord (
+            point& tensor_p,
+            const tensor& input_tensor,
+            const rectangle& rect,
+            const std::string& label,
+            const net_type& net,
+            size_t& det_idx,
+            use_image_pyramid assume_image_pyramid
+        ) const 
+        {
+            using namespace std;
+            if (!input_layer(net).image_contained_point(input_tensor,center(rect)))
+            {
+                std::ostringstream sout;
+                sout << "Encountered a truth rectangle located at " << rect << " that is outside the image." << endl;
+                sout << "The center of each truth rectangle must be within the image." << endl;
+                throw impossible_labeling_error(sout.str());
+            }
+
+            det_idx = find_best_detection_window(rect,label,assume_image_pyramid);
+
+            double scale = 1.0;
+            if (options.assume_image_pyramid == use_image_pyramid::yes)
+            {
+                // Compute the scale we need to be at to get from rect to our detection window.
+                // Note that we compute the scale as the max of two numbers.  It doesn't
+                // actually matter which one we pick, because if they are very different then
+                // it means the box can't be matched by the sliding window.  But picking the
+                // max causes the right error message to be selected in the logic below.
+                scale = std::max(options.detector_windows[det_idx].width/(double)rect.width(), options.detector_windows[det_idx].height/(double)rect.height());
+            }
+            else
+            {
+                // We don't want invariance to scale.
+                scale = 1.0;
+            }
+
+            const rectangle mapped_rect = input_layer(net).image_space_to_tensor_space(input_tensor, std::min(1.0,scale), rect);
+
+            // compute the detection window that we would use at this position.
+            tensor_p = center(mapped_rect);
+            rectangle det_window = centered_rect(tensor_p, options.detector_windows[det_idx].width,options.detector_windows[det_idx].height);
+            det_window = input_layer(net).tensor_space_to_image_space(input_tensor, det_window);
+
+            // make sure the rect can actually be represented by the image pyramid we are
+            // using.
+            if (box_intersection_over_union(rect, det_window) <= options.truth_match_iou_threshold)
+            {
+                std::cout << "Warning, ignoring object.  We encountered a truth rectangle with a width and height of " << rect.width() << " and " << rect.height() << ".  ";
+                std::cout << "The image pyramid and sliding windows can't output a rectangle of this shape.  ";
+                const double detector_area = options.detector_windows[det_idx].width*options.detector_windows[det_idx].height;
+                if (mapped_rect.area()/detector_area <= options.truth_match_iou_threshold)
+                {
+                    std::cout << "This is because the rectangle is smaller than the best matching detection window, which has a width ";
+                    std::cout << "and height of " << options.detector_windows[det_idx].width << " and " << options.detector_windows[det_idx].height << "." << std::endl;
+                }
+                else
+                {
+                    std::cout << "This is either because (1) the final layer's features have too large of a stride across the image, limiting the possible locations the sliding window can search ";
+                    std::cout << "or (2) because the rectangle's aspect ratio is too different from the best matching detection window, ";
+                    std::cout << "which has a width and height of " << options.detector_windows[det_idx].width << " and " << options.detector_windows[det_idx].height << "." << std::endl;
+                }
+                return true;
+            }
+
+            // now map through the CNN to the output layer.
+            tensor_p = input_tensor_to_output_tensor(net,tensor_p);
+
+            const tensor& output_tensor = net.get_output();
+            if (!get_rect(output_tensor).contains(tensor_p))
+            {
+                std::cout << "Warning, ignoring object.  We encountered a truth rectangle located at " << rect << " that is too close to the edge ";
+                std::cout << "of the image to be captured by the CNN features." << std::endl;
+                return true;
+            }
+
+            return false;
+        }
+
+
+        bool overlaps_ignore_box (
+            const std::vector<mmod_rect>& boxes,
+            const rectangle& rect
+        ) const
+        {
+            for (auto&& b : boxes)
+            {
+                if (b.ignore && options.overlaps_ignore(b, rect))
+                    return true;
+            }
+            return false;
+        }
+
+        std::pair<double,unsigned int> find_best_match(
+            const std::vector<mmod_rect>& boxes,
+            const rectangle& rect,
+            const std::string& label
+        ) const
+        {
+            double match = 0;
+            unsigned int best_idx = 0;
+            for (unsigned long i = 0; i < boxes.size(); ++i)
+            {
+                if (boxes[i].ignore || boxes[i].label != label)
+                    continue;
+
+                const double new_match = box_intersection_over_union(rect, boxes[i]);
+                if (new_match > match)
+                {
+                    match = new_match;
+                    best_idx = i;
+                }
+            }
+
+            return std::make_pair(match,best_idx);
+        }
+
+        std::pair<double,unsigned int> find_best_match(
+            const std::vector<mmod_rect>& boxes,
+            const rectangle& rect,
+            const size_t excluded_idx
+        ) const
+        {
+            double match = 0;
+            unsigned int best_idx = 0;
+            for (unsigned long i = 0; i < boxes.size(); ++i)
+            {
+                if (boxes[i].ignore || excluded_idx == i)
+                    continue;
+
+                const double new_match = box_intersection_over_union(rect, boxes[i]);
+                if (new_match > match)
+                {
+                    match = new_match;
+                    best_idx = i;
+                }
+            }
+
+            return std::make_pair(match,best_idx);
+        }
+
+        template <typename T>
+        inline bool overlaps_any_box_nms (
+            const std::vector<T>& rects,
+            const rectangle& rect
+        ) const
+        {
+            for (auto&& r : rects)
+            {
+                if (options.overlaps_nms(r.rect, rect))
+                    return true;
+            }
+            return false;
+        }
+
+
+        mmod_options options;
+
+    };
+
+    template <typename SUBNET>
+    using loss_mmod = add_loss_layer<loss_mmod_, SUBNET>;
+
+// ----------------------------------------------------------------------------------------
+
+    class loss_metric_ 
+    {
+    public:
+
+        typedef unsigned long training_label_type;
+        typedef matrix<float,0,1> output_label_type;
+
+        loss_metric_() = default;
+
+        loss_metric_(
+            float margin_,
+            float dist_thresh_
+        ) : margin(margin_), dist_thresh(dist_thresh_) 
+        {
+            DLIB_CASSERT(margin_ > 0);
+            DLIB_CASSERT(dist_thresh_ > 0);
+        }
+
+        template <
+            typename SUB_TYPE,
+            typename label_iterator
+            >
+        void to_label (
+            const tensor& input_tensor,
+            const SUB_TYPE& sub,
+            label_iterator iter
+        ) const
+        {
+            const tensor& output_tensor = sub.get_output();
+            DLIB_CASSERT(sub.sample_expansion_factor() == 1);
+            DLIB_CASSERT(input_tensor.num_samples() != 0);
+            DLIB_CASSERT(input_tensor.num_samples()%sub.sample_expansion_factor() == 0);
+            DLIB_CASSERT(input_tensor.num_samples() == output_tensor.num_samples());
+            DLIB_CASSERT(output_tensor.nr() == 1 && 
+                         output_tensor.nc() == 1);
+
+            const float* p = output_tensor.host();
+            for (long i = 0; i < output_tensor.num_samples(); ++i)
+            {
+                *iter = mat(p,output_tensor.k(),1);
+
+                ++iter;
+                p += output_tensor.k();
+            }
+        }
+
+
+        float get_margin() const { return margin; }
+        float get_distance_threshold() const { return dist_thresh; }
+
+        template <
+            typename const_label_iterator,
+            typename SUBNET
+            >
+        double compute_loss_value_and_gradient (
+            const tensor& input_tensor,
+            const_label_iterator truth, 
+            SUBNET& sub
+        ) const
+        {
+            const tensor& output_tensor = sub.get_output();
+            tensor& grad = sub.get_gradient_input();
+
+            DLIB_CASSERT(sub.sample_expansion_factor() == 1);
+            DLIB_CASSERT(input_tensor.num_samples() != 0);
+            DLIB_CASSERT(input_tensor.num_samples()%sub.sample_expansion_factor() == 0);
+            DLIB_CASSERT(input_tensor.num_samples() == grad.num_samples());
+            DLIB_CASSERT(input_tensor.num_samples() == output_tensor.num_samples());
+            DLIB_CASSERT(output_tensor.nr() == 1 && 
+                         output_tensor.nc() == 1);
+            DLIB_CASSERT(grad.nr() == 1 && 
+                         grad.nc() == 1);
+
+
+
+            temp.set_size(output_tensor.num_samples(), output_tensor.num_samples());
+            grad_mul.copy_size(temp);
+
+            tt::gemm(0, temp, 1, output_tensor, false, output_tensor, true);
+
+
+            std::vector<double> temp_threshs;
+            const float* d = temp.host();
+            double loss = 0;
+            double num_pos_samps = 0.0001;
+            double num_neg_samps = 0.0001;
+            for (long r = 0; r < temp.num_samples(); ++r)
+            {
+                auto xx = d[r*temp.num_samples() + r];
+                const auto x_label = *(truth + r);
+                for (long c = r+1; c < temp.num_samples(); ++c)
+                {
+                    const auto y_label = *(truth + c);
+                    if (x_label == y_label)
+                    {
+                        ++num_pos_samps;
+                    }
+                    else
+                    {
+                        ++num_neg_samps;
+
+                        // Figure out what distance threshold, when applied to the negative pairs,
+                        // causes there to be an equal number of positive and negative pairs.
+                        auto yy = d[c*temp.num_samples() + c];
+                        auto xy = d[r*temp.num_samples() + c];
+                        // compute the distance between x and y samples.
+                        auto d2 = xx + yy - 2*xy;
+                        if (d2 < 0)
+                            d2 = 0;
+                        temp_threshs.push_back(d2);
+                    }
+                }
+            }
+            // The whole objective function is multiplied by this to scale the loss
+            // relative to the number of things in the mini-batch.
+            const double scale = 0.5/num_pos_samps;
+            DLIB_CASSERT(num_pos_samps>=1, "Make sure each mini-batch contains both positive pairs and negative pairs");
+            DLIB_CASSERT(num_neg_samps>=1, "Make sure each mini-batch contains both positive pairs and negative pairs");
+
+            std::sort(temp_threshs.begin(), temp_threshs.end());
+            const float neg_thresh = std::sqrt(temp_threshs[std::min(num_pos_samps,num_neg_samps)-1]);
+
+            // loop over all the pairs of training samples and compute the loss and
+            // gradients.  Note that we only use the hardest negative pairs and that in
+            // particular we pick the number of negative pairs equal to the number of
+            // positive pairs so everything is balanced.
+            float* gm = grad_mul.host();
+            for (long r = 0; r < temp.num_samples(); ++r)
+            {
+                gm[r*temp.num_samples() + r] = 0;
+                const auto x_label = *(truth + r);
+                auto xx = d[r*temp.num_samples() + r];
+                for (long c = 0; c < temp.num_samples(); ++c)
+                {
+                    if (r==c)
+                        continue;
+                    const auto y_label = *(truth + c);
+                    auto yy = d[c*temp.num_samples() + c];
+                    auto xy = d[r*temp.num_samples() + c];
+
+                    // compute the distance between x and y samples.
+                    auto d2 = xx + yy - 2*xy;
+                    if (d2 <= 0)
+                        d2 = 0;
+                    else 
+                        d2 = std::sqrt(d2);
+
+                    // It should be noted that the derivative of length(x-y) with respect
+                    // to the x vector is the unit vector (x-y)/length(x-y).  If you stare
+                    // at the code below long enough you will see that it's just an
+                    // application of this formula.
+
+                    if (x_label == y_label)
+                    {
+                        // Things with the same label should have distances < dist_thresh between
+                        // them.  If not then we experience non-zero loss.
+                        if (d2 < dist_thresh-margin)
+                        {
+                            gm[r*temp.num_samples() + c] = 0;
+                        }
+                        else
+                        {
+                            loss += scale*(d2 - (dist_thresh-margin));
+                            gm[r*temp.num_samples() + r] += scale/d2;
+                            gm[r*temp.num_samples() + c] = -scale/d2;
+                        }
+                    }
+                    else
+                    {
+                        // Things with different labels should have distances > dist_thresh between
+                        // them.  If not then we experience non-zero loss.
+                        if (d2 > dist_thresh+margin || d2 > neg_thresh)
+                        {
+                            gm[r*temp.num_samples() + c] = 0;
+                        }
+                        else
+                        {
+                            loss += scale*((dist_thresh+margin) - d2);
+                            // don't divide by zero (or a really small number)
+                            d2 = std::max(d2, 0.001f);
+                            gm[r*temp.num_samples() + r] -= scale/d2;
+                            gm[r*temp.num_samples() + c] = scale/d2;
+                        }
+                    }
+                }
+            }
+
+
+            tt::gemm(0, grad, 1, grad_mul, false, output_tensor, false); 
+
+            return loss;
+        }
+
+        friend void serialize(const loss_metric_& item, std::ostream& out)
+        {
+            serialize("loss_metric_2", out);
+            serialize(item.margin, out);
+            serialize(item.dist_thresh, out);
+        }
+
+        friend void deserialize(loss_metric_& item, std::istream& in)
+        {
+            std::string version;
+            deserialize(version, in);
+            if (version == "loss_metric_")
+            {
+                // These values used to be hard coded, so for this version of the metric
+                // learning loss we just use these values.
+                item.margin = 0.1;
+                item.dist_thresh = 0.75;
+                return;
+            }
+            else if (version == "loss_metric_2")
+            {
+                deserialize(item.margin, in);
+                deserialize(item.dist_thresh, in);
+            }
+            else
+            {
+                throw serialization_error("Unexpected version found while deserializing dlib::loss_metric_.  Instead found " + version);
+            }
+        }
+
+        friend std::ostream& operator<<(std::ostream& out, const loss_metric_& item )
+        {
+            out << "loss_metric (margin="<<item.margin<<", distance_threshold="<<item.dist_thresh<<")";
+            return out;
+        }
+
+        friend void to_xml(const loss_metric_& item, std::ostream& out)
+        {
+            out << "<loss_metric margin='"<<item.margin<<"' distance_threshold='"<<item.dist_thresh<<"'/>";
+        }
+
+    private:
+        float margin = 0.04;
+        float dist_thresh = 0.6;
+
+
+        // These variables are only here to avoid being reallocated over and over in
+        // compute_loss_value_and_gradient()
+        mutable resizable_tensor temp, grad_mul;
+
+    };
+
+    template <typename SUBNET>
+    using loss_metric = add_loss_layer<loss_metric_, SUBNET>;
+
+// ----------------------------------------------------------------------------------------
+
+    class loss_ranking_
+    {
+    public:
+
+        typedef float training_label_type; // nominally +1/-1
+        typedef float output_label_type; // ranking score
+
+        template <
+            typename SUB_TYPE,
+            typename label_iterator
+            >
+        void to_label (
+            const tensor& input_tensor,
+            const SUB_TYPE& sub,
+            label_iterator iter
+        ) const
+        {
+            DLIB_CASSERT(sub.sample_expansion_factor() == 1);
+
+            const tensor& output_tensor = sub.get_output();
+
+            DLIB_CASSERT(output_tensor.nr() == 1 &&
+                         output_tensor.nc() == 1 &&
+                         output_tensor.k() == 1);
+            DLIB_CASSERT(input_tensor.num_samples() == output_tensor.num_samples());
+
+            const float* out_data = output_tensor.host();
+            for (long i = 0; i < output_tensor.num_samples(); ++i)
+            {
+                *iter++ = out_data[i];
+            }
+        }
+
+
+        template <
+            typename const_label_iterator,
+            typename SUBNET
+            >
+        double compute_loss_value_and_gradient (
+            const tensor& input_tensor,
+            const_label_iterator truth,
+            SUBNET& sub
+        ) const
+        {
+            const tensor& output_tensor = sub.get_output();
+            tensor& grad = sub.get_gradient_input();
+
+            DLIB_CASSERT(sub.sample_expansion_factor() == 1);
+            DLIB_CASSERT(input_tensor.num_samples() != 0);
+            DLIB_CASSERT(input_tensor.num_samples()%sub.sample_expansion_factor() == 0);
+            DLIB_CASSERT(input_tensor.num_samples() == grad.num_samples());
+            DLIB_CASSERT(input_tensor.num_samples() == output_tensor.num_samples());
+            DLIB_CASSERT(output_tensor.nr() == 1 &&
+                         output_tensor.nc() == 1 &&
+                         output_tensor.k() == 1);
+            DLIB_CASSERT(grad.nr() == 1 &&
+                         grad.nc() == 1 &&
+                         grad.k() == 1);
+
+
+            std::vector<double> rel_scores;
+            std::vector<double> nonrel_scores;
+            std::vector<long> rel_idx, nonrel_idx;
+
+            const float* out_data = output_tensor.host();
+            float* g = grad.host_write_only();
+            for (long i = 0; i < output_tensor.num_samples(); ++i)
+            {
+                const float y = *truth++;
+                if (y > 0)
+                {
+                    rel_scores.push_back(out_data[i]-y);
+                    rel_idx.push_back(i);
+                }
+                else if (y < 0)
+                {
+                    nonrel_scores.push_back(out_data[i]-y);
+                    nonrel_idx.push_back(i);
+                }
+                else
+                {
+                    g[i] = 0;
+                }
+            }
+
+
+            std::vector<unsigned long> rel_counts;
+            std::vector<unsigned long> nonrel_counts;
+            count_ranking_inversions(rel_scores, nonrel_scores, rel_counts, nonrel_counts);
+            const unsigned long total_pairs = rel_scores.size()*nonrel_scores.size();
+            DLIB_CASSERT(total_pairs > 0, "You can't give a ranking mini-batch that contains only one class.  Both classes must be represented.");
+            const double scale = 1.0/total_pairs;
+
+
+            double loss = 0;
+            for (unsigned long k = 0; k < rel_counts.size(); ++k)
+            {
+                loss -= rel_counts[k]*rel_scores[k];
+                g[rel_idx[k]] = -1.0*rel_counts[k]*scale;
+            }
+
+            for (unsigned long k = 0; k < nonrel_counts.size(); ++k)
+            {
+                loss += nonrel_counts[k]*nonrel_scores[k];
+                g[nonrel_idx[k]] = nonrel_counts[k]*scale;
+            }
+
+            return loss*scale;
+        }
+
+        friend void serialize(const loss_ranking_& , std::ostream& out)
+        {
+            serialize("loss_ranking_", out);
+        }
+
+        friend void deserialize(loss_ranking_& , std::istream& in)
+        {
+            std::string version;
+            deserialize(version, in);
+            if (version != "loss_ranking_")
+                throw serialization_error("Unexpected version found while deserializing dlib::loss_ranking_.");
+        }
+
+        friend std::ostream& operator<<(std::ostream& out, const loss_ranking_& )
+        {
+            out << "loss_ranking";
+            return out;
+        }
+
+        friend void to_xml(const loss_ranking_& /*item*/, std::ostream& out)
+        {
+            out << "<loss_ranking/>";
+        }
+
+    };
+
+    template <typename SUBNET>
+    using loss_ranking = add_loss_layer<loss_ranking_, SUBNET>;
+
+// ----------------------------------------------------------------------------------------
+
+    class loss_mean_squared_
+    {
+    public:
+
+        typedef float training_label_type;
+        typedef float output_label_type;
+
+        template <
+            typename SUB_TYPE,
+            typename label_iterator
+            >
+        void to_label (
+            const tensor& input_tensor,
+            const SUB_TYPE& sub,
+            label_iterator iter
+        ) const
+        {
+            DLIB_CASSERT(sub.sample_expansion_factor() == 1);
+
+            const tensor& output_tensor = sub.get_output();
+
+            DLIB_CASSERT(output_tensor.nr() == 1 &&
+                         output_tensor.nc() == 1 &&
+                         output_tensor.k() == 1);
+            DLIB_CASSERT(input_tensor.num_samples() == output_tensor.num_samples());
+
+            const float* out_data = output_tensor.host();
+            for (long i = 0; i < output_tensor.num_samples(); ++i)
+            {
+                *iter++ = out_data[i];
+            }
+        }
+
+
+        template <
+            typename const_label_iterator,
+            typename SUBNET
+            >
+        double compute_loss_value_and_gradient (
+            const tensor& input_tensor,
+            const_label_iterator truth,
+            SUBNET& sub
+        ) const
+        {
+            const tensor& output_tensor = sub.get_output();
+            tensor& grad = sub.get_gradient_input();
+
+            DLIB_CASSERT(sub.sample_expansion_factor() == 1);
+            DLIB_CASSERT(input_tensor.num_samples() != 0);
+            DLIB_CASSERT(input_tensor.num_samples()%sub.sample_expansion_factor() == 0);
+            DLIB_CASSERT(input_tensor.num_samples() == grad.num_samples());
+            DLIB_CASSERT(input_tensor.num_samples() == output_tensor.num_samples());
+            DLIB_CASSERT(output_tensor.nr() == 1 &&
+                         output_tensor.nc() == 1 &&
+                         output_tensor.k() == 1);
+            DLIB_CASSERT(grad.nr() == 1 &&
+                         grad.nc() == 1 &&
+                         grad.k() == 1);
+
+            // The loss we output is the average loss over the mini-batch.
+            const double scale = 1.0/output_tensor.num_samples();
+            double loss = 0;
+            float* g = grad.host_write_only();
+            const float* out_data = output_tensor.host();
+            for (long i = 0; i < output_tensor.num_samples(); ++i)
+            {
+                const float y = *truth++;
+                const float temp1 = y - out_data[i];
+                const float temp2 = scale*temp1;
+                loss += temp2*temp1;
+                g[i] = -temp2;
+
+            }
+            return loss;
+        }
+
+        friend void serialize(const loss_mean_squared_& , std::ostream& out)
+        {
+            serialize("loss_mean_squared_", out);
+        }
+
+        friend void deserialize(loss_mean_squared_& , std::istream& in)
+        {
+            std::string version;
+            deserialize(version, in);
+            if (version != "loss_mean_squared_")
+                throw serialization_error("Unexpected version found while deserializing dlib::loss_mean_squared_.");
+        }
+
+        friend std::ostream& operator<<(std::ostream& out, const loss_mean_squared_& )
+        {
+            out << "loss_mean_squared";
+            return out;
+        }
+
+        friend void to_xml(const loss_mean_squared_& /*item*/, std::ostream& out)
+        {
+            out << "<loss_mean_squared/>";
+        }
+
+    };
+
+    template <typename SUBNET>
+    using loss_mean_squared = add_loss_layer<loss_mean_squared_, SUBNET>;
+
+// ----------------------------------------------------------------------------------------
+
+    class loss_epsilon_insensitive_
+    {
+    public:
+
+        typedef float training_label_type;
+        typedef float output_label_type;
+
+        loss_epsilon_insensitive_() = default;
+        loss_epsilon_insensitive_(double eps) : eps(eps) 
+        {
+            DLIB_CASSERT(eps >= 0, "You can't set a negative error epsilon.");
+        }
+
+        double get_epsilon () const { return eps; }
+        void set_epsilon(double e)
+        {
+            DLIB_CASSERT(e >= 0, "You can't set a negative error epsilon.");
+            eps = e;
+        }
+
+        template <
+            typename SUB_TYPE,
+            typename label_iterator
+            >
+        void to_label (
+            const tensor& input_tensor,
+            const SUB_TYPE& sub,
+            label_iterator iter
+        ) const
+        {
+            DLIB_CASSERT(sub.sample_expansion_factor() == 1);
+
+            const tensor& output_tensor = sub.get_output();
+
+            DLIB_CASSERT(output_tensor.nr() == 1 &&
+                         output_tensor.nc() == 1 &&
+                         output_tensor.k() == 1);
+            DLIB_CASSERT(input_tensor.num_samples() == output_tensor.num_samples());
+
+            const float* out_data = output_tensor.host();
+            for (long i = 0; i < output_tensor.num_samples(); ++i)
+            {
+                *iter++ = out_data[i];
+            }
+        }
+
+
+        template <
+            typename const_label_iterator,
+            typename SUBNET
+            >
+        double compute_loss_value_and_gradient (
+            const tensor& input_tensor,
+            const_label_iterator truth,
+            SUBNET& sub
+        ) const
+        {
+            const tensor& output_tensor = sub.get_output();
+            tensor& grad = sub.get_gradient_input();
+
+            DLIB_CASSERT(sub.sample_expansion_factor() == 1);
+            DLIB_CASSERT(input_tensor.num_samples() != 0);
+            DLIB_CASSERT(input_tensor.num_samples()%sub.sample_expansion_factor() == 0);
+            DLIB_CASSERT(input_tensor.num_samples() == grad.num_samples());
+            DLIB_CASSERT(input_tensor.num_samples() == output_tensor.num_samples());
+            DLIB_CASSERT(output_tensor.nr() == 1 &&
+                         output_tensor.nc() == 1 &&
+                         output_tensor.k() == 1);
+            DLIB_CASSERT(grad.nr() == 1 &&
+                         grad.nc() == 1 &&
+                         grad.k() == 1);
+
+            // The loss we output is the average loss over the mini-batch.
+            const double scale = 1.0/output_tensor.num_samples();
+            double loss = 0;
+            float* g = grad.host_write_only();
+            const float* out_data = output_tensor.host();
+            for (long i = 0; i < output_tensor.num_samples(); ++i)
+            {
+                const float y = *truth++;
+                const float err = out_data[i]-y;
+                if (err > eps)
+                {
+                    loss += scale*(err-eps);
+                    g[i] = scale;
+                }
+                else if (err < -eps)
+                {
+                    loss += scale*(eps-err);
+                    g[i] = -scale;
+                }
+            }
+            return loss;
+        }
+
+        friend void serialize(const loss_epsilon_insensitive_& item, std::ostream& out)
+        {
+            serialize("loss_epsilon_insensitive_", out);
+            serialize(item.eps, out);
+        }
+
+        friend void deserialize(loss_epsilon_insensitive_& item, std::istream& in)
+        {
+            std::string version;
+            deserialize(version, in);
+            if (version != "loss_epsilon_insensitive_")
+                throw serialization_error("Unexpected version found while deserializing dlib::loss_epsilon_insensitive_.");
+            deserialize(item.eps, in);
+        }
+
+        friend std::ostream& operator<<(std::ostream& out, const loss_epsilon_insensitive_& item)
+        {
+            out << "loss_epsilon_insensitive epsilon: " << item.eps;
+            return out;
+        }
+
+        friend void to_xml(const loss_epsilon_insensitive_& item, std::ostream& out)
+        {
+            out << "<loss_epsilon_insensitive_ epsilon='" << item.eps << "'/>";
+        }
+
+    private:
+        double eps = 1;
+
+    };
+
+    template <typename SUBNET>
+    using loss_epsilon_insensitive = add_loss_layer<loss_epsilon_insensitive_, SUBNET>;
+
+// ----------------------------------------------------------------------------------------
+
+    class loss_mean_squared_multioutput_
+    {
+    public:
+
+        typedef matrix<float> training_label_type;
+        typedef matrix<float> output_label_type;
+
+        template <
+            typename SUB_TYPE,
+            typename label_iterator
+            >
+        void to_label (
+            const tensor& input_tensor,
+            const SUB_TYPE& sub,
+            label_iterator iter
+        ) const
+        {
+            DLIB_CASSERT(sub.sample_expansion_factor() == 1);
+
+            const tensor& output_tensor = sub.get_output();
+
+            DLIB_CASSERT(output_tensor.nr() == 1 &&
+                         output_tensor.nc() == 1)
+            DLIB_CASSERT(input_tensor.num_samples() == output_tensor.num_samples());
+
+            const float* out_data = output_tensor.host();
+            for (long i = 0; i < output_tensor.num_samples(); ++i)
+            {
+                *iter++ = mat(out_data, output_tensor.k(), 1);
+                out_data += output_tensor.k();
+            }
+        }
+
+
+        template <
+            typename const_label_iterator,
+            typename SUBNET
+            >
+        double compute_loss_value_and_gradient (
+            const tensor& input_tensor,
+            const_label_iterator truth,
+            SUBNET& sub
+        ) const
+        {
+            const tensor& output_tensor = sub.get_output();
+            tensor& grad = sub.get_gradient_input();
+
+            DLIB_CASSERT(sub.sample_expansion_factor() == 1);
+            DLIB_CASSERT(input_tensor.num_samples() != 0);
+            DLIB_CASSERT(input_tensor.num_samples()%sub.sample_expansion_factor() == 0);
+            DLIB_CASSERT(input_tensor.num_samples() == grad.num_samples());
+            DLIB_CASSERT(input_tensor.num_samples() == output_tensor.num_samples());
+            DLIB_CASSERT(output_tensor.nr() == 1 &&
+                         output_tensor.nc() == 1);
+            DLIB_CASSERT(grad.nr() == 1 &&
+                         grad.nc() == 1);
+            DLIB_CASSERT(grad.k() == output_tensor.k());
+            const long k = output_tensor.k();
+            for (long idx = 0; idx < output_tensor.num_samples(); ++idx)
+            {
+                const_label_iterator truth_matrix_ptr = (truth + idx);
+                DLIB_CASSERT((*truth_matrix_ptr).nr() == k &&
+                             (*truth_matrix_ptr).nc() == 1);
+            }
+
+            // The loss we output is the average loss over the mini-batch.
+            const double scale = 1.0/output_tensor.num_samples();
+            double loss = 0;
+            float* g = grad.host_write_only();
+            const float* out_data = output_tensor.host();
+            matrix<float> ytrue;
+            for (long i = 0; i < output_tensor.num_samples(); ++i)
+            {
+                ytrue = *truth++;
+                for (long j = 0; j < output_tensor.k(); ++j)
+                {
+                    const float y = ytrue(j, 0);
+                    const float temp1 = y - *out_data++;
+                    const float temp2 = scale*temp1;
+                    loss += temp2*temp1;
+                    *g = -temp2;
+                    ++g;
+                }
+
+            }
+            return loss;
+        }
+
+        friend void serialize(const loss_mean_squared_multioutput_& , std::ostream& out)
+        {
+            serialize("loss_mean_squared_multioutput_", out);
+        }
+
+        friend void deserialize(loss_mean_squared_multioutput_& , std::istream& in)
+        {
+            std::string version;
+            deserialize(version, in);
+            if (version != "loss_mean_squared_multioutput_")
+                throw serialization_error("Unexpected version found while deserializing dlib::loss_mean_squared_.");
+        }
+
+        friend std::ostream& operator<<(std::ostream& out, const loss_mean_squared_multioutput_& )
+        {
+            out << "loss_mean_squared_multioutput";
+            return out;
+        }
+
+        friend void to_xml(const loss_mean_squared_multioutput_& /*item*/, std::ostream& out)
+        {
+            out << "<loss_mean_squared_multioutput/>";
+        }
+
+    };
+
+    template <typename SUBNET>
+    using loss_mean_squared_multioutput = add_loss_layer<loss_mean_squared_multioutput_, SUBNET>;
+
+// ----------------------------------------------------------------------------------------
+
+    class loss_multiclass_log_per_pixel_
+    {
+    public:
+
+        // In semantic segmentation, if you don't know the ground-truth of some pixel,
+        // set the label of that pixel to this value. When you do so, the pixel will be
+        // ignored when computing gradients.
+        static const uint16_t label_to_ignore = std::numeric_limits<uint16_t>::max();
+
+
+        // In semantic segmentation, 65535 classes ought to be enough for anybody.
+        typedef matrix<uint16_t> training_label_type;
+        typedef matrix<uint16_t> output_label_type;
+
+        template <
+            typename SUB_TYPE,
+            typename label_iterator
+            >
+        static void to_label (
+            const tensor& input_tensor,
+            const SUB_TYPE& sub,
+            label_iterator iter
+        )
+        {
+            DLIB_CASSERT(sub.sample_expansion_factor() == 1);
+
+            const tensor& output_tensor = sub.get_output();
+
+            DLIB_CASSERT(output_tensor.k() >= 1); // Note that output_tensor.k() should match the number of labels.
+            DLIB_CASSERT(output_tensor.k() < std::numeric_limits<uint16_t>::max());
+            DLIB_CASSERT(input_tensor.num_samples() == output_tensor.num_samples());
+
+            const float* const out_data = output_tensor.host();
+
+            // The index of the largest output for each element is the label.
+            const auto find_label = [&](long sample, long r, long c) 
+            {
+                uint16_t label = 0;
+                float max_value = out_data[tensor_index(output_tensor, sample, 0, r, c)];
+                for (long k = 1; k < output_tensor.k(); ++k) 
+                {
+                    const float value = out_data[tensor_index(output_tensor, sample, k, r, c)];
+                    if (value > max_value) 
+                    {
+                        label = static_cast<uint16_t>(k);
+                        max_value = value;
+                    }
+                }
+                return label;
+            };
+
+            for (long i = 0; i < output_tensor.num_samples(); ++i, ++iter) 
+            {
+                iter->set_size(output_tensor.nr(), output_tensor.nc());
+                for (long r = 0; r < output_tensor.nr(); ++r) 
+                {
+                    for (long c = 0; c < output_tensor.nc(); ++c) 
+                    {
+                        // The index of the largest output for this element is the label.
+                        iter->operator()(r, c) = find_label(i, r, c);
+                    }
+                }
+            }
+        }
+
+        template <
+            typename const_label_iterator,
+            typename SUBNET
+            >
+        double compute_loss_value_and_gradient (
+            const tensor& input_tensor,
+            const_label_iterator truth,
+            SUBNET& sub
+        ) const
+        {
+            const tensor& output_tensor = sub.get_output();
+            tensor& grad = sub.get_gradient_input();
+
+            DLIB_CASSERT(sub.sample_expansion_factor() == 1);
+            DLIB_CASSERT(input_tensor.num_samples() != 0);
+            DLIB_CASSERT(input_tensor.num_samples()%sub.sample_expansion_factor() == 0);
+            DLIB_CASSERT(input_tensor.num_samples() == grad.num_samples());
+            DLIB_CASSERT(input_tensor.num_samples() == output_tensor.num_samples());
+            DLIB_CASSERT(output_tensor.k() >= 1);
+            DLIB_CASSERT(output_tensor.k() < std::numeric_limits<uint16_t>::max());
+            DLIB_CASSERT(output_tensor.nr() == grad.nr() &&
+                         output_tensor.nc() == grad.nc() &&
+                         output_tensor.k() == grad.k());
+            for (long idx = 0; idx < output_tensor.num_samples(); ++idx)
+            {
+                const_label_iterator truth_matrix_ptr = (truth + idx);
+                DLIB_CASSERT(truth_matrix_ptr->nr() == output_tensor.nr() &&
+                             truth_matrix_ptr->nc() == output_tensor.nc(),
+                             "truth size = " << truth_matrix_ptr->nr() << " x " << truth_matrix_ptr->nc() << ", "
+                             "output size = " << output_tensor.nr() << " x " << output_tensor.nc());
+            }
+
+            tt::softmax(grad, output_tensor);
+
+            // The loss we output is the average loss over the mini-batch, and also over each element of the matrix output.
+            const double scale = 1.0 / (output_tensor.num_samples() * output_tensor.nr() * output_tensor.nc());
+            double loss = 0;
+            float* const g = grad.host();
+            for (long i = 0; i < output_tensor.num_samples(); ++i, ++truth)
+            {
+                for (long r = 0; r < output_tensor.nr(); ++r)
+                {
+                    for (long c = 0; c < output_tensor.nc(); ++c)
+                    {
+                        const uint16_t y = truth->operator()(r, c);
+                        // The network must produce a number of outputs that is equal to the number
+                        // of labels when using this type of loss.
+                        DLIB_CASSERT(static_cast<long>(y) < output_tensor.k() || y == label_to_ignore,
+                                        "y: " << y << ", output_tensor.k(): " << output_tensor.k());
+                        for (long k = 0; k < output_tensor.k(); ++k)
+                        {
+                            const size_t idx = tensor_index(output_tensor, i, k, r, c);
+                            if (k == y)
+                            {
+                                loss += scale*-safe_log(g[idx]);
+                                g[idx] = scale*(g[idx] - 1);
+                            }
+                            else if (y == label_to_ignore)
+                            {
+                                g[idx] = 0.f;
+                            }
+                            else
+                            {
+                                g[idx] = scale*g[idx];
+                            }
+                        }
+                    }
+                }
+            }
+            return loss;
+        }
+
+        friend void serialize(const loss_multiclass_log_per_pixel_& , std::ostream& out)
+        {
+            serialize("loss_multiclass_log_per_pixel_", out);
+        }
+
+        friend void deserialize(loss_multiclass_log_per_pixel_& , std::istream& in)
+        {
+            std::string version;
+            deserialize(version, in);
+            if (version != "loss_multiclass_log_per_pixel_")
+                throw serialization_error("Unexpected version found while deserializing dlib::loss_multiclass_log_per_pixel_.");
+        }
+
+        friend std::ostream& operator<<(std::ostream& out, const loss_multiclass_log_per_pixel_& )
+        {
+            out << "loss_multiclass_log_per_pixel";
+            return out;
+        }
+
+        friend void to_xml(const loss_multiclass_log_per_pixel_& /*item*/, std::ostream& out)
+        {
+            out << "<loss_multiclass_log_per_pixel/>";
+        }
+
+    private:
+        static size_t tensor_index(const tensor& t, long sample, long k, long row, long column)
+        {
+            // See: https://github.com/davisking/dlib/blob/4dfeb7e186dd1bf6ac91273509f687293bd4230a/dlib/dnn/tensor_abstract.h#L38
+            return ((sample * t.k() + k) * t.nr() + row) * t.nc() + column;
+        }
+
+    };
+
+    template <typename SUBNET>
+    using loss_multiclass_log_per_pixel = add_loss_layer<loss_multiclass_log_per_pixel_, SUBNET>;
+
+// ----------------------------------------------------------------------------------------
+
+    class loss_multiclass_log_per_pixel_weighted_
+    {
+    public:
+
+        struct weighted_label
+        {
+            weighted_label()
+            {}
+
+            weighted_label(uint16_t label, float weight = 1.f)
+                : label(label), weight(weight)
+            {}
+
+            // In semantic segmentation, 65536 classes ought to be enough for anybody.
+            uint16_t label = 0;
+            float weight = 1.f;
+        };
+
+        typedef matrix<weighted_label> training_label_type;
+        typedef matrix<uint16_t> output_label_type;
+
+        template <
+            typename SUB_TYPE,
+            typename label_iterator
+            >
+        static void to_label (
+            const tensor& input_tensor,
+            const SUB_TYPE& sub,
+            label_iterator iter
+        )
+        {
+            loss_multiclass_log_per_pixel_::to_label(input_tensor, sub, iter);
+        }
+
+        template <
+            typename const_label_iterator,
+            typename SUBNET
+            >
+        double compute_loss_value_and_gradient (
+            const tensor& input_tensor,
+            const_label_iterator truth,
+            SUBNET& sub
+        ) const
+        {
+            const tensor& output_tensor = sub.get_output();
+            tensor& grad = sub.get_gradient_input();
+
+            DLIB_CASSERT(sub.sample_expansion_factor() == 1);
+            DLIB_CASSERT(input_tensor.num_samples() != 0);
+            DLIB_CASSERT(input_tensor.num_samples()%sub.sample_expansion_factor() == 0);
+            DLIB_CASSERT(input_tensor.num_samples() == grad.num_samples());
+            DLIB_CASSERT(input_tensor.num_samples() == output_tensor.num_samples());
+            DLIB_CASSERT(output_tensor.k() >= 1);
+            DLIB_CASSERT(output_tensor.k() < std::numeric_limits<uint16_t>::max());
+            DLIB_CASSERT(output_tensor.nr() == grad.nr() &&
+                         output_tensor.nc() == grad.nc() &&
+                         output_tensor.k() == grad.k());
+            for (long idx = 0; idx < output_tensor.num_samples(); ++idx)
+            {
+                const_label_iterator truth_matrix_ptr = (truth + idx);
+                DLIB_CASSERT(truth_matrix_ptr->nr() == output_tensor.nr() &&
+                             truth_matrix_ptr->nc() == output_tensor.nc(),
+                             "truth size = " << truth_matrix_ptr->nr() << " x " << truth_matrix_ptr->nc() << ", "
+                             "output size = " << output_tensor.nr() << " x " << output_tensor.nc());
+            }
+
+            tt::softmax(grad, output_tensor);
+
+            // The loss we output is the weighted average loss over the mini-batch, and also over each element of the matrix output.
+            const double scale = 1.0 / (output_tensor.num_samples() * output_tensor.nr() * output_tensor.nc());
+            double loss = 0;
+            float* const g = grad.host();
+            for (long i = 0; i < output_tensor.num_samples(); ++i, ++truth)
+            {
+                for (long r = 0; r < output_tensor.nr(); ++r)
+                {
+                    for (long c = 0; c < output_tensor.nc(); ++c)
+                    {
+                        const weighted_label& weighted_label = truth->operator()(r, c);
+                        const uint16_t y = weighted_label.label;
+                        const float weight = weighted_label.weight;
+                        // The network must produce a number of outputs that is equal to the number
+                        // of labels when using this type of loss.
+                        DLIB_CASSERT(static_cast<long>(y) < output_tensor.k() || weight == 0.f,
+                                        "y: " << y << ", output_tensor.k(): " << output_tensor.k());
+                        for (long k = 0; k < output_tensor.k(); ++k)
+                        {
+                            const size_t idx = tensor_index(output_tensor, i, k, r, c);
+                            if (k == y)
+                            {
+                                loss += weight*scale*-safe_log(g[idx]);
+                                g[idx] = weight*scale*(g[idx] - 1);
+                            }
+                            else
+                            {
+                                g[idx] = weight*scale*g[idx];
+                            }
+                        }
+                    }
+                }
+            }
+            return loss;
+        }
+
+        friend void serialize(const loss_multiclass_log_per_pixel_weighted_& , std::ostream& out)
+        {
+            serialize("loss_multiclass_log_per_pixel_weighted_", out);
+        }
+
+        friend void deserialize(loss_multiclass_log_per_pixel_weighted_& , std::istream& in)
+        {
+            std::string version;
+            deserialize(version, in);
+            if (version != "loss_multiclass_log_per_pixel_weighted_")
+                throw serialization_error("Unexpected version found while deserializing dlib::loss_multiclass_log_per_pixel_weighted_.");
+        }
+
+        friend std::ostream& operator<<(std::ostream& out, const loss_multiclass_log_per_pixel_weighted_& )
+        {
+            out << "loss_multiclass_log_per_pixel_weighted";
+            return out;
+        }
+
+        friend void to_xml(const loss_multiclass_log_per_pixel_weighted_& /*item*/, std::ostream& out)
+        {
+            out << "<loss_multiclass_log_per_pixel_weighted/>";
+        }
+
+    private:
+        static size_t tensor_index(const tensor& t, long sample, long k, long row, long column)
+        {
+            // See: https://github.com/davisking/dlib/blob/4dfeb7e186dd1bf6ac91273509f687293bd4230a/dlib/dnn/tensor_abstract.h#L38
+            return ((sample * t.k() + k) * t.nr() + row) * t.nc() + column;
+        }
+
+    };
+
+    template <typename SUBNET>
+    using loss_multiclass_log_per_pixel_weighted = add_loss_layer<loss_multiclass_log_per_pixel_weighted_, SUBNET>;
+
+// ----------------------------------------------------------------------------------------
+
+    class loss_mean_squared_per_pixel_
+    {
+    public:
+
+        typedef matrix<float> training_label_type;
+        typedef matrix<float> output_label_type;
+
+        template <
+            typename SUB_TYPE,
+            typename label_iterator
+            >
+        void to_label (
+            const tensor& input_tensor,
+            const SUB_TYPE& sub,
+            label_iterator iter
+        ) const
+        {
+            DLIB_CASSERT(sub.sample_expansion_factor() == 1);
+
+            const tensor& output_tensor = sub.get_output();
+
+            DLIB_CASSERT(output_tensor.k() == 1, "output k = " << output_tensor.k());
+            DLIB_CASSERT(input_tensor.num_samples() == output_tensor.num_samples());
+
+            const float* out_data = output_tensor.host();
+            for (long i = 0; i < output_tensor.num_samples(); ++i, ++iter)
+            {
+                iter->set_size(output_tensor.nr(), output_tensor.nc());
+                for (long r = 0; r < output_tensor.nr(); ++r)
+                {
+                    for (long c = 0; c < output_tensor.nc(); ++c)
+                    {
+                        iter->operator()(r, c) = out_data[tensor_index(output_tensor, i, 0, r, c)];
+                    }
+                }
+            }
+        }
+
+
+        template <
+            typename const_label_iterator,
+            typename SUBNET
+            >
+        double compute_loss_value_and_gradient (
+            const tensor& input_tensor,
+            const_label_iterator truth,
+            SUBNET& sub
+        ) const
+        {
+            const tensor& output_tensor = sub.get_output();
+            tensor& grad = sub.get_gradient_input();
+
+            DLIB_CASSERT(sub.sample_expansion_factor() == 1);
+            DLIB_CASSERT(input_tensor.num_samples() != 0);
+            DLIB_CASSERT(input_tensor.num_samples() % sub.sample_expansion_factor() == 0);
+            DLIB_CASSERT(input_tensor.num_samples() == grad.num_samples());
+            DLIB_CASSERT(input_tensor.num_samples() == output_tensor.num_samples());
+            DLIB_CASSERT(output_tensor.k() >= 1);
+            DLIB_CASSERT(output_tensor.k() < std::numeric_limits<uint16_t>::max());
+            DLIB_CASSERT(output_tensor.nr() == grad.nr() &&
+                output_tensor.nc() == grad.nc() &&
+                output_tensor.k() == grad.k());
+            for (long idx = 0; idx < output_tensor.num_samples(); ++idx)
+            {
+                const_label_iterator truth_matrix_ptr = (truth + idx);
+                DLIB_CASSERT(truth_matrix_ptr->nr() == output_tensor.nr() &&
+                    truth_matrix_ptr->nc() == output_tensor.nc(),
+                    "truth size = " << truth_matrix_ptr->nr() << " x " << truth_matrix_ptr->nc() << ", "
+                    "output size = " << output_tensor.nr() << " x " << output_tensor.nc());
+            }
+
+            // The loss we output is the average loss over the mini-batch, and also over each element of the matrix output.
+            const double scale = 1.0 / (output_tensor.num_samples() * output_tensor.nr() * output_tensor.nc());
+            double loss = 0;
+            float* const g = grad.host();
+            const float* out_data = output_tensor.host();
+            for (long i = 0; i < output_tensor.num_samples(); ++i, ++truth)
+            {
+                for (long r = 0; r < output_tensor.nr(); ++r)
+                {
+                    for (long c = 0; c < output_tensor.nc(); ++c)
+                    {
+                        const float y = truth->operator()(r, c);
+                        const size_t idx = tensor_index(output_tensor, i, 0, r, c);
+                        const float temp1 = y - out_data[idx];
+                        const float temp2 = scale*temp1;
+                        loss += temp2*temp1;
+                        g[idx] = -temp2;
+                    }
+                }
+            }
+            return loss;
+        }
+
+        friend void serialize(const loss_mean_squared_per_pixel_& , std::ostream& out)
+        {
+            serialize("loss_mean_squared_per_pixel_", out);
+        }
+
+        friend void deserialize(loss_mean_squared_per_pixel_& , std::istream& in)
+        {
+            std::string version;
+            deserialize(version, in);
+            if (version != "loss_mean_squared_per_pixel_")
+                throw serialization_error("Unexpected version found while deserializing dlib::loss_mean_squared_per_pixel_.");
+        }
+
+        friend std::ostream& operator<<(std::ostream& out, const loss_mean_squared_per_pixel_& )
+        {
+            out << "loss_mean_squared_per_pixel";
+            return out;
+        }
+
+        friend void to_xml(const loss_mean_squared_per_pixel_& /*item*/, std::ostream& out)
+        {
+            out << "<loss_mean_squared_per_pixel/>";
+        }
+
+    private:
+        static size_t tensor_index(const tensor& t, long sample, long k, long row, long column)
+        {
+            // See: https://github.com/davisking/dlib/blob/4dfeb7e186dd1bf6ac91273509f687293bd4230a/dlib/dnn/tensor_abstract.h#L38
+            return ((sample * t.k() + k) * t.nr() + row) * t.nc() + column;
+        }
+    };
+
+    template <typename SUBNET>
+    using loss_mean_squared_per_pixel = add_loss_layer<loss_mean_squared_per_pixel_, SUBNET>;
+
+// ----------------------------------------------------------------------------------------
+
+    class loss_dot_ 
+    {
+    public:
+
+        typedef matrix<float,0,1> training_label_type;
+        typedef matrix<float,0,1> output_label_type;
+
+        template <
+            typename SUB_TYPE,
+            typename label_iterator
+            >
+        void to_label (
+            const tensor& input_tensor,
+            const SUB_TYPE& sub,
+            label_iterator iter
+        ) const
+        {
+            const tensor& output_tensor = sub.get_output();
+            DLIB_CASSERT(sub.sample_expansion_factor() == 1);
+            DLIB_CASSERT(input_tensor.num_samples() != 0);
+            DLIB_CASSERT(input_tensor.num_samples()%sub.sample_expansion_factor() == 0);
+            DLIB_CASSERT(input_tensor.num_samples() == output_tensor.num_samples());
+
+            for (long i = 0; i < output_tensor.num_samples(); ++i)
+                *iter++ = trans(rowm(mat(output_tensor),i));
+        }
+
+
+        template <
+            typename const_label_iterator,
+            typename SUBNET
+            >
+        double compute_loss_value_and_gradient (
+            const tensor& input_tensor,
+            const_label_iterator truth, 
+            SUBNET& sub
+        ) const
+        {
+            const tensor& output_tensor = sub.get_output();
+            tensor& grad = sub.get_gradient_input();
+
+            DLIB_CASSERT(sub.sample_expansion_factor() == 1);
+            DLIB_CASSERT(input_tensor.num_samples() != 0);
+            DLIB_CASSERT(input_tensor.num_samples()%sub.sample_expansion_factor() == 0);
+            DLIB_CASSERT(input_tensor.num_samples() == grad.num_samples());
+            DLIB_CASSERT(input_tensor.num_samples() == output_tensor.num_samples());
+
+            const long network_output_dims = output_tensor.size()/output_tensor.num_samples();
+
+
+            // The loss we output is the average loss over the mini-batch. 
+            const double scale = 1.0/output_tensor.num_samples();
+            double loss = 0;
+            float* g = grad.host();
+            const float* out_data = output_tensor.host();
+            for (long i = 0; i < output_tensor.num_samples(); ++i)
+            {
+                DLIB_CASSERT(truth->size() == network_output_dims, "The network must output a vector with the same dimensionality as the training labels. "
+                    << "\ntruth->size():       " << truth->size()
+                    << "\nnetwork_output_dims: " << network_output_dims); 
+
+                const float* t = &(*truth++)(0);
+
+                for (long j = 0; j < network_output_dims; ++j)
+                {
+                    g[j] = -t[j]*scale;
+                    loss -= out_data[j]*t[j];
+                }
+
+                g += network_output_dims;
+                out_data += network_output_dims;
+            }
+            return loss*scale;
+        }
+
+        friend void serialize(const loss_dot_& , std::ostream& out)
+        {
+            serialize("loss_dot_", out);
+        }
+
+        friend void deserialize(loss_dot_& , std::istream& in)
+        {
+            std::string version;
+            deserialize(version, in);
+            if (version != "loss_dot_")
+                throw serialization_error("Unexpected version found while deserializing dlib::loss_dot_.");
+        }
+
+        friend std::ostream& operator<<(std::ostream& out, const loss_dot_& )
+        {
+            out << "loss_dot";
+            return out;
+        }
+
+        friend void to_xml(const loss_dot_& /*item*/, std::ostream& out)
+        {
+            out << "<loss_dot/>";
+        }
+
+    };
+
+    template <typename SUBNET>
+    using loss_dot = add_loss_layer<loss_dot_, SUBNET>;
+
+// ----------------------------------------------------------------------------------------
+
+}
+
+#endif // DLIB_DNn_LOSS_H_
+
diff --git a/ml/dlib/dlib/dnn/loss_abstract.h b/ml/dlib/dlib/dnn/loss_abstract.h
new file mode 100644
index 000000000..0dd043677
--- /dev/null
+++ b/ml/dlib/dlib/dnn/loss_abstract.h
@@ -0,0 +1,1542 @@
+// Copyright (C) 2015  Davis E. King (davis@dlib.net)
+// License: Boost Software License   See LICENSE.txt for the full license.
+#undef DLIB_DNn_LOSS_ABSTRACT_H_
+#ifdef DLIB_DNn_LOSS_ABSTRACT_H_
+
+#include "core_abstract.h"
+#include "../image_processing/full_object_detection_abstract.h"
+
+namespace dlib
+{
+
+// ----------------------------------------------------------------------------------------
+
+    class EXAMPLE_LOSS_LAYER_
+    {
+        /*!
+            WHAT THIS OBJECT REPRESENTS
+                A loss layer is the final layer in a deep neural network.  It computes the
+                task loss.  That is, it computes a number that tells us how well the
+                network is performing on some task, such as predicting a binary label.  
+
+                You can use one of the loss layers that comes with dlib (defined below).
+                But importantly, you are able to define your own loss layers to suit your
+                needs.  You do this by creating a class that defines an interface matching
+                the one described by this EXAMPLE_LOSS_LAYER_ class.  Note that there is no
+                dlib::EXAMPLE_LOSS_LAYER_ type.  It is shown here purely to document the
+                interface that a loss layer must implement.
+
+                A loss layer can optionally provide a to_label() method that converts the
+                output of a network into a user defined type.  If to_label() is not
+                provided then the operator() methods of add_loss_layer will not be
+                available, but otherwise everything will function as normal.
+
+                Finally, note that there are two broad flavors of loss layer, supervised
+                and unsupervised.  The EXAMPLE_LOSS_LAYER_ as shown here is a supervised
+                layer.  To make an unsupervised loss you simply leave out the
+                training_label_type typedef and the truth iterator argument to
+                compute_loss_value_and_gradient().
+        !*/
+
+    public:
+
+        // In most cases training_label_type and output_label_type will be the same type.
+        typedef whatever_type_you_use_for_training_labels training_label_type;
+        typedef whatever_type_you_use_for_outout_labels   output_label_type;
+
+        EXAMPLE_LOSS_LAYER_ (
+        );
+        /*!
+            ensures
+                - EXAMPLE_LOSS_LAYER_ objects are default constructable.
+        !*/
+
+        EXAMPLE_LOSS_LAYER_ (
+            const EXAMPLE_LOSS_LAYER_& item
+        );
+        /*!
+            ensures
+                - EXAMPLE_LOSS_LAYER_ objects are copy constructable.
+        !*/
+
+        // Implementing to_label() is optional.
+        template <
+            typename SUB_TYPE,
+            typename label_iterator
+            >
+        void to_label (
+            const tensor& input_tensor,
+            const SUB_TYPE& sub,
+            label_iterator iter
+        ) const;
+        /*!
+            requires
+                - SUBNET implements the SUBNET interface defined at the top of
+                  layers_abstract.h.
+                - input_tensor was given as input to the network sub and the outputs are
+                  now visible in layer<i>(sub).get_output(), for all valid i.
+                - input_tensor.num_samples() > 0
+                - input_tensor.num_samples()%sub.sample_expansion_factor() == 0.
+                - iter == an iterator pointing to the beginning of a range of
+                  input_tensor.num_samples()/sub.sample_expansion_factor() elements.  Moreover,
+                  they must be output_label_type elements.
+            ensures
+                - Converts the output of the provided network to output_label_type objects and
+                  stores the results into the range indicated by iter.  In particular, for
+                  all valid i, it will be the case that:
+                    *(iter+i/sub.sample_expansion_factor()) is populated based on the output of
+                    sub and corresponds to the ith sample in input_tensor.
+        !*/
+
+        template <
+            typename const_label_iterator,
+            typename SUBNET
+            >
+        double compute_loss_value_and_gradient (
+            const tensor& input_tensor,
+            const_label_iterator truth, 
+            SUBNET& sub
+        ) const;
+        /*!
+            requires
+                - SUBNET implements the SUBNET interface defined at the top of
+                  layers_abstract.h.
+                - input_tensor was given as input to the network sub and the outputs are
+                  now visible in layer<i>(sub).get_output(), for all valid i.
+                - input_tensor.num_samples() > 0
+                - input_tensor.num_samples()%sub.sample_expansion_factor() == 0.
+                - for all valid i:
+                    - layer<i>(sub).get_gradient_input() has the same dimensions as
+                      layer<i>(sub).get_output().
+                    - layer<i>(sub).get_gradient_input() contains all zeros (i.e.
+                      initially, all input gradients are 0).
+                - truth == an iterator pointing to the beginning of a range of
+                  input_tensor.num_samples()/sub.sample_expansion_factor() elements.  Moreover,
+                  they must be training_label_type elements.
+                - for all valid i:
+                    - *(truth+i/sub.sample_expansion_factor()) is the label of the ith sample in
+                      input_tensor.
+            ensures
+                - This function computes a loss function that describes how well the output
+                  of sub matches the expected labels given by truth.  Let's write the loss
+                  function as L(input_tensor, truth, sub).  
+                - Then compute_loss_value_and_gradient() computes the gradient of L() with
+                  respect to the outputs in sub.  Specifically, compute_loss_value_and_gradient() 
+                  assigns the gradients into sub by performing the following tensor
+                  assignments, for all valid i: 
+                    - layer<i>(sub).get_gradient_input() = the gradient of
+                      L(input_tensor,truth,sub) with respect to layer<i>(sub).get_output().
+                      Note that, since get_gradient_input() is zero initialized, you don't
+                      have to write gradient information to layers that have a zero
+                      loss gradient.
+                - returns L(input_tensor,truth,sub)
+        !*/
+    };
+
+    std::ostream& operator<<(std::ostream& out, const EXAMPLE_LOSS_LAYER_& item);
+    /*!
+        print a string describing this layer.
+    !*/
+
+    void to_xml(const EXAMPLE_LOSS_LAYER_& item, std::ostream& out);
+    /*!
+        This function is optional, but required if you want to print your networks with
+        net_to_xml().  Therefore, to_xml() prints a layer as XML.
+    !*/
+
+    void serialize(const EXAMPLE_LOSS_LAYER_& item, std::ostream& out);
+    void deserialize(EXAMPLE_LOSS_LAYER_& item, std::istream& in);
+    /*!
+        provides serialization support  
+    !*/
+
+    // For each loss layer you define, always define an add_loss_layer template so that
+    // layers can be easily composed.  Moreover, the convention is that the layer class
+    // ends with an _ while the add_loss_layer template has the same name but without the
+    // trailing _.
+    template <typename SUBNET>
+    using EXAMPLE_LOSS_LAYER = add_loss_layer<EXAMPLE_LOSS_LAYER_, SUBNET>;
+
+// ----------------------------------------------------------------------------------------
+// ----------------------------------------------------------------------------------------
+// ----------------------------------------------------------------------------------------
+
+    class loss_binary_hinge_ 
+    {
+        /*!
+            WHAT THIS OBJECT REPRESENTS
+                This object implements the loss layer interface defined above by
+                EXAMPLE_LOSS_LAYER_.  In particular, it implements the hinge loss, which is
+                appropriate for binary classification problems.  Therefore, the possible
+                labels when using this loss are +1 and -1.  Moreover, it will cause the
+                network to produce outputs > 0 when predicting a member of the +1 class and
+                values < 0 otherwise.
+        !*/
+    public:
+
+        typedef float training_label_type;
+        typedef float output_label_type;
+
+        template <
+            typename SUB_TYPE,
+            typename label_iterator
+            >
+        void to_label (
+            const tensor& input_tensor,
+            const SUB_TYPE& sub,
+            label_iterator iter
+        ) const;
+        /*!
+            This function has the same interface as EXAMPLE_LOSS_LAYER_::to_label() except
+            it has the additional calling requirements that: 
+                - sub.get_output().nr() == 1
+                - sub.get_output().nc() == 1
+                - sub.get_output().k() == 1
+                - sub.get_output().num_samples() == input_tensor.num_samples()
+                - sub.sample_expansion_factor() == 1
+            and the output label is the raw score for each classified object.  If the score
+            is > 0 then the classifier is predicting the +1 class, otherwise it is
+            predicting the -1 class.
+        !*/
+
+        template <
+            typename const_label_iterator,
+            typename SUBNET
+            >
+        double compute_loss_value_and_gradient (
+            const tensor& input_tensor,
+            const_label_iterator truth, 
+            SUBNET& sub
+        ) const;
+        /*!
+            This function has the same interface as EXAMPLE_LOSS_LAYER_::compute_loss_value_and_gradient() 
+            except it has the additional calling requirements that: 
+                - sub.get_output().nr() == 1
+                - sub.get_output().nc() == 1
+                - sub.get_output().k() == 1
+                - sub.get_output().num_samples() == input_tensor.num_samples()
+                - sub.sample_expansion_factor() == 1
+                - all values pointed to by truth are +1 or -1.
+        !*/
+
+    };
+
+    template <typename SUBNET>
+    using loss_binary_hinge = add_loss_layer<loss_binary_hinge_, SUBNET>;
+
+// ----------------------------------------------------------------------------------------
+
+    class loss_binary_log_ 
+    {
+        /*!
+            WHAT THIS OBJECT REPRESENTS
+                This object implements the loss layer interface defined above by
+                EXAMPLE_LOSS_LAYER_.  In particular, it implements the log loss, which is
+                appropriate for binary classification problems.  Therefore, the possible
+                labels when using this loss are +1 and -1.  Moreover, it will cause the
+                network to produce outputs > 0 when predicting a member of the +1 class and
+                values < 0 otherwise.
+
+                To be more specific, this object contains a sigmoid layer followed by a 
+                cross-entropy layer.  
+        !*/
+    public:
+
+        typedef float training_label_type;
+        typedef float output_label_type;
+
+        template <
+            typename SUB_TYPE,
+            typename label_iterator
+            >
+        void to_label (
+            const tensor& input_tensor,
+            const SUB_TYPE& sub,
+            label_iterator iter
+        ) const;
+        /*!
+            This function has the same interface as EXAMPLE_LOSS_LAYER_::to_label() except
+            it has the additional calling requirements that: 
+                - sub.get_output().nr() == 1
+                - sub.get_output().nc() == 1
+                - sub.get_output().k() == 1
+                - sub.get_output().num_samples() == input_tensor.num_samples()
+                - sub.sample_expansion_factor() == 1
+            and the output label is the raw score for each classified object.  If the score
+            is > 0 then the classifier is predicting the +1 class, otherwise it is
+            predicting the -1 class.
+        !*/
+
+        template <
+            typename const_label_iterator,
+            typename SUBNET
+            >
+        double compute_loss_value_and_gradient (
+            const tensor& input_tensor,
+            const_label_iterator truth, 
+            SUBNET& sub
+        ) const;
+        /*!
+            This function has the same interface as EXAMPLE_LOSS_LAYER_::compute_loss_value_and_gradient() 
+            except it has the additional calling requirements that: 
+                - sub.get_output().nr() == 1
+                - sub.get_output().nc() == 1
+                - sub.get_output().k() == 1
+                - sub.get_output().num_samples() == input_tensor.num_samples()
+                - sub.sample_expansion_factor() == 1
+                - all values pointed to by truth are +1 or -1.
+        !*/
+
+    };
+
+    template <typename SUBNET>
+    using loss_binary_log = add_loss_layer<loss_binary_log_, SUBNET>;
+
+// ----------------------------------------------------------------------------------------
+
+    class loss_multiclass_log_ 
+    {
+        /*!
+            WHAT THIS OBJECT REPRESENTS
+                This object implements the loss layer interface defined above by
+                EXAMPLE_LOSS_LAYER_.  In particular, it implements the multiclass logistic
+                regression loss (e.g. negative log-likelihood loss), which is appropriate
+                for multiclass classification problems.  This means that the possible
+                labels when using this loss are integers >= 0.  
+                
+                Moreover, if after training you were to replace the loss layer of the
+                network with a softmax layer, the network outputs would give the
+                probabilities of each class assignment.  That is, if you have K classes
+                then the network should output tensors with the tensor::k()'th dimension
+                equal to K.  Applying softmax to these K values gives the probabilities of
+                each class.  The index into that K dimensional vector with the highest
+                probability is the predicted class label.
+        !*/
+
+    public:
+
+        typedef unsigned long training_label_type;
+        typedef unsigned long output_label_type;
+
+        template <
+            typename SUB_TYPE,
+            typename label_iterator
+            >
+        void to_label (
+            const tensor& input_tensor,
+            const SUB_TYPE& sub,
+            label_iterator iter
+        ) const;
+        /*!
+            This function has the same interface as EXAMPLE_LOSS_LAYER_::to_label() except
+            it has the additional calling requirements that: 
+                - sub.get_output().nr() == 1
+                - sub.get_output().nc() == 1
+                - sub.get_output().num_samples() == input_tensor.num_samples()
+                - sub.sample_expansion_factor() == 1
+            and the output label is the predicted class for each classified object.  The number
+            of possible output classes is sub.get_output().k().
+        !*/
+
+        template <
+            typename const_label_iterator,
+            typename SUBNET
+            >
+        double compute_loss_value_and_gradient (
+            const tensor& input_tensor,
+            const_label_iterator truth, 
+            SUBNET& sub
+        ) const;
+        /*!
+            This function has the same interface as EXAMPLE_LOSS_LAYER_::compute_loss_value_and_gradient() 
+            except it has the additional calling requirements that: 
+                - sub.get_output().nr() == 1
+                - sub.get_output().nc() == 1
+                - sub.get_output().num_samples() == input_tensor.num_samples()
+                - sub.sample_expansion_factor() == 1
+                - all values pointed to by truth are < sub.get_output().k()
+        !*/
+
+    };
+
+    template <typename SUBNET>
+    using loss_multiclass_log = add_loss_layer<loss_multiclass_log_, SUBNET>;
+
+// ----------------------------------------------------------------------------------------
+
+    class loss_multimulticlass_log_ 
+    {
+        /*!
+            WHAT THIS OBJECT REPRESENTS
+                This object implements the loss layer interface defined above by
+                EXAMPLE_LOSS_LAYER_.  In particular, it implements a collection of
+                multiclass classifiers.  An example will make its use clear.  So suppose,
+                for example, that you want to make something that takes a picture of a
+                vehicle and answers the following questions:
+                    - What type of vehicle is it? A sedan or a truck?
+                    - What color is it? red, green, blue, gray, or black?
+                You need two separate multi-class classifiers to do this.  One to decide
+                the type of vehicle, and another to decide the color.  The
+                loss_multimulticlass_log_ allows you to pack these two classifiers into one
+                neural network.  This means that when you use the network to process an
+                image it will output 2 labels for each image, the type label and the color
+                label.  
+
+                To create a loss_multimulticlass_log_ for the above case you would
+                construct it as follows:
+                    std::map<std::string,std::vector<std::string>> labels;
+                    labels["type"] = {"sedan", "truck"};
+                    labels["color"] = {"red", "green", "blue", "gray", "black"};
+                    loss_multimulticlass_log_ myloss(labels);
+                Then you could use myloss with a network object and train it to do this
+                task.  More generally, you can use any number of classifiers and labels
+                when using this object.  Finally, each of the classifiers uses a standard
+                multi-class logistic regression loss.
+        !*/
+
+    public:
+
+        loss_multimulticlass_log_(
+        ); 
+        /*!
+            ensures
+                - #number_of_labels() == 0
+                - #get_labels().size() == 0
+        !*/
+
+        loss_multimulticlass_log_ (
+            const std::map<std::string,std::vector<std::string>>& labels
+        );
+        /*!
+            requires
+                - Each vector in labels must contain at least 2 strings.  I.e. each
+                  classifier must have at least two possible labels.
+            ensures
+                - #number_of_labels() == the total number of strings in all the
+                  std::vectors in labels.
+                - #number_of_classifiers() == labels.size()
+                - #get_labels() == labels
+        !*/
+
+        unsigned long number_of_labels(
+        ) const; 
+        /*!
+            ensures
+                - returns the total number of labels known to this loss.  This is the count of 
+                  all the labels in each classifier.
+        !*/
+
+        unsigned long number_of_classifiers(
+        ) const; 
+        /*!
+            ensures
+                - returns the number of classifiers defined by this loss.
+        !*/
+
+        std::map<std::string,std::vector<std::string>> get_labels ( 
+        ) const;
+        /*!
+            ensures
+                - returns the names of the classifiers and labels used by this loss.  In
+                  particular, if the returned object is L then:
+                    - L[CLASS] == the set of labels used by the classifier CLASS.
+                    - L.size() == number_of_classifiers()
+                    - The count of strings in the vectors in L == number_of_labels()
+        !*/
+
+        class classifier_output
+        {
+            /*!
+                WHAT THIS OBJECT REPRESENTS
+                    This object stores the predictions from one of the classifiers in
+                    loss_multimulticlass_log_.  It allows you to find out the most likely
+                    string label predicted by that classifier, as well as get the class
+                    conditional probability of any of the classes in the classifier.
+            !*/
+
+        public:
+
+            classifier_output(
+            );
+            /*!
+                ensures
+                    - #num_classes() == 0
+            !*/
+
+            size_t num_classes(
+            ) const; 
+            /*!
+                ensures
+                    - returns the number of possible classes output by this classifier.
+            !*/
+
+            double probability_of_class (
+                size_t i
+            ) const;
+            /*!
+                requires
+                    - i < num_classes()
+                ensures
+                    - returns the probability that the true class has a label of label(i).
+                    - The sum of probability_of_class(j) for j in the range [0, num_classes()) is always 1.
+            !*/
+
+            const std::string& label(
+                size_t i
+            ) const;
+            /*!
+                requires
+                    - i < num_classes()
+                ensures
+                    - returns the string label for the ith class.
+            !*/
+
+            operator std::string(
+            ) const;
+            /*!
+                requires
+                    - num_classes() != 0
+                ensures
+                    - returns the string label for the most probable class.
+            !*/
+
+            friend std::ostream& operator<< (std::ostream& out, const classifier_output& item);
+            /*!
+                requires
+                    - num_classes() != 0
+                ensures
+                    - prints the most probable class label to out.
+            !*/
+
+        };
+
+        // Both training_label_type and output_label_type should always have sizes equal to
+        // number_of_classifiers().  That is, the std::map should have an entry for every
+        // classifier known to this loss.
+        typedef std::map<std::string,std::string> training_label_type;
+        typedef std::map<std::string,classifier_output> output_label_type;
+
+
+        template <
+            typename SUB_TYPE,
+            typename label_iterator
+            >
+        void to_label (
+            const tensor& input_tensor,
+            const SUB_TYPE& sub,
+            label_iterator iter
+        ) const;
+        /*!
+            This function has the same interface as EXAMPLE_LOSS_LAYER_::to_label() except
+            it has the additional calling requirements that: 
+                - number_of_labels() != 0
+                - sub.get_output().k() == number_of_labels()
+                - sub.get_output().nr() == 1
+                - sub.get_output().nc() == 1
+                - sub.get_output().num_samples() == input_tensor.num_samples()
+                - sub.sample_expansion_factor() == 1
+        !*/
+
+        template <
+            typename const_label_iterator,
+            typename SUBNET
+            >
+        double compute_loss_value_and_gradient (
+            const tensor& input_tensor,
+            const_label_iterator truth, 
+            SUBNET& sub
+        ) const;
+        /*!
+            This function has the same interface as EXAMPLE_LOSS_LAYER_::compute_loss_value_and_gradient() 
+            except it has the additional calling requirements that: 
+                - number_of_labels() != 0
+                - sub.get_output().k() == number_of_labels()
+                    It should be noted that the last layer in your network should usually
+                    be an fc layer.  If so, you can satisfy this requirement of k() being
+                    number_of_labels() by calling set_num_outputs() prior to training your
+                    network like so:
+                    your_network.subnet().layer_details().set_num_outputs(your_network.loss_details().number_of_labels());
+                - sub.get_output().nr() == 1
+                - sub.get_output().nc() == 1
+                - sub.get_output().num_samples() == input_tensor.num_samples()
+                - sub.sample_expansion_factor() == 1
+                - All the std::maps pointed to by truth contain entries for all the
+                  classifiers known to this loss.  That is, it must be valid to call
+                  truth[i][classifier] for any of the classifiers known to this loss.  To
+                  say this another way, all the training samples must contain labels for
+                  each of the classifiers defined by this loss.
+
+                  To really belabor this, this also means that truth[i].size() ==
+                  get_labels().size() and that both truth[i] and get_labels() have the same
+                  set of key strings.  It also means that the value strings in truth[i]
+                  must be strings known to the loss, i.e. they are valid labels according
+                  to get_labels().
+        !*/
+    };
+
+    template <typename SUBNET>
+    using loss_multimulticlass_log = add_loss_layer<loss_multimulticlass_log_, SUBNET>;
+
+    // Allow comparison between classifier_outputs and std::string to check if the
+    // predicted class is a particular string.
+    inline bool operator== (const std::string& lhs, const loss_multimulticlass_log_::classifier_output& rhs)
+    { return lhs == static_cast<const std::string&>(rhs); }
+    inline bool operator== (const loss_multimulticlass_log_::classifier_output& lhs, const std::string& rhs)
+    { return rhs == static_cast<const std::string&>(lhs); }
+
+// ----------------------------------------------------------------------------------------
+// ----------------------------------------------------------------------------------------
+
+    enum class use_image_pyramid : uint8_t
+    {
+        no,
+        yes
+    };
+
+    struct mmod_options
+    {
+        /*!
+            WHAT THIS OBJECT REPRESENTS
+                This object contains all the parameters that control the behavior of loss_mmod_.
+        !*/
+
+    public:
+
+        struct detector_window_details
+        {
+            detector_window_details() = default; 
+            detector_window_details(unsigned long w, unsigned long h) : width(w), height(h) {}
+            detector_window_details(unsigned long w, unsigned long h, const std::string& l) : width(w), height(h), label(l) {}
+
+            unsigned long width = 0;
+            unsigned long height = 0;
+            std::string label;
+
+            friend inline void serialize(const detector_window_details& item, std::ostream& out);
+            friend inline void deserialize(detector_window_details& item, std::istream& in);
+        };
+
+        mmod_options() = default;
+
+        // This kind of object detector is a sliding window detector.  The detector_windows
+        // field determines how many sliding windows we will use and what the shape of each
+        // window is.  It also determines the output label applied to each detection
+        // identified by each window.  Since you will usually use the MMOD loss with an
+        // image pyramid, the detector sizes also determine the size of the smallest object
+        // you can detect.
+        std::vector<detector_window_details> detector_windows;
+
+        // These parameters control how we penalize different kinds of mistakes.  See 
+        // Max-Margin Object Detection by Davis E. King (http://arxiv.org/abs/1502.00046)
+        // for further details.
+        double loss_per_false_alarm = 1;
+        double loss_per_missed_target = 1;
+
+        // A detection must have an intersection-over-union value greater than this for us
+        // to consider it a match against a ground truth box.
+        double truth_match_iou_threshold = 0.5;
+
+        // When doing non-max suppression, we use overlaps_nms to decide if a box overlaps
+        // an already output detection and should therefore be thrown out.
+        test_box_overlap overlaps_nms = test_box_overlap(0.4);
+
+        // Any mmod_rect in the training data that has its ignore field set to true defines
+        // an "ignore zone" in an image.  Any detection from that area is totally ignored
+        // by the optimizer.  Therefore, this overlaps_ignore field defines how we decide
+        // if a box falls into an ignore zone.  You use these ignore zones if there are
+        // objects in your dataset that you are unsure if you want to detect or otherwise
+        // don't care if the detector gets them or not.  
+        test_box_overlap overlaps_ignore;
+
+        // Usually the detector would be scale-invariant, and used with an image pyramid.
+        // However, sometimes scale-invariance may not be desired.
+        use_image_pyramid assume_image_pyramid = use_image_pyramid::yes;
+
+        mmod_options (
+            const std::vector<std::vector<mmod_rect>>& boxes,
+            const unsigned long target_size,      
+            const unsigned long min_target_size,   
+            const double min_detector_window_overlap_iou = 0.75
+        );
+        /*!
+            requires
+                - 0 < min_target_size <= target_size
+                - 0.5 < min_detector_window_overlap_iou < 1
+            ensures
+                - use_image_pyramid_ == use_image_pyramid::yes
+                - This function should be used when scale-invariance is desired, and
+                  input_rgb_image_pyramid is therefore used as the input layer.
+                - This function tries to automatically set the MMOD options to reasonable
+                  values, assuming you have a training dataset of boxes.size() images, where
+                  the ith image contains objects boxes[i] you want to detect.
+                - The most important thing this function does is decide what detector
+                  windows should be used.  This is done by finding a set of detector
+                  windows that are sized such that:
+                    - When slid over an image pyramid, each box in boxes will have an
+                      intersection-over-union with one of the detector windows of at least
+                      min_detector_window_overlap_iou.  That is, we will make sure that
+                      each box in boxes could potentially be detected by one of the
+                      detector windows.  This essentially comes down to picking detector
+                      windows with aspect ratios similar to the aspect ratios in boxes.
+                      Note that we also make sure that each box can be detected by a window
+                      with the same label.  For example, if all the boxes had the same
+                      aspect ratio but there were 4 different labels used in boxes then
+                      there would be 4 resulting detector windows, one for each label.
+                    - The longest edge of each detector window is target_size pixels in
+                      length, unless the window's shortest side would be less than
+                      min_target_size pixels in length.  In this case the shortest side
+                      will be set to min_target_size length, and the other side sized to
+                      preserve the aspect ratio of the window.  
+                  This means that target_size and min_target_size control the size of the
+                  detector windows, while the aspect ratios of the detector windows are
+                  automatically determined by the contents of boxes.  It should also be
+                  emphasized that the detector isn't going to be able to detect objects
+                  smaller than any of the detector windows.  So consider that when setting
+                  these sizes.
+                - This function will also set the overlaps_nms tester to the most
+                  restrictive tester that doesn't reject anything in boxes.
+        !*/
+
+        mmod_options (
+            use_image_pyramid use_image_pyramid,
+            const std::vector<std::vector<mmod_rect>>& boxes,
+            const double min_detector_window_overlap_iou = 0.75
+        );
+        /*!
+            requires
+                - use_image_pyramid == use_image_pyramid::no
+                - 0.5 < min_detector_window_overlap_iou < 1
+            ensures
+                - This function should be used when scale-invariance is not desired, and
+                  there is no intention to apply an image pyramid.
+                - This function tries to automatically set the MMOD options to reasonable
+                  values, assuming you have a training dataset of boxes.size() images, where
+                  the ith image contains objects boxes[i] you want to detect.
+                - The most important thing this function does is decide what detector
+                  windows should be used.  This is done by finding a set of detector
+                  windows that are sized such that:
+                    - When slid over an image, each box in boxes will have an
+                      intersection-over-union with one of the detector windows of at least
+                      min_detector_window_overlap_iou.  That is, we will make sure that
+                      each box in boxes could potentially be detected by one of the
+                      detector windows.
+                - This function will also set the overlaps_nms tester to the most
+                  restrictive tester that doesn't reject anything in boxes.
+        !*/
+    };
+
+    void serialize(const mmod_options& item, std::ostream& out);
+    void deserialize(mmod_options& item, std::istream& in);
+
+// ----------------------------------------------------------------------------------------
+
+    class loss_mmod_ 
+    {
+        /*!
+            WHAT THIS OBJECT REPRESENTS
+                This object implements the loss layer interface defined above by
+                EXAMPLE_LOSS_LAYER_.  In particular, it implements the Max Margin Object
+                Detection loss defined in the paper:
+                    Max-Margin Object Detection by Davis E. King (http://arxiv.org/abs/1502.00046).
+               
+                This means you use this loss if you want to detect the locations of objects
+                in images.
+
+                It should also be noted that this loss layer requires an input layer that
+                defines the following functions:
+                    - image_contained_point()
+                    - tensor_space_to_image_space()
+                    - image_space_to_tensor_space()
+                A reference implementation of them and their definitions can be found in
+                the input_rgb_image_pyramid object, which is the recommended input layer to
+                be used with loss_mmod_.
+        !*/
+
+    public:
+
+        typedef std::vector<mmod_rect> training_label_type;
+        typedef std::vector<mmod_rect> output_label_type;
+
+        loss_mmod_(
+        );
+        /*!
+            ensures
+                - #get_options() == mmod_options()
+        !*/
+
+        loss_mmod_(
+            mmod_options options_
+        );
+        /*!
+            ensures
+                - #get_options() == options_
+        !*/
+
+        const mmod_options& get_options (
+        ) const;
+        /*!
+            ensures
+                - returns the options object that defines the general behavior of this loss layer.
+        !*/
+
+        template <
+            typename SUB_TYPE,
+            typename label_iterator
+            >
+        void to_label (
+            const tensor& input_tensor,
+            const SUB_TYPE& sub,
+            label_iterator iter,
+            double adjust_threshold = 0
+        ) const;
+        /*!
+            This function has the same interface as EXAMPLE_LOSS_LAYER_::to_label() except
+            it has the additional calling requirements that: 
+                - sub.get_output().k() == 1
+                - sub.get_output().num_samples() == input_tensor.num_samples()
+                - sub.sample_expansion_factor() == 1
+            Also, the output labels are std::vectors of mmod_rects where, for each mmod_rect R,
+            we have the following interpretations:
+                - R.rect == the location of an object in the image.
+                - R.detection_confidence the score for the object, the bigger the score the
+                  more confident the detector is that an object is really there.  Only
+                  objects with a detection_confidence > adjust_threshold are output.  So if
+                  you want to output more objects (that are also of less confidence) you
+                  can call to_label() with a smaller value of adjust_threshold.
+                - R.ignore == false (this value is unused by to_label()).
+        !*/
+
+        template <
+            typename const_label_iterator,
+            typename SUBNET
+            >
+        double compute_loss_value_and_gradient (
+            const tensor& input_tensor,
+            const_label_iterator truth, 
+            SUBNET& sub
+        ) const;
+        /*!
+            This function has the same interface as EXAMPLE_LOSS_LAYER_::compute_loss_value_and_gradient() 
+            except it has the additional calling requirements that: 
+                - sub.get_output().k() == 1
+                - sub.get_output().num_samples() == input_tensor.num_samples()
+                - sub.sample_expansion_factor() == 1
+            Also, the loss value returned is roughly equal to the average number of
+            mistakes made per image.  This is the sum of false alarms and missed
+            detections, weighted by the loss weights for these types of mistakes specified
+            in the mmod_options.
+        !*/
+    };
+
+    template <typename SUBNET>
+    using loss_mmod = add_loss_layer<loss_mmod_, SUBNET>;
+
+// ----------------------------------------------------------------------------------------
+
+    class loss_metric_ 
+    {
+        /*!
+            WHAT THIS OBJECT REPRESENTS
+                This object implements the loss layer interface defined above by
+                EXAMPLE_LOSS_LAYER_.  In particular, it allows you to learn to map objects
+                into a vector space where objects sharing the same class label are close to
+                each other, while objects with different labels are far apart.   
+
+                To be specific, it optimizes the following loss function which considers
+                all pairs of objects in a mini-batch and computes a different loss depending 
+                on their respective class labels.  So if objects A1 and A2 in a mini-batch
+                share the same class label then their contribution to the loss is:
+                    max(0, length(A1-A2)-get_distance_threshold() + get_margin())
+
+                While if A1 and B1 have different class labels then their contribution to
+                the loss function is:
+                    max(0, get_distance_threshold()-length(A1-B1) + get_margin())
+
+                Therefore, this loss layer optimizes a version of the hinge loss.
+                Moreover, the loss is trying to make sure that all objects with the same
+                label are within get_distance_threshold() distance of each other.
+                Conversely, if two objects have different labels then they should be more
+                than get_distance_threshold() distance from each other in the learned
+                embedding.  So this loss function gives you a natural decision boundary for
+                deciding if two objects are from the same class.
+
+                Finally, the loss balances the number of negative pairs relative to the
+                number of positive pairs.  Therefore, if there are N pairs that share the
+                same identity in a mini-batch then the algorithm will only include the N
+                worst non-matching pairs in the loss.  That is, the algorithm performs hard
+                negative mining on the non-matching pairs.  This is important since there
+                are in general way more non-matching pairs than matching pairs.  So to
+                avoid imbalance in the loss this kind of hard negative mining is useful.
+        !*/
+    public:
+
+        typedef unsigned long training_label_type;
+        typedef matrix<float,0,1> output_label_type;
+
+        loss_metric_(
+        );
+        /*!
+            ensures
+                - #get_margin() == 0.04
+                - #get_distance_threshold() == 0.6
+        !*/
+
+        loss_metric_(
+            float margin,
+            float dist_thresh
+        );
+        /*!
+            requires
+                - margin > 0
+                - dist_thresh > 0
+            ensures
+                - #get_margin() == margin
+                - #get_distance_threshold() == dist_thresh
+        !*/
+
+        template <
+            typename SUB_TYPE,
+            typename label_iterator
+            >
+        void to_label (
+            const tensor& input_tensor,
+            const SUB_TYPE& sub,
+            label_iterator iter
+        ) const;
+        /*!
+            This function has the same interface as EXAMPLE_LOSS_LAYER_::to_label() except
+            it has the additional calling requirements that: 
+                - sub.get_output().nr() == 1
+                - sub.get_output().nc() == 1
+                - sub.get_output().num_samples() == input_tensor.num_samples()
+                - sub.sample_expansion_factor() == 1
+            This loss expects the network to produce a single vector (per sample) as
+            output.  This vector is the learned embedding.  Therefore, to_label() just
+            copies these output vectors from the network into the output label_iterators
+            given to this function, one for each sample in the input_tensor.
+        !*/
+
+        float get_margin() const; 
+        /*!
+            ensures
+                - returns the margin value used by the loss function.  See the discussion
+                  in WHAT THIS OBJECT REPRESENTS for details.
+        !*/
+
+        float get_distance_threshold() const; 
+        /*!
+            ensures
+                - returns the distance threshold value used by the loss function.  See the discussion
+                  in WHAT THIS OBJECT REPRESENTS for details.
+        !*/
+
+        template <
+            typename const_label_iterator,
+            typename SUBNET
+            >
+        double compute_loss_value_and_gradient (
+            const tensor& input_tensor,
+            const_label_iterator truth, 
+            SUBNET& sub
+        ) const;
+        /*!
+            This function has the same interface as EXAMPLE_LOSS_LAYER_::compute_loss_value_and_gradient() 
+            except it has the additional calling requirements that: 
+                - sub.get_output().nr() == 1
+                - sub.get_output().nc() == 1
+                - sub.get_output().num_samples() == input_tensor.num_samples()
+                - sub.sample_expansion_factor() == 1
+        !*/
+
+    };
+
+    template <typename SUBNET>
+    using loss_metric = add_loss_layer<loss_metric_, SUBNET>;
+
+// ----------------------------------------------------------------------------------------
+
+    class loss_ranking_
+    {
+        /*!
+            WHAT THIS OBJECT REPRESENTS
+                This object implements the loss layer interface defined above by
+                EXAMPLE_LOSS_LAYER_.  In particular, it implements the pairwise ranking
+                loss described in the paper:
+                    Optimizing Search Engines using Clickthrough Data by Thorsten Joachims
+
+                This is the same loss function used by the dlib::svm_rank_trainer object.
+                Therefore, it is generally appropriate when you have a two class problem
+                and you want to learn a function that ranks one class before the other.  
+
+                So for example, suppose you have two classes of data.  Objects of type A
+                and objects of type B.  Moreover, suppose that you want to sort the objects
+                so that A objects always come before B objects.  This loss will help you
+                learn a function that assigns a real number to each object such that A
+                objects get a larger number assigned to them than B objects.  This lets you
+                then sort the objects according to the output of the neural network and
+                obtain the desired result of having A objects come before B objects.
+
+                The training labels should be positive values for objects you want to get
+                high scores and negative for objects that should get small scores.  So
+                relative to our A/B example, you would give A objects labels of +1 and B
+                objects labels of -1.  This should cause the learned network to give A
+                objects large positive values and B objects negative values.
+
+
+                Finally, the specific loss function is:
+                    For all pairs of positive vs negative training examples A_i and B_j respectively:
+                      sum_ij:  max(0, B_i - A_j + margin_ij)
+                where margin_ij = the label for A_j minus the label for B_i.  If you
+                always use +1 and -1 labels then the margin is always 2.  However, this
+                formulation allows you to give certain training samples different weight by
+                adjusting the training labels appropriately.  
+        !*/
+
+    public:
+
+        typedef float training_label_type;
+        typedef float output_label_type;
+
+        template <
+            typename SUB_TYPE,
+            typename label_iterator
+            >
+        void to_label (
+            const tensor& input_tensor,
+            const SUB_TYPE& sub,
+            label_iterator iter
+        ) const;
+        /*!
+            This function has the same interface as EXAMPLE_LOSS_LAYER_::to_label() except
+            it has the additional calling requirements that:
+                - sub.get_output().nr() == 1
+                - sub.get_output().nc() == 1
+                - sub.get_output().k() == 1
+                - sub.get_output().num_samples() == input_tensor.num_samples()
+                - sub.sample_expansion_factor() == 1
+            and the output label is the predicted ranking score.
+        !*/
+
+        template <
+            typename const_label_iterator,
+            typename SUBNET
+            >
+        double compute_loss_value_and_gradient (
+            const tensor& input_tensor,
+            const_label_iterator truth,
+            SUBNET& sub
+        ) const;
+        /*!
+            This function has the same interface as EXAMPLE_LOSS_LAYER_::compute_loss_value_and_gradient()
+            except it has the additional calling requirements that:
+                - sub.get_output().nr() == 1
+                - sub.get_output().nc() == 1
+                - sub.get_output().k() == 1
+                - sub.get_output().num_samples() == input_tensor.num_samples()
+                - sub.sample_expansion_factor() == 1
+        !*/
+
+    };
+
+    template <typename SUBNET>
+    using loss_ranking = add_loss_layer<loss_ranking_, SUBNET>;
+
+// ----------------------------------------------------------------------------------------
+
+    class loss_epsilon_insensitive_
+    {
+        /*!
+            WHAT THIS OBJECT REPRESENTS
+                This object implements the loss layer interface defined above by
+                EXAMPLE_LOSS_LAYER_.  In particular, it implements the epsilon insensitive
+                loss, which is appropriate for regression problems.  In particular, this
+                loss function is;
+                    loss(y1,y2) = abs(y1-y2)<epsilon ? 0 : abs(y1-y2)-epsilon
+
+                Therefore, the loss is basically just the abs() loss except there is a dead
+                zone around zero, causing the loss to not care about mistakes of magnitude
+                smaller than epsilon.
+        !*/
+    public:
+
+        typedef float training_label_type;
+        typedef float output_label_type;
+
+        loss_epsilon_insensitive_(
+        ) = default;
+        /*!
+            ensures
+                - #get_epsilon() == 1
+        !*/
+
+        loss_epsilon_insensitive_(
+            double eps
+        );
+        /*!
+            requires
+                - eps >= 0
+            ensures
+                - #get_epsilon() == eps
+        !*/
+
+        double get_epsilon (
+        ) const;
+        /*!
+            ensures
+                - returns the epsilon value used in the loss function.  Mistakes in the
+                  regressor smaller than get_epsilon() are ignored by the loss function.
+        !*/
+
+        void set_epsilon(
+            double eps
+        );
+        /*!
+            requires
+                - eps >= 0
+            ensures
+                - #get_epsilon() == eps
+        !*/
+
+        template <
+            typename SUB_TYPE,
+            typename label_iterator
+            >
+        void to_label (
+            const tensor& input_tensor,
+            const SUB_TYPE& sub,
+            label_iterator iter
+        ) const;
+        /*!
+            This function has the same interface as EXAMPLE_LOSS_LAYER_::to_label() except
+            it has the additional calling requirements that:
+                - sub.get_output().nr() == 1
+                - sub.get_output().nc() == 1
+                - sub.get_output().k() == 1
+                - sub.get_output().num_samples() == input_tensor.num_samples()
+                - sub.sample_expansion_factor() == 1
+            and the output label is the predicted continuous variable.
+        !*/
+
+        template <
+            typename const_label_iterator,
+            typename SUBNET
+            >
+        double compute_loss_value_and_gradient (
+            const tensor& input_tensor,
+            const_label_iterator truth,
+            SUBNET& sub
+        ) const;
+        /*!
+            This function has the same interface as EXAMPLE_LOSS_LAYER_::compute_loss_value_and_gradient()
+            except it has the additional calling requirements that:
+                - sub.get_output().nr() == 1
+                - sub.get_output().nc() == 1
+                - sub.get_output().k() == 1
+                - sub.get_output().num_samples() == input_tensor.num_samples()
+                - sub.sample_expansion_factor() == 1
+        !*/
+
+    };
+
+    template <typename SUBNET>
+    using loss_epsilon_insensitive = add_loss_layer<loss_epsilon_insensitive_, SUBNET>;
+
+// ----------------------------------------------------------------------------------------
+
+    class loss_mean_squared_
+    {
+        /*!
+            WHAT THIS OBJECT REPRESENTS
+                This object implements the loss layer interface defined above by
+                EXAMPLE_LOSS_LAYER_.  In particular, it implements the mean squared loss, which is
+                appropriate for regression problems.
+        !*/
+    public:
+
+        typedef float training_label_type;
+        typedef float output_label_type;
+
+        template <
+            typename SUB_TYPE,
+            typename label_iterator
+            >
+        void to_label (
+            const tensor& input_tensor,
+            const SUB_TYPE& sub,
+            label_iterator iter
+        ) const;
+        /*!
+            This function has the same interface as EXAMPLE_LOSS_LAYER_::to_label() except
+            it has the additional calling requirements that:
+                - sub.get_output().nr() == 1
+                - sub.get_output().nc() == 1
+                - sub.get_output().k() == 1
+                - sub.get_output().num_samples() == input_tensor.num_samples()
+                - sub.sample_expansion_factor() == 1
+            and the output label is the predicted continuous variable.
+        !*/
+
+        template <
+            typename const_label_iterator,
+            typename SUBNET
+            >
+        double compute_loss_value_and_gradient (
+            const tensor& input_tensor,
+            const_label_iterator truth,
+            SUBNET& sub
+        ) const;
+        /*!
+            This function has the same interface as EXAMPLE_LOSS_LAYER_::compute_loss_value_and_gradient()
+            except it has the additional calling requirements that:
+                - sub.get_output().nr() == 1
+                - sub.get_output().nc() == 1
+                - sub.get_output().k() == 1
+                - sub.get_output().num_samples() == input_tensor.num_samples()
+                - sub.sample_expansion_factor() == 1
+        !*/
+
+    };
+
+    template <typename SUBNET>
+    using loss_mean_squared = add_loss_layer<loss_mean_squared_, SUBNET>;
+
+// ----------------------------------------------------------------------------------------
+
+    class loss_mean_squared_multioutput_
+    {
+        /*!
+            WHAT THIS OBJECT REPRESENTS
+                This object implements the loss layer interface defined above by
+                EXAMPLE_LOSS_LAYER_.  In particular, it implements the mean squared loss,
+                which is appropriate for regression problems.  It is basically just like
+                loss_mean_squared_ except that it lets you define multiple outputs instead
+                of just 1.
+        !*/
+    public:
+
+        typedef matrix<float> training_label_type;
+        typedef matrix<float> output_label_type;
+
+        template <
+            typename SUB_TYPE,
+            typename label_iterator
+            >
+        void to_label (
+            const tensor& input_tensor,
+            const SUB_TYPE& sub,
+            label_iterator iter
+        ) const;
+        /*!
+            This function has the same interface as EXAMPLE_LOSS_LAYER_::to_label() except
+            it has the additional calling requirements that:
+                - sub.get_output().nr() == 1
+                - sub.get_output().nc() == 1
+                - sub.get_output().num_samples() == input_tensor.num_samples()
+                - sub.sample_expansion_factor() == 1
+            and the output label is the predicted continuous variable.
+        !*/
+
+        template <
+            typename const_label_iterator,
+            typename SUBNET
+            >
+        double compute_loss_value_and_gradient (
+            const tensor& input_tensor,
+            const_label_iterator truth,
+            SUBNET& sub
+        ) const;
+        /*!
+            This function has the same interface as EXAMPLE_LOSS_LAYER_::compute_loss_value_and_gradient()
+            except it has the additional calling requirements that:
+                - sub.get_output().nr() == 1
+                - sub.get_output().nc() == 1
+                - sub.get_output().num_samples() == input_tensor.num_samples()
+                - sub.sample_expansion_factor() == 1
+                - (*(truth + idx)).nc() == 1 for all idx such that 0 <= idx < sub.get_output().num_samples()
+                - (*(truth + idx)).nr() == sub.get_output().k() for all idx such that 0 <= idx < sub.get_output().num_samples()
+        !*/
+
+    };
+
+    template <typename SUBNET>
+    using loss_mean_squared_multioutput = add_loss_layer<loss_mean_squared_multioutput_, SUBNET>;
+
+// ----------------------------------------------------------------------------------------
+
+    class loss_multiclass_log_per_pixel_
+    {
+        /*!
+            WHAT THIS OBJECT REPRESENTS
+                This object implements the loss layer interface defined above by
+                EXAMPLE_LOSS_LAYER_.  In particular, it implements the multiclass logistic
+                regression loss (e.g. negative log-likelihood loss), which is appropriate
+                for multiclass classification problems.  It is basically just like
+                loss_multiclass_log_ except that it lets you define matrix outputs instead
+                of scalar outputs.  It should be useful, for example, in semantic
+                segmentation where we want to classify each pixel of an image.
+        !*/
+    public:
+
+        // In semantic segmentation, if you don't know the ground-truth of some pixel,
+        // set the label of that pixel to this value. When you do so, the pixel will be
+        // ignored when computing gradients.
+        static const uint16_t label_to_ignore = std::numeric_limits<uint16_t>::max();
+
+        // In semantic segmentation, 65535 classes ought to be enough for anybody.
+        typedef matrix<uint16_t> training_label_type;
+        typedef matrix<uint16_t> output_label_type;
+
+        template <
+            typename SUB_TYPE,
+            typename label_iterator
+            >
+        void to_label (
+            const tensor& input_tensor,
+            const SUB_TYPE& sub,
+            label_iterator iter
+        ) const;
+        /*!
+            This function has the same interface as EXAMPLE_LOSS_LAYER_::to_label() except
+            it has the additional calling requirements that:
+                - sub.get_output().num_samples() == input_tensor.num_samples()
+                - sub.sample_expansion_factor() == 1
+            and the output label is the predicted class for each classified element.  The number
+            of possible output classes is sub.get_output().k().
+        !*/
+
+        template <
+            typename const_label_iterator,
+            typename SUBNET
+            >
+        double compute_loss_value_and_gradient (
+            const tensor& input_tensor,
+            const_label_iterator truth,
+            SUBNET& sub
+        ) const;
+        /*!
+            This function has the same interface as EXAMPLE_LOSS_LAYER_::compute_loss_value_and_gradient()
+            except it has the additional calling requirements that:
+                - sub.get_output().num_samples() == input_tensor.num_samples()
+                - sub.sample_expansion_factor() == 1
+                - all values pointed to by truth are < sub.get_output().k() or are equal to label_to_ignore.
+        !*/
+
+    };
+
+    template <typename SUBNET>
+    using loss_multiclass_log_per_pixel = add_loss_layer<loss_multiclass_log_per_pixel_, SUBNET>;
+
+// ----------------------------------------------------------------------------------------
+
+    class loss_multiclass_log_per_pixel_weighted_
+    {
+        /*!
+            WHAT THIS OBJECT REPRESENTS
+                This object implements the loss layer interface defined above by
+                EXAMPLE_LOSS_LAYER_.  In particular, it implements the multiclass logistic
+                regression loss (e.g. negative log-likelihood loss), which is appropriate
+                for multiclass classification problems.  It is basically just like
+                loss_multiclass_log_per_pixel_ except that it lets you define per-pixel
+                weights, which may be useful e.g. if you want to emphasize rare classes
+                while training.  (If the classification problem is difficult, a flat weight
+                structure may lead the network to always predict the most common label, in
+                particular if the degree of imbalance is high.  To emphasize a certain
+                class or classes, simply increase the weights of the corresponding pixels,
+                relative to the weights of the other pixels.)
+
+                Note that if you set the weight to 0 whenever a pixel's label is equal to
+                loss_multiclass_log_per_pixel_::label_to_ignore, and to 1 otherwise, then
+                you essentially get loss_multiclass_log_per_pixel_ as a special case.
+        !*/
+    public:
+
+        struct weighted_label
+        {
+            /*!
+                WHAT THIS OBJECT REPRESENTS
+                    This object represents the truth label of a single pixel, together with
+                    an associated weight (the higher the weight, the more emphasis the
+                    corresponding pixel is given during the training).
+            !*/
+
+            weighted_label();
+            weighted_label(uint16_t label, float weight = 1.f);
+
+            // The ground-truth label. In semantic segmentation, 65536 classes ought to be
+            // enough for anybody.
+            uint16_t label = 0;
+
+            // The weight of the corresponding pixel.
+            float weight = 1.f;
+        };
+
+        typedef matrix<weighted_label> training_label_type;
+        typedef matrix<uint16_t> output_label_type;
+
+        template <
+            typename SUB_TYPE,
+            typename label_iterator
+            >
+        void to_label (
+            const tensor& input_tensor,
+            const SUB_TYPE& sub,
+            label_iterator iter
+        ) const;
+        /*!
+            This function has the same interface as EXAMPLE_LOSS_LAYER_::to_label() except
+            it has the additional calling requirements that:
+                - sub.get_output().num_samples() == input_tensor.num_samples()
+                - sub.sample_expansion_factor() == 1
+            and the output label is the predicted class for each classified element.  The number
+            of possible output classes is sub.get_output().k().
+        !*/
+
+        template <
+            typename const_label_iterator,
+            typename SUBNET
+            >
+        double compute_loss_value_and_gradient (
+            const tensor& input_tensor,
+            const_label_iterator truth,
+            SUBNET& sub
+        ) const;
+        /*!
+            This function has the same interface as EXAMPLE_LOSS_LAYER_::compute_loss_value_and_gradient()
+            except it has the additional calling requirements that:
+                - sub.get_output().num_samples() == input_tensor.num_samples()
+                - sub.sample_expansion_factor() == 1
+                - all labels pointed to by truth are < sub.get_output().k(), or the corresponding weight
+                  is zero.
+        !*/
+
+    };
+
+    template <typename SUBNET>
+    using loss_multiclass_log_per_pixel_weighted = add_loss_layer<loss_multiclass_log_per_pixel_weighted_, SUBNET>;
+
+// ----------------------------------------------------------------------------------------
+
+    class loss_mean_squared_per_pixel_
+    {
+        /*!
+            WHAT THIS OBJECT REPRESENTS
+                This object implements the loss layer interface defined above by
+                EXAMPLE_LOSS_LAYER_.  In particular, it implements the mean squared loss,
+                which is appropriate for regression problems.  It is basically just like
+                loss_mean_squared_multioutput_ except that it lets you define matrix or
+                image outputs, instead of vector.
+        !*/
+    public:
+
+        typedef matrix<float> training_label_type;
+        typedef matrix<float> output_label_type;
+
+        template <
+            typename SUB_TYPE,
+            typename label_iterator
+            >
+        void to_label (
+            const tensor& input_tensor,
+            const SUB_TYPE& sub,
+            label_iterator iter
+        ) const;
+        /*!
+            This function has the same interface as EXAMPLE_LOSS_LAYER_::to_label() except
+            it has the additional calling requirements that:
+                - sub.get_output().num_samples() == input_tensor.num_samples()
+                - sub.sample_expansion_factor() == 1
+            and the output labels are the predicted continuous variables.
+        !*/
+
+        template <
+            typename const_label_iterator,
+            typename SUBNET
+            >
+        double compute_loss_value_and_gradient (
+            const tensor& input_tensor,
+            const_label_iterator truth,
+            SUBNET& sub
+        ) const;
+        /*!
+            This function has the same interface as EXAMPLE_LOSS_LAYER_::compute_loss_value_and_gradient()
+            except it has the additional calling requirements that:
+                - sub.get_output().k() == 1
+                - sub.get_output().num_samples() == input_tensor.num_samples()
+                - sub.sample_expansion_factor() == 1
+                - for all idx such that 0 <= idx < sub.get_output().num_samples():
+                    - sub.get_output().nr() == (*(truth + idx)).nr()
+                    - sub.get_output().nc() == (*(truth + idx)).nc()
+        !*/
+    };
+
+    template <typename SUBNET>
+    using loss_mean_squared_per_pixel = add_loss_layer<loss_mean_squared_per_pixel_, SUBNET>;
+
+// ----------------------------------------------------------------------------------------
+
+    class loss_dot_ 
+    {
+        /*!
+            WHAT THIS OBJECT REPRESENTS
+                This object implements the loss layer interface defined above by
+                EXAMPLE_LOSS_LAYER_.  In particular, selecting this loss means you want
+                maximize the dot product between the output of a network and a set of
+                training vectors.  The loss is therefore the negative dot product.  To be
+                very specific, if X is the output vector of a network and Y is a training
+                label (also a vector), then the loss for this training sample is: -dot(X,Y)
+        !*/
+
+    public:
+
+        typedef matrix<float,0,1> training_label_type;
+        typedef matrix<float,0,1> output_label_type;
+
+        template <
+            typename SUB_TYPE,
+            typename label_iterator
+            >
+        void to_label (
+            const tensor& input_tensor,
+            const SUB_TYPE& sub,
+            label_iterator iter
+        ) const;
+        /*!
+            This function has the same interface as EXAMPLE_LOSS_LAYER_::to_label() except
+            it has the additional calling requirements that:
+                - sub.get_output().num_samples() == input_tensor.num_samples()
+                - sub.sample_expansion_factor() == 1
+            and the output labels are simply the final network outputs stuffed into a
+            vector.  To be very specific, the output is the following for all valid i:
+                *(iter+i) == trans(rowm(mat(sub.get_output()),i))
+        !*/
+
+
+        template <
+            typename const_label_iterator,
+            typename SUBNET
+            >
+        double compute_loss_value_and_gradient (
+            const tensor& input_tensor,
+            const_label_iterator truth, 
+            SUBNET& sub
+        ) const;
+        /*!
+            This function has the same interface as EXAMPLE_LOSS_LAYER_::compute_loss_value_and_gradient()
+            except it has the additional calling requirements that:
+                - sub.get_output().num_samples() == input_tensor.num_samples()
+                - sub.sample_expansion_factor() == 1
+                - Let NETWORK_OUTPUT_DIMS == sub.get_output().size()/sub.get_output().num_samples()
+                - for all idx such that 0 <= idx < sub.get_output().num_samples():
+                    - NETWORK_OUTPUT_DIMS == (*(truth + idx)).size()
+        !*/
+    };
+
+    template <typename SUBNET>
+    using loss_dot = add_loss_layer<loss_dot_, SUBNET>;
+
+// ----------------------------------------------------------------------------------------
+
+}
+
+#endif // DLIB_DNn_LOSS_ABSTRACT_H_
+
diff --git a/ml/dlib/dlib/dnn/solvers.h b/ml/dlib/dlib/dnn/solvers.h
new file mode 100644
index 000000000..204541a7e
--- /dev/null
+++ b/ml/dlib/dlib/dnn/solvers.h
@@ -0,0 +1,405 @@
+// Copyright (C) 2015  Davis E. King (davis@dlib.net)
+// License: Boost Software License   See LICENSE.txt for the full license.
+#ifndef DLIB_DNn_SOLVERS_H_
+#define DLIB_DNn_SOLVERS_H_
+
+#include "solvers_abstract.h"
+#include "tensor.h"
+#include <iostream>
+#include "layers.h"
+
+namespace dlib
+{
+    class sgd
+    {
+    public:
+
+        explicit sgd(
+            float weight_decay_,
+            float momentum_ = 0.9
+        ) 
+        { 
+            weight_decay = weight_decay_;
+            momentum = momentum_;
+        }
+
+        sgd(
+        ) : sgd(0.0005, 0.9) 
+        { 
+        }
+
+        float get_momentum (
+        ) const { return momentum; }
+
+        float get_weight_decay (
+        ) const { return weight_decay; }
+
+        template <typename layer_type> 
+        const tensor& operator() (
+            const float learning_rate,
+            const layer_type& l,
+            const tensor& params_grad
+        )
+        {
+            const tensor& params = l.get_layer_params();
+
+            DLIB_CASSERT(params.size() != 0);
+            if (v.size() == 0)
+            {
+                v.copy_size(params_grad);
+                v = 0;
+            }
+
+            const double lr = learning_rate*get_learning_rate_multiplier(l);
+            const double wd = weight_decay*get_weight_decay_multiplier(l);
+            
+            //perform: v = momentum*mat(v) - wd*lr*mat(params) - lr*mat(params_grad);
+            tt::affine_transform(v, v, params, params_grad, momentum, -wd*lr, -lr);
+
+            return v;
+        }
+
+        template <unsigned long N>
+        const tensor& operator() (
+            const float learning_rate,
+            const fc_<N,FC_HAS_BIAS>& l,
+            const tensor& params_grad
+        )
+        {
+            update_considering_bias(learning_rate, l, params_grad, params_grad.size()-l.get_num_outputs());
+            return v;
+        }
+
+        template <
+            long _num_filters,
+            long _nr,
+            long _nc,
+            int _stride_y,
+            int _stride_x,
+            int _padding_y,
+            int _padding_x
+            >
+        const tensor& operator() (
+            const float learning_rate,
+            const con_<_num_filters,_nr,_nc,_stride_y,_stride_x,_padding_y,_padding_x>& l,
+            const tensor& params_grad
+        )
+        {
+            update_considering_bias(learning_rate, l, params_grad, params_grad.size()-l.num_filters());
+            return v;
+        }
+
+        template <
+            long _num_filters,
+            long _nr,
+            long _nc,
+            int _stride_y,
+            int _stride_x,
+            int _padding_y,
+            int _padding_x
+            >
+        const tensor& operator() (
+            const float learning_rate,
+            const cont_<_num_filters,_nr,_nc,_stride_y,_stride_x,_padding_y,_padding_x>& l,
+            const tensor& params_grad
+        )
+        {
+            update_considering_bias(learning_rate, l, params_grad, params_grad.size()-l.num_filters());
+            return v;
+        }
+
+        template < layer_mode mode >
+        const tensor& operator() (
+            const float learning_rate,
+            const bn_<mode>& l,
+            const tensor& params_grad
+        )
+        {
+            update_considering_bias(learning_rate, l, params_grad, params_grad.size()/2);
+            return v;
+        }
+
+        friend void serialize(const sgd& item, std::ostream& out)
+        {
+            serialize("sgd2", out);
+            serialize(item.v, out);
+            serialize(item.weight_decay, out);
+            serialize(item.momentum, out);
+        }
+
+        friend void deserialize(sgd& item, std::istream& in)
+        {
+            std::string version;
+            deserialize(version, in);
+            if (version != "sgd2")
+                throw serialization_error("Unexpected version found while deserializing dlib::sgd.");
+            deserialize(item.v, in);
+            deserialize(item.weight_decay, in);
+            deserialize(item.momentum, in);
+        }
+
+        friend std::ostream& operator<< (std::ostream& out, const sgd& item)
+        {
+            out << "sgd: weight_decay="<<item.get_weight_decay() << ", momentum="<<item.get_momentum(); 
+            return out;
+        }
+
+    private:
+
+        template <typename layer_type> 
+        void update_considering_bias(
+            const float learning_rate,
+            const layer_type& l,
+            const tensor& params_grad,
+            unsigned long bias_offset
+        )
+        {
+            const tensor& params = l.get_layer_params();
+
+            DLIB_CASSERT(params.size() != 0);
+            if (v.size() == 0)
+            {
+                v.copy_size(params_grad);
+                v = 0;
+            }
+
+            double lr = learning_rate*get_learning_rate_multiplier(l);
+            double wd = weight_decay*get_weight_decay_multiplier(l);
+            
+            //perform: v = momentum*mat(v) - wd*lr*mat(params) - lr*mat(params_grad);
+
+            if (l.get_bias_learning_rate_multiplier() == 1 && l.get_bias_weight_decay_multiplier() == 1)
+            {
+                tt::affine_transform(v, v, params, params_grad, momentum, -wd*lr, -lr);
+            }
+            else
+            {
+
+                tt::affine_transform_range(0, bias_offset, v, v, params, params_grad, momentum, -wd*lr, -lr);
+
+                // now update the biases but apply their multipliers
+                lr *= l.get_bias_learning_rate_multiplier();
+                wd *= l.get_bias_weight_decay_multiplier();
+                tt::affine_transform_range(bias_offset, v.size(), v, v, params, params_grad, momentum, -wd*lr, -lr);
+            }
+        }
+
+        resizable_tensor v;
+        float weight_decay;
+        float momentum;
+
+    };
+
+// ----------------------------------------------------------------------------------------
+
+    class adam 
+    {
+    public:
+
+        adam(
+            float weight_decay_,
+            float momentum1_, 
+            float momentum2_
+        ) 
+        { 
+            weight_decay = weight_decay_;
+            momentum1 = momentum1_;
+            momentum2 = momentum2_;
+            t = 0;
+        }
+
+        adam(
+        ) : adam(0.0005, 0.9, 0.999) 
+        {}
+
+        float get_momentum1 (
+        ) const { return momentum1; }
+
+        float get_momentum2 (
+        ) const { return momentum2; }
+
+        float get_weight_decay (
+        ) const { return weight_decay; }
+
+        template <typename layer_type>
+        const tensor& operator() (
+            const float learning_rate,
+            const layer_type& l,
+            const tensor& params_grad
+        )
+        {
+            const tensor& params = l.get_layer_params();
+            DLIB_CASSERT(params.size() != 0);
+            if (v.size() == 0)
+            {
+                m.copy_size(params_grad);
+                m = 0;
+                v.copy_size(params_grad);
+                v = 0;
+                s.copy_size(params_grad);
+            }
+
+            ++t;
+
+            
+            tt::compute_adam_update(0, params.size(), s, m, v, t,
+                learning_rate*get_learning_rate_multiplier(l),
+                weight_decay*get_weight_decay_multiplier(l), 
+                momentum1, momentum2, params, params_grad);
+
+            return s;
+        }
+
+        template <unsigned long N>
+        const tensor& operator() (
+            const float learning_rate,
+            const fc_<N,FC_HAS_BIAS>& l,
+            const tensor& params_grad
+        )
+        {
+            update_considering_bias(learning_rate, l, params_grad, params_grad.size()-l.get_num_outputs());
+            return s;
+        }
+
+        template <
+            long _num_filters,
+            long _nr,
+            long _nc,
+            int _stride_y,
+            int _stride_x,
+            int _padding_y,
+            int _padding_x
+            >
+        const tensor& operator() (
+            const float learning_rate,
+            const con_<_num_filters,_nr,_nc,_stride_y,_stride_x,_padding_y,_padding_x>& l,
+            const tensor& params_grad
+        )
+        {
+            update_considering_bias(learning_rate, l, params_grad, params_grad.size()-l.num_filters());
+            return s;
+        }
+
+        template <
+            long _num_filters,
+            long _nr,
+            long _nc,
+            int _stride_y,
+            int _stride_x,
+            int _padding_y,
+            int _padding_x
+            >
+        const tensor& operator() (
+            const float learning_rate,
+            const cont_<_num_filters,_nr,_nc,_stride_y,_stride_x,_padding_y,_padding_x>& l,
+            const tensor& params_grad
+        )
+        {
+            update_considering_bias(learning_rate, l, params_grad, params_grad.size()-l.num_filters());
+            return s;
+        }
+
+        template < layer_mode mode >
+        const tensor& operator() (
+            const float learning_rate,
+            const bn_<mode>& l,
+            const tensor& params_grad
+        )
+        {
+            update_considering_bias(learning_rate, l, params_grad, params_grad.size()/2);
+            return s;
+        }
+
+
+        friend void serialize(const adam& item, std::ostream& out)
+        {
+            serialize("adam2", out);
+            serialize(item.m, out);
+            serialize(item.v, out);
+            serialize(item.s, out);
+            serialize(item.weight_decay, out);
+            serialize(item.momentum1, out);
+            serialize(item.momentum2, out);
+            serialize(item.t, out);
+        }
+
+        friend void deserialize(adam& item, std::istream& in)
+        {
+            std::string version;
+            deserialize(version, in);
+            if (version != "adam2")
+                throw serialization_error("Unexpected version found while deserializing dlib::adam.");
+            deserialize(item.m, in);
+            deserialize(item.v, in);
+            deserialize(item.s, in);
+            deserialize(item.weight_decay, in);
+            deserialize(item.momentum1, in);
+            deserialize(item.momentum2, in);
+            deserialize(item.t, in);
+        }
+
+        friend std::ostream& operator<< (std::ostream& out, const adam& item)
+        {
+            out << "adam: weight_decay="<<item.get_weight_decay() << ", momentum1="<<item.get_momentum1() << ", momentum2="<<item.get_momentum2(); 
+            return out;
+        }
+
+    private:
+
+        template <typename layer_type> 
+        void update_considering_bias(
+            const float learning_rate,
+            const layer_type& l,
+            const tensor& params_grad,
+            unsigned long bias_offset
+        )
+        {
+            const tensor& params = l.get_layer_params();
+            DLIB_CASSERT(params.size() != 0);
+            if (v.size() == 0)
+            {
+                m.copy_size(params_grad);
+                m = 0;
+                v.copy_size(params_grad);
+                v = 0;
+                s.copy_size(params_grad);
+            }
+
+
+            ++t;
+
+            if (l.get_bias_learning_rate_multiplier() == 1 && l.get_bias_weight_decay_multiplier() == 1)
+            {
+                tt::compute_adam_update(0, params.size(), s, m, v, t,
+                    learning_rate*get_learning_rate_multiplier(l),
+                    weight_decay*get_weight_decay_multiplier(l), 
+                    momentum1, momentum2, params, params_grad);
+            }
+            else
+            {
+                tt::compute_adam_update(0, bias_offset, s, m, v, t,
+                    learning_rate*get_learning_rate_multiplier(l),
+                    weight_decay*get_weight_decay_multiplier(l), 
+                    momentum1, momentum2, params, params_grad);
+
+                tt::compute_adam_update(bias_offset, params.size(), s, m, v, t,
+                    learning_rate*get_learning_rate_multiplier(l)*l.get_bias_learning_rate_multiplier(),
+                    weight_decay*get_weight_decay_multiplier(l)*l.get_bias_weight_decay_multiplier(), 
+                    momentum1, momentum2, params, params_grad);
+            }
+        }
+        resizable_tensor m;
+        resizable_tensor v;
+        resizable_tensor s;
+        float weight_decay;
+        float momentum1;
+        float momentum2;
+        float t;
+    };
+
+// ----------------------------------------------------------------------------------------
+
+}
+
+#endif // DLIB_DNn_SOLVERS_H_
+
diff --git a/ml/dlib/dlib/dnn/solvers_abstract.h b/ml/dlib/dlib/dnn/solvers_abstract.h
new file mode 100644
index 000000000..d10ef163a
--- /dev/null
+++ b/ml/dlib/dlib/dnn/solvers_abstract.h
@@ -0,0 +1,204 @@
+// Copyright (C) 2015  Davis E. King (davis@dlib.net)
+// License: Boost Software License   See LICENSE.txt for the full license.
+#undef DLIB_DNn_SOLVERS_ABSTRACT_H_
+#ifdef DLIB_DNn_SOLVERS_ABSTRACT_H_
+
+#include "tensor_abstract.h"
+#include <iostream>
+
+namespace dlib
+{
+
+// ----------------------------------------------------------------------------------------
+// ----------------------------------------------------------------------------------------
+// ----------------------------------------------------------------------------------------
+
+    class EXAMPLE_SOLVER 
+    {
+        /*!
+            WHAT THIS OBJECT REPRESENTS
+                A solver defines the parameter update rule for a single layer in a deep
+                neural network.  It takes a parameter gradient vector and the layer's
+                parameters and tells you how the parameters should be updated.
+                Importantly, each solver instance is used with only one layer in a network.
+                This allows us to define solvers that have per layer state, for example, a
+                solver may keep a momentum term and apply it to its update rule.
+
+                Note that there is no dlib::EXAMPLE_SOLVER type.  It is shown here purely
+                to document the interface a solver object must implement.
+        !*/
+
+    public:
+
+        EXAMPLE_SOLVER(
+        );
+
+        template <typename layer_type>
+        const tensor& operator() (
+            const float learning_rate,
+            const layer_type& l,
+            const tensor& params_grad
+        )
+        /*!
+            requires
+                - l.get_layer_params().size() != 0
+                - have_same_dimensions(l.get_layer_params(), params_grad) == true.
+                - When this function is invoked on a particular solver instance, it is
+                  always supplied with the same layer instance, l.  That is, the solver is
+                  allowed to remember things from one invocation to another and to assume
+                  that it is being serially applied to optimize the same layer's
+                  parameters. 
+            ensures
+                - Returns a step vector V that is intended to be used to update the
+                  parameters by adding V to l.get_layer_params().
+                - This function will use the given "learning rate" to compute V.  How the
+                  learning rate is used is solver dependent.  But in general the learning
+                  rate should be used to select the step size, i.e. to somehow determine
+                  the magnitude of V.
+        !*/
+    };
+
+    void serialize(const EXAMPLE_SOLVER& item, std::ostream& out);
+    void deserialize(EXAMPLE_SOLVER& item, std::istream& in);
+    /*!
+        provides serialization support  
+    !*/
+
+    std::ostream& operator<< (std::ostream& out, const EXAMPLE_SOLVER& item);
+    /*!
+        Prints the solver's name and parameters to out.
+    !*/
+
+// ----------------------------------------------------------------------------------------
+// ----------------------------------------------------------------------------------------
+// ----------------------------------------------------------------------------------------
+
+    class sgd
+    {
+        /*!
+            WHAT THIS OBJECT REPRESENTS
+                This object implements the EXAMPLE_SOLVER interface defined above.  It is a
+                basic stochastic gradient descent solver which uses momentum and weight
+                decay.  In particular, it computes the update vector V according to:
+                    V = momentum*V - weight_decay*learning_rate*l.get_layer_params() - learning_rate*params_grad;
+                Here V is a momentum term that is remembered by the solver from one
+                invocation of operator() to the next.  
+
+
+                Note that the actual learning rate and weight decay used by the solver are
+                multiplied by the per layer multipliers.  That is, the solver will call
+                get_learning_rate_multiplier(l) and get_weight_decay_multiplier(l) and
+                multiply these values with the nominal learning rate and weight decay,
+                respectively, to determine the values it will use during each step.  It is
+                also overloaded to allow additional learning rate multipliers to be applied
+                to fc_ and con_ bias parameters.
+        !*/
+    public:
+
+        sgd(
+        ); 
+        /*!
+            ensures
+                - #get_weight_decay()  == 0.0005 
+                - #get_momentum()      == 0.9 
+        !*/
+
+        explicit sgd(
+            float weight_decay,
+            float momentum = 0.9
+        ); 
+        /*!
+            requires
+                - weight_decay >= 0
+                - momentum >= 0
+            ensures
+                - #get_weight_decay()  == weight_decay 
+                - #get_momentum()      == momentum 
+        !*/
+
+        float get_weight_decay () const;
+        float get_momentum () const; 
+    };
+
+    void serialize(const sgd& item, std::ostream& out);
+    void deserialize(sgd& item, std::istream& in);
+    /*!
+        provides serialization support  
+    !*/
+
+    std::ostream& operator<< (std::ostream& out, const sgd& item);
+    /*!
+        Prints the solver's name and parameters to out.
+    !*/
+
+// ----------------------------------------------------------------------------------------
+
+    class adam
+    {
+        /*!
+            WHAT THIS OBJECT REPRESENTS
+                This object implements the EXAMPLE_SOLVER interface defined above.  In
+                particular, it implements the ADAM parameter update method described in the
+                paper:
+                    Kingma, Diederik P., and Jimmy Ba Adam. "A method for stochastic
+                    optimization." International Conference on Learning Representation. 2015.
+
+
+                Note that the actual learning rate and weight decay used by the solver are
+                multiplied by the per layer multipliers.  That is, the solver will call
+                get_learning_rate_multiplier(l) and get_weight_decay_multiplier(l) and
+                multiply these values with the nominal learning rate and weight decay,
+                respectively, to determine the values it will use during each step.  It is
+                also overloaded to allow additional learning rate multipliers to be applied
+                to fc_ and con_ bias parameters.
+        !*/
+
+    public:
+
+        adam(
+        ); 
+        /*!
+            ensures
+                - #get_weight_decay()  == 0.0005 
+                - #get_momentum1()     == 0.9 
+                - #get_momentum2()     == 0.999 
+        !*/
+
+        adam(
+            float weight_decay,
+            float momentum1, 
+            float momentum2 
+        ); 
+        /*!
+            requires
+                - weight_decay >= 0
+                - 0 <= momentum1 < 1
+                - 0 <= momentum2 < 1
+            ensures
+                - #get_weight_decay()  == weight_decay 
+                - #get_momentum1()     == momentum1
+                - #get_momentum2()     == momentum2
+        !*/
+
+        float get_weight_decay () const;
+        float get_momentum1 () const; 
+        float get_momentum2 () const; 
+    };
+
+    void serialize(const adam& item, std::ostream& out);
+    void deserialize(adam& item, std::istream& in);
+    /*!
+        provides serialization support  
+    !*/
+
+    std::ostream& operator<< (std::ostream& out, const adam& item);
+    /*!
+        Prints the solver's name and parameters to out.
+    !*/
+
+// ----------------------------------------------------------------------------------------
+
+}
+
+#endif // DLIB_DNn_SOLVERS_ABSTRACT_H_
+
diff --git a/ml/dlib/dlib/dnn/tensor.h b/ml/dlib/dlib/dnn/tensor.h
new file mode 100644
index 000000000..8039fe666
--- /dev/null
+++ b/ml/dlib/dlib/dnn/tensor.h
@@ -0,0 +1,686 @@
+// Copyright (C) 2015  Davis E. King (davis@dlib.net)
+// License: Boost Software License   See LICENSE.txt for the full license.
+#ifndef DLIB_DNn_TENSOR_H_
+#define DLIB_DNn_TENSOR_H_
+
+#include "tensor_abstract.h"
+#include <cstring>
+#include "../matrix.h"
+#include "cudnn_dlibapi.h"
+#include "gpu_data.h"
+#include "../byte_orderer.h"
+#include <memory>
+#include "../any.h"
+
+namespace dlib
+{
+
+// ----------------------------------------------------------------------------------------
+
+    class tensor;
+    namespace cuda
+    {
+        void set_tensor (
+            tensor& t,
+            float value
+        );
+
+        void scale_tensor (
+            tensor& t,
+            float value
+        );
+    }
+
+// ----------------------------------------------------------------------------------------
+
+    class tensor
+    {
+    public:
+
+        tensor (
+        ) : 
+            m_n(0), m_k(0), m_nr(0), m_nc(0), m_size(0)
+        {
+        }
+
+        virtual ~tensor() {}
+
+        long long num_samples() const { return m_n; }
+        long long k() const { return m_k; }
+        long long nr() const { return m_nr; }
+        long long nc() const { return m_nc; }
+        size_t size() const { return m_size; }
+
+        typedef float* iterator;
+        typedef const float* const_iterator;
+        iterator       begin()       { return host(); }
+        const_iterator begin() const { return host(); }
+        iterator       end()         { return host()+size(); }
+        const_iterator end() const   { return host()+size(); }
+
+        void async_copy_to_device() const
+        {
+            data().async_copy_to_device();
+        }
+
+        virtual const float* host() const = 0;
+        virtual float*       host() = 0; 
+        virtual float*       host_write_only() = 0;
+        virtual const float* device() const = 0;
+        virtual float*       device() = 0;
+        virtual float*       device_write_only() = 0;
+
+        virtual const any&   annotation() const = 0;
+        virtual any&         annotation() = 0;
+
+        int device_id() const { return data().device_id(); }
+
+        tensor& operator= (float val)
+        {
+#ifdef DLIB_USE_CUDA
+            // If you are using CUDA then presumably you will be mostly using tensors on
+            // the GPU.  So unless you seem to be actively working with the host side's
+            // data then we do this initialization on the device side since this avoids a
+            // host to device transfer that would likely immediately follow.
+            if (data().device_ready())
+            {
+                cuda::set_tensor(*this, val);
+                return *this;
+            }
+#endif
+            auto d = host_write_only();
+            for (size_t i = 0; i < size(); ++i)
+                d[i] = val;
+
+            return *this;
+        }
+
+        tensor& operator*= (float val)
+        {
+#ifdef DLIB_USE_CUDA
+            cuda::scale_tensor(*this, val);
+            return *this;
+#else
+            for (auto& d : *this)
+                d *= val;
+
+            return *this;
+#endif
+        }
+        
+        tensor& operator/= (float val)
+        {
+            *this *= 1.0/val;
+            return *this;
+        }
+
+        template <typename EXP>
+        tensor& operator= (const matrix_exp<EXP>& item)
+        {
+            DLIB_CASSERT(num_samples() == item.nr() &&
+                         nr()*nc()*k() == item.nc());
+            static_assert((is_same_type<float, typename EXP::type>::value == true),
+                "To assign a matrix to a tensor the matrix must contain float values");
+
+            set_ptrm(host_write_only(), m_n, m_nr*m_nc*m_k) = item;
+            return *this;
+        }
+
+        template <typename EXP>
+        tensor& operator+= (const matrix_exp<EXP>& item)
+        {
+            DLIB_CASSERT(num_samples() == item.nr() &&
+                         nr()*nc()*k() == item.nc());
+            static_assert((is_same_type<float, typename EXP::type>::value == true),
+                "To assign a matrix to a tensor the matrix must contain float values");
+            set_ptrm(host(), m_n, m_nr*m_nc*m_k) += item;
+            return *this;
+        }
+
+        template <typename EXP>
+        tensor& operator-= (const matrix_exp<EXP>& item)
+        {
+            DLIB_CASSERT(num_samples() == item.nr() &&
+                         nr()*nc()*k() == item.nc());
+            static_assert((is_same_type<float, typename EXP::type>::value == true),
+                "To assign a matrix to a tensor the matrix must contain float values");
+            set_ptrm(host(), m_n, m_nr*m_nc*m_k) -= item;
+            return *this;
+        }
+
+        template <typename EXP>
+        void set_sample (
+            unsigned long long idx,
+            const matrix_exp<EXP>& item
+        )
+        {
+            DLIB_CASSERT(idx < (unsigned long long)num_samples());
+            DLIB_CASSERT(item.size() == nr()*nc()*k());
+            static_assert((is_same_type<float, typename EXP::type>::value == true),
+                "To assign a matrix to a tensor the matrix must contain float values");
+            set_ptrm(host()+idx*item.size(), item.nr(), item.nc()) = item;
+        }
+
+
+        template <typename EXP>
+        void add_to_sample (
+            unsigned long long idx,
+            const matrix_exp<EXP>& item
+        )
+        {
+            DLIB_CASSERT(idx < (unsigned long long)num_samples());
+            DLIB_CASSERT(item.size() == nr()*nc()*k());
+            static_assert((is_same_type<float, typename EXP::type>::value == true),
+                "To assign a matrix to a tensor the matrix must contain float values");
+            set_ptrm(host()+idx*item.size(), item.nr(), item.nc()) += item;
+        }
+
+
+#ifdef DLIB_USE_CUDA
+        virtual const cuda::tensor_descriptor& get_cudnn_tensor_descriptor (
+        ) const = 0; 
+#endif
+
+        friend void memcpy (
+            tensor& dest, 
+            const tensor& src
+        )
+        {
+            DLIB_CASSERT(dest.size() == src.size());
+            memcpy(dest.data(), dest.get_alias_offset(),  
+                   src.data(),  src.get_alias_offset(), 
+                   src.size());
+        }
+
+
+    protected:
+
+        friend class alias_tensor;
+
+        virtual gpu_data& data() = 0;
+        virtual const gpu_data& data() const = 0;
+        virtual size_t get_alias_offset() const { return 0; } // needed by alias_tensor.
+
+        long long m_n;
+        long long m_k;
+        long long m_nr;
+        long long m_nc;
+        long long m_size; // always equal to m_n*m_k*m_nr*m_nc
+    };
+
+// ----------------------------------------------------------------------------------------
+
+    inline bool is_vector (
+        const tensor& t
+    )
+    {
+        return t.size() == (size_t)t.num_samples() ||
+               t.size() == (size_t)t.k() ||
+               t.size() == (size_t)t.nr() ||
+               t.size() == (size_t)t.nc();
+    }
+
+// ----------------------------------------------------------------------------------------
+
+    inline const matrix_op<op_pointer_to_mat<float> > mat (
+        const tensor& t,
+        long long nr,
+        long long nc
+    )
+    {
+        DLIB_ASSERT(nr >= 0 && nc >= 0 , 
+                    "\tconst matrix_exp mat(tensor, nr, nc)"
+                    << "\n\t nr and nc must be >= 0"
+                    << "\n\t nr: " << nr
+                    << "\n\t nc: " << nc
+        );
+        DLIB_ASSERT(nr*nc == (long long)t.size() , 
+                    "\tconst matrix_exp mat(tensor, nr, nc)"
+                    << "\n\t The sizes don't match up."
+                    << "\n\t nr*nc:    " << nr*nc
+                    << "\n\t t.size(): " << t.size()
+        );
+        typedef op_pointer_to_mat<float> op;
+        return matrix_op<op>(op(t.host(),nr,nc));
+    }
+
+    inline const matrix_op<op_pointer_to_mat<float> > mat (
+        const tensor& t
+    )
+    {
+        if (t.size() != 0)
+            return mat(t, t.num_samples(), t.size()/t.num_samples());
+        else
+            return mat((float*)0,0,0);
+    }
+
+    inline const matrix_op<op_pointer_to_mat<float> > image_plane (
+        const tensor& t,
+        long long sample = 0,
+        long long k = 0
+    )
+    {
+        DLIB_ASSERT(0 <= sample && sample < t.num_samples() &&
+                    0 <= k && k < t.k() &&
+                    t.size() != 0, 
+                    "\tconst matrix_exp image_plane(tensor,sample,k)"
+                    << "\n\t Invalid arguments were given to this function."
+                    << "\n\t sample: " << sample
+                    << "\n\t k:      " << k 
+                    << "\n\t t.num_samples(): " << t.num_samples() 
+                    << "\n\t t.k():           " << t.k() 
+                    << "\n\t t.size():        " << t.size() 
+        );
+
+
+        typedef op_pointer_to_mat<float> op;
+        return matrix_op<op>(op(t.host() + ((sample*t.k() + k)*t.nr())*t.nc(), 
+                                t.nr(), 
+                                t.nc()));
+    }
+
+// ----------------------------------------------------------------------------------------
+
+    inline bool have_same_dimensions (
+        const tensor& a,
+        const tensor& b
+    )
+    {
+        return a.num_samples() == b.num_samples() &&
+               a.k()  == b.k() &&
+               a.nr() == b.nr() &&
+               a.nc() == b.nc();
+    }
+
+// ----------------------------------------------------------------------------------------
+
+    class resizable_tensor : public tensor
+    {
+    public:
+        resizable_tensor(
+        )
+        {}
+
+        template <typename EXP>
+        resizable_tensor(
+            const matrix_exp<EXP>& item
+        )
+        {
+            set_size(item.nr(), item.nc());
+            *this = item;
+        }
+
+        explicit resizable_tensor(
+            long long n_, long long k_ = 1, long long nr_ = 1, long long nc_ = 1
+        ) 
+        {
+            DLIB_ASSERT( n_ >= 0 && k_ >= 0 && nr_ >= 0 && nc_ >= 0);
+
+            set_size(n_,k_,nr_,nc_);
+        }
+
+        resizable_tensor(const resizable_tensor& item) : _annotation(item.annotation()) 
+        {
+            copy_size(item);
+            memcpy(*this, item);
+        }
+        resizable_tensor(const tensor& item) : _annotation(item.annotation()) 
+        {
+            copy_size(item);
+            memcpy(*this, item);
+        }
+
+        resizable_tensor(resizable_tensor&& item) { swap(item); }
+        resizable_tensor& operator=(resizable_tensor&& item) { swap(item); return *this; }
+
+        virtual const float* host() const { return data_instance.host(); }
+        virtual float*       host()       { return data_instance.host(); }
+        virtual float*       host_write_only() { return data_instance.host_write_only(); }
+        virtual const float* device() const { return data_instance.device(); }
+        virtual float*       device()       { return data_instance.device(); }
+        virtual float*       device_write_only() { return data_instance.device_write_only(); }
+
+        virtual const any&   annotation() const { return _annotation; }
+        virtual any&         annotation() { return _annotation; }
+
+        void clear(
+        )
+        {
+            set_size(0,0,0,0);
+            _annotation.clear();
+            // free underlying memory
+            data_instance.set_size(0);
+        }
+
+        void copy_size (
+            const tensor& item
+        )
+        {
+            set_size(item.num_samples(), item.k(), item.nr(), item.nc());
+        }
+
+        resizable_tensor& operator= (float val)
+        {
+            tensor::operator=(val);
+            return *this;
+        }
+
+        template <typename EXP>
+        resizable_tensor& operator= (
+            const matrix_exp<EXP>& item
+        )
+        {
+            if (!(num_samples() == item.nr() && k()*nr()*nc() == item.nc()))
+                set_size(item.nr(), item.nc());
+            tensor::operator=(item);
+            return *this;
+        }
+
+        void set_size(
+            long long n_, long long k_ = 1, long long nr_ = 1, long long nc_ = 1
+        )
+        {
+            DLIB_ASSERT( n_ >= 0 && k_ >= 0 && nr_ >= 0 && nc_ >= 0);
+
+            m_n = n_;
+            m_k = k_;
+            m_nr = nr_;
+            m_nc = nc_;
+            m_size = n_*k_*nr_*nc_;
+            if ((long long)data_instance.size() < m_size)
+                data_instance.set_size(m_size);
+#ifdef DLIB_USE_CUDA
+            cudnn_descriptor.set_size(m_n,m_k,m_nr,m_nc);
+#endif
+        }
+
+
+        resizable_tensor& operator= (const resizable_tensor& item) 
+        {
+            resizable_tensor temp(item);
+            temp.swap(*this);
+            return *this;
+        }
+
+        resizable_tensor& operator= (const tensor& item) 
+        {
+            resizable_tensor temp(item);
+            temp.swap(*this);
+            return *this;
+        }
+
+
+        void swap(resizable_tensor& item)
+        {
+            std::swap(m_n,    item.m_n);
+            std::swap(m_k,    item.m_k);
+            std::swap(m_nr,   item.m_nr);
+            std::swap(m_nc,   item.m_nc);
+            std::swap(m_size, item.m_size);
+            std::swap(data_instance, item.data_instance);
+            std::swap(_annotation, item._annotation);
+#ifdef DLIB_USE_CUDA
+            std::swap(cudnn_descriptor, item.cudnn_descriptor);
+#endif
+        }
+
+#ifdef DLIB_USE_CUDA
+        virtual const cuda::tensor_descriptor& get_cudnn_tensor_descriptor (
+        ) const { return cudnn_descriptor; }
+#endif
+
+    private:
+
+#ifdef DLIB_USE_CUDA
+        cuda::tensor_descriptor cudnn_descriptor;
+#endif 
+
+        gpu_data data_instance;
+        any _annotation;
+        virtual gpu_data& data() { return data_instance; }
+        virtual const gpu_data& data() const { return data_instance; }
+    };
+
+    inline void serialize(const tensor& item, std::ostream& out)
+    {
+        int version = 2;
+        serialize(version, out);
+        serialize(item.num_samples(), out);
+        serialize(item.k(), out);
+        serialize(item.nr(), out);
+        serialize(item.nc(), out);
+        byte_orderer bo;
+        auto sbuf = out.rdbuf();
+        for (auto d : item)
+        {
+            // Write out our data as 4byte little endian IEEE floats rather than using
+            // dlib's default float serialization.  We do this because it will result in
+            // more compact outputs.  It's slightly less portable but it seems doubtful
+            // that any CUDA enabled platform isn't going to use IEEE floats.  But if one
+            // does we can just update the serialization code here to handle it if such a
+            // platform is encountered.
+            bo.host_to_little(d);
+            static_assert(sizeof(d)==4, "This serialization code assumes we are writing 4 byte floats");
+            sbuf->sputn((char*)&d, sizeof(d));
+        }
+    }
+
+    inline void deserialize(resizable_tensor& item, std::istream& in)
+    {
+        int version;
+        deserialize(version, in);
+        if (version != 2)
+            throw serialization_error("Unexpected version found while deserializing dlib::resizable_tensor.");
+
+        long long num_samples=0, k=0, nr=0, nc=0;
+        deserialize(num_samples, in);
+        deserialize(k, in);
+        deserialize(nr, in);
+        deserialize(nc, in);
+        item.set_size(num_samples, k, nr, nc);
+        byte_orderer bo;
+        auto sbuf = in.rdbuf();
+        for (auto& d : item)
+        {
+            static_assert(sizeof(d)==4, "This serialization code assumes we are writing 4 byte floats");
+            if (sbuf->sgetn((char*)&d,sizeof(d)) != sizeof(d))
+            {
+                in.setstate(std::ios::badbit);
+                throw serialization_error("Error reading data while deserializing dlib::resizable_tensor.");
+            }
+            bo.little_to_host(d);
+        }
+    }
+
+// ----------------------------------------------------------------------------------------
+
+    inline double dot(
+        const tensor& a,
+        const tensor& b
+    )
+    {
+        DLIB_CASSERT(a.size() == b.size());
+        const float* da = a.host();
+        const float* db = b.host();
+        double sum = 0;
+        for (size_t i = 0; i < a.size(); ++i)
+            sum += da[i]*db[i];
+        return sum;
+    }
+
+// ----------------------------------------------------------------------------------------
+// ----------------------------------------------------------------------------------------
+
+    class alias_tensor_instance : public tensor
+    {
+        alias_tensor_instance(
+        ) : data_instance(0), _annotation(0), data_offset(0) {}
+
+    public:
+        friend class alias_tensor;
+        friend class alias_tensor_const_instance;
+
+        alias_tensor_instance& operator= (float val)
+        {
+            tensor::operator=(val);
+            return *this;
+        }
+
+        template <typename EXP>
+        alias_tensor_instance& operator= (const matrix_exp<EXP>& item)
+        {
+            tensor::operator=(item);
+            return *this;
+        }
+
+        virtual const float* host() const { return data_instance->host()+data_offset; }
+        virtual float*       host()       { return data_instance->host()+data_offset; }
+        virtual float*       host_write_only()    { return data_instance->host()+data_offset; }
+        virtual const float* device() const { return data_instance->device()+data_offset; }
+        virtual float*       device()       { return data_instance->device()+data_offset; }
+        virtual float*       device_write_only()  { return data_instance->device()+data_offset; }
+
+        virtual const any&   annotation() const { return *_annotation; }
+        virtual any&         annotation() { return *_annotation; }
+
+#ifdef DLIB_USE_CUDA
+        virtual const cuda::tensor_descriptor& get_cudnn_tensor_descriptor (
+        ) const { return *cudnn_descriptor; }
+#endif
+    private:
+
+        virtual size_t get_alias_offset() const { return data_offset; } 
+
+#ifdef DLIB_USE_CUDA
+        std::shared_ptr<cuda::tensor_descriptor> cudnn_descriptor;
+#endif
+        gpu_data* data_instance;
+        any* _annotation;
+        size_t data_offset;
+        virtual gpu_data& data() { return *data_instance; }
+        virtual const gpu_data& data() const { return *data_instance; }
+    };
+
+// ----------------------------------------------------------------------------------------
+
+    class alias_tensor_const_instance 
+    {
+    public:
+        const tensor& get() const { return inst; }
+        operator const tensor& () { return inst; }
+
+        alias_tensor_const_instance(const alias_tensor_instance& item) : inst(item) {}
+
+    private:
+        alias_tensor_instance inst;
+
+        friend class alias_tensor;
+        alias_tensor_const_instance() {}
+    };
+
+// ----------------------------------------------------------------------------------------
+
+    class alias_tensor 
+    {
+    public:
+
+        alias_tensor (
+        ) {}
+
+        alias_tensor (
+            long long n_, long long k_ = 1, long long nr_ = 1, long long nc_ = 1
+        ) 
+        {
+            DLIB_ASSERT( n_ >= 0 && k_ >= 0 && nr_ >= 0 && nc_ >= 0);
+
+            inst.m_n = n_;
+            inst.m_k = k_;
+            inst.m_nr = nr_;
+            inst.m_nc = nc_;
+            inst.m_size = n_*k_*nr_*nc_;
+        }
+
+        long long num_samples(
+        ) const { return inst.m_n; }
+
+        long long k(
+        ) const { return inst.m_k; }
+
+        long long nr(
+        ) const { return inst.m_nr; }
+
+        long long nc(
+        ) const { return inst.m_nc; }
+
+        size_t size(
+        ) const { return inst.m_size; }
+
+        alias_tensor_instance operator() (
+            tensor& t,
+            size_t offset = 0
+        ) const
+        {
+            DLIB_CASSERT(offset+size() <= t.size(), 
+                "offset: "<<offset <<"\n"<<
+                "size(): "<<size() <<"\n"<<
+                "t.size(): "<<t.size() <<"\n");
+
+#ifdef DLIB_USE_CUDA
+            if (!inst.cudnn_descriptor)
+            {
+                inst.cudnn_descriptor = std::make_shared<cuda::tensor_descriptor>();
+                inst.cudnn_descriptor->set_size(inst.m_n, inst.m_k, inst.m_nr, inst.m_nc);
+            }
+#endif
+            inst.data_instance = &t.data();
+            inst._annotation   = &t.annotation();
+            // Note that t might already be an aliasing tensor so we need to take that into
+            // account.
+            inst.data_offset = t.get_alias_offset()+offset;
+            return inst;
+        }
+
+        alias_tensor_const_instance operator() (
+            const tensor& t,
+            size_t offset = 0
+        ) const
+        {
+            alias_tensor_const_instance temp;
+            temp.inst = (*this)(const_cast<tensor&>(t),offset);
+            return temp;
+        }
+
+    private:
+        mutable alias_tensor_instance inst;
+    };
+
+    inline void serialize(const alias_tensor& item, std::ostream& out)
+    {
+        int version = 1;
+        serialize(version, out);
+        serialize(item.num_samples(), out);
+        serialize(item.k(), out);
+        serialize(item.nr(), out);
+        serialize(item.nc(), out);
+    }
+
+    inline void deserialize(alias_tensor& item, std::istream& in)
+    {
+        int version = 0;
+        deserialize(version, in);
+        if (version != 1)
+            throw serialization_error("Unexpected version found while deserializing dlib::alias_tensor.");
+        long long num_samples, k, nr, nc;
+        deserialize(num_samples, in);
+        deserialize(k, in);
+        deserialize(nr, in);
+        deserialize(nc, in);
+        item = alias_tensor(num_samples, k, nr, nc);
+    }
+
+// ----------------------------------------------------------------------------------------
+
+}
+
+#endif // DLIB_DNn_TENSOR_H_
+
diff --git a/ml/dlib/dlib/dnn/tensor_abstract.h b/ml/dlib/dlib/dnn/tensor_abstract.h
new file mode 100644
index 000000000..73a9fff77
--- /dev/null
+++ b/ml/dlib/dlib/dnn/tensor_abstract.h
@@ -0,0 +1,727 @@
+// Copyright (C) 2015  Davis E. King (davis@dlib.net)
+// License: Boost Software License   See LICENSE.txt for the full license.
+#undef DLIB_DNn_TENSOR_ABSTRACT_H_
+#ifdef DLIB_DNn_TENSOR_ABSTRACT_H_
+
+#include "../matrix.h"
+#include "../any/any_abstract.h"
+
+namespace dlib
+{
+// ----------------------------------------------------------------------------------------
+
+    class tensor
+    {
+        /*!
+            WHAT THIS OBJECT REPRESENTS
+                This object represents a 4D array of float values, all stored contiguously
+                in memory.  Importantly, it keeps two copies of the floats, one on the host
+                CPU side and another on the GPU device side. It automatically performs the
+                necessary host/device transfers to keep these two copies of the data in
+                sync.
+
+                All transfers to the device happen asynchronously with respect to the
+                default CUDA stream so that CUDA kernel computations can overlap with data
+                transfers.  However, any transfers from the device to the host happen
+                synchronously in the default CUDA stream.  Therefore, you should perform
+                all your CUDA kernel launches on the default stream so that transfers back
+                to the host do not happen before the relevant computations have completed.
+
+                If DLIB_USE_CUDA is not #defined then this object will not use CUDA at all.
+                Instead, it will simply store one host side memory block of floats.  
+
+                Finally, the convention in dlib code is to interpret the tensor as a set of
+                num_samples() 3D arrays, each of dimension k() by nr() by nc().  Also,
+                while this class does not specify a memory layout, the convention is to
+                assume that indexing into an element at coordinates (sample,k,r,c) can be
+                accomplished via:
+                    host()[((sample*t.k() + k)*t.nr() + r)*t.nc() + c]
+
+            THREAD SAFETY
+                Instances of this object are not thread-safe.  So don't touch one from
+                multiple threads at the same time.
+        !*/
+
+    public:
+
+        virtual ~tensor();
+
+        long long num_samples(
+        ) const; 
+        /*!
+            ensures
+                - returns the number of 3D arrays of dimension k() by nr() by nc() there
+                  are in this object.  
+        !*/
+
+        long long k(
+        ) const; 
+        /*!
+            ensures
+                - returns the k dimension of this tensor.  Generally, we think of a tensor
+                  as containing num_samples() images of nr() by nc() rows and columns, each
+                  with k() channels.
+        !*/
+
+        long long nr(
+        ) const; 
+        /*!
+            ensures
+                - returns the number of rows in this tensor.
+        !*/
+
+        long long nc(
+        ) const; 
+        /*!
+            ensures
+                - returns the number of columns in this tensor.
+        !*/
+
+        size_t size(
+        ) const;
+        /*!
+            ensures
+                - returns num_samples()*k()*nr()*nc()
+                  (i.e. the total number of floats in this tensor)
+        !*/
+
+        void async_copy_to_device(
+        ) const;
+        /*!
+            ensures
+                - This function does not block.
+                - if (the host version of the data is newer than the device's copy) then
+                    - Begins asynchronously copying host data to the device.
+                    - A call to device() that happens before the transfer completes will
+                      block until the transfer is complete.  That is, it is safe to call
+                      async_copy_to_device() and then immediately call device().
+        !*/
+
+        typedef float* iterator;
+        typedef const float* const_iterator;
+        iterator       begin()       { return host(); }
+        const_iterator begin() const { return host(); }
+        iterator       end()         { return host()+size(); }
+        const_iterator end() const   { return host()+size(); }
+        /*!
+            ensures
+                - makes a tensor iterable just like the STL containers.   
+        !*/
+
+        virtual const float* host(
+        ) const = 0;
+        /*!
+            ensures
+                - returns a pointer to the host memory block of size() contiguous float
+                  values or nullptr if size()==0.
+                - if (the host's copy of the data is out of date) then
+                    - copies the data from the device to the host, while this is happening
+                      the call to host() blocks. 
+        !*/
+
+        virtual float* host(
+        ) = 0;
+        /*!
+            ensures
+                - returns a pointer to the host memory block of size() contiguous float
+                  values or nullptr if size()==0.
+                - if (the host's copy of the data is out of date) then
+                    - copies the data from the device to the host, while this is happening
+                      the call to host() blocks. 
+                - Marks the device side data as out of date so that the next call to
+                  device() will perform a host to device transfer.  If you want to begin
+                  the transfer immediately then you can call async_copy_to_device() after
+                  calling host().
+        !*/
+
+        virtual float* host_write_only(
+        ) = 0;
+        /*!
+            ensures
+                - This function returns the same pointer as host(), except that it never
+                  performs a device to host memory copy.  Instead, it immediately marks the
+                  device side data as out of date, effectively discarding it.  Therefore,
+                  the values in the data pointed to by host_write_only() are undefined and
+                  you should only call host_write_only() if you are going to assign to
+                  every memory location in the returned memory block.  
+        !*/
+
+        virtual const float* device(
+        ) const = 0;
+        /*!
+            requires
+                - DLIB_USE_CUDA is #defined
+            ensures
+                - returns a pointer to the device memory block of size() contiguous float
+                  values or nullptr if size()==0.
+                - if (the device's copy of the data is out of date) then
+                    - copies the data from the host to the device, while this is happening
+                      the call to device() blocks. 
+        !*/
+
+        virtual float* device(
+        ) = 0;
+        /*!
+            requires
+                - DLIB_USE_CUDA is #defined
+            ensures
+                - returns a pointer to the device memory block of size() contiguous float
+                  values or nullptr if size()==0.
+                - if (the device's copy of the data is out of date) then
+                    - copies the data from the host to the device, while this is happening
+                      the call to device() blocks. 
+                - Marks the host side data as out of date so that the next call to
+                  host() will perform a device to host transfer.
+        !*/
+
+        virtual float* device_write_only(
+        ) = 0;
+        /*!
+            requires
+                - DLIB_USE_CUDA is #defined
+            ensures
+                - This function returns the same pointer as device(), except that it never
+                  performs a host to device memory copy.  Instead, it immediately marks the
+                  host side data as out of date, effectively discarding it.  Therefore, the
+                  values in the data pointed to by device_write_only() are undefined and
+                  you should only call device_write_only() if you are going to assign to
+                  every memory location in the returned memory block.  
+        !*/
+
+        virtual const any& annotation(
+        ) const = 0;
+        /*!
+            ensures
+                - returns a const reference to the any object in this tensor.  The any
+                  object can be used to store any additional annotation you like in a
+                  tensor.  However, it should be noted that the annotation() is ignored by
+                  serialize() and therefore not saved when a tensor is serialized.
+        !*/
+
+        virtual any& annotation(
+        ) = 0;
+        /*!
+            ensures
+                - returns a non-const reference to the any object in this tensor.  The any
+                  object can be used to store any additional annotation you like in a
+                  tensor.  However, it should be noted that the annotation() is ignored by
+                  serialize() and therefore not saved when a tensor is serialized.
+        !*/
+
+        int device_id(
+        ) const; 
+        /*!
+            ensures
+                - returns the ID of the CUDA device that allocated this memory. I.e. the
+                  number returned by cudaGetDevice() when the memory was allocated.
+                - If CUDA is not being used then this function always returns 0.
+        !*/
+
+        tensor& operator= (
+            float val
+        );
+        /*!
+            ensures
+                - sets all elements of this tensor equal to val.
+                - returns *this
+        !*/
+
+        tensor& operator*= (
+            float val
+        );
+        /*!
+            ensures
+                - pointwise multiplies all elements of *this tensor with val.
+                - returns *this
+        !*/
+        
+        tensor& operator/= (
+            float val
+        );
+        /*!
+            ensures
+                - pointwise divides all elements of *this tensor with val.
+                - returns *this
+        !*/
+
+        template <typename EXP>
+        tensor& operator= (
+            const matrix_exp<EXP>& item
+        );
+        /*!
+            requires
+                - num_samples() == item.nr()
+                - k()*nr()*nc() == item.nc()
+                - item contains float values
+            ensures
+                - Assigns item to *this tensor by performing:
+                  set_ptrm(host(), num_samples(), k()*nr()*nc()) = item;
+        !*/
+
+        template <typename EXP>
+        tensor& operator+= (
+            const matrix_exp<EXP>& item
+        );
+        /*!
+            requires
+                - num_samples() == item.nr()
+                - k()*nr()*nc() == item.nc()
+                - item contains float values
+            ensures
+                - Adds item to *this tensor by performing:
+                  set_ptrm(host(), num_samples(), k()*nr()*nc()) += item;
+        !*/
+
+        template <typename EXP>
+        tensor& operator-= (
+            const matrix_exp<EXP>& item
+        );
+        /*!
+            requires
+                - num_samples() == item.nr()
+                - k()*nr()*nc() == item.nc()
+                - item contains float values
+            ensures
+                - Subtracts item from *this tensor by performing:
+                  set_ptrm(host(), num_samples(), k()*nr()*nc()) -= item;
+        !*/
+
+        template <typename EXP>
+        void set_sample (
+            unsigned long long idx,
+            const matrix_exp<EXP>& item
+        );
+        /*!
+            requires
+                - idx < num_samples()
+                - k()*nr()*nc() == item.size()
+                - item contains float values
+            ensures
+                - Assigns item to the idx'th sample in *this by performing:
+                  set_ptrm(host()+idx*item.size(), item.nr(), item.nc()) = item;
+        !*/
+
+
+        template <typename EXP>
+        void add_to_sample (
+            unsigned long long idx,
+            const matrix_exp<EXP>& item
+        );
+        /*!
+            requires
+                - idx < num_samples()
+                - k()*nr()*nc() == item.size()
+                - item contains float values
+            ensures
+                - Adds item to the idx'th sample in *this by performing:
+                  set_ptrm(host()+idx*item.size(), item.nr(), item.nc()) += item;
+        !*/
+
+    protected:
+
+        // You can't move or copy another tensor into *this since that might modify the
+        // tensor's dimensions.  If you want to do that sort of thing then use a
+        // resizable_tensor.
+        tensor(const tensor& item);  
+        tensor& operator= (const tensor& item); 
+        tensor(tensor&& item); 
+        tensor& operator=(tensor&& item); 
+    };
+
+// ----------------------------------------------------------------------------------------
+
+    void memcpy (
+        tensor& dest, 
+        const tensor& src
+    );
+    /*!
+        requires
+            - dest.size() == src.size()
+        ensures
+            - Copies the data in src to dest.  If the device data is current on both src
+              and dest then the copy will happen entirely on the device side.
+            - It doesn't matter what GPU device is selected by cudaSetDevice().  You can
+              always copy tensor objects to and from each other regardless.
+            - This function blocks until the copy has completed.
+    !*/
+
+// ----------------------------------------------------------------------------------------
+
+    bool is_vector (
+        const tensor& t
+    );
+    /*!
+        ensures
+            - returns true if and only if one of the following is true:
+                - t.size() == t.num_samples() 
+                - t.size() == t.k() 
+                - t.size() == t.nr() 
+                - t.size() == t.nc()
+    !*/
+
+// ----------------------------------------------------------------------------------------
+
+    const matrix_exp mat (
+        const tensor& t,
+        long long nr,
+        long long nc
+    );
+    /*!
+        requires
+            - nr >= 0
+            - nc >= 0
+            - nr*nc == t.size()
+        ensures
+            - returns a matrix M such that:
+                - M.nr() == nr
+                - m.nc() == nc 
+                - for all valid r and c:
+                  M(r,c) == t.host()[r*nc + c]
+                  (i.e. the tensor is interpreted as a matrix laid out in memory
+                  in row major order)
+    !*/
+
+    const matrix_exp mat (
+        const tensor& t
+    );
+    /*!
+        ensures
+            - if (t.size() != 0) then
+                - returns mat(t, t.num_samples(), t.size()/t.num_samples())
+            - else
+                - returns an empty matrix.
+    !*/
+
+    const matrix_exp image_plane (
+        const tensor& t,
+        long long sample = 0,
+        long long k = 0
+    );
+    /*!
+        requires
+            - t.size() != 0
+            - 0 <= sample < t.num_samples()
+            - 0 <= k < t.k()
+        ensures
+            - returns the k-th image plane from the sample-th image in t.  That is,
+              returns a matrix M such that:
+                - M contains float valued elements.
+                - M.nr() == t.nr()
+                - M.nc() == t.nc()
+                - for all valid r and c:
+                    - M(r,c) == t.host()[((sample*t.k() + k)*t.nr() + r)*t.nc() + c]
+    !*/
+
+// ----------------------------------------------------------------------------------------
+
+    bool have_same_dimensions (
+        const tensor& a,
+        const tensor& b
+    );
+    /*!
+        ensures
+            - returns true if and only if all of the fallowing are satisfied:
+                - a.num_samples() == b.num_samples() 
+                - a.k()  == b.k() 
+                - a.nr() == b.nr() 
+                - a.nc() == b.nc()
+    !*/
+
+// ----------------------------------------------------------------------------------------
+
+    class resizable_tensor : public tensor
+    {
+        /*!
+            WHAT THIS OBJECT REPRESENTS
+                This object is just a tensor with the additional ability to be resized.
+        !*/
+
+    public:
+        resizable_tensor(
+        );
+        /*!
+            ensures
+                - #size() == 0
+                - #num_samples() == 0
+                - #k() == 0
+                - #nr() == 0
+                - #nc() == 0
+                - #capacity() == 0
+        !*/
+
+        template <typename EXP>
+        resizable_tensor(
+            const matrix_exp<EXP>& item
+        );
+        /*!
+            requires
+                - item contains float values
+            ensures
+                - #num_samples() == item.nr()
+                - #k() == item.nc()
+                - #nr() == 1
+                - #nc() == 1
+                - Assigns item to *this tensor by performing:
+                  set_ptrm(host(), num_samples(), k()*nr()*nc()) = item;
+                - #capacity() == size()
+        !*/
+
+        explicit resizable_tensor(
+            long long n_, long long k_ = 1, long long nr_ = 1, long long nc_ = 1
+        );
+        /*!
+            requires
+                - n_ >= 0
+                - k_ >= 0
+                - nr_ >= 0
+                - nc_ >= 0
+            ensures
+                - #size() == n_*k_*nr_*nc_
+                - #num_samples() == n_
+                - #k() == k_
+                - #nr() == nr_
+                - #nc() == nc_
+                - #capacity() == size()
+        !*/
+
+        // This object is copyable and movable
+        resizable_tensor(const resizable_tensor&) = default;
+        resizable_tensor(resizable_tensor&&) = default;
+        resizable_tensor& operator= (const resizable_tensor&) = default;
+        resizable_tensor& operator= (resizable_tensor&&) = default;
+
+        size_t capacity (
+        ) const;
+        /*!
+            ensures
+                - returns the total number of floats allocated.  This might be different
+                  from the size() since calls to set_size() that make a tensor smaller
+                  don't trigger reallocations.  They simply adjust the nominal dimensions
+                  while keeping the same allocated memory block.  This makes calls to
+                  set_size() very fast.  If you need to deallocate a tensor then use
+                  clear().
+        !*/
+
+        void clear(
+        );
+        /*!
+            ensures
+                - #size() == 0
+                - #num_samples() == 0
+                - #k() == 0
+                - #nr() == 0
+                - #nc() == 0
+                - #annotation().is_empty() == true
+                - #capacity() == 0
+        !*/
+
+        void copy_size (
+            const tensor& item
+        );
+        /*!
+            ensures
+                - resizes *this so that: have_same_dimensions(#*this, item)==true
+        !*/
+
+        void set_size(
+            long long n_, long long k_ = 1, long long nr_ = 1, long long nc_ = 1
+        );
+        /*!
+            requires
+                - n_ >= 0
+                - k_ >= 0
+                - nr_ >= 0
+                - nc_ >= 0
+            ensures
+                - #size() == n_*k_*nr_*nc_
+                - #num_samples() == n_
+                - #k() == k_
+                - #nr() == nr_
+                - #nc() == nc_
+                - #capacity() == max(#size(), capacity())
+                  (i.e. capacity() never goes down when calling set_size().)
+        !*/
+
+        template <typename EXP>
+        resizable_tensor& operator= (
+            const matrix_exp<EXP>& item
+        );
+        /*!
+            requires
+                - item contains float values
+            ensures
+                - if (num_samples() == item.nr() && k()*nr()*nc() == item.nc()) then
+                    - the dimensions of this tensor are not changed
+                - else
+                    - #num_samples() == item.nr()
+                    - #k() == item.nc()
+                    - #nr() == 1
+                    - #nc() == 1
+                - Assigns item to *this tensor by performing:
+                  set_ptrm(host(), num_samples(), k()*nr()*nc()) = item;
+        !*/
+    };
+
+    void serialize(const tensor& item, std::ostream& out);
+    void deserialize(resizable_tensor& item, std::istream& in);
+    /*!
+        provides serialization support for tensor and resizable_tensor.  Note that you can
+        serialize to/from any combination of tenor and resizable_tensor objects.
+    !*/
+
+// ----------------------------------------------------------------------------------------
+
+    double dot(
+        const tensor& a,
+        const tensor& b
+    );
+    /*!
+        requires
+            - a.size() == b.size()
+        ensures
+            - returns the dot product between a and b when they are both treated as
+              a.size() dimensional vectors.  That is, this function pointwise multiplies
+              the vectors together, then sums the result and returns it.
+
+    !*/
+
+// ----------------------------------------------------------------------------------------
+
+    class alias_tensor_instance : public tensor
+    {
+        /*!
+            WHAT THIS OBJECT REPRESENTS
+                This object is a tensor that aliases another tensor.  That is, it doesn't
+                have its own block of memory but instead simply holds pointers to the
+                memory of another tensor object.  It therefore allows you to efficiently
+                break a tensor into pieces and pass those pieces into functions.
+
+                An alias_tensor_instance doesn't own the resources it points to in any sense.
+                So it is important to make sure that the underlying owning tensor doesn't get
+                destructed before any alias tensors which point to it are destructed.
+        !*/
+
+        // You can't default initialize this object.  You can only get instances of it from
+        // alias_tensor::operator().
+        alias_tensor_instance(
+        ); 
+    };
+
+    class alias_tensor_const_instance 
+    {
+        /*!
+            WHAT THIS OBJECT REPRESENTS
+                This is essentially a const version of alias_tensor_instance and therefore
+                represents a tensor.  However, due to the mechanics of C++, this object
+                can't inherit from tensor.  So instead it provides a get() and an implicit
+                conversion to const tensor.
+        !*/
+
+    public:
+
+        // non-const alias tensors are convertible to const ones.
+        alias_tensor_const_instance(const alias_tensor_instance& item); 
+
+        // Methods that cast the alias to a tensor.
+        const tensor& get() const;
+        operator const tensor& (); 
+
+    private:
+        // You can't default initialize this object.  You can only get instances of it from
+        // alias_tensor::operator().
+        alias_tensor_const_instance();
+    };
+
+    class alias_tensor 
+    {
+        /*!
+            WHAT THIS OBJECT REPRESENTS
+                This is a tool for creating tensor objects that alias other tensor objects.
+                That is, it allows you to make a tensor that references the memory space of
+                another tensor object rather than owning its own memory.  This allows you
+                to do things like interpret a single tensor in different ways or even as a
+                group of multiple tensors.
+        !*/
+    public:
+
+        alias_tensor (
+        );
+        /*!
+            ensures
+                - #size() == 0 
+                - #num_samples() == 0
+                - #k() == 0
+                - #nr() == 0
+                - #nc() == 0
+        !*/
+
+        alias_tensor (
+            long long n_, long long k_ = 1, long long nr_ = 1, long long nc_ = 1
+        );
+        /*!
+            requires
+                - n_ >= 0
+                - k_ >= 0
+                - nr_ >= 0
+                - nc_ >= 0
+            ensures
+                - #size() == n_*k_*nr_*nc_
+                - #num_samples() == n_
+                - #k() == k_
+                - #nr() == nr_
+                - #nc() == nc_
+        !*/
+
+        long long num_samples() const;
+        long long k() const;
+        long long nr() const;
+        long long nc() const;
+        size_t size() const;
+
+        alias_tensor_instance operator() (
+            tensor& t,
+            size_t offset = 0
+        ) const;
+        /*!
+            requires
+                - offset+size() <= t.size()
+            ensures
+                - Returns a tensor that simply aliases the elements of t beginning with t's
+                  offset'th element.  Specifically, this function returns an aliasing
+                  tensor T such that:
+                    - T.size()   == size()
+                    - T.num_samples() == num_samples()
+                    - T.k()      == k()
+                    - T.nr()     == nr()
+                    - T.nc()     == nc()
+                    - T.host()   == t.host()+offset
+                    - T.device() == t.device()+offset
+                    - &T.annotation() == &t.annotation()
+        !*/
+
+        alias_tensor_const_instance operator() (
+            const tensor& t,
+            size_t offset = 0
+        ) const;
+        /*!
+            requires
+                - offset+size() <= t.size()
+            ensures
+                - This function is identical to the above version of operator() except that 
+                  it takes and returns const tensors instead of non-const tensors.
+        !*/
+    };
+
+    void serialize(const alias_tensor& item, std::ostream& out);
+    void deserialize(alias_tensor& item, std::istream& in);
+    /*!
+        provides serialization support for alias_tensor.  
+    !*/
+
+// ----------------------------------------------------------------------------------------
+
+}
+
+#endif // DLIB_DNn_TENSOR_ABSTRACT_H_
+
+
diff --git a/ml/dlib/dlib/dnn/tensor_tools.cpp b/ml/dlib/dlib/dnn/tensor_tools.cpp
new file mode 100644
index 000000000..c0f7fd69d
--- /dev/null
+++ b/ml/dlib/dlib/dnn/tensor_tools.cpp
@@ -0,0 +1,985 @@
+// Copyright (C) 2015  Davis E. King (davis@dlib.net)
+// License: Boost Software License   See LICENSE.txt for the full license.
+#ifndef DLIB_TeNSOR_TOOLS_CPP_
+#define DLIB_TeNSOR_TOOLS_CPP_
+
+#include "tensor_tools.h"
+#include "../string.h"
+#include <atomic>
+
+namespace dlib
+{
+    namespace
+    {
+        std::atomic<bool>& dnn_prefer_fastest_algo (
+        )
+        {
+            static std::atomic<bool> var(true);
+            return var;
+        }
+    }
+
+    bool dnn_prefer_fastest_algorithms (
+    )
+    {
+        return dnn_prefer_fastest_algo();
+    }
+
+    void set_dnn_prefer_fastest_algorithms(
+    )
+    {
+        dnn_prefer_fastest_algo() = true;
+    }
+
+    void set_dnn_prefer_smallest_algorithms(
+    )
+    {
+        dnn_prefer_fastest_algo() = false;
+    }
+}
+
+namespace dlib { namespace tt
+{
+
+// ----------------------------------------------------------------------------------------
+
+    void inverse_norms (
+        resizable_tensor& invnorms,
+        const tensor& data,
+        const double eps
+    )
+    {
+#ifdef DLIB_USE_CUDA
+        cuda::inverse_norms(invnorms, data, eps);
+#else
+        invnorms = reciprocal(sqrt(sum_cols(squared(mat(data))) + eps));
+#endif
+    }
+
+    void dot_prods (
+        resizable_tensor& out,
+        const tensor& lhs,
+        const tensor& rhs
+    )
+    {
+#ifdef DLIB_USE_CUDA
+        cuda::dot_prods(out, lhs, rhs);
+#else
+        out = sum_cols(pointwise_multiply(mat(lhs), mat(rhs))); 
+#endif
+    }
+
+    void dot_prods (
+        bool add_to,
+        tensor& out,
+        const tensor& lhs,
+        const tensor& rhs
+    )
+    {
+#ifdef DLIB_USE_CUDA
+        cuda::dot_prods(add_to, out, lhs, rhs);
+#else
+        if (add_to)
+            out += sum_cols(pointwise_multiply(mat(lhs), mat(rhs))); 
+        else
+            out = sum_cols(pointwise_multiply(mat(lhs), mat(rhs))); 
+#endif
+    }
+
+    void scale_columns (
+        tensor& out,
+        const tensor& m,
+        const tensor& v
+    )
+    {
+        DLIB_CASSERT(have_same_dimensions(out,m));
+        DLIB_CASSERT(is_vector(v));
+        if (m.size() == 0 && v.size() == 0)
+            return;
+        DLIB_CASSERT(m.size() != 0);
+        DLIB_CASSERT(m.size()/m.num_samples() == v.size());
+
+#ifdef DLIB_USE_CUDA
+        cuda::scale_columns(out, m, v);
+#else
+        DLIB_CASSERT(false, "shouldn't be called right now");
+        out = scale_columns(mat(m), mat(v));
+#endif
+    }
+
+    void scale_rows (
+        tensor& out,
+        const tensor& m,
+        const tensor& v
+    )
+    {
+        DLIB_CASSERT(have_same_dimensions(out,m));
+        DLIB_CASSERT(is_vector(v));
+        if (m.size() == 0 && v.size() == 0)
+            return;
+        DLIB_CASSERT(m.size() != 0);
+        DLIB_CASSERT(m.num_samples() == v.size());
+
+#ifdef DLIB_USE_CUDA
+        cuda::scale_rows(out, m, v);
+#else
+        out = scale_rows(mat(m), mat(v));
+#endif
+    }
+
+    void scale_rows2 (
+        float beta, 
+        tensor& out,
+        const tensor& m1,
+        const tensor& m2,
+        const tensor& v1,
+        const tensor& v2
+    )
+    {
+        DLIB_CASSERT(have_same_dimensions(out,m1));
+        DLIB_CASSERT(have_same_dimensions(out,m2));
+        DLIB_CASSERT(have_same_dimensions(v1,v2));
+        DLIB_CASSERT(is_vector(mat(v1))); 
+        DLIB_CASSERT(v1.size() == m1.num_samples());
+
+#ifdef DLIB_USE_CUDA
+        cuda::scale_rows2(beta, out, m1, m2, v1, v2);
+#else
+        if (beta == 0)
+            out = scale_rows(mat(m1) - scale_rows(mat(m2),mat(v1)), mat(v2));
+        else
+            out = beta*mat(out) + scale_rows(mat(m1) - scale_rows(mat(m2),mat(v1)), mat(v2));
+#endif
+    }
+
+// ----------------------------------------------------------------------------------------
+
+    void exp (
+        tensor& dest,
+        const tensor& src
+    )
+    {
+        DLIB_CASSERT(dest.size() == src.size());
+
+#ifdef DLIB_USE_CUDA
+        cuda::exp(dest,src);
+#else
+        dest = exp(mat(src));
+#endif
+    }
+
+// ----------------------------------------------------------------------------------------
+
+    void log (
+        tensor& dest,
+        const tensor& src
+    )
+    {
+        DLIB_CASSERT(dest.size() == src.size());
+
+#ifdef DLIB_USE_CUDA
+        cuda::log(dest,src);
+#else
+        dest = log(mat(src));
+#endif
+    }
+
+// ----------------------------------------------------------------------------------------
+
+    void log10 (
+        tensor& dest,
+        const tensor& src
+    )
+    {
+        DLIB_CASSERT(dest.size() == src.size());
+
+#ifdef DLIB_USE_CUDA
+        cuda::log10(dest,src);
+#else
+        dest = log10(mat(src));
+#endif
+    }
+
+// ----------------------------------------------------------------------------------------
+
+    void gemm (
+        float beta,
+        tensor& dest,
+        float alpha,
+        const tensor& lhs,
+        bool trans_lhs,
+        const tensor& rhs,
+        bool trans_rhs
+    )
+    {
+#ifdef DLIB_USE_CUDA
+        cuda::gemm(beta, dest, alpha, lhs, trans_lhs, rhs, trans_rhs);
+#else
+        if (beta != 0)
+        {
+            if (trans_lhs && trans_rhs)
+                dest = alpha*trans(mat(lhs))*trans(mat(rhs)) + beta*mat(dest);
+            else if (!trans_lhs && trans_rhs)
+                dest = alpha*mat(lhs)*trans(mat(rhs)) + beta*mat(dest);
+            else if (trans_lhs && !trans_rhs)
+                dest = alpha*trans(mat(lhs))*mat(rhs) + beta*mat(dest);
+            else
+                dest = alpha*mat(lhs)*mat(rhs) + beta*mat(dest);
+        }
+        else
+        {
+            if (trans_lhs && trans_rhs)
+                dest = alpha*trans(mat(lhs))*trans(mat(rhs));
+            else if (!trans_lhs && trans_rhs)
+                dest = alpha*mat(lhs)*trans(mat(rhs));
+            else if (trans_lhs && !trans_rhs)
+                dest = alpha*trans(mat(lhs))*mat(rhs);
+            else
+                dest = alpha*mat(lhs)*mat(rhs);
+        }
+#endif
+    }
+
+// ----------------------------------------------------------------------------------------
+// ----------------------------------------------------------------------------------------
+
+    tensor_rand::
+    tensor_rand(
+        unsigned long long seed
+    ) 
+#ifdef DLIB_USE_CUDA
+    :rnd(seed){}
+#else
+    {rnd.set_seed(cast_to_string(seed)); }
+#endif
+
+    void tensor_rand::
+    fill_gaussian (
+        tensor& data,
+        float mean,
+        float stddev
+    )
+    {
+        DLIB_CASSERT(data.size()%2 == 0);
+#ifdef DLIB_USE_CUDA
+        rnd.fill_gaussian(data, mean, stddev);
+#else
+        for (auto& x : data) 
+            x = rnd.get_random_gaussian()*stddev + mean;
+#endif
+    }
+
+    void tensor_rand::
+    fill_uniform (
+        tensor& data
+    )
+    {
+#ifdef DLIB_USE_CUDA
+        rnd.fill_uniform(data);
+#else
+        for (auto& x : data) 
+            x = rnd.get_random_float();
+#endif
+    }
+
+// ----------------------------------------------------------------------------------------
+// ----------------------------------------------------------------------------------------
+
+    void multiply (
+        bool add_to,
+        tensor& dest,
+        const tensor& src1,
+        const tensor& src2
+    )
+    {
+        DLIB_CASSERT(dest.k() == src1.k() && src1.k() == src2.k() &&
+            dest.nr() == src1.nr() && src1.nr() == src2.nr() &&
+            dest.nc() == src1.nc() && src1.nc() == src2.nc() );
+        const long MD = std::max(std::max(dest.num_samples(),src1.num_samples()),src2.num_samples());
+        DLIB_CASSERT((dest.num_samples()==1 || dest.num_samples()==MD) &&
+                    (src1.num_samples()==1 || src1.num_samples()==MD) &&
+                    (src2.num_samples()==1 || src2.num_samples()==MD) );
+#ifdef DLIB_USE_CUDA
+        cuda::multiply(add_to, dest, src1, src2);
+#else
+        cpu::multiply(add_to, dest, src1, src2);
+#endif
+
+    }
+
+    void scale_channels (
+        bool add_to,
+        tensor& dest,
+        const tensor& src,
+        const tensor& scales
+    )
+    {
+#ifdef DLIB_USE_CUDA
+        cuda::scale_channels(add_to, dest, src, scales);
+#else
+        cpu::scale_channels(add_to, dest, src, scales);
+#endif
+    }
+
+    void multiply_conv (
+        bool add_to,
+        tensor& dest,
+        const tensor& src1,
+        const tensor& src2
+    )
+    {
+#ifdef DLIB_USE_CUDA
+        cuda::multiply_conv(add_to, dest, src1, src2);
+#else
+        cpu::multiply_conv(add_to, dest, src1, src2);
+#endif
+    }
+
+    void multiply_zero_padded (
+        bool add_to,
+        tensor& dest,
+        const tensor& src1,
+        const tensor& src2
+    )
+    {
+#ifdef DLIB_USE_CUDA
+        cuda::multiply_zero_padded(add_to, dest, src1, src2);
+#else
+        cpu::multiply_zero_padded(add_to, dest, src1, src2);
+#endif
+    }
+
+// ----------------------------------------------------------------------------------------
+
+    void affine_transform(
+        tensor& dest,
+        const tensor& src,
+        const float A,
+        const float B
+    )
+    {
+#ifdef DLIB_USE_CUDA
+        cuda::affine_transform(dest,src,A,B);
+#else
+        cpu::affine_transform(dest,src,A,B);
+#endif
+    }
+
+    void affine_transform(
+        tensor& dest,
+        const tensor& src,
+        const float A
+    )
+    {
+#ifdef DLIB_USE_CUDA
+        cuda::affine_transform(dest,src,A);
+#else
+        cpu::affine_transform(dest,src,A,0);
+#endif
+    }
+
+    void affine_transform(
+        tensor& dest,
+        const tensor& src1,
+        const tensor& src2,
+        const float A,
+        const float B,
+        const float C
+    )
+    {
+#ifdef DLIB_USE_CUDA
+        cuda::affine_transform(dest,src1,src2,A,B,C);
+#else
+        cpu::affine_transform(dest,src1,src2,A,B,C);
+#endif
+    }
+
+    void affine_transform(
+        tensor& dest,
+        const tensor& src1,
+        const tensor& src2,
+        const float A,
+        const float B
+    )
+    {
+#ifdef DLIB_USE_CUDA
+        cuda::affine_transform(dest,src1,src2,A,B);
+#else
+        cpu::affine_transform(dest,src1,src2,A,B,0);
+#endif
+    }
+
+    void affine_transform(
+        tensor& dest,
+        const tensor& src1,
+        const tensor& src2,
+        const tensor& src3,
+        const float A,
+        const float B,
+        const float C,
+        const float D
+    )
+    {
+#ifdef DLIB_USE_CUDA
+        cuda::affine_transform(dest,src1,src2,src3,A,B,C,D);
+#else
+        cpu::affine_transform(dest,src1,src2,src3,A,B,C,D);
+#endif
+    }
+
+    void affine_transform_range(
+        size_t begin,
+        size_t end,
+        tensor& dest,
+        const tensor& src1,
+        const tensor& src2,
+        const tensor& src3,
+        const float A,
+        const float B,
+        const float C
+    )
+    {
+#ifdef DLIB_USE_CUDA
+        cuda::affine_transform_range(begin, end, dest,src1,src2,src3,A,B,C);
+#else
+        cpu::affine_transform_range(begin, end, dest,src1,src2,src3,A,B,C);
+#endif
+    }
+
+    void affine_transform(
+        const rectangle& rect,
+        tensor& dest, 
+        const tensor& src1, 
+        const tensor& src2, 
+        const tensor& src3, 
+        float A, 
+        float B,
+        float C
+    )
+    {
+#ifdef DLIB_USE_CUDA
+        cuda::affine_transform(rect, dest,src1,src2,src3,A,B,C);
+#else
+        cpu::affine_transform(rect, dest,src1,src2,src3,A,B,C);
+#endif
+    }
+
+    void affine_transform(
+        tensor& dest,
+        const tensor& src1,
+        const tensor& src2,
+        const tensor& src3,
+        const float A,
+        const float B,
+        const float C
+    )
+    {
+#ifdef DLIB_USE_CUDA
+        cuda::affine_transform_range(0,dest.size(),dest,src1,src2,src3,A,B,C);
+#else
+        cpu::affine_transform_range(0,dest.size(),dest,src1,src2,src3,A,B,C);
+#endif
+    }
+
+// ----------------------------------------------------------------------------------------
+
+    void affine_transform(
+        tensor& dest,
+        const tensor& src,
+        const tensor& A,
+        const tensor& B
+    )
+    {
+#ifdef DLIB_USE_CUDA
+        cuda::affine_transform(dest,src,A,B);
+#else
+        cpu::affine_transform(dest,src,A,B);
+#endif
+    }
+
+// ----------------------------------------------------------------------------------------
+
+    void affine_transform_conv(
+        tensor& dest,
+        const tensor& src,
+        const tensor& A,
+        const tensor& B
+    )
+    {
+#ifdef DLIB_USE_CUDA
+        cuda::affine_transform_conv(dest,src,A,B);
+#else
+        cpu::affine_transform_conv(dest,src,A,B);
+#endif
+    }
+
+// ----------------------------------------------------------------------------------------
+
+    void compute_adam_update (
+        size_t begin,
+        size_t end,
+        tensor& s,
+        tensor& m,
+        tensor& v,
+        const float t,
+        const float learning_rate,
+        const float weight_decay,
+        const float momentum1,
+        const float momentum2,
+        const tensor& params,
+        const tensor& params_grad
+    )
+    {
+#ifdef DLIB_USE_CUDA
+        cuda::compute_adam_update(begin, end, s, m, v, t, learning_rate, weight_decay, momentum1,
+            momentum2, params, params_grad);
+#else
+        cpu::compute_adam_update(begin, end, s, m, v, t, learning_rate, weight_decay, momentum1,
+            momentum2, params, params_grad);
+#endif
+    }
+
+// ----------------------------------------------------------------------------------------
+
+    void batch_normalize_inference (
+        const double eps,
+        resizable_tensor& dest,
+        const tensor& src,
+        const tensor& gamma, 
+        const tensor& beta,
+        const tensor& running_means,
+        const tensor& running_variances
+    )
+    {
+#ifdef DLIB_USE_CUDA
+        cuda::batch_normalize_inference(eps,dest,src,gamma,beta,running_means,running_variances);
+#else
+        cpu::batch_normalize_inference(eps,dest,src,gamma,beta,running_means,running_variances);
+#endif
+    }
+
+    void batch_normalize (
+        const double eps,
+        resizable_tensor& dest,
+        resizable_tensor& means,
+        resizable_tensor& vars,
+        const double averaging_factor,
+        resizable_tensor& running_means,
+        resizable_tensor& running_variances,
+        const tensor& src,
+        const tensor& gamma, 
+        const tensor& beta 
+    )
+    {
+#ifdef DLIB_USE_CUDA
+        cuda::batch_normalize(eps,dest,means,vars,averaging_factor,running_means,running_variances,src,gamma,beta);
+#else
+        cpu::batch_normalize(eps,dest,means,vars,averaging_factor,running_means,running_variances,src,gamma,beta);
+#endif
+    }
+
+    void batch_normalize_gradient (
+        const double eps,
+            const tensor& gradient_input,
+            const tensor& means,
+            const tensor& invstds,
+            const tensor& src,
+            const tensor& gamma,
+            tensor& src_grad,
+            tensor& gamma_grad, 
+            tensor& beta_grad 
+    )
+    {
+             
+#ifdef DLIB_USE_CUDA
+        cuda::batch_normalize_gradient(eps,gradient_input, means, invstds, src, gamma, src_grad, gamma_grad, beta_grad);
+#else
+        cpu::batch_normalize_gradient(eps,gradient_input, means, invstds, src, gamma, src_grad, gamma_grad, beta_grad);
+#endif
+    }
+
+// ----------------------------------------------------------------------------------------
+
+    void batch_normalize_conv_inference (
+        const double eps,
+        resizable_tensor& dest,
+        const tensor& src,
+        const tensor& gamma, 
+        const tensor& beta,
+        const tensor& running_means,
+        const tensor& running_variances
+    )
+    {
+#ifdef DLIB_USE_CUDA
+        cuda::batch_normalize_conv_inference(eps,dest,src,gamma,beta,running_means,running_variances);
+#else
+        cpu::batch_normalize_conv_inference(eps,dest,src,gamma,beta,running_means,running_variances);
+#endif
+    }
+
+    void batch_normalize_conv (
+        const double eps,
+        resizable_tensor& dest,
+        resizable_tensor& means,
+        resizable_tensor& vars,
+        const double averaging_factor,
+        resizable_tensor& running_means,
+        resizable_tensor& running_variances,
+        const tensor& src,
+        const tensor& gamma, 
+        const tensor& beta 
+    )
+    {
+#ifdef DLIB_USE_CUDA
+        cuda::batch_normalize_conv(eps,dest,means,vars,averaging_factor,running_means,running_variances,src,gamma,beta);
+#else
+        cpu::batch_normalize_conv(eps,dest,means,vars,averaging_factor,running_means,running_variances,src,gamma,beta);
+#endif
+    }
+
+    void batch_normalize_conv_gradient (
+        const double eps,
+        const tensor& gradient_input,
+        const tensor& means,
+        const tensor& invstds,
+        const tensor& src,
+        const tensor& gamma,
+        tensor& src_grad,
+        tensor& gamma_grad, 
+        tensor& beta_grad 
+    )
+    {
+             
+#ifdef DLIB_USE_CUDA
+        cuda::batch_normalize_conv_gradient(eps,gradient_input, means, invstds, src, gamma, src_grad, gamma_grad, beta_grad);
+#else
+        cpu::batch_normalize_conv_gradient(eps,gradient_input, means, invstds, src, gamma, src_grad, gamma_grad, beta_grad);
+#endif
+    }
+
+// ----------------------------------------------------------------------------------------
+
+    void threshold (
+        tensor& data,
+        float thresh
+    )
+    {
+#ifdef DLIB_USE_CUDA
+        cuda::threshold(data,thresh);
+#else
+        cpu::threshold(data,thresh);
+#endif
+    }
+
+    void dot (
+        const tensor& a,
+        const tensor& b,
+        tensor& result,
+        size_t idx
+    )
+    {
+#ifdef DLIB_USE_CUDA
+        cuda::dot(a,b,result,idx);
+#else
+        cpu::dot(a,b,result,idx);
+#endif
+    }
+
+// ----------------------------------------------------------------------------------------
+
+    void add(
+        float beta,
+        tensor& dest,
+        float alpha,
+        const tensor& src
+    )
+    {
+#ifdef DLIB_USE_CUDA
+        cuda::add(beta,dest,alpha,src);
+#else
+        cpu::add(beta,dest,alpha,src);
+#endif
+    }
+
+// ----------------------------------------------------------------------------------------
+
+    void add (
+        tensor& dest,
+        const tensor& src1,
+        const tensor& src2
+    )
+    {
+#ifdef DLIB_USE_CUDA
+        cuda::add(dest, src1, src2);
+#else
+        cpu::add(dest, src1, src2);
+#endif
+    }
+
+// ----------------------------------------------------------------------------------------
+
+    void assign_conv_bias_gradient (
+        tensor& grad,
+        const tensor& gradient_input
+    )
+    {
+#ifdef DLIB_USE_CUDA
+        cuda::assign_conv_bias_gradient(grad,gradient_input);
+#else
+        cpu::assign_conv_bias_gradient(grad,gradient_input);
+#endif
+    }
+
+// ----------------------------------------------------------------------------------------
+
+    void assign_bias_gradient (
+        tensor& grad,
+        const tensor& gradient_input
+    )
+    {
+#ifdef DLIB_USE_CUDA
+        cuda::assign_bias_gradient(grad,gradient_input);
+#else
+        cpu::assign_bias_gradient(grad,gradient_input);
+#endif
+    }
+
+// ----------------------------------------------------------------------------------------
+// ----------------------------------------------------------------------------------------
+
+    void softmax (
+        tensor& dest,
+        const tensor& src
+    )
+    {
+#ifdef DLIB_USE_CUDA
+        cuda::softmax(dest,src);
+#else
+        cpu::softmax(dest,src);
+#endif
+    }
+
+    void softmax_gradient (
+        tensor& grad,
+        const tensor& dest,
+        const tensor& gradient_input
+    )
+    {
+#ifdef DLIB_USE_CUDA
+        cuda::softmax_gradient(grad, dest, gradient_input);
+#else
+        cpu::softmax_gradient(grad, dest, gradient_input);
+#endif
+    }
+
+// ----------------------------------------------------------------------------------------
+
+    void softmax_all (
+        tensor& dest,
+        const tensor& src
+    )
+    {
+#ifdef DLIB_USE_CUDA
+        cuda::softmax_all(dest,src);
+#else
+        cpu::softmax_all(dest,src);
+#endif
+    }
+
+    void softmax_all_gradient (
+        tensor& grad,
+        const tensor& dest,
+        const tensor& gradient_input
+    )
+    {
+#ifdef DLIB_USE_CUDA
+        cuda::softmax_all_gradient(grad, dest, gradient_input);
+#else
+        cpu::softmax_all_gradient(grad, dest, gradient_input);
+#endif
+    }
+
+// ----------------------------------------------------------------------------------------
+
+    void sigmoid (
+        tensor& dest,
+        const tensor& src
+    )
+    {
+#ifdef DLIB_USE_CUDA
+        cuda::sigmoid(dest,src);
+#else
+        cpu::sigmoid(dest,src);
+#endif
+    }
+
+    void sigmoid_gradient (
+        tensor& grad,
+        const tensor& dest,
+        const tensor& gradient_input
+    )
+    {
+#ifdef DLIB_USE_CUDA
+        cuda::sigmoid_gradient(grad, dest, gradient_input);
+#else
+        cpu::sigmoid_gradient(grad, dest, gradient_input);
+#endif
+    }
+
+// ----------------------------------------------------------------------------------------
+
+    void relu (
+        tensor& dest,
+        const tensor& src
+    )
+    {
+#ifdef DLIB_USE_CUDA
+        cuda::relu(dest,src);
+#else
+        cpu::relu(dest,src);
+#endif
+    }
+
+    void relu_gradient (
+        tensor& grad,
+        const tensor& dest,
+        const tensor& gradient_input
+    )
+    {
+#ifdef DLIB_USE_CUDA
+        cuda::relu_gradient(grad, dest, gradient_input);
+#else
+        cpu::relu_gradient(grad, dest, gradient_input);
+#endif
+    }
+
+// ----------------------------------------------------------------------------------------
+
+    void prelu (
+        tensor& dest,
+        const tensor& src,
+        const tensor& param
+    )
+    {
+#ifdef DLIB_USE_CUDA
+        cuda::prelu(dest, src, param);
+#else
+        cpu::prelu(dest, src, param);
+#endif
+    }
+
+    void prelu_gradient (
+        tensor& grad,
+        const tensor& src,
+        const tensor& gradient_input,
+        const tensor& param,
+        tensor& params_grad 
+    )
+    {
+#ifdef DLIB_USE_CUDA
+        cuda::prelu_gradient(grad, src, gradient_input, param, params_grad);
+#else
+        cpu::prelu_gradient(grad, src, gradient_input, param, params_grad);
+#endif
+    }
+
+// ----------------------------------------------------------------------------------------
+
+    void tanh (
+        tensor& dest,
+        const tensor& src
+    )
+    {
+#ifdef DLIB_USE_CUDA
+        cuda::tanh(dest,src);
+#else
+        cpu::tanh(dest,src);
+#endif
+    }
+
+    void tanh_gradient (
+        tensor& grad,
+        const tensor& dest,
+        const tensor& gradient_input
+    )
+    {
+#ifdef DLIB_USE_CUDA
+        cuda::tanh_gradient(grad, dest, gradient_input);
+#else
+        cpu::tanh_gradient(grad, dest, gradient_input);
+#endif
+    }
+
+// ----------------------------------------------------------------------------------------
+
+    void resize_bilinear (
+        tensor& dest,
+        long dest_row_stride,
+        long dest_channel_stride,
+        const tensor& src,
+        long src_row_stride,
+        long src_channel_stride
+    )
+    {
+#ifdef DLIB_USE_CUDA
+        cuda::resize_bilinear(dest,dest_row_stride,dest_channel_stride, src,src_row_stride,src_channel_stride);
+#else
+        cpu::resize_bilinear(dest,dest_row_stride,dest_channel_stride, src,src_row_stride,src_channel_stride);
+#endif
+    }
+
+    void resize_bilinear_gradient (
+        tensor& grad,
+        long grad_row_stride,
+        long grad_channel_stride,
+        const tensor& gradient_input,
+        long gradient_input_row_stride,
+        long gradient_input_channel_stride
+    )
+    {
+#ifdef DLIB_USE_CUDA
+        cuda::resize_bilinear_gradient(grad,grad_row_stride,grad_channel_stride,  gradient_input,gradient_input_row_stride,gradient_input_channel_stride);
+#else
+        cpu::resize_bilinear_gradient(grad,grad_row_stride,grad_channel_stride,  gradient_input,gradient_input_row_stride,gradient_input_channel_stride);
+#endif
+    }
+
+// ------------------------------------------------------------------------------------
+
+    void copy_tensor(
+            bool add_to,
+            tensor& dest,
+            size_t dest_k_offset,
+            const tensor& src,
+            size_t src_k_offset,
+            size_t count_k
+    )
+    {
+#ifdef DLIB_USE_CUDA
+        cuda::copy_tensor(add_to, dest, dest_k_offset, src, src_k_offset, count_k);
+#else
+        cpu::copy_tensor(add_to, dest, dest_k_offset, src, src_k_offset, count_k);
+#endif
+    }
+
+// ----------------------------------------------------------------------------------------
+
+    void inv::
+    operator() (
+        const tensor& m,
+        resizable_tensor& out
+    )
+    {
+#ifdef DLIB_USE_CUDA
+        finv(m,out);
+#else
+        out = dlib::inv(mat(m));
+#endif
+    }
+
+// ----------------------------------------------------------------------------------------
+
+}}
+
+#endif // DLIB_TeNSOR_TOOLS_CPP_
+
diff --git a/ml/dlib/dlib/dnn/tensor_tools.h b/ml/dlib/dlib/dnn/tensor_tools.h
new file mode 100644
index 000000000..9ba3154e5
--- /dev/null
+++ b/ml/dlib/dlib/dnn/tensor_tools.h
@@ -0,0 +1,1711 @@
+// Copyright (C) 2015  Davis E. King (davis@dlib.net)
+// License: Boost Software License   See LICENSE.txt for the full license.
+#ifndef DLIB_TeNSOR_TOOLS_H_
+#define DLIB_TeNSOR_TOOLS_H_
+
+#include "tensor.h"
+#include "cudnn_dlibapi.h"
+#include "cublas_dlibapi.h"
+#include "cusolver_dlibapi.h"
+#include "curand_dlibapi.h"
+#include "cpu_dlib.h"
+#include "cuda_dlib.h"
+#include "../rand.h"
+#include <memory>
+#include "../geometry/rectangle.h"
+#include "../test_for_odr_violations.h"
+
+namespace dlib
+{
+    bool dnn_prefer_fastest_algorithms();
+    void set_dnn_prefer_fastest_algorithms();
+    void set_dnn_prefer_smallest_algorithms();
+}
+
+namespace dlib { namespace tt
+{
+
+// ----------------------------------------------------------------------------------------
+
+    void inverse_norms (
+        resizable_tensor& invnorms,
+        const tensor& data,
+        const double eps
+    );
+    /*!
+        ensures
+            - #invnorms == reciprocal(sqrt(sum_cols(squared(mat(data))) + eps))
+    !*/
+
+    void dot_prods (
+        resizable_tensor& out,
+        const tensor& lhs,
+        const tensor& rhs
+    );
+    /*!
+        requires
+            - have_same_dimensions(lhs,rhs) == true
+        ensures
+            - #out.num_samples() == lhs.num_samples()
+            - #out.k() == #out.nr() == #out.nc() == 1
+            - #out == sum_cols(pointwise_multiply(mat(lhs), mat(rhs))); 
+    !*/
+
+    void dot_prods (
+        bool add_to,
+        tensor& out,
+        const tensor& lhs,
+        const tensor& rhs
+    );
+    /*!
+        requires
+            - have_same_dimensions(lhs,rhs) == true
+            - out.size() == lhs.num_samples()
+            - out.k() == out.nr() == out.nc() == 1
+        ensures
+            - if (add_to) then
+                - #out == mat(out) + sum_cols(pointwise_multiply(mat(lhs), mat(rhs))); 
+            - else
+                - #out == sum_cols(pointwise_multiply(mat(lhs), mat(rhs))); 
+    !*/
+
+    void scale_columns (
+        tensor& out,
+        const tensor& m,
+        const tensor& v
+    );
+    /*!
+        requires
+            - have_same_dimensions(out,m) == true
+            - is_vector(v) == true
+            - v.size() == mat(m).nc()
+        ensures
+            - performs: out = scale_columns(mat(m),mat(v));
+    !*/
+
+    void scale_rows (
+        tensor& out,
+        const tensor& m,
+        const tensor& v
+    );
+    /*!
+        requires
+            - have_same_dimensions(out,m) == true
+            - is_vector(v) == true
+            - v.size() == m.num_samples()
+        ensures
+            - performs: out = scale_rows(mat(m),mat(v));
+    !*/
+
+    void scale_rows2 (
+        float beta, 
+        tensor& out,
+        const tensor& m1,
+        const tensor& m2,
+        const tensor& v1,
+        const tensor& v2
+    );
+    /*!
+        requires
+            - have_same_dimensions(out,m1) == true
+            - have_same_dimensions(out,m2) == true
+            - have_same_dimensions(v1,v2) == true
+            - is_vector(v1) == true
+            - v1.size() == m1.num_samples()
+        ensures
+            - performs: 
+                out = beta*out + scale_rows(mat(m1) - scale_rows(mat(m2),mat(v1)), mat(v2));
+    !*/
+
+// ----------------------------------------------------------------------------------------
+    
+    void exp (
+        tensor& dest,
+        const tensor& src
+    );
+    /*!
+        requires
+            - dest.size() == src.size()
+        ensures
+            - performs: dest = exp(mat(src))
+    !*/
+
+// ----------------------------------------------------------------------------------------
+
+    void log (
+        tensor& dest,
+        const tensor& src
+    );
+    /*!
+        requires
+            - dest.size() == src.size()
+        ensures
+            - performs: dest = log(mat(src))
+    !*/
+
+// ----------------------------------------------------------------------------------------
+
+    void log10 (
+        tensor& dest,
+        const tensor& src
+    );
+    /*!
+        requires
+            - dest.size() == src.size()
+        ensures
+            - performs: dest = log10(mat(src))
+    !*/
+
+// ----------------------------------------------------------------------------------------
+
+    void gemm (
+        float beta,
+        tensor& dest,
+        float alpha,
+        const tensor& lhs,
+        bool trans_lhs,
+        const tensor& rhs,
+        bool trans_rhs
+    );
+    /*!
+        requires
+            - dest does not alias the memory of lhs or rhs
+            - The dimensions of lhs and rhs must be compatible for matrix multiplication.
+              In particular:
+                - Let L == trans_lhs ? trans(mat(lhs)) : mat(lhs)
+                - Let R == trans_rhs ? trans(mat(rhs)) : mat(rhs)
+                - Let D == mat(dest)
+                - D.nr() == L.nr() && D.nc() == R.nc()
+                  (i.e. dest must be preallocated and have the correct output dimensions)
+                - L.nc() == R.nr()
+        ensures
+            - performs: dest = alpha*L*R + beta*mat(dest)
+    !*/
+
+// ----------------------------------------------------------------------------------------
+
+    class inv
+    {
+        /*!
+            WHAT THIS OBJECT REPRESENTS
+                This is a functor for doing matrix inversion on the GPU.  The only
+                reason it's an object is to avoid the reallocation of some GPU memory
+                blocks if you want to do a bunch of matrix inversions in a row.
+        !*/
+    public:
+
+        void operator() (
+            const tensor& m,
+            resizable_tensor& out
+        );
+        /*!
+            requires
+                - m.size() == m.num_samples()*m.num_samples()
+                  (i.e. mat(m) must be a square matrix)
+            ensures
+                - out == inv(mat(m));
+        !*/
+
+    private:
+#ifdef DLIB_USE_CUDA
+        cuda::inv finv;
+#endif
+    };
+
+// ----------------------------------------------------------------------------------------
+
+    class tensor_rand
+    {
+        /*!
+            WHAT THIS OBJECT REPRESENTS
+                This is a tool for filling a tensor with random numbers.  
+
+                Note that the sequence of random numbers output by this object is different
+                when dlib is compiled with DLIB_USE_CUDA.  So you should not write code
+                that depends on any specific sequence of numbers coming out of a
+                tensor_rand.
+
+        !*/
+
+    public:
+        // not copyable
+        tensor_rand(const tensor_rand&) = delete;
+        tensor_rand& operator=(const tensor_rand&) = delete;
+
+        tensor_rand() : tensor_rand(0) {}
+        tensor_rand(unsigned long long seed);
+
+        void fill_gaussian (
+            tensor& data,
+            float mean = 0,
+            float stddev = 1
+        );
+        /*!
+            requires
+                - data.size()%2 == 0
+            ensures
+                - Fills data with random numbers drawn from a Gaussian distribution
+                  with the given mean and standard deviation.
+        !*/
+
+        void fill_uniform (
+            tensor& data
+        );
+        /*!
+            ensures
+                - Fills data with uniform random numbers in the range (0.0, 1.0].
+        !*/
+
+#ifdef DLIB_USE_CUDA
+        cuda::curand_generator rnd;
+#else
+        dlib::rand rnd;
+#endif
+    };
+
+// ----------------------------------------------------------------------------------------
+
+    void multiply (
+        bool add_to,
+        tensor& dest,
+        const tensor& src1,
+        const tensor& src2
+    );
+    /*!
+        requires
+            - dest.k()  == src1.k()  == src2.k()
+            - dest.nr() == src1.nr() == src2.nr()
+            - dest.nc() == src1.nc() == src2.nc()
+            - dest.num_samples(), src1.num_samples(), and src2.num_samples() must each
+              either be 1 or whichever ones aren't equal to 1 must have the same values.
+        ensures
+            - let MD = max(dest.num_samples(), src1.num_samples(), src2.num_samples)
+            - This function pointwise multiplies src1 with src2 and stores the result into
+              #dest.  However, how the multiplication happens depends on the dimensions of
+              the tensors.  First, when src1 and src2 are multiplied together, if either
+              has a num_samples() dimension that is != MD, then it is first replicated to
+              produce a tensor with num_samples()==MD dimensions and then they are
+              pointwise multiplied together.
+
+              Second, if dest.num_samples()==1, then after the pointwise multiplication of
+              src1 with src2, the result has its samples summed to produce an output tensor
+              with num_samples()==1 which is then assigned to #dest.
+            - if (add_to) then
+                - Instead of assigning the result to dest, this function adds the result to dest.
+    !*/
+
+    void scale_channels (
+        bool add_to,
+        tensor& dest,
+        const tensor& src,
+        const tensor& scales
+    );
+    /*!
+        requires
+            - have_same_dimensions(dest, src) == true
+            - scales.num_samples() == src.num_samples()
+            - scales.k()           == src.k()
+            - scales.nr()          == 1
+            - scales.nc()          == 1
+        ensures
+            - Scales each channel of src by the corresponding value in scales.  To be
+              precise, we will have:
+                - #dest(n,k,r,c) == src(n,k,r,c)*scales(n,k,1,1)
+            - if (add_to) then
+                - Instead of assigning the result to dest, this function adds the result to dest.
+    !*/
+
+    void multiply_conv (
+        bool add_to,
+        tensor& dest,
+        const tensor& src1,
+        const tensor& src2
+    );
+    /*!
+        requires
+            - if (have_same_dimensions(dest, src1) == true) then
+                - src2.num_samples() == 1
+                - src2.nr() == 1
+                - src2.nc() == 1
+                - src2.k() == src1.k()
+            - else
+                - have_same_dimensions(src1, src2) == true) 
+                - dest.num_samples() == 1
+                - dest.nr() == 1
+                - dest.nc() == 1
+                - dest.k() == src1.k()
+        ensures
+            - Performs #dest == src1*src2 
+              In particular, if the elements of dest, src1, and src2 were indexed by (n,k,r,c) then
+              we would have:
+                - if (have_same_dimensions(dest,src1)) then
+                    #dest(n,k,r,c) == src1(n,k,r,c)*src2(k)
+                - else
+                    #dest(k) == sum over {n,r,c} of src1(n,k,r,c)*src2(n,k,r,c)
+            - if (add_to) then
+                - Instead of assigning the result to dest, this function adds the result to dest.
+    !*/
+
+    void multiply_zero_padded (
+        bool add_to,
+        tensor& dest,
+        const tensor& src1,
+        const tensor& src2
+    );
+    /*!
+        ensures
+            - if (add_to) then
+                - performs: dest += src1 * src2
+            - else
+                - performs: dest = src1 * src2
+            - In either case, the multiplication happens pointwise according to 4D tensor
+              arithmetic.  If the dimensions don't match then missing elements are presumed
+              to be equal to 0.
+    !*/
+
+// ----------------------------------------------------------------------------------------
+
+    void affine_transform(
+        tensor& dest,
+        const tensor& src,
+        const float A,
+        const float B
+    );
+    /*!
+        requires
+            - dest.size()==src.size()
+        ensures
+            - #dest == A*src + B
+    !*/
+
+    void affine_transform(
+        tensor& dest,
+        const tensor& src,
+        const float A
+    );
+    /*!
+        requires
+            - dest.size()==src.size()
+        ensures
+            - #dest == A*src 
+    !*/
+
+    void affine_transform(
+        tensor& dest,
+        const tensor& src1,
+        const tensor& src2,
+        const float A,
+        const float B,
+        const float C
+    );
+    /*!
+        requires
+            - dest.size()==src1.size()
+            - dest.size()==src2.size()
+        ensures
+            - #dest == A*src1 + B*src2 + C
+    !*/
+
+    void affine_transform(
+        tensor& dest,
+        const tensor& src1,
+        const tensor& src2,
+        const float A,
+        const float B
+    );
+    /*!
+        requires
+            - dest.size()==src1.size()
+            - dest.size()==src2.size()
+        ensures
+            - #dest == A*src1 + B*src2
+    !*/
+
+    void affine_transform(
+        tensor& dest,
+        const tensor& src1,
+        const tensor& src2,
+        const tensor& src3,
+        const float A,
+        const float B,
+        const float C,
+        const float D
+    );
+    /*!
+        requires 
+            - dest.size()==src1.size()
+            - dest.size()==src2.size()
+            - dest.size()==src3.size()
+        ensures
+            - #dest == A*src1 + B*src2 + C*src3 + D
+    !*/
+
+    void affine_transform(
+        tensor& dest,
+        const tensor& src1,
+        const tensor& src2,
+        const tensor& src3,
+        const float A,
+        const float B,
+        const float C
+    );
+    /*!
+        requires 
+            - dest.size()==src1.size()
+            - dest.size()==src2.size()
+            - dest.size()==src3.size()
+        ensures
+            - #dest == A*src1 + B*src2 + C*src3
+    !*/
+
+    void affine_transform_range(
+        size_t begin,
+        size_t end,
+        tensor& dest,
+        const tensor& src1,
+        const tensor& src2,
+        const tensor& src3,
+        const float A,
+        const float B,
+        const float C
+    );
+    /*!
+        requires 
+            - dest.size()==src1.size()
+            - dest.size()==src2.size()
+            - dest.size()==src3.size()
+            - begin <= end <= dest.size()
+        ensures
+            - This function operates much like
+              affine_transform(dest,src1,src2,src3,A,B,C,0), except that it runs over only
+              the half open range [begin,end) rather than processing the entire tensor.
+              Specifically, it does this:
+                - for i in the range [begin, end):
+                    - #dest.host()[i] == A*src1.host()[i] + B*src2.host()[i] + C*src3.host()[i]
+    !*/
+
+    void affine_transform(
+        const rectangle& rect,
+        tensor& dest, 
+        const tensor& src1, 
+        const tensor& src2, 
+        const tensor& src3, 
+        float A, 
+        float B,
+        float C
+    );
+    /*!
+        requires
+            - dest.size()==src1.size()
+            - dest.size()==src2.size()
+            - dest.size()==src3.size()
+            - dest.num_samples()==src1.num_samples()
+            - dest.num_samples()==src2.num_samples()
+            - dest.num_samples()==src3.num_samples()
+            - get_rect(mat(dest)).contains(rect) == true
+              (i.e. rect must be entirely contained within dest)
+        ensures
+            - This function operates much like
+              affine_transform(dest,src1,src2,src3,A,B,C,0), except that it runs over only
+              the sub-rectangle indicated by rect.  In particular, this function is equivalent
+              to:
+                set_subm(dest,rect) = A*subm(mat(src1),rect) + B*subm(mat(src2),rect) + C*subm(mat(src3),rect)
+    !*/
+
+// ----------------------------------------------------------------------------------------
+
+    void affine_transform(
+        tensor& dest,
+        const tensor& src,
+        const tensor& A,
+        const tensor& B
+    );
+    /*!
+        requires
+            - have_same_dimensions(dest,src) == true
+            - if (A.num_samples() == 1) then
+                - B.num_samples() == 1
+            - else
+                - A.num_samples() == src.num_samples()
+                - B.num_samples() == src.num_samples()
+            - A.nr() == B.nr() == src.nr()
+            - A.nc() == B.nc() == src.nc()
+            - A.k()  == B.k()  == src.k()
+        ensures
+            - if (A.num_samples() == 1) then
+                - #dest == A*src + B
+                    (done for each sample in src)
+            - else
+                - for all valid i:
+                    - #dest.host()[i] == A.host()[i]*src.host()[i] + B.host()[i]  
+    !*/
+
+// ----------------------------------------------------------------------------------------
+
+    void affine_transform_conv(
+        tensor& dest,
+        const tensor& src,
+        const tensor& A,
+        const tensor& B
+    );
+    /*!
+        requires
+            - have_same_dimensions(dest,src) == true
+            - have_same_dimensions(A, B) == true
+            - A.num_samples() == 1
+            - A.nr() == 1
+            - A.nc() == 1
+            - A.k() == src.k()
+        ensures
+            - Performs #dest == A*src + B
+              In particular, if the elements of dest and src were indexed by (n,k,r,c) then
+              we would have:
+                #dest(n,k,r,c) == A(k)*src(n,k,r,c) + B(k).
+    !*/
+
+// ----------------------------------------------------------------------------------------
+
+    void compute_adam_update (
+        size_t begin,
+        size_t end,
+        tensor& s,
+        tensor& m,
+        tensor& v,
+        const float t,
+        const float learning_rate,
+        const float weight_decay,
+        const float momentum1,
+        const float momentum2,
+        const tensor& params,
+        const tensor& params_grad
+    );
+    /*!
+        requires
+            - s.size() == m.size() = v.size() == params.size() == params_grad.size()
+            - t > 0
+            - learning_rate > 0
+            - weight_decay >= 0
+            - 0 <= momentum1 < 1
+            - 0 <= momentum2 < 1
+            - begin <= end <= params.size()
+        ensures
+            - This function implements the ADAM parameter update method described in the paper:
+                Kingma, Diederik P., and Jimmy Ba Adam. "A method for stochastic
+                optimization." International Conference on Learning Representation. 2015.
+              Specifically, it implements the method shown as Algorithm 1.
+            - #s is the update vector that should be added to the parameters.
+            - The function only operates in the half open range [begin,end) of the memory
+              blocks of each tensor.  E.g. to make this function run on the entire tensor
+              set begin to 0 and end to params.size().
+    !*/
+
+// ----------------------------------------------------------------------------------------
+
+    void batch_normalize_inference (
+        const double eps,
+        resizable_tensor& dest,
+        const tensor& src,
+        const tensor& gamma, 
+        const tensor& beta,
+        const tensor& running_means,
+        const tensor& running_variances
+    );
+    /*!
+        requires
+            - eps > 0
+            - gamma.num_samples() == 1 
+            - gamma.nr() == src.nr() 
+            - gamma.nc() == src.nc() 
+            - gamma.k()  == src.k()
+            - have_same_dimensions(gamma, beta) 
+            - have_same_dimensions(gamma, running_means) 
+            - have_same_dimensions(gamma, running_variances)
+        ensures
+            - Linearly transforms src as a call to batch_normalize() would if src had means
+              and variances as given by running_means and running_variances.  That is, this
+              function performs: 
+                dest = gamma*(src-running_means)/sqrt(running_variances+eps) + beta
+              Note that it does it in a pointwise fashion over the samples in src.
+    !*/
+
+    void batch_normalize (
+        const double eps,
+        resizable_tensor& dest,
+        resizable_tensor& means,
+        resizable_tensor& invstds,
+        const double averaging_factor,
+        resizable_tensor& running_means,
+        resizable_tensor& running_variances,
+        const tensor& src,
+        const tensor& gamma, 
+        const tensor& beta 
+    );
+    /*!
+        requires
+            - eps > 0
+            - src.num_samples() > 1
+            - gamma.num_samples() == 1
+            - beta.num_samples() == 1
+            - gamma.nr() == beta.nr() == src.nr()
+            - gamma.nc() == beta.nc() == src.nc()
+            - gamma.k()  == beta.k()  == src.k()
+            - 0 <= averaging_factor <= 1
+            - if (averaging_factor != 1)
+                - have_same_dimensions(running_means, means) == true
+                - have_same_dimensions(running_variances, invstds) == true
+        ensures
+            - have_same_dimensions(#dest, src) == true
+            - #means.num_samples() == 1
+            - #invstds.num_samples() == 1
+            - means.nr() == invstds.nr() == src.nr()
+            - means.nc() == invstds.nc() == src.nc()
+            - means.k()  == invstds.k()  == src.k()
+            - #src == the batch normalized version of src.
+            - #means == the mean values of the contents of src.
+            - #invstds == 1/(the standard deviation values of the contents of src).
+            - #running_means = (1-averaging_factor)*mat(#running_means) + averaging_factor*mat(#means);
+            - #running_variances = (1-averaging_factor)*mat(#running_variances) + averaging_factor*(variance of contents of src);
+    !*/
+
+    void batch_normalize_gradient (
+        const double eps,
+        const tensor& gradient_input,
+        const tensor& means,
+        const tensor& invstds,
+        const tensor& src,
+        const tensor& gamma,
+        tensor& src_grad,
+        tensor& gamma_grad, 
+        tensor& beta_grad 
+    );
+    /*!
+        requires
+            - eps > 0
+            - invstds and means should be the output of a call to
+              batch_normalize(eps,dest,means,invstds,src,gamma,beta)
+            - have_same_dimensions(gradient_input, src) == true
+            - have_same_dimensions(src, src_grad) == true
+            - src.num_samples() > 1
+            - gamma.num_samples() == 1
+            - have_same_dimensions(gamma, gamma_grad) == true
+            - have_same_dimensions(gamma, beta_grad) == true
+            - gamma.nr() == src.nr()
+            - gamma.nc() == src.nc()
+            - gamma.k()  == src.k()
+            - have_same_dimensions(means, gamma) == true
+            - have_same_dimensions(invstds, gamma) == true
+        ensures
+            - Let f(src,gamma,beta) == dot(gradient_input, dest output of
+              batch_normalize(eps,dest,means,invstds,src,gamma,beta))
+            - Adds the gradient of f() with respect to src to #src_grad.
+            - Assigns the gradient of f() with respect to gamma to #gamma_grad.
+            - Assigns the gradient of f() with respect to beta to #beta_grad.
+    !*/
+
+// ----------------------------------------------------------------------------------------
+
+    void batch_normalize_conv_inference (
+        const double eps,
+        resizable_tensor& dest,
+        const tensor& src,
+        const tensor& gamma, 
+        const tensor& beta,
+        const tensor& running_means,
+        const tensor& running_variances
+    );
+    /*!
+        requires
+            - eps > 0
+            - gamma.num_samples() == 1 
+            - gamma.nr() == 1 
+            - gamma.nc() == 1 
+            - gamma.k()  == src.k()
+            - have_same_dimensions(gamma, beta) 
+            - have_same_dimensions(gamma, running_means) 
+            - have_same_dimensions(gamma, running_variances)
+        ensures
+            - Linearly transforms src as a call to batch_normalize_conv() would if src had
+              means and variances as given by running_means and running_variances.  That
+              is, this function performs: 
+                dest = gamma*(src-running_means)/sqrt(running_variances+eps) + beta
+              Note that it does this in a pointwise fashion over the samples, rows, and
+              columns in src.
+    !*/
+
+    void batch_normalize_conv (
+        const double eps,
+        resizable_tensor& dest,
+        resizable_tensor& means,
+        resizable_tensor& invstds,
+        const double averaging_factor,
+        resizable_tensor& running_means,
+        resizable_tensor& running_variances,
+        const tensor& src,
+        const tensor& gamma, 
+        const tensor& beta 
+    );
+    /*!
+        requires
+            - eps > 0
+            - src.num_samples() > 1
+            - gamma.num_samples()==gamma.nr()==gamma.nc() == 1
+            - beta.num_samples() ==beta.nr() ==gamma.nc() == 1
+            - gamma.k()  == beta.k()  == src.k()
+            - 0 <= averaging_factor <= 1
+            - if (averaging_factor != 1)
+                - have_same_dimensions(running_means, means) == true
+                - have_same_dimensions(running_variances, invstds) == true
+        ensures
+            - have_same_dimensions(#dest, src) == true
+            - #means.num_samples()==means.nr()==means.nc() == 1
+            - #invstds.num_samples() ==invstds.nr() ==invstds.nc() == 1
+            - means.k()  == invstds.k()  == src.k()
+            - #src == the batch normalized version of src.
+            - #means == the mean values of the contents of src.
+            - #invstds == 1/(the standard deviation values of the contents of src).
+            - #running_means = (1-averaging_factor)*mat(#running_means) + averaging_factor*mat(#means);
+            - #running_variances = (1-averaging_factor)*mat(#running_variances) + averaging_factor*(variance of contents of src);
+    !*/
+
+    void batch_normalize_conv_gradient (
+        const double eps,
+        const tensor& gradient_input,
+        const tensor& means,
+        const tensor& invstds,
+        const tensor& src,
+        const tensor& gamma,
+        tensor& src_grad,
+        tensor& gamma_grad, 
+        tensor& beta_grad 
+    );
+    /*!
+        requires
+            - eps > 0
+            - invstds and means should be the output of a call to
+              batch_normalize_conv(eps,dest,means,invstds,src,gamma,beta)
+            - have_same_dimensions(gradient_input, src) == true
+            - have_same_dimensions(src, src_grad) == true
+            - src.num_samples() > 1
+            - gamma.num_samples()==gamma.nr()==gamma.nc() == 1
+            - have_same_dimensions(gamma, gamma_grad) == true
+            - have_same_dimensions(gamma, beta_grad) == true
+            - gamma.k()  == src.k()
+            - have_same_dimensions(means, gamma) == true
+            - have_same_dimensions(invstds, gamma) == true
+        ensures
+            - Let f(src,gamma,beta) == dot(gradient_input, dest output of
+              batch_normalize_conv(eps,dest,means,invstds,src,gamma,beta))
+            - Adds the gradient of f() with respect to src to #src_grad.
+            - Assigns the gradient of f() with respect to gamma to #gamma_grad.
+            - Assigns the gradient of f() with respect to beta to #beta_grad.
+    !*/
+
+// -----------------------------------------------------------------------------------
+
+    void threshold (
+        tensor& data,
+        float thresh
+    );
+    /*!
+        ensures
+            - Sets all elements of data to 1 or 0 depending on if they are above or below
+              the given threshold.  Specifically, for all valid i:
+                - #data.host()[i] == data.host()[i]>thresh ? 1 : 0
+    !*/
+
+    void dot (
+        const tensor& a,
+        const tensor& b,
+        tensor& result,
+        size_t idx
+    );
+    /*!
+        requires
+            - a.size() == b.size()
+            - idx < result.size()
+        ensures
+            - #result.host()[idx] == result.host()[idx] + dot(a,b);
+              I.e. Adds the dot product between a and b into the idx-th element of result.
+              The reason you might want to use this more complex version of dot() is
+              because, when using CUDA, it runs by generating asynchronous kernel launches
+              whereas the version of dot() that returns the result immediately as a scalar
+              must block the host while we wait for the result to be computed and then
+              transfered from the GPU do the host for return by dot().  So this version of
+              dot() might be much faster in some cases.
+    !*/
+
+// ----------------------------------------------------------------------------------------
+
+    void add(
+        float beta,
+        tensor& dest,
+        float alpha,
+        const tensor& src
+    );
+    /*!
+        requires
+            - One of the following is true: 
+                - have_same_dimensions(src, dest)
+                - src.num_samples()==1 && src.k()==dest.k() && src.nr()==1 && src.nc()==1
+                - src.num_samples()==1 && src.k()==dest.k() && src.nr()==dest.nr() && src.nc()==dest.nc()
+                - src.num_samples()==1 && src.k()==1 && src.nr()==dest.nr() && src.nc()==dest.nc()
+                - src.num_samples()==dest.num_samples() && src.k()==1 && src.nr()==1 && src.nc()==1
+            - is_same_object(src,dest) == false
+        ensures
+            - performs: dest = beta*dest + alpha*src
+              However, how the addition happens depends on the dimensions of src.  In
+              particular, this function adds the scaled values of one src tensor to dest.
+              Each dimension of the src tensor must match the corresponding dimension of
+              the dest tensor or must be equal to 1. In the latter case, the same value
+              from the src tensor, for those dimensions, will be used to add into the dest
+              tensor.
+    !*/
+
+// ----------------------------------------------------------------------------------------
+
+    void add (
+        tensor& dest,
+        const tensor& src1,
+        const tensor& src2
+    );
+    /*!
+        ensures
+            - performs: dest = src1 + src2
+              The addition happens pointwise according to 4D tensor arithmetic.  If the
+              dimensions don't match then missing elements are presumed to be equal to 0.
+    !*/
+
+// ----------------------------------------------------------------------------------------
+
+    void assign_conv_bias_gradient (
+        tensor& grad,
+        const tensor& gradient_input
+    );
+    /*!
+        requires
+            - grad.num_samples() == 1
+            - grad.k()  >= 1
+            - grad.nr() == 1
+            - grad.nc() == 1
+            - gradient_input.k() == grad.k()
+            - gradient_input.size() > 0
+            - is_same_object(grad,gradient_input) == false
+        ensures
+            - let BIAS be a tensor with the same dimensions as grad.
+            - let OUT be the output of add(1,OUT,1,BIAS)
+            - let f(gradient_input,BIAS) == dot(gradient_input,OUT)
+            - Then this function computes the gradient of f() with respect to BIAS and
+              assigns it to grad.
+    !*/
+
+// ----------------------------------------------------------------------------------------
+
+    void assign_bias_gradient (
+        tensor& grad,
+        const tensor& gradient_input
+    );
+    /*!
+        requires
+            - grad.num_samples() == 1
+            - gradient_input.k() == grad.k()
+            - gradient_input.nr() == grad.nr()
+            - gradient_input.nc() == grad.nc()
+            - gradient_input.size() > 0
+            - is_same_object(grad,gradient_input) == false
+        ensures
+            - let BIAS be a tensor with the same dimensions as grad.
+            - let OUT be the output of add(1,OUT,1,BIAS)
+            - let f(gradient_input,BIAS) == dot(gradient_input,OUT)
+            - Then this function computes the gradient of f() with respect to BIAS and
+              assigns it to grad.
+    !*/
+
+// ----------------------------------------------------------------------------------------
+
+    class tensor_conv
+    {
+    public:
+        tensor_conv(const tensor_conv&) = delete;
+        tensor_conv& operator=(const tensor_conv&) = delete;
+
+        tensor_conv() {}
+
+        void clear(
+        ) { impl.clear(); }
+
+        void operator() (
+            const bool add_to_output,
+            tensor& output,
+            const tensor& data,
+            const tensor& filters
+        ) { impl(add_to_output,output,data,filters); }
+        /*!
+            requires
+                - setup() has been called.  Specifically, setup() has been called like this:
+                    this->setup(data, filters, stride_y, stride_x, padding_y, padding_x);
+                - is_same_object(output,data) == false
+                - is_same_object(output,filters) == false
+                - filters.k() == data.k()
+                - filters.nr() <= src.nr() + 2*padding_y
+                - filters.nc() <= src.nc() + 2*padding_x
+                - #output.num_samples() == data.num_samples()
+                - #output.k() == filters.num_samples()
+                - #output.nr() == 1+(data.nr() + 2*padding_y - filters.nr())/stride_y
+                - #output.nc() == 1+(data.nc() + 2*padding_x - filters.nc())/stride_x
+            ensures
+                - Convolves filters over data.  If add_to_output==true then we add the
+                  results to output, otherwise we assign to output, overwriting the
+                  previous values in output.
+                - filters contains filters.num_samples() filters. 
+        !*/
+
+        void operator() (
+            const bool add_to_output,
+            resizable_tensor& output,
+            const tensor& data,
+            const tensor& filters
+        ) { impl(add_to_output,output,data,filters); }
+        /*!
+            requires
+                - setup() has been called.  Specifically, setup() has been called like this:
+                    this->setup(data, filters, stride_y, stride_x, padding_y, padding_x);
+                - is_same_object(output,data) == false
+                - is_same_object(output,filters) == false
+                - filters.k() == data.k()
+                - filters.nr() <= src.nr() + 2*padding_y
+                - filters.nc() <= src.nc() + 2*padding_x
+            ensures
+                - Convolves filters over data.  If add_to_output==true then we add the
+                  results to output, otherwise we assign to output, overwriting the
+                  previous values in output.  
+                - filters contains filters.num_samples() filters. 
+                - #output.num_samples() == data.num_samples()
+                - #output.k() == filters.num_samples()
+                - #output.nr() == 1+(data.nr() + 2*padding_y - filters.nr())/stride_y
+                - #output.nc() == 1+(data.nc() + 2*padding_x - filters.nc())/stride_x
+        !*/
+
+        void get_gradient_for_data (
+            const bool add_to_output,
+            const tensor& gradient_input, 
+            const tensor& filters,
+            tensor& data_gradient
+        ) { impl.get_gradient_for_data(add_to_output,gradient_input,filters,data_gradient); }
+        /*!
+            requires
+                - One of the following must be true:
+                    - filters has the same dimensions as the filters object given to the
+                      last call to operator().  Also, data_gradient has the same dimensions
+                      as the data object given to the last call to operator().
+                    - setup() has been called.  Specifically, setup() has been called like this:
+                      this->setup(data_gradient, filters, stride_y, stride_x, padding_y, padding_x);
+                - gradient_input has the following dimensions:
+                    - gradient_input.num_samples() == data_gradient.num_samples()
+                    - gradient_input.k() == filters.num_samples()
+                    - gradient_input.nr() == 1+(data_gradient.nr() + 2*padding_y - filters.nr())/stride_y
+                    - gradient_input.nc() == 1+(data_gradient.nc() + 2*padding_x - filters.nc())/stride_x
+                    - NOTE, these dimensions are what you would obtain if gradient_input
+                      has the same dimensions as the last output of operator().  
+                - is_same_object(data_gradient,filters) == false
+                - is_same_object(data_gradient,gradient_input) == false
+            ensures
+                - let OUT be the output of (*this)(OUT,data,filters,sx,sy).
+                - let f(data,filters) == dot(OUT, gradient_input)
+                - if (add_to_output) then
+                    - This function finds the gradient of f() with respect to data and adds
+                      this gradient to data_gradient.
+                - else
+                    - This function finds the gradient of f() with respect to data and
+                      assigns this gradient to data_gradient, overwriting the previous
+                      values in data_gradient.
+        !*/
+
+        void get_gradient_for_filters (
+            const bool add_to_output,
+            const tensor& gradient_input, 
+            const tensor& data,
+            tensor& filters_gradient
+        ) { impl.get_gradient_for_filters(add_to_output,gradient_input,data,filters_gradient); }
+        /*!
+            requires
+                - One of the following must be true:
+                    - filters_gradient has the same dimensions as the filters object given
+                      to the last call to operator().  Also, data has the same dimensions
+                      as the data object given to the last call to operator().
+                    - setup() has been called.  Specifically, setup() has been called like this:
+                      this->setup(data, filters_gradient, stride_y, stride_x, padding_y, padding_x);
+                - gradient_input has the following dimensions:
+                    - gradient_input.num_samples() == data.num_samples()
+                    - gradient_input.k() == filters.num_samples()
+                    - gradient_input.nr() == 1+(data.nr() + 2*padding_y - filters.nr())/stride_y
+                    - gradient_input.nc() == 1+(data.nc() + 2*padding_x - filters.nc())/stride_x
+                    - NOTE, these dimensions are what you would obtain if gradient_input
+                      has the same dimensions as the last output of operator().  
+                - is_same_object(filters_gradient,data) == false
+                - is_same_object(filters_gradient,gradient_input) == false
+            ensures
+                - let OUT be the output of (*this)(OUT,data,filters,sx,sy).
+                - let f(data,filters) == dot(OUT, gradient_input)
+                - if (add_to_output) then
+                    - This function finds the gradient of f() with respect to filters and
+                      adds this gradient to filters_gradient.
+                - else 
+                    - This function finds the gradient of f() with respect to filters and
+                      assigns this gradient to filters_gradient, overwriting the previous
+                      values in filters_gradient.
+        !*/
+
+ 
+        void setup(
+            const tensor& data,
+            const tensor& filters,
+            int stride_y,
+            int stride_x,
+            int padding_y,
+            int padding_x
+        ) {impl.setup(data,filters,stride_y,stride_x,padding_y,padding_x); }
+        /*!
+            requires
+                - filters.k() == data.k()
+                - stride_y > 0
+                - stride_x > 0
+                - 0 <= padding_y < filters.nr()
+                - 0 <= padding_x < filters.nc()
+            ensures
+                - When operator() is called, the output tensor will have these dimensions:
+                    - output.nr() == 1+(data.nr() + 2*padding_y - filters.nr())/stride_y
+                    - output.nc() == 1+(data.nc() + 2*padding_x - filters.nc())/stride_x
+                    - output.num_samples() == data.num_samples()
+                    - output.k() == filters.num_samples()
+                - The point of setup() is to allow this object to gather information about
+                  all the tensor sizes and filter layouts involved in the computation.  In
+                  particular, the reason the tensors are input into setup() is just to
+                  observe their sizes.  setup() doesn't do anything with the contents of
+                  the tensors, or store any kind of references to the data or filter
+                  tensors. 
+        !*/
+       
+    private:
+#ifdef DLIB_USE_CUDA
+        cuda::tensor_conv impl;
+#else
+        cpu::tensor_conv impl;
+#endif
+
+    };
+
+// ----------------------------------------------------------------------------------------
+
+    class pooling
+    {
+        /*!
+            WHAT THIS OBJECT REPRESENTS
+                The pooling object is a tool for performing spatial pooling over a tensor.
+                It can be configured to do either max or average pooling.
+        !*/
+    public:
+
+        pooling(const pooling&) = delete;
+        pooling& operator=(const pooling&) = delete;
+
+        pooling (
+        ) = default;
+
+        void clear(
+        ) { impl.clear(); }
+
+        void setup_max_pooling(
+            int window_height,
+            int window_width,
+            int stride_y,
+            int stride_x,
+            int padding_y,
+            int padding_x
+        ) { impl.setup_max_pooling(window_height, window_width, stride_y, stride_x, padding_y, padding_x); }
+        /*!
+            requires
+                - window_height > 0
+                - window_width > 0
+                - stride_y > 0
+                - stride_x > 0
+                - 0 <= padding_y < window_height
+                - 0 <= padding_x < window_width
+            ensures
+                - When you call operator() it will do max pooling with the given
+                  parameters.
+        !*/
+
+        void setup_avg_pooling(
+            int window_height,
+            int window_width,
+            int stride_y,
+            int stride_x,
+            int padding_y,
+            int padding_x
+        ) { impl.setup_avg_pooling(window_height, window_width, stride_y, stride_x, padding_y, padding_x); }
+        /*!
+            requires
+                - window_height > 0
+                - window_width > 0
+                - stride_y > 0
+                - stride_x > 0
+                - 0 <= padding_y < window_height
+                - 0 <= padding_x < window_width
+            ensures
+                - When you call operator() it will do average pooling with the given
+                  parameters.
+        !*/
+
+        bool does_max_pooling(
+        ) const { return impl.does_max_pooling(); }
+
+        void operator() (
+            resizable_tensor& dest,
+            const tensor& src
+        ) { impl(dest, src); }
+        /*!
+            requires
+                - is_same_object(dest,src) == false
+                - either setup_max_pooling() or setup_avg_pooling() has been called.
+                - window_width  <= src.nc() + 2*padding_x
+                - window_height <= src.nr() + 2*padding_y
+            ensures
+                - #dest.num_samples() == src.num_samples()
+                - #dest.k() == src.k()
+                - #dest.nr() == 1 + (src.nr() + 2*padding_y - window_height)/stride_y
+                - #dest.nc() == 1 + (src.nc() + 2*padding_x - window_width)/stride_x
+                - WINDOW == centered_rect(x*stride_x + window_width/2 - padding_x,
+                                          y*stride_y + window_height/2 - padding_y,
+                                          window_width,
+                                          window_height)
+                - for all valid s, k, r, and c:
+                    - if (does_max_pooling()) then
+                        - image_plane(#dest,s,k)(r,c) == max(subm_clipped(image_plane(src,s,k),WINDOW(c,r)))
+                    - else
+                        - image_plane(#dest,s,k)(r,c) == mean(subm_clipped(image_plane(src,s,k),WINDOW(c,r)))
+        !*/
+
+        void get_gradient(
+            const tensor& gradient_input, 
+            const tensor& dest,
+            const tensor& src,
+            tensor& grad 
+        ) { impl.get_gradient(gradient_input, dest, src, grad); }
+        /*!
+            requires
+                - have_same_dimensions(gradient_input,dest) == true
+                - have_same_dimensions(src,grad) == true
+                - dest contains the result of calling (*this)(dest,src)
+                - is_same_object(grad,gradient_input) == false
+                - is_same_object(grad,dest) == false
+                - is_same_object(grad,src) == false
+            ensures
+                - Recalling that dest is the output of (*this)(dest,src),
+                  let f(src) == dot(gradient_input,dest)
+                - Then this function computes the gradient of f() with respect to src and
+                  adds it to grad.
+        !*/
+
+        private:
+#ifdef DLIB_USE_CUDA
+        cuda::pooling impl;
+#else
+        cpu::pooling impl;
+#endif
+    };
+
+// ----------------------------------------------------------------------------------------
+
+    void softmax (
+        tensor& dest,
+        const tensor& src
+    );
+    /*!
+        requires
+            - have_same_dimensions(dest, src) == true
+        ensures
+            - Note that the softmax function is a vector valued function: 
+                s(x) == exp(x)/sum(exp(x)) 
+            - Computes the softmax function on src and writes the results to dest.  The
+              softmax is computed per spatial location across the different channels at
+              each location.  That is, softmax() outputs a new tensor, #dest, where each of
+              the spatial locations in dest (i.e. image idx, row idx, and column idx)
+              contains the output of s() evaluated over the channel values at each
+              location.
+            - This function supports in-place operation, i.e. having
+              is_same_object(dest, src)==true
+    !*/
+
+    void softmax_gradient (
+        tensor& grad,
+        const tensor& dest,
+        const tensor& gradient_input
+    );
+    /*!
+        requires
+            - have_same_dimensions(dest,gradient_input) == true 
+            - have_same_dimensions(dest,grad) == true 
+        ensures
+            - We interpret dest as the output of softmax(dest,SRC) for some SRC tensor.
+              Then let f(SRC) == dot(gradient_input,dest).  Then this function computes the
+              gradient of f() with respect to SRC and stores it to grad.  Moreover, if
+              is_same_object(grad,gradient_input)==true then the output is assigned to
+              grad, replacing its previous contents.  Otherwise the output is added to
+              grad.
+            - This function supports in-place operation, i.e. having
+              is_same_object(grad, gradient_input)==true
+    !*/
+
+// ----------------------------------------------------------------------------------------
+
+    void softmax_all (
+        tensor& dest,
+        const tensor& src
+    );
+    /*!
+        requires
+            - have_same_dimensions(dest, src) == true
+        ensures
+            - Note that the softmax function is a vector valued function: 
+              s(x) == exp(x)/sum(exp(x)) 
+            - Computes the softmax function on src and writes the results to dest.  The
+              softmax is computed over the entire tensor with one invocation of s().  So
+              unlike softmax() which computes many s() evaluations, one for each spatial
+              location, softmax_all() calls s() once for the entire tensor.
+            - This function supports in-place operation, i.e. having
+              is_same_object(dest, src)==true
+    !*/
+
+    void softmax_all_gradient (
+        tensor& grad,
+        const tensor& dest,
+        const tensor& gradient_input
+    );
+    /*!
+        requires
+            - have_same_dimensions(dest,gradient_input) == true 
+            - have_same_dimensions(dest,grad) == true 
+            - is_same_object(grad, dest)==false
+        ensures
+            - We interpret dest as the output of softmax_all(dest,SRC) for some SRC tensor.
+              Then let f(SRC) == dot(gradient_input,dest) Then this function computes the
+              gradient of f() with respect to SRC and assigns it to grad.
+            - This function supports in-place operation, i.e. having
+              is_same_object(grad, gradient_input)==true
+    !*/
+
+// ----------------------------------------------------------------------------------------
+
+    void sigmoid (
+        tensor& dest,
+        const tensor& src
+    );
+    /*!
+        requires
+            - have_same_dimensions(dest, src) == true
+        ensures
+            - for all valid i:
+                - #dest.host()[i] == 1/(1+std::exp(-src.host()[i])) 
+            - This function supports in-place operation, i.e. having
+              is_same_object(dest, src)==true
+    !*/
+
+    void sigmoid_gradient (
+        tensor& grad,
+        const tensor& dest,
+        const tensor& gradient_input
+    );
+    /*!
+        requires
+            - have_same_dimensions(dest,gradient_input) == true 
+            - have_same_dimensions(dest,grad) == true 
+        ensures
+            - Recalling that dest is the output of sigmoid(dest,SRC) for some SRC tensor,
+              let f(SRC) == dot(gradient_input,dest).  Then this function computes the
+              gradient of f() with respect to SRC and stores it to grad.  Moreover, if
+              is_same_object(grad,gradient_input)==true then the output is assigned to
+              grad, replacing its previous contents.  Otherwise the output is added to
+              grad.
+            - This function supports in-place operation, i.e. having
+              is_same_object(grad, gradient_input)==true
+    !*/
+
+// ----------------------------------------------------------------------------------------
+
+    void relu (
+        tensor& dest,
+        const tensor& src
+    );
+    /*!
+        requires
+            - have_same_dimensions(dest, src) == true
+        ensures
+            - for all valid i:
+                - #dest.host()[i] == std::max(0,src.host()[i]) 
+            - This function supports in-place operation, i.e. having
+              is_same_object(dest, src)==true
+    !*/
+
+    void relu_gradient (
+        tensor& grad,
+        const tensor& dest,
+        const tensor& gradient_input
+    );
+    /*!
+        requires
+            - have_same_dimensions(dest,gradient_input) == true 
+            - have_same_dimensions(dest,grad) == true 
+        ensures
+            - Recalling that dest is the output of relu(dest,SRC) for some SRC tensor,
+              let f(SRC) == dot(gradient_input,dest).  Then this function computes the
+              gradient of f() with respect to SRC and stores it to grad.  Moreover, if
+              is_same_object(grad,gradient_input)==true then the output is assigned to
+              grad, replacing its previous contents.  Otherwise the output is added to
+              grad.
+            - This function supports in-place operation, i.e. having
+              is_same_object(grad, gradient_input)==true
+    !*/
+
+// ----------------------------------------------------------------------------------------
+
+    void prelu (
+        tensor& dest,
+        const tensor& src,
+        const tensor& param
+    );
+    /*!
+        requires
+            - have_same_dimensions(dest, src) == true
+            - param.size() == 1
+        ensures
+            - for all valid i:
+                - if (src.host()[i] > 0) then
+                    - #dest.host()[i] == src.host()[i]
+                - else
+                    - #dest.host()[i] == src.host()[i] * param.host()[0]
+            - This function supports in-place operation, i.e. having
+              is_same_object(dest, src)==true
+    !*/
+
+    void prelu_gradient (
+        tensor& grad,
+        const tensor& src,
+        const tensor& gradient_input,
+        const tensor& param,
+        tensor& params_grad 
+    );
+    /*!
+        requires
+            - have_same_dimensions(grad,src) == true 
+            - have_same_dimensions(grad,gradient_input) == true 
+            - param.size() == 1
+            - params_grad.size() == 1
+            - is_same_object(grad, gradient_input) == false
+        ensures
+            - Recalling that dest is the output of prelu(dest,src,param) let 
+              f(src,param) == dot(gradient_input,dest)
+            - Then this function computes the gradient of f() with respect to src and
+              param.  It assigns the gradient with respect to param to #params_grad and
+              adds the gradient with respect to src to #grad.
+    !*/
+
+// ----------------------------------------------------------------------------------------
+
+    void tanh (
+        tensor& dest,
+        const tensor& src
+    );
+    /*!
+        requires
+            - have_same_dimensions(dest, src) == true
+        ensures
+            - for all valid i:
+                - #dest.host()[i] == std::tanh(src.host()[i]) 
+            - This function supports in-place operation, i.e. having
+              is_same_object(dest, src)==true
+    !*/
+
+    void tanh_gradient (
+        tensor& grad,
+        const tensor& dest,
+        const tensor& gradient_input
+    );
+    /*!
+        requires
+            - have_same_dimensions(dest,gradient_input) == true 
+            - have_same_dimensions(dest,grad) == true 
+        ensures
+            - Recalling that dest is the output of tanh(dest,SRC) for some SRC tensor,
+              let f(SRC) == dot(gradient_input,dest).  Then this function computes the
+              gradient of f() with respect to SRC and stores it to grad.  Moreover, if
+              is_same_object(grad,gradient_input)==true then the output is assigned to
+              grad, replacing its previous contents.  Otherwise the output is added to
+              grad.
+            - This function supports in-place operation, i.e. having
+              is_same_object(grad, gradient_input)==true
+    !*/
+
+// ----------------------------------------------------------------------------------------
+
+    void resize_bilinear (
+        tensor& dest,
+        long dest_row_stride,
+        long dest_channel_stride,
+        const tensor& src,
+        long src_row_stride,
+        long src_channel_stride
+    );
+    /*!
+        requires
+            - is_same_object(dest, src)==false
+            - dest.num_samples() == src.num_samples()
+            - dest.k() == src.k()
+        ensures
+            - for all valid i,k:  image_plane(dest,i,k) is a copy of image_plane(src,i,k)
+              that has been bilinearly interpolated to fit into the shape of
+              image_plane(dest,i,k).
+            - Instead of supposing the row stride and channel stride in the tensors is
+              given by tensor::nc() and tensor::nr()*tensor::nc() respectively, we use the
+              provided stride values to transition from one row and channel to the next.
+              This is useful in combination with alias_tensor objects since it allows you
+              to operate on subwindows in an image.
+    !*/
+
+    void resize_bilinear_gradient (
+        tensor& grad,
+        long grad_row_stride,
+        long grad_channel_stride,
+        const tensor& gradient_input,
+        long gradient_input_row_stride,
+        long gradient_input_channel_stride
+    );
+    /*!
+        requires
+            - is_same_object(grad, gradient_input)==false
+            - gradient_input.num_samples() == grad.num_samples()
+            - gradient_input.k() == grad.k()
+        ensures
+            - Suppose that DEST is the output of resize_bilinear(DEST,SRC) for some SRC
+              tensor, let f(SRC) == dot(gradient_input,DEST).  Then this function computes
+              the gradient of f() with respect to SRC and adds it to grad.   It should be
+              noted that we don't need to know the contents of DEST to compute this
+              gradient.  All that matters is that gradient_input have the same dimensions
+              as DEST.
+            - Instead of supposing the row stride and channel stride in the tensors is
+              given by tensor::nc() and tensor::nr()*tensor::nc() respectively, we use the
+              provided stride values to transition from one row and channel to the next.
+              This is useful in combination with alias_tensor objects since it allows you
+              to operate on subwindows in an image.
+    !*/
+
+    inline void resize_bilinear (
+        tensor& dest,
+        const tensor& src
+    ) { resize_bilinear(dest, dest.nc(), dest.nr()*dest.nc(), src, src.nc(), src.nr()*src.nc()); }
+    /*!
+        requires
+            - is_same_object(dest, src)==false
+            - dest.num_samples() == src.num_samples()
+            - dest.k() == src.k()
+        ensures
+            - for all valid i,k:  image_plane(dest,i,k) is a copy of image_plane(src,i,k)
+              that has been bilinearly interpolated to fit into the shape of
+              image_plane(dest,i,k).
+    !*/
+
+    inline void resize_bilinear_gradient (
+        tensor& grad,
+        const tensor& gradient_input
+    ) { resize_bilinear_gradient(grad, grad.nc(), grad.nr()*grad.nc(), gradient_input, gradient_input.nc(), gradient_input.nr()*gradient_input.nc()); }
+    /*!
+        requires
+            - is_same_object(grad, gradient_input)==false
+            - gradient_input.num_samples() == grad.num_samples()
+            - gradient_input.k() == grad.k()
+        ensures
+            - Suppose that DEST is the output of resize_bilinear(DEST,SRC) for some SRC
+              tensor, let f(SRC) == dot(gradient_input,DEST).  Then this function computes
+              the gradient of f() with respect to SRC and adds it to grad.   It should be
+              noted that we don't need to know the contents of DEST to compute this
+              gradient.  All that matters is that gradient_input have the same dimensions
+              as DEST.
+    !*/
+
+// ----------------------------------------------------------------------------------------
+
+    class multi_device_tensor_averager
+    {
+        /*!
+            WHAT THIS OBJECT REPRESENTS
+                This object is a tool for very quickly averaging a bunch of tensors
+                together.
+        !*/
+    public:
+
+        multi_device_tensor_averager(const multi_device_tensor_averager&) = delete;
+        multi_device_tensor_averager& operator=(const multi_device_tensor_averager&) = delete;
+
+        multi_device_tensor_averager() = default;
+
+        void set(
+            std::vector<tensor*> items
+        )
+        /*!
+            requires
+                - All the tensors in items are the same size
+            ensures
+                - When you call average() we will average the tensors in items.
+                - It's important that the tensors already be allocated to their devices
+                  before you call set().  This is because set() will setup the types of
+                  between device transfers now and use them when you call average().  
+        !*/
+        {
+            using namespace ::dlib::cuda;
+            accessible_groups.clear();
+            epa.clear();
+            if (items.size() < 1)
+                return;
+
+            scale = 1.0/items.size();
+
+            // split item into groups of accessible devices
+            std::vector<tensor*> group, unused;
+            while(items.size() > 0)
+            {
+                group.push_back(items[0]);
+                for(size_t i = 1; i < items.size(); ++i)
+                {
+                    if (can_access_peer(*items[0], *items[i]))
+                        group.push_back(items[i]);
+                    else
+                        unused.push_back(items[i]);
+                }
+                accessible_groups.push_back(group);
+                unused.swap(items);
+                unused.clear();
+                group.clear();
+            }
+            for (auto&& g : accessible_groups)
+            {
+                for (size_t i = 1; i < g.size(); ++i)
+                {
+                    epa.emplace_back(new enable_peer_access(*g[0], *g[i]));
+                }
+            }
+        }
+
+        size_t num_device_groups(
+        ) const { return accessible_groups.size(); }
+        /*!
+            ensures
+                - The devices given to set() are grouped together when they can directly
+                  access each other using GPUDirect.  This function returns the number of
+                  such groups.  For example, if all devices can directly access each other
+                  then the number of groups is 1.
+        !*/
+
+        void average()
+        /*!
+            requires
+                - All the devices have stopped writing to the tensors given to set().  So
+                  you should probably call cudaDeviceSynchronize() on each of the relevant
+                  devices before calling average().
+            ensures
+                - Computes the average of all the tensors given to set() and then sets them
+                  all equal to the average.
+        !*/
+        {
+            using namespace ::dlib::cuda;
+
+
+            // First we average things within each group
+            for (auto&& g : accessible_groups)
+            {
+                raii_set_device set_dev(*g[0]);
+                if (g.size() == 1)
+                    tt::affine_transform(*g[0], *g[0], scale);
+                else 
+                    tt::affine_transform(*g[0], *g[0], *g[1], scale, scale);
+
+                for (size_t i = 2; i < g.size(); ++i)
+                    tt::affine_transform(*g[0], *g[0], *g[i], 1, scale);
+            }
+
+            if (accessible_groups.size() > 1)
+            {
+                tensor& total_avg = *accessible_groups[0][0];
+                raii_set_device set_dev(total_avg);
+                accum_buffer.copy_size(total_avg);
+                // now we need to average things across groups
+                for (size_t i = 1; i < accessible_groups.size(); ++i)
+                {
+                    memcpy(accum_buffer, *accessible_groups[i][0]);
+                    tt::add(total_avg, total_avg, accum_buffer);
+                }
+
+                // Now total_avg has the final average in it.  So we need to send
+                // copies of it back to each of the groups.
+                for (size_t i = 1; i < accessible_groups.size(); ++i)
+                {
+                    memcpy(*accessible_groups[i][0], total_avg);
+                }
+            }
+
+
+            // Now propagate averages back out to each element using point to point
+            // communication inside a group.
+            for (auto&& g : accessible_groups)
+            {
+                raii_set_device set_dev(*g[0]);
+                for (size_t i = 1; i < g.size(); ++i)
+                    memcpy(*g[i], *g[0]); 
+            }
+        }
+
+    private:
+        std::vector<std::unique_ptr<::dlib::cuda::enable_peer_access>> epa;
+        std::vector<std::vector<tensor*>> accessible_groups;
+        float scale;
+
+        resizable_tensor accum_buffer;
+    };
+
+// ----------------------------------------------------------------------------------------
+
+    void copy_tensor(
+            bool add_to,
+            tensor& dest,
+            size_t dest_k_offset,
+            const tensor& src,
+            size_t src_k_offset,
+            size_t count_k
+    );
+    /*!
+        requires
+            - dest.nc() == src.nc()
+            - dest.nr() == src.nr()
+            - dest.num_samples() == src.num_samples()
+            - dest.k() - dest_k_offset >= count_k
+            - src.k() - src_k_offset >= count_k
+            - is_same_object(dest,src) == false
+            - The memory areas of src and dest do not overlap.
+        ensures
+            - if (add_to) then
+                - performs: dest[i, k + dest_k_offset, r, c] += src[i, k + src_k_offset, r, c], where k in [0..count_k]
+                  i.e., adds content of each sample from src in to corresponding place of sample at dest.
+            - else
+                - performs: dest[i, k + dest_k_offset, r, c]  = src[i, k + src_k_offset, r, c], where k in [0..count_k]
+                  i.e., copies content of each sample from src in to corresponding place of sample at dest.
+    !*/
+
+// ----------------------------------------------------------------------------------------
+
+}}
+
+#ifdef NO_MAKEFILE
+#include "tensor_tools.cpp"
+#endif
+
+#endif // DLIB_TeNSOR_TOOLS_H_
+
+
diff --git a/ml/dlib/dlib/dnn/trainer.h b/ml/dlib/dlib/dnn/trainer.h
new file mode 100644
index 000000000..7cb2bf5e5
--- /dev/null
+++ b/ml/dlib/dlib/dnn/trainer.h
@@ -0,0 +1,1333 @@
+// Copyright (C) 2015  Davis E. King (davis@dlib.net)
+// License: Boost Software License   See LICENSE.txt for the full license.
+#ifndef DLIB_DNn_TRAINER_H_
+#define DLIB_DNn_TRAINER_H_
+
+#include "trainer_abstract.h"
+#include "core.h"
+#include "solvers.h"
+#include "../statistics.h"
+#include <chrono>
+#include <fstream>
+#include <sstream>
+#include "../serialize.h"
+
+#include "../pipe.h"
+#include "../threads.h"
+#include "cuda_dlib.h"
+#include "../statistics/running_gradient.h"
+#include <atomic>
+#include <cstdio>
+#include <set>
+#include <future>
+#include <exception>
+#include <mutex>
+#include "../dir_nav.h"
+#include "../md5.h"
+
+namespace dlib
+{
+
+// ----------------------------------------------------------------------------------------
+
+    namespace impl
+    {
+        template <typename training_label_type>
+        struct dnn_job_t
+        {
+            dnn_job_t() = default;
+            dnn_job_t(const dnn_job_t&) = delete;
+            dnn_job_t& operator=(const dnn_job_t&) = delete;
+
+            std::vector<std::vector<training_label_type>> labels;
+            std::vector<resizable_tensor> t;
+            std::vector<int> have_data;  // have_data[i] is true if there is data in labels[i] and t[i].
+            bool test_only = false;
+        };
+
+        template <typename training_label_type>
+        void swap(dnn_job_t<training_label_type>& a, dnn_job_t<training_label_type>& b)
+        {
+            a.labels.swap(b.labels);
+            a.t.swap(b.t);
+            a.have_data.swap(b.have_data);
+            std::swap(a.test_only,b.test_only);
+        }
+    }
+
+    enum class force_flush_to_disk {
+        no = 0,
+        yes = 1
+    };
+
+    template <
+        typename net_type, 
+        typename solver_type = sgd
+        >
+    class dnn_trainer : private threaded_object
+    {
+    public:
+
+        static_assert(is_loss_layer_type<net_type>::value, 
+            "The last layer in a network must be a loss layer.");
+
+        typedef typename net_type::training_label_type training_label_type;
+        typedef typename net_type::input_type input_type;
+        const static size_t num_computational_layers = net_type::num_computational_layers;
+        const static size_t num_layers = net_type::num_layers;
+    private:
+        typedef impl::dnn_job_t<training_label_type> job_t;
+    public:
+
+        dnn_trainer() = delete;
+        dnn_trainer(const dnn_trainer&) = delete;
+        dnn_trainer& operator=(const dnn_trainer&) = delete;
+
+        explicit dnn_trainer(net_type& net_) : job_pipe(0), net(net_)
+        {
+            solver_type default_solver;
+            devices.push_back(std::make_shared<device_data>(dlib::cuda::get_device(), net, default_solver));
+
+            init();
+        }
+
+        dnn_trainer(
+            net_type& net_, 
+            const solver_type& solver_
+        ) : job_pipe(0), net(net_) 
+        {
+            devices.push_back(std::make_shared<device_data>(dlib::cuda::get_device(), net, solver_));
+
+            init();
+        }
+
+        dnn_trainer(
+            net_type& net_, 
+            const solver_type& solver_,
+            const std::vector<int>& cuda_extra_devices
+        ) : job_pipe(0), net(net_) 
+        {
+            devices.push_back(std::make_shared<device_data>(dlib::cuda::get_device(), net, solver_));
+
+            const int total_devices = dlib::cuda::get_num_devices();
+
+            // Make device contexts for the extra device ids but be careful to avoid any
+            // duplicate ids.
+            std::set<int> temp(cuda_extra_devices.begin(), cuda_extra_devices.end());
+            temp.erase(devices[0]->device_id);
+            for (auto id : temp)
+            {
+                DLIB_CASSERT(0 <= id && id < total_devices, "Invalid CUDA device id given to dnn_trainer.");
+                // Switch to this device so that any tensor objects that get allocated when
+                // we create the device context happen on this device.
+                dlib::cuda::set_device(id);
+                devices.push_back(std::make_shared<device_data>(id, net, solver_, clone_net()));
+            }
+            // Set the current device back to what it was before this constructor was
+            // called.
+            dlib::cuda::set_device(devices[0]->device_id);
+
+            init();
+        }
+
+        ~dnn_trainer(
+        )
+        {
+            job_pipe.disable();
+            stop();
+            wait();
+        }
+
+        net_type& get_net (
+            force_flush_to_disk force_flush = force_flush_to_disk::yes
+        )  
+        { 
+            wait_for_thread_to_pause();
+            sync_to_disk(force_flush == force_flush_to_disk::yes);
+            propagate_exception();
+            return net; 
+        }
+
+
+        unsigned long get_mini_batch_size (
+        ) const { return mini_batch_size; }
+
+        void set_mini_batch_size (
+            unsigned long batch_size 
+        )
+        {
+            DLIB_CASSERT(batch_size > 0);
+            mini_batch_size = batch_size;
+        }
+
+        unsigned long get_max_num_epochs (
+        ) const { return max_num_epochs; }
+
+        void set_max_num_epochs (
+            unsigned long num
+        )  
+        {
+            DLIB_CASSERT(num > 0);
+            max_num_epochs = num;
+        }
+
+        void be_verbose (
+        )
+        {
+            verbose = true;
+        }
+
+        void be_quiet (
+        )
+        {
+            verbose = false;
+        }
+
+
+        const std::vector<solver_type>& get_solvers (
+        ) const 
+        { 
+            wait_for_thread_to_pause();
+            propagate_exception();
+            return devices[0]->solvers; 
+        }
+
+        void train_one_step (
+            const std::vector<input_type>& data,
+            const std::vector<training_label_type>& labels 
+        )
+        {
+            DLIB_CASSERT(data.size() == labels.size());
+
+            train_one_step(data.begin(), data.end(), labels.begin());
+        }
+
+        template <
+            typename data_iterator,
+            typename label_iterator
+            >
+        void train_one_step (
+            data_iterator dbegin, 
+            data_iterator dend,
+            label_iterator lbegin
+        )
+        {
+            DLIB_CASSERT(std::distance(dbegin, dend) > 0);
+
+            print_periodic_verbose_status();
+            sync_to_disk();
+            send_job(false, dbegin, dend, lbegin);
+
+            ++train_one_step_calls;
+        }
+
+        void train_one_step (
+            const std::vector<input_type>& data
+        )
+        {
+            train_one_step(data.begin(), data.end());
+        }
+
+        template <
+            typename data_iterator
+            >
+        void train_one_step (
+            data_iterator dbegin, 
+            data_iterator dend
+        )
+        {
+            DLIB_CASSERT(std::distance(dbegin, dend) > 0);
+            print_periodic_verbose_status();
+            sync_to_disk();
+            send_job(false, dbegin, dend);
+            ++train_one_step_calls;
+        }
+
+        void test_one_step (
+            const std::vector<input_type>& data,
+            const std::vector<training_label_type>& labels 
+        )
+        {
+            DLIB_CASSERT(data.size() == labels.size());
+
+            test_one_step(data.begin(), data.end(), labels.begin());
+        }
+
+        template <
+            typename data_iterator,
+            typename label_iterator
+            >
+        void test_one_step (
+            data_iterator dbegin, 
+            data_iterator dend,
+            label_iterator lbegin
+        )
+        {
+            DLIB_CASSERT(std::distance(dbegin, dend) > 0);
+
+            print_periodic_verbose_status();
+            sync_to_disk();
+            send_job(true, dbegin, dend, lbegin);
+
+            ++test_one_step_calls;
+        }
+
+        void test_one_step (
+            const std::vector<input_type>& data
+        )
+        {
+            test_one_step(data.begin(), data.end());
+        }
+
+        template <
+            typename data_iterator
+            >
+        void test_one_step (
+            data_iterator dbegin, 
+            data_iterator dend
+        )
+        {
+            DLIB_CASSERT(std::distance(dbegin, dend) > 0);
+            print_periodic_verbose_status();
+            sync_to_disk();
+            send_job(true, dbegin, dend);
+            ++test_one_step_calls;
+        }
+
+        void train (
+            const std::vector<input_type>& data,
+            const std::vector<training_label_type>& labels 
+        ) 
+        {
+            DLIB_CASSERT(data.size() == labels.size() && data.size() > 0);
+
+            // The reason these two loops don't initialize their counter variables but
+            // instead use class members is so we can include the state of the loops in the
+            // stuff written by sync_to_disk()
+            for (; 
+                epoch_iteration < max_num_epochs && learning_rate >= min_learning_rate; 
+                ++epoch_iteration)
+            {
+                using namespace std::chrono;
+                last_time = system_clock::now();
+                clear_average_loss();
+                for (; epoch_pos < data.size() && learning_rate >= min_learning_rate; epoch_pos += mini_batch_size)
+                {
+                    if (verbose)
+                    {
+                        auto now_time = system_clock::now();
+                        if (now_time-last_time > seconds(20))
+                        {
+                            last_time = now_time;
+                            auto iter = epoch_iteration + epoch_pos/(double)data.size();
+                            std::cout << "epoch: " << rpad(cast_to_string(iter),epoch_string_pad) << "  " 
+                                      << "learning rate: " << rpad(cast_to_string(learning_rate),lr_string_pad) << "  "
+                                      << "average loss: " << rpad(cast_to_string(get_average_loss()),string_pad) << "  ";
+                            print_progress();
+                        }
+                    }
+
+                    sync_to_disk();
+                    send_job(false, data.begin()+epoch_pos, 
+                              data.begin()+std::min(epoch_pos+mini_batch_size,data.size()), 
+                              labels.begin()+epoch_pos);
+                }
+                epoch_pos = 0;
+
+                if (verbose)
+                {
+                    // Capitalize the E in Epoch so it's easy to grep out the lines that
+                    // are for full epoch status statements.
+                    std::cout << "Epoch: " << rpad(cast_to_string(epoch_iteration+1),epoch_string_pad) << "  " 
+                              << "learning rate: " << rpad(cast_to_string(learning_rate),lr_string_pad) << "  "
+                              << "average loss: " << rpad(cast_to_string(get_average_loss()),string_pad) << "  ";
+                    print_progress();
+                }
+            }
+            wait_for_thread_to_pause();
+            // if we modified the network at all then be sure to sync the final result.
+            sync_to_disk(true);
+        }
+
+        void train (
+            const std::vector<input_type>& data
+        ) 
+        {
+            DLIB_CASSERT(data.size() > 0);
+
+            const bool has_unsupervised_loss = std::is_same<no_label_type, training_label_type>::value; 
+            static_assert(has_unsupervised_loss, 
+                "You can only call this version of train() when using an unsupervised loss.");
+
+            // The reason these two loops don't initialize their counter variables but
+            // instead use class members is so we can include the state of the loops in the
+            // stuff written by sync_to_disk()
+            for (; 
+                epoch_iteration < max_num_epochs && learning_rate >= min_learning_rate; 
+                ++epoch_iteration)
+            {
+                using namespace std::chrono;
+                last_time = system_clock::now();
+                clear_average_loss();
+                for (; epoch_pos < data.size() && learning_rate >= min_learning_rate; epoch_pos += mini_batch_size)
+                {
+                    if (verbose)
+                    {
+                        auto now_time = system_clock::now();
+                        if (now_time-last_time > seconds(20))
+                        {
+                            last_time = now_time;
+                            auto iter = epoch_iteration + epoch_pos/(double)data.size();
+                            std::cout << "epoch: " << rpad(cast_to_string(iter),epoch_string_pad) << "  " 
+                                      << "learning rate: " << rpad(cast_to_string(learning_rate),lr_string_pad) << "  "
+                                      << "average loss: " << rpad(cast_to_string(get_average_loss()),string_pad) << "  ";
+                            print_progress();
+                        }
+                    }
+
+                    sync_to_disk();
+                    send_job(false, data.begin()+epoch_pos, 
+                             data.begin()+std::min(epoch_pos+mini_batch_size,data.size()));
+                }
+                epoch_pos = 0;
+
+                if (verbose)
+                {
+                    // Capitalize the E in Epoch so it's easy to grep out the lines that
+                    // are for full epoch status statements.
+                    std::cout << "Epoch: " << rpad(cast_to_string(epoch_iteration+1),epoch_string_pad) << "  " 
+                              << "learning rate: " << rpad(cast_to_string(learning_rate),lr_string_pad) << "  "
+                              << "average loss: " << rpad(cast_to_string(get_average_loss()),string_pad) << "  ";
+                    print_progress();
+                }
+            }
+            wait_for_thread_to_pause();
+            // if we modified the network at all then be sure to sync the final result.
+            sync_to_disk(true);
+        }
+
+        void set_synchronization_file (
+            const std::string& filename,
+            std::chrono::seconds time_between_syncs_ = std::chrono::minutes(15)
+        )
+        {
+            last_sync_time = std::chrono::system_clock::now();
+            sync_filename = filename;
+            time_between_syncs = time_between_syncs_;
+
+            // check if the sync file already exists, if it does we should load it.
+            std::ifstream fin(newest_syncfile(), std::ios::binary);
+            if (fin)
+                deserialize(*this, fin);
+        }
+
+        const std::string& get_synchronization_file (
+        )
+        {
+            return sync_filename;
+        }
+
+        double get_average_loss (
+        ) const 
+        { 
+            wait_for_thread_to_pause();
+            return rs.mean();
+        }
+
+        double get_average_test_loss (
+        ) const
+        {
+            wait_for_thread_to_pause();
+            return rs_test.mean();
+        }
+
+        void clear_average_loss (
+        )
+        {
+            wait_for_thread_to_pause();
+            rs.clear();
+        }
+
+        void set_learning_rate (
+            double lr
+        )
+        {
+            DLIB_CASSERT(lr > 0);
+            wait_for_thread_to_pause();
+            if (learning_rate != lr)
+            {
+                steps_without_progress = 0;
+                test_steps_without_progress = 0;
+                previous_loss_values.clear();
+                test_previous_loss_values.clear();
+            }
+            learning_rate = lr;
+            lr_schedule.set_size(0);
+        }
+
+        double get_learning_rate(
+        ) const 
+        {
+            return learning_rate;
+        }
+
+        void set_min_learning_rate (
+            double lr
+        )
+        {
+            DLIB_CASSERT(lr > 0);
+            wait_for_thread_to_pause();
+            lr_schedule.set_size(0);
+            min_learning_rate = lr;
+        }
+
+        double get_min_learning_rate (
+        ) const
+        {
+            return min_learning_rate;
+        }
+
+        template <typename EXP>
+        void set_learning_rate_schedule (
+            const matrix_exp<EXP>& schedule
+        )
+        {
+            DLIB_CASSERT(schedule.size() > 0);
+            DLIB_CASSERT(min(schedule) > 0);
+            set_learning_rate(schedule(0,0));
+            set_min_learning_rate(min(schedule));
+            set_learning_rate_shrink_factor(1);
+            lr_schedule = matrix_cast<double>(reshape_to_column_vector(schedule));
+            lr_schedule_pos = 0;
+        }
+
+        const matrix<double,0,1>& get_learning_rate_schedule (
+        ) const
+        {
+            return lr_schedule;
+        }
+
+        void set_iterations_without_progress_threshold (
+            unsigned long thresh 
+        )
+        {
+            wait_for_thread_to_pause();
+            lr_schedule.set_size(0);
+            iter_without_progress_thresh = thresh;
+        }
+
+        unsigned long get_iterations_without_progress_threshold (
+        ) const
+        {
+            return iter_without_progress_thresh;
+        }
+
+        unsigned long get_steps_without_progress (
+        ) const
+        {
+            return steps_without_progress;
+        }
+
+        void set_test_iterations_without_progress_threshold (
+            unsigned long thresh 
+        )
+        {
+            wait_for_thread_to_pause();
+            lr_schedule.set_size(0);
+            test_iter_without_progress_thresh = thresh;
+        }
+
+        unsigned long get_test_iterations_without_progress_threshold (
+        ) const
+        {
+            return test_iter_without_progress_thresh;
+        }
+
+        unsigned long get_test_steps_without_progress (
+        ) const
+        {
+            return test_steps_without_progress;
+        }
+
+        void set_learning_rate_shrink_factor (
+            double shrink
+        )
+        {
+            DLIB_CASSERT(0 < shrink && shrink <= 1);
+            wait_for_thread_to_pause();
+            lr_schedule.set_size(0);
+            learning_rate_shrink = shrink;
+            steps_without_progress = 0;
+            test_steps_without_progress = 0;
+        }
+
+        double get_learning_rate_shrink_factor (
+        ) const
+        {
+            return learning_rate_shrink;
+        }
+
+        unsigned long long get_train_one_step_calls (
+        ) const
+        {
+            return train_one_step_calls;
+        }
+
+        unsigned long long get_test_one_step_calls (
+        ) const
+        {
+            return test_one_step_calls;
+        }
+
+    private:
+
+        void record_test_loss(double loss)
+        {
+            test_previous_loss_values.push_back(loss);
+            if (is_finite(loss))
+                rs_test.add(loss);
+            // discard really old loss values.
+            while (test_previous_loss_values.size() > test_iter_without_progress_thresh)
+                test_previous_loss_values.pop_front();
+        }
+
+        void record_loss(double loss)
+        {
+            // This kind of budgeting causes our gradient checking to use a fixed amount of
+            // computational resources, regardless of the size of iter_without_progress_thresh.
+            gradient_check_budget += 200;
+
+            rs.add(loss);
+            previous_loss_values.push_back(loss);
+            // discard really old loss values.
+            while (previous_loss_values.size() > iter_without_progress_thresh)
+                previous_loss_values.pop_front();
+        }
+
+        template <typename T>
+        double compute_parameter_gradients(size_t device, job_t& next_job, const T&)
+        {
+            if (next_job.have_data[device])
+            {
+                auto&& dev = *devices[device];
+                dlib::cuda::set_device(dev.device_id);
+                if (next_job.test_only)
+                    return dev.net.compute_loss(next_job.t[device], next_job.labels[device].begin());
+                else
+                    return dev.net.compute_parameter_gradients(next_job.t[device], next_job.labels[device].begin());
+            }
+            else
+            {
+                return 0;
+            }
+        }
+
+        double compute_parameter_gradients(size_t device, job_t& next_job, const no_label_type&)
+        {
+            if (next_job.have_data[device])
+            {
+                auto&& dev = *devices[device];
+                dlib::cuda::set_device(dev.device_id);
+                no_label_type pick_which_run_update;
+                if (next_job.test_only)
+                    return dev.net.compute_loss(next_job.t[device]);
+                else
+                    return dev.net.compute_parameter_gradients(next_job.t[device]);
+            }
+            else
+            {
+                return 0;
+            }
+        }
+
+        void update_parameters(size_t device)
+        {
+            auto&& dev = *devices[device];
+            dlib::cuda::set_device(dev.device_id);
+            dev.net.update_parameters(make_sstack(dev.solvers), learning_rate);
+        }
+
+        void thread() try
+        {
+            training_label_type pick_which_run_update;
+            job_t next_job;
+
+            std::vector<dlib::future<double>> losses(devices.size());
+
+            std::vector<tt::multi_device_tensor_averager> averagers;
+            // An array of all the parameter tensors in the first network.  We will
+            // periodically copy these tensors to all the other devices to make sure the
+            // different GPUs don't go out of sync.
+            std::vector<tensor*> reference_params;
+            visit_layer_parameters(devices[0]->net, [&](size_t, tensor& t) { reference_params.push_back(&t); });
+
+            // We make separate thread pools with just one thread in them because we want
+            // to make sure each device is always executed on the same thread.  We care
+            // about this because there are thread_local context variables for some cuda
+            // components and they get allocated for each combination of thread and device.
+            // So if we make sure the same device always uses the same thread this will
+            // reduce the number of contexts we allocate from num_devices*num_devices to
+            // just num_devices. 
+            std::vector<std::shared_ptr<thread_pool>> tp;
+            for (size_t i = 0; i < devices.size(); ++i)
+                tp.push_back(std::make_shared<thread_pool>(1));
+
+
+            main_iteration_counter = 0;
+            while(job_pipe.dequeue(next_job))
+            {
+                if (next_job.test_only)
+                {
+                    // compute the testing loss
+                    for (size_t i = 0; i < devices.size(); ++i)
+                        tp[i]->add_task_by_value([&,i](double& loss){ loss = compute_parameter_gradients(i, next_job, pick_which_run_update); }, losses[i]);
+                    // aggregate loss values from all the network computations.
+                    double theloss = 0;
+                    for (auto&& loss : losses)
+                        theloss += loss.get();
+                    record_test_loss(theloss/losses.size());
+
+                    // Check if we should shrink the learning rate based on how the test
+                    // error has been doing lately.
+                    if (learning_rate_shrink != 1)
+                    {
+                        test_steps_without_progress = count_steps_without_decrease(test_previous_loss_values);
+                        if (test_steps_without_progress >= test_iter_without_progress_thresh)
+                        {
+                            test_steps_without_progress = count_steps_without_decrease_robust(test_previous_loss_values);
+                            if (test_steps_without_progress >= test_iter_without_progress_thresh)
+                            {
+                                // optimization has flattened out, so drop the learning rate. 
+                                learning_rate = learning_rate_shrink*learning_rate;
+                                test_steps_without_progress = 0;
+                                // Empty out some of the previous loss values so that test_steps_without_progress 
+                                // will decrease below test_iter_without_progress_thresh.  
+                                for (unsigned long cnt = 0; cnt < test_previous_loss_values_dump_amount+test_iter_without_progress_thresh/10 && test_previous_loss_values.size() > 0; ++cnt)
+                                    test_previous_loss_values.pop_front();
+                            }
+                        }
+                    }
+                    continue;
+                }
+
+                updated_net_since_last_sync = true;
+                ++main_iteration_counter;
+                // Call compute_parameter_gradients() and update_parameters() but pick the
+                // right version for unsupervised or supervised training based on the type
+                // of training_label_type.
+                for (size_t i = 0; i < devices.size(); ++i)
+                    tp[i]->add_task_by_value([&,i](double& loss){ loss = compute_parameter_gradients(i, next_job, pick_which_run_update); }, losses[i]);
+                // aggregate loss values from all the network computations.
+                double theloss = 0;
+                for (auto&& loss : losses)
+                    theloss += loss.get();
+                record_loss(theloss/losses.size());
+
+                // Now, if there is more than one active device we need to synchronize the
+                // gradient updates between devices.  So we do that now.
+                if (devices.size() > 1)
+                {
+                    // if this is the first iteration then we need to setup the averagers.
+                    // We can't do this outside the loop because the tensors that get
+                    // averaged need to be allocated to their devices before we call set()
+                    // so that the averagers can determine how best to average them.
+                    if (averagers.size() == 0 || sync_file_reloaded)
+                    {
+                        averagers = std::vector<tt::multi_device_tensor_averager>(net_type::num_computational_layers);
+                        // setup the averagers to point to the tensors in the networks.
+                        std::vector<std::vector<tensor*>> all_tensors(devices.size());
+                        for (size_t i = 0; i < all_tensors.size(); ++i)
+                        {
+                            all_tensors[i].resize(net_type::num_computational_layers);
+                            visit_layer_parameter_gradients(devices[i]->net, [&](size_t j, tensor& t){
+                                all_tensors[i][j] = &t;
+                            });
+                        }
+                        // Now set each averager to average the tensors at the same layer in each
+                        // network.
+                        for (size_t i = 0; i < net_type::num_computational_layers; ++i)
+                        {
+                            std::vector<tensor*> temp(all_tensors.size());
+                            for (size_t j = 0; j < all_tensors.size(); ++j)
+                                temp[j] = all_tensors[j][i];
+                            // ignore layers that don't have parameters
+                            if (temp[0]->size() != 0)
+                                averagers[i].set(temp);
+                        }
+
+                        sync_file_reloaded = false;
+                    }
+
+
+                    for (auto&& d : devices)
+                        cuda::device_synchronize(d->device_id);
+
+                    for (auto&& avg : averagers)
+                        avg.average();
+                }
+
+
+                // Now apply all the updates to each device.
+                for (size_t i = 0; i < devices.size(); ++i)
+                    tp[i]->add_task_by_value([&,i](){ if (next_job.have_data[i]) update_parameters(i); });
+                // and wait for the updates to all happen.
+                for (size_t i = 0; i < devices.size(); ++i)
+                    tp[i]->wait_for_all_tasks();
+
+
+                // Every now and then force all the parameters to be the same just to make
+                // sure they aren't drifting apart due to any non-deterministic behavior on
+                // the GPU.  It's also important to do this on the first iteration because
+                // the different networks may be initialized differently when tensor data
+                // is first passed through them.  So this code block deals with these
+                // issues.
+                if (devices.size() > 1 && main_iteration_counter%2000 == 1)
+                {
+                    for (size_t i = 1; i < devices.size(); ++i)
+                    {
+                        visit_layer_parameters(devices[i]->net, [&](size_t j, tensor& t) 
+                        { 
+                            memcpy(t, *reference_params[j]);
+                        });
+                    }
+                }
+
+                // If we have been running for a while then check if the loss is still
+                // dropping.  If it isn't then we will reduce the learning rate.  Note that we
+                // have a "budget" that prevents us from calling
+                // count_steps_without_decrease() every iteration.  We do this because
+                // it can be expensive to compute when previous_loss_values is large.
+                if (gradient_check_budget > iter_without_progress_thresh && learning_rate_shrink != 1)
+                {
+                    gradient_check_budget = 0;
+                    steps_without_progress = count_steps_without_decrease(previous_loss_values);
+                    if (steps_without_progress >= iter_without_progress_thresh)
+                    {
+                        // Double check that we aren't seeing decrease.  This second check
+                        // discards the top 10% largest values and checks again.  We do
+                        // this because sometimes a mini-batch might be bad and cause the
+                        // loss to suddenly jump up, making count_steps_without_decrease()
+                        // return a large number.  But if we discard the top 10% of the
+                        // values in previous_loss_values then we are robust to that kind
+                        // of noise.  Another way of looking at it, if the reason
+                        // count_steps_without_decrease() returns a large value is only
+                        // because the most recent loss values have suddenly been large,
+                        // then we shouldn't stop or lower the learning rate.  We should
+                        // keep going until whatever disturbance we hit is damped down.  
+                        steps_without_progress = count_steps_without_decrease_robust(previous_loss_values);
+                        if (steps_without_progress >= iter_without_progress_thresh)
+                        {
+                            // optimization has flattened out, so drop the learning rate. 
+                            learning_rate = learning_rate_shrink*learning_rate;
+                            steps_without_progress = 0;
+                            // Empty out some of the previous loss values so that steps_without_progress 
+                            // will decrease below iter_without_progress_thresh.  
+                            for (unsigned long cnt = 0; cnt < previous_loss_values_dump_amount+iter_without_progress_thresh/10 && previous_loss_values.size() > 0; ++cnt)
+                                previous_loss_values.pop_front();
+                        }
+                    }
+                }
+                else if (lr_schedule.size() != 0) // or use the learning rate schedule if we have one.
+                {
+                    if (lr_schedule_pos < lr_schedule.size())
+                        learning_rate = lr_schedule(lr_schedule_pos++);
+                    else
+                        learning_rate = lr_schedule(lr_schedule.size()-1)*0.99;
+                }
+            }
+        }
+        catch(...)
+        {
+            // If an exception happens then permanently disable the trainer object.
+            job_pipe.disable();
+            std::lock_guard<std::mutex> lock(eptr_mutex);
+            eptr = std::current_exception();
+        }
+
+        void wait_for_thread_to_pause() const
+        {
+            job_pipe.wait_for_num_blocked_dequeues(1);
+        }
+
+        const static long string_pad = 11;
+        const static long epoch_string_pad = 4;
+        const static long lr_string_pad = 4;
+
+        void init()
+        {
+            max_num_epochs = 10000;
+            mini_batch_size = 128;
+            verbose = false;
+            learning_rate = 1e-2;
+            min_learning_rate = 1e-5;
+            iter_without_progress_thresh = 2000;
+            steps_without_progress = 0;
+            test_iter_without_progress_thresh = 500;
+            test_steps_without_progress = 0;
+
+            learning_rate_shrink = 0.1;
+            epoch_iteration = 0;
+            epoch_pos = 0;
+            train_one_step_calls = 0;
+            test_one_step_calls = 0;
+            gradient_check_budget = 0;
+            lr_schedule_pos = 0;
+
+            main_iteration_counter = 0;
+            main_iteration_counter_at_last_disk_sync = 0;
+            prob_loss_increasing_thresh_default_value = 0.99;
+            prob_loss_increasing_thresh_max_value = 0.99999;
+            prob_loss_increasing_thresh = prob_loss_increasing_thresh_default_value;
+            updated_net_since_last_sync = false;
+            sync_file_reloaded = false;
+            previous_loss_values_dump_amount = 400;
+            test_previous_loss_values_dump_amount = 100;
+
+            rs_test = running_stats_decayed<double>(200);
+
+            start();
+        }
+
+        // serialize and deserialize are private because we hold net by reference so
+        // allowing someone to serialize this training object is weird and will likely
+        // result in user errors.  However, we use these functions as part of the automatic
+        // sync code in this object.
+        friend void serialize(const dnn_trainer& item, std::ostream& out)
+        {
+            item.wait_for_thread_to_pause();
+            int version = 12;
+            serialize(version, out);
+
+            size_t nl = dnn_trainer::num_layers;
+            serialize(nl, out);
+            serialize(item.rs, out);
+            serialize(item.rs_test, out);
+            serialize(item.previous_loss_values, out);
+            serialize(item.max_num_epochs, out);
+            serialize(item.mini_batch_size, out);
+            serialize(item.verbose, out);
+            serialize(item.net, out);
+            serialize(item.devices[0]->solvers, out);
+            serialize(item.learning_rate.load(), out);
+            serialize(item.min_learning_rate, out);
+            serialize(item.iter_without_progress_thresh.load(), out);
+            serialize(item.steps_without_progress.load(), out);
+            serialize(item.learning_rate_shrink.load(), out);
+            serialize(item.epoch_iteration, out);
+            serialize(item.epoch_pos, out);
+            serialize(item.train_one_step_calls, out);
+            serialize(item.test_one_step_calls, out);
+            serialize(item.lr_schedule, out);
+            serialize(item.lr_schedule_pos, out);
+            serialize(item.test_iter_without_progress_thresh.load(), out);
+            serialize(item.test_steps_without_progress.load(), out);
+            serialize(item.test_previous_loss_values, out);
+            serialize(item.previous_loss_values_dump_amount, out);
+            serialize(item.test_previous_loss_values_dump_amount, out);
+
+        }
+        friend void deserialize(dnn_trainer& item, std::istream& in)
+        {
+            item.wait_for_thread_to_pause();
+            int version = 0;
+            deserialize(version, in);
+            if (version != 12)
+                throw serialization_error("Unexpected version found while deserializing dlib::dnn_trainer.");
+
+            size_t num_layers = 0;
+            deserialize(num_layers, in);
+            if (num_layers != dnn_trainer::num_layers)
+            {
+                std::ostringstream sout;
+                sout << "Error deserializing dlib::dnn_trainer.  The saved sync file is for a network with " << std::endl;
+                sout << "a different number of layers.  We expected the number of layers to be " << dnn_trainer::num_layers << " but" << std::endl;
+                sout << "instead the file contains " << num_layers << " layers." << std::endl;
+                throw serialization_error(sout.str());
+            }
+
+            double dtemp; long ltemp;
+            deserialize(item.rs, in);
+            deserialize(item.rs_test, in);
+            deserialize(item.previous_loss_values, in);
+            deserialize(item.max_num_epochs, in);
+            deserialize(item.mini_batch_size, in);
+            deserialize(item.verbose, in);
+            deserialize(item.net, in);
+            deserialize(item.devices[0]->solvers, in);
+            deserialize(dtemp, in); item.learning_rate = dtemp;
+            deserialize(item.min_learning_rate, in);
+            deserialize(ltemp, in); item.iter_without_progress_thresh = ltemp;
+            deserialize(ltemp, in); item.steps_without_progress = ltemp;
+            deserialize(dtemp, in); item.learning_rate_shrink = dtemp;
+            deserialize(item.epoch_iteration, in);
+            deserialize(item.epoch_pos, in);
+            deserialize(item.train_one_step_calls, in);
+            deserialize(item.test_one_step_calls, in);
+            deserialize(item.lr_schedule, in);
+            deserialize(item.lr_schedule_pos, in);
+            deserialize(ltemp, in); item.test_iter_without_progress_thresh = ltemp;
+            deserialize(ltemp, in); item.test_steps_without_progress = ltemp;
+            deserialize(item.test_previous_loss_values, in);
+            deserialize(item.previous_loss_values_dump_amount, in);
+            deserialize(item.test_previous_loss_values_dump_amount, in);
+
+            if (item.devices.size() > 1)
+            {
+                const auto prev_dev = dlib::cuda::get_device();
+                // initialize all the other device networks and solver objects
+                for (size_t i = 1; i < item.devices.size(); ++i)
+                {
+                    // Switch to this device so that any tensor objects that get allocated when
+                    // we copy this stuff happen on this device.
+                    dlib::cuda::set_device(item.devices[i]->device_id);
+                    item.devices[i]->solvers = item.devices[0]->solvers;
+                    item.devices[i]->net = item.devices[0]->net;
+                }
+                dlib::cuda::set_device(prev_dev);
+            }
+        }
+
+        void sync_to_disk (
+            bool do_it_now = false
+        ) 
+        {
+            // don't sync anything if we haven't updated the network since the last sync
+            if (!updated_net_since_last_sync)
+                return;
+
+            // If the sync file isn't set then don't do anything.
+            if (sync_filename.size() == 0)
+                return;
+
+            // Only sync if it has been long enough since the last sync or we are being
+            // explicitly forced to do it.
+            if (std::chrono::system_clock::now() - last_sync_time > time_between_syncs ||
+                do_it_now)
+            {
+                wait_for_thread_to_pause();
+
+                // compact network before saving to disk.
+                this->net.clean(); 
+
+                // if the loss has actually been going up since the last time we saved our
+                // state to disk then something has probably gone wrong in the
+                // optimization.  So in this case we do the opposite and recall the
+                // previously saved state in the hopes that the problem won't reoccur.
+                if (loss_increased_since_last_disk_sync()) 
+                {
+                    std::ifstream fin(newest_syncfile(), std::ios::binary);
+                    deserialize(*this, fin);
+                    sync_file_reloaded = true;
+                    if (verbose)
+                        std::cout << "Loss has been increasing, reloading saved state from " << newest_syncfile() << std::endl;
+                }
+                else
+                {
+
+                    const std::string filename = oldest_syncfile();
+                    serialize(filename) << *this;
+
+                    if (verbose)
+                        std::cout << "Saved state to " << filename << std::endl;
+                }
+
+                last_sync_time = std::chrono::system_clock::now();
+                main_iteration_counter_at_last_disk_sync = main_iteration_counter;
+                updated_net_since_last_sync = false;
+            }
+        }
+
+        std::string newest_syncfile (
+        )
+        {
+            return select_newest_file(sync_filename, sync_filename + "_");
+        }
+
+        std::string oldest_syncfile (
+        )
+        {
+            return select_oldest_file(sync_filename, sync_filename + "_");
+        }
+
+        bool loss_increased_since_last_disk_sync() 
+        {
+            size_t gradient_updates_since_last_sync = main_iteration_counter - main_iteration_counter_at_last_disk_sync;
+
+            // if we haven't synced anything to disk yet then return false.
+            if (!std::ifstream(newest_syncfile(), std::ios::binary))
+                return false;
+
+            for (auto x : previous_loss_values)
+            {
+                // If we get a NaN value of loss assume things have gone horribly wrong and
+                // we should reload the state of the trainer.
+                if (std::isnan(x))
+                    return true;
+            }
+
+            // if we haven't seen much data yet then just say false.  Or, alternatively, if
+            // it's been too long since the last sync then don't reload either.
+            if (gradient_updates_since_last_sync < 30 || previous_loss_values.size() < 2*gradient_updates_since_last_sync)
+                return false;
+
+            // Now look at the data since a little before the last disk sync.  We will
+            // check if the loss is getting bettor or worse.
+            running_gradient g;
+            for (size_t i = previous_loss_values.size() - 2*gradient_updates_since_last_sync; i < previous_loss_values.size(); ++i)
+                g.add(previous_loss_values[i]);
+
+            // if the loss is very likely to be increasing then return true
+            const double prob = g.probability_gradient_greater_than(0);
+            if (prob > prob_loss_increasing_thresh && prob_loss_increasing_thresh <= prob_loss_increasing_thresh_max_value)
+            {
+                // Exponentially decay the threshold towards 1 so that if we keep finding
+                // the loss to be increasing over and over we will make the test
+                // progressively harder and harder until it fails, therefore ensuring we
+                // can't get stuck reloading from a previous state over and over. 
+                prob_loss_increasing_thresh = 0.1*prob_loss_increasing_thresh + 0.9*1;
+                return true;
+            }
+            else
+            {
+                // decay back to the default threshold
+                prob_loss_increasing_thresh = std::pow(prob_loss_increasing_thresh, 10.0);
+                // but don't decay below the default value
+                prob_loss_increasing_thresh = std::max(prob_loss_increasing_thresh, prob_loss_increasing_thresh_default_value);
+
+                return false;
+            }
+        }
+
+
+        struct clone_net{};
+
+        // per device state.  All the containers have the same number of objects in them.
+        struct device_data
+        {
+            device_data(
+                int device_id_,
+                net_type& net_,
+                const solver_type& solver_
+            ) : device_id(device_id_), net(net_), solvers(num_computational_layers, solver_) {}
+
+            device_data(
+                int device_id_,
+                net_type& net_,
+                const solver_type& solver_,
+                clone_net
+            ) : device_id(device_id_), net_copy(std::make_shared<net_type>(net_)), net(*net_copy), solvers(num_computational_layers, solver_) {}
+
+            int device_id;
+            std::shared_ptr<net_type> net_copy;
+            net_type& net;
+            std::vector<solver_type> solvers;
+        };
+
+        template <
+            typename data_iterator,
+            typename label_iterator
+            >
+        void send_job (
+            bool test_only,
+            data_iterator dbegin, 
+            data_iterator dend,
+            label_iterator lbegin
+        )
+        {
+            propagate_exception();
+            size_t num = std::distance(dbegin, dend);
+            size_t devs = devices.size();
+            job.t.resize(devs);
+            job.labels.resize(devs);
+            job.have_data.resize(devs);
+            job.test_only = test_only;
+
+            // chop the data into devs blocks, each of about block_size elements.
+            size_t block_size = (num+devs-1)/devs;
+
+            const auto prev_dev = dlib::cuda::get_device();
+            for (size_t i = 0; i < devs; ++i)
+            {
+                dlib::cuda::set_device(devices[i]->device_id);
+
+                size_t start = i*block_size;
+                size_t stop  = std::min(num, start+block_size);
+
+                if (start < stop)
+                {
+                    devices[i]->net.to_tensor(dbegin+start, dbegin+stop, job.t[i]);
+                    job.labels[i].assign(lbegin+start, lbegin+stop);
+                    job.have_data[i] = true;
+                }
+                else
+                {
+                    job.have_data[i] = false;
+                }
+            }
+
+            dlib::cuda::set_device(prev_dev);
+            job_pipe.enqueue(job);
+        }
+
+        template <
+            typename data_iterator
+            >
+        void send_job (
+            bool test_only,
+            data_iterator dbegin, 
+            data_iterator dend
+        )
+        {
+            typename std::vector<training_label_type>::iterator nothing;
+            send_job(test_only, dbegin, dend, nothing);
+        }
+
+        void print_progress()
+        {
+            if (lr_schedule.size() == 0)
+            {
+                if (test_previous_loss_values.size() == 0)
+                    std::cout << "steps without apparent progress: " << steps_without_progress;
+                else
+                    std::cout << "steps without apparent progress: train=" << steps_without_progress << ", test=" << test_steps_without_progress;
+            }
+            else
+            {
+                std::ostringstream sout;
+                sout << "percent complete: " << std::fixed << std::setprecision(2) << 100.0*lr_schedule_pos/(double)lr_schedule.size() << "%";
+                std::cout << sout.str();
+            }
+            std::cout << std::endl;
+        }
+
+        void print_periodic_verbose_status()
+        {
+            if (verbose)
+            {
+                using namespace std::chrono;
+                auto now_time = system_clock::now();
+                if (now_time-last_time > seconds(40))
+                {
+                    last_time = now_time;
+                    std::cout << "step#: " << rpad(cast_to_string(train_one_step_calls),epoch_string_pad) << "  " 
+                              << "learning rate: " << rpad(cast_to_string(learning_rate),lr_string_pad) << "  ";
+                    if (test_previous_loss_values.size() == 0)
+                    {
+                        std::cout << "average loss: " << rpad(cast_to_string(get_average_loss()),string_pad) << "  ";
+                    }
+                    else
+                    {
+                        std::cout << "train loss: " << rpad(cast_to_string(get_average_loss()),string_pad) << "  ";
+                        std::cout << "test loss: " << rpad(cast_to_string(get_average_test_loss()),string_pad) << "  ";
+                    }
+                    print_progress();
+                    clear_average_loss();
+                }
+            }
+        }
+
+        std::vector<std::shared_ptr<device_data>> devices;
+        dlib::pipe<job_t> job_pipe;
+        job_t job;
+
+
+        running_stats<double> rs;
+        running_stats_decayed<double> rs_test;
+        std::deque<double> previous_loss_values;
+        unsigned long max_num_epochs;
+        size_t mini_batch_size;
+        bool verbose;
+        net_type& net;
+        std::atomic<double> learning_rate;
+        double min_learning_rate;
+        std::atomic<unsigned long> iter_without_progress_thresh;
+        std::atomic<unsigned long> steps_without_progress;
+
+        std::atomic<unsigned long> test_iter_without_progress_thresh;
+        std::atomic<unsigned long> test_steps_without_progress;
+        std::deque<double> test_previous_loss_values;
+
+        std::atomic<double> learning_rate_shrink;
+        std::chrono::time_point<std::chrono::system_clock> last_sync_time;
+        std::string sync_filename;
+        std::chrono::seconds time_between_syncs;
+        unsigned long epoch_iteration;
+        size_t epoch_pos;
+        std::chrono::time_point<std::chrono::system_clock> last_time;
+        unsigned long long train_one_step_calls;
+        unsigned long long test_one_step_calls;
+        matrix<double,0,1> lr_schedule;
+        long lr_schedule_pos;
+        unsigned long gradient_check_budget;
+
+        std::exception_ptr eptr = nullptr;
+        mutable std::mutex eptr_mutex;
+        void propagate_exception() const
+        {
+            std::lock_guard<std::mutex> lock(eptr_mutex);
+            if (eptr)
+                std::rethrow_exception(eptr);
+        }
+
+        // These 5 variables are not serialized 
+        size_t main_iteration_counter;
+        size_t main_iteration_counter_at_last_disk_sync;
+        double prob_loss_increasing_thresh_default_value;
+        double prob_loss_increasing_thresh_max_value;
+        double prob_loss_increasing_thresh;
+        std::atomic<bool> updated_net_since_last_sync;
+
+        bool sync_file_reloaded;
+        unsigned long previous_loss_values_dump_amount;
+        unsigned long test_previous_loss_values_dump_amount;
+    };
+
+// ----------------------------------------------------------------------------------------
+
+    template <
+        typename net_type, 
+        typename solver_type 
+        >
+    std::ostream& operator<< (
+        std::ostream& out,
+        dnn_trainer<net_type,solver_type>& trainer
+    )
+    {
+        using std::endl;
+        out << "dnn_trainer details: \n";
+        out << "  net_type::num_layers:  " << net_type::num_layers << endl;
+        // figure out how big the net is in MB.
+        std::ostringstream sout;
+        net_type temp = trainer.get_net(); // make a copy so that we can clean it without mutating the trainer's net.
+        temp.clean();
+        serialize(temp, sout);
+        out << "  net size: " << sout.str().size()/1024.0/1024.0 << "MB" << endl;
+        // Don't include the loss params in the hash since we print them on the next line.
+        // They also aren't really part of the "architecture" of the network.
+        out << "  net architecture hash: " << md5(cast_to_string(trainer.get_net().subnet())) << endl;
+        out << "  loss: " << trainer.get_net().loss_details() << endl;
+
+        out << "  synchronization file:                       " << trainer.get_synchronization_file() << endl;
+        out << "  trainer.get_solvers()[0]:                   " << trainer.get_solvers()[0] << endl;
+        auto sched = trainer.get_learning_rate_schedule();
+        if (sched.size() != 0)
+        {
+            out << "  using explicit user-supplied learning rate schedule" << endl;
+        }
+        else
+        {
+            out << "  learning rate:                              "<< trainer.get_learning_rate() << endl;
+            out << "  learning rate shrink factor:                "<< trainer.get_learning_rate_shrink_factor() << endl;
+            out << "  min learning rate:                          "<< trainer.get_min_learning_rate() << endl;
+            out << "  iterations without progress threshold:      "<< trainer.get_iterations_without_progress_threshold() << endl;
+            out << "  test iterations without progress threshold: "<< trainer.get_test_iterations_without_progress_threshold() << endl;
+        }
+        return out;
+    }
+
+// ----------------------------------------------------------------------------------------
+
+}
+
+#endif // DLIB_DNn_TRAINER_H_
+
diff --git a/ml/dlib/dlib/dnn/trainer_abstract.h b/ml/dlib/dlib/dnn/trainer_abstract.h
new file mode 100644
index 000000000..3bfb6dc99
--- /dev/null
+++ b/ml/dlib/dlib/dnn/trainer_abstract.h
@@ -0,0 +1,765 @@
+// Copyright (C) 2015  Davis E. King (davis@dlib.net)
+// License: Boost Software License   See LICENSE.txt for the full license.
+#undef DLIB_DNn_TRAINER_ABSTRACT_H_
+#ifdef DLIB_DNn_TRAINER_ABSTRACT_H_
+
+#include "core_abstract.h"
+#include "solvers_abstract.h"
+#include <vector>
+#include <chrono>
+
+
+namespace dlib
+{
+
+// ----------------------------------------------------------------------------------------
+
+    enum class force_flush_to_disk {
+        no = 0,
+        yes = 1
+    };
+
+// ----------------------------------------------------------------------------------------
+
+    template <
+        typename net_type, 
+        typename solver_type = sgd
+        >
+    class dnn_trainer
+    {
+        /*!
+            REQUIREMENTS ON net_type
+                - net_type is an add_loss_layer object.
+
+            REQUIREMENTS ON solver_type
+                - solver_type is an implementation of the EXAMPLE_SOLVER interface defined
+                  in solvers_abstract.h
+
+            WHAT THIS OBJECT REPRESENTS
+                This object is a tool training a deep neural network. To use it you supply
+                a neural network type and a solver, then you call train() with your
+                training data and it will output a new network instance that has hopefully
+                learned something useful from your training data.
+
+                If you are compiling with CUDA then this object will use the GPU that is
+                currently selected (i.e. the one indicated by cudaGetDevice()) when
+                dnn_trainer is constructed.  It will continue to use that device even if
+                you later change it by a call to cudaSetDevice().
+
+            EXCEPTIONS
+                If an exception is thrown by any part of the neural network during training
+                then the exception will be propagated out of the trainer to the user.
+                Moreover, the trainer instance will be unusable and should be destroyed.
+        !*/
+
+    public:
+
+        typedef typename net_type::training_label_type training_label_type;
+        typedef typename net_type::input_type input_type;
+        const static size_t num_computational_layers = net_type::num_computational_layers;
+
+        dnn_trainer() = delete;
+        dnn_trainer(const dnn_trainer&) = delete;
+        dnn_trainer& operator=(const dnn_trainer&) = delete;
+
+        dnn_trainer(
+            net_type& net, 
+            const solver_type& solver = solver_type(),
+            const std::vector<int>& cuda_extra_devices = {}
+        ); 
+        /*!
+            requires
+                - for all valid i:
+                    - 0 <= cuda_extra_devices[i] < dlib::cuda::get_num_devices()
+            ensures
+                - &#get_net() == &net 
+                  (i.e. The dnn_trainer holds a reference to net, it does not copy it.
+                  Therefore, you must ensure net has a lifetime at least as long as the
+                  dnn_trainer).
+                - #get_solvers() == a set of solvers that are all initialized with the
+                  provided solver instance.
+                - #get_max_num_epochs() == 10000
+                - #get_mini_batch_size() == 128
+                - #get_learning_rate() == 1e-2 
+                - #get_min_learning_rate() == 1e-5
+                - #get_iterations_without_progress_threshold() == 2000
+                - #get_test_iterations_without_progress_threshold() == 500
+                - #get_learning_rate_shrink_factor() == 0.1
+                - #get_learning_rate_schedule().size() == 0
+                - #get_train_one_step_calls() == 0
+                - #get_test_one_step_calls() == 0
+                - #get_synchronization_file() == ""
+                - if (cuda_extra_devices.size() > 0) then
+                    - This object will use multiple graphics cards to run the learning
+                      algorithms.  In particular, it will always use whatever device is
+                      currently selected on the calling thread (the device indicated by
+                      cudaGetDevice()).  In addition, you can ask to use additional
+                      devices, which you do by putting their device numbers into
+                      cuda_extra_devices.
+        !*/
+
+        net_type& get_net (
+            force_flush_to_disk force_flush = force_flush_to_disk::yes
+        ); 
+        /*!
+            ensures
+                - returns the neural network object used by this trainer.  This is the
+                  network that is optimized when you call train() or train_one_step().
+                  Recall that the dnn_trainer doesn't contain the net_type object but
+                  simply holds a reference to an external network which was provided to the
+                  dnn_trainer's constructor.
+                - This function blocks until all threads inside the dnn_trainer have
+                  stopped touching the net. 
+                - If force_flush is yes, then this function will sync the trainer state to
+                  disk if the current state hasn't already been synced to disk since the
+                  last network modification.
+        !*/
+
+        const std::vector<solver_type>& get_solvers (
+        ) const; 
+        /*!
+            ensures
+                - returns the solvers used to optimize each layer of the neural network
+                  get_net().  In particular, the first layer's solver is
+                  get_solvers()[0], the second layer's solver is
+                  get_solvers()[1], and so on.
+                - This function blocks until all threads inside the dnn_trainer have
+                  stopped touching the net. 
+        !*/
+
+        unsigned long get_mini_batch_size (
+        ) const; 
+        /*!
+            ensures
+                - During training, we call the network's update() routine over and over
+                  with training data.  The number of training samples we give to each call
+                  to update is the "mini-batch size", which is defined by
+                  get_mini_batch_size().
+        !*/
+
+        void set_mini_batch_size (
+            unsigned long batch_size 
+        );
+        /*!
+            requires
+                - batch_size > 0
+            ensures
+                - #get_mini_batch_size() == batch_size
+        !*/
+
+        unsigned long get_max_num_epochs (
+        ) const; 
+        /*!
+            ensures
+                - train() will execute at most get_max_num_epochs() iterations over the
+                  training data before returning.
+        !*/
+
+        void set_max_num_epochs (
+            unsigned long num
+        );
+        /*!
+            requires
+                - num > 0
+            ensures
+                - #get_max_num_epochs() == num
+        !*/
+
+        void set_learning_rate (
+            double lr
+        );
+        /*!
+            requires
+                - lr > 0
+            ensures
+                - #get_learning_rate() == lr
+                - #get_learning_rate_schedule().size() == 0
+                - This function blocks until all threads inside the dnn_trainer have
+                  stopped touching the net. 
+        !*/
+
+        double get_learning_rate(
+        ) const;
+        /*!
+            ensures
+                - During each training step, a solver tells us how to modify the parameters
+                  of each layer in the network.  It does this by outputting a step vector
+                  that, when added to the parameters, will hopefully result in improved
+                  network performance.  The learning rate is one of the inputs to the
+                  solver and influences the size of this step vector.  This function
+                  returns the current learning rate, that is, the learning rate that will
+                  be used during the next training step.
+        !*/
+
+        void set_min_learning_rate (
+            double lr
+        );
+        /*!
+            requires
+                - lr > 0
+            ensures
+                - #get_min_learning_rate() == lr
+                - #get_learning_rate_schedule().size() == 0
+                - This function blocks until all threads inside the dnn_trainer have
+                  stopped touching the net. 
+        !*/
+
+        double get_min_learning_rate (
+        ) const;
+        /*!
+            ensures
+                - During training via this->train(), this object will test if progress is
+                  still being made and if it isn't then it will reduce get_learning_rate()
+                  by setting it to get_learning_rate()*get_learning_rate_shrink_factor().
+                  However, it will not reduce it below get_min_learning_rate().  Once this
+                  minimum learning rate is crossed the training will terminate.
+                - get_min_learning_rate() doesn't apply if you are using train_one_step().  
+                  You can keep calling train_one_step() as many times as you want and the
+                  learning rate will drop infinitely close to 0 if you run long enough.
+        !*/
+
+        template <typename EXP>
+        void set_learning_rate_schedule (
+            const matrix_exp<EXP>& schedule
+        );
+        /*!
+            requires
+                - schedule.size() > 0
+                - min(schedule) > 0
+            ensures
+                - #get_learning_rate_schedule() == reshape_to_column_vector(schedule)
+                - #get_learning_rate() == schedule(0,0)
+                - #get_min_learning_rate() == min(schedule)
+                - #set_learning_rate_shrink_factor() == 1
+        !*/
+
+        const matrix<double,0,1>& get_learning_rate_schedule (
+        ) const;
+        /*!
+            ensures
+                - if (this function returns a non-empty matrix) then
+                    - This trainer will use an explicit learning rate schedule defined by
+                      the learning rate values in get_learning_rate_schedule().  For
+                      example, if get_learning_rate_schedule() returned {0.1, 0.09, 0.08,
+                      0.07, 0.06} then the first training mini-batch would use a learning
+                      rate of 0.1, then the next training mini-batch uses 0.09, and then
+                      0.8, and so on until the end of the schedule is reached.  
+                      
+                      If you continue to run training after the end of the schedule has
+                      been reached then the learning rate will be fixed to 0.99 times the
+                      final value.  So in our example, eventually the learning rate would
+                      be fixed to 0.99*0.06.  This allows you to test if we have reached the
+                      end of the schedule by checking if get_learning_rate() >= 0.06.
+        !*/
+
+        unsigned long get_steps_without_progress (
+        ) const;
+        /*!
+            ensures
+                - if (get_learning_rate_shrink_factor() != 1) then
+                    - returns an estimate of how many mini-batches have executed without us
+                      observing a statistically significant decrease in the training error.
+                - else
+                    - returns 0
+        !*/
+
+        void set_iterations_without_progress_threshold (
+            unsigned long thresh 
+        );
+        /*!
+            ensures
+                - #get_iterations_without_progress_threshold() == thresh
+                - #get_learning_rate_schedule().size() == 0
+                - This function blocks until all threads inside the dnn_trainer have
+                  stopped touching the net. 
+        !*/
+
+        unsigned long get_iterations_without_progress_threshold (
+        ) const;
+        /*!
+            ensures
+                - This object monitors the progress of training and estimates if the
+                  training error is being reduced.  It does this by looking at the previous
+                  get_iterations_without_progress_threshold() mini-batch results and
+                  applying the statistical test defined by the running_gradient object to
+                  see if the training error is getting smaller.  If it isn't being reduced
+                  then get_learning_rate() is made smaller by a factor of get_learning_rate_shrink_factor().
+
+                  Therefore, get_iterations_without_progress_threshold() should always be
+                  set to something sensibly large so that this test can be done with
+                  reasonably high confidence.  Think of this test as saying "if the loss
+                  hasn't decreased for the previous get_iterations_without_progress_threshold() 
+                  then shrink the learning rate".
+        !*/
+
+        void set_learning_rate_shrink_factor (
+            double shrink
+        );
+        /*!
+            requires
+                - 0 < shrink && shrink <= 1
+            ensures
+                - #get_learning_rate_shrink_factor() == shrink
+                - #get_learning_rate_schedule().size() == 0
+                - This function blocks until all threads inside the dnn_trainer have
+                  stopped touching the net. 
+        !*/
+
+        double get_learning_rate_shrink_factor (
+        ) const;
+        /*!
+            ensures
+                - Whenever the training routine thinks it isn't making progress anymore it
+                  will reduce get_learning_rate() by multiplying it by get_learning_rate_shrink_factor().
+                - You can disable the automatic learning rate reduction by setting
+                  get_learning_rate_shrink_factor() to 1.
+        !*/
+
+        unsigned long long get_train_one_step_calls (
+        ) const;
+        /*!
+            ensures
+                - returns the number of times train_one_step() has been called.
+        !*/
+
+        unsigned long long get_test_one_step_calls (
+        ) const;
+        /*!
+            ensures
+                - returns the number of times test_one_step() has been called.
+        !*/
+
+        void be_verbose (
+        );
+        /*!
+            ensures
+                - This object will print status messages to standard out so that a 
+                  user can observe the progress of the algorithm.
+        !*/
+
+        void be_quiet (
+        );
+        /*!
+            ensures
+                - This object will not print anything to standard out
+        !*/
+
+        void set_synchronization_file (
+            const std::string& filename,
+            std::chrono::seconds time_between_syncs = std::chrono::minutes(15)
+        );
+        /*!
+            ensures
+                - #get_synchronization_file() == filename
+                - While training is running, either via train() or repeated calls to
+                  train_one_step(), this object will save its entire state, including the
+                  state of get_net(), to disk in the file named filename every
+                  time_between_syncs seconds.
+                - If the filename file already exists then the state of this trainer will
+                  be loaded from that file by this call to set_synchronization_file().
+                  This allows you to resume a training session which was previously
+                  interrupted.
+                - It should be noted that when saving, the trainer will alternate between
+                  saving to a file called filename and another file called filename+"_".
+                  We do this because it's possible that your computer might crash (not
+                  because of dlib, just in general) before the data is safely saved to
+                  disk.  This way, you will always have a backup file if the write to disk
+                  gets corrupted or is incomplete.  Moreover, when loading, we will always
+                  load from the newest of the two possible files.
+        !*/
+
+        const std::string& get_synchronization_file (
+        );
+        /*!
+            ensures
+                - Returns the name of the file the dnn_trainer will periodically save it's
+                  state to.  If the return value is "" then synchronization is disabled.
+        !*/
+
+        void train (
+            const std::vector<input_type>& data,
+            const std::vector<training_label_type>& labels 
+        ); 
+        /*!
+            requires
+                - data.size() == labels.size()
+                - data.size() > 0
+                - net_type uses a supervised loss.  
+                  i.e. net_type::training_label_type != no_label_type.
+            ensures
+                - Trains a supervised neural network based on the given training data.
+                  The goal of training is to find the network parameters that minimize
+                  get_net().compute_loss(data.begin(), data.end(), labels.begin()). 
+                - The optimizer will run until get_learning_rate() < get_min_learning_rate() 
+                  or get_max_num_epochs() training epochs have been executed. 
+                - Each layer in the network will be optimized by its corresponding solver
+                  in get_solvers().  
+                - Each call to train DOES NOT reinitialize the state of get_net() or
+                  get_solvers().  That is, the existing state of the solvers and network is
+                  the starting point for the optimization each time train() is called.  In
+                  particular, if you use the set_synchronization_file() method you can
+                  resume an interrupted train() call by simply calling train() again and it
+                  will pick up from the last synchronization point.  
+                - You can obtain the average loss value during the final training epoch by
+                  calling get_average_loss().
+                - This function blocks until all threads inside the dnn_trainer have
+                  stopped touching the net. 
+        !*/
+
+        void train (
+            const std::vector<input_type>& data
+        );
+        /*!
+            requires 
+                - data.size() > 0
+                - net_type uses an unsupervised loss.  
+                  i.e. net_type::training_label_type == no_label_type.
+            ensures
+                - Trains an unsupervised neural network based on the given training data.
+                  The goal of training is to find the network parameters that minimize
+                  get_net().compute_loss(data.begin(), data.end()). 
+                - The optimizer will run until get_learning_rate() < get_min_learning_rate() 
+                  or get_max_num_epochs() training epochs have been executed. 
+                - Each layer in the network will be optimized by its corresponding solver
+                  in get_solvers().  
+                - Each call to train DOES NOT reinitialize the state of get_net() or
+                  get_solvers().  That is, the existing state of the solvers and network is
+                  the starting point for the optimization each time train() is called.  In
+                  particular, if you use the set_synchronization_file() method you can
+                  resume an interrupted train() call by simply calling train() again and it
+                  will pick up from the last synchronization point.  
+                - You can obtain the average loss value during the final training epoch by
+                  calling get_average_loss().
+                - This function blocks until all threads inside the dnn_trainer have
+                  stopped touching the net. 
+        !*/
+
+        void train_one_step (
+            const std::vector<input_type>& data,
+            const std::vector<training_label_type>& labels 
+        );
+        /*!
+            requires
+                - data.size() == labels.size()
+                - data.size() > 0
+                - net_type uses a supervised loss.  
+                  i.e. net_type::training_label_type != no_label_type.
+            ensures
+                - Performs one stochastic gradient update step based on the mini-batch of
+                  data and labels supplied to this function.  In particular, calling
+                  train_one_step() in a loop is equivalent to calling the train() method
+                  defined above.  However, train_one_step() allows you to stream data from
+                  disk into the training process while train() requires you to first load
+                  all the training data into RAM.  Otherwise, these training methods are
+                  equivalent.
+                - You can observe the current average loss value by calling get_average_loss().
+                - The network training will happen in another thread.  Therefore, after
+                  calling this function you should call get_net() before you touch the net
+                  object from the calling thread to ensure no other threads are still
+                  accessing the network.
+                - #get_train_one_step_calls() == get_train_one_step_calls() + 1.
+        !*/
+
+        template <
+            typename data_iterator,
+            typename label_iterator
+            >
+        void train_one_step (
+            data_iterator dbegin,
+            data_iterator dend,
+            label_iterator lbegin
+        );
+        /*!
+            requires
+                - std::advance(lbegin, std::distance(dbegin, dend) - 1) is dereferencable
+                - std::distance(dbegin, dend) > 0
+                - net_type uses a supervised loss.  
+                  i.e. net_type::training_label_type != no_label_type.
+            ensures
+                - Performs one stochastic gradient update step based on the mini-batch of
+                  data and labels supplied to this function.  In particular, calling
+                  train_one_step() in a loop is equivalent to calling the train() method
+                  defined above.  However, train_one_step() allows you to stream data from
+                  disk into the training process while train() requires you to first load
+                  all the training data into RAM.  Otherwise, these training methods are
+                  equivalent.
+                - You can observe the current average loss value by calling get_average_loss().
+                - The network training will happen in another thread.  Therefore, after
+                  calling this function you should call get_net() before you touch the net
+                  object from the calling thread to ensure no other threads are still
+                  accessing the network.
+                - #get_train_one_step_calls() == get_train_one_step_calls() + 1.
+        !*/
+
+        void train_one_step (
+            const std::vector<input_type>& data
+        );
+        /*!
+            requires
+                - data.size() > 0
+                - net_type uses an unsupervised loss.  
+                  i.e. net_type::training_label_type == no_label_type.
+            ensures
+                - Performs one stochastic gradient update step based on the mini-batch of
+                  data supplied to this function.  In particular, calling train_one_step()
+                  in a loop is equivalent to calling the train() method defined above.
+                  However, train_one_step() allows you to stream data from disk into the
+                  training process while train() requires you to first load all the
+                  training data into RAM.  Otherwise, these training methods are
+                  equivalent.
+                - You can observe the current average loss value by calling get_average_loss().
+                - The network training will happen in another thread.  Therefore, after
+                  calling this function you should call get_net() before you touch the net
+                  object from the calling thread to ensure no other threads are still
+                  accessing the network.
+                - #get_train_one_step_calls() == get_train_one_step_calls() + 1.
+        !*/
+
+        template <
+            typename data_iterator
+            >
+        void train_one_step (
+            data_iterator dbegin,
+            data_iterator dend
+        );
+        /*!
+            requires
+                - std::distance(dbegin, dend) > 0
+                - net_type uses an unsupervised loss.  
+                  i.e. net_type::training_label_type == no_label_type.
+            ensures
+                - Performs one stochastic gradient update step based on the mini-batch of
+                  data supplied to this function.  In particular, calling train_one_step()
+                  in a loop is equivalent to calling the train() method defined above.
+                  However, train_one_step() allows you to stream data from disk into the
+                  training process while train() requires you to first load all the
+                  training data into RAM.  Otherwise, these training methods are
+                  equivalent.
+                - You can observe the current average loss value by calling get_average_loss().
+                - The network training will happen in another thread.  Therefore, after
+                  calling this function you should call get_net() before you touch the net
+                  object from the calling thread to ensure no other threads are still
+                  accessing the network.
+                - #get_train_one_step_calls() == get_train_one_step_calls() + 1.
+        !*/
+        
+        double get_average_loss (
+        ) const;
+        /*!
+            ensures
+                - returns the average loss value observed during previous calls to
+                  train_one_step() or train().  That is, the average output of
+                  net_type::update() during the previous mini-batch updates.
+                - Note that, if be_verbose() has been called, then this object will
+                  automatically call clear_average_loss() periodically when it logs the
+                  loss to the console.
+                - This function blocks until all threads inside the dnn_trainer have
+                  stopped touching the net. 
+        !*/
+
+        void clear_average_loss (
+        );
+        /*!
+            ensures
+                - #get_average_loss() == 0
+                - get_average_loss() uses a dlib::running_stats object to keep a running
+                  average of the loss values seen during the previous mini-batch updates
+                  applied during training.  Calling clear_average_loss() resets the
+                  running_stats object so it forgets about all previous loss values
+                  observed.
+                - This function blocks until all threads inside the dnn_trainer have
+                  stopped touching the net. 
+        !*/
+
+    // ----------------------
+
+        double get_average_test_loss (
+        ) const;
+        /*!
+            ensures
+                - returns the average loss value observed during previous calls to
+                  test_one_step(). 
+                - This function blocks until all threads inside the dnn_trainer have
+                  stopped touching the net. 
+        !*/
+
+        void test_one_step (
+            const std::vector<input_type>& data,
+            const std::vector<training_label_type>& labels 
+        );
+        /*!
+            requires
+                - data.size() == labels.size()
+                - data.size() > 0
+                - net_type uses a supervised loss.  
+                  i.e. net_type::training_label_type != no_label_type.
+            ensures
+                - Runs the given data through the network and computes and records the loss.  
+                - This call does not modify network parameters.  The point of
+                  test_one_step() is two fold, to allow you to observe the accuracy of the
+                  network on hold out data during training, and to allow the trainer to
+                  automatically adjust the learning rate when the test loss stops
+                  improving.  It should be noted that you are not required to use
+                  test_one_step() at all, but if you want to do this kind of thing it is
+                  available.
+                - You can observe the current average loss value by calling get_average_test_loss().
+                - The computation will happen in another thread.  Therefore, after calling
+                  this function you should call get_net() before you touch the net object
+                  from the calling thread to ensure no other threads are still accessing
+                  the network.
+                - #get_test_one_step_calls() == get_test_one_step_calls() + 1.
+        !*/
+
+        template <
+            typename data_iterator,
+            typename label_iterator
+            >
+        void test_one_step (
+            data_iterator dbegin, 
+            data_iterator dend,
+            label_iterator lbegin
+        );
+        /*!
+            requires
+                - std::advance(lbegin, std::distance(dbegin, dend) - 1) is dereferencable
+                - std::distance(dbegin, dend) > 0
+                - net_type uses a supervised loss.  
+                  i.e. net_type::training_label_type != no_label_type.
+            ensures
+                - Runs the given data through the network and computes and records the loss.  
+                - This call does not modify network parameters.  The point of
+                  test_one_step() is two fold, to allow you to observe the accuracy of the
+                  network on hold out data during training, and to allow the trainer to
+                  automatically adjust the learning rate when the test loss stops
+                  improving.  It should be noted that you are not required to use
+                  test_one_step() at all, but if you want to do this kind of thing it is
+                  available.
+                - You can observe the current average loss value by calling get_average_test_loss().
+                - The computation will happen in another thread.  Therefore, after calling
+                  this function you should call get_net() before you touch the net object
+                  from the calling thread to ensure no other threads are still accessing
+                  the network.
+                - #get_test_one_step_calls() == get_test_one_step_calls() + 1.
+        !*/
+
+        void test_one_step (
+            const std::vector<input_type>& data
+        );
+        /*!
+            requires
+                - data.size() > 0
+                - net_type uses an unsupervised loss.  
+                  i.e. net_type::training_label_type == no_label_type.
+            ensures
+                - Runs the given data through the network and computes and records the loss.  
+                - This call does not modify network parameters.  The point of
+                  test_one_step() is two fold, to allow you to observe the accuracy of the
+                  network on hold out data during training, and to allow the trainer to
+                  automatically adjust the learning rate when the test loss stops
+                  improving.  It should be noted that you are not required to use
+                  test_one_step() at all, but if you want to do this kind of thing it is
+                  available.
+                - You can observe the current average loss value by calling get_average_test_loss().
+                - The computation will happen in another thread.  Therefore, after calling
+                  this function you should call get_net() before you touch the net object
+                  from the calling thread to ensure no other threads are still accessing
+                  the network.
+                - #get_test_one_step_calls() == get_test_one_step_calls() + 1.
+        !*/
+
+        template <
+            typename data_iterator
+            >
+        void test_one_step (
+            data_iterator dbegin, 
+            data_iterator dend
+        );
+        /*!
+            requires
+                - std::distance(dbegin, dend) > 0
+                - net_type uses an unsupervised loss.  
+                  i.e. net_type::training_label_type == no_label_type.
+            ensures
+                - Runs the given data through the network and computes and records the loss.  
+                - This call does not modify network parameters.  The point of
+                  test_one_step() is two fold, to allow you to observe the accuracy of the
+                  network on hold out data during training, and to allow the trainer to
+                  automatically adjust the learning rate when the test loss stops
+                  improving.  It should be noted that you are not required to use
+                  test_one_step() at all, but if you want to do this kind of thing it is
+                  available.
+                - You can observe the current average loss value by calling get_average_test_loss().
+                - The computation will happen in another thread.  Therefore, after calling
+                  this function you should call get_net() before you touch the net object
+                  from the calling thread to ensure no other threads are still accessing
+                  the network.
+                - #get_test_one_step_calls() == get_test_one_step_calls() + 1.
+        !*/
+
+        void set_test_iterations_without_progress_threshold (
+            unsigned long thresh 
+        );
+        /*!
+            ensures
+                - #get_test_iterations_without_progress_threshold() == thresh
+                - #get_learning_rate_schedule().size() == 0
+                - This function blocks until all threads inside the dnn_trainer have
+                  stopped touching the net. 
+        !*/
+
+        unsigned long get_test_iterations_without_progress_threshold (
+        ) const;
+        /*!
+            ensures
+                - This object monitors the progress of training and estimates if the
+                  testing error is being reduced.  It does this by looking at the previous
+                  get_test_iterations_without_progress_threshold() mini-batch results from
+                  test_one_step() and applying the statistical test defined by the
+                  running_gradient object to see if the testing error is getting smaller.
+                  If it isn't being reduced then get_learning_rate() is made smaller by a
+                  factor of get_learning_rate_shrink_factor().
+
+                  Therefore, get_test_iterations_without_progress_threshold() should always be
+                  set to something sensibly large so that this test can be done with
+                  reasonably high confidence.  Think of this test as saying "if the testing loss
+                  hasn't decreased for the previous get_test_iterations_without_progress_threshold() 
+                  calls to test_one_step() then shrink the learning rate".
+        !*/
+
+        unsigned long get_test_steps_without_progress (
+        ) const;
+        /*!
+            ensures
+                - if (get_learning_rate_shrink_factor() != 1) then
+                    - returns an estimate of how many mini-batches have executed without us
+                      observing a statistically significant decrease in the testing error
+                      (i.e. the error on the data given to the trainer via test_one_step()
+                      calls).
+                - else
+                    - returns 0
+        !*/
+
+    };
+
+// ----------------------------------------------------------------------------------------
+
+    template <
+        typename net_type, 
+        typename solver_type 
+        >
+    std::ostream& operator<< (
+        std::ostream& out,
+        dnn_trainer<net_type,solver_type>& trainer
+    );
+    /*!
+        ensures
+            - Prints a log of the current parameters of trainer to out.
+    !*/
+
+// ----------------------------------------------------------------------------------------
+
+}
+
+#endif // DLIB_DNn_TRAINER_ABSTRACT_H_ 
+
+
diff --git a/ml/dlib/dlib/dnn/utilities.h b/ml/dlib/dlib/dnn/utilities.h
new file mode 100644
index 000000000..976128c81
--- /dev/null
+++ b/ml/dlib/dlib/dnn/utilities.h
@@ -0,0 +1,281 @@
+// Copyright (C) 2016  Davis E. King (davis@dlib.net)
+// License: Boost Software License   See LICENSE.txt for the full license.
+#ifndef DLIB_DNn_UTILITIES_H_
+#define DLIB_DNn_UTILITIES_H_
+
+#include "core.h"
+#include "utilities_abstract.h"
+#include "../geometry.h"
+#include <fstream>
+
+namespace dlib
+{
+
+// ----------------------------------------------------------------------------------------
+
+    inline double log1pexp(double x)
+    {
+        using std::exp;
+        using namespace std; // Do this instead of using std::log1p because some compilers
+                             // error out otherwise (E.g. gcc 4.9 in cygwin)
+        if (x <= -37)
+            return exp(x);
+        else if (-37 < x && x <= 18)
+            return log1p(exp(x));
+        else if (18 < x && x <= 33.3)
+            return x + exp(-x);
+        else
+            return x;
+    }
+    
+// ----------------------------------------------------------------------------------------
+
+    inline void randomize_parameters (
+        tensor& params,
+        unsigned long num_inputs_and_outputs,
+        dlib::rand& rnd
+    )
+    {
+        for (auto& val : params)
+        {
+            // Draw a random number to initialize the layer according to formula (16)
+            // from Understanding the difficulty of training deep feedforward neural
+            // networks by Xavier Glorot and Yoshua Bengio.
+            val = 2*rnd.get_random_float()-1;
+            val *= std::sqrt(6.0/(num_inputs_and_outputs));
+        }
+    }
+
+// ----------------------------------------------------------------------------------------
+
+    namespace impl
+    {
+        class visitor_net_to_xml
+        {
+        public:
+
+            visitor_net_to_xml(std::ostream& out_) : out(out_) {}
+
+            template<typename input_layer_type>
+            void operator()(size_t idx, const input_layer_type& l) 
+            {
+                out << "<layer idx='"<<idx<<"' type='input'>\n";
+                to_xml(l,out);
+                out << "</layer>\n";
+            }
+
+            template <typename T, typename U>
+            void operator()(size_t idx, const add_loss_layer<T,U>& l) 
+            {
+                out << "<layer idx='"<<idx<<"' type='loss'>\n";
+                to_xml(l.loss_details(),out);
+                out << "</layer>\n";
+            }
+
+            template <typename T, typename U, typename E>
+            void operator()(size_t idx, const add_layer<T,U,E>& l) 
+            {
+                out << "<layer idx='"<<idx<<"' type='comp'>\n";
+                to_xml(l.layer_details(),out);
+                out << "</layer>\n";
+            }
+
+            template <unsigned long ID, typename U, typename E>
+            void operator()(size_t idx, const add_tag_layer<ID,U,E>& l) 
+            {
+                out << "<layer idx='"<<idx<<"' type='tag' id='"<<ID<<"'/>\n";
+            }
+
+            template <template<typename> class T, typename U>
+            void operator()(size_t idx, const add_skip_layer<T,U>& l) 
+            {
+                out << "<layer idx='"<<idx<<"' type='skip' id='"<<(tag_id<T>::id)<<"'/>\n";
+            }
+
+        private:
+
+            std::ostream& out;
+        };
+    }
+
+    template <typename net_type>
+    void net_to_xml (
+        const net_type& net,
+        std::ostream& out
+    )
+    {
+        auto old_precision = out.precision(9);
+        out << "<net>\n";
+        visit_layers(net, impl::visitor_net_to_xml(out));
+        out << "</net>\n";
+        // restore the original stream precision.
+        out.precision(old_precision);
+    }
+
+    template <typename net_type>
+    void net_to_xml (
+        const net_type& net,
+        const std::string& filename
+    )
+    {
+        std::ofstream fout(filename);
+        net_to_xml(net, fout);
+    }
+
+// ----------------------------------------------------------------------------------------
+
+    namespace impl
+    {
+
+        class visitor_net_map_input_to_output
+        {
+        public:
+
+            visitor_net_map_input_to_output(dpoint& p_) : p(p_) {}
+
+            dpoint& p;
+
+            template<typename input_layer_type>
+            void operator()(const input_layer_type& net) 
+            {
+            }
+
+            template <typename T, typename U>
+            void operator()(const add_loss_layer<T,U>& net) 
+            {
+                (*this)(net.subnet());
+            }
+
+            template <typename T, typename U, typename E>
+            void operator()(const add_layer<T,U,E>& net) 
+            {
+                (*this)(net.subnet());
+                p = net.layer_details().map_input_to_output(p);
+            }
+            template <bool B, typename T, typename U, typename E>
+            void operator()(const dimpl::subnet_wrapper<add_layer<T,U,E>,B>& net) 
+            {
+                (*this)(net.subnet());
+                p = net.layer_details().map_input_to_output(p);
+            }
+
+
+            template <unsigned long ID, typename U, typename E>
+            void operator()(const add_tag_layer<ID,U,E>& net) 
+            {
+                // tag layers are an identity transform, so do nothing
+                (*this)(net.subnet());
+            }
+            template <bool is_first, unsigned long ID, typename U, typename E>
+            void operator()(const dimpl::subnet_wrapper<add_tag_layer<ID,U,E>,is_first>& net) 
+            {
+                // tag layers are an identity transform, so do nothing
+                (*this)(net.subnet());
+            }
+
+
+            template <template<typename> class TAG_TYPE, typename U>
+            void operator()(const add_skip_layer<TAG_TYPE,U>& net) 
+            {
+                (*this)(layer<TAG_TYPE>(net));
+            }
+            template <bool is_first, template<typename> class TAG_TYPE, typename SUBNET>
+            void operator()(const dimpl::subnet_wrapper<add_skip_layer<TAG_TYPE,SUBNET>,is_first>& net) 
+            {
+                // skip layers are an identity transform, so do nothing
+                (*this)(layer<TAG_TYPE>(net));
+            }
+
+        };
+
+        class visitor_net_map_output_to_input
+        {
+        public:
+            visitor_net_map_output_to_input(dpoint& p_) : p(p_) {}
+
+            dpoint& p;
+
+            template<typename input_layer_type>
+            void operator()(const input_layer_type& net) 
+            {
+            }
+
+            template <typename T, typename U>
+            void operator()(const add_loss_layer<T,U>& net) 
+            {
+                (*this)(net.subnet());
+            }
+
+            template <typename T, typename U, typename E>
+            void operator()(const add_layer<T,U,E>& net) 
+            {
+                p = net.layer_details().map_output_to_input(p);
+                (*this)(net.subnet());
+            }
+            template <bool B, typename T, typename U, typename E>
+            void operator()(const dimpl::subnet_wrapper<add_layer<T,U,E>,B>& net) 
+            {
+                p = net.layer_details().map_output_to_input(p);
+                (*this)(net.subnet());
+            }
+
+
+            template <unsigned long ID, typename U, typename E>
+            void operator()(const add_tag_layer<ID,U,E>& net) 
+            {
+                // tag layers are an identity transform, so do nothing
+                (*this)(net.subnet());
+            }
+            template <bool is_first, unsigned long ID, typename U, typename E>
+            void operator()(const dimpl::subnet_wrapper<add_tag_layer<ID,U,E>,is_first>& net) 
+            {
+                // tag layers are an identity transform, so do nothing
+                (*this)(net.subnet());
+            }
+
+
+            template <template<typename> class TAG_TYPE, typename U>
+            void operator()(const add_skip_layer<TAG_TYPE,U>& net) 
+            {
+                (*this)(layer<TAG_TYPE>(net));
+            }
+            template <bool is_first, template<typename> class TAG_TYPE, typename SUBNET>
+            void operator()(const dimpl::subnet_wrapper<add_skip_layer<TAG_TYPE,SUBNET>,is_first>& net) 
+            {
+                // skip layers are an identity transform, so do nothing
+                (*this)(layer<TAG_TYPE>(net));
+            }
+
+        };
+    }
+
+    template <typename net_type>
+    inline dpoint input_tensor_to_output_tensor(
+        const net_type& net,
+        dpoint p 
+    )
+    {
+        impl::visitor_net_map_input_to_output temp(p);
+        temp(net);
+        return p;
+    }
+
+    template <typename net_type>
+    inline dpoint output_tensor_to_input_tensor(
+        const net_type& net,
+        dpoint p  
+    )
+    {
+        impl::visitor_net_map_output_to_input temp(p);
+        temp(net);
+        return p;
+    }
+
+// ----------------------------------------------------------------------------------------
+
+}
+
+#endif // DLIB_DNn_UTILITIES_H_ 
+
+
+
diff --git a/ml/dlib/dlib/dnn/utilities_abstract.h b/ml/dlib/dlib/dnn/utilities_abstract.h
new file mode 100644
index 000000000..2a9a3d3fc
--- /dev/null
+++ b/ml/dlib/dlib/dnn/utilities_abstract.h
@@ -0,0 +1,127 @@
+// Copyright (C) 2016  Davis E. King (davis@dlib.net)
+// License: Boost Software License   See LICENSE.txt for the full license.
+#undef DLIB_DNn_UTILITIES_ABSTRACT_H_
+#ifdef DLIB_DNn_UTILITIES_ABSTRACT_H_
+
+#include "core_abstract.h"
+#include "../geometry/vector_abstract.h"
+
+namespace dlib
+{
+
+// ----------------------------------------------------------------------------------------
+
+    double log1pexp(
+        double x
+    );
+    /*!
+        ensures
+            - returns log(1+exp(x))
+              (except computes it using a numerically accurate method)
+    !*/
+
+// ----------------------------------------------------------------------------------------
+
+    void randomize_parameters (
+        tensor& params,
+        unsigned long num_inputs_and_outputs,
+        dlib::rand& rnd
+    );
+    /*!
+        ensures
+            - This function assigns random values into params based on the given random
+              number generator.  In particular, it uses the parameter initialization method
+              of formula 16 from the paper "Understanding the difficulty of training deep
+              feedforward neural networks" by Xavier Glorot and Yoshua Bengio.
+            - It is assumed that the total number of inputs and outputs from the layer is
+              num_inputs_and_outputs.  That is, you should set num_inputs_and_outputs to
+              the sum of the dimensionalities of the vectors going into and out of the
+              layer that uses params as its parameters.
+    !*/
+
+// ----------------------------------------------------------------------------------------
+
+    template <typename net_type>
+    void net_to_xml (
+        const net_type& net,
+        std::ostream& out
+    );
+    /*!
+        requires
+            - net_type is an object of type add_layer, add_loss_layer, add_skip_layer, or
+              add_tag_layer.
+            - All layers in the net must provide to_xml() functions.
+        ensures
+            - Prints the given neural network object as an XML document to the given output
+              stream.
+    !*/
+
+    template <typename net_type>
+    void net_to_xml (
+        const net_type& net,
+        const std::string& filename
+    );
+    /*!
+        requires
+            - net_type is an object of type add_layer, add_loss_layer, add_skip_layer, or
+              add_tag_layer.
+            - All layers in the net must provide to_xml() functions.
+        ensures
+            - This function is just like the above net_to_xml(), except it writes to a file
+              rather than an ostream.
+    !*/
+
+// ----------------------------------------------------------------------------------------
+
+    template <typename net_type>
+    dpoint input_tensor_to_output_tensor(
+        const net_type& net,
+        dpoint p 
+    );
+    /*!
+        requires
+            - net_type is an object of type add_layer, add_skip_layer, or add_tag_layer.
+            - All layers in the net must provide map_input_to_output() functions.
+        ensures
+            - Given a dpoint (i.e. a row,column coordinate) in the input tensor given to
+              net, this function returns the corresponding dpoint in the output tensor
+              net.get_output().  This kind of mapping is useful when working with fully
+              convolutional networks as you will often want to know what parts of the
+              output feature maps correspond to what parts of the input.
+            - If the network contains skip layers then any layers skipped over by the skip
+              layer are ignored for the purpose of computing this coordinate mapping.  That
+              is, if you walk the network from the output layer to the input layer, where
+              each time you encounter a skip layer you jump to the layer indicated by the
+              skip layer, you will visit exactly the layers in the network involved in the
+              input_tensor_to_output_tensor() calculation. This behavior is useful since it
+              allows you to compute some auxiliary DNN as a separate branch of computation
+              that is separate from the main network's job of running some kind of fully
+              convolutional network over an image.  For instance, you might want to have a
+              branch in your network that computes some global image level
+              summarization/feature.
+    !*/
+
+// ----------------------------------------------------------------------------------------
+
+    template <typename net_type>
+    dpoint output_tensor_to_input_tensor(
+        const net_type& net,
+        dpoint p  
+    );
+    /*!
+        requires
+            - net_type is an object of type add_layer, add_skip_layer, or add_tag_layer.
+            - All layers in the net must provide map_output_to_input() functions.
+        ensures
+            - This function provides the reverse mapping of input_tensor_to_output_tensor().
+              That is, given a dpoint in net.get_output(), what is the corresponding dpoint
+              in the input tensor?
+    !*/
+
+// ----------------------------------------------------------------------------------------
+
+}
+
+#endif // DLIB_DNn_UTILITIES_ABSTRACT_H_ 
+
+
diff --git a/ml/dlib/dlib/dnn/validation.h b/ml/dlib/dlib/dnn/validation.h
new file mode 100644
index 000000000..c65cb4526
--- /dev/null
+++ b/ml/dlib/dlib/dnn/validation.h
@@ -0,0 +1,122 @@
+// Copyright (C) 2016  Davis E. King (davis@dlib.net)
+// License: Boost Software License   See LICENSE.txt for the full license.
+#ifndef DLIB_DNn_VALIDATION_H_
+#define DLIB_DNn_VALIDATION_H_
+
+#include "../svm/cross_validate_object_detection_trainer_abstract.h"
+#include "../svm/cross_validate_object_detection_trainer.h"
+#include "layers.h"
+#include <set>
+
+namespace dlib
+{
+    namespace impl
+    {
+        inline std::set<std::string> get_labels (
+            const std::vector<mmod_rect>& rects1,
+            const std::vector<mmod_rect>& rects2
+        )
+        {
+            std::set<std::string> labels;
+            for (auto& rr : rects1)
+                labels.insert(rr.label);
+            for (auto& rr : rects2)
+                labels.insert(rr.label);
+            return labels;
+        }
+    }
+
+    template <
+        typename SUBNET,
+        typename image_array_type
+        >
+    const matrix<double,1,3> test_object_detection_function (
+        loss_mmod<SUBNET>& detector,
+        const image_array_type& images,
+        const std::vector<std::vector<mmod_rect>>& truth_dets,
+        const test_box_overlap& overlap_tester = test_box_overlap(),
+        const double adjust_threshold = 0,
+        const test_box_overlap& overlaps_ignore_tester = test_box_overlap()
+    )
+    {
+        // make sure requires clause is not broken
+        DLIB_CASSERT( is_learning_problem(images,truth_dets) == true , 
+                    "\t matrix test_object_detection_function()"
+                    << "\n\t invalid inputs were given to this function"
+                    << "\n\t is_learning_problem(images,truth_dets): " << is_learning_problem(images,truth_dets)
+                    << "\n\t images.size(): " << images.size() 
+                    );
+
+
+
+        double correct_hits = 0;
+        double total_true_targets = 0;
+
+        std::vector<std::pair<double,bool> > all_dets;
+        unsigned long missing_detections = 0;
+
+        resizable_tensor temp;
+
+        for (unsigned long i = 0; i < images.size(); ++i)
+        {
+            std::vector<mmod_rect> hits; 
+            detector.to_tensor(&images[i], &images[i]+1, temp);
+            detector.subnet().forward(temp);
+            detector.loss_details().to_label(temp, detector.subnet(), &hits, adjust_threshold);
+
+
+            for (auto& label : impl::get_labels(truth_dets[i], hits))
+            {
+                std::vector<full_object_detection> truth_boxes;
+                std::vector<rectangle> ignore;
+                std::vector<std::pair<double,rectangle>> boxes;
+                // copy hits and truth_dets into the above three objects
+                for (auto&& b : truth_dets[i])
+                {
+                    if (b.ignore)
+                    {
+                        ignore.push_back(b);
+                    }
+                    else if (b.label == label)
+                    {
+                        truth_boxes.push_back(full_object_detection(b.rect));
+                        ++total_true_targets;
+                    }
+                }
+                for (auto&& b : hits)
+                {
+                    if (b.label == label)
+                        boxes.push_back(std::make_pair(b.detection_confidence, b.rect));
+                }
+
+                correct_hits += impl::number_of_truth_hits(truth_boxes, ignore, boxes, overlap_tester, all_dets, missing_detections, overlaps_ignore_tester);
+            }
+        }
+
+        std::sort(all_dets.rbegin(), all_dets.rend());
+
+        double precision, recall;
+
+        double total_hits = all_dets.size();
+
+        if (total_hits == 0)
+            precision = 1;
+        else
+            precision = correct_hits / total_hits;
+
+        if (total_true_targets == 0)
+            recall = 1;
+        else
+            recall = correct_hits / total_true_targets;
+
+        matrix<double, 1, 3> res;
+        res = precision, recall, average_precision(all_dets, missing_detections);
+        return res;
+    }
+
+// ----------------------------------------------------------------------------------------
+
+}
+
+#endif // DLIB_DNn_VALIDATION_H_
+