1 files changed, 0 insertions, 2631 deletions
diff --git a/ml/dlib/dlib/dnn/layers_abstract.h b/ml/dlib/dlib/dnn/layers_abstract.h
deleted file mode 100644
index f07025ff8..000000000
--- a/ml/dlib/dlib/dnn/layers_abstract.h
+++ /dev/null
@@ -1,2631 +0,0 @@
-// Copyright (C) 2015  Davis E. King (davis@dlib.net)
-// License: Boost Software License   See LICENSE.txt for the full license.
-#undef DLIB_DNn_LAYERS_ABSTRACT_H_
-#ifdef DLIB_DNn_LAYERS_ABSTRACT_H_
-
-#include "tensor_abstract.h"
-#include "core_abstract.h"
-
-
-namespace dlib
-{
-
-// ----------------------------------------------------------------------------------------
-
-    class SUBNET 
-    {
-        /*!
-            WHAT THIS OBJECT REPRESENTS
-                This object represents a deep neural network.  In particular, it is
-                the simplified interface through which layer objects interact with their
-                subnetworks.  A layer's two important tasks are to (1) take outputs from its
-                subnetwork and forward propagate them through itself and (2) to backwards
-                propagate an error gradient through itself and onto its subnetwork.
-                The idea of a subnetwork is illustrated in the following diagram:
-
-                  +---------------------------------------------------------+
-                  | loss <-- layer1 <-- layer2 <-- ... <-- layern <-- input |
-                  +---------------------------------------------------------+
-                                      ^                            ^
-                                      \__ subnetwork for layer1 __/
-
-                Therefore, by "subnetwork" we mean the part of the network closer to the
-                input.
-
-                Note that there is no dlib::SUBNET type.  It is shown here purely to
-                document the interface layer objects expect to see when they interact
-                with a network.
-        !*/
-
-    public:
-        // You aren't allowed to copy subnetworks from inside a layer.
-        SUBNET(const SUBNET&) = delete;
-        SUBNET& operator=(const SUBNET&) = delete;
-
-        const tensor& get_output(
-        ) const;
-        /*!
-            ensures
-                - returns the output of this subnetwork.  This is the data that the next
-                  layer in the network will take as input.
-                - have_same_dimensions(#get_gradient_input(), get_output()) == true
-        !*/
-
-        tensor& get_gradient_input(
-        );
-        /*!
-            ensures
-                - returns the error gradient for this subnetwork.  That is, this is the
-                  error gradient that this network will use to update itself.  Therefore,
-                  when performing back propagation, layers that sit on top of this
-                  subnetwork write their back propagated error gradients into
-                  get_gradient_input().  Or to put it another way, during back propagation,
-                  layers take the contents of their get_gradient_input() and back propagate
-                  it through themselves and store the results into their subnetwork's
-                  get_gradient_input().
-        !*/
-
-        const NEXT_SUBNET& subnet(
-        ) const;
-        /*!
-            ensures
-                - returns the subnetwork of *this network.  With respect to the diagram
-                  above, if *this was layer1 then subnet() would return the network that
-                  begins with layer2.
-        !*/
-
-        NEXT_SUBNET& subnet(
-        );
-        /*!
-            ensures
-                - returns the subnetwork of *this network.  With respect to the diagram
-                  above, if *this was layer1 then subnet() would return the network that
-                  begins with layer2.
-        !*/
-
-        const layer_details_type& layer_details(
-        ) const; 
-        /*!
-            ensures
-                - returns the layer_details_type instance that defines the behavior of the
-                  layer at the top of this network.  I.e. returns the layer details that
-                  defines the behavior of the layer nearest to the network output rather
-                  than the input layer.  For computational layers, this is the object
-                  implementing the EXAMPLE_COMPUTATIONAL_LAYER_ interface that defines the
-                  layer's behavior.
-        !*/
-
-        unsigned int sample_expansion_factor (
-        ) const;
-        /*!
-            ensures
-                - When to_tensor() is invoked on this network's input layer it converts N
-                  input objects into M samples, all stored inside a resizable_tensor.  It
-                  is always the case that M is some integer multiple of N.
-                  sample_expansion_factor() returns the value of this multiplier.  To be
-                  very specific, it is always true that M==I*N where I is some integer.
-                  This integer I is what is returned by sample_expansion_factor().
-
-                  It should be noted that computational layers likely do not care about the
-                  sample expansion factor.  It is only really of concern inside a loss
-                  layer where you need to know its value so that tensor samples can be
-                  matched against truth objects.  Moreover, in most cases the sample
-                  expansion factor is 1.
-        !*/
-
-    };
-
-// ----------------------------------------------------------------------------------------
-
-    class EXAMPLE_COMPUTATIONAL_LAYER_
-    {
-        /*!
-            WHAT THIS OBJECT REPRESENTS
-                Each computational layer in a deep neural network can be thought of as a
-                function, f(data,parameters), that takes in a data tensor, some parameters,
-                and produces an output tensor.  You create an entire deep network by
-                composing these functions.  Importantly, you are able to use a wide range
-                of different functions to accommodate the task you are trying to
-                accomplish.  Therefore, dlib includes a number of common layer types but if
-                you want to define your own then you simply implement a class with the same
-                interface as EXAMPLE_COMPUTATIONAL_LAYER_.
-
-                Note that there is no dlib::EXAMPLE_COMPUTATIONAL_LAYER_ type.  It is shown
-                here purely to document the interface that a layer object must implement.
-
-                The central work of defining a layer is implementing the forward and backward
-                methods.  When you do this you have four options:
-                    - Implement the forward() and backward() methods according to the
-                      specification shown below.  Do not implement forward_inplace() and
-                      backward_inplace().
-                    - Implement the forward() and backward() methods according to the
-                      specification shown below, except exclude the computed_output
-                      parameter from backward().  Doing this will allow dlib to make some
-                      layers execute in-place and therefore run a little faster and use
-                      less memory. Do not implement forward_inplace() and
-                      backward_inplace().
-                    - Implement the forward_inplace() and backward_inplace() methods
-                      according to the specification shown below.  Do not implement
-                      forward() and backward().  These in-place methods allow some types of
-                      layers to be implemented more efficiently.
-                    - Implement the forward_inplace() and backward_inplace() methods
-                      according to the specification shown below, except exclude the
-                      computed_output parameter from backward_inplace().  Doing this will
-                      allow dlib to make some layers execute in-place and therefore run a
-                      little faster and use less memory.  Do not implement forward() and
-                      backward().
-
-
-                It should also be noted that layers may define additional layer specific
-                fields and the solvers can use these fields as they see fit.  For example,
-                some layers define get_learning_rate_multiplier() and
-                get_weight_decay_multiplier() methods.  The solvers that come with dlib
-                look at these methods, if they exist, and adjust the learning rate or
-                weight decay for that layer according to the multiplier.  Therefore, you
-                can add these methods to your layer types if you want, or even define new
-                fields and new solvers that use those fields in some way.  
-        !*/
-
-    public:
-
-        EXAMPLE_COMPUTATIONAL_LAYER_(
-        );
-        /*!
-            ensures
-                - Default constructs this object.  This function is not required to do
-                  anything in particular but it must exist, that is, it is required that
-                  layer objects be default constructable. 
-        !*/
-
-        EXAMPLE_COMPUTATIONAL_LAYER_ (
-            const EXAMPLE_COMPUTATIONAL_LAYER_& item
-        );
-        /*!
-            ensures
-                - EXAMPLE_COMPUTATIONAL_LAYER_ objects are copy constructable
-        !*/
-
-        EXAMPLE_COMPUTATIONAL_LAYER_(
-            const some_other_layer_type& item
-        );
-        /*!
-            ensures
-                - Constructs this object from item.  This form of constructor is optional
-                  but it allows you to provide a conversion from one layer type to another.
-                  For example, the following code is valid only if my_layer2 can be
-                  constructed from my_layer1:
-                    relu<fc<my_layer1<fc<input<matrix<float>>>>>> my_dnn1;
-                    relu<fc<my_layer2<fc<input<matrix<float>>>>>> my_dnn2(my_dnn1);
-                  This kind of pattern is useful if you want to use one type of layer
-                  during training but a different type of layer during testing since it
-                  allows you to easily convert between related deep neural network types.  
-
-                  Additionally, if you provide a constructor to build a layer from another
-                  layer type you should also write your layer's deserialize() routine such
-                  that it can read that other layer's serialized data in addition to your
-                  own serialized data.  
-        !*/
-
-        template <typename SUBNET>
-        void setup (
-            const SUBNET& sub
-        );
-        /*!
-            requires
-                - SUBNET implements the SUBNET interface defined at the top of this file.
-            ensures
-                - performs any necessary initial memory allocations and/or sets parameters
-                  to their initial values prior to learning.  Therefore, calling setup
-                  destroys any previously learned parameters.  Also, typically setup()
-                  would look at the dimensions of the outputs of sub and configure the
-                  number of parameters in *this accordingly.
-        !*/
-
-        template <typename SUBNET>
-        void forward(
-            const SUBNET& sub, 
-            resizable_tensor& data_output
-        );
-        /*!
-            requires
-                - SUBNET implements the SUBNET interface defined at the top of this file.
-                - setup() has been called.
-            ensures
-                - Runs the output of the subnetwork through this layer and stores the
-                  results into #data_output.  In particular, forward() can use any of the
-                  outputs in sub (e.g. sub.get_output(), sub.subnet().get_output(), etc.)
-                  to compute whatever it wants.
-        !*/
-
-        template <typename SUBNET>
-        void backward(
-            const tensor& computed_output, // this parameter is optional
-            const tensor& gradient_input, 
-            SUBNET& sub, 
-            tensor& params_grad
-        );
-        /*!
-            requires
-                - SUBNET implements the SUBNET interface defined at the top of this file.
-                - setup() has been called.
-                - computed_output is the tensor resulting from calling forward(sub,computed_output).  
-                  Moreover, this was the most recent call to forward().  This means that
-                  forward() is allowed to cache intermediate results so they can be used
-                  during the backward computation.
-                - have_same_dimensions(gradient_input, computed_output) == true
-                - have_same_dimensions(sub.get_gradient_input(), sub.get_output()) == true
-                - have_same_dimensions(params_grad, get_layer_params()) == true
-            ensures
-                - This function outputs the gradients of this layer with respect to the
-                  input data from sub and also with respect to this layer's parameters.
-                  These gradients are stored into #sub and #params_grad, respectively. To be
-                  precise, the gradients are taken of a function f(sub,get_layer_params())
-                  which is defined thusly:   
-                    - Recalling that computed_output is a function of both sub and get_layer_params(), 
-                      since it is the result of calling forward(sub,computed_output):
-                      let f(sub,get_layer_params()) == dot(computed_output, gradient_input)
-                  Then we define the following gradient vectors: 
-                    - PARAMETER_GRADIENT == gradient of f(sub,get_layer_params()) with
-                      respect to get_layer_params(). 
-                    - for all valid I:
-                        - DATA_GRADIENT_I == gradient of f(sub,get_layer_params()) with
-                          respect to layer<I>(sub).get_output() (recall that forward() can
-                          draw inputs from the immediate sub layer, sub.subnet(), or
-                          any earlier layer.  So you must consider the gradients with
-                          respect to all inputs drawn from sub)
-                  Finally, backward() outputs these gradients by performing:
-                    - params_grad = PARAMETER_GRADIENT 
-                    - for all valid I:
-                        - layer<I>(sub).get_gradient_input() += DATA_GRADIENT_I
-        !*/
-
-        void forward_inplace(
-            const tensor& data_input, 
-            tensor& data_output
-        );
-        /*!
-            requires
-                - have_same_dimensions(data_input,data_output) == true
-                - setup() has been called.
-            ensures
-                - Runs the data_input tensor through this layer and stores the output into
-                  #data_output.
-                - This function supports in-place operation, i.e. having
-                  is_same_object(data_input, data_output)==true
-        !*/
-
-        void backward_inplace(
-            const tensor& computed_output, // this parameter is optional
-            const tensor& gradient_input,
-            tensor& data_grad,
-            tensor& params_grad
-        );
-        /*!
-            requires
-                - setup() has been called.
-                - computed_output is the tensor resulting from the most recent call to
-                  forward_inplace().  This means that forward_inplace() is allowed to cache
-                  intermediate results so they can be used during the backward computation.
-                - have_same_dimensions(gradient_input, data_grad) == true
-                - have_same_dimensions(gradient_input, computed_output) == true
-                - have_same_dimensions(params_grad, get_layer_params()) == true
-            ensures
-                - This function supports in-place operation, i.e. having
-                  is_same_object(gradient_input, data_grad)==true
-                - This function outputs the gradients of this layer with respect to the
-                  input data from a sublayer and also with respect to this layer's parameters.
-                  These gradients are stored into #data_grad and #params_grad, respectively. To be
-                  precise, the gradients are taken of a function f(data_input,get_layer_params())
-                  which is defined thusly:   
-                    - Recalling that computed_output is a function of both the input to
-                      forward_inplace() and get_layer_params(), since it is the result of
-                      calling forward_inplace(data_input,computed_output):
-                      let f(data_input,get_layer_params()) == dot(computed_output, gradient_input)
-                  Then we define the following gradient vectors: 
-                    - PARAMETER_GRADIENT == gradient of f(data_input,get_layer_params()) with
-                      respect to get_layer_params(). 
-                    - DATA_GRADIENT == gradient of f(data_input,get_layer_params()) with respect
-                      to data_input. 
-                  Finally, backward_inplace() outputs these gradients by performing:
-                    - params_grad = PARAMETER_GRADIENT 
-                    - if (is_same_object(gradient_input, data_grad)) then
-                        - data_grad = DATA_GRADIENT
-                    - else
-                        - data_grad += DATA_GRADIENT
-        !*/
-
-        const tensor& get_layer_params(
-        ) const; 
-        /*!
-            ensures
-                - returns the parameters that define the behavior of forward().
-        !*/
-
-        tensor& get_layer_params(
-        ); 
-        /*!
-            ensures
-                - returns the parameters that define the behavior of forward().
-        !*/
-
-
-        dpoint map_input_to_output(dpoint p) const;
-        dpoint map_output_to_input(dpoint p) const;
-        /*!
-            These two functions are optional.  If provided, they should map between
-            (column,row) coordinates in input and output tensors of forward().  Providing
-            these functions allows you to use global utility functions like
-            input_tensor_to_output_tensor().
-        !*/
-
-        void clean (
-        );
-        /*!
-            Implementing this function is optional.  If you don't need it then you don't
-            have to provide a clean().  But if you do provide it then it must behave as
-            follows:
-
-            ensures
-                - calling clean() Causes this object to forget about everything except its
-                  parameters.  This is useful if your layer caches information between
-                  forward and backward passes and you want to clean out that cache
-                  information before saving the network to disk.  
-        !*/
-
-    };
-
-    std::ostream& operator<<(std::ostream& out, const EXAMPLE_COMPUTATIONAL_LAYER_& item);
-    /*!
-        print a string describing this layer.
-    !*/
-
-    void to_xml(const EXAMPLE_COMPUTATIONAL_LAYER_& item, std::ostream& out);
-    /*!
-        This function is optional, but required if you want to print your networks with
-        net_to_xml().  Therefore, to_xml() prints a layer as XML.
-    !*/
-
-    void serialize(const EXAMPLE_COMPUTATIONAL_LAYER_& item, std::ostream& out);
-    void deserialize(EXAMPLE_COMPUTATIONAL_LAYER_& item, std::istream& in);
-    /*!
-        provides serialization support  
-    !*/
-
-    // For each layer you define, always define an add_layer template so that layers can be
-    // easily composed.  Moreover, the convention is that the layer class ends with an _
-    // while the add_layer template has the same name but without the trailing _.
-    template <typename SUBNET>
-    using EXAMPLE_COMPUTATIONAL_LAYER = add_layer<EXAMPLE_COMPUTATIONAL_LAYER_, SUBNET>;
-
-// ----------------------------------------------------------------------------------------
-// ----------------------------------------------------------------------------------------
-// ----------------------------------------------------------------------------------------
-
-    enum fc_bias_mode
-    {
-        FC_HAS_BIAS = 0,
-        FC_NO_BIAS = 1
-    };
-
-    struct num_fc_outputs
-    {
-        num_fc_outputs(unsigned long n) : num_outputs(n) {}
-        unsigned long num_outputs;
-    };
-
-    template <
-        unsigned long num_outputs,
-        fc_bias_mode bias_mode
-        >
-    class fc_
-    {
-        /*!
-            REQUIREMENTS ON num_outputs
-                num_outputs > 0
-
-            WHAT THIS OBJECT REPRESENTS
-                This is an implementation of the EXAMPLE_COMPUTATIONAL_LAYER_ interface
-                defined above.  In particular, it defines a fully connected layer that
-                takes an input tensor and multiplies it by a weight matrix and outputs the
-                results.
-
-                The dimensions of the tensors output by this layer are as follows (letting
-                IN be the input tensor and OUT the output tensor):
-                    - OUT.num_samples() == IN.num_samples()
-                    - OUT.k()  == get_num_outputs()
-                    - OUT.nr() == 1
-                    - OUT.nc() == 1
-        !*/
-
-    public:
-
-        fc_(
-        );
-        /*!
-            ensures
-                - #get_num_outputs() == num_outputs
-                - #get_bias_mode() == bias_mode 
-                - #get_learning_rate_multiplier()      == 1
-                - #get_weight_decay_multiplier()       == 1
-                - #get_bias_learning_rate_multiplier() == 1
-                - #get_bias_weight_decay_multiplier()  == 0
-        !*/
-
-        fc_(
-            num_fc_outputs o
-        );
-        /*!
-            ensures
-                - #get_num_outputs() == o.num_outputs 
-                - #get_bias_mode() == bias_mode 
-                - #get_learning_rate_multiplier()      == 1
-                - #get_weight_decay_multiplier()       == 1
-                - #get_bias_learning_rate_multiplier() == 1
-                - #get_bias_weight_decay_multiplier()  == 0
-        !*/
-
-        unsigned long get_num_outputs (
-        ) const; 
-        /*!
-            ensures
-                - This layer outputs column vectors that contain get_num_outputs()
-                  elements. That is, the output tensor T from forward() will be such that:
-                    - T.num_samples() == however many samples were given to forward().
-                    - T.k() == get_num_outputs()
-                    - The rest of the dimensions of T will be 1.
-        !*/
-
-        void set_num_outputs(
-            long num
-        );
-        /*!
-            requires
-                - num > 0
-                - get_layer_params().size() == 0 || get_num_outputs() == num
-                  (i.e. You can't change the number of outputs in fc_ if the parameter
-                  tensor has already been allocated.)
-            ensures
-                - #get_num_outputs() == num
-        !*/
-
-        fc_bias_mode get_bias_mode (
-        ) const;
-        /*!
-            ensures
-                - returns the bias mode which determines if this layer includes bias terms.
-                  That is, if the bias mode is FC_HAS_BIAS then a different constant scalar
-                  is added to each of the outputs of this layer. 
-        !*/
-
-        double get_learning_rate_multiplier(
-        ) const;  
-        /*!
-            ensures
-                - returns a multiplier number.  The interpretation is that this object is
-                  requesting that the learning rate used to optimize its parameters be
-                  multiplied by get_learning_rate_multiplier().
-        !*/
-
-        double get_weight_decay_multiplier(
-        ) const; 
-        /*!
-            ensures
-                - returns a multiplier number.  The interpretation is that this object is
-                  requesting that the weight decay used to optimize its parameters be
-                  multiplied by get_weight_decay_multiplier().
-        !*/
-
-        void set_learning_rate_multiplier(
-            double val
-        );
-        /*!
-            requires
-                - val >= 0
-            ensures
-                - #get_learning_rate_multiplier() == val
-        !*/
-
-        void set_weight_decay_multiplier(
-            double val
-        ); 
-        /*!
-            requires
-                - val >= 0
-            ensures
-                - #get_weight_decay_multiplier() == val
-        !*/
-
-        double get_bias_learning_rate_multiplier(
-        ) const; 
-        /*!
-            ensures
-                - returns a multiplier number.  The interpretation is that this object is
-                  requesting that the learning rate used to optimize its bias parameters be
-                  multiplied by get_learning_rate_multiplier()*get_bias_learning_rate_multiplier().
-        !*/
-
-        double get_bias_weight_decay_multiplier(
-        ) const; 
-        /*!
-            ensures
-                - returns a multiplier number.  The interpretation is that this object is
-                  requesting that the weight decay used to optimize its bias parameters be
-                  multiplied by get_weight_decay_multiplier()*get_bias_weight_decay_multiplier().
-        !*/
-
-        void set_bias_learning_rate_multiplier(
-            double val
-        ); 
-        /*!
-            requires
-                - val >= 0
-            ensures
-                - #get_bias_learning_rate_multiplier() == val
-        !*/
-
-        void set_bias_weight_decay_multiplier(
-            double val
-        ); 
-        /*!
-            requires
-                - val >= 0
-            ensures
-                - #get_bias_weight_decay_multiplier() == val
-        !*/
-
-        alias_tensor_const_instance get_weights(
-        ) const;
-        /*!
-            ensures
-                - returns an alias of get_layer_params(), containing the weights matrix of
-                  the fully connected layer.
-                - #get_weights().num_samples() is the number of elements in input sample,
-                  i.e. sublayer's output's k * nc * nr.
-                - #get_bias().k() == #get_num_outputs()
-                - if get_bias_mode() == FC_HAS_BIAS:
-                    - #get_layer_params().size() == (#get_weights().size() + #get_biases().size())
-                - else:
-                    - #get_layer_params().size() == #get_weights().size()
-        !*/
-
-        alias_tensor_instance get_weights(
-        );
-        /*!
-            ensures
-                - returns an alias of get_layer_params(), containing the weights matrix of
-                  the fully connected layer.
-                - #get_weights().num_samples() is the number of elements in input sample,
-                  i.e. sublayer's output's k * nc * nr.
-                - #get_bias().k() == #get_num_outputs()
-                - if get_bias_mode() == FC_HAS_BIAS:
-                    - #get_layer_params().size() == (#get_weights().size() + #get_biases().size())
-                - else:
-                    - #get_layer_params().size() == #get_weights().size()
-        !*/
-
-        alias_tensor_const_instance get_biases(
-        ) const;
-        /*!
-            requires
-                - #get_bias_mode() == FC_HAS_BIAS
-            ensures
-                - returns an alias of get_layer_params(), containing the bias vector of
-                  the fully connected layer.
-                - #get_bias().num_samples() == 1
-                - #get_bias().k() == #get_num_outputs()
-                - #get_layer_params().size() == (#get_weights().size() + #get_biases().size())
-        !*/
-
-        alias_tensor_instance get_biases(
-        );
-        /*!
-            requires
-                - #get_bias_mode() == FC_HAS_BIAS
-            ensures
-                - returns an alias of get_layer_params(), containing the bias vector of
-                  the fully connected layer.
-                - #get_bias().num_samples() == 1
-                - #get_bias().k() == #get_num_outputs()
-                - #get_layer_params().size() == (#get_weights().size() + #get_biases().size())
-        !*/
-
-        template <typename SUBNET> void setup (const SUBNET& sub);
-        template <typename SUBNET> void forward(const SUBNET& sub, resizable_tensor& output);
-        template <typename SUBNET> void backward(const tensor& gradient_input, SUBNET& sub, tensor& params_grad);
-        const tensor& get_layer_params() const; 
-        tensor& get_layer_params(); 
-        /*!
-            These functions are implemented as described in the EXAMPLE_COMPUTATIONAL_LAYER_ interface.
-        !*/
-
-    };
-
-    template <
-        unsigned long num_outputs,
-        typename SUBNET
-        >
-    using fc = add_layer<fc_<num_outputs,FC_HAS_BIAS>, SUBNET>;
-
-    template <
-        unsigned long num_outputs,
-        typename SUBNET
-        >
-    using fc_no_bias = add_layer<fc_<num_outputs,FC_NO_BIAS>, SUBNET>;
-
-// ----------------------------------------------------------------------------------------
-
-    struct num_con_outputs
-    {
-        num_con_outputs(unsigned long n) : num_outputs(n) {}
-        unsigned long num_outputs;
-    };
-
-    template <
-        long _num_filters,
-        long _nr,
-        long _nc,
-        int _stride_y,
-        int _stride_x,
-        int _padding_y = _stride_y!=1? 0 : _nr/2,
-        int _padding_x = _stride_x!=1? 0 : _nc/2
-        >
-    class con_
-    {
-        /*!
-            REQUIREMENTS ON TEMPLATE ARGUMENTS
-                - _num_filters > 0
-                - _nr >= 0
-                - _nc >= 0
-                - _stride_y > 0
-                - _stride_x > 0
-                - _padding_y >= 0
-                - _padding_x >= 0
-                - Also, we require that:
-                    - if (_nr == 0) then
-                        - _padding_y == 0
-                    - else
-                        - _padding_y < _nr
-                    - if (_nc == 0) then
-                        - _padding_x == 0
-                    - else
-                        - _padding_x < _nc
-
-            WHAT THIS OBJECT REPRESENTS
-                This is an implementation of the EXAMPLE_COMPUTATIONAL_LAYER_ interface
-                defined above.  In particular, it defines a convolution layer that takes an
-                input tensor (nominally representing an image) and convolves it with a set
-                of filters and then outputs the results. 
-
-                The dimensions of the tensors output by this layer are as follows (letting
-                IN be the input tensor and OUT the output tensor):
-                    - OUT.num_samples() == IN.num_samples()
-                    - OUT.k()  == num_filters()
-                    - OUT.nr() == 1+(IN.nr() + 2*padding_y() - nr())/stride_y()
-                    - OUT.nc() == 1+(IN.nc() + 2*padding_x() - nc())/stride_x()
-
-                Note also that setting _nr or _nc to 0 has a special meaning of "set the
-                filter size equal to the input image size".  Specifically, it means: 
-                    - if (_nr == 0) then
-                        - nr() == IN.nr()
-                        - OUT.nr() == 1
-                    - if (_nc == 0) then
-                        - nc() == IN.nc()
-                        - OUT.nc() == 1
-        !*/
-
-    public:
-        con_(
-        );
-        /*!
-            ensures
-                - #num_filters() == _num_filters
-                - #nr() == _nr
-                - #nc() == _nc
-                - #stride_y() == _stride_y
-                - #stride_x() == _stride_x
-                - #padding_y() == _padding_y
-                - #padding_x() == _padding_x
-                - #get_learning_rate_multiplier()      == 1
-                - #get_weight_decay_multiplier()       == 1
-                - #get_bias_learning_rate_multiplier() == 1
-                - #get_bias_weight_decay_multiplier()  == 0
-        !*/
-
-        con_(
-            num_con_outputs o
-        );
-        /*!
-            ensures
-                - #num_filters() == o.num_outputs 
-                - #nr() == _nr
-                - #nc() == _nc
-                - #stride_y() == _stride_y
-                - #stride_x() == _stride_x
-                - #padding_y() == _padding_y
-                - #padding_x() == _padding_x
-                - #get_learning_rate_multiplier()      == 1
-                - #get_weight_decay_multiplier()       == 1
-                - #get_bias_learning_rate_multiplier() == 1
-                - #get_bias_weight_decay_multiplier()  == 0
-        !*/
-
-        long num_filters(
-        ) const; 
-        /*!
-            ensures
-                - returns the number of filters contained in this layer.  The k dimension
-                  of the output tensors produced by this layer will be equal to the number
-                  of filters.
-        !*/
-
-        void set_num_filters(
-            long num
-        );
-        /*!
-            requires
-                - num > 0
-                - get_layer_params().size() == 0 || num_filters() == num
-                  (i.e. You can't change the number of filters in con_ if the parameter
-                  tensor has already been allocated.)
-            ensures
-                - #num_filters() == num
-        !*/
-
-        long nr(
-        ) const; 
-        /*!
-            ensures
-                - returns the number of rows in the filters in this layer.  Note that if
-                  nr()==0 then it means the size of the filter is not yet assigned, but
-                  once setup() is called nr() will be set to the input tensor's nr().
-                  Therefore, nr()==0 has the special interpretation of "be the same size as
-                  the input tensor".
-        !*/
-
-        long nc(
-        ) const;
-        /*!
-            ensures
-                - returns the number of columns in the filters in this layer.  Note that if
-                  nc()==0 then it means the size of the filter is not yet assigned, but
-                  once setup() is called nc() will be set to the input tensor's nc().
-                  Therefore, nc()==0 has the special interpretation of "be the same size as
-                  the input tensor".
-        !*/
-
-        long stride_y(
-        ) const; 
-        /*!
-            ensures
-                - returns the vertical stride used when convolving the filters over an
-                  image.  That is, each filter will be moved stride_y() pixels down at a
-                  time when it moves over the image.
-        !*/
-
-        long stride_x(
-        ) const;
-        /*!
-            ensures
-                - returns the horizontal stride used when convolving the filters over an
-                  image.  That is, each filter will be moved stride_x() pixels right at a
-                  time when it moves over the image.
-        !*/
-
-        long padding_y(
-        ) const; 
-        /*!
-            ensures
-                - returns the number of pixels of zero padding added to the top and bottom
-                  sides of the image.
-        !*/
-
-        long padding_x(
-        ) const; 
-        /*!
-            ensures
-                - returns the number of pixels of zero padding added to the left and right 
-                  sides of the image.
-        !*/
-
-        double get_learning_rate_multiplier(
-        ) const;  
-        /*!
-            ensures
-                - returns a multiplier number.  The interpretation is that this object is
-                  requesting that the learning rate used to optimize its parameters be
-                  multiplied by get_learning_rate_multiplier().
-        !*/
-
-        double get_weight_decay_multiplier(
-        ) const; 
-        /*!
-            ensures
-                - returns a multiplier number.  The interpretation is that this object is
-                  requesting that the weight decay used to optimize its parameters be
-                  multiplied by get_weight_decay_multiplier().
-        !*/
-
-        void set_learning_rate_multiplier(
-            double val
-        );
-        /*!
-            requires
-                - val >= 0
-            ensures
-                - #get_learning_rate_multiplier() == val
-        !*/
-
-        void set_weight_decay_multiplier(
-            double val
-        ); 
-        /*!
-            requires
-                - val >= 0
-            ensures
-                - #get_weight_decay_multiplier() == val
-        !*/
-
-        double get_bias_learning_rate_multiplier(
-        ) const; 
-        /*!
-            ensures
-                - returns a multiplier number.  The interpretation is that this object is
-                  requesting that the learning rate used to optimize its bias parameters be
-                  multiplied by get_learning_rate_multiplier()*get_bias_learning_rate_multiplier().
-        !*/
-
-        double get_bias_weight_decay_multiplier(
-        ) const; 
-        /*!
-            ensures
-                - returns a multiplier number.  The interpretation is that this object is
-                  requesting that the weight decay used to optimize its bias parameters be
-                  multiplied by get_weight_decay_multiplier()*get_bias_weight_decay_multiplier().
-        !*/
-
-        void set_bias_learning_rate_multiplier(
-            double val
-        ); 
-        /*!
-            requires
-                - val >= 0
-            ensures
-                - #get_bias_learning_rate_multiplier() == val
-        !*/
-
-        void set_bias_weight_decay_multiplier(
-            double val
-        ); 
-        /*!
-            requires
-                - val >= 0
-            ensures
-                - #get_bias_weight_decay_multiplier() == val
-        !*/
-
-        template <typename SUBNET> void setup (const SUBNET& sub);
-        template <typename SUBNET> void forward(const SUBNET& sub, resizable_tensor& output);
-        template <typename SUBNET> void backward(const tensor& gradient_input, SUBNET& sub, tensor& params_grad);
-        dpoint map_input_to_output(dpoint p) const;
-        dpoint map_output_to_input(dpoint p) const;
-        const tensor& get_layer_params() const; 
-        tensor& get_layer_params(); 
-        /*!
-            These functions are implemented as described in the EXAMPLE_COMPUTATIONAL_LAYER_ interface.
-        !*/
-
-    };
-
-    template <
-        long num_filters,
-        long nr,
-        long nc,
-        int stride_y,
-        int stride_x,
-        typename SUBNET
-        >
-    using con = add_layer<con_<num_filters,nr,nc,stride_y,stride_x>, SUBNET>;
-
-// ----------------------------------------------------------------------------------------
-
-    template <
-        long _num_filters,
-        long _nr,
-        long _nc,
-        int _stride_y,
-        int _stride_x,
-        int _padding_y = _stride_y!=1? 0 : _nr/2,
-        int _padding_x = _stride_x!=1? 0 : _nc/2
-        >
-    class cont_
-    {
-        /*!
-            REQUIREMENTS ON TEMPLATE ARGUMENTS
-                All of them must be > 0.
-                Also, we require that:
-                    - 0 <= _padding_y && _padding_y < _nr
-                    - 0 <= _padding_x && _padding_x < _nc
-
-            WHAT THIS OBJECT REPRESENTS
-                This is an implementation of the EXAMPLE_COMPUTATIONAL_LAYER_ interface
-                defined above.  In particular, it defines a transposed convolution layer
-                that takes an input tensor and transpose convolves (sometimes called
-                "deconvolution") it with a set of filters and then outputs the results. 
-
-                This is essentially a convolutional layer that allows fractional strides.
-                Therefore, you can make output tensors that are larger than the input
-                tensors using this layer type. 
-
-                
-                The dimensions of the tensors output by this layer are as follows (letting
-                IN be the input tensor and OUT the output tensor):
-                    - OUT.num_samples() == IN.num_samples()
-                    - OUT.k()  == num_filters()
-                    - OUT.nr() == stride_y()*(IN.nr()-1) + nr() - 2*padding_y()
-                    - OUT.nc() == stride_x()*(IN.nc()-1) + nc() - 2*padding_x()
-        !*/
-
-    public:
-        cont_(
-        );
-        /*!
-            ensures
-                - #num_filters() == _num_filters
-                - #nr() == _nr
-                - #nc() == _nc
-                - #stride_y() == _stride_y
-                - #stride_x() == _stride_x
-                - #padding_y() == _padding_y
-                - #padding_x() == _padding_x
-                - #get_learning_rate_multiplier()      == 1
-                - #get_weight_decay_multiplier()       == 1
-                - #get_bias_learning_rate_multiplier() == 1
-                - #get_bias_weight_decay_multiplier()  == 0
-        !*/
-
-        cont_(
-            num_con_outputs o
-        );
-        /*!
-            ensures
-                - #num_filters() == o.num_outputs 
-                - #nr() == _nr
-                - #nc() == _nc
-                - #stride_y() == _stride_y
-                - #stride_x() == _stride_x
-                - #padding_y() == _padding_y
-                - #padding_x() == _padding_x
-                - #get_learning_rate_multiplier()      == 1
-                - #get_weight_decay_multiplier()       == 1
-                - #get_bias_learning_rate_multiplier() == 1
-                - #get_bias_weight_decay_multiplier()  == 0
-        !*/
-
-        long num_filters(
-        ) const; 
-        /*!
-            ensures
-                - returns the number of filters contained in this layer.  The k dimension
-                  of the output tensors produced by this layer will be equal to the number
-                  of filters.
-        !*/
-
-        void set_num_filters(
-            long num
-        );
-        /*!
-            requires
-                - num > 0
-                - get_layer_params().size() == 0 || num_filters() == num
-                  (i.e. You can't change the number of filters in cont_ if the parameter
-                  tensor has already been allocated.)
-            ensures
-                - #num_filters() == num
-        !*/
-
-        long nr(
-        ) const; 
-        /*!
-            ensures
-                - returns the number of rows in the filters in this layer.
-        !*/
-
-        long nc(
-        ) const;
-        /*!
-            ensures
-                - returns the number of columns in the filters in this layer.
-        !*/
-
-        long stride_y(
-        ) const; 
-        /*!
-            ensures
-                - returns the vertical stride used when convolving the filters over an
-                  image.  That is, each filter will be moved 1.0/stride_y() pixels down at
-                  a time when it moves over the image.
-        !*/
-
-        long stride_x(
-        ) const;
-        /*!
-            ensures
-                - returns the horizontal stride used when convolving the filters over an
-                  image.  That is, each filter will be moved 1.0/stride_x() pixels right at
-                  a time when it moves over the image.
-        !*/
-
-        long padding_y(
-        ) const; 
-        /*!
-            ensures
-                - returns the number of pixels of zero padding added to the top and bottom
-                  sides of the image.
-        !*/
-
-        long padding_x(
-        ) const; 
-        /*!
-            ensures
-                - returns the number of pixels of zero padding added to the left and right 
-                  sides of the image.
-        !*/
-
-        double get_learning_rate_multiplier(
-        ) const;  
-        /*!
-            ensures
-                - returns a multiplier number.  The interpretation is that this object is
-                  requesting that the learning rate used to optimize its parameters be
-                  multiplied by get_learning_rate_multiplier().
-        !*/
-
-        double get_weight_decay_multiplier(
-        ) const; 
-        /*!
-            ensures
-                - returns a multiplier number.  The interpretation is that this object is
-                  requesting that the weight decay used to optimize its parameters be
-                  multiplied by get_weight_decay_multiplier().
-        !*/
-
-        void set_learning_rate_multiplier(
-            double val
-        );
-        /*!
-            requires
-                - val >= 0
-            ensures
-                - #get_learning_rate_multiplier() == val
-        !*/
-
-        void set_weight_decay_multiplier(
-            double val
-        ); 
-        /*!
-            requires
-                - val >= 0
-            ensures
-                - #get_weight_decay_multiplier() == val
-        !*/
-
-        double get_bias_learning_rate_multiplier(
-        ) const; 
-        /*!
-            ensures
-                - returns a multiplier number.  The interpretation is that this object is
-                  requesting that the learning rate used to optimize its bias parameters be
-                  multiplied by get_learning_rate_multiplier()*get_bias_learning_rate_multiplier().
-        !*/
-
-        double get_bias_weight_decay_multiplier(
-        ) const; 
-        /*!
-            ensures
-                - returns a multiplier number.  The interpretation is that this object is
-                  requesting that the weight decay used to optimize its bias parameters be
-                  multiplied by get_weight_decay_multiplier()*get_bias_weight_decay_multiplier().
-        !*/
-
-        void set_bias_learning_rate_multiplier(
-            double val
-        ); 
-        /*!
-            requires
-                - val >= 0
-            ensures
-                - #get_bias_learning_rate_multiplier() == val
-        !*/
-
-        void set_bias_weight_decay_multiplier(
-            double val
-        ); 
-        /*!
-            requires
-                - val >= 0
-            ensures
-                - #get_bias_weight_decay_multiplier() == val
-        !*/
-
-        template <typename SUBNET> void setup (const SUBNET& sub);
-        template <typename SUBNET> void forward(const SUBNET& sub, resizable_tensor& output);
-        template <typename SUBNET> void backward(const tensor& gradient_input, SUBNET& sub, tensor& params_grad);
-        dpoint map_input_to_output(dpoint p) const;
-        dpoint map_output_to_input(dpoint p) const;
-        const tensor& get_layer_params() const; 
-        tensor& get_layer_params(); 
-        /*!
-            These functions are implemented as described in the EXAMPLE_COMPUTATIONAL_LAYER_ interface.
-        !*/
-
-    };
-
-    template <
-        long num_filters,
-        long nr,
-        long nc,
-        int stride_y,
-        int stride_x,
-        typename SUBNET
-        >
-    using cont = add_layer<cont_<num_filters,nr,nc,stride_y,stride_x>, SUBNET>;
-
-// ----------------------------------------------------------------------------------------
-
-    template <
-        int scale_y, 
-        int scale_x 
-        >
-    class upsample_
-    {
-        /*!
-            REQUIREMENTS ON TEMPLATE ARGUMENTS
-                All of them must be >= 1.
-
-            WHAT THIS OBJECT REPRESENTS
-                This is an implementation of the EXAMPLE_COMPUTATIONAL_LAYER_ interface
-                defined above.  In particular, it allows you to upsample a layer using
-                bilinear interpolation.  To be very specific, it upsamples each of the
-                channels in an input tensor.  Therefore, if IN is the input tensor to this
-                layer and OUT the output tensor, then we will have:
-                    - OUT.num_samples() == IN.num_samples()
-                    - OUT.k()  == IN.k() 
-                    - OUT.nr() == IN.nr()*scale_y
-                    - OUT.nc() == IN.nr()*scale_x
-                    - for all valid i,k:  image_plane(OUT,i,k) is a copy of
-                      image_plane(IN,i,k) that has been bilinearly interpolated to fit into
-                      the shape of image_plane(OUT,i,k).
-        !*/
-    public:
-
-        upsample_(
-        );
-        /*!
-            ensures
-                - This object has no state, so the constructor does nothing, aside from
-                  providing default constructability.
-        !*/
-
-        template <typename SUBNET> void setup (const SUBNET& sub);
-        template <typename SUBNET> void forward(const SUBNET& sub, resizable_tensor& output);
-        template <typename SUBNET> void backward(const tensor& gradient_input, SUBNET& sub, tensor& params_grad);
-        dpoint map_input_to_output(dpoint p) const;
-        dpoint map_output_to_input(dpoint p) const;
-        const tensor& get_layer_params() const; 
-        tensor& get_layer_params(); 
-        /*!
-            These functions are implemented as described in the EXAMPLE_COMPUTATIONAL_LAYER_ interface.
-        !*/
-    };
-
-    template <
-        int scale,
-        typename SUBNET
-        >
-    using upsample = add_layer<upsample_<scale,scale>, SUBNET>;
-
-// ----------------------------------------------------------------------------------------
-
-    class dropout_
-    {
-        /*!
-            WHAT THIS OBJECT REPRESENTS
-                This is an implementation of the EXAMPLE_COMPUTATIONAL_LAYER_ interface
-                defined above.  In particular, it defines a dropout layer.  Therefore, it
-                passes its inputs through the stochastic function f(x) which outputs either
-                0 or x.  The probability of 0 being output is given by the drop_rate
-                argument to this object's constructor.
-
-                Note that, after you finish training a network with dropout, it is a good
-                idea to replace each dropout_ layer with a multiply_ layer because the
-                multiply_ layer is faster and deterministic. 
-        !*/
-
-    public:
-
-        explicit dropout_(
-            float drop_rate = 0.5
-        );
-        /*!
-            requires
-                - 0 <= drop_rate <= 1
-            ensures
-                - #get_drop_rate() == drop_rate
-        !*/
-
-        float get_drop_rate (
-        ) const; 
-        /*!
-            ensures
-                - returns the probability that an individual input value to this layer will
-                  be replaced with 0.
-        !*/
-
-        template <typename SUBNET> void setup (const SUBNET& sub);
-        void forward_inplace(const tensor& input, tensor& output);
-        void backward_inplace(const tensor& gradient_input, tensor& data_grad, tensor& params_grad);
-        dpoint map_input_to_output(dpoint p) const;
-        dpoint map_output_to_input(dpoint p) const;
-        const tensor& get_layer_params() const; 
-        tensor& get_layer_params(); 
-        /*!
-            These functions are implemented as described in the EXAMPLE_COMPUTATIONAL_LAYER_ interface.
-        !*/
-    };
-
-    template <typename SUBNET>
-    using dropout = add_layer<dropout_, SUBNET>;
-
-// ----------------------------------------------------------------------------------------
-
-    class multiply_
-    {
-        /*!
-            WHAT THIS OBJECT REPRESENTS
-                This is an implementation of the EXAMPLE_COMPUTATIONAL_LAYER_ interface
-                defined above.  In particular, it defines a basic layer that just
-                multiplies its input tensor with a constant value and returns the result.
-                It therefore has no learnable parameters.
-        !*/
-
-    public:
-        explicit multiply_(
-            float val = 0.5
-        ); 
-        /*!
-            ensures
-                - #get_multiply_value() == val
-        !*/
-
-        multiply_ (
-            const dropout_& item
-        ); 
-        /*!
-            ensures
-                - #get_multiply_value() == 1-item.get_drop_rate()
-                  (i.e. We construct the multiply_ layer so that it is essentially a
-                  deterministic version of the given dropout_ layer)
-        !*/
-
-        float get_multiply_value (
-        ) const;
-        /*!
-            ensures
-                - this layer simply multiplies its input tensor by get_multiply_value() and
-                  produces the result as output.
-        !*/
-
-        template <typename SUBNET> void setup (const SUBNET& sub);
-        void forward_inplace(const tensor& input, tensor& output);
-        void backward_inplace(const tensor& gradient_input, tensor& data_grad, tensor& params_grad);
-        dpoint map_input_to_output(dpoint p) const;
-        dpoint map_output_to_input(dpoint p) const;
-        const tensor& get_layer_params() const; 
-        tensor& get_layer_params(); 
-        /*!
-            These functions are implemented as described in the EXAMPLE_COMPUTATIONAL_LAYER_ interface.
-        !*/
-    };
-
-    template <typename SUBNET>
-    using multiply = add_layer<multiply_, SUBNET>;
-
-// ----------------------------------------------------------------------------------------
-
-    enum layer_mode
-    {
-        CONV_MODE = 0, // convolutional mode
-        FC_MODE = 1    // fully connected mode
-    };
-
-    const double DEFAULT_BATCH_NORM_EPS = 0.0001;
-
-    template <
-        layer_mode mode
-        >
-    class bn_
-    {
-        /*!
-            WHAT THIS OBJECT REPRESENTS
-                This is an implementation of the EXAMPLE_COMPUTATIONAL_LAYER_ interface
-                defined above.  In particular, it defines a batch normalization layer that
-                implements the method described in the paper: 
-                    Batch Normalization: Accelerating Deep Network Training by Reducing
-                    Internal Covariate Shift by Sergey Ioffe and Christian Szegedy
-                
-                In particular, this layer produces output tensors with the same
-                dimensionality as the input tensors, except that the mean and variances of
-                the elements have been standardized to 0 and 1 respectively. 
-
-                It should also be noted that when tensors with a num_samples() dimension of
-                1 are passed to this layer it doesn't perform batch normalization.
-                Instead, it runs in "inference mode" where the learned linear normalizing
-                transformation is used to transform the tensor. 
-
-                Finally, after you finish training a batch normalized network, it is a good
-                idea to replace each bn_ layer with an affine_ layer because the affine_
-                layer is faster and will never surprise you by performing batch
-                normalization on tensors that have a num_samples() dimension > 1.  This allows
-                you to run large mini-batches of samples through your final network without
-                batch normalization executing at all. 
-        !*/
-
-    public:
-        bn_(
-        );
-        /*!
-            ensures
-                - #get_mode() == mode
-                - #get_running_stats_window_size()      == 100
-                - #get_learning_rate_multiplier()       == 1
-                - #get_weight_decay_multiplier()        == 0
-                - #get_bias_learning_rate_multiplier()  == 1
-                - #get_bias_weight_decay_multiplier()   == 1
-                - #get_eps() == tt::DEFAULT_BATCH_NORM_EPS
-        !*/
-
-        explicit bn_(
-            unsigned long window_size,
-            double eps = tt::DEFAULT_BATCH_NORM_EPS
-        );
-        /*!
-            requires
-                - eps > 0
-                - window_size > 0
-            ensures
-                - #get_mode() == mode 
-                - #get_running_stats_window_size()     == window_size
-                - #get_learning_rate_multiplier()      == 1
-                - #get_weight_decay_multiplier()       == 0
-                - #get_bias_learning_rate_multiplier() == 1
-                - #get_bias_weight_decay_multiplier()  == 1
-                - #get_eps() == eps
-        !*/
-
-        layer_mode get_mode(
-        ) const; 
-        /*!
-            ensures
-                - returns the mode of this layer, either CONV_MODE or FC_MODE.
-                  If the mode is FC_MODE then the normalization is applied across the
-                  samples in a tensor (i.e. k()*nr()*nc() different things will be
-                  normalized).  Otherwise, normalization is applied across everything
-                  except for the k() dimension, resulting in there being only k()
-                  normalization equations that are applied spatially over the tensor.
-
-                  Therefore, if you are putting batch normalization after a fully connected
-                  layer you should use FC_MODE.  Otherwise, if you are putting batch
-                  normalization after a convolutional layer you should use CONV_MODE.
-        !*/
-
-        double get_eps(
-        ) const; 
-        /*!
-            ensures
-                - When doing batch normalization, we are dividing by the standard
-                  deviation.  This epsilon value returned by this function is added to the
-                  variance to prevent the division from dividing by zero.
-        !*/
-
-        unsigned long get_running_stats_window_size (
-        ) const; 
-        /*!
-            ensures
-                - Just as recommended in the batch normalization paper, this object keeps a
-                  running average of the mean and standard deviations of the features.
-                  These averages are used during "inference mode" so you can run a single
-                  object through a batch normalized network.  They are also what is used to
-                  initialize an affine_ layer that is constructed from a bn_ layer.  This
-                  function returns the effective number of recent samples used to compute
-                  the running average.
-        !*/
-
-        void set_running_stats_window_size (
-            unsigned long new_window_size
-        );
-        /*!
-            requires
-                - new_window_size > 0
-            ensures
-                - #get_running_stats_window_size() == new_window_size
-        !*/
-
-        double get_learning_rate_multiplier(
-        ) const;  
-        /*!
-            ensures
-                - returns a multiplier number.  The interpretation is that this object is
-                  requesting that the learning rate used to optimize its parameters be
-                  multiplied by get_learning_rate_multiplier().
-        !*/
-
-        double get_weight_decay_multiplier(
-        ) const; 
-        /*!
-            ensures
-                - returns a multiplier number.  The interpretation is that this object is
-                  requesting that the weight decay used to optimize its parameters be
-                  multiplied by get_weight_decay_multiplier().
-        !*/
-
-        void set_learning_rate_multiplier(
-            double val
-        );
-        /*!
-            requires
-                - val >= 0
-            ensures
-                - #get_learning_rate_multiplier() == val
-        !*/
-
-        void set_weight_decay_multiplier(
-            double val
-        ); 
-        /*!
-            requires
-                - val >= 0
-            ensures
-                - #get_weight_decay_multiplier() == val
-        !*/
-
-        double get_bias_learning_rate_multiplier(
-        ) const; 
-        /*!
-            ensures
-                - returns a multiplier number.  The interpretation is that this object is
-                  requesting that the learning rate used to optimize its bias parameters be
-                  multiplied by get_learning_rate_multiplier()*get_bias_learning_rate_multiplier().
-        !*/
-
-        double get_bias_weight_decay_multiplier(
-        ) const; 
-        /*!
-            ensures
-                - returns a multiplier number.  The interpretation is that this object is
-                  requesting that the weight decay used to optimize its bias parameters be
-                  multiplied by get_weight_decay_multiplier()*get_bias_weight_decay_multiplier().
-        !*/
-
-        void set_bias_learning_rate_multiplier(
-            double val
-        ); 
-        /*!
-            requires
-                - val >= 0
-            ensures
-                - #get_bias_learning_rate_multiplier() == val
-        !*/
-
-        void set_bias_weight_decay_multiplier(
-            double val
-        ); 
-        /*!
-            requires
-                - val >= 0
-            ensures
-                - #get_bias_weight_decay_multiplier() == val
-        !*/
-
-        template <typename SUBNET> void setup (const SUBNET& sub);
-        template <typename SUBNET> void forward(const SUBNET& sub, resizable_tensor& output);
-        template <typename SUBNET> void backward(const tensor& gradient_input, SUBNET& sub, tensor& params_grad);
-        dpoint map_input_to_output(dpoint p) const;
-        dpoint map_output_to_input(dpoint p) const;
-        const tensor& get_layer_params() const; 
-        tensor& get_layer_params(); 
-        /*!
-            These functions are implemented as described in the EXAMPLE_COMPUTATIONAL_LAYER_ interface.
-        !*/
-    };
-
-    template <typename SUBNET>
-    using bn_con = add_layer<bn_<CONV_MODE>, SUBNET>;
-    template <typename SUBNET>
-    using bn_fc = add_layer<bn_<FC_MODE>, SUBNET>;
-
-// ----------------------------------------------------------------------------------------
-
-    template <typename net_type>
-    void set_all_bn_running_stats_window_sizes (
-        const net_type& net,
-        unsigned long new_window_size
-    );
-    /*!
-        requires
-            - new_window_size > 0
-            - net_type is an object of type add_layer, add_loss_layer, add_skip_layer, or
-              add_tag_layer.
-        ensures
-            - Sets the get_running_stats_window_size() field of all bn_ layers in net to
-              new_window_size.
-    !*/
-
-// ----------------------------------------------------------------------------------------
-
-    class affine_
-    {
-        /*!
-            WHAT THIS OBJECT REPRESENTS
-                This is an implementation of the EXAMPLE_COMPUTATIONAL_LAYER_ interface
-                defined above.  In particular, it applies a simple pointwise linear
-                transformation to an input tensor.  You can think of it as having two
-                parameter tensors, A and B.  If the input tensor is called INPUT then the
-                output of this layer is:
-                    A*INPUT+B
-                where all operations are performed element wise and each sample in the
-                INPUT tensor is processed separately.
-
-                Moreover, this object has two modes that effect the dimensionalities of A
-                and B and how they are applied to compute A*INPUT+B.  If
-                get_mode()==FC_MODE then A and B each have the same dimensionality as the
-                input tensor, except their num_samples() dimensions are 1.  If
-                get_mode()==CONV_MODE then A and B have all their dimensions set to 1
-                except for k(), which is equal to INPUT.k().
-
-                In either case, the computation of A*INPUT+B is performed pointwise over all
-                the elements of INPUT using either:
-                    OUTPUT(n,k,r,c) == A(1,k,r,c)*INPUT(n,k,r,c)+B(1,k,r,c)
-                or
-                    OUTPUT(n,k,r,c) == A(1,k,1,1)*INPUT(n,k,r,c)+B(1,k,1,1)
-                as appropriate.
-
-
-                Finally, note that the parameters of this layer are not learnable and
-                therefore not modified during network updates.  Instead, the layer will
-                perform the identity transformation unless it is initialized with a bn_
-                layer, in which case it will perform whatever transformation the bn_ layer
-                has learned.
-        !*/
-
-    public:
-
-        affine_(
-        );
-        /*!
-            ensures
-                - #get_mode() == FC_MODE 
-        !*/
-
-        affine_(
-            layer_mode mode
-        );
-        /*!
-            ensures
-                - #get_mode() == mode
-        !*/
-
-        template <
-            layer_mode mode
-            >
-        affine_(
-            const bn_<mode>& layer
-        );
-        /*!
-            ensures
-                - Constructs affine_ so that it performs the same transformation as the
-                  supplied batch normalization layer.  You would want to do this after you
-                  finish training a network with bn_ layers because the affine_ layer will
-                  execute faster.  
-                - #get_mode() == layer.get_mode()
-        !*/
-
-        layer_mode get_mode(
-        ) const; 
-        /*!
-            ensures
-                - returns the mode of this layer, either CONV_MODE or FC_MODE.  
-        !*/
-
-        template <typename SUBNET> void setup (const SUBNET& sub);
-        void forward_inplace(const tensor& input, tensor& output);
-        void backward_inplace(const tensor& computed_output, const tensor& gradient_input, tensor& data_grad, tensor& params_grad);
-        dpoint map_input_to_output(dpoint p) const;
-        dpoint map_output_to_input(dpoint p) const;
-        const tensor& get_layer_params() const; 
-        tensor& get_layer_params(); 
-        /*!
-            These functions are implemented as described in the
-            EXAMPLE_COMPUTATIONAL_LAYER_ interface.  Also note that get_layer_params()
-            always returns an empty tensor since there are no learnable parameters in this
-            object.
-        !*/
-
-    };
-
-    template <typename SUBNET>
-    using affine = add_layer<affine_, SUBNET>;
-
-// ----------------------------------------------------------------------------------------
-
-    template <
-        long _nr,
-        long _nc,
-        int _stride_y,
-        int _stride_x,
-        int _padding_y = _stride_y!=1? 0 : _nr/2,
-        int _padding_x = _stride_x!=1? 0 : _nc/2
-        >
-    class max_pool_
-    {
-        /*!
-            REQUIREMENTS ON TEMPLATE ARGUMENTS
-                - _nr >= 0
-                - _nc >= 0
-                - _stride_y > 0
-                - _stride_x > 0
-                - _padding_y >= 0
-                - _padding_x >= 0
-                - if (_nr != 0) then
-                    - _padding_y < _nr
-                - else
-                    - _padding_y == 0
-                - if (_nc != 0) then
-                    - _padding_x < _nr
-                - else
-                    - _padding_x == 0
-
-            WHAT THIS OBJECT REPRESENTS
-                This is an implementation of the EXAMPLE_COMPUTATIONAL_LAYER_ interface
-                defined above.  In particular, it defines a max pooling layer that takes an
-                input tensor and downsamples it.  It does this by sliding a window over the
-                images in an input tensor and outputting, for each channel, the maximum
-                element within the window.  
-
-                If _nr == 0 then it means the filter size covers all the rows in the input
-                tensor, similarly for the _nc parameter.  To be precise, if we call the
-                input tensor IN and the output tensor OUT, then OUT is defined as follows:
-                    - let FILT_NR == (nr()==0) ? IN.nr() : nr()
-                    - let FILT_NC == (nc()==0) ? IN.nc() : nc()
-                    - OUT.num_samples() == IN.num_samples()
-                    - OUT.k()  == IN.k()
-                    - OUT.nr() == 1+(IN.nr() + 2*padding_y() - FILT_NR)/stride_y()
-                    - OUT.nc() == 1+(IN.nc() + 2*padding_x() - FILT_NC)/stride_x()
-                    - for all valid s, k, r, and c:
-                        - image_plane(OUT,s,k)(r,c) == max(subm_clipped(image_plane(IN,s,k),
-                                                                  centered_rect(x*stride_x() + FILT_NC/2 - padding_x(),
-                                                                                y*stride_y() + FILT_NR/2 - padding_y(),
-                                                                                FILT_NC,
-                                                                                FILT_NR)))
-        !*/
-
-    public:
-
-        max_pool_ (
-        );
-        /*!
-            ensures
-                - #nr() == _nr
-                - #nc() == _nc
-                - #stride_y() == _stride_y
-                - #stride_x() == _stride_x
-                - #padding_y() == _padding_y
-                - #padding_x() == _padding_x
-        !*/
-
-        long nr(
-        ) const; 
-        /*!
-            ensures
-                - returns the number of rows in the pooling window or 0 if the window size
-                  is "the entire input tensor".
-        !*/
-
-        long nc(
-        ) const;
-        /*!
-            ensures
-                - returns the number of rows in the pooling window or 0 if the window size
-                  is "the entire input tensor".
-        !*/
-
-        long stride_y(
-        ) const; 
-        /*!
-            ensures
-                - returns the vertical stride used when scanning the max pooling window
-                  over an image.  That is, each window will be moved stride_y() pixels down
-                  at a time when it moves over the image.
-        !*/
-
-        long stride_x(
-        ) const;
-        /*!
-            ensures
-                - returns the horizontal stride used when scanning the max pooling window
-                  over an image.  That is, each window will be moved stride_x() pixels down
-                  at a time when it moves over the image.
-        !*/
-
-        long padding_y(
-        ) const; 
-        /*!
-            ensures
-                - returns the number of pixels of zero padding added to the top and bottom
-                  sides of the image.
-        !*/
-
-        long padding_x(
-        ) const; 
-        /*!
-            ensures
-                - returns the number of pixels of zero padding added to the left and right 
-                  sides of the image.
-        !*/
-
-        template <typename SUBNET> void setup (const SUBNET& sub);
-        template <typename SUBNET> void forward(const SUBNET& sub, resizable_tensor& output);
-        template <typename SUBNET> void backward(const tensor& computed_output, const tensor& gradient_input, SUBNET& sub, tensor& params_grad);
-        dpoint map_input_to_output(dpoint p) const;
-        dpoint map_output_to_input(dpoint p) const;
-        const tensor& get_layer_params() const; 
-        tensor& get_layer_params(); 
-        /*!
-            These functions are implemented as described in the EXAMPLE_COMPUTATIONAL_LAYER_ 
-            interface.  Note that this layer doesn't have any parameters, so the tensor
-            returned by get_layer_params() is always empty.
-        !*/
-    };
-
-    template <
-        long nr,
-        long nc,
-        int stride_y,
-        int stride_x,
-        typename SUBNET
-        >
-    using max_pool = add_layer<max_pool_<nr,nc,stride_y,stride_x>, SUBNET>;
-
-    template <
-        typename SUBNET
-        >
-    using max_pool_everything = add_layer<max_pool_<0,0,1,1>, SUBNET>;
-
-// ----------------------------------------------------------------------------------------
-
-    template <
-        long _nr,
-        long _nc,
-        int _stride_y,
-        int _stride_x,
-        int _padding_y = _stride_y!=1? 0 : _nr/2,
-        int _padding_x = _stride_x!=1? 0 : _nc/2
-        >
-    class avg_pool_
-    {
-        /*!
-            REQUIREMENTS ON TEMPLATE ARGUMENTS
-                - _nr >= 0
-                - _nc >= 0
-                - _stride_y > 0
-                - _stride_x > 0
-                - _padding_y >= 0
-                - _padding_x >= 0
-                - if (_nr != 0) then
-                    - _padding_y < _nr
-                - else
-                    - _padding_y == 0
-                - if (_nc != 0) then
-                    - _padding_x < _nr
-                - else
-                    - _padding_x == 0
-
-            WHAT THIS OBJECT REPRESENTS
-                This is an implementation of the EXAMPLE_COMPUTATIONAL_LAYER_ interface
-                defined above.  In particular, it defines an average pooling layer that
-                takes an input tensor and downsamples it.  It does this by sliding a window
-                over the images in an input tensor and outputting, for each channel, the
-                average element within the window.  
-
-                If _nr == 0 then it means the filter size covers all the rows in the input
-                tensor, similarly for the _nc parameter.  To be precise, if we call the
-                input tensor IN and the output tensor OUT, then OUT is defined as follows:
-                    - let FILT_NR == (nr()==0) ? IN.nr() : nr()
-                    - let FILT_NC == (nc()==0) ? IN.nc() : nc()
-                    - OUT.num_samples() == IN.num_samples()
-                    - OUT.k()  == IN.k()
-                    - OUT.nr() == 1+(IN.nr() + 2*padding_y() - FILT_NR)/stride_y()
-                    - OUT.nc() == 1+(IN.nc() + 2*padding_x() - FILT_NC)/stride_x()
-                    - for all valid s, k, r, and c:
-                        - image_plane(OUT,s,k)(r,c) == mean(subm_clipped(image_plane(IN,s,k),
-                                                                  centered_rect(x*stride_x() + FILT_NC/2 - padding_x(),
-                                                                                y*stride_y() + FILT_NR/2 - padding_y(),
-                                                                                FILT_NC,
-                                                                                FILT_NR)))
-        !*/
-
-    public:
-
-        avg_pool_ (
-        );
-        /*!
-            ensures
-                - #nr() == _nr
-                - #nc() == _nc
-                - #stride_y() == _stride_y
-                - #stride_x() == _stride_x
-                - #padding_y() == _padding_y
-                - #padding_x() == _padding_x
-        !*/
-
-        long nr(
-        ) const; 
-        /*!
-            ensures
-                - returns the number of rows in the pooling window or 0 if the window size
-                  is "the entire input tensor".
-        !*/
-
-        long nc(
-        ) const;
-        /*!
-            ensures
-                - returns the number of rows in the pooling window or 0 if the window size
-                  is "the entire input tensor".
-        !*/
-
-        long stride_y(
-        ) const; 
-        /*!
-            ensures
-                - returns the vertical stride used when scanning the pooling window
-                  over an image.  That is, each window will be moved stride_y() pixels down
-                  at a time when it moves over the image.
-        !*/
-
-        long stride_x(
-        ) const;
-        /*!
-            ensures
-                - returns the horizontal stride used when scanning the pooling window
-                  over an image.  That is, each window will be moved stride_x() pixels down
-                  at a time when it moves over the image.
-        !*/
-
-        long padding_y(
-        ) const; 
-        /*!
-            ensures
-                - returns the number of pixels of zero padding added to the top and bottom
-                  sides of the image.
-        !*/
-
-        long padding_x(
-        ) const; 
-        /*!
-            ensures
-                - returns the number of pixels of zero padding added to the left and right 
-                  sides of the image.
-        !*/
-
-        template <typename SUBNET> void setup (const SUBNET& sub);
-        template <typename SUBNET> void forward(const SUBNET& sub, resizable_tensor& output);
-        template <typename SUBNET> void backward(const tensor& computed_output, const tensor& gradient_input, SUBNET& sub, tensor& params_grad);
-        dpoint map_input_to_output(dpoint p) const;
-        dpoint map_output_to_input(dpoint p) const;
-        const tensor& get_layer_params() const; 
-        tensor& get_layer_params(); 
-        /*!
-            These functions are implemented as described in the EXAMPLE_COMPUTATIONAL_LAYER_ 
-            interface.  Note that this layer doesn't have any parameters, so the tensor
-            returned by get_layer_params() is always empty.
-        !*/
-
-    };
-
-    template <
-        long nr,
-        long nc,
-        int stride_y,
-        int stride_x,
-        typename SUBNET
-        >
-    using avg_pool = add_layer<avg_pool_<nr,nc,stride_y,stride_x>, SUBNET>;
-
-    template <
-        typename SUBNET
-        >
-    using avg_pool_everything = add_layer<avg_pool_<0,0,1,1>, SUBNET>;
-
-// ----------------------------------------------------------------------------------------
-
-    class relu_
-    {
-        /*!
-            WHAT THIS OBJECT REPRESENTS
-                This is an implementation of the EXAMPLE_COMPUTATIONAL_LAYER_ interface
-                defined above.  In particular, it defines a rectified linear layer.
-                Therefore, it passes its inputs through the function 
-                    f(x)=max(x,0) 
-                where f() is applied pointwise across the input tensor.
-        !*/
-
-    public:
-
-        relu_(
-        );
-
-        template <typename SUBNET> void setup (const SUBNET& sub);
-        void forward_inplace(const tensor& input, tensor& output);
-        void backward_inplace(const tensor& computed_output, const tensor& gradient_input, tensor& data_grad, tensor& params_grad);
-        dpoint map_input_to_output(dpoint p) const;
-        dpoint map_output_to_input(dpoint p) const;
-        const tensor& get_layer_params() const; 
-        tensor& get_layer_params(); 
-        /*!
-            These functions are implemented as described in the EXAMPLE_COMPUTATIONAL_LAYER_ 
-            interface.  Note that this layer doesn't have any parameters, so the tensor
-            returned by get_layer_params() is always empty.
-        !*/
-    };
-
-    template <typename SUBNET>
-    using relu = add_layer<relu_, SUBNET>;
-
-// ----------------------------------------------------------------------------------------
-
-    class prelu_
-    {
-        /*!
-            WHAT THIS OBJECT REPRESENTS
-                This is an implementation of the EXAMPLE_COMPUTATIONAL_LAYER_ interface
-                defined above.  In particular, it defines a parametric rectified linear
-                layer.  Therefore, it passes its inputs through the function 
-                    f(x) = x>0 ? x : p*x 
-                where f() is applied pointwise across the input tensor and p is a scalar
-                parameter learned by this layer.
-
-
-                This is the layer type introduced in the paper:
-                    He, Kaiming, et al. "Delving deep into rectifiers: Surpassing
-                    human-level performance on imagenet classification." Proceedings of the
-                    IEEE International Conference on Computer Vision. 2015.
-        !*/
-
-    public:
-
-        explicit prelu_(
-            float initial_param_value = 0.25
-        );
-        /*!
-            ensures
-                - The p parameter will be initialized with initial_param_value.
-                - #get_initial_param_value() == initial_param_value.
-        !*/
-
-        float get_initial_param_value (
-        ) const;
-        /*!
-            ensures
-                - returns the initial value of the prelu parameter. 
-        !*/
-
-        template <typename SUBNET> void setup (const SUBNET& sub);
-        void forward_inplace(const tensor& input, tensor& output);
-        void backward_inplace(const tensor& computed_output, const tensor& gradient_input, tensor& data_grad, tensor& params_grad);
-        dpoint map_input_to_output(dpoint p) const;
-        dpoint map_output_to_input(dpoint p) const;
-        const tensor& get_layer_params() const; 
-        tensor& get_layer_params(); 
-        /*!
-            These functions are implemented as described in the EXAMPLE_COMPUTATIONAL_LAYER_ interface.
-        !*/
-    };
-
-    template <typename SUBNET>
-    using prelu = add_layer<prelu_, SUBNET>;
-
-// ----------------------------------------------------------------------------------------
-
-    class sig_
-    {
-        /*!
-            WHAT THIS OBJECT REPRESENTS
-                This is an implementation of the EXAMPLE_COMPUTATIONAL_LAYER_ interface
-                defined above.  In particular, it defines a sigmoid layer.  Therefore, it
-                passes its inputs through the function 
-                    f(x)=1/(1+exp(-x)) 
-                where f() is applied pointwise across the input tensor.
-        !*/
-
-    public:
-
-        sig_(
-        );
-
-        template <typename SUBNET> void setup (const SUBNET& sub);
-        void forward_inplace(const tensor& input, tensor& output);
-        void backward_inplace(const tensor& computed_output, const tensor& gradient_input, tensor& data_grad, tensor& params_grad);
-        dpoint map_input_to_output(dpoint p) const;
-        dpoint map_output_to_input(dpoint p) const;
-        const tensor& get_layer_params() const; 
-        tensor& get_layer_params(); 
-        /*!
-            These functions are implemented as described in the EXAMPLE_COMPUTATIONAL_LAYER_ 
-            interface.  Note that this layer doesn't have any parameters, so the tensor
-            returned by get_layer_params() is always empty.
-        !*/
-    };
-
-    template <typename SUBNET>
-    using sig = add_layer<sig_, SUBNET>;
-
-// ----------------------------------------------------------------------------------------
-
-    class htan_
-    {
-        /*!
-            WHAT THIS OBJECT REPRESENTS
-                This is an implementation of the EXAMPLE_COMPUTATIONAL_LAYER_ interface
-                defined above.  In particular, it defines a hyperbolic tangent layer.
-                Therefore, it passes its inputs through the function 
-                    f(x)=std::tanh(x)
-                where f() is applied pointwise across the input tensor.
-        !*/
-
-    public:
-
-        htan_(
-        );
-
-        template <typename SUBNET> void setup (const SUBNET& sub);
-        void forward_inplace(const tensor& input, tensor& output);
-        void backward_inplace(const tensor& computed_output, const tensor& gradient_input, tensor& data_grad, tensor& params_grad);
-        dpoint map_input_to_output(dpoint p) const;
-        dpoint map_output_to_input(dpoint p) const;
-        const tensor& get_layer_params() const; 
-        tensor& get_layer_params(); 
-        /*!
-            These functions are implemented as described in the EXAMPLE_COMPUTATIONAL_LAYER_ 
-            interface.  Note that this layer doesn't have any parameters, so the tensor
-            returned by get_layer_params() is always empty.
-        !*/
-    };
-
-    template <typename SUBNET>
-    using htan = add_layer<htan_, SUBNET>;
-
-// ----------------------------------------------------------------------------------------
-
-    class softmax_
-    {
-        /*!
-            WHAT THIS OBJECT REPRESENTS
-                This is an implementation of the EXAMPLE_COMPUTATIONAL_LAYER_ interface
-                defined above.  In particular, it defines a softmax layer.  To be precise,
-                we define the softmax function s(x) as:
-                    s(x) == exp(x)/sum(exp(x)) 
-                where x is a vector.  Then this layer treats its input tensor as a
-                collection of multi-channel images and applies s() to each spatial location
-                in each image.  In each application, the tensor::k() channel elements at
-                each position are input to s() and then replaced by the outputs of s().   
-
-                This means that, for example, if you collapsed each output image to a 1
-                channel image by adding the channels then you would end up with images
-                where each pixel value was 1.  This is because the sum of the outputs of
-                s() will always be equal to 1.
-        !*/
-
-    public:
-
-        softmax_(
-        );
-
-        template <typename SUBNET> void setup (const SUBNET& sub);
-        void forward_inplace(const tensor& input, tensor& output);
-        void backward_inplace(const tensor& computed_output, const tensor& gradient_input, tensor& data_grad, tensor& params_grad);
-        const tensor& get_layer_params() const; 
-        tensor& get_layer_params(); 
-        /*!
-            These functions are implemented as described in the EXAMPLE_COMPUTATIONAL_LAYER_ 
-            interface.  Note that this layer doesn't have any parameters, so the tensor
-            returned by get_layer_params() is always empty.
-        !*/
-    };
-
-    template <typename SUBNET>
-    using softmax = add_layer<softmax_, SUBNET>;
-
-// ----------------------------------------------------------------------------------------
-
-    class softmax_all_
-    {
-        /*!
-            WHAT THIS OBJECT REPRESENTS
-                This is an implementation of the EXAMPLE_COMPUTATIONAL_LAYER_ interface
-                defined above.  In particular, it defines a softmax layer.  To be precise,
-                we define the softmax function s(x) as:
-                    s(x) == exp(x)/sum(exp(x)) 
-                where x is a vector.  Then this layer treats its input tensor as a
-                collection of tensor::num_samples() vectors and applies s() to each vector
-                in the tensor.  Therefore, there are logically tensor::num_samples()
-                invocations of s().
-        !*/
-
-    public:
-
-        softmax_all_(
-        );
-
-        template <typename SUBNET> void setup (const SUBNET& sub);
-        void forward_inplace(const tensor& input, tensor& output);
-        void backward_inplace(const tensor& computed_output, const tensor& gradient_input, tensor& data_grad, tensor& params_grad);
-        const tensor& get_layer_params() const; 
-        tensor& get_layer_params(); 
-        /*!
-            These functions are implemented as described in the EXAMPLE_COMPUTATIONAL_LAYER_ 
-            interface.  Note that this layer doesn't have any parameters, so the tensor
-            returned by get_layer_params() is always empty.
-        !*/
-    };
-
-    template <typename SUBNET>
-    using softmax_all = add_layer<softmax_all_, SUBNET>;
-
-// ----------------------------------------------------------------------------------------
-
-    template <
-        template<typename> class tag
-        >
-    class add_prev_
-    {
-        /*!
-            WHAT THIS OBJECT REPRESENTS
-                This is an implementation of the EXAMPLE_COMPUTATIONAL_LAYER_ interface
-                defined above.  This layer simply adds the output of two previous layers.
-                In particular, it adds the tensor from its immediate predecessor layer,
-                sub.get_output(), with the tensor from a deeper layer,
-                layer<tag>(sub).get_output().
-
-                Therefore, you supply a tag via add_prev_'s template argument that tells it
-                what layer to add to the output of the previous layer.  The result of this
-                addition is output by add_prev_.  Finally, the addition happens pointwise
-                according to 4D tensor arithmetic.  If the dimensions don't match then
-                missing elements are presumed to be equal to 0.  Moreover, each dimension
-                of the output tensor is equal to the maximum dimension of either of the
-                inputs.  That is, if the tensors A and B are being added to produce C then:
-                    - C.num_samples() == max(A.num_samples(), B.num_samples())
-                    - C.k()  == max(A.k(), B.k())
-                    - C.nr() == max(A.nr(), B.nr())
-                    - C.nc() == max(A.nc(), B.nc())
-        !*/
-
-    public:
-        add_prev_(
-        ); 
-
-        template <typename SUBNET> void setup (const SUBNET& sub);
-        template <typename SUBNET> void forward(const SUBNET& sub, resizable_tensor& output);
-        template <typename SUBNET> void backward(const tensor& gradient_input, SUBNET& sub, tensor& params_grad);
-        dpoint map_input_to_output(dpoint p) const;
-        dpoint map_output_to_input(dpoint p) const;
-        const tensor& get_layer_params() const; 
-        tensor& get_layer_params(); 
-        /*!
-            These functions are implemented as described in the EXAMPLE_COMPUTATIONAL_LAYER_ interface.
-        !*/
-    };
-
-
-    template <
-        template<typename> class tag,
-        typename SUBNET
-        >
-    using add_prev = add_layer<add_prev_<tag>, SUBNET>;
-
-    // Here we add some convenient aliases for using add_prev_ with the tag layers. 
-    template <typename SUBNET> using add_prev1  = add_prev<tag1, SUBNET>;
-    template <typename SUBNET> using add_prev2  = add_prev<tag2, SUBNET>;
-    template <typename SUBNET> using add_prev3  = add_prev<tag3, SUBNET>;
-    template <typename SUBNET> using add_prev4  = add_prev<tag4, SUBNET>;
-    template <typename SUBNET> using add_prev5  = add_prev<tag5, SUBNET>;
-    template <typename SUBNET> using add_prev6  = add_prev<tag6, SUBNET>;
-    template <typename SUBNET> using add_prev7  = add_prev<tag7, SUBNET>;
-    template <typename SUBNET> using add_prev8  = add_prev<tag8, SUBNET>;
-    template <typename SUBNET> using add_prev9  = add_prev<tag9, SUBNET>;
-    template <typename SUBNET> using add_prev10 = add_prev<tag10, SUBNET>;
-    using add_prev1_  = add_prev_<tag1>;
-    using add_prev2_  = add_prev_<tag2>;
-    using add_prev3_  = add_prev_<tag3>;
-    using add_prev4_  = add_prev_<tag4>;
-    using add_prev5_  = add_prev_<tag5>;
-    using add_prev6_  = add_prev_<tag6>;
-    using add_prev7_  = add_prev_<tag7>;
-    using add_prev8_  = add_prev_<tag8>;
-    using add_prev9_  = add_prev_<tag9>;
-    using add_prev10_ = add_prev_<tag10>;
-
-// ----------------------------------------------------------------------------------------
-
-    template <
-        template<typename> class tag
-        >
-    class mult_prev_
-    {
-        /*!
-            WHAT THIS OBJECT REPRESENTS
-                This is an implementation of the EXAMPLE_COMPUTATIONAL_LAYER_ interface
-                defined above.  This layer simply multiplies the output of two previous
-                layers.  In particular, it multiplies the tensor from its immediate
-                predecessor layer, sub.get_output(), with the tensor from a deeper layer,
-                layer<tag>(sub).get_output().
-
-                Therefore, you supply a tag via mult_prev_'s template argument that tells
-                it what layer to multiply with the output of the previous layer.  The
-                result of this multiplication is output by mult_prev_.  Finally, the
-                multiplication happens pointwise according to 4D tensor arithmetic.  If the
-                dimensions don't match then missing elements are presumed to be equal to 0.
-                Moreover, each dimension of the output tensor is equal to the maximum
-                dimension of either of the inputs.  That is, if the tensors A and B are
-                being multiplied to produce C then:
-                    - C.num_samples() == max(A.num_samples(), B.num_samples())
-                    - C.k()  == max(A.k(), B.k())
-                    - C.nr() == max(A.nr(), B.nr())
-                    - C.nc() == max(A.nc(), B.nc())
-        !*/
-
-    public:
-        mult_prev_(
-        ); 
-
-        template <typename SUBNET> void setup (const SUBNET& sub);
-        template <typename SUBNET> void forward(const SUBNET& sub, resizable_tensor& output);
-        template <typename SUBNET> void backward(const tensor& gradient_input, SUBNET& sub, tensor& params_grad);
-        const tensor& get_layer_params() const; 
-        tensor& get_layer_params(); 
-        /*!
-            These functions are implemented as described in the EXAMPLE_COMPUTATIONAL_LAYER_ interface.
-        !*/
-    };
-
-
-    template <
-        template<typename> class tag,
-        typename SUBNET
-        >
-    using mult_prev = add_layer<mult_prev_<tag>, SUBNET>;
-
-    // Here we add some convenient aliases for using mult_prev_ with the tag layers. 
-    template <typename SUBNET> using mult_prev1  = mult_prev<tag1, SUBNET>;
-    template <typename SUBNET> using mult_prev2  = mult_prev<tag2, SUBNET>;
-    template <typename SUBNET> using mult_prev3  = mult_prev<tag3, SUBNET>;
-    template <typename SUBNET> using mult_prev4  = mult_prev<tag4, SUBNET>;
-    template <typename SUBNET> using mult_prev5  = mult_prev<tag5, SUBNET>;
-    template <typename SUBNET> using mult_prev6  = mult_prev<tag6, SUBNET>;
-    template <typename SUBNET> using mult_prev7  = mult_prev<tag7, SUBNET>;
-    template <typename SUBNET> using mult_prev8  = mult_prev<tag8, SUBNET>;
-    template <typename SUBNET> using mult_prev9  = mult_prev<tag9, SUBNET>;
-    template <typename SUBNET> using mult_prev10 = mult_prev<tag10, SUBNET>;
-    using mult_prev1_  = mult_prev_<tag1>;
-    using mult_prev2_  = mult_prev_<tag2>;
-    using mult_prev3_  = mult_prev_<tag3>;
-    using mult_prev4_  = mult_prev_<tag4>;
-    using mult_prev5_  = mult_prev_<tag5>;
-    using mult_prev6_  = mult_prev_<tag6>;
-    using mult_prev7_  = mult_prev_<tag7>;
-    using mult_prev8_  = mult_prev_<tag8>;
-    using mult_prev9_  = mult_prev_<tag9>;
-    using mult_prev10_ = mult_prev_<tag10>;
-
-// ----------------------------------------------------------------------------------------
-
-    template <
-        template<typename> class tag
-        >
-    class scale_
-    {
-        /*!
-            WHAT THIS OBJECT REPRESENTS
-                This is an implementation of the EXAMPLE_COMPUTATIONAL_LAYER_ interface
-                defined above.  This layer scales the output channels of the tagged layer
-                by multiplying it with the output of the previous layer.  To be specific:
-                    - Let INPUT  == layer<tag>(sub).get_output()
-                    - Let SCALES == sub.get_output()
-                    - This layer takes INPUT and SCALES as input.
-                    - The output of this layer has the same dimensions as INPUT.
-                    - This layer requires:
-                        - SCALES.num_samples() == INPUT.num_samples()
-                        - SCALES.k()  == INPUT.k()
-                        - SCALES.nr() == 1
-                        - SCALES.nc() == 1
-                    - The output tensor is produced by pointwise multiplying SCALES with
-                      INPUT at each spatial location.  Therefore, if OUT is the output of
-                      this layer then we would have:
-                        OUT(n,k,r,c) == INPUT(n,k,r,c)*SCALES(n,k)
-        !*/
-
-    public:
-        scale_(
-        ); 
-
-        template <typename SUBNET> void setup (const SUBNET& sub);
-        template <typename SUBNET> void forward(const SUBNET& sub, resizable_tensor& output);
-        template <typename SUBNET> void backward(const tensor& gradient_input, SUBNET& sub, tensor& params_grad);
-        const tensor& get_layer_params() const; 
-        tensor& get_layer_params(); 
-        /*!
-            These functions are implemented as described in the EXAMPLE_COMPUTATIONAL_LAYER_ interface.
-        !*/
-    };
-
-
-    template <
-        template<typename> class tag,
-        typename SUBNET
-        >
-    using scale = add_layer<scale_<tag>, SUBNET>;
-
-    // Here we add some convenient aliases for using scale_ with the tag layers. 
-    template <typename SUBNET> using scale1  = scale<tag1, SUBNET>;
-    template <typename SUBNET> using scale2  = scale<tag2, SUBNET>;
-    template <typename SUBNET> using scale3  = scale<tag3, SUBNET>;
-    template <typename SUBNET> using scale4  = scale<tag4, SUBNET>;
-    template <typename SUBNET> using scale5  = scale<tag5, SUBNET>;
-    template <typename SUBNET> using scale6  = scale<tag6, SUBNET>;
-    template <typename SUBNET> using scale7  = scale<tag7, SUBNET>;
-    template <typename SUBNET> using scale8  = scale<tag8, SUBNET>;
-    template <typename SUBNET> using scale9  = scale<tag9, SUBNET>;
-    template <typename SUBNET> using scale10 = scale<tag10, SUBNET>;
-    using scale1_  = scale_<tag1>;
-    using scale2_  = scale_<tag2>;
-    using scale3_  = scale_<tag3>;
-    using scale4_  = scale_<tag4>;
-    using scale5_  = scale_<tag5>;
-    using scale6_  = scale_<tag6>;
-    using scale7_  = scale_<tag7>;
-    using scale8_  = scale_<tag8>;
-    using scale9_  = scale_<tag9>;
-    using scale10_ = scale_<tag10>;
-
-// ----------------------------------------------------------------------------------------
-
-    template<
-        template<typename> class... TAG_TYPES
-        >
-    class concat_
-    {
-        /*!
-            WHAT THIS OBJECT REPRESENTS
-                This is an implementation of the EXAMPLE_COMPUTATIONAL_LAYER_ interface
-                defined above.  This layer simply concatenates the output of tagged layers.
-                Importantly, each input layer must have the same dimensions (i.e.
-                num_samples, nr, and nc) except for the k channel, which may vary.  This is
-                because the concatenation happens along the k dimension.  That is, the
-                output of this network is a tensor, OUT, that is the concatenation of the
-                tensors:
-                    for each (tag in TAG_TYPES)
-                        layer<tag>(subnet).get_output()
-                Therefore, out.num_samples(), out.nr(), and out.nc() match the dimensions
-                of the input tensors while OUT.k() is the sum of the input layer's k()
-                dimensions.
-        !*/
-
-    public:
-        template <typename SUBNET> void setup (const SUBNET& sub);
-        template <typename SUBNET> void forward(const SUBNET& sub, resizable_tensor& output);
-        template <typename SUBNET> void backward(const tensor& gradient_input, SUBNET& sub, tensor& params_grad);
-        dpoint map_input_to_output(dpoint p) const;
-        dpoint map_output_to_input(dpoint p) const;
-        const tensor& get_layer_params() const;
-        tensor& get_layer_params();
-        /*!
-            These functions are implemented as described in the EXAMPLE_COMPUTATIONAL_LAYER_ interface.
-        !*/
-    };
-
-
-    // concat layer definitions
-    template <template<typename> class TAG1,
-              template<typename> class TAG2,
-              typename SUBNET>
-    using concat2 = add_layer<concat_<TAG1, TAG2>, SUBNET>;
-
-    template <template<typename> class TAG1,
-              template<typename> class TAG2,
-              template<typename> class TAG3,
-              typename SUBNET>
-    using concat3 = add_layer<concat_<TAG1, TAG2, TAG3>, SUBNET>;
-
-    template <template<typename> class TAG1,
-              template<typename> class TAG2,
-              template<typename> class TAG3,
-              template<typename> class TAG4,
-              typename SUBNET>
-    using concat4 = add_layer<concat_<TAG1, TAG2, TAG3, TAG4>, SUBNET>;
-
-    template <template<typename> class TAG1,
-              template<typename> class TAG2,
-              template<typename> class TAG3,
-              template<typename> class TAG4,
-              template<typename> class TAG5,
-              typename SUBNET>
-    using concat5 = add_layer<concat_<TAG1, TAG2, TAG3, TAG4, TAG5>, SUBNET>;
-
-// ----------------------------------------------------------------------------------------
-    
-    /*!A inception layer definitions !*/
-
-    // Now define inception layer tag types.  These layer aliases allow creating
-    // the networks described in the paper: 
-    //   Szegedy, Christian, et al. "Going deeper with convolutions." Proceedings of
-    //   the IEEE Conference on Computer Vision and Pattern Recognition. 2015.
-    // See the dnn_inception_ex.cpp example for a complete example of their use.  Note also
-    // that we use tag ID numbers >= 1000 to avoid conflict with user's tag layers.
-    template <typename SUBNET> using itag0  = add_tag_layer< 1000 + 0, SUBNET>;
-    template <typename SUBNET> using itag1  = add_tag_layer< 1000 + 1, SUBNET>;
-    template <typename SUBNET> using itag2  = add_tag_layer< 1000 + 2, SUBNET>;
-    template <typename SUBNET> using itag3  = add_tag_layer< 1000 + 3, SUBNET>;
-    template <typename SUBNET> using itag4  = add_tag_layer< 1000 + 4, SUBNET>;
-    template <typename SUBNET> using itag5  = add_tag_layer< 1000 + 5, SUBNET>;
-    // skip to inception input
-    template <typename SUBNET> using iskip  = add_skip_layer< itag0, SUBNET>;
-
-    // here are some templates to be used for creating inception layer groups
-    template <template<typename>class B1,
-              template<typename>class B2,
-              typename SUBNET>
-    using inception2 = concat2<itag1, itag2, itag1<B1<iskip< itag2<B2< itag0<SUBNET>>>>>>>;
-
-    template <template<typename>class B1,
-              template<typename>class B2,
-              template<typename>class B3,
-              typename SUBNET>
-    using inception3 = concat3<itag1, itag2, itag3, itag1<B1<iskip< itag2<B2<iskip< itag3<B3<  itag0<SUBNET>>>>>>>>>>;
-
-    template <template<typename>class B1,
-              template<typename>class B2,
-              template<typename>class B3,
-              template<typename>class B4,
-              typename SUBNET>
-    using inception4 = concat4<itag1, itag2, itag3, itag4,
-                itag1<B1<iskip< itag2<B2<iskip< itag3<B3<iskip<  itag4<B4<  itag0<SUBNET>>>>>>>>>>>>>;
-
-    template <template<typename>class B1,
-              template<typename>class B2,
-              template<typename>class B3,
-              template<typename>class B4,
-              template<typename>class B5,
-              typename SUBNET>
-    using inception5 = concat5<itag1, itag2, itag3, itag4, itag5,
-                itag1<B1<iskip< itag2<B2<iskip< itag3<B3<iskip<  itag4<B4<iskip<  itag5<B5<  itag0<SUBNET>>>>>>>>>>>>>>>>;
-
-// ----------------------------------------------------------------------------------------
-
-    const double DEFAULT_L2_NORM_EPS = 1e-5;
-
-    class l2normalize_
-    {
-        /*!
-            WHAT THIS OBJECT REPRESENTS
-                This is an implementation of the EXAMPLE_COMPUTATIONAL_LAYER_ interface
-                defined above.  It takes tensors as input and L2 normalizes them.  In particular,
-                it has the following properties:
-                    - The output tensors from this layer have the same dimensions as the
-                      input tensors.
-                    - If you think of each input tensor as a set of tensor::num_samples()
-                      vectors, then the output tensor contains the same vectors except they
-                      have been length normalized so that their L2 norms are all 1.  I.e. 
-                      for each vector v we will have ||v||==1.
-        !*/
-
-    public:
-
-        explicit l2normalize_(
-            double eps = tt::DEFAULT_L2_NORM_EPS
-        );
-        /*!
-            requires
-                - eps > 0
-            ensures
-                - #get_eps() == eps
-        !*/
-
-        double get_eps(
-        ) const; 
-        /*!
-            ensures
-                - When we normalize a vector we divide it by its L2 norm.  However, the
-                  get_eps() value is added to the squared norm prior to division to avoid
-                  ever dividing by zero. 
-        !*/
-
-        template <typename SUBNET> void setup (const SUBNET& sub);
-        void forward_inplace(const tensor& input, tensor& output);
-        void backward_inplace(const tensor& computed_output, const tensor& gradient_input, tensor& data_grad, tensor& params_grad);
-        const tensor& get_layer_params() const; 
-        tensor& get_layer_params(); 
-        /*!
-            These functions are implemented as described in the EXAMPLE_COMPUTATIONAL_LAYER_ interface.
-        !*/
-    };
-
-// ----------------------------------------------------------------------------------------
-
-    template <
-        long _offset,
-        long _k,
-        long _nr,
-        long _nc
-        >
-    class extract_
-    {
-        /*!
-            REQUIREMENTS ON TEMPLATE ARGUMENTS
-                - 0 <= _offset
-                - 0 < _k
-                - 0 < _nr
-                - 0 < _nc
-
-            WHAT THIS OBJECT REPRESENTS
-                This is an implementation of the EXAMPLE_COMPUTATIONAL_LAYER_ interface
-                defined above.  In particular, the output of this layer is simply a copy of
-                the input tensor.  However, you can configure the extract layer to output
-                only some subset of the input tensor and also to reshape it.  Therefore,
-                the dimensions of the tensor output by this layer are as follows (letting
-                IN be the input tensor and OUT the output tensor):
-                    - OUT.num_samples() == IN.num_samples()
-                    - OUT.k()  == _k 
-                    - OUT.nr() == _nr 
-                    - OUT.nc() == _nc 
-
-                So the output will always have the same number of samples as the input, but
-                within each sample (the k,nr,nc part) we will copy only a subset of the
-                values.  Moreover, the _offset parameter controls which part of each sample
-                we take.  To be very precise, we will have:
-                    - let IN_SIZE   = IN.k()*IN.nr()*IN.nc()
-                    - let OUT_SIZE  = _k*_nr*_nc 
-                    - for i in range[0,IN.num_samples()) and j in range[0,OUT_SIZE):
-                        - OUT.host()[i*OUT_SIZE+j] == IN.host()[i*IN_SIZE+_offset+j]
-
-
-                Finally, all this means that the input tensor to this layer must have a big
-                enough size to accommodate taking a _k*_nr*_nc slice from each of its
-                samples.  
-        !*/
-
-    public:
-
-        template <typename SUBNET> void setup (const SUBNET& sub);
-        template <typename SUBNET> void forward(const SUBNET& sub, resizable_tensor& output);
-        template <typename SUBNET> void backward(const tensor& gradient_input, SUBNET& sub, tensor& params_grad);
-        const tensor& get_layer_params() const; 
-        tensor& get_layer_params(); 
-        /*!
-            These functions are implemented as described in the EXAMPLE_COMPUTATIONAL_LAYER_ interface.
-        !*/
-    };
-
-    template <
-        long offset,
-        long k,
-        long nr,
-        long nc,
-        typename SUBNET
-        >
-    using extract = add_layer<extract_<offset,k,nr,nc>, SUBNET>;
-
-// ----------------------------------------------------------------------------------------
-
-}
-
-#endif // DLIB_DNn_LAYERS_ABSTRACT_H_
-