1 files changed, 0 insertions, 1711 deletions
diff --git a/ml/dlib/dlib/dnn/tensor_tools.h b/ml/dlib/dlib/dnn/tensor_tools.h
deleted file mode 100644
index 9ba3154e5..000000000
--- a/ml/dlib/dlib/dnn/tensor_tools.h
+++ /dev/null
@@ -1,1711 +0,0 @@
-// Copyright (C) 2015  Davis E. King (davis@dlib.net)
-// License: Boost Software License   See LICENSE.txt for the full license.
-#ifndef DLIB_TeNSOR_TOOLS_H_
-#define DLIB_TeNSOR_TOOLS_H_
-
-#include "tensor.h"
-#include "cudnn_dlibapi.h"
-#include "cublas_dlibapi.h"
-#include "cusolver_dlibapi.h"
-#include "curand_dlibapi.h"
-#include "cpu_dlib.h"
-#include "cuda_dlib.h"
-#include "../rand.h"
-#include <memory>
-#include "../geometry/rectangle.h"
-#include "../test_for_odr_violations.h"
-
-namespace dlib
-{
-    bool dnn_prefer_fastest_algorithms();
-    void set_dnn_prefer_fastest_algorithms();
-    void set_dnn_prefer_smallest_algorithms();
-}
-
-namespace dlib { namespace tt
-{
-
-// ----------------------------------------------------------------------------------------
-
-    void inverse_norms (
-        resizable_tensor& invnorms,
-        const tensor& data,
-        const double eps
-    );
-    /*!
-        ensures
-            - #invnorms == reciprocal(sqrt(sum_cols(squared(mat(data))) + eps))
-    !*/
-
-    void dot_prods (
-        resizable_tensor& out,
-        const tensor& lhs,
-        const tensor& rhs
-    );
-    /*!
-        requires
-            - have_same_dimensions(lhs,rhs) == true
-        ensures
-            - #out.num_samples() == lhs.num_samples()
-            - #out.k() == #out.nr() == #out.nc() == 1
-            - #out == sum_cols(pointwise_multiply(mat(lhs), mat(rhs))); 
-    !*/
-
-    void dot_prods (
-        bool add_to,
-        tensor& out,
-        const tensor& lhs,
-        const tensor& rhs
-    );
-    /*!
-        requires
-            - have_same_dimensions(lhs,rhs) == true
-            - out.size() == lhs.num_samples()
-            - out.k() == out.nr() == out.nc() == 1
-        ensures
-            - if (add_to) then
-                - #out == mat(out) + sum_cols(pointwise_multiply(mat(lhs), mat(rhs))); 
-            - else
-                - #out == sum_cols(pointwise_multiply(mat(lhs), mat(rhs))); 
-    !*/
-
-    void scale_columns (
-        tensor& out,
-        const tensor& m,
-        const tensor& v
-    );
-    /*!
-        requires
-            - have_same_dimensions(out,m) == true
-            - is_vector(v) == true
-            - v.size() == mat(m).nc()
-        ensures
-            - performs: out = scale_columns(mat(m),mat(v));
-    !*/
-
-    void scale_rows (
-        tensor& out,
-        const tensor& m,
-        const tensor& v
-    );
-    /*!
-        requires
-            - have_same_dimensions(out,m) == true
-            - is_vector(v) == true
-            - v.size() == m.num_samples()
-        ensures
-            - performs: out = scale_rows(mat(m),mat(v));
-    !*/
-
-    void scale_rows2 (
-        float beta, 
-        tensor& out,
-        const tensor& m1,
-        const tensor& m2,
-        const tensor& v1,
-        const tensor& v2
-    );
-    /*!
-        requires
-            - have_same_dimensions(out,m1) == true
-            - have_same_dimensions(out,m2) == true
-            - have_same_dimensions(v1,v2) == true
-            - is_vector(v1) == true
-            - v1.size() == m1.num_samples()
-        ensures
-            - performs: 
-                out = beta*out + scale_rows(mat(m1) - scale_rows(mat(m2),mat(v1)), mat(v2));
-    !*/
-
-// ----------------------------------------------------------------------------------------
-    
-    void exp (
-        tensor& dest,
-        const tensor& src
-    );
-    /*!
-        requires
-            - dest.size() == src.size()
-        ensures
-            - performs: dest = exp(mat(src))
-    !*/
-
-// ----------------------------------------------------------------------------------------
-
-    void log (
-        tensor& dest,
-        const tensor& src
-    );
-    /*!
-        requires
-            - dest.size() == src.size()
-        ensures
-            - performs: dest = log(mat(src))
-    !*/
-
-// ----------------------------------------------------------------------------------------
-
-    void log10 (
-        tensor& dest,
-        const tensor& src
-    );
-    /*!
-        requires
-            - dest.size() == src.size()
-        ensures
-            - performs: dest = log10(mat(src))
-    !*/
-
-// ----------------------------------------------------------------------------------------
-
-    void gemm (
-        float beta,
-        tensor& dest,
-        float alpha,
-        const tensor& lhs,
-        bool trans_lhs,
-        const tensor& rhs,
-        bool trans_rhs
-    );
-    /*!
-        requires
-            - dest does not alias the memory of lhs or rhs
-            - The dimensions of lhs and rhs must be compatible for matrix multiplication.
-              In particular:
-                - Let L == trans_lhs ? trans(mat(lhs)) : mat(lhs)
-                - Let R == trans_rhs ? trans(mat(rhs)) : mat(rhs)
-                - Let D == mat(dest)
-                - D.nr() == L.nr() && D.nc() == R.nc()
-                  (i.e. dest must be preallocated and have the correct output dimensions)
-                - L.nc() == R.nr()
-        ensures
-            - performs: dest = alpha*L*R + beta*mat(dest)
-    !*/
-
-// ----------------------------------------------------------------------------------------
-
-    class inv
-    {
-        /*!
-            WHAT THIS OBJECT REPRESENTS
-                This is a functor for doing matrix inversion on the GPU.  The only
-                reason it's an object is to avoid the reallocation of some GPU memory
-                blocks if you want to do a bunch of matrix inversions in a row.
-        !*/
-    public:
-
-        void operator() (
-            const tensor& m,
-            resizable_tensor& out
-        );
-        /*!
-            requires
-                - m.size() == m.num_samples()*m.num_samples()
-                  (i.e. mat(m) must be a square matrix)
-            ensures
-                - out == inv(mat(m));
-        !*/
-
-    private:
-#ifdef DLIB_USE_CUDA
-        cuda::inv finv;
-#endif
-    };
-
-// ----------------------------------------------------------------------------------------
-
-    class tensor_rand
-    {
-        /*!
-            WHAT THIS OBJECT REPRESENTS
-                This is a tool for filling a tensor with random numbers.  
-
-                Note that the sequence of random numbers output by this object is different
-                when dlib is compiled with DLIB_USE_CUDA.  So you should not write code
-                that depends on any specific sequence of numbers coming out of a
-                tensor_rand.
-
-        !*/
-
-    public:
-        // not copyable
-        tensor_rand(const tensor_rand&) = delete;
-        tensor_rand& operator=(const tensor_rand&) = delete;
-
-        tensor_rand() : tensor_rand(0) {}
-        tensor_rand(unsigned long long seed);
-
-        void fill_gaussian (
-            tensor& data,
-            float mean = 0,
-            float stddev = 1
-        );
-        /*!
-            requires
-                - data.size()%2 == 0
-            ensures
-                - Fills data with random numbers drawn from a Gaussian distribution
-                  with the given mean and standard deviation.
-        !*/
-
-        void fill_uniform (
-            tensor& data
-        );
-        /*!
-            ensures
-                - Fills data with uniform random numbers in the range (0.0, 1.0].
-        !*/
-
-#ifdef DLIB_USE_CUDA
-        cuda::curand_generator rnd;
-#else
-        dlib::rand rnd;
-#endif
-    };
-
-// ----------------------------------------------------------------------------------------
-
-    void multiply (
-        bool add_to,
-        tensor& dest,
-        const tensor& src1,
-        const tensor& src2
-    );
-    /*!
-        requires
-            - dest.k()  == src1.k()  == src2.k()
-            - dest.nr() == src1.nr() == src2.nr()
-            - dest.nc() == src1.nc() == src2.nc()
-            - dest.num_samples(), src1.num_samples(), and src2.num_samples() must each
-              either be 1 or whichever ones aren't equal to 1 must have the same values.
-        ensures
-            - let MD = max(dest.num_samples(), src1.num_samples(), src2.num_samples)
-            - This function pointwise multiplies src1 with src2 and stores the result into
-              #dest.  However, how the multiplication happens depends on the dimensions of
-              the tensors.  First, when src1 and src2 are multiplied together, if either
-              has a num_samples() dimension that is != MD, then it is first replicated to
-              produce a tensor with num_samples()==MD dimensions and then they are
-              pointwise multiplied together.
-
-              Second, if dest.num_samples()==1, then after the pointwise multiplication of
-              src1 with src2, the result has its samples summed to produce an output tensor
-              with num_samples()==1 which is then assigned to #dest.
-            - if (add_to) then
-                - Instead of assigning the result to dest, this function adds the result to dest.
-    !*/
-
-    void scale_channels (
-        bool add_to,
-        tensor& dest,
-        const tensor& src,
-        const tensor& scales
-    );
-    /*!
-        requires
-            - have_same_dimensions(dest, src) == true
-            - scales.num_samples() == src.num_samples()
-            - scales.k()           == src.k()
-            - scales.nr()          == 1
-            - scales.nc()          == 1
-        ensures
-            - Scales each channel of src by the corresponding value in scales.  To be
-              precise, we will have:
-                - #dest(n,k,r,c) == src(n,k,r,c)*scales(n,k,1,1)
-            - if (add_to) then
-                - Instead of assigning the result to dest, this function adds the result to dest.
-    !*/
-
-    void multiply_conv (
-        bool add_to,
-        tensor& dest,
-        const tensor& src1,
-        const tensor& src2
-    );
-    /*!
-        requires
-            - if (have_same_dimensions(dest, src1) == true) then
-                - src2.num_samples() == 1
-                - src2.nr() == 1
-                - src2.nc() == 1
-                - src2.k() == src1.k()
-            - else
-                - have_same_dimensions(src1, src2) == true) 
-                - dest.num_samples() == 1
-                - dest.nr() == 1
-                - dest.nc() == 1
-                - dest.k() == src1.k()
-        ensures
-            - Performs #dest == src1*src2 
-              In particular, if the elements of dest, src1, and src2 were indexed by (n,k,r,c) then
-              we would have:
-                - if (have_same_dimensions(dest,src1)) then
-                    #dest(n,k,r,c) == src1(n,k,r,c)*src2(k)
-                - else
-                    #dest(k) == sum over {n,r,c} of src1(n,k,r,c)*src2(n,k,r,c)
-            - if (add_to) then
-                - Instead of assigning the result to dest, this function adds the result to dest.
-    !*/
-
-    void multiply_zero_padded (
-        bool add_to,
-        tensor& dest,
-        const tensor& src1,
-        const tensor& src2
-    );
-    /*!
-        ensures
-            - if (add_to) then
-                - performs: dest += src1 * src2
-            - else
-                - performs: dest = src1 * src2
-            - In either case, the multiplication happens pointwise according to 4D tensor
-              arithmetic.  If the dimensions don't match then missing elements are presumed
-              to be equal to 0.
-    !*/
-
-// ----------------------------------------------------------------------------------------
-
-    void affine_transform(
-        tensor& dest,
-        const tensor& src,
-        const float A,
-        const float B
-    );
-    /*!
-        requires
-            - dest.size()==src.size()
-        ensures
-            - #dest == A*src + B
-    !*/
-
-    void affine_transform(
-        tensor& dest,
-        const tensor& src,
-        const float A
-    );
-    /*!
-        requires
-            - dest.size()==src.size()
-        ensures
-            - #dest == A*src 
-    !*/
-
-    void affine_transform(
-        tensor& dest,
-        const tensor& src1,
-        const tensor& src2,
-        const float A,
-        const float B,
-        const float C
-    );
-    /*!
-        requires
-            - dest.size()==src1.size()
-            - dest.size()==src2.size()
-        ensures
-            - #dest == A*src1 + B*src2 + C
-    !*/
-
-    void affine_transform(
-        tensor& dest,
-        const tensor& src1,
-        const tensor& src2,
-        const float A,
-        const float B
-    );
-    /*!
-        requires
-            - dest.size()==src1.size()
-            - dest.size()==src2.size()
-        ensures
-            - #dest == A*src1 + B*src2
-    !*/
-
-    void affine_transform(
-        tensor& dest,
-        const tensor& src1,
-        const tensor& src2,
-        const tensor& src3,
-        const float A,
-        const float B,
-        const float C,
-        const float D
-    );
-    /*!
-        requires 
-            - dest.size()==src1.size()
-            - dest.size()==src2.size()
-            - dest.size()==src3.size()
-        ensures
-            - #dest == A*src1 + B*src2 + C*src3 + D
-    !*/
-
-    void affine_transform(
-        tensor& dest,
-        const tensor& src1,
-        const tensor& src2,
-        const tensor& src3,
-        const float A,
-        const float B,
-        const float C
-    );
-    /*!
-        requires 
-            - dest.size()==src1.size()
-            - dest.size()==src2.size()
-            - dest.size()==src3.size()
-        ensures
-            - #dest == A*src1 + B*src2 + C*src3
-    !*/
-
-    void affine_transform_range(
-        size_t begin,
-        size_t end,
-        tensor& dest,
-        const tensor& src1,
-        const tensor& src2,
-        const tensor& src3,
-        const float A,
-        const float B,
-        const float C
-    );
-    /*!
-        requires 
-            - dest.size()==src1.size()
-            - dest.size()==src2.size()
-            - dest.size()==src3.size()
-            - begin <= end <= dest.size()
-        ensures
-            - This function operates much like
-              affine_transform(dest,src1,src2,src3,A,B,C,0), except that it runs over only
-              the half open range [begin,end) rather than processing the entire tensor.
-              Specifically, it does this:
-                - for i in the range [begin, end):
-                    - #dest.host()[i] == A*src1.host()[i] + B*src2.host()[i] + C*src3.host()[i]
-    !*/
-
-    void affine_transform(
-        const rectangle& rect,
-        tensor& dest, 
-        const tensor& src1, 
-        const tensor& src2, 
-        const tensor& src3, 
-        float A, 
-        float B,
-        float C
-    );
-    /*!
-        requires
-            - dest.size()==src1.size()
-            - dest.size()==src2.size()
-            - dest.size()==src3.size()
-            - dest.num_samples()==src1.num_samples()
-            - dest.num_samples()==src2.num_samples()
-            - dest.num_samples()==src3.num_samples()
-            - get_rect(mat(dest)).contains(rect) == true
-              (i.e. rect must be entirely contained within dest)
-        ensures
-            - This function operates much like
-              affine_transform(dest,src1,src2,src3,A,B,C,0), except that it runs over only
-              the sub-rectangle indicated by rect.  In particular, this function is equivalent
-              to:
-                set_subm(dest,rect) = A*subm(mat(src1),rect) + B*subm(mat(src2),rect) + C*subm(mat(src3),rect)
-    !*/
-
-// ----------------------------------------------------------------------------------------
-
-    void affine_transform(
-        tensor& dest,
-        const tensor& src,
-        const tensor& A,
-        const tensor& B
-    );
-    /*!
-        requires
-            - have_same_dimensions(dest,src) == true
-            - if (A.num_samples() == 1) then
-                - B.num_samples() == 1
-            - else
-                - A.num_samples() == src.num_samples()
-                - B.num_samples() == src.num_samples()
-            - A.nr() == B.nr() == src.nr()
-            - A.nc() == B.nc() == src.nc()
-            - A.k()  == B.k()  == src.k()
-        ensures
-            - if (A.num_samples() == 1) then
-                - #dest == A*src + B
-                    (done for each sample in src)
-            - else
-                - for all valid i:
-                    - #dest.host()[i] == A.host()[i]*src.host()[i] + B.host()[i]  
-    !*/
-
-// ----------------------------------------------------------------------------------------
-
-    void affine_transform_conv(
-        tensor& dest,
-        const tensor& src,
-        const tensor& A,
-        const tensor& B
-    );
-    /*!
-        requires
-            - have_same_dimensions(dest,src) == true
-            - have_same_dimensions(A, B) == true
-            - A.num_samples() == 1
-            - A.nr() == 1
-            - A.nc() == 1
-            - A.k() == src.k()
-        ensures
-            - Performs #dest == A*src + B
-              In particular, if the elements of dest and src were indexed by (n,k,r,c) then
-              we would have:
-                #dest(n,k,r,c) == A(k)*src(n,k,r,c) + B(k).
-    !*/
-
-// ----------------------------------------------------------------------------------------
-
-    void compute_adam_update (
-        size_t begin,
-        size_t end,
-        tensor& s,
-        tensor& m,
-        tensor& v,
-        const float t,
-        const float learning_rate,
-        const float weight_decay,
-        const float momentum1,
-        const float momentum2,
-        const tensor& params,
-        const tensor& params_grad
-    );
-    /*!
-        requires
-            - s.size() == m.size() = v.size() == params.size() == params_grad.size()
-            - t > 0
-            - learning_rate > 0
-            - weight_decay >= 0
-            - 0 <= momentum1 < 1
-            - 0 <= momentum2 < 1
-            - begin <= end <= params.size()
-        ensures
-            - This function implements the ADAM parameter update method described in the paper:
-                Kingma, Diederik P., and Jimmy Ba Adam. "A method for stochastic
-                optimization." International Conference on Learning Representation. 2015.
-              Specifically, it implements the method shown as Algorithm 1.
-            - #s is the update vector that should be added to the parameters.
-            - The function only operates in the half open range [begin,end) of the memory
-              blocks of each tensor.  E.g. to make this function run on the entire tensor
-              set begin to 0 and end to params.size().
-    !*/
-
-// ----------------------------------------------------------------------------------------
-
-    void batch_normalize_inference (
-        const double eps,
-        resizable_tensor& dest,
-        const tensor& src,
-        const tensor& gamma, 
-        const tensor& beta,
-        const tensor& running_means,
-        const tensor& running_variances
-    );
-    /*!
-        requires
-            - eps > 0
-            - gamma.num_samples() == 1 
-            - gamma.nr() == src.nr() 
-            - gamma.nc() == src.nc() 
-            - gamma.k()  == src.k()
-            - have_same_dimensions(gamma, beta) 
-            - have_same_dimensions(gamma, running_means) 
-            - have_same_dimensions(gamma, running_variances)
-        ensures
-            - Linearly transforms src as a call to batch_normalize() would if src had means
-              and variances as given by running_means and running_variances.  That is, this
-              function performs: 
-                dest = gamma*(src-running_means)/sqrt(running_variances+eps) + beta
-              Note that it does it in a pointwise fashion over the samples in src.
-    !*/
-
-    void batch_normalize (
-        const double eps,
-        resizable_tensor& dest,
-        resizable_tensor& means,
-        resizable_tensor& invstds,
-        const double averaging_factor,
-        resizable_tensor& running_means,
-        resizable_tensor& running_variances,
-        const tensor& src,
-        const tensor& gamma, 
-        const tensor& beta 
-    );
-    /*!
-        requires
-            - eps > 0
-            - src.num_samples() > 1
-            - gamma.num_samples() == 1
-            - beta.num_samples() == 1
-            - gamma.nr() == beta.nr() == src.nr()
-            - gamma.nc() == beta.nc() == src.nc()
-            - gamma.k()  == beta.k()  == src.k()
-            - 0 <= averaging_factor <= 1
-            - if (averaging_factor != 1)
-                - have_same_dimensions(running_means, means) == true
-                - have_same_dimensions(running_variances, invstds) == true
-        ensures
-            - have_same_dimensions(#dest, src) == true
-            - #means.num_samples() == 1
-            - #invstds.num_samples() == 1
-            - means.nr() == invstds.nr() == src.nr()
-            - means.nc() == invstds.nc() == src.nc()
-            - means.k()  == invstds.k()  == src.k()
-            - #src == the batch normalized version of src.
-            - #means == the mean values of the contents of src.
-            - #invstds == 1/(the standard deviation values of the contents of src).
-            - #running_means = (1-averaging_factor)*mat(#running_means) + averaging_factor*mat(#means);
-            - #running_variances = (1-averaging_factor)*mat(#running_variances) + averaging_factor*(variance of contents of src);
-    !*/
-
-    void batch_normalize_gradient (
-        const double eps,
-        const tensor& gradient_input,
-        const tensor& means,
-        const tensor& invstds,
-        const tensor& src,
-        const tensor& gamma,
-        tensor& src_grad,
-        tensor& gamma_grad, 
-        tensor& beta_grad 
-    );
-    /*!
-        requires
-            - eps > 0
-            - invstds and means should be the output of a call to
-              batch_normalize(eps,dest,means,invstds,src,gamma,beta)
-            - have_same_dimensions(gradient_input, src) == true
-            - have_same_dimensions(src, src_grad) == true
-            - src.num_samples() > 1
-            - gamma.num_samples() == 1
-            - have_same_dimensions(gamma, gamma_grad) == true
-            - have_same_dimensions(gamma, beta_grad) == true
-            - gamma.nr() == src.nr()
-            - gamma.nc() == src.nc()
-            - gamma.k()  == src.k()
-            - have_same_dimensions(means, gamma) == true
-            - have_same_dimensions(invstds, gamma) == true
-        ensures
-            - Let f(src,gamma,beta) == dot(gradient_input, dest output of
-              batch_normalize(eps,dest,means,invstds,src,gamma,beta))
-            - Adds the gradient of f() with respect to src to #src_grad.
-            - Assigns the gradient of f() with respect to gamma to #gamma_grad.
-            - Assigns the gradient of f() with respect to beta to #beta_grad.
-    !*/
-
-// ----------------------------------------------------------------------------------------
-
-    void batch_normalize_conv_inference (
-        const double eps,
-        resizable_tensor& dest,
-        const tensor& src,
-        const tensor& gamma, 
-        const tensor& beta,
-        const tensor& running_means,
-        const tensor& running_variances
-    );
-    /*!
-        requires
-            - eps > 0
-            - gamma.num_samples() == 1 
-            - gamma.nr() == 1 
-            - gamma.nc() == 1 
-            - gamma.k()  == src.k()
-            - have_same_dimensions(gamma, beta) 
-            - have_same_dimensions(gamma, running_means) 
-            - have_same_dimensions(gamma, running_variances)
-        ensures
-            - Linearly transforms src as a call to batch_normalize_conv() would if src had
-              means and variances as given by running_means and running_variances.  That
-              is, this function performs: 
-                dest = gamma*(src-running_means)/sqrt(running_variances+eps) + beta
-              Note that it does this in a pointwise fashion over the samples, rows, and
-              columns in src.
-    !*/
-
-    void batch_normalize_conv (
-        const double eps,
-        resizable_tensor& dest,
-        resizable_tensor& means,
-        resizable_tensor& invstds,
-        const double averaging_factor,
-        resizable_tensor& running_means,
-        resizable_tensor& running_variances,
-        const tensor& src,
-        const tensor& gamma, 
-        const tensor& beta 
-    );
-    /*!
-        requires
-            - eps > 0
-            - src.num_samples() > 1
-            - gamma.num_samples()==gamma.nr()==gamma.nc() == 1
-            - beta.num_samples() ==beta.nr() ==gamma.nc() == 1
-            - gamma.k()  == beta.k()  == src.k()
-            - 0 <= averaging_factor <= 1
-            - if (averaging_factor != 1)
-                - have_same_dimensions(running_means, means) == true
-                - have_same_dimensions(running_variances, invstds) == true
-        ensures
-            - have_same_dimensions(#dest, src) == true
-            - #means.num_samples()==means.nr()==means.nc() == 1
-            - #invstds.num_samples() ==invstds.nr() ==invstds.nc() == 1
-            - means.k()  == invstds.k()  == src.k()
-            - #src == the batch normalized version of src.
-            - #means == the mean values of the contents of src.
-            - #invstds == 1/(the standard deviation values of the contents of src).
-            - #running_means = (1-averaging_factor)*mat(#running_means) + averaging_factor*mat(#means);
-            - #running_variances = (1-averaging_factor)*mat(#running_variances) + averaging_factor*(variance of contents of src);
-    !*/
-
-    void batch_normalize_conv_gradient (
-        const double eps,
-        const tensor& gradient_input,
-        const tensor& means,
-        const tensor& invstds,
-        const tensor& src,
-        const tensor& gamma,
-        tensor& src_grad,
-        tensor& gamma_grad, 
-        tensor& beta_grad 
-    );
-    /*!
-        requires
-            - eps > 0
-            - invstds and means should be the output of a call to
-              batch_normalize_conv(eps,dest,means,invstds,src,gamma,beta)
-            - have_same_dimensions(gradient_input, src) == true
-            - have_same_dimensions(src, src_grad) == true
-            - src.num_samples() > 1
-            - gamma.num_samples()==gamma.nr()==gamma.nc() == 1
-            - have_same_dimensions(gamma, gamma_grad) == true
-            - have_same_dimensions(gamma, beta_grad) == true
-            - gamma.k()  == src.k()
-            - have_same_dimensions(means, gamma) == true
-            - have_same_dimensions(invstds, gamma) == true
-        ensures
-            - Let f(src,gamma,beta) == dot(gradient_input, dest output of
-              batch_normalize_conv(eps,dest,means,invstds,src,gamma,beta))
-            - Adds the gradient of f() with respect to src to #src_grad.
-            - Assigns the gradient of f() with respect to gamma to #gamma_grad.
-            - Assigns the gradient of f() with respect to beta to #beta_grad.
-    !*/
-
-// -----------------------------------------------------------------------------------
-
-    void threshold (
-        tensor& data,
-        float thresh
-    );
-    /*!
-        ensures
-            - Sets all elements of data to 1 or 0 depending on if they are above or below
-              the given threshold.  Specifically, for all valid i:
-                - #data.host()[i] == data.host()[i]>thresh ? 1 : 0
-    !*/
-
-    void dot (
-        const tensor& a,
-        const tensor& b,
-        tensor& result,
-        size_t idx
-    );
-    /*!
-        requires
-            - a.size() == b.size()
-            - idx < result.size()
-        ensures
-            - #result.host()[idx] == result.host()[idx] + dot(a,b);
-              I.e. Adds the dot product between a and b into the idx-th element of result.
-              The reason you might want to use this more complex version of dot() is
-              because, when using CUDA, it runs by generating asynchronous kernel launches
-              whereas the version of dot() that returns the result immediately as a scalar
-              must block the host while we wait for the result to be computed and then
-              transfered from the GPU do the host for return by dot().  So this version of
-              dot() might be much faster in some cases.
-    !*/
-
-// ----------------------------------------------------------------------------------------
-
-    void add(
-        float beta,
-        tensor& dest,
-        float alpha,
-        const tensor& src
-    );
-    /*!
-        requires
-            - One of the following is true: 
-                - have_same_dimensions(src, dest)
-                - src.num_samples()==1 && src.k()==dest.k() && src.nr()==1 && src.nc()==1
-                - src.num_samples()==1 && src.k()==dest.k() && src.nr()==dest.nr() && src.nc()==dest.nc()
-                - src.num_samples()==1 && src.k()==1 && src.nr()==dest.nr() && src.nc()==dest.nc()
-                - src.num_samples()==dest.num_samples() && src.k()==1 && src.nr()==1 && src.nc()==1
-            - is_same_object(src,dest) == false
-        ensures
-            - performs: dest = beta*dest + alpha*src
-              However, how the addition happens depends on the dimensions of src.  In
-              particular, this function adds the scaled values of one src tensor to dest.
-              Each dimension of the src tensor must match the corresponding dimension of
-              the dest tensor or must be equal to 1. In the latter case, the same value
-              from the src tensor, for those dimensions, will be used to add into the dest
-              tensor.
-    !*/
-
-// ----------------------------------------------------------------------------------------
-
-    void add (
-        tensor& dest,
-        const tensor& src1,
-        const tensor& src2
-    );
-    /*!
-        ensures
-            - performs: dest = src1 + src2
-              The addition happens pointwise according to 4D tensor arithmetic.  If the
-              dimensions don't match then missing elements are presumed to be equal to 0.
-    !*/
-
-// ----------------------------------------------------------------------------------------
-
-    void assign_conv_bias_gradient (
-        tensor& grad,
-        const tensor& gradient_input
-    );
-    /*!
-        requires
-            - grad.num_samples() == 1
-            - grad.k()  >= 1
-            - grad.nr() == 1
-            - grad.nc() == 1
-            - gradient_input.k() == grad.k()
-            - gradient_input.size() > 0
-            - is_same_object(grad,gradient_input) == false
-        ensures
-            - let BIAS be a tensor with the same dimensions as grad.
-            - let OUT be the output of add(1,OUT,1,BIAS)
-            - let f(gradient_input,BIAS) == dot(gradient_input,OUT)
-            - Then this function computes the gradient of f() with respect to BIAS and
-              assigns it to grad.
-    !*/
-
-// ----------------------------------------------------------------------------------------
-
-    void assign_bias_gradient (
-        tensor& grad,
-        const tensor& gradient_input
-    );
-    /*!
-        requires
-            - grad.num_samples() == 1
-            - gradient_input.k() == grad.k()
-            - gradient_input.nr() == grad.nr()
-            - gradient_input.nc() == grad.nc()
-            - gradient_input.size() > 0
-            - is_same_object(grad,gradient_input) == false
-        ensures
-            - let BIAS be a tensor with the same dimensions as grad.
-            - let OUT be the output of add(1,OUT,1,BIAS)
-            - let f(gradient_input,BIAS) == dot(gradient_input,OUT)
-            - Then this function computes the gradient of f() with respect to BIAS and
-              assigns it to grad.
-    !*/
-
-// ----------------------------------------------------------------------------------------
-
-    class tensor_conv
-    {
-    public:
-        tensor_conv(const tensor_conv&) = delete;
-        tensor_conv& operator=(const tensor_conv&) = delete;
-
-        tensor_conv() {}
-
-        void clear(
-        ) { impl.clear(); }
-
-        void operator() (
-            const bool add_to_output,
-            tensor& output,
-            const tensor& data,
-            const tensor& filters
-        ) { impl(add_to_output,output,data,filters); }
-        /*!
-            requires
-                - setup() has been called.  Specifically, setup() has been called like this:
-                    this->setup(data, filters, stride_y, stride_x, padding_y, padding_x);
-                - is_same_object(output,data) == false
-                - is_same_object(output,filters) == false
-                - filters.k() == data.k()
-                - filters.nr() <= src.nr() + 2*padding_y
-                - filters.nc() <= src.nc() + 2*padding_x
-                - #output.num_samples() == data.num_samples()
-                - #output.k() == filters.num_samples()
-                - #output.nr() == 1+(data.nr() + 2*padding_y - filters.nr())/stride_y
-                - #output.nc() == 1+(data.nc() + 2*padding_x - filters.nc())/stride_x
-            ensures
-                - Convolves filters over data.  If add_to_output==true then we add the
-                  results to output, otherwise we assign to output, overwriting the
-                  previous values in output.
-                - filters contains filters.num_samples() filters. 
-        !*/
-
-        void operator() (
-            const bool add_to_output,
-            resizable_tensor& output,
-            const tensor& data,
-            const tensor& filters
-        ) { impl(add_to_output,output,data,filters); }
-        /*!
-            requires
-                - setup() has been called.  Specifically, setup() has been called like this:
-                    this->setup(data, filters, stride_y, stride_x, padding_y, padding_x);
-                - is_same_object(output,data) == false
-                - is_same_object(output,filters) == false
-                - filters.k() == data.k()
-                - filters.nr() <= src.nr() + 2*padding_y
-                - filters.nc() <= src.nc() + 2*padding_x
-            ensures
-                - Convolves filters over data.  If add_to_output==true then we add the
-                  results to output, otherwise we assign to output, overwriting the
-                  previous values in output.  
-                - filters contains filters.num_samples() filters. 
-                - #output.num_samples() == data.num_samples()
-                - #output.k() == filters.num_samples()
-                - #output.nr() == 1+(data.nr() + 2*padding_y - filters.nr())/stride_y
-                - #output.nc() == 1+(data.nc() + 2*padding_x - filters.nc())/stride_x
-        !*/
-
-        void get_gradient_for_data (
-            const bool add_to_output,
-            const tensor& gradient_input, 
-            const tensor& filters,
-            tensor& data_gradient
-        ) { impl.get_gradient_for_data(add_to_output,gradient_input,filters,data_gradient); }
-        /*!
-            requires
-                - One of the following must be true:
-                    - filters has the same dimensions as the filters object given to the
-                      last call to operator().  Also, data_gradient has the same dimensions
-                      as the data object given to the last call to operator().
-                    - setup() has been called.  Specifically, setup() has been called like this:
-                      this->setup(data_gradient, filters, stride_y, stride_x, padding_y, padding_x);
-                - gradient_input has the following dimensions:
-                    - gradient_input.num_samples() == data_gradient.num_samples()
-                    - gradient_input.k() == filters.num_samples()
-                    - gradient_input.nr() == 1+(data_gradient.nr() + 2*padding_y - filters.nr())/stride_y
-                    - gradient_input.nc() == 1+(data_gradient.nc() + 2*padding_x - filters.nc())/stride_x
-                    - NOTE, these dimensions are what you would obtain if gradient_input
-                      has the same dimensions as the last output of operator().  
-                - is_same_object(data_gradient,filters) == false
-                - is_same_object(data_gradient,gradient_input) == false
-            ensures
-                - let OUT be the output of (*this)(OUT,data,filters,sx,sy).
-                - let f(data,filters) == dot(OUT, gradient_input)
-                - if (add_to_output) then
-                    - This function finds the gradient of f() with respect to data and adds
-                      this gradient to data_gradient.
-                - else
-                    - This function finds the gradient of f() with respect to data and
-                      assigns this gradient to data_gradient, overwriting the previous
-                      values in data_gradient.
-        !*/
-
-        void get_gradient_for_filters (
-            const bool add_to_output,
-            const tensor& gradient_input, 
-            const tensor& data,
-            tensor& filters_gradient
-        ) { impl.get_gradient_for_filters(add_to_output,gradient_input,data,filters_gradient); }
-        /*!
-            requires
-                - One of the following must be true:
-                    - filters_gradient has the same dimensions as the filters object given
-                      to the last call to operator().  Also, data has the same dimensions
-                      as the data object given to the last call to operator().
-                    - setup() has been called.  Specifically, setup() has been called like this:
-                      this->setup(data, filters_gradient, stride_y, stride_x, padding_y, padding_x);
-                - gradient_input has the following dimensions:
-                    - gradient_input.num_samples() == data.num_samples()
-                    - gradient_input.k() == filters.num_samples()
-                    - gradient_input.nr() == 1+(data.nr() + 2*padding_y - filters.nr())/stride_y
-                    - gradient_input.nc() == 1+(data.nc() + 2*padding_x - filters.nc())/stride_x
-                    - NOTE, these dimensions are what you would obtain if gradient_input
-                      has the same dimensions as the last output of operator().  
-                - is_same_object(filters_gradient,data) == false
-                - is_same_object(filters_gradient,gradient_input) == false
-            ensures
-                - let OUT be the output of (*this)(OUT,data,filters,sx,sy).
-                - let f(data,filters) == dot(OUT, gradient_input)
-                - if (add_to_output) then
-                    - This function finds the gradient of f() with respect to filters and
-                      adds this gradient to filters_gradient.
-                - else 
-                    - This function finds the gradient of f() with respect to filters and
-                      assigns this gradient to filters_gradient, overwriting the previous
-                      values in filters_gradient.
-        !*/
-
- 
-        void setup(
-            const tensor& data,
-            const tensor& filters,
-            int stride_y,
-            int stride_x,
-            int padding_y,
-            int padding_x
-        ) {impl.setup(data,filters,stride_y,stride_x,padding_y,padding_x); }
-        /*!
-            requires
-                - filters.k() == data.k()
-                - stride_y > 0
-                - stride_x > 0
-                - 0 <= padding_y < filters.nr()
-                - 0 <= padding_x < filters.nc()
-            ensures
-                - When operator() is called, the output tensor will have these dimensions:
-                    - output.nr() == 1+(data.nr() + 2*padding_y - filters.nr())/stride_y
-                    - output.nc() == 1+(data.nc() + 2*padding_x - filters.nc())/stride_x
-                    - output.num_samples() == data.num_samples()
-                    - output.k() == filters.num_samples()
-                - The point of setup() is to allow this object to gather information about
-                  all the tensor sizes and filter layouts involved in the computation.  In
-                  particular, the reason the tensors are input into setup() is just to
-                  observe their sizes.  setup() doesn't do anything with the contents of
-                  the tensors, or store any kind of references to the data or filter
-                  tensors. 
-        !*/
-       
-    private:
-#ifdef DLIB_USE_CUDA
-        cuda::tensor_conv impl;
-#else
-        cpu::tensor_conv impl;
-#endif
-
-    };
-
-// ----------------------------------------------------------------------------------------
-
-    class pooling
-    {
-        /*!
-            WHAT THIS OBJECT REPRESENTS
-                The pooling object is a tool for performing spatial pooling over a tensor.
-                It can be configured to do either max or average pooling.
-        !*/
-    public:
-
-        pooling(const pooling&) = delete;
-        pooling& operator=(const pooling&) = delete;
-
-        pooling (
-        ) = default;
-
-        void clear(
-        ) { impl.clear(); }
-
-        void setup_max_pooling(
-            int window_height,
-            int window_width,
-            int stride_y,
-            int stride_x,
-            int padding_y,
-            int padding_x
-        ) { impl.setup_max_pooling(window_height, window_width, stride_y, stride_x, padding_y, padding_x); }
-        /*!
-            requires
-                - window_height > 0
-                - window_width > 0
-                - stride_y > 0
-                - stride_x > 0
-                - 0 <= padding_y < window_height
-                - 0 <= padding_x < window_width
-            ensures
-                - When you call operator() it will do max pooling with the given
-                  parameters.
-        !*/
-
-        void setup_avg_pooling(
-            int window_height,
-            int window_width,
-            int stride_y,
-            int stride_x,
-            int padding_y,
-            int padding_x
-        ) { impl.setup_avg_pooling(window_height, window_width, stride_y, stride_x, padding_y, padding_x); }
-        /*!
-            requires
-                - window_height > 0
-                - window_width > 0
-                - stride_y > 0
-                - stride_x > 0
-                - 0 <= padding_y < window_height
-                - 0 <= padding_x < window_width
-            ensures
-                - When you call operator() it will do average pooling with the given
-                  parameters.
-        !*/
-
-        bool does_max_pooling(
-        ) const { return impl.does_max_pooling(); }
-
-        void operator() (
-            resizable_tensor& dest,
-            const tensor& src
-        ) { impl(dest, src); }
-        /*!
-            requires
-                - is_same_object(dest,src) == false
-                - either setup_max_pooling() or setup_avg_pooling() has been called.
-                - window_width  <= src.nc() + 2*padding_x
-                - window_height <= src.nr() + 2*padding_y
-            ensures
-                - #dest.num_samples() == src.num_samples()
-                - #dest.k() == src.k()
-                - #dest.nr() == 1 + (src.nr() + 2*padding_y - window_height)/stride_y
-                - #dest.nc() == 1 + (src.nc() + 2*padding_x - window_width)/stride_x
-                - WINDOW == centered_rect(x*stride_x + window_width/2 - padding_x,
-                                          y*stride_y + window_height/2 - padding_y,
-                                          window_width,
-                                          window_height)
-                - for all valid s, k, r, and c:
-                    - if (does_max_pooling()) then
-                        - image_plane(#dest,s,k)(r,c) == max(subm_clipped(image_plane(src,s,k),WINDOW(c,r)))
-                    - else
-                        - image_plane(#dest,s,k)(r,c) == mean(subm_clipped(image_plane(src,s,k),WINDOW(c,r)))
-        !*/
-
-        void get_gradient(
-            const tensor& gradient_input, 
-            const tensor& dest,
-            const tensor& src,
-            tensor& grad 
-        ) { impl.get_gradient(gradient_input, dest, src, grad); }
-        /*!
-            requires
-                - have_same_dimensions(gradient_input,dest) == true
-                - have_same_dimensions(src,grad) == true
-                - dest contains the result of calling (*this)(dest,src)
-                - is_same_object(grad,gradient_input) == false
-                - is_same_object(grad,dest) == false
-                - is_same_object(grad,src) == false
-            ensures
-                - Recalling that dest is the output of (*this)(dest,src),
-                  let f(src) == dot(gradient_input,dest)
-                - Then this function computes the gradient of f() with respect to src and
-                  adds it to grad.
-        !*/
-
-        private:
-#ifdef DLIB_USE_CUDA
-        cuda::pooling impl;
-#else
-        cpu::pooling impl;
-#endif
-    };
-
-// ----------------------------------------------------------------------------------------
-
-    void softmax (
-        tensor& dest,
-        const tensor& src
-    );
-    /*!
-        requires
-            - have_same_dimensions(dest, src) == true
-        ensures
-            - Note that the softmax function is a vector valued function: 
-                s(x) == exp(x)/sum(exp(x)) 
-            - Computes the softmax function on src and writes the results to dest.  The
-              softmax is computed per spatial location across the different channels at
-              each location.  That is, softmax() outputs a new tensor, #dest, where each of
-              the spatial locations in dest (i.e. image idx, row idx, and column idx)
-              contains the output of s() evaluated over the channel values at each
-              location.
-            - This function supports in-place operation, i.e. having
-              is_same_object(dest, src)==true
-    !*/
-
-    void softmax_gradient (
-        tensor& grad,
-        const tensor& dest,
-        const tensor& gradient_input
-    );
-    /*!
-        requires
-            - have_same_dimensions(dest,gradient_input) == true 
-            - have_same_dimensions(dest,grad) == true 
-        ensures
-            - We interpret dest as the output of softmax(dest,SRC) for some SRC tensor.
-              Then let f(SRC) == dot(gradient_input,dest).  Then this function computes the
-              gradient of f() with respect to SRC and stores it to grad.  Moreover, if
-              is_same_object(grad,gradient_input)==true then the output is assigned to
-              grad, replacing its previous contents.  Otherwise the output is added to
-              grad.
-            - This function supports in-place operation, i.e. having
-              is_same_object(grad, gradient_input)==true
-    !*/
-
-// ----------------------------------------------------------------------------------------
-
-    void softmax_all (
-        tensor& dest,
-        const tensor& src
-    );
-    /*!
-        requires
-            - have_same_dimensions(dest, src) == true
-        ensures
-            - Note that the softmax function is a vector valued function: 
-              s(x) == exp(x)/sum(exp(x)) 
-            - Computes the softmax function on src and writes the results to dest.  The
-              softmax is computed over the entire tensor with one invocation of s().  So
-              unlike softmax() which computes many s() evaluations, one for each spatial
-              location, softmax_all() calls s() once for the entire tensor.
-            - This function supports in-place operation, i.e. having
-              is_same_object(dest, src)==true
-    !*/
-
-    void softmax_all_gradient (
-        tensor& grad,
-        const tensor& dest,
-        const tensor& gradient_input
-    );
-    /*!
-        requires
-            - have_same_dimensions(dest,gradient_input) == true 
-            - have_same_dimensions(dest,grad) == true 
-            - is_same_object(grad, dest)==false
-        ensures
-            - We interpret dest as the output of softmax_all(dest,SRC) for some SRC tensor.
-              Then let f(SRC) == dot(gradient_input,dest) Then this function computes the
-              gradient of f() with respect to SRC and assigns it to grad.
-            - This function supports in-place operation, i.e. having
-              is_same_object(grad, gradient_input)==true
-    !*/
-
-// ----------------------------------------------------------------------------------------
-
-    void sigmoid (
-        tensor& dest,
-        const tensor& src
-    );
-    /*!
-        requires
-            - have_same_dimensions(dest, src) == true
-        ensures
-            - for all valid i:
-                - #dest.host()[i] == 1/(1+std::exp(-src.host()[i])) 
-            - This function supports in-place operation, i.e. having
-              is_same_object(dest, src)==true
-    !*/
-
-    void sigmoid_gradient (
-        tensor& grad,
-        const tensor& dest,
-        const tensor& gradient_input
-    );
-    /*!
-        requires
-            - have_same_dimensions(dest,gradient_input) == true 
-            - have_same_dimensions(dest,grad) == true 
-        ensures
-            - Recalling that dest is the output of sigmoid(dest,SRC) for some SRC tensor,
-              let f(SRC) == dot(gradient_input,dest).  Then this function computes the
-              gradient of f() with respect to SRC and stores it to grad.  Moreover, if
-              is_same_object(grad,gradient_input)==true then the output is assigned to
-              grad, replacing its previous contents.  Otherwise the output is added to
-              grad.
-            - This function supports in-place operation, i.e. having
-              is_same_object(grad, gradient_input)==true
-    !*/
-
-// ----------------------------------------------------------------------------------------
-
-    void relu (
-        tensor& dest,
-        const tensor& src
-    );
-    /*!
-        requires
-            - have_same_dimensions(dest, src) == true
-        ensures
-            - for all valid i:
-                - #dest.host()[i] == std::max(0,src.host()[i]) 
-            - This function supports in-place operation, i.e. having
-              is_same_object(dest, src)==true
-    !*/
-
-    void relu_gradient (
-        tensor& grad,
-        const tensor& dest,
-        const tensor& gradient_input
-    );
-    /*!
-        requires
-            - have_same_dimensions(dest,gradient_input) == true 
-            - have_same_dimensions(dest,grad) == true 
-        ensures
-            - Recalling that dest is the output of relu(dest,SRC) for some SRC tensor,
-              let f(SRC) == dot(gradient_input,dest).  Then this function computes the
-              gradient of f() with respect to SRC and stores it to grad.  Moreover, if
-              is_same_object(grad,gradient_input)==true then the output is assigned to
-              grad, replacing its previous contents.  Otherwise the output is added to
-              grad.
-            - This function supports in-place operation, i.e. having
-              is_same_object(grad, gradient_input)==true
-    !*/
-
-// ----------------------------------------------------------------------------------------
-
-    void prelu (
-        tensor& dest,
-        const tensor& src,
-        const tensor& param
-    );
-    /*!
-        requires
-            - have_same_dimensions(dest, src) == true
-            - param.size() == 1
-        ensures
-            - for all valid i:
-                - if (src.host()[i] > 0) then
-                    - #dest.host()[i] == src.host()[i]
-                - else
-                    - #dest.host()[i] == src.host()[i] * param.host()[0]
-            - This function supports in-place operation, i.e. having
-              is_same_object(dest, src)==true
-    !*/
-
-    void prelu_gradient (
-        tensor& grad,
-        const tensor& src,
-        const tensor& gradient_input,
-        const tensor& param,
-        tensor& params_grad 
-    );
-    /*!
-        requires
-            - have_same_dimensions(grad,src) == true 
-            - have_same_dimensions(grad,gradient_input) == true 
-            - param.size() == 1
-            - params_grad.size() == 1
-            - is_same_object(grad, gradient_input) == false
-        ensures
-            - Recalling that dest is the output of prelu(dest,src,param) let 
-              f(src,param) == dot(gradient_input,dest)
-            - Then this function computes the gradient of f() with respect to src and
-              param.  It assigns the gradient with respect to param to #params_grad and
-              adds the gradient with respect to src to #grad.
-    !*/
-
-// ----------------------------------------------------------------------------------------
-
-    void tanh (
-        tensor& dest,
-        const tensor& src
-    );
-    /*!
-        requires
-            - have_same_dimensions(dest, src) == true
-        ensures
-            - for all valid i:
-                - #dest.host()[i] == std::tanh(src.host()[i]) 
-            - This function supports in-place operation, i.e. having
-              is_same_object(dest, src)==true
-    !*/
-
-    void tanh_gradient (
-        tensor& grad,
-        const tensor& dest,
-        const tensor& gradient_input
-    );
-    /*!
-        requires
-            - have_same_dimensions(dest,gradient_input) == true 
-            - have_same_dimensions(dest,grad) == true 
-        ensures
-            - Recalling that dest is the output of tanh(dest,SRC) for some SRC tensor,
-              let f(SRC) == dot(gradient_input,dest).  Then this function computes the
-              gradient of f() with respect to SRC and stores it to grad.  Moreover, if
-              is_same_object(grad,gradient_input)==true then the output is assigned to
-              grad, replacing its previous contents.  Otherwise the output is added to
-              grad.
-            - This function supports in-place operation, i.e. having
-              is_same_object(grad, gradient_input)==true
-    !*/
-
-// ----------------------------------------------------------------------------------------
-
-    void resize_bilinear (
-        tensor& dest,
-        long dest_row_stride,
-        long dest_channel_stride,
-        const tensor& src,
-        long src_row_stride,
-        long src_channel_stride
-    );
-    /*!
-        requires
-            - is_same_object(dest, src)==false
-            - dest.num_samples() == src.num_samples()
-            - dest.k() == src.k()
-        ensures
-            - for all valid i,k:  image_plane(dest,i,k) is a copy of image_plane(src,i,k)
-              that has been bilinearly interpolated to fit into the shape of
-              image_plane(dest,i,k).
-            - Instead of supposing the row stride and channel stride in the tensors is
-              given by tensor::nc() and tensor::nr()*tensor::nc() respectively, we use the
-              provided stride values to transition from one row and channel to the next.
-              This is useful in combination with alias_tensor objects since it allows you
-              to operate on subwindows in an image.
-    !*/
-
-    void resize_bilinear_gradient (
-        tensor& grad,
-        long grad_row_stride,
-        long grad_channel_stride,
-        const tensor& gradient_input,
-        long gradient_input_row_stride,
-        long gradient_input_channel_stride
-    );
-    /*!
-        requires
-            - is_same_object(grad, gradient_input)==false
-            - gradient_input.num_samples() == grad.num_samples()
-            - gradient_input.k() == grad.k()
-        ensures
-            - Suppose that DEST is the output of resize_bilinear(DEST,SRC) for some SRC
-              tensor, let f(SRC) == dot(gradient_input,DEST).  Then this function computes
-              the gradient of f() with respect to SRC and adds it to grad.   It should be
-              noted that we don't need to know the contents of DEST to compute this
-              gradient.  All that matters is that gradient_input have the same dimensions
-              as DEST.
-            - Instead of supposing the row stride and channel stride in the tensors is
-              given by tensor::nc() and tensor::nr()*tensor::nc() respectively, we use the
-              provided stride values to transition from one row and channel to the next.
-              This is useful in combination with alias_tensor objects since it allows you
-              to operate on subwindows in an image.
-    !*/
-
-    inline void resize_bilinear (
-        tensor& dest,
-        const tensor& src
-    ) { resize_bilinear(dest, dest.nc(), dest.nr()*dest.nc(), src, src.nc(), src.nr()*src.nc()); }
-    /*!
-        requires
-            - is_same_object(dest, src)==false
-            - dest.num_samples() == src.num_samples()
-            - dest.k() == src.k()
-        ensures
-            - for all valid i,k:  image_plane(dest,i,k) is a copy of image_plane(src,i,k)
-              that has been bilinearly interpolated to fit into the shape of
-              image_plane(dest,i,k).
-    !*/
-
-    inline void resize_bilinear_gradient (
-        tensor& grad,
-        const tensor& gradient_input
-    ) { resize_bilinear_gradient(grad, grad.nc(), grad.nr()*grad.nc(), gradient_input, gradient_input.nc(), gradient_input.nr()*gradient_input.nc()); }
-    /*!
-        requires
-            - is_same_object(grad, gradient_input)==false
-            - gradient_input.num_samples() == grad.num_samples()
-            - gradient_input.k() == grad.k()
-        ensures
-            - Suppose that DEST is the output of resize_bilinear(DEST,SRC) for some SRC
-              tensor, let f(SRC) == dot(gradient_input,DEST).  Then this function computes
-              the gradient of f() with respect to SRC and adds it to grad.   It should be
-              noted that we don't need to know the contents of DEST to compute this
-              gradient.  All that matters is that gradient_input have the same dimensions
-              as DEST.
-    !*/
-
-// ----------------------------------------------------------------------------------------
-
-    class multi_device_tensor_averager
-    {
-        /*!
-            WHAT THIS OBJECT REPRESENTS
-                This object is a tool for very quickly averaging a bunch of tensors
-                together.
-        !*/
-    public:
-
-        multi_device_tensor_averager(const multi_device_tensor_averager&) = delete;
-        multi_device_tensor_averager& operator=(const multi_device_tensor_averager&) = delete;
-
-        multi_device_tensor_averager() = default;
-
-        void set(
-            std::vector<tensor*> items
-        )
-        /*!
-            requires
-                - All the tensors in items are the same size
-            ensures
-                - When you call average() we will average the tensors in items.
-                - It's important that the tensors already be allocated to their devices
-                  before you call set().  This is because set() will setup the types of
-                  between device transfers now and use them when you call average().  
-        !*/
-        {
-            using namespace ::dlib::cuda;
-            accessible_groups.clear();
-            epa.clear();
-            if (items.size() < 1)
-                return;
-
-            scale = 1.0/items.size();
-
-            // split item into groups of accessible devices
-            std::vector<tensor*> group, unused;
-            while(items.size() > 0)
-            {
-                group.push_back(items[0]);
-                for(size_t i = 1; i < items.size(); ++i)
-                {
-                    if (can_access_peer(*items[0], *items[i]))
-                        group.push_back(items[i]);
-                    else
-                        unused.push_back(items[i]);
-                }
-                accessible_groups.push_back(group);
-                unused.swap(items);
-                unused.clear();
-                group.clear();
-            }
-            for (auto&& g : accessible_groups)
-            {
-                for (size_t i = 1; i < g.size(); ++i)
-                {
-                    epa.emplace_back(new enable_peer_access(*g[0], *g[i]));
-                }
-            }
-        }
-
-        size_t num_device_groups(
-        ) const { return accessible_groups.size(); }
-        /*!
-            ensures
-                - The devices given to set() are grouped together when they can directly
-                  access each other using GPUDirect.  This function returns the number of
-                  such groups.  For example, if all devices can directly access each other
-                  then the number of groups is 1.
-        !*/
-
-        void average()
-        /*!
-            requires
-                - All the devices have stopped writing to the tensors given to set().  So
-                  you should probably call cudaDeviceSynchronize() on each of the relevant
-                  devices before calling average().
-            ensures
-                - Computes the average of all the tensors given to set() and then sets them
-                  all equal to the average.
-        !*/
-        {
-            using namespace ::dlib::cuda;
-
-
-            // First we average things within each group
-            for (auto&& g : accessible_groups)
-            {
-                raii_set_device set_dev(*g[0]);
-                if (g.size() == 1)
-                    tt::affine_transform(*g[0], *g[0], scale);
-                else 
-                    tt::affine_transform(*g[0], *g[0], *g[1], scale, scale);
-
-                for (size_t i = 2; i < g.size(); ++i)
-                    tt::affine_transform(*g[0], *g[0], *g[i], 1, scale);
-            }
-
-            if (accessible_groups.size() > 1)
-            {
-                tensor& total_avg = *accessible_groups[0][0];
-                raii_set_device set_dev(total_avg);
-                accum_buffer.copy_size(total_avg);
-                // now we need to average things across groups
-                for (size_t i = 1; i < accessible_groups.size(); ++i)
-                {
-                    memcpy(accum_buffer, *accessible_groups[i][0]);
-                    tt::add(total_avg, total_avg, accum_buffer);
-                }
-
-                // Now total_avg has the final average in it.  So we need to send
-                // copies of it back to each of the groups.
-                for (size_t i = 1; i < accessible_groups.size(); ++i)
-                {
-                    memcpy(*accessible_groups[i][0], total_avg);
-                }
-            }
-
-
-            // Now propagate averages back out to each element using point to point
-            // communication inside a group.
-            for (auto&& g : accessible_groups)
-            {
-                raii_set_device set_dev(*g[0]);
-                for (size_t i = 1; i < g.size(); ++i)
-                    memcpy(*g[i], *g[0]); 
-            }
-        }
-
-    private:
-        std::vector<std::unique_ptr<::dlib::cuda::enable_peer_access>> epa;
-        std::vector<std::vector<tensor*>> accessible_groups;
-        float scale;
-
-        resizable_tensor accum_buffer;
-    };
-
-// ----------------------------------------------------------------------------------------
-
-    void copy_tensor(
-            bool add_to,
-            tensor& dest,
-            size_t dest_k_offset,
-            const tensor& src,
-            size_t src_k_offset,
-            size_t count_k
-    );
-    /*!
-        requires
-            - dest.nc() == src.nc()
-            - dest.nr() == src.nr()
-            - dest.num_samples() == src.num_samples()
-            - dest.k() - dest_k_offset >= count_k
-            - src.k() - src_k_offset >= count_k
-            - is_same_object(dest,src) == false
-            - The memory areas of src and dest do not overlap.
-        ensures
-            - if (add_to) then
-                - performs: dest[i, k + dest_k_offset, r, c] += src[i, k + src_k_offset, r, c], where k in [0..count_k]
-                  i.e., adds content of each sample from src in to corresponding place of sample at dest.
-            - else
-                - performs: dest[i, k + dest_k_offset, r, c]  = src[i, k + src_k_offset, r, c], where k in [0..count_k]
-                  i.e., copies content of each sample from src in to corresponding place of sample at dest.
-    !*/
-
-// ----------------------------------------------------------------------------------------
-
-}}
-
-#ifdef NO_MAKEFILE
-#include "tensor_tools.cpp"
-#endif
-
-#endif // DLIB_TeNSOR_TOOLS_H_
-
-