summaryrefslogtreecommitdiffstats
path: root/ml/dlib/dlib/dnn/cuda_dlib.cu
diff options
context:
space:
mode:
authorDaniel Baumann <daniel.baumann@progress-linux.org>2024-03-09 13:19:48 +0000
committerDaniel Baumann <daniel.baumann@progress-linux.org>2024-03-09 13:20:02 +0000
commit58daab21cd043e1dc37024a7f99b396788372918 (patch)
tree96771e43bb69f7c1c2b0b4f7374cb74d7866d0cb /ml/dlib/dlib/dnn/cuda_dlib.cu
parentReleasing debian version 1.43.2-1. (diff)
downloadnetdata-58daab21cd043e1dc37024a7f99b396788372918.tar.xz
netdata-58daab21cd043e1dc37024a7f99b396788372918.zip
Merging upstream version 1.44.3.
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'ml/dlib/dlib/dnn/cuda_dlib.cu')
-rw-r--r--ml/dlib/dlib/dnn/cuda_dlib.cu1630
1 files changed, 1630 insertions, 0 deletions
diff --git a/ml/dlib/dlib/dnn/cuda_dlib.cu b/ml/dlib/dlib/dnn/cuda_dlib.cu
new file mode 100644
index 000000000..6c37593f1
--- /dev/null
+++ b/ml/dlib/dlib/dnn/cuda_dlib.cu
@@ -0,0 +1,1630 @@
+// Copyright (C) 2015 Davis E. King (davis@dlib.net)
+// License: Boost Software License See LICENSE.txt for the full license.
+
+#include "cuda_utils.h"
+#include "cuda_dlib.h"
+
+
+namespace dlib
+{
+ namespace cuda
+ {
+
+ // -----------------------------------------------------------------------------------
+
+ void set_device (
+ int dev
+ )
+ {
+ CHECK_CUDA(cudaSetDevice(dev));
+ }
+
+ int get_device (
+ )
+ {
+ int dev = 0;
+ CHECK_CUDA(cudaGetDevice(&dev));
+ return dev;
+ }
+
+ std::string get_device_name (
+ int device
+ )
+ {
+ cudaDeviceProp props;
+ CHECK_CUDA(cudaGetDeviceProperties(&props, device));
+ return props.name;
+ }
+
+ void set_current_device_blocking_sync(
+ )
+ {
+ CHECK_CUDA(cudaSetDeviceFlags(cudaDeviceScheduleBlockingSync));
+ }
+
+ int get_num_devices (
+ )
+ {
+ int num_devices;
+ CHECK_CUDA(cudaGetDeviceCount(&num_devices));
+ return num_devices;
+ }
+
+ bool can_access_peer (int device_id, int peer_device_id)
+ {
+ int can_access;
+ CHECK_CUDA(cudaDeviceCanAccessPeer(&can_access, device_id, peer_device_id));
+ return can_access != 0;
+ }
+ bool can_access_peer (const tensor& device, const tensor& peer_device)
+ {
+ return can_access_peer(device.device_id(), peer_device.device_id());
+ }
+
+ void device_synchronize (int dev)
+ {
+ raii_set_device set_dev(dev);
+ CHECK_CUDA(cudaDeviceSynchronize());
+ }
+ void device_synchronize (const tensor& dev) { device_synchronize(dev.device_id()); }
+
+ enable_peer_access::
+ enable_peer_access(
+ int device_id,
+ int peer_device_id
+ ) : call_disable(false), device_id(device_id), peer_device_id(peer_device_id)
+ {
+ raii_set_device set_dev(device_id);
+
+ auto err = cudaDeviceEnablePeerAccess(peer_device_id, 0);
+ if (err == cudaSuccess)
+ {
+ call_disable = true;
+ }
+ else if (err == cudaErrorPeerAccessAlreadyEnabled)
+ {
+ // call cudaGetLastError() to dispose of this error since we don't
+ // care.
+ auto err2 = cudaGetLastError();
+ if (err2 != cudaErrorPeerAccessAlreadyEnabled)
+ CHECK_CUDA(err2);
+ }
+ else
+ {
+ CHECK_CUDA(err);
+ }
+ }
+
+
+ enable_peer_access::
+ ~enable_peer_access() noexcept(false)
+ {
+ if (call_disable)
+ {
+ raii_set_device set_dev(device_id);
+ CHECK_CUDA(cudaDeviceDisablePeerAccess(peer_device_id));
+ }
+ }
+
+ // -----------------------------------------------------------------------------------
+ // -----------------------------------------------------------------------------------
+ // -----------------------------------------------------------------------------------
+
+ __global__ void _cuda_inverse_norms(float* invnorms, const float* data, size_t nr, size_t nc, const float eps)
+ {
+ // initialize invnorms before we begin.
+ for (auto i : grid_stride_range_y(0, nr))
+ for (auto j : grid_stride_range(0, 1))
+ invnorms[i] = eps;
+ __syncthreads();
+
+ for (auto i : grid_stride_range_y(0, nr))
+ {
+ auto p = data + i*nc;
+ float temp = 0;
+ for (auto j : grid_stride_range(0, nc))
+ temp += p[j]*p[j];
+
+ // and store the sum into invnorms[i]
+ warp_reduce_atomic_add(invnorms[i], temp);
+ }
+ __syncthreads();
+
+ for (auto i : grid_stride_range_y(0, nr))
+ for (auto j : grid_stride_range(0, 1))
+ invnorms[i] = 1.0/std::sqrt(invnorms[i]);
+ }
+
+ void inverse_norms (
+ resizable_tensor& invnorms,
+ const tensor& data,
+ const double eps
+ )
+ {
+ invnorms.set_size(data.num_samples());
+ launch_kernel(_cuda_inverse_norms, max_jobs(data.size()/data.num_samples(), data.num_samples()),
+ invnorms.device(), data.device(), data.num_samples(), data.size()/data.num_samples(), eps);
+ }
+
+ // ----------------------------------------------------------------------------------------
+
+ __global__ void _cuda_dot_prods(float* out, const float* lhs, const float* rhs, size_t nr, size_t nc)
+ {
+ // initialize out before we begin.
+ for (auto i : grid_stride_range_y(0, nr))
+ for (auto j : grid_stride_range(0, 1))
+ out[i] = 0;
+ __syncthreads();
+
+ for (auto i : grid_stride_range_y(0, nr))
+ {
+ auto l = lhs + i*nc;
+ auto r = rhs + i*nc;
+ float temp = 0;
+ for (auto j : grid_stride_range(0, nc))
+ temp += l[j]*r[j];
+
+ // and store the sum into out[i]
+ warp_reduce_atomic_add(out[i], temp);
+ }
+ }
+
+ __global__ void _cuda_dot_prods_add_to(float* out, const float* lhs, const float* rhs, size_t nr, size_t nc)
+ {
+ for (auto i : grid_stride_range_y(0, nr))
+ {
+ auto l = lhs + i*nc;
+ auto r = rhs + i*nc;
+ float temp = 0;
+ for (auto j : grid_stride_range(0, nc))
+ temp += l[j]*r[j];
+
+ // and store the sum into out[i]
+ warp_reduce_atomic_add(out[i], temp);
+ }
+ }
+
+ void dot_prods (
+ resizable_tensor& out,
+ const tensor& lhs,
+ const tensor& rhs
+ )
+ {
+ DLIB_CASSERT(have_same_dimensions(lhs,rhs));
+
+ out.set_size(lhs.num_samples());
+ if (out.size() == 0)
+ return;
+
+ const auto nr = lhs.num_samples();
+ const auto nc = lhs.size()/lhs.num_samples();
+
+ launch_kernel(_cuda_dot_prods, max_jobs(nc,nr), out.device_write_only(), lhs.device(), rhs.device(), nr, nc);
+ }
+
+ void dot_prods (
+ bool add_to,
+ tensor& out,
+ const tensor& lhs,
+ const tensor& rhs
+ )
+ {
+ DLIB_CASSERT(have_same_dimensions(lhs,rhs));
+ DLIB_CASSERT(out.k() == 1 && out.nr() == 1 && out.nc() == 1);
+ DLIB_CASSERT(out.size() == lhs.num_samples());
+
+ const auto nr = lhs.num_samples();
+ const auto nc = lhs.size()/lhs.num_samples();
+
+ if (add_to)
+ launch_kernel(_cuda_dot_prods_add_to, max_jobs(nc,nr), out.device(), lhs.device(), rhs.device(), nr, nc);
+ else
+ launch_kernel(_cuda_dot_prods, max_jobs(nc,nr), out.device_write_only(), lhs.device(), rhs.device(), nr, nc);
+ }
+
+ // ----------------------------------------------------------------------------------------
+
+ __global__ void _cuda_scale_columns(float* out, const float* m, const float* v, size_t nr, size_t nc)
+ {
+ for (auto j : grid_stride_range(0, nr*nc))
+ {
+ out[j] = m[j]*v[j%nc];
+ }
+ }
+
+ void scale_columns (
+ tensor& out,
+ const tensor& m,
+ const tensor& v
+ )
+ {
+ launch_kernel(_cuda_scale_columns, max_jobs(m.size()), out.device(), m.device(), v.device(), m.num_samples(), m.size()/m.num_samples());
+ }
+
+ // ----------------------------------------------------------------------------------------
+
+ __global__ void _cuda_scale_rows(float* out, const float* m, const float* v, size_t nr, size_t nc)
+ {
+ for (auto j : grid_stride_range(0, nr*nc))
+ {
+ out[j] = m[j]*v[j/nc];
+ }
+ }
+
+ void scale_rows (
+ tensor& out,
+ const tensor& m,
+ const tensor& v
+ )
+ {
+ launch_kernel(_cuda_scale_rows, max_jobs(m.size()), out.device(), m.device(), v.device(), m.num_samples(), m.size()/m.num_samples());
+ }
+
+ // ----------------------------------------------------------------------------------------
+
+ __global__ void _cuda_scale_rows2(float* out, const float* m1, const float* m2, const float* v1, const float* v2, size_t nr, size_t nc)
+ {
+ for (auto j : grid_stride_range(0, nr*nc))
+ {
+ out[j] = (m1[j] - m2[j]*v1[j/nc]) * v2[j/nc];
+ }
+ }
+
+ __global__ void _cuda_scale_rows2_beta(const float beta, float* out, const float* m1, const float* m2, const float* v1, const float* v2, size_t nr, size_t nc)
+ {
+ for (auto j : grid_stride_range(0, nr*nc))
+ {
+ out[j] = beta*out[j] + (m1[j] - m2[j]*v1[j/nc]) * v2[j/nc];
+ }
+ }
+
+ void scale_rows2 (
+ float beta,
+ tensor& out,
+ const tensor& m1,
+ const tensor& m2,
+ const tensor& v1,
+ const tensor& v2
+ )
+ {
+ if (beta == 0)
+ {
+ launch_kernel(_cuda_scale_rows2, max_jobs(m1.size()), out.device(),
+ m1.device(), m2.device(), v1.device(), v2.device(), m1.num_samples(),
+ m1.size()/m1.num_samples());
+ }
+ else
+ {
+ launch_kernel(_cuda_scale_rows2_beta, max_jobs(m1.size()), beta,
+ out.device(), m1.device(), m2.device(), v1.device(), v2.device(),
+ m1.num_samples(), m1.size()/m1.num_samples());
+ }
+ }
+
+ // ----------------------------------------------------------------------------------------
+
+ __global__ void _cuda_exp(float* dest, const float* src, size_t n)
+ {
+ for (auto i : grid_stride_range(0, n))
+ dest[i] = ::exp(src[i]);
+ }
+
+ void exp (
+ tensor& dest,
+ const tensor& src
+ )
+ {
+ DLIB_ASSERT(dest.size() == src.size());
+ launch_kernel(_cuda_exp, max_jobs(src.size()), dest.device(), src.device(), src.size());
+ }
+
+ // ----------------------------------------------------------------------------------------
+
+ __global__ void _cuda_log(float* dest, const float* src, size_t n)
+ {
+ for (auto i : grid_stride_range(0, n))
+ dest[i] = ::log(src[i]);
+ }
+
+ void log (
+ tensor& dest,
+ const tensor& src
+ )
+ {
+ DLIB_ASSERT(dest.size() == src.size());
+ launch_kernel(_cuda_log, max_jobs(src.size()), dest.device(), src.device(), src.size());
+ }
+
+ // ----------------------------------------------------------------------------------------
+
+ __global__ void _cuda_log10(float* dest, const float* src, size_t n)
+ {
+ for (auto i : grid_stride_range(0, n))
+ dest[i] = ::log10(src[i]);
+ }
+
+ void log10 (
+ tensor& dest,
+ const tensor& src
+ )
+ {
+ DLIB_ASSERT(dest.size() == src.size());
+ launch_kernel(_cuda_log10, max_jobs(src.size()), dest.device(), src.device(), src.size());
+ }
+
+ // -----------------------------------------------------------------------------------
+
+ __global__ void _cuda_multiply1(float* d, const float* s1, const float* s2, size_t n)
+ {
+ for (auto i : grid_stride_range(0, n))
+ {
+ d[i] = s1[i]*s2[i];
+ }
+ }
+ __global__ void _cuda_multiply2(float* d, const float* s1, const float* s2,
+ size_t n, size_t s1_n, size_t s2_n, size_t max_size)
+ {
+ for (auto i : grid_stride_range(0, n))
+ {
+ d[i] = 0;
+ for (size_t j = i; j < max_size; j += n)
+ d[i] += s1[j%s1_n]*s2[j%s2_n];
+ }
+ }
+
+ __global__ void _cuda_multiply3(float* d, const float* s1, const float* s2,
+ size_t n, size_t s1_n, size_t s2_n)
+ {
+ for (auto i : grid_stride_range(0, n))
+ {
+ d[i] = s1[i%s1_n]*s2[i%s2_n];
+ }
+ }
+
+ __global__ void _cuda_multiply1_add_to(float* d, const float* s1, const float* s2, size_t n)
+ {
+ for (auto i : grid_stride_range(0, n))
+ {
+ d[i] += s1[i]*s2[i];
+ }
+ }
+ __global__ void _cuda_multiply2_add_to(float* d, const float* s1, const float* s2,
+ size_t n, size_t s1_n, size_t s2_n, size_t max_size)
+ {
+ for (auto i : grid_stride_range(0, n))
+ {
+ for (size_t j = i; j < max_size; j += n)
+ d[i] += s1[j%s1_n]*s2[j%s2_n];
+ }
+ }
+
+ __global__ void _cuda_multiply3_add_to(float* d, const float* s1, const float* s2,
+ size_t n, size_t s1_n, size_t s2_n)
+ {
+ for (auto i : grid_stride_range(0, n))
+ {
+ d[i] += s1[i%s1_n]*s2[i%s2_n];
+ }
+ }
+
+ void multiply (
+ bool add_to,
+ tensor& dest,
+ const tensor& src1,
+ const tensor& src2
+ )
+ {
+
+ DLIB_CASSERT(dest.k() == src1.k() && src1.k() == src2.k() &&
+ dest.nr() == src1.nr() && src1.nr() == src2.nr() &&
+ dest.nc() == src1.nc() && src1.nc() == src2.nc() );
+ const long MD = std::max(std::max(dest.num_samples(),src1.num_samples()),src2.num_samples());
+ DLIB_CASSERT((dest.num_samples()==1 || dest.num_samples()==MD) &&
+ (src1.num_samples()==1 || src1.num_samples()==MD) &&
+ (src2.num_samples()==1 || src2.num_samples()==MD) );
+
+ if (dest.size() == 0)
+ return;
+
+ const size_t max_size = std::max(std::max(dest.size(),src1.size()),src2.size());
+ const auto d = dest.host();
+ const auto s1 = src1.host();
+ const auto s2 = src2.host();
+ if (dest.size() == src1.size() && src1.size() == src2.size())
+ {
+ if (add_to)
+ launch_kernel(_cuda_multiply1_add_to,max_jobs(dest.size()),dest.device(), src1.device(), src2.device(), src1.size());
+ else
+ launch_kernel(_cuda_multiply1,max_jobs(dest.size()),dest.device(), src1.device(), src2.device(), src1.size());
+ }
+ else if (dest.num_samples() == 1)
+ {
+ if (add_to)
+ launch_kernel(_cuda_multiply2_add_to,max_jobs(dest.size()),dest.device(), src1.device(), src2.device(),
+ dest.size(), src1.size(), src2.size(), max_size);
+ else
+ launch_kernel(_cuda_multiply2,max_jobs(dest.size()),dest.device(), src1.device(), src2.device(),
+ dest.size(), src1.size(), src2.size(), max_size);
+ }
+ else
+ {
+ if (add_to)
+ launch_kernel(_cuda_multiply3_add_to,max_jobs(dest.size()),dest.device(), src1.device(), src2.device(),
+ dest.size(), src1.size(), src2.size());
+ else
+ launch_kernel(_cuda_multiply3,max_jobs(dest.size()),dest.device(), src1.device(), src2.device(),
+ dest.size(), src1.size(), src2.size());
+ }
+ }
+
+ // ------------------------------------------------------------------------------------
+
+ __global__ void _cuda_multiply_conv(float* d, const float* s1, size_t n, const float* s2, size_t bs, size_t ks)
+ {
+ for (auto i : grid_stride_range(0, n))
+ {
+ auto k = (i/bs)%ks;
+ d[i] = s1[i]*s2[k];
+ }
+ }
+
+ __global__ void _cuda_multiply_conv2(float* d, const float* s1, size_t n, const float* s2, size_t bs, size_t ks)
+ {
+ // zero initialize d before we begin.
+ for (auto i : grid_stride_range_y(0, ks))
+ for (auto j : grid_stride_range(0, 1))
+ d[i] = 0;
+ __syncthreads();
+
+ // loop over all the image planes
+ for (auto i : grid_stride_range_y(0, n))
+ {
+ // sum all the elements in the i-th image plane
+ float temp = 0;
+ for (auto j : grid_stride_range(i*bs, (i+1)*bs))
+ temp += s1[j]*s2[j];
+ auto k = i%ks;
+ // and store the sum into d[k]
+ warp_reduce_atomic_add(d[k], temp);
+ }
+ }
+
+ __global__ void _cuda_multiply_conv_add_to(float* d, const float* s1, size_t n, const float* s2, size_t bs, size_t ks)
+ {
+ for (auto i : grid_stride_range(0, n))
+ {
+ auto k = (i/bs)%ks;
+ d[i] += s1[i]*s2[k];
+ }
+ }
+
+ __global__ void _cuda_multiply_conv2_add_to(float* d, const float* s1, size_t n, const float* s2, size_t bs, size_t ks)
+ {
+ // loop over all the image planes
+ for (auto i : grid_stride_range_y(0, n))
+ {
+ // sum all the elements in the i-th image plane
+ float temp = 0;
+ for (auto j : grid_stride_range(i*bs, (i+1)*bs))
+ temp += s1[j]*s2[j];
+ auto k = i%ks;
+ // and store the sum into d[k]
+ warp_reduce_atomic_add(d[k], temp);
+ }
+ }
+
+
+ void multiply_conv (
+ bool add_to,
+ tensor& dest,
+ const tensor& src1,
+ const tensor& src2
+ )
+ {
+ if (have_same_dimensions(dest,src1))
+ {
+ DLIB_CASSERT(src2.num_samples() == 1 && src2.nr() == 1 && src2.nc() == 1 && src2.k() == src1.k());
+ if (dest.size() == 0)
+ return;
+
+ if (add_to)
+ launch_kernel(_cuda_multiply_conv_add_to,max_jobs(dest.size()),
+ dest.device(), src1.device(), src1.size(), src2.device(), src1.nr()*src1.nc(), src1.k());
+ else
+ launch_kernel(_cuda_multiply_conv,max_jobs(dest.size()),
+ dest.device(), src1.device(), src1.size(), src2.device(), src1.nr()*src1.nc(), src1.k());
+ }
+ else
+ {
+ DLIB_CASSERT(have_same_dimensions(src1,src2));
+ DLIB_CASSERT(dest.num_samples() == 1 && dest.nr() == 1 && dest.nc() == 1 && dest.k() == src1.k());
+ if (dest.size() == 0)
+ return;
+
+
+ const auto bs = src1.nr()*src1.nc();
+ const auto n = src1.num_samples()*src1.k();
+ if (add_to)
+ launch_kernel(_cuda_multiply_conv2_add_to, max_jobs(bs,n),
+ dest.device(), src1.device(), n, src2.device(), bs, src1.k());
+ else
+ launch_kernel(_cuda_multiply_conv2, max_jobs(bs,n),
+ dest.device(), src1.device(), n, src2.device(), bs, src1.k());
+ }
+
+ }
+
+ // ------------------------------------------------------------------------------------
+
+ __global__ void _cuda_scale_channels_add_to(float* d, const float* src, size_t n, const float* scales, size_t bs)
+ {
+ for (auto i : grid_stride_range(0, n))
+ {
+ auto k = i/bs;
+ d[i] += src[i]*scales[k];
+ }
+ }
+
+ __global__ void _cuda_scale_channels(float* d, const float* src, size_t n, const float* scales, size_t bs)
+ {
+ for (auto i : grid_stride_range(0, n))
+ {
+ auto k = i/bs;
+ d[i] = src[i]*scales[k];
+ }
+ }
+
+ void scale_channels (
+ bool add_to,
+ tensor& dest,
+ const tensor& src,
+ const tensor& scales
+ )
+ {
+ DLIB_CASSERT(have_same_dimensions(dest,src) &&
+ scales.num_samples() == src.num_samples() &&
+ scales.k() == src.k() &&
+ scales.nr() == 1 &&
+ scales.nc() == 1 );
+
+ if (dest.size() == 0)
+ return;
+
+ if (add_to)
+ launch_kernel(_cuda_scale_channels_add_to,max_jobs(dest.size()),
+ dest.device(), src.device(), src.size(), scales.device(), src.nr()*src.nc());
+ else
+ launch_kernel(_cuda_scale_channels,max_jobs(dest.size()),
+ dest.device_write_only(), src.device(), src.size(), scales.device(), src.nr()*src.nc());
+ }
+
+ // ------------------------------------------------------------------------------------
+
+ __global__ void _cuda_mult1(float* d, const float* s1, const float* s2, size_t n)
+ {
+ for (auto i : grid_stride_range(0, n))
+ {
+ d[i] = s1[i]*s2[i];
+ }
+ }
+
+ __global__ void _cuda_mult1_add_to(float* d, const float* s1, const float* s2, size_t n)
+ {
+ for (auto i : grid_stride_range(0, n))
+ {
+ d[i] += s1[i]*s2[i];
+ }
+ }
+
+ __global__ void _cuda_mult2(float* d, const float* s1, const float* s2,
+ size_t dn, size_t dk, size_t dr, size_t dc,
+ size_t s1n, size_t s1k, size_t s1r, size_t s1c,
+ size_t s2n, size_t s2k, size_t s2r, size_t s2c)
+ {
+ for (auto i : grid_stride_range(0, dn*dk*dr*dc))
+ {
+ size_t n,k,r,c;
+ unpack_idx(i, dk,dr,dc, n,k,r,c);
+
+ float v1 = 0;
+ float v2 = 0;
+
+ if (n < s1n &&
+ k < s1k &&
+ r < s1r &&
+ c < s1c )
+ {
+ v1 = s1[pack_idx(s1k,s1r,s1c, n,k,r,c)];
+ }
+
+ if (n < s2n &&
+ k < s2k &&
+ r < s2r &&
+ c < s2c )
+ {
+ v2 = s2[pack_idx(s2k,s2r,s2c, n,k,r,c)];
+ }
+
+ d[i] = v1*v2;
+ }
+ }
+
+ __global__ void _cuda_mult2_add_to(float* d, const float* s1, const float* s2,
+ size_t dn, size_t dk, size_t dr, size_t dc,
+ size_t s1n, size_t s1k, size_t s1r, size_t s1c,
+ size_t s2n, size_t s2k, size_t s2r, size_t s2c)
+ {
+ for (auto i : grid_stride_range(0, dn*dk*dr*dc))
+ {
+ size_t n,k,r,c;
+ unpack_idx(i, dk,dr,dc, n,k,r,c);
+
+ float v1 = 0;
+ float v2 = 0;
+
+ if (n < s1n &&
+ k < s1k &&
+ r < s1r &&
+ c < s1c )
+ {
+ v1 = s1[pack_idx(s1k,s1r,s1c, n,k,r,c)];
+ }
+
+ if (n < s2n &&
+ k < s2k &&
+ r < s2r &&
+ c < s2c )
+ {
+ v2 = s2[pack_idx(s2k,s2r,s2c, n,k,r,c)];
+ }
+
+ d[i] += v1*v2;
+ }
+ }
+
+ void multiply_zero_padded (
+ bool add_to,
+ tensor& dest,
+ const tensor& src1,
+ const tensor& src2
+ )
+ {
+ if (dest.size() == 0)
+ return;
+
+ // Do the simple and fast version if everything has the same dimensions
+ if (have_same_dimensions(dest, src1) &&
+ have_same_dimensions(dest, src2))
+ {
+ if (add_to)
+ launch_kernel(_cuda_mult1_add_to,max_jobs(dest.size()), dest.device(), src1.device(), src2.device(), dest.size());
+ else
+ launch_kernel(_cuda_mult1,max_jobs(dest.size()), dest.device(), src1.device(), src2.device(), dest.size());
+ }
+ else
+ {
+ if (add_to)
+ {
+ // Otherwise, do the more complex version with bounds checking.
+ launch_kernel(_cuda_mult2_add_to,max_jobs(dest.size()),
+ dest.device(), src1.device(), src2.device(),
+ dest.num_samples(), dest.k(), dest.nr(), dest.nc(),
+ src1.num_samples(), src1.k(), src1.nr(), src1.nc(),
+ src2.num_samples(), src2.k(), src2.nr(), src2.nc()
+ );
+ }
+ else
+ {
+ // Otherwise, do the more complex version with bounds checking.
+ launch_kernel(_cuda_mult2,max_jobs(dest.size()),
+ dest.device(), src1.device(), src2.device(),
+ dest.num_samples(), dest.k(), dest.nr(), dest.nc(),
+ src1.num_samples(), src1.k(), src1.nr(), src1.nc(),
+ src2.num_samples(), src2.k(), src2.nr(), src2.nc()
+ );
+ }
+ }
+ }
+
+ // ------------------------------------------------------------------------------------
+
+ __global__ void _cuda_add1(float* d, const float* s1, const float* s2, size_t n)
+ {
+ for (auto i : grid_stride_range(0, n))
+ {
+ d[i] = s1[i]+s2[i];
+ }
+ }
+
+ __global__ void _cuda_add2(float* d, const float* s1, const float* s2,
+ size_t dn, size_t dk, size_t dr, size_t dc,
+ size_t s1n, size_t s1k, size_t s1r, size_t s1c,
+ size_t s2n, size_t s2k, size_t s2r, size_t s2c)
+ {
+ for (auto i : grid_stride_range(0, dn*dk*dr*dc))
+ {
+ size_t n,k,r,c;
+ unpack_idx(i, dk,dr,dc, n,k,r,c);
+
+ float v1 = 0;
+ float v2 = 0;
+
+ if (n < s1n &&
+ k < s1k &&
+ r < s1r &&
+ c < s1c )
+ {
+ v1 = s1[pack_idx(s1k,s1r,s1c, n,k,r,c)];
+ }
+
+ if (n < s2n &&
+ k < s2k &&
+ r < s2r &&
+ c < s2c )
+ {
+ v2 = s2[pack_idx(s2k,s2r,s2c, n,k,r,c)];
+ }
+
+ d[i] = v1+v2;
+ }
+ }
+
+ void add (
+ tensor& dest,
+ const tensor& src1,
+ const tensor& src2
+ )
+ {
+ if (dest.size() == 0)
+ return;
+
+ // Do the simple and fast version if everything has the same dimensions
+ if (have_same_dimensions(dest, src1) &&
+ have_same_dimensions(dest, src2))
+ {
+ launch_kernel(_cuda_add1,max_jobs(dest.size()), dest.device(), src1.device(), src2.device(), dest.size());
+ }
+ else
+ {
+ // Otherwise, do the more complex version with bounds checking.
+ launch_kernel(_cuda_add2,max_jobs(dest.size()),
+ dest.device(), src1.device(), src2.device(),
+ dest.num_samples(), dest.k(), dest.nr(), dest.nc(),
+ src1.num_samples(), src1.k(), src1.nr(), src1.nc(),
+ src2.num_samples(), src2.k(), src2.nr(), src2.nc()
+ );
+ }
+
+ }
+
+ // ------------------------------------------------------------------------------------
+
+ __global__ void _cuda_affine_transform1(float* d, const float* s, size_t n, float A, float B)
+ {
+ for (auto i : grid_stride_range(0, n))
+ {
+ d[i] = A*s[i] + B;
+ }
+ }
+
+ __global__ void _cuda_affine_transform1_0(float* d, const float* s, size_t n, float A)
+ {
+ for (auto i : grid_stride_range(0, n))
+ {
+ d[i] = A*s[i];
+ }
+ }
+
+ void affine_transform(
+ tensor& dest,
+ const tensor& src,
+ const float A,
+ const float B
+ )
+ {
+ DLIB_CASSERT(dest.size()==src.size());
+ if (B != 0)
+ launch_kernel(_cuda_affine_transform1,max_jobs(dest.size()),dest.device(), src.device(), src.size(), A, B);
+ else
+ launch_kernel(_cuda_affine_transform1_0,max_jobs(dest.size()),dest.device(), src.device(), src.size(), A);
+ }
+
+ void affine_transform(
+ tensor& dest,
+ const tensor& src,
+ const float A
+ )
+ {
+ DLIB_CASSERT(dest.size()==src.size());
+ launch_kernel(_cuda_affine_transform1_0,max_jobs(dest.size()),dest.device(), src.device(), src.size(), A);
+ }
+
+ // ----------------------------------------------------------------------------------------
+
+ __global__ void _cuda_affine_transform_rect(
+ float* d,
+ const float* s1,
+ const float* s2,
+ const float* s3,
+ float A,
+ float B,
+ float C,
+ size_t start_idx,
+ size_t n,
+ size_t rect_nc,
+ size_t total_nc
+ )
+ {
+ for (auto i : grid_stride_range(0, n))
+ {
+ size_t r = i/rect_nc;
+ size_t c = i%rect_nc;
+ size_t idx = r*total_nc + c + start_idx;
+ d[idx] = A*s1[idx] + B*s2[idx] + C*s3[idx];
+ }
+ }
+
+ void affine_transform(
+ const rectangle& rect,
+ tensor& dest,
+ const tensor& src1,
+ const tensor& src2,
+ const tensor& src3,
+ float A,
+ float B,
+ float C
+ )
+ {
+ DLIB_CASSERT(dest.size() == src1.size());
+ DLIB_CASSERT(dest.size() == src2.size());
+ DLIB_CASSERT(dest.size() == src3.size());
+ DLIB_CASSERT(dest.num_samples() == src1.num_samples());
+ DLIB_CASSERT(dest.num_samples() == src2.num_samples());
+ DLIB_CASSERT(dest.num_samples() == src3.num_samples());
+ DLIB_CASSERT(rectangle(0,0, dest.size()/dest.num_samples()-1, dest.num_samples()-1).contains(rect));
+ launch_kernel(_cuda_affine_transform_rect,max_jobs(rect.area()),
+ dest.device(), src1.device(), src2.device(), src3.device(), A, B, C,
+ rect.left() + rect.top()*(dest.size()/dest.num_samples()),
+ rect.area(),
+ rect.width(),
+ dest.size()/dest.num_samples());
+ }
+
+ // ----------------------------------------------------------------------------------------
+
+ __global__ void _cuda_affine_transform4(float* d, const float* s1, const float* s2, size_t n, float A, float B, float C)
+ {
+ for (auto i : grid_stride_range(0, n))
+ {
+ d[i] = A*s1[i] + B*s2[i] + C;
+ }
+ }
+
+ __global__ void _cuda_affine_transform4_0(float* d, const float* s1, const float* s2, size_t n, float A, float B)
+ {
+ for (auto i : grid_stride_range(0, n))
+ {
+ d[i] = A*s1[i] + B*s2[i];
+ }
+ }
+
+ void affine_transform(
+ tensor& dest,
+ const tensor& src1,
+ const tensor& src2,
+ const float A,
+ const float B,
+ const float C
+ )
+ {
+ DLIB_CASSERT(dest.size()==src1.size());
+ DLIB_CASSERT(dest.size()==src2.size());
+ if (C != 0)
+ launch_kernel(_cuda_affine_transform4,max_jobs(dest.size()),dest.device(), src1.device(), src2.device(), dest.size(), A, B, C);
+ else
+ launch_kernel(_cuda_affine_transform4_0,max_jobs(dest.size()),dest.device(), src1.device(), src2.device(), dest.size(), A, B);
+ }
+
+ void affine_transform(
+ tensor& dest,
+ const tensor& src1,
+ const tensor& src2,
+ const float A,
+ const float B
+ )
+ {
+ DLIB_CASSERT(dest.size()==src1.size());
+ DLIB_CASSERT(dest.size()==src2.size());
+ launch_kernel(_cuda_affine_transform4_0,max_jobs(dest.size()),dest.device(), src1.device(), src2.device(), dest.size(), A, B);
+ }
+
+ // ----------------------------------------------------------------------------------------
+
+ __global__ void _cuda_add_scaled(float* d, const float* s, size_t n, float scale)
+ {
+ for (auto i : grid_stride_range(0, n))
+ {
+ d[i] += scale*s[i];
+ }
+ }
+
+ void add_scaled(
+ tensor& dest,
+ const float scale,
+ const tensor& src
+ )
+ {
+ DLIB_CASSERT(dest.size()==src.size());
+ launch_kernel(_cuda_add_scaled,max_jobs(dest.size()),dest.device(), src.device(), dest.size(), scale);
+ }
+
+ // ----------------------------------------------------------------------------------------
+
+ __global__ void _cuda_add_cv_to_all_columns(float beta, float* dest, float alpha, const float* src, size_t size, size_t stride)
+ {
+ for (auto i : grid_stride_range(0, size))
+ {
+ dest[i] = beta*dest[i] + alpha*src[i/stride];
+ }
+ }
+
+ __global__ void _cuda_add_cv_to_all_columns_no_beta(float* dest, float alpha, const float* src, size_t size, size_t stride)
+ {
+ for (auto i : grid_stride_range(0, size))
+ {
+ dest[i] = alpha*src[i/stride];
+ }
+ }
+
+ void add_cv_to_all_columns(
+ float beta,
+ tensor& dest,
+ float alpha,
+ const tensor& src
+ )
+ {
+ DLIB_CASSERT(dest.num_samples() == src.num_samples() && src.num_samples() == src.size());
+ if (beta == 0)
+ launch_kernel(_cuda_add_cv_to_all_columns_no_beta, max_jobs(dest.size()), dest.device(), alpha, src.device(), dest.size(), dest.size()/dest.num_samples());
+ else
+ launch_kernel(_cuda_add_cv_to_all_columns, max_jobs(dest.size()), beta, dest.device(), alpha, src.device(), dest.size(), dest.size()/dest.num_samples());
+ }
+
+ // ----------------------------------------------------------------------------------------
+
+ __global__ void _cuda_affine_transform5(
+ float* d, const float* s1, const float* s2, const float* s3, size_t n, float A, float B, float C, float D
+ )
+ {
+ for (auto i : grid_stride_range(0, n))
+ {
+ d[i] = A*s1[i] + B*s2[i] + C*s3[i] + D;
+ }
+ }
+
+ void affine_transform(
+ tensor& dest,
+ const tensor& src1,
+ const tensor& src2,
+ const tensor& src3,
+ const float A,
+ const float B,
+ const float C,
+ const float D
+ )
+ {
+ DLIB_CASSERT(dest.size()==src1.size());
+ DLIB_CASSERT(dest.size()==src2.size());
+ DLIB_CASSERT(dest.size()==src3.size());
+ launch_kernel(_cuda_affine_transform5,max_jobs(dest.size()),dest.device(), src1.device(),
+ src2.device(), src3.device(), dest.size(), A, B, C, D);
+ }
+
+ // ----------------------------------------------------------------------------------------
+
+ __global__ void _cuda_affine_transform_range(
+ float* d, const float* s1, const float* s2, const float* s3, size_t begin, size_t end, float A, float B, float C
+ )
+ {
+ for (auto i : grid_stride_range(begin, end))
+ {
+ d[i] = A*s1[i] + B*s2[i] + C*s3[i];
+ }
+ }
+
+
+ void affine_transform_range(
+ size_t begin,
+ size_t end,
+ tensor& dest,
+ const tensor& src1,
+ const tensor& src2,
+ const tensor& src3,
+ const float A,
+ const float B,
+ const float C
+ )
+ {
+ DLIB_CASSERT(dest.size()==src1.size());
+ DLIB_CASSERT(dest.size()==src2.size());
+ DLIB_CASSERT(dest.size()==src3.size());
+ DLIB_CASSERT(begin <= end && end <= dest.size());
+ launch_kernel(_cuda_affine_transform_range,max_jobs(end-begin),
+ dest.device(), src1.device(),
+ src2.device(), src3.device(), begin, end, A, B, C);
+ }
+
+ // -----------------------------------------------------------------------------------
+
+ __global__ void _cuda_affine_transform2(float* d, const float* s, size_t n, const float* A, const float* B)
+ {
+ for (auto i : grid_stride_range(0, n))
+ {
+ d[i] = A[i]*s[i] + B[i];
+ }
+ }
+ __global__ void _cuda_affine_transform3(float* d, const float* s, size_t n, const float* A, const float* B, size_t bs)
+ {
+ for (auto i : grid_stride_range(0, n))
+ {
+ d[i] = A[i%bs]*s[i] + B[i%bs];
+ }
+ }
+
+ void affine_transform(
+ tensor& dest,
+ const tensor& src,
+ const tensor& A,
+ const tensor& B
+ )
+ {
+ DLIB_CASSERT(have_same_dimensions(dest, src));
+ DLIB_CASSERT(
+ ((A.num_samples()==1 && B.num_samples()==1) ||
+ (A.num_samples()==src.num_samples() && B.num_samples()==src.num_samples())));
+ DLIB_CASSERT(
+ A.nr()==B.nr() && B.nr()==src.nr() &&
+ A.nc()==B.nc() && B.nc()==src.nc() &&
+ A.k() ==B.k() && B.k()==src.k(),
+ "\nA.nr(): " << A.nr() << "\nB.nr(): " << B.nr() << "\nsrc.nr(): " << src.nr()
+ <<"\nA.nc(): " << A.nc() << "\nB.nc(): " << B.nc() << "\nsrc.nc(): " << src.nc()
+ <<"\nA.k(): " << A.k() << "\nB.k(): " << B.k() << "\nsrc.k(): " << src.k()
+ );
+
+ if (A.num_samples() == 1)
+ {
+ launch_kernel(_cuda_affine_transform3,max_jobs(dest.size()),dest.device(), src.device(), src.size(), A.device(), B.device(), A.size());
+ }
+ else
+ {
+ launch_kernel(_cuda_affine_transform2,max_jobs(dest.size()),dest.device(), src.device(), src.size(), A.device(), B.device());
+ }
+ }
+
+ // ----------------------------------------------------------------------------------------
+
+ __global__ void _cuda_compute_adam_update(
+ size_t begin,
+ size_t end,
+ float* s,
+ float* m,
+ float* v,
+ const float alpha,
+ const float weight_decay,
+ const float momentum1,
+ const float momentum2,
+ const float* params,
+ const float* params_grad
+ )
+ {
+ const float eps = 1e-8;
+ // The loop is equivalent to doing this:
+ // m = momentum1*m + (1-momentum1) * (weight_decay*params + params_grad);
+ // v = momentum2*v + (1-momentum2)*squared(weight_decay*params + params_grad);
+ // s = -alpha*m/(sqrt(v) + eps);
+ for (auto i : grid_stride_range(begin, end))
+ {
+ float g = (weight_decay*params[i] + params_grad[i]);
+ m[i] = momentum1*m[i] + (1-momentum1)*g;
+ v[i] = momentum2*v[i] + (1-momentum2)*g*g;
+ s[i] = -alpha*m[i]/(std::sqrt(v[i]) + eps);
+ }
+ }
+
+ void compute_adam_update (
+ size_t begin,
+ size_t end,
+ tensor& s,
+ tensor& m,
+ tensor& v,
+ const float t,
+ const float learning_rate,
+ const float weight_decay,
+ const float momentum1,
+ const float momentum2,
+ const tensor& params,
+ const tensor& params_grad
+ )
+ {
+ DLIB_CASSERT(s.size() == m.size() &&
+ s.size() == v.size() &&
+ s.size() == params.size() &&
+ s.size() == params_grad.size());
+ DLIB_CASSERT(begin <= end && end <= params.size());
+ const float alpha = learning_rate*std::sqrt(1-std::pow(momentum2,t))/(1-std::pow(momentum1, t));
+
+ launch_kernel(_cuda_compute_adam_update,max_jobs(end-begin),
+ begin, end, s.device(), m.device(), v.device(), alpha, weight_decay,
+ momentum1, momentum2, params.device(), params_grad.device());
+ }
+
+ // -----------------------------------------------------------------------------------
+
+ __global__ void _cuda_affine_transform_conv(float* d, const float* s, size_t n, const float* A, const float* B, size_t bs, size_t ks)
+ {
+ for (auto i : grid_stride_range(0, n))
+ {
+ auto k = (i/bs)%ks;
+ d[i] = A[k]*s[i] + B[k];
+ }
+ }
+
+ void affine_transform_conv(
+ tensor& dest,
+ const tensor& src,
+ const tensor& A,
+ const tensor& B
+ )
+ {
+ DLIB_CASSERT(have_same_dimensions(dest, src));
+ DLIB_CASSERT(have_same_dimensions(A, B));
+ DLIB_CASSERT(A.num_samples() == 1 && A.nr() == 1 && A.nc() == 1 && A.k() == src.k());
+
+ launch_kernel(_cuda_affine_transform_conv,max_jobs(dest.size()),
+ dest.device(), src.device(), src.size(), A.device(), B.device(), src.nr()*src.nc(), src.k());
+ }
+
+ // -----------------------------------------------------------------------------------
+
+ __global__ void _add_bias_gradient(float* out, const float* in, size_t n, size_t total_n)
+ {
+ for (auto i : grid_stride_range(0, n))
+ {
+ out[i] = in[i];
+ for (size_t j = i+n; j < total_n; j+=n)
+ out[i] += in[j];
+ }
+ }
+
+ void assign_bias_gradient (
+ tensor& grad,
+ const tensor& gradient_input
+ )
+ {
+ DLIB_CASSERT(
+ grad.num_samples() == 1 &&
+ gradient_input.k() == grad.k() &&
+ gradient_input.nr() == grad.nr() &&
+ gradient_input.nc() == grad.nc() &&
+ gradient_input.size() > 0);
+
+ launch_kernel(_add_bias_gradient,max_jobs(grad.size()),grad.device(), gradient_input.device(), grad.size(), gradient_input.size());
+ }
+
+ // ----------------------------------------------------------------------------------------
+
+ __global__ void _set_tensor(float* out, size_t n, const float val)
+ {
+ for (auto i : grid_stride_range(0, n))
+ out[i] = val;
+ }
+
+ void set_tensor (
+ tensor& t,
+ float value
+ )
+ {
+ launch_kernel(_set_tensor, max_jobs(t.size()), t.device(), t.size(), value);
+ }
+
+ // ----------------------------------------------------------------------------------------
+
+ __global__ void _scale_tensor(float* out, size_t n, const float val)
+ {
+ for (auto i : grid_stride_range(0, n))
+ out[i] *= val;
+ }
+
+ void scale_tensor (
+ tensor& t,
+ float value
+ )
+ {
+ launch_kernel(_scale_tensor, max_jobs(t.size()), t.device(), t.size(), value);
+ }
+
+ // -----------------------------------------------------------------------------------
+ // -----------------------------------------------------------------------------------
+
+ __global__ void _cuda_threshold(float* d, size_t n, float thresh)
+ {
+ for (auto i : grid_stride_range(0, n))
+ {
+ d[i] = d[i]>thresh ? 1:0;
+ }
+ }
+
+ void threshold (
+ tensor& data,
+ float thresh
+ )
+ {
+ launch_kernel(_cuda_threshold,max_jobs(data.size()),data.device(), data.size(), thresh);
+ }
+
+ // ------------------------------------------------------------------------------------
+
+ __global__ void _cuda_dot(const float* a, const float* b, size_t n, float* result)
+ {
+ // Parallel sum everything into local temp variables.
+ float temp = 0;
+ for(auto i : grid_stride_range(0, n))
+ temp += a[i]*b[i];
+
+ // Then do the warp reduce add thing to merge into one output value.
+ warp_reduce_atomic_add(*result, temp);
+ }
+
+
+ void dot (
+ const tensor& a,
+ const tensor& b,
+ tensor& result,
+ size_t idx
+ )
+ {
+ DLIB_CASSERT(a.size() == b.size());
+ DLIB_CASSERT(idx < result.size());
+
+ launch_kernel(_cuda_dot, max_jobs(a.size()), a.device(), b.device(), a.size(), result.device()+idx);
+ }
+
+ // ----------------------------------------------------------------------------------------
+
+ __global__ void _cuda_prelu(const float* s, float* d, size_t n, const float* pp)
+ {
+ const float p = *pp;
+ for (auto i : grid_stride_range(0, n))
+ {
+ if (s[i] > 0)
+ d[i] = s[i];
+ else
+ d[i] = p*s[i];
+ }
+ }
+
+ void prelu (
+ tensor& dest,
+ const tensor& src,
+ const tensor& param
+ )
+ {
+ launch_kernel(_cuda_prelu, max_jobs(dest.size()),
+ src.device(), dest.device(), src.size(), param.device());
+ }
+
+ // ----------------------------------------------------------------------------------------
+
+ __global__ void _cuda_prelu_gradient(float* out, const float* s, const float* gi, size_t n, const float* pp, float* ppgrad)
+ {
+ const float p = *pp;
+ float pgrad = 0;
+ for(auto i : grid_stride_range(0, n))
+ {
+ if (s[i] > 0)
+ {
+ out[i] += gi[i];
+ }
+ else
+ {
+ out[i] += p*gi[i];
+ pgrad += gi[i]*s[i];
+ }
+ }
+
+ // Then do the warp reduce add thing to merge into one output value.
+ warp_reduce_atomic_add(*ppgrad, pgrad);
+ }
+
+ void prelu_gradient (
+ tensor& grad,
+ const tensor& src,
+ const tensor& gradient_input,
+ const tensor& param,
+ tensor& params_grad
+ )
+ {
+ params_grad = 0;
+ launch_kernel(_cuda_prelu_gradient, max_jobs(grad.size()),
+ grad.device(), src.device(), gradient_input.device(), grad.size(),
+ param.device(), params_grad.device());
+ }
+
+ // ----------------------------------------------------------------------------------------
+
+ __global__ void _cuda_resize_bilinear(size_t dsize, size_t dchan_size, size_t dnc, float* d,
+ size_t schan_size, int snr, int snc, const float* s,
+ const float x_scale, const float y_scale)
+ {
+ for(auto i : grid_stride_range(0, dsize))
+ {
+ const int idx = i%dchan_size;
+ const int channel = i/dchan_size;
+ const int sidx = channel*schan_size;
+ const int r = idx/dnc;
+ const int c = idx%dnc;
+
+ const float y = r*y_scale;
+ const int top = static_cast<int>(::floor(y));
+ const int bottom = ::min(top+1, snr-1);
+ const float tb_frac = y - top;
+
+ const float x = c*x_scale;
+ const int left = static_cast<int>(::floor(x));
+ const int right = ::min(left+1, snc-1);
+ const float lr_frac = x - left;
+
+ float tl = s[sidx+top*snc+left];
+ float tr = s[sidx+top*snc+right];
+ float bl = s[sidx+bottom*snc+left];
+ float br = s[sidx+bottom*snc+right];
+
+ float temp = (1-tb_frac)*((1-lr_frac)*tl + lr_frac*tr) +
+ tb_frac*((1-lr_frac)*bl + lr_frac*br);
+
+ d[i] = temp;
+ }
+ }
+
+ __global__ void _cuda_resize_bilinear_strided(size_t dsize, size_t dchan_size, size_t dnc, float* d,
+ size_t schan_size, int snr, int snc, const float* s,
+ const float x_scale, const float y_scale,
+ size_t dest_row_stride, size_t src_row_stride, size_t dest_chan_size_strided
+ )
+ {
+ for(auto i : grid_stride_range(0, dsize))
+ {
+ const int idx = i%dchan_size;
+ const int channel = i/dchan_size;
+ const int sidx = channel*schan_size;
+ const int r = idx/dnc;
+ const int c = idx%dnc;
+ const int didx = channel*dest_chan_size_strided + r*dest_row_stride+c;
+
+ const float y = r*y_scale;
+ const int top = static_cast<int>(::floor(y));
+ const int bottom = ::min(top+1, snr-1);
+ const float tb_frac = y - top;
+
+ const float x = c*x_scale;
+ const int left = static_cast<int>(::floor(x));
+ const int right = ::min(left+1, snc-1);
+ const float lr_frac = x - left;
+
+ float tl = s[sidx+top*src_row_stride+left];
+ float tr = s[sidx+top*src_row_stride+right];
+ float bl = s[sidx+bottom*src_row_stride+left];
+ float br = s[sidx+bottom*src_row_stride+right];
+
+ float temp = (1-tb_frac)*((1-lr_frac)*tl + lr_frac*tr) +
+ tb_frac*((1-lr_frac)*bl + lr_frac*br);
+
+ d[didx] = temp;
+ }
+ }
+
+ void resize_bilinear (
+ tensor& dest,
+ long dest_row_stride,
+ long dest_channel_stride,
+ const tensor& src,
+ long src_row_stride,
+ long src_channel_stride
+ )
+ {
+ DLIB_CASSERT(is_same_object(dest, src)==false);
+ DLIB_CASSERT(dest.num_samples() == src.num_samples());
+ DLIB_CASSERT(dest.k() == src.k());
+
+ if (dest.size() == 0 || src.size() == 0)
+ return;
+
+ const float x_scale = (src.nc()-1)/(float)std::max<long>((dest.nc()-1),1);
+ const float y_scale = (src.nr()-1)/(float)std::max<long>((dest.nr()-1),1);
+
+ if (dest.nc() == dest_row_stride && dest.nr()*dest.nc()==dest_channel_stride &&
+ src.nc() == src_row_stride && src.nr()*src.nc()==src_channel_stride)
+ {
+ launch_kernel(_cuda_resize_bilinear,
+ dest.size(), dest.nr()*dest.nc(), dest.nc(), dest.device(),
+ src.nr()*src.nc(), src.nr(), src.nc(), src.device(),
+ x_scale, y_scale);
+ }
+ else
+ {
+ launch_kernel(_cuda_resize_bilinear_strided,
+ dest.size(), dest.nr()*dest.nc(), dest.nc(), dest.device(),
+ src_channel_stride, src.nr(), src.nc(), src.device(),
+ x_scale, y_scale, dest_row_stride, src_row_stride, dest_channel_stride);
+ }
+ }
+
+ // ----------------------------------------------------------------------------------------
+
+ __global__ void _cuda_resize_bilinear_gradient(size_t dsize, size_t dchan_size, size_t dnc, const float* d,
+ size_t schan_size, int snr, int snc, float* s,
+ const float x_scale, const float y_scale)
+ {
+ for(auto i : grid_stride_range(0, dsize))
+ {
+ const float tmp = d[i];
+
+ const int idx = i%dchan_size;
+ const int channel = i/dchan_size;
+ const int sidx = channel*schan_size;
+ const int r = idx/dnc;
+ const int c = idx%dnc;
+
+ const float y = r*y_scale;
+ const int top = static_cast<int>(::floor(y));
+ const int bottom = ::min(top+1, snr-1);
+ const float tb_frac = y - top;
+
+ const float x = c*x_scale;
+ const int left = static_cast<int>(::floor(x));
+ const int right = ::min(left+1, snc-1);
+ const float lr_frac = x - left;
+
+
+ atomicAdd(s+sidx+top*snc+left, tmp*(1-tb_frac)*(1-lr_frac));
+ atomicAdd(s+sidx+top*snc+right, tmp*(1-tb_frac)*(lr_frac));
+ atomicAdd(s+sidx+bottom*snc+left, tmp*(tb_frac)*(1-lr_frac));
+ atomicAdd(s+sidx+bottom*snc+right, tmp*(tb_frac)*(lr_frac));
+ }
+ }
+
+ __global__ void _cuda_resize_bilinear_gradient_strided(size_t dsize, size_t dchan_size, size_t dnc, const float* d,
+ size_t schan_size, int snr, int snc, float* s,
+ const float x_scale, const float y_scale,
+ size_t dest_row_stride, size_t src_row_stride, size_t dest_chan_size_strided
+ )
+ {
+ for(auto i : grid_stride_range(0, dsize))
+ {
+
+ const int idx = i%dchan_size;
+ const int channel = i/dchan_size;
+ const int didx = channel*dest_chan_size_strided;
+ const int sidx = channel*schan_size;
+ const int r = idx/dnc;
+ const int c = idx%dnc;
+
+ const float tmp = d[didx + r*dest_row_stride+c];
+
+ const float y = r*y_scale;
+ const int top = static_cast<int>(::floor(y));
+ const int bottom = ::min(top+1, snr-1);
+ const float tb_frac = y - top;
+
+ const float x = c*x_scale;
+ const int left = static_cast<int>(::floor(x));
+ const int right = ::min(left+1, snc-1);
+ const float lr_frac = x - left;
+
+
+ atomicAdd(s+sidx+top*src_row_stride+left, tmp*(1-tb_frac)*(1-lr_frac));
+ atomicAdd(s+sidx+top*src_row_stride+right, tmp*(1-tb_frac)*(lr_frac));
+ atomicAdd(s+sidx+bottom*src_row_stride+left, tmp*(tb_frac)*(1-lr_frac));
+ atomicAdd(s+sidx+bottom*src_row_stride+right, tmp*(tb_frac)*(lr_frac));
+ }
+ }
+
+ void resize_bilinear_gradient (
+ tensor& grad,
+ long grad_row_stride,
+ long grad_channel_stride,
+ const tensor& gradient_input,
+ long gradient_input_row_stride,
+ long gradient_input_channel_stride
+ )
+ {
+ DLIB_CASSERT(is_same_object(grad, gradient_input)==false);
+ DLIB_CASSERT(gradient_input.num_samples() == grad.num_samples());
+ DLIB_CASSERT(gradient_input.k() == grad.k());
+
+ if (grad.size() == 0 || gradient_input.size() == 0)
+ return;
+
+ const float x_scale = (grad.nc()-1)/(float)std::max<long>((gradient_input.nc()-1),1);
+ const float y_scale = (grad.nr()-1)/(float)std::max<long>((gradient_input.nr()-1),1);
+
+ if (grad.nc() == grad_row_stride && grad.nr()*grad.nc()==grad_channel_stride &&
+ gradient_input.nc() == gradient_input_row_stride && gradient_input.nr()*gradient_input.nc()==gradient_input_channel_stride)
+ {
+ launch_kernel(_cuda_resize_bilinear_gradient,
+ gradient_input.size(), gradient_input.nr()*gradient_input.nc(), gradient_input.nc(), gradient_input.device(),
+ grad.nr()*grad.nc(), grad.nr(), grad.nc(), grad.device(),
+ x_scale, y_scale);
+ }
+ else
+ {
+ launch_kernel(_cuda_resize_bilinear_gradient_strided,
+ gradient_input.size(), gradient_input.nr()*gradient_input.nc(), gradient_input.nc(), gradient_input.device(),
+ grad_channel_stride, grad.nr(), grad.nc(), grad.device(),
+ x_scale, y_scale, gradient_input_row_stride, grad_row_stride, gradient_input_channel_stride);
+ }
+ }
+
+ // ----------------------------------------------------------------------------------------
+
+ __global__ void _cuda_copy_tensor_add_to (float* dest, size_t size, const float* src, size_t dest_stride, size_t src_stride, size_t block_size)
+ {
+ for(auto i : grid_stride_range(0, size))
+ {
+ size_t blk = i/block_size;
+ size_t j = i%block_size;
+ dest[blk*dest_stride + j] += src[blk*src_stride + j];
+ }
+ }
+
+ __global__ void _cuda_copy_tensor (float* dest, size_t size, const float* src, size_t dest_stride, size_t src_stride, size_t block_size)
+ {
+ for(auto i : grid_stride_range(0, size))
+ {
+ size_t blk = i/block_size;
+ size_t j = i%block_size;
+ dest[blk*dest_stride + j] = src[blk*src_stride + j];
+ }
+ }
+
+ void copy_tensor(
+ bool add_to,
+ tensor& dest,
+ size_t dest_k_offset,
+ const tensor& src,
+ size_t src_k_offset,
+ size_t count_k
+ )
+ {
+ const size_t dest_sample_size = static_cast<size_t>(dest.nc() * dest.nr() * dest.k());
+ const size_t src_sample_size = static_cast<size_t>(src.nc() * src.nr() * src.k());
+
+ const size_t block_size = count_k * dest.nc() * dest.nr();
+
+ DLIB_CASSERT(dest.num_samples() == src.num_samples() &&
+ dest.nc() == src.nc() && dest.nr() == src.nr(), "All sources should fit into dest tensor size");
+ DLIB_CASSERT(dest.k() - dest_k_offset >= count_k, "Not enough space in dest tensor");
+ DLIB_CASSERT(src.k() - src_k_offset >= count_k, "Not enough space in src tensor");
+
+ float* dest_p = dest.device() + dest_k_offset * dest.nc() * dest.nr();
+ const float* src_p = src.device() + src_k_offset * src.nc() * src.nr();;
+
+ if (add_to)
+ {
+ launch_kernel(_cuda_copy_tensor_add_to, max_jobs(dest.size()),
+ dest_p, block_size*dest.num_samples(),
+ src_p, dest_sample_size, src_sample_size, block_size);
+ }
+ else
+ {
+ launch_kernel(_cuda_copy_tensor, max_jobs(dest.size()),
+ dest_p, block_size*dest.num_samples(),
+ src_p, dest_sample_size, src_sample_size, block_size);
+ }
+ }
+
+ // ----------------------------------------------------------------------------------------
+
+ }
+}
+