summaryrefslogtreecommitdiffstats
path: root/src/boost/libs/compute/example
diff options
context:
space:
mode:
authorDaniel Baumann <daniel.baumann@progress-linux.org>2024-04-27 18:24:20 +0000
committerDaniel Baumann <daniel.baumann@progress-linux.org>2024-04-27 18:24:20 +0000
commit483eb2f56657e8e7f419ab1a4fab8dce9ade8609 (patch)
treee5d88d25d870d5dedacb6bbdbe2a966086a0a5cf /src/boost/libs/compute/example
parentInitial commit. (diff)
downloadceph-483eb2f56657e8e7f419ab1a4fab8dce9ade8609.tar.xz
ceph-483eb2f56657e8e7f419ab1a4fab8dce9ade8609.zip
Adding upstream version 14.2.21.upstream/14.2.21upstream
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'src/boost/libs/compute/example')
-rw-r--r--src/boost/libs/compute/example/CMakeLists.txt166
-rw-r--r--src/boost/libs/compute/example/amd_cpp_kernel.cpp116
-rw-r--r--src/boost/libs/compute/example/batched_determinant.cpp96
-rw-r--r--src/boost/libs/compute/example/black_scholes.cpp168
-rw-r--r--src/boost/libs/compute/example/copy_data.cpp49
-rw-r--r--src/boost/libs/compute/example/fizz_buzz.cpp160
-rw-r--r--src/boost/libs/compute/example/hello_world.cpp30
-rw-r--r--src/boost/libs/compute/example/host_sort.cpp56
-rw-r--r--src/boost/libs/compute/example/inline_ptx.cpp72
-rw-r--r--src/boost/libs/compute/example/k_means.cpp229
-rw-r--r--src/boost/libs/compute/example/list_devices.cpp45
-rw-r--r--src/boost/libs/compute/example/longest_vector.cpp58
-rw-r--r--src/boost/libs/compute/example/mandelbrot.cpp224
-rw-r--r--src/boost/libs/compute/example/mapped_view.cpp45
-rw-r--r--src/boost/libs/compute/example/matrix_transpose.cpp355
-rw-r--r--src/boost/libs/compute/example/memory_limits.cpp37
-rw-r--r--src/boost/libs/compute/example/monte_carlo.cpp73
-rw-r--r--src/boost/libs/compute/example/nbody.cpp236
-rw-r--r--src/boost/libs/compute/example/opencl_test.cpp165
-rw-r--r--src/boost/libs/compute/example/opencv_convolution.cpp265
-rw-r--r--src/boost/libs/compute/example/opencv_flip.cpp101
-rw-r--r--src/boost/libs/compute/example/opencv_histogram.cpp228
-rw-r--r--src/boost/libs/compute/example/opencv_optical_flow.cpp289
-rw-r--r--src/boost/libs/compute/example/opencv_sobel_filter.cpp254
-rw-r--r--src/boost/libs/compute/example/opengl_sphere.cpp242
-rw-r--r--src/boost/libs/compute/example/point_centroid.cpp68
-rw-r--r--src/boost/libs/compute/example/price_cross.cpp87
-rw-r--r--src/boost/libs/compute/example/print_vector.cpp45
-rw-r--r--src/boost/libs/compute/example/qimage_blur.cpp145
-rw-r--r--src/boost/libs/compute/example/random_walk.cpp153
-rw-r--r--src/boost/libs/compute/example/resize_image.cpp253
-rw-r--r--src/boost/libs/compute/example/simple_kernel.cpp84
-rw-r--r--src/boost/libs/compute/example/simple_moving_average.cpp139
-rw-r--r--src/boost/libs/compute/example/sort_vector.cpp68
-rw-r--r--src/boost/libs/compute/example/threefry_engine.cpp43
-rw-r--r--src/boost/libs/compute/example/time_copy.cpp63
-rw-r--r--src/boost/libs/compute/example/transform_sqrt.cpp58
-rw-r--r--src/boost/libs/compute/example/vector_addition.cpp57
38 files changed, 5022 insertions, 0 deletions
diff --git a/src/boost/libs/compute/example/CMakeLists.txt b/src/boost/libs/compute/example/CMakeLists.txt
new file mode 100644
index 00000000..a5708cb0
--- /dev/null
+++ b/src/boost/libs/compute/example/CMakeLists.txt
@@ -0,0 +1,166 @@
+# ---------------------------------------------------------------------------
+# Copyright (c) 2013-2014 Kyle Lutz <kyle.r.lutz@gmail.com>
+#
+# Distributed under the Boost Software License, Version 1.0
+# See accompanying file LICENSE_1_0.txt or copy at
+# http://www.boost.org/LICENSE_1_0.txt
+#
+# ---------------------------------------------------------------------------
+
+include_directories(../include)
+
+set(EXAMPLES
+ amd_cpp_kernel
+ black_scholes
+ copy_data
+ fizz_buzz
+ hello_world
+ host_sort
+ inline_ptx
+ longest_vector
+ list_devices
+ mapped_view
+ memory_limits
+ monte_carlo
+ point_centroid
+ price_cross
+ print_vector
+ sort_vector
+ simple_kernel
+ time_copy
+ transform_sqrt
+ vector_addition
+ simple_moving_average
+ matrix_transpose
+)
+
+# boost library link dependencies
+set(EXAMPLE_BOOST_COMPONENTS program_options)
+
+if (${BOOST_COMPUTE_USE_OFFLINE_CACHE})
+ set(EXAMPLE_BOOST_COMPONENTS ${EXAMPLE_BOOST_COMPONENTS} system filesystem)
+endif()
+
+if(${BOOST_COMPUTE_THREAD_SAFE} AND NOT ${BOOST_COMPUTE_USE_CPP11})
+ set(EXAMPLE_BOOST_COMPONENTS ${EXAMPLE_BOOST_COMPONENTS} system thread)
+endif()
+
+if(MSVC AND EXAMPLE_BOOST_COMPONENTS)
+ set(EXAMPLE_BOOST_COMPONENTS ${EXAMPLE_BOOST_COMPONENTS} chrono)
+endif()
+
+if(EXAMPLE_BOOST_COMPONENTS)
+ list(REMOVE_DUPLICATES EXAMPLE_BOOST_COMPONENTS)
+endif()
+find_package(Boost 1.54 REQUIRED COMPONENTS ${EXAMPLE_BOOST_COMPONENTS})
+include_directories(SYSTEM ${Boost_INCLUDE_DIRS})
+
+foreach(EXAMPLE ${EXAMPLES})
+ add_executable(${EXAMPLE} ${EXAMPLE}.cpp)
+ target_link_libraries(${EXAMPLE} ${OpenCL_LIBRARIES} ${Boost_LIBRARIES})
+
+ # add example program to list of tests (if testing is enabled)
+ if(${BOOST_COMPUTE_BUILD_TESTS})
+ add_test("example.${EXAMPLE}" ${EXAMPLE})
+ endif()
+endforeach()
+
+# opencl test example
+add_executable(opencl_test opencl_test.cpp)
+target_link_libraries(opencl_test ${OpenCL_LIBRARIES})
+
+# eigen examples
+if(${BOOST_COMPUTE_HAVE_EIGEN})
+ find_package(Eigen REQUIRED)
+ include_directories(SYSTEM ${EIGEN_INCLUDE_DIRS})
+ add_executable(batched_determinant batched_determinant.cpp)
+ target_link_libraries(batched_determinant ${OpenCL_LIBRARIES} ${Boost_LIBRARIES})
+endif()
+
+# opencv examples
+if(${BOOST_COMPUTE_HAVE_OPENCV})
+ find_package(OpenCV REQUIRED)
+ include_directories(SYSTEM ${OpenCV_INCLUDE_DIRS})
+
+ set(OPENCV_EXAMPLES
+ k_means
+ opencv_flip
+ random_walk
+ opencv_optical_flow
+ opencv_convolution
+ opencv_sobel_filter
+ opencv_histogram
+ )
+
+ foreach(EXAMPLE ${OPENCV_EXAMPLES})
+ add_executable(${EXAMPLE} ${EXAMPLE}.cpp)
+ target_link_libraries(${EXAMPLE} ${OpenCL_LIBRARIES} ${Boost_LIBRARIES} ${OpenCV_LIBS})
+ endforeach()
+endif()
+
+# opengl/vtk examples
+if(${BOOST_COMPUTE_HAVE_VTK})
+ find_package(VTK REQUIRED)
+ include(${VTK_USE_FILE})
+ add_executable(opengl_sphere opengl_sphere.cpp)
+ target_link_libraries(opengl_sphere ${OpenCL_LIBRARIES} ${Boost_LIBRARIES} ${VTK_LIBRARIES})
+ if(APPLE)
+ target_link_libraries(opengl_sphere "-framework OpenGL")
+ elseif(UNIX)
+ target_link_libraries(opengl_sphere GL)
+ endif()
+endif()
+
+# qt examples
+if(${BOOST_COMPUTE_HAVE_QT})
+
+ # look for Qt4 in the first place
+ find_package(Qt4 QUIET)
+
+ if(${QT4_FOUND})
+ # build with Qt4
+ find_package(Qt4 REQUIRED COMPONENTS QtCore QtGui QtOpenGL)
+ set(QT_USE_QTOPENGL TRUE)
+ include(${QT_USE_FILE})
+ else()
+
+ # look for Qt5
+ find_package(Qt5Widgets QUIET)
+
+ if(${Qt5Widgets_FOUND})
+ # build with Qt5
+ find_package(Qt5Core REQUIRED)
+ find_package(Qt5Widgets REQUIRED)
+ find_package(Qt5OpenGL REQUIRED)
+ include_directories(${Qt5OpenGL_INCLUDE_DIRS})
+ set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${Qt5OpenGL_EXECUTABLE_COMPILE_FLAGS}")
+ set(QT_LIBRARIES ${Qt5OpenGL_LIBRARIES})
+ else()
+ # no valid Qt framework found
+ message(FATAL_ERROR "Error: Did not find Qt4 or Qt5")
+ endif()
+ endif()
+
+ # required by both versions
+ set(CMAKE_AUTOMOC TRUE)
+ include_directories(${CMAKE_CURRENT_BINARY_DIR})
+
+ # add examples
+ add_executable(qimage_blur qimage_blur.cpp)
+ target_link_libraries(qimage_blur ${OpenCL_LIBRARIES} ${Boost_LIBRARIES} ${QT_LIBRARIES})
+
+ set(QT_OPENGL_EXAMPLES
+ mandelbrot
+ nbody
+ resize_image
+ )
+ foreach(EXAMPLE ${QT_OPENGL_EXAMPLES})
+ add_executable(${EXAMPLE} ${EXAMPLE}.cpp)
+ target_link_libraries(${EXAMPLE} ${OpenCL_LIBRARIES} ${Boost_LIBRARIES} ${QT_LIBRARIES})
+ if(APPLE)
+ target_link_libraries(${EXAMPLE} "-framework OpenGL")
+ elseif(UNIX)
+ target_link_libraries(${EXAMPLE} GL)
+ endif()
+ endforeach()
+endif()
diff --git a/src/boost/libs/compute/example/amd_cpp_kernel.cpp b/src/boost/libs/compute/example/amd_cpp_kernel.cpp
new file mode 100644
index 00000000..0207312f
--- /dev/null
+++ b/src/boost/libs/compute/example/amd_cpp_kernel.cpp
@@ -0,0 +1,116 @@
+//---------------------------------------------------------------------------//
+// Copyright (c) 2013-2014 Kyle Lutz <kyle.r.lutz@gmail.com>
+//
+// Distributed under the Boost Software License, Version 1.0
+// See accompanying file LICENSE_1_0.txt or copy at
+// http://www.boost.org/LICENSE_1_0.txt
+//
+// See http://boostorg.github.com/compute for more information.
+//---------------------------------------------------------------------------//
+
+#include <iostream>
+
+#include <boost/compute/command_queue.hpp>
+#include <boost/compute/kernel.hpp>
+#include <boost/compute/program.hpp>
+#include <boost/compute/system.hpp>
+#include <boost/compute/algorithm/copy.hpp>
+#include <boost/compute/container/vector.hpp>
+#include <boost/compute/utility/source.hpp>
+
+namespace compute = boost::compute;
+
+// this example shows how to use the static c++ kernel language
+// extension (currently only supported by AMD) to compile and
+// execute a templated c++ kernel.
+// Using platform vendor info to decide if this is AMD platform
+int main()
+{
+ // get default device and setup context
+ compute::device device = compute::system::default_device();
+ compute::context context(device);
+ compute::command_queue queue(context, device);
+
+ // check the platform vendor string
+ if(device.platform().vendor() != "Advanced Micro Devices, Inc."){
+ std::cerr << "error: static C++ kernel language is only "
+ << "supported on AMD devices."
+ << std::endl;
+ return 0;
+ }
+
+ // create input int values and copy them to the device
+ int int_data[] = { 1, 2, 3, 4};
+ compute::vector<int> int_vector(int_data, int_data + 4, queue);
+
+ // create input float values and copy them to the device
+ float float_data[] = { 2.0f, 4.0f, 6.0f, 8.0f };
+ compute::vector<float> float_vector(float_data, float_data + 4, queue);
+
+ // create kernel source with a templated function and templated kernel
+ const char source[] = BOOST_COMPUTE_STRINGIZE_SOURCE(
+ // define our templated function which returns the square of its input
+ template<typename T>
+ inline T square(const T x)
+ {
+ return x * x;
+ }
+
+ // define our templated kernel which calls square on each value in data
+ template<typename T>
+ __kernel void square_kernel(__global T *data)
+ {
+ const uint i = get_global_id(0);
+ data[i] = square(data[i]);
+ }
+
+ // explicitly instantiate the square kernel for int's. this allows
+ // for it to be called from the host with the given mangled name.
+ template __attribute__((mangled_name(square_kernel_int)))
+ __kernel void square_kernel(__global int *data);
+
+ // also instantiate the square kernel for float's.
+ template __attribute__((mangled_name(square_kernel_float)))
+ __kernel void square_kernel(__global float *data);
+ );
+
+ // build the program. must enable the c++ static kernel language
+ // by passing the "-x clc++" compile option.
+ compute::program square_program =
+ compute::program::build_with_source(source, context, "-x clc++");
+
+ // create the square kernel for int's by using its mangled name declared
+ // in the explicit template instantiation.
+ compute::kernel square_int_kernel(square_program, "square_kernel_int");
+ square_int_kernel.set_arg(0, int_vector);
+
+ // execute the square int kernel
+ queue.enqueue_1d_range_kernel(square_int_kernel, 0, int_vector.size(), 4);
+
+ // print out the squared int values
+ std::cout << "int's: ";
+ compute::copy(
+ int_vector.begin(), int_vector.end(),
+ std::ostream_iterator<int>(std::cout, " "),
+ queue
+ );
+ std::cout << std::endl;
+
+ // now create the square kernel for float's
+ compute::kernel square_float_kernel(square_program, "square_kernel_float");
+ square_float_kernel.set_arg(0, float_vector);
+
+ // execute the square int kernel
+ queue.enqueue_1d_range_kernel(square_float_kernel, 0, float_vector.size(), 4);
+
+ // print out the squared float values
+ std::cout << "float's: ";
+ compute::copy(
+ float_vector.begin(), float_vector.end(),
+ std::ostream_iterator<float>(std::cout, " "),
+ queue
+ );
+ std::cout << std::endl;
+
+ return 0;
+}
diff --git a/src/boost/libs/compute/example/batched_determinant.cpp b/src/boost/libs/compute/example/batched_determinant.cpp
new file mode 100644
index 00000000..0029151e
--- /dev/null
+++ b/src/boost/libs/compute/example/batched_determinant.cpp
@@ -0,0 +1,96 @@
+//---------------------------------------------------------------------------//
+// Copyright (c) 2013-2014 Kyle Lutz <kyle.r.lutz@gmail.com>
+//
+// Distributed under the Boost Software License, Version 1.0
+// See accompanying file LICENSE_1_0.txt or copy at
+// http://www.boost.org/LICENSE_1_0.txt
+//
+// See http://boostorg.github.com/compute for more information.
+//---------------------------------------------------------------------------//
+
+#include <iostream>
+
+#include <Eigen/Core>
+#include <Eigen/LU>
+
+#include <boost/compute/function.hpp>
+#include <boost/compute/system.hpp>
+#include <boost/compute/algorithm/transform.hpp>
+#include <boost/compute/container/vector.hpp>
+#include <boost/compute/types/fundamental.hpp>
+
+namespace compute = boost::compute;
+
+// this example shows how to compute the determinant of many 4x4 matrices
+// using a determinant function and the transform() algorithm. in OpenCL the
+// float16 type can be used to store a 4x4 matrix and the components are laid
+// out in the following order:
+//
+// M = [ s0 s4 s8 sc ]
+// [ s1 s5 s9 sd ]
+// [ s2 s6 sa se ]
+// [ s3 s7 sb sf ]
+//
+// the input matrices are created using eigen's random matrix and then
+// used again at the end to verify the results of the determinant function.
+int main()
+{
+ // get default device and setup context
+ compute::device gpu = compute::system::default_device();
+ compute::context context(gpu);
+ compute::command_queue queue(context, gpu);
+ std::cout << "device: " << gpu.name() << std::endl;
+
+ size_t n = 1000;
+
+ // create random 4x4 matrices on the host
+ std::vector<Eigen::Matrix4f> matrices(n);
+ for(size_t i = 0; i < n; i++){
+ matrices[i] = Eigen::Matrix4f::Random();
+ }
+
+ // copy matrices to the device
+ using compute::float16_;
+ compute::vector<float16_> input(n, context);
+ compute::copy(
+ matrices.begin(), matrices.end(), input.begin(), queue
+ );
+
+ // function returning the determinant of a 4x4 matrix.
+ BOOST_COMPUTE_FUNCTION(float, determinant4x4, (const float16_ m),
+ {
+ return m.s0*m.s5*m.sa*m.sf + m.s0*m.s6*m.sb*m.sd + m.s0*m.s7*m.s9*m.se +
+ m.s1*m.s4*m.sb*m.se + m.s1*m.s6*m.s8*m.sf + m.s1*m.s7*m.sa*m.sc +
+ m.s2*m.s4*m.s9*m.sf + m.s2*m.s5*m.sb*m.sc + m.s2*m.s7*m.s8*m.sd +
+ m.s3*m.s4*m.sa*m.sd + m.s3*m.s5*m.s8*m.se + m.s3*m.s6*m.s9*m.sc -
+ m.s0*m.s5*m.sb*m.se - m.s0*m.s6*m.s9*m.sf - m.s0*m.s7*m.sa*m.sd -
+ m.s1*m.s4*m.sa*m.sf - m.s1*m.s6*m.sb*m.sc - m.s1*m.s7*m.s8*m.se -
+ m.s2*m.s4*m.sb*m.sd - m.s2*m.s5*m.s8*m.sf - m.s2*m.s7*m.s9*m.sc -
+ m.s3*m.s4*m.s9*m.se - m.s3*m.s5*m.sa*m.sc - m.s3*m.s6*m.s8*m.sd;
+ });
+
+ // calculate determinants on the gpu
+ compute::vector<float> determinants(n, context);
+ compute::transform(
+ input.begin(), input.end(), determinants.begin(), determinant4x4, queue
+ );
+
+ // check determinants
+ std::vector<float> host_determinants(n);
+ compute::copy(
+ determinants.begin(), determinants.end(), host_determinants.begin(), queue
+ );
+
+ for(size_t i = 0; i < n; i++){
+ float det = matrices[i].determinant();
+
+ if(std::abs(det - host_determinants[i]) > 1e-6){
+ std::cerr << "error: wrong determinant at " << i << " ("
+ << host_determinants[i] << " != " << det << ")"
+ << std::endl;
+ return -1;
+ }
+ }
+
+ return 0;
+}
diff --git a/src/boost/libs/compute/example/black_scholes.cpp b/src/boost/libs/compute/example/black_scholes.cpp
new file mode 100644
index 00000000..13a0e097
--- /dev/null
+++ b/src/boost/libs/compute/example/black_scholes.cpp
@@ -0,0 +1,168 @@
+//---------------------------------------------------------------------------//
+// Copyright (c) 2013-2014 Kyle Lutz <kyle.r.lutz@gmail.com>
+//
+// Distributed under the Boost Software License, Version 1.0
+// See accompanying file LICENSE_1_0.txt or copy at
+// http://www.boost.org/LICENSE_1_0.txt
+//
+// See http://boostorg.github.com/compute for more information.
+//---------------------------------------------------------------------------//
+
+#include <cstdlib>
+#include <iostream>
+
+#include <boost/compute/command_queue.hpp>
+#include <boost/compute/system.hpp>
+#include <boost/compute/algorithm/copy_n.hpp>
+#include <boost/compute/container/vector.hpp>
+#include <boost/compute/utility/source.hpp>
+
+namespace compute = boost::compute;
+
+// return a random float between lo and hi
+float rand_float(float lo, float hi)
+{
+ float x = (float) std::rand() / (float) RAND_MAX;
+
+ return (1.0f - x) * lo + x * hi;
+}
+
+// this example demostrates a black-scholes option pricing kernel.
+int main()
+{
+ // number of options
+ const int N = 4000000;
+
+ // black-scholes parameters
+ const float risk_free_rate = 0.02f;
+ const float volatility = 0.30f;
+
+ // get default device and setup context
+ compute::device gpu = compute::system::default_device();
+ compute::context context(gpu);
+ compute::command_queue queue(context, gpu);
+ std::cout << "device: " << gpu.name() << std::endl;
+
+ // initialize option data on host
+ std::vector<float> stock_price_data(N);
+ std::vector<float> option_strike_data(N);
+ std::vector<float> option_years_data(N);
+
+ std::srand(5347);
+ for(int i = 0; i < N; i++){
+ stock_price_data[i] = rand_float(5.0f, 30.0f);
+ option_strike_data[i] = rand_float(1.0f, 100.0f);
+ option_years_data[i] = rand_float(0.25f, 10.0f);
+ }
+
+ // create memory buffers on the device
+ compute::vector<float> call_result(N, context);
+ compute::vector<float> put_result(N, context);
+ compute::vector<float> stock_price(N, context);
+ compute::vector<float> option_strike(N, context);
+ compute::vector<float> option_years(N, context);
+
+ // copy initial values to the device
+ compute::copy_n(stock_price_data.begin(), N, stock_price.begin(), queue);
+ compute::copy_n(option_strike_data.begin(), N, option_strike.begin(), queue);
+ compute::copy_n(option_years_data.begin(), N, option_years.begin(), queue);
+
+ // source code for black-scholes program
+ const char source[] = BOOST_COMPUTE_STRINGIZE_SOURCE(
+ // approximation of the cumulative normal distribution function
+ static float cnd(float d)
+ {
+ const float A1 = 0.319381530f;
+ const float A2 = -0.356563782f;
+ const float A3 = 1.781477937f;
+ const float A4 = -1.821255978f;
+ const float A5 = 1.330274429f;
+ const float RSQRT2PI = 0.39894228040143267793994605993438f;
+
+ float K = 1.0f / (1.0f + 0.2316419f * fabs(d));
+ float cnd =
+ RSQRT2PI * exp(-0.5f * d * d) *
+ (K * (A1 + K * (A2 + K * (A3 + K * (A4 + K * A5)))));
+
+ if(d > 0){
+ cnd = 1.0f - cnd;
+ }
+
+ return cnd;
+ }
+
+ // black-scholes option pricing kernel
+ __kernel void black_scholes(__global float *call_result,
+ __global float *put_result,
+ __global const float *stock_price,
+ __global const float *option_strike,
+ __global const float *option_years,
+ float risk_free_rate,
+ float volatility)
+ {
+ const uint opt = get_global_id(0);
+
+ float S = stock_price[opt];
+ float X = option_strike[opt];
+ float T = option_years[opt];
+ float R = risk_free_rate;
+ float V = volatility;
+
+ float sqrtT = sqrt(T);
+ float d1 = (log(S / X) + (R + 0.5f * V * V) * T) / (V * sqrtT);
+ float d2 = d1 - V * sqrtT;
+ float CNDD1 = cnd(d1);
+ float CNDD2 = cnd(d2);
+
+ float expRT = exp(-R * T);
+ call_result[opt] = S * CNDD1 - X * expRT * CNDD2;
+ put_result[opt] = X * expRT * (1.0f - CNDD2) - S * (1.0f - CNDD1);
+ }
+ );
+
+ // build black-scholes program
+ compute::program program = compute::program::create_with_source(source, context);
+ program.build();
+
+ // setup black-scholes kernel
+ compute::kernel kernel(program, "black_scholes");
+ kernel.set_arg(0, call_result);
+ kernel.set_arg(1, put_result);
+ kernel.set_arg(2, stock_price);
+ kernel.set_arg(3, option_strike);
+ kernel.set_arg(4, option_years);
+ kernel.set_arg(5, risk_free_rate);
+ kernel.set_arg(6, volatility);
+
+ // execute black-scholes kernel
+ queue.enqueue_1d_range_kernel(kernel, 0, N, 0);
+
+ // print out the first option's put and call prices
+ float call0, put0;
+ compute::copy_n(put_result.begin(), 1, &put0, queue);
+ compute::copy_n(call_result.begin(), 1, &call0, queue);
+
+ std::cout << "option 0 call price: " << call0 << std::endl;
+ std::cout << "option 0 put price: " << put0 << std::endl;
+
+ // due to the differences in the random-number generators between Operating Systems
+ // and/or compilers, we will get different "expected" results for this example
+#ifdef __APPLE__
+ double expected_call0 = 0.000249461;
+ double expected_put0 = 26.2798;
+#elif _MSC_VER
+ double expected_call0 = 8.21412;
+ double expected_put0 = 2.25904;
+#else
+ double expected_call0 = 0.0999f;
+ double expected_put0 = 43.0524f;
+#endif
+
+ // check option prices
+ if(std::abs(call0 - expected_call0) > 1e-4 || std::abs(put0 - expected_put0) > 1e-4){
+ std::cerr << "error: option prices are wrong" << std::endl;
+ return -1;
+ }
+
+ return 0;
+}
diff --git a/src/boost/libs/compute/example/copy_data.cpp b/src/boost/libs/compute/example/copy_data.cpp
new file mode 100644
index 00000000..0550287f
--- /dev/null
+++ b/src/boost/libs/compute/example/copy_data.cpp
@@ -0,0 +1,49 @@
+//---------------------------------------------------------------------------//
+// Copyright (c) 2013 Kyle Lutz <kyle.r.lutz@gmail.com>
+//
+// Distributed under the Boost Software License, Version 1.0
+// See accompanying file LICENSE_1_0.txt or copy at
+// http://www.boost.org/LICENSE_1_0.txt
+//
+// See http://boostorg.github.com/compute for more information.
+//---------------------------------------------------------------------------//
+
+//[copy_data_example
+
+#include <vector>
+
+#include <boost/compute/algorithm/copy.hpp>
+#include <boost/compute/container/vector.hpp>
+
+namespace compute = boost::compute;
+
+int main()
+{
+ // get default device and setup context
+ compute::device device = compute::system::default_device();
+ compute::context context(device);
+ compute::command_queue queue(context, device);
+
+ // create data array on host
+ int host_data[] = { 1, 3, 5, 7, 9 };
+
+ // create vector on device
+ compute::vector<int> device_vector(5, context);
+
+ // copy from host to device
+ compute::copy(
+ host_data, host_data + 5, device_vector.begin(), queue
+ );
+
+ // create vector on host
+ std::vector<int> host_vector(5);
+
+ // copy data back to host
+ compute::copy(
+ device_vector.begin(), device_vector.end(), host_vector.begin(), queue
+ );
+
+ return 0;
+}
+
+//]
diff --git a/src/boost/libs/compute/example/fizz_buzz.cpp b/src/boost/libs/compute/example/fizz_buzz.cpp
new file mode 100644
index 00000000..2c69995f
--- /dev/null
+++ b/src/boost/libs/compute/example/fizz_buzz.cpp
@@ -0,0 +1,160 @@
+//---------------------------------------------------------------------------//
+// Copyright (c) 2013-2014 Kyle Lutz <kyle.r.lutz@gmail.com>
+//
+// Distributed under the Boost Software License, Version 1.0
+// See accompanying file LICENSE_1_0.txt or copy at
+// http://www.boost.org/LICENSE_1_0.txt
+//
+// See http://boostorg.github.com/compute for more information.
+//---------------------------------------------------------------------------//
+
+#include <iostream>
+
+#include <boost/compute/system.hpp>
+#include <boost/compute/algorithm/copy.hpp>
+#include <boost/compute/algorithm/accumulate.hpp>
+#include <boost/compute/algorithm/exclusive_scan.hpp>
+#include <boost/compute/container/vector.hpp>
+#include <boost/compute/utility/dim.hpp>
+#include <boost/compute/utility/source.hpp>
+
+namespace compute = boost::compute;
+
+const char fizz_buzz_source[] = BOOST_COMPUTE_STRINGIZE_SOURCE(
+ // returns the length of the string for the number 'n'. This is used
+ // during the first pass when we calculate the amount of space needed
+ // for each string in the fizz-buzz sequence.
+ inline uint fizz_buzz_string_length(uint n)
+ {
+ if((n % 5 == 0) && (n % 3 == 0)){
+ return sizeof("fizzbuzz");
+ }
+ else if(n % 5 == 0){
+ return sizeof("fizz");
+ }
+ else if(n % 3 == 0){
+ return sizeof("buzz");
+ }
+ else {
+ uint digits = 0;
+ while(n){
+ n /= 10;
+ digits++;
+ }
+ return digits + 1;
+ }
+ }
+
+ // first-pass kernel which calculates the string length for each number
+ // and writes it to the string_lengths array. these will then be passed
+ // to exclusive_scan() to calculate the output offsets for each string.
+ __kernel void fizz_buzz_allocate_strings(__global uint *string_lengths)
+ {
+ const uint i = get_global_id(0);
+ const uint n = i + 1;
+
+ string_lengths[i] = fizz_buzz_string_length(n);
+ }
+
+ // copy the string 's' with length 'n' to 'result' (just like strncpy())
+ inline void copy_string(__constant const char *s, uint n, __global char *result)
+ {
+ while(n--){
+ result[n] = s[n];
+ }
+ }
+
+ // reverse the string [start, end).
+ inline void reverse_string(__global char *start, __global char *end)
+ {
+ while(start < end){
+ char tmp = *end;
+ *end = *start;
+ *start = tmp;
+ start++;
+ end--;
+ }
+ }
+
+ // second-pass kernel which copies the fizz-buzz string for each number to
+ // buffer using the previously calculated offsets.
+ __kernel void fizz_buzz_copy_strings(__global const uint *offsets, __global char *buffer)
+ {
+ const uint i = get_global_id(0);
+ const uint n = i + 1;
+ const uint offset = offsets[i];
+
+ if((n % 5 == 0) && (n % 3 == 0)){
+ copy_string("fizzbuzz\n", 9, buffer + offset);
+ }
+ else if(n % 5 == 0){
+ copy_string("fizz\n", 5, buffer + offset);
+ }
+ else if(n % 3 == 0){
+ copy_string("buzz\n", 5, buffer + offset);
+ }
+ else {
+ // convert number to string and write it to the output
+ __global char *number = buffer + offset;
+ uint n_ = n;
+ while(n_){
+ *number++ = (n_%10) + '0';
+ n_ /= 10;
+ }
+ reverse_string(buffer + offset, number - 1);
+ *number = '\n';
+ }
+ }
+);
+
+int main()
+{
+ using compute::dim;
+ using compute::uint_;
+
+ // fizz-buzz up to 100
+ size_t n = 100;
+
+ // get the default device
+ compute::device device = compute::system::default_device();
+ compute::context ctx(device);
+ compute::command_queue queue(ctx, device);
+
+ // compile the fizz-buzz program
+ compute::program fizz_buzz_program =
+ compute::program::create_with_source(fizz_buzz_source, ctx);
+ fizz_buzz_program.build();
+
+ // create a vector for the output string and computing offsets
+ compute::vector<char> output(ctx);
+ compute::vector<uint_> offsets(n, ctx);
+
+ // run the allocate kernel to calculate string lengths
+ compute::kernel allocate_kernel(fizz_buzz_program, "fizz_buzz_allocate_strings");
+ allocate_kernel.set_arg(0, offsets);
+ queue.enqueue_nd_range_kernel(allocate_kernel, dim(0), dim(n), dim(1));
+
+ // allocate space for the output string
+ output.resize(
+ compute::accumulate(offsets.begin(), offsets.end(), 0, queue)
+ );
+
+ // scan string lengths for each number to calculate the output offsets
+ compute::exclusive_scan(
+ offsets.begin(), offsets.end(), offsets.begin(), queue
+ );
+
+ // run the copy kernel to fill the output buffer
+ compute::kernel copy_kernel(fizz_buzz_program, "fizz_buzz_copy_strings");
+ copy_kernel.set_arg(0, offsets);
+ copy_kernel.set_arg(1, output);
+ queue.enqueue_nd_range_kernel(copy_kernel, dim(0), dim(n), dim(1));
+
+ // copy the string to the host and print it to stdout
+ std::string str;
+ str.resize(output.size());
+ compute::copy(output.begin(), output.end(), str.begin(), queue);
+ std::cout << str;
+
+ return 0;
+}
diff --git a/src/boost/libs/compute/example/hello_world.cpp b/src/boost/libs/compute/example/hello_world.cpp
new file mode 100644
index 00000000..ef45fdd6
--- /dev/null
+++ b/src/boost/libs/compute/example/hello_world.cpp
@@ -0,0 +1,30 @@
+//---------------------------------------------------------------------------//
+// Copyright (c) 2013 Kyle Lutz <kyle.r.lutz@gmail.com>
+//
+// Distributed under the Boost Software License, Version 1.0
+// See accompanying file LICENSE_1_0.txt or copy at
+// http://www.boost.org/LICENSE_1_0.txt
+//
+// See http://boostorg.github.com/compute for more information.
+//---------------------------------------------------------------------------//
+
+//[hello_world_example
+
+#include <iostream>
+
+#include <boost/compute/core.hpp>
+
+namespace compute = boost::compute;
+
+int main()
+{
+ // get the default device
+ compute::device device = compute::system::default_device();
+
+ // print the device's name and platform
+ std::cout << "hello from " << device.name();
+ std::cout << " (platform: " << device.platform().name() << ")" << std::endl;
+
+ return 0;
+}
+//]
diff --git a/src/boost/libs/compute/example/host_sort.cpp b/src/boost/libs/compute/example/host_sort.cpp
new file mode 100644
index 00000000..b5ff52cf
--- /dev/null
+++ b/src/boost/libs/compute/example/host_sort.cpp
@@ -0,0 +1,56 @@
+//---------------------------------------------------------------------------//
+// Copyright (c) 2013-2014 Kyle Lutz <kyle.r.lutz@gmail.com>
+//
+// Distributed under the Boost Software License, Version 1.0
+// See accompanying file LICENSE_1_0.txt or copy at
+// http://www.boost.org/LICENSE_1_0.txt
+//
+// See http://boostorg.github.com/compute for more information.
+//---------------------------------------------------------------------------//
+
+#include <iostream>
+#include <vector>
+
+#include <boost/spirit/include/karma.hpp>
+
+#include <boost/compute/system.hpp>
+#include <boost/compute/algorithm/sort.hpp>
+
+namespace compute = boost::compute;
+namespace karma = boost::spirit::karma;
+
+int rand_int()
+{
+ return rand() % 100;
+}
+
+// this example demonstrates how to sort a std::vector of ints on the GPU
+int main()
+{
+ // get default device and setup context
+ compute::device gpu = compute::system::default_device();
+ compute::context context(gpu);
+ compute::command_queue queue(context, gpu);
+ std::cout << "device: " << gpu.name() << std::endl;
+
+ // create vector of random values on the host
+ std::vector<int> vector(8);
+ std::generate(vector.begin(), vector.end(), rand_int);
+
+ // print input vector
+ std::cout << "input: [ "
+ << karma::format(karma::int_ % ", ", vector)
+ << " ]"
+ << std::endl;
+
+ // sort vector
+ compute::sort(vector.begin(), vector.end(), queue);
+
+ // print sorted vector
+ std::cout << "output: [ "
+ << karma::format(karma::int_ % ", ", vector)
+ << " ]"
+ << std::endl;
+
+ return 0;
+}
diff --git a/src/boost/libs/compute/example/inline_ptx.cpp b/src/boost/libs/compute/example/inline_ptx.cpp
new file mode 100644
index 00000000..fe4e32f0
--- /dev/null
+++ b/src/boost/libs/compute/example/inline_ptx.cpp
@@ -0,0 +1,72 @@
+//---------------------------------------------------------------------------//
+// Copyright (c) 2013-2014 Kyle Lutz <kyle.r.lutz@gmail.com>
+//
+// Distributed under the Boost Software License, Version 1.0
+// See accompanying file LICENSE_1_0.txt or copy at
+// http://www.boost.org/LICENSE_1_0.txt
+//
+// See http://boostorg.github.com/compute for more information.
+//---------------------------------------------------------------------------//
+
+#include <algorithm>
+#include <iostream>
+#include <vector>
+
+#include <boost/compute/system.hpp>
+#include <boost/compute/algorithm/copy.hpp>
+#include <boost/compute/algorithm/transform.hpp>
+#include <boost/compute/container/vector.hpp>
+
+namespace compute = boost::compute;
+
+// this example shows how to embed PTX assembly instructions
+// directly into boost.compute functions and use them with the
+// transform() algorithm.
+int main()
+{
+ // get default device and setup context
+ compute::device gpu = compute::system::default_device();
+ compute::context context(gpu);
+ compute::command_queue queue(context, gpu);
+ std::cout << "device: " << gpu.name() << std::endl;
+
+ // check to ensure we have an nvidia device
+ if(gpu.vendor() != "NVIDIA Corporation"){
+ std::cerr << "error: inline PTX assembly is only supported "
+ << "on NVIDIA devices."
+ << std::endl;
+ return 0;
+ }
+
+ // create input values and copy them to the device
+ using compute::uint_;
+ uint_ data[] = { 0x00, 0x01, 0x11, 0xFF };
+ compute::vector<uint_> input(data, data + 4, queue);
+
+ // function returning the number of bits set (aka population count or
+ // popcount) using the "popc" inline ptx assembly instruction.
+ BOOST_COMPUTE_FUNCTION(uint_, nvidia_popc, (uint_ x),
+ {
+ uint count;
+ asm("popc.b32 %0, %1;" : "=r"(count) : "r"(x));
+ return count;
+ });
+
+ // calculate the popcount for each input value
+ compute::vector<uint_> output(input.size(), context);
+ compute::transform(
+ input.begin(), input.end(), output.begin(), nvidia_popc, queue
+ );
+
+ // copy results back to the host and print them out
+ std::vector<uint_> counts(output.size());
+ compute::copy(output.begin(), output.end(), counts.begin(), queue);
+
+ for(size_t i = 0; i < counts.size(); i++){
+ std::cout << "0x" << std::hex << data[i]
+ << " has " << counts[i]
+ << " bits set" << std::endl;
+ }
+
+ return 0;
+}
diff --git a/src/boost/libs/compute/example/k_means.cpp b/src/boost/libs/compute/example/k_means.cpp
new file mode 100644
index 00000000..cd291a9b
--- /dev/null
+++ b/src/boost/libs/compute/example/k_means.cpp
@@ -0,0 +1,229 @@
+//---------------------------------------------------------------------------//
+// Copyright (c) 2013-2014 Kyle Lutz <kyle.r.lutz@gmail.com>
+//
+// Distributed under the Boost Software License, Version 1.0
+// See accompanying file LICENSE_1_0.txt or copy at
+// http://www.boost.org/LICENSE_1_0.txt
+//
+// See http://boostorg.github.com/compute for more information.
+//---------------------------------------------------------------------------//
+
+#include <opencv2/core/core.hpp>
+#include <opencv2/highgui/highgui.hpp>
+#include <opencv2/imgproc/imgproc.hpp>
+
+#include <boost/compute/system.hpp>
+#include <boost/compute/container/vector.hpp>
+#include <boost/compute/image/image2d.hpp>
+#include <boost/compute/interop/opencv/core.hpp>
+#include <boost/compute/interop/opencv/highgui.hpp>
+#include <boost/compute/random/default_random_engine.hpp>
+#include <boost/compute/random/uniform_real_distribution.hpp>
+#include <boost/compute/utility/dim.hpp>
+#include <boost/compute/utility/source.hpp>
+
+namespace compute = boost::compute;
+
+using compute::dim;
+using compute::int_;
+using compute::float_;
+using compute::float2_;
+
+// the k-means example implements the k-means clustering algorithm
+int main()
+{
+ // number of clusters
+ size_t k = 6;
+
+ // number of points
+ size_t n_points = 4500;
+
+ // height and width of image
+ size_t height = 800;
+ size_t width = 800;
+
+ // get default device and setup context
+ compute::device gpu = compute::system::default_device();
+ compute::context context(gpu);
+ compute::command_queue queue(context, gpu);
+
+ // generate random, uniformily-distributed points
+ compute::default_random_engine random_engine(queue);
+ compute::uniform_real_distribution<float_> uniform_distribution(0, 800);
+
+ compute::vector<float2_> points(n_points, context);
+ uniform_distribution.generate(
+ compute::make_buffer_iterator<float_>(points.get_buffer(), 0),
+ compute::make_buffer_iterator<float_>(points.get_buffer(), n_points * 2),
+ random_engine,
+ queue
+ );
+
+ // initialize all points to cluster 0
+ compute::vector<int_> clusters(n_points, context);
+ compute::fill(clusters.begin(), clusters.end(), 0, queue);
+
+ // create initial means with the first k points
+ compute::vector<float2_> means(k, context);
+ compute::copy_n(points.begin(), k, means.begin(), queue);
+
+ // k-means clustering program source
+ const char k_means_source[] = BOOST_COMPUTE_STRINGIZE_SOURCE(
+ __kernel void assign_clusters(__global const float2 *points,
+ __global const float2 *means,
+ const int k,
+ __global int *clusters)
+ {
+ const uint gid = get_global_id(0);
+
+ const float2 point = points[gid];
+
+ // find the closest cluster
+ float current_distance = 0;
+ int closest_cluster = -1;
+
+ // find closest cluster mean to the point
+ for(int i = 0; i < k; i++){
+ const float2 mean = means[i];
+
+ int distance_to_mean = distance(point, mean);
+ if(closest_cluster == -1 || distance_to_mean < current_distance){
+ current_distance = distance_to_mean;
+ closest_cluster = i;
+ }
+ }
+
+ // write new cluster
+ clusters[gid] = closest_cluster;
+ }
+
+ __kernel void update_means(__global const float2 *points,
+ const uint n_points,
+ __global float2 *means,
+ __global const int *clusters)
+ {
+ const uint k = get_global_id(0);
+
+ float2 sum = { 0, 0 };
+ float count = 0;
+ for(uint i = 0; i < n_points; i++){
+ if(clusters[i] == k){
+ sum += points[i];
+ count += 1;
+ }
+ }
+
+ means[k] = sum / count;
+ }
+ );
+
+ // build the k-means program
+ compute::program k_means_program =
+ compute::program::build_with_source(k_means_source, context);
+
+ // setup the k-means kernels
+ compute::kernel assign_clusters_kernel(k_means_program, "assign_clusters");
+ assign_clusters_kernel.set_arg(0, points);
+ assign_clusters_kernel.set_arg(1, means);
+ assign_clusters_kernel.set_arg(2, int_(k));
+ assign_clusters_kernel.set_arg(3, clusters);
+
+ compute::kernel update_means_kernel(k_means_program, "update_means");
+ update_means_kernel.set_arg(0, points);
+ update_means_kernel.set_arg(1, int_(n_points));
+ update_means_kernel.set_arg(2, means);
+ update_means_kernel.set_arg(3, clusters);
+
+ // run the k-means algorithm
+ for(int iteration = 0; iteration < 25; iteration++){
+ queue.enqueue_1d_range_kernel(assign_clusters_kernel, 0, n_points, 0);
+ queue.enqueue_1d_range_kernel(update_means_kernel, 0, k, 0);
+ }
+
+ // create output image
+ compute::image2d image(
+ context, width, height, compute::image_format(CL_RGBA, CL_UNSIGNED_INT8)
+ );
+
+ // program with two kernels, one to fill the image with white, and then
+ // one the draw to points calculated in coordinates on the image
+ const char draw_walk_source[] = BOOST_COMPUTE_STRINGIZE_SOURCE(
+ __kernel void draw_points(__global const float2 *points,
+ __global const int *clusters,
+ __write_only image2d_t image)
+ {
+ const uint i = get_global_id(0);
+ const float2 coord = points[i];
+
+ // map cluster number to color
+ uint4 color = { 0, 0, 0, 0 };
+ switch(clusters[i]){
+ case 0:
+ color = (uint4)(255, 0, 0, 255);
+ break;
+ case 1:
+ color = (uint4)(0, 255, 0, 255);
+ break;
+ case 2:
+ color = (uint4)(0, 0, 255, 255);
+ break;
+ case 3:
+ color = (uint4)(255, 255, 0, 255);
+ break;
+ case 4:
+ color = (uint4)(255, 0, 255, 255);
+ break;
+ case 5:
+ color = (uint4)(0, 255, 255, 255);
+ break;
+ }
+
+ // draw a 3x3 pixel point
+ for(int x = -1; x <= 1; x++){
+ for(int y = -1; y <= 1; y++){
+ if(coord.x + x > 0 && coord.x + x < get_image_width(image) &&
+ coord.y + y > 0 && coord.y + y < get_image_height(image)){
+ write_imageui(image, (int2)(coord.x, coord.y) + (int2)(x, y), color);
+ }
+ }
+ }
+ }
+
+ __kernel void fill_gray(__write_only image2d_t image)
+ {
+ const int2 coord = { get_global_id(0), get_global_id(1) };
+
+ if(coord.x < get_image_width(image) && coord.y < get_image_height(image)){
+ uint4 gray = { 15, 15, 15, 15 };
+ write_imageui(image, coord, gray);
+ }
+ }
+ );
+
+ // build the program
+ compute::program draw_program =
+ compute::program::build_with_source(draw_walk_source, context);
+
+ // fill image with dark gray
+ compute::kernel fill_kernel(draw_program, "fill_gray");
+ fill_kernel.set_arg(0, image);
+
+ queue.enqueue_nd_range_kernel(
+ fill_kernel, dim(0, 0), dim(width, height), dim(1, 1)
+ );
+
+ // draw points colored according to cluster
+ compute::kernel draw_kernel(draw_program, "draw_points");
+ draw_kernel.set_arg(0, points);
+ draw_kernel.set_arg(1, clusters);
+ draw_kernel.set_arg(2, image);
+ queue.enqueue_1d_range_kernel(draw_kernel, 0, n_points, 0);
+
+ // show image
+ compute::opencv_imshow("k-means", image, queue);
+
+ // wait and return
+ cv::waitKey(0);
+
+ return 0;
+}
diff --git a/src/boost/libs/compute/example/list_devices.cpp b/src/boost/libs/compute/example/list_devices.cpp
new file mode 100644
index 00000000..b7bdbc94
--- /dev/null
+++ b/src/boost/libs/compute/example/list_devices.cpp
@@ -0,0 +1,45 @@
+//---------------------------------------------------------------------------//
+// Copyright (c) 2013 Kyle Lutz <kyle.r.lutz@gmail.com>
+//
+// Distributed under the Boost Software License, Version 1.0
+// See accompanying file LICENSE_1_0.txt or copy at
+// http://www.boost.org/LICENSE_1_0.txt
+//
+// See http://boostorg.github.com/compute for more information.
+//---------------------------------------------------------------------------//
+
+#include <iostream>
+
+#include <boost/compute/core.hpp>
+
+namespace compute = boost::compute;
+
+int main()
+{
+ std::vector<compute::platform> platforms = compute::system::platforms();
+
+ for(size_t i = 0; i < platforms.size(); i++){
+ const compute::platform &platform = platforms[i];
+
+ std::cout << "Platform '" << platform.name() << "'" << std::endl;
+
+ std::vector<compute::device> devices = platform.devices();
+ for(size_t j = 0; j < devices.size(); j++){
+ const compute::device &device = devices[j];
+
+ std::string type;
+ if(device.type() & compute::device::gpu)
+ type = "GPU Device";
+ else if(device.type() & compute::device::cpu)
+ type = "CPU Device";
+ else if(device.type() & compute::device::accelerator)
+ type = "Accelerator Device";
+ else
+ type = "Unknown Device";
+
+ std::cout << " " << type << ": " << device.name() << std::endl;
+ }
+ }
+
+ return 0;
+}
diff --git a/src/boost/libs/compute/example/longest_vector.cpp b/src/boost/libs/compute/example/longest_vector.cpp
new file mode 100644
index 00000000..faada332
--- /dev/null
+++ b/src/boost/libs/compute/example/longest_vector.cpp
@@ -0,0 +1,58 @@
+//---------------------------------------------------------------------------//
+// Copyright (c) 2013 Kyle Lutz <kyle.r.lutz@gmail.com>
+//
+// Distributed under the Boost Software License, Version 1.0
+// See accompanying file LICENSE_1_0.txt or copy at
+// http://www.boost.org/LICENSE_1_0.txt
+//
+// See http://boostorg.github.com/compute for more information.
+//---------------------------------------------------------------------------//
+
+#include <iostream>
+#include <iterator>
+
+#include <boost/compute/algorithm/max_element.hpp>
+#include <boost/compute/container/vector.hpp>
+#include <boost/compute/functional/geometry.hpp>
+#include <boost/compute/iterator/transform_iterator.hpp>
+#include <boost/compute/types/fundamental.hpp>
+
+namespace compute = boost::compute;
+
+// this example shows how to use the max_element() algorithm along with
+// a transform_iterator and the length() function to find the longest
+// 4-component vector in an array of vectors
+int main()
+{
+ using compute::float4_;
+
+ // vectors data
+ float data[] = { 1.0f, 2.0f, 3.0f, 0.0f,
+ 4.0f, 5.0f, 6.0f, 0.0f,
+ 7.0f, 8.0f, 9.0f, 0.0f,
+ 0.0f, 0.0f, 0.0f, 0.0f };
+
+ // create device vector with the vector data
+ compute::vector<float4_> vector(
+ reinterpret_cast<float4_ *>(data),
+ reinterpret_cast<float4_ *>(data) + 4
+ );
+
+ // find the longest vector
+ compute::vector<float4_>::const_iterator iter =
+ compute::max_element(
+ compute::make_transform_iterator(
+ vector.begin(), compute::length<float4_>()
+ ),
+ compute::make_transform_iterator(
+ vector.end(), compute::length<float4_>()
+ )
+ ).base();
+
+ // print the index of the longest vector
+ std::cout << "longest vector index: "
+ << std::distance(vector.begin(), iter)
+ << std::endl;
+
+ return 0;
+}
diff --git a/src/boost/libs/compute/example/mandelbrot.cpp b/src/boost/libs/compute/example/mandelbrot.cpp
new file mode 100644
index 00000000..5ddb3687
--- /dev/null
+++ b/src/boost/libs/compute/example/mandelbrot.cpp
@@ -0,0 +1,224 @@
+//---------------------------------------------------------------------------//
+// Copyright (c) 2013-2014 Kyle Lutz <kyle.r.lutz@gmail.com>
+//
+// Distributed under the Boost Software License, Version 1.0
+// See accompanying file LICENSE_1_0.txt or copy at
+// http://www.boost.org/LICENSE_1_0.txt
+//
+// See http://boostorg.github.com/compute for more information.
+//---------------------------------------------------------------------------//
+
+#include <iostream>
+#include <algorithm>
+
+#include <QtGlobal>
+#if QT_VERSION >= 0x050000
+#include <QtWidgets>
+#else
+#include <QtGui>
+#endif
+#include <QtOpenGL>
+
+#ifndef Q_MOC_RUN
+#include <boost/compute/command_queue.hpp>
+#include <boost/compute/kernel.hpp>
+#include <boost/compute/program.hpp>
+#include <boost/compute/system.hpp>
+#include <boost/compute/interop/opengl.hpp>
+#include <boost/compute/utility/dim.hpp>
+#include <boost/compute/utility/source.hpp>
+#endif // Q_MOC_RUN
+
+namespace compute = boost::compute;
+
+// opencl source code
+const char source[] = BOOST_COMPUTE_STRINGIZE_SOURCE(
+ // map value to color
+ float4 color(uint i)
+ {
+ uchar c = i;
+ uchar x = 35;
+ uchar y = 25;
+ uchar z = 15;
+ uchar max = 255;
+
+ if(i == 256)
+ return (float4)(0, 0, 0, 255);
+ else
+ return (float4)(max-x*i, max-y*i, max-z*i, max) / 255.f;
+ }
+
+ __kernel void mandelbrot(__write_only image2d_t image)
+ {
+ const uint x_coord = get_global_id(0);
+ const uint y_coord = get_global_id(1);
+ const uint width = get_global_size(0);
+ const uint height = get_global_size(1);
+
+ float x_origin = ((float) x_coord / width) * 3.25f - 2.0f;
+ float y_origin = ((float) y_coord / height) * 2.5f - 1.25f;
+
+ float x = 0.0f;
+ float y = 0.0f;
+
+ uint i = 0;
+ while(x*x + y*y <= 4.f && i < 256){
+ float tmp = x*x - y*y + x_origin;
+ y = 2*x*y + y_origin;
+ x = tmp;
+ i++;
+ }
+
+ int2 coord = { x_coord, y_coord };
+ write_imagef(image, coord, color(i));
+ };
+);
+
+class MandelbrotWidget : public QGLWidget
+{
+ Q_OBJECT
+
+public:
+ MandelbrotWidget(QWidget *parent = 0);
+ ~MandelbrotWidget();
+
+ void initializeGL();
+ void resizeGL(int width, int height);
+ void paintGL();
+ void keyPressEvent(QKeyEvent* event);
+
+private:
+ compute::context context_;
+ compute::command_queue queue_;
+ compute::program program_;
+ GLuint gl_texture_;
+ compute::opengl_texture cl_texture_;
+};
+
+MandelbrotWidget::MandelbrotWidget(QWidget *parent)
+ : QGLWidget(parent)
+{
+ gl_texture_ = 0;
+}
+
+MandelbrotWidget::~MandelbrotWidget()
+{
+}
+
+void MandelbrotWidget::initializeGL()
+{
+ // setup opengl
+ glDisable(GL_LIGHTING);
+
+ // create the OpenGL/OpenCL shared context
+ context_ = compute::opengl_create_shared_context();
+
+ // get gpu device
+ compute::device gpu = context_.get_device();
+ std::cout << "device: " << gpu.name() << std::endl;
+
+ // setup command queue
+ queue_ = compute::command_queue(context_, gpu);
+
+ // build mandelbrot program
+ program_ = compute::program::create_with_source(source, context_);
+ program_.build();
+}
+
+void MandelbrotWidget::resizeGL(int width, int height)
+{
+#if QT_VERSION >= 0x050000
+ // scale height/width based on device pixel ratio
+ width /= windowHandle()->devicePixelRatio();
+ height /= windowHandle()->devicePixelRatio();
+#endif
+
+ // resize viewport
+ glViewport(0, 0, width, height);
+
+ // delete old texture
+ if(gl_texture_){
+ glDeleteTextures(1, &gl_texture_);
+ gl_texture_ = 0;
+ }
+
+ // generate new texture
+ glGenTextures(1, &gl_texture_);
+ glBindTexture(GL_TEXTURE_2D, gl_texture_);
+ glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_S, GL_REPEAT);
+ glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_T, GL_REPEAT);
+ glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, GL_LINEAR);
+ glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_LINEAR);
+ glTexImage2D(
+ GL_TEXTURE_2D, 0, GL_RGBA, width, height, 0, GL_RGBA, GL_UNSIGNED_BYTE, 0
+ );
+
+ // create opencl object for the texture
+ cl_texture_ = compute::opengl_texture(
+ context_, GL_TEXTURE_2D, 0, gl_texture_, CL_MEM_WRITE_ONLY
+ );
+}
+
+void MandelbrotWidget::paintGL()
+{
+ using compute::dim;
+
+ float w = width();
+ float h = height();
+
+ glMatrixMode(GL_PROJECTION);
+ glLoadIdentity();
+ glOrtho(0.0, w, 0.0, h, -1.0, 1.0);
+ glMatrixMode(GL_MODELVIEW);
+ glLoadIdentity();
+
+ // setup the mandelbrot kernel
+ compute::kernel kernel(program_, "mandelbrot");
+ kernel.set_arg(0, cl_texture_);
+
+ // acquire the opengl texture so it can be used in opencl
+ compute::opengl_enqueue_acquire_gl_objects(1, &cl_texture_.get(), queue_);
+
+ // execute the mandelbrot kernel
+ queue_.enqueue_nd_range_kernel(
+ kernel, dim(0, 0), dim(width(), height()), dim(1, 1)
+ );
+
+ // release the opengl texture so it can be used by opengl
+ compute::opengl_enqueue_release_gl_objects(1, &cl_texture_.get(), queue_);
+
+ // ensure opencl is finished before rendering in opengl
+ queue_.finish();
+
+ // draw a single quad with the mandelbrot image texture
+ glEnable(GL_TEXTURE_2D);
+ glBindTexture(GL_TEXTURE_2D, gl_texture_);
+
+ glBegin(GL_QUADS);
+ glTexCoord2f(0, 0); glVertex2f(0, 0);
+ glTexCoord2f(0, 1); glVertex2f(0, h);
+ glTexCoord2f(1, 1); glVertex2f(w, h);
+ glTexCoord2f(1, 0); glVertex2f(w, 0);
+ glEnd();
+}
+
+void MandelbrotWidget::keyPressEvent(QKeyEvent* event)
+{
+ if(event->key() == Qt::Key_Escape) {
+ this->close();
+ }
+}
+
+// the mandelbrot example shows how to create a mandelbrot image in
+// OpenCL and render the image as a texture in OpenGL
+int main(int argc, char *argv[])
+{
+ QApplication app(argc, argv);
+
+ MandelbrotWidget widget;
+ widget.show();
+
+ return app.exec();
+}
+
+#include "mandelbrot.moc"
diff --git a/src/boost/libs/compute/example/mapped_view.cpp b/src/boost/libs/compute/example/mapped_view.cpp
new file mode 100644
index 00000000..1e05107a
--- /dev/null
+++ b/src/boost/libs/compute/example/mapped_view.cpp
@@ -0,0 +1,45 @@
+//---------------------------------------------------------------------------//
+// Copyright (c) 2013-2014 Kyle Lutz <kyle.r.lutz@gmail.com>
+//
+// Distributed under the Boost Software License, Version 1.0
+// See accompanying file LICENSE_1_0.txt or copy at
+// http://www.boost.org/LICENSE_1_0.txt
+//
+// See http://boostorg.github.com/compute for more information.
+//---------------------------------------------------------------------------//
+
+#include <iostream>
+#include <vector>
+
+#include <boost/compute/system.hpp>
+#include <boost/compute/algorithm/reduce.hpp>
+#include <boost/compute/container/mapped_view.hpp>
+
+namespace compute = boost::compute;
+
+// this example demonstrates how to use the mapped_view class to map
+// an array of numbers to device memory and use the reduce() algorithm
+// to calculate the sum.
+int main()
+{
+ // get default device and setup context
+ compute::device gpu = compute::system::default_device();
+ compute::context context(gpu);
+ compute::command_queue queue(context, gpu);
+ std::cout << "device: " << gpu.name() << std::endl;
+
+ // create data on host
+ int data[] = { 4, 2, 3, 7, 8, 9, 1, 6 };
+
+ // create mapped view on device
+ compute::mapped_view<int> view(data, 8, context);
+
+ // use reduce() to calculate sum on the device
+ int sum = 0;
+ compute::reduce(view.begin(), view.end(), &sum, queue);
+
+ // print the sum on the host
+ std::cout << "sum: " << sum << std::endl;
+
+ return 0;
+}
diff --git a/src/boost/libs/compute/example/matrix_transpose.cpp b/src/boost/libs/compute/example/matrix_transpose.cpp
new file mode 100644
index 00000000..ee9b1e9d
--- /dev/null
+++ b/src/boost/libs/compute/example/matrix_transpose.cpp
@@ -0,0 +1,355 @@
+//---------------------------------------------------------------------------//
+// Copyright (c) 2014 Benoit Dequidt <benoit.dequidt@gmail.com>
+//
+// Distributed under the Boost Software License, Version 1.0
+// See accompanying file LICENSE_1_0.txt or copy at
+// http://www.boost.org/LICENSE_1_0.txt
+//
+// See http://boostorg.github.com/compute for more information.
+//---------------------------------------------------------------------------//
+
+#include <iostream>
+#include <cstdlib>
+
+#include <boost/program_options.hpp>
+
+#include <boost/compute/core.hpp>
+#include <boost/compute/algorithm/copy.hpp>
+#include <boost/compute/container/vector.hpp>
+#include <boost/compute/type_traits/type_name.hpp>
+#include <boost/compute/utility/source.hpp>
+
+namespace compute = boost::compute;
+namespace po = boost::program_options;
+
+using compute::uint_;
+
+const uint_ TILE_DIM = 32;
+const uint_ BLOCK_ROWS = 8;
+
+// generate a copy kernel program
+compute::kernel make_copy_kernel(const compute::context& context)
+{
+ // source for the copy_kernel program
+ const char source[] = BOOST_COMPUTE_STRINGIZE_SOURCE(
+ __kernel void copy_kernel(__global const float *src, __global float *dst)
+ {
+ uint x = get_group_id(0) * TILE_DIM + get_local_id(0);
+ uint y = get_group_id(1) * TILE_DIM + get_local_id(1);
+
+ uint width = get_num_groups(0) * TILE_DIM;
+
+ for(uint i = 0 ; i < TILE_DIM ; i+= BLOCK_ROWS){
+ dst[(y+i)*width +x] = src[(y+i)*width + x];
+ }
+ }
+ );
+
+ // setup compilation flags for the copy program
+ std::stringstream options;
+ options << "-DTILE_DIM=" << TILE_DIM << " -DBLOCK_ROWS=" << BLOCK_ROWS;
+
+ // create and build the copy program
+ compute::program program =
+ compute::program::build_with_source(source, context, options.str());
+
+ // create and return the copy kernel
+ return program.create_kernel("copy_kernel");
+}
+
+// generate a naive transpose kernel
+compute::kernel make_naive_transpose_kernel(const compute::context& context)
+{
+ // source for the naive_transpose kernel
+ const char source[] = BOOST_COMPUTE_STRINGIZE_SOURCE(
+ __kernel void naive_transpose(__global const float *src, __global float *dst)
+ {
+ uint x = get_group_id(0) * TILE_DIM + get_local_id(0);
+ uint y = get_group_id(1) * TILE_DIM + get_local_id(1);
+
+ uint width = get_num_groups(0) * TILE_DIM;
+
+ for(uint i = 0 ; i < TILE_DIM; i+= BLOCK_ROWS){
+ dst[x*width + y+i] = src[(y+i)*width + x];
+ }
+ }
+ );
+
+ // setup compilation flags for the naive_transpose program
+ std::stringstream options;
+ options << "-DTILE_DIM=" << TILE_DIM << " -DBLOCK_ROWS=" << BLOCK_ROWS;
+
+ // create and build the naive_transpose program
+ compute::program program =
+ compute::program::build_with_source(source, context, options.str());
+
+ // create and return the naive_transpose kernel
+ return program.create_kernel("naive_transpose");
+}
+
+// generates a coalesced transpose kernel
+compute::kernel make_coalesced_transpose_kernel(const compute::context& context)
+{
+ // source for the coalesced_transpose kernel
+ const char source[] = BOOST_COMPUTE_STRINGIZE_SOURCE(
+ __kernel void coalesced_transpose(__global const float *src, __global float *dst)
+ {
+ __local float tile[TILE_DIM][TILE_DIM];
+
+ // compute indexes
+ uint x = get_group_id(0) * TILE_DIM + get_local_id(0);
+ uint y = get_group_id(1) * TILE_DIM + get_local_id(1);
+
+ uint width = get_num_groups(0) * TILE_DIM;
+
+ // load inside local memory
+ for(uint i = 0 ; i < TILE_DIM; i+= BLOCK_ROWS){
+ tile[get_local_id(1)+i][get_local_id(0)] = src[(y+i)*width + x];
+ }
+
+ barrier(CLK_LOCAL_MEM_FENCE);
+
+ // transpose indexes
+ x = get_group_id(1) * TILE_DIM + get_local_id(0);
+ y = get_group_id(0) * TILE_DIM + get_local_id(1);
+
+ // write output from local memory
+ for(uint i = 0 ; i < TILE_DIM ; i+=BLOCK_ROWS){
+ dst[(y+i)*width + x] = tile[get_local_id(0)][get_local_id(1)+i];
+ }
+ }
+ );
+
+ // setup compilation flags for the coalesced_transpose program
+ std::stringstream options;
+ options << "-DTILE_DIM=" << TILE_DIM << " -DBLOCK_ROWS=" << BLOCK_ROWS;
+
+ // create and build the coalesced_transpose program
+ compute::program program =
+ compute::program::build_with_source(source, context, options.str());
+
+ // create and return coalesced_transpose kernel
+ return program.create_kernel("coalesced_transpose");
+}
+
+// generate a coalesced withtout bank conflicts kernel
+compute::kernel make_coalesced_no_bank_conflicts_kernel(const compute::context& context)
+{
+ const char source[] = BOOST_COMPUTE_STRINGIZE_SOURCE(
+ __kernel void coalesced_no_bank_conflicts(__global const float *src, __global float *dst)
+ {
+ // TILE_DIM+1 is here to avoid bank conflicts in local memory
+ __local float tile[TILE_DIM][TILE_DIM+1];
+
+ // compute indexes
+ uint x = get_group_id(0) * TILE_DIM + get_local_id(0);
+ uint y = get_group_id(1) * TILE_DIM + get_local_id(1);
+
+ uint width = get_num_groups(0) * TILE_DIM;
+
+ // load inside local memory
+ for(uint i = 0 ; i < TILE_DIM; i+= BLOCK_ROWS){
+ tile[get_local_id(1)+i][get_local_id(0)] = src[(y+i)*width + x];
+ }
+
+ barrier(CLK_LOCAL_MEM_FENCE);
+
+ // transpose indexes
+ x = get_group_id(1) * TILE_DIM + get_local_id(0);
+ y = get_group_id(0) * TILE_DIM + get_local_id(1);
+
+ // write output from local memory
+ for(uint i = 0 ; i < TILE_DIM ; i+=BLOCK_ROWS){
+ dst[(y+i)*width + x] = tile[get_local_id(0)][get_local_id(1)+i];
+ }
+ }
+ );
+
+ // setup compilation flags for the coalesced_no_bank_conflicts program
+ std::stringstream options;
+ options << "-DTILE_DIM=" << TILE_DIM << " -DBLOCK_ROWS=" << BLOCK_ROWS;
+
+ // create and build the coalesced_no_bank_conflicts program
+ compute::program program =
+ compute::program::build_with_source(source, context, options.str());
+
+ // create and return the coalesced_no_bank_conflicts kernel
+ return program.create_kernel("coalesced_no_bank_conflicts");
+}
+
+// compare 'expectedResult' to 'transposedMatrix'. prints an error message if not equal.
+bool check_transposition(const std::vector<float>& expectedResult,
+ uint_ size,
+ const std::vector<float>& transposedMatrix)
+{
+ for(uint_ i = 0 ; i < size ; ++i){
+ if(expectedResult[i] != transposedMatrix[i]){
+ std::cout << "idx = " << i << " , expected " << expectedResult[i]
+ << " , got " << transposedMatrix[i] << std::endl;
+ std::cout << "FAILED" << std::endl;
+ return false;
+ }
+ }
+ return true;
+}
+
+// generate a matrix inside 'in' and do the tranposition inside 'out'
+void generate_matrix(std::vector<float>& in, std::vector<float>& out, uint_ rows, uint_ cols)
+{
+ // generate a matrix
+ for(uint_ i = 0 ; i < rows ; ++i){
+ for(uint_ j = 0 ; j < cols ; ++j){
+ in[i*cols + j] = i*cols + j;
+ }
+ }
+
+ // store transposed result
+ for(uint_ j = 0; j < cols ; ++j){
+ for(uint_ i = 0 ; i < rows ; ++i){
+ out[j*rows + i] = in[i*cols + j];
+ }
+ }
+}
+
+// neccessary for 64-bit integer on win32
+#ifdef _WIN32
+#define uint64_t unsigned __int64
+#endif
+
+int main(int argc, char *argv[])
+{
+ // setup command line arguments
+ po::options_description options("options");
+ options.add_options()
+ ("help", "show usage instructions")
+ ("rows", po::value<uint_>()->default_value(4096), "number of matrix rows")
+ ("cols", po::value<uint_>()->default_value(4096), "number of matrix columns")
+ ;
+
+ // parse command line
+ po::variables_map vm;
+ po::store(po::parse_command_line(argc, argv, options), vm);
+ po::notify(vm);
+
+ // check command line arguments
+ if(vm.count("help")){
+ std::cout << options << std::endl;
+ return 0;
+ }
+
+ // get number rows and columns for the matrix
+ const uint_ rows = vm["rows"].as<uint_>();
+ const uint_ cols = vm["cols"].as<uint_>();
+
+ // get the default device
+ compute::device device = compute::system::default_device();
+
+ // print out device name and matrix information
+ std::cout << "Device: " << device.name() << std::endl;
+ std::cout << "Matrix Size: " << rows << "x" << cols << std::endl;
+ std::cout << "Grid Size: " << rows/TILE_DIM << "x" << cols/TILE_DIM << " blocks" << std::endl;
+ std::cout << "Local Size: " << TILE_DIM << "x" << BLOCK_ROWS << " threads" << std::endl;
+ std::cout << std::endl;
+
+ // On OSX this example does not work on CPU devices
+ #if defined(__APPLE__)
+ if(device.type() & compute::device::cpu) {
+ std::cout << "On OSX this example does not work on CPU devices" << std::endl;
+ return 0;
+ }
+ #endif
+
+ const size_t global_work_size[2] = {rows, cols*BLOCK_ROWS/TILE_DIM};
+ const size_t local_work_size[2] = {TILE_DIM, BLOCK_ROWS};
+
+ // setup input data on the host
+ const uint_ size = rows * cols;
+ std::vector<float> h_input(size);
+ std::vector<float> h_output(size);
+ std::vector<float> expectedResult(size);
+ generate_matrix(h_input, expectedResult, rows, cols);
+
+ // create a context for the device
+ compute::context context(device);
+
+ // device vectors
+ compute::vector<float> d_input(size, context);
+ compute::vector<float> d_output(size, context);
+
+ // command_queue with profiling
+ compute::command_queue queue(context, device, compute::command_queue::enable_profiling);
+
+ // copy input data
+ compute::copy(h_input.begin(), h_input.end(), d_input.begin(), queue);
+
+ // simple copy kernel
+ std::cout << "Testing copy_kernel:" << std::endl;
+ compute::kernel kernel = make_copy_kernel(context);
+ kernel.set_arg(0, d_input);
+ kernel.set_arg(1, d_output);
+
+ compute::event start;
+ start = queue.enqueue_nd_range_kernel(kernel, 2, 0, global_work_size, local_work_size);
+ queue.finish();
+ uint64_t elapsed = start.duration<boost::chrono::nanoseconds>().count();
+
+ std::cout << " Elapsed: " << elapsed << " ns" << std::endl;
+ std::cout << " BandWidth: " << 2*rows*cols*sizeof(float) / elapsed << " GB/s" << std::endl;
+ compute::copy(d_output.begin(), d_output.end(), h_output.begin(), queue);
+
+ check_transposition(h_input, rows*cols, h_output);
+ std::cout << std::endl;
+
+ // naive_transpose kernel
+ std::cout << "Testing naive_transpose:" << std::endl;
+ kernel = make_naive_transpose_kernel(context);
+ kernel.set_arg(0, d_input);
+ kernel.set_arg(1, d_output);
+
+ start = queue.enqueue_nd_range_kernel(kernel, 2, 0, global_work_size, local_work_size);
+ queue.finish();
+ elapsed = start.duration<boost::chrono::nanoseconds>().count();
+ std::cout << " Elapsed: " << elapsed << " ns" << std::endl;
+ std::cout << " BandWidth: " << 2*rows*cols*sizeof(float) / elapsed << " GB/s" << std::endl;
+ compute::copy(d_output.begin(), d_output.end(), h_output.begin(), queue);
+
+ check_transposition(expectedResult, rows*cols, h_output);
+ std::cout << std::endl;
+
+ // coalesced_transpose kernel
+ std::cout << "Testing coalesced_transpose:" << std::endl;
+ kernel = make_coalesced_transpose_kernel(context);
+ kernel.set_arg(0, d_input);
+ kernel.set_arg(1, d_output);
+
+ start = queue.enqueue_nd_range_kernel(kernel, 2, 0, global_work_size, local_work_size);
+ queue.finish();
+ elapsed = start.duration<boost::chrono::nanoseconds>().count();
+ std::cout << " Elapsed: " << elapsed << " ns" << std::endl;
+ std::cout << " BandWidth: " << 2*rows*cols*sizeof(float) / elapsed << " GB/s" << std::endl;
+
+ compute::copy(d_output.begin(), d_output.end(), h_output.begin(), queue);
+
+ check_transposition(expectedResult, rows*cols, h_output);
+ std::cout << std::endl;
+
+ // coalesced_no_bank_conflicts kernel
+ std::cout << "Testing coalesced_no_bank_conflicts:" << std::endl;
+
+ kernel = make_coalesced_no_bank_conflicts_kernel(context);
+ kernel.set_arg(0, d_input);
+ kernel.set_arg(1, d_output);
+
+ start = queue.enqueue_nd_range_kernel(kernel, 2, 0, global_work_size, local_work_size);
+ queue.finish();
+ elapsed = start.duration<boost::chrono::nanoseconds>().count();
+ std::cout << " Elapsed: " << elapsed << " ns" << std::endl;
+ std::cout << " BandWidth: " << 2*rows*cols*sizeof(float) / elapsed << " GB/s" << std::endl;
+
+ compute::copy(d_output.begin(), d_output.end(), h_output.begin(), queue);
+
+ check_transposition(expectedResult, rows*cols, h_output);
+ std::cout << std::endl;
+
+ return 0;
+}
diff --git a/src/boost/libs/compute/example/memory_limits.cpp b/src/boost/libs/compute/example/memory_limits.cpp
new file mode 100644
index 00000000..8d0e4a7a
--- /dev/null
+++ b/src/boost/libs/compute/example/memory_limits.cpp
@@ -0,0 +1,37 @@
+//---------------------------------------------------------------------------//
+// Copyright (c) 2013-2014 Kyle Lutz <kyle.r.lutz@gmail.com>
+//
+// Distributed under the Boost Software License, Version 1.0
+// See accompanying file LICENSE_1_0.txt or copy at
+// http://www.boost.org/LICENSE_1_0.txt
+//
+// See http://boostorg.github.com/compute for more information.
+//---------------------------------------------------------------------------//
+
+#include <iostream>
+
+#include <boost/compute/core.hpp>
+
+namespace compute = boost::compute;
+
+int main()
+{
+ // get the default device
+ compute::device device = compute::system::default_device();
+
+ std::cout << "device: " << device.name() << std::endl;
+ std::cout << " global memory size: "
+ << device.get_info<cl_ulong>(CL_DEVICE_GLOBAL_MEM_SIZE) / 1024 / 1024
+ << " MB"
+ << std::endl;
+ std::cout << " local memory size: "
+ << device.get_info<cl_ulong>(CL_DEVICE_LOCAL_MEM_SIZE) / 1024
+ << " KB"
+ << std::endl;
+ std::cout << " constant memory size: "
+ << device.get_info<cl_ulong>(CL_DEVICE_MAX_CONSTANT_BUFFER_SIZE) / 1024
+ << " KB"
+ << std::endl;
+
+ return 0;
+}
diff --git a/src/boost/libs/compute/example/monte_carlo.cpp b/src/boost/libs/compute/example/monte_carlo.cpp
new file mode 100644
index 00000000..8ae26209
--- /dev/null
+++ b/src/boost/libs/compute/example/monte_carlo.cpp
@@ -0,0 +1,73 @@
+//---------------------------------------------------------------------------//
+// Copyright (c) 2013 Kyle Lutz <kyle.r.lutz@gmail.com>
+//
+// Distributed under the Boost Software License, Version 1.0
+// See accompanying file LICENSE_1_0.txt or copy at
+// http://www.boost.org/LICENSE_1_0.txt
+//
+// See http://boostorg.github.com/compute for more information.
+//---------------------------------------------------------------------------//
+
+#include <iostream>
+
+#include <boost/compute/function.hpp>
+#include <boost/compute/system.hpp>
+#include <boost/compute/algorithm/count_if.hpp>
+#include <boost/compute/container/vector.hpp>
+#include <boost/compute/iterator/buffer_iterator.hpp>
+#include <boost/compute/random/default_random_engine.hpp>
+#include <boost/compute/types/fundamental.hpp>
+
+namespace compute = boost::compute;
+
+int main()
+{
+ // get default device and setup context
+ compute::device gpu = compute::system::default_device();
+ compute::context context(gpu);
+ compute::command_queue queue(context, gpu);
+
+ std::cout << "device: " << gpu.name() << std::endl;
+
+ using compute::uint_;
+ using compute::uint2_;
+
+
+#ifdef CI_BUILD // lower number of points for CI builds
+ size_t n = 2000;
+#else
+ // ten million random points
+ size_t n = 10000000;
+#endif
+
+ // generate random numbers
+ compute::default_random_engine rng(queue);
+ compute::vector<uint_> vector(n * 2, context);
+ rng.generate(vector.begin(), vector.end(), queue);
+
+ // function returing true if the point is within the unit circle
+ BOOST_COMPUTE_FUNCTION(bool, is_in_unit_circle, (const uint2_ point),
+ {
+ const float x = point.x / (float) UINT_MAX - 1;
+ const float y = point.y / (float) UINT_MAX - 1;
+
+ return (x*x + y*y) < 1.0f;
+ });
+
+ // iterate over vector<uint> as vector<uint2>
+ compute::buffer_iterator<uint2_> start =
+ compute::make_buffer_iterator<uint2_>(vector.get_buffer(), 0);
+ compute::buffer_iterator<uint2_> end =
+ compute::make_buffer_iterator<uint2_>(vector.get_buffer(), vector.size() / 2);
+
+ // count number of random points within the unit circle
+ size_t count = compute::count_if(start, end, is_in_unit_circle, queue);
+
+ // print out values
+ float count_f = static_cast<float>(count);
+ std::cout << "count: " << count << " / " << n << std::endl;
+ std::cout << "ratio: " << count_f / float(n) << std::endl;
+ std::cout << "pi = " << (count_f / float(n)) * 4.0f << std::endl;
+
+ return 0;
+}
diff --git a/src/boost/libs/compute/example/nbody.cpp b/src/boost/libs/compute/example/nbody.cpp
new file mode 100644
index 00000000..9379c63b
--- /dev/null
+++ b/src/boost/libs/compute/example/nbody.cpp
@@ -0,0 +1,236 @@
+//---------------------------------------------------------------------------//
+// Copyright (c) 2014 Fabian Köhler <fabian2804@googlemail.com>
+//
+// Distributed under the Boost Software License, Version 1.0
+// See accompanying file LICENSE_1_0.txt or copy at
+// http://www.boost.org/LICENSE_1_0.txt
+//
+// See http://boostorg.github.com/compute for more information.
+//---------------------------------------------------------------------------//
+
+#include <iostream>
+
+#define GL_GLEXT_PROTOTYPES
+#ifdef __APPLE__
+#include <OpenGL/gl.h>
+#include <OpenGL/glext.h>
+#else
+#include <GL/gl.h>
+#include <GL/glext.h>
+#endif
+
+#include <QtGlobal>
+#if QT_VERSION >= 0x050000
+#include <QtWidgets>
+#else
+#include <QtGui>
+#endif
+#include <QtOpenGL>
+#include <QTimer>
+
+#include <boost/program_options.hpp>
+#include <boost/random/uniform_real_distribution.hpp>
+#include <boost/random/mersenne_twister.hpp>
+
+#ifndef Q_MOC_RUN
+#include <boost/compute/system.hpp>
+#include <boost/compute/algorithm.hpp>
+#include <boost/compute/container/vector.hpp>
+#include <boost/compute/interop/opengl.hpp>
+#include <boost/compute/utility/source.hpp>
+#endif // Q_MOC_RUN
+
+namespace compute = boost::compute;
+namespace po = boost::program_options;
+
+using compute::uint_;
+using compute::float4_;
+
+const char source[] = BOOST_COMPUTE_STRINGIZE_SOURCE(
+ __kernel void updateVelocity(__global const float4* position, __global float4* velocity, float dt, uint N)
+ {
+ uint gid = get_global_id(0);
+
+ float4 r = { 0.0f, 0.0f, 0.0f, 0.0f };
+ float f = 0.0f;
+ for(uint i = 0; i != gid; i++) {
+ if(i != gid) {
+ r = position[i]-position[gid];
+ f = length(r)+0.001f;
+ f *= f*f;
+ f = dt/f;
+ velocity[gid] += f*r;
+ }
+ }
+ }
+ __kernel void updatePosition(__global float4* position, __global const float4* velocity, float dt)
+ {
+ uint gid = get_global_id(0);
+
+ position[gid].xyz += dt*velocity[gid].xyz;
+ }
+);
+
+class NBodyWidget : public QGLWidget
+{
+ Q_OBJECT
+
+public:
+ NBodyWidget(std::size_t particles, float dt, QWidget* parent = 0);
+ ~NBodyWidget();
+
+ void initializeGL();
+ void resizeGL(int width, int height);
+ void paintGL();
+ void updateParticles();
+ void keyPressEvent(QKeyEvent* event);
+
+private:
+ QTimer* timer;
+
+ compute::context m_context;
+ compute::command_queue m_queue;
+ compute::program m_program;
+ compute::opengl_buffer m_position;
+ compute::vector<float4_>* m_velocity;
+ compute::kernel m_velocity_kernel;
+ compute::kernel m_position_kernel;
+
+ bool m_initial_draw;
+
+ const uint_ m_particles;
+ const float m_dt;
+};
+
+NBodyWidget::NBodyWidget(std::size_t particles, float dt, QWidget* parent)
+ : QGLWidget(parent), m_initial_draw(true), m_particles(particles), m_dt(dt)
+{
+ // create a timer to redraw as fast as possible
+ timer = new QTimer(this);
+ connect(timer, SIGNAL(timeout()), this, SLOT(updateGL()));
+ timer->start(1);
+}
+
+NBodyWidget::~NBodyWidget()
+{
+ delete m_velocity;
+
+ // delete the opengl buffer
+ GLuint vbo = m_position.get_opengl_object();
+ glDeleteBuffers(1, &vbo);
+}
+
+void NBodyWidget::initializeGL()
+{
+ // create context, command queue and program
+ m_context = compute::opengl_create_shared_context();
+ m_queue = compute::command_queue(m_context, m_context.get_device());
+ m_program = compute::program::create_with_source(source, m_context);
+ m_program.build();
+
+ // prepare random particle positions that will be transferred to the vbo
+ float4_* temp = new float4_[m_particles];
+ boost::random::uniform_real_distribution<float> dist(-0.5f, 0.5f);
+ boost::random::mt19937_64 gen;
+ for(size_t i = 0; i < m_particles; i++) {
+ temp[i][0] = dist(gen);
+ temp[i][1] = dist(gen);
+ temp[i][2] = dist(gen);
+ temp[i][3] = 1.0f;
+ }
+
+ // create an OpenGL vbo
+ GLuint vbo = 0;
+ glGenBuffers(1, &vbo);
+ glBindBuffer(GL_ARRAY_BUFFER, vbo);
+ glBufferData(GL_ARRAY_BUFFER, m_particles*sizeof(float4_), temp, GL_DYNAMIC_DRAW);
+
+ // create a OpenCL buffer from the vbo
+ m_position = compute::opengl_buffer(m_context, vbo);
+ delete[] temp;
+
+ // create buffer for velocities
+ m_velocity = new compute::vector<float4_>(m_particles, m_context);
+ compute::fill(m_velocity->begin(), m_velocity->end(), float4_(0.0f, 0.0f, 0.0f, 0.0f), m_queue);
+
+ // create compute kernels
+ m_velocity_kernel = m_program.create_kernel("updateVelocity");
+ m_velocity_kernel.set_arg(0, m_position);
+ m_velocity_kernel.set_arg(1, m_velocity->get_buffer());
+ m_velocity_kernel.set_arg(2, m_dt);
+ m_velocity_kernel.set_arg(3, m_particles);
+ m_position_kernel = m_program.create_kernel("updatePosition");
+ m_position_kernel.set_arg(0, m_position);
+ m_position_kernel.set_arg(1, m_velocity->get_buffer());
+ m_position_kernel.set_arg(2, m_dt);
+}
+void NBodyWidget::resizeGL(int width, int height)
+{
+ // update viewport
+ glViewport(0, 0, width, height);
+}
+void NBodyWidget::paintGL()
+{
+ // clear buffer
+ glClear(GL_COLOR_BUFFER_BIT | GL_DEPTH_BUFFER_BIT);
+
+ // check if this is the first draw
+ if(m_initial_draw) {
+ // do not update particles
+ m_initial_draw = false;
+ } else {
+ // update particles
+ updateParticles();
+ }
+
+ // draw
+ glVertexPointer(4, GL_FLOAT, 0, 0);
+ glEnableClientState(GL_VERTEX_ARRAY);
+ glDrawArrays(GL_POINTS, 0, m_particles);
+ glFinish();
+}
+void NBodyWidget::updateParticles()
+{
+ // enqueue kernels to update particles and make sure that the command queue is finished
+ compute::opengl_enqueue_acquire_buffer(m_position, m_queue);
+ m_queue.enqueue_1d_range_kernel(m_velocity_kernel, 0, m_particles, 0).wait();
+ m_queue.enqueue_1d_range_kernel(m_position_kernel, 0, m_particles, 0).wait();
+ m_queue.finish();
+ compute::opengl_enqueue_release_buffer(m_position, m_queue);
+}
+void NBodyWidget::keyPressEvent(QKeyEvent* event)
+{
+ if(event->key() == Qt::Key_Escape) {
+ this->close();
+ }
+}
+
+int main(int argc, char** argv)
+{
+ // parse command line arguments
+ po::options_description options("options");
+ options.add_options()
+ ("help", "show usage")
+ ("particles", po::value<uint_>()->default_value(1000), "number of particles")
+ ("dt", po::value<float>()->default_value(0.00001f), "width of each integration step");
+ po::variables_map vm;
+ po::store(po::parse_command_line(argc, argv, options), vm);
+ po::notify(vm);
+
+ if(vm.count("help") > 0) {
+ std::cout << options << std::endl;
+ return 0;
+ }
+
+ const uint_ particles = vm["particles"].as<uint_>();
+ const float dt = vm["dt"].as<float>();
+
+ QApplication app(argc, argv);
+ NBodyWidget nbody(particles, dt);
+
+ nbody.show();
+
+ return app.exec();
+}
+
+#include "nbody.moc"
diff --git a/src/boost/libs/compute/example/opencl_test.cpp b/src/boost/libs/compute/example/opencl_test.cpp
new file mode 100644
index 00000000..15b85202
--- /dev/null
+++ b/src/boost/libs/compute/example/opencl_test.cpp
@@ -0,0 +1,165 @@
+//---------------------------------------------------------------------------//
+// Copyright (c) 2013-2014 Kyle Lutz <kyle.r.lutz@gmail.com>
+//
+// Distributed under the Boost Software License, Version 1.0
+// See accompanying file LICENSE_1_0.txt or copy at
+// http://www.boost.org/LICENSE_1_0.txt
+//
+// See http://boostorg.github.com/compute for more information.
+//---------------------------------------------------------------------------//
+
+// See boost/compute/detail/diagnostic.hpp
+// GCC
+#if ((__GNUC__ * 100) + __GNUC_MINOR__) >= 402
+#define BOOST_COMPUTE_GCC_DIAG_STR(s) #s
+#define BOOST_COMPUTE_GCC_DIAG_JOINSTR(x,y) BOOST_COMPUTE_GCC_DIAG_STR(x ## y)
+# define BOOST_COMPUTE_GCC_DIAG_DO_PRAGMA(x) _Pragma (#x)
+# define BOOST_COMPUTE_GCC_DIAG_PRAGMA(x) BOOST_COMPUTE_GCC_DIAG_DO_PRAGMA(GCC diagnostic x)
+# if ((__GNUC__ * 100) + __GNUC_MINOR__) >= 406
+# define BOOST_COMPUTE_GCC_DIAG_OFF(x) BOOST_COMPUTE_GCC_DIAG_PRAGMA(push) \
+ BOOST_COMPUTE_GCC_DIAG_PRAGMA(ignored BOOST_COMPUTE_GCC_DIAG_JOINSTR(-W,x))
+# define BOOST_COMPUTE_GCC_DIAG_ON(x) BOOST_COMPUTE_GCC_DIAG_PRAGMA(pop)
+# else
+# define BOOST_COMPUTE_GCC_DIAG_OFF(x) \
+ BOOST_COMPUTE_GCC_DIAG_PRAGMA(ignored BOOST_COMPUTE_GCC_DIAG_JOINSTR(-W,x))
+# define BOOST_COMPUTE_GCC_DIAG_ON(x) \
+ BOOST_COMPUTE_GCC_DIAG_PRAGMA(warning BOOST_COMPUTE_GCC_DIAG_JOINSTR(-W,x))
+# endif
+#else // Ensure these macros do nothing for other compilers.
+# define BOOST_COMPUTE_GCC_DIAG_OFF(x)
+# define BOOST_COMPUTE_GCC_DIAG_ON(x)
+#endif
+
+// Clang
+#ifdef __clang__
+# define BOOST_COMPUTE_CLANG_DIAG_STR(s) # s
+// stringize s to "no-sign-compare"
+# define BOOST_COMPUTE_CLANG_DIAG_JOINSTR(x,y) BOOST_COMPUTE_CLANG_DIAG_STR(x ## y)
+// join -W with no-unused-variable to "-Wno-sign-compare"
+# define BOOST_COMPUTE_CLANG_DIAG_DO_PRAGMA(x) _Pragma (#x)
+// _Pragma is unary operator #pragma ("")
+# define BOOST_COMPUTE_CLANG_DIAG_PRAGMA(x) \
+ BOOST_COMPUTE_CLANG_DIAG_DO_PRAGMA(clang diagnostic x)
+# define BOOST_COMPUTE_CLANG_DIAG_OFF(x) BOOST_COMPUTE_CLANG_DIAG_PRAGMA(push) \
+ BOOST_COMPUTE_CLANG_DIAG_PRAGMA(ignored BOOST_COMPUTE_CLANG_DIAG_JOINSTR(-W,x))
+// For example: #pragma clang diagnostic ignored "-Wno-sign-compare"
+# define BOOST_COMPUTE_CLANG_DIAG_ON(x) BOOST_COMPUTE_CLANG_DIAG_PRAGMA(pop)
+// For example: #pragma clang diagnostic warning "-Wno-sign-compare"
+#else // Ensure these macros do nothing for other compilers.
+# define BOOST_COMPUTE_CLANG_DIAG_OFF(x)
+# define BOOST_COMPUTE_CLANG_DIAG_ON(x)
+# define BOOST_COMPUTE_CLANG_DIAG_PRAGMA(x)
+#endif
+
+// MSVC
+#if defined(_MSC_VER)
+# define BOOST_COMPUTE_MSVC_DIAG_DO_PRAGMA(x) __pragma(x)
+# define BOOST_COMPUTE_MSVC_DIAG_PRAGMA(x) \
+ BOOST_COMPUTE_MSVC_DIAG_DO_PRAGMA(warning(x))
+# define BOOST_COMPUTE_MSVC_DIAG_OFF(x) BOOST_COMPUTE_MSVC_DIAG_PRAGMA(push) \
+ BOOST_COMPUTE_MSVC_DIAG_PRAGMA(disable: x)
+# define BOOST_COMPUTE_MSVC_DIAG_ON(x) BOOST_COMPUTE_MSVC_DIAG_PRAGMA(pop)
+#else // Ensure these macros do nothing for other compilers.
+# define BOOST_COMPUTE_MSVC_DIAG_OFF(x)
+# define BOOST_COMPUTE_MSVC_DIAG_ON(x)
+#endif
+
+#include <iostream>
+
+// include the proper opencl header for the system
+#if defined(__APPLE__)
+#include <OpenCL/cl.h>
+#else
+#include <CL/cl.h>
+#endif
+
+// the opencl_test example displays the opencl platforms and devices found
+// on the system using the opencl api directly. if this test fails to compile
+// and/or run, there is a problem with the opencl implementation found on the
+// system. users should ensure this test runs successfuly before using any of
+// the boost.compute apis (which depend on a working opencl implementation).
+int main()
+{
+ // Suppress deprecated declarations warning
+ BOOST_COMPUTE_MSVC_DIAG_OFF(4996); // MSVC
+ BOOST_COMPUTE_GCC_DIAG_OFF(deprecated-declarations); // GCC
+ BOOST_COMPUTE_CLANG_DIAG_OFF(deprecated-declarations); // Clang
+
+ // query number of opencl platforms
+ cl_uint num_platforms = 0;
+ cl_int ret = clGetPlatformIDs(0, NULL, &num_platforms);
+ if(ret != CL_SUCCESS){
+ std::cerr << "failed to query platforms: " << ret << std::endl;
+ return -1;
+ }
+
+ // check that at least one platform was found
+ if(num_platforms == 0){
+ std::cerr << "found 0 platforms" << std::endl;
+ return 0;
+ }
+
+ // get platform ids
+ cl_platform_id *platforms = new cl_platform_id[num_platforms];
+ clGetPlatformIDs(num_platforms, platforms, NULL);
+
+ // iterate through each platform and query its devices
+ for(cl_uint i = 0; i < num_platforms; i++){
+ cl_platform_id platform = platforms[i];
+
+ // query number of opencl devices
+ cl_uint num_devices = 0;
+ ret = clGetDeviceIDs(platform, CL_DEVICE_TYPE_ALL, 0, NULL, &num_devices);
+ if(ret != CL_SUCCESS){
+ std::cerr << "failed to lookup devices for platform " << i << std::endl;
+ continue;
+ }
+
+ // print number of devices found
+ std::cout << "platform " << i << " has " << num_devices << " devices:" << std::endl;
+
+ // get device ids for the platform
+ cl_device_id *devices = new cl_device_id[num_devices];
+ ret = clGetDeviceIDs(platform, CL_DEVICE_TYPE_ALL, num_devices, devices, NULL);
+ if(ret != CL_SUCCESS){
+ std::cerr << "failed to query platform devices" << std::endl;
+ delete[] devices;
+ continue;
+ }
+
+ // iterate through each device on the platform and print its name
+ for(cl_uint j = 0; j < num_devices; j++){
+ cl_device_id device = devices[j];
+
+ // get length of the device name string
+ size_t name_length = 0;
+ ret = clGetDeviceInfo(device, CL_DEVICE_NAME, 0, NULL, &name_length);
+ if(ret != CL_SUCCESS){
+ std::cerr << "failed to query device name length for device " << j << std::endl;
+ continue;
+ }
+
+ // get the device name string
+ char *name = new char[name_length];
+ ret = clGetDeviceInfo(device, CL_DEVICE_NAME, name_length, name, NULL);
+ if(ret != CL_SUCCESS){
+ std::cerr << "failed to query device name string for device " << j << std::endl;
+ delete[] name;
+ continue;
+ }
+
+ // print out the device name
+ std::cout << " device: " << name << std::endl;
+
+ delete[] name;
+ }
+ delete[] devices;
+ }
+ delete[] platforms;
+
+ BOOST_COMPUTE_CLANG_DIAG_ON(deprecated-declarations); // Clang
+ BOOST_COMPUTE_GCC_DIAG_ON(deprecated-declarations); // GCC
+ BOOST_COMPUTE_MSVC_DIAG_OFF(4996); // MSVC
+
+ return 0;
+}
diff --git a/src/boost/libs/compute/example/opencv_convolution.cpp b/src/boost/libs/compute/example/opencv_convolution.cpp
new file mode 100644
index 00000000..7ba53436
--- /dev/null
+++ b/src/boost/libs/compute/example/opencv_convolution.cpp
@@ -0,0 +1,265 @@
+//---------------------------------------------------------------------------//
+// Copyright (c) 2013-2014 Mageswaran.D <mageswaran1989@gmail.com>
+//
+// Distributed under the Boost Software License, Version 1.0
+// See accompanying file LICENSE_1_0.txt or copy at
+// http://www.boost.org/LICENSE_1_0.txt
+//
+// See http://boostorg.github.com/compute for more information.
+//---------------------------------------------------------------------------//
+
+#include <iostream>
+#include <string>
+
+#include <opencv2/core/core.hpp>
+#include <opencv2/highgui/highgui.hpp>
+#include <opencv2/imgproc/imgproc.hpp>
+
+#include <boost/compute/system.hpp>
+#include <boost/compute/interop/opencv/core.hpp>
+#include <boost/compute/interop/opencv/highgui.hpp>
+#include <boost/compute/utility/source.hpp>
+
+#include <boost/program_options.hpp>
+
+namespace compute = boost::compute;
+namespace po = boost::program_options;
+
+// Create convolution program
+const char source[] = BOOST_COMPUTE_STRINGIZE_SOURCE (
+ __kernel void convolution(__read_only image2d_t sourceImage,
+ __write_only image2d_t outputImage,
+ __constant float* filter,
+ int filterWidth)
+ {
+ const sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE |
+ CLK_ADDRESS_CLAMP_TO_EDGE |
+ CLK_FILTER_NEAREST;
+
+ // Store each work-item's unique row and column
+ int x = get_global_id(0);
+ int y = get_global_id(1);
+
+ // Half the width of the filter is needed for indexing
+ // memory later
+ int halfWidth = (int)(filterWidth/2);
+
+ // All accesses to images return data as four-element vector
+ // (i.e., float4).
+ float4 sum = {0.0f, 0.0f, 0.0f, 0.0f};
+
+ // Iterator for the filter
+ int filterIdx = 0;
+
+ // Each work-item iterates around its local area based on the
+ // size of the filter
+ int2 coords; // Coordinates for accessing the image
+
+ // Iterate the filter rows
+ for(int i = -halfWidth; i <= halfWidth; i++)
+ {
+ coords.y = y + i;
+
+ // Iterate over the filter columns
+ for(int j = -halfWidth; j <= halfWidth; j++)
+ {
+ coords.x = x + j;
+
+ float4 pixel;
+
+ // Read a pixel from the image.
+ // Work on a channel
+ pixel = read_imagef(sourceImage, sampler, coords);
+ sum.x += pixel.x * filter[filterIdx++];
+ //sum.y += pixel.y * filter[filterIdx++];
+ //sum.z += pixel.z * filter[filterIdx++];
+ }
+ }
+
+ barrier(CLK_GLOBAL_MEM_FENCE);
+ // Copy the data to the output image if the
+ // work-item is in bounds
+ if(y < get_image_height(sourceImage) &&
+ x < get_image_width(sourceImage))
+ {
+ coords.x = x;
+ coords.y = y;
+
+ //Same channel is copied in all three channels
+ //write_imagef(outputImage, coords,
+ // (float4)(sum.x,sum.x,sum.x,1.0f));
+
+ write_imagef(outputImage, coords, sum);
+ }
+ }
+);
+
+// This example shows how to read two images or use camera
+// with OpenCV, transfer the frames to the GPU,
+// and apply a convolution written in OpenCL
+int main(int argc, char *argv[])
+{
+ ///////////////////////////////////////////////////////////////////////////
+
+ // setup the command line arguments
+ po::options_description desc;
+ desc.add_options()
+ ("help", "show available options")
+ ("camera", po::value<int>()->default_value(-1),
+ "if not default camera, specify a camera id")
+ ("image", po::value<std::string>(), "path to image file");
+
+ // Parse the command lines
+ po::variables_map vm;
+ po::store(po::parse_command_line(argc, argv, desc), vm);
+ po::notify(vm);
+
+ //check the command line arguments
+ if(vm.count("help"))
+ {
+ std::cout << desc << std::endl;
+ return 0;
+ }
+
+ ///////////////////////////////////////////////////////////////////////////
+
+ //OpenCV variables
+ cv::Mat cv_mat;
+ cv::VideoCapture cap; //OpenCV camera handle.
+
+ //Filter Variables
+ float filter[9] = {
+ -1.0, 0.0, 1.0,
+ -2.0, 0.0, 2.0,
+ -1.0, 0.0, 1.0,
+ };
+
+ // The convolution filter is 3x3
+ int filterWidth = 3;
+
+ //OpenCL variables
+ // Get default device and setup context
+ compute::device gpu = compute::system::default_device();
+ compute::context context(gpu);
+ compute::command_queue queue(context, gpu);
+ compute::buffer dev_filter(context, sizeof(filter),
+ compute::memory_object::read_only |
+ compute::memory_object::copy_host_ptr,
+ filter);
+
+ compute::program filter_program =
+ compute::program::create_with_source(source, context);
+
+ try
+ {
+ filter_program.build();
+ }
+ catch(compute::opencl_error e)
+ {
+ std::cout<<"Build Error: "<<std::endl
+ <<filter_program.build_log();
+ return -1;
+ }
+
+ // create fliter kernel and set arguments
+ compute::kernel filter_kernel(filter_program, "convolution");
+
+ ///////////////////////////////////////////////////////////////////////////
+
+ //check for image paths
+ if(vm.count("image"))
+ {
+ // Read image with OpenCV
+ cv_mat = cv::imread(vm["image"].as<std::string>(),
+ CV_LOAD_IMAGE_COLOR);
+ if(!cv_mat.data){
+ std::cerr << "Failed to load image" << std::endl;
+ return -1;
+ }
+ }
+ else //by default use camera
+ {
+ //open camera
+ cap.open(vm["camera"].as<int>());
+ // read first frame
+ cap >> cv_mat;
+ if(!cv_mat.data){
+ std::cerr << "failed to capture frame" << std::endl;
+ return -1;
+ }
+ }
+
+ // Convert image to BGRA (OpenCL requires 16-byte aligned data)
+ cv::cvtColor(cv_mat, cv_mat, CV_BGR2BGRA);
+
+ // Transfer image/frame data to gpu
+ compute::image2d dev_input_image =
+ compute::opencv_create_image2d_with_mat(
+ cv_mat, compute::image2d::read_write, queue
+ );
+
+ // Create output image
+ // Be sure what will be your ouput image/frame size
+ compute::image2d dev_output_image(
+ context,
+ dev_input_image.width(),
+ dev_input_image.height(),
+ dev_input_image.format(),
+ compute::image2d::write_only
+ );
+
+ filter_kernel.set_arg(0, dev_input_image);
+ filter_kernel.set_arg(1, dev_output_image);
+ filter_kernel.set_arg(2, dev_filter);
+ filter_kernel.set_arg(3, filterWidth);
+
+ // run flip kernel
+ size_t origin[2] = { 0, 0 };
+ size_t region[2] = { dev_input_image.width(),
+ dev_input_image.height() };
+
+ ///////////////////////////////////////////////////////////////////////////
+
+ queue.enqueue_nd_range_kernel(filter_kernel, 2, origin, region, 0);
+
+ //check for image paths
+ if(vm.count("image"))
+ {
+ // show host image
+ cv::imshow("Original Image", cv_mat);
+
+ // show gpu image
+ compute::opencv_imshow("Convoluted Image", dev_output_image, queue);
+
+ // wait and return
+ cv::waitKey(0);
+ }
+ else
+ {
+ char key = '\0';
+ while(key != 27) //check for escape key
+ {
+ cap >> cv_mat;
+
+ // Convert image to BGRA (OpenCL requires 16-byte aligned data)
+ cv::cvtColor(cv_mat, cv_mat, CV_BGR2BGRA);
+
+ // Update the device image memory with current frame data
+ compute::opencv_copy_mat_to_image(cv_mat,
+ dev_input_image,queue);
+
+ // Run the kernel on the device
+ queue.enqueue_nd_range_kernel(filter_kernel, 2, origin, region, 0);
+
+ // Show host image
+ cv::imshow("Camera Frame", cv_mat);
+
+ // Show GPU image
+ compute::opencv_imshow("Convoluted Frame", dev_output_image, queue);
+
+ // wait
+ key = cv::waitKey(10);
+ }
+ }
+ return 0;
+}
diff --git a/src/boost/libs/compute/example/opencv_flip.cpp b/src/boost/libs/compute/example/opencv_flip.cpp
new file mode 100644
index 00000000..8cc26ef2
--- /dev/null
+++ b/src/boost/libs/compute/example/opencv_flip.cpp
@@ -0,0 +1,101 @@
+//---------------------------------------------------------------------------//
+// Copyright (c) 2013-2014 Kyle Lutz <kyle.r.lutz@gmail.com>
+//
+// Distributed under the Boost Software License, Version 1.0
+// See accompanying file LICENSE_1_0.txt or copy at
+// http://www.boost.org/LICENSE_1_0.txt
+//
+// See http://boostorg.github.com/compute for more information.
+//---------------------------------------------------------------------------//
+
+#include <iostream>
+
+#include <opencv2/core/core.hpp>
+#include <opencv2/highgui/highgui.hpp>
+#include <opencv2/imgproc/imgproc.hpp>
+
+#include <boost/compute/system.hpp>
+#include <boost/compute/interop/opencv/core.hpp>
+#include <boost/compute/interop/opencv/highgui.hpp>
+#include <boost/compute/utility/source.hpp>
+
+namespace compute = boost::compute;
+
+// this example shows how to read an image with OpenCV, transfer the
+// image to the GPU, and apply a simple flip filter written in OpenCL
+int main(int argc, char *argv[])
+{
+ // check command line
+ if(argc < 2){
+ std::cerr << "usage: " << argv[0] << " FILENAME" << std::endl;
+ return -1;
+ }
+
+ // read image with opencv
+ cv::Mat cv_image = cv::imread(argv[1], CV_LOAD_IMAGE_COLOR);
+ if(!cv_image.data){
+ std::cerr << "failed to load image" << std::endl;
+ return -1;
+ }
+
+ // get default device and setup context
+ compute::device gpu = compute::system::default_device();
+ compute::context context(gpu);
+ compute::command_queue queue(context, gpu);
+
+ // convert image to BGRA (OpenCL requires 16-byte aligned data)
+ cv::cvtColor(cv_image, cv_image, CV_BGR2BGRA);
+
+ // transfer image to gpu
+ compute::image2d input_image =
+ compute::opencv_create_image2d_with_mat(
+ cv_image, compute::image2d::read_write, queue
+ );
+
+ // create output image
+ compute::image2d output_image(
+ context,
+ input_image.width(),
+ input_image.height(),
+ input_image.format(),
+ compute::image2d::write_only
+ );
+
+ // create flip program
+ const char source[] = BOOST_COMPUTE_STRINGIZE_SOURCE(
+ __kernel void flip_kernel(__read_only image2d_t input,
+ __write_only image2d_t output)
+ {
+ const sampler_t sampler = CLK_ADDRESS_NONE | CLK_FILTER_NEAREST;
+ int height = get_image_height(input);
+ int2 input_coord = { get_global_id(0), get_global_id(1) };
+ int2 output_coord = { input_coord.x, height - input_coord.y - 1 };
+ float4 value = read_imagef(input, sampler, input_coord);
+ write_imagef(output, output_coord, value);
+ }
+ );
+
+ compute::program flip_program =
+ compute::program::create_with_source(source, context);
+ flip_program.build();
+
+ // create flip kernel and set arguments
+ compute::kernel flip_kernel(flip_program, "flip_kernel");
+ flip_kernel.set_arg(0, input_image);
+ flip_kernel.set_arg(1, output_image);
+
+ // run flip kernel
+ size_t origin[2] = { 0, 0 };
+ size_t region[2] = { input_image.width(), input_image.height() };
+ queue.enqueue_nd_range_kernel(flip_kernel, 2, origin, region, 0);
+
+ // show host image
+ cv::imshow("opencv image", cv_image);
+
+ // show gpu image
+ compute::opencv_imshow("filtered image", output_image, queue);
+
+ // wait and return
+ cv::waitKey(0);
+ return 0;
+}
diff --git a/src/boost/libs/compute/example/opencv_histogram.cpp b/src/boost/libs/compute/example/opencv_histogram.cpp
new file mode 100644
index 00000000..e339030b
--- /dev/null
+++ b/src/boost/libs/compute/example/opencv_histogram.cpp
@@ -0,0 +1,228 @@
+//---------------------------------------------------------------------------//
+// Copyright (c) 2013-2014 Mageswaran.D <mageswaran1989@gmail.com>
+//
+// Distributed under the Boost Software License, Version 1.0
+// See accompanying file LICENSE_1_0.txt or copy at
+// http://www.boost.org/LICENSE_1_0.txt
+//
+// See http://boostorg.github.com/compute for more information.
+//---------------------------------------------------------------------------//
+
+//Code sample for calculating histogram using OpenCL and
+//displaying image histogram in OpenCV.
+
+#include <iostream>
+#include <string>
+
+#include <opencv2/imgproc/imgproc.hpp>
+#include <opencv2/highgui/highgui.hpp>
+
+#include <boost/compute/source.hpp>
+#include <boost/compute/system.hpp>
+#include <boost/compute/container/vector.hpp>
+#include <boost/compute/interop/opencv/core.hpp>
+#include <boost/compute/interop/opencv/highgui.hpp>
+#include <boost/program_options.hpp>
+
+namespace compute = boost::compute;
+namespace po = boost::program_options;
+
+// number of bins
+int histSize = 256;
+
+// Set the ranges ( for B,G,R) )
+// TryOut: consider the range in kernel calculation
+float range[] = { 0, 256 } ;
+const float* histRange = { range };
+
+// Create naive histogram program
+// Needs "cl_khr_local_int32_base_atomics" extension
+const char source[] = BOOST_COMPUTE_STRINGIZE_SOURCE (
+ __kernel void histogram(read_only image2d_t src_image,
+ __global int* b_hist,
+ __global int* g_hist,
+ __global int* r_hist)
+ {
+ sampler_t sampler =( CLK_NORMALIZED_COORDS_FALSE |
+ CLK_FILTER_NEAREST |
+ CLK_ADDRESS_CLAMP_TO_EDGE);
+
+ int image_width = get_image_width(src_image);
+ int image_height = get_image_height(src_image);
+
+ int2 coords = (int2)(get_global_id(0), get_global_id(1));
+ float4 pixel = read_imagef(src_image,sampler, coords);
+
+ //boundary condition
+ if ((coords.x < image_width) && (coords.y < image_height))
+ {
+ uchar indx_x, indx_y, indx_z;
+ indx_x = convert_uchar_sat(pixel.x * 255.0f);
+ indx_y = convert_uchar_sat(pixel.y * 255.0f);
+ indx_z = convert_uchar_sat(pixel.z * 255.0f);
+
+ atomic_inc(&b_hist[(uint)indx_z]);
+ atomic_inc(&g_hist[(uint)indx_y]);
+ atomic_inc(&r_hist[(uint)indx_x]);
+ }
+ }
+);
+
+inline void showHistogramWindow(cv::Mat &b_hist, cv::Mat &g_hist, cv::Mat &r_hist,
+ std::string window_name)
+{
+ // Draw the histograms for B, G and R
+ int hist_w = 1024;
+ int hist_h = 768;
+ int bin_w = cvRound((double)hist_w/histSize);
+
+ cv::Mat histImage(hist_h, hist_w, CV_8UC3, cv::Scalar(0,0,0));
+
+ // Normalize the result to [ 0, histImage.rows ]
+ cv::normalize(b_hist, b_hist, 0, histImage.rows, cv::NORM_MINMAX, -1, cv::Mat());
+ cv::normalize(g_hist, g_hist, 0, histImage.rows, cv::NORM_MINMAX, -1, cv::Mat());
+ cv::normalize(r_hist, r_hist, 0, histImage.rows, cv::NORM_MINMAX, -1, cv::Mat());
+
+ // Draw for each channel
+ for (int i = 1; i < histSize; i++ )
+ {
+ cv::line(histImage,
+ cv::Point(bin_w*(i-1), hist_h - cvRound(b_hist.at<float>(i-1))),
+ cv::Point(bin_w*(i), hist_h - cvRound(b_hist.at<float>(i))),
+ cv::Scalar(255, 0, 0),
+ 2,
+ 8,
+ 0);
+
+ cv::line(histImage,
+ cv::Point(bin_w*(i-1), hist_h - cvRound(g_hist.at<float>(i-1))),
+ cv::Point(bin_w*(i), hist_h - cvRound(g_hist.at<float>(i))),
+ cv::Scalar(0, 255, 0),
+ 2,
+ 8,
+ 0);
+
+ cv::line(histImage,
+ cv::Point( bin_w*(i-1), hist_h - cvRound(r_hist.at<float>(i-1))),
+ cv::Point( bin_w*(i), hist_h - cvRound(r_hist.at<float>(i)) ),
+ cv::Scalar( 0, 0, 255),
+ 2,
+ 8,
+ 0);
+ }
+
+ // Display
+ cv::namedWindow(window_name, CV_WINDOW_AUTOSIZE );
+ cv::imshow(window_name, histImage );
+}
+
+//Get the device context
+//Create GPU array/vector
+//Copy the image & set up the kernel
+//Execute the kernel
+//Copy GPU data back to CPU cv::Mat data pointer
+//OpenCV conversion for convienient display
+void calculateHistogramUsingCL(cv::Mat src, compute::command_queue &queue)
+{
+ compute::context context = queue.get_context();
+
+ // Convert image to BGRA (OpenCL requires 16-byte aligned data)
+ cv::cvtColor(src, src, CV_BGR2BGRA);
+
+ //3 channels & 256 bins : alpha channel is ignored
+ compute::vector<int> gpu_b_hist(histSize, context);
+ compute::vector<int> gpu_g_hist(histSize, context);
+ compute::vector<int> gpu_r_hist(histSize, context);
+
+ // Transfer image to gpu
+ compute::image2d gpu_src =
+ compute::opencv_create_image2d_with_mat(
+ src, compute::image2d::read_only,
+ queue
+ );
+
+ compute::program histogram_program =
+ compute::program::create_with_source(source, context);
+ histogram_program.build();
+
+ // create histogram kernel and set arguments
+ compute::kernel histogram_kernel(histogram_program, "histogram");
+ histogram_kernel.set_arg(0, gpu_src);
+ histogram_kernel.set_arg(1, gpu_b_hist.get_buffer());
+ histogram_kernel.set_arg(2, gpu_g_hist.get_buffer());
+ histogram_kernel.set_arg(3, gpu_r_hist.get_buffer());
+
+ // run histogram kernel
+ // each kernel thread updating red, green & blue bins
+ size_t origin[2] = { 0, 0 };
+ size_t region[2] = { gpu_src.width(),
+ gpu_src.height() };
+
+ queue.enqueue_nd_range_kernel(histogram_kernel, 2, origin, region, 0);
+
+ //Make sure kernel get executed and data copied back
+ queue.finish();
+
+ //create Mat and copy GPU bins to CPU memory
+ cv::Mat b_hist(256, 1, CV_32SC1);
+ compute::copy(gpu_b_hist.begin(), gpu_b_hist.end(), b_hist.data, queue);
+ cv::Mat g_hist(256, 1, CV_32SC1);
+ compute::copy(gpu_g_hist.begin(), gpu_g_hist.end(), g_hist.data, queue);
+ cv::Mat r_hist(256, 1, CV_32SC1);
+ compute::copy(gpu_r_hist.begin(), gpu_r_hist.end(), r_hist.data, queue);
+
+ b_hist.convertTo(b_hist, CV_32FC1); //converted for displaying
+ g_hist.convertTo(g_hist, CV_32FC1);
+ r_hist.convertTo(r_hist, CV_32FC1);
+
+ showHistogramWindow(b_hist, g_hist, r_hist, "Histogram");
+}
+
+int main( int argc, char** argv )
+{
+ // Get default device and setup context
+ compute::device gpu = compute::system::default_device();
+ compute::context context(gpu);
+ compute::command_queue queue(context, gpu);
+
+ cv::Mat src;
+
+ // setup the command line arguments
+ po::options_description desc;
+ desc.add_options()
+ ("help", "show available options")
+ ("image", po::value<std::string>(), "path to image file");
+
+ // Parse the command lines
+ po::variables_map vm;
+ po::store(po::parse_command_line(argc, argv, desc), vm);
+ po::notify(vm);
+
+ //check the command line arguments
+ if(vm.count("help"))
+ {
+ std::cout << desc << std::endl;
+ return 0;
+ }
+
+ //check for image paths
+ if(vm.count("image"))
+ {
+ // Read image with OpenCV
+ src = cv::imread(vm["image"].as<std::string>(),
+ CV_LOAD_IMAGE_COLOR);
+ if(!src.data){
+ std::cerr << "Failed to load image" << std::endl;
+ return -1;
+ }
+ calculateHistogramUsingCL(src, queue);
+ cv::imshow("Image", src);
+ cv::waitKey(0);
+ }
+ else
+ {
+ std::cout << desc << std::endl;
+ return 0;
+ }
+ return 0;
+}
diff --git a/src/boost/libs/compute/example/opencv_optical_flow.cpp b/src/boost/libs/compute/example/opencv_optical_flow.cpp
new file mode 100644
index 00000000..87f330ae
--- /dev/null
+++ b/src/boost/libs/compute/example/opencv_optical_flow.cpp
@@ -0,0 +1,289 @@
+//---------------------------------------------------------------------------//
+// Copyright (c) 2013-2014 Mageswaran.D <mageswaran1989@gmail.com>
+//
+// Distributed under the Boost Software License, Version 1.0
+// See accompanying file LICENSE_1_0.txt or copy at
+// http://www.boost.org/LICENSE_1_0.txt
+//
+// See http://boostorg.github.com/compute for more information.
+//---------------------------------------------------------------------------//
+
+#include <iostream>
+#include <string>
+
+#include <opencv2/core/core.hpp>
+#include <opencv2/highgui/highgui.hpp>
+#include <opencv2/imgproc/imgproc.hpp>
+
+#include <boost/compute/system.hpp>
+#include <boost/compute/interop/opencv/core.hpp>
+#include <boost/compute/interop/opencv/highgui.hpp>
+#include <boost/compute/utility/source.hpp>
+
+#include <boost/program_options.hpp>
+
+namespace compute = boost::compute;
+namespace po = boost::program_options;
+
+// Create naive optical flow program
+const char source[] = BOOST_COMPUTE_STRINGIZE_SOURCE (
+ const sampler_t sampler = CLK_ADDRESS_CLAMP_TO_EDGE;
+
+ __kernel void optical_flow (
+ read_only
+ image2d_t current_image,
+ read_only image2d_t previous_image,
+ write_only image2d_t optical_flow,
+ const float scale,
+ const float offset,
+ const float lambda,
+ const float threshold )
+ {
+ int2 coords = (int2)(get_global_id(0), get_global_id(1));
+ float4 current_pixel = read_imagef(current_image,
+ sampler,
+ coords);
+ float4 previous_pixel = read_imagef(previous_image,
+ sampler,
+ coords);
+ int2 x1 = (int2)(offset, 0.f);
+ int2 y1 = (int2)(0.f, offset);
+
+ //get the difference
+ float4 curdif = previous_pixel - current_pixel;
+
+ //calculate the gradient
+ //Image 2 first
+ float4 gradx = read_imagef(previous_image,
+ sampler,
+ coords+x1) -
+ read_imagef(previous_image,
+ sampler,
+ coords-x1);
+ //Image 1
+ gradx += read_imagef(current_image,
+ sampler,
+ coords+x1) -
+ read_imagef(current_image,
+ sampler,
+ coords-x1);
+ //Image 2 first
+ float4 grady = read_imagef(previous_image,
+ sampler,
+ coords+y1) -
+ read_imagef(previous_image,
+ sampler,
+ coords-y1);
+ //Image 1
+ grady += read_imagef(current_image,
+ sampler,
+ coords+y1) -
+ read_imagef(current_image,
+ sampler,
+ coords-y1);
+
+ float4 sqr = (gradx*gradx) + (grady*grady) +
+ (float4)(lambda,lambda, lambda, lambda);
+ float4 gradmag = sqrt(sqr);
+
+ ///////////////////////////////////////////////////
+ float4 vx = curdif * (gradx / gradmag);
+ float vxd = vx.x;//assumes greyscale
+
+ //format output for flowrepos, out(-x,+x,-y,+y)
+ float2 xout = (float2)(fmax(vxd,0.f),fabs(fmin(vxd,0.f)));
+ xout *= scale;
+ ///////////////////////////////////////////////////
+ float4 vy = curdif*(grady/gradmag);
+ float vyd = vy.x;//assumes greyscale
+
+ //format output for flowrepos, out(-x,+x,-y,+y)
+ float2 yout = (float2)(fmax(vyd,0.f),fabs(fmin(vyd,0.f)));
+ yout *= scale;
+ ///////////////////////////////////////////////////
+
+ float4 out = (float4)(xout, yout);
+ float cond = (float)isgreaterequal(length(out), threshold);
+ out *= cond;
+
+ write_imagef(optical_flow, coords, out);
+ }
+);
+
+// This example shows how to read two images or use camera
+// with OpenCV, transfer the frames to the GPU,
+// and apply a naive optical flow algorithm
+// written in OpenCL
+int main(int argc, char *argv[])
+{
+ // setup the command line arguments
+ po::options_description desc;
+ desc.add_options()
+ ("help", "show available options")
+ ("camera", po::value<int>()->default_value(-1),
+ "if not default camera, specify a camera id")
+ ("image1", po::value<std::string>(), "path to image file 1")
+ ("image2", po::value<std::string>(), "path to image file 2");
+
+ // Parse the command lines
+ po::variables_map vm;
+ po::store(po::parse_command_line(argc, argv, desc), vm);
+ po::notify(vm);
+
+ //check the command line arguments
+ if(vm.count("help"))
+ {
+ std::cout << desc << std::endl;
+ return 0;
+ }
+
+ //OpenCV variables
+ cv::Mat previous_cv_image;
+ cv::Mat current_cv_image;
+ cv::VideoCapture cap; //OpenCV camera handle
+
+ //check for image paths
+ if(vm.count("image1") && vm.count("image2"))
+ {
+ // Read image 1 with OpenCV
+ previous_cv_image = cv::imread(vm["image1"].as<std::string>(),
+ CV_LOAD_IMAGE_COLOR);
+ if(!previous_cv_image.data){
+ std::cerr << "Failed to load image" << std::endl;
+ return -1;
+ }
+
+ // Read image 2 with opencv
+ current_cv_image = cv::imread(vm["image2"].as<std::string>(),
+ CV_LOAD_IMAGE_COLOR);
+ if(!current_cv_image.data){
+ std::cerr << "Failed to load image" << std::endl;
+ return -1;
+ }
+ }
+ else //by default use camera
+ {
+ //open camera
+ cap.open(vm["camera"].as<int>());
+ // read first frame
+ cap >> previous_cv_image;
+ if(!previous_cv_image.data){
+ std::cerr << "failed to capture frame" << std::endl;
+ return -1;
+ }
+
+ // read second frame
+ cap >> current_cv_image;
+ if(!current_cv_image.data){
+ std::cerr << "failed to capture frame" << std::endl;
+ return -1;
+ }
+
+ }
+
+ // Get default device and setup context
+ compute::device gpu = compute::system::default_device();
+ compute::context context(gpu);
+ compute::command_queue queue(context, gpu);
+
+ // Convert image to BGRA (OpenCL requires 16-byte aligned data)
+ cv::cvtColor(previous_cv_image, previous_cv_image, CV_BGR2BGRA);
+ cv::cvtColor(current_cv_image, current_cv_image, CV_BGR2BGRA);
+
+ // Transfer image to gpu
+ compute::image2d dev_previous_image =
+ compute::opencv_create_image2d_with_mat(
+ previous_cv_image, compute::image2d::read_write, queue
+ );
+ // Transfer image to gpu
+ compute::image2d dev_current_image =
+ compute::opencv_create_image2d_with_mat(
+ current_cv_image, compute::image2d::read_write, queue
+ );
+
+ // Create output image
+ compute::image2d dev_output_image(
+ context,
+ dev_previous_image.width(),
+ dev_previous_image.height(),
+ dev_previous_image.format(),
+ compute::image2d::write_only
+ );
+
+ compute::program optical_program =
+ compute::program::create_with_source(source, context);
+ optical_program.build();
+
+ // create flip kernel and set arguments
+ compute::kernel optical_kernel(optical_program, "optical_flow");
+ float scale = 10;
+ float offset = 1;
+ float lambda = 0.0025;
+ float threshold = 1.0;
+
+ optical_kernel.set_arg(0, dev_previous_image);
+ optical_kernel.set_arg(1, dev_current_image);
+ optical_kernel.set_arg(2, dev_output_image);
+ optical_kernel.set_arg(3, scale);
+ optical_kernel.set_arg(4, offset);
+ optical_kernel.set_arg(5, lambda);
+ optical_kernel.set_arg(6, threshold);
+
+ // run flip kernel
+ size_t origin[2] = { 0, 0 };
+ size_t region[2] = { dev_previous_image.width(),
+ dev_previous_image.height() };
+ queue.enqueue_nd_range_kernel(optical_kernel, 2, origin, region, 0);
+
+ //check for image paths
+ if(vm.count("image1") && vm.count("image2"))
+ {
+ // show host image
+ cv::imshow("Previous Frame", previous_cv_image);
+ cv::imshow("Current Frame", current_cv_image);
+
+ // show gpu image
+ compute::opencv_imshow("filtered image", dev_output_image, queue);
+
+ // wait and return
+ cv::waitKey(0);
+ }
+ else
+ {
+ char key = '\0';
+ while(key != 27) //check for escape key
+ {
+ cap >> current_cv_image;
+
+ // Convert image to BGRA (OpenCL requires 16-byte aligned data)
+ cv::cvtColor(current_cv_image, current_cv_image, CV_BGR2BGRA);
+
+ // Update the device image memory with current frame data
+ compute::opencv_copy_mat_to_image(previous_cv_image,
+ dev_previous_image,
+ queue);
+ compute::opencv_copy_mat_to_image(current_cv_image,
+ dev_current_image,
+ queue);
+
+ // Run the kernel on the device
+ queue.enqueue_nd_range_kernel(optical_kernel, 2, origin, region, 0);
+
+ // Show host image
+ cv::imshow("Previous Frame", previous_cv_image);
+ cv::imshow("Current Frame", current_cv_image);
+
+ // Show GPU image
+ compute::opencv_imshow("filtered image", dev_output_image, queue);
+
+ // Copy current frame container to previous frame container
+ current_cv_image.copyTo(previous_cv_image);
+
+ // wait
+ key = cv::waitKey(10);
+ }
+
+ }
+ return 0;
+}
+
diff --git a/src/boost/libs/compute/example/opencv_sobel_filter.cpp b/src/boost/libs/compute/example/opencv_sobel_filter.cpp
new file mode 100644
index 00000000..4fcfa206
--- /dev/null
+++ b/src/boost/libs/compute/example/opencv_sobel_filter.cpp
@@ -0,0 +1,254 @@
+//---------------------------------------------------------------------------//
+// Copyright (c) 2013-2014 Mageswaran.D <mageswaran1989@gmail.com>
+//
+// Book Refered: OpenCL Programming Guide
+// Distributed under the Boost Software License, Version 1.0
+// See accompanying file LICENSE_1_0.txt or copy at
+// http://www.boost.org/LICENSE_1_0.txt
+//
+// See http://boostorg.github.com/compute for more information.
+//---------------------------------------------------------------------------//
+
+//---------------------------------------------------------------------------//
+// About Sobel Filter:
+// * Edge Filter - distinguishes the differrent color region
+// * Finds the gradient in x and y-axes
+// * Three step process
+// -> Find x-axis gradient with kernel/matrix
+// Gx = [-1 0 +1]
+// [-2 0 +2]
+// [-1 0 +1]
+// -> Find y-axis gradient with kernel/matrix
+// Gy = [-1 -2 -1]
+// [ 0 0 0]
+// [+1 +2 +1]
+// * Gradient magnitude G = sqrt(Gx^2 + Gy^2)
+//---------------------------------------------------------------------------//
+
+#include <iostream>
+#include <string>
+
+#include <opencv2/core/core.hpp>
+#include <opencv2/highgui/highgui.hpp>
+#include <opencv2/imgproc/imgproc.hpp>
+
+#include <boost/compute/system.hpp>
+#include <boost/compute/interop/opencv/core.hpp>
+#include <boost/compute/interop/opencv/highgui.hpp>
+#include <boost/compute/utility/source.hpp>
+
+#include <boost/program_options.hpp>
+
+namespace compute = boost::compute;
+namespace po = boost::program_options;
+
+// Create sobel filter program
+const char source[] = BOOST_COMPUTE_STRINGIZE_SOURCE (
+ //For out of boundary pixels, edge pixel
+ // value is returned
+ const sampler_t sampler = CLK_ADDRESS_CLAMP_TO_EDGE |
+ CLK_FILTER_NEAREST;
+ kernel void sobel_rgb(read_only image2d_t src, write_only image2d_t dst)
+ {
+ int x = (int)get_global_id(0);
+ int y = (int)get_global_id(1);
+
+ if (x >= get_image_width(src) || y >= get_image_height(src))
+ return;
+
+ // [(x-1, y+1), (x, y+1), (x+1, y+1)]
+ // [(x-1, y ), (x, y ), (x+1, y )]
+ // [(x-1, y-1), (x, y-1), (x+1, y-1)]
+
+ // [p02, p12, p22]
+ // [p01, pixel, p21]
+ // [p00, p10, p20]
+
+ //Basically finding influence of neighbour pixels on current pixel
+ float4 p00 = read_imagef(src, sampler, (int2)(x - 1, y - 1));
+ float4 p10 = read_imagef(src, sampler, (int2)(x, y - 1));
+ float4 p20 = read_imagef(src, sampler, (int2)(x + 1, y - 1));
+
+ float4 p01 = read_imagef(src, sampler, (int2)(x - 1, y));
+ //pixel that we are working on
+ float4 p21 = read_imagef(src, sampler, (int2)(x + 1, y));
+
+ float4 p02 = read_imagef(src, sampler, (int2)(x - 1, y + 1));
+ float4 p12 = read_imagef(src, sampler, (int2)(x, y + 1));
+ float4 p22 = read_imagef(src, sampler, (int2)(x + 1, y + 1));
+
+ //Find Gx = kernel + 3x3 around current pixel
+ // Gx = [-1 0 +1] [p02, p12, p22]
+ // [-2 0 +2] + [p01, pixel, p21]
+ // [-1 0 +1] [p00, p10, p20]
+ float3 gx = -p00.xyz + p20.xyz +
+ 2.0f * (p21.xyz - p01.xyz)
+ -p02.xyz + p22.xyz;
+
+ //Find Gy = kernel + 3x3 around current pixel
+ // Gy = [-1 -2 -1] [p02, p12, p22]
+ // [ 0 0 0] + [p01, pixel, p21]
+ // [+1 +2 +1] [p00, p10, p20]
+ float3 gy = p00.xyz + p20.xyz +
+ 2.0f * (- p12.xyz + p10.xyz) -
+ p02.xyz - p22.xyz;
+ //Find G
+ float3 g = native_sqrt(gx * gx + gy * gy);
+
+ // we could also approximate this as g = fabs(gx) + fabs(gy)
+ write_imagef(dst, (int2)(x, y), (float4)(g.x, g.y, g.z, 1.0f));
+ }
+);
+
+// This example shows how to apply sobel filter on images or on camera frames
+// with OpenCV, transfer the frames to the GPU, and apply a sobel filter
+// written in OpenCL
+int main(int argc, char *argv[])
+{
+ ///////////////////////////////////////////////////////////////////////////
+
+ // setup the command line arguments
+ po::options_description desc;
+ desc.add_options()
+ ("help", "show available options")
+ ("camera", po::value<int>()->default_value(-1),
+ "if not default camera, specify a camera id")
+ ("image", po::value<std::string>(), "path to image file");
+
+ // Parse the command lines
+ po::variables_map vm;
+ po::store(po::parse_command_line(argc, argv, desc), vm);
+ po::notify(vm);
+
+ //check the command line arguments
+ if(vm.count("help"))
+ {
+ std::cout << desc << std::endl;
+ return 0;
+ }
+
+ ///////////////////////////////////////////////////////////////////////////
+
+ //OpenCV variables
+ cv::Mat cv_mat;
+ cv::VideoCapture cap; //OpenCV camera handle.
+
+ //OpenCL variables
+ // Get default device and setup context
+ compute::device gpu = compute::system::default_device();
+ compute::context context(gpu);
+ compute::command_queue queue(context, gpu);
+ compute::program filter_program =
+ compute::program::create_with_source(source, context);
+
+ try
+ {
+ filter_program.build();
+ }
+ catch(compute::opencl_error e)
+ {
+ std::cout<<"Build Error: "<<std::endl
+ <<filter_program.build_log();
+ }
+
+ // create fliter kernel and set arguments
+ compute::kernel filter_kernel(filter_program, "sobel_rgb");
+
+ ///////////////////////////////////////////////////////////////////////////
+
+ //check for image paths
+ if(vm.count("image"))
+ {
+ // Read image with OpenCV
+ cv_mat = cv::imread(vm["image"].as<std::string>(),
+ CV_LOAD_IMAGE_COLOR);
+ if(!cv_mat.data){
+ std::cerr << "Failed to load image" << std::endl;
+ return -1;
+ }
+ }
+ else //by default use camera
+ {
+ //open camera
+ cap.open(vm["camera"].as<int>());
+ // read first frame
+ cap >> cv_mat;
+ if(!cv_mat.data){
+ std::cerr << "failed to capture frame" << std::endl;
+ return -1;
+ }
+ }
+
+ // Convert image to BGRA (OpenCL requires 16-byte aligned data)
+ cv::cvtColor(cv_mat, cv_mat, CV_BGR2BGRA);
+
+ // Transfer image/frame data to gpu
+ compute::image2d dev_input_image =
+ compute::opencv_create_image2d_with_mat(
+ cv_mat, compute::image2d::read_write, queue
+ );
+
+ // Create output image
+ // Be sure what will be your ouput image/frame size
+ compute::image2d dev_output_image(
+ context,
+ dev_input_image.width(),
+ dev_input_image.height(),
+ dev_input_image.format(),
+ compute::image2d::write_only
+ );
+
+ filter_kernel.set_arg(0, dev_input_image);
+ filter_kernel.set_arg(1, dev_output_image);
+
+
+ // run flip kernel
+ size_t origin[2] = { 0, 0 };
+ size_t region[2] = { dev_input_image.width(),
+ dev_input_image.height() };
+
+ ///////////////////////////////////////////////////////////////////////////
+
+ queue.enqueue_nd_range_kernel(filter_kernel, 2, origin, region, 0);
+
+ //check for image paths
+ if(vm.count("image"))
+ {
+ // show host image
+ cv::imshow("Original Image", cv_mat);
+
+ // show gpu image
+ compute::opencv_imshow("Filtered Image", dev_output_image, queue);
+
+ // wait and return
+ cv::waitKey(0);
+ }
+ else
+ {
+ char key = '\0';
+ while(key != 27) //check for escape key
+ {
+ cap >> cv_mat;
+
+ // Convert image to BGRA (OpenCL requires 16-byte aligned data)
+ cv::cvtColor(cv_mat, cv_mat, CV_BGR2BGRA);
+
+ // Update the device image memory with current frame data
+ compute::opencv_copy_mat_to_image(cv_mat,
+ dev_input_image,queue);
+
+ // Run the kernel on the device
+ queue.enqueue_nd_range_kernel(filter_kernel, 2, origin, region, 0);
+
+ // Show host image
+ cv::imshow("Camera Frame", cv_mat);
+
+ // Show GPU image
+ compute::opencv_imshow("Filtered RGB Frame", dev_output_image, queue);
+
+ // wait
+ key = cv::waitKey(10);
+ }
+ }
+ return 0;
+}
diff --git a/src/boost/libs/compute/example/opengl_sphere.cpp b/src/boost/libs/compute/example/opengl_sphere.cpp
new file mode 100644
index 00000000..38999f34
--- /dev/null
+++ b/src/boost/libs/compute/example/opengl_sphere.cpp
@@ -0,0 +1,242 @@
+//---------------------------------------------------------------------------//
+// Copyright (c) 2013-2014 Kyle Lutz <kyle.r.lutz@gmail.com>
+//
+// Distributed under the Boost Software License, Version 1.0
+// See accompanying file LICENSE_1_0.txt or copy at
+// http://www.boost.org/LICENSE_1_0.txt
+//
+// See http://boostorg.github.com/compute for more information.
+//---------------------------------------------------------------------------//
+
+#include <iostream>
+#include <algorithm>
+
+#include <GL/gl.h>
+
+#include <vtkActor.h>
+#include <vtkCamera.h>
+#include <vtkgl.h>
+#include <vtkInteractorStyleSwitch.h>
+#include <vtkMapper.h>
+#include <vtkObjectFactory.h>
+#include <vtkOpenGLExtensionManager.h>
+#include <vtkOpenGLRenderWindow.h>
+#include <vtkProperty.h>
+#include <vtkRenderer.h>
+#include <vtkRenderWindow.h>
+#include <vtkRenderWindowInteractor.h>
+#include <vtkSmartPointer.h>
+
+#include <boost/compute/system.hpp>
+#include <boost/compute/algorithm/iota.hpp>
+#include <boost/compute/interop/opengl.hpp>
+#include <boost/compute/interop/vtk.hpp>
+#include <boost/compute/utility/dim.hpp>
+#include <boost/compute/utility/source.hpp>
+
+namespace compute = boost::compute;
+
+// tesselates a sphere with radius, phi_slices, and theta_slices. returns
+// a shared opencl/opengl buffer containing the vertex data.
+compute::opengl_buffer tesselate_sphere(float radius,
+ size_t phi_slices,
+ size_t theta_slices,
+ compute::command_queue &queue)
+{
+ using compute::dim;
+
+ const compute::context &context = queue.get_context();
+
+ const size_t vertex_count = phi_slices * theta_slices;
+
+ // create opengl buffer
+ GLuint vbo;
+ vtkgl::GenBuffersARB(1, &vbo);
+ vtkgl::BindBufferARB(vtkgl::ARRAY_BUFFER, vbo);
+ vtkgl::BufferDataARB(vtkgl::ARRAY_BUFFER,
+ sizeof(float) * 4 * vertex_count,
+ NULL,
+ vtkgl::STREAM_DRAW);
+ vtkgl::BindBufferARB(vtkgl::ARRAY_BUFFER, 0);
+
+ // create shared opengl/opencl buffer
+ compute::opengl_buffer vertex_buffer(context, vbo);
+
+ // tesselate_sphere kernel source
+ const char source[] = BOOST_COMPUTE_STRINGIZE_SOURCE(
+ __kernel void tesselate_sphere(float radius,
+ uint phi_slices,
+ uint theta_slices,
+ __global float4 *vertex_buffer)
+ {
+ const uint phi_i = get_global_id(0);
+ const uint theta_i = get_global_id(1);
+
+ const float phi = phi_i * 2.f * M_PI_F / phi_slices;
+ const float theta = theta_i * 2.f * M_PI_F / theta_slices;
+
+ float4 v;
+ v.x = radius * cos(theta) * cos(phi);
+ v.y = radius * cos(theta) * sin(phi);
+ v.z = radius * sin(theta);
+ v.w = 1.f;
+
+ vertex_buffer[phi_i*phi_slices+theta_i] = v;
+ }
+ );
+
+ // build tesselate_sphere program
+ compute::program program =
+ compute::program::create_with_source(source, context);
+ program.build();
+
+ // setup tesselate_sphere kernel
+ compute::kernel kernel(program, "tesselate_sphere");
+ kernel.set_arg<compute::float_>(0, radius);
+ kernel.set_arg<compute::uint_>(1, phi_slices);
+ kernel.set_arg<compute::uint_>(2, theta_slices);
+ kernel.set_arg(3, vertex_buffer);
+
+ // acqurire buffer so that it is accessible to OpenCL
+ compute::opengl_enqueue_acquire_buffer(vertex_buffer, queue);
+
+ // execute tesselate_sphere kernel
+ queue.enqueue_nd_range_kernel(
+ kernel, dim(0, 0), dim(phi_slices, theta_slices), dim(1, 1)
+ );
+
+ // release buffer so that it is accessible to OpenGL
+ compute::opengl_enqueue_release_buffer(vertex_buffer, queue);
+
+ return vertex_buffer;
+}
+
+// simple vtkMapper subclass to render the tesselated sphere on the gpu.
+class gpu_sphere_mapper : public vtkMapper
+{
+public:
+ vtkTypeMacro(gpu_sphere_mapper, vtkMapper)
+
+ static gpu_sphere_mapper* New()
+ {
+ return new gpu_sphere_mapper;
+ }
+
+ void Render(vtkRenderer *renderer, vtkActor *actor)
+ {
+ if(!m_initialized){
+ Initialize(renderer, actor);
+ m_initialized = true;
+ }
+
+ if(!m_tesselated){
+ m_vertex_count = m_phi_slices * m_theta_slices;
+
+ // tesselate sphere
+ m_vertex_buffer = tesselate_sphere(
+ m_radius, m_phi_slices, m_theta_slices, m_command_queue
+ );
+
+ // ensure tesselation is finished (seems to be required on AMD)
+ m_command_queue.finish();
+
+ // set tesselated flag to true
+ m_tesselated = true;
+ }
+
+ // draw sphere
+ glEnableClientState(GL_VERTEX_ARRAY);
+ vtkgl::BindBufferARB(vtkgl::ARRAY_BUFFER, m_vertex_buffer.get_opengl_object());
+ glVertexPointer(4, GL_FLOAT, sizeof(float)*4, 0);
+ glDrawArrays(GL_POINTS, 0, m_vertex_count);
+ }
+
+ void Initialize(vtkRenderer *renderer, vtkActor *actor)
+ {
+ // initialize opengl extensions
+ vtkOpenGLExtensionManager *extensions =
+ static_cast<vtkOpenGLRenderWindow *>(renderer->GetRenderWindow())
+ ->GetExtensionManager();
+ extensions->LoadExtension("GL_ARB_vertex_buffer_object");
+
+ // initialize opencl/opengl shared context
+ m_context = compute::opengl_create_shared_context();
+ compute::device device = m_context.get_device();
+ std::cout << "device: " << device.name() << std::endl;
+
+ // create command queue for the gpu device
+ m_command_queue = compute::command_queue(m_context, device);
+ }
+
+ double* GetBounds()
+ {
+ static double bounds[6];
+ bounds[0] = -m_radius; bounds[1] = m_radius;
+ bounds[2] = -m_radius; bounds[3] = m_radius;
+ bounds[4] = -m_radius; bounds[5] = m_radius;
+ return bounds;
+ }
+
+protected:
+ gpu_sphere_mapper()
+ {
+ m_radius = 5.0f;
+ m_phi_slices = 100;
+ m_theta_slices = 100;
+ m_initialized = false;
+ m_tesselated = false;
+ }
+
+private:
+ float m_radius;
+ int m_phi_slices;
+ int m_theta_slices;
+ int m_vertex_count;
+ bool m_initialized;
+ bool m_tesselated;
+ compute::context m_context;
+ compute::command_queue m_command_queue;
+ compute::opengl_buffer m_vertex_buffer;
+};
+
+int main(int argc, char *argv[])
+{
+ // create gpu sphere mapper
+ vtkSmartPointer<gpu_sphere_mapper> mapper =
+ vtkSmartPointer<gpu_sphere_mapper>::New();
+
+ // create actor for gpu sphere mapper
+ vtkSmartPointer<vtkActor> actor =
+ vtkSmartPointer<vtkActor>::New();
+ actor->GetProperty()->LightingOff();
+ actor->GetProperty()->SetInterpolationToFlat();
+ actor->SetMapper(mapper);
+
+ // create render window
+ vtkSmartPointer<vtkRenderer> renderer =
+ vtkSmartPointer<vtkRenderer>::New();
+ renderer->SetBackground(.1, .2, .31);
+ vtkSmartPointer<vtkRenderWindow> renderWindow =
+ vtkSmartPointer<vtkRenderWindow>::New();
+ renderWindow->SetSize(800, 600);
+ renderWindow->AddRenderer(renderer);
+ vtkSmartPointer<vtkRenderWindowInteractor> renderWindowInteractor =
+ vtkSmartPointer<vtkRenderWindowInteractor>::New();
+ vtkInteractorStyleSwitch *interactorStyle =
+ vtkInteractorStyleSwitch::SafeDownCast(
+ renderWindowInteractor->GetInteractorStyle()
+ );
+ interactorStyle->SetCurrentStyleToTrackballCamera();
+ renderWindowInteractor->SetRenderWindow(renderWindow);
+ renderer->AddActor(actor);
+
+ // render
+ renderer->ResetCamera();
+ vtkCamera *camera = renderer->GetActiveCamera();
+ camera->Elevation(-90.0);
+ renderWindowInteractor->Initialize();
+ renderWindow->Render();
+ renderWindowInteractor->Start();
+
+ return 0;
+}
diff --git a/src/boost/libs/compute/example/point_centroid.cpp b/src/boost/libs/compute/example/point_centroid.cpp
new file mode 100644
index 00000000..e5691f7c
--- /dev/null
+++ b/src/boost/libs/compute/example/point_centroid.cpp
@@ -0,0 +1,68 @@
+//---------------------------------------------------------------------------//
+// Copyright (c) 2013 Kyle Lutz <kyle.r.lutz@gmail.com>
+//
+// Distributed under the Boost Software License, Version 1.0
+// See accompanying file LICENSE_1_0.txt or copy at
+// http://www.boost.org/LICENSE_1_0.txt
+//
+// See http://boostorg.github.com/compute for more information.
+//---------------------------------------------------------------------------//
+
+//[point_centroid_example
+
+#include <iostream>
+
+#include <boost/compute/algorithm/copy.hpp>
+#include <boost/compute/algorithm/accumulate.hpp>
+#include <boost/compute/container/vector.hpp>
+#include <boost/compute/types/fundamental.hpp>
+
+namespace compute = boost::compute;
+
+// the point centroid example calculates and displays the
+// centroid of a set of 3D points stored as float4's
+int main()
+{
+ using compute::float4_;
+
+ // get default device and setup context
+ compute::device device = compute::system::default_device();
+ compute::context context(device);
+ compute::command_queue queue(context, device);
+
+ // point coordinates
+ float points[] = { 1.0f, 2.0f, 3.0f, 0.0f,
+ -2.0f, -3.0f, 4.0f, 0.0f,
+ 1.0f, -2.0f, 2.5f, 0.0f,
+ -7.0f, -3.0f, -2.0f, 0.0f,
+ 3.0f, 4.0f, -5.0f, 0.0f };
+
+ // create vector for five points
+ compute::vector<float4_> vector(5, context);
+
+ // copy point data to the device
+ compute::copy(
+ reinterpret_cast<float4_ *>(points),
+ reinterpret_cast<float4_ *>(points) + 5,
+ vector.begin(),
+ queue
+ );
+
+ // calculate sum
+ float4_ sum = compute::accumulate(
+ vector.begin(), vector.end(), float4_(0, 0, 0, 0), queue
+ );
+
+ // calculate centroid
+ float4_ centroid;
+ for(size_t i = 0; i < 3; i++){
+ centroid[i] = sum[i] / 5.0f;
+ }
+
+ // print centroid
+ std::cout << "centroid: " << centroid << std::endl;
+
+ return 0;
+}
+
+//]
diff --git a/src/boost/libs/compute/example/price_cross.cpp b/src/boost/libs/compute/example/price_cross.cpp
new file mode 100644
index 00000000..95196494
--- /dev/null
+++ b/src/boost/libs/compute/example/price_cross.cpp
@@ -0,0 +1,87 @@
+//---------------------------------------------------------------------------//
+// Copyright (c) 2014 Kyle Lutz <kyle.r.lutz@gmail.com>
+//
+// Distributed under the Boost Software License, Version 1.0
+// See accompanying file LICENSE_1_0.txt or copy at
+// http://www.boost.org/LICENSE_1_0.txt
+//
+// See http://boostorg.github.com/compute for more information.
+//---------------------------------------------------------------------------//
+
+#include <iostream>
+
+#include <boost/compute/system.hpp>
+#include <boost/compute/algorithm/copy.hpp>
+#include <boost/compute/algorithm/copy_n.hpp>
+#include <boost/compute/algorithm/find_if.hpp>
+#include <boost/compute/container/vector.hpp>
+#include <boost/compute/iterator/zip_iterator.hpp>
+
+namespace compute = boost::compute;
+
+// this example shows how to use the find_if() algorithm to detect the
+// point at which two vectors of prices (such as stock prices) cross.
+int main()
+{
+ // get default device and setup context
+ compute::device gpu = compute::system::default_device();
+ compute::context context(gpu);
+ compute::command_queue queue(context, gpu);
+
+ // prices #1 (from 10.0 to 11.0)
+ std::vector<float> prices1;
+ for(float i = 10.0; i <= 11.0; i += 0.1){
+ prices1.push_back(i);
+ }
+
+ // prices #2 (from 11.0 to 10.0)
+ std::vector<float> prices2;
+ for(float i = 11.0; i >= 10.0; i -= 0.1){
+ prices2.push_back(i);
+ }
+
+ // create gpu vectors
+ compute::vector<float> gpu_prices1(prices1.size(), context);
+ compute::vector<float> gpu_prices2(prices2.size(), context);
+
+ // copy prices to gpu
+ compute::copy(prices1.begin(), prices1.end(), gpu_prices1.begin(), queue);
+ compute::copy(prices2.begin(), prices2.end(), gpu_prices2.begin(), queue);
+
+ // function returning true if the second price is less than the first price
+ BOOST_COMPUTE_FUNCTION(bool, check_price_cross, (boost::tuple<float, float> prices),
+ {
+ // first price
+ const float first = boost_tuple_get(prices, 0);
+
+ // second price
+ const float second = boost_tuple_get(prices, 1);
+
+ // return true if second price is less than first
+ return second < first;
+ });
+
+ // find cross point (should be 10.5)
+ compute::vector<float>::iterator iter = boost::get<0>(
+ compute::find_if(
+ compute::make_zip_iterator(
+ boost::make_tuple(gpu_prices1.begin(), gpu_prices2.begin())
+ ),
+ compute::make_zip_iterator(
+ boost::make_tuple(gpu_prices1.end(), gpu_prices2.end())
+ ),
+ check_price_cross,
+ queue
+ ).get_iterator_tuple()
+ );
+
+ // print out result
+ int index = std::distance(gpu_prices1.begin(), iter);
+ std::cout << "price cross at index: " << index << std::endl;
+
+ float value;
+ compute::copy_n(iter, 1, &value, queue);
+ std::cout << "value: " << value << std::endl;
+
+ return 0;
+}
diff --git a/src/boost/libs/compute/example/print_vector.cpp b/src/boost/libs/compute/example/print_vector.cpp
new file mode 100644
index 00000000..9ec5c6c7
--- /dev/null
+++ b/src/boost/libs/compute/example/print_vector.cpp
@@ -0,0 +1,45 @@
+//---------------------------------------------------------------------------//
+// Copyright (c) 2013-2014 Kyle Lutz <kyle.r.lutz@gmail.com>
+//
+// Distributed under the Boost Software License, Version 1.0
+// See accompanying file LICENSE_1_0.txt or copy at
+// http://www.boost.org/LICENSE_1_0.txt
+//
+// See http://boostorg.github.com/compute for more information.
+//---------------------------------------------------------------------------//
+
+#include <iostream>
+#include <vector>
+
+#include <boost/compute/system.hpp>
+#include <boost/compute/algorithm/copy.hpp>
+#include <boost/compute/algorithm/iota.hpp>
+#include <boost/compute/container/vector.hpp>
+
+namespace compute = boost::compute;
+
+// this example demonstrates how to print the values in a vector
+int main()
+{
+ // get default device and setup context
+ compute::device gpu = compute::system::default_device();
+ compute::context context(gpu);
+ compute::command_queue queue(context, gpu);
+ std::cout << "device: " << gpu.name() << std::endl;
+
+ // create vector on the device and fill with the sequence 1..10
+ compute::vector<int> vector(10, context);
+ compute::iota(vector.begin(), vector.end(), 1, queue);
+
+//[print_vector_example
+ std::cout << "vector: [ ";
+ boost::compute::copy(
+ vector.begin(), vector.end(),
+ std::ostream_iterator<int>(std::cout, ", "),
+ queue
+ );
+ std::cout << "]" << std::endl;
+//]
+
+ return 0;
+}
diff --git a/src/boost/libs/compute/example/qimage_blur.cpp b/src/boost/libs/compute/example/qimage_blur.cpp
new file mode 100644
index 00000000..cbfa3145
--- /dev/null
+++ b/src/boost/libs/compute/example/qimage_blur.cpp
@@ -0,0 +1,145 @@
+//---------------------------------------------------------------------------//
+// Copyright (c) 2014 Kyle Lutz <kyle.r.lutz@gmail.com>
+//
+// Distributed under the Boost Software License, Version 1.0
+// See accompanying file LICENSE_1_0.txt or copy at
+// http://www.boost.org/LICENSE_1_0.txt
+//
+// See http://boostorg.github.com/compute for more information.
+//---------------------------------------------------------------------------//
+
+#include <iostream>
+#include <algorithm>
+
+#include <QtGlobal>
+#if QT_VERSION >= 0x050000
+#include <QtWidgets>
+#else
+#include <QtGui>
+#endif
+
+#ifndef Q_MOC_RUN
+#include <boost/compute/system.hpp>
+#include <boost/compute/image/image2d.hpp>
+#include <boost/compute/interop/qt.hpp>
+#include <boost/compute/utility/dim.hpp>
+#include <boost/compute/utility/source.hpp>
+#endif // Q_MOC_RUN
+
+namespace compute = boost::compute;
+
+inline void box_filter_image(const compute::image2d &input,
+ compute::image2d &output,
+ compute::uint_ box_height,
+ compute::uint_ box_width,
+ compute::command_queue &queue)
+{
+ using compute::dim;
+
+ const compute::context &context = queue.get_context();
+
+ // simple box filter kernel source
+ const char source[] = BOOST_COMPUTE_STRINGIZE_SOURCE(
+ __kernel void box_filter(__read_only image2d_t input,
+ __write_only image2d_t output,
+ uint box_height,
+ uint box_width)
+ {
+ int x = get_global_id(0);
+ int y = get_global_id(1);
+ int h = get_image_height(input);
+ int w = get_image_width(input);
+ int k = box_width;
+ int l = box_height;
+
+ if(x < k/2 || y < l/2 || x >= w-(k/2) || y >= h-(l/2)){
+ write_imagef(output, (int2)(x, y), (float4)(0, 0, 0, 1));
+ }
+ else {
+ const sampler_t sampler = CLK_ADDRESS_NONE | CLK_FILTER_NEAREST;
+
+ float4 sum = { 0, 0, 0, 0 };
+ for(int i = 0; i < k; i++){
+ for(int j = 0; j < l; j++){
+ sum += read_imagef(input, sampler, (int2)(x+i-k, y+j-l));
+ }
+ }
+ sum /= (float) k * l;
+ float4 value = (float4)( sum.x, sum.y, sum.z, 1.f );
+ write_imagef(output, (int2)(x, y), value);
+ }
+ }
+ );
+
+ // build box filter program
+ compute::program program =
+ compute::program::create_with_source(source, context);
+ program.build();
+
+ // setup box filter kernel
+ compute::kernel kernel(program, "box_filter");
+ kernel.set_arg(0, input);
+ kernel.set_arg(1, output);
+ kernel.set_arg(2, box_height);
+ kernel.set_arg(3, box_width);
+
+ // execute the box filter kernel
+ queue.enqueue_nd_range_kernel(kernel, dim(0, 0), input.size(), dim(1, 1));
+}
+
+// this example shows how to load an image using Qt, apply a simple
+// box blur filter, and then display it in a Qt window.
+int main(int argc, char *argv[])
+{
+ QApplication app(argc, argv);
+
+ // check command line
+ if(argc < 2){
+ std::cout << "usage: qimage_blur [FILENAME]" << std::endl;
+ return -1;
+ }
+
+ // load image using Qt
+ QString fileName = argv[1];
+ QImage qimage(fileName);
+
+ size_t height = qimage.height();
+ size_t width = qimage.width();
+ size_t bytes_per_line = qimage.bytesPerLine();
+
+ qDebug() << "height:" << height
+ << "width:" << width
+ << "bytes per line:" << bytes_per_line
+ << "depth:" << qimage.depth()
+ << "format:" << qimage.format();
+
+ // create compute context
+ compute::device gpu = compute::system::default_device();
+ compute::context context(gpu);
+ compute::command_queue queue(context, gpu);
+ std::cout << "device: " << gpu.name() << std::endl;
+
+ // get the opencl image format for the qimage
+ compute::image_format format =
+ compute::qt_qimage_format_to_image_format(qimage.format());
+
+ // create input and output images on the gpu
+ compute::image2d input_image(context, width, height, format);
+ compute::image2d output_image(context, width, height, format);
+
+ // copy host qimage to gpu image
+ compute::qt_copy_qimage_to_image2d(qimage, input_image, queue);
+
+ // apply box filter
+ box_filter_image(input_image, output_image, 7, 7, queue);
+
+ // copy gpu blurred image from to host qimage
+ compute::qt_copy_image2d_to_qimage(output_image, qimage, queue);
+
+ // show image as a pixmap
+ QLabel label;
+ label.setPixmap(QPixmap::fromImage(qimage));
+ label.show();
+
+ return app.exec();
+}
diff --git a/src/boost/libs/compute/example/random_walk.cpp b/src/boost/libs/compute/example/random_walk.cpp
new file mode 100644
index 00000000..13106d96
--- /dev/null
+++ b/src/boost/libs/compute/example/random_walk.cpp
@@ -0,0 +1,153 @@
+//---------------------------------------------------------------------------//
+// Copyright (c) 2013-2014 Kyle Lutz <kyle.r.lutz@gmail.com>
+//
+// Distributed under the Boost Software License, Version 1.0
+// See accompanying file LICENSE_1_0.txt or copy at
+// http://www.boost.org/LICENSE_1_0.txt
+//
+// See http://boostorg.github.com/compute for more information.
+//---------------------------------------------------------------------------//
+
+#include <iostream>
+
+#include <opencv2/core/core.hpp>
+#include <opencv2/highgui/highgui.hpp>
+#include <opencv2/imgproc/imgproc.hpp>
+
+#include <boost/compute/system.hpp>
+#include <boost/compute/algorithm/inclusive_scan.hpp>
+#include <boost/compute/algorithm/inclusive_scan.hpp>
+#include <boost/compute/interop/opencv/core.hpp>
+#include <boost/compute/interop/opencv/highgui.hpp>
+#include <boost/compute/random/default_random_engine.hpp>
+#include <boost/compute/random/uniform_real_distribution.hpp>
+#include <boost/compute/utility/source.hpp>
+
+namespace compute = boost::compute;
+
+// this example uses the random-number generation functions in Boost.Compute
+// to calculate a large number of random "steps" and then plots the final
+// random "walk" in a 2D image on the GPU and displays it with OpenCV
+int main()
+{
+ // number of random steps to take
+ size_t steps = 250000;
+
+ // height and width of image
+ size_t height = 800;
+ size_t width = 800;
+
+ // get default device and setup context
+ compute::device gpu = compute::system::default_device();
+ compute::context context(gpu);
+ compute::command_queue queue(context, gpu);
+
+ using compute::int2_;
+
+ // calaculate random values for each step
+ compute::vector<float> random_values(steps, context);
+ compute::default_random_engine random_engine(queue);
+ compute::uniform_real_distribution<float> random_distribution(0.f, 4.f);
+
+ random_distribution.generate(
+ random_values.begin(), random_values.end(), random_engine, queue
+ );
+
+ // calaculate coordinates for each step
+ compute::vector<int2_> coordinates(steps, context);
+
+ // function to convert random values to random directions (in 2D)
+ BOOST_COMPUTE_FUNCTION(int2_, take_step, (const float x),
+ {
+ if(x < 1.f){
+ // move right
+ return (int2)(1, 0);
+ }
+ if(x < 2.f){
+ // move up
+ return (int2)(0, 1);
+ }
+ if(x < 3.f){
+ // move left
+ return (int2)(-1, 0);
+ }
+ else {
+ // move down
+ return (int2)(0, -1);
+ }
+ });
+
+ // transform the random values into random steps
+ compute::transform(
+ random_values.begin(), random_values.end(), coordinates.begin(), take_step, queue
+ );
+
+ // set staring position
+ int2_ starting_position(width / 2, height / 2);
+ compute::copy_n(&starting_position, 1, coordinates.begin(), queue);
+
+ // scan steps to calculate position after each step
+ compute::inclusive_scan(
+ coordinates.begin(), coordinates.end(), coordinates.begin(), queue
+ );
+
+ // create output image
+ compute::image2d image(
+ context, width, height, compute::image_format(CL_RGBA, CL_UNSIGNED_INT8)
+ );
+
+ // program with two kernels, one to fill the image with white, and then
+ // one the draw to points calculated in coordinates on the image
+ const char draw_walk_source[] = BOOST_COMPUTE_STRINGIZE_SOURCE(
+ __kernel void draw_walk(__global const int2 *coordinates,
+ __write_only image2d_t image)
+ {
+ const uint i = get_global_id(0);
+ const int2 coord = coordinates[i];
+
+ if(coord.x > 0 && coord.x < get_image_width(image) &&
+ coord.y > 0 && coord.y < get_image_height(image)){
+ uint4 black = { 0, 0, 0, 0 };
+ write_imageui(image, coord, black);
+ }
+ }
+
+ __kernel void fill_white(__write_only image2d_t image)
+ {
+ const int2 coord = { get_global_id(0), get_global_id(1) };
+
+ if(coord.x < get_image_width(image) &&
+ coord.y < get_image_height(image)){
+ uint4 white = { 255, 255, 255, 255 };
+ write_imageui(image, coord, white);
+ }
+ }
+ );
+
+ // build the program
+ compute::program draw_program =
+ compute::program::build_with_source(draw_walk_source, context);
+
+ // fill image with white
+ compute::kernel fill_kernel(draw_program, "fill_white");
+ fill_kernel.set_arg(0, image);
+
+ const size_t offset[] = { 0, 0 };
+ const size_t bounds[] = { width, height };
+
+ queue.enqueue_nd_range_kernel(fill_kernel, 2, offset, bounds, 0);
+
+ // draw random walk
+ compute::kernel draw_kernel(draw_program, "draw_walk");
+ draw_kernel.set_arg(0, coordinates);
+ draw_kernel.set_arg(1, image);
+ queue.enqueue_1d_range_kernel(draw_kernel, 0, coordinates.size(), 0);
+
+ // show image
+ compute::opencv_imshow("random walk", image, queue);
+
+ // wait and return
+ cv::waitKey(0);
+
+ return 0;
+}
diff --git a/src/boost/libs/compute/example/resize_image.cpp b/src/boost/libs/compute/example/resize_image.cpp
new file mode 100644
index 00000000..d9e29edd
--- /dev/null
+++ b/src/boost/libs/compute/example/resize_image.cpp
@@ -0,0 +1,253 @@
+//---------------------------------------------------------------------------//
+// Copyright (c) 2013-2014 Kyle Lutz <kyle.r.lutz@gmail.com>
+//
+// Distributed under the Boost Software License, Version 1.0
+// See accompanying file LICENSE_1_0.txt or copy at
+// http://www.boost.org/LICENSE_1_0.txt
+//
+// See http://boostorg.github.com/compute for more information.
+//---------------------------------------------------------------------------//
+
+#include <iostream>
+#include <algorithm>
+
+#include <QtGlobal>
+#if QT_VERSION >= 0x050000
+#include <QtWidgets>
+#else
+#include <QtGui>
+#endif
+#include <QtOpenGL>
+
+#include <boost/program_options.hpp>
+
+#ifndef Q_MOC_RUN
+#include <boost/compute/command_queue.hpp>
+#include <boost/compute/kernel.hpp>
+#include <boost/compute/program.hpp>
+#include <boost/compute/system.hpp>
+#include <boost/compute/image/image2d.hpp>
+#include <boost/compute/image/image_sampler.hpp>
+#include <boost/compute/interop/qt.hpp>
+#include <boost/compute/interop/opengl.hpp>
+#include <boost/compute/utility/source.hpp>
+#endif // Q_MOC_RUN
+
+namespace compute = boost::compute;
+namespace po = boost::program_options;
+
+// opencl source code
+const char source[] = BOOST_COMPUTE_STRINGIZE_SOURCE(
+ __kernel void resize_image(__read_only image2d_t input,
+ const sampler_t sampler,
+ __write_only image2d_t output)
+ {
+ const uint x = get_global_id(0);
+ const uint y = get_global_id(1);
+
+ const float w = get_image_width(output);
+ const float h = get_image_height(output);
+
+ float2 coord = { ((float) x / w) * get_image_width(input),
+ ((float) y / h) * get_image_height(input) };
+
+ float4 pixel = read_imagef(input, sampler, coord);
+ write_imagef(output, (int2)(x, h - y - 1), pixel);
+ };
+);
+
+class ImageWidget : public QGLWidget
+{
+ Q_OBJECT
+
+public:
+ ImageWidget(QString fileName, QWidget *parent = 0);
+ ~ImageWidget();
+
+ void initializeGL();
+ void resizeGL(int width, int height);
+ void paintGL();
+
+private:
+ QImage qt_image_;
+ compute::context context_;
+ compute::command_queue queue_;
+ compute::program program_;
+ compute::image2d image_;
+ compute::image_sampler sampler_;
+ GLuint gl_texture_;
+ compute::opengl_texture cl_texture_;
+};
+
+ImageWidget::ImageWidget(QString fileName, QWidget *parent)
+ : QGLWidget(parent),
+ qt_image_(fileName)
+{
+ gl_texture_ = 0;
+}
+
+ImageWidget::~ImageWidget()
+{
+}
+
+void ImageWidget::initializeGL()
+{
+ // setup opengl
+ glDisable(GL_LIGHTING);
+
+ // create the OpenGL/OpenCL shared context
+ context_ = compute::opengl_create_shared_context();
+
+ // get gpu device
+ compute::device gpu = context_.get_device();
+ std::cout << "device: " << gpu.name() << std::endl;
+
+ // setup command queue
+ queue_ = compute::command_queue(context_, gpu);
+
+ // allocate image on the device
+ compute::image_format format =
+ compute::qt_qimage_format_to_image_format(qt_image_.format());
+
+ image_ = compute::image2d(
+ context_, qt_image_.width(), qt_image_.height(), format, CL_MEM_READ_ONLY
+ );
+
+ // transfer image to the device
+ compute::qt_copy_qimage_to_image2d(qt_image_, image_, queue_);
+
+ // setup image sampler (use CL_FILTER_NEAREST to disable linear interpolation)
+ sampler_ = compute::image_sampler(
+ context_, false, CL_ADDRESS_NONE, CL_FILTER_LINEAR
+ );
+
+ // build resize program
+ program_ = compute::program::build_with_source(source, context_);
+}
+
+void ImageWidget::resizeGL(int width, int height)
+{
+#if QT_VERSION >= 0x050000
+ // scale height/width based on device pixel ratio
+ width /= windowHandle()->devicePixelRatio();
+ height /= windowHandle()->devicePixelRatio();
+#endif
+
+ // resize viewport
+ glViewport(0, 0, width, height);
+
+ // delete old texture
+ if(gl_texture_){
+ glDeleteTextures(1, &gl_texture_);
+ gl_texture_ = 0;
+ }
+
+ // generate new texture
+ glGenTextures(1, &gl_texture_);
+ glBindTexture(GL_TEXTURE_2D, gl_texture_);
+ glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_S, GL_REPEAT);
+ glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_T, GL_REPEAT);
+ glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, GL_LINEAR);
+ glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_LINEAR);
+ glTexImage2D(
+ GL_TEXTURE_2D, 0, GL_RGBA, width, height, 0, GL_RGBA, GL_UNSIGNED_BYTE, 0
+ );
+
+ // create opencl object for the texture
+ cl_texture_ = compute::opengl_texture(
+ context_, GL_TEXTURE_2D, 0, gl_texture_, CL_MEM_WRITE_ONLY
+ );
+}
+
+void ImageWidget::paintGL()
+{
+ float w = width();
+ float h = height();
+
+ glMatrixMode(GL_PROJECTION);
+ glLoadIdentity();
+ glOrtho(0.0, w, 0.0, h, -1.0, 1.0);
+ glMatrixMode(GL_MODELVIEW);
+ glLoadIdentity();
+
+ // setup the resize kernel
+ compute::kernel kernel(program_, "resize_image");
+ kernel.set_arg(0, image_);
+ kernel.set_arg(1, sampler_);
+ kernel.set_arg(2, cl_texture_);
+
+ // acquire the opengl texture so it can be used in opencl
+ compute::opengl_enqueue_acquire_gl_objects(1, &cl_texture_.get(), queue_);
+
+ // execute the resize kernel
+ const size_t global_work_offset[] = { 0, 0 };
+ const size_t global_work_size[] = { size_t(width()), size_t(height()) };
+
+ queue_.enqueue_nd_range_kernel(
+ kernel, 2, global_work_offset, global_work_size, 0
+ );
+
+ // release the opengl texture so it can be used by opengl
+ compute::opengl_enqueue_release_gl_objects(1, &cl_texture_.get(), queue_);
+
+ // ensure opencl is finished before rendering in opengl
+ queue_.finish();
+
+ // draw a single quad with the resized image texture
+ glEnable(GL_TEXTURE_2D);
+ glBindTexture(GL_TEXTURE_2D, gl_texture_);
+
+ glBegin(GL_QUADS);
+ glTexCoord2f(0, 0); glVertex2f(0, 0);
+ glTexCoord2f(0, 1); glVertex2f(0, h);
+ glTexCoord2f(1, 1); glVertex2f(w, h);
+ glTexCoord2f(1, 0); glVertex2f(w, 0);
+ glEnd();
+}
+
+// the resize image example demonstrates how to interactively resize a
+// 2D image and display it using OpenGL. a image sampler is used to perform
+// hardware-accelerated linear interpolation for the resized image.
+int main(int argc, char *argv[])
+{
+ // setup command line arguments
+ po::options_description options("options");
+ options.add_options()
+ ("help", "show usage instructions")
+ ("file", po::value<std::string>(), "image file name (e.g. /path/to/image.png)")
+ ;
+ po::positional_options_description positional_options;
+ positional_options.add("file", 1);
+
+ // parse command line
+ po::variables_map vm;
+ po::store(
+ po::command_line_parser(argc, argv)
+ .options(options)
+ .positional(positional_options)
+ .run(),
+ vm
+ );
+ po::notify(vm);
+
+ // check for file argument
+ if(vm.count("help") || !vm.count("file")){
+ std::cout << options << std::endl;
+ return -1;
+ }
+
+ // get file name
+ std::string file_name = vm["file"].as<std::string>();
+
+ // setup qt application
+ QApplication app(argc, argv);
+
+ // setup image widget
+ ImageWidget widget(QString::fromStdString(file_name));
+ widget.show();
+
+ // run qt application
+ return app.exec();
+}
+
+#include "resize_image.moc"
diff --git a/src/boost/libs/compute/example/simple_kernel.cpp b/src/boost/libs/compute/example/simple_kernel.cpp
new file mode 100644
index 00000000..4aa1872f
--- /dev/null
+++ b/src/boost/libs/compute/example/simple_kernel.cpp
@@ -0,0 +1,84 @@
+//---------------------------------------------------------------------------//
+// Copyright (c) 2013 Kyle Lutz <kyle.r.lutz@gmail.com>
+//
+// Distributed under the Boost Software License, Version 1.0
+// See accompanying file LICENSE_1_0.txt or copy at
+// http://www.boost.org/LICENSE_1_0.txt
+//
+// See http://boostorg.github.com/compute for more information.
+//---------------------------------------------------------------------------//
+
+#include <iostream>
+
+#include <boost/compute/core.hpp>
+
+namespace compute = boost::compute;
+
+// this example demonstrates how to use the Boost.Compute classes to
+// setup and run a simple vector addition kernel on the GPU
+int main()
+{
+ // get the default device
+ compute::device device = compute::system::default_device();
+
+ // create a context for the device
+ compute::context context(device);
+
+ // setup input arrays
+ float a[] = { 1, 2, 3, 4 };
+ float b[] = { 5, 6, 7, 8 };
+
+ // make space for the output
+ float c[] = { 0, 0, 0, 0 };
+
+ // create memory buffers for the input and output
+ compute::buffer buffer_a(context, 4 * sizeof(float));
+ compute::buffer buffer_b(context, 4 * sizeof(float));
+ compute::buffer buffer_c(context, 4 * sizeof(float));
+
+ // source code for the add kernel
+ const char source[] =
+ "__kernel void add(__global const float *a,"
+ " __global const float *b,"
+ " __global float *c)"
+ "{"
+ " const uint i = get_global_id(0);"
+ " c[i] = a[i] + b[i];"
+ "}";
+
+ // create the program with the source
+ compute::program program =
+ compute::program::create_with_source(source, context);
+
+ // compile the program
+ program.build();
+
+ // create the kernel
+ compute::kernel kernel(program, "add");
+
+ // set the kernel arguments
+ kernel.set_arg(0, buffer_a);
+ kernel.set_arg(1, buffer_b);
+ kernel.set_arg(2, buffer_c);
+
+ // create a command queue
+ compute::command_queue queue(context, device);
+
+ // write the data from 'a' and 'b' to the device
+ queue.enqueue_write_buffer(buffer_a, 0, 4 * sizeof(float), a);
+ queue.enqueue_write_buffer(buffer_b, 0, 4 * sizeof(float), b);
+
+ // run the add kernel
+ queue.enqueue_1d_range_kernel(kernel, 0, 4, 0);
+
+ // transfer results back to the host array 'c'
+ queue.enqueue_read_buffer(buffer_c, 0, 4 * sizeof(float), c);
+
+ // print out results in 'c'
+ std::cout << "c: [" << c[0] << ", "
+ << c[1] << ", "
+ << c[2] << ", "
+ << c[3] << "]" << std::endl;
+
+ return 0;
+}
diff --git a/src/boost/libs/compute/example/simple_moving_average.cpp b/src/boost/libs/compute/example/simple_moving_average.cpp
new file mode 100644
index 00000000..968db393
--- /dev/null
+++ b/src/boost/libs/compute/example/simple_moving_average.cpp
@@ -0,0 +1,139 @@
+//---------------------------------------------------------------------------//
+// Copyright (c) 2014 Benoit Dequidt <benoit.dequidt@gmail.com>
+//
+// Distributed under the Boost Software License, Version 1.0
+// See accompanying file LICENSE_1_0.txt or copy at
+// http://www.boost.org/LICENSE_1_0.txt
+//
+// See http://boostorg.github.com/compute for more information.
+//---------------------------------------------------------------------------//
+
+#include <iostream>
+#include <cstdlib>
+
+#include <boost/compute/core.hpp>
+#include <boost/compute/algorithm/copy.hpp>
+#include <boost/compute/algorithm/inclusive_scan.hpp>
+#include <boost/compute/container/vector.hpp>
+#include <boost/compute/type_traits/type_name.hpp>
+#include <boost/compute/utility/source.hpp>
+
+namespace compute = boost::compute;
+
+/// warning precision is not precise due
+/// to the float error accumulation when size is large enough
+/// for more precision use double
+/// or a kahan sum else results can diverge
+/// from the CPU implementation
+compute::program make_sma_program(const compute::context& context)
+{
+ const char source[] = BOOST_COMPUTE_STRINGIZE_SOURCE(
+ __kernel void SMA(__global const float *scannedValues, int size, __global float *output, int wSize)
+ {
+ const int gid = get_global_id(0);
+
+ float cumValues = 0.f;
+ int endIdx = gid + wSize/2;
+ int startIdx = gid -1 - wSize/2;
+
+ if(endIdx > size -1)
+ endIdx = size -1;
+
+ cumValues += scannedValues[endIdx];
+ if(startIdx < 0)
+ startIdx = -1;
+ else
+ cumValues -= scannedValues[startIdx];
+
+ output[gid] =(float)( cumValues / ( float )(endIdx - startIdx));
+ }
+ );
+
+ // create sma program
+ return compute::program::build_with_source(source,context);
+}
+
+bool check_results(const std::vector<float>& values, const std::vector<float>& smoothValues, unsigned int wSize)
+{
+ int size = values.size();
+ if(size != (int)smoothValues.size()) return false;
+
+ int semiWidth = wSize/2;
+
+ bool ret = true;
+ for(int idx = 0 ; idx < size ; ++idx)
+ {
+ int start = (std::max)(idx - semiWidth,0);
+ int end = (std::min)(idx + semiWidth,size-1);
+ float res = 0;
+ for(int j = start ; j <= end ; ++j)
+ {
+ res+= values[j];
+ }
+
+ res /= float(end - start +1);
+
+ if(std::abs(res-smoothValues[idx]) > 1e-3)
+ {
+ std::cout << "idx = " << idx << " -- expected = " << res << " -- result = " << smoothValues[idx] << std::endl;
+ ret = false;
+ }
+ }
+
+ return ret;
+}
+
+// generate a uniform law over [0,10]
+float myRand()
+{
+ static const double divisor = double(RAND_MAX)+1.;
+ return double(rand())/divisor * 10.;
+}
+
+int main()
+{
+ unsigned int size = 1024;
+ // wSize must be odd
+ unsigned int wSize = 21;
+ // get the default device
+ compute::device device = compute::system::default_device();
+ // create a context for the device
+ compute::context context(device);
+ // get the program
+ compute::program program = make_sma_program(context);
+
+ // create vector of random numbers on the host
+ std::vector<float> host_vector(size);
+ std::vector<float> host_result(size);
+ std::generate(host_vector.begin(), host_vector.end(), myRand);
+
+ compute::vector<float> a(size,context);
+ compute::vector<float> b(size,context);
+ compute::vector<float> c(size,context);
+ compute::command_queue queue(context, device);
+
+ compute::copy(host_vector.begin(),host_vector.end(),a.begin(),queue);
+
+ // scan values
+ compute::inclusive_scan(a.begin(),a.end(),b.begin(),queue);
+ // sma kernel
+ compute::kernel kernel(program, "SMA");
+ kernel.set_arg(0,b.get_buffer());
+ kernel.set_arg(1,(int)b.size());
+ kernel.set_arg(2,c.get_buffer());
+ kernel.set_arg(3,(int)wSize);
+
+ using compute::uint_;
+ uint_ tpb = 128;
+ uint_ workSize = size;
+ queue.enqueue_1d_range_kernel(kernel,0,workSize,tpb);
+
+ compute::copy(c.begin(),c.end(),host_result.begin(),queue);
+
+ bool res = check_results(host_vector,host_result,wSize);
+ std::string status = res ? "results are equivalent" : "GPU results differs from CPU one's";
+ std::cout << status << std::endl;
+
+ return 0;
+}
+
diff --git a/src/boost/libs/compute/example/sort_vector.cpp b/src/boost/libs/compute/example/sort_vector.cpp
new file mode 100644
index 00000000..39ec8ea7
--- /dev/null
+++ b/src/boost/libs/compute/example/sort_vector.cpp
@@ -0,0 +1,68 @@
+//---------------------------------------------------------------------------//
+// Copyright (c) 2013 Kyle Lutz <kyle.r.lutz@gmail.com>
+//
+// Distributed under the Boost Software License, Version 1.0
+// See accompanying file LICENSE_1_0.txt or copy at
+// http://www.boost.org/LICENSE_1_0.txt
+//
+// See http://boostorg.github.com/compute for more information.
+//---------------------------------------------------------------------------//
+
+#include <algorithm>
+#include <iostream>
+#include <vector>
+
+#include <boost/compute/system.hpp>
+#include <boost/compute/algorithm/copy.hpp>
+#include <boost/compute/algorithm/sort.hpp>
+#include <boost/compute/container/vector.hpp>
+
+namespace compute = boost::compute;
+
+int rand_int()
+{
+ return rand() % 100;
+}
+
+// this example demonstrates how to sort a vector of ints on the GPU
+int main()
+{
+ // create vector of random values on the host
+ std::vector<int> host_vector(10);
+ std::generate(host_vector.begin(), host_vector.end(), rand_int);
+
+ // print out input vector
+ std::cout << "input: [ ";
+ for(size_t i = 0; i < host_vector.size(); i++){
+ std::cout << host_vector[i];
+
+ if(i != host_vector.size() - 1){
+ std::cout << ", ";
+ }
+ }
+ std::cout << " ]" << std::endl;
+
+ // transfer the values to the device
+ compute::vector<int> device_vector = host_vector;
+
+ // sort the values on the device
+ compute::sort(device_vector.begin(), device_vector.end());
+
+ // transfer the values back to the host
+ compute::copy(device_vector.begin(),
+ device_vector.end(),
+ host_vector.begin());
+
+ // print out the sorted vector
+ std::cout << "output: [ ";
+ for(size_t i = 0; i < host_vector.size(); i++){
+ std::cout << host_vector[i];
+
+ if(i != host_vector.size() - 1){
+ std::cout << ", ";
+ }
+ }
+ std::cout << " ]" << std::endl;
+
+ return 0;
+}
diff --git a/src/boost/libs/compute/example/threefry_engine.cpp b/src/boost/libs/compute/example/threefry_engine.cpp
new file mode 100644
index 00000000..5cbf6c8e
--- /dev/null
+++ b/src/boost/libs/compute/example/threefry_engine.cpp
@@ -0,0 +1,43 @@
+//---------------------------------------------------------------------------//
+// Copyright (c) 2013 Muhammad Junaid Muzammil <mjunaidmuzammil@gmail.com>
+//
+// Distributed under the Boost Software License, Version 1.0
+// See accompanying file LICENSE_1_0.txt or copy at
+// http://www.boost.org/LICENSE_1_0.txt
+//
+// See http://kylelutz.github.com/compute for more information.
+//---------------------------------------------------------------------------//
+
+
+#include <boost/compute/random/threefry_engine.hpp>
+#include <boost/compute/container/vector.hpp>
+#include <boost/compute/command_queue.hpp>
+#include <boost/compute/context.hpp>
+#include <boost/compute/device.hpp>
+#include <boost/compute/system.hpp>
+#include <iostream>
+
+int main()
+{
+ using boost::compute::uint_;
+ boost::compute::device device = boost::compute::system::default_device();
+ boost::compute::context context(device);
+ boost::compute::command_queue queue(context, device);
+ boost::compute::threefry_engine<> rng(queue);
+ boost::compute::vector<uint_> vector_ctr(20, context);
+
+ uint32_t ctr[20];
+ for(int i = 0; i < 10; i++) {
+ ctr[i*2] = i;
+ ctr[i*2+1] = 0;
+ }
+ boost::compute::copy(ctr, ctr+20, vector_ctr.begin(), queue);
+ rng.generate(vector_ctr.begin(), vector_ctr.end(), queue);
+ boost::compute::copy(vector_ctr.begin(), vector_ctr.end(), ctr, queue);
+
+ for(int i = 0; i < 10; i++) {
+ std::cout << std::hex << ctr[i*2] << " " << ctr[i*2+1] << std::endl;
+ }
+ return 0;
+}
+
diff --git a/src/boost/libs/compute/example/time_copy.cpp b/src/boost/libs/compute/example/time_copy.cpp
new file mode 100644
index 00000000..35168770
--- /dev/null
+++ b/src/boost/libs/compute/example/time_copy.cpp
@@ -0,0 +1,63 @@
+//---------------------------------------------------------------------------//
+// Copyright (c) 2013 Kyle Lutz <kyle.r.lutz@gmail.com>
+//
+// Distributed under the Boost Software License, Version 1.0
+// See accompanying file LICENSE_1_0.txt or copy at
+// http://www.boost.org/LICENSE_1_0.txt
+//
+// See http://boostorg.github.com/compute for more information.
+//---------------------------------------------------------------------------//
+
+//[time_copy_example
+
+#include <vector>
+#include <cstdlib>
+#include <iostream>
+
+#include <boost/compute/event.hpp>
+#include <boost/compute/system.hpp>
+#include <boost/compute/algorithm/copy.hpp>
+#include <boost/compute/async/future.hpp>
+#include <boost/compute/container/vector.hpp>
+
+namespace compute = boost::compute;
+
+int main()
+{
+ // get the default device
+ compute::device gpu = compute::system::default_device();
+
+ // create context for default device
+ compute::context context(gpu);
+
+ // create command queue with profiling enabled
+ compute::command_queue queue(
+ context, gpu, compute::command_queue::enable_profiling
+ );
+
+ // generate random data on the host
+ std::vector<int> host_vector(16000000);
+ std::generate(host_vector.begin(), host_vector.end(), rand);
+
+ // create a vector on the device
+ compute::vector<int> device_vector(host_vector.size(), context);
+
+ // copy data from the host to the device
+ compute::future<void> future = compute::copy_async(
+ host_vector.begin(), host_vector.end(), device_vector.begin(), queue
+ );
+
+ // wait for copy to finish
+ future.wait();
+
+ // get elapsed time from event profiling information
+ boost::chrono::milliseconds duration =
+ future.get_event().duration<boost::chrono::milliseconds>();
+
+ // print elapsed time in milliseconds
+ std::cout << "time: " << duration.count() << " ms" << std::endl;
+
+ return 0;
+}
+
+//]
diff --git a/src/boost/libs/compute/example/transform_sqrt.cpp b/src/boost/libs/compute/example/transform_sqrt.cpp
new file mode 100644
index 00000000..860063fc
--- /dev/null
+++ b/src/boost/libs/compute/example/transform_sqrt.cpp
@@ -0,0 +1,58 @@
+//---------------------------------------------------------------------------//
+// Copyright (c) 2013 Kyle Lutz <kyle.r.lutz@gmail.com>
+//
+// Distributed under the Boost Software License, Version 1.0
+// See accompanying file LICENSE_1_0.txt or copy at
+// http://www.boost.org/LICENSE_1_0.txt
+//
+// See http://boostorg.github.com/compute for more information.
+//---------------------------------------------------------------------------//
+
+//[transform_sqrt_example
+
+#include <vector>
+#include <algorithm>
+
+#include <boost/compute/algorithm/transform.hpp>
+#include <boost/compute/container/vector.hpp>
+#include <boost/compute/functional/math.hpp>
+
+namespace compute = boost::compute;
+
+int main()
+{
+ // get default device and setup context
+ compute::device device = compute::system::default_device();
+ compute::context context(device);
+ compute::command_queue queue(context, device);
+
+ // generate random data on the host
+ std::vector<float> host_vector(10000);
+ std::generate(host_vector.begin(), host_vector.end(), rand);
+
+ // create a vector on the device
+ compute::vector<float> device_vector(host_vector.size(), context);
+
+ // transfer data from the host to the device
+ compute::copy(
+ host_vector.begin(), host_vector.end(), device_vector.begin(), queue
+ );
+
+ // calculate the square-root of each element in-place
+ compute::transform(
+ device_vector.begin(),
+ device_vector.end(),
+ device_vector.begin(),
+ compute::sqrt<float>(),
+ queue
+ );
+
+ // copy values back to the host
+ compute::copy(
+ device_vector.begin(), device_vector.end(), host_vector.begin(), queue
+ );
+
+ return 0;
+}
+
+//]
diff --git a/src/boost/libs/compute/example/vector_addition.cpp b/src/boost/libs/compute/example/vector_addition.cpp
new file mode 100644
index 00000000..1f3aa92d
--- /dev/null
+++ b/src/boost/libs/compute/example/vector_addition.cpp
@@ -0,0 +1,57 @@
+//---------------------------------------------------------------------------//
+// Copyright (c) 2013 Kyle Lutz <kyle.r.lutz@gmail.com>
+//
+// Distributed under the Boost Software License, Version 1.0
+// See accompanying file LICENSE_1_0.txt or copy at
+// http://www.boost.org/LICENSE_1_0.txt
+//
+// See http://boostorg.github.com/compute for more information.
+//---------------------------------------------------------------------------//
+
+#include <iostream>
+
+#include <boost/compute/algorithm/copy.hpp>
+#include <boost/compute/algorithm/transform.hpp>
+#include <boost/compute/container/vector.hpp>
+#include <boost/compute/functional/operator.hpp>
+
+namespace compute = boost::compute;
+
+// this example demonstrates how to use Boost.Compute's STL
+// implementation to add two vectors on the GPU
+int main()
+{
+ // setup input arrays
+ float a[] = { 1, 2, 3, 4 };
+ float b[] = { 5, 6, 7, 8 };
+
+ // make space for the output
+ float c[] = { 0, 0, 0, 0 };
+
+ // create vectors and transfer data for the input arrays 'a' and 'b'
+ compute::vector<float> vector_a(a, a + 4);
+ compute::vector<float> vector_b(b, b + 4);
+
+ // create vector for the output array
+ compute::vector<float> vector_c(4);
+
+ // add the vectors together
+ compute::transform(
+ vector_a.begin(),
+ vector_a.end(),
+ vector_b.begin(),
+ vector_c.begin(),
+ compute::plus<float>()
+ );
+
+ // transfer results back to the host array 'c'
+ compute::copy(vector_c.begin(), vector_c.end(), c);
+
+ // print out results in 'c'
+ std::cout << "c: [" << c[0] << ", "
+ << c[1] << ", "
+ << c[2] << ", "
+ << c[3] << "]" << std::endl;
+
+ return 0;
+}