Adding upstream version 14.2.21.upstream/14.2.21 upstream

Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
author: Daniel Baumann <daniel.baumann@progress-linux.org> 2024-04-27 18:24:20 +0000
committer: Daniel Baumann <daniel.baumann@progress-linux.org> 2024-04-27 18:24:20 +0000
commit: 483eb2f56657e8e7f419ab1a4fab8dce9ade8609 (patch)
tree: e5d88d25d870d5dedacb6bbdbe2a966086a0a5cf /src/boost/libs/compute/example
parent: Initial commit. (diff)
download: ceph-483eb2f56657e8e7f419ab1a4fab8dce9ade8609.tar.xz
ceph-483eb2f56657e8e7f419ab1a4fab8dce9ade8609.zip
38 files changed, 5022 insertions, 0 deletions
diff --git a/src/boost/libs/compute/example/CMakeLists.txt b/src/boost/libs/compute/example/CMakeLists.txt
new file mode 100644
index 00000000..a5708cb0
--- /dev/null
+++ b/src/boost/libs/compute/example/CMakeLists.txt
@@ -0,0 +1,166 @@
+# ---------------------------------------------------------------------------
+#  Copyright (c) 2013-2014 Kyle Lutz <kyle.r.lutz@gmail.com>
+#
+#  Distributed under the Boost Software License, Version 1.0
+#  See accompanying file LICENSE_1_0.txt or copy at
+#  http://www.boost.org/LICENSE_1_0.txt
+#
+# ---------------------------------------------------------------------------
+
+include_directories(../include)
+
+set(EXAMPLES
+  amd_cpp_kernel
+  black_scholes
+  copy_data
+  fizz_buzz
+  hello_world
+  host_sort
+  inline_ptx
+  longest_vector
+  list_devices
+  mapped_view
+  memory_limits
+  monte_carlo
+  point_centroid
+  price_cross
+  print_vector
+  sort_vector
+  simple_kernel
+  time_copy
+  transform_sqrt
+  vector_addition
+  simple_moving_average
+  matrix_transpose
+)
+
+# boost library link dependencies
+set(EXAMPLE_BOOST_COMPONENTS program_options)
+
+if (${BOOST_COMPUTE_USE_OFFLINE_CACHE})
+  set(EXAMPLE_BOOST_COMPONENTS ${EXAMPLE_BOOST_COMPONENTS} system filesystem)
+endif()
+
+if(${BOOST_COMPUTE_THREAD_SAFE} AND NOT ${BOOST_COMPUTE_USE_CPP11})
+  set(EXAMPLE_BOOST_COMPONENTS ${EXAMPLE_BOOST_COMPONENTS} system thread)
+endif()
+
+if(MSVC AND EXAMPLE_BOOST_COMPONENTS)
+  set(EXAMPLE_BOOST_COMPONENTS ${EXAMPLE_BOOST_COMPONENTS} chrono)
+endif()
+
+if(EXAMPLE_BOOST_COMPONENTS)
+  list(REMOVE_DUPLICATES EXAMPLE_BOOST_COMPONENTS)
+endif()
+find_package(Boost 1.54 REQUIRED COMPONENTS ${EXAMPLE_BOOST_COMPONENTS})
+include_directories(SYSTEM ${Boost_INCLUDE_DIRS})
+
+foreach(EXAMPLE ${EXAMPLES})
+  add_executable(${EXAMPLE} ${EXAMPLE}.cpp)
+  target_link_libraries(${EXAMPLE} ${OpenCL_LIBRARIES} ${Boost_LIBRARIES})
+
+  # add example program to list of tests (if testing is enabled)
+  if(${BOOST_COMPUTE_BUILD_TESTS})
+    add_test("example.${EXAMPLE}" ${EXAMPLE})
+  endif()
+endforeach()
+
+# opencl test example
+add_executable(opencl_test opencl_test.cpp)
+target_link_libraries(opencl_test ${OpenCL_LIBRARIES})
+
+# eigen examples
+if(${BOOST_COMPUTE_HAVE_EIGEN})
+  find_package(Eigen REQUIRED)
+  include_directories(SYSTEM ${EIGEN_INCLUDE_DIRS})
+  add_executable(batched_determinant batched_determinant.cpp)
+  target_link_libraries(batched_determinant ${OpenCL_LIBRARIES} ${Boost_LIBRARIES})
+endif()
+
+# opencv examples
+if(${BOOST_COMPUTE_HAVE_OPENCV})
+  find_package(OpenCV REQUIRED)
+  include_directories(SYSTEM ${OpenCV_INCLUDE_DIRS})
+
+  set(OPENCV_EXAMPLES
+    k_means
+    opencv_flip
+    random_walk
+    opencv_optical_flow
+    opencv_convolution
+    opencv_sobel_filter
+    opencv_histogram
+  )
+
+  foreach(EXAMPLE ${OPENCV_EXAMPLES})
+    add_executable(${EXAMPLE} ${EXAMPLE}.cpp)
+    target_link_libraries(${EXAMPLE} ${OpenCL_LIBRARIES} ${Boost_LIBRARIES} ${OpenCV_LIBS})
+  endforeach()
+endif()
+
+# opengl/vtk examples
+if(${BOOST_COMPUTE_HAVE_VTK})
+  find_package(VTK REQUIRED)
+  include(${VTK_USE_FILE})
+  add_executable(opengl_sphere opengl_sphere.cpp)
+  target_link_libraries(opengl_sphere ${OpenCL_LIBRARIES} ${Boost_LIBRARIES} ${VTK_LIBRARIES})
+  if(APPLE)
+    target_link_libraries(opengl_sphere "-framework OpenGL")
+  elseif(UNIX)
+    target_link_libraries(opengl_sphere GL)
+  endif()
+endif()
+
+# qt examples
+if(${BOOST_COMPUTE_HAVE_QT})
+
+  # look for Qt4 in the first place
+  find_package(Qt4 QUIET)
+
+  if(${QT4_FOUND})
+    # build with Qt4
+    find_package(Qt4 REQUIRED COMPONENTS QtCore QtGui QtOpenGL)
+    set(QT_USE_QTOPENGL TRUE)
+    include(${QT_USE_FILE})
+  else()
+
+    # look for Qt5
+    find_package(Qt5Widgets QUIET)
+
+    if(${Qt5Widgets_FOUND})
+      # build with Qt5
+      find_package(Qt5Core REQUIRED)
+      find_package(Qt5Widgets REQUIRED)
+      find_package(Qt5OpenGL REQUIRED)
+      include_directories(${Qt5OpenGL_INCLUDE_DIRS})
+      set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${Qt5OpenGL_EXECUTABLE_COMPILE_FLAGS}")
+      set(QT_LIBRARIES ${Qt5OpenGL_LIBRARIES})
+    else()
+      # no valid Qt framework found
+      message(FATAL_ERROR "Error: Did not find Qt4 or Qt5")
+    endif()
+  endif()
+
+  # required by both versions
+  set(CMAKE_AUTOMOC TRUE)
+  include_directories(${CMAKE_CURRENT_BINARY_DIR})
+
+  # add examples
+  add_executable(qimage_blur qimage_blur.cpp)
+  target_link_libraries(qimage_blur ${OpenCL_LIBRARIES} ${Boost_LIBRARIES} ${QT_LIBRARIES})
+
+  set(QT_OPENGL_EXAMPLES
+    mandelbrot
+    nbody
+    resize_image
+  )
+  foreach(EXAMPLE ${QT_OPENGL_EXAMPLES})
+    add_executable(${EXAMPLE} ${EXAMPLE}.cpp)
+    target_link_libraries(${EXAMPLE} ${OpenCL_LIBRARIES} ${Boost_LIBRARIES} ${QT_LIBRARIES})
+    if(APPLE)
+      target_link_libraries(${EXAMPLE} "-framework OpenGL")
+    elseif(UNIX)
+      target_link_libraries(${EXAMPLE} GL)
+    endif()
+  endforeach()
+endif()
diff --git a/src/boost/libs/compute/example/amd_cpp_kernel.cpp b/src/boost/libs/compute/example/amd_cpp_kernel.cpp
new file mode 100644
index 00000000..0207312f
--- /dev/null
+++ b/src/boost/libs/compute/example/amd_cpp_kernel.cpp
@@ -0,0 +1,116 @@
+//---------------------------------------------------------------------------//
+// Copyright (c) 2013-2014 Kyle Lutz <kyle.r.lutz@gmail.com>
+//
+// Distributed under the Boost Software License, Version 1.0
+// See accompanying file LICENSE_1_0.txt or copy at
+// http://www.boost.org/LICENSE_1_0.txt
+//
+// See http://boostorg.github.com/compute for more information.
+//---------------------------------------------------------------------------//
+
+#include <iostream>
+
+#include <boost/compute/command_queue.hpp>
+#include <boost/compute/kernel.hpp>
+#include <boost/compute/program.hpp>
+#include <boost/compute/system.hpp>
+#include <boost/compute/algorithm/copy.hpp>
+#include <boost/compute/container/vector.hpp>
+#include <boost/compute/utility/source.hpp>
+
+namespace compute = boost::compute;
+
+// this example shows how to use the static c++ kernel language
+// extension (currently only supported by AMD) to compile and
+// execute a templated c++ kernel.
+// Using platform vendor info to decide if this is AMD platform
+int main()
+{
+    // get default device and setup context
+    compute::device device = compute::system::default_device();
+    compute::context context(device);
+    compute::command_queue queue(context, device);
+
+    // check the platform vendor string
+    if(device.platform().vendor() != "Advanced Micro Devices, Inc."){
+        std::cerr << "error: static C++ kernel language is only "
+                  << "supported on AMD devices."
+                  << std::endl;
+        return 0;
+    }
+
+    // create input int values and copy them to the device
+    int int_data[] = { 1, 2, 3, 4};
+    compute::vector<int> int_vector(int_data, int_data + 4, queue);
+
+    // create input float values and copy them to the device
+    float float_data[] = { 2.0f, 4.0f, 6.0f, 8.0f };
+    compute::vector<float> float_vector(float_data, float_data + 4, queue);
+
+    // create kernel source with a templated function and templated kernel
+    const char source[] = BOOST_COMPUTE_STRINGIZE_SOURCE(
+        // define our templated function which returns the square of its input
+        template<typename T>
+        inline T square(const T x)
+        {
+            return x * x;
+        }
+
+        // define our templated kernel which calls square on each value in data
+        template<typename T>
+        __kernel void square_kernel(__global T *data)
+        {
+            const uint i = get_global_id(0);
+            data[i] = square(data[i]);
+        }
+
+        // explicitly instantiate the square kernel for int's. this allows
+        // for it to be called from the host with the given mangled name.
+        template __attribute__((mangled_name(square_kernel_int)))
+        __kernel void square_kernel(__global int *data);
+
+        // also instantiate the square kernel for float's.
+        template __attribute__((mangled_name(square_kernel_float)))
+        __kernel void square_kernel(__global float *data);
+    );
+
+    // build the program. must enable the c++ static kernel language
+    // by passing the "-x clc++" compile option.
+    compute::program square_program =
+        compute::program::build_with_source(source, context, "-x clc++");
+
+    // create the square kernel for int's by using its mangled name declared
+    // in the explicit template instantiation.
+    compute::kernel square_int_kernel(square_program, "square_kernel_int");
+    square_int_kernel.set_arg(0, int_vector);
+
+    // execute the square int kernel
+    queue.enqueue_1d_range_kernel(square_int_kernel, 0, int_vector.size(), 4);
+
+    // print out the squared int values
+    std::cout << "int's: ";
+    compute::copy(
+        int_vector.begin(), int_vector.end(),
+        std::ostream_iterator<int>(std::cout, " "),
+        queue
+    );
+    std::cout << std::endl;
+
+    // now create the square kernel for float's
+    compute::kernel square_float_kernel(square_program, "square_kernel_float");
+    square_float_kernel.set_arg(0, float_vector);
+
+    // execute the square int kernel
+    queue.enqueue_1d_range_kernel(square_float_kernel, 0, float_vector.size(), 4);
+
+    // print out the squared float values
+    std::cout << "float's: ";
+    compute::copy(
+        float_vector.begin(), float_vector.end(),
+        std::ostream_iterator<float>(std::cout, " "),
+        queue
+    );
+    std::cout << std::endl;
+
+    return 0;
+}
diff --git a/src/boost/libs/compute/example/batched_determinant.cpp b/src/boost/libs/compute/example/batched_determinant.cpp
new file mode 100644
index 00000000..0029151e
--- /dev/null
+++ b/src/boost/libs/compute/example/batched_determinant.cpp
@@ -0,0 +1,96 @@
+//---------------------------------------------------------------------------//
+// Copyright (c) 2013-2014 Kyle Lutz <kyle.r.lutz@gmail.com>
+//
+// Distributed under the Boost Software License, Version 1.0
+// See accompanying file LICENSE_1_0.txt or copy at
+// http://www.boost.org/LICENSE_1_0.txt
+//
+// See http://boostorg.github.com/compute for more information.
+//---------------------------------------------------------------------------//
+
+#include <iostream>
+
+#include <Eigen/Core>
+#include <Eigen/LU>
+
+#include <boost/compute/function.hpp>
+#include <boost/compute/system.hpp>
+#include <boost/compute/algorithm/transform.hpp>
+#include <boost/compute/container/vector.hpp>
+#include <boost/compute/types/fundamental.hpp>
+
+namespace compute = boost::compute;
+
+// this example shows how to compute the determinant of many 4x4 matrices
+// using a determinant function and the transform() algorithm. in OpenCL the
+// float16 type can be used to store a 4x4 matrix and the components are laid
+// out in the following order:
+//
+// M = [ s0 s4 s8 sc ]
+//     [ s1 s5 s9 sd ]
+//     [ s2 s6 sa se ]
+//     [ s3 s7 sb sf ]
+//
+// the input matrices are created using eigen's random matrix and then
+// used again at the end to verify the results of the determinant function.
+int main()
+{
+    // get default device and setup context
+    compute::device gpu = compute::system::default_device();
+    compute::context context(gpu);
+    compute::command_queue queue(context, gpu);
+    std::cout << "device: " << gpu.name() << std::endl;
+
+    size_t n = 1000;
+
+    // create random 4x4 matrices on the host
+    std::vector<Eigen::Matrix4f> matrices(n);
+    for(size_t i = 0; i < n; i++){
+        matrices[i] = Eigen::Matrix4f::Random();
+    }
+
+    // copy matrices to the device
+    using compute::float16_;
+    compute::vector<float16_> input(n, context);
+    compute::copy(
+        matrices.begin(), matrices.end(), input.begin(), queue
+    );
+
+    // function returning the determinant of a 4x4 matrix.
+    BOOST_COMPUTE_FUNCTION(float, determinant4x4, (const float16_ m),
+    {
+        return m.s0*m.s5*m.sa*m.sf + m.s0*m.s6*m.sb*m.sd + m.s0*m.s7*m.s9*m.se +
+               m.s1*m.s4*m.sb*m.se + m.s1*m.s6*m.s8*m.sf + m.s1*m.s7*m.sa*m.sc +
+               m.s2*m.s4*m.s9*m.sf + m.s2*m.s5*m.sb*m.sc + m.s2*m.s7*m.s8*m.sd +
+               m.s3*m.s4*m.sa*m.sd + m.s3*m.s5*m.s8*m.se + m.s3*m.s6*m.s9*m.sc -
+               m.s0*m.s5*m.sb*m.se - m.s0*m.s6*m.s9*m.sf - m.s0*m.s7*m.sa*m.sd -
+               m.s1*m.s4*m.sa*m.sf - m.s1*m.s6*m.sb*m.sc - m.s1*m.s7*m.s8*m.se -
+               m.s2*m.s4*m.sb*m.sd - m.s2*m.s5*m.s8*m.sf - m.s2*m.s7*m.s9*m.sc -
+               m.s3*m.s4*m.s9*m.se - m.s3*m.s5*m.sa*m.sc - m.s3*m.s6*m.s8*m.sd;
+    });
+
+    // calculate determinants on the gpu
+    compute::vector<float> determinants(n, context);
+    compute::transform(
+        input.begin(), input.end(), determinants.begin(), determinant4x4, queue
+    );
+
+    // check determinants
+    std::vector<float> host_determinants(n);
+    compute::copy(
+        determinants.begin(), determinants.end(), host_determinants.begin(), queue
+    );
+
+    for(size_t i = 0; i < n; i++){
+        float det = matrices[i].determinant();
+
+        if(std::abs(det - host_determinants[i]) > 1e-6){
+          std::cerr << "error: wrong determinant at " << i << " ("
+                    << host_determinants[i] << " != " << det << ")"
+                    << std::endl;
+            return -1;
+        }
+    }
+
+    return 0;
+}
diff --git a/src/boost/libs/compute/example/black_scholes.cpp b/src/boost/libs/compute/example/black_scholes.cpp
new file mode 100644
index 00000000..13a0e097
--- /dev/null
+++ b/src/boost/libs/compute/example/black_scholes.cpp
@@ -0,0 +1,168 @@
+//---------------------------------------------------------------------------//
+// Copyright (c) 2013-2014 Kyle Lutz <kyle.r.lutz@gmail.com>
+//
+// Distributed under the Boost Software License, Version 1.0
+// See accompanying file LICENSE_1_0.txt or copy at
+// http://www.boost.org/LICENSE_1_0.txt
+//
+// See http://boostorg.github.com/compute for more information.
+//---------------------------------------------------------------------------//
+
+#include <cstdlib>
+#include <iostream>
+
+#include <boost/compute/command_queue.hpp>
+#include <boost/compute/system.hpp>
+#include <boost/compute/algorithm/copy_n.hpp>
+#include <boost/compute/container/vector.hpp>
+#include <boost/compute/utility/source.hpp>
+
+namespace compute = boost::compute;
+
+// return a random float between lo and hi
+float rand_float(float lo, float hi)
+{
+    float x = (float) std::rand() / (float) RAND_MAX;
+
+    return (1.0f - x) * lo + x * hi;
+}
+
+// this example demostrates a black-scholes option pricing kernel.
+int main()
+{
+    // number of options
+    const int N = 4000000;
+
+    // black-scholes parameters
+    const float risk_free_rate = 0.02f;
+    const float volatility = 0.30f;
+
+    // get default device and setup context
+    compute::device gpu = compute::system::default_device();
+    compute::context context(gpu);
+    compute::command_queue queue(context, gpu);
+    std::cout << "device: " << gpu.name() << std::endl;
+
+    // initialize option data on host
+    std::vector<float> stock_price_data(N);
+    std::vector<float> option_strike_data(N);
+    std::vector<float> option_years_data(N);
+
+    std::srand(5347);
+    for(int i = 0; i < N; i++){
+        stock_price_data[i] = rand_float(5.0f, 30.0f);
+        option_strike_data[i] = rand_float(1.0f, 100.0f);
+        option_years_data[i] = rand_float(0.25f, 10.0f);
+    }
+
+    // create memory buffers on the device
+    compute::vector<float> call_result(N, context);
+    compute::vector<float> put_result(N, context);
+    compute::vector<float> stock_price(N, context);
+    compute::vector<float> option_strike(N, context);
+    compute::vector<float> option_years(N, context);
+
+    // copy initial values to the device
+    compute::copy_n(stock_price_data.begin(), N, stock_price.begin(), queue);
+    compute::copy_n(option_strike_data.begin(), N, option_strike.begin(), queue);
+    compute::copy_n(option_years_data.begin(), N, option_years.begin(), queue);
+
+    // source code for black-scholes program
+    const char source[] = BOOST_COMPUTE_STRINGIZE_SOURCE(
+        // approximation of the cumulative normal distribution function
+        static float cnd(float d)
+        {
+            const float A1 =  0.319381530f;
+            const float A2 = -0.356563782f;
+            const float A3 =  1.781477937f;
+            const float A4 = -1.821255978f;
+            const float A5 =  1.330274429f;
+            const float RSQRT2PI = 0.39894228040143267793994605993438f;
+
+            float K = 1.0f / (1.0f + 0.2316419f * fabs(d));
+            float cnd =
+                RSQRT2PI * exp(-0.5f * d * d) *
+                (K * (A1 + K * (A2 + K * (A3 + K * (A4 + K * A5)))));
+
+            if(d > 0){
+                cnd = 1.0f - cnd;
+            }
+
+            return cnd;
+        }
+
+        // black-scholes option pricing kernel
+        __kernel void black_scholes(__global float *call_result,
+                                    __global float *put_result,
+                                    __global const float *stock_price,
+                                    __global const float *option_strike,
+                                    __global const float *option_years,
+                                    float risk_free_rate,
+                                    float volatility)
+        {
+            const uint opt = get_global_id(0);
+
+            float S = stock_price[opt];
+            float X = option_strike[opt];
+            float T = option_years[opt];
+            float R = risk_free_rate;
+            float V = volatility;
+
+            float sqrtT = sqrt(T);
+            float d1 = (log(S / X) + (R + 0.5f * V * V) * T) / (V * sqrtT);
+            float d2 = d1 - V * sqrtT;
+            float CNDD1 = cnd(d1);
+            float CNDD2 = cnd(d2);
+
+            float expRT = exp(-R * T);
+            call_result[opt] = S * CNDD1 - X * expRT * CNDD2;
+            put_result[opt] = X * expRT * (1.0f - CNDD2) - S * (1.0f - CNDD1);
+        }
+    );
+
+    // build black-scholes program
+    compute::program program = compute::program::create_with_source(source, context);
+    program.build();
+
+    // setup black-scholes kernel
+    compute::kernel kernel(program, "black_scholes");
+    kernel.set_arg(0, call_result);
+    kernel.set_arg(1, put_result);
+    kernel.set_arg(2, stock_price);
+    kernel.set_arg(3, option_strike);
+    kernel.set_arg(4, option_years);
+    kernel.set_arg(5, risk_free_rate);
+    kernel.set_arg(6, volatility);
+
+    // execute black-scholes kernel
+    queue.enqueue_1d_range_kernel(kernel, 0, N, 0);
+
+    // print out the first option's put and call prices
+    float call0, put0;
+    compute::copy_n(put_result.begin(), 1, &put0, queue);
+    compute::copy_n(call_result.begin(), 1, &call0, queue);
+
+    std::cout << "option 0 call price: " << call0 << std::endl;
+    std::cout << "option 0 put price: " << put0 << std::endl;
+
+    // due to the differences in the random-number generators between Operating Systems
+    // and/or compilers, we will get different "expected" results for this example
+#ifdef __APPLE__
+    double expected_call0 = 0.000249461;
+    double expected_put0 = 26.2798;
+#elif _MSC_VER
+    double expected_call0 = 8.21412;
+    double expected_put0 = 2.25904;
+#else
+    double expected_call0 = 0.0999f;
+    double expected_put0 = 43.0524f;
+#endif
+
+    // check option prices
+    if(std::abs(call0 - expected_call0) > 1e-4 || std::abs(put0 - expected_put0) > 1e-4){
+        std::cerr << "error: option prices are wrong" << std::endl;
+        return -1;
+    }
+
+    return 0;
+}
diff --git a/src/boost/libs/compute/example/copy_data.cpp b/src/boost/libs/compute/example/copy_data.cpp
new file mode 100644
index 00000000..0550287f
--- /dev/null
+++ b/src/boost/libs/compute/example/copy_data.cpp
@@ -0,0 +1,49 @@
+//---------------------------------------------------------------------------//
+// Copyright (c) 2013 Kyle Lutz <kyle.r.lutz@gmail.com>
+//
+// Distributed under the Boost Software License, Version 1.0
+// See accompanying file LICENSE_1_0.txt or copy at
+// http://www.boost.org/LICENSE_1_0.txt
+//
+// See http://boostorg.github.com/compute for more information.
+//---------------------------------------------------------------------------//
+
+//[copy_data_example
+
+#include <vector>
+
+#include <boost/compute/algorithm/copy.hpp>
+#include <boost/compute/container/vector.hpp>
+
+namespace compute = boost::compute;
+
+int main()
+{
+    // get default device and setup context
+    compute::device device = compute::system::default_device();
+    compute::context context(device);
+    compute::command_queue queue(context, device);
+
+    // create data array on host
+    int host_data[] = { 1, 3, 5, 7, 9 };
+
+    // create vector on device
+    compute::vector<int> device_vector(5, context);
+
+    // copy from host to device
+    compute::copy(
+        host_data, host_data + 5, device_vector.begin(), queue
+    );
+
+    // create vector on host
+    std::vector<int> host_vector(5);
+
+    // copy data back to host
+    compute::copy(
+        device_vector.begin(), device_vector.end(), host_vector.begin(), queue
+    );
+
+    return 0;
+}
+
+//]
diff --git a/src/boost/libs/compute/example/fizz_buzz.cpp b/src/boost/libs/compute/example/fizz_buzz.cpp
new file mode 100644
index 00000000..2c69995f
--- /dev/null
+++ b/src/boost/libs/compute/example/fizz_buzz.cpp
@@ -0,0 +1,160 @@
+//---------------------------------------------------------------------------//
+// Copyright (c) 2013-2014 Kyle Lutz <kyle.r.lutz@gmail.com>
+//
+// Distributed under the Boost Software License, Version 1.0
+// See accompanying file LICENSE_1_0.txt or copy at
+// http://www.boost.org/LICENSE_1_0.txt
+//
+// See http://boostorg.github.com/compute for more information.
+//---------------------------------------------------------------------------//
+
+#include <iostream>
+
+#include <boost/compute/system.hpp>
+#include <boost/compute/algorithm/copy.hpp>
+#include <boost/compute/algorithm/accumulate.hpp>
+#include <boost/compute/algorithm/exclusive_scan.hpp>
+#include <boost/compute/container/vector.hpp>
+#include <boost/compute/utility/dim.hpp>
+#include <boost/compute/utility/source.hpp>
+
+namespace compute = boost::compute;
+
+const char fizz_buzz_source[] = BOOST_COMPUTE_STRINGIZE_SOURCE(
+    // returns the length of the string for the number 'n'. This is used
+    // during the first pass when we calculate the amount of space needed
+    // for each string in the fizz-buzz sequence.
+    inline uint fizz_buzz_string_length(uint n)
+    {
+        if((n % 5 == 0) && (n % 3 == 0)){
+            return sizeof("fizzbuzz");
+        }
+        else if(n % 5 == 0){
+            return sizeof("fizz");
+        }
+        else if(n % 3 == 0){
+            return sizeof("buzz");
+        }
+        else {
+            uint digits = 0;
+            while(n){
+                n /= 10;
+                digits++;
+            }
+            return digits + 1;
+        }
+    }
+
+    // first-pass kernel which calculates the string length for each number
+    // and writes it to the string_lengths array. these will then be passed
+    // to exclusive_scan() to calculate the output offsets for each string.
+    __kernel void fizz_buzz_allocate_strings(__global uint *string_lengths)
+    {
+        const uint i = get_global_id(0);
+        const uint n = i + 1;
+
+        string_lengths[i] = fizz_buzz_string_length(n);
+    }
+
+    // copy the string 's' with length 'n' to 'result' (just like strncpy())
+    inline void copy_string(__constant const char *s, uint n, __global char *result)
+    {
+        while(n--){
+            result[n] = s[n];
+        }
+    }
+
+    // reverse the string [start, end).
+    inline void reverse_string(__global char *start, __global char *end)
+    {
+        while(start < end){
+          char tmp = *end;
+          *end = *start;
+          *start = tmp;
+          start++;
+          end--;
+        }
+    }
+
+    // second-pass kernel which copies the fizz-buzz string for each number to
+    // buffer using the previously calculated offsets.
+    __kernel void fizz_buzz_copy_strings(__global const uint *offsets, __global char *buffer)
+    {
+        const uint i = get_global_id(0);
+        const uint n = i + 1;
+        const uint offset = offsets[i];
+
+        if((n % 5 == 0) && (n % 3 == 0)){
+            copy_string("fizzbuzz\n", 9, buffer + offset);
+        }
+        else if(n % 5 == 0){
+            copy_string("fizz\n", 5, buffer + offset);
+        }
+        else if(n % 3 == 0){
+            copy_string("buzz\n", 5, buffer + offset);
+        }
+        else {
+            // convert number to string and write it to the output
+            __global char *number = buffer + offset;
+            uint n_ = n;
+            while(n_){
+                *number++ = (n_%10) + '0';
+                n_ /= 10;
+            }
+            reverse_string(buffer + offset, number - 1);
+            *number = '\n';
+        }
+    }
+);
+
+int main()
+{
+    using compute::dim;
+    using compute::uint_;
+
+    // fizz-buzz up to 100
+    size_t n = 100;
+
+    // get the default device
+    compute::device device = compute::system::default_device();
+    compute::context ctx(device);
+    compute::command_queue queue(ctx, device);
+
+    // compile the fizz-buzz program
+    compute::program fizz_buzz_program =
+        compute::program::create_with_source(fizz_buzz_source, ctx);
+    fizz_buzz_program.build();
+
+    // create a vector for the output string and computing offsets
+    compute::vector<char> output(ctx);
+    compute::vector<uint_> offsets(n, ctx);
+
+    // run the allocate kernel to calculate string lengths
+    compute::kernel allocate_kernel(fizz_buzz_program, "fizz_buzz_allocate_strings");
+    allocate_kernel.set_arg(0, offsets);
+    queue.enqueue_nd_range_kernel(allocate_kernel, dim(0), dim(n), dim(1));
+
+    // allocate space for the output string
+    output.resize(
+        compute::accumulate(offsets.begin(), offsets.end(), 0, queue)
+    );
+
+    // scan string lengths for each number to calculate the output offsets
+    compute::exclusive_scan(
+        offsets.begin(), offsets.end(), offsets.begin(), queue
+    );
+
+    // run the copy kernel to fill the output buffer
+    compute::kernel copy_kernel(fizz_buzz_program, "fizz_buzz_copy_strings");
+    copy_kernel.set_arg(0, offsets);
+    copy_kernel.set_arg(1, output);
+    queue.enqueue_nd_range_kernel(copy_kernel, dim(0), dim(n), dim(1));
+
+    // copy the string to the host and print it to stdout
+    std::string str;
+    str.resize(output.size());
+    compute::copy(output.begin(), output.end(), str.begin(), queue);
+    std::cout << str;
+
+    return 0;
+}
diff --git a/src/boost/libs/compute/example/hello_world.cpp b/src/boost/libs/compute/example/hello_world.cpp
new file mode 100644
index 00000000..ef45fdd6
--- /dev/null
+++ b/src/boost/libs/compute/example/hello_world.cpp
@@ -0,0 +1,30 @@
+//---------------------------------------------------------------------------//
+// Copyright (c) 2013 Kyle Lutz <kyle.r.lutz@gmail.com>
+//
+// Distributed under the Boost Software License, Version 1.0
+// See accompanying file LICENSE_1_0.txt or copy at
+// http://www.boost.org/LICENSE_1_0.txt
+//
+// See http://boostorg.github.com/compute for more information.
+//---------------------------------------------------------------------------//
+
+//[hello_world_example
+
+#include <iostream>
+
+#include <boost/compute/core.hpp>
+
+namespace compute = boost::compute;
+
+int main()
+{
+    // get the default device
+    compute::device device = compute::system::default_device();
+
+    // print the device's name and platform
+    std::cout << "hello from " << device.name();
+    std::cout << " (platform: " << device.platform().name() << ")" << std::endl;
+
+    return 0;
+}
+//]
diff --git a/src/boost/libs/compute/example/host_sort.cpp b/src/boost/libs/compute/example/host_sort.cpp
new file mode 100644
index 00000000..b5ff52cf
--- /dev/null
+++ b/src/boost/libs/compute/example/host_sort.cpp
@@ -0,0 +1,56 @@
+//---------------------------------------------------------------------------//
+// Copyright (c) 2013-2014 Kyle Lutz <kyle.r.lutz@gmail.com>
+//
+// Distributed under the Boost Software License, Version 1.0
+// See accompanying file LICENSE_1_0.txt or copy at
+// http://www.boost.org/LICENSE_1_0.txt
+//
+// See http://boostorg.github.com/compute for more information.
+//---------------------------------------------------------------------------//
+
+#include <iostream>
+#include <vector>
+
+#include <boost/spirit/include/karma.hpp>
+
+#include <boost/compute/system.hpp>
+#include <boost/compute/algorithm/sort.hpp>
+
+namespace compute = boost::compute;
+namespace karma = boost::spirit::karma;
+
+int rand_int()
+{
+    return rand() % 100;
+}
+
+// this example demonstrates how to sort a std::vector of ints on the GPU
+int main()
+{
+    // get default device and setup context
+    compute::device gpu = compute::system::default_device();
+    compute::context context(gpu);
+    compute::command_queue queue(context, gpu);
+    std::cout << "device: " << gpu.name() << std::endl;
+
+    // create vector of random values on the host
+    std::vector<int> vector(8);
+    std::generate(vector.begin(), vector.end(), rand_int);
+
+    // print input vector
+    std::cout << "input:  [ "
+              << karma::format(karma::int_ % ", ", vector)
+              << " ]"
+              << std::endl;
+
+    // sort vector
+    compute::sort(vector.begin(), vector.end(), queue);
+
+    // print sorted vector
+    std::cout << "output: [ "
+              << karma::format(karma::int_ % ", ", vector)
+              << " ]"
+              << std::endl;
+
+    return 0;
+}
diff --git a/src/boost/libs/compute/example/inline_ptx.cpp b/src/boost/libs/compute/example/inline_ptx.cpp
new file mode 100644
index 00000000..fe4e32f0
--- /dev/null
+++ b/src/boost/libs/compute/example/inline_ptx.cpp
@@ -0,0 +1,72 @@
+//---------------------------------------------------------------------------//
+// Copyright (c) 2013-2014 Kyle Lutz <kyle.r.lutz@gmail.com>
+//
+// Distributed under the Boost Software License, Version 1.0
+// See accompanying file LICENSE_1_0.txt or copy at
+// http://www.boost.org/LICENSE_1_0.txt
+//
+// See http://boostorg.github.com/compute for more information.
+//---------------------------------------------------------------------------//
+
+#include <algorithm>
+#include <iostream>
+#include <vector>
+
+#include <boost/compute/system.hpp>
+#include <boost/compute/algorithm/copy.hpp>
+#include <boost/compute/algorithm/transform.hpp>
+#include <boost/compute/container/vector.hpp>
+
+namespace compute = boost::compute;
+
+// this example shows how to embed PTX assembly instructions
+// directly into boost.compute functions and use them with the
+// transform() algorithm.
+int main()
+{
+    // get default device and setup context
+    compute::device gpu = compute::system::default_device();
+    compute::context context(gpu);
+    compute::command_queue queue(context, gpu);
+    std::cout << "device: " << gpu.name() << std::endl;
+
+    // check to ensure we have an nvidia device
+    if(gpu.vendor() != "NVIDIA Corporation"){
+        std::cerr << "error: inline PTX assembly is only supported "
+                  << "on NVIDIA devices."
+                  << std::endl;
+        return 0;
+    }
+
+    // create input values and copy them to the device
+    using compute::uint_;
+    uint_ data[] = { 0x00, 0x01, 0x11, 0xFF };
+    compute::vector<uint_> input(data, data + 4, queue);
+
+    // function returning the number of bits set (aka population count or
+    // popcount) using the "popc" inline ptx assembly instruction.
+    BOOST_COMPUTE_FUNCTION(uint_, nvidia_popc, (uint_ x),
+    {
+        uint count;
+        asm("popc.b32 %0, %1;" : "=r"(count) : "r"(x));
+        return count;
+    });
+
+    // calculate the popcount for each input value
+    compute::vector<uint_> output(input.size(), context);
+    compute::transform(
+        input.begin(), input.end(), output.begin(), nvidia_popc, queue
+    );
+
+    // copy results back to the host and print them out
+    std::vector<uint_> counts(output.size());
+    compute::copy(output.begin(), output.end(), counts.begin(), queue);
+
+    for(size_t i = 0; i < counts.size(); i++){
+        std::cout << "0x" << std::hex << data[i]
+                  << " has " << counts[i]
+                  << " bits set" << std::endl;
+    }
+
+    return 0;
+}
diff --git a/src/boost/libs/compute/example/k_means.cpp b/src/boost/libs/compute/example/k_means.cpp
new file mode 100644
index 00000000..cd291a9b
--- /dev/null
+++ b/src/boost/libs/compute/example/k_means.cpp
@@ -0,0 +1,229 @@
+//---------------------------------------------------------------------------//
+// Copyright (c) 2013-2014 Kyle Lutz <kyle.r.lutz@gmail.com>
+//
+// Distributed under the Boost Software License, Version 1.0
+// See accompanying file LICENSE_1_0.txt or copy at
+// http://www.boost.org/LICENSE_1_0.txt
+//
+// See http://boostorg.github.com/compute for more information.
+//---------------------------------------------------------------------------//
+
+#include <opencv2/core/core.hpp>
+#include <opencv2/highgui/highgui.hpp>
+#include <opencv2/imgproc/imgproc.hpp>
+
+#include <boost/compute/system.hpp>
+#include <boost/compute/container/vector.hpp>
+#include <boost/compute/image/image2d.hpp>
+#include <boost/compute/interop/opencv/core.hpp>
+#include <boost/compute/interop/opencv/highgui.hpp>
+#include <boost/compute/random/default_random_engine.hpp>
+#include <boost/compute/random/uniform_real_distribution.hpp>
+#include <boost/compute/utility/dim.hpp>
+#include <boost/compute/utility/source.hpp>
+
+namespace compute = boost::compute;
+
+using compute::dim;
+using compute::int_;
+using compute::float_;
+using compute::float2_;
+
+// the k-means example implements the k-means clustering algorithm
+int main()
+{
+    // number of clusters
+    size_t k = 6;
+
+    // number of points
+    size_t n_points = 4500;
+
+    // height and width of image
+    size_t height = 800;
+    size_t width = 800;
+
+    // get default device and setup context
+    compute::device gpu = compute::system::default_device();
+    compute::context context(gpu);
+    compute::command_queue queue(context, gpu);
+
+    // generate random, uniformily-distributed points
+    compute::default_random_engine random_engine(queue);
+    compute::uniform_real_distribution<float_> uniform_distribution(0, 800);
+
+    compute::vector<float2_> points(n_points, context);
+    uniform_distribution.generate(
+        compute::make_buffer_iterator<float_>(points.get_buffer(), 0),
+        compute::make_buffer_iterator<float_>(points.get_buffer(), n_points * 2),
+        random_engine,
+        queue
+    );
+
+    // initialize all points to cluster 0
+    compute::vector<int_> clusters(n_points, context);
+    compute::fill(clusters.begin(), clusters.end(), 0, queue);
+
+    // create initial means with the first k points
+    compute::vector<float2_> means(k, context);
+    compute::copy_n(points.begin(), k, means.begin(), queue);
+
+    // k-means clustering program source
+    const char k_means_source[] = BOOST_COMPUTE_STRINGIZE_SOURCE(
+        __kernel void assign_clusters(__global const float2 *points,
+                                      __global const float2 *means,
+                                      const int k,
+                                      __global int *clusters)
+        {
+            const uint gid = get_global_id(0);
+
+            const float2 point = points[gid];
+
+            // find the closest cluster
+            float current_distance = 0;
+            int closest_cluster = -1;
+
+            // find closest cluster mean to the point
+            for(int i = 0; i < k; i++){
+                const float2 mean = means[i];
+
+                int distance_to_mean = distance(point, mean);
+                if(closest_cluster == -1 || distance_to_mean < current_distance){
+                    current_distance = distance_to_mean;
+                    closest_cluster = i;
+                }
+            }
+
+            // write new cluster
+            clusters[gid] = closest_cluster;
+        }
+
+        __kernel void update_means(__global const float2 *points,
+                                   const uint n_points,
+                                   __global float2 *means,
+                                   __global const int *clusters)
+        {
+            const uint k = get_global_id(0);
+
+            float2 sum = { 0, 0 };
+            float count = 0;
+            for(uint i = 0; i < n_points; i++){
+                if(clusters[i] == k){
+                    sum += points[i];
+                    count += 1;
+                }
+            }
+
+            means[k] = sum / count;
+        }
+    );
+
+    // build the k-means program
+    compute::program k_means_program =
+        compute::program::build_with_source(k_means_source, context);
+
+    // setup the k-means kernels
+    compute::kernel assign_clusters_kernel(k_means_program, "assign_clusters");
+    assign_clusters_kernel.set_arg(0, points);
+    assign_clusters_kernel.set_arg(1, means);
+    assign_clusters_kernel.set_arg(2, int_(k));
+    assign_clusters_kernel.set_arg(3, clusters);
+
+    compute::kernel update_means_kernel(k_means_program, "update_means");
+    update_means_kernel.set_arg(0, points);
+    update_means_kernel.set_arg(1, int_(n_points));
+    update_means_kernel.set_arg(2, means);
+    update_means_kernel.set_arg(3, clusters);
+
+    // run the k-means algorithm
+    for(int iteration = 0; iteration < 25; iteration++){
+        queue.enqueue_1d_range_kernel(assign_clusters_kernel, 0, n_points, 0);
+        queue.enqueue_1d_range_kernel(update_means_kernel, 0, k, 0);
+    }
+
+    // create output image
+    compute::image2d image(
+        context, width, height, compute::image_format(CL_RGBA, CL_UNSIGNED_INT8)
+    );
+
+    // program with two kernels, one to fill the image with white, and then
+    // one the draw to points calculated in coordinates on the image
+    const char draw_walk_source[] = BOOST_COMPUTE_STRINGIZE_SOURCE(
+        __kernel void draw_points(__global const float2 *points,
+                                  __global const int *clusters,
+                                  __write_only image2d_t image)
+        {
+            const uint i = get_global_id(0);
+            const float2 coord = points[i];
+
+            // map cluster number to color
+            uint4 color = { 0, 0, 0, 0 };
+            switch(clusters[i]){
+              case 0:
+                  color = (uint4)(255, 0, 0, 255);
+                  break;
+              case 1:
+                  color = (uint4)(0, 255, 0, 255);
+                  break;
+              case 2:
+                  color = (uint4)(0, 0, 255, 255);
+                  break;
+              case 3:
+                  color = (uint4)(255, 255, 0, 255);
+                  break;
+              case 4:
+                  color = (uint4)(255, 0, 255, 255);
+                  break;
+              case 5:
+                  color = (uint4)(0, 255, 255, 255);
+                  break;
+            }
+
+            // draw a 3x3 pixel point
+            for(int x = -1; x <= 1; x++){
+                for(int y = -1; y <= 1; y++){
+                    if(coord.x + x > 0 && coord.x + x < get_image_width(image) &&
+                       coord.y + y > 0 && coord.y + y < get_image_height(image)){
+                        write_imageui(image, (int2)(coord.x, coord.y) + (int2)(x, y), color);
+                    }
+                }
+            }
+        }
+
+        __kernel void fill_gray(__write_only image2d_t image)
+        {
+            const int2 coord = { get_global_id(0), get_global_id(1) };
+
+            if(coord.x < get_image_width(image) && coord.y < get_image_height(image)){
+                uint4 gray = { 15, 15, 15, 15 };
+                write_imageui(image, coord, gray);
+            }
+        }
+    );
+
+    // build the program
+    compute::program draw_program =
+        compute::program::build_with_source(draw_walk_source, context);
+
+    // fill image with dark gray
+    compute::kernel fill_kernel(draw_program, "fill_gray");
+    fill_kernel.set_arg(0, image);
+
+    queue.enqueue_nd_range_kernel(
+        fill_kernel, dim(0, 0), dim(width, height), dim(1, 1)
+    );
+
+    // draw points colored according to cluster
+    compute::kernel draw_kernel(draw_program, "draw_points");
+    draw_kernel.set_arg(0, points);
+    draw_kernel.set_arg(1, clusters);
+    draw_kernel.set_arg(2, image);
+    queue.enqueue_1d_range_kernel(draw_kernel, 0, n_points, 0);
+
+    // show image
+    compute::opencv_imshow("k-means", image, queue);
+
+    // wait and return
+    cv::waitKey(0);
+
+    return 0;
+}
diff --git a/src/boost/libs/compute/example/list_devices.cpp b/src/boost/libs/compute/example/list_devices.cpp
new file mode 100644
index 00000000..b7bdbc94
--- /dev/null
+++ b/src/boost/libs/compute/example/list_devices.cpp
@@ -0,0 +1,45 @@
+//---------------------------------------------------------------------------//
+// Copyright (c) 2013 Kyle Lutz <kyle.r.lutz@gmail.com>
+//
+// Distributed under the Boost Software License, Version 1.0
+// See accompanying file LICENSE_1_0.txt or copy at
+// http://www.boost.org/LICENSE_1_0.txt
+//
+// See http://boostorg.github.com/compute for more information.
+//---------------------------------------------------------------------------//
+
+#include <iostream>
+
+#include <boost/compute/core.hpp>
+
+namespace compute = boost::compute;
+
+int main()
+{
+    std::vector<compute::platform> platforms = compute::system::platforms();
+
+    for(size_t i = 0; i < platforms.size(); i++){
+        const compute::platform &platform = platforms[i];
+
+        std::cout << "Platform '" << platform.name() << "'" << std::endl;
+
+        std::vector<compute::device> devices = platform.devices();
+        for(size_t j = 0; j < devices.size(); j++){
+            const compute::device &device = devices[j];
+
+            std::string type;
+            if(device.type() & compute::device::gpu)
+                type = "GPU Device";
+            else if(device.type() & compute::device::cpu)
+                type = "CPU Device";
+            else if(device.type() & compute::device::accelerator)
+                type = "Accelerator Device";
+            else
+                type = "Unknown Device";
+
+            std::cout << "  " << type << ": " << device.name() << std::endl;
+        }
+    }
+
+    return 0;
+}
diff --git a/src/boost/libs/compute/example/longest_vector.cpp b/src/boost/libs/compute/example/longest_vector.cpp
new file mode 100644
index 00000000..faada332
--- /dev/null
+++ b/src/boost/libs/compute/example/longest_vector.cpp
@@ -0,0 +1,58 @@
+//---------------------------------------------------------------------------//
+// Copyright (c) 2013 Kyle Lutz <kyle.r.lutz@gmail.com>
+//
+// Distributed under the Boost Software License, Version 1.0
+// See accompanying file LICENSE_1_0.txt or copy at
+// http://www.boost.org/LICENSE_1_0.txt
+//
+// See http://boostorg.github.com/compute for more information.
+//---------------------------------------------------------------------------//
+
+#include <iostream>
+#include <iterator>
+
+#include <boost/compute/algorithm/max_element.hpp>
+#include <boost/compute/container/vector.hpp>
+#include <boost/compute/functional/geometry.hpp>
+#include <boost/compute/iterator/transform_iterator.hpp>
+#include <boost/compute/types/fundamental.hpp>
+
+namespace compute = boost::compute;
+
+// this example shows how to use the max_element() algorithm along with
+// a transform_iterator and the length() function to find the longest
+// 4-component vector in an array of vectors
+int main()
+{
+    using compute::float4_;
+
+    // vectors data
+    float data[] = { 1.0f, 2.0f, 3.0f, 0.0f,
+                     4.0f, 5.0f, 6.0f, 0.0f,
+                     7.0f, 8.0f, 9.0f, 0.0f,
+                     0.0f, 0.0f, 0.0f, 0.0f };
+
+    // create device vector with the vector data
+    compute::vector<float4_> vector(
+        reinterpret_cast<float4_ *>(data),
+        reinterpret_cast<float4_ *>(data) + 4
+    );
+
+    // find the longest vector
+    compute::vector<float4_>::const_iterator iter =
+        compute::max_element(
+            compute::make_transform_iterator(
+                vector.begin(), compute::length<float4_>()
+            ),
+            compute::make_transform_iterator(
+                vector.end(), compute::length<float4_>()
+            )
+        ).base();
+
+    // print the index of the longest vector
+    std::cout << "longest vector index: "
+              << std::distance(vector.begin(), iter)
+              << std::endl;
+
+    return 0;
+}
diff --git a/src/boost/libs/compute/example/mandelbrot.cpp b/src/boost/libs/compute/example/mandelbrot.cpp
new file mode 100644
index 00000000..5ddb3687
--- /dev/null
+++ b/src/boost/libs/compute/example/mandelbrot.cpp
@@ -0,0 +1,224 @@
+//---------------------------------------------------------------------------//
+// Copyright (c) 2013-2014 Kyle Lutz <kyle.r.lutz@gmail.com>
+//
+// Distributed under the Boost Software License, Version 1.0
+// See accompanying file LICENSE_1_0.txt or copy at
+// http://www.boost.org/LICENSE_1_0.txt
+//
+// See http://boostorg.github.com/compute for more information.
+//---------------------------------------------------------------------------//
+
+#include <iostream>
+#include <algorithm>
+
+#include <QtGlobal>
+#if QT_VERSION >= 0x050000
+#include <QtWidgets>
+#else
+#include <QtGui>
+#endif
+#include <QtOpenGL>
+
+#ifndef Q_MOC_RUN
+#include <boost/compute/command_queue.hpp>
+#include <boost/compute/kernel.hpp>
+#include <boost/compute/program.hpp>
+#include <boost/compute/system.hpp>
+#include <boost/compute/interop/opengl.hpp>
+#include <boost/compute/utility/dim.hpp>
+#include <boost/compute/utility/source.hpp>
+#endif // Q_MOC_RUN
+
+namespace compute = boost::compute;
+
+// opencl source code
+const char source[] = BOOST_COMPUTE_STRINGIZE_SOURCE(
+    // map value to color
+    float4 color(uint i)
+    {
+        uchar c = i;
+        uchar x = 35;
+        uchar y = 25;
+        uchar z = 15;
+        uchar max = 255;
+
+        if(i == 256)
+            return (float4)(0, 0, 0, 255);
+        else
+            return (float4)(max-x*i, max-y*i, max-z*i, max) / 255.f;
+    }
+
+    __kernel void mandelbrot(__write_only image2d_t image)
+    {
+        const uint x_coord = get_global_id(0);
+        const uint y_coord = get_global_id(1);
+        const uint width = get_global_size(0);
+        const uint height = get_global_size(1);
+
+        float x_origin = ((float) x_coord / width) * 3.25f - 2.0f;
+        float y_origin = ((float) y_coord / height) * 2.5f - 1.25f;
+
+        float x = 0.0f;
+        float y = 0.0f;
+
+        uint i = 0;
+        while(x*x + y*y <= 4.f && i < 256){
+            float tmp = x*x - y*y + x_origin;
+            y = 2*x*y + y_origin;
+            x = tmp;
+            i++;
+        }
+
+        int2 coord = { x_coord, y_coord };
+        write_imagef(image, coord, color(i));
+    };
+);
+
+class MandelbrotWidget : public QGLWidget
+{
+    Q_OBJECT
+
+public:
+    MandelbrotWidget(QWidget *parent = 0);
+    ~MandelbrotWidget();
+
+    void initializeGL();
+    void resizeGL(int width, int height);
+    void paintGL();
+    void keyPressEvent(QKeyEvent* event);
+
+private:
+    compute::context context_;
+    compute::command_queue queue_;
+    compute::program program_;
+    GLuint gl_texture_;
+    compute::opengl_texture cl_texture_;
+};
+
+MandelbrotWidget::MandelbrotWidget(QWidget *parent)
+    : QGLWidget(parent)
+{
+    gl_texture_ = 0;
+}
+
+MandelbrotWidget::~MandelbrotWidget()
+{
+}
+
+void MandelbrotWidget::initializeGL()
+{
+    // setup opengl
+    glDisable(GL_LIGHTING);
+
+    // create the OpenGL/OpenCL shared context
+    context_ = compute::opengl_create_shared_context();
+
+    // get gpu device
+    compute::device gpu = context_.get_device();
+    std::cout << "device: " << gpu.name() << std::endl;
+
+    // setup command queue
+    queue_ = compute::command_queue(context_, gpu);
+
+    // build mandelbrot program
+    program_ = compute::program::create_with_source(source, context_);
+    program_.build();
+}
+
+void MandelbrotWidget::resizeGL(int width, int height)
+{
+#if QT_VERSION >= 0x050000
+    // scale height/width based on device pixel ratio
+    width /= windowHandle()->devicePixelRatio();
+    height /= windowHandle()->devicePixelRatio();
+#endif
+
+    // resize viewport
+    glViewport(0, 0, width, height);
+
+    // delete old texture
+    if(gl_texture_){
+      glDeleteTextures(1, &gl_texture_);
+      gl_texture_ = 0;
+    }
+
+    // generate new texture
+    glGenTextures(1, &gl_texture_);
+    glBindTexture(GL_TEXTURE_2D, gl_texture_);
+    glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_S, GL_REPEAT);
+    glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_T, GL_REPEAT);
+    glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, GL_LINEAR);
+    glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_LINEAR);
+    glTexImage2D(
+        GL_TEXTURE_2D, 0, GL_RGBA, width, height, 0, GL_RGBA, GL_UNSIGNED_BYTE, 0
+    );
+
+    // create opencl object for the texture
+    cl_texture_ = compute::opengl_texture(
+        context_, GL_TEXTURE_2D, 0, gl_texture_, CL_MEM_WRITE_ONLY
+    );
+}
+
+void MandelbrotWidget::paintGL()
+{
+    using compute::dim;
+
+    float w = width();
+    float h = height();
+
+    glMatrixMode(GL_PROJECTION);
+    glLoadIdentity();
+    glOrtho(0.0, w, 0.0, h, -1.0, 1.0);
+    glMatrixMode(GL_MODELVIEW);
+    glLoadIdentity();
+
+    // setup the mandelbrot kernel
+    compute::kernel kernel(program_, "mandelbrot");
+    kernel.set_arg(0, cl_texture_);
+
+    // acquire the opengl texture so it can be used in opencl
+    compute::opengl_enqueue_acquire_gl_objects(1, &cl_texture_.get(), queue_);
+
+    // execute the mandelbrot kernel
+    queue_.enqueue_nd_range_kernel(
+        kernel, dim(0, 0), dim(width(), height()), dim(1, 1)
+    );
+
+    // release the opengl texture so it can be used by opengl
+    compute::opengl_enqueue_release_gl_objects(1, &cl_texture_.get(), queue_);
+
+    // ensure opencl is finished before rendering in opengl
+    queue_.finish();
+
+    // draw a single quad with the mandelbrot image texture
+    glEnable(GL_TEXTURE_2D);
+    glBindTexture(GL_TEXTURE_2D, gl_texture_);
+
+    glBegin(GL_QUADS);
+    glTexCoord2f(0, 0); glVertex2f(0, 0);
+    glTexCoord2f(0, 1); glVertex2f(0, h);
+    glTexCoord2f(1, 1); glVertex2f(w, h);
+    glTexCoord2f(1, 0); glVertex2f(w, 0);
+    glEnd();
+}
+
+void MandelbrotWidget::keyPressEvent(QKeyEvent* event)
+{
+    if(event->key() == Qt::Key_Escape) {
+        this->close();
+    }
+}
+
+// the mandelbrot example shows how to create a mandelbrot image in
+// OpenCL and render the image as a texture in OpenGL
+int main(int argc, char *argv[])
+{
+    QApplication app(argc, argv);
+
+    MandelbrotWidget widget;
+    widget.show();
+
+    return app.exec();
+}
+
+#include "mandelbrot.moc"
diff --git a/src/boost/libs/compute/example/mapped_view.cpp b/src/boost/libs/compute/example/mapped_view.cpp
new file mode 100644
index 00000000..1e05107a
--- /dev/null
+++ b/src/boost/libs/compute/example/mapped_view.cpp
@@ -0,0 +1,45 @@
+//---------------------------------------------------------------------------//
+// Copyright (c) 2013-2014 Kyle Lutz <kyle.r.lutz@gmail.com>
+//
+// Distributed under the Boost Software License, Version 1.0
+// See accompanying file LICENSE_1_0.txt or copy at
+// http://www.boost.org/LICENSE_1_0.txt
+//
+// See http://boostorg.github.com/compute for more information.
+//---------------------------------------------------------------------------//
+
+#include <iostream>
+#include <vector>
+
+#include <boost/compute/system.hpp>
+#include <boost/compute/algorithm/reduce.hpp>
+#include <boost/compute/container/mapped_view.hpp>
+
+namespace compute = boost::compute;
+
+// this example demonstrates how to use the mapped_view class to map
+// an array of numbers to device memory and use the reduce() algorithm
+// to calculate the sum.
+int main()
+{
+    // get default device and setup context
+    compute::device gpu = compute::system::default_device();
+    compute::context context(gpu);
+    compute::command_queue queue(context, gpu);
+    std::cout << "device: " << gpu.name() << std::endl;
+
+    // create data on host
+    int data[] = { 4, 2, 3, 7, 8, 9, 1, 6 };
+
+    // create mapped view on device
+    compute::mapped_view<int> view(data, 8, context);
+
+    // use reduce() to calculate sum on the device
+    int sum = 0;
+    compute::reduce(view.begin(), view.end(), &sum, queue);
+
+    // print the sum on the host
+    std::cout << "sum: " << sum << std::endl;
+
+    return 0;
+}
diff --git a/src/boost/libs/compute/example/matrix_transpose.cpp b/src/boost/libs/compute/example/matrix_transpose.cpp
new file mode 100644
index 00000000..ee9b1e9d
--- /dev/null
+++ b/src/boost/libs/compute/example/matrix_transpose.cpp
@@ -0,0 +1,355 @@
+//---------------------------------------------------------------------------//
+// Copyright (c) 2014 Benoit Dequidt <benoit.dequidt@gmail.com>
+//
+// Distributed under the Boost Software License, Version 1.0
+// See accompanying file LICENSE_1_0.txt or copy at
+// http://www.boost.org/LICENSE_1_0.txt
+//
+// See http://boostorg.github.com/compute for more information.
+//---------------------------------------------------------------------------//
+
+#include <iostream>
+#include <cstdlib>
+
+#include <boost/program_options.hpp>
+
+#include <boost/compute/core.hpp>
+#include <boost/compute/algorithm/copy.hpp>
+#include <boost/compute/container/vector.hpp>
+#include <boost/compute/type_traits/type_name.hpp>
+#include <boost/compute/utility/source.hpp>
+
+namespace compute = boost::compute;
+namespace po = boost::program_options;
+
+using compute::uint_;
+
+const uint_ TILE_DIM = 32;
+const uint_ BLOCK_ROWS = 8;
+
+// generate a copy kernel program
+compute::kernel make_copy_kernel(const compute::context& context)
+{
+    // source for the copy_kernel program
+    const char source[] = BOOST_COMPUTE_STRINGIZE_SOURCE(
+        __kernel void copy_kernel(__global const float *src, __global float *dst)
+        {
+            uint x = get_group_id(0) * TILE_DIM + get_local_id(0);
+            uint y = get_group_id(1) * TILE_DIM + get_local_id(1);
+
+            uint width = get_num_groups(0) * TILE_DIM;
+
+            for(uint i = 0 ; i < TILE_DIM ; i+= BLOCK_ROWS){
+                dst[(y+i)*width +x] = src[(y+i)*width + x];
+            }
+        }
+    );
+
+    // setup compilation flags for the copy program
+    std::stringstream options;
+    options << "-DTILE_DIM=" << TILE_DIM << " -DBLOCK_ROWS=" << BLOCK_ROWS;
+
+    // create and build the copy program
+    compute::program program =
+        compute::program::build_with_source(source, context, options.str());
+
+    // create and return the copy kernel
+    return program.create_kernel("copy_kernel");
+}
+
+// generate a naive transpose kernel
+compute::kernel make_naive_transpose_kernel(const compute::context& context)
+{
+    // source for the naive_transpose kernel
+    const char source[] = BOOST_COMPUTE_STRINGIZE_SOURCE(
+        __kernel void naive_transpose(__global const float *src, __global float *dst)
+        {
+            uint x = get_group_id(0) * TILE_DIM + get_local_id(0);
+            uint y = get_group_id(1) * TILE_DIM + get_local_id(1);
+
+            uint width = get_num_groups(0) * TILE_DIM;
+
+            for(uint i = 0 ; i < TILE_DIM; i+= BLOCK_ROWS){
+                dst[x*width + y+i] = src[(y+i)*width + x];
+            }
+        }
+    );
+
+    // setup compilation flags for the naive_transpose program
+    std::stringstream options;
+    options << "-DTILE_DIM=" << TILE_DIM << " -DBLOCK_ROWS=" << BLOCK_ROWS;
+
+    // create and build the naive_transpose program
+    compute::program program =
+        compute::program::build_with_source(source, context, options.str());
+
+    // create and return the naive_transpose kernel
+    return program.create_kernel("naive_transpose");
+}
+
+// generates a coalesced transpose kernel
+compute::kernel make_coalesced_transpose_kernel(const compute::context& context)
+{
+    // source for the coalesced_transpose kernel
+    const char source[] = BOOST_COMPUTE_STRINGIZE_SOURCE(
+        __kernel void coalesced_transpose(__global const float *src, __global float *dst)
+        {
+            __local float tile[TILE_DIM][TILE_DIM];
+
+            // compute indexes
+            uint x = get_group_id(0) * TILE_DIM + get_local_id(0);
+            uint y = get_group_id(1) * TILE_DIM + get_local_id(1);
+
+            uint width = get_num_groups(0) * TILE_DIM;
+
+            // load inside local memory
+            for(uint i = 0 ; i < TILE_DIM; i+= BLOCK_ROWS){
+                tile[get_local_id(1)+i][get_local_id(0)] = src[(y+i)*width + x];
+            }
+
+            barrier(CLK_LOCAL_MEM_FENCE);
+
+            // transpose indexes
+            x = get_group_id(1) * TILE_DIM + get_local_id(0);
+            y = get_group_id(0) * TILE_DIM + get_local_id(1);
+
+            // write output from local memory
+            for(uint i = 0 ; i < TILE_DIM ; i+=BLOCK_ROWS){
+                dst[(y+i)*width + x] = tile[get_local_id(0)][get_local_id(1)+i];
+            }
+        }
+    );
+
+    // setup compilation flags for the coalesced_transpose program
+    std::stringstream options;
+    options << "-DTILE_DIM=" << TILE_DIM << " -DBLOCK_ROWS=" << BLOCK_ROWS;
+
+    // create and build the coalesced_transpose program
+    compute::program program =
+        compute::program::build_with_source(source, context, options.str());
+
+    // create and return coalesced_transpose kernel
+    return program.create_kernel("coalesced_transpose");
+}
+
+// generate a coalesced withtout bank conflicts kernel
+compute::kernel make_coalesced_no_bank_conflicts_kernel(const compute::context& context)
+{
+    const char source[] = BOOST_COMPUTE_STRINGIZE_SOURCE(
+        __kernel void coalesced_no_bank_conflicts(__global const float *src, __global float *dst)
+        {
+            // TILE_DIM+1 is here to avoid bank conflicts in local memory
+            __local float tile[TILE_DIM][TILE_DIM+1];
+
+            // compute indexes
+            uint x = get_group_id(0) * TILE_DIM + get_local_id(0);
+            uint y = get_group_id(1) * TILE_DIM + get_local_id(1);
+
+            uint width = get_num_groups(0) * TILE_DIM;
+
+            // load inside local memory
+            for(uint i = 0 ; i < TILE_DIM; i+= BLOCK_ROWS){
+                tile[get_local_id(1)+i][get_local_id(0)] = src[(y+i)*width + x];
+            }
+
+            barrier(CLK_LOCAL_MEM_FENCE);
+
+            // transpose indexes
+            x = get_group_id(1) * TILE_DIM + get_local_id(0);
+            y = get_group_id(0) * TILE_DIM + get_local_id(1);
+
+            // write output from local memory
+            for(uint i = 0 ; i < TILE_DIM ; i+=BLOCK_ROWS){
+                dst[(y+i)*width + x] = tile[get_local_id(0)][get_local_id(1)+i];
+            }
+        }
+    );
+
+    // setup compilation flags for the coalesced_no_bank_conflicts program
+    std::stringstream options;
+    options << "-DTILE_DIM=" << TILE_DIM << " -DBLOCK_ROWS=" << BLOCK_ROWS;
+
+    // create and build the coalesced_no_bank_conflicts program
+    compute::program program =
+        compute::program::build_with_source(source, context, options.str());
+
+    // create and return the coalesced_no_bank_conflicts kernel
+    return program.create_kernel("coalesced_no_bank_conflicts");
+}
+
+// compare 'expectedResult' to 'transposedMatrix'. prints an error message if not equal.
+bool check_transposition(const std::vector<float>& expectedResult,
+                         uint_ size,
+                         const std::vector<float>& transposedMatrix)
+{
+    for(uint_ i = 0 ; i < size ; ++i){
+        if(expectedResult[i] != transposedMatrix[i]){
+            std::cout << "idx = " << i << " , expected " << expectedResult[i]
+                      << " , got " << transposedMatrix[i] << std::endl;
+            std::cout << "FAILED" << std::endl;
+            return false;
+        }
+    }
+    return true;
+}
+
+// generate a matrix inside 'in' and do the tranposition inside 'out'
+void generate_matrix(std::vector<float>& in, std::vector<float>& out, uint_ rows, uint_ cols)
+{
+    // generate a matrix
+    for(uint_ i = 0 ; i < rows ; ++i){
+        for(uint_ j = 0 ; j < cols ; ++j){
+            in[i*cols + j] = i*cols + j;
+        }
+    }
+
+    // store transposed result
+    for(uint_ j = 0; j < cols ; ++j){
+        for(uint_ i = 0 ; i < rows ; ++i){
+            out[j*rows + i] = in[i*cols + j];
+        }
+    }
+}
+
+// neccessary for 64-bit integer on win32
+#ifdef _WIN32
+#define uint64_t unsigned __int64
+#endif
+
+int main(int argc, char *argv[])
+{
+    // setup command line arguments
+    po::options_description options("options");
+    options.add_options()
+        ("help", "show usage instructions")
+        ("rows", po::value<uint_>()->default_value(4096), "number of matrix rows")
+        ("cols", po::value<uint_>()->default_value(4096), "number of matrix columns")
+    ;
+
+    // parse command line
+    po::variables_map vm;
+    po::store(po::parse_command_line(argc, argv, options), vm);
+    po::notify(vm);
+
+    // check command line arguments
+    if(vm.count("help")){
+        std::cout << options << std::endl;
+        return 0;
+    }
+
+    // get number rows and columns for the matrix
+    const uint_ rows = vm["rows"].as<uint_>();
+    const uint_ cols = vm["cols"].as<uint_>();
+
+    // get the default device
+    compute::device device = compute::system::default_device();
+
+    // print out device name and matrix information
+    std::cout << "Device: " << device.name() << std::endl;
+    std::cout << "Matrix Size: " << rows << "x" << cols << std::endl;
+    std::cout << "Grid Size: " << rows/TILE_DIM << "x" << cols/TILE_DIM << " blocks" << std::endl;
+    std::cout << "Local Size: " << TILE_DIM << "x" << BLOCK_ROWS << " threads" << std::endl;
+    std::cout << std::endl;
+
+    // On OSX this example does not work on CPU devices
+    #if defined(__APPLE__)
+    if(device.type() & compute::device::cpu) {
+        std::cout << "On OSX this example does not work on CPU devices" << std::endl;
+        return 0;
+    }
+    #endif
+
+    const size_t global_work_size[2] = {rows, cols*BLOCK_ROWS/TILE_DIM};
+    const size_t local_work_size[2] = {TILE_DIM, BLOCK_ROWS};
+
+    // setup input data on the host
+    const uint_ size = rows * cols;
+    std::vector<float> h_input(size);
+    std::vector<float> h_output(size);
+    std::vector<float> expectedResult(size);
+    generate_matrix(h_input, expectedResult, rows, cols);
+
+    // create a context for the device
+    compute::context context(device);
+
+    // device vectors
+    compute::vector<float> d_input(size, context);
+    compute::vector<float> d_output(size, context);
+
+    // command_queue with profiling
+    compute::command_queue queue(context, device, compute::command_queue::enable_profiling);
+
+    // copy input data
+    compute::copy(h_input.begin(), h_input.end(), d_input.begin(), queue);
+
+    // simple copy kernel
+    std::cout << "Testing copy_kernel:" << std::endl;
+    compute::kernel kernel = make_copy_kernel(context);
+    kernel.set_arg(0, d_input);
+    kernel.set_arg(1, d_output);
+
+    compute::event start;
+    start = queue.enqueue_nd_range_kernel(kernel, 2, 0, global_work_size, local_work_size);
+    queue.finish();
+    uint64_t elapsed = start.duration<boost::chrono::nanoseconds>().count();
+
+    std::cout << "  Elapsed: " << elapsed << " ns" << std::endl;
+    std::cout << "  BandWidth: " << 2*rows*cols*sizeof(float) / elapsed << " GB/s" << std::endl;
+    compute::copy(d_output.begin(), d_output.end(), h_output.begin(), queue);
+
+    check_transposition(h_input, rows*cols, h_output);
+    std::cout << std::endl;
+
+    // naive_transpose kernel
+    std::cout << "Testing naive_transpose:" << std::endl;
+    kernel = make_naive_transpose_kernel(context);
+    kernel.set_arg(0, d_input);
+    kernel.set_arg(1, d_output);
+
+    start = queue.enqueue_nd_range_kernel(kernel, 2, 0, global_work_size, local_work_size);
+    queue.finish();
+    elapsed = start.duration<boost::chrono::nanoseconds>().count();
+    std::cout << "  Elapsed: " << elapsed << " ns" << std::endl;
+    std::cout << "  BandWidth: " << 2*rows*cols*sizeof(float) / elapsed << " GB/s" << std::endl;
+    compute::copy(d_output.begin(), d_output.end(), h_output.begin(), queue);
+
+    check_transposition(expectedResult, rows*cols, h_output);
+    std::cout << std::endl;
+
+    // coalesced_transpose kernel
+    std::cout << "Testing coalesced_transpose:" << std::endl;
+    kernel = make_coalesced_transpose_kernel(context);
+    kernel.set_arg(0, d_input);
+    kernel.set_arg(1, d_output);
+
+    start = queue.enqueue_nd_range_kernel(kernel, 2, 0, global_work_size, local_work_size);
+    queue.finish();
+    elapsed = start.duration<boost::chrono::nanoseconds>().count();
+    std::cout << "  Elapsed: " << elapsed << " ns" << std::endl;
+    std::cout << "  BandWidth: " << 2*rows*cols*sizeof(float) / elapsed << " GB/s" << std::endl;
+
+    compute::copy(d_output.begin(), d_output.end(), h_output.begin(), queue);
+
+    check_transposition(expectedResult, rows*cols, h_output);
+    std::cout << std::endl;
+
+    // coalesced_no_bank_conflicts kernel
+    std::cout << "Testing coalesced_no_bank_conflicts:" << std::endl;
+
+    kernel = make_coalesced_no_bank_conflicts_kernel(context);
+    kernel.set_arg(0, d_input);
+    kernel.set_arg(1, d_output);
+
+    start = queue.enqueue_nd_range_kernel(kernel, 2, 0, global_work_size, local_work_size);
+    queue.finish();
+    elapsed = start.duration<boost::chrono::nanoseconds>().count();
+    std::cout << "  Elapsed: " << elapsed << " ns" << std::endl;
+    std::cout << "  BandWidth: " << 2*rows*cols*sizeof(float) / elapsed << " GB/s" << std::endl;
+
+    compute::copy(d_output.begin(), d_output.end(), h_output.begin(), queue);
+
+    check_transposition(expectedResult, rows*cols, h_output);
+    std::cout << std::endl;
+
+    return 0;
+}
diff --git a/src/boost/libs/compute/example/memory_limits.cpp b/src/boost/libs/compute/example/memory_limits.cpp
new file mode 100644
index 00000000..8d0e4a7a
--- /dev/null
+++ b/src/boost/libs/compute/example/memory_limits.cpp
@@ -0,0 +1,37 @@
+//---------------------------------------------------------------------------//
+// Copyright (c) 2013-2014 Kyle Lutz <kyle.r.lutz@gmail.com>
+//
+// Distributed under the Boost Software License, Version 1.0
+// See accompanying file LICENSE_1_0.txt or copy at
+// http://www.boost.org/LICENSE_1_0.txt
+//
+// See http://boostorg.github.com/compute for more information.
+//---------------------------------------------------------------------------//
+
+#include <iostream>
+
+#include <boost/compute/core.hpp>
+
+namespace compute = boost::compute;
+
+int main()
+{
+    // get the default device
+    compute::device device = compute::system::default_device();
+
+    std::cout << "device: " << device.name() << std::endl;
+    std::cout << "  global memory size: "
+              << device.get_info<cl_ulong>(CL_DEVICE_GLOBAL_MEM_SIZE) / 1024 / 1024
+              << " MB"
+              << std::endl;
+    std::cout << "  local memory size: "
+              << device.get_info<cl_ulong>(CL_DEVICE_LOCAL_MEM_SIZE) / 1024
+              << " KB"
+              << std::endl;
+    std::cout << "  constant memory size: "
+              << device.get_info<cl_ulong>(CL_DEVICE_MAX_CONSTANT_BUFFER_SIZE) / 1024
+              << " KB"
+              << std::endl;
+
+    return 0;
+}
diff --git a/src/boost/libs/compute/example/monte_carlo.cpp b/src/boost/libs/compute/example/monte_carlo.cpp
new file mode 100644
index 00000000..8ae26209
--- /dev/null
+++ b/src/boost/libs/compute/example/monte_carlo.cpp
@@ -0,0 +1,73 @@
+//---------------------------------------------------------------------------//
+// Copyright (c) 2013 Kyle Lutz <kyle.r.lutz@gmail.com>
+//
+// Distributed under the Boost Software License, Version 1.0
+// See accompanying file LICENSE_1_0.txt or copy at
+// http://www.boost.org/LICENSE_1_0.txt
+//
+// See http://boostorg.github.com/compute for more information.
+//---------------------------------------------------------------------------//
+
+#include <iostream>
+
+#include <boost/compute/function.hpp>
+#include <boost/compute/system.hpp>
+#include <boost/compute/algorithm/count_if.hpp>
+#include <boost/compute/container/vector.hpp>
+#include <boost/compute/iterator/buffer_iterator.hpp>
+#include <boost/compute/random/default_random_engine.hpp>
+#include <boost/compute/types/fundamental.hpp>
+
+namespace compute = boost::compute;
+
+int main()
+{
+    // get default device and setup context
+    compute::device gpu = compute::system::default_device();
+    compute::context context(gpu);
+    compute::command_queue queue(context, gpu);
+
+    std::cout << "device: " << gpu.name() << std::endl;
+
+    using compute::uint_;
+    using compute::uint2_;
+
+
+#ifdef CI_BUILD // lower number of points for CI builds
+    size_t n = 2000;
+#else
+    // ten million random points
+    size_t n = 10000000;
+#endif
+
+    // generate random numbers
+    compute::default_random_engine rng(queue);
+    compute::vector<uint_> vector(n * 2, context);
+    rng.generate(vector.begin(), vector.end(), queue);
+
+    // function returing true if the point is within the unit circle
+    BOOST_COMPUTE_FUNCTION(bool, is_in_unit_circle, (const uint2_ point),
+    {
+        const float x = point.x / (float) UINT_MAX - 1;
+        const float y = point.y / (float) UINT_MAX - 1;
+
+        return (x*x + y*y) < 1.0f;
+    });
+
+    // iterate over vector<uint> as vector<uint2>
+    compute::buffer_iterator<uint2_> start =
+        compute::make_buffer_iterator<uint2_>(vector.get_buffer(), 0);
+    compute::buffer_iterator<uint2_> end =
+        compute::make_buffer_iterator<uint2_>(vector.get_buffer(), vector.size() / 2);
+
+    // count number of random points within the unit circle
+    size_t count = compute::count_if(start, end, is_in_unit_circle, queue);
+
+    // print out values
+    float count_f = static_cast<float>(count);
+    std::cout << "count: " << count << " / " << n << std::endl;
+    std::cout << "ratio: " << count_f / float(n) << std::endl;
+    std::cout << "pi = " << (count_f / float(n)) * 4.0f << std::endl;
+
+    return 0;
+}
diff --git a/src/boost/libs/compute/example/nbody.cpp b/src/boost/libs/compute/example/nbody.cpp
new file mode 100644
index 00000000..9379c63b
--- /dev/null
+++ b/src/boost/libs/compute/example/nbody.cpp
@@ -0,0 +1,236 @@
+//---------------------------------------------------------------------------//
+// Copyright (c) 2014 Fabian Köhler <fabian2804@googlemail.com>
+//
+// Distributed under the Boost Software License, Version 1.0
+// See accompanying file LICENSE_1_0.txt or copy at
+// http://www.boost.org/LICENSE_1_0.txt
+//
+// See http://boostorg.github.com/compute for more information.
+//---------------------------------------------------------------------------//
+
+#include <iostream>
+
+#define GL_GLEXT_PROTOTYPES
+#ifdef __APPLE__
+#include <OpenGL/gl.h>
+#include <OpenGL/glext.h>
+#else
+#include <GL/gl.h>
+#include <GL/glext.h>
+#endif
+
+#include <QtGlobal>
+#if QT_VERSION >= 0x050000
+#include <QtWidgets>
+#else
+#include <QtGui>
+#endif
+#include <QtOpenGL>
+#include <QTimer>
+
+#include <boost/program_options.hpp>
+#include <boost/random/uniform_real_distribution.hpp>
+#include <boost/random/mersenne_twister.hpp>
+
+#ifndef Q_MOC_RUN
+#include <boost/compute/system.hpp>
+#include <boost/compute/algorithm.hpp>
+#include <boost/compute/container/vector.hpp>
+#include <boost/compute/interop/opengl.hpp>
+#include <boost/compute/utility/source.hpp>
+#endif // Q_MOC_RUN
+
+namespace compute = boost::compute;
+namespace po = boost::program_options;
+
+using compute::uint_;
+using compute::float4_;
+
+const char source[] = BOOST_COMPUTE_STRINGIZE_SOURCE(
+    __kernel void updateVelocity(__global const float4* position, __global float4* velocity, float dt, uint N)
+    {
+        uint gid = get_global_id(0);
+
+        float4 r = { 0.0f, 0.0f, 0.0f, 0.0f };
+        float f = 0.0f;
+        for(uint i = 0; i != gid; i++) {
+            if(i != gid) {
+                r = position[i]-position[gid];
+                f = length(r)+0.001f;
+                f *= f*f;
+                f = dt/f;
+                velocity[gid] += f*r;
+            }
+        }
+    }
+    __kernel void updatePosition(__global float4* position, __global const float4* velocity, float dt)
+    {
+        uint gid = get_global_id(0);
+
+        position[gid].xyz += dt*velocity[gid].xyz;
+    }
+);
+
+class NBodyWidget : public QGLWidget
+{
+    Q_OBJECT
+
+public:
+    NBodyWidget(std::size_t particles, float dt, QWidget* parent = 0);
+    ~NBodyWidget();
+
+    void initializeGL();
+    void resizeGL(int width, int height);
+    void paintGL();
+    void updateParticles();
+    void keyPressEvent(QKeyEvent* event);
+
+private:
+    QTimer* timer;
+
+    compute::context m_context;
+    compute::command_queue m_queue;
+    compute::program m_program;
+    compute::opengl_buffer m_position;
+    compute::vector<float4_>* m_velocity;
+    compute::kernel m_velocity_kernel;
+    compute::kernel m_position_kernel;
+
+    bool m_initial_draw;
+
+    const uint_ m_particles;
+    const float m_dt;
+};
+
+NBodyWidget::NBodyWidget(std::size_t particles, float dt, QWidget* parent)
+    : QGLWidget(parent), m_initial_draw(true), m_particles(particles), m_dt(dt)
+{
+    // create a timer to redraw as fast as possible
+    timer = new QTimer(this);
+    connect(timer, SIGNAL(timeout()), this, SLOT(updateGL()));
+    timer->start(1);
+}
+
+NBodyWidget::~NBodyWidget()
+{
+    delete m_velocity;
+
+    // delete the opengl buffer
+    GLuint vbo = m_position.get_opengl_object();
+    glDeleteBuffers(1, &vbo);
+}
+
+void NBodyWidget::initializeGL()
+{
+    // create context, command queue and program
+    m_context = compute::opengl_create_shared_context();
+    m_queue = compute::command_queue(m_context, m_context.get_device());
+    m_program = compute::program::create_with_source(source, m_context);
+    m_program.build();
+
+    // prepare random particle positions that will be transferred to the vbo
+    float4_* temp = new float4_[m_particles];
+    boost::random::uniform_real_distribution<float> dist(-0.5f, 0.5f);
+    boost::random::mt19937_64 gen;
+    for(size_t i = 0; i < m_particles; i++) {
+        temp[i][0] = dist(gen);
+        temp[i][1] = dist(gen);
+        temp[i][2] = dist(gen);
+        temp[i][3] = 1.0f;
+    }
+
+    // create an OpenGL vbo
+    GLuint vbo = 0;
+    glGenBuffers(1, &vbo);
+    glBindBuffer(GL_ARRAY_BUFFER, vbo);
+    glBufferData(GL_ARRAY_BUFFER, m_particles*sizeof(float4_), temp, GL_DYNAMIC_DRAW);
+
+    // create a OpenCL buffer from the vbo
+    m_position = compute::opengl_buffer(m_context, vbo);
+    delete[] temp;
+
+    // create buffer for velocities
+    m_velocity = new compute::vector<float4_>(m_particles, m_context);
+    compute::fill(m_velocity->begin(), m_velocity->end(), float4_(0.0f, 0.0f, 0.0f, 0.0f), m_queue);
+
+    // create compute kernels
+    m_velocity_kernel = m_program.create_kernel("updateVelocity");
+    m_velocity_kernel.set_arg(0, m_position);
+    m_velocity_kernel.set_arg(1, m_velocity->get_buffer());
+    m_velocity_kernel.set_arg(2, m_dt);
+    m_velocity_kernel.set_arg(3, m_particles);
+    m_position_kernel = m_program.create_kernel("updatePosition");
+    m_position_kernel.set_arg(0, m_position);
+    m_position_kernel.set_arg(1, m_velocity->get_buffer());
+    m_position_kernel.set_arg(2, m_dt);
+}
+void NBodyWidget::resizeGL(int width, int height)
+{
+    // update viewport
+    glViewport(0, 0, width, height);
+}
+void NBodyWidget::paintGL()
+{
+    // clear buffer
+    glClear(GL_COLOR_BUFFER_BIT | GL_DEPTH_BUFFER_BIT);
+
+    // check if this is the first draw
+    if(m_initial_draw) {
+        // do not update particles
+        m_initial_draw = false;
+    } else {
+        // update particles
+        updateParticles();
+    }
+
+    // draw
+    glVertexPointer(4, GL_FLOAT, 0, 0);
+    glEnableClientState(GL_VERTEX_ARRAY);
+    glDrawArrays(GL_POINTS, 0, m_particles);
+    glFinish();
+}
+void NBodyWidget::updateParticles()
+{
+    // enqueue kernels to update particles and make sure that the command queue is finished
+    compute::opengl_enqueue_acquire_buffer(m_position, m_queue);
+    m_queue.enqueue_1d_range_kernel(m_velocity_kernel, 0, m_particles, 0).wait();
+    m_queue.enqueue_1d_range_kernel(m_position_kernel, 0, m_particles, 0).wait();
+    m_queue.finish();
+    compute::opengl_enqueue_release_buffer(m_position, m_queue);
+}
+void NBodyWidget::keyPressEvent(QKeyEvent* event)
+{
+    if(event->key() == Qt::Key_Escape) {
+        this->close();
+    }
+}
+
+int main(int argc, char** argv)
+{
+    // parse command line arguments
+    po::options_description options("options");
+    options.add_options()
+        ("help", "show usage")
+        ("particles", po::value<uint_>()->default_value(1000), "number of particles")
+        ("dt", po::value<float>()->default_value(0.00001f), "width of each integration step");
+    po::variables_map vm;
+    po::store(po::parse_command_line(argc, argv, options), vm);
+    po::notify(vm);
+
+    if(vm.count("help") > 0) {
+        std::cout << options << std::endl;
+        return 0;
+    }
+
+    const uint_ particles = vm["particles"].as<uint_>();
+    const float dt = vm["dt"].as<float>();
+
+    QApplication app(argc, argv);
+    NBodyWidget nbody(particles, dt);
+
+    nbody.show();
+
+    return app.exec();
+}
+
+#include "nbody.moc"
diff --git a/src/boost/libs/compute/example/opencl_test.cpp b/src/boost/libs/compute/example/opencl_test.cpp
new file mode 100644
index 00000000..15b85202
--- /dev/null
+++ b/src/boost/libs/compute/example/opencl_test.cpp
@@ -0,0 +1,165 @@
+//---------------------------------------------------------------------------//
+// Copyright (c) 2013-2014 Kyle Lutz <kyle.r.lutz@gmail.com>
+//
+// Distributed under the Boost Software License, Version 1.0
+// See accompanying file LICENSE_1_0.txt or copy at
+// http://www.boost.org/LICENSE_1_0.txt
+//
+// See http://boostorg.github.com/compute for more information.
+//---------------------------------------------------------------------------//
+
+// See boost/compute/detail/diagnostic.hpp
+// GCC
+#if ((__GNUC__ * 100) + __GNUC_MINOR__) >= 402
+#define BOOST_COMPUTE_GCC_DIAG_STR(s) #s
+#define BOOST_COMPUTE_GCC_DIAG_JOINSTR(x,y) BOOST_COMPUTE_GCC_DIAG_STR(x ## y)
+# define BOOST_COMPUTE_GCC_DIAG_DO_PRAGMA(x) _Pragma (#x)
+# define BOOST_COMPUTE_GCC_DIAG_PRAGMA(x) BOOST_COMPUTE_GCC_DIAG_DO_PRAGMA(GCC diagnostic x)
+# if ((__GNUC__ * 100) + __GNUC_MINOR__) >= 406
+#  define BOOST_COMPUTE_GCC_DIAG_OFF(x) BOOST_COMPUTE_GCC_DIAG_PRAGMA(push) \
+      BOOST_COMPUTE_GCC_DIAG_PRAGMA(ignored BOOST_COMPUTE_GCC_DIAG_JOINSTR(-W,x))
+#  define BOOST_COMPUTE_GCC_DIAG_ON(x) BOOST_COMPUTE_GCC_DIAG_PRAGMA(pop)
+# else
+#  define BOOST_COMPUTE_GCC_DIAG_OFF(x) \
+      BOOST_COMPUTE_GCC_DIAG_PRAGMA(ignored BOOST_COMPUTE_GCC_DIAG_JOINSTR(-W,x))
+#  define BOOST_COMPUTE_GCC_DIAG_ON(x) \
+      BOOST_COMPUTE_GCC_DIAG_PRAGMA(warning BOOST_COMPUTE_GCC_DIAG_JOINSTR(-W,x))
+# endif
+#else // Ensure these macros do nothing for other compilers.
+# define BOOST_COMPUTE_GCC_DIAG_OFF(x)
+# define BOOST_COMPUTE_GCC_DIAG_ON(x)
+#endif
+
+// Clang
+#ifdef __clang__
+#  define BOOST_COMPUTE_CLANG_DIAG_STR(s) # s
+// stringize s to "no-sign-compare"
+#  define BOOST_COMPUTE_CLANG_DIAG_JOINSTR(x,y) BOOST_COMPUTE_CLANG_DIAG_STR(x ## y)
+//  join -W with no-unused-variable to "-Wno-sign-compare"
+#  define BOOST_COMPUTE_CLANG_DIAG_DO_PRAGMA(x) _Pragma (#x)
+// _Pragma is unary operator  #pragma ("")
+#  define BOOST_COMPUTE_CLANG_DIAG_PRAGMA(x) \
+      BOOST_COMPUTE_CLANG_DIAG_DO_PRAGMA(clang diagnostic x)
+#  define BOOST_COMPUTE_CLANG_DIAG_OFF(x) BOOST_COMPUTE_CLANG_DIAG_PRAGMA(push) \
+      BOOST_COMPUTE_CLANG_DIAG_PRAGMA(ignored BOOST_COMPUTE_CLANG_DIAG_JOINSTR(-W,x))
+// For example: #pragma clang diagnostic ignored "-Wno-sign-compare"
+#  define BOOST_COMPUTE_CLANG_DIAG_ON(x) BOOST_COMPUTE_CLANG_DIAG_PRAGMA(pop)
+// For example: #pragma clang diagnostic warning "-Wno-sign-compare"
+#else // Ensure these macros do nothing for other compilers.
+#  define BOOST_COMPUTE_CLANG_DIAG_OFF(x)
+#  define BOOST_COMPUTE_CLANG_DIAG_ON(x)
+#  define BOOST_COMPUTE_CLANG_DIAG_PRAGMA(x)
+#endif
+
+// MSVC
+#if defined(_MSC_VER)
+#  define BOOST_COMPUTE_MSVC_DIAG_DO_PRAGMA(x) __pragma(x)
+#  define BOOST_COMPUTE_MSVC_DIAG_PRAGMA(x) \
+          BOOST_COMPUTE_MSVC_DIAG_DO_PRAGMA(warning(x))
+#  define BOOST_COMPUTE_MSVC_DIAG_OFF(x) BOOST_COMPUTE_MSVC_DIAG_PRAGMA(push) \
+          BOOST_COMPUTE_MSVC_DIAG_PRAGMA(disable: x)
+#  define BOOST_COMPUTE_MSVC_DIAG_ON(x) BOOST_COMPUTE_MSVC_DIAG_PRAGMA(pop)
+#else // Ensure these macros do nothing for other compilers.
+#  define BOOST_COMPUTE_MSVC_DIAG_OFF(x)
+#  define BOOST_COMPUTE_MSVC_DIAG_ON(x)
+#endif
+
+#include <iostream>
+
+// include the proper opencl header for the system
+#if defined(__APPLE__)
+#include <OpenCL/cl.h>
+#else
+#include <CL/cl.h>
+#endif
+
+// the opencl_test example displays the opencl platforms and devices found
+// on the system using the opencl api directly. if this test fails to compile
+// and/or run, there is a problem with the opencl implementation found on the
+// system. users should ensure this test runs successfuly before using any of
+// the boost.compute apis (which depend on a working opencl implementation).
+int main()
+{
+    // Suppress deprecated declarations warning
+    BOOST_COMPUTE_MSVC_DIAG_OFF(4996); // MSVC
+    BOOST_COMPUTE_GCC_DIAG_OFF(deprecated-declarations); // GCC
+    BOOST_COMPUTE_CLANG_DIAG_OFF(deprecated-declarations); // Clang
+
+    // query number of opencl platforms
+    cl_uint num_platforms = 0;
+    cl_int ret = clGetPlatformIDs(0, NULL, &num_platforms);
+    if(ret != CL_SUCCESS){
+        std::cerr << "failed to query platforms: " << ret << std::endl;
+        return -1;
+    }
+
+    // check that at least one platform was found
+    if(num_platforms == 0){
+        std::cerr << "found 0 platforms" << std::endl;
+        return 0;
+    }
+
+    // get platform ids
+    cl_platform_id *platforms = new cl_platform_id[num_platforms];
+    clGetPlatformIDs(num_platforms, platforms, NULL);
+
+    // iterate through each platform and query its devices
+    for(cl_uint i = 0; i < num_platforms; i++){
+        cl_platform_id platform = platforms[i];
+
+        // query number of opencl devices
+        cl_uint num_devices = 0;
+        ret = clGetDeviceIDs(platform, CL_DEVICE_TYPE_ALL, 0, NULL, &num_devices);
+        if(ret != CL_SUCCESS){
+            std::cerr << "failed to lookup devices for platform " << i << std::endl;
+            continue;
+        }
+
+        // print number of devices found
+        std::cout << "platform " << i << " has " << num_devices << " devices:" << std::endl;
+
+        // get device ids for the platform
+        cl_device_id *devices = new cl_device_id[num_devices];
+        ret = clGetDeviceIDs(platform, CL_DEVICE_TYPE_ALL, num_devices, devices, NULL);
+        if(ret != CL_SUCCESS){
+            std::cerr << "failed to query platform devices" << std::endl;
+            delete[] devices;
+            continue;
+        }
+
+        // iterate through each device on the platform and print its name
+        for(cl_uint j = 0; j < num_devices; j++){
+            cl_device_id device = devices[j];
+
+            // get length of the device name string
+            size_t name_length = 0;
+            ret = clGetDeviceInfo(device, CL_DEVICE_NAME, 0, NULL, &name_length);
+            if(ret != CL_SUCCESS){
+                std::cerr << "failed to query device name length for device " << j << std::endl;
+                continue;
+            }
+
+            // get the device name string
+            char *name = new char[name_length];
+            ret = clGetDeviceInfo(device, CL_DEVICE_NAME, name_length, name, NULL);
+            if(ret != CL_SUCCESS){
+                std::cerr << "failed to query device name string for device " << j << std::endl;
+                delete[] name;
+                continue;
+            }
+
+            // print out the device name
+            std::cout << "  device: " << name << std::endl;
+
+            delete[] name;
+        }
+        delete[] devices;
+    }
+    delete[] platforms;
+
+    BOOST_COMPUTE_CLANG_DIAG_ON(deprecated-declarations); // Clang
+    BOOST_COMPUTE_GCC_DIAG_ON(deprecated-declarations); // GCC
+    BOOST_COMPUTE_MSVC_DIAG_OFF(4996); // MSVC
+
+    return 0;
+}
diff --git a/src/boost/libs/compute/example/opencv_convolution.cpp b/src/boost/libs/compute/example/opencv_convolution.cpp
new file mode 100644
index 00000000..7ba53436
--- /dev/null
+++ b/src/boost/libs/compute/example/opencv_convolution.cpp
@@ -0,0 +1,265 @@
+//---------------------------------------------------------------------------//
+// Copyright (c) 2013-2014 Mageswaran.D <mageswaran1989@gmail.com>
+//
+// Distributed under the Boost Software License, Version 1.0
+// See accompanying file LICENSE_1_0.txt or copy at
+// http://www.boost.org/LICENSE_1_0.txt
+//
+// See http://boostorg.github.com/compute for more information.
+//---------------------------------------------------------------------------//
+
+#include <iostream>
+#include <string>
+
+#include <opencv2/core/core.hpp>
+#include <opencv2/highgui/highgui.hpp>
+#include <opencv2/imgproc/imgproc.hpp>
+
+#include <boost/compute/system.hpp>
+#include <boost/compute/interop/opencv/core.hpp>
+#include <boost/compute/interop/opencv/highgui.hpp>
+#include <boost/compute/utility/source.hpp>
+
+#include <boost/program_options.hpp>
+
+namespace compute = boost::compute;
+namespace po = boost::program_options;
+
+// Create convolution program
+const char source[] = BOOST_COMPUTE_STRINGIZE_SOURCE (
+    __kernel void convolution(__read_only  image2d_t  sourceImage,
+                              __write_only image2d_t  outputImage,
+                              __constant float* filter,
+                              int filterWidth)
+    {
+        const sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE |
+                                  CLK_ADDRESS_CLAMP_TO_EDGE   |
+                                  CLK_FILTER_NEAREST;
+
+        // Store each work-item's unique row and column
+        int x   = get_global_id(0);
+        int y   = get_global_id(1);
+
+        // Half the width of the filter is needed for indexing
+        // memory later
+        int halfWidth = (int)(filterWidth/2);
+
+        // All accesses to images return data as four-element vector
+        // (i.e., float4).
+        float4 sum = {0.0f, 0.0f, 0.0f, 0.0f};
+
+        // Iterator for the filter
+        int filterIdx = 0;
+
+        // Each work-item iterates around its local area based on the
+        // size of the filter
+        int2 coords;  // Coordinates for accessing the image
+
+        // Iterate the filter rows
+        for(int i = -halfWidth; i <= halfWidth; i++)
+        {
+            coords.y = y + i;
+
+            // Iterate over the filter columns
+            for(int j = -halfWidth; j <= halfWidth; j++)
+            {
+                coords.x = x + j;
+
+                float4 pixel;
+
+                // Read a pixel from the image.
+                // Work on a channel
+                pixel = read_imagef(sourceImage, sampler, coords);
+                sum.x += pixel.x * filter[filterIdx++];
+                //sum.y += pixel.y * filter[filterIdx++];
+                //sum.z += pixel.z * filter[filterIdx++];
+            }
+        }
+
+         barrier(CLK_GLOBAL_MEM_FENCE);
+        // Copy the data to the output image if the
+        // work-item is in bounds
+        if(y < get_image_height(sourceImage) &&
+                x < get_image_width(sourceImage))
+        {
+            coords.x = x;
+            coords.y = y;
+
+            //Same channel is copied in all three channels
+            //write_imagef(outputImage, coords,
+                        // (float4)(sum.x,sum.x,sum.x,1.0f));
+
+            write_imagef(outputImage, coords, sum);
+        }
+    }
+);
+
+// This example shows how to read two images or use camera
+// with OpenCV, transfer the frames to the GPU,
+// and apply a convolution written in OpenCL
+int main(int argc, char *argv[])
+{
+    ///////////////////////////////////////////////////////////////////////////
+
+    // setup the command line arguments
+    po::options_description desc;
+    desc.add_options()
+            ("help",  "show available options")
+            ("camera", po::value<int>()->default_value(-1),
+                                 "if not default camera, specify a camera id")
+            ("image", po::value<std::string>(), "path to image file");
+
+    // Parse the command lines
+    po::variables_map vm;
+    po::store(po::parse_command_line(argc, argv, desc), vm);
+    po::notify(vm);
+
+    //check the command line arguments
+    if(vm.count("help"))
+    {
+        std::cout << desc << std::endl;
+        return 0;
+    }
+
+    ///////////////////////////////////////////////////////////////////////////
+
+    //OpenCV variables
+    cv::Mat cv_mat;
+    cv::VideoCapture cap; //OpenCV camera handle.
+
+    //Filter Variables
+    float filter[9] =  {
+                -1.0,      0.0,      1.0,
+                -2.0,      0.0,      2.0,
+                -1.0,      0.0,      1.0,
+                };
+
+    // The convolution filter is 3x3
+    int filterWidth = 3;
+
+    //OpenCL variables
+    // Get default device and setup context
+    compute::device gpu = compute::system::default_device();
+    compute::context context(gpu);
+    compute::command_queue queue(context, gpu);
+    compute::buffer dev_filter(context, sizeof(filter),
+                               compute::memory_object::read_only |
+                               compute::memory_object::copy_host_ptr,
+                               filter);
+
+    compute::program filter_program =
+            compute::program::create_with_source(source, context);
+
+    try
+    {
+        filter_program.build();
+    }
+    catch(compute::opencl_error e)
+    {
+        std::cout<<"Build Error: "<<std::endl
+                 <<filter_program.build_log();
+    return -1;
+    }
+
+    // create fliter kernel and set arguments
+    compute::kernel filter_kernel(filter_program, "convolution");
+
+    ///////////////////////////////////////////////////////////////////////////
+
+    //check for image paths
+    if(vm.count("image"))
+    {
+        // Read image with OpenCV
+        cv_mat = cv::imread(vm["image"].as<std::string>(),
+                                       CV_LOAD_IMAGE_COLOR);
+        if(!cv_mat.data){
+            std::cerr << "Failed to load image" << std::endl;
+            return -1;
+        }
+    }
+    else //by default use camera
+    {
+        //open camera
+        cap.open(vm["camera"].as<int>());
+        // read first frame
+        cap >> cv_mat;
+        if(!cv_mat.data){
+            std::cerr << "failed to capture frame" << std::endl;
+            return -1;
+        }
+    }
+
+    // Convert image to BGRA (OpenCL requires 16-byte aligned data)
+    cv::cvtColor(cv_mat, cv_mat, CV_BGR2BGRA);
+
+    // Transfer image/frame data to gpu
+    compute::image2d dev_input_image =
+            compute::opencv_create_image2d_with_mat(
+                cv_mat, compute::image2d::read_write, queue
+                );
+
+    // Create output image
+    // Be sure what will be your ouput image/frame size
+    compute::image2d dev_output_image(
+                context,
+                dev_input_image.width(),
+                dev_input_image.height(),
+                dev_input_image.format(),
+                compute::image2d::write_only
+                );
+
+    filter_kernel.set_arg(0, dev_input_image);
+    filter_kernel.set_arg(1, dev_output_image);
+    filter_kernel.set_arg(2, dev_filter);
+    filter_kernel.set_arg(3, filterWidth);
+
+    // run flip kernel
+    size_t origin[2] = { 0, 0 };
+    size_t region[2] = { dev_input_image.width(),
+                         dev_input_image.height() };
+
+    ///////////////////////////////////////////////////////////////////////////
+
+    queue.enqueue_nd_range_kernel(filter_kernel, 2, origin, region, 0);
+
+    //check for image paths
+    if(vm.count("image"))
+    {
+        // show host image
+        cv::imshow("Original Image", cv_mat);
+
+        // show gpu image
+        compute::opencv_imshow("Convoluted Image", dev_output_image, queue);
+
+        // wait and return
+        cv::waitKey(0);
+    }
+    else
+    {
+        char key = '\0';
+        while(key != 27) //check for escape key
+        {
+            cap >> cv_mat;
+
+            // Convert image to BGRA (OpenCL requires 16-byte aligned data)
+            cv::cvtColor(cv_mat, cv_mat, CV_BGR2BGRA);
+
+            // Update the device image memory with current frame data
+            compute::opencv_copy_mat_to_image(cv_mat,
+                                              dev_input_image,queue);
+
+            // Run the kernel on the device
+            queue.enqueue_nd_range_kernel(filter_kernel, 2, origin, region, 0);
+
+            // Show host image
+            cv::imshow("Camera Frame", cv_mat);
+
+            // Show GPU image
+            compute::opencv_imshow("Convoluted Frame", dev_output_image, queue);
+
+            // wait
+            key = cv::waitKey(10);
+        }
+    }
+    return 0;
+}
diff --git a/src/boost/libs/compute/example/opencv_flip.cpp b/src/boost/libs/compute/example/opencv_flip.cpp
new file mode 100644
index 00000000..8cc26ef2
--- /dev/null
+++ b/src/boost/libs/compute/example/opencv_flip.cpp
@@ -0,0 +1,101 @@
+//---------------------------------------------------------------------------//
+// Copyright (c) 2013-2014 Kyle Lutz <kyle.r.lutz@gmail.com>
+//
+// Distributed under the Boost Software License, Version 1.0
+// See accompanying file LICENSE_1_0.txt or copy at
+// http://www.boost.org/LICENSE_1_0.txt
+//
+// See http://boostorg.github.com/compute for more information.
+//---------------------------------------------------------------------------//
+
+#include <iostream>
+
+#include <opencv2/core/core.hpp>
+#include <opencv2/highgui/highgui.hpp>
+#include <opencv2/imgproc/imgproc.hpp>
+
+#include <boost/compute/system.hpp>
+#include <boost/compute/interop/opencv/core.hpp>
+#include <boost/compute/interop/opencv/highgui.hpp>
+#include <boost/compute/utility/source.hpp>
+
+namespace compute = boost::compute;
+
+// this example shows how to read an image with OpenCV, transfer the
+// image to the GPU, and apply a simple flip filter written in OpenCL
+int main(int argc, char *argv[])
+{
+    // check command line
+    if(argc < 2){
+        std::cerr << "usage: " << argv[0] << " FILENAME" << std::endl;
+        return -1;
+    }
+
+    // read image with opencv
+    cv::Mat cv_image = cv::imread(argv[1], CV_LOAD_IMAGE_COLOR);
+    if(!cv_image.data){
+        std::cerr << "failed to load image" << std::endl;
+        return -1;
+    }
+
+    // get default device and setup context
+    compute::device gpu = compute::system::default_device();
+    compute::context context(gpu);
+    compute::command_queue queue(context, gpu);
+
+    // convert image to BGRA (OpenCL requires 16-byte aligned data)
+    cv::cvtColor(cv_image, cv_image, CV_BGR2BGRA);
+
+    // transfer image to gpu
+    compute::image2d input_image =
+        compute::opencv_create_image2d_with_mat(
+            cv_image, compute::image2d::read_write, queue
+        );
+
+    // create output image
+    compute::image2d output_image(
+        context,
+        input_image.width(),
+        input_image.height(),
+        input_image.format(),
+        compute::image2d::write_only
+    );
+
+    // create flip program
+    const char source[] = BOOST_COMPUTE_STRINGIZE_SOURCE(
+        __kernel void flip_kernel(__read_only image2d_t input,
+                                  __write_only image2d_t output)
+        {
+            const sampler_t sampler = CLK_ADDRESS_NONE | CLK_FILTER_NEAREST;
+            int height = get_image_height(input);
+            int2 input_coord = { get_global_id(0), get_global_id(1) };
+            int2 output_coord = { input_coord.x, height - input_coord.y - 1 };
+            float4 value = read_imagef(input, sampler, input_coord);
+            write_imagef(output, output_coord, value);
+        }
+    );
+
+    compute::program flip_program =
+        compute::program::create_with_source(source, context);
+    flip_program.build();
+
+    // create flip kernel and set arguments
+    compute::kernel flip_kernel(flip_program, "flip_kernel");
+    flip_kernel.set_arg(0, input_image);
+    flip_kernel.set_arg(1, output_image);
+
+    // run flip kernel
+    size_t origin[2] = { 0, 0 };
+    size_t region[2] = { input_image.width(), input_image.height() };
+    queue.enqueue_nd_range_kernel(flip_kernel, 2, origin, region, 0);
+
+    // show host image
+    cv::imshow("opencv image", cv_image);
+
+    // show gpu image
+    compute::opencv_imshow("filtered image", output_image, queue);
+
+    // wait and return
+    cv::waitKey(0);
+    return 0;
+}
diff --git a/src/boost/libs/compute/example/opencv_histogram.cpp b/src/boost/libs/compute/example/opencv_histogram.cpp
new file mode 100644
index 00000000..e339030b
--- /dev/null
+++ b/src/boost/libs/compute/example/opencv_histogram.cpp
@@ -0,0 +1,228 @@
+//---------------------------------------------------------------------------//
+// Copyright (c) 2013-2014 Mageswaran.D <mageswaran1989@gmail.com>
+//
+// Distributed under the Boost Software License, Version 1.0
+// See accompanying file LICENSE_1_0.txt or copy at
+// http://www.boost.org/LICENSE_1_0.txt
+//
+// See http://boostorg.github.com/compute for more information.
+//---------------------------------------------------------------------------//
+
+//Code sample for calculating histogram using OpenCL and
+//displaying image histogram in OpenCV.
+
+#include <iostream>
+#include <string>
+
+#include <opencv2/imgproc/imgproc.hpp>
+#include <opencv2/highgui/highgui.hpp>
+
+#include <boost/compute/source.hpp>
+#include <boost/compute/system.hpp>
+#include <boost/compute/container/vector.hpp>
+#include <boost/compute/interop/opencv/core.hpp>
+#include <boost/compute/interop/opencv/highgui.hpp>
+#include <boost/program_options.hpp>
+
+namespace compute = boost::compute;
+namespace po = boost::program_options;
+
+// number of bins
+int histSize = 256;
+
+// Set the ranges ( for B,G,R) )
+// TryOut: consider the range in kernel calculation
+float range[] = { 0, 256 } ;
+const float* histRange = { range };
+
+// Create naive histogram program
+// Needs "cl_khr_local_int32_base_atomics" extension
+const char source[] = BOOST_COMPUTE_STRINGIZE_SOURCE (
+    __kernel void histogram(read_only image2d_t src_image,
+                            __global int* b_hist,
+                            __global int* g_hist,
+                            __global int* r_hist)
+            {
+                sampler_t sampler =( CLK_NORMALIZED_COORDS_FALSE |
+                                     CLK_FILTER_NEAREST |
+                                     CLK_ADDRESS_CLAMP_TO_EDGE);
+
+                int image_width = get_image_width(src_image);
+                int image_height = get_image_height(src_image);
+
+                int2 coords  = (int2)(get_global_id(0), get_global_id(1));
+                float4 pixel = read_imagef(src_image,sampler, coords);
+
+                //boundary condition
+                if ((coords.x < image_width) && (coords.y < image_height))
+                {
+                    uchar indx_x, indx_y, indx_z;
+                    indx_x = convert_uchar_sat(pixel.x * 255.0f);
+                    indx_y = convert_uchar_sat(pixel.y * 255.0f);
+                    indx_z = convert_uchar_sat(pixel.z * 255.0f);
+
+                    atomic_inc(&b_hist[(uint)indx_z]);
+                    atomic_inc(&g_hist[(uint)indx_y]);
+                    atomic_inc(&r_hist[(uint)indx_x]);
+                }
+            }
+);
+
+inline void showHistogramWindow(cv::Mat &b_hist, cv::Mat &g_hist, cv::Mat &r_hist,
+                         std::string window_name)
+{
+    // Draw the histograms for B, G and R
+    int hist_w = 1024;
+    int hist_h = 768;
+    int bin_w = cvRound((double)hist_w/histSize);
+
+    cv::Mat histImage(hist_h, hist_w, CV_8UC3, cv::Scalar(0,0,0));
+
+    // Normalize the result to [ 0, histImage.rows ]
+    cv::normalize(b_hist, b_hist, 0, histImage.rows, cv::NORM_MINMAX, -1, cv::Mat());
+    cv::normalize(g_hist, g_hist, 0, histImage.rows, cv::NORM_MINMAX, -1, cv::Mat());
+    cv::normalize(r_hist, r_hist, 0, histImage.rows, cv::NORM_MINMAX, -1, cv::Mat());
+
+    // Draw for each channel
+    for (int i = 1; i < histSize; i++ )
+    {
+        cv::line(histImage,
+                 cv::Point(bin_w*(i-1), hist_h - cvRound(b_hist.at<float>(i-1))),
+                 cv::Point(bin_w*(i), hist_h - cvRound(b_hist.at<float>(i))),
+                 cv::Scalar(255, 0, 0),
+                 2,
+                 8,
+                 0);
+
+        cv::line(histImage,
+                 cv::Point(bin_w*(i-1), hist_h - cvRound(g_hist.at<float>(i-1))),
+                 cv::Point(bin_w*(i), hist_h - cvRound(g_hist.at<float>(i))),
+                 cv::Scalar(0, 255, 0),
+                 2,
+                 8,
+                 0);
+
+        cv::line(histImage,
+                 cv::Point( bin_w*(i-1), hist_h - cvRound(r_hist.at<float>(i-1))),
+                 cv::Point( bin_w*(i), hist_h - cvRound(r_hist.at<float>(i)) ),
+                 cv::Scalar( 0, 0, 255),
+                 2,
+                 8,
+                 0);
+    }
+
+    // Display
+    cv::namedWindow(window_name, CV_WINDOW_AUTOSIZE );
+    cv::imshow(window_name, histImage );
+}
+
+//Get the device context
+//Create GPU array/vector
+//Copy the image & set up the kernel
+//Execute the kernel
+//Copy GPU data back to CPU cv::Mat data pointer
+//OpenCV conversion for convienient display
+void calculateHistogramUsingCL(cv::Mat src, compute::command_queue &queue)
+{
+    compute::context context = queue.get_context();
+
+    // Convert image to BGRA (OpenCL requires 16-byte aligned data)
+    cv::cvtColor(src, src, CV_BGR2BGRA);
+
+    //3 channels & 256 bins : alpha channel is ignored
+    compute::vector<int> gpu_b_hist(histSize, context);
+    compute::vector<int> gpu_g_hist(histSize, context);
+    compute::vector<int> gpu_r_hist(histSize, context);
+
+    // Transfer image to gpu
+    compute::image2d gpu_src =
+            compute::opencv_create_image2d_with_mat(
+                src, compute::image2d::read_only,
+                queue
+                );
+
+    compute::program histogram_program =
+            compute::program::create_with_source(source, context);
+    histogram_program.build();
+
+    // create histogram kernel and set arguments
+    compute::kernel histogram_kernel(histogram_program, "histogram");
+    histogram_kernel.set_arg(0, gpu_src);
+    histogram_kernel.set_arg(1, gpu_b_hist.get_buffer());
+    histogram_kernel.set_arg(2, gpu_g_hist.get_buffer());
+    histogram_kernel.set_arg(3, gpu_r_hist.get_buffer());
+
+    // run histogram kernel
+    // each kernel thread updating red, green & blue bins
+    size_t origin[2] = { 0, 0 };
+    size_t region[2] = { gpu_src.width(),
+                         gpu_src.height() };
+
+    queue.enqueue_nd_range_kernel(histogram_kernel, 2, origin, region, 0);
+
+    //Make sure kernel get executed and data copied back
+    queue.finish();
+
+    //create Mat and copy GPU bins to CPU memory
+    cv::Mat b_hist(256, 1, CV_32SC1);
+    compute::copy(gpu_b_hist.begin(), gpu_b_hist.end(), b_hist.data, queue);
+    cv::Mat g_hist(256, 1, CV_32SC1);
+    compute::copy(gpu_g_hist.begin(), gpu_g_hist.end(), g_hist.data, queue);
+    cv::Mat r_hist(256, 1, CV_32SC1);
+    compute::copy(gpu_r_hist.begin(), gpu_r_hist.end(), r_hist.data, queue);
+
+    b_hist.convertTo(b_hist, CV_32FC1); //converted for displaying
+    g_hist.convertTo(g_hist, CV_32FC1);
+    r_hist.convertTo(r_hist, CV_32FC1);
+
+    showHistogramWindow(b_hist, g_hist, r_hist, "Histogram");
+}
+
+int main( int argc, char** argv )
+{
+    // Get default device and setup context
+    compute::device gpu = compute::system::default_device();
+    compute::context context(gpu);
+    compute::command_queue queue(context, gpu);
+
+    cv::Mat src;
+
+    // setup the command line arguments
+    po::options_description desc;
+    desc.add_options()
+            ("help",  "show available options")
+            ("image", po::value<std::string>(), "path to image file");
+
+    // Parse the command lines
+    po::variables_map vm;
+    po::store(po::parse_command_line(argc, argv, desc), vm);
+    po::notify(vm);
+
+    //check the command line arguments
+    if(vm.count("help"))
+    {
+        std::cout << desc << std::endl;
+        return 0;
+    }
+
+    //check for image paths
+    if(vm.count("image"))
+    {
+        // Read image with OpenCV
+        src = cv::imread(vm["image"].as<std::string>(),
+                                       CV_LOAD_IMAGE_COLOR);
+        if(!src.data){
+            std::cerr << "Failed to load image" << std::endl;
+            return -1;
+        }
+        calculateHistogramUsingCL(src, queue);
+        cv::imshow("Image", src);
+        cv::waitKey(0);
+    }
+    else
+    {
+        std::cout << desc << std::endl;
+        return 0;
+    }
+    return 0;
+}
diff --git a/src/boost/libs/compute/example/opencv_optical_flow.cpp b/src/boost/libs/compute/example/opencv_optical_flow.cpp
new file mode 100644
index 00000000..87f330ae
--- /dev/null
+++ b/src/boost/libs/compute/example/opencv_optical_flow.cpp
@@ -0,0 +1,289 @@
+//---------------------------------------------------------------------------//
+// Copyright (c) 2013-2014 Mageswaran.D <mageswaran1989@gmail.com>
+//
+// Distributed under the Boost Software License, Version 1.0
+// See accompanying file LICENSE_1_0.txt or copy at
+// http://www.boost.org/LICENSE_1_0.txt
+//
+// See http://boostorg.github.com/compute for more information.
+//---------------------------------------------------------------------------//
+
+#include <iostream>
+#include <string>
+
+#include <opencv2/core/core.hpp>
+#include <opencv2/highgui/highgui.hpp>
+#include <opencv2/imgproc/imgproc.hpp>
+
+#include <boost/compute/system.hpp>
+#include <boost/compute/interop/opencv/core.hpp>
+#include <boost/compute/interop/opencv/highgui.hpp>
+#include <boost/compute/utility/source.hpp>
+
+#include <boost/program_options.hpp>
+
+namespace compute = boost::compute;
+namespace po = boost::program_options;
+
+// Create naive optical flow program
+const char source[] = BOOST_COMPUTE_STRINGIZE_SOURCE (
+    const sampler_t sampler = CLK_ADDRESS_CLAMP_TO_EDGE;
+
+    __kernel void optical_flow (
+                                read_only
+                                image2d_t current_image,
+                                read_only image2d_t previous_image,
+                                write_only image2d_t optical_flow,
+                                const float scale,
+                                const float offset,
+                                const float lambda,
+                                const float threshold )
+    {
+        int2 coords = (int2)(get_global_id(0), get_global_id(1));
+        float4 current_pixel    = read_imagef(current_image,
+                                              sampler,
+                                              coords);
+        float4 previous_pixel   = read_imagef(previous_image,
+                                              sampler,
+                                              coords);
+        int2 x1     = (int2)(offset, 0.f);
+        int2 y1     = (int2)(0.f, offset);
+
+        //get the difference
+        float4 curdif = previous_pixel - current_pixel;
+
+        //calculate the gradient
+        //Image 2 first
+        float4 gradx = read_imagef(previous_image,
+                                   sampler,
+                                   coords+x1) -
+                       read_imagef(previous_image,
+                                   sampler,
+                                   coords-x1);
+        //Image 1
+        gradx += read_imagef(current_image,
+                             sampler,
+                             coords+x1) -
+                 read_imagef(current_image,
+                             sampler,
+                             coords-x1);
+        //Image 2 first
+        float4 grady = read_imagef(previous_image,
+                                   sampler,
+                                   coords+y1) -
+                       read_imagef(previous_image,
+                                   sampler,
+                                   coords-y1);
+        //Image 1
+        grady += read_imagef(current_image,
+                             sampler,
+                             coords+y1) -
+                 read_imagef(current_image,
+                             sampler,
+                             coords-y1);
+
+        float4 sqr = (gradx*gradx) + (grady*grady) +
+                     (float4)(lambda,lambda, lambda, lambda);
+        float4 gradmag = sqrt(sqr);
+
+        ///////////////////////////////////////////////////
+        float4 vx = curdif * (gradx / gradmag);
+        float vxd = vx.x;//assumes greyscale
+
+        //format output for flowrepos, out(-x,+x,-y,+y)
+        float2 xout = (float2)(fmax(vxd,0.f),fabs(fmin(vxd,0.f)));
+        xout *= scale;
+        ///////////////////////////////////////////////////
+        float4 vy = curdif*(grady/gradmag);
+        float vyd = vy.x;//assumes greyscale
+
+        //format output for flowrepos, out(-x,+x,-y,+y)
+        float2 yout = (float2)(fmax(vyd,0.f),fabs(fmin(vyd,0.f)));
+        yout *= scale;
+        ///////////////////////////////////////////////////
+
+        float4 out = (float4)(xout, yout);
+        float cond = (float)isgreaterequal(length(out), threshold);
+        out *= cond;
+
+        write_imagef(optical_flow, coords, out);
+    }
+);
+
+// This example shows how to read two images or use camera
+// with OpenCV, transfer the frames to the GPU,
+// and apply a naive optical flow algorithm
+// written in OpenCL
+int main(int argc, char *argv[])
+{
+    // setup the command line arguments
+    po::options_description desc;
+    desc.add_options()
+            ("help",  "show available options")
+            ("camera", po::value<int>()->default_value(-1),
+                                 "if not default camera, specify a camera id")
+            ("image1", po::value<std::string>(), "path to image file 1")
+            ("image2", po::value<std::string>(), "path to image file 2");
+
+    // Parse the command lines
+    po::variables_map vm;
+    po::store(po::parse_command_line(argc, argv, desc), vm);
+    po::notify(vm);
+
+    //check the command line arguments
+    if(vm.count("help"))
+    {
+        std::cout << desc << std::endl;
+        return 0;
+    }
+
+    //OpenCV variables
+    cv::Mat previous_cv_image;
+    cv::Mat current_cv_image;
+    cv::VideoCapture cap; //OpenCV camera handle
+
+    //check for image paths
+    if(vm.count("image1") && vm.count("image2"))
+    {
+        // Read image 1 with OpenCV
+        previous_cv_image = cv::imread(vm["image1"].as<std::string>(),
+                                       CV_LOAD_IMAGE_COLOR);
+        if(!previous_cv_image.data){
+            std::cerr << "Failed to load image" << std::endl;
+            return -1;
+        }
+
+        // Read image 2 with opencv
+        current_cv_image = cv::imread(vm["image2"].as<std::string>(),
+                                      CV_LOAD_IMAGE_COLOR);
+        if(!current_cv_image.data){
+            std::cerr << "Failed to load image" << std::endl;
+            return -1;
+        }
+    }
+    else //by default use camera
+    {
+        //open camera
+        cap.open(vm["camera"].as<int>());
+        // read first frame
+        cap >> previous_cv_image;
+        if(!previous_cv_image.data){
+            std::cerr << "failed to capture frame" << std::endl;
+            return -1;
+        }
+
+        // read second frame
+        cap >> current_cv_image;
+        if(!current_cv_image.data){
+            std::cerr << "failed to capture frame" << std::endl;
+            return -1;
+        }
+
+    }
+
+    // Get default device and setup context
+    compute::device gpu = compute::system::default_device();
+    compute::context context(gpu);
+    compute::command_queue queue(context, gpu);
+
+    // Convert image to BGRA (OpenCL requires 16-byte aligned data)
+    cv::cvtColor(previous_cv_image, previous_cv_image, CV_BGR2BGRA);
+    cv::cvtColor(current_cv_image, current_cv_image, CV_BGR2BGRA);
+
+    // Transfer image to gpu
+    compute::image2d dev_previous_image =
+            compute::opencv_create_image2d_with_mat(
+                previous_cv_image, compute::image2d::read_write, queue
+                );
+    // Transfer image to gpu
+    compute::image2d dev_current_image =
+            compute::opencv_create_image2d_with_mat(
+                current_cv_image, compute::image2d::read_write, queue
+                );
+
+    // Create output image
+    compute::image2d dev_output_image(
+                context,
+                dev_previous_image.width(),
+                dev_previous_image.height(),
+                dev_previous_image.format(),
+                compute::image2d::write_only
+                );
+
+    compute::program optical_program =
+            compute::program::create_with_source(source, context);
+    optical_program.build();
+
+    // create flip kernel and set arguments
+    compute::kernel optical_kernel(optical_program, "optical_flow");
+    float scale = 10;
+    float offset = 1;
+    float lambda = 0.0025;
+    float threshold = 1.0;
+
+    optical_kernel.set_arg(0, dev_previous_image);
+    optical_kernel.set_arg(1, dev_current_image);
+    optical_kernel.set_arg(2, dev_output_image);
+    optical_kernel.set_arg(3, scale);
+    optical_kernel.set_arg(4, offset);
+    optical_kernel.set_arg(5, lambda);
+    optical_kernel.set_arg(6, threshold);
+
+    // run flip kernel
+    size_t origin[2] = { 0, 0 };
+    size_t region[2] = { dev_previous_image.width(),
+                         dev_previous_image.height() };
+    queue.enqueue_nd_range_kernel(optical_kernel, 2, origin, region, 0);
+
+    //check for image paths
+    if(vm.count("image1") && vm.count("image2"))
+    {
+        // show host image
+        cv::imshow("Previous Frame", previous_cv_image);
+        cv::imshow("Current Frame", current_cv_image);
+
+        // show gpu image
+        compute::opencv_imshow("filtered image", dev_output_image, queue);
+
+        // wait and return
+        cv::waitKey(0);
+    }
+    else
+    {
+        char key = '\0';
+        while(key != 27) //check for escape key
+        {
+            cap >> current_cv_image;
+
+            // Convert image to BGRA (OpenCL requires 16-byte aligned data)
+            cv::cvtColor(current_cv_image, current_cv_image, CV_BGR2BGRA);
+
+            // Update the device image memory with current frame data
+            compute::opencv_copy_mat_to_image(previous_cv_image,
+                                              dev_previous_image,
+                                              queue);
+            compute::opencv_copy_mat_to_image(current_cv_image,
+                                              dev_current_image,
+                                              queue);
+
+            // Run the kernel on the device
+            queue.enqueue_nd_range_kernel(optical_kernel, 2, origin, region, 0);
+
+            // Show host image
+            cv::imshow("Previous Frame", previous_cv_image);
+            cv::imshow("Current Frame", current_cv_image);
+
+            // Show GPU image
+            compute::opencv_imshow("filtered image", dev_output_image, queue);
+
+            // Copy current frame container to previous frame container
+            current_cv_image.copyTo(previous_cv_image);
+
+            // wait
+            key = cv::waitKey(10);
+        }
+
+    }
+    return 0;
+}
+
diff --git a/src/boost/libs/compute/example/opencv_sobel_filter.cpp b/src/boost/libs/compute/example/opencv_sobel_filter.cpp
new file mode 100644
index 00000000..4fcfa206
--- /dev/null
+++ b/src/boost/libs/compute/example/opencv_sobel_filter.cpp
@@ -0,0 +1,254 @@
+//---------------------------------------------------------------------------//
+// Copyright (c) 2013-2014 Mageswaran.D <mageswaran1989@gmail.com>
+//
+// Book Refered: OpenCL Programming Guide
+// Distributed under the Boost Software License, Version 1.0
+// See accompanying file LICENSE_1_0.txt or copy at
+// http://www.boost.org/LICENSE_1_0.txt
+//
+// See http://boostorg.github.com/compute for more information.
+//---------------------------------------------------------------------------//
+
+//---------------------------------------------------------------------------//
+// About Sobel Filter:
+// * Edge Filter - distinguishes the differrent color region
+// * Finds the gradient in x and y-axes
+// * Three step process
+//   -> Find x-axis gradient with kernel/matrix
+//           Gx = [-1 0 +1]
+//                [-2 0 +2]
+//                [-1 0 +1]
+//   -> Find y-axis gradient with kernel/matrix
+//           Gy = [-1 -2 -1]
+//                [ 0  0  0]
+//                [+1 +2 +1]
+// * Gradient magnitude G = sqrt(Gx^2 + Gy^2)
+//---------------------------------------------------------------------------//
+
+#include <iostream>
+#include <string>
+
+#include <opencv2/core/core.hpp>
+#include <opencv2/highgui/highgui.hpp>
+#include <opencv2/imgproc/imgproc.hpp>
+
+#include <boost/compute/system.hpp>
+#include <boost/compute/interop/opencv/core.hpp>
+#include <boost/compute/interop/opencv/highgui.hpp>
+#include <boost/compute/utility/source.hpp>
+
+#include <boost/program_options.hpp>
+
+namespace compute = boost::compute;
+namespace po = boost::program_options;
+
+// Create sobel filter program
+const char source[] = BOOST_COMPUTE_STRINGIZE_SOURCE (
+    //For out of boundary pixels, edge pixel
+    // value is returned
+    const sampler_t sampler = CLK_ADDRESS_CLAMP_TO_EDGE |
+                              CLK_FILTER_NEAREST;
+    kernel void sobel_rgb(read_only image2d_t src, write_only image2d_t dst)
+    {
+        int x = (int)get_global_id(0);
+        int y = (int)get_global_id(1);
+
+        if (x >= get_image_width(src) || y >= get_image_height(src))
+                return;
+
+        //  [(x-1, y+1), (x, y+1), (x+1, y+1)]
+        //  [(x-1, y  ), (x, y  ), (x+1, y  )]
+        //  [(x-1, y-1), (x, y-1), (x+1, y-1)]
+
+        //  [p02, p12,   p22]
+        //  [p01, pixel, p21]
+        //  [p00, p10,   p20]
+
+        //Basically finding influence of neighbour pixels on current pixel
+        float4 p00 = read_imagef(src, sampler, (int2)(x - 1, y - 1));
+        float4 p10 = read_imagef(src, sampler, (int2)(x,     y - 1));
+        float4 p20 = read_imagef(src, sampler, (int2)(x + 1, y - 1));
+
+        float4 p01 = read_imagef(src, sampler, (int2)(x - 1, y));
+        //pixel that we are working on
+        float4 p21 = read_imagef(src, sampler, (int2)(x + 1, y));
+
+        float4 p02 = read_imagef(src, sampler, (int2)(x - 1, y + 1));
+        float4 p12 = read_imagef(src, sampler, (int2)(x,     y + 1));
+        float4 p22 = read_imagef(src, sampler, (int2)(x + 1, y + 1));
+
+        //Find Gx = kernel + 3x3 around current pixel
+        //           Gx = [-1 0 +1]     [p02, p12,   p22]
+        //                [-2 0 +2]  +  [p01, pixel, p21]
+        //                [-1 0 +1]     [p00, p10,   p20]
+        float3 gx = -p00.xyz + p20.xyz +
+                    2.0f * (p21.xyz - p01.xyz)
+                    -p02.xyz + p22.xyz;
+
+        //Find Gy = kernel + 3x3 around current pixel
+        //           Gy = [-1 -2 -1]     [p02, p12,   p22]
+        //                [ 0  0  0]  +  [p01, pixel, p21]
+        //                [+1 +2 +1]     [p00, p10,   p20]
+        float3 gy = p00.xyz + p20.xyz +
+                    2.0f * (- p12.xyz + p10.xyz) -
+                    p02.xyz - p22.xyz;
+        //Find G
+        float3 g = native_sqrt(gx * gx + gy * gy);
+
+        // we could also approximate this as g = fabs(gx) + fabs(gy)
+        write_imagef(dst, (int2)(x, y), (float4)(g.x, g.y, g.z, 1.0f));
+    }
+);
+
+// This example shows how to apply sobel filter on images or on camera frames
+// with OpenCV, transfer the frames to the GPU, and apply a sobel filter
+// written in OpenCL
+int main(int argc, char *argv[])
+{
+    ///////////////////////////////////////////////////////////////////////////
+
+    // setup the command line arguments
+    po::options_description desc;
+    desc.add_options()
+            ("help",  "show available options")
+            ("camera", po::value<int>()->default_value(-1),
+                                 "if not default camera, specify a camera id")
+            ("image", po::value<std::string>(), "path to image file");
+
+    // Parse the command lines
+    po::variables_map vm;
+    po::store(po::parse_command_line(argc, argv, desc), vm);
+    po::notify(vm);
+
+    //check the command line arguments
+    if(vm.count("help"))
+    {
+        std::cout << desc << std::endl;
+        return 0;
+    }
+
+    ///////////////////////////////////////////////////////////////////////////
+
+    //OpenCV variables
+    cv::Mat cv_mat;
+    cv::VideoCapture cap; //OpenCV camera handle.
+
+    //OpenCL variables
+    // Get default device and setup context
+    compute::device gpu = compute::system::default_device();
+    compute::context context(gpu);
+    compute::command_queue queue(context, gpu);
+    compute::program filter_program =
+            compute::program::create_with_source(source, context);
+
+    try
+    {
+        filter_program.build();
+    }
+    catch(compute::opencl_error e)
+    {
+        std::cout<<"Build Error: "<<std::endl
+                 <<filter_program.build_log();
+    }
+
+    // create fliter kernel and set arguments
+    compute::kernel filter_kernel(filter_program, "sobel_rgb");
+
+    ///////////////////////////////////////////////////////////////////////////
+
+    //check for image paths
+    if(vm.count("image"))
+    {
+        // Read image with OpenCV
+        cv_mat = cv::imread(vm["image"].as<std::string>(),
+                                       CV_LOAD_IMAGE_COLOR);
+        if(!cv_mat.data){
+            std::cerr << "Failed to load image" << std::endl;
+            return -1;
+        }
+    }
+    else //by default use camera
+    {
+        //open camera
+        cap.open(vm["camera"].as<int>());
+        // read first frame
+        cap >> cv_mat;
+        if(!cv_mat.data){
+            std::cerr << "failed to capture frame" << std::endl;
+            return -1;
+        }
+    }
+
+    // Convert image to BGRA (OpenCL requires 16-byte aligned data)
+    cv::cvtColor(cv_mat, cv_mat, CV_BGR2BGRA);
+
+    // Transfer image/frame data to gpu
+    compute::image2d dev_input_image =
+            compute::opencv_create_image2d_with_mat(
+                cv_mat, compute::image2d::read_write, queue
+                );
+
+    // Create output image
+    // Be sure what will be your ouput image/frame size
+    compute::image2d dev_output_image(
+                context,
+                dev_input_image.width(),
+                dev_input_image.height(),
+                dev_input_image.format(),
+                compute::image2d::write_only
+                );
+
+    filter_kernel.set_arg(0, dev_input_image);
+    filter_kernel.set_arg(1, dev_output_image);
+
+
+    // run flip kernel
+    size_t origin[2] = { 0, 0 };
+    size_t region[2] = { dev_input_image.width(),
+                         dev_input_image.height() };
+
+    ///////////////////////////////////////////////////////////////////////////
+
+    queue.enqueue_nd_range_kernel(filter_kernel, 2, origin, region, 0);
+
+    //check for image paths
+    if(vm.count("image"))
+    {
+        // show host image
+        cv::imshow("Original Image", cv_mat);
+
+        // show gpu image
+        compute::opencv_imshow("Filtered Image", dev_output_image, queue);
+
+        // wait and return
+        cv::waitKey(0);
+    }
+    else
+    {
+        char key = '\0';
+        while(key != 27) //check for escape key
+        {
+            cap >> cv_mat;
+
+            // Convert image to BGRA (OpenCL requires 16-byte aligned data)
+            cv::cvtColor(cv_mat, cv_mat, CV_BGR2BGRA);
+
+            // Update the device image memory with current frame data
+            compute::opencv_copy_mat_to_image(cv_mat,
+                                              dev_input_image,queue);
+
+            // Run the kernel on the device
+            queue.enqueue_nd_range_kernel(filter_kernel, 2, origin, region, 0);
+
+            // Show host image
+            cv::imshow("Camera Frame", cv_mat);
+
+            // Show GPU image
+            compute::opencv_imshow("Filtered RGB Frame", dev_output_image, queue);
+
+            // wait
+            key = cv::waitKey(10);
+        }
+    }
+    return 0;
+}
diff --git a/src/boost/libs/compute/example/opengl_sphere.cpp b/src/boost/libs/compute/example/opengl_sphere.cpp
new file mode 100644
index 00000000..38999f34
--- /dev/null
+++ b/src/boost/libs/compute/example/opengl_sphere.cpp
@@ -0,0 +1,242 @@
+//---------------------------------------------------------------------------//
+// Copyright (c) 2013-2014 Kyle Lutz <kyle.r.lutz@gmail.com>
+//
+// Distributed under the Boost Software License, Version 1.0
+// See accompanying file LICENSE_1_0.txt or copy at
+// http://www.boost.org/LICENSE_1_0.txt
+//
+// See http://boostorg.github.com/compute for more information.
+//---------------------------------------------------------------------------//
+
+#include <iostream>
+#include <algorithm>
+
+#include <GL/gl.h>
+
+#include <vtkActor.h>
+#include <vtkCamera.h>
+#include <vtkgl.h>
+#include <vtkInteractorStyleSwitch.h>
+#include <vtkMapper.h>
+#include <vtkObjectFactory.h>
+#include <vtkOpenGLExtensionManager.h>
+#include <vtkOpenGLRenderWindow.h>
+#include <vtkProperty.h>
+#include <vtkRenderer.h>
+#include <vtkRenderWindow.h>
+#include <vtkRenderWindowInteractor.h>
+#include <vtkSmartPointer.h>
+
+#include <boost/compute/system.hpp>
+#include <boost/compute/algorithm/iota.hpp>
+#include <boost/compute/interop/opengl.hpp>
+#include <boost/compute/interop/vtk.hpp>
+#include <boost/compute/utility/dim.hpp>
+#include <boost/compute/utility/source.hpp>
+
+namespace compute = boost::compute;
+
+// tesselates a sphere with radius, phi_slices, and theta_slices. returns
+// a shared opencl/opengl buffer containing the vertex data.
+compute::opengl_buffer tesselate_sphere(float radius,
+                                        size_t phi_slices,
+                                        size_t theta_slices,
+                                        compute::command_queue &queue)
+{
+    using compute::dim;
+
+    const compute::context &context = queue.get_context();
+
+    const size_t vertex_count = phi_slices * theta_slices;
+
+    // create opengl buffer
+    GLuint vbo;
+    vtkgl::GenBuffersARB(1, &vbo);
+    vtkgl::BindBufferARB(vtkgl::ARRAY_BUFFER, vbo);
+    vtkgl::BufferDataARB(vtkgl::ARRAY_BUFFER,
+                         sizeof(float) * 4 * vertex_count,
+                         NULL,
+                         vtkgl::STREAM_DRAW);
+    vtkgl::BindBufferARB(vtkgl::ARRAY_BUFFER, 0);
+
+    // create shared opengl/opencl buffer
+    compute::opengl_buffer vertex_buffer(context, vbo);
+
+    // tesselate_sphere kernel source
+    const char source[] = BOOST_COMPUTE_STRINGIZE_SOURCE(
+        __kernel void tesselate_sphere(float radius,
+                                       uint phi_slices,
+                                       uint theta_slices,
+                                       __global float4 *vertex_buffer)
+        {
+            const uint phi_i = get_global_id(0);
+            const uint theta_i = get_global_id(1);
+
+            const float phi = phi_i * 2.f * M_PI_F / phi_slices;
+            const float theta = theta_i * 2.f * M_PI_F / theta_slices;
+
+            float4 v;
+            v.x = radius * cos(theta) * cos(phi);
+            v.y = radius * cos(theta) * sin(phi);
+            v.z = radius * sin(theta);
+            v.w = 1.f;
+
+            vertex_buffer[phi_i*phi_slices+theta_i] = v;
+        }
+    );
+
+    // build tesselate_sphere program
+    compute::program program =
+        compute::program::create_with_source(source, context);
+    program.build();
+
+    // setup tesselate_sphere kernel
+    compute::kernel kernel(program, "tesselate_sphere");
+    kernel.set_arg<compute::float_>(0, radius);
+    kernel.set_arg<compute::uint_>(1, phi_slices);
+    kernel.set_arg<compute::uint_>(2, theta_slices);
+    kernel.set_arg(3, vertex_buffer);
+
+    // acqurire buffer so that it is accessible to OpenCL
+    compute::opengl_enqueue_acquire_buffer(vertex_buffer, queue);
+
+    // execute tesselate_sphere kernel
+    queue.enqueue_nd_range_kernel(
+        kernel, dim(0, 0), dim(phi_slices, theta_slices), dim(1, 1)
+    );
+
+    // release buffer so that it is accessible to OpenGL
+    compute::opengl_enqueue_release_buffer(vertex_buffer, queue);
+
+    return vertex_buffer;
+}
+
+// simple vtkMapper subclass to render the tesselated sphere on the gpu.
+class gpu_sphere_mapper : public vtkMapper
+{
+public:
+    vtkTypeMacro(gpu_sphere_mapper, vtkMapper)
+
+    static gpu_sphere_mapper* New()
+    {
+        return new gpu_sphere_mapper;
+    }
+
+    void Render(vtkRenderer *renderer, vtkActor *actor)
+    {
+        if(!m_initialized){
+            Initialize(renderer, actor);
+            m_initialized = true;
+        }
+
+        if(!m_tesselated){
+            m_vertex_count = m_phi_slices * m_theta_slices;
+
+            // tesselate sphere
+            m_vertex_buffer = tesselate_sphere(
+                m_radius, m_phi_slices, m_theta_slices, m_command_queue
+            );
+
+            // ensure tesselation is finished (seems to be required on AMD)
+            m_command_queue.finish();
+
+            // set tesselated flag to true
+            m_tesselated = true;
+        }
+
+        // draw sphere
+        glEnableClientState(GL_VERTEX_ARRAY);
+        vtkgl::BindBufferARB(vtkgl::ARRAY_BUFFER, m_vertex_buffer.get_opengl_object());
+        glVertexPointer(4, GL_FLOAT, sizeof(float)*4, 0);
+        glDrawArrays(GL_POINTS, 0, m_vertex_count);
+    }
+
+    void Initialize(vtkRenderer *renderer, vtkActor *actor)
+    {
+        // initialize opengl extensions
+        vtkOpenGLExtensionManager *extensions =
+            static_cast<vtkOpenGLRenderWindow *>(renderer->GetRenderWindow())
+                ->GetExtensionManager();
+        extensions->LoadExtension("GL_ARB_vertex_buffer_object");
+
+        // initialize opencl/opengl shared context
+        m_context = compute::opengl_create_shared_context();
+        compute::device device = m_context.get_device();
+        std::cout << "device: " << device.name() << std::endl;
+
+        // create command queue for the gpu device
+        m_command_queue = compute::command_queue(m_context, device);
+    }
+
+    double* GetBounds()
+    {
+        static double bounds[6];
+        bounds[0] = -m_radius; bounds[1] = m_radius;
+        bounds[2] = -m_radius; bounds[3] = m_radius;
+        bounds[4] = -m_radius; bounds[5] = m_radius;
+        return bounds;
+    }
+
+protected:
+    gpu_sphere_mapper()
+    {
+        m_radius = 5.0f;
+        m_phi_slices = 100;
+        m_theta_slices = 100;
+        m_initialized = false;
+        m_tesselated = false;
+    }
+
+private:
+    float m_radius;
+    int m_phi_slices;
+    int m_theta_slices;
+    int m_vertex_count;
+    bool m_initialized;
+    bool m_tesselated;
+    compute::context m_context;
+    compute::command_queue m_command_queue;
+    compute::opengl_buffer m_vertex_buffer;
+};
+
+int main(int argc, char *argv[])
+{
+    // create gpu sphere mapper
+    vtkSmartPointer<gpu_sphere_mapper> mapper =
+        vtkSmartPointer<gpu_sphere_mapper>::New();
+
+    // create actor for gpu sphere mapper
+    vtkSmartPointer<vtkActor> actor =
+        vtkSmartPointer<vtkActor>::New();
+    actor->GetProperty()->LightingOff();
+    actor->GetProperty()->SetInterpolationToFlat();
+    actor->SetMapper(mapper);
+
+    // create render window
+    vtkSmartPointer<vtkRenderer> renderer =
+        vtkSmartPointer<vtkRenderer>::New();
+    renderer->SetBackground(.1, .2, .31);
+    vtkSmartPointer<vtkRenderWindow> renderWindow =
+        vtkSmartPointer<vtkRenderWindow>::New();
+    renderWindow->SetSize(800, 600);
+    renderWindow->AddRenderer(renderer);
+    vtkSmartPointer<vtkRenderWindowInteractor> renderWindowInteractor =
+        vtkSmartPointer<vtkRenderWindowInteractor>::New();
+    vtkInteractorStyleSwitch *interactorStyle =
+        vtkInteractorStyleSwitch::SafeDownCast(
+            renderWindowInteractor->GetInteractorStyle()
+        );
+    interactorStyle->SetCurrentStyleToTrackballCamera();
+    renderWindowInteractor->SetRenderWindow(renderWindow);
+    renderer->AddActor(actor);
+
+    // render
+    renderer->ResetCamera();
+    vtkCamera *camera = renderer->GetActiveCamera();
+    camera->Elevation(-90.0);
+    renderWindowInteractor->Initialize();
+    renderWindow->Render();
+    renderWindowInteractor->Start();
+
+    return 0;
+}
diff --git a/src/boost/libs/compute/example/point_centroid.cpp b/src/boost/libs/compute/example/point_centroid.cpp
new file mode 100644
index 00000000..e5691f7c
--- /dev/null
+++ b/src/boost/libs/compute/example/point_centroid.cpp
@@ -0,0 +1,68 @@
+//---------------------------------------------------------------------------//
+// Copyright (c) 2013 Kyle Lutz <kyle.r.lutz@gmail.com>
+//
+// Distributed under the Boost Software License, Version 1.0
+// See accompanying file LICENSE_1_0.txt or copy at
+// http://www.boost.org/LICENSE_1_0.txt
+//
+// See http://boostorg.github.com/compute for more information.
+//---------------------------------------------------------------------------//
+
+//[point_centroid_example
+
+#include <iostream>
+
+#include <boost/compute/algorithm/copy.hpp>
+#include <boost/compute/algorithm/accumulate.hpp>
+#include <boost/compute/container/vector.hpp>
+#include <boost/compute/types/fundamental.hpp>
+
+namespace compute = boost::compute;
+
+// the point centroid example calculates and displays the
+// centroid of a set of 3D points stored as float4's
+int main()
+{
+    using compute::float4_;
+
+    // get default device and setup context
+    compute::device device = compute::system::default_device();
+    compute::context context(device);
+    compute::command_queue queue(context, device);
+
+    // point coordinates
+    float points[] = { 1.0f, 2.0f, 3.0f, 0.0f,
+                       -2.0f, -3.0f, 4.0f, 0.0f,
+                       1.0f, -2.0f, 2.5f, 0.0f,
+                       -7.0f, -3.0f, -2.0f, 0.0f,
+                       3.0f, 4.0f, -5.0f, 0.0f };
+
+    // create vector for five points
+    compute::vector<float4_> vector(5, context);
+
+    // copy point data to the device
+    compute::copy(
+        reinterpret_cast<float4_ *>(points),
+        reinterpret_cast<float4_ *>(points) + 5,
+        vector.begin(),
+        queue
+    );
+
+    // calculate sum
+    float4_ sum = compute::accumulate(
+        vector.begin(), vector.end(), float4_(0, 0, 0, 0), queue
+    );
+
+    // calculate centroid
+    float4_ centroid;
+    for(size_t i = 0; i < 3; i++){
+        centroid[i] = sum[i] / 5.0f;
+    }
+
+    // print centroid
+    std::cout << "centroid: " << centroid << std::endl;
+
+    return 0;
+}
+
+//]
diff --git a/src/boost/libs/compute/example/price_cross.cpp b/src/boost/libs/compute/example/price_cross.cpp
new file mode 100644
index 00000000..95196494
--- /dev/null
+++ b/src/boost/libs/compute/example/price_cross.cpp
@@ -0,0 +1,87 @@
+//---------------------------------------------------------------------------//
+// Copyright (c) 2014 Kyle Lutz <kyle.r.lutz@gmail.com>
+//
+// Distributed under the Boost Software License, Version 1.0
+// See accompanying file LICENSE_1_0.txt or copy at
+// http://www.boost.org/LICENSE_1_0.txt
+//
+// See http://boostorg.github.com/compute for more information.
+//---------------------------------------------------------------------------//
+
+#include <iostream>
+
+#include <boost/compute/system.hpp>
+#include <boost/compute/algorithm/copy.hpp>
+#include <boost/compute/algorithm/copy_n.hpp>
+#include <boost/compute/algorithm/find_if.hpp>
+#include <boost/compute/container/vector.hpp>
+#include <boost/compute/iterator/zip_iterator.hpp>
+
+namespace compute = boost::compute;
+
+// this example shows how to use the find_if() algorithm to detect the
+// point at which two vectors of prices (such as stock prices) cross.
+int main()
+{
+    // get default device and setup context
+    compute::device gpu = compute::system::default_device();
+    compute::context context(gpu);
+    compute::command_queue queue(context, gpu);
+
+    // prices #1 (from 10.0 to 11.0)
+    std::vector<float> prices1;
+    for(float i = 10.0; i <= 11.0; i += 0.1){
+        prices1.push_back(i);
+    }
+
+    // prices #2 (from 11.0 to 10.0)
+    std::vector<float> prices2;
+    for(float i = 11.0; i >= 10.0; i -= 0.1){
+        prices2.push_back(i);
+    }
+
+    // create gpu vectors
+    compute::vector<float> gpu_prices1(prices1.size(), context);
+    compute::vector<float> gpu_prices2(prices2.size(), context);
+
+    // copy prices to gpu
+    compute::copy(prices1.begin(), prices1.end(), gpu_prices1.begin(), queue);
+    compute::copy(prices2.begin(), prices2.end(), gpu_prices2.begin(), queue);
+
+    // function returning true if the second price is less than the first price
+    BOOST_COMPUTE_FUNCTION(bool, check_price_cross, (boost::tuple<float, float> prices),
+    {
+        // first price
+        const float first = boost_tuple_get(prices, 0);
+
+        // second price
+        const float second = boost_tuple_get(prices, 1);
+
+        // return true if second price is less than first
+        return second < first;
+    });
+
+    // find cross point (should be 10.5)
+    compute::vector<float>::iterator iter = boost::get<0>(
+        compute::find_if(
+            compute::make_zip_iterator(
+                boost::make_tuple(gpu_prices1.begin(), gpu_prices2.begin())
+            ),
+            compute::make_zip_iterator(
+                boost::make_tuple(gpu_prices1.end(), gpu_prices2.end())
+            ),
+            check_price_cross,
+            queue
+        ).get_iterator_tuple()
+    );
+
+    // print out result
+    int index = std::distance(gpu_prices1.begin(), iter);
+    std::cout << "price cross at index: " << index << std::endl;
+
+    float value;
+    compute::copy_n(iter, 1, &value, queue);
+    std::cout << "value: " << value << std::endl;
+
+    return 0;
+}
diff --git a/src/boost/libs/compute/example/print_vector.cpp b/src/boost/libs/compute/example/print_vector.cpp
new file mode 100644
index 00000000..9ec5c6c7
--- /dev/null
+++ b/src/boost/libs/compute/example/print_vector.cpp
@@ -0,0 +1,45 @@
+//---------------------------------------------------------------------------//
+// Copyright (c) 2013-2014 Kyle Lutz <kyle.r.lutz@gmail.com>
+//
+// Distributed under the Boost Software License, Version 1.0
+// See accompanying file LICENSE_1_0.txt or copy at
+// http://www.boost.org/LICENSE_1_0.txt
+//
+// See http://boostorg.github.com/compute for more information.
+//---------------------------------------------------------------------------//
+
+#include <iostream>
+#include <vector>
+
+#include <boost/compute/system.hpp>
+#include <boost/compute/algorithm/copy.hpp>
+#include <boost/compute/algorithm/iota.hpp>
+#include <boost/compute/container/vector.hpp>
+
+namespace compute = boost::compute;
+
+// this example demonstrates how to print the values in a vector
+int main()
+{
+    // get default device and setup context
+    compute::device gpu = compute::system::default_device();
+    compute::context context(gpu);
+    compute::command_queue queue(context, gpu);
+    std::cout << "device: " << gpu.name() << std::endl;
+
+    // create vector on the device and fill with the sequence 1..10
+    compute::vector<int> vector(10, context);
+    compute::iota(vector.begin(), vector.end(), 1, queue);
+
+//[print_vector_example
+    std::cout << "vector: [ ";
+    boost::compute::copy(
+        vector.begin(), vector.end(),
+        std::ostream_iterator<int>(std::cout, ", "),
+        queue
+    );
+    std::cout << "]" << std::endl;
+//]
+
+    return 0;
+}
diff --git a/src/boost/libs/compute/example/qimage_blur.cpp b/src/boost/libs/compute/example/qimage_blur.cpp
new file mode 100644
index 00000000..cbfa3145
--- /dev/null
+++ b/src/boost/libs/compute/example/qimage_blur.cpp
@@ -0,0 +1,145 @@
+//---------------------------------------------------------------------------//
+// Copyright (c) 2014 Kyle Lutz <kyle.r.lutz@gmail.com>
+//
+// Distributed under the Boost Software License, Version 1.0
+// See accompanying file LICENSE_1_0.txt or copy at
+// http://www.boost.org/LICENSE_1_0.txt
+//
+// See http://boostorg.github.com/compute for more information.
+//---------------------------------------------------------------------------//
+
+#include <iostream>
+#include <algorithm>
+
+#include <QtGlobal>
+#if QT_VERSION >= 0x050000
+#include <QtWidgets>
+#else
+#include <QtGui>
+#endif
+
+#ifndef Q_MOC_RUN
+#include <boost/compute/system.hpp>
+#include <boost/compute/image/image2d.hpp>
+#include <boost/compute/interop/qt.hpp>
+#include <boost/compute/utility/dim.hpp>
+#include <boost/compute/utility/source.hpp>
+#endif // Q_MOC_RUN
+
+namespace compute = boost::compute;
+
+inline void box_filter_image(const compute::image2d &input,
+                             compute::image2d &output,
+                             compute::uint_ box_height,
+                             compute::uint_ box_width,
+                             compute::command_queue &queue)
+{
+    using compute::dim;
+
+    const compute::context &context = queue.get_context();
+
+    // simple box filter kernel source
+    const char source[] = BOOST_COMPUTE_STRINGIZE_SOURCE(
+        __kernel void box_filter(__read_only image2d_t input,
+                                 __write_only image2d_t output,
+                                 uint box_height,
+                                 uint box_width)
+        {
+            int x = get_global_id(0);
+            int y = get_global_id(1);
+            int h = get_image_height(input);
+            int w = get_image_width(input);
+            int k = box_width;
+            int l = box_height;
+
+            if(x < k/2 || y < l/2 || x >= w-(k/2) || y >= h-(l/2)){
+                write_imagef(output, (int2)(x, y), (float4)(0, 0, 0, 1));
+            }
+            else {
+                const sampler_t sampler = CLK_ADDRESS_NONE | CLK_FILTER_NEAREST;
+
+                float4 sum = { 0, 0, 0, 0 };
+                for(int i = 0; i < k; i++){
+                    for(int j = 0; j < l; j++){
+                        sum += read_imagef(input, sampler, (int2)(x+i-k, y+j-l));
+                    }
+                }
+                sum /= (float) k * l;
+                float4 value = (float4)( sum.x, sum.y, sum.z, 1.f );
+                write_imagef(output, (int2)(x, y), value);
+            }
+        }
+    );
+
+    // build box filter program
+    compute::program program =
+        compute::program::create_with_source(source, context);
+    program.build();
+
+    // setup box filter kernel
+    compute::kernel kernel(program, "box_filter");
+    kernel.set_arg(0, input);
+    kernel.set_arg(1, output);
+    kernel.set_arg(2, box_height);
+    kernel.set_arg(3, box_width);
+
+    // execute the box filter kernel
+    queue.enqueue_nd_range_kernel(kernel, dim(0, 0), input.size(), dim(1, 1));
+}
+
+// this example shows how to load an image using Qt, apply a simple
+// box blur filter, and then display it in a Qt window.
+int main(int argc, char *argv[])
+{
+    QApplication app(argc, argv);
+
+    // check command line
+    if(argc < 2){
+        std::cout << "usage: qimage_blur [FILENAME]" << std::endl;
+        return -1;
+    }
+
+    // load image using Qt
+    QString fileName = argv[1];
+    QImage qimage(fileName);
+
+    size_t height = qimage.height();
+    size_t width = qimage.width();
+    size_t bytes_per_line = qimage.bytesPerLine();
+
+    qDebug() << "height:" << height
+             << "width:" << width
+             << "bytes per line:" << bytes_per_line
+             << "depth:" << qimage.depth()
+             << "format:" << qimage.format();
+
+    // create compute context
+    compute::device gpu = compute::system::default_device();
+    compute::context context(gpu);
+    compute::command_queue queue(context, gpu);
+    std::cout << "device: " << gpu.name() << std::endl;
+
+    // get the opencl image format for the qimage
+    compute::image_format format =
+        compute::qt_qimage_format_to_image_format(qimage.format());
+
+    // create input and output images on the gpu
+    compute::image2d input_image(context, width, height, format);
+    compute::image2d output_image(context, width, height, format);
+
+    // copy host qimage to gpu image
+    compute::qt_copy_qimage_to_image2d(qimage, input_image, queue);
+
+    // apply box filter
+    box_filter_image(input_image, output_image, 7, 7, queue);
+
+    // copy gpu blurred image from to host qimage
+    compute::qt_copy_image2d_to_qimage(output_image, qimage, queue);
+
+    // show image as a pixmap
+    QLabel label;
+    label.setPixmap(QPixmap::fromImage(qimage));
+    label.show();
+
+    return app.exec();
+}
diff --git a/src/boost/libs/compute/example/random_walk.cpp b/src/boost/libs/compute/example/random_walk.cpp
new file mode 100644
index 00000000..13106d96
--- /dev/null
+++ b/src/boost/libs/compute/example/random_walk.cpp
@@ -0,0 +1,153 @@
+//---------------------------------------------------------------------------//
+// Copyright (c) 2013-2014 Kyle Lutz <kyle.r.lutz@gmail.com>
+//
+// Distributed under the Boost Software License, Version 1.0
+// See accompanying file LICENSE_1_0.txt or copy at
+// http://www.boost.org/LICENSE_1_0.txt
+//
+// See http://boostorg.github.com/compute for more information.
+//---------------------------------------------------------------------------//
+
+#include <iostream>
+
+#include <opencv2/core/core.hpp>
+#include <opencv2/highgui/highgui.hpp>
+#include <opencv2/imgproc/imgproc.hpp>
+
+#include <boost/compute/system.hpp>
+#include <boost/compute/algorithm/inclusive_scan.hpp>
+#include <boost/compute/algorithm/inclusive_scan.hpp>
+#include <boost/compute/interop/opencv/core.hpp>
+#include <boost/compute/interop/opencv/highgui.hpp>
+#include <boost/compute/random/default_random_engine.hpp>
+#include <boost/compute/random/uniform_real_distribution.hpp>
+#include <boost/compute/utility/source.hpp>
+
+namespace compute = boost::compute;
+
+// this example uses the random-number generation functions in Boost.Compute
+// to calculate a large number of random "steps" and then plots the final
+// random "walk" in a 2D image on the GPU and displays it with OpenCV
+int main()
+{
+    // number of random steps to take
+    size_t steps = 250000;
+
+    // height and width of image
+    size_t height = 800;
+    size_t width = 800;
+
+    // get default device and setup context
+    compute::device gpu = compute::system::default_device();
+    compute::context context(gpu);
+    compute::command_queue queue(context, gpu);
+
+    using compute::int2_;
+
+    // calaculate random values for each step
+    compute::vector<float> random_values(steps, context);
+    compute::default_random_engine random_engine(queue);
+    compute::uniform_real_distribution<float> random_distribution(0.f, 4.f);
+
+    random_distribution.generate(
+        random_values.begin(), random_values.end(), random_engine, queue
+    );
+
+    // calaculate coordinates for each step
+    compute::vector<int2_> coordinates(steps, context);
+
+    // function to convert random values to random directions (in 2D)
+    BOOST_COMPUTE_FUNCTION(int2_, take_step, (const float x),
+    {
+        if(x < 1.f){
+            // move right
+            return (int2)(1, 0);
+        }
+        if(x < 2.f){
+            // move up
+            return (int2)(0, 1);
+        }
+        if(x < 3.f){
+            // move left
+            return (int2)(-1, 0);
+        }
+        else {
+            // move down
+            return (int2)(0, -1);
+        }
+    });
+
+    // transform the random values into random steps
+    compute::transform(
+        random_values.begin(), random_values.end(), coordinates.begin(), take_step, queue
+    );
+
+    // set staring position
+    int2_ starting_position(width / 2, height / 2);
+    compute::copy_n(&starting_position, 1, coordinates.begin(), queue);
+
+    // scan steps to calculate position after each step
+    compute::inclusive_scan(
+        coordinates.begin(), coordinates.end(), coordinates.begin(), queue
+    );
+
+    // create output image
+    compute::image2d image(
+        context, width, height, compute::image_format(CL_RGBA, CL_UNSIGNED_INT8)
+    );
+
+    // program with two kernels, one to fill the image with white, and then
+    // one the draw to points calculated in coordinates on the image
+    const char draw_walk_source[] = BOOST_COMPUTE_STRINGIZE_SOURCE(
+        __kernel void draw_walk(__global const int2 *coordinates,
+                                __write_only image2d_t image)
+        {
+            const uint i = get_global_id(0);
+            const int2 coord = coordinates[i];
+
+            if(coord.x > 0 && coord.x < get_image_width(image) &&
+               coord.y > 0 && coord.y < get_image_height(image)){
+                uint4 black = { 0, 0, 0, 0 };
+                write_imageui(image, coord, black);
+            }
+        }
+
+        __kernel void fill_white(__write_only image2d_t image)
+        {
+            const int2 coord = { get_global_id(0), get_global_id(1) };
+
+            if(coord.x < get_image_width(image) &&
+               coord.y < get_image_height(image)){
+                uint4 white = { 255, 255, 255, 255 };
+                write_imageui(image, coord, white);
+            }
+        }
+    );
+
+    // build the program
+    compute::program draw_program =
+        compute::program::build_with_source(draw_walk_source, context);
+
+    // fill image with white
+    compute::kernel fill_kernel(draw_program, "fill_white");
+    fill_kernel.set_arg(0, image);
+
+    const size_t offset[] = { 0, 0 };
+    const size_t bounds[] = { width, height };
+
+    queue.enqueue_nd_range_kernel(fill_kernel, 2, offset, bounds, 0);
+
+    // draw random walk
+    compute::kernel draw_kernel(draw_program, "draw_walk");
+    draw_kernel.set_arg(0, coordinates);
+    draw_kernel.set_arg(1, image);
+    queue.enqueue_1d_range_kernel(draw_kernel, 0, coordinates.size(), 0);
+
+    // show image
+    compute::opencv_imshow("random walk", image, queue);
+
+    // wait and return
+    cv::waitKey(0);
+
+    return 0;
+}
diff --git a/src/boost/libs/compute/example/resize_image.cpp b/src/boost/libs/compute/example/resize_image.cpp
new file mode 100644
index 00000000..d9e29edd
--- /dev/null
+++ b/src/boost/libs/compute/example/resize_image.cpp
@@ -0,0 +1,253 @@
+//---------------------------------------------------------------------------//
+// Copyright (c) 2013-2014 Kyle Lutz <kyle.r.lutz@gmail.com>
+//
+// Distributed under the Boost Software License, Version 1.0
+// See accompanying file LICENSE_1_0.txt or copy at
+// http://www.boost.org/LICENSE_1_0.txt
+//
+// See http://boostorg.github.com/compute for more information.
+//---------------------------------------------------------------------------//
+
+#include <iostream>
+#include <algorithm>
+
+#include <QtGlobal>
+#if QT_VERSION >= 0x050000
+#include <QtWidgets>
+#else
+#include <QtGui>
+#endif
+#include <QtOpenGL>
+
+#include <boost/program_options.hpp>
+
+#ifndef Q_MOC_RUN
+#include <boost/compute/command_queue.hpp>
+#include <boost/compute/kernel.hpp>
+#include <boost/compute/program.hpp>
+#include <boost/compute/system.hpp>
+#include <boost/compute/image/image2d.hpp>
+#include <boost/compute/image/image_sampler.hpp>
+#include <boost/compute/interop/qt.hpp>
+#include <boost/compute/interop/opengl.hpp>
+#include <boost/compute/utility/source.hpp>
+#endif // Q_MOC_RUN
+
+namespace compute = boost::compute;
+namespace po = boost::program_options;
+
+// opencl source code
+const char source[] = BOOST_COMPUTE_STRINGIZE_SOURCE(
+    __kernel void resize_image(__read_only image2d_t input,
+                               const sampler_t sampler,
+                               __write_only image2d_t output)
+    {
+        const uint x = get_global_id(0);
+        const uint y = get_global_id(1);
+
+        const float w = get_image_width(output);
+        const float h = get_image_height(output);
+
+        float2 coord = { ((float) x / w) * get_image_width(input),
+                         ((float) y / h) * get_image_height(input) };
+
+        float4 pixel = read_imagef(input, sampler, coord);
+        write_imagef(output, (int2)(x, h - y - 1), pixel);
+    };
+);
+
+class ImageWidget : public QGLWidget
+{
+    Q_OBJECT
+
+public:
+    ImageWidget(QString fileName, QWidget *parent = 0);
+    ~ImageWidget();
+
+    void initializeGL();
+    void resizeGL(int width, int height);
+    void paintGL();
+
+private:
+    QImage qt_image_;
+    compute::context context_;
+    compute::command_queue queue_;
+    compute::program program_;
+    compute::image2d image_;
+    compute::image_sampler sampler_;
+    GLuint gl_texture_;
+    compute::opengl_texture cl_texture_;
+};
+
+ImageWidget::ImageWidget(QString fileName, QWidget *parent)
+    : QGLWidget(parent),
+      qt_image_(fileName)
+{
+    gl_texture_ = 0;
+}
+
+ImageWidget::~ImageWidget()
+{
+}
+
+void ImageWidget::initializeGL()
+{
+    // setup opengl
+    glDisable(GL_LIGHTING);
+
+    // create the OpenGL/OpenCL shared context
+    context_ = compute::opengl_create_shared_context();
+
+    // get gpu device
+    compute::device gpu = context_.get_device();
+    std::cout << "device: " << gpu.name() << std::endl;
+
+    // setup command queue
+    queue_ = compute::command_queue(context_, gpu);
+
+    // allocate image on the device
+    compute::image_format format =
+        compute::qt_qimage_format_to_image_format(qt_image_.format());
+
+    image_ = compute::image2d(
+        context_, qt_image_.width(), qt_image_.height(), format, CL_MEM_READ_ONLY
+    );
+
+    // transfer image to the device
+    compute::qt_copy_qimage_to_image2d(qt_image_, image_, queue_);
+
+    // setup image sampler (use CL_FILTER_NEAREST to disable linear interpolation)
+    sampler_ = compute::image_sampler(
+        context_, false, CL_ADDRESS_NONE, CL_FILTER_LINEAR
+    );
+
+    // build resize program
+    program_ = compute::program::build_with_source(source, context_);
+}
+
+void ImageWidget::resizeGL(int width, int height)
+{
+#if QT_VERSION >= 0x050000
+    // scale height/width based on device pixel ratio
+    width /= windowHandle()->devicePixelRatio();
+    height /= windowHandle()->devicePixelRatio();
+#endif
+
+    // resize viewport
+    glViewport(0, 0, width, height);
+
+    // delete old texture
+    if(gl_texture_){
+      glDeleteTextures(1, &gl_texture_);
+      gl_texture_ = 0;
+    }
+
+    // generate new texture
+    glGenTextures(1, &gl_texture_);
+    glBindTexture(GL_TEXTURE_2D, gl_texture_);
+    glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_S, GL_REPEAT);
+    glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_T, GL_REPEAT);
+    glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, GL_LINEAR);
+    glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_LINEAR);
+    glTexImage2D(
+        GL_TEXTURE_2D, 0, GL_RGBA, width, height, 0, GL_RGBA, GL_UNSIGNED_BYTE, 0
+    );
+
+    // create opencl object for the texture
+    cl_texture_ = compute::opengl_texture(
+        context_, GL_TEXTURE_2D, 0, gl_texture_, CL_MEM_WRITE_ONLY
+    );
+}
+
+void ImageWidget::paintGL()
+{
+    float w = width();
+    float h = height();
+
+    glMatrixMode(GL_PROJECTION);
+    glLoadIdentity();
+    glOrtho(0.0, w, 0.0, h, -1.0, 1.0);
+    glMatrixMode(GL_MODELVIEW);
+    glLoadIdentity();
+
+    // setup the resize kernel
+    compute::kernel kernel(program_, "resize_image");
+    kernel.set_arg(0, image_);
+    kernel.set_arg(1, sampler_);
+    kernel.set_arg(2, cl_texture_);
+
+    // acquire the opengl texture so it can be used in opencl
+    compute::opengl_enqueue_acquire_gl_objects(1, &cl_texture_.get(), queue_);
+
+    // execute the resize kernel
+    const size_t global_work_offset[] = { 0, 0 };
+    const size_t global_work_size[] = { size_t(width()), size_t(height()) };
+
+    queue_.enqueue_nd_range_kernel(
+        kernel, 2, global_work_offset, global_work_size, 0
+    );
+
+    // release the opengl texture so it can be used by opengl
+    compute::opengl_enqueue_release_gl_objects(1, &cl_texture_.get(), queue_);
+
+    // ensure opencl is finished before rendering in opengl
+    queue_.finish();
+
+    // draw a single quad with the resized image texture
+    glEnable(GL_TEXTURE_2D);
+    glBindTexture(GL_TEXTURE_2D, gl_texture_);
+
+    glBegin(GL_QUADS);
+    glTexCoord2f(0, 0); glVertex2f(0, 0);
+    glTexCoord2f(0, 1); glVertex2f(0, h);
+    glTexCoord2f(1, 1); glVertex2f(w, h);
+    glTexCoord2f(1, 0); glVertex2f(w, 0);
+    glEnd();
+}
+
+// the resize image example demonstrates how to interactively resize a
+// 2D image and display it using OpenGL. a image sampler is used to perform
+// hardware-accelerated linear interpolation for the resized image.
+int main(int argc, char *argv[])
+{
+    // setup command line arguments
+    po::options_description options("options");
+    options.add_options()
+        ("help", "show usage instructions")
+        ("file", po::value<std::string>(), "image file name (e.g. /path/to/image.png)")
+    ;
+    po::positional_options_description positional_options;
+    positional_options.add("file", 1);
+
+    // parse command line
+    po::variables_map vm;
+    po::store(
+        po::command_line_parser(argc, argv)
+            .options(options)
+            .positional(positional_options)
+            .run(),
+        vm
+    );
+    po::notify(vm);
+
+    // check for file argument
+    if(vm.count("help") || !vm.count("file")){
+        std::cout << options << std::endl;
+        return -1;
+    }
+
+    // get file name
+    std::string file_name = vm["file"].as<std::string>();
+
+    // setup qt application
+    QApplication app(argc, argv);
+
+    // setup image widget
+    ImageWidget widget(QString::fromStdString(file_name));
+    widget.show();
+
+    // run qt application
+    return app.exec();
+}
+
+#include "resize_image.moc"
diff --git a/src/boost/libs/compute/example/simple_kernel.cpp b/src/boost/libs/compute/example/simple_kernel.cpp
new file mode 100644
index 00000000..4aa1872f
--- /dev/null
+++ b/src/boost/libs/compute/example/simple_kernel.cpp
@@ -0,0 +1,84 @@
+//---------------------------------------------------------------------------//
+// Copyright (c) 2013 Kyle Lutz <kyle.r.lutz@gmail.com>
+//
+// Distributed under the Boost Software License, Version 1.0
+// See accompanying file LICENSE_1_0.txt or copy at
+// http://www.boost.org/LICENSE_1_0.txt
+//
+// See http://boostorg.github.com/compute for more information.
+//---------------------------------------------------------------------------//
+
+#include <iostream>
+
+#include <boost/compute/core.hpp>
+
+namespace compute = boost::compute;
+
+// this example demonstrates how to use the Boost.Compute classes to
+// setup and run a simple vector addition kernel on the GPU
+int main()
+{
+    // get the default device
+    compute::device device = compute::system::default_device();
+
+    // create a context for the device
+    compute::context context(device);
+
+    // setup input arrays
+    float a[] = { 1, 2, 3, 4 };
+    float b[] = { 5, 6, 7, 8 };
+
+    // make space for the output
+    float c[] = { 0, 0, 0, 0 };
+
+    // create memory buffers for the input and output
+    compute::buffer buffer_a(context, 4 * sizeof(float));
+    compute::buffer buffer_b(context, 4 * sizeof(float));
+    compute::buffer buffer_c(context, 4 * sizeof(float));
+
+    // source code for the add kernel
+    const char source[] =
+        "__kernel void add(__global const float *a,"
+        "                  __global const float *b,"
+        "                  __global float *c)"
+        "{"
+        "    const uint i = get_global_id(0);"
+        "    c[i] = a[i] + b[i];"
+        "}";
+
+    // create the program with the source
+    compute::program program =
+        compute::program::create_with_source(source, context);
+
+    // compile the program
+    program.build();
+
+    // create the kernel
+    compute::kernel kernel(program, "add");
+
+    // set the kernel arguments
+    kernel.set_arg(0, buffer_a);
+    kernel.set_arg(1, buffer_b);
+    kernel.set_arg(2, buffer_c);
+
+    // create a command queue
+    compute::command_queue queue(context, device);
+
+    // write the data from 'a' and 'b' to the device
+    queue.enqueue_write_buffer(buffer_a, 0, 4 * sizeof(float), a);
+    queue.enqueue_write_buffer(buffer_b, 0, 4 * sizeof(float), b);
+
+    // run the add kernel
+    queue.enqueue_1d_range_kernel(kernel, 0, 4, 0);
+
+    // transfer results back to the host array 'c'
+    queue.enqueue_read_buffer(buffer_c, 0, 4 * sizeof(float), c);
+
+    // print out results in 'c'
+    std::cout << "c: [" << c[0] << ", "
+                        << c[1] << ", "
+                        << c[2] << ", "
+                        << c[3] << "]" << std::endl;
+
+    return 0;
+}
diff --git a/src/boost/libs/compute/example/simple_moving_average.cpp b/src/boost/libs/compute/example/simple_moving_average.cpp
new file mode 100644
index 00000000..968db393
--- /dev/null
+++ b/src/boost/libs/compute/example/simple_moving_average.cpp
@@ -0,0 +1,139 @@
+//---------------------------------------------------------------------------//
+// Copyright (c) 2014 Benoit Dequidt <benoit.dequidt@gmail.com>
+//
+// Distributed under the Boost Software License, Version 1.0
+// See accompanying file LICENSE_1_0.txt or copy at
+// http://www.boost.org/LICENSE_1_0.txt
+//
+// See http://boostorg.github.com/compute for more information.
+//---------------------------------------------------------------------------//
+
+#include <iostream>
+#include <cstdlib>
+
+#include <boost/compute/core.hpp>
+#include <boost/compute/algorithm/copy.hpp>
+#include <boost/compute/algorithm/inclusive_scan.hpp>
+#include <boost/compute/container/vector.hpp>
+#include <boost/compute/type_traits/type_name.hpp>
+#include <boost/compute/utility/source.hpp>
+
+namespace compute = boost::compute;
+
+/// warning precision is not precise due
+/// to the float error accumulation when size is large enough
+/// for more precision use double
+/// or a kahan sum else results can diverge
+/// from the CPU implementation
+compute::program make_sma_program(const compute::context& context)
+{
+    const char source[] = BOOST_COMPUTE_STRINGIZE_SOURCE(
+        __kernel void SMA(__global const float *scannedValues, int size, __global float *output, int wSize)
+        {
+            const int gid = get_global_id(0);
+
+            float cumValues = 0.f;
+            int endIdx = gid + wSize/2;
+            int startIdx = gid -1 - wSize/2;
+
+            if(endIdx > size -1)
+                endIdx = size -1;
+
+            cumValues += scannedValues[endIdx];
+            if(startIdx < 0)
+                startIdx = -1;
+            else
+                cumValues -= scannedValues[startIdx];
+
+            output[gid] =(float)( cumValues / ( float )(endIdx - startIdx));
+        }
+   );
+
+    // create sma program
+    return compute::program::build_with_source(source,context);
+}
+
+bool check_results(const std::vector<float>& values, const std::vector<float>& smoothValues, unsigned int wSize)
+{
+    int size = values.size();
+    if(size != (int)smoothValues.size()) return false;
+
+    int semiWidth = wSize/2;
+
+    bool ret = true;
+    for(int idx = 0 ; idx < size ; ++idx)
+    {
+        int start = (std::max)(idx - semiWidth,0);
+        int end = (std::min)(idx + semiWidth,size-1);
+        float res = 0;
+        for(int j = start ; j <= end ; ++j)
+        {
+            res+= values[j];
+        }
+
+        res /= float(end - start +1);
+
+        if(std::abs(res-smoothValues[idx]) > 1e-3)
+        {
+            std::cout << "idx = " << idx << " -- expected = " << res << " -- result = " << smoothValues[idx] << std::endl;
+            ret = false;
+        }
+    }
+
+    return ret;
+}
+
+// generate a uniform law over [0,10]
+float myRand()
+{
+    static const double divisor = double(RAND_MAX)+1.;
+    return double(rand())/divisor * 10.;
+}
+
+int main()
+{
+    unsigned int size = 1024;
+    // wSize must be odd
+    unsigned int wSize = 21;
+    // get the default device
+    compute::device device = compute::system::default_device();
+    // create a context for the device
+    compute::context context(device);
+    // get the program
+    compute::program program = make_sma_program(context);
+
+    // create vector of random numbers on the host
+    std::vector<float> host_vector(size);
+    std::vector<float> host_result(size);
+    std::generate(host_vector.begin(), host_vector.end(), myRand);
+
+    compute::vector<float> a(size,context);
+    compute::vector<float> b(size,context);
+    compute::vector<float> c(size,context);
+    compute::command_queue queue(context, device);
+
+    compute::copy(host_vector.begin(),host_vector.end(),a.begin(),queue);
+
+    // scan values
+    compute::inclusive_scan(a.begin(),a.end(),b.begin(),queue);
+    // sma kernel
+    compute::kernel kernel(program, "SMA");
+    kernel.set_arg(0,b.get_buffer());
+    kernel.set_arg(1,(int)b.size());
+    kernel.set_arg(2,c.get_buffer());
+    kernel.set_arg(3,(int)wSize);
+
+    using compute::uint_;
+    uint_ tpb = 128;
+    uint_ workSize = size;
+    queue.enqueue_1d_range_kernel(kernel,0,workSize,tpb);
+
+    compute::copy(c.begin(),c.end(),host_result.begin(),queue);
+
+    bool res = check_results(host_vector,host_result,wSize);
+    std::string status = res ? "results are equivalent" : "GPU results differs from CPU one's";
+    std::cout << status << std::endl;
+
+    return 0;
+}
+
diff --git a/src/boost/libs/compute/example/sort_vector.cpp b/src/boost/libs/compute/example/sort_vector.cpp
new file mode 100644
index 00000000..39ec8ea7
--- /dev/null
+++ b/src/boost/libs/compute/example/sort_vector.cpp
@@ -0,0 +1,68 @@
+//---------------------------------------------------------------------------//
+// Copyright (c) 2013 Kyle Lutz <kyle.r.lutz@gmail.com>
+//
+// Distributed under the Boost Software License, Version 1.0
+// See accompanying file LICENSE_1_0.txt or copy at
+// http://www.boost.org/LICENSE_1_0.txt
+//
+// See http://boostorg.github.com/compute for more information.
+//---------------------------------------------------------------------------//
+
+#include <algorithm>
+#include <iostream>
+#include <vector>
+
+#include <boost/compute/system.hpp>
+#include <boost/compute/algorithm/copy.hpp>
+#include <boost/compute/algorithm/sort.hpp>
+#include <boost/compute/container/vector.hpp>
+
+namespace compute = boost::compute;
+
+int rand_int()
+{
+    return rand() % 100;
+}
+
+// this example demonstrates how to sort a vector of ints on the GPU
+int main()
+{
+    // create vector of random values on the host
+    std::vector<int> host_vector(10);
+    std::generate(host_vector.begin(), host_vector.end(), rand_int);
+
+    // print out input vector
+    std::cout << "input:  [ ";
+    for(size_t i = 0; i < host_vector.size(); i++){
+        std::cout << host_vector[i];
+
+        if(i != host_vector.size() - 1){
+            std::cout << ", ";
+        }
+    }
+    std::cout << " ]" << std::endl;
+
+    // transfer the values to the device
+    compute::vector<int> device_vector = host_vector;
+
+    // sort the values on the device
+    compute::sort(device_vector.begin(), device_vector.end());
+
+    // transfer the values back to the host
+    compute::copy(device_vector.begin(),
+                  device_vector.end(),
+                  host_vector.begin());
+
+    // print out the sorted vector
+    std::cout << "output: [ ";
+    for(size_t i = 0; i < host_vector.size(); i++){
+        std::cout << host_vector[i];
+
+        if(i != host_vector.size() - 1){
+            std::cout << ", ";
+        }
+    }
+    std::cout << " ]" << std::endl;
+
+    return 0;
+}
diff --git a/src/boost/libs/compute/example/threefry_engine.cpp b/src/boost/libs/compute/example/threefry_engine.cpp
new file mode 100644
index 00000000..5cbf6c8e
--- /dev/null
+++ b/src/boost/libs/compute/example/threefry_engine.cpp
@@ -0,0 +1,43 @@
+//---------------------------------------------------------------------------//
+// Copyright (c) 2013 Muhammad Junaid Muzammil <mjunaidmuzammil@gmail.com>
+//
+// Distributed under the Boost Software License, Version 1.0
+// See accompanying file LICENSE_1_0.txt or copy at
+// http://www.boost.org/LICENSE_1_0.txt
+//
+// See http://kylelutz.github.com/compute for more information.
+//---------------------------------------------------------------------------//
+
+
+#include <boost/compute/random/threefry_engine.hpp>
+#include <boost/compute/container/vector.hpp>
+#include <boost/compute/command_queue.hpp>
+#include <boost/compute/context.hpp>
+#include <boost/compute/device.hpp>
+#include <boost/compute/system.hpp>
+#include <iostream>
+
+int main() 
+{
+    using boost::compute::uint_;
+    boost::compute::device device = boost::compute::system::default_device();
+    boost::compute::context context(device);
+    boost::compute::command_queue queue(context, device);
+    boost::compute::threefry_engine<> rng(queue);
+    boost::compute::vector<uint_> vector_ctr(20, context);
+    
+    uint32_t ctr[20];
+    for(int i = 0; i < 10; i++) {
+        ctr[i*2] = i;
+        ctr[i*2+1] = 0;
+    }
+    boost::compute::copy(ctr, ctr+20, vector_ctr.begin(), queue);
+    rng.generate(vector_ctr.begin(), vector_ctr.end(), queue);
+    boost::compute::copy(vector_ctr.begin(), vector_ctr.end(), ctr, queue);
+
+    for(int i = 0; i < 10; i++) {
+        std::cout << std::hex << ctr[i*2] << " " << ctr[i*2+1] << std::endl;
+    }
+    return 0;
+}
+
diff --git a/src/boost/libs/compute/example/time_copy.cpp b/src/boost/libs/compute/example/time_copy.cpp
new file mode 100644
index 00000000..35168770
--- /dev/null
+++ b/src/boost/libs/compute/example/time_copy.cpp
@@ -0,0 +1,63 @@
+//---------------------------------------------------------------------------//
+// Copyright (c) 2013 Kyle Lutz <kyle.r.lutz@gmail.com>
+//
+// Distributed under the Boost Software License, Version 1.0
+// See accompanying file LICENSE_1_0.txt or copy at
+// http://www.boost.org/LICENSE_1_0.txt
+//
+// See http://boostorg.github.com/compute for more information.
+//---------------------------------------------------------------------------//
+
+//[time_copy_example
+
+#include <vector>
+#include <cstdlib>
+#include <iostream>
+
+#include <boost/compute/event.hpp>
+#include <boost/compute/system.hpp>
+#include <boost/compute/algorithm/copy.hpp>
+#include <boost/compute/async/future.hpp>
+#include <boost/compute/container/vector.hpp>
+
+namespace compute = boost::compute;
+
+int main()
+{
+    // get the default device
+    compute::device gpu = compute::system::default_device();
+
+    // create context for default device
+    compute::context context(gpu);
+
+    // create command queue with profiling enabled
+    compute::command_queue queue(
+        context, gpu, compute::command_queue::enable_profiling
+    );
+
+    // generate random data on the host
+    std::vector<int> host_vector(16000000);
+    std::generate(host_vector.begin(), host_vector.end(), rand);
+
+    // create a vector on the device
+    compute::vector<int> device_vector(host_vector.size(), context);
+
+    // copy data from the host to the device
+    compute::future<void> future = compute::copy_async(
+        host_vector.begin(), host_vector.end(), device_vector.begin(), queue
+    );
+
+    // wait for copy to finish
+    future.wait();
+
+    // get elapsed time from event profiling information
+    boost::chrono::milliseconds duration =
+        future.get_event().duration<boost::chrono::milliseconds>();
+
+    // print elapsed time in milliseconds
+    std::cout << "time: " << duration.count() << " ms" << std::endl;
+
+    return 0;
+}
+
+//]
diff --git a/src/boost/libs/compute/example/transform_sqrt.cpp b/src/boost/libs/compute/example/transform_sqrt.cpp
new file mode 100644
index 00000000..860063fc
--- /dev/null
+++ b/src/boost/libs/compute/example/transform_sqrt.cpp
@@ -0,0 +1,58 @@
+//---------------------------------------------------------------------------//
+// Copyright (c) 2013 Kyle Lutz <kyle.r.lutz@gmail.com>
+//
+// Distributed under the Boost Software License, Version 1.0
+// See accompanying file LICENSE_1_0.txt or copy at
+// http://www.boost.org/LICENSE_1_0.txt
+//
+// See http://boostorg.github.com/compute for more information.
+//---------------------------------------------------------------------------//
+
+//[transform_sqrt_example
+
+#include <vector>
+#include <algorithm>
+
+#include <boost/compute/algorithm/transform.hpp>
+#include <boost/compute/container/vector.hpp>
+#include <boost/compute/functional/math.hpp>
+
+namespace compute = boost::compute;
+
+int main()
+{
+    // get default device and setup context
+    compute::device device = compute::system::default_device();
+    compute::context context(device);
+    compute::command_queue queue(context, device);
+
+    // generate random data on the host
+    std::vector<float> host_vector(10000);
+    std::generate(host_vector.begin(), host_vector.end(), rand);
+
+    // create a vector on the device
+    compute::vector<float> device_vector(host_vector.size(), context);
+
+    // transfer data from the host to the device
+    compute::copy(
+        host_vector.begin(), host_vector.end(), device_vector.begin(), queue
+    );
+
+    // calculate the square-root of each element in-place
+    compute::transform(
+        device_vector.begin(),
+        device_vector.end(),
+        device_vector.begin(),
+        compute::sqrt<float>(),
+        queue
+    );
+
+    // copy values back to the host
+    compute::copy(
+        device_vector.begin(), device_vector.end(), host_vector.begin(), queue
+    );
+
+    return 0;
+}
+
+//]
diff --git a/src/boost/libs/compute/example/vector_addition.cpp b/src/boost/libs/compute/example/vector_addition.cpp
new file mode 100644
index 00000000..1f3aa92d
--- /dev/null
+++ b/src/boost/libs/compute/example/vector_addition.cpp
@@ -0,0 +1,57 @@
+//---------------------------------------------------------------------------//
+// Copyright (c) 2013 Kyle Lutz <kyle.r.lutz@gmail.com>
+//
+// Distributed under the Boost Software License, Version 1.0
+// See accompanying file LICENSE_1_0.txt or copy at
+// http://www.boost.org/LICENSE_1_0.txt
+//
+// See http://boostorg.github.com/compute for more information.
+//---------------------------------------------------------------------------//
+
+#include <iostream>
+
+#include <boost/compute/algorithm/copy.hpp>
+#include <boost/compute/algorithm/transform.hpp>
+#include <boost/compute/container/vector.hpp>
+#include <boost/compute/functional/operator.hpp>
+
+namespace compute = boost::compute;
+
+// this example demonstrates how to use Boost.Compute's STL
+// implementation to add two vectors on the GPU
+int main()
+{
+    // setup input arrays
+    float a[] = { 1, 2, 3, 4 };
+    float b[] = { 5, 6, 7, 8 };
+
+    // make space for the output
+    float c[] = { 0, 0, 0, 0 };
+
+    // create vectors and transfer data for the input arrays 'a' and 'b'
+    compute::vector<float> vector_a(a, a + 4);
+    compute::vector<float> vector_b(b, b + 4);
+
+    // create vector for the output array
+    compute::vector<float> vector_c(4);
+
+    // add the vectors together
+    compute::transform(
+        vector_a.begin(),
+        vector_a.end(),
+        vector_b.begin(),
+        vector_c.begin(),
+        compute::plus<float>()
+    );
+
+    // transfer results back to the host array 'c'
+    compute::copy(vector_c.begin(), vector_c.end(), c);
+
+    // print out results in 'c'
+    std::cout << "c: [" << c[0] << ", "
+                        << c[1] << ", "
+                        << c[2] << ", "
+                        << c[3] << "]" << std::endl;
+
+    return 0;
+}
author	Daniel Baumann <daniel.baumann@progress-linux.org>	2024-04-27 18:24:20 +0000
committer	Daniel Baumann <daniel.baumann@progress-linux.org>	2024-04-27 18:24:20 +0000
commit	483eb2f56657e8e7f419ab1a4fab8dce9ade8609 (patch)
tree	e5d88d25d870d5dedacb6bbdbe2a966086a0a5cf /src/boost/libs/compute/example
parent	Initial commit. (diff)
download	ceph-483eb2f56657e8e7f419ab1a4fab8dce9ade8609.tar.xz ceph-483eb2f56657e8e7f419ab1a4fab8dce9ade8609.zip