Adding upstream version 16.2.11+ds.upstream/16.2.11+ds upstream

Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
author: Daniel Baumann <daniel.baumann@progress-linux.org> 2024-04-07 18:45:59 +0000
committer: Daniel Baumann <daniel.baumann@progress-linux.org> 2024-04-07 18:45:59 +0000
commit: 19fcec84d8d7d21e796c7624e521b60d28ee21ed (patch)
tree: 42d26aa27d1e3f7c0b8bd3fd14e7d7082f5008dc /src/boost/libs/compute/perf
parent: Initial commit. (diff)
download: ceph-upstream/16.2.11+ds.tar.xz
ceph-upstream/16.2.11+ds.zip
110 files changed, 7351 insertions, 0 deletions
diff --git a/src/boost/libs/compute/perf/CMakeLists.txt b/src/boost/libs/compute/perf/CMakeLists.txt
new file mode 100644
index 000000000..b04429e9d
--- /dev/null
+++ b/src/boost/libs/compute/perf/CMakeLists.txt
@@ -0,0 +1,210 @@
+# ---------------------------------------------------------------------------
+#  Copyright (c) 2013 Kyle Lutz <kyle.r.lutz@gmail.com>
+#
+#  Distributed under the Boost Software License, Version 1.0
+#  See accompanying file LICENSE_1_0.txt or copy at
+#  http://www.boost.org/LICENSE_1_0.txt
+#
+# ---------------------------------------------------------------------------
+
+include_directories(../include)
+
+set(PERF_BOOST_COMPONENTS system timer chrono program_options)
+
+if (${BOOST_COMPUTE_USE_OFFLINE_CACHE})
+  set(PERF_BOOST_COMPONENTS ${PERF_BOOST_COMPONENTS} filesystem)
+endif()
+
+if(${BOOST_COMPUTE_THREAD_SAFE} AND NOT ${BOOST_COMPUTE_USE_CPP11})
+  set(PERF_BOOST_COMPONENTS ${PERF_BOOST_COMPONENTS} thread)
+elseif(${BOOST_COMPUTE_HAVE_BOLT} AND ${BOOST_COMPUTE_USE_CPP11})
+  set(PERF_BOOST_COMPONENTS ${PERF_BOOST_COMPONENTS} thread)
+endif()
+
+if(${BOOST_COMPUTE_HAVE_BOLT} AND ${BOOST_COMPUTE_USE_CPP11})
+  set(PERF_BOOST_COMPONENTS ${PERF_BOOST_COMPONENTS} date_time)
+endif()
+
+if(PERF_BOOST_COMPONENTS)
+  list(REMOVE_DUPLICATES PERF_BOOST_COMPONENTS)
+endif()
+find_package(Boost 1.54 REQUIRED COMPONENTS ${PERF_BOOST_COMPONENTS})
+include_directories(SYSTEM ${Boost_INCLUDE_DIRS})
+
+set(BENCHMARKS
+  accumulate
+  bernoulli_distribution
+  binary_find
+  cart_to_polar
+  comparison_sort
+  copy_if
+  copy_to_device
+  count
+  discrete_distribution
+  erase_remove
+  exclusive_scan
+  fill
+  find
+  find_end
+  includes
+  inner_product
+  is_permutation
+  is_sorted
+  max_element
+  merge
+  next_permutation
+  nth_element
+  partial_sum
+  partition
+  partition_point
+  prev_permutation
+  reverse
+  reverse_copy
+  rotate
+  rotate_copy
+  host_sort
+  random_number_engine
+  reduce_by_key
+  saxpy
+  search
+  search_n
+  set_difference
+  set_intersection
+  set_symmetric_difference
+  set_union
+  sort
+  sort_by_key
+  sort_float
+  stable_partition
+  uniform_int_distribution
+  unique
+  unique_copy
+)
+
+foreach(BENCHMARK ${BENCHMARKS})
+  set(PERF_TARGET perf_${BENCHMARK})
+  add_executable(${PERF_TARGET} perf_${BENCHMARK}.cpp)
+  target_link_libraries(${PERF_TARGET} ${OpenCL_LIBRARIES} ${Boost_LIBRARIES})
+endforeach()
+
+# stl benchmarks (for comparison)
+set(STL_BENCHMARKS
+  stl_accumulate
+  stl_count
+  stl_find
+  stl_find_end
+  stl_includes
+  stl_inner_product
+  stl_max_element
+  stl_merge
+  stl_next_permutation
+  stl_partial_sum
+  stl_partition
+  stl_prev_permutation
+  stl_reverse
+  stl_reverse_copy
+  stl_rotate
+  stl_rotate_copy
+  stl_saxpy
+  stl_search
+  stl_search_n
+  stl_set_difference
+  stl_set_intersection
+  stl_set_symmetric_difference
+  stl_set_union
+  stl_sort
+  stl_stable_partition
+  stl_unique
+  stl_unique_copy
+)
+
+# stl benchmarks which require c++11
+if(${BOOST_COMPUTE_USE_CPP11})
+  list(APPEND
+    STL_BENCHMARKS
+    stl_is_permutation
+    stl_partition_point
+  )
+endif()
+
+foreach(BENCHMARK ${STL_BENCHMARKS})
+  set(PERF_TARGET perf_${BENCHMARK})
+  add_executable(${PERF_TARGET} perf_${BENCHMARK}.cpp)
+  target_link_libraries(${PERF_TARGET} ${Boost_LIBRARIES})
+endforeach()
+
+# cuda/thrust benchmarks (for comparison)
+if(${BOOST_COMPUTE_HAVE_CUDA})
+  find_package(CUDA 5.0 REQUIRED)
+
+  set(CUDA_BENCHMARKS
+    thrust_accumulate
+    thrust_count
+    thrust_exclusive_scan
+    thrust_find
+    thrust_inner_product
+    thrust_merge
+    thrust_partial_sum
+    thrust_partition
+    thrust_reduce_by_key
+    thrust_reverse
+    thrust_reverse_copy
+    thrust_rotate
+    thrust_saxpy
+    thrust_set_difference
+    thrust_sort
+    thrust_unique
+  )
+
+  foreach(BENCHMARK ${CUDA_BENCHMARKS})
+    set(PERF_TARGET perf_${BENCHMARK})
+    cuda_add_executable(${PERF_TARGET} perf_${BENCHMARK}.cu)
+    target_link_libraries(${PERF_TARGET} ${CUDA_LIBRARIES} ${Boost_LIBRARIES})
+  endforeach()
+endif()
+
+# intel tbb benchmarks (for comparison)
+if(${BOOST_COMPUTE_HAVE_TBB})
+  find_package(TBB REQUIRED)
+  include_directories(SYSTEM ${TBB_INCLUDE_DIRS})
+
+  set(TBB_BENCHMARKS
+    tbb_accumulate
+    tbb_merge
+    tbb_sort
+  )
+
+  foreach(BENCHMARK ${TBB_BENCHMARKS})
+    set(PERF_TARGET perf_${BENCHMARK})
+    add_executable(${PERF_TARGET} perf_${BENCHMARK}.cpp)
+    target_link_libraries(${PERF_TARGET} ${TBB_LIBRARIES} ${Boost_LIBRARIES})
+  endforeach()
+endif()
+
+# bolt c++ template lib benchmarks (for comparison)
+if(${BOOST_COMPUTE_HAVE_BOLT} AND ${BOOST_COMPUTE_USE_CPP11})
+  find_package(Bolt REQUIRED)
+  include_directories(SYSTEM ${BOLT_INCLUDE_DIRS})
+
+  set(BOLT_BENCHMARKS
+    bolt_accumulate
+    bolt_count
+    bolt_exclusive_scan
+    bolt_fill
+    bolt_inner_product
+    bolt_max_element
+    bolt_merge
+    bolt_partial_sum
+    bolt_reduce_by_key
+    bolt_saxpy
+    bolt_sort
+  )
+
+  foreach(BENCHMARK ${BOLT_BENCHMARKS})
+    set(PERF_TARGET perf_${BENCHMARK})
+    add_executable(${PERF_TARGET} perf_${BENCHMARK}.cpp)
+    target_link_libraries(${PERF_TARGET} ${OpenCL_LIBRARIES} ${BOLT_LIBRARIES} ${Boost_LIBRARIES})
+  endforeach()
+elseif(${BOOST_COMPUTE_HAVE_BOLT} AND NOT ${BOOST_COMPUTE_USE_CPP11})
+  message(WARNING "BOOST_COMPUTE_USE_CPP11 must be ON for building Bolt C++ Template Library performance tests.")
+endif()
diff --git a/src/boost/libs/compute/perf/perf.hpp b/src/boost/libs/compute/perf/perf.hpp
new file mode 100644
index 000000000..cce0328cd
--- /dev/null
+++ b/src/boost/libs/compute/perf/perf.hpp
@@ -0,0 +1,109 @@
+//---------------------------------------------------------------------------//
+// Copyright (c) 2013-2014 Kyle Lutz <kyle.r.lutz@gmail.com>
+//
+// Distributed under the Boost Software License, Version 1.0
+// See accompanying file LICENSE_1_0.txt or copy at
+// http://www.boost.org/LICENSE_1_0.txt
+//
+// See http://boostorg.github.com/compute for more information.
+//---------------------------------------------------------------------------//
+
+#ifndef PERF_HPP
+#define PERF_HPP
+
+// this header contains general purpose functions and variables used by
+// the boost.compute performance benchmarks.
+
+#include <vector>
+#include <cstdlib>
+#include <algorithm>
+
+#include <boost/lexical_cast.hpp>
+#include <boost/timer/timer.hpp>
+
+static size_t PERF_N = 1024;
+static size_t PERF_TRIALS = 3;
+
+// parses command line arguments and sets the corresponding perf variables
+inline void perf_parse_args(int argc, char *argv[])
+{
+    if(argc >= 2){
+        PERF_N = boost::lexical_cast<size_t>(argv[1]);
+    }
+
+    if(argc >= 3){
+        PERF_TRIALS = boost::lexical_cast<size_t>(argv[2]);
+    }
+}
+
+// generates a vector of random numbers
+template<class T>
+std::vector<T> generate_random_vector(const size_t size)
+{
+    std::vector<T> vector(size);
+    std::generate(vector.begin(), vector.end(), rand);
+    return vector;
+}
+
+// a simple timer wrapper which records multiple time entries
+class perf_timer
+{
+public:
+    typedef boost::timer::nanosecond_type nanosecond_type;
+
+    perf_timer()
+    {
+        timer.stop();
+    }
+
+    void start()
+    {
+        timer.start();
+    }
+
+    void stop()
+    {
+        timer.stop();
+        times.push_back(timer.elapsed().wall);
+    }
+
+    size_t trials() const
+    {
+        return times.size();
+    }
+
+    void clear()
+    {
+        times.clear();
+    }
+
+    nanosecond_type last_time() const
+    {
+        return times.back();
+    }
+
+    nanosecond_type min_time() const
+    {
+        return *std::min_element(times.begin(), times.end());
+    }
+
+    nanosecond_type max_time() const
+    {
+        return *std::max_element(times.begin(), times.end());
+    }
+
+    boost::timer::cpu_timer timer;
+    std::vector<boost::timer::nanosecond_type> times;
+};
+
+// returns the rate (in MB/s) for processing 'count' items of type 'T'
+// in 'time' nanoseconds
+template<class T>
+double perf_rate(const size_t count, perf_timer::nanosecond_type time)
+{
+    const size_t byte_count = count * sizeof(T);
+
+    return (double(byte_count) / 1024 / 1024) / (time / 1e9);
+}
+
+#endif // PERF_HPP
diff --git a/src/boost/libs/compute/perf/perf.py b/src/boost/libs/compute/perf/perf.py
new file mode 100755
index 000000000..c7b33f635
--- /dev/null
+++ b/src/boost/libs/compute/perf/perf.py
@@ -0,0 +1,238 @@
+#!/usr/bin/python
+
+# Copyright (c) 2014 Kyle Lutz <kyle.r.lutz@gmail.com>
+# Distributed under the Boost Software License, Version 1.0
+# See accompanying file LICENSE_1_0.txt or copy at
+# http://www.boost.org/LICENSE_1_0.txt
+#
+# See http://boostorg.github.com/compute for more information.
+
+# driver script for boost.compute benchmarking. will run a
+# benchmark for a given function (e.g. accumulate, sort).
+
+import os
+import sys
+import subprocess
+
+try:
+    import pylab
+except:
+    print('pylab not found, no ploting...')
+    pass
+
+def run_perf_process(name, size, backend = ""):
+    if not backend:
+        proc = "perf_%s" % name
+    else:
+        proc = "perf_%s_%s" % (backend, name)
+
+    filename = "./perf/" + proc
+
+    if not os.path.isfile(filename):
+        print("Error: failed to find ", filename, " for running")
+        return 0
+    try:
+        output = subprocess.check_output([filename, str(int(size))])
+    except:
+        return 0
+
+    t = 0
+    for line in output.decode('utf8').split("\n"):
+        if line.startswith("time:"):
+            t = float(line.split(":")[1].split()[0])
+
+    return t
+
+class Report:
+    def __init__(self, name):
+        self.name = name
+        self.samples = {}
+
+    def add_sample(self, name, size, time):
+        if not name in self.samples:
+            self.samples[name] = []
+
+        self.samples[name].append((size, time))
+
+    def display(self):
+        for name in self.samples.keys():
+            print('=== %s with %s ===' % (self.name, name))
+            print('size,time (ms)')
+
+            for sample in self.samples[name]:
+                print('%d,%f' % sample)
+
+    def plot_time(self, name):
+        if not name in self.samples:
+            return
+
+        x = []
+        y = []
+
+        any_valid_samples = False
+
+        for sample in self.samples[name]:
+            if sample[1] == 0:
+                continue
+
+            x.append(sample[0])
+            y.append(sample[1])
+            any_valid_samples = True
+
+        if not any_valid_samples:
+            return
+
+        pylab.loglog(x, y, marker='o', label=name)
+        pylab.xlabel("Size")
+        pylab.ylabel("Time (ms)")
+        pylab.title(self.name)
+
+    def plot_rate(self, name):
+        if not name in self.samples:
+            return
+
+        x = []
+        y = []
+
+        any_valid_samples = False
+
+        for sample in self.samples[name]:
+            if sample[1] == 0:
+                continue
+
+            x.append(sample[0])
+            y.append(float(sample[0]) / (float(sample[1]) * 1e-3))
+            any_valid_samples = True
+
+        if not any_valid_samples:
+            return
+
+        pylab.loglog(x, y, marker='o', label=name)
+        pylab.xlabel("Size")
+        pylab.ylabel("Rate (values/s)")
+        pylab.title(self.name)
+
+def run_benchmark(name, sizes, vs=[]):
+    report = Report(name)
+
+    for size in sizes:
+        time = run_perf_process(name, size)
+
+        report.add_sample("compute", size, time)
+
+    competitors = {
+        "thrust" : [
+            "accumulate",
+            "count",
+            "exclusive_scan",
+            "find",
+            "inner_product",
+            "merge",
+            "partial_sum",
+            "partition",
+            "reduce_by_key",
+            "reverse",
+            "reverse_copy",
+            "rotate",
+            "saxpy",
+            "sort",
+            "unique"
+        ],
+        "bolt" : [
+            "accumulate",
+            "count",
+            "exclusive_scan",
+            "fill",
+            "inner_product",
+            "max_element",
+            "merge",
+            "partial_sum",
+            "reduce_by_key",
+            "saxpy",
+            "sort"
+        ],
+        "tbb": [
+            "accumulate",
+            "merge",
+            "sort"
+        ],
+        "stl": [
+            "accumulate",
+            "count",
+            "find",
+            "find_end",
+            "includes",
+            "inner_product",
+            "is_permutation",
+            "max_element",
+            "merge",
+            "next_permutation",
+            "nth_element",
+            "partial_sum",
+            "partition",
+            "partition_point",
+            "prev_permutation",
+            "reverse",
+            "reverse_copy",
+            "rotate",
+            "rotate_copy",
+            "saxpy",
+            "search",
+            "search_n",
+            "set_difference",
+            "set_intersection",
+            "set_symmetric_difference",
+            "set_union",
+            "sort",
+            "stable_partition",
+            "unique",
+            "unique_copy"
+        ]
+    }
+
+    for other in vs:
+        if not other in competitors:
+            continue
+        if not name in competitors[other]:
+            continue
+
+        for size in sizes:
+            time = run_perf_process(name, size, other)
+            report.add_sample(other, size, time)
+
+    return report
+
+if __name__ == '__main__':
+    test = "sort"
+    if len(sys.argv) >= 2:
+        test = sys.argv[1]
+    print('running %s perf test' % test)
+
+    sizes = [ pow(2, x) for x in range(1, 26) ]
+
+    sizes = sorted(sizes)
+
+    competitors = ["bolt", "tbb", "thrust", "stl"]
+
+    report = run_benchmark(test, sizes, competitors)
+
+    plot = None
+    if "--plot-time" in sys.argv:
+        plot = "time"
+    elif "--plot-rate" in sys.argv:
+        plot = "rate"
+
+    if plot == "time":
+        report.plot_time("compute")
+        for competitor in competitors:
+            report.plot_time(competitor)
+    elif plot == "rate":
+        report.plot_rate("compute")
+        for competitor in competitors:
+            report.plot_rate(competitor)
+
+    if plot:
+        pylab.legend(loc='upper left')
+        pylab.show()
+    else:
+        report.display()
diff --git a/src/boost/libs/compute/perf/perf_accumulate.cpp b/src/boost/libs/compute/perf/perf_accumulate.cpp
new file mode 100644
index 000000000..bd4276e62
--- /dev/null
+++ b/src/boost/libs/compute/perf/perf_accumulate.cpp
@@ -0,0 +1,140 @@
+//---------------------------------------------------------------------------//
+// Copyright (c) 2013-2014 Kyle Lutz <kyle.r.lutz@gmail.com>
+//
+// Distributed under the Boost Software License, Version 1.0
+// See accompanying file LICENSE_1_0.txt or copy at
+// http://www.boost.org/LICENSE_1_0.txt
+//
+// See http://boostorg.github.com/compute for more information.
+//---------------------------------------------------------------------------//
+
+#include <algorithm>
+#include <iostream>
+#include <numeric>
+#include <vector>
+
+#include <boost/program_options.hpp>
+
+#include <boost/compute/system.hpp>
+#include <boost/compute/algorithm/accumulate.hpp>
+#include <boost/compute/container/vector.hpp>
+
+#include "perf.hpp"
+
+namespace po = boost::program_options;
+namespace compute = boost::compute;
+
+int rand_int()
+{
+    return static_cast<int>((rand() / double(RAND_MAX)) * 25.0);
+}
+
+template<class T>
+double perf_accumulate(const compute::vector<T>& data,
+                       const size_t trials,
+                       compute::command_queue& queue)
+{
+    perf_timer t;
+    for(size_t trial = 0; trial < trials; trial++){
+        t.start();
+        compute::accumulate(data.begin(), data.end(), T(0), queue);
+        queue.finish();
+        t.stop();
+    }
+    return t.min_time();
+}
+
+template<class T>
+void tune_accumulate(const compute::vector<T>& data,
+                     const size_t trials,
+                     compute::command_queue& queue)
+{
+    boost::shared_ptr<compute::detail::parameter_cache>
+        params = compute::detail::parameter_cache::get_global_cache(queue.get_device());
+
+    const std::string cache_key =
+        std::string("__boost_reduce_on_gpu_") + compute::type_name<T>();
+
+    const compute::uint_ tpbs[] = { 4, 8, 16, 32, 64, 128, 256, 512, 1024 };
+    const compute::uint_ vpts[] = { 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 };
+
+    double min_time = (std::numeric_limits<double>::max)();
+    compute::uint_ best_tpb = 0;
+    compute::uint_ best_vpt = 0;
+
+    for(size_t i = 0; i < sizeof(tpbs) / sizeof(*tpbs); i++){
+        params->set(cache_key, "tpb", tpbs[i]);
+        for(size_t j = 0; j < sizeof(vpts) / sizeof(*vpts); j++){
+            params->set(cache_key, "vpt", vpts[j]);
+
+            try {
+                const double t = perf_accumulate(data, trials, queue);
+                if(t < min_time){
+                    best_tpb = tpbs[i];
+                    best_vpt = vpts[j];
+                    min_time = t;
+                }
+            }
+            catch(compute::opencl_error&){
+                // invalid parameters for this device, skip
+            }
+        }
+    }
+
+    // store optimal parameters
+    params->set(cache_key, "tpb", best_tpb);
+    params->set(cache_key, "vpt", best_vpt);
+}
+
+int main(int argc, char *argv[])
+{
+    // setup command line arguments
+    po::options_description options("options");
+    options.add_options()
+        ("help", "show usage instructions")
+        ("size", po::value<size_t>()->default_value(8192), "input size")
+        ("trials", po::value<size_t>()->default_value(3), "number of trials to run")
+        ("tune", "run tuning procedure")
+    ;
+    po::positional_options_description positional_options;
+    positional_options.add("size", 1);
+
+    // parse command line
+    po::variables_map vm;
+    po::store(
+        po::command_line_parser(argc, argv)
+            .options(options).positional(positional_options).run(),
+        vm
+    );
+    po::notify(vm);
+
+    const size_t size = vm["size"].as<size_t>();
+    const size_t trials = vm["trials"].as<size_t>();
+    std::cout << "size: " << size << std::endl;
+
+    // setup context and queue for the default device
+    compute::device device = compute::system::default_device();
+    compute::context context(device);
+    compute::command_queue queue(context, device);
+    std::cout << "device: " << device.name() << std::endl;
+
+    // create vector of random numbers on the host
+    std::vector<int> host_data(size);
+    std::generate(host_data.begin(), host_data.end(), rand_int);
+
+    // create vector on the device and copy the data
+    compute::vector<int> device_data(
+        host_data.begin(), host_data.end(), queue
+    );
+
+    // run tuning proceure (if requested)
+    if(vm.count("tune")){
+        tune_accumulate(device_data, trials, queue);
+    }
+
+    // run benchmark
+    double t = perf_accumulate(device_data, trials, queue);
+    std::cout << "time: " << t / 1e6 << " ms" << std::endl;
+
+    return 0;
+}
diff --git a/src/boost/libs/compute/perf/perf_bernoulli_distribution.cpp b/src/boost/libs/compute/perf/perf_bernoulli_distribution.cpp
new file mode 100644
index 000000000..bffbe58f1
--- /dev/null
+++ b/src/boost/libs/compute/perf/perf_bernoulli_distribution.cpp
@@ -0,0 +1,46 @@
+//---------------------------------------------------------------------------//
+// Copyright (c) 2014 Roshan <thisisroshansmail@gmail.com>
+//
+// Distributed under the Boost Software License, Version 1.0
+// See accompanying file LICENSE_1_0.txt or copy at
+// http://www.boost.org/LICENSE_1_0.txt
+//
+// See http://boostorg.github.com/compute for more information.
+//---------------------------------------------------------------------------//
+
+#include <algorithm>
+#include <iostream>
+#include <vector>
+
+#include <boost/compute/system.hpp>
+#include <boost/compute/container/vector.hpp>
+#include <boost/compute/random/default_random_engine.hpp>
+#include <boost/compute/random/bernoulli_distribution.hpp>
+
+#include "perf.hpp"
+
+namespace compute = boost::compute;
+
+int main(int argc, char *argv[])
+{
+    perf_parse_args(argc, argv);
+    std::cout << "size: " << PERF_N << std::endl;
+
+    compute::device device = compute::system::default_device();
+    compute::context context(device);
+    compute::command_queue queue(context, device);
+
+    compute::vector<bool> vector(PERF_N, context);
+
+    compute::default_random_engine rng(queue);
+    compute::bernoulli_distribution<float> dist(0.5);
+
+    perf_timer t;
+    t.start();
+    dist.generate(vector.begin(), vector.end(), rng, queue);
+    queue.finish();
+    t.stop();
+    std::cout << "time: " << t.min_time() / 1e6 << " ms" << std::endl;
+
+    return 0;
+}
diff --git a/src/boost/libs/compute/perf/perf_binary_find.cpp b/src/boost/libs/compute/perf/perf_binary_find.cpp
new file mode 100644
index 000000000..ee7c46317
--- /dev/null
+++ b/src/boost/libs/compute/perf/perf_binary_find.cpp
@@ -0,0 +1,71 @@
+//---------------------------------------------------------------------------//
+// Copyright (c) 2014 Roshan <thisisroshansmail@gmail.com>
+//
+// Distributed under the Boost Software License, Version 1.0
+// See accompanying file LICENSE_1_0.txt or copy at
+// http://www.boost.org/LICENSE_1_0.txt
+//
+// See http://boostorg.github.com/compute for more information.
+//---------------------------------------------------------------------------//
+
+#include <algorithm>
+#include <iostream>
+#include <numeric>
+#include <vector>
+
+#include <boost/compute/system.hpp>
+#include <boost/compute/lambda.hpp>
+#include <boost/compute/algorithm/detail/binary_find.hpp>
+#include <boost/compute/algorithm/partition.hpp>
+#include <boost/compute/container/vector.hpp>
+
+#include "perf.hpp"
+
+int rand_int()
+{
+    return static_cast<int>((rand() / double(RAND_MAX)) * 25.0);
+}
+
+int main(int argc, char *argv[])
+{
+    perf_parse_args(argc, argv);
+    std::cout << "size: " << PERF_N << std::endl;
+
+    // setup context and queue for the default device
+    boost::compute::device device = boost::compute::system::default_device();
+    boost::compute::context context(device);
+    boost::compute::command_queue queue(context, device);
+    std::cout << "device: " << device.name() << std::endl;
+
+    // create vector of random numbers on the host
+    std::vector<int> host_vector(PERF_N);
+    std::generate(host_vector.begin(), host_vector.end(), rand_int);
+
+    // create vector on the device and copy the data
+    boost::compute::vector<int> device_vector(PERF_N, context);
+    boost::compute::copy(
+        host_vector.begin(), host_vector.end(), device_vector.begin(), queue
+    );
+
+    using boost::compute::_1;
+    boost::compute::partition(
+        device_vector.begin(), device_vector.end(), _1 < 20, queue
+    );
+
+    // just to be sure everything is finished before measuring execution time
+    // of binary_find algorithm
+    queue.finish();
+
+    perf_timer t;
+    for(size_t trial = 0; trial < PERF_TRIALS; trial++){
+        t.start();
+        boost::compute::detail::binary_find(
+            device_vector.begin(), device_vector.end(), _1 >= 20, queue
+        );
+        queue.finish();
+        t.stop();
+    }
+    std::cout << "time: " << t.min_time() / 1e6 << " ms" << std::endl;
+
+    return 0;
+}
diff --git a/src/boost/libs/compute/perf/perf_bolt_accumulate.cpp b/src/boost/libs/compute/perf/perf_bolt_accumulate.cpp
new file mode 100644
index 000000000..5a6b9b9c8
--- /dev/null
+++ b/src/boost/libs/compute/perf/perf_bolt_accumulate.cpp
@@ -0,0 +1,51 @@
+//---------------------------------------------------------------------------//
+// Copyright (c) 2015 Jakub Szuppe <j.szuppe@gmail.com>
+//
+// Distributed under the Boost Software License, Version 1.0
+// See accompanying file LICENSE_1_0.txt or copy at
+// http://www.boost.org/LICENSE_1_0.txt
+//
+// See http://boostorg.github.com/compute for more information.
+//---------------------------------------------------------------------------//
+
+#include <iostream>
+#include <algorithm>
+#include <vector>
+
+#include <bolt/cl/copy.h>
+#include <bolt/cl/device_vector.h>
+#include <bolt/cl/reduce.h>
+
+#include "perf.hpp"
+
+int main(int argc, char *argv[])
+{
+    perf_parse_args(argc, argv);
+
+    std::cout << "size: " << PERF_N << std::endl;
+
+    bolt::cl::control ctrl = bolt::cl::control::getDefault();
+    ::cl::Device device = ctrl.getDevice();
+    std::cout << "device: " << device.getInfo<CL_DEVICE_NAME>() << std::endl;
+
+    // create host vector
+    std::vector<int> host_vec = generate_random_vector<int>(PERF_N);
+
+    // create device vectors
+    bolt::cl::device_vector<int> device_vec(PERF_N);
+
+    // transfer data to the device
+    bolt::cl::copy(host_vec.begin(), host_vec.end(), device_vec.begin());
+
+    int sum = 0;
+    perf_timer t;
+    for(size_t trial = 0; trial < PERF_TRIALS; trial++){
+        t.start();
+        sum = bolt::cl::reduce(device_vec.begin(), device_vec.end());
+        t.stop();
+    }
+    std::cout << "time: " << t.min_time() / 1e6 << " ms" << std::endl;
+    std::cout << "sum: " << sum << std::endl;
+
+    return 0;
+}
diff --git a/src/boost/libs/compute/perf/perf_bolt_count.cpp b/src/boost/libs/compute/perf/perf_bolt_count.cpp
new file mode 100644
index 000000000..a23c5ac3b
--- /dev/null
+++ b/src/boost/libs/compute/perf/perf_bolt_count.cpp
@@ -0,0 +1,57 @@
+//---------------------------------------------------------------------------//
+// Copyright (c) 2015 Jakub Szuppe <j.szuppe@gmail.com>
+//
+// Distributed under the Boost Software License, Version 1.0
+// See accompanying file LICENSE_1_0.txt or copy at
+// http://www.boost.org/LICENSE_1_0.txt
+//
+// See http://boostorg.github.com/compute for more information.
+//---------------------------------------------------------------------------//
+
+#include <iostream>
+#include <algorithm>
+#include <vector>
+
+#include <bolt/cl/count.h>
+#include <bolt/cl/copy.h>
+#include <bolt/cl/device_vector.h>
+
+#include "perf.hpp"
+
+int rand_int()
+{
+    return static_cast<int>((rand() / double(RAND_MAX)) * 25.0);
+}
+
+int main(int argc, char *argv[])
+{
+    perf_parse_args(argc, argv);
+
+    std::cout << "size: " << PERF_N << std::endl;
+
+    bolt::cl::control ctrl = bolt::cl::control::getDefault();
+    ::cl::Device device = ctrl.getDevice();
+    std::cout << "device: " << device.getInfo<CL_DEVICE_NAME>() << std::endl;
+
+    // create vector of random numbers on the host
+    std::vector<int> h_vec(PERF_N);
+    std::generate(h_vec.begin(), h_vec.end(), rand_int);
+
+    // create device vector
+    bolt::cl::device_vector<int> d_vec(PERF_N);
+
+    // transfer data to the device
+    bolt::cl::copy(h_vec.begin(), h_vec.end(), d_vec.begin());
+
+    size_t count = 0;
+    perf_timer t;
+    for(size_t trial = 0; trial < PERF_TRIALS; trial++){
+        t.start();
+        count = bolt::cl::count(ctrl, d_vec.begin(), d_vec.end(), 4);
+        t.stop();
+    }
+    std::cout << "time: " << t.min_time() / 1e6 << " ms" << std::endl;
+    std::cout << "count: " << count << std::endl;
+
+    return 0;
+}
diff --git a/src/boost/libs/compute/perf/perf_bolt_exclusive_scan.cpp b/src/boost/libs/compute/perf/perf_bolt_exclusive_scan.cpp
new file mode 100644
index 000000000..a60e4655f
--- /dev/null
+++ b/src/boost/libs/compute/perf/perf_bolt_exclusive_scan.cpp
@@ -0,0 +1,52 @@
+//---------------------------------------------------------------------------//
+// Copyright (c) 2015 Jakub Szuppe <j.szuppe@gmail.com>
+//
+// Distributed under the Boost Software License, Version 1.0
+// See accompanying file LICENSE_1_0.txt or copy at
+// http://www.boost.org/LICENSE_1_0.txt
+//
+// See http://boostorg.github.com/compute for more information.
+//---------------------------------------------------------------------------//
+
+#include <iostream>
+#include <algorithm>
+#include <vector>
+
+#include <bolt/cl/scan.h>
+#include <bolt/cl/copy.h>
+#include <bolt/cl/device_vector.h>
+
+#include "perf.hpp"
+
+int main(int argc, char *argv[])
+{
+    perf_parse_args(argc, argv);
+
+    std::cout << "size: " << PERF_N << std::endl;
+
+    bolt::cl::control ctrl = bolt::cl::control::getDefault();
+    ::cl::Device device = ctrl.getDevice();
+    std::cout << "device: " << device.getInfo<CL_DEVICE_NAME>() << std::endl;
+
+    // create vector of random numbers on the host
+    std::vector<int> h_vec = generate_random_vector<int>(PERF_N);
+
+    // create device vector
+    bolt::cl::device_vector<int> d_vec(PERF_N);
+
+    perf_timer t;
+    for(size_t trial = 0; trial < PERF_TRIALS; trial++){
+        // transfer data to the device
+        bolt::cl::copy(h_vec.begin(), h_vec.end(), d_vec.begin());
+
+        t.start();
+        bolt::cl::exclusive_scan(d_vec.begin(), d_vec.end(), d_vec.begin());
+        t.stop();
+    }
+    std::cout << "time: " << t.min_time() / 1e6 << " ms" << std::endl;
+
+    // transfer data back to host
+    bolt::cl::copy(d_vec.begin(), d_vec.end(), h_vec.begin());
+
+    return 0;
+}
diff --git a/src/boost/libs/compute/perf/perf_bolt_fill.cpp b/src/boost/libs/compute/perf/perf_bolt_fill.cpp
new file mode 100644
index 000000000..50b6e85e0
--- /dev/null
+++ b/src/boost/libs/compute/perf/perf_bolt_fill.cpp
@@ -0,0 +1,43 @@
+//---------------------------------------------------------------------------//
+// Copyright (c) 2015 Jakub Szuppe <j.szuppe@gmail.com>
+//
+// Distributed under the Boost Software License, Version 1.0
+// See accompanying file LICENSE_1_0.txt or copy at
+// http://www.boost.org/LICENSE_1_0.txt
+//
+// See http://boostorg.github.com/compute for more information.
+//---------------------------------------------------------------------------//
+
+#include <iostream>
+#include <algorithm>
+#include <vector>
+
+#include <bolt/cl/fill.h>
+#include <bolt/cl/copy.h>
+#include <bolt/cl/device_vector.h>
+
+#include "perf.hpp"
+
+int main(int argc, char *argv[])
+{
+    perf_parse_args(argc, argv);
+
+    std::cout << "size: " << PERF_N << std::endl;
+
+    bolt::cl::control ctrl = bolt::cl::control::getDefault();
+    ::cl::Device device = ctrl.getDevice();
+    std::cout << "device: " << device.getInfo<CL_DEVICE_NAME>() << std::endl;
+
+    // create device vector (filled with zeros)
+    bolt::cl::device_vector<int> d_vec(PERF_N, 0);
+
+    perf_timer t;
+    for(size_t trial = 0; trial < PERF_TRIALS; trial++){
+        t.start();
+        bolt::cl::fill(d_vec.begin(), d_vec.end(), int(trial));
+        t.stop();
+    }
+    std::cout << "time: " << t.min_time() / 1e6 << " ms" << std::endl;
+
+    return 0;
+}
diff --git a/src/boost/libs/compute/perf/perf_bolt_inner_product.cpp b/src/boost/libs/compute/perf/perf_bolt_inner_product.cpp
new file mode 100644
index 000000000..4c9652e6e
--- /dev/null
+++ b/src/boost/libs/compute/perf/perf_bolt_inner_product.cpp
@@ -0,0 +1,56 @@
+//---------------------------------------------------------------------------//
+// Copyright (c) 2015 Jakub Szuppe <j.szuppe@gmail.com>
+//
+// Distributed under the Boost Software License, Version 1.0
+// See accompanying file LICENSE_1_0.txt or copy at
+// http://www.boost.org/LICENSE_1_0.txt
+//
+// See http://boostorg.github.com/compute for more information.
+//---------------------------------------------------------------------------//
+
+#include <iostream>
+#include <algorithm>
+#include <vector>
+
+#include <bolt/cl/inner_product.h>
+#include <bolt/cl/copy.h>
+#include <bolt/cl/device_vector.h>
+
+#include "perf.hpp"
+
+int main(int argc, char *argv[])
+{
+    perf_parse_args(argc, argv);
+
+    std::cout << "size: " << PERF_N << std::endl;
+
+    bolt::cl::control ctrl = bolt::cl::control::getDefault();
+    ::cl::Device device = ctrl.getDevice();
+    std::cout << "device: " << device.getInfo<CL_DEVICE_NAME>() << std::endl;
+
+    // create host vectors
+    std::vector<int> host_x = generate_random_vector<int>(PERF_N);
+    std::vector<int> host_y = generate_random_vector<int>(PERF_N);
+
+    // create device vectors
+    bolt::cl::device_vector<int> device_x(PERF_N);
+    bolt::cl::device_vector<int> device_y(PERF_N);
+
+    // transfer data to the device
+    bolt::cl::copy(host_x.begin(), host_x.end(), device_x.begin());
+    bolt::cl::copy(host_y.begin(), host_y.end(), device_y.begin());
+
+    int product = 0;
+    perf_timer t;
+    for(size_t trial = 0; trial < PERF_TRIALS; trial++){
+        t.start();
+        product = bolt::cl::inner_product(
+            device_x.begin(), device_x.end(), device_y.begin(), 0
+        );
+        t.stop();
+    }
+    std::cout << "time: " << t.min_time() / 1e6 << " ms" << std::endl;
+    std::cout << "product: " << product << std::endl;
+
+    return 0;
+}
diff --git a/src/boost/libs/compute/perf/perf_bolt_max_element.cpp b/src/boost/libs/compute/perf/perf_bolt_max_element.cpp
new file mode 100644
index 000000000..238b1ba2f
--- /dev/null
+++ b/src/boost/libs/compute/perf/perf_bolt_max_element.cpp
@@ -0,0 +1,69 @@
+//---------------------------------------------------------------------------//
+// Copyright (c) 2015 Jakub Szuppe <j.szuppe@gmail.com>
+//
+// Distributed under the Boost Software License, Version 1.0
+// See accompanying file LICENSE_1_0.txt or copy at
+// http://www.boost.org/LICENSE_1_0.txt
+//
+// See http://boostorg.github.com/compute for more information.
+//---------------------------------------------------------------------------//
+
+#include <iostream>
+#include <algorithm>
+#include <vector>
+
+#include <bolt/cl/copy.h>
+#include <bolt/cl/device_vector.h>
+#include <bolt/cl/max_element.h>
+
+#include "perf.hpp"
+
+int rand_int()
+{
+    return static_cast<int>(rand() % 10000000);
+}
+
+int main(int argc, char *argv[])
+{
+    perf_parse_args(argc, argv);
+
+    std::cout << "size: " << PERF_N << std::endl;
+
+    bolt::cl::control ctrl = bolt::cl::control::getDefault();
+    ::cl::Device device = ctrl.getDevice();
+    std::cout << "device: " << device.getInfo<CL_DEVICE_NAME>() << std::endl;
+
+    // create host vector
+    std::vector<int> host_vec = generate_random_vector<int>(PERF_N);
+
+    // create device vectors
+    bolt::cl::device_vector<int> device_vec(PERF_N);
+
+    // transfer data to the device
+    bolt::cl::copy(host_vec.begin(), host_vec.end(), device_vec.begin());
+
+    bolt::cl::device_vector<int>::iterator max_iter = device_vec.begin();
+    perf_timer t;
+    for(size_t trial = 0; trial < PERF_TRIALS; trial++){
+        t.start();
+        max_iter = bolt::cl::max_element(device_vec.begin(), device_vec.end());
+        t.stop();
+    }
+
+    int device_max = *max_iter;
+    std::cout << "time: " << t.min_time() / 1e6 << " ms" << std::endl;
+    std::cout << "max: " << device_max << std::endl;
+
+    // verify max is correct
+    int host_max = *std::max_element(host_vec.begin(), host_vec.end());
+    if(device_max != host_max){
+        std::cout << "ERROR: "
+                  << "device_max (" << device_max << ") "
+                  << "!= "
+                  << "host_max (" << host_max << ")"
+                  << std::endl;
+        return -1;
+    }
+
+    return 0;
+}
diff --git a/src/boost/libs/compute/perf/perf_bolt_merge.cpp b/src/boost/libs/compute/perf/perf_bolt_merge.cpp
new file mode 100644
index 000000000..c5ee12ac1
--- /dev/null
+++ b/src/boost/libs/compute/perf/perf_bolt_merge.cpp
@@ -0,0 +1,60 @@
+//---------------------------------------------------------------------------//
+// Copyright (c) 2015 Jakub Szuppe <j.szuppe@gmail.com>
+//
+// Distributed under the Boost Software License, Version 1.0
+// See accompanying file LICENSE_1_0.txt or copy at
+// http://www.boost.org/LICENSE_1_0.txt
+//
+// See http://boostorg.github.com/compute for more information.
+//---------------------------------------------------------------------------//
+
+#include <iostream>
+#include <algorithm>
+#include <vector>
+
+#include <bolt/cl/merge.h>
+#include <bolt/cl/copy.h>
+#include <bolt/cl/device_vector.h>
+
+#include "perf.hpp"
+
+int main(int argc, char *argv[])
+{
+    perf_parse_args(argc, argv);
+
+    std::cout << "size: " << PERF_N << std::endl;
+
+    bolt::cl::control ctrl = bolt::cl::control::getDefault();
+    ::cl::Device device = ctrl.getDevice();
+    std::cout << "device: " << device.getInfo<CL_DEVICE_NAME>() << std::endl;
+    
+    // create vector of random numbers on the host
+    std::vector<int> host_vec1 = generate_random_vector<int>(std::floor(PERF_N / 2.0));
+    std::vector<int> host_vec2 = generate_random_vector<int>(std::ceil(PERF_N / 2.0));
+    // sort them
+    std::sort(host_vec1.begin(), host_vec1.end());
+    std::sort(host_vec2.begin(), host_vec2.end());
+
+    // create device vectors
+    bolt::cl::device_vector<int> device_vec1(PERF_N);
+    bolt::cl::device_vector<int> device_vec2(PERF_N);
+    bolt::cl::device_vector<int> device_vec3(PERF_N);
+    
+    // transfer data to the device
+    bolt::cl::copy(host_vec1.begin(), host_vec1.end(), device_vec1.begin());
+    bolt::cl::copy(host_vec2.begin(), host_vec2.end(), device_vec2.begin());
+
+    perf_timer t;
+    for(size_t trial = 0; trial < PERF_TRIALS; trial++){
+        t.start();
+        bolt::cl::merge(
+            device_vec1.begin(), device_vec1.end(),
+            device_vec2.begin(), device_vec2.end(),
+            device_vec3.begin()
+        );
+        t.stop();
+    }
+    std::cout << "time: " << t.min_time() / 1e6 << " ms" << std::endl;
+
+    return 0;
+}
diff --git a/src/boost/libs/compute/perf/perf_bolt_partial_sum.cpp b/src/boost/libs/compute/perf/perf_bolt_partial_sum.cpp
new file mode 100644
index 000000000..2f9c830e5
--- /dev/null
+++ b/src/boost/libs/compute/perf/perf_bolt_partial_sum.cpp
@@ -0,0 +1,53 @@
+//---------------------------------------------------------------------------//
+// Copyright (c) 2015 Jakub Szuppe <j.szuppe@gmail.com>
+//
+// Distributed under the Boost Software License, Version 1.0
+// See accompanying file LICENSE_1_0.txt or copy at
+// http://www.boost.org/LICENSE_1_0.txt
+//
+// See http://boostorg.github.com/compute for more information.
+//---------------------------------------------------------------------------//
+
+#include <iostream>
+#include <algorithm>
+#include <vector>
+
+#include <bolt/cl/scan.h>
+#include <bolt/cl/copy.h>
+#include <bolt/cl/device_vector.h>
+
+#include "perf.hpp"
+
+int main(int argc, char *argv[])
+{
+    perf_parse_args(argc, argv);
+
+    std::cout << "size: " << PERF_N << std::endl;
+
+    bolt::cl::control ctrl = bolt::cl::control::getDefault();
+    ::cl::Device device = ctrl.getDevice();
+    std::cout << "device: " << device.getInfo<CL_DEVICE_NAME>() << std::endl;
+
+    // create vector of random numbers on the host
+    std::vector<int> h_vec = generate_random_vector<int>(PERF_N);
+
+    // create device vector
+    bolt::cl::device_vector<int> d_vec(PERF_N);
+
+    perf_timer t;
+    for(size_t trial = 0; trial < PERF_TRIALS; trial++){
+        // transfer data to the device
+        bolt::cl::copy(h_vec.begin(), h_vec.end(), d_vec.begin());
+
+        t.start();
+        bolt::cl::inclusive_scan(d_vec.begin(), d_vec.end(), d_vec.begin());
+        t.stop();
+    }
+    std::cout << "time: " << t.min_time() / 1e6 << " ms" << std::endl;
+
+    // transfer data back to host
+    bolt::cl::copy(d_vec.begin(), d_vec.end(), h_vec.begin());
+
+    return 0;
+}
+
diff --git a/src/boost/libs/compute/perf/perf_bolt_reduce_by_key.cpp b/src/boost/libs/compute/perf/perf_bolt_reduce_by_key.cpp
new file mode 100644
index 000000000..e76684985
--- /dev/null
+++ b/src/boost/libs/compute/perf/perf_bolt_reduce_by_key.cpp
@@ -0,0 +1,100 @@
+//---------------------------------------------------------------------------//
+// Copyright (c) 2015 Jakub Szuppe <j.szuppe@gmail.com>
+//
+// Distributed under the Boost Software License, Version 1.0
+// See accompanying file LICENSE_1_0.txt or copy at
+// http://www.boost.org/LICENSE_1_0.txt
+//
+// See http://boostorg.github.com/compute for more information.
+//---------------------------------------------------------------------------//
+
+#include <iostream>
+#include <algorithm>
+#include <vector>
+
+#include <bolt/cl/copy.h>
+#include <bolt/cl/device_vector.h>
+#include <bolt/cl/reduce_by_key.h>
+
+#include "perf.hpp"
+
+int rand_int()
+{
+    return static_cast<int>((rand() / double(RAND_MAX)) * 25.0);
+}
+
+struct unique_key {
+  int current;
+  int avgValuesNoPerKey;
+
+  unique_key()
+  {
+      current = 0;
+      avgValuesNoPerKey = 512;
+  }
+
+  int operator()()
+  {
+      double p = double(1.0) / static_cast<double>(avgValuesNoPerKey);
+      if((rand() / double(RAND_MAX)) <= p)
+          return ++current;
+      return current;
+  }
+} UniqueKey;
+
+int main(int argc, char *argv[])
+{
+    perf_parse_args(argc, argv);
+
+    std::cout << "size: " << PERF_N << std::endl;
+
+    bolt::cl::control ctrl = bolt::cl::control::getDefault();
+    ::cl::Device device = ctrl.getDevice();
+    std::cout << "device: " << device.getInfo<CL_DEVICE_NAME>() << std::endl;
+
+    // create vector of keys and random values
+    std::vector<int> host_keys(PERF_N);
+    std::vector<int> host_values(PERF_N);
+    std::generate(host_keys.begin(), host_keys.end(), UniqueKey);
+    std::generate(host_values.begin(), host_values.end(), rand_int);
+
+    // create device vectors for data
+    bolt::cl::device_vector<int> device_keys(PERF_N);
+    bolt::cl::device_vector<int> device_values(PERF_N);
+
+    // transfer data to the device
+    bolt::cl::copy(host_keys.begin(), host_keys.end(), device_keys.begin());
+    bolt::cl::copy(host_values.begin(), host_values.end(), device_values.begin());
+
+    // create device vectors for the results
+    bolt::cl::device_vector<int> device_keys_results(PERF_N);
+    bolt::cl::device_vector<int> device_values_results(PERF_N);
+
+    typedef bolt::cl::device_vector<int>::iterator iterType;
+    bolt::cl::pair<iterType, iterType> result = {
+        device_keys_results.begin(),
+        device_values_results.begin()
+    };
+
+    perf_timer t;
+    for(size_t trial = 0; trial < PERF_TRIALS; trial++){
+        t.start();
+        result = bolt::cl::reduce_by_key(device_keys.begin(),
+                                         device_keys.end(),
+                                         device_values.begin(),
+                                         device_keys_results.begin(),
+                                         device_values_results.begin());
+        t.stop();
+    }
+    std::cout << "time: " << t.min_time() / 1e6 << " ms" << std::endl;
+
+    size_t result_size = bolt::cl::distance(device_keys_results.begin(), result.first);
+    if(result_size != static_cast<size_t>(host_keys[PERF_N-1] + 1)){
+        std::cout << "ERROR: "
+                  << "wrong number of keys"
+                  << std::endl;
+        return -1;
+    }
+
+    return 0;
+}
diff --git a/src/boost/libs/compute/perf/perf_bolt_saxpy.cpp b/src/boost/libs/compute/perf/perf_bolt_saxpy.cpp
new file mode 100644
index 000000000..201f683ab
--- /dev/null
+++ b/src/boost/libs/compute/perf/perf_bolt_saxpy.cpp
@@ -0,0 +1,76 @@
+//---------------------------------------------------------------------------//
+// Copyright (c) 2015 Jakub Szuppe <j.szuppe@gmail.com>
+//
+// Distributed under the Boost Software License, Version 1.0
+// See accompanying file LICENSE_1_0.txt or copy at
+// http://www.boost.org/LICENSE_1_0.txt
+//
+// See http://boostorg.github.com/compute for more information.
+//---------------------------------------------------------------------------//
+
+#include <iostream>
+#include <algorithm>
+#include <vector>
+
+#include <bolt/cl/copy.h>
+#include <bolt/cl/device_vector.h>
+#include <bolt/cl/transform.h>
+
+#include "perf.hpp"
+
+BOLT_FUNCTOR(saxpy_functor,
+    struct saxpy_functor
+    {
+        float _a;
+        saxpy_functor(float a) : _a(a) {};
+
+        float operator() (const float &x, const float &y) const
+        {
+            return _a * x + y;
+        };
+    };
+)
+
+int main(int argc, char *argv[])
+{
+    perf_parse_args(argc, argv);
+
+    std::cout << "size: " << PERF_N << std::endl;
+
+    bolt::cl::control ctrl = bolt::cl::control::getDefault();
+    ::cl::Device device = ctrl.getDevice();
+    std::cout << "device: " << device.getInfo<CL_DEVICE_NAME>() << std::endl;
+
+    // create host vectors
+    std::vector<float> host_x(PERF_N);
+    std::vector<float> host_y(PERF_N);
+    std::generate(host_x.begin(), host_x.end(), rand);
+    std::generate(host_y.begin(), host_y.end(), rand);
+
+    // create device vectors
+    bolt::cl::device_vector<float> device_x(PERF_N);
+    bolt::cl::device_vector<float> device_y(PERF_N);
+
+    // transfer data to the device
+    bolt::cl::copy(host_x.begin(), host_x.end(), device_x.begin());
+    bolt::cl::copy(host_y.begin(), host_y.end(), device_y.begin());
+
+    perf_timer t;
+    for(size_t trial = 0; trial < PERF_TRIALS; trial++){
+        t.start();
+        bolt::cl::transform(
+            device_x.begin(), device_x.end(),
+            device_y.begin(),
+            device_y.begin(),
+            saxpy_functor(2.5f)
+        );
+        t.stop();
+    }
+    std::cout << "time: " << t.min_time() / 1e6 << " ms" << std::endl;
+
+    // transfer data back to host
+    bolt::cl::copy(device_x.begin(), device_x.end(), host_x.begin());
+    bolt::cl::copy(device_y.begin(), device_y.end(), host_y.begin());
+
+    return 0;
+}
diff --git a/src/boost/libs/compute/perf/perf_bolt_sort.cpp b/src/boost/libs/compute/perf/perf_bolt_sort.cpp
new file mode 100644
index 000000000..2508ec069
--- /dev/null
+++ b/src/boost/libs/compute/perf/perf_bolt_sort.cpp
@@ -0,0 +1,50 @@
+//---------------------------------------------------------------------------//
+// Copyright (c) 2015 Jakub Szuppe <j.szuppe@gmail.com>
+//
+// Distributed under the Boost Software License, Version 1.0
+// See accompanying file LICENSE_1_0.txt or copy at
+// http://www.boost.org/LICENSE_1_0.txt
+//
+// See http://boostorg.github.com/compute for more information.
+//---------------------------------------------------------------------------//
+
+#include <algorithm>
+#include <cstdlib>
+#include <vector>
+
+#include <bolt/cl/sort.h>
+#include <bolt/cl/copy.h>
+#include <bolt/cl/device_vector.h>
+
+#include "perf.hpp"
+
+int main(int argc, char *argv[])
+{
+    perf_parse_args(argc, argv);
+
+    std::cout << "size: " << PERF_N << std::endl;
+
+    ::cl::Device device = bolt::cl::control::getDefault().getDevice();
+    std::cout << "device: " << device.getInfo<CL_DEVICE_NAME>() << std::endl;
+
+    // create host vector
+    std::vector<int> h_vec = generate_random_vector<int>(PERF_N);
+    // create device vector
+    bolt::cl::device_vector<int> d_vec(PERF_N);
+
+    perf_timer t;
+    for(size_t trial = 0; trial < PERF_TRIALS; trial++){
+        // transfer data to the device
+        bolt::cl::copy(h_vec.begin(), h_vec.end(), d_vec.begin());
+
+        t.start();
+        bolt::cl::sort(d_vec.begin(), d_vec.end());
+        t.stop();
+    }
+    std::cout << "time: " << t.min_time() / 1e6 << " ms" << std::endl;
+
+    // transfer data back to host
+    bolt::cl::copy(d_vec.begin(), d_vec.end(), h_vec.begin());
+
+    return 0;
+}
diff --git a/src/boost/libs/compute/perf/perf_cart_to_polar.cpp b/src/boost/libs/compute/perf/perf_cart_to_polar.cpp
new file mode 100644
index 000000000..e0d333eb5
--- /dev/null
+++ b/src/boost/libs/compute/perf/perf_cart_to_polar.cpp
@@ -0,0 +1,158 @@
+//---------------------------------------------------------------------------//
+// Copyright (c) 2013-2014 Kyle Lutz <kyle.r.lutz@gmail.com>
+//
+// Distributed under the Boost Software License, Version 1.0
+// See accompanying file LICENSE_1_0.txt or copy at
+// http://www.boost.org/LICENSE_1_0.txt
+//
+// See http://boostorg.github.com/compute for more information.
+//---------------------------------------------------------------------------//
+
+#define _USE_MATH_DEFINES
+#include <algorithm>
+#include <iostream>
+#include <vector>
+
+#include <boost/compute/system.hpp>
+#include <boost/compute/algorithm/copy.hpp>
+#include <boost/compute/algorithm/copy_n.hpp>
+#include <boost/compute/algorithm/transform.hpp>
+#include <boost/compute/container/vector.hpp>
+
+#include "perf.hpp"
+
+namespace compute = boost::compute;
+
+using compute::float2_;
+
+float rand_float()
+{
+    return (float(rand()) / float(RAND_MAX)) * 1000.f;
+}
+
+void serial_cartesian_to_polar(const float *input, size_t n, float *output)
+{
+    for(size_t i = 0; i < n; i++){
+        float x = input[i*2+0];
+        float y = input[i*2+1];
+
+        float magnitude = std::sqrt(x*x + y*y);
+        float angle = std::atan2(y, x) * 180.f / M_PI;
+
+        output[i*2+0] = magnitude;
+        output[i*2+1] = angle;
+    }
+}
+
+void serial_polar_to_cartesian(const float *input, size_t n, float *output)
+{
+    for(size_t i = 0; i < n; i++){
+        float magnitude = input[i*2+0];
+        float angle = input[i*2+1];
+
+        float x = magnitude * cos(angle);
+        float y = magnitude * sin(angle);
+
+        output[i*2+0] = x;
+        output[i*2+1] = y;
+    }
+}
+
+// converts from cartesian coordinates (x, y) to polar coordinates (magnitude, angle)
+BOOST_COMPUTE_FUNCTION(float2_, cartesian_to_polar, (float2_ p),
+{
+    float x = p.x;
+    float y = p.y;
+
+    float magnitude = sqrt(x*x + y*y);
+    float angle = atan2(y, x) * 180.f / M_PI;
+
+    return (float2)(magnitude, angle);
+});
+
+// converts from polar coordinates (magnitude, angle) to cartesian coordinates (x, y)
+BOOST_COMPUTE_FUNCTION(float2_, polar_to_cartesian, (float2_ p),
+{
+    float magnitude = p.x;
+    float angle = p.y;
+
+    float x = magnitude * cos(angle);
+    float y = magnitude * sin(angle);
+
+    return (float2)(x, y)
+});
+
+int main(int argc, char *argv[])
+{
+    perf_parse_args(argc, argv);
+
+    std::cout << "size: " << PERF_N << std::endl;
+
+    // setup context and queue for the default device
+    compute::device device = compute::system::default_device();
+    compute::context context(device);
+    compute::command_queue queue(context, device);
+    std::cout << "device: " << device.name() << std::endl;
+
+    // create vector of random numbers on the host
+    std::vector<float> host_vector(PERF_N*2);
+    std::generate(host_vector.begin(), host_vector.end(), rand_float);
+
+    // create vector on the device and copy the data
+    compute::vector<float2_> device_vector(PERF_N, context);
+    compute::copy_n(
+        reinterpret_cast<float2_ *>(&host_vector[0]),
+        PERF_N,
+        device_vector.begin(),
+        queue
+    );
+
+    perf_timer t;
+    for(size_t trial = 0; trial < PERF_TRIALS; trial++){
+        t.start();
+        compute::transform(
+            device_vector.begin(),
+            device_vector.end(),
+            device_vector.begin(),
+            cartesian_to_polar,
+            queue
+        );
+        queue.finish();
+        t.stop();
+    }
+    std::cout << "time: " << t.min_time() / 1e6 << " ms" << std::endl;
+
+    // perform saxpy on host
+    t.clear();
+    for(size_t trial = 0; trial < PERF_TRIALS; trial++){
+        t.start();
+        serial_cartesian_to_polar(&host_vector[0], PERF_N, &host_vector[0]);
+        t.stop();
+    }
+    std::cout << "host time: " << t.min_time() / 1e6 << " ms" << std::endl;
+
+    std::vector<float> device_data(PERF_N*2);
+    compute::copy(
+        device_vector.begin(),
+        device_vector.end(),
+        reinterpret_cast<float2_ *>(&device_data[0]),
+        queue
+    );
+
+    for(size_t i = 0; i < PERF_N; i++){
+        float host_value = host_vector[i];
+        float device_value = device_data[i];
+
+        if(std::abs(device_value - host_value) > 1e-3){
+            std::cout << "ERROR: "
+                      << "value at " << i << " "
+                      << "device_value (" << device_value << ") "
+                      << "!= "
+                      << "host_value (" << host_value << ")"
+                      << std::endl;
+            return -1;
+        }
+    }
+
+    return 0;
+}
diff --git a/src/boost/libs/compute/perf/perf_comparison_sort.cpp b/src/boost/libs/compute/perf/perf_comparison_sort.cpp
new file mode 100644
index 000000000..fa63303fc
--- /dev/null
+++ b/src/boost/libs/compute/perf/perf_comparison_sort.cpp
@@ -0,0 +1,86 @@
+//---------------------------------------------------------------------------//
+// Copyright (c) 2016 Jakub Szuppe <j.szuppe@gmail.com>
+//
+// Distributed under the Boost Software License, Version 1.0
+// See accompanying file LICENSE_1_0.txt or copy at
+// http://www.boost.org/LICENSE_1_0.txt
+//
+// See http://boostorg.github.com/compute for more information.
+//---------------------------------------------------------------------------//
+
+#include <algorithm>
+#include <iostream>
+#include <vector>
+
+#include <boost/program_options.hpp>
+
+#include <boost/compute/system.hpp>
+#include <boost/compute/algorithm/sort.hpp>
+#include <boost/compute/algorithm/is_sorted.hpp>
+#include <boost/compute/container/vector.hpp>
+
+#include "perf.hpp"
+
+namespace po = boost::program_options;
+namespace compute = boost::compute;
+
+int main(int argc, char *argv[])
+{
+    perf_parse_args(argc, argv);
+    std::cout << "size: " << PERF_N << std::endl;
+
+    // setup context and queue for the default device
+    boost::compute::device device = boost::compute::system::default_device();
+    boost::compute::context context(device);
+    boost::compute::command_queue queue(context, device);
+    std::cout << "device: " << device.name() << std::endl;
+
+    using boost::compute::int_;
+
+    // create vector of random numbers on the host
+    std::vector<int_> host_vector(PERF_N);
+    std::generate(host_vector.begin(), host_vector.end(), rand);
+
+    // create vector on the device and copy the data
+    boost::compute::vector<int_> device_vector(PERF_N, context);
+
+    // less function for float
+    BOOST_COMPUTE_FUNCTION(bool, comp, (int_ a, int_ b),
+    {
+        return a < b;
+    });
+
+    // sort vector
+    perf_timer t;
+    for(size_t trial = 0; trial < PERF_TRIALS; trial++){
+        boost::compute::copy(
+            host_vector.begin(),
+            host_vector.end(),
+            device_vector.begin(),
+            queue
+        );
+        queue.finish();
+
+        t.start();
+        boost::compute::sort(
+            device_vector.begin(),
+            device_vector.end(),
+            comp,
+            queue
+        );
+        queue.finish();
+        t.stop();
+    };
+    std::cout << "time: " << t.min_time() / 1e6 << " ms" << std::endl;
+
+    // verify vector is sorted
+    if(!boost::compute::is_sorted(device_vector.begin(),
+                                  device_vector.end(),
+                                  comp,
+                                  queue)){
+        std::cout << "ERROR: is_sorted() returned false" << std::endl;
+        return -1;
+    }
+
+    return 0;
+}
diff --git a/src/boost/libs/compute/perf/perf_copy_if.cpp b/src/boost/libs/compute/perf/perf_copy_if.cpp
new file mode 100644
index 000000000..b2c162f64
--- /dev/null
+++ b/src/boost/libs/compute/perf/perf_copy_if.cpp
@@ -0,0 +1,122 @@
+//---------------------------------------------------------------------------//
+// Copyright (c) 2013-2014 Kyle Lutz <kyle.r.lutz@gmail.com>
+//
+// Distributed under the Boost Software License, Version 1.0
+// See accompanying file LICENSE_1_0.txt or copy at
+// http://www.boost.org/LICENSE_1_0.txt
+//
+// See http://boostorg.github.com/compute for more information.
+//---------------------------------------------------------------------------//
+
+#include <boost/compute/core.hpp>
+#include <boost/compute/closure.hpp>
+#include <boost/compute/algorithm/copy_if.hpp>
+#include <boost/compute/container/vector.hpp>
+#include <boost/compute/random/default_random_engine.hpp>
+#include <boost/compute/random/uniform_int_distribution.hpp>
+#include <boost/compute/random/uniform_real_distribution.hpp>
+
+#include "perf.hpp"
+
+namespace compute = boost::compute;
+
+void test_copy_if_odd(compute::command_queue &queue)
+{
+    // create input and output vectors on the device
+    const compute::context &context = queue.get_context();
+    compute::vector<int> input(PERF_N, context);
+    compute::vector<int> output(PERF_N, context);
+
+    // generate random numbers between 1 and 10
+    compute::default_random_engine rng(queue);
+    compute::uniform_int_distribution<int> d(1, 10);
+    d.generate(input.begin(), input.end(), rng, queue);
+
+    BOOST_COMPUTE_FUNCTION(bool, is_odd, (int x),
+    {
+        return x & 1;
+    });
+
+    perf_timer t;
+    for(size_t trial = 0; trial < PERF_TRIALS; trial++){
+        t.start();
+        compute::vector<int>::iterator i = compute::copy_if(
+            input.begin(), input.end(), output.begin(), is_odd, queue
+        );
+        queue.finish();
+        t.stop();
+
+        float ratio = float(std::distance(output.begin(), i)) / PERF_N;
+        if(PERF_N > 1000 && (ratio < 0.45f || ratio > 0.55f)){
+            std::cerr << "error: ratio is " << ratio << std::endl;
+            std::cerr << "error: ratio should be around 45-55%" << std::endl;
+        }
+    }
+    std::cout << "time: " << t.min_time() / 1e6 << " ms" << std::endl;
+}
+
+void test_copy_if_in_sphere(compute::command_queue &queue)
+{
+    using boost::compute::float4_;
+
+    // create input and output vectors on the device
+    const compute::context &context = queue.get_context();
+    compute::vector<float4_> input_points(PERF_N, context);
+    compute::vector<float4_> output_points(PERF_N, context);
+
+    // generate random numbers in a cube
+    float radius = 5.0f;
+    compute::default_random_engine rng(queue);
+    compute::uniform_real_distribution<float> d(-radius, +radius);
+    d.generate(
+        compute::make_buffer_iterator<float>(input_points.get_buffer(), 0),
+        compute::make_buffer_iterator<float>(input_points.get_buffer(), PERF_N * 4),
+        rng,
+        queue
+    );
+
+    // predicate which returns true if the point lies within the sphere
+    BOOST_COMPUTE_CLOSURE(bool, is_in_sphere, (float4_ point), (radius),
+    {
+        // ignore fourth component
+        point.w = 0;
+
+        return length(point) < radius;
+    });
+
+    perf_timer t;
+    for(size_t trial = 0; trial < PERF_TRIALS; trial++){
+        t.start();
+        compute::vector<float4_>::iterator i = compute::copy_if(
+            input_points.begin(),
+            input_points.end(),
+            output_points.begin(),
+            is_in_sphere,
+            queue
+        );
+        queue.finish();
+        t.stop();
+
+        float ratio = float(std::distance(output_points.begin(), i)) / PERF_N;
+        if(PERF_N > 1000 && (ratio < 0.5f || ratio > 0.6f)){
+            std::cerr << "error: ratio is " << ratio << std::endl;
+            std::cerr << "error: ratio should be around 50-60%" << std::endl;
+        }
+    }
+    std::cout << "time: " << t.min_time() / 1e6 << " ms" << std::endl;
+}
+
+int main(int argc, char *argv[])
+{
+    perf_parse_args(argc, argv);
+
+    // setup context and queue for the default device
+    boost::compute::device device = boost::compute::system::default_device();
+    boost::compute::context context(device);
+    boost::compute::command_queue queue(context, device);
+    std::cout << "device: " << device.name() << std::endl;
+
+    test_copy_if_odd(queue);
+
+    return 0;
+}
diff --git a/src/boost/libs/compute/perf/perf_copy_to_device.cpp b/src/boost/libs/compute/perf/perf_copy_to_device.cpp
new file mode 100644
index 000000000..0cda3d0f5
--- /dev/null
+++ b/src/boost/libs/compute/perf/perf_copy_to_device.cpp
@@ -0,0 +1,55 @@
+//---------------------------------------------------------------------------//
+// Copyright (c) 2013-2014 Kyle Lutz <kyle.r.lutz@gmail.com>
+//
+// Distributed under the Boost Software License, Version 1.0
+// See accompanying file LICENSE_1_0.txt or copy at
+// http://www.boost.org/LICENSE_1_0.txt
+//
+// See http://boostorg.github.com/compute for more information.
+//---------------------------------------------------------------------------//
+
+#include <vector>
+#include <cstdlib>
+#include <iostream>
+
+#include <boost/compute.hpp>
+
+int main(int argc, char *argv[])
+{
+    size_t size = 1000;
+    if(argc >= 2){
+        size = boost::lexical_cast<size_t>(argv[1]);
+    }
+
+    boost::compute::device device = boost::compute::system::default_device();
+    boost::compute::context context(device);
+
+    boost::compute::command_queue::properties
+        properties = boost::compute::command_queue::enable_profiling;
+    boost::compute::command_queue queue(context, device, properties);
+
+    std::vector<int> host_vector(size);
+    std::generate(host_vector.begin(), host_vector.end(), rand);
+
+    boost::compute::vector<int> device_vector(host_vector.size(), context);
+
+    boost::compute::future<void> future =
+        boost::compute::copy_async(host_vector.begin(),
+                                   host_vector.end(),
+                                   device_vector.begin(),
+                                   queue);
+
+    // wait for copy to finish
+    future.wait();
+
+    // get elapsed time in nanoseconds
+    size_t elapsed =
+        future.get_event().duration<boost::chrono::nanoseconds>().count();
+
+    std::cout << "time: " << elapsed / 1e6 << " ms" << std::endl;
+
+    float rate = (float(size * sizeof(int)) / elapsed) * 1000.f;
+    std::cout << "rate: " << rate << " MB/s" << std::endl;
+
+    return 0;
+}
diff --git a/src/boost/libs/compute/perf/perf_count.cpp b/src/boost/libs/compute/perf/perf_count.cpp
new file mode 100644
index 000000000..a47eb6150
--- /dev/null
+++ b/src/boost/libs/compute/perf/perf_count.cpp
@@ -0,0 +1,77 @@
+//---------------------------------------------------------------------------//
+// Copyright (c) 2013-2014 Kyle Lutz <kyle.r.lutz@gmail.com>
+//
+// Distributed under the Boost Software License, Version 1.0
+// See accompanying file LICENSE_1_0.txt or copy at
+// http://www.boost.org/LICENSE_1_0.txt
+//
+// See http://boostorg.github.com/compute for more information.
+//---------------------------------------------------------------------------//
+
+#include <algorithm>
+#include <iostream>
+#include <vector>
+
+#include <boost/compute/system.hpp>
+#include <boost/compute/algorithm/count.hpp>
+#include <boost/compute/container/vector.hpp>
+
+#include "perf.hpp"
+
+int rand_int()
+{
+    return static_cast<int>((rand() / double(RAND_MAX)) * 25.0);
+}
+
+int main(int argc, char *argv[])
+{
+    perf_parse_args(argc, argv);
+    std::cout << "size: " << PERF_N << std::endl;
+
+    // setup context and queue for the default device
+    boost::compute::device device = boost::compute::system::default_device();
+    boost::compute::context context(device);
+    boost::compute::command_queue queue(context, device);
+    std::cout << "device: " << device.name() << std::endl;
+
+    // create vector of random numbers on the host
+    std::vector<int> host_vector(PERF_N);
+    std::generate(host_vector.begin(), host_vector.end(), rand_int);
+
+    // create vector on the device and copy the data
+    boost::compute::vector<int> device_vector(PERF_N, context);
+    boost::compute::copy(
+        host_vector.begin(),
+        host_vector.end(),
+        device_vector.begin(),
+        queue
+    );
+
+    size_t count = 0;
+    perf_timer t;
+    for(size_t trial = 0; trial < PERF_TRIALS; trial++){
+        t.start();
+        count = boost::compute::count(
+            device_vector.begin(), device_vector.end(), 4, queue
+        );
+        queue.finish();
+        t.stop();
+    }
+    std::cout << "time: " << t.min_time() / 1e6 << " ms" << std::endl;
+    std::cout << "count: " << count << std::endl;
+
+    // verify count is correct
+    size_t host_count = std::count(host_vector.begin(),
+                                   host_vector.end(),
+                                   4);
+    if(count != host_count){
+        std::cout << "ERROR: "
+                  << "device_count (" << count << ") "
+                  << "!= "
+                  << "host_count (" << host_count << ")"
+                  << std::endl;
+        return -1;
+    }
+
+    return 0;
+}
diff --git a/src/boost/libs/compute/perf/perf_discrete_distribution.cpp b/src/boost/libs/compute/perf/perf_discrete_distribution.cpp
new file mode 100644
index 000000000..f6679eb1a
--- /dev/null
+++ b/src/boost/libs/compute/perf/perf_discrete_distribution.cpp
@@ -0,0 +1,48 @@
+//---------------------------------------------------------------------------//
+// Copyright (c) 2014 Roshan <thisisroshansmail@gmail.com>
+//
+// Distributed under the Boost Software License, Version 1.0
+// See accompanying file LICENSE_1_0.txt or copy at
+// http://www.boost.org/LICENSE_1_0.txt
+//
+// See http://boostorg.github.com/compute for more information.
+//---------------------------------------------------------------------------//
+
+#include <algorithm>
+#include <iostream>
+#include <vector>
+
+#include <boost/compute/system.hpp>
+#include <boost/compute/container/vector.hpp>
+#include <boost/compute/random/default_random_engine.hpp>
+#include <boost/compute/random/discrete_distribution.hpp>
+
+#include "perf.hpp"
+
+namespace compute = boost::compute;
+
+int main(int argc, char *argv[])
+{
+    perf_parse_args(argc, argv);
+    std::cout << "size: " << PERF_N << std::endl;
+
+    compute::device device = compute::system::default_device();
+    compute::context context(device);
+    compute::command_queue queue(context, device);
+
+    compute::vector<compute::uint_> vector(PERF_N, context);
+
+    int weights[] = {1, 1};
+
+    compute::default_random_engine rng(queue);
+    compute::discrete_distribution<compute::uint_> dist(weights, weights+2);
+
+    perf_timer t;
+    t.start();
+    dist.generate(vector.begin(), vector.end(), rng, queue);
+    queue.finish();
+    t.stop();
+    std::cout << "time: " << t.min_time() / 1e6 << " ms" << std::endl;
+
+    return 0;
+}
diff --git a/src/boost/libs/compute/perf/perf_erase_remove.cpp b/src/boost/libs/compute/perf/perf_erase_remove.cpp
new file mode 100644
index 000000000..f567247fe
--- /dev/null
+++ b/src/boost/libs/compute/perf/perf_erase_remove.cpp
@@ -0,0 +1,61 @@
+//---------------------------------------------------------------------------//
+// Copyright (c) 2013-2014 Kyle Lutz <kyle.r.lutz@gmail.com>
+//
+// Distributed under the Boost Software License, Version 1.0
+// See accompanying file LICENSE_1_0.txt or copy at
+// http://www.boost.org/LICENSE_1_0.txt
+//
+// See http://boostorg.github.com/compute for more information.
+//---------------------------------------------------------------------------//
+
+#include <algorithm>
+#include <iostream>
+#include <vector>
+
+#include <boost/compute/system.hpp>
+#include <boost/compute/algorithm/remove.hpp>
+#include <boost/compute/container/vector.hpp>
+
+#include "perf.hpp"
+
+int rand_int()
+{
+    return static_cast<int>((rand() / double(RAND_MAX)) * 10.0);
+}
+
+int main(int argc, char *argv[])
+{
+    perf_parse_args(argc, argv);
+    std::cout << "size: " << PERF_N << std::endl;
+
+    // setup context and queue for the default device
+    boost::compute::device device = boost::compute::system::default_device();
+    boost::compute::context context(device);
+    boost::compute::command_queue queue(context, device);
+    std::cout << "device: " << device.name() << std::endl;
+
+    // create vector of random numbers on the host
+    std::vector<int> host_vector(PERF_N);
+    std::generate(host_vector.begin(), host_vector.end(), rand_int);
+
+    perf_timer t;
+    for(size_t trial = 0; trial < PERF_TRIALS; trial++){
+        boost::compute::vector<int> device_vector(
+            host_vector.begin(), host_vector.end(), queue
+        );
+
+        t.start();
+        device_vector.erase(
+            boost::compute::remove(
+                device_vector.begin(), device_vector.end(), 4, queue
+            ),
+            device_vector.end(),
+            queue
+        );
+        queue.finish();
+        t.stop();
+    }
+    std::cout << "time: " << t.min_time() / 1e6 << " ms" << std::endl;
+
+    return 0;
+}
diff --git a/src/boost/libs/compute/perf/perf_exclusive_scan.cpp b/src/boost/libs/compute/perf/perf_exclusive_scan.cpp
new file mode 100644
index 000000000..e6c651356
--- /dev/null
+++ b/src/boost/libs/compute/perf/perf_exclusive_scan.cpp
@@ -0,0 +1,97 @@
+//---------------------------------------------------------------------------//
+// Copyright (c) 2014 Benoit
+//
+// Distributed under the Boost Software License, Version 1.0
+// See accompanying file LICENSE_1_0.txt or copy at
+// http://www.boost.org/LICENSE_1_0.txt
+//
+// See http://boostorg.github.com/compute for more information.
+//---------------------------------------------------------------------------//
+
+#include <algorithm>
+#include <iostream>
+#include <numeric>
+#include <vector>
+
+#include <boost/compute/system.hpp>
+#include <boost/compute/algorithm/exclusive_scan.hpp>
+#include <boost/compute/container/vector.hpp>
+
+#include "perf.hpp"
+
+int rand_int()
+{
+    return static_cast<int>((rand() / double(RAND_MAX)) * 25.0);
+}
+
+int main(int argc, char *argv[])
+{
+    perf_parse_args(argc, argv);
+
+    std::cout << "size: " << PERF_N << std::endl;
+
+    // setup context and queue for the default device
+    boost::compute::device device = boost::compute::system::default_device();
+    boost::compute::context context(device);
+    boost::compute::command_queue queue(context, device);
+    std::cout << "device: " << device.name() << std::endl;
+
+    // create vector of random numbers on the host
+    std::vector<int> host_vector(PERF_N);
+    std::generate(host_vector.begin(), host_vector.end(), rand_int);
+
+    // create vector on the device and copy the data
+    boost::compute::vector<int> device_vector(PERF_N, context);
+    boost::compute::vector<int> device_res(PERF_N,context);
+    boost::compute::copy(
+        host_vector.begin(),
+        host_vector.end(),
+        device_vector.begin(),
+        queue
+    );
+
+    // sum vector
+    perf_timer t;
+    for(size_t trial = 0; trial < PERF_TRIALS; trial++){
+        boost::compute::copy(
+            host_vector.begin(),
+            host_vector.end(),
+            device_vector.begin(),
+            queue
+        );
+
+        t.start();
+        boost::compute::exclusive_scan(
+            device_vector.begin(),
+            device_vector.end(),
+            device_res.begin(),
+            queue
+        );
+        queue.finish();
+        t.stop();
+    }
+    std::cout << "time: " << t.min_time() / 1e6 << " ms" << std::endl;
+
+    // verify sum is correct
+    std::partial_sum(
+        host_vector.begin(),
+        host_vector.end(),
+        host_vector.begin()
+    );
+
+    int device_sum = device_res.back();
+    // when scan is exclusive values are shifted by one on the left
+    // compared to a inclusive scan
+    int host_sum = host_vector[host_vector.size()-2];
+
+    if(device_sum != host_sum){
+        std::cout << "ERROR: "
+                  << "device_sum (" << device_sum << ") "
+                  << "!= "
+                  << "host_sum (" << host_sum << ")"
+                  << std::endl;
+        return -1;
+    }
+
+    return 0;
+}
diff --git a/src/boost/libs/compute/perf/perf_fill.cpp b/src/boost/libs/compute/perf/perf_fill.cpp
new file mode 100644
index 000000000..9f35b6b0f
--- /dev/null
+++ b/src/boost/libs/compute/perf/perf_fill.cpp
@@ -0,0 +1,43 @@
+//---------------------------------------------------------------------------//
+// Copyright (c) 2013-2014 Kyle Lutz <kyle.r.lutz@gmail.com>
+//
+// Distributed under the Boost Software License, Version 1.0
+// See accompanying file LICENSE_1_0.txt or copy at
+// http://www.boost.org/LICENSE_1_0.txt
+//
+// See http://boostorg.github.com/compute for more information.
+//---------------------------------------------------------------------------//
+
+#include <iostream>
+
+#include <boost/compute/system.hpp>
+#include <boost/compute/algorithm/fill.hpp>
+#include <boost/compute/container/vector.hpp>
+
+#include "perf.hpp"
+
+int main(int argc, char *argv[])
+{
+    perf_parse_args(argc, argv);
+    std::cout << "size: " << PERF_N << std::endl;
+
+    // setup context and queue for the default device
+    boost::compute::device device = boost::compute::system::default_device();
+    boost::compute::context context(device);
+    boost::compute::command_queue queue(context, device);
+    std::cout << "device: " << device.name() << std::endl;
+
+    // create vector on the device (filled with zeros)
+    boost::compute::vector<int> vec(PERF_N, 0, queue);
+
+    perf_timer t;
+    for(size_t trial = 0; trial < PERF_TRIALS; trial++){
+        t.start();
+        boost::compute::fill(vec.begin(), vec.end(), int(trial), queue);
+        queue.finish();
+        t.stop();
+    }
+    std::cout << "time: " << t.min_time() / 1e6 << " ms" << std::endl;
+
+    return 0;
+}
diff --git a/src/boost/libs/compute/perf/perf_find.cpp b/src/boost/libs/compute/perf/perf_find.cpp
new file mode 100644
index 000000000..3cfb75ad6
--- /dev/null
+++ b/src/boost/libs/compute/perf/perf_find.cpp
@@ -0,0 +1,88 @@
+//---------------------------------------------------------------------------//
+// Copyright (c) 2015 Jakub Szuppe <j.szuppe@gmail.com>
+//
+// Distributed under the Boost Software License, Version 1.0
+// See accompanying file LICENSE_1_0.txt or copy at
+// http://www.boost.org/LICENSE_1_0.txt
+//
+// See http://boostorg.github.com/compute for more information.
+//---------------------------------------------------------------------------//
+
+#include <algorithm>
+#include <iostream>
+#include <vector>
+
+#include <boost/compute/system.hpp>
+#include <boost/compute/algorithm/find.hpp>
+#include <boost/compute/container/vector.hpp>
+
+#include "perf.hpp"
+
+// Max integer that can be generated by rand_int() function.
+int rand_int_max = 25;
+
+int rand_int()
+{
+    return static_cast<int>((rand() / double(RAND_MAX)) * rand_int_max);
+}
+
+int main(int argc, char *argv[])
+{
+    perf_parse_args(argc, argv);
+    std::cout << "size: " << PERF_N << std::endl;
+
+    // setup context and queue for the default device
+    boost::compute::device device = boost::compute::system::default_device();
+    boost::compute::context context(device);
+    boost::compute::command_queue queue(context, device);
+    std::cout << "device: " << device.name() << std::endl;
+
+    // create vector of random numbers on the host
+    std::vector<int> host_vector(PERF_N);
+    std::generate(host_vector.begin(), host_vector.end(), rand_int);
+
+    // create vector on the device and copy the data
+    boost::compute::vector<int> device_vector(PERF_N, context);
+    boost::compute::copy(
+        host_vector.begin(),
+        host_vector.end(),
+        device_vector.begin(),
+        queue
+    );
+
+    // trying to find element that isn't in vector (worst-case scenario)
+    int wanted = rand_int_max + 1;
+
+    // device iterator
+    boost::compute::vector<int>::iterator device_result_it;
+
+    perf_timer t;
+    for(size_t trial = 0; trial < PERF_TRIALS; trial++){
+        t.start();
+        device_result_it = boost::compute::find(device_vector.begin(),
+                                                device_vector.end(),
+                                                wanted,
+                                                queue);
+        queue.finish();
+        t.stop();
+    }
+    std::cout << "time: " << t.min_time() / 1e6 << " ms" << std::endl;
+
+    // verify if found index is correct by comparing it with std::find() result
+    size_t host_result_index = std::distance(host_vector.begin(),
+                                             std::find(host_vector.begin(),
+                                                       host_vector.end(),
+                                                       wanted));
+    size_t device_result_index = device_result_it.get_index();
+
+    if(device_result_index != host_result_index){
+        std::cout << "ERROR: "
+                  << "device_result_index (" << device_result_index << ") "
+                  << "!= "
+                  << "host_result_index (" << host_result_index << ")"
+                  << std::endl;
+        return -1;
+    }
+
+    return 0;
+}
diff --git a/src/boost/libs/compute/perf/perf_find_end.cpp b/src/boost/libs/compute/perf/perf_find_end.cpp
new file mode 100644
index 000000000..e20a7e8d8
--- /dev/null
+++ b/src/boost/libs/compute/perf/perf_find_end.cpp
@@ -0,0 +1,65 @@
+//---------------------------------------------------------------------------//
+// Copyright (c) 2014 Roshan <thisisroshansmail@gmail.com>
+//
+// Distributed under the Boost Software License, Version 1.0
+// See accompanying file LICENSE_1_0.txt or copy at
+// http://www.boost.org/LICENSE_1_0.txt
+//
+// See http://boostorg.github.com/compute for more information.
+//---------------------------------------------------------------------------//
+
+#include <algorithm>
+#include <iostream>
+#include <numeric>
+#include <vector>
+
+#include <boost/compute/system.hpp>
+#include <boost/compute/algorithm/find_end.hpp>
+#include <boost/compute/container/vector.hpp>
+
+#include "perf.hpp"
+
+int rand_int()
+{
+    return static_cast<int>((rand() / double(RAND_MAX)) * 25.0);
+}
+
+int main(int argc, char *argv[])
+{
+    perf_parse_args(argc, argv);
+    std::cout << "size: " << PERF_N << std::endl;
+
+    // setup context and queue for the default device
+    boost::compute::device device = boost::compute::system::default_device();
+    boost::compute::context context(device);
+    boost::compute::command_queue queue(context, device);
+    std::cout << "device: " << device.name() << std::endl;
+
+    // create vector of random numbers on the host
+    std::vector<int> host_vector(PERF_N);
+    std::generate(host_vector.begin(), host_vector.end(), rand_int);
+
+    int pattern[] = {2, 6, 6, 7, 8, 4};
+
+    // create vector on the device and copy the data
+    boost::compute::vector<int> device_vector(PERF_N, context);
+    boost::compute::copy(
+        host_vector.begin(), host_vector.end(), device_vector.begin(), queue
+    );
+
+    boost::compute::vector<int> pattern_vector(pattern, pattern + 6, queue);
+
+    perf_timer t;
+    for(size_t trial = 0; trial < PERF_TRIALS; trial++){
+        t.start();
+        boost::compute::find_end(
+            device_vector.begin(), device_vector.end(),
+            pattern_vector.begin(), pattern_vector.end(), queue
+        );
+        queue.finish();
+        t.stop();
+    }
+    std::cout << "time: " << t.min_time() / 1e6 << " ms" << std::endl;
+
+    return 0;
+}
diff --git a/src/boost/libs/compute/perf/perf_host_sort.cpp b/src/boost/libs/compute/perf/perf_host_sort.cpp
new file mode 100644
index 000000000..d34b1c525
--- /dev/null
+++ b/src/boost/libs/compute/perf/perf_host_sort.cpp
@@ -0,0 +1,65 @@
+//---------------------------------------------------------------------------//
+// Copyright (c) 2013-2014 Kyle Lutz <kyle.r.lutz@gmail.com>
+//
+// Distributed under the Boost Software License, Version 1.0
+// See accompanying file LICENSE_1_0.txt or copy at
+// http://www.boost.org/LICENSE_1_0.txt
+//
+// See http://boostorg.github.com/compute for more information.
+//---------------------------------------------------------------------------//
+
+#include <algorithm>
+#include <iostream>
+#include <vector>
+
+#include <boost/timer/timer.hpp>
+
+#include <boost/compute/system.hpp>
+#include <boost/compute/command_queue.hpp>
+#include <boost/compute/algorithm/sort.hpp>
+#include <boost/compute/container/vector.hpp>
+
+#include "perf.hpp"
+
+int main(int argc, char *argv[])
+{
+    perf_parse_args(argc, argv);
+    std::cout << "size: " << PERF_N << std::endl;
+
+    // setup context and queue for the default device
+    boost::compute::device device = boost::compute::system::default_device();
+    boost::compute::context context(device);
+    boost::compute::command_queue queue(context, device);
+    std::cout << "device: " << device.name() << std::endl;
+
+    // create vector of random numbers on the host
+    std::vector<int> random_vector(PERF_N);
+    std::generate(random_vector.begin(), random_vector.end(), rand);
+
+    // create input vector for gpu
+    std::vector<int> gpu_vector = random_vector;
+
+    // sort vector on gpu
+    boost::timer::cpu_timer t;
+    boost::compute::sort(
+        gpu_vector.begin(), gpu_vector.end(), queue
+    );
+    queue.finish();
+    std::cout << "time: " << t.elapsed().wall / 1e6 << " ms" << std::endl;
+
+    // create input vector for host
+    std::vector<int> host_vector = random_vector;
+
+    // sort vector on host
+    t.start();
+    std::sort(host_vector.begin(), host_vector.end());
+    std::cout << "host time: " << t.elapsed().wall / 1e6 << " ms" << std::endl;
+
+    // ensure that both sorted vectors are equal
+    if(!std::equal(gpu_vector.begin(), gpu_vector.end(), host_vector.begin())){
+        std::cerr << "ERROR: sorted vectors not the same" << std::endl;
+        return -1;
+    }
+
+    return 0;
+}
diff --git a/src/boost/libs/compute/perf/perf_includes.cpp b/src/boost/libs/compute/perf/perf_includes.cpp
new file mode 100644
index 000000000..0418a5d78
--- /dev/null
+++ b/src/boost/libs/compute/perf/perf_includes.cpp
@@ -0,0 +1,68 @@
+//---------------------------------------------------------------------------//
+// Copyright (c) 2014 Roshan <thisisroshansmail@gmail.com>
+//
+// Distributed under the Boost Software License, Version 1.0
+// See accompanying file LICENSE_1_0.txt or copy at
+// http://www.boost.org/LICENSE_1_0.txt
+//
+// See http://boostorg.github.com/compute for more information.
+//---------------------------------------------------------------------------//
+
+#include <algorithm>
+#include <iostream>
+#include <numeric>
+#include <vector>
+
+#include <boost/compute/system.hpp>
+#include <boost/compute/algorithm/includes.hpp>
+#include <boost/compute/container/vector.hpp>
+
+#include "perf.hpp"
+
+int rand_int()
+{
+    return static_cast<int>((rand() / double(RAND_MAX)) * 25.0);
+}
+
+int main(int argc, char *argv[])
+{
+    perf_parse_args(argc, argv);
+    std::cout << "size: " << PERF_N << std::endl;
+
+    // setup context and queue for the default device
+    boost::compute::device device = boost::compute::system::default_device();
+    boost::compute::context context(device);
+    boost::compute::command_queue queue(context, device);
+    std::cout << "device: " << device.name() << std::endl;
+
+    // create vectors of random numbers on the host
+    std::vector<int> host_vector(PERF_N);
+    std::generate(host_vector.begin(), host_vector.end(), rand_int);
+    std::sort(host_vector.begin(), host_vector.end());
+
+    // create vectors on the device and copy the data
+    boost::compute::vector<int> device_vector(PERF_N, context);
+    boost::compute::copy(
+        host_vector.begin(), host_vector.end(), device_vector.begin(), queue
+    );
+
+    boost::compute::vector<int> device_vector2(PERF_N, context);
+    boost::compute::copy(
+        host_vector.begin(), host_vector.end(), device_vector2.begin(), queue
+    );
+
+    perf_timer t;
+    for(size_t trial = 0; trial < PERF_TRIALS; trial++){
+        t.start();
+        boost::compute::includes(
+            device_vector.begin(), device_vector.end(),
+            device_vector2.begin(), device_vector2.end(),
+            queue
+        );
+        queue.finish();
+        t.stop();
+    }
+    std::cout << "time: " << t.min_time() / 1e6 << " ms" << std::endl;
+
+    return 0;
+}
diff --git a/src/boost/libs/compute/perf/perf_inner_product.cpp b/src/boost/libs/compute/perf/perf_inner_product.cpp
new file mode 100644
index 000000000..112a4bc62
--- /dev/null
+++ b/src/boost/libs/compute/perf/perf_inner_product.cpp
@@ -0,0 +1,74 @@
+//---------------------------------------------------------------------------//
+// Copyright (c) 2013-2014 Kyle Lutz <kyle.r.lutz@gmail.com>
+//
+// Distributed under the Boost Software License, Version 1.0
+// See accompanying file LICENSE_1_0.txt or copy at
+// http://www.boost.org/LICENSE_1_0.txt
+//
+// See http://boostorg.github.com/compute for more information.
+//---------------------------------------------------------------------------//
+
+#include <algorithm>
+#include <iostream>
+#include <numeric>
+#include <vector>
+
+#include <boost/compute/system.hpp>
+#include <boost/compute/algorithm/inner_product.hpp>
+#include <boost/compute/container/vector.hpp>
+
+#include "perf.hpp"
+
+int rand_int()
+{
+    return static_cast<int>((rand() / double(RAND_MAX)) * 25.0);
+}
+
+int main(int argc, char *argv[])
+{
+    perf_parse_args(argc, argv);
+    std::cout << "size: " << PERF_N << std::endl;
+
+    boost::compute::device device = boost::compute::system::default_device();
+    boost::compute::context context(device);
+    boost::compute::command_queue queue(context, device);
+    std::cout << "device: " << device.name() << std::endl;
+
+    std::vector<int> h1(PERF_N);
+    std::vector<int> h2(PERF_N);
+    std::generate(h1.begin(), h1.end(), rand_int);
+    std::generate(h2.begin(), h2.end(), rand_int);
+
+    // create vector on the device and copy the data
+    boost::compute::vector<int> d1(PERF_N, context);
+    boost::compute::vector<int> d2(PERF_N, context);
+    boost::compute::copy(h1.begin(), h1.end(), d1.begin(), queue);
+    boost::compute::copy(h2.begin(), h2.end(), d2.begin(), queue);
+
+    int product = 0;
+    perf_timer t;
+    for(size_t trial = 0; trial < PERF_TRIALS; trial++){
+        t.start();
+        product = boost::compute::inner_product(
+            d1.begin(), d1.end(), d2.begin(), int(0), queue
+        );
+        queue.finish();
+        t.stop();
+    }
+    std::cout << "time: " << t.min_time() / 1e6 << " ms" << std::endl;
+
+    // verify product is correct
+    int host_product = std::inner_product(
+        h1.begin(), h1.end(), h2.begin(), int(0)
+    );
+    if(product != host_product){
+        std::cout << "ERROR: "
+                  << "device_product (" << product << ") "
+                  << "!= "
+                  << "host_product (" << host_product << ")"
+                  << std::endl;
+        return -1;
+    }
+
+    return 0;
+}
diff --git a/src/boost/libs/compute/perf/perf_is_permutation.cpp b/src/boost/libs/compute/perf/perf_is_permutation.cpp
new file mode 100644
index 000000000..39c72bf1e
--- /dev/null
+++ b/src/boost/libs/compute/perf/perf_is_permutation.cpp
@@ -0,0 +1,66 @@
+//---------------------------------------------------------------------------//
+// Copyright (c) 2014 Roshan <thisisroshansmail@gmail.com>
+//
+// Distributed under the Boost Software License, Version 1.0
+// See accompanying file LICENSE_1_0.txt or copy at
+// http://www.boost.org/LICENSE_1_0.txt
+//
+// See http://boostorg.github.com/compute for more information.
+//---------------------------------------------------------------------------//
+
+#include <algorithm>
+#include <iostream>
+#include <numeric>
+#include <vector>
+
+#include <boost/compute/system.hpp>
+#include <boost/compute/algorithm/is_permutation.hpp>
+#include <boost/compute/container/vector.hpp>
+
+#include "perf.hpp"
+
+int rand_int()
+{
+    return static_cast<int>((rand() / double(RAND_MAX)) * 25.0);
+}
+
+int main(int argc, char *argv[])
+{
+    perf_parse_args(argc, argv);
+    std::cout << "size: " << PERF_N << std::endl;
+
+    // setup context and queue for the default device
+    boost::compute::device device = boost::compute::system::default_device();
+    boost::compute::context context(device);
+    boost::compute::command_queue queue(context, device);
+    std::cout << "device: " << device.name() << std::endl;
+
+    // create vector of random numbers on the host
+    std::vector<int> host_vector(PERF_N);
+    std::generate(host_vector.begin(), host_vector.end(), rand_int);
+
+    // create vector on the device and copy the data
+    boost::compute::vector<int> device_vector(PERF_N, context);
+    boost::compute::copy(
+        host_vector.begin(), host_vector.end(), device_vector.begin(), queue
+    );
+
+    boost::compute::vector<int> device_vector2(PERF_N, context);
+    boost::compute::copy(
+        host_vector.rbegin(), host_vector.rend(), device_vector2.begin(), queue
+    );
+
+    perf_timer t;
+    for(size_t trial = 0; trial < PERF_TRIALS; trial++){
+        t.start();
+        boost::compute::is_permutation(
+            device_vector.begin(), device_vector.end(),
+            device_vector2.begin(), device_vector2.end(), queue
+        );
+        queue.finish();
+        t.stop();
+    }
+    std::cout << "time: " << t.min_time() / 1e6 << " ms" << std::endl;
+
+    return 0;
+}
diff --git a/src/boost/libs/compute/perf/perf_is_sorted.cpp b/src/boost/libs/compute/perf/perf_is_sorted.cpp
new file mode 100644
index 000000000..f16a31726
--- /dev/null
+++ b/src/boost/libs/compute/perf/perf_is_sorted.cpp
@@ -0,0 +1,63 @@
+//---------------------------------------------------------------------------//
+// Copyright (c) 2013-2014 Kyle Lutz <kyle.r.lutz@gmail.com>
+//
+// Distributed under the Boost Software License, Version 1.0
+// See accompanying file LICENSE_1_0.txt or copy at
+// http://www.boost.org/LICENSE_1_0.txt
+//
+// See http://boostorg.github.com/compute for more information.
+//---------------------------------------------------------------------------//
+
+#include <algorithm>
+#include <iostream>
+#include <vector>
+
+#include <boost/compute/system.hpp>
+#include <boost/compute/algorithm/is_sorted.hpp>
+#include <boost/compute/algorithm/reverse.hpp>
+#include <boost/compute/algorithm/sort.hpp>
+#include <boost/compute/container/vector.hpp>
+
+#include "perf.hpp"
+
+int main(int argc, char *argv[])
+{
+    perf_parse_args(argc, argv);
+    std::cout << "size: " << PERF_N << std::endl;
+
+    // setup context and queue for the default device
+    boost::compute::device device = boost::compute::system::default_device();
+    boost::compute::context context(device);
+    boost::compute::command_queue queue(context, device);
+    std::cout << "device: " << device.name() << std::endl;
+
+    // create vector of random numbers on the host
+    std::vector<int> host_vector(PERF_N);
+    std::generate(host_vector.begin(), host_vector.end(), rand);
+
+    // create vector on the device and copy the data
+    boost::compute::vector<int> device_vector(PERF_N, context);
+    boost::compute::copy(
+        host_vector.begin(), host_vector.end(), device_vector.begin(), queue
+    );
+
+    // sort and then reverse the random vector
+    boost::compute::sort(device_vector.begin(), device_vector.end(), queue);
+    boost::compute::reverse(device_vector.begin(), device_vector.end(), queue);
+
+    perf_timer t;
+    for(size_t trial = 0; trial < PERF_TRIALS; trial++){
+        t.start();
+        bool sorted = boost::compute::is_sorted(
+            device_vector.begin(), device_vector.end(), queue
+        );
+        queue.finish();
+        t.stop();
+        if(sorted){
+            std::cerr << "ERROR: is_sorted() returned true" << std::endl;
+        }
+    }
+    std::cout << "time: " << t.min_time() / 1e6 << " ms" << std::endl;
+
+    return 0;
+}
diff --git a/src/boost/libs/compute/perf/perf_max_element.cpp b/src/boost/libs/compute/perf/perf_max_element.cpp
new file mode 100644
index 000000000..0e47c67ef
--- /dev/null
+++ b/src/boost/libs/compute/perf/perf_max_element.cpp
@@ -0,0 +1,93 @@
+//---------------------------------------------------------------------------//
+// Copyright (c) 2013-2014 Rastko Anicic <anicic.rastko@gmail.com>
+//
+// Distributed under the Boost Software License, Version 1.0
+// See accompanying file LICENSE_1_0.txt or copy at
+// http://www.boost.org/LICENSE_1_0.txt
+//
+// See http://boostorg.github.com/compute for more information.
+//---------------------------------------------------------------------------//
+
+#include <algorithm>
+#include <iostream>
+#include <vector>
+
+#include <boost/compute/system.hpp>
+#include <boost/compute/algorithm/max_element.hpp>
+#include <boost/compute/container/vector.hpp>
+
+#include "perf.hpp"
+
+int rand_int()
+{
+    return static_cast<int>(rand() % 10000000);
+}
+
+int main(int argc, char *argv[])
+{
+    perf_parse_args(argc, argv);
+    std::cout << "size: " << PERF_N << std::endl;
+
+    // setup context and queue for the default device
+    boost::compute::device device = boost::compute::system::default_device();
+    boost::compute::context context(device);
+    boost::compute::command_queue queue(context, device);
+    std::cout << "device: " << device.name() << std::endl;
+
+    // create vector of random numbers on the host
+    std::vector<int> host_vector(PERF_N);
+    std::generate(host_vector.begin(), host_vector.end(), rand_int);
+
+    // create vector on the device and copy the data
+    boost::compute::vector<int> device_vector(PERF_N, context);
+    boost::compute::copy(
+        host_vector.begin(),
+        host_vector.end(),
+        device_vector.begin(),
+        queue
+    );
+
+    boost::compute::vector<int>::iterator device_max_iter
+        = device_vector.begin();
+
+    perf_timer t;
+    for(size_t trial = 0; trial < PERF_TRIALS; trial++){
+        t.start();
+        device_max_iter = boost::compute::max_element(
+            device_vector.begin(), device_vector.end(), queue
+        );
+        queue.finish();
+        t.stop();
+    }
+
+    int device_max = device_max_iter.read(queue);
+    std::cout << "time: " << t.min_time() / 1e6 << " ms" << std::endl;
+    std::cout << "max: " << device_max << std::endl;
+
+    // verify max is correct
+    std::vector<int>::iterator host_max_iter
+        = std::max_element(host_vector.begin(), host_vector.end());
+
+    int host_max = *host_max_iter;
+    if(device_max != host_max){
+         std::cout << "ERROR: "
+                   << "device_max (" << device_max << ") "
+                   << "!= "
+                   << "host_max (" << host_max << ")"
+                   << std::endl;
+         return -1;
+    }
+
+    size_t host_max_idx = std::distance(host_vector.begin(), host_max_iter);
+    size_t device_max_idx = std::distance(device_vector.begin(), device_max_iter);
+    if(device_max_idx != host_max_idx){
+        std::cout << "ERROR: "
+                  << "device_max index (" << device_max_idx << ") "
+                  << "!= "
+                  << "host_max index (" << host_max_idx << ")"
+                  << std::endl;
+        return -1;
+    }
+
+    return 0;
+}
diff --git a/src/boost/libs/compute/perf/perf_merge.cpp b/src/boost/libs/compute/perf/perf_merge.cpp
new file mode 100644
index 000000000..58ea836f1
--- /dev/null
+++ b/src/boost/libs/compute/perf/perf_merge.cpp
@@ -0,0 +1,69 @@
+//---------------------------------------------------------------------------//
+// Copyright (c) 2013-2014 Kyle Lutz <kyle.r.lutz@gmail.com>
+//
+// Distributed under the Boost Software License, Version 1.0
+// See accompanying file LICENSE_1_0.txt or copy at
+// http://www.boost.org/LICENSE_1_0.txt
+//
+// See http://boostorg.github.com/compute for more information.
+//---------------------------------------------------------------------------//
+
+#include <cmath>
+#include <vector>
+#include <algorithm>
+#include <iostream>
+
+#include <boost/compute/system.hpp>
+#include <boost/compute/algorithm/merge.hpp>
+#include <boost/compute/container/vector.hpp>
+
+#include "perf.hpp"
+
+int main(int argc, char *argv[])
+{
+    perf_parse_args(argc, argv);
+
+    std::cout << "size: " << PERF_N << std::endl;
+
+    boost::compute::device device = boost::compute::system::default_device();
+    boost::compute::context context(device);
+    boost::compute::command_queue queue(context, device);
+    std::cout << "device: " << device.name() << std::endl;
+
+    std::vector<int> v1 = generate_random_vector<int>(std::floor(PERF_N / 2.0));
+    std::vector<int> v2 = generate_random_vector<int>(std::ceil(PERF_N / 2.0));
+    std::vector<int> v3(PERF_N);
+
+    std::sort(v1.begin(), v1.end());
+    std::sort(v2.begin(), v2.end());
+
+    boost::compute::vector<int> gpu_v1(v1.begin(), v1.end(), queue);
+    boost::compute::vector<int> gpu_v2(v2.begin(), v2.end(), queue);
+    boost::compute::vector<int> gpu_v3(PERF_N, context);
+
+    perf_timer t;
+    for(size_t trial = 0; trial < PERF_TRIALS; trial++){
+        t.start();
+        boost::compute::merge(gpu_v1.begin(), gpu_v1.end(),
+                              gpu_v2.begin(), gpu_v2.end(),
+                              gpu_v3.begin(),
+                              queue
+        );
+        queue.finish();
+        t.stop();
+    }
+    std::cout << "time: " << t.min_time() / 1e6 << " ms" << std::endl;
+
+    std::vector<int> check_v3(PERF_N);
+    boost::compute::copy(gpu_v3.begin(), gpu_v3.end(), check_v3.begin(), queue);
+    queue.finish();
+
+    std::merge(v1.begin(), v1.end(), v2.begin(), v2.end(), v3.begin());
+    bool ok = std::equal(check_v3.begin(), check_v3.end(), v3.begin());
+    if(!ok){
+        std::cerr << "ERROR: merged ranges different" << std::endl;
+        return -1;
+    }
+
+    return 0;
+}
diff --git a/src/boost/libs/compute/perf/perf_next_permutation.cpp b/src/boost/libs/compute/perf/perf_next_permutation.cpp
new file mode 100644
index 000000000..62f0bbb0d
--- /dev/null
+++ b/src/boost/libs/compute/perf/perf_next_permutation.cpp
@@ -0,0 +1,65 @@
+//---------------------------------------------------------------------------//
+// Copyright (c) 2014 Roshan <thisisroshansmail@gmail.com>
+//
+// Distributed under the Boost Software License, Version 1.0
+// See accompanying file LICENSE_1_0.txt or copy at
+// http://www.boost.org/LICENSE_1_0.txt
+//
+// See http://boostorg.github.com/compute for more information.
+//---------------------------------------------------------------------------//
+
+#include <algorithm>
+#include <iostream>
+#include <numeric>
+#include <vector>
+
+#include <boost/compute/system.hpp>
+#include <boost/compute/algorithm/next_permutation.hpp>
+#include <boost/compute/algorithm/prev_permutation.hpp>
+#include <boost/compute/container/vector.hpp>
+
+#include "perf.hpp"
+
+int rand_int()
+{
+    return static_cast<int>((rand() / double(RAND_MAX)) * 25.0);
+}
+
+int main(int argc, char *argv[])
+{
+    perf_parse_args(argc, argv);
+    std::cout << "size: " << PERF_N << std::endl;
+
+    // setup context and queue for the default device
+    boost::compute::device device = boost::compute::system::default_device();
+    boost::compute::context context(device);
+    boost::compute::command_queue queue(context, device);
+    std::cout << "device: " << device.name() << std::endl;
+
+    // create vector of random numbers on the host
+    std::vector<int> host_vector(PERF_N);
+    std::generate(host_vector.begin(), host_vector.end(), rand_int);
+    std::sort(host_vector.begin(), host_vector.end(), std::greater<int>());
+
+    // create vector on the device and copy the data
+    boost::compute::vector<int> device_vector(PERF_N, context);
+    boost::compute::copy(
+        host_vector.begin(), host_vector.end(), device_vector.begin(), queue
+    );
+
+    perf_timer t;
+    for(size_t trial = 0; trial < PERF_TRIALS; trial++){
+        t.start();
+        boost::compute::next_permutation(
+            device_vector.begin(), device_vector.end(), queue
+        );
+        queue.finish();
+        t.stop();
+        boost::compute::prev_permutation(
+            device_vector.begin(), device_vector.end(), queue
+        );
+    }
+    std::cout << "time: " << t.min_time() / 1e6 << " ms" << std::endl;
+
+    return 0;
+}
diff --git a/src/boost/libs/compute/perf/perf_nth_element.cpp b/src/boost/libs/compute/perf/perf_nth_element.cpp
new file mode 100644
index 000000000..07627cb42
--- /dev/null
+++ b/src/boost/libs/compute/perf/perf_nth_element.cpp
@@ -0,0 +1,60 @@
+//---------------------------------------------------------------------------//
+// Copyright (c) 2014 Roshan <thisisroshansmail@gmail.com>
+//
+// Distributed under the Boost Software License, Version 1.0
+// See accompanying file LICENSE_1_0.txt or copy at
+// http://www.boost.org/LICENSE_1_0.txt
+//
+// See http://boostorg.github.com/compute for more information.
+//---------------------------------------------------------------------------//
+
+#include <algorithm>
+#include <iostream>
+#include <numeric>
+#include <vector>
+
+#include <boost/compute/system.hpp>
+#include <boost/compute/algorithm/nth_element.hpp>
+#include <boost/compute/container/vector.hpp>
+
+#include "perf.hpp"
+
+int rand_int()
+{
+    return static_cast<int>((rand() / double(RAND_MAX)) * 25.0);
+}
+
+int main(int argc, char *argv[])
+{
+    perf_parse_args(argc, argv);
+    std::cout << "size: " << PERF_N << std::endl;
+
+    // setup context and queue for the default device
+    boost::compute::device device = boost::compute::system::default_device();
+    boost::compute::context context(device);
+    boost::compute::command_queue queue(context, device);
+    std::cout << "device: " << device.name() << std::endl;
+
+    // create vector of random numbers on the host
+    std::vector<int> host_vector(PERF_N);
+    std::generate(host_vector.begin(), host_vector.end(), rand_int);
+
+    // create vector on the device and copy the data
+    boost::compute::vector<int> device_vector(PERF_N, context);
+    boost::compute::copy(
+        host_vector.begin(), host_vector.end(), device_vector.begin(), queue
+    );
+
+    perf_timer t;
+    for(size_t trial = 0; trial < PERF_TRIALS; trial++){
+        t.start();
+        boost::compute::nth_element(
+            device_vector.begin(), device_vector.begin()+(PERF_N/2), device_vector.end(), queue
+        );
+        queue.finish();
+        t.stop();
+    }
+    std::cout << "time: " << t.min_time() / 1e6 << " ms" << std::endl;
+
+    return 0;
+}
diff --git a/src/boost/libs/compute/perf/perf_partial_sum.cpp b/src/boost/libs/compute/perf/perf_partial_sum.cpp
new file mode 100644
index 000000000..1c02b4c47
--- /dev/null
+++ b/src/boost/libs/compute/perf/perf_partial_sum.cpp
@@ -0,0 +1,97 @@
+//---------------------------------------------------------------------------//
+// Copyright (c) 2013-2014 Kyle Lutz <kyle.r.lutz@gmail.com>
+//
+// Distributed under the Boost Software License, Version 1.0
+// See accompanying file LICENSE_1_0.txt or copy at
+// http://www.boost.org/LICENSE_1_0.txt
+//
+// See http://boostorg.github.com/compute for more information.
+//---------------------------------------------------------------------------//
+
+#include <algorithm>
+#include <iostream>
+#include <numeric>
+#include <vector>
+
+#include <boost/compute/system.hpp>
+#include <boost/compute/algorithm/partial_sum.hpp>
+#include <boost/compute/container/vector.hpp>
+
+#include "perf.hpp"
+
+int rand_int()
+{
+    return static_cast<int>((rand() / double(RAND_MAX)) * 25.0);
+}
+
+int main(int argc, char *argv[])
+{
+    using boost::compute::int_;
+
+    perf_parse_args(argc, argv);
+
+    std::cout << "size: " << PERF_N << std::endl;
+
+    // setup context and queue for the default device
+    boost::compute::device device = boost::compute::system::default_device();
+    boost::compute::context context(device);
+    boost::compute::command_queue queue(context, device);
+    std::cout << "device: " << device.name() << std::endl;
+
+    // create vector of random numbers on the host
+    std::vector<int_> host_vector(PERF_N);
+    std::generate(host_vector.begin(), host_vector.end(), rand_int);
+
+    // create vector on the device and copy the data
+    boost::compute::vector<int_> device_vector(PERF_N, context);
+    boost::compute::vector<int_> device_res(PERF_N,context);
+    boost::compute::copy(
+        host_vector.begin(),
+        host_vector.end(),
+        device_vector.begin(),
+        queue
+    );
+
+    // sum vector
+    perf_timer t;
+    for(size_t trial = 0; trial < PERF_TRIALS; trial++){
+        boost::compute::copy(
+            host_vector.begin(),
+            host_vector.end(),
+            device_vector.begin(),
+            queue
+        );
+
+        t.start();
+        boost::compute::partial_sum(
+            device_vector.begin(),
+            device_vector.end(),
+            device_res.begin(),
+            queue
+        );
+        queue.finish();
+        t.stop();
+    }
+    std::cout << "time: " << t.min_time() / 1e6 << " ms" << std::endl;
+
+    // verify sum is correct
+    std::partial_sum(
+        host_vector.begin(),
+        host_vector.end(),
+        host_vector.begin()
+    );
+
+    int device_sum = device_res.back();
+    int host_sum = host_vector.back();
+
+    if(device_sum != host_sum){
+        std::cout << "ERROR: "
+                  << "device_sum (" << device_sum << ") "
+                  << "!= "
+                  << "host_sum (" << host_sum << ")"
+                  << std::endl;
+        return -1;
+    }
+
+    return 0;
+}
diff --git a/src/boost/libs/compute/perf/perf_partition.cpp b/src/boost/libs/compute/perf/perf_partition.cpp
new file mode 100644
index 000000000..b765d7360
--- /dev/null
+++ b/src/boost/libs/compute/perf/perf_partition.cpp
@@ -0,0 +1,66 @@
+//---------------------------------------------------------------------------//
+// Copyright (c) 2014 Roshan <thisisroshansmail@gmail.com>
+//
+// Distributed under the Boost Software License, Version 1.0
+// See accompanying file LICENSE_1_0.txt or copy at
+// http://www.boost.org/LICENSE_1_0.txt
+//
+// See http://boostorg.github.com/compute for more information.
+//---------------------------------------------------------------------------//
+
+#include <algorithm>
+#include <iostream>
+#include <numeric>
+#include <vector>
+
+#include <boost/compute/system.hpp>
+#include <boost/compute/algorithm/partition.hpp>
+#include <boost/compute/container/vector.hpp>
+
+#include "perf.hpp"
+
+int rand_int()
+{
+    return static_cast<int>((rand() / double(RAND_MAX)) * 25.0);
+}
+
+int main(int argc, char *argv[])
+{
+    perf_parse_args(argc, argv);
+    std::cout << "size: " << PERF_N << std::endl;
+
+    // setup context and queue for the default device
+    boost::compute::device device = boost::compute::system::default_device();
+    boost::compute::context context(device);
+    boost::compute::command_queue queue(context, device);
+    std::cout << "device: " << device.name() << std::endl;
+
+    // create vector of random numbers on the host
+    std::vector<int> host_vector(PERF_N);
+    std::generate(host_vector.begin(), host_vector.end(), rand_int);
+
+    // create vector on the device and copy the data
+    boost::compute::vector<int> device_vector(PERF_N, context);
+    boost::compute::copy(
+        host_vector.begin(), host_vector.end(), device_vector.begin(), queue
+    );
+
+    using boost::compute::_1;
+
+    perf_timer t;
+    for(size_t trial = 0; trial < PERF_TRIALS; trial++){
+        boost::compute::copy(
+            host_vector.begin(), host_vector.end(), device_vector.begin(), queue
+        );
+
+        t.start();
+        boost::compute::partition(
+            device_vector.begin(), device_vector.end(), _1 < 10, queue
+        );
+        queue.finish();
+        t.stop();
+    }
+    std::cout << "time: " << t.min_time() / 1e6 << " ms" << std::endl;
+
+    return 0;
+}
diff --git a/src/boost/libs/compute/perf/perf_partition_point.cpp b/src/boost/libs/compute/perf/perf_partition_point.cpp
new file mode 100644
index 000000000..91261aa1d
--- /dev/null
+++ b/src/boost/libs/compute/perf/perf_partition_point.cpp
@@ -0,0 +1,68 @@
+//---------------------------------------------------------------------------//
+// Copyright (c) 2014 Roshan <thisisroshansmail@gmail.com>
+//
+// Distributed under the Boost Software License, Version 1.0
+// See accompanying file LICENSE_1_0.txt or copy at
+// http://www.boost.org/LICENSE_1_0.txt
+//
+// See http://boostorg.github.com/compute for more information.
+//---------------------------------------------------------------------------//
+
+#include <algorithm>
+#include <iostream>
+#include <numeric>
+#include <vector>
+
+#include <boost/compute/system.hpp>
+#include <boost/compute/lambda.hpp>
+#include <boost/compute/algorithm/partition.hpp>
+#include <boost/compute/algorithm/partition_point.hpp>
+#include <boost/compute/container/vector.hpp>
+
+#include "perf.hpp"
+
+int rand_int()
+{
+    return static_cast<int>((rand() / double(RAND_MAX)) * 25.0);
+}
+
+int main(int argc, char *argv[])
+{
+    perf_parse_args(argc, argv);
+    std::cout << "size: " << PERF_N << std::endl;
+
+    // setup context and queue for the default device
+    boost::compute::device device = boost::compute::system::default_device();
+    boost::compute::context context(device);
+    boost::compute::command_queue queue(context, device);
+    std::cout << "device: " << device.name() << std::endl;
+
+    // create vector of random numbers on the host
+    std::vector<int> host_vector(PERF_N);
+    std::generate(host_vector.begin(), host_vector.end(), rand_int);
+
+    // create vector on the device and copy the data
+    boost::compute::vector<int> device_vector(PERF_N, context);
+    boost::compute::copy(
+        host_vector.begin(), host_vector.end(), device_vector.begin(), queue
+    );
+
+    using boost::compute::_1;
+    boost::compute::partition(
+        device_vector.begin(), device_vector.end(), _1 < 20, queue
+    );
+
+
+    perf_timer t;
+    for(size_t trial = 0; trial < PERF_TRIALS; trial++){
+        t.start();
+        boost::compute::partition_point(
+            device_vector.begin(), device_vector.end(), _1 < 20, queue
+        );
+        queue.finish();
+        t.stop();
+    }
+    std::cout << "time: " << t.min_time() / 1e6 << " ms" << std::endl;
+
+    return 0;
+}
diff --git a/src/boost/libs/compute/perf/perf_prev_permutation.cpp b/src/boost/libs/compute/perf/perf_prev_permutation.cpp
new file mode 100644
index 000000000..bb7d76d26
--- /dev/null
+++ b/src/boost/libs/compute/perf/perf_prev_permutation.cpp
@@ -0,0 +1,65 @@
+//---------------------------------------------------------------------------//
+// Copyright (c) 2014 Roshan <thisisroshansmail@gmail.com>
+//
+// Distributed under the Boost Software License, Version 1.0
+// See accompanying file LICENSE_1_0.txt or copy at
+// http://www.boost.org/LICENSE_1_0.txt
+//
+// See http://boostorg.github.com/compute for more information.
+//---------------------------------------------------------------------------//
+
+#include <algorithm>
+#include <iostream>
+#include <numeric>
+#include <vector>
+
+#include <boost/compute/system.hpp>
+#include <boost/compute/algorithm/next_permutation.hpp>
+#include <boost/compute/algorithm/prev_permutation.hpp>
+#include <boost/compute/container/vector.hpp>
+
+#include "perf.hpp"
+
+int rand_int()
+{
+    return static_cast<int>((rand() / double(RAND_MAX)) * 25.0);
+}
+
+int main(int argc, char *argv[])
+{
+    perf_parse_args(argc, argv);
+    std::cout << "size: " << PERF_N << std::endl;
+
+    // setup context and queue for the default device
+    boost::compute::device device = boost::compute::system::default_device();
+    boost::compute::context context(device);
+    boost::compute::command_queue queue(context, device);
+    std::cout << "device: " << device.name() << std::endl;
+
+    // create vector of random numbers on the host
+    std::vector<int> host_vector(PERF_N);
+    std::generate(host_vector.begin(), host_vector.end(), rand_int);
+    std::sort(host_vector.begin(), host_vector.end());
+
+    // create vector on the device and copy the data
+    boost::compute::vector<int> device_vector(PERF_N, context);
+    boost::compute::copy(
+        host_vector.begin(), host_vector.end(), device_vector.begin(), queue
+    );
+
+    perf_timer t;
+    for(size_t trial = 0; trial < PERF_TRIALS; trial++){
+        t.start();
+        boost::compute::prev_permutation(
+            device_vector.begin(), device_vector.end(), queue
+        );
+        queue.finish();
+        t.stop();
+        boost::compute::next_permutation(
+            device_vector.begin(), device_vector.end(), queue
+        );
+    }
+    std::cout << "time: " << t.min_time() / 1e6 << " ms" << std::endl;
+
+    return 0;
+}
diff --git a/src/boost/libs/compute/perf/perf_random_number_engine.cpp b/src/boost/libs/compute/perf/perf_random_number_engine.cpp
new file mode 100644
index 000000000..db25d437e
--- /dev/null
+++ b/src/boost/libs/compute/perf/perf_random_number_engine.cpp
@@ -0,0 +1,101 @@
+//---------------------------------------------------------------------------//
+// Copyright (c) 2013-2015 Kyle Lutz <kyle.r.lutz@gmail.com>
+//
+// Distributed under the Boost Software License, Version 1.0
+// See accompanying file LICENSE_1_0.txt or copy at
+// http://www.boost.org/LICENSE_1_0.txt
+//
+// See http://boostorg.github.com/compute for more information.
+//---------------------------------------------------------------------------//
+
+#include <iostream>
+#include <vector>
+
+#include <boost/program_options.hpp>
+
+#include <boost/compute/container/vector.hpp>
+#include <boost/compute/core.hpp>
+#include <boost/compute/random.hpp>
+
+#include "perf.hpp"
+
+namespace compute = boost::compute;
+namespace po = boost::program_options;
+
+template<class Engine>
+void perf_random_number_engine(const size_t size,
+                               const size_t trials,
+                               compute::command_queue& queue)
+{
+    typedef typename Engine::result_type T;
+
+    // create random number engine
+    Engine engine(queue);
+
+    // create vector on the device
+    std::cout << "size = " << size << std::endl;
+    compute::vector<T> vector(size, queue.get_context());
+
+    // generate random numbers
+    perf_timer t;
+    for(size_t i = 0; i < trials; i++){
+        t.start();
+        engine.generate(vector.begin(), vector.end(), queue);
+        queue.finish();
+        t.stop();
+    }
+
+    // print result
+    std::cout << "time: " << t.min_time() / 1e6 << " ms" << std::endl;
+    std::cout << "rate: " << perf_rate<T>(size, t.min_time()) << " MB/s" << std::endl;
+}
+
+int main(int argc, char *argv[])
+{
+    // setup and parse command line options
+    po::options_description options("options");
+    options.add_options()
+        ("help", "show usage instructions")
+        ("size", po::value<size_t>()->default_value(8192), "number of values")
+        ("trials", po::value<size_t>()->default_value(3), "number of trials")
+        ("engine", po::value<std::string>()->default_value("default_random_engine"), "random number engine")
+    ;
+    po::variables_map vm;
+    po::store(po::parse_command_line(argc, argv, options), vm);
+    po::notify(vm);
+
+    if(vm.count("help")) {
+        std::cout << options << std::endl;
+        return 0;
+    }
+
+    // setup context and queue for the default device
+    compute::device device = compute::system::default_device();
+    compute::context context(device);
+    compute::command_queue queue(context, device);
+
+    // get command line options
+    const size_t size = vm["size"].as<size_t>();
+    const size_t trials = vm["trials"].as<size_t>();
+    const std::string& engine = vm["engine"].as<std::string>();
+
+    // run benchmark
+    if(engine == "default_random_engine"){
+        perf_random_number_engine<compute::default_random_engine>(size, trials, queue);
+    }
+    else if(engine == "mersenne_twister_engine"){
+        perf_random_number_engine<compute::mt19937>(size, trials, queue);
+    }
+    else if(engine == "linear_congruential_engine"){
+        perf_random_number_engine<compute::linear_congruential_engine<> >(size, trials, queue);
+    }
+    else if(engine == "threefry_engine"){
+        perf_random_number_engine<compute::threefry_engine<> >(size, trials, queue);
+    }
+    else {
+        std::cerr << "error: unknown random number engine '" << engine << "'" << std::endl;
+        return -1;
+    }
+
+    return 0;
+}
diff --git a/src/boost/libs/compute/perf/perf_reduce_by_key.cpp b/src/boost/libs/compute/perf/perf_reduce_by_key.cpp
new file mode 100644
index 000000000..c88d450e9
--- /dev/null
+++ b/src/boost/libs/compute/perf/perf_reduce_by_key.cpp
@@ -0,0 +1,114 @@
+//---------------------------------------------------------------------------//
+// Copyright (c) 2015 Jakub Szuppe <j.szuppe@gmail.com>
+//
+// Distributed under the Boost Software License, Version 1.0
+// See accompanying file LICENSE_1_0.txt or copy at
+// http://www.boost.org/LICENSE_1_0.txt
+//
+// See http://boostorg.github.com/compute for more information.
+//---------------------------------------------------------------------------//
+
+#include <algorithm>
+#include <iostream>
+#include <numeric>
+#include <vector>
+
+#include <boost/compute/system.hpp>
+#include <boost/compute/algorithm/fill.hpp>
+#include <boost/compute/algorithm/reduce_by_key.hpp>
+#include <boost/compute/container/vector.hpp>
+
+#include "perf.hpp"
+
+int rand_int()
+{
+    return static_cast<int>((rand() / double(RAND_MAX)) * 25.0);
+}
+
+struct unique_key {
+  int current;
+  int avgValuesNoPerKey;
+
+  unique_key()
+  {
+      current = 0;
+      avgValuesNoPerKey = 512;
+  }
+
+  int operator()()
+  {
+      double p = double(1.0) / static_cast<double>(avgValuesNoPerKey);
+      if((rand() / double(RAND_MAX)) <= p)
+          return ++current;
+      return current;
+  }
+} UniqueKey;
+
+int main(int argc, char *argv[])
+{
+    perf_parse_args(argc, argv);
+
+    std::cout << "size: " << PERF_N << std::endl;
+
+    // setup context and queue for the default device
+    boost::compute::device device = boost::compute::system::default_device();
+    boost::compute::context context(device);
+    boost::compute::command_queue queue(context, device);
+    std::cout << "device: " << device.name() << std::endl;
+
+    // create vector of keys and random values
+    std::vector<int> host_keys(PERF_N);
+    std::vector<int> host_values(PERF_N);
+    std::generate(host_keys.begin(), host_keys.end(), UniqueKey);
+    std::generate(host_values.begin(), host_values.end(), rand_int);
+
+    // create vectors for keys and values on the device and copy the data
+    boost::compute::vector<int> device_keys(PERF_N, context);
+    boost::compute::vector<int> device_values(PERF_N,context);
+    boost::compute::copy(
+        host_keys.begin(),
+        host_keys.end(),
+        device_keys.begin(),
+        queue
+    );
+    boost::compute::copy(
+        host_values.begin(),
+        host_values.end(),
+        device_values.begin(),
+        queue
+    );
+
+    // vectors for the results
+    boost::compute::vector<int> device_keys_results(PERF_N, context);
+    boost::compute::vector<int> device_values_results(PERF_N,context);
+
+    typedef boost::compute::vector<int>::iterator iterType;
+    std::pair<iterType, iterType> result(
+        device_keys_results.begin(),
+        device_values_results.begin()
+    );
+
+    // reduce by key
+    perf_timer t;
+    for(size_t trial = 0; trial < PERF_TRIALS; trial++){
+        t.start();
+        result = boost::compute::reduce_by_key(device_keys.begin(),
+                                               device_keys.end(),
+                                               device_values.begin(),
+                                               device_keys_results.begin(),
+                                               device_values_results.begin(),
+                                               queue);
+        t.stop();
+    }
+    std::cout << "time: " << t.min_time() / 1e6 << " ms" << std::endl;
+
+    size_t result_size = std::distance(device_keys_results.begin(), result.first);
+    if(result_size != static_cast<size_t>(host_keys[PERF_N-1] + 1)){
+        std::cout << "ERROR: "
+                  << "wrong number of keys" << result_size << "\n" << (host_keys[PERF_N-1] + 1)
+                  << std::endl;
+        return -1;
+    }
+
+    return 0;
+}
diff --git a/src/boost/libs/compute/perf/perf_reverse.cpp b/src/boost/libs/compute/perf/perf_reverse.cpp
new file mode 100644
index 000000000..64369a78e
--- /dev/null
+++ b/src/boost/libs/compute/perf/perf_reverse.cpp
@@ -0,0 +1,60 @@
+//---------------------------------------------------------------------------//
+// Copyright (c) 2013-2014 Kyle Lutz <kyle.r.lutz@gmail.com>
+//
+// Distributed under the Boost Software License, Version 1.0
+// See accompanying file LICENSE_1_0.txt or copy at
+// http://www.boost.org/LICENSE_1_0.txt
+//
+// See http://boostorg.github.com/compute for more information.
+//---------------------------------------------------------------------------//
+
+#include <algorithm>
+#include <iostream>
+#include <numeric>
+#include <vector>
+
+#include <boost/compute/system.hpp>
+#include <boost/compute/algorithm/reverse.hpp>
+#include <boost/compute/container/vector.hpp>
+
+#include "perf.hpp"
+
+int rand_int()
+{
+    return static_cast<int>((rand() / double(RAND_MAX)) * 25.0);
+}
+
+int main(int argc, char *argv[])
+{
+    perf_parse_args(argc, argv);
+    std::cout << "size: " << PERF_N << std::endl;
+
+    // setup context and queue for the default device
+    boost::compute::device device = boost::compute::system::default_device();
+    boost::compute::context context(device);
+    boost::compute::command_queue queue(context, device);
+    std::cout << "device: " << device.name() << std::endl;
+
+    // create vector of random numbers on the host
+    std::vector<int> host_vector(PERF_N);
+    std::generate(host_vector.begin(), host_vector.end(), rand_int);
+
+    // create vector on the device and copy the data
+    boost::compute::vector<int> device_vector(PERF_N, context);
+    boost::compute::copy(
+        host_vector.begin(), host_vector.end(), device_vector.begin(), queue
+    );
+
+    perf_timer t;
+    for(size_t trial = 0; trial < PERF_TRIALS; trial++){
+        t.start();
+        boost::compute::reverse(
+            device_vector.begin(), device_vector.end(), queue
+        );
+        queue.finish();
+        t.stop();
+    }
+    std::cout << "time: " << t.min_time() / 1e6 << " ms" << std::endl;
+
+    return 0;
+}
diff --git a/src/boost/libs/compute/perf/perf_reverse_copy.cpp b/src/boost/libs/compute/perf/perf_reverse_copy.cpp
new file mode 100644
index 000000000..5ce01c9b2
--- /dev/null
+++ b/src/boost/libs/compute/perf/perf_reverse_copy.cpp
@@ -0,0 +1,65 @@
+//---------------------------------------------------------------------------//
+// Copyright (c) 2015 Jakub Szuppe <j.szuppe@gmail.com>
+//
+// Distributed under the Boost Software License, Version 1.0
+// See accompanying file LICENSE_1_0.txt or copy at
+// http://www.boost.org/LICENSE_1_0.txt
+//
+// See http://boostorg.github.com/compute for more information.
+//---------------------------------------------------------------------------//
+
+#include <algorithm>
+#include <iostream>
+#include <numeric>
+#include <vector>
+
+#include <boost/compute/system.hpp>
+#include <boost/compute/algorithm/reverse_copy.hpp>
+#include <boost/compute/container/vector.hpp>
+
+#include "perf.hpp"
+
+int rand_int()
+{
+    return static_cast<int>((rand() / double(RAND_MAX)) * 25.0);
+}
+
+int main(int argc, char *argv[])
+{
+    perf_parse_args(argc, argv);
+    std::cout << "size: " << PERF_N << std::endl;
+
+    // setup context and queue for the default device
+    boost::compute::device device = boost::compute::system::default_device();
+    boost::compute::context context(device);
+    boost::compute::command_queue queue(context, device);
+    std::cout << "device: " << device.name() << std::endl;
+
+    // create vector of random numbers on the host
+    std::vector<int> host_vector(PERF_N);
+    std::generate(host_vector.begin(), host_vector.end(), rand_int);
+
+    // create vector on the device and copy the data
+    boost::compute::vector<int> device_vector(PERF_N, context);
+    boost::compute::copy(
+        host_vector.begin(), host_vector.end(), device_vector.begin(), queue
+    );
+
+    // create vector on the device for reversed data
+    boost::compute::vector<int> device_reversed_vector(PERF_N, context);
+
+    perf_timer t;
+    for(size_t trial = 0; trial < PERF_TRIALS; trial++){
+        t.start();
+        boost::compute::reverse_copy(
+            device_vector.begin(), device_vector.end(),
+            device_reversed_vector.begin(),
+            queue
+        );
+        queue.finish();
+        t.stop();
+    }
+    std::cout << "time: " << t.min_time() / 1e6 << " ms" << std::endl;
+
+    return 0;
+}
diff --git a/src/boost/libs/compute/perf/perf_rotate.cpp b/src/boost/libs/compute/perf/perf_rotate.cpp
new file mode 100644
index 000000000..49f85d577
--- /dev/null
+++ b/src/boost/libs/compute/perf/perf_rotate.cpp
@@ -0,0 +1,60 @@
+//---------------------------------------------------------------------------//
+// Copyright (c) 2014 Roshan <thisisroshansmail@gmail.com>
+//
+// Distributed under the Boost Software License, Version 1.0
+// See accompanying file LICENSE_1_0.txt or copy at
+// http://www.boost.org/LICENSE_1_0.txt
+//
+// See http://boostorg.github.com/compute for more information.
+//---------------------------------------------------------------------------//
+
+#include <algorithm>
+#include <iostream>
+#include <numeric>
+#include <vector>
+
+#include <boost/compute/system.hpp>
+#include <boost/compute/algorithm/rotate.hpp>
+#include <boost/compute/container/vector.hpp>
+
+#include "perf.hpp"
+
+int rand_int()
+{
+    return static_cast<int>((rand() / double(RAND_MAX)) * 25.0);
+}
+
+int main(int argc, char *argv[])
+{
+    perf_parse_args(argc, argv);
+    std::cout << "size: " << PERF_N << std::endl;
+
+    // setup context and queue for the default device
+    boost::compute::device device = boost::compute::system::default_device();
+    boost::compute::context context(device);
+    boost::compute::command_queue queue(context, device);
+    std::cout << "device: " << device.name() << std::endl;
+
+    // create vector of random numbers on the host
+    std::vector<int> host_vector(PERF_N);
+    std::generate(host_vector.begin(), host_vector.end(), rand_int);
+
+    // create vector on the device and copy the data
+    boost::compute::vector<int> device_vector(PERF_N, context);
+    boost::compute::copy(
+        host_vector.begin(), host_vector.end(), device_vector.begin(), queue
+    );
+
+    perf_timer t;
+    for(size_t trial = 0; trial < PERF_TRIALS; trial++){
+        t.start();
+        boost::compute::rotate(
+            device_vector.begin(), device_vector.begin()+(PERF_N/2), device_vector.end(), queue
+        );
+        queue.finish();
+        t.stop();
+    }
+    std::cout << "time: " << t.min_time() / 1e6 << " ms" << std::endl;
+
+    return 0;
+}
diff --git a/src/boost/libs/compute/perf/perf_rotate_copy.cpp b/src/boost/libs/compute/perf/perf_rotate_copy.cpp
new file mode 100644
index 000000000..97111ef57
--- /dev/null
+++ b/src/boost/libs/compute/perf/perf_rotate_copy.cpp
@@ -0,0 +1,62 @@
+//---------------------------------------------------------------------------//
+// Copyright (c) 2014 Roshan <thisisroshansmail@gmail.com>
+//
+// Distributed under the Boost Software License, Version 1.0
+// See accompanying file LICENSE_1_0.txt or copy at
+// http://www.boost.org/LICENSE_1_0.txt
+//
+// See http://boostorg.github.com/compute for more information.
+//---------------------------------------------------------------------------//
+
+#include <algorithm>
+#include <iostream>
+#include <numeric>
+#include <vector>
+
+#include <boost/compute/system.hpp>
+#include <boost/compute/algorithm/rotate_copy.hpp>
+#include <boost/compute/container/vector.hpp>
+
+#include "perf.hpp"
+
+int rand_int()
+{
+    return static_cast<int>((rand() / double(RAND_MAX)) * 25.0);
+}
+
+int main(int argc, char *argv[])
+{
+    perf_parse_args(argc, argv);
+    std::cout << "size: " << PERF_N << std::endl;
+
+    // setup context and queue for the default device
+    boost::compute::device device = boost::compute::system::default_device();
+    boost::compute::context context(device);
+    boost::compute::command_queue queue(context, device);
+    std::cout << "device: " << device.name() << std::endl;
+
+    // create vector of random numbers on the host
+    std::vector<int> host_vector(PERF_N);
+    std::generate(host_vector.begin(), host_vector.end(), rand_int);
+
+    // create vector on the device and copy the data
+    boost::compute::vector<int> device_vector(PERF_N, context);
+    boost::compute::copy(
+        host_vector.begin(), host_vector.end(), device_vector.begin(), queue
+    );
+
+    boost::compute::vector<int> device_vector2(PERF_N, context);
+
+    perf_timer t;
+    for(size_t trial = 0; trial < PERF_TRIALS; trial++){
+        t.start();
+        boost::compute::rotate_copy(
+            device_vector.begin(), device_vector.begin()+(PERF_N/2), device_vector.end(), device_vector2.begin(), queue
+        );
+        queue.finish();
+        t.stop();
+    }
+    std::cout << "time: " << t.min_time() / 1e6 << " ms" << std::endl;
+
+    return 0;
+}
diff --git a/src/boost/libs/compute/perf/perf_saxpy.cpp b/src/boost/libs/compute/perf/perf_saxpy.cpp
new file mode 100644
index 000000000..99ffc55df
--- /dev/null
+++ b/src/boost/libs/compute/perf/perf_saxpy.cpp
@@ -0,0 +1,162 @@
+//---------------------------------------------------------------------------//
+// Copyright (c) 2013-2014 Kyle Lutz <kyle.r.lutz@gmail.com>
+//
+// Distributed under the Boost Software License, Version 1.0
+// See accompanying file LICENSE_1_0.txt or copy at
+// http://www.boost.org/LICENSE_1_0.txt
+//
+// See http://boostorg.github.com/compute for more information.
+//---------------------------------------------------------------------------//
+
+#include <algorithm>
+#include <iostream>
+#include <vector>
+
+#include <boost/program_options.hpp>
+
+#include <boost/compute/lambda.hpp>
+#include <boost/compute/system.hpp>
+#include <boost/compute/algorithm/copy.hpp>
+#include <boost/compute/algorithm/transform.hpp>
+#include <boost/compute/container/vector.hpp>
+
+#include "perf.hpp"
+
+namespace po = boost::program_options;
+namespace compute = boost::compute;
+
+float rand_float()
+{
+    return (float(rand()) / float(RAND_MAX)) * 1000.f;
+}
+
+template<class T>
+double perf_saxpy(const compute::vector<T>& x,
+                  const compute::vector<T>& y,
+                  const T alpha,
+                  const size_t trials,
+                  compute::command_queue& queue)
+{
+    // create vector on the device to store the result
+    compute::vector<T> result(x.size(), queue.get_context());
+
+    perf_timer t;
+    for(size_t trial = 0; trial < trials; trial++){
+        compute::fill(result.begin(), result.end(), T(0), queue);
+        queue.finish();
+
+        t.start();
+
+        using compute::lambda::_1;
+        using compute::lambda::_2;
+
+        compute::transform(
+            x.begin(), x.end(), y.begin(), result.begin(), alpha * _1 + _2, queue
+        );
+
+        queue.finish();
+        t.stop();
+    }
+
+    return t.min_time();
+}
+
+template<class T>
+void tune_saxpy(const compute::vector<T>& x,
+                const compute::vector<T>& y,
+                const T alpha,
+                const size_t trials,
+                compute::command_queue& queue)
+{
+    boost::shared_ptr<compute::detail::parameter_cache>
+        params = compute::detail::parameter_cache::get_global_cache(queue.get_device());
+
+    const std::string cache_key =
+        std::string("__boost_copy_kernel_") + boost::lexical_cast<std::string>(sizeof(T));
+
+    const compute::uint_ tpbs[] = { 4, 8, 16, 32, 64, 128, 256, 512, 1024 };
+    const compute::uint_ vpts[] = { 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 };
+
+    double min_time = (std::numeric_limits<double>::max)();
+    compute::uint_ best_tpb = 0;
+    compute::uint_ best_vpt = 0;
+
+    for(size_t i = 0; i < sizeof(tpbs) / sizeof(*tpbs); i++){
+        params->set(cache_key, "tpb", tpbs[i]);
+        for(size_t j = 0; j < sizeof(vpts) / sizeof(*vpts); j++){
+            params->set(cache_key, "vpt", vpts[j]);
+
+            try {
+                const double t = perf_saxpy(x, y, alpha, trials, queue);
+                if(t < min_time){
+                    best_tpb = tpbs[i];
+                    best_vpt = vpts[j];
+                    min_time = t;
+                }
+            }
+            catch(compute::opencl_error&){
+                // invalid parameters for this device, skip
+            }
+        }
+    }
+
+    // store optimal parameters
+    params->set(cache_key, "tpb", best_tpb);
+    params->set(cache_key, "vpt", best_vpt);
+}
+
+int main(int argc, char *argv[])
+{
+    // setup command line arguments
+    po::options_description options("options");
+    options.add_options()
+        ("help", "show usage instructions")
+        ("size", po::value<size_t>()->default_value(8192), "input size")
+        ("trials", po::value<size_t>()->default_value(3), "number of trials to run")
+        ("tune", "run tuning procedure")
+        ("alpha", po::value<double>()->default_value(2.5), "saxpy alpha value")
+    ;
+    po::positional_options_description positional_options;
+    positional_options.add("size", 1);
+
+    // parse command line
+    po::variables_map vm;
+    po::store(
+        po::command_line_parser(argc, argv)
+            .options(options).positional(positional_options).run(),
+        vm
+    );
+    po::notify(vm);
+
+    const size_t size = vm["size"].as<size_t>();
+    const size_t trials = vm["trials"].as<size_t>();
+    const float alpha = vm["alpha"].as<double>();
+    std::cout << "size: " << size << std::endl;
+
+    // setup context and queue for the default device
+    compute::device device = boost::compute::system::default_device();
+    compute::context context(device);
+    compute::command_queue queue(context, device);
+    std::cout << "device: " << device.name() << std::endl;
+
+    // create vector of random numbers on the host
+    std::vector<float> host_x(size);
+    std::vector<float> host_y(size);
+    std::generate(host_x.begin(), host_x.end(), rand_float);
+    std::generate(host_y.begin(), host_y.end(), rand_float);
+
+    // create vector on the device and copy the data
+    compute::vector<float> x(host_x.begin(), host_x.end(), queue);
+    compute::vector<float> y(host_y.begin(), host_y.end(), queue);
+
+    // run tuning proceure (if requested)
+    if(vm.count("tune")){
+        tune_saxpy(x, y, alpha, trials, queue);
+    }
+
+    // run benchmark
+    double t = perf_saxpy(x, y, alpha, trials, queue);
+    std::cout << "time: " << t / 1e6 << " ms" << std::endl;
+
+    return 0;
+}
diff --git a/src/boost/libs/compute/perf/perf_search.cpp b/src/boost/libs/compute/perf/perf_search.cpp
new file mode 100644
index 000000000..b76e97553
--- /dev/null
+++ b/src/boost/libs/compute/perf/perf_search.cpp
@@ -0,0 +1,65 @@
+//---------------------------------------------------------------------------//
+// Copyright (c) 2014 Roshan <thisisroshansmail@gmail.com>
+//
+// Distributed under the Boost Software License, Version 1.0
+// See accompanying file LICENSE_1_0.txt or copy at
+// http://www.boost.org/LICENSE_1_0.txt
+//
+// See http://boostorg.github.com/compute for more information.
+//---------------------------------------------------------------------------//
+
+#include <algorithm>
+#include <iostream>
+#include <numeric>
+#include <vector>
+
+#include <boost/compute/system.hpp>
+#include <boost/compute/algorithm/search.hpp>
+#include <boost/compute/container/vector.hpp>
+
+#include "perf.hpp"
+
+int rand_int()
+{
+    return static_cast<int>((rand() / double(RAND_MAX)) * 25.0);
+}
+
+int main(int argc, char *argv[])
+{
+    perf_parse_args(argc, argv);
+    std::cout << "size: " << PERF_N << std::endl;
+
+    // setup context and queue for the default device
+    boost::compute::device device = boost::compute::system::default_device();
+    boost::compute::context context(device);
+    boost::compute::command_queue queue(context, device);
+    std::cout << "device: " << device.name() << std::endl;
+
+    // create vector of random numbers on the host
+    std::vector<int> host_vector(PERF_N);
+    std::generate(host_vector.begin(), host_vector.end(), rand_int);
+
+    int pattern[] = {2, 6, 6, 7, 8, 4};
+
+    // create vector on the device and copy the data
+    boost::compute::vector<int> device_vector(PERF_N, context);
+    boost::compute::copy(
+        host_vector.begin(), host_vector.end(), device_vector.begin(), queue
+    );
+
+    boost::compute::vector<int> pattern_vector(pattern, pattern + 6, queue);
+
+    perf_timer t;
+    for(size_t trial = 0; trial < PERF_TRIALS; trial++){
+        t.start();
+        boost::compute::search(
+            device_vector.begin(), device_vector.end(),
+            pattern_vector.begin(), pattern_vector.end(), queue
+        );
+        queue.finish();
+        t.stop();
+    }
+    std::cout << "time: " << t.min_time() / 1e6 << " ms" << std::endl;
+
+    return 0;
+}
diff --git a/src/boost/libs/compute/perf/perf_search_n.cpp b/src/boost/libs/compute/perf/perf_search_n.cpp
new file mode 100644
index 000000000..31fa0adb9
--- /dev/null
+++ b/src/boost/libs/compute/perf/perf_search_n.cpp
@@ -0,0 +1,61 @@
+//---------------------------------------------------------------------------//
+// Copyright (c) 2014 Roshan <thisisroshansmail@gmail.com>
+//
+// Distributed under the Boost Software License, Version 1.0
+// See accompanying file LICENSE_1_0.txt or copy at
+// http://www.boost.org/LICENSE_1_0.txt
+//
+// See http://boostorg.github.com/compute for more information.
+//---------------------------------------------------------------------------//
+
+#include <algorithm>
+#include <iostream>
+#include <numeric>
+#include <vector>
+
+#include <boost/compute/system.hpp>
+#include <boost/compute/algorithm/search_n.hpp>
+#include <boost/compute/container/vector.hpp>
+
+#include "perf.hpp"
+
+int rand_int()
+{
+    return static_cast<int>((rand() / double(RAND_MAX)) * 25.0);
+}
+
+int main(int argc, char *argv[])
+{
+    perf_parse_args(argc, argv);
+    std::cout << "size: " << PERF_N << std::endl;
+
+    // setup context and queue for the default device
+    boost::compute::device device = boost::compute::system::default_device();
+    boost::compute::context context(device);
+    boost::compute::command_queue queue(context, device);
+    std::cout << "device: " << device.name() << std::endl;
+
+    // create vector of random numbers on the host
+    std::vector<int> host_vector(PERF_N);
+    std::generate(host_vector.begin(), host_vector.end(), rand_int);
+
+    // create vector on the device and copy the data
+    boost::compute::vector<int> device_vector(PERF_N, context);
+    boost::compute::copy(
+        host_vector.begin(), host_vector.end(), device_vector.begin(), queue
+    );
+
+    perf_timer t;
+    for(size_t trial = 0; trial < PERF_TRIALS; trial++){
+        t.start();
+        boost::compute::search_n(
+            device_vector.begin(), device_vector.end(),
+            5, 2, queue
+        );
+        queue.finish();
+        t.stop();
+    }
+    std::cout << "time: " << t.min_time() / 1e6 << " ms" << std::endl;
+
+    return 0;
+}
diff --git a/src/boost/libs/compute/perf/perf_set_difference.cpp b/src/boost/libs/compute/perf/perf_set_difference.cpp
new file mode 100644
index 000000000..b1b28573d
--- /dev/null
+++ b/src/boost/libs/compute/perf/perf_set_difference.cpp
@@ -0,0 +1,75 @@
+//---------------------------------------------------------------------------//
+// Copyright (c) 2014 Roshan <thisisroshansmail@gmail.com>
+//
+// Distributed under the Boost Software License, Version 1.0
+// See accompanying file LICENSE_1_0.txt or copy at
+// http://www.boost.org/LICENSE_1_0.txt
+//
+// See http://boostorg.github.com/compute for more information.
+//---------------------------------------------------------------------------//
+
+#include <algorithm>
+#include <iostream>
+#include <numeric>
+#include <vector>
+
+#include <boost/compute/system.hpp>
+#include <boost/compute/algorithm/set_difference.hpp>
+#include <boost/compute/container/vector.hpp>
+
+#include "perf.hpp"
+
+int rand_int()
+{
+    return static_cast<int>((rand() / double(RAND_MAX)) * 25.0);
+}
+
+int main(int argc, char *argv[])
+{
+    perf_parse_args(argc, argv);
+    std::cout << "size: " << PERF_N << std::endl;
+
+    // setup context and queue for the default device
+    boost::compute::device device = boost::compute::system::default_device();
+    boost::compute::context context(device);
+    boost::compute::command_queue queue(context, device);
+    std::cout << "device: " << device.name() << std::endl;
+
+    // create vectors of random numbers on the host
+    std::vector<int> v1(std::floor(PERF_N / 2.0));
+    std::vector<int> v2(std::ceil(PERF_N / 2.0));
+    std::generate(v1.begin(), v1.end(), rand_int);
+    std::generate(v2.begin(), v2.end(), rand_int);
+    std::sort(v1.begin(), v1.end());
+    std::sort(v2.begin(), v2.end());
+
+    // create vectors on the device and copy the data
+    boost::compute::vector<int> gpu_v1(std::floor(PERF_N / 2.0), context);
+    boost::compute::vector<int> gpu_v2(std::ceil(PERF_N / 2.0), context);
+
+    boost::compute::copy(
+        v1.begin(), v1.end(), gpu_v1.begin(), queue
+    );
+    boost::compute::copy(
+        v2.begin(), v2.end(), gpu_v2.begin(), queue
+    );
+
+    boost::compute::vector<int> gpu_v3(PERF_N, context);
+    boost::compute::vector<int>::iterator gpu_v3_end;
+
+    perf_timer t;
+    for(size_t trial = 0; trial < PERF_TRIALS; trial++){
+        t.start();
+        gpu_v3_end = boost::compute::set_difference(
+            gpu_v1.begin(), gpu_v1.end(),
+            gpu_v2.begin(), gpu_v2.end(),
+            gpu_v3.begin(), queue
+        );
+        queue.finish();
+        t.stop();
+    }
+    std::cout << "time: " << t.min_time() / 1e6 << " ms" << std::endl;
+    std::cout << "size: " << std::distance(gpu_v3.begin(), gpu_v3_end) << std::endl;
+
+    return 0;
+}
diff --git a/src/boost/libs/compute/perf/perf_set_intersection.cpp b/src/boost/libs/compute/perf/perf_set_intersection.cpp
new file mode 100644
index 000000000..dbfeb42f1
--- /dev/null
+++ b/src/boost/libs/compute/perf/perf_set_intersection.cpp
@@ -0,0 +1,75 @@
+//---------------------------------------------------------------------------//
+// Copyright (c) 2014 Roshan <thisisroshansmail@gmail.com>
+//
+// Distributed under the Boost Software License, Version 1.0
+// See accompanying file LICENSE_1_0.txt or copy at
+// http://www.boost.org/LICENSE_1_0.txt
+//
+// See http://boostorg.github.com/compute for more information.
+//---------------------------------------------------------------------------//
+
+#include <algorithm>
+#include <iostream>
+#include <numeric>
+#include <vector>
+
+#include <boost/compute/system.hpp>
+#include <boost/compute/algorithm/set_intersection.hpp>
+#include <boost/compute/container/vector.hpp>
+
+#include "perf.hpp"
+
+int rand_int()
+{
+    return static_cast<int>((rand() / double(RAND_MAX)) * 25.0);
+}
+
+int main(int argc, char *argv[])
+{
+    perf_parse_args(argc, argv);
+    std::cout << "size: " << PERF_N << std::endl;
+
+    // setup context and queue for the default device
+    boost::compute::device device = boost::compute::system::default_device();
+    boost::compute::context context(device);
+    boost::compute::command_queue queue(context, device);
+    std::cout << "device: " << device.name() << std::endl;
+
+    // create vectors of random numbers on the host
+    std::vector<int> v1(std::floor(PERF_N / 2.0));
+    std::vector<int> v2(std::ceil(PERF_N / 2.0));
+    std::generate(v1.begin(), v1.end(), rand_int);
+    std::generate(v2.begin(), v2.end(), rand_int);
+    std::sort(v1.begin(), v1.end());
+    std::sort(v2.begin(), v2.end());
+
+    // create vectors on the device and copy the data
+    boost::compute::vector<int> gpu_v1(std::floor(PERF_N / 2.0), context);
+    boost::compute::vector<int> gpu_v2(std::ceil(PERF_N / 2.0), context);
+
+    boost::compute::copy(
+        v1.begin(), v1.end(), gpu_v1.begin(), queue
+    );
+    boost::compute::copy(
+        v2.begin(), v2.end(), gpu_v2.begin(), queue
+    );
+
+    boost::compute::vector<int> gpu_v3(PERF_N, context);
+    boost::compute::vector<int>::iterator gpu_v3_end;
+
+    perf_timer t;
+    for(size_t trial = 0; trial < PERF_TRIALS; trial++){
+        t.start();
+        gpu_v3_end = boost::compute::set_intersection(
+            gpu_v1.begin(), gpu_v1.end(),
+            gpu_v2.begin(), gpu_v2.end(),
+            gpu_v3.begin(), queue
+        );
+        queue.finish();
+        t.stop();
+    }
+    std::cout << "time: " << t.min_time() / 1e6 << " ms" << std::endl;
+    std::cout << "size: " << std::distance(gpu_v3.begin(), gpu_v3_end) << std::endl;
+
+    return 0;
+}
diff --git a/src/boost/libs/compute/perf/perf_set_symmetric_difference.cpp b/src/boost/libs/compute/perf/perf_set_symmetric_difference.cpp
new file mode 100644
index 000000000..9449c585f
--- /dev/null
+++ b/src/boost/libs/compute/perf/perf_set_symmetric_difference.cpp
@@ -0,0 +1,75 @@
+//---------------------------------------------------------------------------//
+// Copyright (c) 2014 Roshan <thisisroshansmail@gmail.com>
+//
+// Distributed under the Boost Software License, Version 1.0
+// See accompanying file LICENSE_1_0.txt or copy at
+// http://www.boost.org/LICENSE_1_0.txt
+//
+// See http://boostorg.github.com/compute for more information.
+//---------------------------------------------------------------------------//
+
+#include <algorithm>
+#include <iostream>
+#include <numeric>
+#include <vector>
+
+#include <boost/compute/system.hpp>
+#include <boost/compute/algorithm/set_symmetric_difference.hpp>
+#include <boost/compute/container/vector.hpp>
+
+#include "perf.hpp"
+
+int rand_int()
+{
+    return static_cast<int>((rand() / double(RAND_MAX)) * 25.0);
+}
+
+int main(int argc, char *argv[])
+{
+    perf_parse_args(argc, argv);
+    std::cout << "size: " << PERF_N << std::endl;
+
+    // setup context and queue for the default device
+    boost::compute::device device = boost::compute::system::default_device();
+    boost::compute::context context(device);
+    boost::compute::command_queue queue(context, device);
+    std::cout << "device: " << device.name() << std::endl;
+
+    // create vectors of random numbers on the host
+    std::vector<int> v1(std::floor(PERF_N / 2.0));
+    std::vector<int> v2(std::ceil(PERF_N / 2.0));
+    std::generate(v1.begin(), v1.end(), rand_int);
+    std::generate(v2.begin(), v2.end(), rand_int);
+    std::sort(v1.begin(), v1.end());
+    std::sort(v2.begin(), v2.end());
+
+    // create vectors on the device and copy the data
+    boost::compute::vector<int> gpu_v1(std::floor(PERF_N / 2.0), context);
+    boost::compute::vector<int> gpu_v2(std::ceil(PERF_N / 2.0), context);
+
+    boost::compute::copy(
+        v1.begin(), v1.end(), gpu_v1.begin(), queue
+    );
+    boost::compute::copy(
+        v2.begin(), v2.end(), gpu_v2.begin(), queue
+    );
+
+    boost::compute::vector<int> gpu_v3(PERF_N, context);
+    boost::compute::vector<int>::iterator gpu_v3_end;
+
+    perf_timer t;
+    for(size_t trial = 0; trial < PERF_TRIALS; trial++){
+        t.start();
+        gpu_v3_end = boost::compute::set_symmetric_difference(
+            gpu_v1.begin(), gpu_v1.end(),
+            gpu_v2.begin(), gpu_v2.end(),
+            gpu_v3.begin(), queue
+        );
+        queue.finish();
+        t.stop();
+    }
+    std::cout << "time: " << t.min_time() / 1e6 << " ms" << std::endl;
+    std::cout << "size: " << std::distance(gpu_v3.begin(), gpu_v3_end) << std::endl;
+
+    return 0;
+}
diff --git a/src/boost/libs/compute/perf/perf_set_union.cpp b/src/boost/libs/compute/perf/perf_set_union.cpp
new file mode 100644
index 000000000..3a336cb60
--- /dev/null
+++ b/src/boost/libs/compute/perf/perf_set_union.cpp
@@ -0,0 +1,75 @@
+//---------------------------------------------------------------------------//
+// Copyright (c) 2014 Roshan <thisisroshansmail@gmail.com>
+//
+// Distributed under the Boost Software License, Version 1.0
+// See accompanying file LICENSE_1_0.txt or copy at
+// http://www.boost.org/LICENSE_1_0.txt
+//
+// See http://boostorg.github.com/compute for more information.
+//---------------------------------------------------------------------------//
+
+#include <algorithm>
+#include <iostream>
+#include <numeric>
+#include <vector>
+
+#include <boost/compute/system.hpp>
+#include <boost/compute/algorithm/set_union.hpp>
+#include <boost/compute/container/vector.hpp>
+
+#include "perf.hpp"
+
+int rand_int()
+{
+    return static_cast<int>((rand() / double(RAND_MAX)) * 25.0);
+}
+
+int main(int argc, char *argv[])
+{
+    perf_parse_args(argc, argv);
+    std::cout << "size: " << PERF_N << std::endl;
+
+    // setup context and queue for the default device
+    boost::compute::device device = boost::compute::system::default_device();
+    boost::compute::context context(device);
+    boost::compute::command_queue queue(context, device);
+    std::cout << "device: " << device.name() << std::endl;
+
+    // create vectors of random numbers on the host
+    std::vector<int> v1(std::floor(PERF_N / 2.0));
+    std::vector<int> v2(std::ceil(PERF_N / 2.0));
+    std::generate(v1.begin(), v1.end(), rand_int);
+    std::generate(v2.begin(), v2.end(), rand_int);
+    std::sort(v1.begin(), v1.end());
+    std::sort(v2.begin(), v2.end());
+
+    // create vectors on the device and copy the data
+    boost::compute::vector<int> gpu_v1(std::floor(PERF_N / 2.0), context);
+    boost::compute::vector<int> gpu_v2(std::ceil(PERF_N / 2.0), context);
+
+    boost::compute::copy(
+        v1.begin(), v1.end(), gpu_v1.begin(), queue
+    );
+    boost::compute::copy(
+        v2.begin(), v2.end(), gpu_v2.begin(), queue
+    );
+
+    boost::compute::vector<int> gpu_v3(PERF_N, context);
+    boost::compute::vector<int>::iterator gpu_v3_end;
+
+    perf_timer t;
+    for(size_t trial = 0; trial < PERF_TRIALS; trial++){
+        t.start();
+        gpu_v3_end = boost::compute::set_union(
+            gpu_v1.begin(), gpu_v1.end(),
+            gpu_v2.begin(), gpu_v2.end(),
+            gpu_v3.begin(), queue
+        );
+        queue.finish();
+        t.stop();
+    }
+    std::cout << "time: " << t.min_time() / 1e6 << " ms" << std::endl;
+    std::cout << "size: " << std::distance(gpu_v3.begin(), gpu_v3_end) << std::endl;
+
+    return 0;
+}
diff --git a/src/boost/libs/compute/perf/perf_sort.cpp b/src/boost/libs/compute/perf/perf_sort.cpp
new file mode 100644
index 000000000..458203edd
--- /dev/null
+++ b/src/boost/libs/compute/perf/perf_sort.cpp
@@ -0,0 +1,130 @@
+//---------------------------------------------------------------------------//
+// Copyright (c) 2013-2014 Kyle Lutz <kyle.r.lutz@gmail.com>
+//
+// Distributed under the Boost Software License, Version 1.0
+// See accompanying file LICENSE_1_0.txt or copy at
+// http://www.boost.org/LICENSE_1_0.txt
+//
+// See http://boostorg.github.com/compute for more information.
+//---------------------------------------------------------------------------//
+
+#include <algorithm>
+#include <iostream>
+#include <vector>
+
+#include <boost/program_options.hpp>
+
+#include <boost/compute/system.hpp>
+#include <boost/compute/algorithm/sort.hpp>
+#include <boost/compute/algorithm/is_sorted.hpp>
+#include <boost/compute/container/vector.hpp>
+
+#include "perf.hpp"
+
+namespace po = boost::program_options;
+namespace compute = boost::compute;
+
+template<class T>
+double perf_sort(const std::vector<T>& data,
+                 const size_t trials,
+                 compute::command_queue& queue)
+{
+    compute::vector<T> vec(data.size(), queue.get_context());
+
+    perf_timer t;
+    for(size_t trial = 0; trial < trials; trial++){
+        compute::copy(data.begin(), data.end(), vec.begin(), queue);
+        t.start();
+        compute::sort(vec.begin(), vec.end(), queue);
+        queue.finish();
+        t.stop();
+
+        if(!compute::is_sorted(vec.begin(), vec.end(), queue)){
+            std::cerr << "ERROR: is_sorted() returned false" << std::endl;
+        }
+    }
+    return t.min_time();
+}
+
+template<class T>
+void tune_sort(const std::vector<T>& data,
+               const size_t trials,
+               compute::command_queue& queue)
+{
+    boost::shared_ptr<compute::detail::parameter_cache>
+        params = compute::detail::parameter_cache::get_global_cache(queue.get_device());
+
+    const std::string cache_key =
+        std::string("__boost_radix_sort_") + compute::type_name<T>();
+
+    const compute::uint_ tpbs[] = { 32, 64, 128, 256, 512, 1024 };
+
+    double min_time = (std::numeric_limits<double>::max)();
+    compute::uint_ best_tpb = 0;
+
+    for(size_t i = 0; i < sizeof(tpbs) / sizeof(*tpbs); i++){
+        params->set(cache_key, "tpb", tpbs[i]);
+
+        try {
+            const double t = perf_sort(data, trials, queue);
+            if(t < min_time){
+                best_tpb = tpbs[i];
+                min_time = t;
+            }
+        }
+        catch(compute::opencl_error&){
+            // invalid work group size for this device, skip
+        }
+    }
+
+    // store optimal parameters
+    params->set(cache_key, "tpb", best_tpb);
+}
+
+int main(int argc, char *argv[])
+{
+    // setup command line arguments
+    po::options_description options("options");
+    options.add_options()
+        ("help", "show usage instructions")
+        ("size", po::value<size_t>()->default_value(8192), "input size")
+        ("trials", po::value<size_t>()->default_value(3), "number of trials to run")
+        ("tune", "run tuning procedure")
+    ;
+    po::positional_options_description positional_options;
+    positional_options.add("size", 1);
+
+    // parse command line
+    po::variables_map vm;
+    po::store(
+        po::command_line_parser(argc, argv)
+            .options(options).positional(positional_options).run(),
+        vm
+    );
+    po::notify(vm);
+
+    const size_t size = vm["size"].as<size_t>();
+    const size_t trials = vm["trials"].as<size_t>();
+    std::cout << "size: " << size << std::endl;
+
+    // setup context and queue for the default device
+    compute::device device = boost::compute::system::default_device();
+    compute::context context(device);
+    compute::command_queue queue(context, device);
+    std::cout << "device: " << device.name() << std::endl;
+
+    // create vector of random numbers on the host
+    std::vector<unsigned int> data(size);
+    std::generate(data.begin(), data.end(), rand);
+
+    // run tuning proceure (if requested)
+    if(vm.count("tune")){
+        tune_sort(data, trials, queue);
+    }
+
+    // run sort benchmark
+    double t = perf_sort(data, trials, queue);
+    std::cout << "time: " << t / 1e6 << " ms" << std::endl;
+
+    return 0;
+}
diff --git a/src/boost/libs/compute/perf/perf_sort_by_key.cpp b/src/boost/libs/compute/perf/perf_sort_by_key.cpp
new file mode 100644
index 000000000..57c3fc837
--- /dev/null
+++ b/src/boost/libs/compute/perf/perf_sort_by_key.cpp
@@ -0,0 +1,79 @@
+//---------------------------------------------------------------------------//
+// Copyright (c) 2013-2014 Kyle Lutz <kyle.r.lutz@gmail.com>
+//
+// Distributed under the Boost Software License, Version 1.0
+// See accompanying file LICENSE_1_0.txt or copy at
+// http://www.boost.org/LICENSE_1_0.txt
+//
+// See http://boostorg.github.com/compute for more information.
+//---------------------------------------------------------------------------//
+
+#include <algorithm>
+#include <iostream>
+#include <vector>
+
+#include <boost/compute/system.hpp>
+#include <boost/compute/algorithm/sort_by_key.hpp>
+#include <boost/compute/algorithm/is_sorted.hpp>
+#include <boost/compute/container/vector.hpp>
+#include <boost/compute/types/fundamental.hpp>
+
+#include "perf.hpp"
+
+int main(int argc, char *argv[])
+{
+    using boost::compute::int_;
+    using boost::compute::long_;
+
+    perf_parse_args(argc, argv);
+
+    std::cout << "size: " << PERF_N << std::endl;
+
+    // setup context and queue for the default device
+    boost::compute::device device = boost::compute::system::default_device();
+    boost::compute::context context(device);
+    boost::compute::command_queue queue(context, device);
+    std::cout << "device: " << device.name() << std::endl;
+
+    // create vector of random numbers on the host
+    std::vector<int_> host_keys(PERF_N);
+    std::generate(host_keys.begin(), host_keys.end(), rand);
+    std::vector<long_> host_values(PERF_N);
+    std::copy(host_keys.begin(), host_keys.end(), host_values.begin());
+
+    // create vector on the device and copy the data
+    boost::compute::vector<int_> device_keys(PERF_N, context);
+    boost::compute::vector<long_> device_values(PERF_N, context);
+
+    perf_timer t;
+    for(size_t trial = 0; trial < PERF_TRIALS; trial++){
+        boost::compute::copy(
+            host_keys.begin(), host_keys.end(), device_keys.begin(), queue
+        );
+        boost::compute::copy(
+            host_values.begin(), host_values.end(), device_values.begin(), queue
+        );
+
+        t.start();
+        // sort vector
+        boost::compute::sort_by_key(
+            device_keys.begin(), device_keys.end(), device_values.begin(), queue
+        );
+        queue.finish();
+        t.stop();
+    }
+    std::cout << "time: " << t.min_time() / 1e6 << " ms" << std::endl;
+
+    // verify keys are sorted
+    if(!boost::compute::is_sorted(device_keys.begin(), device_keys.end(), queue)){
+        std::cout << "ERROR: is_sorted() returned false for the keys" << std::endl;
+        return -1;
+    }
+    // verify values are sorted
+    if(!boost::compute::is_sorted(device_values.begin(), device_values.end(), queue)){
+        std::cout << "ERROR: is_sorted() returned false for the values" << std::endl;
+        return -1;
+    }
+
+    return 0;
+}
diff --git a/src/boost/libs/compute/perf/perf_sort_float.cpp b/src/boost/libs/compute/perf/perf_sort_float.cpp
new file mode 100644
index 000000000..1b2d5f5e4
--- /dev/null
+++ b/src/boost/libs/compute/perf/perf_sort_float.cpp
@@ -0,0 +1,72 @@
+//---------------------------------------------------------------------------//
+// Copyright (c) 2013-2014 Kyle Lutz <kyle.r.lutz@gmail.com>
+//
+// Distributed under the Boost Software License, Version 1.0
+// See accompanying file LICENSE_1_0.txt or copy at
+// http://www.boost.org/LICENSE_1_0.txt
+//
+// See http://boostorg.github.com/compute for more information.
+//---------------------------------------------------------------------------//
+
+#include <algorithm>
+#include <iostream>
+#include <vector>
+
+#include <boost/compute/system.hpp>
+#include <boost/compute/algorithm/is_sorted.hpp>
+#include <boost/compute/algorithm/sort.hpp>
+#include <boost/compute/container/vector.hpp>
+
+#include "perf.hpp"
+
+float rand_float()
+{
+    return ((rand() / float(RAND_MAX)) - 0.5f) * 100000.0f;
+}
+
+int main(int argc, char *argv[])
+{
+    perf_parse_args(argc, argv);
+    std::cout << "size: " << PERF_N << std::endl;
+
+    // setup context and queue for the default device
+    boost::compute::device device = boost::compute::system::default_device();
+    boost::compute::context context(device);
+    boost::compute::command_queue queue(context, device);
+    std::cout << "device: " << device.name() << std::endl;
+
+    // create vector of random numbers on the host
+    std::vector<float> host_vector(PERF_N);
+    std::generate(host_vector.begin(), host_vector.end(), rand_float);
+
+    // create vector on the device and copy the data
+    boost::compute::vector<float> device_vector(PERF_N, context);
+    boost::compute::copy(
+        host_vector.begin(),
+        host_vector.end(),
+        device_vector.begin(),
+        queue
+    );
+
+    // sort vector
+    perf_timer t;
+    t.start();
+    boost::compute::sort(
+        device_vector.begin(),
+        device_vector.end(),
+        queue
+    );
+    queue.finish();
+    t.stop();
+    std::cout << "time: " << t.last_time() / 1e6 << " ms" << std::endl;
+
+    // verify vector is sorted
+    if(!boost::compute::is_sorted(device_vector.begin(),
+                                  device_vector.end(),
+                                  queue)){
+        std::cout << "ERROR: is_sorted() returned false" << std::endl;
+        return -1;
+    }
+
+    return 0;
+}
diff --git a/src/boost/libs/compute/perf/perf_stable_partition.cpp b/src/boost/libs/compute/perf/perf_stable_partition.cpp
new file mode 100644
index 000000000..f7ef10637
--- /dev/null
+++ b/src/boost/libs/compute/perf/perf_stable_partition.cpp
@@ -0,0 +1,62 @@
+//---------------------------------------------------------------------------//
+// Copyright (c) 2014 Roshan <thisisroshansmail@gmail.com>
+//
+// Distributed under the Boost Software License, Version 1.0
+// See accompanying file LICENSE_1_0.txt or copy at
+// http://www.boost.org/LICENSE_1_0.txt
+//
+// See http://boostorg.github.com/compute for more information.
+//---------------------------------------------------------------------------//
+
+#include <algorithm>
+#include <iostream>
+#include <numeric>
+#include <vector>
+
+#include <boost/compute/system.hpp>
+#include <boost/compute/algorithm/stable_partition.hpp>
+#include <boost/compute/container/vector.hpp>
+
+#include "perf.hpp"
+
+int rand_int()
+{
+    return static_cast<int>((rand() / double(RAND_MAX)) * 25.0);
+}
+
+int main(int argc, char *argv[])
+{
+    perf_parse_args(argc, argv);
+    std::cout << "size: " << PERF_N << std::endl;
+
+    // setup context and queue for the default device
+    boost::compute::device device = boost::compute::system::default_device();
+    boost::compute::context context(device);
+    boost::compute::command_queue queue(context, device);
+    std::cout << "device: " << device.name() << std::endl;
+
+    // create vector of random numbers on the host
+    std::vector<int> host_vector(PERF_N);
+    std::generate(host_vector.begin(), host_vector.end(), rand_int);
+
+    // create vector on the device and copy the data
+    boost::compute::vector<int> device_vector(PERF_N, context);
+    boost::compute::copy(
+        host_vector.begin(), host_vector.end(), device_vector.begin(), queue
+    );
+
+    using boost::compute::_1;
+
+    perf_timer t;
+    for(size_t trial = 0; trial < PERF_TRIALS; trial++){
+        t.start();
+        boost::compute::stable_partition(
+            device_vector.begin(), device_vector.end(), _1 < 10, queue
+        );
+        queue.finish();
+        t.stop();
+    }
+    std::cout << "time: " << t.min_time() / 1e6 << " ms" << std::endl;
+
+    return 0;
+}
diff --git a/src/boost/libs/compute/perf/perf_stl_accumulate.cpp b/src/boost/libs/compute/perf/perf_stl_accumulate.cpp
new file mode 100644
index 000000000..c28d2d03b
--- /dev/null
+++ b/src/boost/libs/compute/perf/perf_stl_accumulate.cpp
@@ -0,0 +1,43 @@
+//---------------------------------------------------------------------------//
+// Copyright (c) 2013-2014 Kyle Lutz <kyle.r.lutz@gmail.com>
+//
+// Distributed under the Boost Software License, Version 1.0
+// See accompanying file LICENSE_1_0.txt or copy at
+// http://www.boost.org/LICENSE_1_0.txt
+//
+// See http://boostorg.github.com/compute for more information.
+//---------------------------------------------------------------------------//
+
+#include <algorithm>
+#include <iostream>
+#include <numeric>
+#include <vector>
+
+#include "perf.hpp"
+
+int rand_int()
+{
+    return static_cast<int>((rand() / double(RAND_MAX)) * 25.0);
+}
+
+int main(int argc, char *argv[])
+{
+    perf_parse_args(argc, argv);
+    std::cout << "size: " << PERF_N << std::endl;
+
+    // create vector of random numbers on the host
+    std::vector<int> host_vector(PERF_N);
+    std::generate(host_vector.begin(), host_vector.end(), rand_int);
+
+    int sum = 0;
+    perf_timer t;
+    for(size_t trial = 0; trial < PERF_TRIALS; trial++){
+        t.start();
+        sum = std::accumulate(host_vector.begin(), host_vector.end(), int(0));
+        t.stop();
+    }
+    std::cout << "time: " << t.min_time() / 1e6 << " ms" << std::endl;
+    std::cout << "sum: " << sum << std::endl;
+
+    return 0;
+}
diff --git a/src/boost/libs/compute/perf/perf_stl_count.cpp b/src/boost/libs/compute/perf/perf_stl_count.cpp
new file mode 100644
index 000000000..9df13f861
--- /dev/null
+++ b/src/boost/libs/compute/perf/perf_stl_count.cpp
@@ -0,0 +1,45 @@
+//---------------------------------------------------------------------------//
+// Copyright (c) 2013-2014 Kyle Lutz <kyle.r.lutz@gmail.com>
+//
+// Distributed under the Boost Software License, Version 1.0
+// See accompanying file LICENSE_1_0.txt or copy at
+// http://www.boost.org/LICENSE_1_0.txt
+//
+// See http://boostorg.github.com/compute for more information.
+//---------------------------------------------------------------------------//
+
+#include <algorithm>
+#include <iostream>
+#include <vector>
+
+#include "perf.hpp"
+
+int rand_int()
+{
+    return static_cast<int>((rand() / double(RAND_MAX)) * 25.0);
+}
+
+int main(int argc, char *argv[])
+{
+    perf_parse_args(argc, argv);
+    std::cout << "size: " << PERF_N << std::endl;
+
+    // create vector of random numbers on the host
+    std::vector<int> host_vector(PERF_N);
+    std::generate(host_vector.begin(), host_vector.end(), rand_int);
+
+    // count values equal to four in the vector
+    size_t count = 0;
+    perf_timer t;
+    for(size_t trial = 0; trial < PERF_TRIALS; trial++){
+        t.start();
+        count = std::count(
+            host_vector.begin(), host_vector.end(), 4
+        );
+        t.stop();
+    }
+    std::cout << "time: " << t.min_time() / 1e6 << " ms" << std::endl;
+    std::cout << "count: " << count << std::endl;
+
+    return 0;
+}
diff --git a/src/boost/libs/compute/perf/perf_stl_find.cpp b/src/boost/libs/compute/perf/perf_stl_find.cpp
new file mode 100644
index 000000000..b5d3eed2a
--- /dev/null
+++ b/src/boost/libs/compute/perf/perf_stl_find.cpp
@@ -0,0 +1,58 @@
+//---------------------------------------------------------------------------//
+// Copyright (c) 2015 Jakub Szuppe <j.szuppe@gmail.com>
+//
+// Distributed under the Boost Software License, Version 1.0
+// See accompanying file LICENSE_1_0.txt or copy at
+// http://www.boost.org/LICENSE_1_0.txt
+//
+// See http://boostorg.github.com/compute for more information.
+//---------------------------------------------------------------------------//
+
+#include <algorithm>
+#include <iostream>
+#include <vector>
+
+#include "perf.hpp"
+
+// Max integer that can be generated by rand_int() function.
+int rand_int_max = 25;
+
+int rand_int()
+{
+    return static_cast<int>((rand() / double(RAND_MAX)) * rand_int_max);
+}
+
+int main(int argc, char *argv[])
+{
+    perf_parse_args(argc, argv);
+    std::cout << "size: " << PERF_N << std::endl;
+
+    // create vector of random numbers on the host
+    std::vector<int> host_vector(PERF_N);
+    std::generate(host_vector.begin(), host_vector.end(), rand_int);
+
+    // trying to find element that isn't in vector (worst-case scenario)
+    int wanted = rand_int_max + 1;
+
+    // result
+    std::vector<int>::iterator host_result_it;
+
+    perf_timer t;
+    for(size_t trial = 0; trial < PERF_TRIALS; trial++){
+        t.start();
+        host_result_it = std::find(host_vector.begin(), host_vector.end(), wanted);
+        t.stop();
+    }
+    std::cout << "time: " << t.min_time() / 1e6 << " ms" << std::endl;
+
+    // verify
+    if(host_result_it != host_vector.end()){
+        std::cout << "ERROR: "
+                  << "host_result_iterator != "
+                  << "host_vector.end()"
+                  << std::endl;
+        return -1;
+    }
+
+    return 0;
+}
diff --git a/src/boost/libs/compute/perf/perf_stl_find_end.cpp b/src/boost/libs/compute/perf/perf_stl_find_end.cpp
new file mode 100644
index 000000000..cb1233b18
--- /dev/null
+++ b/src/boost/libs/compute/perf/perf_stl_find_end.cpp
@@ -0,0 +1,44 @@
+//---------------------------------------------------------------------------//
+// Copyright (c) 2014 Roshan <thisisroshansmail@gmail.com>
+//
+// Distributed under the Boost Software License, Version 1.0
+// See accompanying file LICENSE_1_0.txt or copy at
+// http://www.boost.org/LICENSE_1_0.txt
+//
+// See http://boostorg.github.com/compute for more information.
+//---------------------------------------------------------------------------//
+
+#include <algorithm>
+#include <iostream>
+#include <numeric>
+#include <vector>
+
+#include "perf.hpp"
+
+int rand_int()
+{
+    return static_cast<int>((rand() / double(RAND_MAX)) * 25.0);
+}
+
+int main(int argc, char *argv[])
+{
+    perf_parse_args(argc, argv);
+    std::cout << "size: " << PERF_N << std::endl;
+
+    // create vector of random numbers on the host
+    std::vector<int> host_vector(PERF_N);
+    std::generate(host_vector.begin(), host_vector.end(), rand_int);
+
+    int pattern[] = {2, 6, 6, 7, 8, 4};
+
+    perf_timer t;
+    for(size_t trial = 0; trial < PERF_TRIALS; trial++){
+        t.start();
+        std::find_end(host_vector.begin(), host_vector.end(),
+                        pattern, pattern + 6);
+        t.stop();
+    }
+    std::cout << "time: " << t.min_time() / 1e6 << " ms" << std::endl;
+
+    return 0;
+}
diff --git a/src/boost/libs/compute/perf/perf_stl_includes.cpp b/src/boost/libs/compute/perf/perf_stl_includes.cpp
new file mode 100644
index 000000000..b71dbf3cb
--- /dev/null
+++ b/src/boost/libs/compute/perf/perf_stl_includes.cpp
@@ -0,0 +1,48 @@
+//---------------------------------------------------------------------------//
+// Copyright (c) 2014 Roshan <thisisroshansmail@gmail.com>
+//
+// Distributed under the Boost Software License, Version 1.0
+// See accompanying file LICENSE_1_0.txt or copy at
+// http://www.boost.org/LICENSE_1_0.txt
+//
+// See http://boostorg.github.com/compute for more information.
+//---------------------------------------------------------------------------//
+
+#include <vector>
+#include <algorithm>
+#include <iostream>
+
+#include "perf.hpp"
+
+int rand_int()
+{
+    return static_cast<int>((rand() / double(RAND_MAX)) * 25.0);
+}
+
+int main(int argc, char *argv[])
+{
+    perf_parse_args(argc, argv);
+
+    std::cout << "size: " << PERF_N << std::endl;
+
+    std::vector<int> v1(PERF_N);
+    std::generate(v1.begin(), v1.end(), rand_int);
+
+    std::vector<int> v2(v1);
+
+    std::sort(v1.begin(), v1.end());
+    std::sort(v2.begin(), v2.end());
+
+    perf_timer t;
+    for(size_t trial = 0; trial < PERF_TRIALS; trial++){
+        t.start();
+        std::includes(
+            v1.begin(), v1.end(),
+            v2.begin(), v2.end()
+        );
+        t.stop();
+    }
+    std::cout << "time: " << t.min_time() / 1e6 << " ms" << std::endl;
+
+    return 0;
+}
diff --git a/src/boost/libs/compute/perf/perf_stl_inner_product.cpp b/src/boost/libs/compute/perf/perf_stl_inner_product.cpp
new file mode 100644
index 000000000..884f06c96
--- /dev/null
+++ b/src/boost/libs/compute/perf/perf_stl_inner_product.cpp
@@ -0,0 +1,46 @@
+//---------------------------------------------------------------------------//
+// Copyright (c) 2013-2014 Kyle Lutz <kyle.r.lutz@gmail.com>
+//
+// Distributed under the Boost Software License, Version 1.0
+// See accompanying file LICENSE_1_0.txt or copy at
+// http://www.boost.org/LICENSE_1_0.txt
+//
+// See http://boostorg.github.com/compute for more information.
+//---------------------------------------------------------------------------//
+
+#include <algorithm>
+#include <iostream>
+#include <numeric>
+#include <vector>
+
+#include "perf.hpp"
+
+int rand_int()
+{
+    return static_cast<int>((rand() / double(RAND_MAX)) * 25.0);
+}
+
+int main(int argc, char *argv[])
+{
+    perf_parse_args(argc, argv);
+    std::cout << "size: " << PERF_N << std::endl;
+
+    std::vector<int> h1(PERF_N);
+    std::vector<int> h2(PERF_N);
+    std::generate(h1.begin(), h1.end(), rand_int);
+    std::generate(h2.begin(), h2.end(), rand_int);
+
+    int product = 0;
+    perf_timer t;
+    for(size_t trial = 0; trial < PERF_TRIALS; trial++){
+        t.start();
+        product = std::inner_product(
+            h1.begin(), h1.end(), h2.begin(), int(0)
+        );
+        t.stop();
+    }
+    std::cout << "time: " << t.min_time() / 1e6 << " ms" << std::endl;
+    std::cout << "product: " << product << std::endl;
+
+    return 0;
+}
diff --git a/src/boost/libs/compute/perf/perf_stl_is_permutation.cpp b/src/boost/libs/compute/perf/perf_stl_is_permutation.cpp
new file mode 100644
index 000000000..1384e8e93
--- /dev/null
+++ b/src/boost/libs/compute/perf/perf_stl_is_permutation.cpp
@@ -0,0 +1,45 @@
+//---------------------------------------------------------------------------//
+// Copyright (c) 2014 Roshan <thisisroshansmail@gmail.com>
+//
+// Distributed under the Boost Software License, Version 1.0
+// See accompanying file LICENSE_1_0.txt or copy at
+// http://www.boost.org/LICENSE_1_0.txt
+//
+// See http://boostorg.github.com/compute for more information.
+//---------------------------------------------------------------------------//
+
+#include <algorithm>
+#include <iostream>
+#include <numeric>
+#include <vector>
+
+#include "perf.hpp"
+
+int rand_int()
+{
+    return static_cast<int>((rand() / double(RAND_MAX)) * 25.0);
+}
+
+int main(int argc, char *argv[])
+{
+    perf_parse_args(argc, argv);
+    std::cout << "size: " << PERF_N << std::endl;
+
+    // create vector of random numbers on the host
+    std::vector<int> host_vector(PERF_N);
+    std::generate(host_vector.begin(), host_vector.end(), rand_int);
+
+    std::vector<int> host_vector2(PERF_N);
+    std::copy(host_vector.rbegin(), host_vector.rend(), host_vector2.begin());
+
+    perf_timer t;
+    for(size_t trial = 0; trial < PERF_TRIALS; trial++){
+        t.start();
+        std::is_permutation(host_vector.begin(), host_vector.end(),
+                            host_vector2.begin());
+        t.stop();
+    }
+    std::cout << "time: " << t.min_time() / 1e6 << " ms" << std::endl;
+
+    return 0;
+}
diff --git a/src/boost/libs/compute/perf/perf_stl_max_element.cpp b/src/boost/libs/compute/perf/perf_stl_max_element.cpp
new file mode 100644
index 000000000..3fa61267e
--- /dev/null
+++ b/src/boost/libs/compute/perf/perf_stl_max_element.cpp
@@ -0,0 +1,43 @@
+//---------------------------------------------------------------------------//
+// Copyright (c) 2013-2014 Rastko Anicic <anicic.rastko@gmail.com>
+//
+// Distributed under the Boost Software License, Version 1.0
+// See accompanying file LICENSE_1_0.txt or copy at
+// http://www.boost.org/LICENSE_1_0.txt
+//
+// See http://boostorg.github.com/compute for more information.
+//---------------------------------------------------------------------------//
+
+#include <algorithm>
+#include <iostream>
+#include <vector>
+
+#include "perf.hpp"
+
+int rand_int()
+{
+    return static_cast<int>(rand() % 10000000);
+}
+
+int main(int argc, char *argv[])
+{
+    perf_parse_args(argc, argv);
+    std::cout << "size: " << PERF_N << std::endl;
+
+    // create vector of random numbers on the host
+    std::vector<int> host_vector(PERF_N);
+    std::generate(host_vector.begin(), host_vector.end(), rand_int);
+
+    int max = 0;
+
+    perf_timer t;
+    for(size_t trial = 0; trial < PERF_TRIALS; trial++){
+        t.start();
+        max = *(std::max_element(host_vector.begin(), host_vector.end()));
+        t.stop();
+    }
+    std::cout << "time: " << t.min_time() / 1e6 << " ms" << std::endl;
+    std::cout << "max: " << max << std::endl;
+
+    return 0;
+}
diff --git a/src/boost/libs/compute/perf/perf_stl_merge.cpp b/src/boost/libs/compute/perf/perf_stl_merge.cpp
new file mode 100644
index 000000000..0a842a04e
--- /dev/null
+++ b/src/boost/libs/compute/perf/perf_stl_merge.cpp
@@ -0,0 +1,38 @@
+//---------------------------------------------------------------------------//
+// Copyright (c) 2013-2014 Kyle Lutz <kyle.r.lutz@gmail.com>
+//
+// Distributed under the Boost Software License, Version 1.0
+// See accompanying file LICENSE_1_0.txt or copy at
+// http://www.boost.org/LICENSE_1_0.txt
+//
+// See http://boostorg.github.com/compute for more information.
+//---------------------------------------------------------------------------//
+
+#include <vector>
+#include <algorithm>
+#include <iostream>
+
+#include "perf.hpp"
+
+int main(int argc, char *argv[])
+{
+    perf_parse_args(argc, argv);
+
+    std::cout << "size: " << PERF_N << std::endl;
+    std::vector<int> v1 = generate_random_vector<int>(std::floor(PERF_N / 2.0));
+    std::vector<int> v2 = generate_random_vector<int>(std::ceil(PERF_N / 2.0));
+    std::vector<int> v3(PERF_N);
+
+    std::sort(v1.begin(), v1.end());
+    std::sort(v2.begin(), v2.end());
+
+    perf_timer t;
+    for(size_t trial = 0; trial < PERF_TRIALS; trial++){
+        t.start();
+        std::merge(v1.begin(), v1.end(), v2.begin(), v2.end(), v3.begin());
+        t.stop();
+    }
+    std::cout << "time: " << t.min_time() / 1e6 << " ms" << std::endl;
+
+    return 0;
+}
diff --git a/src/boost/libs/compute/perf/perf_stl_next_permutation.cpp b/src/boost/libs/compute/perf/perf_stl_next_permutation.cpp
new file mode 100644
index 000000000..221489751
--- /dev/null
+++ b/src/boost/libs/compute/perf/perf_stl_next_permutation.cpp
@@ -0,0 +1,43 @@
+//---------------------------------------------------------------------------//
+// Copyright (c) 2014 Roshan <thisisroshansmail@gmail.com>
+//
+// Distributed under the Boost Software License, Version 1.0
+// See accompanying file LICENSE_1_0.txt or copy at
+// http://www.boost.org/LICENSE_1_0.txt
+//
+// See http://boostorg.github.com/compute for more information.
+//---------------------------------------------------------------------------//
+
+#include <algorithm>
+#include <iostream>
+#include <numeric>
+#include <vector>
+
+#include "perf.hpp"
+
+int rand_int()
+{
+    return static_cast<int>((rand() / double(RAND_MAX)) * 25.0);
+}
+
+int main(int argc, char *argv[])
+{
+    perf_parse_args(argc, argv);
+    std::cout << "size: " << PERF_N << std::endl;
+
+    // create vector of random numbers on the host
+    std::vector<int> host_vector(PERF_N);
+    std::generate(host_vector.begin(), host_vector.end(), rand_int);
+    std::sort(host_vector.begin(), host_vector.end(), std::greater<int>());
+
+    perf_timer t;
+    for(size_t trial = 0; trial < PERF_TRIALS; trial++){
+        t.start();
+        std::next_permutation(host_vector.begin(), host_vector.end());
+        t.stop();
+        std::prev_permutation(host_vector.begin(), host_vector.end());
+    }
+    std::cout << "time: " << t.min_time() / 1e6 << " ms" << std::endl;
+
+    return 0;
+}
diff --git a/src/boost/libs/compute/perf/perf_stl_partial_sum.cpp b/src/boost/libs/compute/perf/perf_stl_partial_sum.cpp
new file mode 100644
index 000000000..533defb29
--- /dev/null
+++ b/src/boost/libs/compute/perf/perf_stl_partial_sum.cpp
@@ -0,0 +1,51 @@
+//---------------------------------------------------------------------------//
+// Copyright (c) 2013-2014 Kyle Lutz <kyle.r.lutz@gmail.com>
+//
+// Distributed under the Boost Software License, Version 1.0
+// See accompanying file LICENSE_1_0.txt or copy at
+// http://www.boost.org/LICENSE_1_0.txt
+//
+// See http://boostorg.github.com/compute for more information.
+//---------------------------------------------------------------------------//
+
+#include <algorithm>
+#include <iostream>
+#include <numeric>
+#include <vector>
+
+#include <boost/compute/system.hpp>
+
+#include "perf.hpp"
+
+int rand_int()
+{
+    return static_cast<int>((rand() / double(RAND_MAX)) * 25.0);
+}
+
+int main(int argc, char *argv[])
+{
+    using boost::compute::int_;
+
+    perf_parse_args(argc, argv);
+
+    std::cout << "size: " << PERF_N << std::endl;
+
+    // create vector of random numbers on the host
+    std::vector<int_> v(PERF_N);
+    std::vector<int_> r(PERF_N);
+
+    perf_timer t;
+    for(size_t trial = 0; trial < PERF_TRIALS; trial++){
+        std::generate(v.begin(), v.end(), rand_int);
+        t.start();
+        std::partial_sum(
+            v.begin(),
+            v.end(),
+            r.begin()
+        );
+        t.stop();
+    }
+    std::cout << "time: " << t.min_time() / 1e6 << " ms" << std::endl;
+
+    return 0;
+}
diff --git a/src/boost/libs/compute/perf/perf_stl_partition.cpp b/src/boost/libs/compute/perf/perf_stl_partition.cpp
new file mode 100644
index 000000000..56aadd0d7
--- /dev/null
+++ b/src/boost/libs/compute/perf/perf_stl_partition.cpp
@@ -0,0 +1,46 @@
+//---------------------------------------------------------------------------//
+// Copyright (c) 2014 Roshan <thisisroshansmail@gmail.com>
+//
+// Distributed under the Boost Software License, Version 1.0
+// See accompanying file LICENSE_1_0.txt or copy at
+// http://www.boost.org/LICENSE_1_0.txt
+//
+// See http://boostorg.github.com/compute for more information.
+//---------------------------------------------------------------------------//
+
+#include <algorithm>
+#include <iostream>
+#include <numeric>
+#include <vector>
+
+#include "perf.hpp"
+
+int rand_int()
+{
+    return static_cast<int>((rand() / double(RAND_MAX)) * 25.0);
+}
+
+bool less_than_10(int value)
+{
+    return value < 10;
+}
+
+int main(int argc, char *argv[])
+{
+    perf_parse_args(argc, argv);
+    std::cout << "size: " << PERF_N << std::endl;
+
+    // create vector of random numbers on the host
+    std::vector<int> host_vector(PERF_N);
+    std::generate(host_vector.begin(), host_vector.end(), rand_int);
+
+    perf_timer t;
+    for(size_t trial = 0; trial < PERF_TRIALS; trial++){
+        t.start();
+        std::partition(host_vector.begin(), host_vector.end(), less_than_10);
+        t.stop();
+    }
+    std::cout << "time: " << t.min_time() / 1e6 << " ms" << std::endl;
+
+    return 0;
+}
diff --git a/src/boost/libs/compute/perf/perf_stl_partition_point.cpp b/src/boost/libs/compute/perf/perf_stl_partition_point.cpp
new file mode 100644
index 000000000..94b1c2634
--- /dev/null
+++ b/src/boost/libs/compute/perf/perf_stl_partition_point.cpp
@@ -0,0 +1,48 @@
+//---------------------------------------------------------------------------//
+// Copyright (c) 2014 Roshan <thisisroshansmail@gmail.com>
+//
+// Distributed under the Boost Software License, Version 1.0
+// See accompanying file LICENSE_1_0.txt or copy at
+// http://www.boost.org/LICENSE_1_0.txt
+//
+// See http://boostorg.github.com/compute for more information.
+//---------------------------------------------------------------------------//
+
+#include <algorithm>
+#include <iostream>
+#include <numeric>
+#include <vector>
+
+#include "perf.hpp"
+
+int rand_int()
+{
+    return static_cast<int>((rand() / double(RAND_MAX)) * 25.0);
+}
+
+bool less_than_20(int value)
+{
+    return value < 20;
+}
+
+int main(int argc, char *argv[])
+{
+    perf_parse_args(argc, argv);
+    std::cout << "size: " << PERF_N << std::endl;
+
+    // create vector of random numbers on the host
+    std::vector<int> host_vector(PERF_N);
+    std::generate(host_vector.begin(), host_vector.end(), rand_int);
+    std::partition(host_vector.begin(), host_vector.end(),
+                    less_than_20);
+    perf_timer t;
+    for(size_t trial = 0; trial < PERF_TRIALS; trial++){
+        t.start();
+        std::partition_point(host_vector.begin(), host_vector.end(),
+                             less_than_20);
+        t.stop();
+    }
+    std::cout << "time: " << t.min_time() / 1e6 << " ms" << std::endl;
+
+    return 0;
+}
diff --git a/src/boost/libs/compute/perf/perf_stl_prev_permutation.cpp b/src/boost/libs/compute/perf/perf_stl_prev_permutation.cpp
new file mode 100644
index 000000000..f246ba2aa
--- /dev/null
+++ b/src/boost/libs/compute/perf/perf_stl_prev_permutation.cpp
@@ -0,0 +1,43 @@
+//---------------------------------------------------------------------------//
+// Copyright (c) 2014 Roshan <thisisroshansmail@gmail.com>
+//
+// Distributed under the Boost Software License, Version 1.0
+// See accompanying file LICENSE_1_0.txt or copy at
+// http://www.boost.org/LICENSE_1_0.txt
+//
+// See http://boostorg.github.com/compute for more information.
+//---------------------------------------------------------------------------//
+
+#include <algorithm>
+#include <iostream>
+#include <numeric>
+#include <vector>
+
+#include "perf.hpp"
+
+int rand_int()
+{
+    return static_cast<int>((rand() / double(RAND_MAX)) * 25.0);
+}
+
+int main(int argc, char *argv[])
+{
+    perf_parse_args(argc, argv);
+    std::cout << "size: " << PERF_N << std::endl;
+
+    // create vector of random numbers on the host
+    std::vector<int> host_vector(PERF_N);
+    std::generate(host_vector.begin(), host_vector.end(), rand_int);
+    std::sort(host_vector.begin(), host_vector.end());
+
+    perf_timer t;
+    for(size_t trial = 0; trial < PERF_TRIALS; trial++){
+        t.start();
+        std::prev_permutation(host_vector.begin(), host_vector.end());
+        t.stop();
+        std::next_permutation(host_vector.begin(), host_vector.end());
+    }
+    std::cout << "time: " << t.min_time() / 1e6 << " ms" << std::endl;
+
+    return 0;
+}
diff --git a/src/boost/libs/compute/perf/perf_stl_reverse.cpp b/src/boost/libs/compute/perf/perf_stl_reverse.cpp
new file mode 100644
index 000000000..b2ee2ef6d
--- /dev/null
+++ b/src/boost/libs/compute/perf/perf_stl_reverse.cpp
@@ -0,0 +1,41 @@
+//---------------------------------------------------------------------------//
+// Copyright (c) 2014 Roshan <thisisroshansmail@gmail.com>
+//
+// Distributed under the Boost Software License, Version 1.0
+// See accompanying file LICENSE_1_0.txt or copy at
+// http://www.boost.org/LICENSE_1_0.txt
+//
+// See http://boostorg.github.com/compute for more information.
+//---------------------------------------------------------------------------//
+
+#include <algorithm>
+#include <iostream>
+#include <numeric>
+#include <vector>
+
+#include "perf.hpp"
+
+int rand_int()
+{
+    return static_cast<int>((rand() / double(RAND_MAX)) * 25.0);
+}
+
+int main(int argc, char *argv[])
+{
+    perf_parse_args(argc, argv);
+    std::cout << "size: " << PERF_N << std::endl;
+
+    // create vector of random numbers on the host
+    std::vector<int> host_vector(PERF_N);
+    std::generate(host_vector.begin(), host_vector.end(), rand_int);
+
+    perf_timer t;
+    for(size_t trial = 0; trial < PERF_TRIALS; trial++){
+        t.start();
+        std::reverse(host_vector.begin(), host_vector.end());
+        t.stop();
+    }
+    std::cout << "time: " << t.min_time() / 1e6 << " ms" << std::endl;
+
+    return 0;
+}
diff --git a/src/boost/libs/compute/perf/perf_stl_reverse_copy.cpp b/src/boost/libs/compute/perf/perf_stl_reverse_copy.cpp
new file mode 100644
index 000000000..1397e9a74
--- /dev/null
+++ b/src/boost/libs/compute/perf/perf_stl_reverse_copy.cpp
@@ -0,0 +1,45 @@
+//---------------------------------------------------------------------------//
+// Copyright (c) 2015 Jakub Szuppe <j.szuppe@gmail.com>
+//
+// Distributed under the Boost Software License, Version 1.0
+// See accompanying file LICENSE_1_0.txt or copy at
+// http://www.boost.org/LICENSE_1_0.txt
+//
+// See http://boostorg.github.com/compute for more information.
+//---------------------------------------------------------------------------//
+
+#include <algorithm>
+#include <iostream>
+#include <numeric>
+#include <vector>
+
+#include "perf.hpp"
+
+int rand_int()
+{
+    return static_cast<int>((rand() / double(RAND_MAX)) * 25.0);
+}
+
+int main(int argc, char *argv[])
+{
+    perf_parse_args(argc, argv);
+    std::cout << "size: " << PERF_N << std::endl;
+
+    // create vector of random numbers on the host
+    std::vector<int> host_vector(PERF_N);
+    std::generate(host_vector.begin(), host_vector.end(), rand_int);
+
+    // create vector for reversed data
+    std::vector<int> host_reversed_vector(PERF_N);
+
+    perf_timer t;
+    for(size_t trial = 0; trial < PERF_TRIALS; trial++){
+        t.start();
+        std::reverse_copy(host_vector.begin(), host_vector.end(),
+                          host_reversed_vector.begin());
+        t.stop();
+    }
+    std::cout << "time: " << t.min_time() / 1e6 << " ms" << std::endl;
+
+    return 0;
+}
diff --git a/src/boost/libs/compute/perf/perf_stl_rotate.cpp b/src/boost/libs/compute/perf/perf_stl_rotate.cpp
new file mode 100644
index 000000000..f90acef2a
--- /dev/null
+++ b/src/boost/libs/compute/perf/perf_stl_rotate.cpp
@@ -0,0 +1,41 @@
+//---------------------------------------------------------------------------//
+// Copyright (c) 2014 Roshan <thisisroshansmail@gmail.com>
+//
+// Distributed under the Boost Software License, Version 1.0
+// See accompanying file LICENSE_1_0.txt or copy at
+// http://www.boost.org/LICENSE_1_0.txt
+//
+// See http://boostorg.github.com/compute for more information.
+//---------------------------------------------------------------------------//
+
+#include <algorithm>
+#include <iostream>
+#include <numeric>
+#include <vector>
+
+#include "perf.hpp"
+
+int rand_int()
+{
+    return static_cast<int>((rand() / double(RAND_MAX)) * 25.0);
+}
+
+int main(int argc, char *argv[])
+{
+    perf_parse_args(argc, argv);
+    std::cout << "size: " << PERF_N << std::endl;
+
+    // create vector of random numbers on the host
+    std::vector<int> host_vector(PERF_N);
+    std::generate(host_vector.begin(), host_vector.end(), rand_int);
+
+    perf_timer t;
+    for(size_t trial = 0; trial < PERF_TRIALS; trial++){
+        t.start();
+        std::rotate(host_vector.begin(), host_vector.begin()+(PERF_N/2), host_vector.end());
+        t.stop();
+    }
+    std::cout << "time: " << t.min_time() / 1e6 << " ms" << std::endl;
+
+    return 0;
+}
diff --git a/src/boost/libs/compute/perf/perf_stl_rotate_copy.cpp b/src/boost/libs/compute/perf/perf_stl_rotate_copy.cpp
new file mode 100644
index 000000000..516d11c2f
--- /dev/null
+++ b/src/boost/libs/compute/perf/perf_stl_rotate_copy.cpp
@@ -0,0 +1,43 @@
+//---------------------------------------------------------------------------//
+// Copyright (c) 2014 Roshan <thisisroshansmail@gmail.com>
+//
+// Distributed under the Boost Software License, Version 1.0
+// See accompanying file LICENSE_1_0.txt or copy at
+// http://www.boost.org/LICENSE_1_0.txt
+//
+// See http://boostorg.github.com/compute for more information.
+//---------------------------------------------------------------------------//
+
+#include <algorithm>
+#include <iostream>
+#include <numeric>
+#include <vector>
+
+#include "perf.hpp"
+
+int rand_int()
+{
+    return static_cast<int>((rand() / double(RAND_MAX)) * 25.0);
+}
+
+int main(int argc, char *argv[])
+{
+    perf_parse_args(argc, argv);
+    std::cout << "size: " << PERF_N << std::endl;
+
+    // create vector of random numbers on the host
+    std::vector<int> host_vector(PERF_N);
+    std::generate(host_vector.begin(), host_vector.end(), rand_int);
+
+    std::vector<int> host_vector2(PERF_N);
+
+    perf_timer t;
+    for(size_t trial = 0; trial < PERF_TRIALS; trial++){
+        t.start();
+        std::rotate_copy(host_vector.begin(), host_vector.begin()+(PERF_N/2), host_vector.end(), host_vector2.begin());
+        t.stop();
+    }
+    std::cout << "time: " << t.min_time() / 1e6 << " ms" << std::endl;
+
+    return 0;
+}
diff --git a/src/boost/libs/compute/perf/perf_stl_saxpy.cpp b/src/boost/libs/compute/perf/perf_stl_saxpy.cpp
new file mode 100644
index 000000000..8ab33535c
--- /dev/null
+++ b/src/boost/libs/compute/perf/perf_stl_saxpy.cpp
@@ -0,0 +1,52 @@
+//---------------------------------------------------------------------------//
+// Copyright (c) 2013-2014 Kyle Lutz <kyle.r.lutz@gmail.com>
+//
+// Distributed under the Boost Software License, Version 1.0
+// See accompanying file LICENSE_1_0.txt or copy at
+// http://www.boost.org/LICENSE_1_0.txt
+//
+// See http://boostorg.github.com/compute for more information.
+//---------------------------------------------------------------------------//
+
+#include <algorithm>
+#include <iostream>
+#include <vector>
+
+#include "perf.hpp"
+
+float rand_float()
+{
+    return (float(rand()) / float(RAND_MAX)) * 1000.f;
+}
+
+// y <- alpha * x + y
+void serial_saxpy(size_t n, float alpha, const float *x, float *y)
+{
+    for(size_t i = 0; i < n; i++){
+        y[i] = alpha * x[i] + y[i];
+    }
+}
+
+int main(int argc, char *argv[])
+{
+    perf_parse_args(argc, argv);
+
+    std::cout << "size: " << PERF_N << std::endl;
+
+    float alpha = 2.5f;
+
+    std::vector<float> host_x(PERF_N);
+    std::vector<float> host_y(PERF_N);
+    std::generate(host_x.begin(), host_x.end(), rand_float);
+    std::generate(host_y.begin(), host_y.end(), rand_float);
+
+    perf_timer t;
+    for(size_t trial = 0; trial < PERF_TRIALS; trial++){
+        t.start();
+        serial_saxpy(PERF_N, alpha, &host_x[0], &host_y[0]);
+        t.stop();
+    }
+    std::cout << "time: " << t.min_time() / 1e6 << " ms" << std::endl;
+
+    return 0;
+}
diff --git a/src/boost/libs/compute/perf/perf_stl_search.cpp b/src/boost/libs/compute/perf/perf_stl_search.cpp
new file mode 100644
index 000000000..8166d35e6
--- /dev/null
+++ b/src/boost/libs/compute/perf/perf_stl_search.cpp
@@ -0,0 +1,44 @@
+//---------------------------------------------------------------------------//
+// Copyright (c) 2014 Roshan <thisisroshansmail@gmail.com>
+//
+// Distributed under the Boost Software License, Version 1.0
+// See accompanying file LICENSE_1_0.txt or copy at
+// http://www.boost.org/LICENSE_1_0.txt
+//
+// See http://boostorg.github.com/compute for more information.
+//---------------------------------------------------------------------------//
+
+#include <algorithm>
+#include <iostream>
+#include <numeric>
+#include <vector>
+
+#include "perf.hpp"
+
+int rand_int()
+{
+    return static_cast<int>((rand() / double(RAND_MAX)) * 25.0);
+}
+
+int main(int argc, char *argv[])
+{
+    perf_parse_args(argc, argv);
+    std::cout << "size: " << PERF_N << std::endl;
+
+    // create vector of random numbers on the host
+    std::vector<int> host_vector(PERF_N);
+    std::generate(host_vector.begin(), host_vector.end(), rand_int);
+
+    int pattern[] = {2, 6, 6, 7, 8, 4};
+
+    perf_timer t;
+    for(size_t trial = 0; trial < PERF_TRIALS; trial++){
+        t.start();
+        std::search(host_vector.begin(), host_vector.end(),
+                    pattern, pattern + 6);
+        t.stop();
+    }
+    std::cout << "time: " << t.min_time() / 1e6 << " ms" << std::endl;
+
+    return 0;
+}
diff --git a/src/boost/libs/compute/perf/perf_stl_search_n.cpp b/src/boost/libs/compute/perf/perf_stl_search_n.cpp
new file mode 100644
index 000000000..76a6bb071
--- /dev/null
+++ b/src/boost/libs/compute/perf/perf_stl_search_n.cpp
@@ -0,0 +1,41 @@
+//---------------------------------------------------------------------------//
+// Copyright (c) 2014 Roshan <thisisroshansmail@gmail.com>
+//
+// Distributed under the Boost Software License, Version 1.0
+// See accompanying file LICENSE_1_0.txt or copy at
+// http://www.boost.org/LICENSE_1_0.txt
+//
+// See http://boostorg.github.com/compute for more information.
+//---------------------------------------------------------------------------//
+
+#include <algorithm>
+#include <iostream>
+#include <numeric>
+#include <vector>
+
+#include "perf.hpp"
+
+int rand_int()
+{
+    return static_cast<int>((rand() / double(RAND_MAX)) * 25.0);
+}
+
+int main(int argc, char *argv[])
+{
+    perf_parse_args(argc, argv);
+    std::cout << "size: " << PERF_N << std::endl;
+
+    // create vector of random numbers on the host
+    std::vector<int> host_vector(PERF_N);
+    std::generate(host_vector.begin(), host_vector.end(), rand_int);
+
+    perf_timer t;
+    for(size_t trial = 0; trial < PERF_TRIALS; trial++){
+        t.start();
+        std::search_n(host_vector.begin(), host_vector.end(), 5, 2);
+        t.stop();
+    }
+    std::cout << "time: " << t.min_time() / 1e6 << " ms" << std::endl;
+
+    return 0;
+}
diff --git a/src/boost/libs/compute/perf/perf_stl_set_difference.cpp b/src/boost/libs/compute/perf/perf_stl_set_difference.cpp
new file mode 100644
index 000000000..c5d0802db
--- /dev/null
+++ b/src/boost/libs/compute/perf/perf_stl_set_difference.cpp
@@ -0,0 +1,54 @@
+//---------------------------------------------------------------------------//
+// Copyright (c) 2014 Roshan <thisisroshansmail@gmail.com>
+//
+// Distributed under the Boost Software License, Version 1.0
+// See accompanying file LICENSE_1_0.txt or copy at
+// http://www.boost.org/LICENSE_1_0.txt
+//
+// See http://boostorg.github.com/compute for more information.
+//---------------------------------------------------------------------------//
+
+#include <vector>
+#include <algorithm>
+#include <iostream>
+
+#include "perf.hpp"
+
+int rand_int()
+{
+    return static_cast<int>((rand() / double(RAND_MAX)) * 25.0);
+}
+
+int main(int argc, char *argv[])
+{
+    perf_parse_args(argc, argv);
+
+    std::cout << "size: " << PERF_N << std::endl;
+
+    std::vector<int> v1(std::floor(PERF_N / 2.0));
+    std::vector<int> v2(std::ceil(PERF_N / 2.0));
+
+    std::generate(v1.begin(), v1.end(), rand_int);
+    std::generate(v2.begin(), v2.end(), rand_int);
+
+    std::sort(v1.begin(), v1.end());
+    std::sort(v2.begin(), v2.end());
+
+    std::vector<int> v3(PERF_N);
+    std::vector<int>::iterator v3_end;
+
+    perf_timer t;
+    for(size_t trial = 0; trial < PERF_TRIALS; trial++){
+        t.start();
+        v3_end = std::set_difference(
+            v1.begin(), v1.end(),
+            v2.begin(), v2.end(),
+            v3.begin()
+        );
+        t.stop();
+    }
+    std::cout << "time: " << t.min_time() / 1e6 << " ms" << std::endl;
+    std::cout << "size: " << std::distance(v3.begin(), v3_end) << std::endl;
+
+    return 0;
+}
diff --git a/src/boost/libs/compute/perf/perf_stl_set_intersection.cpp b/src/boost/libs/compute/perf/perf_stl_set_intersection.cpp
new file mode 100644
index 000000000..6aa3493b9
--- /dev/null
+++ b/src/boost/libs/compute/perf/perf_stl_set_intersection.cpp
@@ -0,0 +1,54 @@
+//---------------------------------------------------------------------------//
+// Copyright (c) 2014 Roshan <thisisroshansmail@gmail.com>
+//
+// Distributed under the Boost Software License, Version 1.0
+// See accompanying file LICENSE_1_0.txt or copy at
+// http://www.boost.org/LICENSE_1_0.txt
+//
+// See http://boostorg.github.com/compute for more information.
+//---------------------------------------------------------------------------//
+
+#include <vector>
+#include <algorithm>
+#include <iostream>
+
+#include "perf.hpp"
+
+int rand_int()
+{
+    return static_cast<int>((rand() / double(RAND_MAX)) * 25.0);
+}
+
+int main(int argc, char *argv[])
+{
+    perf_parse_args(argc, argv);
+
+    std::cout << "size: " << PERF_N << std::endl;
+
+    std::vector<int> v1(std::floor(PERF_N / 2.0));
+    std::vector<int> v2(std::ceil(PERF_N / 2.0));
+
+    std::generate(v1.begin(), v1.end(), rand_int);
+    std::generate(v2.begin(), v2.end(), rand_int);
+
+    std::sort(v1.begin(), v1.end());
+    std::sort(v2.begin(), v2.end());
+
+    std::vector<int> v3(PERF_N);
+    std::vector<int>::iterator v3_end;
+
+    perf_timer t;
+    for(size_t trial = 0; trial < PERF_TRIALS; trial++){
+        t.start();
+        v3_end = std::set_intersection(
+            v1.begin(), v1.end(),
+            v2.begin(), v2.end(),
+            v3.begin()
+        );
+        t.stop();
+    }
+    std::cout << "time: " << t.min_time() / 1e6 << " ms" << std::endl;
+    std::cout << "size: " << std::distance(v3.begin(), v3_end) << std::endl;
+
+    return 0;
+}
diff --git a/src/boost/libs/compute/perf/perf_stl_set_symmetric_difference.cpp b/src/boost/libs/compute/perf/perf_stl_set_symmetric_difference.cpp
new file mode 100644
index 000000000..c22c70a7e
--- /dev/null
+++ b/src/boost/libs/compute/perf/perf_stl_set_symmetric_difference.cpp
@@ -0,0 +1,54 @@
+//---------------------------------------------------------------------------//
+// Copyright (c) 2014 Roshan <thisisroshansmail@gmail.com>
+//
+// Distributed under the Boost Software License, Version 1.0
+// See accompanying file LICENSE_1_0.txt or copy at
+// http://www.boost.org/LICENSE_1_0.txt
+//
+// See http://boostorg.github.com/compute for more information.
+//---------------------------------------------------------------------------//
+
+#include <vector>
+#include <algorithm>
+#include <iostream>
+
+#include "perf.hpp"
+
+int rand_int()
+{
+    return static_cast<int>((rand() / double(RAND_MAX)) * 25.0);
+}
+
+int main(int argc, char *argv[])
+{
+    perf_parse_args(argc, argv);
+
+    std::cout << "size: " << PERF_N << std::endl;
+
+    std::vector<int> v1(std::floor(PERF_N / 2.0));
+    std::vector<int> v2(std::ceil(PERF_N / 2.0));
+
+    std::generate(v1.begin(), v1.end(), rand_int);
+    std::generate(v2.begin(), v2.end(), rand_int);
+
+    std::sort(v1.begin(), v1.end());
+    std::sort(v2.begin(), v2.end());
+
+    std::vector<int> v3(PERF_N);
+    std::vector<int>::iterator v3_end;
+
+    perf_timer t;
+    for(size_t trial = 0; trial < PERF_TRIALS; trial++){
+        t.start();
+        v3_end = std::set_symmetric_difference(
+            v1.begin(), v1.end(),
+            v2.begin(), v2.end(),
+            v3.begin()
+        );
+        t.stop();
+    }
+    std::cout << "time: " << t.min_time() / 1e6 << " ms" << std::endl;
+    std::cout << "size: " << std::distance(v3.begin(), v3_end) << std::endl;
+
+    return 0;
+}
diff --git a/src/boost/libs/compute/perf/perf_stl_set_union.cpp b/src/boost/libs/compute/perf/perf_stl_set_union.cpp
new file mode 100644
index 000000000..9d4fe3a85
--- /dev/null
+++ b/src/boost/libs/compute/perf/perf_stl_set_union.cpp
@@ -0,0 +1,54 @@
+//---------------------------------------------------------------------------//
+// Copyright (c) 2014 Roshan <thisisroshansmail@gmail.com>
+//
+// Distributed under the Boost Software License, Version 1.0
+// See accompanying file LICENSE_1_0.txt or copy at
+// http://www.boost.org/LICENSE_1_0.txt
+//
+// See http://boostorg.github.com/compute for more information.
+//---------------------------------------------------------------------------//
+
+#include <vector>
+#include <algorithm>
+#include <iostream>
+
+#include "perf.hpp"
+
+int rand_int()
+{
+    return static_cast<int>((rand() / double(RAND_MAX)) * 25.0);
+}
+
+int main(int argc, char *argv[])
+{
+    perf_parse_args(argc, argv);
+
+    std::cout << "size: " << PERF_N << std::endl;
+
+    std::vector<int> v1(std::floor(PERF_N / 2.0));
+    std::vector<int> v2(std::ceil(PERF_N / 2.0));
+
+    std::generate(v1.begin(), v1.end(), rand_int);
+    std::generate(v2.begin(), v2.end(), rand_int);
+
+    std::sort(v1.begin(), v1.end());
+    std::sort(v2.begin(), v2.end());
+
+    std::vector<int> v3(PERF_N);
+    std::vector<int>::iterator v3_end;
+
+    perf_timer t;
+    for(size_t trial = 0; trial < PERF_TRIALS; trial++){
+        t.start();
+        v3_end = std::set_union(
+            v1.begin(), v1.end(),
+            v2.begin(), v2.end(),
+            v3.begin()
+        );
+        t.stop();
+    }
+    std::cout << "time: " << t.min_time() / 1e6 << " ms" << std::endl;
+    std::cout << "size: " << std::distance(v3.begin(), v3_end) << std::endl;
+
+    return 0;
+}
diff --git a/src/boost/libs/compute/perf/perf_stl_sort.cpp b/src/boost/libs/compute/perf/perf_stl_sort.cpp
new file mode 100644
index 000000000..c9d4294b5
--- /dev/null
+++ b/src/boost/libs/compute/perf/perf_stl_sort.cpp
@@ -0,0 +1,33 @@
+//---------------------------------------------------------------------------//
+// Copyright (c) 2013-2014 Kyle Lutz <kyle.r.lutz@gmail.com>
+//
+// Distributed under the Boost Software License, Version 1.0
+// See accompanying file LICENSE_1_0.txt or copy at
+// http://www.boost.org/LICENSE_1_0.txt
+//
+// See http://boostorg.github.com/compute for more information.
+//---------------------------------------------------------------------------//
+
+#include <algorithm>
+#include <iostream>
+
+#include "perf.hpp"
+
+int main(int argc, char *argv[])
+{
+    perf_parse_args(argc, argv);
+
+    std::cout << "size: " << PERF_N << std::endl;
+
+    std::vector<int> v;
+    perf_timer t;
+    for(size_t trial = 0; trial < PERF_TRIALS; trial++){
+        v = generate_random_vector<int>(PERF_N);
+        t.start();
+        std::sort(v.begin(), v.end());
+        t.stop();
+    }
+    std::cout << "time: " << t.min_time() / 1e6 << " ms" << std::endl;
+
+    return 0;
+}
diff --git a/src/boost/libs/compute/perf/perf_stl_stable_partition.cpp b/src/boost/libs/compute/perf/perf_stl_stable_partition.cpp
new file mode 100644
index 000000000..ee4993cc9
--- /dev/null
+++ b/src/boost/libs/compute/perf/perf_stl_stable_partition.cpp
@@ -0,0 +1,47 @@
+//---------------------------------------------------------------------------//
+// Copyright (c) 2014 Roshan <thisisroshansmail@gmail.com>
+//
+// Distributed under the Boost Software License, Version 1.0
+// See accompanying file LICENSE_1_0.txt or copy at
+// http://www.boost.org/LICENSE_1_0.txt
+//
+// See http://boostorg.github.com/compute for more information.
+//---------------------------------------------------------------------------//
+
+#include <algorithm>
+#include <iostream>
+#include <numeric>
+#include <vector>
+
+#include "perf.hpp"
+
+int rand_int()
+{
+    return static_cast<int>((rand() / double(RAND_MAX)) * 25.0);
+}
+
+bool less_than_10(int value)
+{
+    return value < 10;
+}
+
+int main(int argc, char *argv[])
+{
+    perf_parse_args(argc, argv);
+    std::cout << "size: " << PERF_N << std::endl;
+
+    // create vector of random numbers on the host
+    std::vector<int> host_vector(PERF_N);
+    std::generate(host_vector.begin(), host_vector.end(), rand_int);
+
+    perf_timer t;
+    for(size_t trial = 0; trial < PERF_TRIALS; trial++){
+        t.start();
+        std::stable_partition(host_vector.begin(), host_vector.end(),
+                                less_than_10);
+        t.stop();
+    }
+    std::cout << "time: " << t.min_time() / 1e6 << " ms" << std::endl;
+
+    return 0;
+}
diff --git a/src/boost/libs/compute/perf/perf_stl_unique.cpp b/src/boost/libs/compute/perf/perf_stl_unique.cpp
new file mode 100644
index 000000000..f4f97b4b4
--- /dev/null
+++ b/src/boost/libs/compute/perf/perf_stl_unique.cpp
@@ -0,0 +1,41 @@
+//---------------------------------------------------------------------------//
+// Copyright (c) 2014 Roshan <thisisroshansmail@gmail.com>
+//
+// Distributed under the Boost Software License, Version 1.0
+// See accompanying file LICENSE_1_0.txt or copy at
+// http://www.boost.org/LICENSE_1_0.txt
+//
+// See http://boostorg.github.com/compute for more information.
+//---------------------------------------------------------------------------//
+
+#include <algorithm>
+#include <iostream>
+#include <numeric>
+#include <vector>
+
+#include "perf.hpp"
+
+int rand_int()
+{
+    return static_cast<int>((rand() / double(RAND_MAX)) * 25.0);
+}
+
+int main(int argc, char *argv[])
+{
+    perf_parse_args(argc, argv);
+    std::cout << "size: " << PERF_N << std::endl;
+
+    // create vector of random numbers on the host
+    std::vector<int> host_vector(PERF_N);
+
+    perf_timer t;
+    for(size_t trial = 0; trial < PERF_TRIALS; trial++){
+        std::generate(host_vector.begin(), host_vector.end(), rand_int);
+        t.start();
+        std::unique(host_vector.begin(), host_vector.end());
+        t.stop();
+    }
+    std::cout << "time: " << t.min_time() / 1e6 << " ms" << std::endl;
+
+    return 0;
+}
diff --git a/src/boost/libs/compute/perf/perf_stl_unique_copy.cpp b/src/boost/libs/compute/perf/perf_stl_unique_copy.cpp
new file mode 100644
index 000000000..77705fa27
--- /dev/null
+++ b/src/boost/libs/compute/perf/perf_stl_unique_copy.cpp
@@ -0,0 +1,44 @@
+//---------------------------------------------------------------------------//
+// Copyright (c) 2014 Roshan <thisisroshansmail@gmail.com>
+//
+// Distributed under the Boost Software License, Version 1.0
+// See accompanying file LICENSE_1_0.txt or copy at
+// http://www.boost.org/LICENSE_1_0.txt
+//
+// See http://boostorg.github.com/compute for more information.
+//---------------------------------------------------------------------------//
+
+#include <algorithm>
+#include <iostream>
+#include <numeric>
+#include <vector>
+
+#include "perf.hpp"
+
+int rand_int()
+{
+    return static_cast<int>((rand() / double(RAND_MAX)) * 25.0);
+}
+
+int main(int argc, char *argv[])
+{
+    perf_parse_args(argc, argv);
+    std::cout << "size: " << PERF_N << std::endl;
+
+    // create vector of random numbers on the host
+    std::vector<int> host_vector(PERF_N);
+    std::vector<int> host_vector2(PERF_N);
+
+    perf_timer t;
+    for(size_t trial = 0; trial < PERF_TRIALS; trial++){
+        std::generate(host_vector.begin(), host_vector.end(), rand_int);
+        t.start();
+        std::unique_copy(
+            host_vector.begin(), host_vector.end(), host_vector2.begin()
+        );
+        t.stop();
+    }
+    std::cout << "time: " << t.min_time() / 1e6 << " ms" << std::endl;
+
+    return 0;
+}
diff --git a/src/boost/libs/compute/perf/perf_tbb_accumulate.cpp b/src/boost/libs/compute/perf/perf_tbb_accumulate.cpp
new file mode 100644
index 000000000..319ad8af4
--- /dev/null
+++ b/src/boost/libs/compute/perf/perf_tbb_accumulate.cpp
@@ -0,0 +1,75 @@
+//---------------------------------------------------------------------------//
+// Copyright (c) 2013-2014 Kyle Lutz <kyle.r.lutz@gmail.com>
+//
+// Distributed under the Boost Software License, Version 1.0
+// See accompanying file LICENSE_1_0.txt or copy at
+// http://www.boost.org/LICENSE_1_0.txt
+//
+// See http://boostorg.github.com/compute for more information.
+//---------------------------------------------------------------------------//
+
+#include <algorithm>
+#include <iostream>
+#include <numeric>
+#include <vector>
+
+#include <tbb/blocked_range.h>
+#include <tbb/parallel_reduce.h>
+
+#include "perf.hpp"
+
+int rand_int()
+{
+    return static_cast<int>((rand() / double(RAND_MAX)) * 25.0);
+}
+
+template<class T>
+struct Sum {
+    T value;
+    Sum() : value(0) {}
+    Sum( Sum& s, tbb::split ) {value = 0;}
+    void operator()( const tbb::blocked_range<T*>& r ) {
+        T temp = value;
+        for( T* a=r.begin(); a!=r.end(); ++a ) {
+            temp += *a;
+        }
+        value = temp;
+    }
+    void join( Sum& rhs ) {value += rhs.value;}
+};
+
+template<class T>
+T ParallelSum( T array[], size_t n ) {
+    Sum<T> total;
+    tbb::parallel_reduce( tbb::blocked_range<T*>( array, array+n ),
+                     total );
+    return total.value;
+}
+
+int main(int argc, char *argv[])
+{
+    perf_parse_args(argc, argv);
+    std::cout << "size: " << PERF_N << std::endl;
+
+    // create vector of random numbers on the host
+    std::vector<int> host_vector(PERF_N);
+    std::generate(host_vector.begin(), host_vector.end(), rand_int);
+
+    int sum = 0;
+    perf_timer t;
+    for(size_t trial = 0; trial < PERF_TRIALS; trial++){
+        t.start();
+        sum = ParallelSum<int>(&host_vector[0], host_vector.size());
+        t.stop();
+    }
+    std::cout << "time: " << t.min_time() / 1e6 << " ms" << std::endl;
+    std::cout << "sum: " << sum << std::endl;
+
+    int host_sum = std::accumulate(host_vector.begin(), host_vector.end(), int(0));
+    if(sum != host_sum){
+        std::cerr << "ERROR: sum (" << sum << ") != (" << host_sum << ")" << std::endl;
+        return -1;
+    }
+
+    return 0;
+}
diff --git a/src/boost/libs/compute/perf/perf_tbb_merge.cpp b/src/boost/libs/compute/perf/perf_tbb_merge.cpp
new file mode 100644
index 000000000..a7aa814e0
--- /dev/null
+++ b/src/boost/libs/compute/perf/perf_tbb_merge.cpp
@@ -0,0 +1,95 @@
+//---------------------------------------------------------------------------//
+// Copyright (c) 2014 Kyle Lutz <kyle.r.lutz@gmail.com>
+//
+// Distributed under the Boost Software License, Version 1.0
+// See accompanying file LICENSE_1_0.txt or copy at
+// http://www.boost.org/LICENSE_1_0.txt
+//
+// See http://boostorg.github.com/compute for more information.
+//---------------------------------------------------------------------------//
+
+#include <algorithm>
+#include <iostream>
+#include <vector>
+
+#include <tbb/parallel_for.h>
+
+#include "perf.hpp"
+
+// example from: http://www.threadingbuildingblocks.org/docs/help/reference/algorithms/parallel_for_func.htm
+using namespace tbb;
+
+template<typename Iterator>
+struct ParallelMergeRange {
+    static size_t grainsize;
+    Iterator begin1, end1; // [begin1,end1) is 1st sequence to be merged
+    Iterator begin2, end2; // [begin2,end2) is 2nd sequence to be merged
+    Iterator out;               // where to put merged sequence
+    bool empty()   const {return (end1-begin1)+(end2-begin2)==0;}
+    bool is_divisible() const {
+        return (std::min)( end1-begin1, end2-begin2 ) > grainsize;
+    }
+    ParallelMergeRange( ParallelMergeRange& r, split ) {
+        if( r.end1-r.begin1 < r.end2-r.begin2 ) {
+            std::swap(r.begin1,r.begin2);
+            std::swap(r.end1,r.end2);
+        }
+        Iterator m1 = r.begin1 + (r.end1-r.begin1)/2;
+        Iterator m2 = std::lower_bound( r.begin2, r.end2, *m1 );
+        begin1 = m1;
+        begin2 = m2;
+        end1 = r.end1;
+        end2 = r.end2;
+        out = r.out + (m1-r.begin1) + (m2-r.begin2);
+        r.end1 = m1;
+        r.end2 = m2;
+    }
+    ParallelMergeRange( Iterator begin1_, Iterator end1_,
+                        Iterator begin2_, Iterator end2_,
+                        Iterator out_ ) :
+        begin1(begin1_), end1(end1_),
+        begin2(begin2_), end2(end2_), out(out_)
+    {}
+};
+
+template<typename Iterator>
+size_t ParallelMergeRange<Iterator>::grainsize = 1000;
+
+template<typename Iterator>
+struct ParallelMergeBody {
+    void operator()( ParallelMergeRange<Iterator>& r ) const {
+        std::merge( r.begin1, r.end1, r.begin2, r.end2, r.out );
+    }
+};
+
+template<typename Iterator>
+void ParallelMerge( Iterator begin1, Iterator end1, Iterator begin2, Iterator end2, Iterator out ) {
+    parallel_for(
+       ParallelMergeRange<Iterator>(begin1,end1,begin2,end2,out),
+       ParallelMergeBody<Iterator>(),
+       simple_partitioner()
+    );
+}
+
+int main(int argc, char *argv[])
+{
+    perf_parse_args(argc, argv);
+
+    std::cout << "size: " << PERF_N << std::endl;
+    std::vector<int> v1 = generate_random_vector<int>(PERF_N / 2);
+    std::vector<int> v2 = generate_random_vector<int>(PERF_N / 2);
+    std::vector<int> v3(PERF_N);
+
+    std::sort(v1.begin(), v1.end());
+    std::sort(v2.begin(), v2.end());
+
+    perf_timer t;
+    for(size_t trial = 0; trial < PERF_TRIALS; trial++){
+        t.start();
+        ParallelMerge(v1.begin(), v1.end(), v2.begin(), v2.end(), v3.begin());
+        t.stop();
+    }
+    std::cout << "time: " << t.min_time() / 1e6 << " ms" << std::endl;
+
+    return 0;
+}
diff --git a/src/boost/libs/compute/perf/perf_tbb_sort.cpp b/src/boost/libs/compute/perf/perf_tbb_sort.cpp
new file mode 100644
index 000000000..2f79b5b28
--- /dev/null
+++ b/src/boost/libs/compute/perf/perf_tbb_sort.cpp
@@ -0,0 +1,35 @@
+//---------------------------------------------------------------------------//
+// Copyright (c) 2013-2014 Kyle Lutz <kyle.r.lutz@gmail.com>
+//
+// Distributed under the Boost Software License, Version 1.0
+// See accompanying file LICENSE_1_0.txt or copy at
+// http://www.boost.org/LICENSE_1_0.txt
+//
+// See http://boostorg.github.com/compute for more information.
+//---------------------------------------------------------------------------//
+
+#include <iostream>
+#include <vector>
+
+#include <tbb/parallel_sort.h>
+
+#include "perf.hpp"
+
+int main(int argc, char *argv[])
+{
+    perf_parse_args(argc, argv);
+
+    std::cout << "size: " << PERF_N << std::endl;
+    std::vector<int> v(PERF_N);
+
+    perf_timer t;
+    for(size_t trial = 0; trial < PERF_TRIALS; trial++){
+        v = generate_random_vector<int>(PERF_N);
+        t.start();
+        tbb::parallel_sort(v.begin(), v.end());
+        t.stop();
+    }
+    std::cout << "time: " << t.min_time() / 1e6 << " ms" << std::endl;
+
+    return 0;
+}
diff --git a/src/boost/libs/compute/perf/perf_thrust_accumulate.cu b/src/boost/libs/compute/perf/perf_thrust_accumulate.cu
new file mode 100644
index 000000000..76b723212
--- /dev/null
+++ b/src/boost/libs/compute/perf/perf_thrust_accumulate.cu
@@ -0,0 +1,45 @@
+//---------------------------------------------------------------------------//
+// Copyright (c) 2013-2014 Kyle Lutz <kyle.r.lutz@gmail.com>
+//
+// Distributed under the Boost Software License, Version 1.0
+// See accompanying file LICENSE_1_0.txt or copy at
+// http://www.boost.org/LICENSE_1_0.txt
+//
+// See http://boostorg.github.com/compute for more information.
+//---------------------------------------------------------------------------//
+
+#include <algorithm>
+#include <cstdlib>
+#include <iostream>
+
+#include <thrust/copy.h>
+#include <thrust/device_vector.h>
+#include <thrust/generate.h>
+#include <thrust/host_vector.h>
+#include <thrust/reduce.h>
+
+#include "perf.hpp"
+
+int main(int argc, char *argv[])
+{
+    perf_parse_args(argc, argv);
+
+    std::cout << "size: " << PERF_N << std::endl;
+    thrust::host_vector<int> h_vec = generate_random_vector<int>(PERF_N);
+
+    // transfer data to the device
+    thrust::device_vector<int> d_vec = h_vec;
+
+    int sum = 0;
+    perf_timer t;
+    for(size_t trial = 0; trial < PERF_TRIALS; trial++){
+        t.start();
+        sum = thrust::reduce(d_vec.begin(), d_vec.end());
+        cudaDeviceSynchronize();
+        t.stop();
+    }
+    std::cout << "time: " << t.min_time() / 1e6 << " ms" << std::endl;
+    std::cout << "sum: " << sum << std::endl;
+
+    return 0;
+}
diff --git a/src/boost/libs/compute/perf/perf_thrust_count.cu b/src/boost/libs/compute/perf/perf_thrust_count.cu
new file mode 100644
index 000000000..d69df9018
--- /dev/null
+++ b/src/boost/libs/compute/perf/perf_thrust_count.cu
@@ -0,0 +1,49 @@
+//---------------------------------------------------------------------------//
+// Copyright (c) 2013-2014 Kyle Lutz <kyle.r.lutz@gmail.com>
+//
+// Distributed under the Boost Software License, Version 1.0
+// See accompanying file LICENSE_1_0.txt or copy at
+// http://www.boost.org/LICENSE_1_0.txt
+//
+// See http://boostorg.github.com/compute for more information.
+//---------------------------------------------------------------------------//
+
+#include <algorithm>
+#include <iostream>
+#include <vector>
+
+#include <thrust/count.h>
+#include <thrust/host_vector.h>
+#include <thrust/device_vector.h>
+
+#include "perf.hpp"
+
+int rand_int()
+{
+    return static_cast<int>((rand() / double(RAND_MAX)) * 25.0);
+}
+
+int main(int argc, char *argv[])
+{
+    perf_parse_args(argc, argv);
+    std::cout << "size: " << PERF_N << std::endl;
+
+    // create vector of random numbers on the host
+    thrust::host_vector<int> host_vector(PERF_N);
+    thrust::generate(host_vector.begin(), host_vector.end(), rand_int);
+
+    thrust::device_vector<int> v = host_vector;
+
+    size_t count = 0;
+    perf_timer t;
+    for(size_t trial = 0; trial < PERF_TRIALS; trial++){
+        t.start();
+        count = thrust::count(v.begin(), v.end(), 4);
+        cudaDeviceSynchronize();
+        t.stop();
+    }
+    std::cout << "time: " << t.min_time() / 1e6 << " ms" << std::endl;
+    std::cout << "count: " << count << std::endl;
+
+    return 0;
+}
diff --git a/src/boost/libs/compute/perf/perf_thrust_exclusive_scan.cu b/src/boost/libs/compute/perf/perf_thrust_exclusive_scan.cu
new file mode 100644
index 000000000..df1367a27
--- /dev/null
+++ b/src/boost/libs/compute/perf/perf_thrust_exclusive_scan.cu
@@ -0,0 +1,48 @@
+//---------------------------------------------------------------------------//
+// Copyright (c) 2014 Benoit
+//
+// Distributed under the Boost Software License, Version 1.0
+// See accompanying file LICENSE_1_0.txt or copy at
+// http://www.boost.org/LICENSE_1_0.txt
+//
+// See http://boostorg.github.com/compute for more information.
+//---------------------------------------------------------------------------//
+
+#include <algorithm>
+#include <cstdlib>
+#include <iostream>
+
+#include <thrust/copy.h>
+#include <thrust/device_vector.h>
+#include <thrust/generate.h>
+#include <thrust/host_vector.h>
+#include <thrust/scan.h>
+
+#include "perf.hpp"
+
+int main(int argc, char *argv[])
+{
+    perf_parse_args(argc, argv);
+
+    std::cout << "size: " << PERF_N << std::endl;
+    thrust::host_vector<int> h_vec = generate_random_vector<int>(PERF_N);
+
+    // transfer data to the device
+    thrust::device_vector<int> d_vec = h_vec;
+
+    perf_timer t;
+    for(size_t trial = 0; trial < PERF_TRIALS; trial++){
+        d_vec = h_vec;
+
+        t.start();
+        thrust::exclusive_scan(d_vec.begin(), d_vec.end(), d_vec.begin());
+        cudaDeviceSynchronize();
+        t.stop();
+    }
+    std::cout << "time: " << t.min_time() / 1e6 << " ms" << std::endl;
+
+    // transfer data back to host
+    thrust::copy(d_vec.begin(), d_vec.end(), h_vec.begin());
+
+    return 0;
+}
diff --git a/src/boost/libs/compute/perf/perf_thrust_find.cu b/src/boost/libs/compute/perf/perf_thrust_find.cu
new file mode 100644
index 000000000..e14826046
--- /dev/null
+++ b/src/boost/libs/compute/perf/perf_thrust_find.cu
@@ -0,0 +1,65 @@
+//---------------------------------------------------------------------------//
+// Copyright (c) 2015 Jakub Szuppe <j.szuppe@gmail.com>
+//
+// Distributed under the Boost Software License, Version 1.0
+// See accompanying file LICENSE_1_0.txt or copy at
+// http://www.boost.org/LICENSE_1_0.txt
+//
+// See http://boostorg.github.com/compute for more information.
+//---------------------------------------------------------------------------//
+
+#include <algorithm>
+#include <iostream>
+#include <vector>
+
+#include <thrust/find.h>
+#include <thrust/host_vector.h>
+#include <thrust/device_vector.h>
+
+#include "perf.hpp"
+
+// Max integer that can be generated by rand_int() function.
+int rand_int_max = 25;
+
+int rand_int()
+{
+    return static_cast<int>((rand() / double(RAND_MAX)) * rand_int_max);
+}
+
+int main(int argc, char *argv[])
+{
+    perf_parse_args(argc, argv);
+    std::cout << "size: " << PERF_N << std::endl;
+
+    // create vector of random numbers on the host
+    thrust::host_vector<int> host_vector(PERF_N);
+    thrust::generate(host_vector.begin(), host_vector.end(), rand_int);
+
+    thrust::device_vector<int> v = host_vector;
+
+    // trying to find element that isn't in vector (worst-case scenario)
+    int wanted = rand_int_max + 1;
+    
+    // result
+    thrust::device_vector<int>::iterator device_result_it;
+    
+    perf_timer t;
+    for(size_t trial = 0; trial < PERF_TRIALS; trial++){
+        t.start();
+        device_result_it = thrust::find(v.begin(), v.end(), wanted);
+        cudaDeviceSynchronize();
+        t.stop();
+    }
+    std::cout << "time: " << t.min_time() / 1e6 << " ms" << std::endl;
+
+    // verify
+    if(device_result_it != v.end()){
+        std::cout << "ERROR: "
+                  << "device_result_iterator != "
+                  << "v.end()"
+                  << std::endl;
+        return -1;
+    }
+    
+    return 0;
+}
diff --git a/src/boost/libs/compute/perf/perf_thrust_inner_product.cu b/src/boost/libs/compute/perf/perf_thrust_inner_product.cu
new file mode 100644
index 000000000..6d01fc53a
--- /dev/null
+++ b/src/boost/libs/compute/perf/perf_thrust_inner_product.cu
@@ -0,0 +1,49 @@
+//---------------------------------------------------------------------------//
+// Copyright (c) 2013-2014 Kyle Lutz <kyle.r.lutz@gmail.com>
+//
+// Distributed under the Boost Software License, Version 1.0
+// See accompanying file LICENSE_1_0.txt or copy at
+// http://www.boost.org/LICENSE_1_0.txt
+//
+// See http://boostorg.github.com/compute for more information.
+//---------------------------------------------------------------------------//
+
+#include <iostream>
+#include <iterator>
+#include <algorithm>
+
+#include <thrust/device_vector.h>
+#include <thrust/host_vector.h>
+#include <thrust/inner_product.h>
+
+#include "perf.hpp"
+
+int main(int argc, char *argv[])
+{
+    perf_parse_args(argc, argv);
+
+    std::cout << "size: " << PERF_N << std::endl;
+    thrust::host_vector<int> host_x(PERF_N);
+    thrust::host_vector<int> host_y(PERF_N);
+    std::generate(host_x.begin(), host_x.end(), rand);
+    std::generate(host_y.begin(), host_y.end(), rand);
+
+    // transfer data to the device
+    thrust::device_vector<int> device_x = host_x;
+    thrust::device_vector<int> device_y = host_y;
+
+    int product = 0;
+    perf_timer t;
+    for(size_t trial = 0; trial < PERF_TRIALS; trial++){
+        t.start();
+        product = thrust::inner_product(
+            device_x.begin(), device_x.end(), device_y.begin(), 0
+        );
+        cudaDeviceSynchronize();
+        t.stop();
+    }
+    std::cout << "time: " << t.min_time() / 1e6 << " ms" << std::endl;
+    std::cout << "product: " << product << std::endl;
+
+    return 0;
+}
diff --git a/src/boost/libs/compute/perf/perf_thrust_merge.cu b/src/boost/libs/compute/perf/perf_thrust_merge.cu
new file mode 100644
index 000000000..f269c939e
--- /dev/null
+++ b/src/boost/libs/compute/perf/perf_thrust_merge.cu
@@ -0,0 +1,63 @@
+//---------------------------------------------------------------------------//
+// Copyright (c) 2013-2014 Kyle Lutz <kyle.r.lutz@gmail.com>
+//
+// Distributed under the Boost Software License, Version 1.0
+// See accompanying file LICENSE_1_0.txt or copy at
+// http://www.boost.org/LICENSE_1_0.txt
+//
+// See http://boostorg.github.com/compute for more information.
+//---------------------------------------------------------------------------//
+
+#include <iostream>
+#include <iterator>
+#include <algorithm>
+
+#include <thrust/device_vector.h>
+#include <thrust/host_vector.h>
+#include <thrust/merge.h>
+#include <thrust/sort.h>
+
+#include "perf.hpp"
+
+int main(int argc, char *argv[])
+{
+    perf_parse_args(argc, argv);
+
+    std::cout << "size: " << PERF_N << std::endl;
+    thrust::host_vector<int> v1(std::floor(PERF_N / 2.0));
+    thrust::host_vector<int> v2(std::ceil(PERF_N / 2.0));
+    std::generate(v1.begin(), v1.end(), rand);
+    std::generate(v2.begin(), v2.end(), rand);
+    std::sort(v1.begin(), v1.end());
+    std::sort(v2.begin(), v2.end());
+
+    // transfer data to the device
+    thrust::device_vector<int> gpu_v1 = v1;
+    thrust::device_vector<int> gpu_v2 = v2;
+    thrust::device_vector<int> gpu_v3(PERF_N);
+
+    perf_timer t;
+    for(size_t trial = 0; trial < PERF_TRIALS; trial++){
+        t.start();
+        thrust::merge(
+            gpu_v1.begin(), gpu_v1.end(),
+            gpu_v2.begin(), gpu_v2.end(),
+            gpu_v3.begin()
+        );
+        cudaDeviceSynchronize();
+        t.stop();
+    }
+    std::cout << "time: " << t.min_time() / 1e6 << " ms" << std::endl;
+
+    thrust::host_vector<int> check_v3 = gpu_v3;
+
+    thrust::host_vector<int> v3(PERF_N);
+    std::merge(v1.begin(), v1.end(), v2.begin(), v2.end(), v3.begin());
+    bool ok = std::equal(check_v3.begin(), check_v3.end(), v3.begin());
+    if(!ok){
+        std::cerr << "ERROR: merged ranges different" << std::endl;
+        return -1;
+    }
+
+    return 0;
+}
diff --git a/src/boost/libs/compute/perf/perf_thrust_partial_sum.cu b/src/boost/libs/compute/perf/perf_thrust_partial_sum.cu
new file mode 100644
index 000000000..e30e80b5b
--- /dev/null
+++ b/src/boost/libs/compute/perf/perf_thrust_partial_sum.cu
@@ -0,0 +1,48 @@
+//---------------------------------------------------------------------------//
+// Copyright (c) 2013-2014 Kyle Lutz <kyle.r.lutz@gmail.com>
+//
+// Distributed under the Boost Software License, Version 1.0
+// See accompanying file LICENSE_1_0.txt or copy at
+// http://www.boost.org/LICENSE_1_0.txt
+//
+// See http://boostorg.github.com/compute for more information.
+//---------------------------------------------------------------------------//
+
+#include <algorithm>
+#include <cstdlib>
+#include <iostream>
+
+#include <thrust/copy.h>
+#include <thrust/device_vector.h>
+#include <thrust/generate.h>
+#include <thrust/host_vector.h>
+#include <thrust/scan.h>
+
+#include "perf.hpp"
+
+int main(int argc, char *argv[])
+{
+    perf_parse_args(argc, argv);
+
+    std::cout << "size: " << PERF_N << std::endl;
+    thrust::host_vector<int> h_vec = generate_random_vector<int>(PERF_N);
+
+    // transfer data to the device
+    thrust::device_vector<int> d_vec = h_vec;
+
+    perf_timer t;
+    for(size_t trial = 0; trial < PERF_TRIALS; trial++){
+        d_vec = h_vec;
+
+        t.start();
+        thrust::inclusive_scan(d_vec.begin(), d_vec.end(), d_vec.begin());
+        cudaDeviceSynchronize();
+        t.stop();
+    }
+    std::cout << "time: " << t.min_time() / 1e6 << " ms" << std::endl;
+
+    // transfer data back to host
+    thrust::copy(d_vec.begin(), d_vec.end(), h_vec.begin());
+
+    return 0;
+}
diff --git a/src/boost/libs/compute/perf/perf_thrust_partition.cu b/src/boost/libs/compute/perf/perf_thrust_partition.cu
new file mode 100644
index 000000000..5c89014c1
--- /dev/null
+++ b/src/boost/libs/compute/perf/perf_thrust_partition.cu
@@ -0,0 +1,60 @@
+//---------------------------------------------------------------------------//
+// Copyright (c) 2013-2014 Kyle Lutz <kyle.r.lutz@gmail.com>
+//
+// Distributed under the Boost Software License, Version 1.0
+// See accompanying file LICENSE_1_0.txt or copy at
+// http://www.boost.org/LICENSE_1_0.txt
+//
+// See http://boostorg.github.com/compute for more information.
+//---------------------------------------------------------------------------//
+
+#include <algorithm>
+#include <cstdlib>
+#include <iostream>
+
+#include <thrust/copy.h>
+#include <thrust/device_vector.h>
+#include <thrust/generate.h>
+#include <thrust/host_vector.h>
+#include <thrust/partition.h>
+
+#include "perf.hpp"
+
+int rand_int()
+{
+    return static_cast<int>((rand() / double(RAND_MAX)) * 25.0);
+}
+
+struct less_than_ten : public thrust::unary_function<bool, int>
+{
+    __device__ bool operator()(int x) const
+    {
+        return x < 10;
+    }
+};
+
+int main(int argc, char *argv[])
+{
+    perf_parse_args(argc, argv);
+
+    std::cout << "size: " << PERF_N << std::endl;
+    thrust::host_vector<int> h_vec(PERF_N);
+    std::generate(h_vec.begin(), h_vec.end(), rand_int);
+
+    thrust::device_vector<int> d_vec(PERF_N);
+
+    perf_timer t;
+    for(size_t trial = 0; trial < PERF_TRIALS; trial++){
+        d_vec = h_vec;
+
+        t.start();
+        thrust::partition(
+            d_vec.begin(), d_vec.end(), less_than_ten()
+        );
+        cudaDeviceSynchronize();
+        t.stop();
+    }
+    std::cout << "time: " << t.min_time() / 1e6 << " ms" << std::endl;
+
+    return 0;
+}
diff --git a/src/boost/libs/compute/perf/perf_thrust_reduce_by_key.cu b/src/boost/libs/compute/perf/perf_thrust_reduce_by_key.cu
new file mode 100644
index 000000000..a445c1375
--- /dev/null
+++ b/src/boost/libs/compute/perf/perf_thrust_reduce_by_key.cu
@@ -0,0 +1,92 @@
+//---------------------------------------------------------------------------//
+// Copyright (c) 2015 Jakub Szuppe <j.szuppe@gmail.com>
+//
+// Distributed under the Boost Software License, Version 1.0
+// See accompanying file LICENSE_1_0.txt or copy at
+// http://www.boost.org/LICENSE_1_0.txt
+//
+// See http://boostorg.github.com/compute for more information.
+//---------------------------------------------------------------------------//
+
+#include <algorithm>
+#include <cstdlib>
+#include <iostream>
+
+#include <thrust/copy.h>
+#include <thrust/device_vector.h>
+#include <thrust/generate.h>
+#include <thrust/host_vector.h>
+#include <thrust/reduce.h>
+
+#include "perf.hpp"
+
+int rand_int()
+{
+    return static_cast<int>((rand() / double(RAND_MAX)) * 25.0);
+}
+
+struct unique_key {
+  int current;
+  int avgValuesNoPerKey;
+
+  unique_key()
+  {
+      current = 0;
+      avgValuesNoPerKey = 512;
+  }
+
+  int operator()()
+  {
+      double p = double(1.0) / static_cast<double>(avgValuesNoPerKey);
+      if((rand() / double(RAND_MAX)) <= p)
+          return ++current;
+      return current;
+  }
+} UniqueKey;
+
+int main(int argc, char *argv[])
+{
+    perf_parse_args(argc, argv);
+
+    std::cout << "size: " << PERF_N << std::endl;
+    
+    // create vector of keys and random values
+    thrust::host_vector<int> host_keys(PERF_N);
+    thrust::host_vector<int> host_values(PERF_N);
+    std::generate(host_keys.begin(), host_keys.end(), UniqueKey);
+    std::generate(host_values.begin(), host_values.end(), rand_int);
+    
+    // transfer data to the device
+    thrust::device_vector<int> device_keys = host_keys;
+    thrust::device_vector<int> device_values = host_values;
+
+    // create device vectors for the results
+    thrust::device_vector<int> device_keys_results(PERF_N);
+    thrust::device_vector<int> device_values_results(PERF_N);
+
+    typedef typename thrust::device_vector<int>::iterator iterType;
+    thrust::pair<iterType, iterType> result;
+
+    perf_timer t;
+    for(size_t trial = 0; trial < PERF_TRIALS; trial++){
+        t.start();
+        result = thrust::reduce_by_key(device_keys.begin(),
+                                       device_keys.end(),
+                                       device_values.begin(),
+                                       device_keys_results.begin(),
+                                       device_values_results.begin());
+        cudaDeviceSynchronize();
+        t.stop();
+    }
+    std::cout << "time: " << t.min_time() / 1e6 << " ms" << std::endl;
+    
+    size_t result_size = thrust::distance(device_keys_results.begin(), result.first);
+    if(result_size != static_cast<size_t>(host_keys[PERF_N-1] + 1)){
+        std::cout << "ERROR: "
+                  << "wrong number of keys"
+                  << std::endl;
+        return -1;
+    }
+
+    return 0;
+}
diff --git a/src/boost/libs/compute/perf/perf_thrust_reverse.cu b/src/boost/libs/compute/perf/perf_thrust_reverse.cu
new file mode 100644
index 000000000..1927ca6a5
--- /dev/null
+++ b/src/boost/libs/compute/perf/perf_thrust_reverse.cu
@@ -0,0 +1,48 @@
+//---------------------------------------------------------------------------//
+// Copyright (c) 2013-2014 Kyle Lutz <kyle.r.lutz@gmail.com>
+//
+// Distributed under the Boost Software License, Version 1.0
+// See accompanying file LICENSE_1_0.txt or copy at
+// http://www.boost.org/LICENSE_1_0.txt
+//
+// See http://boostorg.github.com/compute for more information.
+//---------------------------------------------------------------------------//
+
+#include <algorithm>
+#include <cstdlib>
+#include <iostream>
+
+#include <thrust/copy.h>
+#include <thrust/device_vector.h>
+#include <thrust/generate.h>
+#include <thrust/host_vector.h>
+#include <thrust/reverse.h>
+
+#include "perf.hpp"
+
+int main(int argc, char *argv[])
+{
+    perf_parse_args(argc, argv);
+
+    std::cout << "size: " << PERF_N << std::endl;
+    thrust::host_vector<int> h_vec = generate_random_vector<int>(PERF_N);
+
+    // transfer data to the device
+    thrust::device_vector<int> d_vec;
+
+    perf_timer t;
+    for(size_t trial = 0; trial < PERF_TRIALS; trial++){
+        d_vec = h_vec;
+
+        t.start();
+        thrust::reverse(d_vec.begin(), d_vec.end());
+        cudaDeviceSynchronize();
+        t.stop();
+    }
+    std::cout << "time: " << t.min_time() / 1e6 << " ms" << std::endl;
+
+    // transfer data back to host
+    thrust::copy(d_vec.begin(), d_vec.end(), h_vec.begin());
+
+    return 0;
+}
diff --git a/src/boost/libs/compute/perf/perf_thrust_reverse_copy.cu b/src/boost/libs/compute/perf/perf_thrust_reverse_copy.cu
new file mode 100644
index 000000000..af1a044d1
--- /dev/null
+++ b/src/boost/libs/compute/perf/perf_thrust_reverse_copy.cu
@@ -0,0 +1,47 @@
+//---------------------------------------------------------------------------//
+// Copyright (c) 2015 Jakub Szuppe <j.szuppe@gmail.com>
+//
+// Distributed under the Boost Software License, Version 1.0
+// See accompanying file LICENSE_1_0.txt or copy at
+// http://www.boost.org/LICENSE_1_0.txt
+//
+// See http://boostorg.github.com/compute for more information.
+//---------------------------------------------------------------------------//
+
+#include <algorithm>
+#include <cstdlib>
+#include <iostream>
+
+#include <thrust/copy.h>
+#include <thrust/device_vector.h>
+#include <thrust/generate.h>
+#include <thrust/host_vector.h>
+#include <thrust/reverse.h>
+
+#include "perf.hpp"
+
+int main(int argc, char *argv[])
+{
+    perf_parse_args(argc, argv);
+
+    std::cout << "size: " << PERF_N << std::endl;
+    thrust::host_vector<int> h_vec = generate_random_vector<int>(PERF_N);
+
+	// transfer data to the device
+    thrust::device_vector<int> d_vec;   
+    d_vec = h_vec;
+    
+    // device vector for reversed data
+    thrust::device_vector<int> d_reversed_vec(PERF_N);
+    
+    perf_timer t;
+    for(size_t trial = 0; trial < PERF_TRIALS; trial++){
+        t.start();
+        thrust::reverse_copy(d_vec.begin(), d_vec.end(), d_reversed_vec.begin());
+        cudaDeviceSynchronize();
+        t.stop();
+    }
+    std::cout << "time: " << t.min_time() / 1e6 << " ms" << std::endl;
+
+    return 0;
+}
diff --git a/src/boost/libs/compute/perf/perf_thrust_rotate.cu b/src/boost/libs/compute/perf/perf_thrust_rotate.cu
new file mode 100644
index 000000000..108bb99b5
--- /dev/null
+++ b/src/boost/libs/compute/perf/perf_thrust_rotate.cu
@@ -0,0 +1,51 @@
+//---------------------------------------------------------------------------//
+// Copyright (c) 2013-2014 Kyle Lutz <kyle.r.lutz@gmail.com>
+//
+// Distributed under the Boost Software License, Version 1.0
+// See accompanying file LICENSE_1_0.txt or copy at
+// http://www.boost.org/LICENSE_1_0.txt
+//
+// See http://boostorg.github.com/compute for more information.
+//---------------------------------------------------------------------------//
+
+#include <algorithm>
+#include <cstdlib>
+#include <iostream>
+
+#include <thrust/copy.h>
+#include <thrust/device_vector.h>
+#include <thrust/host_vector.h>
+
+#include "perf.hpp"
+
+int main(int argc, char *argv[])
+{
+    perf_parse_args(argc, argv);
+
+    std::cout << "size: " << PERF_N << std::endl;
+    thrust::host_vector<int> h_vec = generate_random_vector<int>(PERF_N);
+
+    // transfer data to the device
+    thrust::device_vector<int> d_vec;
+
+    size_t rotate_distance = PERF_N / 2;
+
+    perf_timer t;
+    for(size_t trial = 0; trial < PERF_TRIALS; trial++){
+        d_vec = h_vec;
+
+        t.start();
+        // there is no thrust::rotate() so we implement it manually with copy()
+        thrust::device_vector<int> tmp(d_vec.begin(), d_vec.begin() + rotate_distance);
+        thrust::copy(d_vec.begin() + rotate_distance, d_vec.end(), d_vec.begin());
+        thrust::copy(tmp.begin(), tmp.end(), d_vec.begin() + rotate_distance);
+        cudaDeviceSynchronize();
+        t.stop();
+    }
+    std::cout << "time: " << t.min_time() / 1e6 << " ms" << std::endl;
+
+    // transfer data back to host
+    thrust::copy(d_vec.begin(), d_vec.end(), h_vec.begin());
+
+    return 0;
+}
diff --git a/src/boost/libs/compute/perf/perf_thrust_saxpy.cu b/src/boost/libs/compute/perf/perf_thrust_saxpy.cu
new file mode 100644
index 000000000..aa35a1918
--- /dev/null
+++ b/src/boost/libs/compute/perf/perf_thrust_saxpy.cu
@@ -0,0 +1,63 @@
+//---------------------------------------------------------------------------//
+// Copyright (c) 2013-2014 Kyle Lutz <kyle.r.lutz@gmail.com>
+//
+// Distributed under the Boost Software License, Version 1.0
+// See accompanying file LICENSE_1_0.txt or copy at
+// http://www.boost.org/LICENSE_1_0.txt
+//
+// See http://boostorg.github.com/compute for more information.
+//---------------------------------------------------------------------------//
+
+#include <iostream>
+#include <iterator>
+#include <algorithm>
+
+#include <thrust/device_vector.h>
+#include <thrust/functional.h>
+#include <thrust/host_vector.h>
+#include <thrust/transform.h>
+
+#include "perf.hpp"
+
+struct saxpy_functor : public thrust::binary_function<float,float,float>
+{
+    const float a;
+
+    saxpy_functor(float _a) : a(_a) {}
+
+    __host__ __device__
+    float operator()(const float& x, const float& y) const
+    {
+        return a * x + y;
+    }
+};
+
+int main(int argc, char *argv[])
+{
+    perf_parse_args(argc, argv);
+
+    std::cout << "size: " << PERF_N << std::endl;
+    thrust::host_vector<int> host_x(PERF_N);
+    thrust::host_vector<int> host_y(PERF_N);
+    std::generate(host_x.begin(), host_x.end(), rand);
+    std::generate(host_y.begin(), host_y.end(), rand);
+
+    // transfer data to the device
+    thrust::device_vector<int> device_x = host_x;
+    thrust::device_vector<int> device_y = host_y;
+
+    perf_timer t;
+    for(size_t trial = 0; trial < PERF_TRIALS; trial++){
+        t.start();
+        thrust::transform(device_x.begin(), device_x.end(), device_y.begin(), device_y.begin(), saxpy_functor(2.5f));
+        cudaDeviceSynchronize();
+        t.stop();
+    }
+    std::cout << "time: " << t.min_time() / 1e6 << " ms" << std::endl;
+
+    // transfer data back to host
+    thrust::copy(device_x.begin(), device_x.end(), host_x.begin());
+    thrust::copy(device_y.begin(), device_y.end(), host_y.begin());
+
+    return 0;
+}
diff --git a/src/boost/libs/compute/perf/perf_thrust_set_difference.cu b/src/boost/libs/compute/perf/perf_thrust_set_difference.cu
new file mode 100644
index 000000000..3465f2148
--- /dev/null
+++ b/src/boost/libs/compute/perf/perf_thrust_set_difference.cu
@@ -0,0 +1,61 @@
+//---------------------------------------------------------------------------//
+// Copyright (c) 2013-2014 Kyle Lutz <kyle.r.lutz@gmail.com>
+//
+// Distributed under the Boost Software License, Version 1.0
+// See accompanying file LICENSE_1_0.txt or copy at
+// http://www.boost.org/LICENSE_1_0.txt
+//
+// See http://boostorg.github.com/compute for more information.
+//---------------------------------------------------------------------------//
+
+#include <iostream>
+#include <iterator>
+#include <algorithm>
+
+#include <thrust/device_vector.h>
+#include <thrust/host_vector.h>
+#include <thrust/set_operations.h>
+#include <thrust/sort.h>
+
+#include "perf.hpp"
+
+int rand_int()
+{
+    return static_cast<int>((rand() / double(RAND_MAX)) * 25.0);
+}
+
+int main(int argc, char *argv[])
+{
+    perf_parse_args(argc, argv);
+
+    std::cout << "size: " << PERF_N << std::endl;
+    thrust::host_vector<int> v1(std::floor(PERF_N / 2.0));
+    thrust::host_vector<int> v2(std::ceil(PERF_N / 2.0));
+    std::generate(v1.begin(), v1.end(), rand_int);
+    std::generate(v2.begin(), v2.end(), rand_int);
+    std::sort(v1.begin(), v1.end());
+    std::sort(v2.begin(), v2.end());
+
+    // transfer data to the device
+    thrust::device_vector<int> gpu_v1 = v1;
+    thrust::device_vector<int> gpu_v2 = v2;
+    thrust::device_vector<int> gpu_v3(PERF_N);
+
+    thrust::device_vector<int>::iterator gpu_v3_end;
+
+    perf_timer t;
+    for(size_t trial = 0; trial < PERF_TRIALS; trial++){
+        t.start();
+        gpu_v3_end = thrust::set_difference(
+            gpu_v1.begin(), gpu_v1.end(),
+            gpu_v2.begin(), gpu_v2.end(),
+            gpu_v3.begin()
+        );
+        cudaDeviceSynchronize();
+        t.stop();
+    }
+    std::cout << "time: " << t.min_time() / 1e6 << " ms" << std::endl;
+    std::cout << "size: " << thrust::distance(gpu_v3.begin(), gpu_v3_end) << std::endl;
+
+    return 0;
+}
diff --git a/src/boost/libs/compute/perf/perf_thrust_sort.cu b/src/boost/libs/compute/perf/perf_thrust_sort.cu
new file mode 100644
index 000000000..b2d909396
--- /dev/null
+++ b/src/boost/libs/compute/perf/perf_thrust_sort.cu
@@ -0,0 +1,48 @@
+//---------------------------------------------------------------------------//
+// Copyright (c) 2013-2014 Kyle Lutz <kyle.r.lutz@gmail.com>
+//
+// Distributed under the Boost Software License, Version 1.0
+// See accompanying file LICENSE_1_0.txt or copy at
+// http://www.boost.org/LICENSE_1_0.txt
+//
+// See http://boostorg.github.com/compute for more information.
+//---------------------------------------------------------------------------//
+
+#include <algorithm>
+#include <cstdlib>
+#include <iostream>
+
+#include <thrust/copy.h>
+#include <thrust/device_vector.h>
+#include <thrust/generate.h>
+#include <thrust/host_vector.h>
+#include <thrust/sort.h>
+
+#include "perf.hpp"
+
+int main(int argc, char *argv[])
+{
+    perf_parse_args(argc, argv);
+
+    std::cout << "size: " << PERF_N << std::endl;
+    thrust::host_vector<int> h_vec = generate_random_vector<int>(PERF_N);
+
+    // transfer data to the device
+    thrust::device_vector<int> d_vec;
+
+    perf_timer t;
+    for(size_t trial = 0; trial < PERF_TRIALS; trial++){
+        d_vec = h_vec;
+
+        t.start();
+        thrust::sort(d_vec.begin(), d_vec.end());
+        cudaDeviceSynchronize();
+        t.stop();
+    }
+    std::cout << "time: " << t.min_time() / 1e6 << " ms" << std::endl;
+
+    // transfer data back to host
+    thrust::copy(d_vec.begin(), d_vec.end(), h_vec.begin());
+
+    return 0;
+}
diff --git a/src/boost/libs/compute/perf/perf_thrust_unique.cu b/src/boost/libs/compute/perf/perf_thrust_unique.cu
new file mode 100644
index 000000000..6030f2914
--- /dev/null
+++ b/src/boost/libs/compute/perf/perf_thrust_unique.cu
@@ -0,0 +1,50 @@
+//---------------------------------------------------------------------------//
+// Copyright (c) 2013-2014 Kyle Lutz <kyle.r.lutz@gmail.com>
+//
+// Distributed under the Boost Software License, Version 1.0
+// See accompanying file LICENSE_1_0.txt or copy at
+// http://www.boost.org/LICENSE_1_0.txt
+//
+// See http://boostorg.github.com/compute for more information.
+//---------------------------------------------------------------------------//
+
+#include <algorithm>
+#include <cstdlib>
+#include <iostream>
+
+#include <thrust/copy.h>
+#include <thrust/device_vector.h>
+#include <thrust/generate.h>
+#include <thrust/host_vector.h>
+#include <thrust/unique.h>
+
+#include "perf.hpp"
+
+int rand_int()
+{
+    return static_cast<int>((rand() / double(RAND_MAX)) * 25.0);
+}
+
+int main(int argc, char *argv[])
+{
+    perf_parse_args(argc, argv);
+
+    std::cout << "size: " << PERF_N << std::endl;
+    thrust::host_vector<int> h_vec(PERF_N);
+    std::generate(h_vec.begin(), h_vec.end(), rand_int);
+
+    thrust::device_vector<int> d_vec(PERF_N);
+
+    perf_timer t;
+    for(size_t trial = 0; trial < PERF_TRIALS; trial++){
+        d_vec = h_vec;
+
+        t.start();
+        thrust::unique(d_vec.begin(), d_vec.end());
+        cudaDeviceSynchronize();
+        t.stop();
+    }
+    std::cout << "time: " << t.min_time() / 1e6 << " ms" << std::endl;
+
+    return 0;
+}
diff --git a/src/boost/libs/compute/perf/perf_uniform_int_distribution.cpp b/src/boost/libs/compute/perf/perf_uniform_int_distribution.cpp
new file mode 100644
index 000000000..57973a817
--- /dev/null
+++ b/src/boost/libs/compute/perf/perf_uniform_int_distribution.cpp
@@ -0,0 +1,46 @@
+//---------------------------------------------------------------------------//
+// Copyright (c) 2014 Roshan <thisisroshansmail@gmail.com>
+//
+// Distributed under the Boost Software License, Version 1.0
+// See accompanying file LICENSE_1_0.txt or copy at
+// http://www.boost.org/LICENSE_1_0.txt
+//
+// See http://boostorg.github.com/compute for more information.
+//---------------------------------------------------------------------------//
+
+#include <algorithm>
+#include <iostream>
+#include <vector>
+
+#include <boost/compute/system.hpp>
+#include <boost/compute/container/vector.hpp>
+#include <boost/compute/random/default_random_engine.hpp>
+#include <boost/compute/random/uniform_int_distribution.hpp>
+
+#include "perf.hpp"
+
+namespace compute = boost::compute;
+
+int main(int argc, char *argv[])
+{
+    perf_parse_args(argc, argv);
+    std::cout << "size: " << PERF_N << std::endl;
+
+    compute::device device = compute::system::default_device();
+    compute::context context(device);
+    compute::command_queue queue(context, device);
+
+    compute::vector<compute::uint_> vector(PERF_N, context);
+
+    compute::default_random_engine rng(queue);
+    compute::uniform_int_distribution<compute::uint_> dist(0, 1);
+
+    perf_timer t;
+    t.start();
+    dist.generate(vector.begin(), vector.end(), rng, queue);
+    queue.finish();
+    t.stop();
+    std::cout << "time: " << t.min_time() / 1e6 << " ms" << std::endl;
+
+    return 0;
+}
diff --git a/src/boost/libs/compute/perf/perf_unique.cpp b/src/boost/libs/compute/perf/perf_unique.cpp
new file mode 100644
index 000000000..b25801f18
--- /dev/null
+++ b/src/boost/libs/compute/perf/perf_unique.cpp
@@ -0,0 +1,60 @@
+//---------------------------------------------------------------------------//
+// Copyright (c) 2014 Roshan <thisisroshansmail@gmail.com>
+//
+// Distributed under the Boost Software License, Version 1.0
+// See accompanying file LICENSE_1_0.txt or copy at
+// http://www.boost.org/LICENSE_1_0.txt
+//
+// See http://boostorg.github.com/compute for more information.
+//---------------------------------------------------------------------------//
+
+#include <algorithm>
+#include <iostream>
+#include <numeric>
+#include <vector>
+
+#include <boost/compute/system.hpp>
+#include <boost/compute/algorithm/unique.hpp>
+#include <boost/compute/container/vector.hpp>
+
+#include "perf.hpp"
+
+int rand_int()
+{
+    return static_cast<int>((rand() / double(RAND_MAX)) * 25.0);
+}
+
+int main(int argc, char *argv[])
+{
+    perf_parse_args(argc, argv);
+    std::cout << "size: " << PERF_N << std::endl;
+
+    // setup context and queue for the default device
+    boost::compute::device device = boost::compute::system::default_device();
+    boost::compute::context context(device);
+    boost::compute::command_queue queue(context, device);
+    std::cout << "device: " << device.name() << std::endl;
+
+    // create vector of random numbers on the host
+    std::vector<int> host_vector(PERF_N);
+    std::generate(host_vector.begin(), host_vector.end(), rand_int);
+
+    // create vector on the device and copy the data
+    boost::compute::vector<int> device_vector(PERF_N, context);
+
+    perf_timer t;
+    for(size_t trial = 0; trial < PERF_TRIALS; trial++){
+        boost::compute::copy(
+            host_vector.begin(), host_vector.end(), device_vector.begin(), queue
+        );
+        t.start();
+        boost::compute::unique(
+            device_vector.begin(), device_vector.end(), queue
+        );
+        queue.finish();
+        t.stop();
+    }
+    std::cout << "time: " << t.min_time() / 1e6 << " ms" << std::endl;
+
+    return 0;
+}
diff --git a/src/boost/libs/compute/perf/perf_unique_copy.cpp b/src/boost/libs/compute/perf/perf_unique_copy.cpp
new file mode 100644
index 000000000..d7ff98af7
--- /dev/null
+++ b/src/boost/libs/compute/perf/perf_unique_copy.cpp
@@ -0,0 +1,61 @@
+//---------------------------------------------------------------------------//
+// Copyright (c) 2014 Roshan <thisisroshansmail@gmail.com>
+//
+// Distributed under the Boost Software License, Version 1.0
+// See accompanying file LICENSE_1_0.txt or copy at
+// http://www.boost.org/LICENSE_1_0.txt
+//
+// See http://boostorg.github.com/compute for more information.
+//---------------------------------------------------------------------------//
+
+#include <algorithm>
+#include <iostream>
+#include <numeric>
+#include <vector>
+
+#include <boost/compute/system.hpp>
+#include <boost/compute/algorithm/unique_copy.hpp>
+#include <boost/compute/container/vector.hpp>
+
+#include "perf.hpp"
+
+int rand_int()
+{
+    return static_cast<int>((rand() / double(RAND_MAX)) * 25.0);
+}
+
+int main(int argc, char *argv[])
+{
+    perf_parse_args(argc, argv);
+    std::cout << "size: " << PERF_N << std::endl;
+
+    // setup context and queue for the default device
+    boost::compute::device device = boost::compute::system::default_device();
+    boost::compute::context context(device);
+    boost::compute::command_queue queue(context, device);
+    std::cout << "device: " << device.name() << std::endl;
+
+    // create vector of random numbers on the host
+    std::vector<int> host_vector(PERF_N);
+    std::generate(host_vector.begin(), host_vector.end(), rand_int);
+
+    // create vector on the device and copy the data
+    boost::compute::vector<int> device_vector(PERF_N, context);
+    boost::compute::vector<int> device_vector2(PERF_N, context);
+
+    perf_timer t;
+    for(size_t trial = 0; trial < PERF_TRIALS; trial++){
+        boost::compute::copy(
+            host_vector.begin(), host_vector.end(), device_vector.begin(), queue
+        );
+        t.start();
+        boost::compute::unique_copy(
+            device_vector.begin(), device_vector.end(), device_vector2.begin(), queue
+        );
+        queue.finish();
+        t.stop();
+    }
+    std::cout << "time: " << t.min_time() / 1e6 << " ms" << std::endl;
+
+    return 0;
+}
diff --git a/src/boost/libs/compute/perf/perfdoc.py b/src/boost/libs/compute/perf/perfdoc.py
new file mode 100755
index 000000000..e9c60362f
--- /dev/null
+++ b/src/boost/libs/compute/perf/perfdoc.py
@@ -0,0 +1,70 @@
+#!/usr/bin/python
+
+# Copyright (c) 2014 Kyle Lutz <kyle.r.lutz@gmail.com>
+# Distributed under the Boost Software License, Version 1.0
+# See accompanying file LICENSE_1_0.txt or copy at
+# http://www.boost.org/LICENSE_1_0.txt
+#
+# See http://boostorg.github.com/compute for more information.
+
+import os
+import sys
+import pylab
+
+from perf import run_benchmark
+
+fignum = 0
+
+def plot_to_file(report, filename):
+    global fignum
+    fignum += 1
+    pylab.figure(fignum)
+
+    run_to_label = {
+        "stl" : "C++ STL",
+        "thrust" : "Thrust",
+        "compute" : "Boost.Compute",
+        "bolt" : "Bolt"
+    }
+
+    for run in sorted(report.samples.keys()):
+        x = []
+        y = []
+
+        for sample in report.samples[run]:
+            x.append(sample[0])
+            y.append(sample[1])
+
+        pylab.loglog(x, y, marker='o', label=run_to_label[run])
+
+    pylab.xlabel("Size")
+    pylab.ylabel("Time (ms)")
+    pylab.legend(loc='upper left')
+    pylab.savefig(filename)
+
+if __name__ == '__main__':
+    sizes = [pow(2, x) for x in range(10, 26)]
+    algorithms = [
+        "accumulate",
+        "count",
+        "inner_product",
+        "merge",
+        "partial_sum",
+        "partition",
+        "reverse",
+        "rotate",
+        "saxpy",
+        "sort",
+        "unique",
+    ]
+
+    try:
+        os.mkdir("perf_plots")
+    except OSError:
+        pass
+
+    for algorithm in algorithms:
+        print("running '%s'" % (algorithm))
+        report = run_benchmark(algorithm, sizes, ["stl", "thrust", "bolt"])
+        plot_to_file(report, "perf_plots/%s_time_plot.png" % algorithm)
+
author	Daniel Baumann <daniel.baumann@progress-linux.org>	2024-04-07 18:45:59 +0000
committer	Daniel Baumann <daniel.baumann@progress-linux.org>	2024-04-07 18:45:59 +0000
commit	19fcec84d8d7d21e796c7624e521b60d28ee21ed (patch)
tree	42d26aa27d1e3f7c0b8bd3fd14e7d7082f5008dc /src/boost/libs/compute/perf
parent	Initial commit. (diff)
download	ceph-upstream/16.2.11+ds.tar.xz ceph-upstream/16.2.11+ds.zip