Hi, Am trying to sort a vector on GPU (GeForce GT 750M) using example code which generates SEGV. However, it runs fine on Iris Pro (Intel(R) Core(TM) i7-4850HQ CPU). We observe SEGV when size is 10 million. When the size is 5 million, following exception is thrown: boost/1_61_0/include/boost/compute/command_queue.hpp(453): Throw in function boost::compute::event boost::compute::command_queue::enqueue_write_buffer(const boost::compute::buffer &, size_t, size_t, const void *, const boost::compute::wait_list &) Dynamic exception type: boost::exception_detail::clone_impl<boost::exception_detail::error_info_injector< boost::compute::opencl_error> > std::exception::what: Invalid Value Another observation: if the size of vector is 50 million the sorting works fine, though the timings are worse than Iris Pro. Also, when is size is 100 million, the binary causes the OS to crash. Compiler details: clang++ --version Apple LLVM version 7.3.0 (clang-703.0.31) Target: x86_64-apple-darwin15.5.0 Thread model: posix OS: System Version: OS X 10.11.5 (15F34) Kernel Version: Darwin 15.5.0 Regards, Prashant ----------------------------Cut here------------------------------------- #include <iostream> #include <vector> #include <algorithm> #include <boost/foreach.hpp> #include <boost/compute/core.hpp> #include <boost/compute/platform.hpp> #include <boost/compute/algorithm.hpp> #include <boost/compute/container/vector.hpp> #include <boost/compute/functional/math.hpp> #include <boost/compute/types/builtin.hpp> #include <boost/compute/function.hpp> #include <boost/chrono/include.hpp> #include <boost/exception/all.hpp> namespace compute = boost::compute; int main(int argc, char* argv[]) { if (argc != 2) { std::cout << "Usage: " << argv[0] << " <size> " << std::endl; return 0; } // generate random data on the host std::vector<float> host_vector(atoi(argv[1])); std::generate(host_vector.begin(), host_vector.end(), rand); std::cout << "===============CPU==================\n"; for (size_t k=0; k<5; k++) { std::vector<float> host_copy_vector(host_vector); auto start = std::chrono::high_resolution_clock::now(); std::sort(host_copy_vector.begin(), host_copy_vector.end()); auto duration = std::chrono::duration_cast<std::chrono::milliseconds> (std::chrono::high_resolution_clock::now() - start); std::cout << "time: iteration ("<< k << ") : " << duration.count() << " ms" << std::endl; } std::vector<compute::platform> platforms = compute::system::platforms(); for(size_t i = 0; i < platforms.size(); i++){ const compute::platform &platform = platforms[i]; std::cout << "Platform '" << platform.name() << "'" << std::endl; std::vector<compute::device> devices = platform.devices(); for(size_t j = 0; j < devices.size(); j++){ const compute::device &device = devices[j]; std::string type; if(device.type() & compute::device::gpu) type = "GPU Device"; else if(device.type() & compute::device::cpu) type = "CPU Device"; else if(device.type() & compute::device::accelerator) type = "Accelerator Device"; else type = "Unknown Device"; if (type != "GPU Device") { std::cout << "Ignoring non GPU devices.\n"; continue; } std::cout << "====\n"; std::cout << " " << type << ": " << device.name() << std::endl; std::cout << "====\n"; compute::context context(device); compute::command_queue queue(context, device); for (size_t k=0; k<5; k++) { compute::vector<float> device_vector(host_vector.size(), context); // copy data from the host to the device compute::copy( host_vector.begin(), host_vector.end(), device_vector.begin(), queue ); auto start = std::chrono::high_resolution_clock::now(); try { compute::sort(device_vector.begin(), device_vector.end(), queue); } catch (boost::exception & e) { std::cerr << diagnostic_information(e); break; } auto duration = std::chrono::duration_cast<std::chrono::milliseconds> (std::chrono::high_resolution_clock::now() - start); std::cout << "time: iteration ("<< k << ") : " << duration.count() << " ms" << std::endl; } std::cout << "====\n"; } } return 0; }