[boost] [compute] [OS X 10.11.5] [GeForce GT 750M] Segmentation fault

23 May 2016

      Hi,
   Am trying to sort a vector on GPU (GeForce GT 750M) using example code which generates  SEGV.
   However, it runs fine on Iris Pro (Intel(R) Core(TM) i7-4850HQ CPU).

   We observe SEGV when size is 10 million.
   When the size is 5 million, following exception is thrown: 
boost/1_61_0/include/boost/compute/command_queue.hpp(453): Throw in function
 boost::compute::event boost::compute::command_queue::enqueue_write_buffer(const 
 boost::compute::buffer &, size_t, size_t, const void *, const boost::compute::wait_list
 &) Dynamic exception type: 
 boost::exception_detail::clone_impl<boost::exception_detail::error_info_injector<
   boost::compute::opencl_error> > std::exception::what: Invalid Value

    Another observation: if the size of vector is 50 million the sorting works fine, 
though the timings are worse than Iris Pro.
    Also, when is size is 100 million, the binary causes the OS to crash.

   Compiler details:
    clang++ --version
    Apple LLVM version 7.3.0 (clang-703.0.31)
    Target: x86_64-apple-darwin15.5.0
    Thread model: posix

   OS:
    System Version:	OS X 10.11.5 (15F34)
    Kernel Version:	Darwin 15.5.0
Regards,
Prashant

----------------------------Cut here-------------------------------------
#include <iostream>
#include <vector>
#include <algorithm>
#include <boost/foreach.hpp>
#include <boost/compute/core.hpp>
#include <boost/compute/platform.hpp>
#include <boost/compute/algorithm.hpp>
#include <boost/compute/container/vector.hpp>
#include <boost/compute/functional/math.hpp>
#include <boost/compute/types/builtin.hpp>
#include <boost/compute/function.hpp>
#include <boost/chrono/include.hpp>
#include <boost/exception/all.hpp>

namespace compute = boost::compute;

int main(int argc, char* argv[])
{
    if (argc != 2) {
        std::cout << "Usage: " << argv[0] << " <size> " << std::endl;
        return 0;
    }

    // generate random data on the host
    std::vector<float> host_vector(atoi(argv[1]));
    std::generate(host_vector.begin(), host_vector.end(), rand);

    std::cout << 
"===============CPU==================\n";
    for (size_t k=0; k<5; k++)
    {
        std::vector<float> host_copy_vector(host_vector);
        auto start = std::chrono::high_resolution_clock::now();
        std::sort(host_copy_vector.begin(), host_copy_vector.end());

        auto duration = std::chrono::duration_cast<std::chrono::milliseconds>
(std::chrono::high_resolution_clock::now() - start);
        std::cout << "time: iteration ("<< k << ") : " << duration.count() << " ms" << 
std::endl;
    }

    std::vector<compute::platform> platforms = compute::system::platforms();

    for(size_t i = 0; i < platforms.size(); i++){
        const compute::platform &platform = platforms[i];
        std::cout << "Platform '" << platform.name() << "'" << std::endl;
        std::vector<compute::device> devices = platform.devices();

        for(size_t j = 0; j < devices.size(); j++){
            const compute::device &device = devices[j];

            std::string type;
            if(device.type() & compute::device::gpu)
                type = "GPU Device";
            else if(device.type() & compute::device::cpu)
                type = "CPU Device";
            else if(device.type() & compute::device::accelerator)
                type = "Accelerator Device";
            else
                type = "Unknown Device";

            if (type != "GPU Device") {
                std::cout << "Ignoring non GPU devices.\n";
                continue;
            }

            std::cout << 
"====\n";
            std::cout << "  " << type << ": " << device.name() << std::endl;
            std::cout << 
"====\n";
            compute::context context(device);
            compute::command_queue queue(context, device);

            for (size_t k=0; k<5; k++)
            {
                compute::vector<float> device_vector(host_vector.size(), context);

                // copy data from the host to the device
                compute::copy(
                    host_vector.begin(), host_vector.end(), device_vector.begin(), queue
                );

                auto start = std::chrono::high_resolution_clock::now();
                try {
                  compute::sort(device_vector.begin(), device_vector.end(), queue);
                } catch (boost::exception & e) {
                  std::cerr << diagnostic_information(e);
                  break;
                }

                auto duration = std::chrono::duration_cast<std::chrono::milliseconds>
(std::chrono::high_resolution_clock::now() - start);
                std::cout << "time: iteration ("<< k << ") : " << duration.count() << " ms" <<
std::endl;
            }

            std::cout << 
"====\n";
        }
    }

    return 0;
}