The solution discussed here worked well for me. This solution uses built-in cuda functions and is very simple to implement.
The relevant code is copied below:
#include <stdio.h>
#include <stdlib.h>
__global__ void foo(int *ptr)
{
*ptr = 7;
}
int main(void)
{
foo<<<1,1>>>(0);
// make the host block until the device is finished with foo
cudaDeviceSynchronize();
// check for error
cudaError_t error = cudaGetLastError();
if(error != cudaSuccess)
{
// print the CUDA error message and exit
printf("CUDA error: %s\n", cudaGetErrorString(error));
exit(-1);
}
return 0;
}