#include extern "C" __device__ void add_u32( ulonglong4 *out_c, ulonglong4 in_a, ulonglong4 in_b ); __constant__ char ok[] = "ok"; __constant__ char not_ok[] = "not ok"; __global__ void kernel(char *buf) { ulonglong4 a = {0, 1, 2, 3}; ulonglong4 b = {1, 1, 1, 1}; ulonglong4 c = {1, 2, 3, 4}; add_u32(&c, a, b); memcpy(buf, ok, sizeof(ok)); } int main() { char h_buf[32]; char *d_buf; cudaMalloc(&d_buf, 32); kernel<<<1, 1>>>(d_buf); cudaDeviceSynchronize(); cudaMemcpy(h_buf, d_buf, 32, cudaMemcpyDeviceToHost); printf("%s\n", h_buf); cudaFree(d_buf); return 0; }