#include #include extern "C" __device__ void add_u16( ulonglong2 *out_c, ulonglong2 in_a, ulonglong2 in_b ); extern "C" __device__ void sub_u16( ulonglong2 *out_c, ulonglong2 in_a, ulonglong2 in_b ); extern "C" __device__ void add_u32( ulonglong4 *out_c, ulonglong4 in_a, ulonglong4 in_b ); extern "C" __device__ void sub_u32( ulonglong4 *out_c, ulonglong4 in_a, ulonglong4 in_b ); __device__ void mul_u16( ulonglong2 *out_c, ulonglong2 in_a, ulonglong2 in_b ) { uint64_t ax_ay = in_a.x + in_a.y; uint64_t bx_by = in_b.x + in_b.y; uint64_t axbx = in_a.x * in_b.x; uint64_t ayby = in_a.y * in_b.y; out_c->x = ax_ay * bx_by - axbx - ayby; out_c->y = ayby; } __device__ void mul_u32( ulonglong4 *out_c, ulonglong4 in_a, ulonglong4 in_b ) { auto ax = (ulonglong2 *)&in_a.x; auto ay = (ulonglong2 *)&in_a.z; auto bx = (ulonglong2 *)&in_b.x; auto by = (ulonglong2 *)&in_b.z; ulonglong2 ax_ay, bx_by, paren, axbx, ayby; add_u16(&ax_ay, *ax, *ay); add_u16(&bx_by, *bx, *by); mul_u16(&paren, ax_ay, bx_by); mul_u16(&axbx, *ax, *bx); mul_u16(&ayby, *ay, *by); sub_u16(&paren, paren, axbx); sub_u16(&paren, paren, ayby); out_c->x = paren.x; out_c->y = paren.y; out_c->z = ayby.x; out_c->w = ayby.y; } __device__ bool equ_u16(ulonglong2 a, ulonglong2 b) { return a.x == b.x && a.y == b.y; } __device__ bool equ_u32(ulonglong4 a, ulonglong4 b) { return a.x == b.x && a.y == b.y && a.z == b.z && a.w == b.w; } __device__ void print_u16(ulonglong2 a) { printf("0x%016llx.%016llx\n", a.x, a.y); } __device__ void print_u32(ulonglong4 a) { printf("0x%016llx.%016llx.%016llx.%016llx\n", a.x, a.y, a.z, a.w); } #define U8_MAX 0xFFFFFFFFFFFFFFFF #define _U16_MAX {U8_MAX, U8_MAX} #define _U32_MAX {U8_MAX, U8_MAX, U8_MAX, U8_MAX} __global__ void test(bool *passed) { *passed = false; { ulonglong4 a = _U32_MAX; ulonglong4 b = {0, 0, 0, 1}; ulonglong4 c = {0, 0, 0, 0}; add_u32(&a, a, b); if (!equ_u32(a, c)) { printf("add_u32 "); print_u32(a); return; } } { ulonglong4 a = {0, 0, 0, 0}; ulonglong4 b = {0, 0, 0, 1}; ulonglong4 c = _U32_MAX; sub_u32(&a, a, b); if (!equ_u32(a, c)) { printf("sub_32 "); print_u32(a); return; } } { ulonglong2 a = _U16_MAX; ulonglong2 b = {0, U8_MAX}; ulonglong2 c = {U8_MAX, 1}; mul_u16(&a, a, b); if (!equ_u16(a, c)) { printf("mul_16 "); print_u16(a); return; } } { ulonglong4 a = _U32_MAX; ulonglong4 b = {0, 0, U8_MAX, U8_MAX}; ulonglong4 c = {U8_MAX, U8_MAX, 0, 1}; mul_u32(&a, a, b); if (!equ_u32(a, c)) { printf("mul_32 "); print_u32(a); return; } } *passed = true; } int main() { bool test_passed, *d_test_passed; cudaMalloc(&d_test_passed, sizeof(bool)); test<<<1, 1>>>(d_test_passed); cudaDeviceSynchronize(); cudaMemcpy(&test_passed, d_test_passed, sizeof(bool), cudaMemcpyDeviceToHost); cudaFree(d_test_passed); if (!test_passed) { printf("test not passed\n"); return 1; } return 0; }