#include #include extern "C" __device__ void add_u16( ulonglong2 *out_c, ulonglong2 in_a, ulonglong2 in_b ); extern "C" __device__ void sub_u16( ulonglong2 *out_c, ulonglong2 in_a, ulonglong2 in_b ); extern "C" __device__ void add_u32( ulonglong4 *out_c, ulonglong4 in_a, ulonglong4 in_b ); extern "C" __device__ void sub_u32( ulonglong4 *out_c, ulonglong4 in_a, ulonglong4 in_b ); extern "C" __device__ void mul_lo_u16( ulonglong2 *out_c, ulonglong2 in_a, ulonglong2 in_b ); extern "C" __device__ void mul_u16( ulonglong2 *out_c_hi, ulonglong2 *out_c_lo, ulonglong2 in_a, ulonglong2 in_b ); __device__ bool equ_u16(ulonglong2 a, ulonglong2 b) { return a.x == b.x && a.y == b.y; } __device__ bool equ_u32(ulonglong4 a, ulonglong4 b) { return a.x == b.x && a.y == b.y && a.z == b.z && a.w == b.w; } __device__ int cmp_u32(ulonglong4 a, ulonglong4 b) { if (a.x < b.x) return -1; else if (a.x > b.x) return 1; if (a.y < b.y) return -1; else if (a.y > b.y) return 1; if (a.z < b.z) return -1; else if (a.z > b.z) return 1; if (a.w < b.w) return -1; else if (a.w > b.w) return 1; return 0; } __device__ void mul_lo_u32( ulonglong4 *out_c, ulonglong4 in_a, ulonglong4 in_b ) { auto a = (ulonglong2 *)&in_a.x; auto b = (ulonglong2 *)&in_a.z; auto c = (ulonglong2 *)&in_b.x; auto d = (ulonglong2 *)&in_b.z; ulonglong2 a_b, c_d, ac, bd_hi, bd_lo, p; mul_lo_u16(&ac, *a, *c); mul_u16(&bd_hi, &bd_lo, *b, *d); add_u16(&a_b, *a, *b); add_u16(&c_d, *c, *d); mul_lo_u16(&p, a_b, c_d); sub_u16(&p, p, ac); sub_u16(&p, p, bd_lo); add_u16(&p, p, bd_hi); out_c->x = p.x; out_c->y = p.y; out_c->z = bd_lo.x; out_c->w = bd_lo.y; } __device__ void print_u16(ulonglong2 a) { printf("0x%016llx.%016llx\n", a.x, a.y); } __device__ void print_u32(ulonglong4 a) { printf("0x%016llx.%016llx.%016llx.%016llx\n", a.x, a.y, a.z, a.w); } #define U8_MAX 0xFFFFFFFFFFFFFFFF #define U16_MAX {U8_MAX, U8_MAX} #define U32_MAX {U8_MAX, U8_MAX, U8_MAX, U8_MAX} __global__ void test(bool *passed) { *passed = true; { ulonglong4 a = U32_MAX; ulonglong4 b = {0, 0, 0, 1}; ulonglong4 c = {0, 0, 0, 0}; add_u32(&a, a, b); if (!equ_u32(a, c)) { printf("add_u32\n"); print_u32(a); *passed = false; } } { ulonglong4 a = {0, 0, 0, 0}; ulonglong4 b = {0, 0, 0, 1}; ulonglong4 c = U32_MAX; sub_u32(&a, a, b); if (!equ_u32(a, c)) { printf("sub_u32\n"); print_u32(a); *passed = false; } } { ulonglong2 a = U16_MAX; ulonglong2 b = {0, U8_MAX}; ulonglong2 c = {U8_MAX, 1}; mul_lo_u16(&a, a, b); if (!equ_u16(a, c)) { printf("mul_lo_u16\n"); print_u16(a); *passed = false; } } { ulonglong2 a = U16_MAX; ulonglong2 b = {0, U8_MAX}; ulonglong2 c_hi = {0, U8_MAX - 1}; ulonglong2 c_lo = {U8_MAX, 1}; mul_u16(&a, &b, a, b); if (!equ_u16(a, c_hi) || !equ_u16(b, c_lo)) { printf("mul_u16\n"); print_u16(a); print_u16(b); *passed = false; } a = U16_MAX; b = U16_MAX; c_hi = {U8_MAX, U8_MAX - 1}; c_lo = {0, 1}; mul_u16(&a, &b, a, b); if (!equ_u16(a, c_hi) || !equ_u16(b, c_lo)) { printf("mul_u16\n"); print_u16(a); print_u16(b); *passed = false; } } { ulonglong4 a = U32_MAX; ulonglong4 b = {0, 0, U8_MAX, U8_MAX}; ulonglong4 c = {U8_MAX, U8_MAX, 0, 1}; mul_lo_u32(&a, a, b); if (!equ_u32(a, c)) { printf("mul_lo_u32\n"); print_u32(a); *passed = false; } } } int main() { bool test_passed, *d_test_passed; cudaMalloc(&d_test_passed, sizeof(bool)); test<<<1, 1>>>(d_test_passed); cudaDeviceSynchronize(); cudaMemcpy(&test_passed, d_test_passed, sizeof(bool), cudaMemcpyDeviceToHost); cudaFree(d_test_passed); if (!test_passed) { printf("test not passed\n"); return 1; } return 0; }