ds: secp256k1, 2

2025-12-25 14:31:23 +03:00
parent 1fe3bbbe2e
commit f2c718fc3f
9 changed files with 6139 additions and 1330 deletions
--- a/ds/25-1/1e/main.cu
+++ b/ds/25-1/1e/main.cu
@ -1,152 +1,106 @@
-#include <stdio.h>
-#include <stdint.h>
+#include <cmath>

-extern "C" __device__ void add_u16(
-    ulonglong2 *out_c,
-    ulonglong2 in_a,
-    ulonglong2 in_b
-);
-
-extern "C" __device__ void sub_u16(
-    ulonglong2 *out_c,
-    ulonglong2 in_a,
-    ulonglong2 in_b
-);
-
-extern "C" __device__ void add_u32(
-    ulonglong4 *out_c,
-    ulonglong4 in_a,
-    ulonglong4 in_b
-);
-
-extern "C" __device__ void sub_u32(
-    ulonglong4 *out_c,
-    ulonglong4 in_a,
-    ulonglong4 in_b
-);
-
-__device__ void mul_u16(
-    ulonglong2 *out_c,
-    ulonglong2 in_a,
-    ulonglong2 in_b
-) {
-    uint64_t ax_ay = in_a.x + in_a.y;
-    uint64_t bx_by = in_b.x + in_b.y;
-    uint64_t axbx = in_a.x * in_b.x;
-    uint64_t ayby = in_a.y * in_b.y;
-    out_c->x = ax_ay * bx_by - axbx - ayby;
-    out_c->y = ayby;
-}
-
-__device__ void mul_u32(
-    ulonglong4 *out_c,
-    ulonglong4 in_a,
-    ulonglong4 in_b
-) {
-    auto ax = (ulonglong2 *)&in_a.x;
-    auto ay = (ulonglong2 *)&in_a.z;
-    auto bx = (ulonglong2 *)&in_b.x;
-    auto by = (ulonglong2 *)&in_b.z;
-    ulonglong2 ax_ay, bx_by, paren, axbx, ayby;
-    add_u16(&ax_ay, *ax, *ay);
-    add_u16(&bx_by, *bx, *by);
-    mul_u16(&paren, ax_ay, bx_by);
-    mul_u16(&axbx, *ax, *bx);
-    mul_u16(&ayby, *ay, *by);
-    sub_u16(&paren, paren, axbx);
-    sub_u16(&paren, paren, ayby);
-    out_c->x = paren.x;
-    out_c->y = paren.y;
-    out_c->z = ayby.x;
-    out_c->w = ayby.y;
-}
-
-__device__ bool equ_u16(ulonglong2 a, ulonglong2 b) {
-    return a.x == b.x && a.y == b.y;
-}
-
-__device__ bool equ_u32(ulonglong4 a, ulonglong4 b) {
-    return a.x == b.x &&
-        a.y == b.y &&
-        a.z == b.z &&
-        a.w == b.w;
-}
-
-__device__ void print_u16(ulonglong2 a) {
-    printf("0x%016llx.%016llx\n", a.x, a.y);
-}
-
-__device__ void print_u32(ulonglong4 a) {
-    printf("0x%016llx.%016llx.%016llx.%016llx\n", a.x, a.y, a.z, a.w);
-}
-
-#define U8_MAX 0xFFFFFFFFFFFFFFFF
-#define _U16_MAX {U8_MAX, U8_MAX}
-#define _U32_MAX {U8_MAX, U8_MAX, U8_MAX, U8_MAX}
-
-__global__ void test(bool *passed) {
-    *passed = false;
-    {
-        ulonglong4 a = _U32_MAX;
-        ulonglong4 b = {0, 0, 0, 1};
-        ulonglong4 c = {0, 0, 0, 0};
-        add_u32(&a, a, b);
-        if (!equ_u32(a, c)) {
-            printf("add_u32 ");
-            print_u32(a);
-            return;
+template <typename T, int TILE_SIZE>
+__global__ void mat_mul(T *A, T *B, T *C, int N, int M, int K) {
+    __shared__ T sA[TILE_SIZE][TILE_SIZE];
+    __shared__ T sB[TILE_SIZE][TILE_SIZE];
+    
+    int bx = blockIdx.x, by = blockIdx.y;
+    int tx = threadIdx.x, ty = threadIdx.y;
+    
+    int row = by * TILE_SIZE + ty;
+    int col = bx * TILE_SIZE + tx;
+    
+    T sum = 0;
+    
+    for (int tile = 0; tile < ceil((float)M/TILE_SIZE); tile++) {
+        if (row < N && (tile * TILE_SIZE + tx) < M) {
+            sA[ty][tx] = A[row * M + (tile * TILE_SIZE + tx)];
+        } else {
+            sA[ty][tx] = 0;
        }
-    }
-    {
-        ulonglong4 a = {0, 0, 0, 0};
-        ulonglong4 b = {0, 0, 0, 1};
-        ulonglong4 c = _U32_MAX;
-        sub_u32(&a, a, b);
-        if (!equ_u32(a, c)) {
-            printf("sub_32 ");
-            print_u32(a);
-            return;
+
+        if ((tile * TILE_SIZE + ty) < M && col < K) {
+            sB[ty][tx] = B[(tile * TILE_SIZE + ty) * K + col];
+        } else {
+            sB[ty][tx] = 0;
        }
-    }
-    {
-        ulonglong2 a = _U16_MAX;
-        ulonglong2 b = {0, U8_MAX};
-        ulonglong2 c = {U8_MAX, 1};
-        mul_u16(&a, a, b);
-        if (!equ_u16(a, c)) {
-            printf("mul_16 ");
-            print_u16(a);
-            return;
+        __syncthreads();
+        
+        for (int k = 0; k < TILE_SIZE; k++) {
+            sum += sA[ty][k] * sB[k][tx];
        }
+        __syncthreads();
    }
-    {
-        ulonglong4 a = _U32_MAX;
-        ulonglong4 b = {0, 0, U8_MAX, U8_MAX};
-        ulonglong4 c = {U8_MAX, U8_MAX, 0, 1};
-        mul_u32(&a, a, b);
-        if (!equ_u32(a, c)) {
-            printf("mul_32 ");
-            print_u32(a);
-            return;
+    
+    if (row < N && col < K) {
+        C[row * K + col] = sum;
+    }
+}
+
+#define MAT_TYPE int
+#define MAT_FMT "%d\t"
+#define N 5
+#define M 7
+#define K 3
+#define A_LEN (N * M)
+#define B_LEN (M * K)
+#define C_LEN (N * K)
+#define A_SIZE (sizeof(MAT_TYPE) * N * M)
+#define B_SIZE (sizeof(MAT_TYPE) * M * K)
+#define C_SIZE (sizeof(MAT_TYPE) * N * K)
+
+#include <cstdio>
+#include <random>
+
+template <typename T>
+void mat_print(T *a, const char *fmt, int n, int m) {
+    for (auto row = 0; row < n; row++) {
+        for (auto col = 0; col < m; col++) {
+            printf(fmt, a[row * m + col]);
        }
+        printf("\n");
    }
-    *passed = true;
 }

 int main() {
-    bool test_passed, *d_test_passed;
-    cudaMalloc(&d_test_passed, sizeof(bool));
-	
-    test<<<1, 1>>>(d_test_passed);
-    cudaDeviceSynchronize();
+    std::random_device rd;
+    std::mt19937 engine(rd());
+    std::uniform_int_distribution<MAT_TYPE> dist(1, 10);

-    cudaMemcpy(&test_passed, d_test_passed, sizeof(bool), cudaMemcpyDeviceToHost);
-    cudaFree(d_test_passed);
-
-    if (!test_passed) {
-        printf("test not passed\n");
-        return 1;
+    MAT_TYPE buf[A_LEN + B_LEN + C_LEN];
+    for (auto i = 0; i < A_LEN + B_LEN; i++) {
+        buf[i] = dist(engine);
    }

-	return 0;
+    MAT_TYPE *a = buf;
+    MAT_TYPE *b = a + A_LEN;
+    MAT_TYPE *c = b + B_LEN;
+
+    printf("\na\n");
+    mat_print(a, MAT_FMT, N, M);
+    printf("\nb\n");
+    mat_print(b, MAT_FMT, M, K);
+
+    MAT_TYPE *d_a, *d_b, *d_c;
+    cudaMalloc(&d_a, A_SIZE);
+    cudaMalloc(&d_b, B_SIZE);
+    cudaMalloc(&d_c, C_SIZE);
+
+    cudaMemcpy(d_a, a, A_SIZE, cudaMemcpyHostToDevice);
+    cudaMemcpy(d_b, b, B_SIZE, cudaMemcpyHostToDevice);
+
+    dim3 blockDim(4, 4);
+    dim3 threadDim(4, 4);
+    mat_mul<MAT_TYPE, 4><<<blockDim, threadDim>>>(d_a, d_b, d_c, N, M, K);
+
+    cudaMemcpy(c, d_c, C_SIZE, cudaMemcpyDeviceToHost);
+    cudaDeviceSynchronize();
+
+    cudaFree(a);
+    cudaFree(b);
+    cudaFree(c);
+
+    printf("\nc\n");
+    mat_print(c, MAT_FMT, N, K);
 }
--- a/ds/25-1/1e/op.ptx
+++ b/ds/25-1/1e/op.ptx
@ -36,15 +36,14 @@
    ld.param.v2.u64 {%ra1, %ra0}, [in_a];
    ld.param.v2.u64 {%rb1, %rb0}, [in_b];

-    add.cc.u64 %ra0, %ra0, %rb0;
-    addc.u64 %ra1, %ra1, %rb1;
+    sub.cc.u64 %ra0, %ra0, %rb0;
+    subc.u64 %ra1, %ra1, %rb1;

    st.v2.u64 [%rdc], {%ra1, %ra0};

    ret;
 }

-
 .visible .func add_u32(
    .param .b64 out_c,
    .param .align 16 .b8 in_a[32],
@ -95,4 +94,78 @@
    st.v2.u64 [%rdc + 16], {%ra1, %ra0};

    ret;
+}
+
+.visible .func mul_lo_u16(
+    .param .b64 out_c,
+    .param .align 16 .b8 in_a[16],
+    .param .align 16 .b8 in_b[16]
+) {
+    .reg .u64 %a, %b, %c, %d, %a_b, %c_d;
+    .reg .u64 %ac, %bd_hi, %bd_lo, %p;
+    .reg .b64 %rdc;
+
+    ld.param.b64 %rdc, [out_c];
+
+    ld.param.v2.u64 {%a, %b}, [in_a];
+    ld.param.v2.u64 {%c, %d}, [in_b];
+
+    mul.lo.u64 %ac, %a, %c;
+    mul.lo.u64 %bd_lo, %b, %d;
+    mul.hi.u64 %bd_hi, %b, %d;
+
+    add.u64 %a_b, %a, %b;
+    add.u64 %c_d, %c, %d;
+
+    mul.lo.u64 %p, %a_b, %c_d;
+    
+    sub.u64 %p, %p, %ac;
+    sub.u64 %p, %p, %bd_lo;
+
+    add.u64 %p, %p, %bd_hi;
+
+    st.v2.u64 [%rdc], {%p, %bd_lo};
+    
+    ret;
+}
+
+.visible .func mul_u16(
+    .param .b64 out_c_hi,
+    .param .b64 out_c_lo,
+    .param .align 16 .b8 in_a[16],
+    .param .align 16 .b8 in_b[16]
+) {
+  .reg .u64 %a, %b, %c, %d;
+  .reg .u64 %a_b_hi, %a_b_lo, %c_d_hi, %c_d_lo;
+  .reg .u64 %p_hi, %p_lo, %p_hi2, %p_lo2;
+  .reg .u64 %ac_hi, %ac_lo, %bd_hi, %bd_lo;
+  .reg .b64 %rdc_hi, %rdc_lo;
+
+  ld.param.b64 %rdc_hi, [out_c_hi];
+  ld.param.b64 %rdc_lo, [out_c_lo];
+
+  ld.param.v2.u64 {%a, %b}, [in_a];
+  ld.param.v2.u64 {%c, %d}, [in_b];
+
+  mul.lo.u64 %ac_lo, %a, %c;
+  mul.hi.u64 %ac_hi, %a, %c;
+  mul.lo.u64 %bd_lo, %b, %d;
+  mul.hi.u64 %bd_hi, %b, %d;
+
+  add.cc.u64 %a_b_lo, %a, %b;
+  addc.u64 %a_b_hi, %a, %b;
+  add.cc.u64 %c_d_lo, %c, %d;
+  addc.u64 %c_d_hi, %c, %d;
+
+  mul.lo.u64 %p_lo, %a_b_lo, %c_d_lo;
+  mul.hi.u64 %p_hi, %a_b_lo, %c_d_lo;
+  mul.lo.u64 %p_hi2, %a_b_hi, %c_d_hi;
+  
+
+
+
+  st.v2.u64 [%rdc_lo], {%p_hi, %p_lo};
+  st.v2.u64 [%rdc_hi], {%a_b_lo, %p_hi2};
+
+  ret;
 }
--- a/ds/25-1/1e/secp256k1.cu
+++ b/ds/25-1/1e/secp256k1.cu
@ -0,0 +1,205 @@
+#include <stdio.h>
+#include <stdint.h>
+
+extern "C" __device__ void add_u16(
+    ulonglong2 *out_c,
+    ulonglong2 in_a,
+    ulonglong2 in_b
+);
+
+extern "C" __device__ void sub_u16(
+    ulonglong2 *out_c,
+    ulonglong2 in_a,
+    ulonglong2 in_b
+);
+
+extern "C" __device__ void add_u32(
+    ulonglong4 *out_c,
+    ulonglong4 in_a,
+    ulonglong4 in_b
+);
+
+extern "C" __device__ void sub_u32(
+    ulonglong4 *out_c,
+    ulonglong4 in_a,
+    ulonglong4 in_b
+);
+
+extern "C" __device__ void mul_lo_u16(
+    ulonglong2 *out_c,
+    ulonglong2 in_a,
+    ulonglong2 in_b
+);
+
+extern "C" __device__ void mul_u16(
+    ulonglong2 *out_c_hi,
+    ulonglong2 *out_c_lo,
+    ulonglong2 in_a,
+    ulonglong2 in_b
+);
+
+__device__ bool equ_u16(ulonglong2 a, ulonglong2 b) {
+    return a.x == b.x && a.y == b.y;
+}
+
+__device__ bool equ_u32(ulonglong4 a, ulonglong4 b) {
+    return a.x == b.x &&
+        a.y == b.y &&
+        a.z == b.z &&
+        a.w == b.w;
+}
+
+__device__ int cmp_u32(ulonglong4 a, ulonglong4 b) {
+  if (a.x < b.x)
+    return -1;
+  else if (a.x > b.x)
+    return 1;
+  
+  if (a.y < b.y)
+    return -1;
+  else if (a.y > b.y)
+    return 1;
+
+  if (a.z < b.z)
+    return -1;
+  else if (a.z > b.z)
+    return 1;
+
+  if (a.w < b.w)
+    return -1;
+  else if (a.w > b.w)
+    return 1;
+
+  return 0;
+}
+
+__device__ void mul_lo_u32(
+    ulonglong4 *out_c,
+    ulonglong4 in_a,
+    ulonglong4 in_b
+) {
+    auto a = (ulonglong2 *)&in_a.x;
+    auto b = (ulonglong2 *)&in_a.z;
+    auto c = (ulonglong2 *)&in_b.x;
+    auto d = (ulonglong2 *)&in_b.z;
+    ulonglong2 a_b, c_d, ac, bd_hi, bd_lo, p;
+
+    mul_lo_u16(&ac, *a, *c);
+    mul_u16(&bd_hi, &bd_lo, *b, *d);
+
+    add_u16(&a_b, *a, *b);
+    add_u16(&c_d, *c, *d);
+
+    mul_lo_u16(&p, a_b, c_d);
+
+    sub_u16(&p, p, ac);
+    sub_u16(&p, p, bd_lo);
+    add_u16(&p, p, bd_hi);
+
+    out_c->x = p.x;
+    out_c->y = p.y;
+    out_c->z = bd_lo.x;
+    out_c->w = bd_lo.y;
+}
+
+__device__ void print_u16(ulonglong2 a) {
+    printf("0x%016llx.%016llx\n", a.x, a.y);
+}
+
+__device__ void print_u32(ulonglong4 a) {
+    printf("0x%016llx.%016llx.%016llx.%016llx\n", a.x, a.y, a.z, a.w);
+}
+
+#define U8_MAX 0xFFFFFFFFFFFFFFFF
+#define U16_MAX {U8_MAX, U8_MAX}
+#define U32_MAX {U8_MAX, U8_MAX, U8_MAX, U8_MAX}
+
+__global__ void test(bool *passed) {
+    *passed = true;
+    {
+        ulonglong4 a = U32_MAX;
+        ulonglong4 b = {0, 0, 0, 1};
+        ulonglong4 c = {0, 0, 0, 0};
+        add_u32(&a, a, b);
+        if (!equ_u32(a, c)) {
+            printf("add_u32\n");
+            print_u32(a);
+            *passed = false;
+        }
+    }
+    {
+        ulonglong4 a = {0, 0, 0, 0};
+        ulonglong4 b = {0, 0, 0, 1};
+        ulonglong4 c = U32_MAX;
+        sub_u32(&a, a, b);
+        if (!equ_u32(a, c)) {
+            printf("sub_u32\n");
+            print_u32(a);
+            *passed = false;
+        }
+    }
+    {
+        ulonglong2 a = U16_MAX;
+        ulonglong2 b = {0, U8_MAX};
+        ulonglong2 c = {U8_MAX, 1};
+        mul_lo_u16(&a, a, b);
+        if (!equ_u16(a, c)) {
+            printf("mul_lo_u16\n");
+            print_u16(a);
+            *passed = false;
+        }
+    }
+    {
+        ulonglong2 a = U16_MAX;
+        ulonglong2 b = {0, U8_MAX};
+        ulonglong2 c_hi = {0, U8_MAX - 1};
+        ulonglong2 c_lo = {U8_MAX, 1};
+        mul_u16(&a, &b, a, b);
+        if (!equ_u16(a, c_hi) || !equ_u16(b, c_lo)) {
+            printf("mul_u16\n");
+            print_u16(a);
+            print_u16(b);
+            *passed = false;
+        }
+        a = U16_MAX;
+        b = U16_MAX;
+        c_hi = {U8_MAX, U8_MAX - 1};
+        c_lo = {0, 1};
+        mul_u16(&a, &b, a, b);
+        if (!equ_u16(a, c_hi) || !equ_u16(b, c_lo)) {
+            printf("mul_u16\n");
+            print_u16(a);
+            print_u16(b);
+            *passed = false;
+        }
+    }
+    {
+        ulonglong4 a = U32_MAX;
+        ulonglong4 b = {0, 0, U8_MAX, U8_MAX};
+        ulonglong4 c = {U8_MAX, U8_MAX, 0, 1};
+        mul_lo_u32(&a, a, b);
+        if (!equ_u32(a, c)) {
+            printf("mul_lo_u32\n");
+            print_u32(a);
+            *passed = false;
+        }
+    }
+}
+
+int main() {
+    bool test_passed, *d_test_passed;
+    cudaMalloc(&d_test_passed, sizeof(bool));
+	
+    test<<<1, 1>>>(d_test_passed);
+    cudaDeviceSynchronize();
+
+    cudaMemcpy(&test_passed, d_test_passed, sizeof(bool), cudaMemcpyDeviceToHost);
+    cudaFree(d_test_passed);
+
+    if (!test_passed) {
+        printf("test not passed\n");
+        return 1;
+    }
+
+	return 0;
+}
--- a/ds/25-1/1e/test.py
+++ b/ds/25-1/1e/test.py
@ -2,16 +2,21 @@ U8_MAX = 0xFFFFFFFFFFFFFFFF
 U16_MAX = U8_MAX << 64 | U8_MAX
 U32_MAX = U16_MAX << 128 | U16_MAX

+
 def dothex(num):
    strhex = hex(num)[2:]
    dothex = strhex[-16:]
    strhex = strhex[:-16]
-    
+
    while len(strhex) > 0:
        dothex = strhex[-16:] + '.' + dothex
        strhex = strhex[:-16]

    return '0x' + dothex

-print('mul16', dothex(U16_MAX * 2 % (U16_MAX + 1)))
-print('mul32', dothex(U32_MAX * U16_MAX % (U32_MAX + 1)))
+print('mul_u16', dothex((U16_MAX * U8_MAX >> 128) %
+      (U16_MAX + 1)), dothex(U16_MAX * U8_MAX % (U16_MAX + 1)))
+print('mul_u16', dothex((U16_MAX * U16_MAX >> 128) %
+      (U16_MAX + 1)), dothex(U16_MAX * U16_MAX % (U16_MAX + 1)))
+print('mul_lo_u32', dothex(U32_MAX * U16_MAX % (U32_MAX + 1)))
+print('div_lo_u32', dothex(U32_MAX // U8_MAX), dothex(U32_MAX - U32_MAX // U8_MAX))