From 4f53fe5fd420f22205e34e36dc85842e0e59233f Mon Sep 17 00:00:00 2001
From: SEK1RO <vlad@sek1.ro>
Date: Tue, 30 Dec 2025 13:13:14 +0300
Subject: [PATCH] ds: 1e speed comparsion

---
 ds/25-1/1e/main.cu | 105 ++++++++++++++++++++++++++++++++++-----------
 ds/25-1/1e/main.py |  27 ++++++++++++
 2 files changed, 107 insertions(+), 25 deletions(-)
 create mode 100644 ds/25-1/1e/main.py
diff --git a/ds/25-1/1e/main.cu b/ds/25-1/1e/main.cu
index 8b401cd..21f1c71 100644
--- a/ds/25-1/1e/main.cu
+++ b/ds/25-1/1e/main.cu
@@ -1,4 +1,4 @@
-#include <cmath>
+#include <stdint.h>
 
 template <typename T, int TILE_SIZE>
 __global__ void mat_mul(T *A, T *B, T *C, int N, int M, int K) {
@@ -10,39 +10,57 @@ __global__ void mat_mul(T *A, T *B, T *C, int N, int M, int K) {
     
     int row = by * TILE_SIZE + ty;
     int col = bx * TILE_SIZE + tx;
+
+    if (col >= K || row >= M) return;
     
     T sum = 0;
+
+    int tiles_len = (M + TILE_SIZE - 1) / TILE_SIZE;
     
-    for (int tile = 0; tile < ceil((float)M/TILE_SIZE); tile++) {
-        if (row < N && (tile * TILE_SIZE + tx) < M) {
-            sA[ty][tx] = A[row * M + (tile * TILE_SIZE + tx)];
+    for (int tile = 0; tile < tiles_len; tile++) {
+        int aCol = tile * TILE_SIZE + tx;
+        int bRow = tile * TILE_SIZE + ty;
+
+        if (aCol < M) {
+            sA[ty][tx] = A[row * M + aCol];
         } else {
             sA[ty][tx] = 0;
         }
 
-        if ((tile * TILE_SIZE + ty) < M && col < K) {
-            sB[ty][tx] = B[(tile * TILE_SIZE + ty) * K + col];
-        } else {
-            sB[ty][tx] = 0;
-        }
+        sB[ty][tx] = (T)((uint64_t)B[bRow * K + col] & ((uint64_t)(bRow >= M) - 1));
         __syncthreads();
         
         for (int k = 0; k < TILE_SIZE; k++) {
             sum += sA[ty][k] * sB[k][tx];
         }
-        __syncthreads();
-    }
-    
-    if (row < N && col < K) {
-        C[row * K + col] = sum;
     }
+
+    C[row * K + col] = sum;
 }
 
+template <typename T>
+__global__ void dumb_mat_mul(T *A, T *B, T *C, int N, int M, int K) {
+    int col = blockIdx.x * blockDim.x + threadIdx.x;
+    int row = blockIdx.y * blockDim.y + threadIdx.y;
+
+    if (col >= K || row >= M) return;
+
+    T sum = 0;
+    for (int i = 0; i < M; i++) {
+        sum += A[row * M + i] * B[i * K + col];
+    }
+    C[row * K + col] = sum;
+}
+
+#define N 1024
+#define M 1024
+#define K 1024
+#define NO_PRINT 1
+#define GRID_DIM 1
+#define BLOCK_DIM 32
+
 #define MAT_TYPE int
 #define MAT_FMT "%d\t"
-#define N 5
-#define M 7
-#define K 3
 #define A_LEN (N * M)
 #define B_LEN (M * K)
 #define C_LEN (N * K)
@@ -52,6 +70,8 @@ __global__ void mat_mul(T *A, T *B, T *C, int N, int M, int K) {
 
 #include <cstdio>
 #include <random>
+#include <chrono>
+using namespace std::chrono;
 
 template <typename T>
 void mat_print(T *a, const char *fmt, int n, int m) {
@@ -68,7 +88,7 @@ int main() {
     std::mt19937 engine(rd());
     std::uniform_int_distribution<MAT_TYPE> dist(1, 10);
 
-    MAT_TYPE buf[A_LEN + B_LEN + C_LEN];
+    auto buf = (MAT_TYPE *)malloc(A_SIZE + B_SIZE + C_SIZE);
     for (auto i = 0; i < A_LEN + B_LEN; i++) {
         buf[i] = dist(engine);
     }
@@ -77,10 +97,12 @@ int main() {
     MAT_TYPE *b = a + A_LEN;
     MAT_TYPE *c = b + B_LEN;
 
+#if NO_PRINT==0
     printf("\na\n");
     mat_print(a, MAT_FMT, N, M);
     printf("\nb\n");
     mat_print(b, MAT_FMT, M, K);
+#endif
 
     MAT_TYPE *d_a, *d_b, *d_c;
     cudaMalloc(&d_a, A_SIZE);
@@ -90,17 +112,50 @@ int main() {
     cudaMemcpy(d_a, a, A_SIZE, cudaMemcpyHostToDevice);
     cudaMemcpy(d_b, b, B_SIZE, cudaMemcpyHostToDevice);
 
-    dim3 blockDim(4, 4);
-    dim3 threadDim(4, 4);
-    mat_mul<MAT_TYPE, 4><<<blockDim, threadDim>>>(d_a, d_b, d_c, N, M, K);
+    dim3 gridDim(GRID_DIM, GRID_DIM);
+    dim3 blockDim(BLOCK_DIM, BLOCK_DIM);
 
+    int cycles = 0;
+    microseconds duration(0);
+
+    while (duration.count() < 1e6) {
+        auto start = high_resolution_clock::now();
+        mat_mul<MAT_TYPE, BLOCK_DIM><<<gridDim, blockDim>>>(d_a, d_b, d_c, N, M, K);
+        cudaDeviceSynchronize();
+        auto end = high_resolution_clock::now();
+
+        cycles++;
+        duration += duration_cast<microseconds>(end - start);
+    }
+
+#if NO_PRINT==0
     cudaMemcpy(c, d_c, C_SIZE, cudaMemcpyDeviceToHost);
-    cudaDeviceSynchronize();
+    printf("\nc\n");
+    mat_print(c, MAT_FMT, N, K);
+#endif
+    printf("optimized mul take %f usec avg in %d cycles\n", (float)(duration.count()) / cycles, cycles);
+
+    cycles = 0;
+    duration = microseconds(0);
+    while (duration.count() < 1e6) {
+        auto start = high_resolution_clock::now();
+        dumb_mat_mul<MAT_TYPE><<<gridDim, blockDim>>>(d_a, d_b, d_c, N, M, K);
+        cudaDeviceSynchronize();
+        auto end = high_resolution_clock::now();
+
+        cycles++;
+        duration += duration_cast<microseconds>(end - start);
+    }
+
+#if NO_PRINT==0
+    cudaMemcpy(c, d_c, C_SIZE, cudaMemcpyDeviceToHost);
+    printf("\nc\n");
+    mat_print(c, MAT_FMT, N, K);
+#endif
+    printf("dumb mul take %f usec avg in %d cycles\n", (float)(duration.count()) / cycles, cycles);
 
     cudaFree(a);
     cudaFree(b);
     cudaFree(c);
-
-    printf("\nc\n");
-    mat_print(c, MAT_FMT, N, K);
+    free(buf);
 }
\ No newline at end of file
diff --git a/ds/25-1/1e/main.py b/ds/25-1/1e/main.py
new file mode 100644
index 0000000..23f0f47
--- /dev/null
+++ b/ds/25-1/1e/main.py
@@ -0,0 +1,27 @@
+import sys, time, math
+import numpy as np
+import cupy as cp
+
+def measure(a, b):
+    duration = 0
+    cycles = 0
+    while (duration < 1):
+        start = time.perf_counter()
+        c = a @ b
+        cp.cuda.Stream.null.synchronize()
+        end = time.perf_counter()
+
+        duration += end - start
+        cycles += 1
+        
+    return duration / cycles
+
+n = 1024
+
+a = np.random.rand(n, n).astype(np.float32)
+b = np.random.rand(n, n).astype(np.float32)
+print('numpy take', measure(a, b) * 1e6, 'usec')
+
+a = cp.random.rand(n, n, dtype = cp.float32)
+b = cp.random.rand(n, n, dtype = cp.float32)
+print('cupy take', measure(a, b) * 1e6, 'usec')
\ No newline at end of file