From 4f53fe5fd420f22205e34e36dc85842e0e59233f Mon Sep 17 00:00:00 2001 From: SEK1RO Date: Tue, 30 Dec 2025 13:13:14 +0300 Subject: [PATCH] ds: 1e speed comparsion --- ds/25-1/1e/main.cu | 105 ++++++++++++++++++++++++++++++++++----------- ds/25-1/1e/main.py | 27 ++++++++++++ 2 files changed, 107 insertions(+), 25 deletions(-) create mode 100644 ds/25-1/1e/main.py diff --git a/ds/25-1/1e/main.cu b/ds/25-1/1e/main.cu index 8b401cd..21f1c71 100644 --- a/ds/25-1/1e/main.cu +++ b/ds/25-1/1e/main.cu @@ -1,4 +1,4 @@ -#include +#include template __global__ void mat_mul(T *A, T *B, T *C, int N, int M, int K) { @@ -10,39 +10,57 @@ __global__ void mat_mul(T *A, T *B, T *C, int N, int M, int K) { int row = by * TILE_SIZE + ty; int col = bx * TILE_SIZE + tx; + + if (col >= K || row >= M) return; T sum = 0; + + int tiles_len = (M + TILE_SIZE - 1) / TILE_SIZE; - for (int tile = 0; tile < ceil((float)M/TILE_SIZE); tile++) { - if (row < N && (tile * TILE_SIZE + tx) < M) { - sA[ty][tx] = A[row * M + (tile * TILE_SIZE + tx)]; + for (int tile = 0; tile < tiles_len; tile++) { + int aCol = tile * TILE_SIZE + tx; + int bRow = tile * TILE_SIZE + ty; + + if (aCol < M) { + sA[ty][tx] = A[row * M + aCol]; } else { sA[ty][tx] = 0; } - if ((tile * TILE_SIZE + ty) < M && col < K) { - sB[ty][tx] = B[(tile * TILE_SIZE + ty) * K + col]; - } else { - sB[ty][tx] = 0; - } + sB[ty][tx] = (T)((uint64_t)B[bRow * K + col] & ((uint64_t)(bRow >= M) - 1)); __syncthreads(); for (int k = 0; k < TILE_SIZE; k++) { sum += sA[ty][k] * sB[k][tx]; } - __syncthreads(); - } - - if (row < N && col < K) { - C[row * K + col] = sum; } + + C[row * K + col] = sum; } +template +__global__ void dumb_mat_mul(T *A, T *B, T *C, int N, int M, int K) { + int col = blockIdx.x * blockDim.x + threadIdx.x; + int row = blockIdx.y * blockDim.y + threadIdx.y; + + if (col >= K || row >= M) return; + + T sum = 0; + for (int i = 0; i < M; i++) { + sum += A[row * M + i] * B[i * K + col]; + } + C[row * K + col] = sum; +} + +#define N 1024 +#define M 1024 +#define K 1024 +#define NO_PRINT 1 +#define GRID_DIM 1 +#define BLOCK_DIM 32 + #define MAT_TYPE int #define MAT_FMT "%d\t" -#define N 5 -#define M 7 -#define K 3 #define A_LEN (N * M) #define B_LEN (M * K) #define C_LEN (N * K) @@ -52,6 +70,8 @@ __global__ void mat_mul(T *A, T *B, T *C, int N, int M, int K) { #include #include +#include +using namespace std::chrono; template void mat_print(T *a, const char *fmt, int n, int m) { @@ -68,7 +88,7 @@ int main() { std::mt19937 engine(rd()); std::uniform_int_distribution dist(1, 10); - MAT_TYPE buf[A_LEN + B_LEN + C_LEN]; + auto buf = (MAT_TYPE *)malloc(A_SIZE + B_SIZE + C_SIZE); for (auto i = 0; i < A_LEN + B_LEN; i++) { buf[i] = dist(engine); } @@ -77,10 +97,12 @@ int main() { MAT_TYPE *b = a + A_LEN; MAT_TYPE *c = b + B_LEN; +#if NO_PRINT==0 printf("\na\n"); mat_print(a, MAT_FMT, N, M); printf("\nb\n"); mat_print(b, MAT_FMT, M, K); +#endif MAT_TYPE *d_a, *d_b, *d_c; cudaMalloc(&d_a, A_SIZE); @@ -90,17 +112,50 @@ int main() { cudaMemcpy(d_a, a, A_SIZE, cudaMemcpyHostToDevice); cudaMemcpy(d_b, b, B_SIZE, cudaMemcpyHostToDevice); - dim3 blockDim(4, 4); - dim3 threadDim(4, 4); - mat_mul<<>>(d_a, d_b, d_c, N, M, K); + dim3 gridDim(GRID_DIM, GRID_DIM); + dim3 blockDim(BLOCK_DIM, BLOCK_DIM); + int cycles = 0; + microseconds duration(0); + + while (duration.count() < 1e6) { + auto start = high_resolution_clock::now(); + mat_mul<<>>(d_a, d_b, d_c, N, M, K); + cudaDeviceSynchronize(); + auto end = high_resolution_clock::now(); + + cycles++; + duration += duration_cast(end - start); + } + +#if NO_PRINT==0 cudaMemcpy(c, d_c, C_SIZE, cudaMemcpyDeviceToHost); - cudaDeviceSynchronize(); + printf("\nc\n"); + mat_print(c, MAT_FMT, N, K); +#endif + printf("optimized mul take %f usec avg in %d cycles\n", (float)(duration.count()) / cycles, cycles); + + cycles = 0; + duration = microseconds(0); + while (duration.count() < 1e6) { + auto start = high_resolution_clock::now(); + dumb_mat_mul<<>>(d_a, d_b, d_c, N, M, K); + cudaDeviceSynchronize(); + auto end = high_resolution_clock::now(); + + cycles++; + duration += duration_cast(end - start); + } + +#if NO_PRINT==0 + cudaMemcpy(c, d_c, C_SIZE, cudaMemcpyDeviceToHost); + printf("\nc\n"); + mat_print(c, MAT_FMT, N, K); +#endif + printf("dumb mul take %f usec avg in %d cycles\n", (float)(duration.count()) / cycles, cycles); cudaFree(a); cudaFree(b); cudaFree(c); - - printf("\nc\n"); - mat_print(c, MAT_FMT, N, K); + free(buf); } \ No newline at end of file diff --git a/ds/25-1/1e/main.py b/ds/25-1/1e/main.py new file mode 100644 index 0000000..23f0f47 --- /dev/null +++ b/ds/25-1/1e/main.py @@ -0,0 +1,27 @@ +import sys, time, math +import numpy as np +import cupy as cp + +def measure(a, b): + duration = 0 + cycles = 0 + while (duration < 1): + start = time.perf_counter() + c = a @ b + cp.cuda.Stream.null.synchronize() + end = time.perf_counter() + + duration += end - start + cycles += 1 + + return duration / cycles + +n = 1024 + +a = np.random.rand(n, n).astype(np.float32) +b = np.random.rand(n, n).astype(np.float32) +print('numpy take', measure(a, b) * 1e6, 'usec') + +a = cp.random.rand(n, n, dtype = cp.float32) +b = cp.random.rand(n, n, dtype = cp.float32) +print('cupy take', measure(a, b) * 1e6, 'usec') \ No newline at end of file