Files
lab/ds/25-1/1e/main.cu
2025-12-25 14:31:23 +03:00

106 lines
2.5 KiB
Plaintext

#include <cmath>
template <typename T, int TILE_SIZE>
__global__ void mat_mul(T *A, T *B, T *C, int N, int M, int K) {
__shared__ T sA[TILE_SIZE][TILE_SIZE];
__shared__ T sB[TILE_SIZE][TILE_SIZE];
int bx = blockIdx.x, by = blockIdx.y;
int tx = threadIdx.x, ty = threadIdx.y;
int row = by * TILE_SIZE + ty;
int col = bx * TILE_SIZE + tx;
T sum = 0;
for (int tile = 0; tile < ceil((float)M/TILE_SIZE); tile++) {
if (row < N && (tile * TILE_SIZE + tx) < M) {
sA[ty][tx] = A[row * M + (tile * TILE_SIZE + tx)];
} else {
sA[ty][tx] = 0;
}
if ((tile * TILE_SIZE + ty) < M && col < K) {
sB[ty][tx] = B[(tile * TILE_SIZE + ty) * K + col];
} else {
sB[ty][tx] = 0;
}
__syncthreads();
for (int k = 0; k < TILE_SIZE; k++) {
sum += sA[ty][k] * sB[k][tx];
}
__syncthreads();
}
if (row < N && col < K) {
C[row * K + col] = sum;
}
}
#define MAT_TYPE int
#define MAT_FMT "%d\t"
#define N 5
#define M 7
#define K 3
#define A_LEN (N * M)
#define B_LEN (M * K)
#define C_LEN (N * K)
#define A_SIZE (sizeof(MAT_TYPE) * N * M)
#define B_SIZE (sizeof(MAT_TYPE) * M * K)
#define C_SIZE (sizeof(MAT_TYPE) * N * K)
#include <cstdio>
#include <random>
template <typename T>
void mat_print(T *a, const char *fmt, int n, int m) {
for (auto row = 0; row < n; row++) {
for (auto col = 0; col < m; col++) {
printf(fmt, a[row * m + col]);
}
printf("\n");
}
}
int main() {
std::random_device rd;
std::mt19937 engine(rd());
std::uniform_int_distribution<MAT_TYPE> dist(1, 10);
MAT_TYPE buf[A_LEN + B_LEN + C_LEN];
for (auto i = 0; i < A_LEN + B_LEN; i++) {
buf[i] = dist(engine);
}
MAT_TYPE *a = buf;
MAT_TYPE *b = a + A_LEN;
MAT_TYPE *c = b + B_LEN;
printf("\na\n");
mat_print(a, MAT_FMT, N, M);
printf("\nb\n");
mat_print(b, MAT_FMT, M, K);
MAT_TYPE *d_a, *d_b, *d_c;
cudaMalloc(&d_a, A_SIZE);
cudaMalloc(&d_b, B_SIZE);
cudaMalloc(&d_c, C_SIZE);
cudaMemcpy(d_a, a, A_SIZE, cudaMemcpyHostToDevice);
cudaMemcpy(d_b, b, B_SIZE, cudaMemcpyHostToDevice);
dim3 blockDim(4, 4);
dim3 threadDim(4, 4);
mat_mul<MAT_TYPE, 4><<<blockDim, threadDim>>>(d_a, d_b, d_c, N, M, K);
cudaMemcpy(c, d_c, C_SIZE, cudaMemcpyDeviceToHost);
cudaDeviceSynchronize();
cudaFree(a);
cudaFree(b);
cudaFree(c);
printf("\nc\n");
mat_print(c, MAT_FMT, N, K);
}