ds: 1e speed comparsion
This commit is contained in:
@ -1,4 +1,4 @@
|
||||
#include <cmath>
|
||||
#include <stdint.h>
|
||||
|
||||
template <typename T, int TILE_SIZE>
|
||||
__global__ void mat_mul(T *A, T *B, T *C, int N, int M, int K) {
|
||||
@ -11,38 +11,56 @@ __global__ void mat_mul(T *A, T *B, T *C, int N, int M, int K) {
|
||||
int row = by * TILE_SIZE + ty;
|
||||
int col = bx * TILE_SIZE + tx;
|
||||
|
||||
if (col >= K || row >= M) return;
|
||||
|
||||
T sum = 0;
|
||||
|
||||
for (int tile = 0; tile < ceil((float)M/TILE_SIZE); tile++) {
|
||||
if (row < N && (tile * TILE_SIZE + tx) < M) {
|
||||
sA[ty][tx] = A[row * M + (tile * TILE_SIZE + tx)];
|
||||
int tiles_len = (M + TILE_SIZE - 1) / TILE_SIZE;
|
||||
|
||||
for (int tile = 0; tile < tiles_len; tile++) {
|
||||
int aCol = tile * TILE_SIZE + tx;
|
||||
int bRow = tile * TILE_SIZE + ty;
|
||||
|
||||
if (aCol < M) {
|
||||
sA[ty][tx] = A[row * M + aCol];
|
||||
} else {
|
||||
sA[ty][tx] = 0;
|
||||
}
|
||||
|
||||
if ((tile * TILE_SIZE + ty) < M && col < K) {
|
||||
sB[ty][tx] = B[(tile * TILE_SIZE + ty) * K + col];
|
||||
} else {
|
||||
sB[ty][tx] = 0;
|
||||
}
|
||||
sB[ty][tx] = (T)((uint64_t)B[bRow * K + col] & ((uint64_t)(bRow >= M) - 1));
|
||||
__syncthreads();
|
||||
|
||||
for (int k = 0; k < TILE_SIZE; k++) {
|
||||
sum += sA[ty][k] * sB[k][tx];
|
||||
}
|
||||
__syncthreads();
|
||||
}
|
||||
|
||||
if (row < N && col < K) {
|
||||
C[row * K + col] = sum;
|
||||
}
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
__global__ void dumb_mat_mul(T *A, T *B, T *C, int N, int M, int K) {
|
||||
int col = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
int row = blockIdx.y * blockDim.y + threadIdx.y;
|
||||
|
||||
if (col >= K || row >= M) return;
|
||||
|
||||
T sum = 0;
|
||||
for (int i = 0; i < M; i++) {
|
||||
sum += A[row * M + i] * B[i * K + col];
|
||||
}
|
||||
C[row * K + col] = sum;
|
||||
}
|
||||
|
||||
#define N 1024
|
||||
#define M 1024
|
||||
#define K 1024
|
||||
#define NO_PRINT 1
|
||||
#define GRID_DIM 1
|
||||
#define BLOCK_DIM 32
|
||||
|
||||
#define MAT_TYPE int
|
||||
#define MAT_FMT "%d\t"
|
||||
#define N 5
|
||||
#define M 7
|
||||
#define K 3
|
||||
#define A_LEN (N * M)
|
||||
#define B_LEN (M * K)
|
||||
#define C_LEN (N * K)
|
||||
@ -52,6 +70,8 @@ __global__ void mat_mul(T *A, T *B, T *C, int N, int M, int K) {
|
||||
|
||||
#include <cstdio>
|
||||
#include <random>
|
||||
#include <chrono>
|
||||
using namespace std::chrono;
|
||||
|
||||
template <typename T>
|
||||
void mat_print(T *a, const char *fmt, int n, int m) {
|
||||
@ -68,7 +88,7 @@ int main() {
|
||||
std::mt19937 engine(rd());
|
||||
std::uniform_int_distribution<MAT_TYPE> dist(1, 10);
|
||||
|
||||
MAT_TYPE buf[A_LEN + B_LEN + C_LEN];
|
||||
auto buf = (MAT_TYPE *)malloc(A_SIZE + B_SIZE + C_SIZE);
|
||||
for (auto i = 0; i < A_LEN + B_LEN; i++) {
|
||||
buf[i] = dist(engine);
|
||||
}
|
||||
@ -77,10 +97,12 @@ int main() {
|
||||
MAT_TYPE *b = a + A_LEN;
|
||||
MAT_TYPE *c = b + B_LEN;
|
||||
|
||||
#if NO_PRINT==0
|
||||
printf("\na\n");
|
||||
mat_print(a, MAT_FMT, N, M);
|
||||
printf("\nb\n");
|
||||
mat_print(b, MAT_FMT, M, K);
|
||||
#endif
|
||||
|
||||
MAT_TYPE *d_a, *d_b, *d_c;
|
||||
cudaMalloc(&d_a, A_SIZE);
|
||||
@ -90,17 +112,50 @@ int main() {
|
||||
cudaMemcpy(d_a, a, A_SIZE, cudaMemcpyHostToDevice);
|
||||
cudaMemcpy(d_b, b, B_SIZE, cudaMemcpyHostToDevice);
|
||||
|
||||
dim3 blockDim(4, 4);
|
||||
dim3 threadDim(4, 4);
|
||||
mat_mul<MAT_TYPE, 4><<<blockDim, threadDim>>>(d_a, d_b, d_c, N, M, K);
|
||||
dim3 gridDim(GRID_DIM, GRID_DIM);
|
||||
dim3 blockDim(BLOCK_DIM, BLOCK_DIM);
|
||||
|
||||
cudaMemcpy(c, d_c, C_SIZE, cudaMemcpyDeviceToHost);
|
||||
int cycles = 0;
|
||||
microseconds duration(0);
|
||||
|
||||
while (duration.count() < 1e6) {
|
||||
auto start = high_resolution_clock::now();
|
||||
mat_mul<MAT_TYPE, BLOCK_DIM><<<gridDim, blockDim>>>(d_a, d_b, d_c, N, M, K);
|
||||
cudaDeviceSynchronize();
|
||||
auto end = high_resolution_clock::now();
|
||||
|
||||
cycles++;
|
||||
duration += duration_cast<microseconds>(end - start);
|
||||
}
|
||||
|
||||
#if NO_PRINT==0
|
||||
cudaMemcpy(c, d_c, C_SIZE, cudaMemcpyDeviceToHost);
|
||||
printf("\nc\n");
|
||||
mat_print(c, MAT_FMT, N, K);
|
||||
#endif
|
||||
printf("optimized mul take %f usec avg in %d cycles\n", (float)(duration.count()) / cycles, cycles);
|
||||
|
||||
cycles = 0;
|
||||
duration = microseconds(0);
|
||||
while (duration.count() < 1e6) {
|
||||
auto start = high_resolution_clock::now();
|
||||
dumb_mat_mul<MAT_TYPE><<<gridDim, blockDim>>>(d_a, d_b, d_c, N, M, K);
|
||||
cudaDeviceSynchronize();
|
||||
auto end = high_resolution_clock::now();
|
||||
|
||||
cycles++;
|
||||
duration += duration_cast<microseconds>(end - start);
|
||||
}
|
||||
|
||||
#if NO_PRINT==0
|
||||
cudaMemcpy(c, d_c, C_SIZE, cudaMemcpyDeviceToHost);
|
||||
printf("\nc\n");
|
||||
mat_print(c, MAT_FMT, N, K);
|
||||
#endif
|
||||
printf("dumb mul take %f usec avg in %d cycles\n", (float)(duration.count()) / cycles, cycles);
|
||||
|
||||
cudaFree(a);
|
||||
cudaFree(b);
|
||||
cudaFree(c);
|
||||
|
||||
printf("\nc\n");
|
||||
mat_print(c, MAT_FMT, N, K);
|
||||
free(buf);
|
||||
}
|
||||
27
ds/25-1/1e/main.py
Normal file
27
ds/25-1/1e/main.py
Normal file
@ -0,0 +1,27 @@
|
||||
import sys, time, math
|
||||
import numpy as np
|
||||
import cupy as cp
|
||||
|
||||
def measure(a, b):
|
||||
duration = 0
|
||||
cycles = 0
|
||||
while (duration < 1):
|
||||
start = time.perf_counter()
|
||||
c = a @ b
|
||||
cp.cuda.Stream.null.synchronize()
|
||||
end = time.perf_counter()
|
||||
|
||||
duration += end - start
|
||||
cycles += 1
|
||||
|
||||
return duration / cycles
|
||||
|
||||
n = 1024
|
||||
|
||||
a = np.random.rand(n, n).astype(np.float32)
|
||||
b = np.random.rand(n, n).astype(np.float32)
|
||||
print('numpy take', measure(a, b) * 1e6, 'usec')
|
||||
|
||||
a = cp.random.rand(n, n, dtype = cp.float32)
|
||||
b = cp.random.rand(n, n, dtype = cp.float32)
|
||||
print('cupy take', measure(a, b) * 1e6, 'usec')
|
||||
Reference in New Issue
Block a user