-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathutils.cu
103 lines (90 loc) · 2.47 KB
/
utils.cu
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
#include <time.h>
#include <stdint.h>
#include <bits/stdc++.h>
#include <cuda_fp16.h>
using namespace std;
#define CEIL_DIV(a, b) (((a) + (b - 1)) / (b))
void matrix_random(float *a, int numel)
{
for (int i = 0; i < numel; i++)
{
a[i] = ((double)rand()) / INT_MAX;
}
}
void matrix_random_fp16valued(float *a, int numel)
{
for (int i = 0; i < numel; i++)
{
a[i] = __half2float(__float2half(((double)rand()) / INT_MAX));
}
}
void matrix_zeros(float *a, int numel)
{
for (int i = 0; i < numel; i++)
{
a[i] = 0.0;
}
}
void matmul_c(float *a, float *b, float *c, int N)
{
for (int i = 0; i < N; i++)
{
for (int j = 0; j < N; j++)
{
float acc = 0.0f;
for (int k = 0; k < N; k++)
{
acc += a[i * N + k] * b[k * N + j];
}
c[i * N + j] = acc;
}
}
}
void matrix_eq(float *a, float *b, int N)
{
for (int i = 0; i < N; i++)
{
for (int j = 0; j < N; j++)
{
if (fabs(b[i * N + j] - a[i * N + j]) > 2e-2)
{
printf("ERROR at i=%d j=%d (should be %f, is %f)\n", i, j, a[i * N + j], b[i * N + j]);
exit(1);
}
}
}
}
// from https://github.com/siboehm/SGEMM_CUDA/blob/master/src/runner.cu#L24
void CudaDeviceInfo()
{
int deviceId;
cudaGetDevice(&deviceId);
cudaDeviceProp props{};
cudaGetDeviceProperties(&props, deviceId);
printf("Device ID: %d\n\
Name: %s\n\
Compute Capability: %d.%d\n\
memoryBusWidth: %d\n\
maxThreadsPerBlock: %d\n\
maxThreadsPerMultiProcessor: %d\n\
maxRegsPerBlock: %d\n\
maxRegsPerMultiProcessor: %d\n\
totalGlobalMem: %zuMB\n\
sharedMemPerBlock: %zuKB\n\
sharedMemPerMultiprocessor: %zuKB\n\
totalConstMem: %zuKB\n\
multiProcessorCount: %d\n\
Warp Size: %d\n",
deviceId, props.name, props.major, props.minor, props.memoryBusWidth,
props.maxThreadsPerBlock, props.maxThreadsPerMultiProcessor,
props.regsPerBlock, props.regsPerMultiprocessor,
props.totalGlobalMem / 1024 / 1024, props.sharedMemPerBlock / 1024,
props.sharedMemPerMultiprocessor / 1024, props.totalConstMem / 1024,
props.multiProcessorCount, props.warpSize);
};
uint64_t nanos()
{
struct timespec start;
clock_gettime(CLOCK_MONOTONIC_RAW, &start);
return (uint64_t)start.tv_sec * 1000000000 + (uint64_t)start.tv_nsec;
}