14
14
## 测试
15
15
16
16
``` bash
17
- # 只测试Ada架构 不指定默认编译所有架构 耗时较长
17
+ # 只测试Ada架构 不指定默认编译所有架构 耗时较长: Volta, Ampere, Ada, Hopper, ...
18
18
export TORCH_CUDA_ARCH_LIST=Ada
19
19
python3 dot_product.py
20
20
```
@@ -23,13 +23,102 @@ python3 dot_product.py
23
23
24
24
``` bash
25
25
--------------------------------------------------------------------------------
26
- out_f32f32: -1534.59301758 , time:0.17350578ms
27
- out_f32x4f32: -1534.61364746 , time:0.18058038ms
28
- out_f32f32_th: -1534.61157227 , time:0.18307972ms
26
+ S=1024, K=1024
27
+ out_f32f32: -670.21264648 , time:0.08947158ms
28
+ out_f32x4f32: -670.21435547 , time:0.02821302ms
29
+ out_f32f32_th: -670.21374512 , time:0.09709382ms
29
30
--------------------------------------------------------------------------------
30
- out_f16f32: -1538.26318359 , time:0.10106802ms
31
- out_f16x2f32: -1537.58288574 , time:0.05217433ms
32
- out_f16x8packf32: -1536.44006348 , time:0.02096844ms
33
- out_f16f16_th: -1536.00000000 , time:0.02491832ms
31
+ out_f16f32: -670.32208252 , time:0.04000235ms
32
+ out_f16x2f32: -670.15814209 , time:0.05491829ms
33
+ out_f16x8packf32: -669.90997314 , time:0.01669478ms
34
+ out_f16f16_th: -670.50000000 , time:0.02021313ms
35
+ --------------------------------------------------------------------------------
36
+ --------------------------------------------------------------------------------
37
+ S=1024, K=2048
38
+ out_f32f32: 1040.51086426 , time:0.04557490ms
39
+ out_f32x4f32: 1040.50720215 , time:0.06275582ms
40
+ out_f32f32_th: 1040.50842285 , time:0.04762864ms
41
+ --------------------------------------------------------------------------------
42
+ out_f16f32: 1041.44299316 , time:0.03214121ms
43
+ out_f16x2f32: 1041.79589844 , time:0.03448486ms
44
+ out_f16x8packf32: 1042.22717285 , time:0.02689457ms
45
+ out_f16f16_th: 1041.00000000 , time:0.02859521ms
46
+ --------------------------------------------------------------------------------
47
+ --------------------------------------------------------------------------------
48
+ S=1024, K=4096
49
+ out_f32f32: -1859.81457520 , time:0.08664179ms
50
+ out_f32x4f32: -1859.81628418 , time:0.08621526ms
51
+ out_f32f32_th: -1859.81933594 , time:0.08647323ms
52
+ --------------------------------------------------------------------------------
53
+ out_f16f32: -1860.23291016 , time:0.05826116ms
54
+ out_f16x2f32: -1860.91186523 , time:0.04677963ms
55
+ out_f16x8packf32: -1860.25988770 , time:0.04591107ms
56
+ out_f16f16_th: -1861.00000000 , time:0.04904127ms
57
+ --------------------------------------------------------------------------------
58
+ --------------------------------------------------------------------------------
59
+ S=2048, K=1024
60
+ out_f32f32: 858.98229980 , time:0.04499865ms
61
+ out_f32x4f32: 858.98461914 , time:0.04623890ms
62
+ out_f32f32_th: 858.98376465 , time:0.06848693ms
63
+ --------------------------------------------------------------------------------
64
+ out_f16f32: 858.85339355 , time:0.03274632ms
65
+ out_f16x2f32: 858.94274902 , time:0.02831578ms
66
+ out_f16x8packf32: 859.46844482 , time:0.02884459ms
67
+ out_f16f16_th: 859.00000000 , time:0.03692698ms
68
+ --------------------------------------------------------------------------------
69
+ --------------------------------------------------------------------------------
70
+ S=2048, K=2048
71
+ out_f32f32: -1205.77990723 , time:0.08356524ms
72
+ out_f32x4f32: -1205.77624512 , time:0.08583307ms
73
+ out_f32f32_th: -1205.77807617 , time:0.08613133ms
74
+ --------------------------------------------------------------------------------
75
+ out_f16f32: -1205.40588379 , time:0.06001544ms
76
+ out_f16x2f32: -1205.29028320 , time:0.04738235ms
77
+ out_f16x8packf32: -1205.72924805 , time:0.04624581ms
78
+ out_f16f16_th: -1205.00000000 , time:0.04907203ms
79
+ --------------------------------------------------------------------------------
80
+ --------------------------------------------------------------------------------
81
+ S=2048, K=4096
82
+ out_f32f32: -893.49169922 , time:0.16136765ms
83
+ out_f32x4f32: -893.48596191 , time:0.16174912ms
84
+ out_f32f32_th: -893.48901367 , time:0.16518927ms
85
+ --------------------------------------------------------------------------------
86
+ out_f16f32: -894.42169189 , time:0.11468077ms
87
+ out_f16x2f32: -894.61779785 , time:0.08950567ms
88
+ out_f16x8packf32: -895.26538086 , time:0.08448958ms
89
+ out_f16f16_th: -894.00000000 , time:0.09156108ms
90
+ --------------------------------------------------------------------------------
91
+ --------------------------------------------------------------------------------
92
+ S=4096, K=1024
93
+ out_f32f32: 141.78890991 , time:0.08385873ms
94
+ out_f32x4f32: 141.78639221 , time:0.08500123ms
95
+ out_f32f32_th: 141.78683472 , time:0.08647728ms
96
+ --------------------------------------------------------------------------------
97
+ out_f16f32: 141.80113220 , time:0.05876780ms
98
+ out_f16x2f32: 141.62113953 , time:0.04708385ms
99
+ out_f16x8packf32: 141.15240479 , time:0.04586506ms
100
+ out_f16f16_th: 141.50000000 , time:0.04933500ms
101
+ --------------------------------------------------------------------------------
102
+ --------------------------------------------------------------------------------
103
+ S=4096, K=2048
104
+ out_f32f32: -1238.80456543 , time:0.16236329ms
105
+ out_f32x4f32: -1238.80737305 , time:0.16246724ms
106
+ out_f32f32_th: -1238.80859375 , time:0.16496468ms
107
+ --------------------------------------------------------------------------------
108
+ out_f16f32: -1238.78466797 , time:0.11416745ms
109
+ out_f16x2f32: -1239.28540039 , time:0.08488607ms
110
+ out_f16x8packf32: -1238.85302734 , time:0.08867455ms
111
+ out_f16f16_th: -1239.00000000 , time:0.09029007ms
112
+ --------------------------------------------------------------------------------
113
+ --------------------------------------------------------------------------------
114
+ S=4096, K=4096
115
+ out_f32f32: 556.32690430 , time:0.31692672ms
116
+ out_f32x4f32: 556.33087158 , time:0.31752276ms
117
+ out_f32f32_th: 556.32879639 , time:0.32040811ms
118
+ --------------------------------------------------------------------------------
119
+ out_f16f32: 554.45031738 , time:0.23417449ms
120
+ out_f16x2f32: 553.61444092 , time:0.16469955ms
121
+ out_f16x8packf32: 554.04040527 , time:0.16465998ms
122
+ out_f16f16_th: 554.50000000 , time:0.17046404ms
34
123
--------------------------------------------------------------------------------
35
124
```
0 commit comments