|
26 | 26 | #include "../../../common.h" |
27 | 27 | #if defined(MLD_ARITH_BACKEND_AARCH64) |
28 | 28 |
|
| 29 | +.macro vins vec_out, gpr_in, lane |
| 30 | + ins \vec_out\().d[\lane], \gpr_in |
| 31 | +.endm |
| 32 | + |
| 33 | +.macro ldr_vo vec, base, offset |
| 34 | + ldr xtmp0, [\base, #\offset] |
| 35 | + ldr xtmp1, [\base, #(\offset+8)] |
| 36 | + vins \vec, xtmp0, 0 |
| 37 | + vins \vec, xtmp1, 1 |
| 38 | +.endm |
| 39 | + |
| 40 | +.macro ldr_vi vec, base, inc |
| 41 | + ldr xtmp0, [\base], #\inc |
| 42 | + ldr xtmp1, [\base, #(-\inc+8)] |
| 43 | + vins \vec, xtmp0, 0 |
| 44 | + vins \vec, xtmp1, 1 |
| 45 | +.endm |
| 46 | + |
29 | 47 | .macro mulmodq dst, src, const, idx0, idx1 |
30 | 48 | sqrdmulh t2.4s, \src\().4s, \const\().s[\idx1\()] |
31 | 49 | mul \dst\().4s, \src\().4s, \const\().s[\idx0\()] |
|
51 | 69 | .endm |
52 | 70 |
|
53 | 71 | .macro load_roots_123 |
54 | | - ldr q_root0, [r012345_ptr], #64 |
55 | | - ldr q_root1, [r012345_ptr, #(-64 + 16)] |
56 | | - ldr q_root2, [r012345_ptr, #(-64 + 32)] |
57 | | - ldr q_root3, [r012345_ptr, #(-64 + 48)] |
| 72 | + ldr_vi root0, r012345_ptr, 64 |
| 73 | + ldr_vo root1, r012345_ptr, (-64 + 16) |
| 74 | + ldr_vo root2, r012345_ptr, (-64 + 32) |
| 75 | + ldr_vo root3, r012345_ptr, (-64 + 48) |
58 | 76 | .endm |
59 | 77 |
|
60 | 78 | .macro load_roots_456 |
61 | | - ldr q_root0, [r012345_ptr], #64 |
62 | | - ldr q_root1, [r012345_ptr, #(-64 + 16)] |
63 | | - ldr q_root2, [r012345_ptr, #(-64 + 32)] |
64 | | - ldr q_root3, [r012345_ptr, #(-64 + 48)] |
| 79 | + ldr_vi root0, r012345_ptr, 64 |
| 80 | + ldr_vo root1, r012345_ptr, (-64 + 16) |
| 81 | + ldr_vo root2, r012345_ptr, (-64 + 32) |
| 82 | + ldr_vo root3, r012345_ptr, (-64 + 48) |
65 | 83 | .endm |
66 | 84 |
|
67 | 85 | .macro load_roots_78_part1 |
68 | | - ldr q_root0, [r67_ptr], #(12*16) |
69 | | - ldr q_root0_tw, [r67_ptr, #(-12*16 + 1*16)] |
70 | | - ldr q_root1, [r67_ptr, #(-12*16 + 2*16)] |
71 | | - ldr q_root1_tw, [r67_ptr, #(-12*16 + 3*16)] |
72 | | - ldr q_root2, [r67_ptr, #(-12*16 + 4*16)] |
73 | | - ldr q_root2_tw, [r67_ptr, #(-12*16 + 5*16)] |
| 86 | + ldr_vi root0, r67_ptr, (12*16) |
| 87 | + ldr_vo root0_tw, r67_ptr, (-12*16 + 1*16) |
| 88 | + ldr_vo root1, r67_ptr, (-12*16 + 2*16) |
| 89 | + ldr_vo root1_tw, r67_ptr, (-12*16 + 3*16) |
| 90 | + ldr_vo root2, r67_ptr, (-12*16 + 4*16) |
| 91 | + ldr_vo root2_tw, r67_ptr, (-12*16 + 5*16) |
74 | 92 | .endm |
75 | 93 |
|
76 | 94 | .macro load_roots_78_part2 |
77 | | - ldr q_root0, [r67_ptr, #(-12*16 + 6*16)] |
78 | | - ldr q_root0_tw, [r67_ptr, #(-12*16 + 7*16)] |
79 | | - ldr q_root1, [r67_ptr, #(-12*16 + 8*16)] |
80 | | - ldr q_root1_tw, [r67_ptr, #(-12*16 + 9*16)] |
81 | | - ldr q_root2, [r67_ptr, #(-12*16 + 10*16)] |
82 | | - ldr q_root2_tw, [r67_ptr, #(-12*16 + 11*16)] |
| 95 | + ldr_vo root0, r67_ptr, (-12*16 + 6*16) |
| 96 | + ldr_vo root0_tw, r67_ptr, (-12*16 + 7*16) |
| 97 | + ldr_vo root1, r67_ptr, (-12*16 + 8*16) |
| 98 | + ldr_vo root1_tw, r67_ptr, (-12*16 + 9*16) |
| 99 | + ldr_vo root2, r67_ptr, (-12*16 + 10*16) |
| 100 | + ldr_vo root2_tw, r67_ptr, (-12*16 + 11*16) |
83 | 101 | .endm |
84 | 102 |
|
85 | 103 | .macro transpose4 data0, data1, data2, data3 |
|
129 | 147 | xtmp .req x6 |
130 | 148 | wtmp .req w6 |
131 | 149 |
|
| 150 | + xtmp0 .req x6 |
| 151 | + xtmp1 .req x7 |
| 152 | + |
132 | 153 | data0 .req v9 |
133 | 154 | data1 .req v10 |
134 | 155 | data2 .req v11 |
@@ -193,14 +214,14 @@ MLD_ASM_FN_SYMBOL(ntt_asm) |
193 | 214 |
|
194 | 215 | .p2align 2 |
195 | 216 | layer123_start: |
196 | | - ldr q_data0, [in, #(0*(1024/8))] |
197 | | - ldr q_data1, [in, #(1*(1024/8))] |
198 | | - ldr q_data2, [in, #(2*(1024/8))] |
199 | | - ldr q_data3, [in, #(3*(1024/8))] |
200 | | - ldr q_data4, [in, #(4*(1024/8))] |
201 | | - ldr q_data5, [in, #(5*(1024/8))] |
202 | | - ldr q_data6, [in, #(6*(1024/8))] |
203 | | - ldr q_data7, [in, #(7*(1024/8))] |
| 217 | + ldr_vo data0, in, (0*(1024/8)) |
| 218 | + ldr_vo data1, in, (1*(1024/8)) |
| 219 | + ldr_vo data2, in, (2*(1024/8)) |
| 220 | + ldr_vo data3, in, (3*(1024/8)) |
| 221 | + ldr_vo data4, in, (4*(1024/8)) |
| 222 | + ldr_vo data5, in, (5*(1024/8)) |
| 223 | + ldr_vo data6, in, (6*(1024/8)) |
| 224 | + ldr_vo data7, in, (7*(1024/8)) |
204 | 225 |
|
205 | 226 | ct_butterfly data0, data4, root0, 0, 1 |
206 | 227 | ct_butterfly data1, data5, root0, 0, 1 |
@@ -245,14 +266,14 @@ layer123_start: |
245 | 266 |
|
246 | 267 | .p2align 2 |
247 | 268 | layer45678_start: |
248 | | - ldr q_data0, [in, #(64 + 16*0)] |
249 | | - ldr q_data1, [in, #(64 + 16*1)] |
250 | | - ldr q_data2, [in, #(64 + 16*2)] |
251 | | - ldr q_data3, [in, #(64 + 16*3)] |
252 | | - ldr q_data4, [inpp, #(64 + 16*0)] |
253 | | - ldr q_data5, [inpp, #(64 + 16*1)] |
254 | | - ldr q_data6, [inpp, #(64 + 16*2)] |
255 | | - ldr q_data7, [inpp, #(64 + 16*3)] |
| 269 | + ldr_vo data0, in, (64 + 16*0) |
| 270 | + ldr_vo data1, in, (64 + 16*1) |
| 271 | + ldr_vo data2, in, (64 + 16*2) |
| 272 | + ldr_vo data3, in, (64 + 16*3) |
| 273 | + ldr_vo data4, inpp, (64 + 16*0) |
| 274 | + ldr_vo data5, inpp, (64 + 16*1) |
| 275 | + ldr_vo data6, inpp, (64 + 16*2) |
| 276 | + ldr_vo data7, inpp, (64 + 16*3) |
256 | 277 |
|
257 | 278 | add in, in, #64 |
258 | 279 | add inpp, inpp, #64 |
|
0 commit comments