1
1
/*
2
2
* memcpy - copy memory area
3
3
*
4
- * Copyright (c) 2012 -2020, Arm Limited.
4
+ * Copyright (c) 2019 -2020, Arm Limited.
5
5
* SPDX-License-Identifier: MIT
6
6
*/
7
7
8
8
/* Assumptions:
9
9
*
10
- * ARMv8-a, AArch64, unaligned accesses.
10
+ * ARMv8-a, AArch64, Advanced SIMD, unaligned accesses.
11
11
*
12
12
*/
13
13
25
25
#define B_l x8
26
26
#define B_lw w8
27
27
#define B_h x9
28
- #define C_l x10
29
28
#define C_lw w10
30
- #define C_h x11
31
- #define D_l x12
32
- #define D_h x13
33
- #define E_l x14
34
- #define E_h x15
35
- #define F_l x16
36
- #define F_h x17
37
- #define G_l count
38
- #define G_h dst
39
- #define H_l src
40
- #define H_h srcend
41
29
#define tmp1 x14
42
30
31
+ #define A_q q0
32
+ #define B_q q1
33
+ #define C_q q2
34
+ #define D_q q3
35
+ #define E_q q4
36
+ #define F_q q5
37
+ #define G_q q6
38
+ #define H_q q7
39
+
43
40
/* This implementation handles overlaps and supports both memcpy and memmove
44
41
from a single entry point. It uses unaligned accesses and branchless
45
42
sequences to keep the code small, simple and improve performance.
49
46
check is negligible since it is only required for large copies.
50
47
51
48
Large copies use a software pipelined loop processing 64 bytes per iteration.
52
- The destination pointer is 16-byte aligned to minimize unaligned accesses.
49
+ The source pointer is 16-byte aligned to minimize unaligned accesses.
53
50
The loop tail is handled by always copying 64 bytes from the end.
54
51
*/
55
52
@@ -68,10 +65,10 @@ ENTRY (memcpy)
68
65
/* Small copies: 0..32 bytes. */
69
66
cmp count, 16
70
67
b.lo L(copy16)
71
- ldp A_l, A_h , [src]
72
- ldp D_l, D_h , [srcend, -16]
73
- stp A_l, A_h , [dstin]
74
- stp D_l, D_h , [dstend, -16]
68
+ ldr A_q , [src]
69
+ ldr B_q , [srcend, -16]
70
+ str A_q , [dstin]
71
+ str B_q , [dstend, -16]
75
72
ret
76
73
77
74
/* Copy 8-15 bytes. */
@@ -109,134 +106,100 @@ L(copy0):
109
106
.p2align 4
110
107
/* Medium copies: 33..128 bytes. */
111
108
L(copy32_128):
112
- ldp A_l, A_h, [src]
113
- ldp B_l, B_h, [src, 16]
114
- ldp C_l, C_h, [srcend, -32]
115
- ldp D_l, D_h, [srcend, -16]
109
+ ldp A_q, B_q, [src]
110
+ ldp C_q, D_q, [srcend, -32]
116
111
cmp count, 64
117
112
b.hi L(copy128)
118
- stp A_l, A_h, [dstin]
119
- stp B_l, B_h, [dstin, 16]
120
- stp C_l, C_h, [dstend, -32]
121
- stp D_l, D_h, [dstend, -16]
113
+ stp A_q, B_q, [dstin]
114
+ stp C_q, D_q, [dstend, -32]
122
115
ret
123
116
124
117
.p2align 4
125
118
/* Copy 65..128 bytes. */
126
119
L(copy128):
127
- ldp E_l, E_h, [src, 32]
128
- ldp F_l, F_h, [src, 48]
120
+ ldp E_q, F_q, [src, 32]
129
121
cmp count, 96
130
122
b.ls L(copy96)
131
- ldp G_l, G_h, [srcend, -64]
132
- ldp H_l, H_h, [srcend, -48]
133
- stp G_l, G_h, [dstend, -64]
134
- stp H_l, H_h, [dstend, -48]
123
+ ldp G_q, H_q, [srcend, -64]
124
+ stp G_q, H_q, [dstend, -64]
135
125
L(copy96):
136
- stp A_l, A_h, [dstin]
137
- stp B_l, B_h, [dstin, 16]
138
- stp E_l, E_h, [dstin, 32]
139
- stp F_l, F_h, [dstin, 48]
140
- stp C_l, C_h, [dstend, -32]
141
- stp D_l, D_h, [dstend, -16]
126
+ stp A_q, B_q, [dstin]
127
+ stp E_q, F_q, [dstin, 32]
128
+ stp C_q, D_q, [dstend, -32]
142
129
ret
143
130
144
- .p2align 4
145
131
/* Copy more than 128 bytes. */
146
132
L(copy_long):
147
133
/* Use backwards copy if there is an overlap. */
148
134
sub tmp1, dstin, src
149
- cbz tmp1, L(copy0)
150
135
cmp tmp1, count
151
136
b.lo L(copy_long_backwards)
152
137
153
- /* Copy 16 bytes and then align dst to 16-byte alignment. */
154
-
155
- ldp D_l, D_h, [src]
156
- and tmp1, dstin, 15
157
- bic dst, dstin, 15
158
- sub src, src, tmp1
138
+ /* Copy 16 bytes and then align src to 16-byte alignment. */
139
+ ldr D_q, [src]
140
+ and tmp1, src, 15
141
+ bic src, src, 15
142
+ sub dst, dstin, tmp1
159
143
add count, count, tmp1 /* Count is now 16 too large. */
160
- ldp A_l, A_h, [src, 16]
161
- stp D_l, D_h, [dstin]
162
- ldp B_l, B_h, [src, 32]
163
- ldp C_l, C_h, [src, 48]
164
- ldp D_l, D_h, [src, 64]!
144
+ ldp A_q, B_q, [src, 16]
145
+ str D_q, [dstin]
146
+ ldp C_q, D_q, [src, 48]
165
147
subs count, count, 128 + 16 /* Test and readjust count. */
166
148
b.ls L(copy64_from_end)
167
-
168
149
L(loop64):
169
- stp A_l, A_h, [dst, 16]
170
- ldp A_l, A_h, [src, 16]
171
- stp B_l, B_h, [dst, 32]
172
- ldp B_l, B_h, [src, 32]
173
- stp C_l, C_h, [dst, 48]
174
- ldp C_l, C_h, [src, 48]
175
- stp D_l, D_h, [dst, 64]!
176
- ldp D_l, D_h, [src, 64]!
150
+ stp A_q, B_q, [dst, 16]
151
+ ldp A_q, B_q, [src, 80]
152
+ stp C_q, D_q, [dst, 48]
153
+ ldp C_q, D_q, [src, 112]
154
+ add src, src, 64
155
+ add dst, dst, 64
177
156
subs count, count, 64
178
157
b.hi L(loop64)
179
158
180
159
/* Write the last iteration and copy 64 bytes from the end. */
181
160
L(copy64_from_end):
182
- ldp E_l, E_h, [srcend, -64]
183
- stp A_l, A_h, [dst, 16]
184
- ldp A_l, A_h, [srcend, -48]
185
- stp B_l, B_h, [dst, 32]
186
- ldp B_l, B_h, [srcend, -32]
187
- stp C_l, C_h, [dst, 48]
188
- ldp C_l, C_h, [srcend, -16]
189
- stp D_l, D_h, [dst, 64]
190
- stp E_l, E_h, [dstend, -64]
191
- stp A_l, A_h, [dstend, -48]
192
- stp B_l, B_h, [dstend, -32]
193
- stp C_l, C_h, [dstend, -16]
161
+ ldp E_q, F_q, [srcend, -64]
162
+ stp A_q, B_q, [dst, 16]
163
+ ldp A_q, B_q, [srcend, -32]
164
+ stp C_q, D_q, [dst, 48]
165
+ stp E_q, F_q, [dstend, -64]
166
+ stp A_q, B_q, [dstend, -32]
194
167
ret
195
168
196
- .p2align 4
197
-
198
169
/* Large backwards copy for overlapping copies.
199
- Copy 16 bytes and then align dst to 16-byte alignment. */
170
+ Copy 16 bytes and then align srcend to 16-byte alignment. */
200
171
L(copy_long_backwards):
201
- ldp D_l, D_h, [srcend, -16]
202
- and tmp1, dstend, 15
203
- sub srcend, srcend, tmp1
172
+ cbz tmp1, L(copy0)
173
+ ldr D_q, [srcend, -16]
174
+ and tmp1, srcend, 15
175
+ bic srcend, srcend, 15
204
176
sub count, count, tmp1
205
- ldp A_l, A_h, [srcend, -16]
206
- stp D_l, D_h, [dstend, -16]
207
- ldp B_l, B_h, [srcend, -32]
208
- ldp C_l, C_h, [srcend, -48]
209
- ldp D_l, D_h, [srcend, -64]!
177
+ ldp A_q, B_q, [srcend, -32]
178
+ str D_q, [dstend, -16]
179
+ ldp C_q, D_q, [srcend, -64]
210
180
sub dstend, dstend, tmp1
211
181
subs count, count, 128
212
182
b.ls L(copy64_from_start)
213
183
214
184
L(loop64_backwards):
215
- stp A_l, A_h, [dstend, -16]
216
- ldp A_l, A_h, [srcend, -16]
217
- stp B_l, B_h, [dstend, -32]
218
- ldp B_l, B_h, [srcend, -32]
219
- stp C_l, C_h, [dstend, -48]
220
- ldp C_l, C_h, [srcend, -48]
221
- stp D_l, D_h, [dstend, -64]!
222
- ldp D_l, D_h, [srcend, -64]!
185
+ str B_q, [dstend, -16]
186
+ str A_q, [dstend, -32]
187
+ ldp A_q, B_q, [srcend, -96]
188
+ str D_q, [dstend, -48]
189
+ str C_q, [dstend, -64]!
190
+ ldp C_q, D_q, [srcend, -128]
191
+ sub srcend, srcend, 64
223
192
subs count, count, 64
224
193
b.hi L(loop64_backwards)
225
194
226
195
/* Write the last iteration and copy 64 bytes from the start. */
227
196
L(copy64_from_start):
228
- ldp G_l, G_h, [src, 48]
229
- stp A_l, A_h, [dstend, -16]
230
- ldp A_l, A_h, [src, 32]
231
- stp B_l, B_h, [dstend, -32]
232
- ldp B_l, B_h, [src, 16]
233
- stp C_l, C_h, [dstend, -48]
234
- ldp C_l, C_h, [src]
235
- stp D_l, D_h, [dstend, -64]
236
- stp G_l, G_h, [dstin, 48]
237
- stp A_l, A_h, [dstin, 32]
238
- stp B_l, B_h, [dstin, 16]
239
- stp C_l, C_h, [dstin]
197
+ ldp E_q, F_q, [src, 32]
198
+ stp A_q, B_q, [dstend, -32]
199
+ ldp A_q, B_q, [src]
200
+ stp C_q, D_q, [dstend, -64]
201
+ stp E_q, F_q, [dstin, 32]
202
+ stp A_q, B_q, [dstin]
240
203
ret
241
204
242
205
END (memcpy)
0 commit comments