Add ACKNOWLEDGEMENTS. Replace memcpy with advsimd implementation.

sfc-gh-kmakino · sfc-gh-kmakino · commit ab318880a58b · 2021-08-23T19:12:52.000-07:00
diff --git a/ACKNOWLEDGEMENTS b/ACKNOWLEDGEMENTS
@@ -562,3 +562,28 @@ folly_memcpy:
  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  See the License for the specific language governing permissions and
  limitations under the License.
+
+
+Arm Limited (optimized-routines)
+
+    MIT License
+
+    Copyright (c) 1999-2019, Arm Limited.
+
+    Permission is hereby granted, free of charge, to any person obtaining a copy
+    of this software and associated documentation files (the "Software"), to deal
+    in the Software without restriction, including without limitation the rights
+    to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+    copies of the Software, and to permit persons to whom the Software is
+    furnished to do so, subject to the following conditions:
+
+    The above copyright notice and this permission notice shall be included in all
+    copies or substantial portions of the Software.
+
+    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+    OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+    SOFTWARE.
diff --git a/flow/aarch64/memcpy.S b/flow/aarch64/memcpy.S
@@ -1,13 +1,13 @@
 /*
  * memcpy - copy memory area
  *
- * Copyright (c) 2012-2020, Arm Limited.
+ * Copyright (c) 2019-2020, Arm Limited.
  * SPDX-License-Identifier: MIT
  */
 
 /* Assumptions:
  *
- * ARMv8-a, AArch64, unaligned accesses.
+ * ARMv8-a, AArch64, Advanced SIMD, unaligned accesses.
  *
  */
 
@@ -25,21 +25,18 @@
 #define B_l	x8
 #define B_lw	w8
 #define B_h	x9
-#define C_l	x10
 #define C_lw	w10
-#define C_h	x11
-#define D_l	x12
-#define D_h	x13
-#define E_l	x14
-#define E_h	x15
-#define F_l	x16
-#define F_h	x17
-#define G_l	count
-#define G_h	dst
-#define H_l	src
-#define H_h	srcend
 #define tmp1	x14
 
+#define A_q	q0
+#define B_q	q1
+#define C_q	q2
+#define D_q	q3
+#define E_q	q4
+#define F_q	q5
+#define G_q	q6
+#define H_q	q7
+
 /* This implementation handles overlaps and supports both memcpy and memmove
    from a single entry point.  It uses unaligned accesses and branchless
    sequences to keep the code small, simple and improve performance.
@@ -49,7 +46,7 @@
    check is negligible since it is only required for large copies.
 
    Large copies use a software pipelined loop processing 64 bytes per iteration.
-   The destination pointer is 16-byte aligned to minimize unaligned accesses.
+   The source pointer is 16-byte aligned to minimize unaligned accesses.
    The loop tail is handled by always copying 64 bytes from the end.
 */
 
@@ -68,10 +65,10 @@ ENTRY (memcpy)
 	/* Small copies: 0..32 bytes.  */
 	cmp	count, 16
 	b.lo	L(copy16)
-	ldp	A_l, A_h, [src]
-	ldp	D_l, D_h, [srcend, -16]
-	stp	A_l, A_h, [dstin]
-	stp	D_l, D_h, [dstend, -16]
+	ldr	A_q, [src]
+	ldr	B_q, [srcend, -16]
+	str	A_q, [dstin]
+	str	B_q, [dstend, -16]
 	ret
 
 	/* Copy 8-15 bytes.  */
@@ -109,134 +106,100 @@ L(copy0):
 	.p2align 4
 	/* Medium copies: 33..128 bytes.  */
 L(copy32_128):
-	ldp	A_l, A_h, [src]
-	ldp	B_l, B_h, [src, 16]
-	ldp	C_l, C_h, [srcend, -32]
-	ldp	D_l, D_h, [srcend, -16]
+	ldp	A_q, B_q, [src]
+	ldp	C_q, D_q, [srcend, -32]
 	cmp	count, 64
 	b.hi	L(copy128)
-	stp	A_l, A_h, [dstin]
-	stp	B_l, B_h, [dstin, 16]
-	stp	C_l, C_h, [dstend, -32]
-	stp	D_l, D_h, [dstend, -16]
+	stp	A_q, B_q, [dstin]
+	stp	C_q, D_q, [dstend, -32]
 	ret
 
 	.p2align 4
 	/* Copy 65..128 bytes.  */
 L(copy128):
-	ldp	E_l, E_h, [src, 32]
-	ldp	F_l, F_h, [src, 48]
+	ldp	E_q, F_q, [src, 32]
 	cmp	count, 96
 	b.ls	L(copy96)
-	ldp	G_l, G_h, [srcend, -64]
-	ldp	H_l, H_h, [srcend, -48]
-	stp	G_l, G_h, [dstend, -64]
-	stp	H_l, H_h, [dstend, -48]
+	ldp	G_q, H_q, [srcend, -64]
+	stp	G_q, H_q, [dstend, -64]
 L(copy96):
-	stp	A_l, A_h, [dstin]
-	stp	B_l, B_h, [dstin, 16]
-	stp	E_l, E_h, [dstin, 32]
-	stp	F_l, F_h, [dstin, 48]
-	stp	C_l, C_h, [dstend, -32]
-	stp	D_l, D_h, [dstend, -16]
+	stp	A_q, B_q, [dstin]
+	stp	E_q, F_q, [dstin, 32]
+	stp	C_q, D_q, [dstend, -32]
 	ret
 
-	.p2align 4
 	/* Copy more than 128 bytes.  */
 L(copy_long):
 	/* Use backwards copy if there is an overlap.  */
 	sub	tmp1, dstin, src
-	cbz	tmp1, L(copy0)
 	cmp	tmp1, count
 	b.lo	L(copy_long_backwards)
 
-	/* Copy 16 bytes and then align dst to 16-byte alignment.  */
-
-	ldp	D_l, D_h, [src]
-	and	tmp1, dstin, 15
-	bic	dst, dstin, 15
-	sub	src, src, tmp1
+	/* Copy 16 bytes and then align src to 16-byte alignment.  */
+	ldr	D_q, [src]
+	and	tmp1, src, 15
+	bic	src, src, 15
+	sub	dst, dstin, tmp1
 	add	count, count, tmp1	/* Count is now 16 too large.  */
-	ldp	A_l, A_h, [src, 16]
-	stp	D_l, D_h, [dstin]
-	ldp	B_l, B_h, [src, 32]
-	ldp	C_l, C_h, [src, 48]
-	ldp	D_l, D_h, [src, 64]!
+	ldp	A_q, B_q, [src, 16]
+	str	D_q, [dstin]
+	ldp	C_q, D_q, [src, 48]
 	subs	count, count, 128 + 16	/* Test and readjust count.  */
 	b.ls	L(copy64_from_end)
-
 L(loop64):
-	stp	A_l, A_h, [dst, 16]
-	ldp	A_l, A_h, [src, 16]
-	stp	B_l, B_h, [dst, 32]
-	ldp	B_l, B_h, [src, 32]
-	stp	C_l, C_h, [dst, 48]
-	ldp	C_l, C_h, [src, 48]
-	stp	D_l, D_h, [dst, 64]!
-	ldp	D_l, D_h, [src, 64]!
+	stp	A_q, B_q, [dst, 16]
+	ldp	A_q, B_q, [src, 80]
+	stp	C_q, D_q, [dst, 48]
+	ldp	C_q, D_q, [src, 112]
+	add	src, src, 64
+	add	dst, dst, 64
 	subs	count, count, 64
 	b.hi	L(loop64)
 
 	/* Write the last iteration and copy 64 bytes from the end.  */
 L(copy64_from_end):
-	ldp	E_l, E_h, [srcend, -64]
-	stp	A_l, A_h, [dst, 16]
-	ldp	A_l, A_h, [srcend, -48]
-	stp	B_l, B_h, [dst, 32]
-	ldp	B_l, B_h, [srcend, -32]
-	stp	C_l, C_h, [dst, 48]
-	ldp	C_l, C_h, [srcend, -16]
-	stp	D_l, D_h, [dst, 64]
-	stp	E_l, E_h, [dstend, -64]
-	stp	A_l, A_h, [dstend, -48]
-	stp	B_l, B_h, [dstend, -32]
-	stp	C_l, C_h, [dstend, -16]
+	ldp	E_q, F_q, [srcend, -64]
+	stp	A_q, B_q, [dst, 16]
+	ldp	A_q, B_q, [srcend, -32]
+	stp	C_q, D_q, [dst, 48]
+	stp	E_q, F_q, [dstend, -64]
+	stp	A_q, B_q, [dstend, -32]
 	ret
 
-	.p2align 4
-
 	/* Large backwards copy for overlapping copies.
-	   Copy 16 bytes and then align dst to 16-byte alignment.  */
+	   Copy 16 bytes and then align srcend to 16-byte alignment.  */
 L(copy_long_backwards):
-	ldp	D_l, D_h, [srcend, -16]
-	and	tmp1, dstend, 15
-	sub	srcend, srcend, tmp1
+	cbz	tmp1, L(copy0)
+	ldr	D_q, [srcend, -16]
+	and	tmp1, srcend, 15
+	bic	srcend, srcend, 15
 	sub	count, count, tmp1
-	ldp	A_l, A_h, [srcend, -16]
-	stp	D_l, D_h, [dstend, -16]
-	ldp	B_l, B_h, [srcend, -32]
-	ldp	C_l, C_h, [srcend, -48]
-	ldp	D_l, D_h, [srcend, -64]!
+	ldp	A_q, B_q, [srcend, -32]
+	str	D_q, [dstend, -16]
+	ldp	C_q, D_q, [srcend, -64]
 	sub	dstend, dstend, tmp1
 	subs	count, count, 128
 	b.ls	L(copy64_from_start)
 
 L(loop64_backwards):
-	stp	A_l, A_h, [dstend, -16]
-	ldp	A_l, A_h, [srcend, -16]
-	stp	B_l, B_h, [dstend, -32]
-	ldp	B_l, B_h, [srcend, -32]
-	stp	C_l, C_h, [dstend, -48]
-	ldp	C_l, C_h, [srcend, -48]
-	stp	D_l, D_h, [dstend, -64]!
-	ldp	D_l, D_h, [srcend, -64]!
+	str	B_q, [dstend, -16]
+	str	A_q, [dstend, -32]
+	ldp	A_q, B_q, [srcend, -96]
+	str	D_q, [dstend, -48]
+	str	C_q, [dstend, -64]!
+	ldp	C_q, D_q, [srcend, -128]
+	sub	srcend, srcend, 64
 	subs	count, count, 64
 	b.hi	L(loop64_backwards)
 
 	/* Write the last iteration and copy 64 bytes from the start.  */
 L(copy64_from_start):
-	ldp	G_l, G_h, [src, 48]
-	stp	A_l, A_h, [dstend, -16]
-	ldp	A_l, A_h, [src, 32]
-	stp	B_l, B_h, [dstend, -32]
-	ldp	B_l, B_h, [src, 16]
-	stp	C_l, C_h, [dstend, -48]
-	ldp	C_l, C_h, [src]
-	stp	D_l, D_h, [dstend, -64]
-	stp	G_l, G_h, [dstin, 48]
-	stp	A_l, A_h, [dstin, 32]
-	stp	B_l, B_h, [dstin, 16]
-	stp	C_l, C_h, [dstin]
+	ldp	E_q, F_q, [src, 32]
+	stp	A_q, B_q, [dstend, -32]
+	ldp	A_q, B_q, [src]
+	stp	C_q, D_q, [dstend, -64]
+	stp	E_q, F_q, [dstin, 32]
+	stp	A_q, B_q, [dstin]
 	ret
 
 END (memcpy)