From d6de6080de363fe5aa24f17478066e2717a2c65e Mon Sep 17 00:00:00 2001 From: matthewkeil Date: Thu, 25 Jan 2024 15:35:54 +0700 Subject: [PATCH] fix: add blst/build --- .gitignore | 1 + temp-deps/blst-ts/deps/blst/.gitignore | 2 + temp-deps/blst-ts/deps/blst/build/assembly.S | 123 + .../blst-ts/deps/blst/build/bindings_trim.pl | 37 + .../deps/blst/build/coff/add_mod_256-armv8.S | 397 ++ .../deps/blst/build/coff/add_mod_256-x86_64.s | 911 ++++ .../deps/blst/build/coff/add_mod_384-armv8.S | 1056 ++++ .../deps/blst/build/coff/add_mod_384-x86_64.s | 2481 ++++++++++ .../blst/build/coff/add_mod_384x384-x86_64.s | 326 ++ .../build/coff/ct_inverse_mod_256-armv8.S | 798 ++++ .../build/coff/ct_inverse_mod_256-x86_64.s | 1209 +++++ .../build/coff/ct_inverse_mod_384-armv8.S | 729 +++ .../build/coff/ct_is_square_mod_384-armv8.S | 334 ++ .../build/coff/ct_is_square_mod_384-x86_64.s | 505 ++ .../build/coff/ctq_inverse_mod_384-x86_64.s | 1221 +++++ .../build/coff/ctx_inverse_mod_384-x86_64.s | 1596 +++++++ .../deps/blst/build/coff/div3w-armv8.S | 94 + .../deps/blst/build/coff/div3w-x86_64.s | 140 + .../deps/blst/build/coff/mul_mont_256-armv8.S | 474 ++ .../deps/blst/build/coff/mul_mont_384-armv8.S | 2424 ++++++++++ .../blst/build/coff/mulq_mont_256-x86_64.s | 872 ++++ .../blst/build/coff/mulq_mont_384-x86_64.s | 4206 ++++++++++++++++ .../blst/build/coff/mulx_mont_256-x86_64.s | 784 +++ .../blst/build/coff/mulx_mont_384-x86_64.s | 3559 ++++++++++++++ .../deps/blst/build/coff/sha256-armv8.S | 1087 +++++ .../blst/build/coff/sha256-portable-x86_64.s | 1784 +++++++ .../deps/blst/build/coff/sha256-x86_64.s | 1560 ++++++ .../deps/blst/build/elf/add_mod_256-armv8.S | 379 ++ .../deps/blst/build/elf/add_mod_256-x86_64.s | 572 +++ .../deps/blst/build/elf/add_mod_384-armv8.S | 1000 ++++ .../deps/blst/build/elf/add_mod_384-x86_64.s | 1907 ++++++++ .../blst/build/elf/add_mod_384x384-x86_64.s | 252 + .../blst/build/elf/ct_inverse_mod_256-armv8.S | 784 +++ .../build/elf/ct_inverse_mod_256-x86_64.s | 1185 +++++ .../blst/build/elf/ct_inverse_mod_384-armv8.S | 717 +++ .../build/elf/ct_is_square_mod_384-armv8.S | 324 ++ .../build/elf/ct_is_square_mod_384-x86_64.s | 479 ++ .../build/elf/ctq_inverse_mod_384-x86_64.s | 1195 +++++ .../build/elf/ctx_inverse_mod_384-x86_64.s | 1574 ++++++ .../blst-ts/deps/blst/build/elf/div3w-armv8.S | 88 + .../deps/blst/build/elf/div3w-x86_64.s | 123 + .../deps/blst/build/elf/mul_mont_256-armv8.S | 464 ++ .../deps/blst/build/elf/mul_mont_384-armv8.S | 2372 +++++++++ .../blst/build/elf/mulq_mont_256-x86_64.s | 714 +++ .../blst/build/elf/mulq_mont_384-x86_64.s | 3620 ++++++++++++++ .../blst/build/elf/mulx_mont_256-x86_64.s | 627 +++ .../blst/build/elf/mulx_mont_384-x86_64.s | 2968 ++++++++++++ .../deps/blst/build/elf/sha256-armv8.S | 1077 +++++ .../blst/build/elf/sha256-portable-x86_64.s | 1754 +++++++ .../deps/blst/build/elf/sha256-x86_64.s | 1446 ++++++ .../blst/build/mach-o/add_mod_256-armv8.S | 379 ++ .../blst/build/mach-o/add_mod_256-x86_64.s | 564 +++ .../blst/build/mach-o/add_mod_384-armv8.S | 1000 ++++ .../blst/build/mach-o/add_mod_384-x86_64.s | 1899 ++++++++ .../build/mach-o/add_mod_384x384-x86_64.s | 244 + .../build/mach-o/ct_inverse_mod_256-armv8.S | 784 +++ .../build/mach-o/ct_inverse_mod_256-x86_64.s | 1177 +++++ .../build/mach-o/ct_inverse_mod_384-armv8.S | 717 +++ .../build/mach-o/ct_is_square_mod_384-armv8.S | 324 ++ .../mach-o/ct_is_square_mod_384-x86_64.s | 471 ++ .../build/mach-o/ctq_inverse_mod_384-x86_64.s | 1187 +++++ .../build/mach-o/ctx_inverse_mod_384-x86_64.s | 1566 ++++++ .../deps/blst/build/mach-o/div3w-armv8.S | 88 + .../deps/blst/build/mach-o/div3w-x86_64.s | 115 + .../blst/build/mach-o/mul_mont_256-armv8.S | 464 ++ .../blst/build/mach-o/mul_mont_384-armv8.S | 2372 +++++++++ .../blst/build/mach-o/mulq_mont_256-x86_64.s | 706 +++ .../blst/build/mach-o/mulq_mont_384-x86_64.s | 3612 ++++++++++++++ .../blst/build/mach-o/mulx_mont_256-x86_64.s | 619 +++ .../blst/build/mach-o/mulx_mont_384-x86_64.s | 2960 ++++++++++++ .../deps/blst/build/mach-o/sha256-armv8.S | 1077 +++++ .../build/mach-o/sha256-portable-x86_64.s | 1746 +++++++ .../deps/blst/build/mach-o/sha256-x86_64.s | 1438 ++++++ temp-deps/blst-ts/deps/blst/build/refresh.sh | 49 + .../blst/build/win64/add_mod_256-armv8.asm | 380 ++ .../blst/build/win64/add_mod_256-x86_64.asm | 934 ++++ .../blst/build/win64/add_mod_384-armv8.asm | 1001 ++++ .../blst/build/win64/add_mod_384-x86_64.asm | 2504 ++++++++++ .../build/win64/add_mod_384x384-x86_64.asm | 334 ++ .../blst-ts/deps/blst/build/win64/blst.def | 217 + .../build/win64/ct_inverse_mod_256-armv8.asm | 785 +++ .../build/win64/ct_inverse_mod_256-x86_64.asm | 1211 +++++ .../build/win64/ct_inverse_mod_384-armv8.asm | 718 +++ .../win64/ct_is_square_mod_384-armv8.asm | 325 ++ .../win64/ct_is_square_mod_384-x86_64.asm | 509 ++ .../win64/ctq_inverse_mod_384-x86_64.asm | 1224 +++++ .../win64/ctx_inverse_mod_384-x86_64.asm | 1597 +++++++ .../deps/blst/build/win64/div3w-armv8.asm | 89 + .../deps/blst/build/win64/div3w-x86_64.asm | 152 + temp-deps/blst-ts/deps/blst/build/win64/dll.c | 32 + .../blst/build/win64/mul_mont_256-armv8.asm | 465 ++ .../blst/build/win64/mul_mont_384-armv8.asm | 2373 +++++++++ .../blst/build/win64/mulq_mont_256-x86_64.asm | 884 ++++ .../blst/build/win64/mulq_mont_384-x86_64.asm | 4233 +++++++++++++++++ .../blst/build/win64/mulx_mont_256-x86_64.asm | 796 ++++ .../blst/build/win64/mulx_mont_384-x86_64.asm | 3586 ++++++++++++++ .../deps/blst/build/win64/sha256-armv8.asm | 1078 +++++ .../deps/blst/build/win64/sha256-x86_64.asm | 1570 ++++++ 98 files changed, 106886 insertions(+) create mode 100644 temp-deps/blst-ts/deps/blst/build/assembly.S create mode 100755 temp-deps/blst-ts/deps/blst/build/bindings_trim.pl create mode 100644 temp-deps/blst-ts/deps/blst/build/coff/add_mod_256-armv8.S create mode 100644 temp-deps/blst-ts/deps/blst/build/coff/add_mod_256-x86_64.s create mode 100644 temp-deps/blst-ts/deps/blst/build/coff/add_mod_384-armv8.S create mode 100644 temp-deps/blst-ts/deps/blst/build/coff/add_mod_384-x86_64.s create mode 100644 temp-deps/blst-ts/deps/blst/build/coff/add_mod_384x384-x86_64.s create mode 100644 temp-deps/blst-ts/deps/blst/build/coff/ct_inverse_mod_256-armv8.S create mode 100644 temp-deps/blst-ts/deps/blst/build/coff/ct_inverse_mod_256-x86_64.s create mode 100644 temp-deps/blst-ts/deps/blst/build/coff/ct_inverse_mod_384-armv8.S create mode 100644 temp-deps/blst-ts/deps/blst/build/coff/ct_is_square_mod_384-armv8.S create mode 100644 temp-deps/blst-ts/deps/blst/build/coff/ct_is_square_mod_384-x86_64.s create mode 100644 temp-deps/blst-ts/deps/blst/build/coff/ctq_inverse_mod_384-x86_64.s create mode 100644 temp-deps/blst-ts/deps/blst/build/coff/ctx_inverse_mod_384-x86_64.s create mode 100644 temp-deps/blst-ts/deps/blst/build/coff/div3w-armv8.S create mode 100644 temp-deps/blst-ts/deps/blst/build/coff/div3w-x86_64.s create mode 100644 temp-deps/blst-ts/deps/blst/build/coff/mul_mont_256-armv8.S create mode 100644 temp-deps/blst-ts/deps/blst/build/coff/mul_mont_384-armv8.S create mode 100644 temp-deps/blst-ts/deps/blst/build/coff/mulq_mont_256-x86_64.s create mode 100644 temp-deps/blst-ts/deps/blst/build/coff/mulq_mont_384-x86_64.s create mode 100644 temp-deps/blst-ts/deps/blst/build/coff/mulx_mont_256-x86_64.s create mode 100644 temp-deps/blst-ts/deps/blst/build/coff/mulx_mont_384-x86_64.s create mode 100644 temp-deps/blst-ts/deps/blst/build/coff/sha256-armv8.S create mode 100644 temp-deps/blst-ts/deps/blst/build/coff/sha256-portable-x86_64.s create mode 100644 temp-deps/blst-ts/deps/blst/build/coff/sha256-x86_64.s create mode 100644 temp-deps/blst-ts/deps/blst/build/elf/add_mod_256-armv8.S create mode 100644 temp-deps/blst-ts/deps/blst/build/elf/add_mod_256-x86_64.s create mode 100644 temp-deps/blst-ts/deps/blst/build/elf/add_mod_384-armv8.S create mode 100644 temp-deps/blst-ts/deps/blst/build/elf/add_mod_384-x86_64.s create mode 100644 temp-deps/blst-ts/deps/blst/build/elf/add_mod_384x384-x86_64.s create mode 100644 temp-deps/blst-ts/deps/blst/build/elf/ct_inverse_mod_256-armv8.S create mode 100644 temp-deps/blst-ts/deps/blst/build/elf/ct_inverse_mod_256-x86_64.s create mode 100644 temp-deps/blst-ts/deps/blst/build/elf/ct_inverse_mod_384-armv8.S create mode 100644 temp-deps/blst-ts/deps/blst/build/elf/ct_is_square_mod_384-armv8.S create mode 100644 temp-deps/blst-ts/deps/blst/build/elf/ct_is_square_mod_384-x86_64.s create mode 100644 temp-deps/blst-ts/deps/blst/build/elf/ctq_inverse_mod_384-x86_64.s create mode 100644 temp-deps/blst-ts/deps/blst/build/elf/ctx_inverse_mod_384-x86_64.s create mode 100644 temp-deps/blst-ts/deps/blst/build/elf/div3w-armv8.S create mode 100644 temp-deps/blst-ts/deps/blst/build/elf/div3w-x86_64.s create mode 100644 temp-deps/blst-ts/deps/blst/build/elf/mul_mont_256-armv8.S create mode 100644 temp-deps/blst-ts/deps/blst/build/elf/mul_mont_384-armv8.S create mode 100644 temp-deps/blst-ts/deps/blst/build/elf/mulq_mont_256-x86_64.s create mode 100644 temp-deps/blst-ts/deps/blst/build/elf/mulq_mont_384-x86_64.s create mode 100644 temp-deps/blst-ts/deps/blst/build/elf/mulx_mont_256-x86_64.s create mode 100644 temp-deps/blst-ts/deps/blst/build/elf/mulx_mont_384-x86_64.s create mode 100644 temp-deps/blst-ts/deps/blst/build/elf/sha256-armv8.S create mode 100644 temp-deps/blst-ts/deps/blst/build/elf/sha256-portable-x86_64.s create mode 100644 temp-deps/blst-ts/deps/blst/build/elf/sha256-x86_64.s create mode 100644 temp-deps/blst-ts/deps/blst/build/mach-o/add_mod_256-armv8.S create mode 100644 temp-deps/blst-ts/deps/blst/build/mach-o/add_mod_256-x86_64.s create mode 100644 temp-deps/blst-ts/deps/blst/build/mach-o/add_mod_384-armv8.S create mode 100644 temp-deps/blst-ts/deps/blst/build/mach-o/add_mod_384-x86_64.s create mode 100644 temp-deps/blst-ts/deps/blst/build/mach-o/add_mod_384x384-x86_64.s create mode 100644 temp-deps/blst-ts/deps/blst/build/mach-o/ct_inverse_mod_256-armv8.S create mode 100644 temp-deps/blst-ts/deps/blst/build/mach-o/ct_inverse_mod_256-x86_64.s create mode 100644 temp-deps/blst-ts/deps/blst/build/mach-o/ct_inverse_mod_384-armv8.S create mode 100644 temp-deps/blst-ts/deps/blst/build/mach-o/ct_is_square_mod_384-armv8.S create mode 100644 temp-deps/blst-ts/deps/blst/build/mach-o/ct_is_square_mod_384-x86_64.s create mode 100644 temp-deps/blst-ts/deps/blst/build/mach-o/ctq_inverse_mod_384-x86_64.s create mode 100644 temp-deps/blst-ts/deps/blst/build/mach-o/ctx_inverse_mod_384-x86_64.s create mode 100644 temp-deps/blst-ts/deps/blst/build/mach-o/div3w-armv8.S create mode 100644 temp-deps/blst-ts/deps/blst/build/mach-o/div3w-x86_64.s create mode 100644 temp-deps/blst-ts/deps/blst/build/mach-o/mul_mont_256-armv8.S create mode 100644 temp-deps/blst-ts/deps/blst/build/mach-o/mul_mont_384-armv8.S create mode 100644 temp-deps/blst-ts/deps/blst/build/mach-o/mulq_mont_256-x86_64.s create mode 100644 temp-deps/blst-ts/deps/blst/build/mach-o/mulq_mont_384-x86_64.s create mode 100644 temp-deps/blst-ts/deps/blst/build/mach-o/mulx_mont_256-x86_64.s create mode 100644 temp-deps/blst-ts/deps/blst/build/mach-o/mulx_mont_384-x86_64.s create mode 100644 temp-deps/blst-ts/deps/blst/build/mach-o/sha256-armv8.S create mode 100644 temp-deps/blst-ts/deps/blst/build/mach-o/sha256-portable-x86_64.s create mode 100644 temp-deps/blst-ts/deps/blst/build/mach-o/sha256-x86_64.s create mode 100755 temp-deps/blst-ts/deps/blst/build/refresh.sh create mode 100644 temp-deps/blst-ts/deps/blst/build/win64/add_mod_256-armv8.asm create mode 100644 temp-deps/blst-ts/deps/blst/build/win64/add_mod_256-x86_64.asm create mode 100644 temp-deps/blst-ts/deps/blst/build/win64/add_mod_384-armv8.asm create mode 100644 temp-deps/blst-ts/deps/blst/build/win64/add_mod_384-x86_64.asm create mode 100644 temp-deps/blst-ts/deps/blst/build/win64/add_mod_384x384-x86_64.asm create mode 100644 temp-deps/blst-ts/deps/blst/build/win64/blst.def create mode 100644 temp-deps/blst-ts/deps/blst/build/win64/ct_inverse_mod_256-armv8.asm create mode 100644 temp-deps/blst-ts/deps/blst/build/win64/ct_inverse_mod_256-x86_64.asm create mode 100644 temp-deps/blst-ts/deps/blst/build/win64/ct_inverse_mod_384-armv8.asm create mode 100644 temp-deps/blst-ts/deps/blst/build/win64/ct_is_square_mod_384-armv8.asm create mode 100644 temp-deps/blst-ts/deps/blst/build/win64/ct_is_square_mod_384-x86_64.asm create mode 100644 temp-deps/blst-ts/deps/blst/build/win64/ctq_inverse_mod_384-x86_64.asm create mode 100644 temp-deps/blst-ts/deps/blst/build/win64/ctx_inverse_mod_384-x86_64.asm create mode 100644 temp-deps/blst-ts/deps/blst/build/win64/div3w-armv8.asm create mode 100644 temp-deps/blst-ts/deps/blst/build/win64/div3w-x86_64.asm create mode 100644 temp-deps/blst-ts/deps/blst/build/win64/dll.c create mode 100644 temp-deps/blst-ts/deps/blst/build/win64/mul_mont_256-armv8.asm create mode 100644 temp-deps/blst-ts/deps/blst/build/win64/mul_mont_384-armv8.asm create mode 100644 temp-deps/blst-ts/deps/blst/build/win64/mulq_mont_256-x86_64.asm create mode 100644 temp-deps/blst-ts/deps/blst/build/win64/mulq_mont_384-x86_64.asm create mode 100644 temp-deps/blst-ts/deps/blst/build/win64/mulx_mont_256-x86_64.asm create mode 100644 temp-deps/blst-ts/deps/blst/build/win64/mulx_mont_384-x86_64.asm create mode 100644 temp-deps/blst-ts/deps/blst/build/win64/sha256-armv8.asm create mode 100644 temp-deps/blst-ts/deps/blst/build/win64/sha256-x86_64.asm diff --git a/.gitignore b/.gitignore index 5c0c3b9..d2a99eb 100644 --- a/.gitignore +++ b/.gitignore @@ -65,6 +65,7 @@ typings/ dist/ lib/ !temp-deps/lib/ +!temp-deps/blst-ts/deps/blst/build benchmark-reports .vscode/ diff --git a/temp-deps/blst-ts/deps/blst/.gitignore b/temp-deps/blst-ts/deps/blst/.gitignore index c76bcb8..925822d 100644 --- a/temp-deps/blst-ts/deps/blst/.gitignore +++ b/temp-deps/blst-ts/deps/blst/.gitignore @@ -70,3 +70,5 @@ bindings/node.js bindings/emscripten bin/ obj/ + +!build/ diff --git a/temp-deps/blst-ts/deps/blst/build/assembly.S b/temp-deps/blst-ts/deps/blst/build/assembly.S new file mode 100644 index 0000000..a1a7c54 --- /dev/null +++ b/temp-deps/blst-ts/deps/blst/build/assembly.S @@ -0,0 +1,123 @@ +#if defined(__x86_64) || defined(__x86_64__) +# if defined(__ELF__) +# if defined(__BLST_PORTABLE__) +# include "elf/sha256-portable-x86_64.s" +# else +# include "elf/sha256-x86_64.s" +# endif +# if defined(__ADX__) && !defined(__BLST_PORTABLE__) +# include "elf/ctx_inverse_mod_384-x86_64.s" +# else +# include "elf/ctq_inverse_mod_384-x86_64.s" +# endif +# include "elf/add_mod_384-x86_64.s" +# include "elf/add_mod_384x384-x86_64.s" +# define __add_mod_384 __add_mont_384 +# define __sub_mod_384 __sub_mont_384 +# define __sub_mod_384x384 __sub_mont_384x384 +# if defined(__ADX__) && !defined(__BLST_PORTABLE__) +# include "elf/mulx_mont_384-x86_64.s" +# include "elf/mulx_mont_256-x86_64.s" +# else +# include "elf/mulq_mont_384-x86_64.s" +# include "elf/mulq_mont_256-x86_64.s" +# endif +# include "elf/add_mod_256-x86_64.s" +# include "elf/ct_inverse_mod_256-x86_64.s" +# include "elf/div3w-x86_64.s" +# include "elf/ct_is_square_mod_384-x86_64.s" +# elif defined(_WIN64) || defined(__CYGWIN__) +# if defined(__BLST_PORTABLE__) +# include "coff/sha256-portable-x86_64.s" +# else +# include "coff/sha256-x86_64.s" +# endif +# if defined(__ADX__) && !defined(__BLST_PORTABLE__) +# include "coff/ctx_inverse_mod_384-x86_64.s" +# else +# include "coff/ctq_inverse_mod_384-x86_64.s" +# endif +# include "coff/add_mod_384-x86_64.s" +# include "coff/add_mod_384x384-x86_64.s" +# define __add_mod_384 __add_mont_384 +# define __sub_mod_384 __sub_mont_384 +# define __sub_mod_384x384 __sub_mont_384x384 +# if defined(__ADX__) && !defined(__BLST_PORTABLE__) +# include "coff/mulx_mont_384-x86_64.s" +# include "coff/mulx_mont_256-x86_64.s" +# else +# include "coff/mulq_mont_384-x86_64.s" +# include "coff/mulq_mont_256-x86_64.s" +# endif +# include "coff/add_mod_256-x86_64.s" +# include "coff/ct_inverse_mod_256-x86_64.s" +# include "coff/div3w-x86_64.s" +# include "coff/ct_is_square_mod_384-x86_64.s" +# elif defined(__APPLE__) +# include "mach-o/sha256-x86_64.s" +# if defined(__ADX__) && !defined(__BLST_PORTABLE__) +# include "mach-o/ctx_inverse_mod_384-x86_64.s" +# else +# include "mach-o/ctq_inverse_mod_384-x86_64.s" +# endif +# include "mach-o/add_mod_384-x86_64.s" +# include "mach-o/add_mod_384x384-x86_64.s" +# define __add_mod_384 __add_mont_384 +# define __sub_mod_384 __sub_mont_384 +# define __sub_mod_384x384 __sub_mont_384x384 +# if defined(__ADX__) && !defined(__BLST_PORTABLE__) +# include "mach-o/mulx_mont_384-x86_64.s" +# include "mach-o/mulx_mont_256-x86_64.s" +# else +# include "mach-o/mulq_mont_384-x86_64.s" +# include "mach-o/mulq_mont_256-x86_64.s" +# endif +# include "mach-o/add_mod_256-x86_64.s" +# include "mach-o/ct_inverse_mod_256-x86_64.s" +# include "mach-o/div3w-x86_64.s" +# include "mach-o/ct_is_square_mod_384-x86_64.s" +# endif +#elif defined(__aarch64__) +# if defined(__ELF__) +# include "elf/sha256-armv8.S" +# include "elf/ct_inverse_mod_384-armv8.S" +# include "elf/add_mod_384-armv8.S" +# define __add_mod_384 __add_mont_384 +# define __sub_mod_384 __sub_mont_384 +# include "elf/mul_mont_384-armv8.S" +# include "elf/mul_mont_256-armv8.S" +# include "elf/add_mod_256-armv8.S" +# include "elf/ct_inverse_mod_256-armv8.S" +# include "elf/div3w-armv8.S" +# include "elf/ct_is_square_mod_384-armv8.S" +# elif defined(_WIN64) +# include "coff/sha256-armv8.S" +# include "coff/ct_inverse_mod_384-armv8.S" +# include "coff/add_mod_384-armv8.S" +# define __add_mod_384 __add_mont_384 +# define __sub_mod_384 __sub_mont_384 +# include "coff/mul_mont_384-armv8.S" +# include "coff/mul_mont_256-armv8.S" +# include "coff/add_mod_256-armv8.S" +# include "coff/ct_inverse_mod_256-armv8.S" +# include "coff/div3w-armv8.S" +# include "coff/ct_is_square_mod_384-armv8.S" +# elif defined(__APPLE__) +# include "mach-o/sha256-armv8.S" +# include "mach-o/ct_inverse_mod_384-armv8.S" +# include "mach-o/add_mod_384-armv8.S" +# define __add_mod_384 __add_mont_384 +# define __sub_mod_384 __sub_mont_384 +# include "mach-o/mul_mont_384-armv8.S" +# include "mach-o/mul_mont_256-armv8.S" +# include "mach-o/add_mod_256-armv8.S" +# include "mach-o/ct_inverse_mod_256-armv8.S" +# include "mach-o/div3w-armv8.S" +# include "mach-o/ct_is_square_mod_384-armv8.S" +# endif +#elif defined(__BLST_NO_ASM__) || \ + (defined(__SIZEOF_POINTER__) && __SIZEOF_POINTER__==4) +/* inaccurate way to detect a 32-bit processor, but it's close enough */ +#else +# error "unsupported platform" +#endif diff --git a/temp-deps/blst-ts/deps/blst/build/bindings_trim.pl b/temp-deps/blst-ts/deps/blst/build/bindings_trim.pl new file mode 100755 index 0000000..9350ffa --- /dev/null +++ b/temp-deps/blst-ts/deps/blst/build/bindings_trim.pl @@ -0,0 +1,37 @@ +#!/usr/bin/env perl + +# read whole file +while(<>) { push @file, $_; } + +# traverse and remove auto-generated PartialEq for chosen types +for (my $i = 0; $i <= $#file; $i++) { + if (@file[$i] =~ m/struct\s+blst_p[12]/) { + @file[$i-1] =~ s/,\s*PartialEq//; + } elsif (@file[$i] =~ m/struct\s+blst_fp12/) { + @file[$i-1] =~ s/,\s*(?:Default|PartialEq)//g; + } elsif (@file[$i] =~ m/struct\s+(blst_pairing|blst_uniq)/) { + @file[$i-1] =~ s/,\s*(?:Copy|Clone|Eq|PartialEq)//g; + } elsif (@file[$i] =~ m/struct\s+blst_scalar/) { + @file[$i-1] =~ s/,\s*Copy//; + @file[$i-1] =~ s/\)/, Zeroize\)/; + splice @file, $i, 0, "#[zeroize(drop)]\n"; $i++; + } elsif (@file[$i] =~ m/assert_eq!\($/) { + @file[++$i] =~ s/unsafe\s*\{\s*&\(\*\(::std::ptr::null::<(\w+)>\(\)\)\)\.(\w+).*\}/offsetof!($1, $2)/; + } +} + +print << '___'; +#[cfg(test)] +macro_rules! offsetof { + ($type:ty, $field:tt) => { + unsafe { + let v = std::mem::MaybeUninit::<$type>::uninit().assume_init(); + (&v.$field as *const _ as usize) - (&v as *const _ as usize) + } + }; +} +___ +# print the file +print @file; + +close STDOUT; diff --git a/temp-deps/blst-ts/deps/blst/build/coff/add_mod_256-armv8.S b/temp-deps/blst-ts/deps/blst/build/coff/add_mod_256-armv8.S new file mode 100644 index 0000000..27b64ef --- /dev/null +++ b/temp-deps/blst-ts/deps/blst/build/coff/add_mod_256-armv8.S @@ -0,0 +1,397 @@ +.text + +.globl add_mod_256 + +.def add_mod_256; +.type 32; +.endef +.p2align 5 +add_mod_256: + ldp x8,x9,[x1] + ldp x12,x13,[x2] + + ldp x10,x11,[x1,#16] + adds x8,x8,x12 + ldp x14,x15,[x2,#16] + adcs x9,x9,x13 + ldp x4,x5,[x3] + adcs x10,x10,x14 + ldp x6,x7,[x3,#16] + adcs x11,x11,x15 + adc x3,xzr,xzr + + subs x16,x8,x4 + sbcs x17,x9,x5 + sbcs x1,x10,x6 + sbcs x2,x11,x7 + sbcs xzr,x3,xzr + + csel x8,x8,x16,lo + csel x9,x9,x17,lo + csel x10,x10,x1,lo + stp x8,x9,[x0] + csel x11,x11,x2,lo + stp x10,x11,[x0,#16] + + ret + + +.globl mul_by_3_mod_256 + +.def mul_by_3_mod_256; +.type 32; +.endef +.p2align 5 +mul_by_3_mod_256: + ldp x12,x13,[x1] + ldp x14,x15,[x1,#16] + + adds x8,x12,x12 + ldp x4,x5,[x2] + adcs x9,x13,x13 + ldp x6,x7,[x2,#16] + adcs x10,x14,x14 + adcs x11,x15,x15 + adc x3,xzr,xzr + + subs x16,x8,x4 + sbcs x17,x9,x5 + sbcs x1,x10,x6 + sbcs x2,x11,x7 + sbcs xzr,x3,xzr + + csel x8,x8,x16,lo + csel x9,x9,x17,lo + csel x10,x10,x1,lo + csel x11,x11,x2,lo + + adds x8,x8,x12 + adcs x9,x9,x13 + adcs x10,x10,x14 + adcs x11,x11,x15 + adc x3,xzr,xzr + + subs x16,x8,x4 + sbcs x17,x9,x5 + sbcs x1,x10,x6 + sbcs x2,x11,x7 + sbcs xzr,x3,xzr + + csel x8,x8,x16,lo + csel x9,x9,x17,lo + csel x10,x10,x1,lo + stp x8,x9,[x0] + csel x11,x11,x2,lo + stp x10,x11,[x0,#16] + + ret + + +.globl lshift_mod_256 + +.def lshift_mod_256; +.type 32; +.endef +.p2align 5 +lshift_mod_256: + ldp x8,x9,[x1] + ldp x10,x11,[x1,#16] + + ldp x4,x5,[x3] + ldp x6,x7,[x3,#16] + +.Loop_lshift_mod_256: + adds x8,x8,x8 + sub x2,x2,#1 + adcs x9,x9,x9 + adcs x10,x10,x10 + adcs x11,x11,x11 + adc x3,xzr,xzr + + subs x12,x8,x4 + sbcs x13,x9,x5 + sbcs x14,x10,x6 + sbcs x15,x11,x7 + sbcs xzr,x3,xzr + + csel x8,x8,x12,lo + csel x9,x9,x13,lo + csel x10,x10,x14,lo + csel x11,x11,x15,lo + + cbnz x2,.Loop_lshift_mod_256 + + stp x8,x9,[x0] + stp x10,x11,[x0,#16] + + ret + + +.globl rshift_mod_256 + +.def rshift_mod_256; +.type 32; +.endef +.p2align 5 +rshift_mod_256: + ldp x8,x9,[x1] + ldp x10,x11,[x1,#16] + + ldp x4,x5,[x3] + ldp x6,x7,[x3,#16] + +.Loop_rshift: + adds x12,x8,x4 + sub x2,x2,#1 + adcs x13,x9,x5 + adcs x14,x10,x6 + adcs x15,x11,x7 + adc x3,xzr,xzr + tst x8,#1 + + csel x12,x12,x8,ne + csel x13,x13,x9,ne + csel x14,x14,x10,ne + csel x15,x15,x11,ne + csel x3,x3,xzr,ne + + extr x8,x13,x12,#1 + extr x9,x14,x13,#1 + extr x10,x15,x14,#1 + extr x11,x3,x15,#1 + + cbnz x2,.Loop_rshift + + stp x8,x9,[x0] + stp x10,x11,[x0,#16] + + ret + + +.globl cneg_mod_256 + +.def cneg_mod_256; +.type 32; +.endef +.p2align 5 +cneg_mod_256: + ldp x8,x9,[x1] + ldp x4,x5,[x3] + + ldp x10,x11,[x1,#16] + subs x12,x4,x8 + ldp x6,x7,[x3,#16] + orr x4,x8,x9 + sbcs x13,x5,x9 + orr x5,x10,x11 + sbcs x14,x6,x10 + orr x3,x4,x5 + sbc x15,x7,x11 + + cmp x3,#0 + csetm x3,ne + ands x2,x2,x3 + + csel x8,x8,x12,eq + csel x9,x9,x13,eq + csel x10,x10,x14,eq + stp x8,x9,[x0] + csel x11,x11,x15,eq + stp x10,x11,[x0,#16] + + ret + + +.globl sub_mod_256 + +.def sub_mod_256; +.type 32; +.endef +.p2align 5 +sub_mod_256: + ldp x8,x9,[x1] + ldp x12,x13,[x2] + + ldp x10,x11,[x1,#16] + subs x8,x8,x12 + ldp x14,x15,[x2,#16] + sbcs x9,x9,x13 + ldp x4,x5,[x3] + sbcs x10,x10,x14 + ldp x6,x7,[x3,#16] + sbcs x11,x11,x15 + sbc x3,xzr,xzr + + and x4,x4,x3 + and x5,x5,x3 + adds x8,x8,x4 + and x6,x6,x3 + adcs x9,x9,x5 + and x7,x7,x3 + adcs x10,x10,x6 + stp x8,x9,[x0] + adc x11,x11,x7 + stp x10,x11,[x0,#16] + + ret + + +.globl check_mod_256 + +.def check_mod_256; +.type 32; +.endef +.p2align 5 +check_mod_256: + ldp x8,x9,[x0] + ldp x10,x11,[x0,#16] + ldp x4,x5,[x1] + ldp x6,x7,[x1,#16] + +#ifdef __AARCH64EB__ + rev x8,x8 + rev x9,x9 + rev x10,x10 + rev x11,x11 +#endif + + subs xzr,x8,x4 + sbcs xzr,x9,x5 + orr x8,x8,x9 + sbcs xzr,x10,x6 + orr x8,x8,x10 + sbcs xzr,x11,x7 + orr x8,x8,x11 + sbc x1,xzr,xzr + + cmp x8,#0 + mov x0,#1 + csel x0,x0,xzr,ne + and x0,x0,x1 + + ret + + +.globl add_n_check_mod_256 + +.def add_n_check_mod_256; +.type 32; +.endef +.p2align 5 +add_n_check_mod_256: + ldp x8,x9,[x1] + ldp x12,x13,[x2] + ldp x10,x11,[x1,#16] + ldp x14,x15,[x2,#16] + +#ifdef __AARCH64EB__ + rev x8,x8 + rev x12,x12 + rev x9,x9 + rev x13,x13 + rev x10,x10 + rev x14,x14 + rev x11,x11 + rev x15,x15 +#endif + + adds x8,x8,x12 + ldp x4,x5,[x3] + adcs x9,x9,x13 + ldp x6,x7,[x3,#16] + adcs x10,x10,x14 + adcs x11,x11,x15 + adc x3,xzr,xzr + + subs x16,x8,x4 + sbcs x17,x9,x5 + sbcs x1,x10,x6 + sbcs x2,x11,x7 + sbcs xzr,x3,xzr + + csel x8,x8,x16,lo + csel x9,x9,x17,lo + csel x10,x10,x1,lo + csel x11,x11,x2,lo + + orr x16, x8, x9 + orr x17, x10, x11 + orr x16, x16, x17 + +#ifdef __AARCH64EB__ + rev x8,x8 + rev x9,x9 + rev x10,x10 + rev x11,x11 +#endif + + stp x8,x9,[x0] + stp x10,x11,[x0,#16] + + mov x17, #1 + cmp x16, #0 + csel x0, x17, xzr, ne + + ret + + +.globl sub_n_check_mod_256 + +.def sub_n_check_mod_256; +.type 32; +.endef +.p2align 5 +sub_n_check_mod_256: + ldp x8,x9,[x1] + ldp x12,x13,[x2] + ldp x10,x11,[x1,#16] + ldp x14,x15,[x2,#16] + +#ifdef __AARCH64EB__ + rev x8,x8 + rev x12,x12 + rev x9,x9 + rev x13,x13 + rev x10,x10 + rev x14,x14 + rev x11,x11 + rev x15,x15 +#endif + + subs x8,x8,x12 + sbcs x9,x9,x13 + ldp x4,x5,[x3] + sbcs x10,x10,x14 + ldp x6,x7,[x3,#16] + sbcs x11,x11,x15 + sbc x3,xzr,xzr + + and x4,x4,x3 + and x5,x5,x3 + adds x8,x8,x4 + and x6,x6,x3 + adcs x9,x9,x5 + and x7,x7,x3 + adcs x10,x10,x6 + adc x11,x11,x7 + + orr x16, x8, x9 + orr x17, x10, x11 + orr x16, x16, x17 + +#ifdef __AARCH64EB__ + rev x8,x8 + rev x9,x9 + rev x10,x10 + rev x11,x11 +#endif + + stp x8,x9,[x0] + stp x10,x11,[x0,#16] + + mov x17, #1 + cmp x16, #0 + csel x0, x17, xzr, ne + + ret + diff --git a/temp-deps/blst-ts/deps/blst/build/coff/add_mod_256-x86_64.s b/temp-deps/blst-ts/deps/blst/build/coff/add_mod_256-x86_64.s new file mode 100644 index 0000000..f88e618 --- /dev/null +++ b/temp-deps/blst-ts/deps/blst/build/coff/add_mod_256-x86_64.s @@ -0,0 +1,911 @@ +.text + +.globl add_mod_256 + +.def add_mod_256; .scl 2; .type 32; .endef +.p2align 5 +add_mod_256: + .byte 0xf3,0x0f,0x1e,0xfa + movq %rdi,8(%rsp) + movq %rsi,16(%rsp) + movq %rsp,%r11 +.LSEH_begin_add_mod_256: + movq %rcx,%rdi + movq %rdx,%rsi + movq %r8,%rdx + movq %r9,%rcx + + + pushq %rbp + + pushq %rbx + + subq $8,%rsp + +.LSEH_body_add_mod_256: + + + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + +.Loaded_a_add_mod_256: + addq 0(%rdx),%r8 + adcq 8(%rdx),%r9 + movq %r8,%rax + adcq 16(%rdx),%r10 + movq %r9,%rsi + adcq 24(%rdx),%r11 + sbbq %rdx,%rdx + + movq %r10,%rbx + subq 0(%rcx),%r8 + sbbq 8(%rcx),%r9 + sbbq 16(%rcx),%r10 + movq %r11,%rbp + sbbq 24(%rcx),%r11 + sbbq $0,%rdx + + cmovcq %rax,%r8 + cmovcq %rsi,%r9 + movq %r8,0(%rdi) + cmovcq %rbx,%r10 + movq %r9,8(%rdi) + cmovcq %rbp,%r11 + movq %r10,16(%rdi) + movq %r11,24(%rdi) + + movq 8(%rsp),%rbx + + movq 16(%rsp),%rbp + + leaq 24(%rsp),%rsp + +.LSEH_epilogue_add_mod_256: + mov 8(%rsp),%rdi + mov 16(%rsp),%rsi + + .byte 0xf3,0xc3 + +.LSEH_end_add_mod_256: + + +.globl mul_by_3_mod_256 + +.def mul_by_3_mod_256; .scl 2; .type 32; .endef +.p2align 5 +mul_by_3_mod_256: + .byte 0xf3,0x0f,0x1e,0xfa + movq %rdi,8(%rsp) + movq %rsi,16(%rsp) + movq %rsp,%r11 +.LSEH_begin_mul_by_3_mod_256: + movq %rcx,%rdi + movq %rdx,%rsi + movq %r8,%rdx + + + pushq %rbp + + pushq %rbx + + pushq %r12 + +.LSEH_body_mul_by_3_mod_256: + + + movq %rdx,%rcx + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq %rsi,%rdx + movq 24(%rsi),%r11 + + call __lshift_mod_256 + movq 0(%rsp),%r12 + + jmp .Loaded_a_add_mod_256 + + movq 8(%rsp),%rbx + + movq 16(%rsp),%rbp + + leaq 24(%rsp),%rsp + +.LSEH_epilogue_mul_by_3_mod_256: + mov 8(%rsp),%rdi + mov 16(%rsp),%rsi + + .byte 0xf3,0xc3 + +.LSEH_end_mul_by_3_mod_256: + +.def __lshift_mod_256; .scl 3; .type 32; .endef +.p2align 5 +__lshift_mod_256: + .byte 0xf3,0x0f,0x1e,0xfa + + addq %r8,%r8 + adcq %r9,%r9 + movq %r8,%rax + adcq %r10,%r10 + movq %r9,%rsi + adcq %r11,%r11 + sbbq %r12,%r12 + + movq %r10,%rbx + subq 0(%rcx),%r8 + sbbq 8(%rcx),%r9 + sbbq 16(%rcx),%r10 + movq %r11,%rbp + sbbq 24(%rcx),%r11 + sbbq $0,%r12 + + cmovcq %rax,%r8 + cmovcq %rsi,%r9 + cmovcq %rbx,%r10 + cmovcq %rbp,%r11 + + .byte 0xf3,0xc3 + + + +.globl lshift_mod_256 + +.def lshift_mod_256; .scl 2; .type 32; .endef +.p2align 5 +lshift_mod_256: + .byte 0xf3,0x0f,0x1e,0xfa + movq %rdi,8(%rsp) + movq %rsi,16(%rsp) + movq %rsp,%r11 +.LSEH_begin_lshift_mod_256: + movq %rcx,%rdi + movq %rdx,%rsi + movq %r8,%rdx + movq %r9,%rcx + + + pushq %rbp + + pushq %rbx + + pushq %r12 + +.LSEH_body_lshift_mod_256: + + + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + +.Loop_lshift_mod_256: + call __lshift_mod_256 + decl %edx + jnz .Loop_lshift_mod_256 + + movq %r8,0(%rdi) + movq %r9,8(%rdi) + movq %r10,16(%rdi) + movq %r11,24(%rdi) + + movq 0(%rsp),%r12 + + movq 8(%rsp),%rbx + + movq 16(%rsp),%rbp + + leaq 24(%rsp),%rsp + +.LSEH_epilogue_lshift_mod_256: + mov 8(%rsp),%rdi + mov 16(%rsp),%rsi + + .byte 0xf3,0xc3 + +.LSEH_end_lshift_mod_256: + + +.globl rshift_mod_256 + +.def rshift_mod_256; .scl 2; .type 32; .endef +.p2align 5 +rshift_mod_256: + .byte 0xf3,0x0f,0x1e,0xfa + movq %rdi,8(%rsp) + movq %rsi,16(%rsp) + movq %rsp,%r11 +.LSEH_begin_rshift_mod_256: + movq %rcx,%rdi + movq %rdx,%rsi + movq %r8,%rdx + movq %r9,%rcx + + + pushq %rbp + + pushq %rbx + + subq $8,%rsp + +.LSEH_body_rshift_mod_256: + + + movq 0(%rsi),%rbp + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + +.Loop_rshift_mod_256: + movq %rbp,%r8 + andq $1,%rbp + movq 0(%rcx),%rax + negq %rbp + movq 8(%rcx),%rsi + movq 16(%rcx),%rbx + + andq %rbp,%rax + andq %rbp,%rsi + andq %rbp,%rbx + andq 24(%rcx),%rbp + + addq %rax,%r8 + adcq %rsi,%r9 + adcq %rbx,%r10 + adcq %rbp,%r11 + sbbq %rax,%rax + + shrq $1,%r8 + movq %r9,%rbp + shrq $1,%r9 + movq %r10,%rbx + shrq $1,%r10 + movq %r11,%rsi + shrq $1,%r11 + + shlq $63,%rbp + shlq $63,%rbx + orq %r8,%rbp + shlq $63,%rsi + orq %rbx,%r9 + shlq $63,%rax + orq %rsi,%r10 + orq %rax,%r11 + + decl %edx + jnz .Loop_rshift_mod_256 + + movq %rbp,0(%rdi) + movq %r9,8(%rdi) + movq %r10,16(%rdi) + movq %r11,24(%rdi) + + movq 8(%rsp),%rbx + + movq 16(%rsp),%rbp + + leaq 24(%rsp),%rsp + +.LSEH_epilogue_rshift_mod_256: + mov 8(%rsp),%rdi + mov 16(%rsp),%rsi + + .byte 0xf3,0xc3 + +.LSEH_end_rshift_mod_256: + + +.globl cneg_mod_256 + +.def cneg_mod_256; .scl 2; .type 32; .endef +.p2align 5 +cneg_mod_256: + .byte 0xf3,0x0f,0x1e,0xfa + movq %rdi,8(%rsp) + movq %rsi,16(%rsp) + movq %rsp,%r11 +.LSEH_begin_cneg_mod_256: + movq %rcx,%rdi + movq %rdx,%rsi + movq %r8,%rdx + movq %r9,%rcx + + + pushq %rbp + + pushq %rbx + + pushq %r12 + +.LSEH_body_cneg_mod_256: + + + movq 0(%rsi),%r12 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq %r12,%r8 + movq 24(%rsi),%r11 + orq %r9,%r12 + orq %r10,%r12 + orq %r11,%r12 + movq $-1,%rbp + + movq 0(%rcx),%rax + cmovnzq %rbp,%r12 + movq 8(%rcx),%rsi + movq 16(%rcx),%rbx + andq %r12,%rax + movq 24(%rcx),%rbp + andq %r12,%rsi + andq %r12,%rbx + andq %r12,%rbp + + subq %r8,%rax + sbbq %r9,%rsi + sbbq %r10,%rbx + sbbq %r11,%rbp + + orq %rdx,%rdx + + cmovzq %r8,%rax + cmovzq %r9,%rsi + movq %rax,0(%rdi) + cmovzq %r10,%rbx + movq %rsi,8(%rdi) + cmovzq %r11,%rbp + movq %rbx,16(%rdi) + movq %rbp,24(%rdi) + + movq 0(%rsp),%r12 + + movq 8(%rsp),%rbx + + movq 16(%rsp),%rbp + + leaq 24(%rsp),%rsp + +.LSEH_epilogue_cneg_mod_256: + mov 8(%rsp),%rdi + mov 16(%rsp),%rsi + + .byte 0xf3,0xc3 + +.LSEH_end_cneg_mod_256: + + +.globl sub_mod_256 + +.def sub_mod_256; .scl 2; .type 32; .endef +.p2align 5 +sub_mod_256: + .byte 0xf3,0x0f,0x1e,0xfa + movq %rdi,8(%rsp) + movq %rsi,16(%rsp) + movq %rsp,%r11 +.LSEH_begin_sub_mod_256: + movq %rcx,%rdi + movq %rdx,%rsi + movq %r8,%rdx + movq %r9,%rcx + + + pushq %rbp + + pushq %rbx + + subq $8,%rsp + +.LSEH_body_sub_mod_256: + + + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + + subq 0(%rdx),%r8 + movq 0(%rcx),%rax + sbbq 8(%rdx),%r9 + movq 8(%rcx),%rsi + sbbq 16(%rdx),%r10 + movq 16(%rcx),%rbx + sbbq 24(%rdx),%r11 + movq 24(%rcx),%rbp + sbbq %rdx,%rdx + + andq %rdx,%rax + andq %rdx,%rsi + andq %rdx,%rbx + andq %rdx,%rbp + + addq %rax,%r8 + adcq %rsi,%r9 + movq %r8,0(%rdi) + adcq %rbx,%r10 + movq %r9,8(%rdi) + adcq %rbp,%r11 + movq %r10,16(%rdi) + movq %r11,24(%rdi) + + movq 8(%rsp),%rbx + + movq 16(%rsp),%rbp + + leaq 24(%rsp),%rsp + +.LSEH_epilogue_sub_mod_256: + mov 8(%rsp),%rdi + mov 16(%rsp),%rsi + + .byte 0xf3,0xc3 + +.LSEH_end_sub_mod_256: + + +.globl check_mod_256 + +.def check_mod_256; .scl 2; .type 32; .endef +.p2align 5 +check_mod_256: + .byte 0xf3,0x0f,0x1e,0xfa + movq %rdi,8(%rsp) + movq %rsi,16(%rsp) + movq %rsp,%r11 +.LSEH_begin_check_mod_256: + movq %rcx,%rdi + movq %rdx,%rsi + + + movq 0(%rdi),%rax + movq 8(%rdi),%r9 + movq 16(%rdi),%r10 + movq 24(%rdi),%r11 + + movq %rax,%r8 + orq %r9,%rax + orq %r10,%rax + orq %r11,%rax + + subq 0(%rsi),%r8 + sbbq 8(%rsi),%r9 + sbbq 16(%rsi),%r10 + sbbq 24(%rsi),%r11 + sbbq %rsi,%rsi + + movq $1,%rdx + cmpq $0,%rax + cmovneq %rdx,%rax + andq %rsi,%rax +.LSEH_epilogue_check_mod_256: + mov 8(%rsp),%rdi + mov 16(%rsp),%rsi + + .byte 0xf3,0xc3 + +.LSEH_end_check_mod_256: + + +.globl add_n_check_mod_256 + +.def add_n_check_mod_256; .scl 2; .type 32; .endef +.p2align 5 +add_n_check_mod_256: + .byte 0xf3,0x0f,0x1e,0xfa + movq %rdi,8(%rsp) + movq %rsi,16(%rsp) + movq %rsp,%r11 +.LSEH_begin_add_n_check_mod_256: + movq %rcx,%rdi + movq %rdx,%rsi + movq %r8,%rdx + movq %r9,%rcx + + + pushq %rbp + + pushq %rbx + + subq $8,%rsp + +.LSEH_body_add_n_check_mod_256: + + + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + + addq 0(%rdx),%r8 + adcq 8(%rdx),%r9 + movq %r8,%rax + adcq 16(%rdx),%r10 + movq %r9,%rsi + adcq 24(%rdx),%r11 + sbbq %rdx,%rdx + + movq %r10,%rbx + subq 0(%rcx),%r8 + sbbq 8(%rcx),%r9 + sbbq 16(%rcx),%r10 + movq %r11,%rbp + sbbq 24(%rcx),%r11 + sbbq $0,%rdx + + cmovcq %rax,%r8 + cmovcq %rsi,%r9 + movq %r8,0(%rdi) + cmovcq %rbx,%r10 + movq %r9,8(%rdi) + cmovcq %rbp,%r11 + movq %r10,16(%rdi) + movq %r11,24(%rdi) + + orq %r9,%r8 + orq %r11,%r10 + orq %r10,%r8 + movq $1,%rax + cmovzq %r8,%rax + + movq 8(%rsp),%rbx + + movq 16(%rsp),%rbp + + leaq 24(%rsp),%rsp + +.LSEH_epilogue_add_n_check_mod_256: + mov 8(%rsp),%rdi + mov 16(%rsp),%rsi + + .byte 0xf3,0xc3 + +.LSEH_end_add_n_check_mod_256: + + +.globl sub_n_check_mod_256 + +.def sub_n_check_mod_256; .scl 2; .type 32; .endef +.p2align 5 +sub_n_check_mod_256: + .byte 0xf3,0x0f,0x1e,0xfa + movq %rdi,8(%rsp) + movq %rsi,16(%rsp) + movq %rsp,%r11 +.LSEH_begin_sub_n_check_mod_256: + movq %rcx,%rdi + movq %rdx,%rsi + movq %r8,%rdx + movq %r9,%rcx + + + pushq %rbp + + pushq %rbx + + subq $8,%rsp + +.LSEH_body_sub_n_check_mod_256: + + + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + + subq 0(%rdx),%r8 + movq 0(%rcx),%rax + sbbq 8(%rdx),%r9 + movq 8(%rcx),%rsi + sbbq 16(%rdx),%r10 + movq 16(%rcx),%rbx + sbbq 24(%rdx),%r11 + movq 24(%rcx),%rbp + sbbq %rdx,%rdx + + andq %rdx,%rax + andq %rdx,%rsi + andq %rdx,%rbx + andq %rdx,%rbp + + addq %rax,%r8 + adcq %rsi,%r9 + movq %r8,0(%rdi) + adcq %rbx,%r10 + movq %r9,8(%rdi) + adcq %rbp,%r11 + movq %r10,16(%rdi) + movq %r11,24(%rdi) + + orq %r9,%r8 + orq %r11,%r10 + orq %r10,%r8 + movq $1,%rax + cmovzq %r8,%rax + + movq 8(%rsp),%rbx + + movq 16(%rsp),%rbp + + leaq 24(%rsp),%rsp + +.LSEH_epilogue_sub_n_check_mod_256: + mov 8(%rsp),%rdi + mov 16(%rsp),%rsi + + .byte 0xf3,0xc3 + +.LSEH_end_sub_n_check_mod_256: +.section .pdata +.p2align 2 +.rva .LSEH_begin_add_mod_256 +.rva .LSEH_body_add_mod_256 +.rva .LSEH_info_add_mod_256_prologue + +.rva .LSEH_body_add_mod_256 +.rva .LSEH_epilogue_add_mod_256 +.rva .LSEH_info_add_mod_256_body + +.rva .LSEH_epilogue_add_mod_256 +.rva .LSEH_end_add_mod_256 +.rva .LSEH_info_add_mod_256_epilogue + +.rva .LSEH_begin_mul_by_3_mod_256 +.rva .LSEH_body_mul_by_3_mod_256 +.rva .LSEH_info_mul_by_3_mod_256_prologue + +.rva .LSEH_body_mul_by_3_mod_256 +.rva .LSEH_epilogue_mul_by_3_mod_256 +.rva .LSEH_info_mul_by_3_mod_256_body + +.rva .LSEH_epilogue_mul_by_3_mod_256 +.rva .LSEH_end_mul_by_3_mod_256 +.rva .LSEH_info_mul_by_3_mod_256_epilogue + +.rva .LSEH_begin_lshift_mod_256 +.rva .LSEH_body_lshift_mod_256 +.rva .LSEH_info_lshift_mod_256_prologue + +.rva .LSEH_body_lshift_mod_256 +.rva .LSEH_epilogue_lshift_mod_256 +.rva .LSEH_info_lshift_mod_256_body + +.rva .LSEH_epilogue_lshift_mod_256 +.rva .LSEH_end_lshift_mod_256 +.rva .LSEH_info_lshift_mod_256_epilogue + +.rva .LSEH_begin_rshift_mod_256 +.rva .LSEH_body_rshift_mod_256 +.rva .LSEH_info_rshift_mod_256_prologue + +.rva .LSEH_body_rshift_mod_256 +.rva .LSEH_epilogue_rshift_mod_256 +.rva .LSEH_info_rshift_mod_256_body + +.rva .LSEH_epilogue_rshift_mod_256 +.rva .LSEH_end_rshift_mod_256 +.rva .LSEH_info_rshift_mod_256_epilogue + +.rva .LSEH_begin_cneg_mod_256 +.rva .LSEH_body_cneg_mod_256 +.rva .LSEH_info_cneg_mod_256_prologue + +.rva .LSEH_body_cneg_mod_256 +.rva .LSEH_epilogue_cneg_mod_256 +.rva .LSEH_info_cneg_mod_256_body + +.rva .LSEH_epilogue_cneg_mod_256 +.rva .LSEH_end_cneg_mod_256 +.rva .LSEH_info_cneg_mod_256_epilogue + +.rva .LSEH_begin_sub_mod_256 +.rva .LSEH_body_sub_mod_256 +.rva .LSEH_info_sub_mod_256_prologue + +.rva .LSEH_body_sub_mod_256 +.rva .LSEH_epilogue_sub_mod_256 +.rva .LSEH_info_sub_mod_256_body + +.rva .LSEH_epilogue_sub_mod_256 +.rva .LSEH_end_sub_mod_256 +.rva .LSEH_info_sub_mod_256_epilogue + +.rva .LSEH_epilogue_check_mod_256 +.rva .LSEH_end_check_mod_256 +.rva .LSEH_info_check_mod_256_epilogue + +.rva .LSEH_begin_add_n_check_mod_256 +.rva .LSEH_body_add_n_check_mod_256 +.rva .LSEH_info_add_n_check_mod_256_prologue + +.rva .LSEH_body_add_n_check_mod_256 +.rva .LSEH_epilogue_add_n_check_mod_256 +.rva .LSEH_info_add_n_check_mod_256_body + +.rva .LSEH_epilogue_add_n_check_mod_256 +.rva .LSEH_end_add_n_check_mod_256 +.rva .LSEH_info_add_n_check_mod_256_epilogue + +.rva .LSEH_begin_sub_n_check_mod_256 +.rva .LSEH_body_sub_n_check_mod_256 +.rva .LSEH_info_sub_n_check_mod_256_prologue + +.rva .LSEH_body_sub_n_check_mod_256 +.rva .LSEH_epilogue_sub_n_check_mod_256 +.rva .LSEH_info_sub_n_check_mod_256_body + +.rva .LSEH_epilogue_sub_n_check_mod_256 +.rva .LSEH_end_sub_n_check_mod_256 +.rva .LSEH_info_sub_n_check_mod_256_epilogue + +.section .xdata +.p2align 3 +.LSEH_info_add_mod_256_prologue: +.byte 1,0,5,0x0b +.byte 0,0x74,1,0 +.byte 0,0x64,2,0 +.byte 0,0x03 +.byte 0,0 +.LSEH_info_add_mod_256_body: +.byte 1,0,9,0 +.byte 0x00,0x34,0x01,0x00 +.byte 0x00,0x54,0x02,0x00 +.byte 0x00,0x74,0x04,0x00 +.byte 0x00,0x64,0x05,0x00 +.byte 0x00,0x22 +.byte 0x00,0x00 +.LSEH_info_add_mod_256_epilogue: +.byte 1,0,4,0 +.byte 0x00,0x74,0x01,0x00 +.byte 0x00,0x64,0x02,0x00 +.byte 0x00,0x00,0x00,0x00 + +.LSEH_info_mul_by_3_mod_256_prologue: +.byte 1,0,5,0x0b +.byte 0,0x74,1,0 +.byte 0,0x64,2,0 +.byte 0,0x03 +.byte 0,0 +.LSEH_info_mul_by_3_mod_256_body: +.byte 1,0,11,0 +.byte 0x00,0xc4,0x00,0x00 +.byte 0x00,0x34,0x01,0x00 +.byte 0x00,0x54,0x02,0x00 +.byte 0x00,0x74,0x04,0x00 +.byte 0x00,0x64,0x05,0x00 +.byte 0x00,0x22 +.byte 0x00,0x00,0x00,0x00,0x00,0x00 +.LSEH_info_mul_by_3_mod_256_epilogue: +.byte 1,0,4,0 +.byte 0x00,0x74,0x01,0x00 +.byte 0x00,0x64,0x02,0x00 +.byte 0x00,0x00,0x00,0x00 + +.LSEH_info_lshift_mod_256_prologue: +.byte 1,0,5,0x0b +.byte 0,0x74,1,0 +.byte 0,0x64,2,0 +.byte 0,0x03 +.byte 0,0 +.LSEH_info_lshift_mod_256_body: +.byte 1,0,11,0 +.byte 0x00,0xc4,0x00,0x00 +.byte 0x00,0x34,0x01,0x00 +.byte 0x00,0x54,0x02,0x00 +.byte 0x00,0x74,0x04,0x00 +.byte 0x00,0x64,0x05,0x00 +.byte 0x00,0x22 +.byte 0x00,0x00,0x00,0x00,0x00,0x00 +.LSEH_info_lshift_mod_256_epilogue: +.byte 1,0,4,0 +.byte 0x00,0x74,0x01,0x00 +.byte 0x00,0x64,0x02,0x00 +.byte 0x00,0x00,0x00,0x00 + +.LSEH_info_rshift_mod_256_prologue: +.byte 1,0,5,0x0b +.byte 0,0x74,1,0 +.byte 0,0x64,2,0 +.byte 0,0x03 +.byte 0,0 +.LSEH_info_rshift_mod_256_body: +.byte 1,0,9,0 +.byte 0x00,0x34,0x01,0x00 +.byte 0x00,0x54,0x02,0x00 +.byte 0x00,0x74,0x04,0x00 +.byte 0x00,0x64,0x05,0x00 +.byte 0x00,0x22 +.byte 0x00,0x00 +.LSEH_info_rshift_mod_256_epilogue: +.byte 1,0,4,0 +.byte 0x00,0x74,0x01,0x00 +.byte 0x00,0x64,0x02,0x00 +.byte 0x00,0x00,0x00,0x00 + +.LSEH_info_cneg_mod_256_prologue: +.byte 1,0,5,0x0b +.byte 0,0x74,1,0 +.byte 0,0x64,2,0 +.byte 0,0x03 +.byte 0,0 +.LSEH_info_cneg_mod_256_body: +.byte 1,0,11,0 +.byte 0x00,0xc4,0x00,0x00 +.byte 0x00,0x34,0x01,0x00 +.byte 0x00,0x54,0x02,0x00 +.byte 0x00,0x74,0x04,0x00 +.byte 0x00,0x64,0x05,0x00 +.byte 0x00,0x22 +.byte 0x00,0x00,0x00,0x00,0x00,0x00 +.LSEH_info_cneg_mod_256_epilogue: +.byte 1,0,4,0 +.byte 0x00,0x74,0x01,0x00 +.byte 0x00,0x64,0x02,0x00 +.byte 0x00,0x00,0x00,0x00 + +.LSEH_info_sub_mod_256_prologue: +.byte 1,0,5,0x0b +.byte 0,0x74,1,0 +.byte 0,0x64,2,0 +.byte 0,0x03 +.byte 0,0 +.LSEH_info_sub_mod_256_body: +.byte 1,0,9,0 +.byte 0x00,0x34,0x01,0x00 +.byte 0x00,0x54,0x02,0x00 +.byte 0x00,0x74,0x04,0x00 +.byte 0x00,0x64,0x05,0x00 +.byte 0x00,0x22 +.byte 0x00,0x00 +.LSEH_info_sub_mod_256_epilogue: +.byte 1,0,4,0 +.byte 0x00,0x74,0x01,0x00 +.byte 0x00,0x64,0x02,0x00 +.byte 0x00,0x00,0x00,0x00 + +.LSEH_info_check_mod_256_epilogue: +.byte 1,0,4,0 +.byte 0x00,0x74,0x01,0x00 +.byte 0x00,0x64,0x02,0x00 +.byte 0x00,0x00,0x00,0x00 + +.LSEH_info_add_n_check_mod_256_prologue: +.byte 1,0,5,0x0b +.byte 0,0x74,1,0 +.byte 0,0x64,2,0 +.byte 0,0x03 +.byte 0,0 +.LSEH_info_add_n_check_mod_256_body: +.byte 1,0,9,0 +.byte 0x00,0x34,0x01,0x00 +.byte 0x00,0x54,0x02,0x00 +.byte 0x00,0x74,0x04,0x00 +.byte 0x00,0x64,0x05,0x00 +.byte 0x00,0x22 +.byte 0x00,0x00 +.LSEH_info_add_n_check_mod_256_epilogue: +.byte 1,0,4,0 +.byte 0x00,0x74,0x01,0x00 +.byte 0x00,0x64,0x02,0x00 +.byte 0x00,0x00,0x00,0x00 + +.LSEH_info_sub_n_check_mod_256_prologue: +.byte 1,0,5,0x0b +.byte 0,0x74,1,0 +.byte 0,0x64,2,0 +.byte 0,0x03 +.byte 0,0 +.LSEH_info_sub_n_check_mod_256_body: +.byte 1,0,9,0 +.byte 0x00,0x34,0x01,0x00 +.byte 0x00,0x54,0x02,0x00 +.byte 0x00,0x74,0x04,0x00 +.byte 0x00,0x64,0x05,0x00 +.byte 0x00,0x22 +.byte 0x00,0x00 +.LSEH_info_sub_n_check_mod_256_epilogue: +.byte 1,0,4,0 +.byte 0x00,0x74,0x01,0x00 +.byte 0x00,0x64,0x02,0x00 +.byte 0x00,0x00,0x00,0x00 + diff --git a/temp-deps/blst-ts/deps/blst/build/coff/add_mod_384-armv8.S b/temp-deps/blst-ts/deps/blst/build/coff/add_mod_384-armv8.S new file mode 100644 index 0000000..2eff067 --- /dev/null +++ b/temp-deps/blst-ts/deps/blst/build/coff/add_mod_384-armv8.S @@ -0,0 +1,1056 @@ +.text + +.globl add_mod_384 + +.def add_mod_384; +.type 32; +.endef +.p2align 5 +add_mod_384: +.long 3573752639 + stp x29,x30,[sp,#-48]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + + ldp x4,x5,[x3] + ldp x6,x7,[x3,#16] + ldp x8,x9,[x3,#32] + + bl __add_mod_384 + ldr x30,[sp,#8] + + stp x10,x11,[x0] + stp x12,x13,[x0,#16] + stp x14,x15,[x0,#32] + + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldr x29,[sp],#48 +.long 3573752767 + ret + + +.def __add_mod_384; +.type 32; +.endef +.p2align 5 +__add_mod_384: + ldp x10,x11,[x1] + ldp x16,x17,[x2] + ldp x12,x13,[x1,#16] + ldp x19,x20,[x2,#16] + ldp x14,x15,[x1,#32] + ldp x21,x22,[x2,#32] + +__add_mod_384_ab_are_loaded: + adds x10,x10,x16 + adcs x11,x11,x17 + adcs x12,x12,x19 + adcs x13,x13,x20 + adcs x14,x14,x21 + adcs x15,x15,x22 + adc x3,xzr,xzr + + subs x16,x10,x4 + sbcs x17,x11,x5 + sbcs x19,x12,x6 + sbcs x20,x13,x7 + sbcs x21,x14,x8 + sbcs x22,x15,x9 + sbcs xzr,x3,xzr + + csel x10,x10,x16,lo + csel x11,x11,x17,lo + csel x12,x12,x19,lo + csel x13,x13,x20,lo + csel x14,x14,x21,lo + csel x15,x15,x22,lo + + ret + + +.globl add_mod_384x + +.def add_mod_384x; +.type 32; +.endef +.p2align 5 +add_mod_384x: +.long 3573752639 + stp x29,x30,[sp,#-48]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + + ldp x4,x5,[x3] + ldp x6,x7,[x3,#16] + ldp x8,x9,[x3,#32] + + bl __add_mod_384 + + stp x10,x11,[x0] + add x1,x1,#48 + stp x12,x13,[x0,#16] + add x2,x2,#48 + stp x14,x15,[x0,#32] + + bl __add_mod_384 + ldr x30,[sp,#8] + + stp x10,x11,[x0,#48] + stp x12,x13,[x0,#64] + stp x14,x15,[x0,#80] + + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldr x29,[sp],#48 +.long 3573752767 + ret + + +.globl rshift_mod_384 + +.def rshift_mod_384; +.type 32; +.endef +.p2align 5 +rshift_mod_384: +.long 3573752639 + stp x29,x30,[sp,#-48]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + + ldp x10,x11,[x1] + ldp x12,x13,[x1,#16] + ldp x14,x15,[x1,#32] + + ldp x4,x5,[x3] + ldp x6,x7,[x3,#16] + ldp x8,x9,[x3,#32] + +.Loop_rshift_mod_384: + sub x2,x2,#1 + bl __rshift_mod_384 + cbnz x2,.Loop_rshift_mod_384 + + ldr x30,[sp,#8] + stp x10,x11,[x0] + stp x12,x13,[x0,#16] + stp x14,x15,[x0,#32] + + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldr x29,[sp],#48 +.long 3573752767 + ret + + +.def __rshift_mod_384; +.type 32; +.endef +.p2align 5 +__rshift_mod_384: + sbfx x22,x10,#0,#1 + and x16,x22,x4 + and x17,x22,x5 + adds x10,x10,x16 + and x19,x22,x6 + adcs x11,x11,x17 + and x20,x22,x7 + adcs x12,x12,x19 + and x21,x22,x8 + adcs x13,x13,x20 + and x22,x22,x9 + adcs x14,x14,x21 + extr x10,x11,x10,#1 // a[0:5] >>= 1 + adcs x15,x15,x22 + extr x11,x12,x11,#1 + adc x22,xzr,xzr + extr x12,x13,x12,#1 + extr x13,x14,x13,#1 + extr x14,x15,x14,#1 + extr x15,x22,x15,#1 + ret + + +.globl div_by_2_mod_384 + +.def div_by_2_mod_384; +.type 32; +.endef +.p2align 5 +div_by_2_mod_384: +.long 3573752639 + stp x29,x30,[sp,#-48]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + + ldp x10,x11,[x1] + ldp x12,x13,[x1,#16] + ldp x14,x15,[x1,#32] + + ldp x4,x5,[x2] + ldp x6,x7,[x2,#16] + ldp x8,x9,[x2,#32] + + bl __rshift_mod_384 + + ldr x30,[sp,#8] + stp x10,x11,[x0] + stp x12,x13,[x0,#16] + stp x14,x15,[x0,#32] + + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldr x29,[sp],#48 +.long 3573752767 + ret + + +.globl lshift_mod_384 + +.def lshift_mod_384; +.type 32; +.endef +.p2align 5 +lshift_mod_384: +.long 3573752639 + stp x29,x30,[sp,#-48]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + + ldp x10,x11,[x1] + ldp x12,x13,[x1,#16] + ldp x14,x15,[x1,#32] + + ldp x4,x5,[x3] + ldp x6,x7,[x3,#16] + ldp x8,x9,[x3,#32] + +.Loop_lshift_mod_384: + sub x2,x2,#1 + bl __lshift_mod_384 + cbnz x2,.Loop_lshift_mod_384 + + ldr x30,[sp,#8] + stp x10,x11,[x0] + stp x12,x13,[x0,#16] + stp x14,x15,[x0,#32] + + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldr x29,[sp],#48 +.long 3573752767 + ret + + +.def __lshift_mod_384; +.type 32; +.endef +.p2align 5 +__lshift_mod_384: + adds x10,x10,x10 + adcs x11,x11,x11 + adcs x12,x12,x12 + adcs x13,x13,x13 + adcs x14,x14,x14 + adcs x15,x15,x15 + adc x3,xzr,xzr + + subs x16,x10,x4 + sbcs x17,x11,x5 + sbcs x19,x12,x6 + sbcs x20,x13,x7 + sbcs x21,x14,x8 + sbcs x22,x15,x9 + sbcs xzr,x3,xzr + + csel x10,x10,x16,lo + csel x11,x11,x17,lo + csel x12,x12,x19,lo + csel x13,x13,x20,lo + csel x14,x14,x21,lo + csel x15,x15,x22,lo + + ret + + +.globl mul_by_3_mod_384 + +.def mul_by_3_mod_384; +.type 32; +.endef +.p2align 5 +mul_by_3_mod_384: +.long 3573752639 + stp x29,x30,[sp,#-48]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + + ldp x10,x11,[x1] + ldp x12,x13,[x1,#16] + ldp x14,x15,[x1,#32] + + ldp x4,x5,[x2] + ldp x6,x7,[x2,#16] + ldp x8,x9,[x2,#32] + + bl __lshift_mod_384 + + ldp x16,x17,[x1] + ldp x19,x20,[x1,#16] + ldp x21,x22,[x1,#32] + + bl __add_mod_384_ab_are_loaded + ldr x30,[sp,#8] + + stp x10,x11,[x0] + stp x12,x13,[x0,#16] + stp x14,x15,[x0,#32] + + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldr x29,[sp],#48 +.long 3573752767 + ret + + +.globl mul_by_8_mod_384 + +.def mul_by_8_mod_384; +.type 32; +.endef +.p2align 5 +mul_by_8_mod_384: +.long 3573752639 + stp x29,x30,[sp,#-48]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + + ldp x10,x11,[x1] + ldp x12,x13,[x1,#16] + ldp x14,x15,[x1,#32] + + ldp x4,x5,[x2] + ldp x6,x7,[x2,#16] + ldp x8,x9,[x2,#32] + + bl __lshift_mod_384 + bl __lshift_mod_384 + bl __lshift_mod_384 + ldr x30,[sp,#8] + + stp x10,x11,[x0] + stp x12,x13,[x0,#16] + stp x14,x15,[x0,#32] + + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldr x29,[sp],#48 +.long 3573752767 + ret + + +.globl mul_by_3_mod_384x + +.def mul_by_3_mod_384x; +.type 32; +.endef +.p2align 5 +mul_by_3_mod_384x: +.long 3573752639 + stp x29,x30,[sp,#-48]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + + ldp x10,x11,[x1] + ldp x12,x13,[x1,#16] + ldp x14,x15,[x1,#32] + + ldp x4,x5,[x2] + ldp x6,x7,[x2,#16] + ldp x8,x9,[x2,#32] + + bl __lshift_mod_384 + + ldp x16,x17,[x1] + ldp x19,x20,[x1,#16] + ldp x21,x22,[x1,#32] + + bl __add_mod_384_ab_are_loaded + + stp x10,x11,[x0] + ldp x10,x11,[x1,#48] + stp x12,x13,[x0,#16] + ldp x12,x13,[x1,#64] + stp x14,x15,[x0,#32] + ldp x14,x15,[x1,#80] + + bl __lshift_mod_384 + + ldp x16,x17,[x1,#48] + ldp x19,x20,[x1,#64] + ldp x21,x22,[x1,#80] + + bl __add_mod_384_ab_are_loaded + ldr x30,[sp,#8] + + stp x10,x11,[x0,#48] + stp x12,x13,[x0,#64] + stp x14,x15,[x0,#80] + + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldr x29,[sp],#48 +.long 3573752767 + ret + + +.globl mul_by_8_mod_384x + +.def mul_by_8_mod_384x; +.type 32; +.endef +.p2align 5 +mul_by_8_mod_384x: +.long 3573752639 + stp x29,x30,[sp,#-48]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + + ldp x10,x11,[x1] + ldp x12,x13,[x1,#16] + ldp x14,x15,[x1,#32] + + ldp x4,x5,[x2] + ldp x6,x7,[x2,#16] + ldp x8,x9,[x2,#32] + + bl __lshift_mod_384 + bl __lshift_mod_384 + bl __lshift_mod_384 + + stp x10,x11,[x0] + ldp x10,x11,[x1,#48] + stp x12,x13,[x0,#16] + ldp x12,x13,[x1,#64] + stp x14,x15,[x0,#32] + ldp x14,x15,[x1,#80] + + bl __lshift_mod_384 + bl __lshift_mod_384 + bl __lshift_mod_384 + ldr x30,[sp,#8] + + stp x10,x11,[x0,#48] + stp x12,x13,[x0,#64] + stp x14,x15,[x0,#80] + + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldr x29,[sp],#48 +.long 3573752767 + ret + + +.globl cneg_mod_384 + +.def cneg_mod_384; +.type 32; +.endef +.p2align 5 +cneg_mod_384: +.long 3573752639 + stp x29,x30,[sp,#-48]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + + ldp x10,x11,[x1] + ldp x4,x5,[x3] + ldp x12,x13,[x1,#16] + ldp x6,x7,[x3,#16] + + subs x16,x4,x10 + ldp x14,x15,[x1,#32] + ldp x8,x9,[x3,#32] + orr x3,x10,x11 + sbcs x17,x5,x11 + orr x3,x3,x12 + sbcs x19,x6,x12 + orr x3,x3,x13 + sbcs x20,x7,x13 + orr x3,x3,x14 + sbcs x21,x8,x14 + orr x3,x3,x15 + sbc x22,x9,x15 + + cmp x3,#0 + csetm x3,ne + ands x2,x2,x3 + + csel x10,x10,x16,eq + csel x11,x11,x17,eq + csel x12,x12,x19,eq + csel x13,x13,x20,eq + stp x10,x11,[x0] + csel x14,x14,x21,eq + stp x12,x13,[x0,#16] + csel x15,x15,x22,eq + stp x14,x15,[x0,#32] + + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldr x29,[sp],#48 +.long 3573752767 + ret + + +.globl sub_mod_384 + +.def sub_mod_384; +.type 32; +.endef +.p2align 5 +sub_mod_384: +.long 3573752639 + stp x29,x30,[sp,#-48]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + + ldp x4,x5,[x3] + ldp x6,x7,[x3,#16] + ldp x8,x9,[x3,#32] + + bl __sub_mod_384 + ldr x30,[sp,#8] + + stp x10,x11,[x0] + stp x12,x13,[x0,#16] + stp x14,x15,[x0,#32] + + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldr x29,[sp],#48 +.long 3573752767 + ret + + +.def __sub_mod_384; +.type 32; +.endef +.p2align 5 +__sub_mod_384: + ldp x10,x11,[x1] + ldp x16,x17,[x2] + ldp x12,x13,[x1,#16] + ldp x19,x20,[x2,#16] + ldp x14,x15,[x1,#32] + ldp x21,x22,[x2,#32] + + subs x10,x10,x16 + sbcs x11,x11,x17 + sbcs x12,x12,x19 + sbcs x13,x13,x20 + sbcs x14,x14,x21 + sbcs x15,x15,x22 + sbc x3,xzr,xzr + + and x16,x4,x3 + and x17,x5,x3 + adds x10,x10,x16 + and x19,x6,x3 + adcs x11,x11,x17 + and x20,x7,x3 + adcs x12,x12,x19 + and x21,x8,x3 + adcs x13,x13,x20 + and x22,x9,x3 + adcs x14,x14,x21 + adc x15,x15,x22 + + ret + + +.globl sub_mod_384x + +.def sub_mod_384x; +.type 32; +.endef +.p2align 5 +sub_mod_384x: +.long 3573752639 + stp x29,x30,[sp,#-48]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + + ldp x4,x5,[x3] + ldp x6,x7,[x3,#16] + ldp x8,x9,[x3,#32] + + bl __sub_mod_384 + + stp x10,x11,[x0] + add x1,x1,#48 + stp x12,x13,[x0,#16] + add x2,x2,#48 + stp x14,x15,[x0,#32] + + bl __sub_mod_384 + ldr x30,[sp,#8] + + stp x10,x11,[x0,#48] + stp x12,x13,[x0,#64] + stp x14,x15,[x0,#80] + + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldr x29,[sp],#48 +.long 3573752767 + ret + + +.globl mul_by_1_plus_i_mod_384x + +.def mul_by_1_plus_i_mod_384x; +.type 32; +.endef +.p2align 5 +mul_by_1_plus_i_mod_384x: +.long 3573752639 + stp x29,x30,[sp,#-48]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + + ldp x4,x5,[x2] + ldp x6,x7,[x2,#16] + ldp x8,x9,[x2,#32] + add x2,x1,#48 + + bl __sub_mod_384 // a->re - a->im + + ldp x16,x17,[x1] + ldp x19,x20,[x1,#16] + ldp x21,x22,[x1,#32] + stp x10,x11,[x0] + ldp x10,x11,[x1,#48] + stp x12,x13,[x0,#16] + ldp x12,x13,[x1,#64] + stp x14,x15,[x0,#32] + ldp x14,x15,[x1,#80] + + bl __add_mod_384_ab_are_loaded // a->re + a->im + ldr x30,[sp,#8] + + stp x10,x11,[x0,#48] + stp x12,x13,[x0,#64] + stp x14,x15,[x0,#80] + + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldr x29,[sp],#48 +.long 3573752767 + ret + + +.globl sgn0_pty_mod_384 + +.def sgn0_pty_mod_384; +.type 32; +.endef +.p2align 5 +sgn0_pty_mod_384: + ldp x10,x11,[x0] + ldp x12,x13,[x0,#16] + ldp x14,x15,[x0,#32] + + ldp x4,x5,[x1] + ldp x6,x7,[x1,#16] + ldp x8,x9,[x1,#32] + + and x0,x10,#1 + adds x10,x10,x10 + adcs x11,x11,x11 + adcs x12,x12,x12 + adcs x13,x13,x13 + adcs x14,x14,x14 + adcs x15,x15,x15 + adc x3,xzr,xzr + + subs x10,x10,x4 + sbcs x11,x11,x5 + sbcs x12,x12,x6 + sbcs x13,x13,x7 + sbcs x14,x14,x8 + sbcs x15,x15,x9 + sbc x3,x3,xzr + + mvn x3,x3 + and x3,x3,#2 + orr x0,x0,x3 + + ret + + +.globl sgn0_pty_mod_384x + +.def sgn0_pty_mod_384x; +.type 32; +.endef +.p2align 5 +sgn0_pty_mod_384x: + ldp x10,x11,[x0] + ldp x12,x13,[x0,#16] + ldp x14,x15,[x0,#32] + + ldp x4,x5,[x1] + ldp x6,x7,[x1,#16] + ldp x8,x9,[x1,#32] + + and x2,x10,#1 + orr x3,x10,x11 + adds x10,x10,x10 + orr x3,x3,x12 + adcs x11,x11,x11 + orr x3,x3,x13 + adcs x12,x12,x12 + orr x3,x3,x14 + adcs x13,x13,x13 + orr x3,x3,x15 + adcs x14,x14,x14 + adcs x15,x15,x15 + adc x16,xzr,xzr + + subs x10,x10,x4 + sbcs x11,x11,x5 + sbcs x12,x12,x6 + sbcs x13,x13,x7 + sbcs x14,x14,x8 + sbcs x15,x15,x9 + sbc x16,x16,xzr + + ldp x10,x11,[x0,#48] + ldp x12,x13,[x0,#64] + ldp x14,x15,[x0,#80] + + mvn x16,x16 + and x16,x16,#2 + orr x2,x2,x16 + + and x0,x10,#1 + orr x1,x10,x11 + adds x10,x10,x10 + orr x1,x1,x12 + adcs x11,x11,x11 + orr x1,x1,x13 + adcs x12,x12,x12 + orr x1,x1,x14 + adcs x13,x13,x13 + orr x1,x1,x15 + adcs x14,x14,x14 + adcs x15,x15,x15 + adc x16,xzr,xzr + + subs x10,x10,x4 + sbcs x11,x11,x5 + sbcs x12,x12,x6 + sbcs x13,x13,x7 + sbcs x14,x14,x8 + sbcs x15,x15,x9 + sbc x16,x16,xzr + + mvn x16,x16 + and x16,x16,#2 + orr x0,x0,x16 + + cmp x3,#0 + csel x3,x0,x2,eq // a->re==0? prty(a->im) : prty(a->re) + + cmp x1,#0 + csel x1,x0,x2,ne // a->im!=0? sgn0(a->im) : sgn0(a->re) + + and x3,x3,#1 + and x1,x1,#2 + orr x0,x1,x3 // pack sign and parity + + ret + +.globl vec_select_32 + +.def vec_select_32; +.type 32; +.endef +.p2align 5 +vec_select_32: + dup v6.2d, x3 + ld1 {v0.2d, v1.2d, v2.2d}, [x1],#48 + cmeq v6.2d, v6.2d, #0 + ld1 {v3.2d, v4.2d, v5.2d}, [x2],#48 + bit v0.16b, v3.16b, v6.16b + bit v1.16b, v4.16b, v6.16b + bit v2.16b, v5.16b, v6.16b + st1 {v0.2d, v1.2d, v2.2d}, [x0] + ret + +.globl vec_select_48 + +.def vec_select_48; +.type 32; +.endef +.p2align 5 +vec_select_48: + dup v6.2d, x3 + ld1 {v0.2d, v1.2d, v2.2d}, [x1],#48 + cmeq v6.2d, v6.2d, #0 + ld1 {v3.2d, v4.2d, v5.2d}, [x2],#48 + bit v0.16b, v3.16b, v6.16b + bit v1.16b, v4.16b, v6.16b + bit v2.16b, v5.16b, v6.16b + st1 {v0.2d, v1.2d, v2.2d}, [x0] + ret + +.globl vec_select_96 + +.def vec_select_96; +.type 32; +.endef +.p2align 5 +vec_select_96: + dup v6.2d, x3 + ld1 {v0.2d, v1.2d, v2.2d}, [x1],#48 + cmeq v6.2d, v6.2d, #0 + ld1 {v3.2d, v4.2d, v5.2d}, [x2],#48 + bit v0.16b, v3.16b, v6.16b + ld1 {v16.2d, v17.2d, v18.2d}, [x1],#48 + bit v1.16b, v4.16b, v6.16b + ld1 {v19.2d, v20.2d, v21.2d}, [x2],#48 + bit v2.16b, v5.16b, v6.16b + st1 {v0.2d, v1.2d, v2.2d}, [x0],#48 + bit v16.16b, v19.16b, v6.16b + bit v17.16b, v20.16b, v6.16b + bit v18.16b, v21.16b, v6.16b + st1 {v16.2d, v17.2d, v18.2d}, [x0] + ret + +.globl vec_select_192 + +.def vec_select_192; +.type 32; +.endef +.p2align 5 +vec_select_192: + dup v6.2d, x3 + ld1 {v0.2d, v1.2d, v2.2d}, [x1],#48 + cmeq v6.2d, v6.2d, #0 + ld1 {v3.2d, v4.2d, v5.2d}, [x2],#48 + bit v0.16b, v3.16b, v6.16b + ld1 {v16.2d, v17.2d, v18.2d}, [x1],#48 + bit v1.16b, v4.16b, v6.16b + ld1 {v19.2d, v20.2d, v21.2d}, [x2],#48 + bit v2.16b, v5.16b, v6.16b + st1 {v0.2d, v1.2d, v2.2d}, [x0],#48 + bit v16.16b, v19.16b, v6.16b + ld1 {v0.2d, v1.2d, v2.2d}, [x1],#48 + bit v17.16b, v20.16b, v6.16b + ld1 {v3.2d, v4.2d, v5.2d}, [x2],#48 + bit v18.16b, v21.16b, v6.16b + st1 {v16.2d, v17.2d, v18.2d}, [x0],#48 + bit v0.16b, v3.16b, v6.16b + ld1 {v16.2d, v17.2d, v18.2d}, [x1],#48 + bit v1.16b, v4.16b, v6.16b + ld1 {v19.2d, v20.2d, v21.2d}, [x2],#48 + bit v2.16b, v5.16b, v6.16b + st1 {v0.2d, v1.2d, v2.2d}, [x0],#48 + bit v16.16b, v19.16b, v6.16b + bit v17.16b, v20.16b, v6.16b + bit v18.16b, v21.16b, v6.16b + st1 {v16.2d, v17.2d, v18.2d}, [x0] + ret + +.globl vec_select_144 + +.def vec_select_144; +.type 32; +.endef +.p2align 5 +vec_select_144: + dup v6.2d, x3 + ld1 {v0.2d, v1.2d, v2.2d}, [x1],#48 + cmeq v6.2d, v6.2d, #0 + ld1 {v3.2d, v4.2d, v5.2d}, [x2],#48 + bit v0.16b, v3.16b, v6.16b + ld1 {v16.2d, v17.2d, v18.2d}, [x1],#48 + bit v1.16b, v4.16b, v6.16b + ld1 {v19.2d, v20.2d, v21.2d}, [x2],#48 + bit v2.16b, v5.16b, v6.16b + st1 {v0.2d, v1.2d, v2.2d}, [x0],#48 + bit v16.16b, v19.16b, v6.16b + ld1 {v0.2d, v1.2d, v2.2d}, [x1],#48 + bit v17.16b, v20.16b, v6.16b + ld1 {v3.2d, v4.2d, v5.2d}, [x2],#48 + bit v18.16b, v21.16b, v6.16b + st1 {v16.2d, v17.2d, v18.2d}, [x0],#48 + bit v0.16b, v3.16b, v6.16b + bit v1.16b, v4.16b, v6.16b + bit v2.16b, v5.16b, v6.16b + st1 {v0.2d, v1.2d, v2.2d}, [x0] + ret + +.globl vec_select_288 + +.def vec_select_288; +.type 32; +.endef +.p2align 5 +vec_select_288: + dup v6.2d, x3 + ld1 {v0.2d, v1.2d, v2.2d}, [x1],#48 + cmeq v6.2d, v6.2d, #0 + ld1 {v3.2d, v4.2d, v5.2d}, [x2],#48 + bit v0.16b, v3.16b, v6.16b + ld1 {v16.2d, v17.2d, v18.2d}, [x1],#48 + bit v1.16b, v4.16b, v6.16b + ld1 {v19.2d, v20.2d, v21.2d}, [x2],#48 + bit v2.16b, v5.16b, v6.16b + st1 {v0.2d, v1.2d, v2.2d}, [x0],#48 + bit v16.16b, v19.16b, v6.16b + ld1 {v0.2d, v1.2d, v2.2d}, [x1],#48 + bit v17.16b, v20.16b, v6.16b + ld1 {v3.2d, v4.2d, v5.2d}, [x2],#48 + bit v18.16b, v21.16b, v6.16b + st1 {v16.2d, v17.2d, v18.2d}, [x0],#48 + bit v0.16b, v3.16b, v6.16b + ld1 {v16.2d, v17.2d, v18.2d}, [x1],#48 + bit v1.16b, v4.16b, v6.16b + ld1 {v19.2d, v20.2d, v21.2d}, [x2],#48 + bit v2.16b, v5.16b, v6.16b + st1 {v0.2d, v1.2d, v2.2d}, [x0],#48 + bit v16.16b, v19.16b, v6.16b + ld1 {v0.2d, v1.2d, v2.2d}, [x1],#48 + bit v17.16b, v20.16b, v6.16b + ld1 {v3.2d, v4.2d, v5.2d}, [x2],#48 + bit v18.16b, v21.16b, v6.16b + st1 {v16.2d, v17.2d, v18.2d}, [x0],#48 + bit v0.16b, v3.16b, v6.16b + ld1 {v16.2d, v17.2d, v18.2d}, [x1],#48 + bit v1.16b, v4.16b, v6.16b + ld1 {v19.2d, v20.2d, v21.2d}, [x2],#48 + bit v2.16b, v5.16b, v6.16b + st1 {v0.2d, v1.2d, v2.2d}, [x0],#48 + bit v16.16b, v19.16b, v6.16b + bit v17.16b, v20.16b, v6.16b + bit v18.16b, v21.16b, v6.16b + st1 {v16.2d, v17.2d, v18.2d}, [x0] + ret + +.globl vec_prefetch + +.def vec_prefetch; +.type 32; +.endef +.p2align 5 +vec_prefetch: + add x1, x1, x0 + sub x1, x1, #1 + mov x2, #64 + prfm pldl1keep, [x0] + add x0, x0, x2 + cmp x0, x1 + csel x0, x1, x0, hi + csel x2, xzr, x2, hi + prfm pldl1keep, [x0] + add x0, x0, x2 + cmp x0, x1 + csel x0, x1, x0, hi + csel x2, xzr, x2, hi + prfm pldl1keep, [x0] + add x0, x0, x2 + cmp x0, x1 + csel x0, x1, x0, hi + csel x2, xzr, x2, hi + prfm pldl1keep, [x0] + add x0, x0, x2 + cmp x0, x1 + csel x0, x1, x0, hi + csel x2, xzr, x2, hi + prfm pldl1keep, [x0] + add x0, x0, x2 + cmp x0, x1 + csel x0, x1, x0, hi + csel x2, xzr, x2, hi + prfm pldl1keep, [x0] + add x0, x0, x2 + cmp x0, x1 + csel x0, x1, x0, hi + prfm pldl1keep, [x0] + ret + +.globl vec_is_zero_16x + +.def vec_is_zero_16x; +.type 32; +.endef +.p2align 5 +vec_is_zero_16x: + ld1 {v0.2d}, [x0], #16 + lsr x1, x1, #4 + sub x1, x1, #1 + cbz x1, .Loop_is_zero_done + +.Loop_is_zero: + ld1 {v1.2d}, [x0], #16 + orr v0.16b, v0.16b, v1.16b + sub x1, x1, #1 + cbnz x1, .Loop_is_zero + +.Loop_is_zero_done: + dup v1.2d, v0.d[1] + orr v0.16b, v0.16b, v1.16b + mov x1, v0.d[0] + mov x0, #1 + cmp x1, #0 + csel x0, x0, xzr, eq + ret + +.globl vec_is_equal_16x + +.def vec_is_equal_16x; +.type 32; +.endef +.p2align 5 +vec_is_equal_16x: + ld1 {v0.2d}, [x0], #16 + ld1 {v1.2d}, [x1], #16 + lsr x2, x2, #4 + eor v0.16b, v0.16b, v1.16b + +.Loop_is_equal: + sub x2, x2, #1 + cbz x2, .Loop_is_equal_done + ld1 {v1.2d}, [x0], #16 + ld1 {v2.2d}, [x1], #16 + eor v1.16b, v1.16b, v2.16b + orr v0.16b, v0.16b, v1.16b + b .Loop_is_equal + nop + +.Loop_is_equal_done: + dup v1.2d, v0.d[1] + orr v0.16b, v0.16b, v1.16b + mov x1, v0.d[0] + mov x0, #1 + cmp x1, #0 + csel x0, x0, xzr, eq + ret + diff --git a/temp-deps/blst-ts/deps/blst/build/coff/add_mod_384-x86_64.s b/temp-deps/blst-ts/deps/blst/build/coff/add_mod_384-x86_64.s new file mode 100644 index 0000000..d1c7ad6 --- /dev/null +++ b/temp-deps/blst-ts/deps/blst/build/coff/add_mod_384-x86_64.s @@ -0,0 +1,2481 @@ +.text + +.globl add_mod_384 + +.def add_mod_384; .scl 2; .type 32; .endef +.p2align 5 +add_mod_384: + .byte 0xf3,0x0f,0x1e,0xfa + movq %rdi,8(%rsp) + movq %rsi,16(%rsp) + movq %rsp,%r11 +.LSEH_begin_add_mod_384: + movq %rcx,%rdi + movq %rdx,%rsi + movq %r8,%rdx + movq %r9,%rcx + + + pushq %rbp + + pushq %rbx + + pushq %r12 + + pushq %r13 + + pushq %r14 + + pushq %r15 + + subq $8,%rsp + +.LSEH_body_add_mod_384: + + + call __add_mod_384 + + movq 8(%rsp),%r15 + + movq 16(%rsp),%r14 + + movq 24(%rsp),%r13 + + movq 32(%rsp),%r12 + + movq 40(%rsp),%rbx + + movq 48(%rsp),%rbp + + leaq 56(%rsp),%rsp + +.LSEH_epilogue_add_mod_384: + mov 8(%rsp),%rdi + mov 16(%rsp),%rsi + + .byte 0xf3,0xc3 + +.LSEH_end_add_mod_384: + +.def __add_mod_384; .scl 3; .type 32; .endef +.p2align 5 +__add_mod_384: + .byte 0xf3,0x0f,0x1e,0xfa + + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + movq 32(%rsi),%r12 + movq 40(%rsi),%r13 + +__add_mod_384_a_is_loaded: + addq 0(%rdx),%r8 + adcq 8(%rdx),%r9 + adcq 16(%rdx),%r10 + movq %r8,%r14 + adcq 24(%rdx),%r11 + movq %r9,%r15 + adcq 32(%rdx),%r12 + movq %r10,%rax + adcq 40(%rdx),%r13 + movq %r11,%rbx + sbbq %rdx,%rdx + + subq 0(%rcx),%r8 + sbbq 8(%rcx),%r9 + movq %r12,%rbp + sbbq 16(%rcx),%r10 + sbbq 24(%rcx),%r11 + sbbq 32(%rcx),%r12 + movq %r13,%rsi + sbbq 40(%rcx),%r13 + sbbq $0,%rdx + + cmovcq %r14,%r8 + cmovcq %r15,%r9 + cmovcq %rax,%r10 + movq %r8,0(%rdi) + cmovcq %rbx,%r11 + movq %r9,8(%rdi) + cmovcq %rbp,%r12 + movq %r10,16(%rdi) + cmovcq %rsi,%r13 + movq %r11,24(%rdi) + movq %r12,32(%rdi) + movq %r13,40(%rdi) + + .byte 0xf3,0xc3 + + +.globl add_mod_384x + +.def add_mod_384x; .scl 2; .type 32; .endef +.p2align 5 +add_mod_384x: + .byte 0xf3,0x0f,0x1e,0xfa + movq %rdi,8(%rsp) + movq %rsi,16(%rsp) + movq %rsp,%r11 +.LSEH_begin_add_mod_384x: + movq %rcx,%rdi + movq %rdx,%rsi + movq %r8,%rdx + movq %r9,%rcx + + + pushq %rbp + + pushq %rbx + + pushq %r12 + + pushq %r13 + + pushq %r14 + + pushq %r15 + + subq $24,%rsp + +.LSEH_body_add_mod_384x: + + + movq %rsi,0(%rsp) + movq %rdx,8(%rsp) + leaq 48(%rsi),%rsi + leaq 48(%rdx),%rdx + leaq 48(%rdi),%rdi + call __add_mod_384 + + movq 0(%rsp),%rsi + movq 8(%rsp),%rdx + leaq -48(%rdi),%rdi + call __add_mod_384 + + movq 24+0(%rsp),%r15 + + movq 24+8(%rsp),%r14 + + movq 24+16(%rsp),%r13 + + movq 24+24(%rsp),%r12 + + movq 24+32(%rsp),%rbx + + movq 24+40(%rsp),%rbp + + leaq 24+48(%rsp),%rsp + +.LSEH_epilogue_add_mod_384x: + mov 8(%rsp),%rdi + mov 16(%rsp),%rsi + + .byte 0xf3,0xc3 + +.LSEH_end_add_mod_384x: + + +.globl rshift_mod_384 + +.def rshift_mod_384; .scl 2; .type 32; .endef +.p2align 5 +rshift_mod_384: + .byte 0xf3,0x0f,0x1e,0xfa + movq %rdi,8(%rsp) + movq %rsi,16(%rsp) + movq %rsp,%r11 +.LSEH_begin_rshift_mod_384: + movq %rcx,%rdi + movq %rdx,%rsi + movq %r8,%rdx + movq %r9,%rcx + + + pushq %rbp + + pushq %rbx + + pushq %r12 + + pushq %r13 + + pushq %r14 + + pushq %r15 + + pushq %rdi + +.LSEH_body_rshift_mod_384: + + + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + movq 32(%rsi),%r12 + movq 40(%rsi),%r13 + +.Loop_rshift_mod_384: + call __rshift_mod_384 + decl %edx + jnz .Loop_rshift_mod_384 + + movq %r8,0(%rdi) + movq %r9,8(%rdi) + movq %r10,16(%rdi) + movq %r11,24(%rdi) + movq %r12,32(%rdi) + movq %r13,40(%rdi) + + movq 8(%rsp),%r15 + + movq 16(%rsp),%r14 + + movq 24(%rsp),%r13 + + movq 32(%rsp),%r12 + + movq 40(%rsp),%rbx + + movq 48(%rsp),%rbp + + leaq 56(%rsp),%rsp + +.LSEH_epilogue_rshift_mod_384: + mov 8(%rsp),%rdi + mov 16(%rsp),%rsi + + .byte 0xf3,0xc3 + +.LSEH_end_rshift_mod_384: + +.def __rshift_mod_384; .scl 3; .type 32; .endef +.p2align 5 +__rshift_mod_384: + .byte 0xf3,0x0f,0x1e,0xfa + + movq $1,%rsi + movq 0(%rcx),%r14 + andq %r8,%rsi + movq 8(%rcx),%r15 + negq %rsi + movq 16(%rcx),%rax + andq %rsi,%r14 + movq 24(%rcx),%rbx + andq %rsi,%r15 + movq 32(%rcx),%rbp + andq %rsi,%rax + andq %rsi,%rbx + andq %rsi,%rbp + andq 40(%rcx),%rsi + + addq %r8,%r14 + adcq %r9,%r15 + adcq %r10,%rax + adcq %r11,%rbx + adcq %r12,%rbp + adcq %r13,%rsi + sbbq %r13,%r13 + + shrq $1,%r14 + movq %r15,%r8 + shrq $1,%r15 + movq %rax,%r9 + shrq $1,%rax + movq %rbx,%r10 + shrq $1,%rbx + movq %rbp,%r11 + shrq $1,%rbp + movq %rsi,%r12 + shrq $1,%rsi + shlq $63,%r8 + shlq $63,%r9 + orq %r14,%r8 + shlq $63,%r10 + orq %r15,%r9 + shlq $63,%r11 + orq %rax,%r10 + shlq $63,%r12 + orq %rbx,%r11 + shlq $63,%r13 + orq %rbp,%r12 + orq %rsi,%r13 + + .byte 0xf3,0xc3 + + +.globl div_by_2_mod_384 + +.def div_by_2_mod_384; .scl 2; .type 32; .endef +.p2align 5 +div_by_2_mod_384: + .byte 0xf3,0x0f,0x1e,0xfa + movq %rdi,8(%rsp) + movq %rsi,16(%rsp) + movq %rsp,%r11 +.LSEH_begin_div_by_2_mod_384: + movq %rcx,%rdi + movq %rdx,%rsi + movq %r8,%rdx + + + pushq %rbp + + pushq %rbx + + pushq %r12 + + pushq %r13 + + pushq %r14 + + pushq %r15 + + pushq %rdi + +.LSEH_body_div_by_2_mod_384: + + + movq 0(%rsi),%r8 + movq %rdx,%rcx + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + movq 32(%rsi),%r12 + movq 40(%rsi),%r13 + + call __rshift_mod_384 + + movq %r8,0(%rdi) + movq %r9,8(%rdi) + movq %r10,16(%rdi) + movq %r11,24(%rdi) + movq %r12,32(%rdi) + movq %r13,40(%rdi) + + movq 8(%rsp),%r15 + + movq 16(%rsp),%r14 + + movq 24(%rsp),%r13 + + movq 32(%rsp),%r12 + + movq 40(%rsp),%rbx + + movq 48(%rsp),%rbp + + leaq 56(%rsp),%rsp + +.LSEH_epilogue_div_by_2_mod_384: + mov 8(%rsp),%rdi + mov 16(%rsp),%rsi + + .byte 0xf3,0xc3 + +.LSEH_end_div_by_2_mod_384: + + +.globl lshift_mod_384 + +.def lshift_mod_384; .scl 2; .type 32; .endef +.p2align 5 +lshift_mod_384: + .byte 0xf3,0x0f,0x1e,0xfa + movq %rdi,8(%rsp) + movq %rsi,16(%rsp) + movq %rsp,%r11 +.LSEH_begin_lshift_mod_384: + movq %rcx,%rdi + movq %rdx,%rsi + movq %r8,%rdx + movq %r9,%rcx + + + pushq %rbp + + pushq %rbx + + pushq %r12 + + pushq %r13 + + pushq %r14 + + pushq %r15 + + pushq %rdi + +.LSEH_body_lshift_mod_384: + + + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + movq 32(%rsi),%r12 + movq 40(%rsi),%r13 + +.Loop_lshift_mod_384: + addq %r8,%r8 + adcq %r9,%r9 + adcq %r10,%r10 + movq %r8,%r14 + adcq %r11,%r11 + movq %r9,%r15 + adcq %r12,%r12 + movq %r10,%rax + adcq %r13,%r13 + movq %r11,%rbx + sbbq %rdi,%rdi + + subq 0(%rcx),%r8 + sbbq 8(%rcx),%r9 + movq %r12,%rbp + sbbq 16(%rcx),%r10 + sbbq 24(%rcx),%r11 + sbbq 32(%rcx),%r12 + movq %r13,%rsi + sbbq 40(%rcx),%r13 + sbbq $0,%rdi + + movq (%rsp),%rdi + cmovcq %r14,%r8 + cmovcq %r15,%r9 + cmovcq %rax,%r10 + cmovcq %rbx,%r11 + cmovcq %rbp,%r12 + cmovcq %rsi,%r13 + + decl %edx + jnz .Loop_lshift_mod_384 + + movq %r8,0(%rdi) + movq %r9,8(%rdi) + movq %r10,16(%rdi) + movq %r11,24(%rdi) + movq %r12,32(%rdi) + movq %r13,40(%rdi) + + movq 8(%rsp),%r15 + + movq 16(%rsp),%r14 + + movq 24(%rsp),%r13 + + movq 32(%rsp),%r12 + + movq 40(%rsp),%rbx + + movq 48(%rsp),%rbp + + leaq 56(%rsp),%rsp + +.LSEH_epilogue_lshift_mod_384: + mov 8(%rsp),%rdi + mov 16(%rsp),%rsi + + .byte 0xf3,0xc3 + +.LSEH_end_lshift_mod_384: + +.def __lshift_mod_384; .scl 3; .type 32; .endef +.p2align 5 +__lshift_mod_384: + .byte 0xf3,0x0f,0x1e,0xfa + + addq %r8,%r8 + adcq %r9,%r9 + adcq %r10,%r10 + movq %r8,%r14 + adcq %r11,%r11 + movq %r9,%r15 + adcq %r12,%r12 + movq %r10,%rax + adcq %r13,%r13 + movq %r11,%rbx + sbbq %rdx,%rdx + + subq 0(%rcx),%r8 + sbbq 8(%rcx),%r9 + movq %r12,%rbp + sbbq 16(%rcx),%r10 + sbbq 24(%rcx),%r11 + sbbq 32(%rcx),%r12 + movq %r13,%rsi + sbbq 40(%rcx),%r13 + sbbq $0,%rdx + + cmovcq %r14,%r8 + cmovcq %r15,%r9 + cmovcq %rax,%r10 + cmovcq %rbx,%r11 + cmovcq %rbp,%r12 + cmovcq %rsi,%r13 + + .byte 0xf3,0xc3 + + + +.globl mul_by_3_mod_384 + +.def mul_by_3_mod_384; .scl 2; .type 32; .endef +.p2align 5 +mul_by_3_mod_384: + .byte 0xf3,0x0f,0x1e,0xfa + movq %rdi,8(%rsp) + movq %rsi,16(%rsp) + movq %rsp,%r11 +.LSEH_begin_mul_by_3_mod_384: + movq %rcx,%rdi + movq %rdx,%rsi + movq %r8,%rdx + + + pushq %rbp + + pushq %rbx + + pushq %r12 + + pushq %r13 + + pushq %r14 + + pushq %r15 + + pushq %rsi + +.LSEH_body_mul_by_3_mod_384: + + + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + movq 32(%rsi),%r12 + movq 40(%rsi),%r13 + movq %rdx,%rcx + + call __lshift_mod_384 + + movq (%rsp),%rdx + call __add_mod_384_a_is_loaded + + movq 8(%rsp),%r15 + + movq 16(%rsp),%r14 + + movq 24(%rsp),%r13 + + movq 32(%rsp),%r12 + + movq 40(%rsp),%rbx + + movq 48(%rsp),%rbp + + leaq 56(%rsp),%rsp + +.LSEH_epilogue_mul_by_3_mod_384: + mov 8(%rsp),%rdi + mov 16(%rsp),%rsi + + .byte 0xf3,0xc3 + +.LSEH_end_mul_by_3_mod_384: + +.globl mul_by_8_mod_384 + +.def mul_by_8_mod_384; .scl 2; .type 32; .endef +.p2align 5 +mul_by_8_mod_384: + .byte 0xf3,0x0f,0x1e,0xfa + movq %rdi,8(%rsp) + movq %rsi,16(%rsp) + movq %rsp,%r11 +.LSEH_begin_mul_by_8_mod_384: + movq %rcx,%rdi + movq %rdx,%rsi + movq %r8,%rdx + + + pushq %rbp + + pushq %rbx + + pushq %r12 + + pushq %r13 + + pushq %r14 + + pushq %r15 + + subq $8,%rsp + +.LSEH_body_mul_by_8_mod_384: + + + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + movq 32(%rsi),%r12 + movq 40(%rsi),%r13 + movq %rdx,%rcx + + call __lshift_mod_384 + call __lshift_mod_384 + call __lshift_mod_384 + + movq %r8,0(%rdi) + movq %r9,8(%rdi) + movq %r10,16(%rdi) + movq %r11,24(%rdi) + movq %r12,32(%rdi) + movq %r13,40(%rdi) + + movq 8(%rsp),%r15 + + movq 16(%rsp),%r14 + + movq 24(%rsp),%r13 + + movq 32(%rsp),%r12 + + movq 40(%rsp),%rbx + + movq 48(%rsp),%rbp + + leaq 56(%rsp),%rsp + +.LSEH_epilogue_mul_by_8_mod_384: + mov 8(%rsp),%rdi + mov 16(%rsp),%rsi + + .byte 0xf3,0xc3 + +.LSEH_end_mul_by_8_mod_384: + + +.globl mul_by_3_mod_384x + +.def mul_by_3_mod_384x; .scl 2; .type 32; .endef +.p2align 5 +mul_by_3_mod_384x: + .byte 0xf3,0x0f,0x1e,0xfa + movq %rdi,8(%rsp) + movq %rsi,16(%rsp) + movq %rsp,%r11 +.LSEH_begin_mul_by_3_mod_384x: + movq %rcx,%rdi + movq %rdx,%rsi + movq %r8,%rdx + + + pushq %rbp + + pushq %rbx + + pushq %r12 + + pushq %r13 + + pushq %r14 + + pushq %r15 + + pushq %rsi + +.LSEH_body_mul_by_3_mod_384x: + + + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + movq 32(%rsi),%r12 + movq 40(%rsi),%r13 + movq %rdx,%rcx + + call __lshift_mod_384 + + movq (%rsp),%rdx + call __add_mod_384_a_is_loaded + + movq (%rsp),%rsi + leaq 48(%rdi),%rdi + + movq 48(%rsi),%r8 + movq 56(%rsi),%r9 + movq 64(%rsi),%r10 + movq 72(%rsi),%r11 + movq 80(%rsi),%r12 + movq 88(%rsi),%r13 + + call __lshift_mod_384 + + movq $48,%rdx + addq (%rsp),%rdx + call __add_mod_384_a_is_loaded + + movq 8(%rsp),%r15 + + movq 16(%rsp),%r14 + + movq 24(%rsp),%r13 + + movq 32(%rsp),%r12 + + movq 40(%rsp),%rbx + + movq 48(%rsp),%rbp + + leaq 56(%rsp),%rsp + +.LSEH_epilogue_mul_by_3_mod_384x: + mov 8(%rsp),%rdi + mov 16(%rsp),%rsi + + .byte 0xf3,0xc3 + +.LSEH_end_mul_by_3_mod_384x: + +.globl mul_by_8_mod_384x + +.def mul_by_8_mod_384x; .scl 2; .type 32; .endef +.p2align 5 +mul_by_8_mod_384x: + .byte 0xf3,0x0f,0x1e,0xfa + movq %rdi,8(%rsp) + movq %rsi,16(%rsp) + movq %rsp,%r11 +.LSEH_begin_mul_by_8_mod_384x: + movq %rcx,%rdi + movq %rdx,%rsi + movq %r8,%rdx + + + pushq %rbp + + pushq %rbx + + pushq %r12 + + pushq %r13 + + pushq %r14 + + pushq %r15 + + pushq %rsi + +.LSEH_body_mul_by_8_mod_384x: + + + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + movq 32(%rsi),%r12 + movq 40(%rsi),%r13 + movq %rdx,%rcx + + call __lshift_mod_384 + call __lshift_mod_384 + call __lshift_mod_384 + + movq (%rsp),%rsi + movq %r8,0(%rdi) + movq %r9,8(%rdi) + movq %r10,16(%rdi) + movq %r11,24(%rdi) + movq %r12,32(%rdi) + movq %r13,40(%rdi) + + movq 48+0(%rsi),%r8 + movq 48+8(%rsi),%r9 + movq 48+16(%rsi),%r10 + movq 48+24(%rsi),%r11 + movq 48+32(%rsi),%r12 + movq 48+40(%rsi),%r13 + + call __lshift_mod_384 + call __lshift_mod_384 + call __lshift_mod_384 + + movq %r8,48+0(%rdi) + movq %r9,48+8(%rdi) + movq %r10,48+16(%rdi) + movq %r11,48+24(%rdi) + movq %r12,48+32(%rdi) + movq %r13,48+40(%rdi) + + movq 8(%rsp),%r15 + + movq 16(%rsp),%r14 + + movq 24(%rsp),%r13 + + movq 32(%rsp),%r12 + + movq 40(%rsp),%rbx + + movq 48(%rsp),%rbp + + leaq 56(%rsp),%rsp + +.LSEH_epilogue_mul_by_8_mod_384x: + mov 8(%rsp),%rdi + mov 16(%rsp),%rsi + + .byte 0xf3,0xc3 + +.LSEH_end_mul_by_8_mod_384x: + + +.globl cneg_mod_384 + +.def cneg_mod_384; .scl 2; .type 32; .endef +.p2align 5 +cneg_mod_384: + .byte 0xf3,0x0f,0x1e,0xfa + movq %rdi,8(%rsp) + movq %rsi,16(%rsp) + movq %rsp,%r11 +.LSEH_begin_cneg_mod_384: + movq %rcx,%rdi + movq %rdx,%rsi + movq %r8,%rdx + movq %r9,%rcx + + + pushq %rbp + + pushq %rbx + + pushq %r12 + + pushq %r13 + + pushq %r14 + + pushq %r15 + + pushq %rdx + +.LSEH_body_cneg_mod_384: + + + movq 0(%rsi),%rdx + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq %rdx,%r8 + movq 24(%rsi),%r11 + orq %r9,%rdx + movq 32(%rsi),%r12 + orq %r10,%rdx + movq 40(%rsi),%r13 + orq %r11,%rdx + movq $-1,%rsi + orq %r12,%rdx + orq %r13,%rdx + + movq 0(%rcx),%r14 + cmovnzq %rsi,%rdx + movq 8(%rcx),%r15 + movq 16(%rcx),%rax + andq %rdx,%r14 + movq 24(%rcx),%rbx + andq %rdx,%r15 + movq 32(%rcx),%rbp + andq %rdx,%rax + movq 40(%rcx),%rsi + andq %rdx,%rbx + movq 0(%rsp),%rcx + andq %rdx,%rbp + andq %rdx,%rsi + + subq %r8,%r14 + sbbq %r9,%r15 + sbbq %r10,%rax + sbbq %r11,%rbx + sbbq %r12,%rbp + sbbq %r13,%rsi + + orq %rcx,%rcx + + cmovzq %r8,%r14 + cmovzq %r9,%r15 + cmovzq %r10,%rax + movq %r14,0(%rdi) + cmovzq %r11,%rbx + movq %r15,8(%rdi) + cmovzq %r12,%rbp + movq %rax,16(%rdi) + cmovzq %r13,%rsi + movq %rbx,24(%rdi) + movq %rbp,32(%rdi) + movq %rsi,40(%rdi) + + movq 8(%rsp),%r15 + + movq 16(%rsp),%r14 + + movq 24(%rsp),%r13 + + movq 32(%rsp),%r12 + + movq 40(%rsp),%rbx + + movq 48(%rsp),%rbp + + leaq 56(%rsp),%rsp + +.LSEH_epilogue_cneg_mod_384: + mov 8(%rsp),%rdi + mov 16(%rsp),%rsi + + .byte 0xf3,0xc3 + +.LSEH_end_cneg_mod_384: + + +.globl sub_mod_384 + +.def sub_mod_384; .scl 2; .type 32; .endef +.p2align 5 +sub_mod_384: + .byte 0xf3,0x0f,0x1e,0xfa + movq %rdi,8(%rsp) + movq %rsi,16(%rsp) + movq %rsp,%r11 +.LSEH_begin_sub_mod_384: + movq %rcx,%rdi + movq %rdx,%rsi + movq %r8,%rdx + movq %r9,%rcx + + + pushq %rbp + + pushq %rbx + + pushq %r12 + + pushq %r13 + + pushq %r14 + + pushq %r15 + + subq $8,%rsp + +.LSEH_body_sub_mod_384: + + + call __sub_mod_384 + + movq 8(%rsp),%r15 + + movq 16(%rsp),%r14 + + movq 24(%rsp),%r13 + + movq 32(%rsp),%r12 + + movq 40(%rsp),%rbx + + movq 48(%rsp),%rbp + + leaq 56(%rsp),%rsp + +.LSEH_epilogue_sub_mod_384: + mov 8(%rsp),%rdi + mov 16(%rsp),%rsi + + .byte 0xf3,0xc3 + +.LSEH_end_sub_mod_384: + +.def __sub_mod_384; .scl 3; .type 32; .endef +.p2align 5 +__sub_mod_384: + .byte 0xf3,0x0f,0x1e,0xfa + + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + movq 32(%rsi),%r12 + movq 40(%rsi),%r13 + + subq 0(%rdx),%r8 + movq 0(%rcx),%r14 + sbbq 8(%rdx),%r9 + movq 8(%rcx),%r15 + sbbq 16(%rdx),%r10 + movq 16(%rcx),%rax + sbbq 24(%rdx),%r11 + movq 24(%rcx),%rbx + sbbq 32(%rdx),%r12 + movq 32(%rcx),%rbp + sbbq 40(%rdx),%r13 + movq 40(%rcx),%rsi + sbbq %rdx,%rdx + + andq %rdx,%r14 + andq %rdx,%r15 + andq %rdx,%rax + andq %rdx,%rbx + andq %rdx,%rbp + andq %rdx,%rsi + + addq %r14,%r8 + adcq %r15,%r9 + movq %r8,0(%rdi) + adcq %rax,%r10 + movq %r9,8(%rdi) + adcq %rbx,%r11 + movq %r10,16(%rdi) + adcq %rbp,%r12 + movq %r11,24(%rdi) + adcq %rsi,%r13 + movq %r12,32(%rdi) + movq %r13,40(%rdi) + + .byte 0xf3,0xc3 + + +.globl sub_mod_384x + +.def sub_mod_384x; .scl 2; .type 32; .endef +.p2align 5 +sub_mod_384x: + .byte 0xf3,0x0f,0x1e,0xfa + movq %rdi,8(%rsp) + movq %rsi,16(%rsp) + movq %rsp,%r11 +.LSEH_begin_sub_mod_384x: + movq %rcx,%rdi + movq %rdx,%rsi + movq %r8,%rdx + movq %r9,%rcx + + + pushq %rbp + + pushq %rbx + + pushq %r12 + + pushq %r13 + + pushq %r14 + + pushq %r15 + + subq $24,%rsp + +.LSEH_body_sub_mod_384x: + + + movq %rsi,0(%rsp) + movq %rdx,8(%rsp) + leaq 48(%rsi),%rsi + leaq 48(%rdx),%rdx + leaq 48(%rdi),%rdi + call __sub_mod_384 + + movq 0(%rsp),%rsi + movq 8(%rsp),%rdx + leaq -48(%rdi),%rdi + call __sub_mod_384 + + movq 24+0(%rsp),%r15 + + movq 24+8(%rsp),%r14 + + movq 24+16(%rsp),%r13 + + movq 24+24(%rsp),%r12 + + movq 24+32(%rsp),%rbx + + movq 24+40(%rsp),%rbp + + leaq 24+48(%rsp),%rsp + +.LSEH_epilogue_sub_mod_384x: + mov 8(%rsp),%rdi + mov 16(%rsp),%rsi + + .byte 0xf3,0xc3 + +.LSEH_end_sub_mod_384x: +.globl mul_by_1_plus_i_mod_384x + +.def mul_by_1_plus_i_mod_384x; .scl 2; .type 32; .endef +.p2align 5 +mul_by_1_plus_i_mod_384x: + .byte 0xf3,0x0f,0x1e,0xfa + movq %rdi,8(%rsp) + movq %rsi,16(%rsp) + movq %rsp,%r11 +.LSEH_begin_mul_by_1_plus_i_mod_384x: + movq %rcx,%rdi + movq %rdx,%rsi + movq %r8,%rdx + + + pushq %rbp + + pushq %rbx + + pushq %r12 + + pushq %r13 + + pushq %r14 + + pushq %r15 + + subq $56,%rsp + +.LSEH_body_mul_by_1_plus_i_mod_384x: + + + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + movq 32(%rsi),%r12 + movq 40(%rsi),%r13 + + movq %r8,%r14 + addq 48(%rsi),%r8 + movq %r9,%r15 + adcq 56(%rsi),%r9 + movq %r10,%rax + adcq 64(%rsi),%r10 + movq %r11,%rbx + adcq 72(%rsi),%r11 + movq %r12,%rcx + adcq 80(%rsi),%r12 + movq %r13,%rbp + adcq 88(%rsi),%r13 + movq %rdi,48(%rsp) + sbbq %rdi,%rdi + + subq 48(%rsi),%r14 + sbbq 56(%rsi),%r15 + sbbq 64(%rsi),%rax + sbbq 72(%rsi),%rbx + sbbq 80(%rsi),%rcx + sbbq 88(%rsi),%rbp + sbbq %rsi,%rsi + + movq %r8,0(%rsp) + movq 0(%rdx),%r8 + movq %r9,8(%rsp) + movq 8(%rdx),%r9 + movq %r10,16(%rsp) + movq 16(%rdx),%r10 + movq %r11,24(%rsp) + movq 24(%rdx),%r11 + movq %r12,32(%rsp) + andq %rsi,%r8 + movq 32(%rdx),%r12 + movq %r13,40(%rsp) + andq %rsi,%r9 + movq 40(%rdx),%r13 + andq %rsi,%r10 + andq %rsi,%r11 + andq %rsi,%r12 + andq %rsi,%r13 + movq 48(%rsp),%rsi + + addq %r8,%r14 + movq 0(%rsp),%r8 + adcq %r9,%r15 + movq 8(%rsp),%r9 + adcq %r10,%rax + movq 16(%rsp),%r10 + adcq %r11,%rbx + movq 24(%rsp),%r11 + adcq %r12,%rcx + movq 32(%rsp),%r12 + adcq %r13,%rbp + movq 40(%rsp),%r13 + + movq %r14,0(%rsi) + movq %r8,%r14 + movq %r15,8(%rsi) + movq %rax,16(%rsi) + movq %r9,%r15 + movq %rbx,24(%rsi) + movq %rcx,32(%rsi) + movq %r10,%rax + movq %rbp,40(%rsi) + + subq 0(%rdx),%r8 + movq %r11,%rbx + sbbq 8(%rdx),%r9 + sbbq 16(%rdx),%r10 + movq %r12,%rcx + sbbq 24(%rdx),%r11 + sbbq 32(%rdx),%r12 + movq %r13,%rbp + sbbq 40(%rdx),%r13 + sbbq $0,%rdi + + cmovcq %r14,%r8 + cmovcq %r15,%r9 + cmovcq %rax,%r10 + movq %r8,48(%rsi) + cmovcq %rbx,%r11 + movq %r9,56(%rsi) + cmovcq %rcx,%r12 + movq %r10,64(%rsi) + cmovcq %rbp,%r13 + movq %r11,72(%rsi) + movq %r12,80(%rsi) + movq %r13,88(%rsi) + + movq 56+0(%rsp),%r15 + + movq 56+8(%rsp),%r14 + + movq 56+16(%rsp),%r13 + + movq 56+24(%rsp),%r12 + + movq 56+32(%rsp),%rbx + + movq 56+40(%rsp),%rbp + + leaq 56+48(%rsp),%rsp + +.LSEH_epilogue_mul_by_1_plus_i_mod_384x: + mov 8(%rsp),%rdi + mov 16(%rsp),%rsi + + .byte 0xf3,0xc3 + +.LSEH_end_mul_by_1_plus_i_mod_384x: +.globl sgn0_pty_mod_384 + +.def sgn0_pty_mod_384; .scl 2; .type 32; .endef +.p2align 5 +sgn0_pty_mod_384: + .byte 0xf3,0x0f,0x1e,0xfa + movq %rdi,8(%rsp) + movq %rsi,16(%rsp) + movq %rsp,%r11 +.LSEH_begin_sgn0_pty_mod_384: + movq %rcx,%rdi + movq %rdx,%rsi + + +.LSEH_body_sgn0_pty_mod_384: + + movq 0(%rdi),%r8 + movq 8(%rdi),%r9 + movq 16(%rdi),%r10 + movq 24(%rdi),%r11 + movq 32(%rdi),%rcx + movq 40(%rdi),%rdx + + xorq %rax,%rax + movq %r8,%rdi + addq %r8,%r8 + adcq %r9,%r9 + adcq %r10,%r10 + adcq %r11,%r11 + adcq %rcx,%rcx + adcq %rdx,%rdx + adcq $0,%rax + + subq 0(%rsi),%r8 + sbbq 8(%rsi),%r9 + sbbq 16(%rsi),%r10 + sbbq 24(%rsi),%r11 + sbbq 32(%rsi),%rcx + sbbq 40(%rsi),%rdx + sbbq $0,%rax + + notq %rax + andq $1,%rdi + andq $2,%rax + orq %rdi,%rax + +.LSEH_epilogue_sgn0_pty_mod_384: + mov 8(%rsp),%rdi + mov 16(%rsp),%rsi + + .byte 0xf3,0xc3 + +.LSEH_end_sgn0_pty_mod_384: + +.globl sgn0_pty_mod_384x + +.def sgn0_pty_mod_384x; .scl 2; .type 32; .endef +.p2align 5 +sgn0_pty_mod_384x: + .byte 0xf3,0x0f,0x1e,0xfa + movq %rdi,8(%rsp) + movq %rsi,16(%rsp) + movq %rsp,%r11 +.LSEH_begin_sgn0_pty_mod_384x: + movq %rcx,%rdi + movq %rdx,%rsi + + + pushq %rbp + + pushq %rbx + + subq $8,%rsp + +.LSEH_body_sgn0_pty_mod_384x: + + + movq 48(%rdi),%r8 + movq 56(%rdi),%r9 + movq 64(%rdi),%r10 + movq 72(%rdi),%r11 + movq 80(%rdi),%rcx + movq 88(%rdi),%rdx + + movq %r8,%rbx + orq %r9,%r8 + orq %r10,%r8 + orq %r11,%r8 + orq %rcx,%r8 + orq %rdx,%r8 + + leaq 0(%rdi),%rax + xorq %rdi,%rdi + movq %rbx,%rbp + addq %rbx,%rbx + adcq %r9,%r9 + adcq %r10,%r10 + adcq %r11,%r11 + adcq %rcx,%rcx + adcq %rdx,%rdx + adcq $0,%rdi + + subq 0(%rsi),%rbx + sbbq 8(%rsi),%r9 + sbbq 16(%rsi),%r10 + sbbq 24(%rsi),%r11 + sbbq 32(%rsi),%rcx + sbbq 40(%rsi),%rdx + sbbq $0,%rdi + + movq %r8,0(%rsp) + notq %rdi + andq $1,%rbp + andq $2,%rdi + orq %rbp,%rdi + + movq 0(%rax),%r8 + movq 8(%rax),%r9 + movq 16(%rax),%r10 + movq 24(%rax),%r11 + movq 32(%rax),%rcx + movq 40(%rax),%rdx + + movq %r8,%rbx + orq %r9,%r8 + orq %r10,%r8 + orq %r11,%r8 + orq %rcx,%r8 + orq %rdx,%r8 + + xorq %rax,%rax + movq %rbx,%rbp + addq %rbx,%rbx + adcq %r9,%r9 + adcq %r10,%r10 + adcq %r11,%r11 + adcq %rcx,%rcx + adcq %rdx,%rdx + adcq $0,%rax + + subq 0(%rsi),%rbx + sbbq 8(%rsi),%r9 + sbbq 16(%rsi),%r10 + sbbq 24(%rsi),%r11 + sbbq 32(%rsi),%rcx + sbbq 40(%rsi),%rdx + sbbq $0,%rax + + movq 0(%rsp),%rbx + + notq %rax + + testq %r8,%r8 + cmovzq %rdi,%rbp + + testq %rbx,%rbx + cmovnzq %rdi,%rax + + andq $1,%rbp + andq $2,%rax + orq %rbp,%rax + + movq 8(%rsp),%rbx + + movq 16(%rsp),%rbp + + leaq 24(%rsp),%rsp + +.LSEH_epilogue_sgn0_pty_mod_384x: + mov 8(%rsp),%rdi + mov 16(%rsp),%rsi + + .byte 0xf3,0xc3 + +.LSEH_end_sgn0_pty_mod_384x: +.globl vec_select_32 + +.def vec_select_32; .scl 2; .type 32; .endef +.p2align 5 +vec_select_32: + .byte 0xf3,0x0f,0x1e,0xfa + + movd %r9d,%xmm5 + pxor %xmm4,%xmm4 + pshufd $0,%xmm5,%xmm5 + movdqu (%rdx),%xmm0 + leaq 16(%rdx),%rdx + pcmpeqd %xmm4,%xmm5 + movdqu (%r8),%xmm1 + leaq 16(%r8),%r8 + pcmpeqd %xmm5,%xmm4 + leaq 16(%rcx),%rcx + pand %xmm4,%xmm0 + movdqu 0+16-16(%rdx),%xmm2 + pand %xmm5,%xmm1 + movdqu 0+16-16(%r8),%xmm3 + por %xmm1,%xmm0 + movdqu %xmm0,0-16(%rcx) + pand %xmm4,%xmm2 + pand %xmm5,%xmm3 + por %xmm3,%xmm2 + movdqu %xmm2,16-16(%rcx) + .byte 0xf3,0xc3 + +.globl vec_select_48 + +.def vec_select_48; .scl 2; .type 32; .endef +.p2align 5 +vec_select_48: + .byte 0xf3,0x0f,0x1e,0xfa + + movd %r9d,%xmm5 + pxor %xmm4,%xmm4 + pshufd $0,%xmm5,%xmm5 + movdqu (%rdx),%xmm0 + leaq 24(%rdx),%rdx + pcmpeqd %xmm4,%xmm5 + movdqu (%r8),%xmm1 + leaq 24(%r8),%r8 + pcmpeqd %xmm5,%xmm4 + leaq 24(%rcx),%rcx + pand %xmm4,%xmm0 + movdqu 0+16-24(%rdx),%xmm2 + pand %xmm5,%xmm1 + movdqu 0+16-24(%r8),%xmm3 + por %xmm1,%xmm0 + movdqu %xmm0,0-24(%rcx) + pand %xmm4,%xmm2 + movdqu 16+16-24(%rdx),%xmm0 + pand %xmm5,%xmm3 + movdqu 16+16-24(%r8),%xmm1 + por %xmm3,%xmm2 + movdqu %xmm2,16-24(%rcx) + pand %xmm4,%xmm0 + pand %xmm5,%xmm1 + por %xmm1,%xmm0 + movdqu %xmm0,32-24(%rcx) + .byte 0xf3,0xc3 + +.globl vec_select_96 + +.def vec_select_96; .scl 2; .type 32; .endef +.p2align 5 +vec_select_96: + .byte 0xf3,0x0f,0x1e,0xfa + + movd %r9d,%xmm5 + pxor %xmm4,%xmm4 + pshufd $0,%xmm5,%xmm5 + movdqu (%rdx),%xmm0 + leaq 48(%rdx),%rdx + pcmpeqd %xmm4,%xmm5 + movdqu (%r8),%xmm1 + leaq 48(%r8),%r8 + pcmpeqd %xmm5,%xmm4 + leaq 48(%rcx),%rcx + pand %xmm4,%xmm0 + movdqu 0+16-48(%rdx),%xmm2 + pand %xmm5,%xmm1 + movdqu 0+16-48(%r8),%xmm3 + por %xmm1,%xmm0 + movdqu %xmm0,0-48(%rcx) + pand %xmm4,%xmm2 + movdqu 16+16-48(%rdx),%xmm0 + pand %xmm5,%xmm3 + movdqu 16+16-48(%r8),%xmm1 + por %xmm3,%xmm2 + movdqu %xmm2,16-48(%rcx) + pand %xmm4,%xmm0 + movdqu 32+16-48(%rdx),%xmm2 + pand %xmm5,%xmm1 + movdqu 32+16-48(%r8),%xmm3 + por %xmm1,%xmm0 + movdqu %xmm0,32-48(%rcx) + pand %xmm4,%xmm2 + movdqu 48+16-48(%rdx),%xmm0 + pand %xmm5,%xmm3 + movdqu 48+16-48(%r8),%xmm1 + por %xmm3,%xmm2 + movdqu %xmm2,48-48(%rcx) + pand %xmm4,%xmm0 + movdqu 64+16-48(%rdx),%xmm2 + pand %xmm5,%xmm1 + movdqu 64+16-48(%r8),%xmm3 + por %xmm1,%xmm0 + movdqu %xmm0,64-48(%rcx) + pand %xmm4,%xmm2 + pand %xmm5,%xmm3 + por %xmm3,%xmm2 + movdqu %xmm2,80-48(%rcx) + .byte 0xf3,0xc3 + +.globl vec_select_192 + +.def vec_select_192; .scl 2; .type 32; .endef +.p2align 5 +vec_select_192: + .byte 0xf3,0x0f,0x1e,0xfa + + movd %r9d,%xmm5 + pxor %xmm4,%xmm4 + pshufd $0,%xmm5,%xmm5 + movdqu (%rdx),%xmm0 + leaq 96(%rdx),%rdx + pcmpeqd %xmm4,%xmm5 + movdqu (%r8),%xmm1 + leaq 96(%r8),%r8 + pcmpeqd %xmm5,%xmm4 + leaq 96(%rcx),%rcx + pand %xmm4,%xmm0 + movdqu 0+16-96(%rdx),%xmm2 + pand %xmm5,%xmm1 + movdqu 0+16-96(%r8),%xmm3 + por %xmm1,%xmm0 + movdqu %xmm0,0-96(%rcx) + pand %xmm4,%xmm2 + movdqu 16+16-96(%rdx),%xmm0 + pand %xmm5,%xmm3 + movdqu 16+16-96(%r8),%xmm1 + por %xmm3,%xmm2 + movdqu %xmm2,16-96(%rcx) + pand %xmm4,%xmm0 + movdqu 32+16-96(%rdx),%xmm2 + pand %xmm5,%xmm1 + movdqu 32+16-96(%r8),%xmm3 + por %xmm1,%xmm0 + movdqu %xmm0,32-96(%rcx) + pand %xmm4,%xmm2 + movdqu 48+16-96(%rdx),%xmm0 + pand %xmm5,%xmm3 + movdqu 48+16-96(%r8),%xmm1 + por %xmm3,%xmm2 + movdqu %xmm2,48-96(%rcx) + pand %xmm4,%xmm0 + movdqu 64+16-96(%rdx),%xmm2 + pand %xmm5,%xmm1 + movdqu 64+16-96(%r8),%xmm3 + por %xmm1,%xmm0 + movdqu %xmm0,64-96(%rcx) + pand %xmm4,%xmm2 + movdqu 80+16-96(%rdx),%xmm0 + pand %xmm5,%xmm3 + movdqu 80+16-96(%r8),%xmm1 + por %xmm3,%xmm2 + movdqu %xmm2,80-96(%rcx) + pand %xmm4,%xmm0 + movdqu 96+16-96(%rdx),%xmm2 + pand %xmm5,%xmm1 + movdqu 96+16-96(%r8),%xmm3 + por %xmm1,%xmm0 + movdqu %xmm0,96-96(%rcx) + pand %xmm4,%xmm2 + movdqu 112+16-96(%rdx),%xmm0 + pand %xmm5,%xmm3 + movdqu 112+16-96(%r8),%xmm1 + por %xmm3,%xmm2 + movdqu %xmm2,112-96(%rcx) + pand %xmm4,%xmm0 + movdqu 128+16-96(%rdx),%xmm2 + pand %xmm5,%xmm1 + movdqu 128+16-96(%r8),%xmm3 + por %xmm1,%xmm0 + movdqu %xmm0,128-96(%rcx) + pand %xmm4,%xmm2 + movdqu 144+16-96(%rdx),%xmm0 + pand %xmm5,%xmm3 + movdqu 144+16-96(%r8),%xmm1 + por %xmm3,%xmm2 + movdqu %xmm2,144-96(%rcx) + pand %xmm4,%xmm0 + movdqu 160+16-96(%rdx),%xmm2 + pand %xmm5,%xmm1 + movdqu 160+16-96(%r8),%xmm3 + por %xmm1,%xmm0 + movdqu %xmm0,160-96(%rcx) + pand %xmm4,%xmm2 + pand %xmm5,%xmm3 + por %xmm3,%xmm2 + movdqu %xmm2,176-96(%rcx) + .byte 0xf3,0xc3 + +.globl vec_select_144 + +.def vec_select_144; .scl 2; .type 32; .endef +.p2align 5 +vec_select_144: + .byte 0xf3,0x0f,0x1e,0xfa + + movd %r9d,%xmm5 + pxor %xmm4,%xmm4 + pshufd $0,%xmm5,%xmm5 + movdqu (%rdx),%xmm0 + leaq 72(%rdx),%rdx + pcmpeqd %xmm4,%xmm5 + movdqu (%r8),%xmm1 + leaq 72(%r8),%r8 + pcmpeqd %xmm5,%xmm4 + leaq 72(%rcx),%rcx + pand %xmm4,%xmm0 + movdqu 0+16-72(%rdx),%xmm2 + pand %xmm5,%xmm1 + movdqu 0+16-72(%r8),%xmm3 + por %xmm1,%xmm0 + movdqu %xmm0,0-72(%rcx) + pand %xmm4,%xmm2 + movdqu 16+16-72(%rdx),%xmm0 + pand %xmm5,%xmm3 + movdqu 16+16-72(%r8),%xmm1 + por %xmm3,%xmm2 + movdqu %xmm2,16-72(%rcx) + pand %xmm4,%xmm0 + movdqu 32+16-72(%rdx),%xmm2 + pand %xmm5,%xmm1 + movdqu 32+16-72(%r8),%xmm3 + por %xmm1,%xmm0 + movdqu %xmm0,32-72(%rcx) + pand %xmm4,%xmm2 + movdqu 48+16-72(%rdx),%xmm0 + pand %xmm5,%xmm3 + movdqu 48+16-72(%r8),%xmm1 + por %xmm3,%xmm2 + movdqu %xmm2,48-72(%rcx) + pand %xmm4,%xmm0 + movdqu 64+16-72(%rdx),%xmm2 + pand %xmm5,%xmm1 + movdqu 64+16-72(%r8),%xmm3 + por %xmm1,%xmm0 + movdqu %xmm0,64-72(%rcx) + pand %xmm4,%xmm2 + movdqu 80+16-72(%rdx),%xmm0 + pand %xmm5,%xmm3 + movdqu 80+16-72(%r8),%xmm1 + por %xmm3,%xmm2 + movdqu %xmm2,80-72(%rcx) + pand %xmm4,%xmm0 + movdqu 96+16-72(%rdx),%xmm2 + pand %xmm5,%xmm1 + movdqu 96+16-72(%r8),%xmm3 + por %xmm1,%xmm0 + movdqu %xmm0,96-72(%rcx) + pand %xmm4,%xmm2 + movdqu 112+16-72(%rdx),%xmm0 + pand %xmm5,%xmm3 + movdqu 112+16-72(%r8),%xmm1 + por %xmm3,%xmm2 + movdqu %xmm2,112-72(%rcx) + pand %xmm4,%xmm0 + pand %xmm5,%xmm1 + por %xmm1,%xmm0 + movdqu %xmm0,128-72(%rcx) + .byte 0xf3,0xc3 + +.globl vec_select_288 + +.def vec_select_288; .scl 2; .type 32; .endef +.p2align 5 +vec_select_288: + .byte 0xf3,0x0f,0x1e,0xfa + + movd %r9d,%xmm5 + pxor %xmm4,%xmm4 + pshufd $0,%xmm5,%xmm5 + movdqu (%rdx),%xmm0 + leaq 144(%rdx),%rdx + pcmpeqd %xmm4,%xmm5 + movdqu (%r8),%xmm1 + leaq 144(%r8),%r8 + pcmpeqd %xmm5,%xmm4 + leaq 144(%rcx),%rcx + pand %xmm4,%xmm0 + movdqu 0+16-144(%rdx),%xmm2 + pand %xmm5,%xmm1 + movdqu 0+16-144(%r8),%xmm3 + por %xmm1,%xmm0 + movdqu %xmm0,0-144(%rcx) + pand %xmm4,%xmm2 + movdqu 16+16-144(%rdx),%xmm0 + pand %xmm5,%xmm3 + movdqu 16+16-144(%r8),%xmm1 + por %xmm3,%xmm2 + movdqu %xmm2,16-144(%rcx) + pand %xmm4,%xmm0 + movdqu 32+16-144(%rdx),%xmm2 + pand %xmm5,%xmm1 + movdqu 32+16-144(%r8),%xmm3 + por %xmm1,%xmm0 + movdqu %xmm0,32-144(%rcx) + pand %xmm4,%xmm2 + movdqu 48+16-144(%rdx),%xmm0 + pand %xmm5,%xmm3 + movdqu 48+16-144(%r8),%xmm1 + por %xmm3,%xmm2 + movdqu %xmm2,48-144(%rcx) + pand %xmm4,%xmm0 + movdqu 64+16-144(%rdx),%xmm2 + pand %xmm5,%xmm1 + movdqu 64+16-144(%r8),%xmm3 + por %xmm1,%xmm0 + movdqu %xmm0,64-144(%rcx) + pand %xmm4,%xmm2 + movdqu 80+16-144(%rdx),%xmm0 + pand %xmm5,%xmm3 + movdqu 80+16-144(%r8),%xmm1 + por %xmm3,%xmm2 + movdqu %xmm2,80-144(%rcx) + pand %xmm4,%xmm0 + movdqu 96+16-144(%rdx),%xmm2 + pand %xmm5,%xmm1 + movdqu 96+16-144(%r8),%xmm3 + por %xmm1,%xmm0 + movdqu %xmm0,96-144(%rcx) + pand %xmm4,%xmm2 + movdqu 112+16-144(%rdx),%xmm0 + pand %xmm5,%xmm3 + movdqu 112+16-144(%r8),%xmm1 + por %xmm3,%xmm2 + movdqu %xmm2,112-144(%rcx) + pand %xmm4,%xmm0 + movdqu 128+16-144(%rdx),%xmm2 + pand %xmm5,%xmm1 + movdqu 128+16-144(%r8),%xmm3 + por %xmm1,%xmm0 + movdqu %xmm0,128-144(%rcx) + pand %xmm4,%xmm2 + movdqu 144+16-144(%rdx),%xmm0 + pand %xmm5,%xmm3 + movdqu 144+16-144(%r8),%xmm1 + por %xmm3,%xmm2 + movdqu %xmm2,144-144(%rcx) + pand %xmm4,%xmm0 + movdqu 160+16-144(%rdx),%xmm2 + pand %xmm5,%xmm1 + movdqu 160+16-144(%r8),%xmm3 + por %xmm1,%xmm0 + movdqu %xmm0,160-144(%rcx) + pand %xmm4,%xmm2 + movdqu 176+16-144(%rdx),%xmm0 + pand %xmm5,%xmm3 + movdqu 176+16-144(%r8),%xmm1 + por %xmm3,%xmm2 + movdqu %xmm2,176-144(%rcx) + pand %xmm4,%xmm0 + movdqu 192+16-144(%rdx),%xmm2 + pand %xmm5,%xmm1 + movdqu 192+16-144(%r8),%xmm3 + por %xmm1,%xmm0 + movdqu %xmm0,192-144(%rcx) + pand %xmm4,%xmm2 + movdqu 208+16-144(%rdx),%xmm0 + pand %xmm5,%xmm3 + movdqu 208+16-144(%r8),%xmm1 + por %xmm3,%xmm2 + movdqu %xmm2,208-144(%rcx) + pand %xmm4,%xmm0 + movdqu 224+16-144(%rdx),%xmm2 + pand %xmm5,%xmm1 + movdqu 224+16-144(%r8),%xmm3 + por %xmm1,%xmm0 + movdqu %xmm0,224-144(%rcx) + pand %xmm4,%xmm2 + movdqu 240+16-144(%rdx),%xmm0 + pand %xmm5,%xmm3 + movdqu 240+16-144(%r8),%xmm1 + por %xmm3,%xmm2 + movdqu %xmm2,240-144(%rcx) + pand %xmm4,%xmm0 + movdqu 256+16-144(%rdx),%xmm2 + pand %xmm5,%xmm1 + movdqu 256+16-144(%r8),%xmm3 + por %xmm1,%xmm0 + movdqu %xmm0,256-144(%rcx) + pand %xmm4,%xmm2 + pand %xmm5,%xmm3 + por %xmm3,%xmm2 + movdqu %xmm2,272-144(%rcx) + .byte 0xf3,0xc3 + +.globl vec_prefetch + +.def vec_prefetch; .scl 2; .type 32; .endef +.p2align 5 +vec_prefetch: + .byte 0xf3,0x0f,0x1e,0xfa + + leaq -1(%rcx,%rdx,1),%rdx + movq $64,%rax + xorq %r8,%r8 + prefetchnta (%rcx) + leaq (%rcx,%rax,1),%rcx + cmpq %rdx,%rcx + cmovaq %rdx,%rcx + cmovaq %r8,%rax + prefetchnta (%rcx) + leaq (%rcx,%rax,1),%rcx + cmpq %rdx,%rcx + cmovaq %rdx,%rcx + cmovaq %r8,%rax + prefetchnta (%rcx) + leaq (%rcx,%rax,1),%rcx + cmpq %rdx,%rcx + cmovaq %rdx,%rcx + cmovaq %r8,%rax + prefetchnta (%rcx) + leaq (%rcx,%rax,1),%rcx + cmpq %rdx,%rcx + cmovaq %rdx,%rcx + cmovaq %r8,%rax + prefetchnta (%rcx) + leaq (%rcx,%rax,1),%rcx + cmpq %rdx,%rcx + cmovaq %rdx,%rcx + cmovaq %r8,%rax + prefetchnta (%rcx) + leaq (%rcx,%rax,1),%rcx + cmpq %rdx,%rcx + cmovaq %rdx,%rcx + prefetchnta (%rcx) + .byte 0xf3,0xc3 + +.globl vec_is_zero_16x + +.def vec_is_zero_16x; .scl 2; .type 32; .endef +.p2align 5 +vec_is_zero_16x: + .byte 0xf3,0x0f,0x1e,0xfa + + shrl $4,%edx + movdqu (%rcx),%xmm0 + leaq 16(%rcx),%rcx + +.Loop_is_zero: + decl %edx + jz .Loop_is_zero_done + movdqu (%rcx),%xmm1 + leaq 16(%rcx),%rcx + por %xmm1,%xmm0 + jmp .Loop_is_zero + +.Loop_is_zero_done: + pshufd $0x4e,%xmm0,%xmm1 + por %xmm1,%xmm0 +.byte 102,72,15,126,192 + incl %edx + testq %rax,%rax + cmovnzl %edx,%eax + xorl $1,%eax + .byte 0xf3,0xc3 + +.globl vec_is_equal_16x + +.def vec_is_equal_16x; .scl 2; .type 32; .endef +.p2align 5 +vec_is_equal_16x: + .byte 0xf3,0x0f,0x1e,0xfa + + shrl $4,%r8d + movdqu (%rcx),%xmm0 + movdqu (%rdx),%xmm1 + subq %rcx,%rdx + leaq 16(%rcx),%rcx + pxor %xmm1,%xmm0 + +.Loop_is_equal: + decl %r8d + jz .Loop_is_equal_done + movdqu (%rcx),%xmm1 + movdqu (%rcx,%rdx,1),%xmm2 + leaq 16(%rcx),%rcx + pxor %xmm2,%xmm1 + por %xmm1,%xmm0 + jmp .Loop_is_equal + +.Loop_is_equal_done: + pshufd $0x4e,%xmm0,%xmm1 + por %xmm1,%xmm0 +.byte 102,72,15,126,192 + incl %r8d + testq %rax,%rax + cmovnzl %r8d,%eax + xorl $1,%eax + .byte 0xf3,0xc3 + +.section .pdata +.p2align 2 +.rva .LSEH_begin_add_mod_384 +.rva .LSEH_body_add_mod_384 +.rva .LSEH_info_add_mod_384_prologue + +.rva .LSEH_body_add_mod_384 +.rva .LSEH_epilogue_add_mod_384 +.rva .LSEH_info_add_mod_384_body + +.rva .LSEH_epilogue_add_mod_384 +.rva .LSEH_end_add_mod_384 +.rva .LSEH_info_add_mod_384_epilogue + +.rva .LSEH_begin_add_mod_384x +.rva .LSEH_body_add_mod_384x +.rva .LSEH_info_add_mod_384x_prologue + +.rva .LSEH_body_add_mod_384x +.rva .LSEH_epilogue_add_mod_384x +.rva .LSEH_info_add_mod_384x_body + +.rva .LSEH_epilogue_add_mod_384x +.rva .LSEH_end_add_mod_384x +.rva .LSEH_info_add_mod_384x_epilogue + +.rva .LSEH_begin_rshift_mod_384 +.rva .LSEH_body_rshift_mod_384 +.rva .LSEH_info_rshift_mod_384_prologue + +.rva .LSEH_body_rshift_mod_384 +.rva .LSEH_epilogue_rshift_mod_384 +.rva .LSEH_info_rshift_mod_384_body + +.rva .LSEH_epilogue_rshift_mod_384 +.rva .LSEH_end_rshift_mod_384 +.rva .LSEH_info_rshift_mod_384_epilogue + +.rva .LSEH_begin_div_by_2_mod_384 +.rva .LSEH_body_div_by_2_mod_384 +.rva .LSEH_info_div_by_2_mod_384_prologue + +.rva .LSEH_body_div_by_2_mod_384 +.rva .LSEH_epilogue_div_by_2_mod_384 +.rva .LSEH_info_div_by_2_mod_384_body + +.rva .LSEH_epilogue_div_by_2_mod_384 +.rva .LSEH_end_div_by_2_mod_384 +.rva .LSEH_info_div_by_2_mod_384_epilogue + +.rva .LSEH_begin_lshift_mod_384 +.rva .LSEH_body_lshift_mod_384 +.rva .LSEH_info_lshift_mod_384_prologue + +.rva .LSEH_body_lshift_mod_384 +.rva .LSEH_epilogue_lshift_mod_384 +.rva .LSEH_info_lshift_mod_384_body + +.rva .LSEH_epilogue_lshift_mod_384 +.rva .LSEH_end_lshift_mod_384 +.rva .LSEH_info_lshift_mod_384_epilogue + +.rva .LSEH_begin_mul_by_3_mod_384 +.rva .LSEH_body_mul_by_3_mod_384 +.rva .LSEH_info_mul_by_3_mod_384_prologue + +.rva .LSEH_body_mul_by_3_mod_384 +.rva .LSEH_epilogue_mul_by_3_mod_384 +.rva .LSEH_info_mul_by_3_mod_384_body + +.rva .LSEH_epilogue_mul_by_3_mod_384 +.rva .LSEH_end_mul_by_3_mod_384 +.rva .LSEH_info_mul_by_3_mod_384_epilogue + +.rva .LSEH_begin_mul_by_8_mod_384 +.rva .LSEH_body_mul_by_8_mod_384 +.rva .LSEH_info_mul_by_8_mod_384_prologue + +.rva .LSEH_body_mul_by_8_mod_384 +.rva .LSEH_epilogue_mul_by_8_mod_384 +.rva .LSEH_info_mul_by_8_mod_384_body + +.rva .LSEH_epilogue_mul_by_8_mod_384 +.rva .LSEH_end_mul_by_8_mod_384 +.rva .LSEH_info_mul_by_8_mod_384_epilogue + +.rva .LSEH_begin_mul_by_3_mod_384x +.rva .LSEH_body_mul_by_3_mod_384x +.rva .LSEH_info_mul_by_3_mod_384x_prologue + +.rva .LSEH_body_mul_by_3_mod_384x +.rva .LSEH_epilogue_mul_by_3_mod_384x +.rva .LSEH_info_mul_by_3_mod_384x_body + +.rva .LSEH_epilogue_mul_by_3_mod_384x +.rva .LSEH_end_mul_by_3_mod_384x +.rva .LSEH_info_mul_by_3_mod_384x_epilogue + +.rva .LSEH_begin_mul_by_8_mod_384x +.rva .LSEH_body_mul_by_8_mod_384x +.rva .LSEH_info_mul_by_8_mod_384x_prologue + +.rva .LSEH_body_mul_by_8_mod_384x +.rva .LSEH_epilogue_mul_by_8_mod_384x +.rva .LSEH_info_mul_by_8_mod_384x_body + +.rva .LSEH_epilogue_mul_by_8_mod_384x +.rva .LSEH_end_mul_by_8_mod_384x +.rva .LSEH_info_mul_by_8_mod_384x_epilogue + +.rva .LSEH_begin_cneg_mod_384 +.rva .LSEH_body_cneg_mod_384 +.rva .LSEH_info_cneg_mod_384_prologue + +.rva .LSEH_body_cneg_mod_384 +.rva .LSEH_epilogue_cneg_mod_384 +.rva .LSEH_info_cneg_mod_384_body + +.rva .LSEH_epilogue_cneg_mod_384 +.rva .LSEH_end_cneg_mod_384 +.rva .LSEH_info_cneg_mod_384_epilogue + +.rva .LSEH_begin_sub_mod_384 +.rva .LSEH_body_sub_mod_384 +.rva .LSEH_info_sub_mod_384_prologue + +.rva .LSEH_body_sub_mod_384 +.rva .LSEH_epilogue_sub_mod_384 +.rva .LSEH_info_sub_mod_384_body + +.rva .LSEH_epilogue_sub_mod_384 +.rva .LSEH_end_sub_mod_384 +.rva .LSEH_info_sub_mod_384_epilogue + +.rva .LSEH_begin_sub_mod_384x +.rva .LSEH_body_sub_mod_384x +.rva .LSEH_info_sub_mod_384x_prologue + +.rva .LSEH_body_sub_mod_384x +.rva .LSEH_epilogue_sub_mod_384x +.rva .LSEH_info_sub_mod_384x_body + +.rva .LSEH_epilogue_sub_mod_384x +.rva .LSEH_end_sub_mod_384x +.rva .LSEH_info_sub_mod_384x_epilogue + +.rva .LSEH_begin_mul_by_1_plus_i_mod_384x +.rva .LSEH_body_mul_by_1_plus_i_mod_384x +.rva .LSEH_info_mul_by_1_plus_i_mod_384x_prologue + +.rva .LSEH_body_mul_by_1_plus_i_mod_384x +.rva .LSEH_epilogue_mul_by_1_plus_i_mod_384x +.rva .LSEH_info_mul_by_1_plus_i_mod_384x_body + +.rva .LSEH_epilogue_mul_by_1_plus_i_mod_384x +.rva .LSEH_end_mul_by_1_plus_i_mod_384x +.rva .LSEH_info_mul_by_1_plus_i_mod_384x_epilogue + +.rva .LSEH_begin_sgn0_pty_mod_384 +.rva .LSEH_body_sgn0_pty_mod_384 +.rva .LSEH_info_sgn0_pty_mod_384_prologue + +.rva .LSEH_body_sgn0_pty_mod_384 +.rva .LSEH_epilogue_sgn0_pty_mod_384 +.rva .LSEH_info_sgn0_pty_mod_384_body + +.rva .LSEH_epilogue_sgn0_pty_mod_384 +.rva .LSEH_end_sgn0_pty_mod_384 +.rva .LSEH_info_sgn0_pty_mod_384_epilogue + +.rva .LSEH_begin_sgn0_pty_mod_384x +.rva .LSEH_body_sgn0_pty_mod_384x +.rva .LSEH_info_sgn0_pty_mod_384x_prologue + +.rva .LSEH_body_sgn0_pty_mod_384x +.rva .LSEH_epilogue_sgn0_pty_mod_384x +.rva .LSEH_info_sgn0_pty_mod_384x_body + +.rva .LSEH_epilogue_sgn0_pty_mod_384x +.rva .LSEH_end_sgn0_pty_mod_384x +.rva .LSEH_info_sgn0_pty_mod_384x_epilogue + +.section .xdata +.p2align 3 +.LSEH_info_add_mod_384_prologue: +.byte 1,0,5,0x0b +.byte 0,0x74,1,0 +.byte 0,0x64,2,0 +.byte 0,0x03 +.byte 0,0 +.LSEH_info_add_mod_384_body: +.byte 1,0,17,0 +.byte 0x00,0xf4,0x01,0x00 +.byte 0x00,0xe4,0x02,0x00 +.byte 0x00,0xd4,0x03,0x00 +.byte 0x00,0xc4,0x04,0x00 +.byte 0x00,0x34,0x05,0x00 +.byte 0x00,0x54,0x06,0x00 +.byte 0x00,0x74,0x08,0x00 +.byte 0x00,0x64,0x09,0x00 +.byte 0x00,0x62 +.byte 0x00,0x00 +.LSEH_info_add_mod_384_epilogue: +.byte 1,0,4,0 +.byte 0x00,0x74,0x01,0x00 +.byte 0x00,0x64,0x02,0x00 +.byte 0x00,0x00,0x00,0x00 + +.LSEH_info_add_mod_384x_prologue: +.byte 1,0,5,0x0b +.byte 0,0x74,1,0 +.byte 0,0x64,2,0 +.byte 0,0x03 +.byte 0,0 +.LSEH_info_add_mod_384x_body: +.byte 1,0,17,0 +.byte 0x00,0xf4,0x03,0x00 +.byte 0x00,0xe4,0x04,0x00 +.byte 0x00,0xd4,0x05,0x00 +.byte 0x00,0xc4,0x06,0x00 +.byte 0x00,0x34,0x07,0x00 +.byte 0x00,0x54,0x08,0x00 +.byte 0x00,0x74,0x0a,0x00 +.byte 0x00,0x64,0x0b,0x00 +.byte 0x00,0x82 +.byte 0x00,0x00 +.LSEH_info_add_mod_384x_epilogue: +.byte 1,0,4,0 +.byte 0x00,0x74,0x01,0x00 +.byte 0x00,0x64,0x02,0x00 +.byte 0x00,0x00,0x00,0x00 + +.LSEH_info_rshift_mod_384_prologue: +.byte 1,0,5,0x0b +.byte 0,0x74,1,0 +.byte 0,0x64,2,0 +.byte 0,0x03 +.byte 0,0 +.LSEH_info_rshift_mod_384_body: +.byte 1,0,17,0 +.byte 0x00,0xf4,0x01,0x00 +.byte 0x00,0xe4,0x02,0x00 +.byte 0x00,0xd4,0x03,0x00 +.byte 0x00,0xc4,0x04,0x00 +.byte 0x00,0x34,0x05,0x00 +.byte 0x00,0x54,0x06,0x00 +.byte 0x00,0x74,0x08,0x00 +.byte 0x00,0x64,0x09,0x00 +.byte 0x00,0x62 +.byte 0x00,0x00 +.LSEH_info_rshift_mod_384_epilogue: +.byte 1,0,4,0 +.byte 0x00,0x74,0x01,0x00 +.byte 0x00,0x64,0x02,0x00 +.byte 0x00,0x00,0x00,0x00 + +.LSEH_info_div_by_2_mod_384_prologue: +.byte 1,0,5,0x0b +.byte 0,0x74,1,0 +.byte 0,0x64,2,0 +.byte 0,0x03 +.byte 0,0 +.LSEH_info_div_by_2_mod_384_body: +.byte 1,0,17,0 +.byte 0x00,0xf4,0x01,0x00 +.byte 0x00,0xe4,0x02,0x00 +.byte 0x00,0xd4,0x03,0x00 +.byte 0x00,0xc4,0x04,0x00 +.byte 0x00,0x34,0x05,0x00 +.byte 0x00,0x54,0x06,0x00 +.byte 0x00,0x74,0x08,0x00 +.byte 0x00,0x64,0x09,0x00 +.byte 0x00,0x62 +.byte 0x00,0x00 +.LSEH_info_div_by_2_mod_384_epilogue: +.byte 1,0,4,0 +.byte 0x00,0x74,0x01,0x00 +.byte 0x00,0x64,0x02,0x00 +.byte 0x00,0x00,0x00,0x00 + +.LSEH_info_lshift_mod_384_prologue: +.byte 1,0,5,0x0b +.byte 0,0x74,1,0 +.byte 0,0x64,2,0 +.byte 0,0x03 +.byte 0,0 +.LSEH_info_lshift_mod_384_body: +.byte 1,0,17,0 +.byte 0x00,0xf4,0x01,0x00 +.byte 0x00,0xe4,0x02,0x00 +.byte 0x00,0xd4,0x03,0x00 +.byte 0x00,0xc4,0x04,0x00 +.byte 0x00,0x34,0x05,0x00 +.byte 0x00,0x54,0x06,0x00 +.byte 0x00,0x74,0x08,0x00 +.byte 0x00,0x64,0x09,0x00 +.byte 0x00,0x62 +.byte 0x00,0x00 +.LSEH_info_lshift_mod_384_epilogue: +.byte 1,0,4,0 +.byte 0x00,0x74,0x01,0x00 +.byte 0x00,0x64,0x02,0x00 +.byte 0x00,0x00,0x00,0x00 + +.LSEH_info_mul_by_3_mod_384_prologue: +.byte 1,0,5,0x0b +.byte 0,0x74,1,0 +.byte 0,0x64,2,0 +.byte 0,0x03 +.byte 0,0 +.LSEH_info_mul_by_3_mod_384_body: +.byte 1,0,17,0 +.byte 0x00,0xf4,0x01,0x00 +.byte 0x00,0xe4,0x02,0x00 +.byte 0x00,0xd4,0x03,0x00 +.byte 0x00,0xc4,0x04,0x00 +.byte 0x00,0x34,0x05,0x00 +.byte 0x00,0x54,0x06,0x00 +.byte 0x00,0x74,0x08,0x00 +.byte 0x00,0x64,0x09,0x00 +.byte 0x00,0x62 +.byte 0x00,0x00 +.LSEH_info_mul_by_3_mod_384_epilogue: +.byte 1,0,4,0 +.byte 0x00,0x74,0x01,0x00 +.byte 0x00,0x64,0x02,0x00 +.byte 0x00,0x00,0x00,0x00 + +.LSEH_info_mul_by_8_mod_384_prologue: +.byte 1,0,5,0x0b +.byte 0,0x74,1,0 +.byte 0,0x64,2,0 +.byte 0,0x03 +.byte 0,0 +.LSEH_info_mul_by_8_mod_384_body: +.byte 1,0,17,0 +.byte 0x00,0xf4,0x01,0x00 +.byte 0x00,0xe4,0x02,0x00 +.byte 0x00,0xd4,0x03,0x00 +.byte 0x00,0xc4,0x04,0x00 +.byte 0x00,0x34,0x05,0x00 +.byte 0x00,0x54,0x06,0x00 +.byte 0x00,0x74,0x08,0x00 +.byte 0x00,0x64,0x09,0x00 +.byte 0x00,0x62 +.byte 0x00,0x00 +.LSEH_info_mul_by_8_mod_384_epilogue: +.byte 1,0,4,0 +.byte 0x00,0x74,0x01,0x00 +.byte 0x00,0x64,0x02,0x00 +.byte 0x00,0x00,0x00,0x00 + +.LSEH_info_mul_by_3_mod_384x_prologue: +.byte 1,0,5,0x0b +.byte 0,0x74,1,0 +.byte 0,0x64,2,0 +.byte 0,0x03 +.byte 0,0 +.LSEH_info_mul_by_3_mod_384x_body: +.byte 1,0,17,0 +.byte 0x00,0xf4,0x01,0x00 +.byte 0x00,0xe4,0x02,0x00 +.byte 0x00,0xd4,0x03,0x00 +.byte 0x00,0xc4,0x04,0x00 +.byte 0x00,0x34,0x05,0x00 +.byte 0x00,0x54,0x06,0x00 +.byte 0x00,0x74,0x08,0x00 +.byte 0x00,0x64,0x09,0x00 +.byte 0x00,0x62 +.byte 0x00,0x00 +.LSEH_info_mul_by_3_mod_384x_epilogue: +.byte 1,0,4,0 +.byte 0x00,0x74,0x01,0x00 +.byte 0x00,0x64,0x02,0x00 +.byte 0x00,0x00,0x00,0x00 + +.LSEH_info_mul_by_8_mod_384x_prologue: +.byte 1,0,5,0x0b +.byte 0,0x74,1,0 +.byte 0,0x64,2,0 +.byte 0,0x03 +.byte 0,0 +.LSEH_info_mul_by_8_mod_384x_body: +.byte 1,0,17,0 +.byte 0x00,0xf4,0x01,0x00 +.byte 0x00,0xe4,0x02,0x00 +.byte 0x00,0xd4,0x03,0x00 +.byte 0x00,0xc4,0x04,0x00 +.byte 0x00,0x34,0x05,0x00 +.byte 0x00,0x54,0x06,0x00 +.byte 0x00,0x74,0x08,0x00 +.byte 0x00,0x64,0x09,0x00 +.byte 0x00,0x62 +.byte 0x00,0x00 +.LSEH_info_mul_by_8_mod_384x_epilogue: +.byte 1,0,4,0 +.byte 0x00,0x74,0x01,0x00 +.byte 0x00,0x64,0x02,0x00 +.byte 0x00,0x00,0x00,0x00 + +.LSEH_info_cneg_mod_384_prologue: +.byte 1,0,5,0x0b +.byte 0,0x74,1,0 +.byte 0,0x64,2,0 +.byte 0,0x03 +.byte 0,0 +.LSEH_info_cneg_mod_384_body: +.byte 1,0,17,0 +.byte 0x00,0xf4,0x01,0x00 +.byte 0x00,0xe4,0x02,0x00 +.byte 0x00,0xd4,0x03,0x00 +.byte 0x00,0xc4,0x04,0x00 +.byte 0x00,0x34,0x05,0x00 +.byte 0x00,0x54,0x06,0x00 +.byte 0x00,0x74,0x08,0x00 +.byte 0x00,0x64,0x09,0x00 +.byte 0x00,0x62 +.byte 0x00,0x00 +.LSEH_info_cneg_mod_384_epilogue: +.byte 1,0,4,0 +.byte 0x00,0x74,0x01,0x00 +.byte 0x00,0x64,0x02,0x00 +.byte 0x00,0x00,0x00,0x00 + +.LSEH_info_sub_mod_384_prologue: +.byte 1,0,5,0x0b +.byte 0,0x74,1,0 +.byte 0,0x64,2,0 +.byte 0,0x03 +.byte 0,0 +.LSEH_info_sub_mod_384_body: +.byte 1,0,17,0 +.byte 0x00,0xf4,0x01,0x00 +.byte 0x00,0xe4,0x02,0x00 +.byte 0x00,0xd4,0x03,0x00 +.byte 0x00,0xc4,0x04,0x00 +.byte 0x00,0x34,0x05,0x00 +.byte 0x00,0x54,0x06,0x00 +.byte 0x00,0x74,0x08,0x00 +.byte 0x00,0x64,0x09,0x00 +.byte 0x00,0x62 +.byte 0x00,0x00 +.LSEH_info_sub_mod_384_epilogue: +.byte 1,0,4,0 +.byte 0x00,0x74,0x01,0x00 +.byte 0x00,0x64,0x02,0x00 +.byte 0x00,0x00,0x00,0x00 + +.LSEH_info_sub_mod_384x_prologue: +.byte 1,0,5,0x0b +.byte 0,0x74,1,0 +.byte 0,0x64,2,0 +.byte 0,0x03 +.byte 0,0 +.LSEH_info_sub_mod_384x_body: +.byte 1,0,17,0 +.byte 0x00,0xf4,0x03,0x00 +.byte 0x00,0xe4,0x04,0x00 +.byte 0x00,0xd4,0x05,0x00 +.byte 0x00,0xc4,0x06,0x00 +.byte 0x00,0x34,0x07,0x00 +.byte 0x00,0x54,0x08,0x00 +.byte 0x00,0x74,0x0a,0x00 +.byte 0x00,0x64,0x0b,0x00 +.byte 0x00,0x82 +.byte 0x00,0x00 +.LSEH_info_sub_mod_384x_epilogue: +.byte 1,0,4,0 +.byte 0x00,0x74,0x01,0x00 +.byte 0x00,0x64,0x02,0x00 +.byte 0x00,0x00,0x00,0x00 + +.LSEH_info_mul_by_1_plus_i_mod_384x_prologue: +.byte 1,0,5,0x0b +.byte 0,0x74,1,0 +.byte 0,0x64,2,0 +.byte 0,0x03 +.byte 0,0 +.LSEH_info_mul_by_1_plus_i_mod_384x_body: +.byte 1,0,17,0 +.byte 0x00,0xf4,0x07,0x00 +.byte 0x00,0xe4,0x08,0x00 +.byte 0x00,0xd4,0x09,0x00 +.byte 0x00,0xc4,0x0a,0x00 +.byte 0x00,0x34,0x0b,0x00 +.byte 0x00,0x54,0x0c,0x00 +.byte 0x00,0x74,0x0e,0x00 +.byte 0x00,0x64,0x0f,0x00 +.byte 0x00,0xc2 +.byte 0x00,0x00 +.LSEH_info_mul_by_1_plus_i_mod_384x_epilogue: +.byte 1,0,4,0 +.byte 0x00,0x74,0x01,0x00 +.byte 0x00,0x64,0x02,0x00 +.byte 0x00,0x00,0x00,0x00 + +.LSEH_info_sgn0_pty_mod_384_prologue: +.byte 1,0,5,0x0b +.byte 0,0x74,1,0 +.byte 0,0x64,2,0 +.byte 0,0x03 +.byte 0,0 +.LSEH_info_sgn0_pty_mod_384_body: +.byte 1,0,4,0 +.byte 0x00,0x74,0x01,0x00 +.byte 0x00,0x64,0x02,0x00 +.byte 0x00,0x00,0x00,0x00 +.LSEH_info_sgn0_pty_mod_384_epilogue: +.byte 1,0,4,0 +.byte 0x00,0x74,0x01,0x00 +.byte 0x00,0x64,0x02,0x00 +.byte 0x00,0x00,0x00,0x00 + +.LSEH_info_sgn0_pty_mod_384x_prologue: +.byte 1,0,5,0x0b +.byte 0,0x74,1,0 +.byte 0,0x64,2,0 +.byte 0,0x03 +.byte 0,0 +.LSEH_info_sgn0_pty_mod_384x_body: +.byte 1,0,9,0 +.byte 0x00,0x34,0x01,0x00 +.byte 0x00,0x54,0x02,0x00 +.byte 0x00,0x74,0x04,0x00 +.byte 0x00,0x64,0x05,0x00 +.byte 0x00,0x22 +.byte 0x00,0x00 +.LSEH_info_sgn0_pty_mod_384x_epilogue: +.byte 1,0,4,0 +.byte 0x00,0x74,0x01,0x00 +.byte 0x00,0x64,0x02,0x00 +.byte 0x00,0x00,0x00,0x00 + diff --git a/temp-deps/blst-ts/deps/blst/build/coff/add_mod_384x384-x86_64.s b/temp-deps/blst-ts/deps/blst/build/coff/add_mod_384x384-x86_64.s new file mode 100644 index 0000000..79976cc --- /dev/null +++ b/temp-deps/blst-ts/deps/blst/build/coff/add_mod_384x384-x86_64.s @@ -0,0 +1,326 @@ +.text + +.def __add_mod_384x384; .scl 3; .type 32; .endef +.p2align 5 +__add_mod_384x384: + .byte 0xf3,0x0f,0x1e,0xfa + + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + movq 32(%rsi),%r12 + movq 40(%rsi),%r13 + movq 48(%rsi),%r14 + + addq 0(%rdx),%r8 + movq 56(%rsi),%r15 + adcq 8(%rdx),%r9 + movq 64(%rsi),%rax + adcq 16(%rdx),%r10 + movq 72(%rsi),%rbx + adcq 24(%rdx),%r11 + movq 80(%rsi),%rbp + adcq 32(%rdx),%r12 + movq 88(%rsi),%rsi + adcq 40(%rdx),%r13 + movq %r8,0(%rdi) + adcq 48(%rdx),%r14 + movq %r9,8(%rdi) + adcq 56(%rdx),%r15 + movq %r10,16(%rdi) + adcq 64(%rdx),%rax + movq %r12,32(%rdi) + movq %r14,%r8 + adcq 72(%rdx),%rbx + movq %r11,24(%rdi) + movq %r15,%r9 + adcq 80(%rdx),%rbp + movq %r13,40(%rdi) + movq %rax,%r10 + adcq 88(%rdx),%rsi + movq %rbx,%r11 + sbbq %rdx,%rdx + + subq 0(%rcx),%r14 + sbbq 8(%rcx),%r15 + movq %rbp,%r12 + sbbq 16(%rcx),%rax + sbbq 24(%rcx),%rbx + sbbq 32(%rcx),%rbp + movq %rsi,%r13 + sbbq 40(%rcx),%rsi + sbbq $0,%rdx + + cmovcq %r8,%r14 + cmovcq %r9,%r15 + cmovcq %r10,%rax + movq %r14,48(%rdi) + cmovcq %r11,%rbx + movq %r15,56(%rdi) + cmovcq %r12,%rbp + movq %rax,64(%rdi) + cmovcq %r13,%rsi + movq %rbx,72(%rdi) + movq %rbp,80(%rdi) + movq %rsi,88(%rdi) + + .byte 0xf3,0xc3 + + +.def __sub_mod_384x384; .scl 3; .type 32; .endef +.p2align 5 +__sub_mod_384x384: + .byte 0xf3,0x0f,0x1e,0xfa + + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + movq 32(%rsi),%r12 + movq 40(%rsi),%r13 + movq 48(%rsi),%r14 + + subq 0(%rdx),%r8 + movq 56(%rsi),%r15 + sbbq 8(%rdx),%r9 + movq 64(%rsi),%rax + sbbq 16(%rdx),%r10 + movq 72(%rsi),%rbx + sbbq 24(%rdx),%r11 + movq 80(%rsi),%rbp + sbbq 32(%rdx),%r12 + movq 88(%rsi),%rsi + sbbq 40(%rdx),%r13 + movq %r8,0(%rdi) + sbbq 48(%rdx),%r14 + movq 0(%rcx),%r8 + movq %r9,8(%rdi) + sbbq 56(%rdx),%r15 + movq 8(%rcx),%r9 + movq %r10,16(%rdi) + sbbq 64(%rdx),%rax + movq 16(%rcx),%r10 + movq %r11,24(%rdi) + sbbq 72(%rdx),%rbx + movq 24(%rcx),%r11 + movq %r12,32(%rdi) + sbbq 80(%rdx),%rbp + movq 32(%rcx),%r12 + movq %r13,40(%rdi) + sbbq 88(%rdx),%rsi + movq 40(%rcx),%r13 + sbbq %rdx,%rdx + + andq %rdx,%r8 + andq %rdx,%r9 + andq %rdx,%r10 + andq %rdx,%r11 + andq %rdx,%r12 + andq %rdx,%r13 + + addq %r8,%r14 + adcq %r9,%r15 + movq %r14,48(%rdi) + adcq %r10,%rax + movq %r15,56(%rdi) + adcq %r11,%rbx + movq %rax,64(%rdi) + adcq %r12,%rbp + movq %rbx,72(%rdi) + adcq %r13,%rsi + movq %rbp,80(%rdi) + movq %rsi,88(%rdi) + + .byte 0xf3,0xc3 + + +.globl add_mod_384x384 + +.def add_mod_384x384; .scl 2; .type 32; .endef +.p2align 5 +add_mod_384x384: + .byte 0xf3,0x0f,0x1e,0xfa + movq %rdi,8(%rsp) + movq %rsi,16(%rsp) + movq %rsp,%r11 +.LSEH_begin_add_mod_384x384: + movq %rcx,%rdi + movq %rdx,%rsi + movq %r8,%rdx + movq %r9,%rcx + + + pushq %rbp + + pushq %rbx + + pushq %r12 + + pushq %r13 + + pushq %r14 + + pushq %r15 + + subq $8,%rsp + +.LSEH_body_add_mod_384x384: + + + call __add_mod_384x384 + + movq 8(%rsp),%r15 + + movq 16(%rsp),%r14 + + movq 24(%rsp),%r13 + + movq 32(%rsp),%r12 + + movq 40(%rsp),%rbx + + movq 48(%rsp),%rbp + + leaq 56(%rsp),%rsp + +.LSEH_epilogue_add_mod_384x384: + mov 8(%rsp),%rdi + mov 16(%rsp),%rsi + + .byte 0xf3,0xc3 + +.LSEH_end_add_mod_384x384: + +.globl sub_mod_384x384 + +.def sub_mod_384x384; .scl 2; .type 32; .endef +.p2align 5 +sub_mod_384x384: + .byte 0xf3,0x0f,0x1e,0xfa + movq %rdi,8(%rsp) + movq %rsi,16(%rsp) + movq %rsp,%r11 +.LSEH_begin_sub_mod_384x384: + movq %rcx,%rdi + movq %rdx,%rsi + movq %r8,%rdx + movq %r9,%rcx + + + pushq %rbp + + pushq %rbx + + pushq %r12 + + pushq %r13 + + pushq %r14 + + pushq %r15 + + subq $8,%rsp + +.LSEH_body_sub_mod_384x384: + + + call __sub_mod_384x384 + + movq 8(%rsp),%r15 + + movq 16(%rsp),%r14 + + movq 24(%rsp),%r13 + + movq 32(%rsp),%r12 + + movq 40(%rsp),%rbx + + movq 48(%rsp),%rbp + + leaq 56(%rsp),%rsp + +.LSEH_epilogue_sub_mod_384x384: + mov 8(%rsp),%rdi + mov 16(%rsp),%rsi + + .byte 0xf3,0xc3 + +.LSEH_end_sub_mod_384x384: +.section .pdata +.p2align 2 +.rva .LSEH_begin_add_mod_384x384 +.rva .LSEH_body_add_mod_384x384 +.rva .LSEH_info_add_mod_384x384_prologue + +.rva .LSEH_body_add_mod_384x384 +.rva .LSEH_epilogue_add_mod_384x384 +.rva .LSEH_info_add_mod_384x384_body + +.rva .LSEH_epilogue_add_mod_384x384 +.rva .LSEH_end_add_mod_384x384 +.rva .LSEH_info_add_mod_384x384_epilogue + +.rva .LSEH_begin_sub_mod_384x384 +.rva .LSEH_body_sub_mod_384x384 +.rva .LSEH_info_sub_mod_384x384_prologue + +.rva .LSEH_body_sub_mod_384x384 +.rva .LSEH_epilogue_sub_mod_384x384 +.rva .LSEH_info_sub_mod_384x384_body + +.rva .LSEH_epilogue_sub_mod_384x384 +.rva .LSEH_end_sub_mod_384x384 +.rva .LSEH_info_sub_mod_384x384_epilogue + +.section .xdata +.p2align 3 +.LSEH_info_add_mod_384x384_prologue: +.byte 1,0,5,0x0b +.byte 0,0x74,1,0 +.byte 0,0x64,2,0 +.byte 0,0x03 +.byte 0,0 +.LSEH_info_add_mod_384x384_body: +.byte 1,0,17,0 +.byte 0x00,0xf4,0x01,0x00 +.byte 0x00,0xe4,0x02,0x00 +.byte 0x00,0xd4,0x03,0x00 +.byte 0x00,0xc4,0x04,0x00 +.byte 0x00,0x34,0x05,0x00 +.byte 0x00,0x54,0x06,0x00 +.byte 0x00,0x74,0x08,0x00 +.byte 0x00,0x64,0x09,0x00 +.byte 0x00,0x62 +.byte 0x00,0x00 +.LSEH_info_add_mod_384x384_epilogue: +.byte 1,0,4,0 +.byte 0x00,0x74,0x01,0x00 +.byte 0x00,0x64,0x02,0x00 +.byte 0x00,0x00,0x00,0x00 + +.LSEH_info_sub_mod_384x384_prologue: +.byte 1,0,5,0x0b +.byte 0,0x74,1,0 +.byte 0,0x64,2,0 +.byte 0,0x03 +.byte 0,0 +.LSEH_info_sub_mod_384x384_body: +.byte 1,0,17,0 +.byte 0x00,0xf4,0x01,0x00 +.byte 0x00,0xe4,0x02,0x00 +.byte 0x00,0xd4,0x03,0x00 +.byte 0x00,0xc4,0x04,0x00 +.byte 0x00,0x34,0x05,0x00 +.byte 0x00,0x54,0x06,0x00 +.byte 0x00,0x74,0x08,0x00 +.byte 0x00,0x64,0x09,0x00 +.byte 0x00,0x62 +.byte 0x00,0x00 +.LSEH_info_sub_mod_384x384_epilogue: +.byte 1,0,4,0 +.byte 0x00,0x74,0x01,0x00 +.byte 0x00,0x64,0x02,0x00 +.byte 0x00,0x00,0x00,0x00 + diff --git a/temp-deps/blst-ts/deps/blst/build/coff/ct_inverse_mod_256-armv8.S b/temp-deps/blst-ts/deps/blst/build/coff/ct_inverse_mod_256-armv8.S new file mode 100644 index 0000000..17c3d25 --- /dev/null +++ b/temp-deps/blst-ts/deps/blst/build/coff/ct_inverse_mod_256-armv8.S @@ -0,0 +1,798 @@ +.text + +.globl ct_inverse_mod_256 +.def ct_inverse_mod_256; +.type 32; +.endef +.p2align 5 +ct_inverse_mod_256: +.long 3573752639 + stp x29, x30, [sp,#-80]! + add x29, sp, #0 + stp x19, x20, [sp,#16] + stp x21, x22, [sp,#32] + stp x23, x24, [sp,#48] + stp x25, x26, [sp,#64] + sub sp, sp, #1040 + + ldp x4, x5, [x1,#8*0] + ldp x6, x7, [x1,#8*2] + + add x1, sp, #16+511 // find closest 512-byte-aligned spot + and x1, x1, #-512 // in the frame... + str x0, [sp] + + ldp x8, x9, [x2,#8*0] + ldp x10, x11, [x2,#8*2] + + stp x4, x5, [x1,#8*0] // copy input to |a| + stp x6, x7, [x1,#8*2] + stp x8, x9, [x1,#8*4] // copy modulus to |b| + stp x10, x11, [x1,#8*6] + + ////////////////////////////////////////// first iteration + bl .Lab_approximation_31_256_loaded + + eor x0, x1, #256 // pointer to dst |a|b|u|v| + bl __smul_256_n_shift_by_31 + str x12,[x0,#8*8] // initialize |u| with |f0| + + mov x12, x14 // |f1| + mov x13, x15 // |g1| + add x0, x0, #8*4 // pointer to dst |b| + bl __smul_256_n_shift_by_31 + str x12, [x0,#8*9] // initialize |v| with |f1| + + ////////////////////////////////////////// second iteration + eor x1, x1, #256 // flip-flop src |a|b|u|v| + bl __ab_approximation_31_256 + + eor x0, x1, #256 // pointer to dst |a|b|u|v| + bl __smul_256_n_shift_by_31 + mov x16, x12 // corrected |f0| + mov x17, x13 // corrected |g0| + + mov x12, x14 // |f1| + mov x13, x15 // |g1| + add x0, x0, #8*4 // pointer to destination |b| + bl __smul_256_n_shift_by_31 + + ldr x8, [x1,#8*8] // |u| + ldr x9, [x1,#8*13] // |v| + madd x4, x16, x8, xzr // |u|*|f0| + madd x4, x17, x9, x4 // |v|*|g0| + str x4, [x0,#8*4] + asr x5, x4, #63 // sign extenstion + stp x5, x5, [x0,#8*5] + stp x5, x5, [x0,#8*7] + + madd x4, x12, x8, xzr // |u|*|f1| + madd x4, x13, x9, x4 // |v|*|g1| + str x4, [x0,#8*9] + asr x5, x4, #63 // sign extenstion + stp x5, x5, [x0,#8*10] + stp x5, x5, [x0,#8*12] + eor x1, x1, #256 // flip-flop src |a|b|u|v| + bl __ab_approximation_31_256 + + eor x0, x1, #256 // pointer to dst |a|b|u|v| + bl __smul_256_n_shift_by_31 + mov x16, x12 // corrected |f0| + mov x17, x13 // corrected |g0| + + mov x12, x14 // |f1| + mov x13, x15 // |g1| + add x0, x0, #8*4 // pointer to destination |b| + bl __smul_256_n_shift_by_31 + + add x0, x0, #8*4 // pointer to destination |u| + bl __smul_256x63 + adc x22, x22, x23 + str x22, [x0,#8*4] + + mov x16, x12 // corrected |f1| + mov x17, x13 // corrected |g1| + add x0, x0, #8*5 // pointer to destination |v| + bl __smul_256x63 + adc x22, x22, x23 + stp x22, x22, [x0,#8*4] + stp x22, x22, [x0,#8*6] + eor x1, x1, #256 // flip-flop src |a|b|u|v| + bl __ab_approximation_31_256 + + eor x0, x1, #256 // pointer to dst |a|b|u|v| + bl __smul_256_n_shift_by_31 + mov x16, x12 // corrected |f0| + mov x17, x13 // corrected |g0| + + mov x12, x14 // |f1| + mov x13, x15 // |g1| + add x0, x0, #8*4 // pointer to destination |b| + bl __smul_256_n_shift_by_31 + + add x0, x0, #8*4 // pointer to destination |u| + bl __smul_256x63 + adc x22, x22, x23 + str x22, [x0,#8*4] + + mov x16, x12 // corrected |f1| + mov x17, x13 // corrected |g1| + add x0, x0, #8*5 // pointer to destination |v| + bl __smul_256x63 + adc x22, x22, x23 + stp x22, x22, [x0,#8*4] + stp x22, x22, [x0,#8*6] + eor x1, x1, #256 // flip-flop src |a|b|u|v| + bl __ab_approximation_31_256 + + eor x0, x1, #256 // pointer to dst |a|b|u|v| + bl __smul_256_n_shift_by_31 + mov x16, x12 // corrected |f0| + mov x17, x13 // corrected |g0| + + mov x12, x14 // |f1| + mov x13, x15 // |g1| + add x0, x0, #8*4 // pointer to destination |b| + bl __smul_256_n_shift_by_31 + + add x0, x0, #8*4 // pointer to destination |u| + bl __smul_256x63 + adc x22, x22, x23 + str x22, [x0,#8*4] + + mov x16, x12 // corrected |f1| + mov x17, x13 // corrected |g1| + add x0, x0, #8*5 // pointer to destination |v| + bl __smul_256x63 + adc x22, x22, x23 + stp x22, x22, [x0,#8*4] + stp x22, x22, [x0,#8*6] + eor x1, x1, #256 // flip-flop src |a|b|u|v| + bl __ab_approximation_31_256 + + eor x0, x1, #256 // pointer to dst |a|b|u|v| + bl __smul_256_n_shift_by_31 + mov x16, x12 // corrected |f0| + mov x17, x13 // corrected |g0| + + mov x12, x14 // |f1| + mov x13, x15 // |g1| + add x0, x0, #8*4 // pointer to destination |b| + bl __smul_256_n_shift_by_31 + + add x0, x0, #8*4 // pointer to destination |u| + bl __smul_256x63 + adc x22, x22, x23 + str x22, [x0,#8*4] + + mov x16, x12 // corrected |f1| + mov x17, x13 // corrected |g1| + add x0, x0, #8*5 // pointer to destination |v| + bl __smul_256x63 + adc x22, x22, x23 + stp x22, x22, [x0,#8*4] + stp x22, x22, [x0,#8*6] + eor x1, x1, #256 // flip-flop src |a|b|u|v| + bl __ab_approximation_31_256 + + eor x0, x1, #256 // pointer to dst |a|b|u|v| + bl __smul_256_n_shift_by_31 + mov x16, x12 // corrected |f0| + mov x17, x13 // corrected |g0| + + mov x12, x14 // |f1| + mov x13, x15 // |g1| + add x0, x0, #8*4 // pointer to destination |b| + bl __smul_256_n_shift_by_31 + + add x0, x0, #8*4 // pointer to destination |u| + bl __smul_256x63 + adc x22, x22, x23 + str x22, [x0,#8*4] + + mov x16, x12 // corrected |f1| + mov x17, x13 // corrected |g1| + add x0, x0, #8*5 // pointer to destination |v| + bl __smul_256x63 + adc x22, x22, x23 + stp x22, x22, [x0,#8*4] + stp x22, x22, [x0,#8*6] + eor x1, x1, #256 // flip-flop src |a|b|u|v| + bl __ab_approximation_31_256 + + eor x0, x1, #256 // pointer to dst |a|b|u|v| + bl __smul_256_n_shift_by_31 + mov x16, x12 // corrected |f0| + mov x17, x13 // corrected |g0| + + mov x12, x14 // |f1| + mov x13, x15 // |g1| + add x0, x0, #8*4 // pointer to destination |b| + bl __smul_256_n_shift_by_31 + + add x0, x0, #8*4 // pointer to destination |u| + bl __smul_256x63 + adc x22, x22, x23 + str x22, [x0,#8*4] + + mov x16, x12 // corrected |f1| + mov x17, x13 // corrected |g1| + add x0, x0, #8*5 // pointer to destination |v| + bl __smul_256x63 + adc x22, x22, x23 + stp x22, x22, [x0,#8*4] + stp x22, x22, [x0,#8*6] + eor x1, x1, #256 // flip-flop src |a|b|u|v| + bl __ab_approximation_31_256 + + eor x0, x1, #256 // pointer to dst |a|b|u|v| + bl __smul_256_n_shift_by_31 + mov x16, x12 // corrected |f0| + mov x17, x13 // corrected |g0| + + mov x12, x14 // |f1| + mov x13, x15 // |g1| + add x0, x0, #8*4 // pointer to destination |b| + bl __smul_256_n_shift_by_31 + + add x0, x0, #8*4 // pointer to destination |u| + bl __smul_256x63 + adc x22, x22, x23 + str x22, [x0,#8*4] + + mov x16, x12 // corrected |f1| + mov x17, x13 // corrected |g1| + add x0, x0, #8*5 // pointer to destination |v| + bl __smul_256x63 + bl __smul_512x63_tail + eor x1, x1, #256 // flip-flop src |a|b|u|v| + bl __ab_approximation_31_256 + + eor x0, x1, #256 // pointer to dst |a|b|u|v| + bl __smul_256_n_shift_by_31 + mov x16, x12 // corrected |f0| + mov x17, x13 // corrected |g0| + + mov x12, x14 // |f1| + mov x13, x15 // |g1| + add x0, x0, #8*4 // pointer to destination |b| + bl __smul_256_n_shift_by_31 + + add x0, x0, #8*4 // pointer to destination |u| + bl __smul_256x63 + adc x22, x22, x23 + str x22, [x0,#8*4] + + mov x16, x12 // corrected |f1| + mov x17, x13 // corrected |g1| + add x0, x0, #8*5 // pointer to destination |v| + bl __smul_256x63 + bl __smul_512x63_tail + eor x1, x1, #256 // flip-flop src |a|b|u|v| + bl __ab_approximation_31_256 + + eor x0, x1, #256 // pointer to dst |a|b|u|v| + bl __smul_256_n_shift_by_31 + mov x16, x12 // corrected |f0| + mov x17, x13 // corrected |g0| + + mov x12, x14 // |f1| + mov x13, x15 // |g1| + add x0, x0, #8*4 // pointer to destination |b| + bl __smul_256_n_shift_by_31 + + add x0, x0, #8*4 // pointer to destination |u| + bl __smul_256x63 + adc x22, x22, x23 + str x22, [x0,#8*4] + + mov x16, x12 // corrected |f1| + mov x17, x13 // corrected |g1| + add x0, x0, #8*5 // pointer to destination |v| + bl __smul_256x63 + bl __smul_512x63_tail + eor x1, x1, #256 // flip-flop src |a|b|u|v| + bl __ab_approximation_31_256 + + eor x0, x1, #256 // pointer to dst |a|b|u|v| + bl __smul_256_n_shift_by_31 + mov x16, x12 // corrected |f0| + mov x17, x13 // corrected |g0| + + mov x12, x14 // |f1| + mov x13, x15 // |g1| + add x0, x0, #8*4 // pointer to destination |b| + bl __smul_256_n_shift_by_31 + + add x0, x0, #8*4 // pointer to destination |u| + bl __smul_256x63 + adc x22, x22, x23 + str x22, [x0,#8*4] + + mov x16, x12 // corrected |f1| + mov x17, x13 // corrected |g1| + add x0, x0, #8*5 // pointer to destination |v| + bl __smul_256x63 + bl __smul_512x63_tail + eor x1, x1, #256 // flip-flop src |a|b|u|v| + bl __ab_approximation_31_256 + + eor x0, x1, #256 // pointer to dst |a|b|u|v| + bl __smul_256_n_shift_by_31 + mov x16, x12 // corrected |f0| + mov x17, x13 // corrected |g0| + + mov x12, x14 // |f1| + mov x13, x15 // |g1| + add x0, x0, #8*4 // pointer to destination |b| + bl __smul_256_n_shift_by_31 + + add x0, x0, #8*4 // pointer to destination |u| + bl __smul_256x63 + adc x22, x22, x23 + str x22, [x0,#8*4] + + mov x16, x12 // corrected |f1| + mov x17, x13 // corrected |g1| + add x0, x0, #8*5 // pointer to destination |v| + bl __smul_256x63 + bl __smul_512x63_tail + eor x1, x1, #256 // flip-flop src |a|b|u|v| + bl __ab_approximation_31_256 + + eor x0, x1, #256 // pointer to dst |a|b|u|v| + bl __smul_256_n_shift_by_31 + mov x16, x12 // corrected |f0| + mov x17, x13 // corrected |g0| + + mov x12, x14 // |f1| + mov x13, x15 // |g1| + add x0, x0, #8*4 // pointer to destination |b| + bl __smul_256_n_shift_by_31 + + add x0, x0, #8*4 // pointer to destination |u| + bl __smul_256x63 + adc x22, x22, x23 + str x22, [x0,#8*4] + + mov x16, x12 // corrected |f1| + mov x17, x13 // corrected |g1| + add x0, x0, #8*5 // pointer to destination |v| + bl __smul_256x63 + bl __smul_512x63_tail + eor x1, x1, #256 // flip-flop src |a|b|u|v| + bl __ab_approximation_31_256 + + eor x0, x1, #256 // pointer to dst |a|b|u|v| + bl __smul_256_n_shift_by_31 + mov x16, x12 // corrected |f0| + mov x17, x13 // corrected |g0| + + mov x12, x14 // |f1| + mov x13, x15 // |g1| + add x0, x0, #8*4 // pointer to destination |b| + bl __smul_256_n_shift_by_31 + + add x0, x0, #8*4 // pointer to destination |u| + bl __smul_256x63 + adc x22, x22, x23 + str x22, [x0,#8*4] + + mov x16, x12 // corrected |f1| + mov x17, x13 // corrected |g1| + add x0, x0, #8*5 // pointer to destination |v| + bl __smul_256x63 + bl __smul_512x63_tail + ////////////////////////////////////////// two[!] last iterations + eor x1, x1, #256 // flip-flop src |a|b|u|v| + mov x2, #47 // 31 + 512 % 31 + //bl __ab_approximation_62_256 // |a| and |b| are exact, + ldr x7, [x1,#8*0] // just load + ldr x11, [x1,#8*4] + bl __inner_loop_62_256 + + mov x16, x14 + mov x17, x15 + ldr x0, [sp] // original out_ptr + bl __smul_256x63 + bl __smul_512x63_tail + ldr x30, [x29,#8] + + smulh x20, x7, x17 // figure out top-most limb + ldp x8, x9, [x3,#8*0] + adc x23, x23, x25 + ldp x10, x11, [x3,#8*2] + + add x20, x20, x23 // x20 is 1, 0 or -1 + asr x19, x20, #63 // sign as mask + + and x23, x8, x19 // add mod<<256 conditionally + and x24, x9, x19 + adds x4, x4, x23 + and x25, x10, x19 + adcs x5, x5, x24 + and x26, x11, x19 + adcs x6, x6, x25 + adcs x7, x22, x26 + adc x20, x20, xzr // x20 is 1, 0 or -1 + + neg x19, x20 + orr x20, x20, x19 // excess bit or sign as mask + asr x19, x19, #63 // excess bit as mask + + and x8, x8, x20 // mask |mod| + and x9, x9, x20 + and x10, x10, x20 + and x11, x11, x20 + + eor x8, x8, x19 // conditionally negate |mod| + eor x9, x9, x19 + adds x8, x8, x19, lsr#63 + eor x10, x10, x19 + adcs x9, x9, xzr + eor x11, x11, x19 + adcs x10, x10, xzr + adc x11, x11, xzr + + adds x4, x4, x8 // final adjustment for |mod|<<256 + adcs x5, x5, x9 + adcs x6, x6, x10 + stp x4, x5, [x0,#8*4] + adc x7, x7, x11 + stp x6, x7, [x0,#8*6] + + add sp, sp, #1040 + ldp x19, x20, [x29,#16] + ldp x21, x22, [x29,#32] + ldp x23, x24, [x29,#48] + ldp x25, x26, [x29,#64] + ldr x29, [sp],#80 +.long 3573752767 + ret + + +//////////////////////////////////////////////////////////////////////// +.def __smul_256x63; +.type 32; +.endef +.p2align 5 +__smul_256x63: + ldp x4, x5, [x1,#8*0+64] // load |u| (or |v|) + asr x14, x16, #63 // |f_|'s sign as mask (or |g_|'s) + ldp x6, x7, [x1,#8*2+64] + eor x16, x16, x14 // conditionally negate |f_| (or |g_|) + ldr x22, [x1,#8*4+64] + + eor x4, x4, x14 // conditionally negate |u| (or |v|) + sub x16, x16, x14 + eor x5, x5, x14 + adds x4, x4, x14, lsr#63 + eor x6, x6, x14 + adcs x5, x5, xzr + eor x7, x7, x14 + adcs x6, x6, xzr + eor x22, x22, x14 + umulh x19, x4, x16 + adcs x7, x7, xzr + umulh x20, x5, x16 + adcs x22, x22, xzr + umulh x21, x6, x16 + mul x4, x4, x16 + cmp x16, #0 + mul x5, x5, x16 + csel x22, x22, xzr, ne + mul x6, x6, x16 + adds x5, x5, x19 + mul x24, x7, x16 + adcs x6, x6, x20 + adcs x24, x24, x21 + adc x26, xzr, xzr + ldp x8, x9, [x1,#8*0+104] // load |u| (or |v|) + asr x14, x17, #63 // |f_|'s sign as mask (or |g_|'s) + ldp x10, x11, [x1,#8*2+104] + eor x17, x17, x14 // conditionally negate |f_| (or |g_|) + ldr x23, [x1,#8*4+104] + + eor x8, x8, x14 // conditionally negate |u| (or |v|) + sub x17, x17, x14 + eor x9, x9, x14 + adds x8, x8, x14, lsr#63 + eor x10, x10, x14 + adcs x9, x9, xzr + eor x11, x11, x14 + adcs x10, x10, xzr + eor x23, x23, x14 + umulh x19, x8, x17 + adcs x11, x11, xzr + umulh x20, x9, x17 + adcs x23, x23, xzr + umulh x21, x10, x17 + adc x15, xzr, xzr // used in __smul_512x63_tail + mul x8, x8, x17 + cmp x17, #0 + mul x9, x9, x17 + csel x23, x23, xzr, ne + mul x10, x10, x17 + adds x9, x9, x19 + mul x25, x11, x17 + adcs x10, x10, x20 + adcs x25, x25, x21 + adc x26, x26, xzr + + adds x4, x4, x8 + adcs x5, x5, x9 + adcs x6, x6, x10 + stp x4, x5, [x0,#8*0] + adcs x24, x24, x25 + stp x6, x24, [x0,#8*2] + + ret + + +.def __smul_512x63_tail; +.type 32; +.endef +.p2align 5 +__smul_512x63_tail: + umulh x24, x7, x16 + ldp x5, x6, [x1,#8*18] // load rest of |v| + adc x26, x26, xzr + ldr x7, [x1,#8*20] + and x22, x22, x16 + + umulh x11, x11, x17 // resume |v|*|g1| chain + + sub x24, x24, x22 // tie up |u|*|f1| chain + asr x25, x24, #63 + + eor x5, x5, x14 // conditionally negate rest of |v| + eor x6, x6, x14 + adds x5, x5, x15 + eor x7, x7, x14 + adcs x6, x6, xzr + umulh x19, x23, x17 + adc x7, x7, xzr + umulh x20, x5, x17 + add x11, x11, x26 + umulh x21, x6, x17 + + mul x4, x23, x17 + mul x5, x5, x17 + adds x4, x4, x11 + mul x6, x6, x17 + adcs x5, x5, x19 + mul x22, x7, x17 + adcs x6, x6, x20 + adcs x22, x22, x21 + adc x23, xzr, xzr // used in the final step + + adds x4, x4, x24 + adcs x5, x5, x25 + adcs x6, x6, x25 + stp x4, x5, [x0,#8*4] + adcs x22, x22, x25 // carry is used in the final step + stp x6, x22, [x0,#8*6] + + ret + + +.def __smul_256_n_shift_by_31; +.type 32; +.endef +.p2align 5 +__smul_256_n_shift_by_31: + ldp x4, x5, [x1,#8*0+0] // load |a| (or |b|) + asr x24, x12, #63 // |f0|'s sign as mask (or |g0|'s) + ldp x6, x7, [x1,#8*2+0] + eor x25, x12, x24 // conditionally negate |f0| (or |g0|) + + eor x4, x4, x24 // conditionally negate |a| (or |b|) + sub x25, x25, x24 + eor x5, x5, x24 + adds x4, x4, x24, lsr#63 + eor x6, x6, x24 + adcs x5, x5, xzr + eor x7, x7, x24 + umulh x19, x4, x25 + adcs x6, x6, xzr + umulh x20, x5, x25 + adc x7, x7, xzr + umulh x21, x6, x25 + and x24, x24, x25 + umulh x22, x7, x25 + neg x24, x24 + + mul x4, x4, x25 + mul x5, x5, x25 + mul x6, x6, x25 + adds x5, x5, x19 + mul x7, x7, x25 + adcs x6, x6, x20 + adcs x7, x7, x21 + adc x22, x22, x24 + ldp x8, x9, [x1,#8*0+32] // load |a| (or |b|) + asr x24, x13, #63 // |f0|'s sign as mask (or |g0|'s) + ldp x10, x11, [x1,#8*2+32] + eor x25, x13, x24 // conditionally negate |f0| (or |g0|) + + eor x8, x8, x24 // conditionally negate |a| (or |b|) + sub x25, x25, x24 + eor x9, x9, x24 + adds x8, x8, x24, lsr#63 + eor x10, x10, x24 + adcs x9, x9, xzr + eor x11, x11, x24 + umulh x19, x8, x25 + adcs x10, x10, xzr + umulh x20, x9, x25 + adc x11, x11, xzr + umulh x21, x10, x25 + and x24, x24, x25 + umulh x23, x11, x25 + neg x24, x24 + + mul x8, x8, x25 + mul x9, x9, x25 + mul x10, x10, x25 + adds x9, x9, x19 + mul x11, x11, x25 + adcs x10, x10, x20 + adcs x11, x11, x21 + adc x23, x23, x24 + adds x4, x4, x8 + adcs x5, x5, x9 + adcs x6, x6, x10 + adcs x7, x7, x11 + adc x8, x22, x23 + + extr x4, x5, x4, #31 + extr x5, x6, x5, #31 + extr x6, x7, x6, #31 + asr x23, x8, #63 // result's sign as mask + extr x7, x8, x7, #31 + + eor x4, x4, x23 // ensure the result is positive + eor x5, x5, x23 + adds x4, x4, x23, lsr#63 + eor x6, x6, x23 + adcs x5, x5, xzr + eor x7, x7, x23 + adcs x6, x6, xzr + stp x4, x5, [x0,#8*0] + adc x7, x7, xzr + stp x6, x7, [x0,#8*2] + + eor x12, x12, x23 // adjust |f/g| accordingly + eor x13, x13, x23 + sub x12, x12, x23 + sub x13, x13, x23 + + ret + +.def __ab_approximation_31_256; +.type 32; +.endef +.p2align 4 +__ab_approximation_31_256: + ldp x6, x7, [x1,#8*2] + ldp x10, x11, [x1,#8*6] + ldp x4, x5, [x1,#8*0] + ldp x8, x9, [x1,#8*4] + +.Lab_approximation_31_256_loaded: + orr x19, x7, x11 // check top-most limbs, ... + cmp x19, #0 + csel x7, x7, x6, ne + csel x11, x11, x10, ne + csel x6, x6, x5, ne + orr x19, x7, x11 // and ones before top-most, ... + csel x10, x10, x9, ne + + cmp x19, #0 + csel x7, x7, x6, ne + csel x11, x11, x10, ne + csel x6, x6, x4, ne + orr x19, x7, x11 // and one more, ... + csel x10, x10, x8, ne + + clz x19, x19 + cmp x19, #64 + csel x19, x19, xzr, ne + csel x7, x7, x6, ne + csel x11, x11, x10, ne + neg x20, x19 + + lslv x7, x7, x19 // align high limbs to the left + lslv x11, x11, x19 + lsrv x6, x6, x20 + lsrv x10, x10, x20 + and x6, x6, x20, asr#6 + and x10, x10, x20, asr#6 + orr x7, x7, x6 + orr x11, x11, x10 + + bfxil x7, x4, #0, #31 + bfxil x11, x8, #0, #31 + + b __inner_loop_31_256 + ret + + +.def __inner_loop_31_256; +.type 32; +.endef +.p2align 4 +__inner_loop_31_256: + mov x2, #31 + mov x13, #0x7FFFFFFF80000000 // |f0|=1, |g0|=0 + mov x15, #0x800000007FFFFFFF // |f1|=0, |g1|=1 + mov x23,#0x7FFFFFFF7FFFFFFF + +.Loop_31_256: + sbfx x22, x7, #0, #1 // if |a_| is odd, then we'll be subtracting + sub x2, x2, #1 + and x19, x11, x22 + sub x20, x11, x7 // |b_|-|a_| + subs x21, x7, x19 // |a_|-|b_| (or |a_|-0 if |a_| was even) + mov x19, x15 + csel x11, x11, x7, hs // |b_| = |a_| + csel x7, x21, x20, hs // borrow means |a_|<|b_|, replace with |b_|-|a_| + csel x15, x15, x13, hs // exchange |fg0| and |fg1| + csel x13, x13, x19, hs + lsr x7, x7, #1 + and x19, x15, x22 + and x20, x23, x22 + sub x13, x13, x19 // |f0|-=|f1| (or |f0-=0| if |a_| was even) + add x15, x15, x15 // |f1|<<=1 + add x13, x13, x20 + sub x15, x15, x23 + cbnz x2, .Loop_31_256 + + mov x23, #0x7FFFFFFF + ubfx x12, x13, #0, #32 + ubfx x13, x13, #32, #32 + ubfx x14, x15, #0, #32 + ubfx x15, x15, #32, #32 + sub x12, x12, x23 // remove bias + sub x13, x13, x23 + sub x14, x14, x23 + sub x15, x15, x23 + + ret + + +.def __inner_loop_62_256; +.type 32; +.endef +.p2align 4 +__inner_loop_62_256: + mov x12, #1 // |f0|=1 + mov x13, #0 // |g0|=0 + mov x14, #0 // |f1|=0 + mov x15, #1 // |g1|=1 + +.Loop_62_256: + sbfx x22, x7, #0, #1 // if |a_| is odd, then we'll be subtracting + sub x2, x2, #1 + and x19, x11, x22 + sub x20, x11, x7 // |b_|-|a_| + subs x21, x7, x19 // |a_|-|b_| (or |a_|-0 if |a_| was even) + mov x19, x12 + csel x11, x11, x7, hs // |b_| = |a_| + csel x7, x21, x20, hs // borrow means |a_|<|b_|, replace with |b_|-|a_| + mov x20, x13 + csel x12, x12, x14, hs // exchange |f0| and |f1| + csel x14, x14, x19, hs + csel x13, x13, x15, hs // exchange |g0| and |g1| + csel x15, x15, x20, hs + lsr x7, x7, #1 + and x19, x14, x22 + and x20, x15, x22 + add x14, x14, x14 // |f1|<<=1 + add x15, x15, x15 // |g1|<<=1 + sub x12, x12, x19 // |f0|-=|f1| (or |f0-=0| if |a_| was even) + sub x13, x13, x20 // |g0|-=|g1| (or |g0-=0| ...) + cbnz x2, .Loop_62_256 + + ret + diff --git a/temp-deps/blst-ts/deps/blst/build/coff/ct_inverse_mod_256-x86_64.s b/temp-deps/blst-ts/deps/blst/build/coff/ct_inverse_mod_256-x86_64.s new file mode 100644 index 0000000..e7d4a63 --- /dev/null +++ b/temp-deps/blst-ts/deps/blst/build/coff/ct_inverse_mod_256-x86_64.s @@ -0,0 +1,1209 @@ +.text + +.globl ct_inverse_mod_256 +.def ct_inverse_mod_256; .scl 2; .type 32; .endef +.p2align 5 +ct_inverse_mod_256: + .byte 0xf3,0x0f,0x1e,0xfa + movq %rdi,8(%rsp) + movq %rsi,16(%rsp) + movq %rsp,%r11 +.LSEH_begin_ct_inverse_mod_256: + movq %rcx,%rdi + movq %rdx,%rsi + movq %r8,%rdx + movq %r9,%rcx + + + pushq %rbp + + pushq %rbx + + pushq %r12 + + pushq %r13 + + pushq %r14 + + pushq %r15 + + subq $1072,%rsp + +.LSEH_body_ct_inverse_mod_256: + + + leaq 48+511(%rsp),%rax + andq $-512,%rax + movq %rdi,32(%rsp) + movq %rcx,40(%rsp) + + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + + movq 0(%rdx),%r12 + movq 8(%rdx),%r13 + movq 16(%rdx),%r14 + movq 24(%rdx),%r15 + + movq %r8,0(%rax) + movq %r9,8(%rax) + movq %r10,16(%rax) + movq %r11,24(%rax) + + movq %r12,32(%rax) + movq %r13,40(%rax) + movq %r14,48(%rax) + movq %r15,56(%rax) + movq %rax,%rsi + + + movl $31,%edx + call __ab_approximation_31_256 + + + movq %r12,16(%rsp) + movq %r13,24(%rsp) + + movq $256,%rdi + xorq %rsi,%rdi + call __smulq_256_n_shift_by_31 + + + movq %rdx,64(%rdi) + + movq 16(%rsp),%rdx + movq 24(%rsp),%rcx + leaq 32(%rdi),%rdi + call __smulq_256_n_shift_by_31 + + + movq %rdx,72(%rdi) + + + xorq $256,%rsi + movl $31,%edx + call __ab_approximation_31_256 + + + movq %r12,16(%rsp) + movq %r13,24(%rsp) + + movq $256,%rdi + xorq %rsi,%rdi + call __smulq_256_n_shift_by_31 + movq %rdx,0(%rsp) + movq %rcx,8(%rsp) + + movq 16(%rsp),%rdx + movq 24(%rsp),%rcx + leaq 32(%rdi),%rdi + call __smulq_256_n_shift_by_31 + + + + movq 64(%rsi),%r8 + movq 104(%rsi),%r12 + movq %r8,%r9 + imulq 0(%rsp),%r8 + movq %r12,%r13 + imulq 8(%rsp),%r12 + addq %r12,%r8 + movq %r8,32(%rdi) + sarq $63,%r8 + movq %r8,40(%rdi) + movq %r8,48(%rdi) + movq %r8,56(%rdi) + movq %r8,64(%rdi) + leaq 64(%rsi),%rsi + + imulq %rdx,%r9 + imulq %rcx,%r13 + addq %r13,%r9 + movq %r9,72(%rdi) + sarq $63,%r9 + movq %r9,80(%rdi) + movq %r9,88(%rdi) + movq %r9,96(%rdi) + movq %r9,104(%rdi) + xorq $256+64,%rsi + movl $31,%edx + call __ab_approximation_31_256 + + + movq %r12,16(%rsp) + movq %r13,24(%rsp) + + movq $256,%rdi + xorq %rsi,%rdi + call __smulq_256_n_shift_by_31 + movq %rdx,0(%rsp) + movq %rcx,8(%rsp) + + movq 16(%rsp),%rdx + movq 24(%rsp),%rcx + leaq 32(%rdi),%rdi + call __smulq_256_n_shift_by_31 + movq %rdx,16(%rsp) + movq %rcx,24(%rsp) + + movq 0(%rsp),%rdx + movq 8(%rsp),%rcx + leaq 64(%rsi),%rsi + leaq 32(%rdi),%rdi + call __smulq_256x63 + + movq 16(%rsp),%rdx + movq 24(%rsp),%rcx + leaq 40(%rdi),%rdi + call __smulq_256x63 + xorq $256+64,%rsi + movl $31,%edx + call __ab_approximation_31_256 + + + movq %r12,16(%rsp) + movq %r13,24(%rsp) + + movq $256,%rdi + xorq %rsi,%rdi + call __smulq_256_n_shift_by_31 + movq %rdx,0(%rsp) + movq %rcx,8(%rsp) + + movq 16(%rsp),%rdx + movq 24(%rsp),%rcx + leaq 32(%rdi),%rdi + call __smulq_256_n_shift_by_31 + movq %rdx,16(%rsp) + movq %rcx,24(%rsp) + + movq 0(%rsp),%rdx + movq 8(%rsp),%rcx + leaq 64(%rsi),%rsi + leaq 32(%rdi),%rdi + call __smulq_256x63 + + movq 16(%rsp),%rdx + movq 24(%rsp),%rcx + leaq 40(%rdi),%rdi + call __smulq_256x63 + xorq $256+64,%rsi + movl $31,%edx + call __ab_approximation_31_256 + + + movq %r12,16(%rsp) + movq %r13,24(%rsp) + + movq $256,%rdi + xorq %rsi,%rdi + call __smulq_256_n_shift_by_31 + movq %rdx,0(%rsp) + movq %rcx,8(%rsp) + + movq 16(%rsp),%rdx + movq 24(%rsp),%rcx + leaq 32(%rdi),%rdi + call __smulq_256_n_shift_by_31 + movq %rdx,16(%rsp) + movq %rcx,24(%rsp) + + movq 0(%rsp),%rdx + movq 8(%rsp),%rcx + leaq 64(%rsi),%rsi + leaq 32(%rdi),%rdi + call __smulq_256x63 + + movq 16(%rsp),%rdx + movq 24(%rsp),%rcx + leaq 40(%rdi),%rdi + call __smulq_256x63 + xorq $256+64,%rsi + movl $31,%edx + call __ab_approximation_31_256 + + + movq %r12,16(%rsp) + movq %r13,24(%rsp) + + movq $256,%rdi + xorq %rsi,%rdi + call __smulq_256_n_shift_by_31 + movq %rdx,0(%rsp) + movq %rcx,8(%rsp) + + movq 16(%rsp),%rdx + movq 24(%rsp),%rcx + leaq 32(%rdi),%rdi + call __smulq_256_n_shift_by_31 + movq %rdx,16(%rsp) + movq %rcx,24(%rsp) + + movq 0(%rsp),%rdx + movq 8(%rsp),%rcx + leaq 64(%rsi),%rsi + leaq 32(%rdi),%rdi + call __smulq_256x63 + + movq 16(%rsp),%rdx + movq 24(%rsp),%rcx + leaq 40(%rdi),%rdi + call __smulq_256x63 + xorq $256+64,%rsi + movl $31,%edx + call __ab_approximation_31_256 + + + movq %r12,16(%rsp) + movq %r13,24(%rsp) + + movq $256,%rdi + xorq %rsi,%rdi + call __smulq_256_n_shift_by_31 + movq %rdx,0(%rsp) + movq %rcx,8(%rsp) + + movq 16(%rsp),%rdx + movq 24(%rsp),%rcx + leaq 32(%rdi),%rdi + call __smulq_256_n_shift_by_31 + movq %rdx,16(%rsp) + movq %rcx,24(%rsp) + + movq 0(%rsp),%rdx + movq 8(%rsp),%rcx + leaq 64(%rsi),%rsi + leaq 32(%rdi),%rdi + call __smulq_256x63 + + movq 16(%rsp),%rdx + movq 24(%rsp),%rcx + leaq 40(%rdi),%rdi + call __smulq_256x63 + xorq $256+64,%rsi + movl $31,%edx + call __ab_approximation_31_256 + + + movq %r12,16(%rsp) + movq %r13,24(%rsp) + + movq $256,%rdi + xorq %rsi,%rdi + call __smulq_256_n_shift_by_31 + movq %rdx,0(%rsp) + movq %rcx,8(%rsp) + + movq 16(%rsp),%rdx + movq 24(%rsp),%rcx + leaq 32(%rdi),%rdi + call __smulq_256_n_shift_by_31 + movq %rdx,16(%rsp) + movq %rcx,24(%rsp) + + movq 0(%rsp),%rdx + movq 8(%rsp),%rcx + leaq 64(%rsi),%rsi + leaq 32(%rdi),%rdi + call __smulq_256x63 + + movq 16(%rsp),%rdx + movq 24(%rsp),%rcx + leaq 40(%rdi),%rdi + call __smulq_256x63 + xorq $256+64,%rsi + movl $31,%edx + call __ab_approximation_31_256 + + + movq %r12,16(%rsp) + movq %r13,24(%rsp) + + movq $256,%rdi + xorq %rsi,%rdi + call __smulq_256_n_shift_by_31 + movq %rdx,0(%rsp) + movq %rcx,8(%rsp) + + movq 16(%rsp),%rdx + movq 24(%rsp),%rcx + leaq 32(%rdi),%rdi + call __smulq_256_n_shift_by_31 + movq %rdx,16(%rsp) + movq %rcx,24(%rsp) + + movq 0(%rsp),%rdx + movq 8(%rsp),%rcx + leaq 64(%rsi),%rsi + leaq 32(%rdi),%rdi + call __smulq_256x63 + + movq 16(%rsp),%rdx + movq 24(%rsp),%rcx + leaq 40(%rdi),%rdi + call __smulq_256x63 + sarq $63,%rbp + movq %rbp,40(%rdi) + movq %rbp,48(%rdi) + movq %rbp,56(%rdi) + xorq $256+64,%rsi + movl $31,%edx + call __ab_approximation_31_256 + + + movq %r12,16(%rsp) + movq %r13,24(%rsp) + + movq $256,%rdi + xorq %rsi,%rdi + call __smulq_256_n_shift_by_31 + movq %rdx,0(%rsp) + movq %rcx,8(%rsp) + + movq 16(%rsp),%rdx + movq 24(%rsp),%rcx + leaq 32(%rdi),%rdi + call __smulq_256_n_shift_by_31 + movq %rdx,16(%rsp) + movq %rcx,24(%rsp) + + movq 0(%rsp),%rdx + movq 8(%rsp),%rcx + leaq 64(%rsi),%rsi + leaq 32(%rdi),%rdi + call __smulq_256x63 + + movq 16(%rsp),%rdx + movq 24(%rsp),%rcx + leaq 40(%rdi),%rdi + call __smulq_512x63 + xorq $256+64,%rsi + movl $31,%edx + call __ab_approximation_31_256 + + + movq %r12,16(%rsp) + movq %r13,24(%rsp) + + movq $256,%rdi + xorq %rsi,%rdi + call __smulq_256_n_shift_by_31 + movq %rdx,0(%rsp) + movq %rcx,8(%rsp) + + movq 16(%rsp),%rdx + movq 24(%rsp),%rcx + leaq 32(%rdi),%rdi + call __smulq_256_n_shift_by_31 + movq %rdx,16(%rsp) + movq %rcx,24(%rsp) + + movq 0(%rsp),%rdx + movq 8(%rsp),%rcx + leaq 64(%rsi),%rsi + leaq 32(%rdi),%rdi + call __smulq_256x63 + + movq 16(%rsp),%rdx + movq 24(%rsp),%rcx + leaq 40(%rdi),%rdi + call __smulq_512x63 + xorq $256+64,%rsi + movl $31,%edx + call __ab_approximation_31_256 + + + movq %r12,16(%rsp) + movq %r13,24(%rsp) + + movq $256,%rdi + xorq %rsi,%rdi + call __smulq_256_n_shift_by_31 + movq %rdx,0(%rsp) + movq %rcx,8(%rsp) + + movq 16(%rsp),%rdx + movq 24(%rsp),%rcx + leaq 32(%rdi),%rdi + call __smulq_256_n_shift_by_31 + movq %rdx,16(%rsp) + movq %rcx,24(%rsp) + + movq 0(%rsp),%rdx + movq 8(%rsp),%rcx + leaq 64(%rsi),%rsi + leaq 32(%rdi),%rdi + call __smulq_256x63 + + movq 16(%rsp),%rdx + movq 24(%rsp),%rcx + leaq 40(%rdi),%rdi + call __smulq_512x63 + xorq $256+64,%rsi + movl $31,%edx + call __ab_approximation_31_256 + + + movq %r12,16(%rsp) + movq %r13,24(%rsp) + + movq $256,%rdi + xorq %rsi,%rdi + call __smulq_256_n_shift_by_31 + movq %rdx,0(%rsp) + movq %rcx,8(%rsp) + + movq 16(%rsp),%rdx + movq 24(%rsp),%rcx + leaq 32(%rdi),%rdi + call __smulq_256_n_shift_by_31 + movq %rdx,16(%rsp) + movq %rcx,24(%rsp) + + movq 0(%rsp),%rdx + movq 8(%rsp),%rcx + leaq 64(%rsi),%rsi + leaq 32(%rdi),%rdi + call __smulq_256x63 + + movq 16(%rsp),%rdx + movq 24(%rsp),%rcx + leaq 40(%rdi),%rdi + call __smulq_512x63 + xorq $256+64,%rsi + movl $31,%edx + call __ab_approximation_31_256 + + + movq %r12,16(%rsp) + movq %r13,24(%rsp) + + movq $256,%rdi + xorq %rsi,%rdi + call __smulq_256_n_shift_by_31 + movq %rdx,0(%rsp) + movq %rcx,8(%rsp) + + movq 16(%rsp),%rdx + movq 24(%rsp),%rcx + leaq 32(%rdi),%rdi + call __smulq_256_n_shift_by_31 + movq %rdx,16(%rsp) + movq %rcx,24(%rsp) + + movq 0(%rsp),%rdx + movq 8(%rsp),%rcx + leaq 64(%rsi),%rsi + leaq 32(%rdi),%rdi + call __smulq_256x63 + + movq 16(%rsp),%rdx + movq 24(%rsp),%rcx + leaq 40(%rdi),%rdi + call __smulq_512x63 + xorq $256+64,%rsi + movl $31,%edx + call __ab_approximation_31_256 + + + movq %r12,16(%rsp) + movq %r13,24(%rsp) + + movq $256,%rdi + xorq %rsi,%rdi + call __smulq_256_n_shift_by_31 + movq %rdx,0(%rsp) + movq %rcx,8(%rsp) + + movq 16(%rsp),%rdx + movq 24(%rsp),%rcx + leaq 32(%rdi),%rdi + call __smulq_256_n_shift_by_31 + movq %rdx,16(%rsp) + movq %rcx,24(%rsp) + + movq 0(%rsp),%rdx + movq 8(%rsp),%rcx + leaq 64(%rsi),%rsi + leaq 32(%rdi),%rdi + call __smulq_256x63 + + movq 16(%rsp),%rdx + movq 24(%rsp),%rcx + leaq 40(%rdi),%rdi + call __smulq_512x63 + + xorq $256+64,%rsi + movl $47,%edx + + movq 0(%rsi),%r8 + + movq 32(%rsi),%r10 + + call __inner_loop_62_256 + + + + + + + + leaq 64(%rsi),%rsi + + + + + + movq %r12,%rdx + movq %r13,%rcx + movq 32(%rsp),%rdi + call __smulq_512x63 + adcq %rbp,%rdx + + movq 40(%rsp),%rsi + movq %rdx,%rax + sarq $63,%rdx + + movq %rdx,%r8 + movq %rdx,%r9 + andq 0(%rsi),%r8 + movq %rdx,%r10 + andq 8(%rsi),%r9 + andq 16(%rsi),%r10 + andq 24(%rsi),%rdx + + addq %r8,%r12 + adcq %r9,%r13 + adcq %r10,%r14 + adcq %rdx,%r15 + adcq $0,%rax + + movq %rax,%rdx + negq %rax + orq %rax,%rdx + sarq $63,%rax + + movq %rdx,%r8 + movq %rdx,%r9 + andq 0(%rsi),%r8 + movq %rdx,%r10 + andq 8(%rsi),%r9 + andq 16(%rsi),%r10 + andq 24(%rsi),%rdx + + xorq %rax,%r8 + xorq %rcx,%rcx + xorq %rax,%r9 + subq %rax,%rcx + xorq %rax,%r10 + xorq %rax,%rdx + addq %rcx,%r8 + adcq $0,%r9 + adcq $0,%r10 + adcq $0,%rdx + + addq %r8,%r12 + adcq %r9,%r13 + adcq %r10,%r14 + adcq %rdx,%r15 + + movq %r12,32(%rdi) + movq %r13,40(%rdi) + movq %r14,48(%rdi) + movq %r15,56(%rdi) + + leaq 1072(%rsp),%r8 + movq 0(%r8),%r15 + + movq 8(%r8),%r14 + + movq 16(%r8),%r13 + + movq 24(%r8),%r12 + + movq 32(%r8),%rbx + + movq 40(%r8),%rbp + + leaq 48(%r8),%rsp + +.LSEH_epilogue_ct_inverse_mod_256: + mov 8(%rsp),%rdi + mov 16(%rsp),%rsi + + .byte 0xf3,0xc3 + +.LSEH_end_ct_inverse_mod_256: +.def __smulq_512x63; .scl 3; .type 32; .endef +.p2align 5 +__smulq_512x63: + .byte 0xf3,0x0f,0x1e,0xfa + + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + movq 32(%rsi),%rbp + + movq %rdx,%rbx + sarq $63,%rdx + xorq %rax,%rax + subq %rdx,%rax + + xorq %rdx,%rbx + addq %rax,%rbx + + xorq %rdx,%r8 + xorq %rdx,%r9 + xorq %rdx,%r10 + xorq %rdx,%r11 + xorq %rdx,%rbp + addq %r8,%rax + adcq $0,%r9 + adcq $0,%r10 + adcq $0,%r11 + adcq $0,%rbp + + mulq %rbx + movq %rax,0(%rdi) + movq %r9,%rax + movq %rdx,%r9 + mulq %rbx + addq %rax,%r9 + movq %r10,%rax + adcq $0,%rdx + movq %r9,8(%rdi) + movq %rdx,%r10 + mulq %rbx + addq %rax,%r10 + movq %r11,%rax + adcq $0,%rdx + movq %r10,16(%rdi) + movq %rdx,%r11 + andq %rbx,%rbp + negq %rbp + mulq %rbx + addq %rax,%r11 + adcq %rdx,%rbp + movq %r11,24(%rdi) + + movq 40(%rsi),%r8 + movq 48(%rsi),%r9 + movq 56(%rsi),%r10 + movq 64(%rsi),%r11 + movq 72(%rsi),%r12 + movq 80(%rsi),%r13 + movq 88(%rsi),%r14 + movq 96(%rsi),%r15 + + movq %rcx,%rdx + sarq $63,%rdx + xorq %rax,%rax + subq %rdx,%rax + + xorq %rdx,%rcx + addq %rax,%rcx + + xorq %rdx,%r8 + xorq %rdx,%r9 + xorq %rdx,%r10 + xorq %rdx,%r11 + xorq %rdx,%r12 + xorq %rdx,%r13 + xorq %rdx,%r14 + xorq %rdx,%r15 + addq %r8,%rax + adcq $0,%r9 + adcq $0,%r10 + adcq $0,%r11 + adcq $0,%r12 + adcq $0,%r13 + adcq $0,%r14 + adcq $0,%r15 + + mulq %rcx + movq %rax,%r8 + movq %r9,%rax + movq %rdx,%r9 + mulq %rcx + addq %rax,%r9 + movq %r10,%rax + adcq $0,%rdx + movq %rdx,%r10 + mulq %rcx + addq %rax,%r10 + movq %r11,%rax + adcq $0,%rdx + movq %rdx,%r11 + mulq %rcx + addq %rax,%r11 + movq %r12,%rax + adcq $0,%rdx + movq %rdx,%r12 + mulq %rcx + addq %rax,%r12 + movq %r13,%rax + adcq $0,%rdx + movq %rdx,%r13 + mulq %rcx + addq %rax,%r13 + movq %r14,%rax + adcq $0,%rdx + movq %rdx,%r14 + mulq %rcx + addq %rax,%r14 + movq %r15,%rax + adcq $0,%rdx + movq %rdx,%r15 + imulq %rcx + addq %rax,%r15 + adcq $0,%rdx + + movq %rbp,%rbx + sarq $63,%rbp + + addq 0(%rdi),%r8 + adcq 8(%rdi),%r9 + adcq 16(%rdi),%r10 + adcq 24(%rdi),%r11 + adcq %rbx,%r12 + adcq %rbp,%r13 + adcq %rbp,%r14 + adcq %rbp,%r15 + + movq %r8,0(%rdi) + movq %r9,8(%rdi) + movq %r10,16(%rdi) + movq %r11,24(%rdi) + movq %r12,32(%rdi) + movq %r13,40(%rdi) + movq %r14,48(%rdi) + movq %r15,56(%rdi) + + .byte 0xf3,0xc3 + + +.def __smulq_256x63; .scl 3; .type 32; .endef +.p2align 5 +__smulq_256x63: + .byte 0xf3,0x0f,0x1e,0xfa + + movq 0+0(%rsi),%r8 + movq 0+8(%rsi),%r9 + movq 0+16(%rsi),%r10 + movq 0+24(%rsi),%r11 + movq 0+32(%rsi),%rbp + + movq %rdx,%rbx + sarq $63,%rdx + xorq %rax,%rax + subq %rdx,%rax + + xorq %rdx,%rbx + addq %rax,%rbx + + xorq %rdx,%r8 + xorq %rdx,%r9 + xorq %rdx,%r10 + xorq %rdx,%r11 + xorq %rdx,%rbp + addq %r8,%rax + adcq $0,%r9 + adcq $0,%r10 + adcq $0,%r11 + adcq $0,%rbp + + mulq %rbx + movq %rax,%r8 + movq %r9,%rax + movq %rdx,%r9 + mulq %rbx + addq %rax,%r9 + movq %r10,%rax + adcq $0,%rdx + movq %rdx,%r10 + mulq %rbx + addq %rax,%r10 + movq %r11,%rax + adcq $0,%rdx + movq %rdx,%r11 + andq %rbx,%rbp + negq %rbp + mulq %rbx + addq %rax,%r11 + adcq %rdx,%rbp + movq %rcx,%rdx + movq 40+0(%rsi),%r12 + movq 40+8(%rsi),%r13 + movq 40+16(%rsi),%r14 + movq 40+24(%rsi),%r15 + movq 40+32(%rsi),%rcx + + movq %rdx,%rbx + sarq $63,%rdx + xorq %rax,%rax + subq %rdx,%rax + + xorq %rdx,%rbx + addq %rax,%rbx + + xorq %rdx,%r12 + xorq %rdx,%r13 + xorq %rdx,%r14 + xorq %rdx,%r15 + xorq %rdx,%rcx + addq %r12,%rax + adcq $0,%r13 + adcq $0,%r14 + adcq $0,%r15 + adcq $0,%rcx + + mulq %rbx + movq %rax,%r12 + movq %r13,%rax + movq %rdx,%r13 + mulq %rbx + addq %rax,%r13 + movq %r14,%rax + adcq $0,%rdx + movq %rdx,%r14 + mulq %rbx + addq %rax,%r14 + movq %r15,%rax + adcq $0,%rdx + movq %rdx,%r15 + andq %rbx,%rcx + negq %rcx + mulq %rbx + addq %rax,%r15 + adcq %rdx,%rcx + addq %r12,%r8 + adcq %r13,%r9 + adcq %r14,%r10 + adcq %r15,%r11 + adcq %rcx,%rbp + + movq %r8,0(%rdi) + movq %r9,8(%rdi) + movq %r10,16(%rdi) + movq %r11,24(%rdi) + movq %rbp,32(%rdi) + + .byte 0xf3,0xc3 + +.def __smulq_256_n_shift_by_31; .scl 3; .type 32; .endef +.p2align 5 +__smulq_256_n_shift_by_31: + .byte 0xf3,0x0f,0x1e,0xfa + + movq %rdx,0(%rdi) + movq %rcx,8(%rdi) + movq %rdx,%rbp + movq 0+0(%rsi),%r8 + movq 0+8(%rsi),%r9 + movq 0+16(%rsi),%r10 + movq 0+24(%rsi),%r11 + + movq %rbp,%rbx + sarq $63,%rbp + xorq %rax,%rax + subq %rbp,%rax + + xorq %rbp,%rbx + addq %rax,%rbx + + xorq %rbp,%r8 + xorq %rbp,%r9 + xorq %rbp,%r10 + xorq %rbp,%r11 + addq %r8,%rax + adcq $0,%r9 + adcq $0,%r10 + adcq $0,%r11 + + mulq %rbx + movq %rax,%r8 + movq %r9,%rax + andq %rbx,%rbp + negq %rbp + movq %rdx,%r9 + mulq %rbx + addq %rax,%r9 + movq %r10,%rax + adcq $0,%rdx + movq %rdx,%r10 + mulq %rbx + addq %rax,%r10 + movq %r11,%rax + adcq $0,%rdx + movq %rdx,%r11 + mulq %rbx + addq %rax,%r11 + adcq %rdx,%rbp + movq 32+0(%rsi),%r12 + movq 32+8(%rsi),%r13 + movq 32+16(%rsi),%r14 + movq 32+24(%rsi),%r15 + + movq %rcx,%rbx + sarq $63,%rcx + xorq %rax,%rax + subq %rcx,%rax + + xorq %rcx,%rbx + addq %rax,%rbx + + xorq %rcx,%r12 + xorq %rcx,%r13 + xorq %rcx,%r14 + xorq %rcx,%r15 + addq %r12,%rax + adcq $0,%r13 + adcq $0,%r14 + adcq $0,%r15 + + mulq %rbx + movq %rax,%r12 + movq %r13,%rax + andq %rbx,%rcx + negq %rcx + movq %rdx,%r13 + mulq %rbx + addq %rax,%r13 + movq %r14,%rax + adcq $0,%rdx + movq %rdx,%r14 + mulq %rbx + addq %rax,%r14 + movq %r15,%rax + adcq $0,%rdx + movq %rdx,%r15 + mulq %rbx + addq %rax,%r15 + adcq %rdx,%rcx + addq %r12,%r8 + adcq %r13,%r9 + adcq %r14,%r10 + adcq %r15,%r11 + adcq %rcx,%rbp + + movq 0(%rdi),%rdx + movq 8(%rdi),%rcx + + shrdq $31,%r9,%r8 + shrdq $31,%r10,%r9 + shrdq $31,%r11,%r10 + shrdq $31,%rbp,%r11 + + sarq $63,%rbp + xorq %rax,%rax + subq %rbp,%rax + + xorq %rbp,%r8 + xorq %rbp,%r9 + xorq %rbp,%r10 + xorq %rbp,%r11 + addq %rax,%r8 + adcq $0,%r9 + adcq $0,%r10 + adcq $0,%r11 + + movq %r8,0(%rdi) + movq %r9,8(%rdi) + movq %r10,16(%rdi) + movq %r11,24(%rdi) + + xorq %rbp,%rdx + xorq %rbp,%rcx + addq %rax,%rdx + addq %rax,%rcx + + .byte 0xf3,0xc3 + +.def __ab_approximation_31_256; .scl 3; .type 32; .endef +.p2align 5 +__ab_approximation_31_256: + .byte 0xf3,0x0f,0x1e,0xfa + + movq 24(%rsi),%r9 + movq 56(%rsi),%r11 + movq 16(%rsi),%rbx + movq 48(%rsi),%rbp + movq 8(%rsi),%r8 + movq 40(%rsi),%r10 + + movq %r9,%rax + orq %r11,%rax + cmovzq %rbx,%r9 + cmovzq %rbp,%r11 + cmovzq %r8,%rbx + movq 0(%rsi),%r8 + cmovzq %r10,%rbp + movq 32(%rsi),%r10 + + movq %r9,%rax + orq %r11,%rax + cmovzq %rbx,%r9 + cmovzq %rbp,%r11 + cmovzq %r8,%rbx + cmovzq %r10,%rbp + + movq %r9,%rax + orq %r11,%rax + bsrq %rax,%rcx + leaq 1(%rcx),%rcx + cmovzq %r8,%r9 + cmovzq %r10,%r11 + cmovzq %rax,%rcx + negq %rcx + + + shldq %cl,%rbx,%r9 + shldq %cl,%rbp,%r11 + + movl $0x7FFFFFFF,%eax + andq %rax,%r8 + andq %rax,%r10 + notq %rax + andq %rax,%r9 + andq %rax,%r11 + orq %r9,%r8 + orq %r11,%r10 + + jmp __inner_loop_31_256 + + .byte 0xf3,0xc3 + +.def __inner_loop_31_256; .scl 3; .type 32; .endef +.p2align 5 +__inner_loop_31_256: + .byte 0xf3,0x0f,0x1e,0xfa + + movq $0x7FFFFFFF80000000,%rcx + movq $0x800000007FFFFFFF,%r13 + movq $0x7FFFFFFF7FFFFFFF,%r15 + +.Loop_31_256: + cmpq %r10,%r8 + movq %r8,%rax + movq %r10,%rbx + movq %rcx,%rbp + movq %r13,%r14 + cmovbq %r10,%r8 + cmovbq %rax,%r10 + cmovbq %r13,%rcx + cmovbq %rbp,%r13 + + subq %r10,%r8 + subq %r13,%rcx + addq %r15,%rcx + + testq $1,%rax + cmovzq %rax,%r8 + cmovzq %rbx,%r10 + cmovzq %rbp,%rcx + cmovzq %r14,%r13 + + shrq $1,%r8 + addq %r13,%r13 + subq %r15,%r13 + subl $1,%edx + jnz .Loop_31_256 + + shrq $32,%r15 + movl %ecx,%edx + movl %r13d,%r12d + shrq $32,%rcx + shrq $32,%r13 + subq %r15,%rdx + subq %r15,%rcx + subq %r15,%r12 + subq %r15,%r13 + + .byte 0xf3,0xc3 + + +.def __inner_loop_62_256; .scl 3; .type 32; .endef +.p2align 5 +__inner_loop_62_256: + .byte 0xf3,0x0f,0x1e,0xfa + + movl %edx,%r15d + movq $1,%rdx + xorq %rcx,%rcx + xorq %r12,%r12 + movq %rdx,%r13 + movq %rdx,%r14 + +.Loop_62_256: + xorq %rax,%rax + testq %r14,%r8 + movq %r10,%rbx + cmovnzq %r10,%rax + subq %r8,%rbx + movq %r8,%rbp + subq %rax,%r8 + cmovcq %rbx,%r8 + cmovcq %rbp,%r10 + movq %rdx,%rax + cmovcq %r12,%rdx + cmovcq %rax,%r12 + movq %rcx,%rbx + cmovcq %r13,%rcx + cmovcq %rbx,%r13 + xorq %rax,%rax + xorq %rbx,%rbx + shrq $1,%r8 + testq %r14,%rbp + cmovnzq %r12,%rax + cmovnzq %r13,%rbx + addq %r12,%r12 + addq %r13,%r13 + subq %rax,%rdx + subq %rbx,%rcx + subl $1,%r15d + jnz .Loop_62_256 + + .byte 0xf3,0xc3 + +.section .pdata +.p2align 2 +.rva .LSEH_begin_ct_inverse_mod_256 +.rva .LSEH_body_ct_inverse_mod_256 +.rva .LSEH_info_ct_inverse_mod_256_prologue + +.rva .LSEH_body_ct_inverse_mod_256 +.rva .LSEH_epilogue_ct_inverse_mod_256 +.rva .LSEH_info_ct_inverse_mod_256_body + +.rva .LSEH_epilogue_ct_inverse_mod_256 +.rva .LSEH_end_ct_inverse_mod_256 +.rva .LSEH_info_ct_inverse_mod_256_epilogue + +.section .xdata +.p2align 3 +.LSEH_info_ct_inverse_mod_256_prologue: +.byte 1,0,5,0x0b +.byte 0,0x74,1,0 +.byte 0,0x64,2,0 +.byte 0,0x03 +.byte 0,0 +.LSEH_info_ct_inverse_mod_256_body: +.byte 1,0,18,0 +.byte 0x00,0xf4,0x86,0x00 +.byte 0x00,0xe4,0x87,0x00 +.byte 0x00,0xd4,0x88,0x00 +.byte 0x00,0xc4,0x89,0x00 +.byte 0x00,0x34,0x8a,0x00 +.byte 0x00,0x54,0x8b,0x00 +.byte 0x00,0x74,0x8d,0x00 +.byte 0x00,0x64,0x8e,0x00 +.byte 0x00,0x01,0x8c,0x00 +.LSEH_info_ct_inverse_mod_256_epilogue: +.byte 1,0,4,0 +.byte 0x00,0x74,0x01,0x00 +.byte 0x00,0x64,0x02,0x00 +.byte 0x00,0x00,0x00,0x00 + diff --git a/temp-deps/blst-ts/deps/blst/build/coff/ct_inverse_mod_384-armv8.S b/temp-deps/blst-ts/deps/blst/build/coff/ct_inverse_mod_384-armv8.S new file mode 100644 index 0000000..65193f1 --- /dev/null +++ b/temp-deps/blst-ts/deps/blst/build/coff/ct_inverse_mod_384-armv8.S @@ -0,0 +1,729 @@ +.text + +.globl ct_inverse_mod_383 +.def ct_inverse_mod_383; +.type 32; +.endef +.p2align 5 +ct_inverse_mod_383: +.long 3573752639 + stp x29, x30, [sp,#-128]! + add x29, sp, #0 + stp x19, x20, [sp,#16] + stp x21, x22, [sp,#32] + stp x23, x24, [sp,#48] + stp x25, x26, [sp,#64] + stp x27, x28, [sp,#80] + sub sp, sp, #1040 + + ldp x22, x4, [x1,#8*0] + ldp x5, x6, [x1,#8*2] + ldp x7, x8, [x1,#8*4] + + add x1, sp, #16+511 // find closest 512-byte-aligned spot + and x1, x1, #-512 // in the frame... + stp x0, x3, [sp] + + ldp x9, x10, [x2,#8*0] + ldp x11, x12, [x2,#8*2] + ldp x13, x14, [x2,#8*4] + + stp x22, x4, [x1,#8*0] // copy input to |a| + stp x5, x6, [x1,#8*2] + stp x7, x8, [x1,#8*4] + stp x9, x10, [x1,#8*6] // copy modulus to |b| + stp x11, x12, [x1,#8*8] + stp x13, x14, [x1,#8*10] + + ////////////////////////////////////////// first iteration + mov x2, #62 + bl .Lab_approximation_62_loaded + + eor x0, x1, #256 // pointer to dst |a|b|u|v| + bl __smul_383_n_shift_by_62 + str x15,[x0,#8*12] // initialize |u| with |f0| + + mov x15, x17 // |f1| + mov x16, x19 // |g1| + add x0, x0, #8*6 // pointer to dst |b| + bl __smul_383_n_shift_by_62 + str x15, [x0,#8*12] // initialize |v| with |f1| + + ////////////////////////////////////////// second iteration + eor x1, x1, #256 // flip-flop src |a|b|u|v| + mov x2, #62 + bl __ab_approximation_62 + + eor x0, x1, #256 // pointer to dst |a|b|u|v| + bl __smul_383_n_shift_by_62 + mov x20, x15 // corrected |f0| + mov x21, x16 // corrected |g0| + + mov x15, x17 // |f1| + mov x16, x19 // |g1| + add x0, x0, #8*6 // pointer to destination |b| + bl __smul_383_n_shift_by_62 + + ldr x7, [x1,#8*12] // |u| + ldr x8, [x1,#8*18] // |v| + mul x3, x20, x7 // |u|*|f0| + smulh x4, x20, x7 + mul x5, x21, x8 // |v|*|g0| + smulh x6, x21, x8 + adds x3, x3, x5 + adc x4, x4, x6 + stp x3, x4, [x0,#8*6] + asr x5, x4, #63 // sign extenstion + stp x5, x5, [x0,#8*8] + stp x5, x5, [x0,#8*10] + + mul x3, x15, x7 // |u|*|f1| + smulh x4, x15, x7 + mul x5, x16, x8 // |v|*|g1| + smulh x6, x16, x8 + adds x3, x3, x5 + adc x4, x4, x6 + stp x3, x4, [x0,#8*12] + asr x5, x4, #63 // sign extenstion + stp x5, x5, [x0,#8*14] + stp x5, x5, [x0,#8*16] + eor x1, x1, #256 // flip-flop src |a|b|u|v| + mov x2, #62 + bl __ab_approximation_62 + + eor x0, x1, #256 // pointer to dst |a|b|u|v| + bl __smul_383_n_shift_by_62 + mov x20, x15 // corrected |f0| + mov x21, x16 // corrected |g0| + + mov x15, x17 // |f1| + mov x16, x19 // |g1| + add x0, x0, #8*6 // pointer to destination |b| + bl __smul_383_n_shift_by_62 + + add x0, x0, #8*6 // pointer to destination |u| + bl __smul_383x63 + + mov x20, x15 // corrected |f1| + mov x21, x16 // corrected |g1| + add x0, x0, #8*6 // pointer to destination |v| + bl __smul_383x63 + eor x1, x1, #256 // flip-flop src |a|b|u|v| + mov x2, #62 + bl __ab_approximation_62 + + eor x0, x1, #256 // pointer to dst |a|b|u|v| + bl __smul_383_n_shift_by_62 + mov x20, x15 // corrected |f0| + mov x21, x16 // corrected |g0| + + mov x15, x17 // |f1| + mov x16, x19 // |g1| + add x0, x0, #8*6 // pointer to destination |b| + bl __smul_383_n_shift_by_62 + + add x0, x0, #8*6 // pointer to destination |u| + bl __smul_383x63 + + mov x20, x15 // corrected |f1| + mov x21, x16 // corrected |g1| + add x0, x0, #8*6 // pointer to destination |v| + bl __smul_383x63 + eor x1, x1, #256 // flip-flop src |a|b|u|v| + mov x2, #62 + bl __ab_approximation_62 + + eor x0, x1, #256 // pointer to dst |a|b|u|v| + bl __smul_383_n_shift_by_62 + mov x20, x15 // corrected |f0| + mov x21, x16 // corrected |g0| + + mov x15, x17 // |f1| + mov x16, x19 // |g1| + add x0, x0, #8*6 // pointer to destination |b| + bl __smul_383_n_shift_by_62 + + add x0, x0, #8*6 // pointer to destination |u| + bl __smul_383x63 + + mov x20, x15 // corrected |f1| + mov x21, x16 // corrected |g1| + add x0, x0, #8*6 // pointer to destination |v| + bl __smul_383x63 + eor x1, x1, #256 // flip-flop src |a|b|u|v| + mov x2, #62 + bl __ab_approximation_62 + + eor x0, x1, #256 // pointer to dst |a|b|u|v| + bl __smul_383_n_shift_by_62 + mov x20, x15 // corrected |f0| + mov x21, x16 // corrected |g0| + + mov x15, x17 // |f1| + mov x16, x19 // |g1| + add x0, x0, #8*6 // pointer to destination |b| + bl __smul_383_n_shift_by_62 + + add x0, x0, #8*6 // pointer to destination |u| + bl __smul_383x63 + + mov x20, x15 // corrected |f1| + mov x21, x16 // corrected |g1| + add x0, x0, #8*6 // pointer to destination |v| + bl __smul_383x63 + asr x27, x27, #63 // sign extension + stp x27, x27, [x0,#8*6] + stp x27, x27, [x0,#8*8] + stp x27, x27, [x0,#8*10] + eor x1, x1, #256 // flip-flop src |a|b|u|v| + mov x2, #62 + bl __ab_approximation_62 + + eor x0, x1, #256 // pointer to dst |a|b|u|v| + bl __smul_383_n_shift_by_62 + mov x20, x15 // corrected |f0| + mov x21, x16 // corrected |g0| + + mov x15, x17 // |f1| + mov x16, x19 // |g1| + add x0, x0, #8*6 // pointer to destination |b| + bl __smul_383_n_shift_by_62 + + add x0, x0, #8*6 // pointer to destination |u| + bl __smul_383x63 + + mov x20, x15 // corrected |f1| + mov x21, x16 // corrected |g1| + add x0, x0, #8*6 // pointer to destination |v| + bl __smul_383x63 + bl __smul_767x63_tail + eor x1, x1, #256 // flip-flop src |a|b|u|v| + mov x2, #62 + bl __ab_approximation_62 + + eor x0, x1, #256 // pointer to dst |a|b|u|v| + bl __smul_383_n_shift_by_62 + mov x20, x15 // corrected |f0| + mov x21, x16 // corrected |g0| + + mov x15, x17 // |f1| + mov x16, x19 // |g1| + add x0, x0, #8*6 // pointer to destination |b| + bl __smul_383_n_shift_by_62 + + add x0, x0, #8*6 // pointer to destination |u| + bl __smul_383x63 + + mov x20, x15 // corrected |f1| + mov x21, x16 // corrected |g1| + add x0, x0, #8*6 // pointer to destination |v| + bl __smul_383x63 + bl __smul_767x63_tail + eor x1, x1, #256 // flip-flop src |a|b|u|v| + mov x2, #62 + bl __ab_approximation_62 + + eor x0, x1, #256 // pointer to dst |a|b|u|v| + bl __smul_383_n_shift_by_62 + mov x20, x15 // corrected |f0| + mov x21, x16 // corrected |g0| + + mov x15, x17 // |f1| + mov x16, x19 // |g1| + add x0, x0, #8*6 // pointer to destination |b| + bl __smul_383_n_shift_by_62 + + add x0, x0, #8*6 // pointer to destination |u| + bl __smul_383x63 + + mov x20, x15 // corrected |f1| + mov x21, x16 // corrected |g1| + add x0, x0, #8*6 // pointer to destination |v| + bl __smul_383x63 + bl __smul_767x63_tail + eor x1, x1, #256 // flip-flop src |a|b|u|v| + mov x2, #62 + bl __ab_approximation_62 + + eor x0, x1, #256 // pointer to dst |a|b|u|v| + bl __smul_383_n_shift_by_62 + mov x20, x15 // corrected |f0| + mov x21, x16 // corrected |g0| + + mov x15, x17 // |f1| + mov x16, x19 // |g1| + add x0, x0, #8*6 // pointer to destination |b| + bl __smul_383_n_shift_by_62 + + add x0, x0, #8*6 // pointer to destination |u| + bl __smul_383x63 + + mov x20, x15 // corrected |f1| + mov x21, x16 // corrected |g1| + add x0, x0, #8*6 // pointer to destination |v| + bl __smul_383x63 + bl __smul_767x63_tail + eor x1, x1, #256 // flip-flop src |a|b|u|v| + mov x2, #62 + bl __ab_approximation_62 + + eor x0, x1, #256 // pointer to dst |a|b|u|v| + bl __smul_383_n_shift_by_62 + mov x20, x15 // corrected |f0| + mov x21, x16 // corrected |g0| + + mov x15, x17 // |f1| + mov x16, x19 // |g1| + add x0, x0, #8*6 // pointer to destination |b| + bl __smul_383_n_shift_by_62 + + add x0, x0, #8*6 // pointer to destination |u| + bl __smul_383x63 + + mov x20, x15 // corrected |f1| + mov x21, x16 // corrected |g1| + add x0, x0, #8*6 // pointer to destination |v| + bl __smul_383x63 + bl __smul_767x63_tail + ////////////////////////////////////////// iteration before last + eor x1, x1, #256 // flip-flop src |a|b|u|v| + mov x2, #62 + //bl __ab_approximation_62 // |a| and |b| are exact, + ldp x3, x8, [x1,#8*0] // just load + ldp x9, x14, [x1,#8*6] + bl __inner_loop_62 + + eor x0, x1, #256 // pointer to dst |a|b|u|v| + str x3, [x0,#8*0] + str x9, [x0,#8*6] + + mov x20, x15 // exact |f0| + mov x21, x16 // exact |g0| + mov x15, x17 + mov x16, x19 + add x0, x0, #8*12 // pointer to dst |u| + bl __smul_383x63 + + mov x20, x15 // exact |f1| + mov x21, x16 // exact |g1| + add x0, x0, #8*6 // pointer to dst |v| + bl __smul_383x63 + bl __smul_767x63_tail + + ////////////////////////////////////////// last iteration + eor x1, x1, #256 // flip-flop src |a|b|u|v| + mov x2, #22 // 766 % 62 + //bl __ab_approximation_62 // |a| and |b| are exact, + ldr x3, [x1,#8*0] // just load + eor x8, x8, x8 + ldr x9, [x1,#8*6] + eor x14, x14, x14 + bl __inner_loop_62 + + mov x20, x17 + mov x21, x19 + ldp x0, x15, [sp] // original out_ptr and n_ptr + bl __smul_383x63 + bl __smul_767x63_tail + ldr x30, [x29,#8] + + asr x22, x8, #63 // sign as mask + ldp x9, x10, [x15,#8*0] + ldp x11, x12, [x15,#8*2] + ldp x13, x14, [x15,#8*4] + + and x9, x9, x22 // add mod<<384 conditionally + and x10, x10, x22 + adds x3, x3, x9 + and x11, x11, x22 + adcs x4, x4, x10 + and x12, x12, x22 + adcs x5, x5, x11 + and x13, x13, x22 + adcs x6, x6, x12 + and x14, x14, x22 + stp x3, x4, [x0,#8*6] + adcs x7, x7, x13 + stp x5, x6, [x0,#8*8] + adc x8, x8, x14 + stp x7, x8, [x0,#8*10] + + add sp, sp, #1040 + ldp x19, x20, [x29,#16] + ldp x21, x22, [x29,#32] + ldp x23, x24, [x29,#48] + ldp x25, x26, [x29,#64] + ldp x27, x28, [x29,#80] + ldr x29, [sp],#128 +.long 3573752767 + ret + + +//////////////////////////////////////////////////////////////////////// +// see corresponding commentary in ctx_inverse_mod_384-x86_64... +.def __smul_383x63; +.type 32; +.endef +.p2align 5 +__smul_383x63: + ldp x3, x4, [x1,#8*0+96] // load |u| (or |v|) + asr x17, x20, #63 // |f_|'s sign as mask (or |g_|'s) + ldp x5, x6, [x1,#8*2+96] + eor x20, x20, x17 // conditionally negate |f_| (or |g_|) + ldp x7, x8, [x1,#8*4+96] + + eor x3, x3, x17 // conditionally negate |u| (or |v|) + sub x20, x20, x17 + eor x4, x4, x17 + adds x3, x3, x17, lsr#63 + eor x5, x5, x17 + adcs x4, x4, xzr + eor x6, x6, x17 + adcs x5, x5, xzr + eor x7, x7, x17 + adcs x6, x6, xzr + umulh x22, x3, x20 + eor x8, x8, x17 + umulh x23, x4, x20 + adcs x7, x7, xzr + umulh x24, x5, x20 + adcs x8, x8, xzr + umulh x25, x6, x20 + umulh x26, x7, x20 + mul x3, x3, x20 + mul x4, x4, x20 + mul x5, x5, x20 + adds x4, x4, x22 + mul x6, x6, x20 + adcs x5, x5, x23 + mul x7, x7, x20 + adcs x6, x6, x24 + mul x27,x8, x20 + adcs x7, x7, x25 + adcs x27,x27,x26 + adc x2, xzr, xzr + ldp x9, x10, [x1,#8*0+144] // load |u| (or |v|) + asr x17, x21, #63 // |f_|'s sign as mask (or |g_|'s) + ldp x11, x12, [x1,#8*2+144] + eor x21, x21, x17 // conditionally negate |f_| (or |g_|) + ldp x13, x14, [x1,#8*4+144] + + eor x9, x9, x17 // conditionally negate |u| (or |v|) + sub x21, x21, x17 + eor x10, x10, x17 + adds x9, x9, x17, lsr#63 + eor x11, x11, x17 + adcs x10, x10, xzr + eor x12, x12, x17 + adcs x11, x11, xzr + eor x13, x13, x17 + adcs x12, x12, xzr + umulh x22, x9, x21 + eor x14, x14, x17 + umulh x23, x10, x21 + adcs x13, x13, xzr + umulh x24, x11, x21 + adcs x14, x14, xzr + umulh x25, x12, x21 + adc x19, xzr, xzr // used in __smul_767x63_tail + umulh x26, x13, x21 + mul x9, x9, x21 + mul x10, x10, x21 + mul x11, x11, x21 + adds x10, x10, x22 + mul x12, x12, x21 + adcs x11, x11, x23 + mul x13, x13, x21 + adcs x12, x12, x24 + mul x28,x14, x21 + adcs x13, x13, x25 + adcs x28,x28,x26 + adc x2, x2, xzr + + adds x3, x3, x9 + adcs x4, x4, x10 + adcs x5, x5, x11 + adcs x6, x6, x12 + stp x3, x4, [x0,#8*0] + adcs x7, x7, x13 + stp x5, x6, [x0,#8*2] + adcs x27, x27, x28 + stp x7, x27, [x0,#8*4] + adc x28, x2, xzr // used in __smul_767x63_tail + + ret + + +.def __smul_767x63_tail; +.type 32; +.endef +.p2align 5 +__smul_767x63_tail: + smulh x27, x8, x20 + ldp x3, x4, [x1,#8*24] // load rest of |v| + umulh x14,x14, x21 + ldp x5, x6, [x1,#8*26] + ldp x7, x8, [x1,#8*28] + + eor x3, x3, x17 // conditionally negate rest of |v| + eor x4, x4, x17 + eor x5, x5, x17 + adds x3, x3, x19 + eor x6, x6, x17 + adcs x4, x4, xzr + eor x7, x7, x17 + adcs x5, x5, xzr + eor x8, x8, x17 + adcs x6, x6, xzr + umulh x22, x3, x21 + adcs x7, x7, xzr + umulh x23, x4, x21 + adc x8, x8, xzr + + umulh x24, x5, x21 + add x14, x14, x28 + umulh x25, x6, x21 + asr x28, x27, #63 + umulh x26, x7, x21 + mul x3, x3, x21 + mul x4, x4, x21 + mul x5, x5, x21 + adds x3, x3, x14 + mul x6, x6, x21 + adcs x4, x4, x22 + mul x7, x7, x21 + adcs x5, x5, x23 + mul x8, x8, x21 + adcs x6, x6, x24 + adcs x7, x7, x25 + adc x8, x8, x26 + + adds x3, x3, x27 + adcs x4, x4, x28 + adcs x5, x5, x28 + adcs x6, x6, x28 + stp x3, x4, [x0,#8*6] + adcs x7, x7, x28 + stp x5, x6, [x0,#8*8] + adc x8, x8, x28 + stp x7, x8, [x0,#8*10] + + ret + + +.def __smul_383_n_shift_by_62; +.type 32; +.endef +.p2align 5 +__smul_383_n_shift_by_62: + ldp x3, x4, [x1,#8*0+0] // load |a| (or |b|) + asr x28, x15, #63 // |f0|'s sign as mask (or |g0|'s) + ldp x5, x6, [x1,#8*2+0] + eor x2, x15, x28 // conditionally negate |f0| (or |g0|) + ldp x7, x8, [x1,#8*4+0] + + eor x3, x3, x28 // conditionally negate |a| (or |b|) + sub x2, x2, x28 + eor x4, x4, x28 + adds x3, x3, x28, lsr#63 + eor x5, x5, x28 + adcs x4, x4, xzr + eor x6, x6, x28 + adcs x5, x5, xzr + eor x7, x7, x28 + umulh x22, x3, x2 + adcs x6, x6, xzr + umulh x23, x4, x2 + eor x8, x8, x28 + umulh x24, x5, x2 + adcs x7, x7, xzr + umulh x25, x6, x2 + adc x8, x8, xzr + + umulh x26, x7, x2 + smulh x27, x8, x2 + mul x3, x3, x2 + mul x4, x4, x2 + mul x5, x5, x2 + adds x4, x4, x22 + mul x6, x6, x2 + adcs x5, x5, x23 + mul x7, x7, x2 + adcs x6, x6, x24 + mul x8, x8, x2 + adcs x7, x7, x25 + adcs x8, x8 ,x26 + adc x27, x27, xzr + ldp x9, x10, [x1,#8*0+48] // load |a| (or |b|) + asr x28, x16, #63 // |f0|'s sign as mask (or |g0|'s) + ldp x11, x12, [x1,#8*2+48] + eor x2, x16, x28 // conditionally negate |f0| (or |g0|) + ldp x13, x14, [x1,#8*4+48] + + eor x9, x9, x28 // conditionally negate |a| (or |b|) + sub x2, x2, x28 + eor x10, x10, x28 + adds x9, x9, x28, lsr#63 + eor x11, x11, x28 + adcs x10, x10, xzr + eor x12, x12, x28 + adcs x11, x11, xzr + eor x13, x13, x28 + umulh x22, x9, x2 + adcs x12, x12, xzr + umulh x23, x10, x2 + eor x14, x14, x28 + umulh x24, x11, x2 + adcs x13, x13, xzr + umulh x25, x12, x2 + adc x14, x14, xzr + + umulh x26, x13, x2 + smulh x28, x14, x2 + mul x9, x9, x2 + mul x10, x10, x2 + mul x11, x11, x2 + adds x10, x10, x22 + mul x12, x12, x2 + adcs x11, x11, x23 + mul x13, x13, x2 + adcs x12, x12, x24 + mul x14, x14, x2 + adcs x13, x13, x25 + adcs x14, x14 ,x26 + adc x28, x28, xzr + adds x3, x3, x9 + adcs x4, x4, x10 + adcs x5, x5, x11 + adcs x6, x6, x12 + adcs x7, x7, x13 + adcs x8, x8, x14 + adc x9, x27, x28 + + extr x3, x4, x3, #62 + extr x4, x5, x4, #62 + extr x5, x6, x5, #62 + asr x28, x9, #63 + extr x6, x7, x6, #62 + extr x7, x8, x7, #62 + extr x8, x9, x8, #62 + + eor x3, x3, x28 + eor x4, x4, x28 + adds x3, x3, x28, lsr#63 + eor x5, x5, x28 + adcs x4, x4, xzr + eor x6, x6, x28 + adcs x5, x5, xzr + eor x7, x7, x28 + adcs x6, x6, xzr + eor x8, x8, x28 + stp x3, x4, [x0,#8*0] + adcs x7, x7, xzr + stp x5, x6, [x0,#8*2] + adc x8, x8, xzr + stp x7, x8, [x0,#8*4] + + eor x15, x15, x28 + eor x16, x16, x28 + sub x15, x15, x28 + sub x16, x16, x28 + + ret + +.def __ab_approximation_62; +.type 32; +.endef +.p2align 4 +__ab_approximation_62: + ldp x7, x8, [x1,#8*4] + ldp x13, x14, [x1,#8*10] + ldp x5, x6, [x1,#8*2] + ldp x11, x12, [x1,#8*8] + +.Lab_approximation_62_loaded: + orr x22, x8, x14 // check top-most limbs, ... + cmp x22, #0 + csel x8, x8, x7, ne + csel x14, x14, x13, ne + csel x7, x7, x6, ne + orr x22, x8, x14 // ... ones before top-most, ... + csel x13, x13, x12, ne + + ldp x3, x4, [x1,#8*0] + ldp x9, x10, [x1,#8*6] + + cmp x22, #0 + csel x8, x8, x7, ne + csel x14, x14, x13, ne + csel x7, x7, x5, ne + orr x22, x8, x14 // ... and ones before that ... + csel x13, x13, x11, ne + + cmp x22, #0 + csel x8, x8, x7, ne + csel x14, x14, x13, ne + csel x7, x7, x4, ne + orr x22, x8, x14 + csel x13, x13, x10, ne + + clz x22, x22 + cmp x22, #64 + csel x22, x22, xzr, ne + csel x8, x8, x7, ne + csel x14, x14, x13, ne + neg x23, x22 + + lslv x8, x8, x22 // align high limbs to the left + lslv x14, x14, x22 + lsrv x7, x7, x23 + lsrv x13, x13, x23 + and x7, x7, x23, asr#6 + and x13, x13, x23, asr#6 + orr x8, x8, x7 + orr x14, x14, x13 + + b __inner_loop_62 + ret + +.def __inner_loop_62; +.type 32; +.endef +.p2align 4 +__inner_loop_62: + mov x15, #1 // |f0|=1 + mov x16, #0 // |g0|=0 + mov x17, #0 // |f1|=0 + mov x19, #1 // |g1|=1 + +.Loop_62: + sbfx x28, x3, #0, #1 // if |a_| is odd, then we'll be subtracting + sub x2, x2, #1 + subs x24, x9, x3 // |b_|-|a_| + and x22, x9, x28 + sbc x25, x14, x8 + and x23, x14, x28 + subs x26, x3, x22 // |a_|-|b_| (or |a_|-0 if |a_| was even) + mov x22, x15 + sbcs x27, x8, x23 + mov x23, x16 + csel x9, x9, x3, hs // |b_| = |a_| + csel x14, x14, x8, hs + csel x3, x26, x24, hs // borrow means |a_|<|b_|, replace with |b_|-|a_| + csel x8, x27, x25, hs + csel x15, x15, x17, hs // exchange |f0| and |f1| + csel x17, x17, x22, hs + csel x16, x16, x19, hs // exchange |g0| and |g1| + csel x19, x19, x23, hs + extr x3, x8, x3, #1 + lsr x8, x8, #1 + and x22, x17, x28 + and x23, x19, x28 + add x17, x17, x17 // |f1|<<=1 + add x19, x19, x19 // |g1|<<=1 + sub x15, x15, x22 // |f0|-=|f1| (or |f0-=0| if |a_| was even) + sub x16, x16, x23 // |g0|-=|g1| (or |g0-=0| ...) + cbnz x2, .Loop_62 + + ret + diff --git a/temp-deps/blst-ts/deps/blst/build/coff/ct_is_square_mod_384-armv8.S b/temp-deps/blst-ts/deps/blst/build/coff/ct_is_square_mod_384-armv8.S new file mode 100644 index 0000000..34336ff --- /dev/null +++ b/temp-deps/blst-ts/deps/blst/build/coff/ct_is_square_mod_384-armv8.S @@ -0,0 +1,334 @@ +.text + +.globl ct_is_square_mod_384 +.def ct_is_square_mod_384; +.type 32; +.endef +.p2align 5 +ct_is_square_mod_384: +.long 3573752639 + stp x29, x30, [sp,#-128]! + add x29, sp, #0 + stp x19, x20, [sp,#16] + stp x21, x22, [sp,#32] + stp x23, x24, [sp,#48] + stp x25, x26, [sp,#64] + stp x27, x28, [sp,#80] + sub sp, sp, #512 + + ldp x3, x4, [x0,#8*0] // load input + ldp x5, x6, [x0,#8*2] + ldp x7, x8, [x0,#8*4] + + add x0, sp, #255 // find closest 256-byte-aligned spot + and x0, x0, #-256 // in the frame... + + ldp x9, x10, [x1,#8*0] // load modulus + ldp x11, x12, [x1,#8*2] + ldp x13, x14, [x1,#8*4] + + stp x3, x4, [x0,#8*6] // copy input to |a| + stp x5, x6, [x0,#8*8] + stp x7, x8, [x0,#8*10] + stp x9, x10, [x0,#8*0] // copy modulus to |b| + stp x11, x12, [x0,#8*2] + stp x13, x14, [x0,#8*4] + + eor x2, x2, x2 // init the .Legendre symbol + mov x15, #24 // 24 is 768/30-1 + b .Loop_is_square + +.p2align 4 +.Loop_is_square: + bl __ab_approximation_30 + sub x15, x15, #1 + + eor x1, x0, #128 // pointer to dst |b| + bl __smul_384_n_shift_by_30 + + mov x19, x16 // |f0| + mov x20, x17 // |g0| + add x1, x1, #8*6 // pointer to dst |a| + bl __smul_384_n_shift_by_30 + + ldp x9, x10, [x1,#-8*6] + eor x0, x0, #128 // flip-flop src |a|b| + and x27, x27, x9 // if |a| was negative, + add x2, x2, x27, lsr#1 // adjust |L| + + cbnz x15, .Loop_is_square + + ////////////////////////////////////////// last iteration + //bl __ab_approximation_30 // |a| and |b| are exact, + //ldr x8, [x0,#8*6] // and loaded + //ldr x14, [x0,#8*0] + mov x15, #48 // 48 is 768%30 + 30 + bl __inner_loop_48 + ldr x30, [x29,#8] + + and x0, x2, #1 + eor x0, x0, #1 + + add sp, sp, #512 + ldp x19, x20, [x29,#16] + ldp x21, x22, [x29,#32] + ldp x23, x24, [x29,#48] + ldp x25, x26, [x29,#64] + ldp x27, x28, [x29,#80] + ldr x29, [sp],#128 +.long 3573752767 + ret + + +.def __smul_384_n_shift_by_30; +.type 32; +.endef +.p2align 5 +__smul_384_n_shift_by_30: + ldp x3, x4, [x0,#8*0+0] // load |b| (or |a|) + asr x27, x20, #63 // |g1|'s sign as mask (or |f1|'s) + ldp x5, x6, [x0,#8*2+0] + eor x20, x20, x27 // conditionally negate |g1| (or |f1|) + ldp x7, x8, [x0,#8*4+0] + + eor x3, x3, x27 // conditionally negate |b| (or |a|) + sub x20, x20, x27 + eor x4, x4, x27 + adds x3, x3, x27, lsr#63 + eor x5, x5, x27 + adcs x4, x4, xzr + eor x6, x6, x27 + adcs x5, x5, xzr + eor x7, x7, x27 + umulh x21, x3, x20 + adcs x6, x6, xzr + umulh x22, x4, x20 + eor x8, x8, x27 + umulh x23, x5, x20 + adcs x7, x7, xzr + umulh x24, x6, x20 + adc x8, x8, xzr + + umulh x25, x7, x20 + and x28, x20, x27 + umulh x26, x8, x20 + neg x28, x28 + mul x3, x3, x20 + mul x4, x4, x20 + mul x5, x5, x20 + adds x4, x4, x21 + mul x6, x6, x20 + adcs x5, x5, x22 + mul x7, x7, x20 + adcs x6, x6, x23 + mul x8, x8, x20 + adcs x7, x7, x24 + adcs x8, x8 ,x25 + adc x26, x26, x28 + ldp x9, x10, [x0,#8*0+48] // load |b| (or |a|) + asr x27, x19, #63 // |g1|'s sign as mask (or |f1|'s) + ldp x11, x12, [x0,#8*2+48] + eor x19, x19, x27 // conditionally negate |g1| (or |f1|) + ldp x13, x14, [x0,#8*4+48] + + eor x9, x9, x27 // conditionally negate |b| (or |a|) + sub x19, x19, x27 + eor x10, x10, x27 + adds x9, x9, x27, lsr#63 + eor x11, x11, x27 + adcs x10, x10, xzr + eor x12, x12, x27 + adcs x11, x11, xzr + eor x13, x13, x27 + umulh x21, x9, x19 + adcs x12, x12, xzr + umulh x22, x10, x19 + eor x14, x14, x27 + umulh x23, x11, x19 + adcs x13, x13, xzr + umulh x24, x12, x19 + adc x14, x14, xzr + + umulh x25, x13, x19 + and x28, x19, x27 + umulh x27, x14, x19 + neg x28, x28 + mul x9, x9, x19 + mul x10, x10, x19 + mul x11, x11, x19 + adds x10, x10, x21 + mul x12, x12, x19 + adcs x11, x11, x22 + mul x13, x13, x19 + adcs x12, x12, x23 + mul x14, x14, x19 + adcs x13, x13, x24 + adcs x14, x14 ,x25 + adc x27, x27, x28 + adds x3, x3, x9 + adcs x4, x4, x10 + adcs x5, x5, x11 + adcs x6, x6, x12 + adcs x7, x7, x13 + adcs x8, x8, x14 + adc x9, x26, x27 + + extr x3, x4, x3, #30 + extr x4, x5, x4, #30 + extr x5, x6, x5, #30 + asr x27, x9, #63 + extr x6, x7, x6, #30 + extr x7, x8, x7, #30 + extr x8, x9, x8, #30 + + eor x3, x3, x27 + eor x4, x4, x27 + adds x3, x3, x27, lsr#63 + eor x5, x5, x27 + adcs x4, x4, xzr + eor x6, x6, x27 + adcs x5, x5, xzr + eor x7, x7, x27 + adcs x6, x6, xzr + eor x8, x8, x27 + stp x3, x4, [x1,#8*0] + adcs x7, x7, xzr + stp x5, x6, [x1,#8*2] + adc x8, x8, xzr + stp x7, x8, [x1,#8*4] + + ret + +.def __ab_approximation_30; +.type 32; +.endef +.p2align 4 +__ab_approximation_30: + ldp x13, x14, [x0,#8*4] // |a| is still in registers + ldp x11, x12, [x0,#8*2] + + orr x21, x8, x14 // check top-most limbs, ... + cmp x21, #0 + csel x8, x8, x7, ne + csel x14, x14, x13, ne + csel x7, x7, x6, ne + orr x21, x8, x14 // ... ones before top-most, ... + csel x13, x13, x12, ne + + cmp x21, #0 + csel x8, x8, x7, ne + csel x14, x14, x13, ne + csel x7, x7, x5, ne + orr x21, x8, x14 // ... and ones before that ... + csel x13, x13, x11, ne + + cmp x21, #0 + csel x8, x8, x7, ne + csel x14, x14, x13, ne + csel x7, x7, x4, ne + orr x21, x8, x14 // and one more, ... + csel x13, x13, x10, ne + + cmp x21, #0 + csel x8, x8, x7, ne + csel x14, x14, x13, ne + csel x7, x7, x3, ne + orr x21, x8, x14 + csel x13, x13, x9, ne + + clz x21, x21 + cmp x21, #64 + csel x21, x21, xzr, ne + csel x8, x8, x7, ne + csel x14, x14, x13, ne + neg x22, x21 + + lslv x8, x8, x21 // align high limbs to the left + lslv x14, x14, x21 + lsrv x7, x7, x22 + lsrv x13, x13, x22 + and x7, x7, x22, asr#6 + and x13, x13, x22, asr#6 + orr x8, x8, x7 + orr x14, x14, x13 + + bfxil x8, x3, #0, #32 + bfxil x14, x9, #0, #32 + + b __inner_loop_30 + ret + + +.def __inner_loop_30; +.type 32; +.endef +.p2align 4 +__inner_loop_30: + mov x28, #30 + mov x17, #0x7FFFFFFF80000000 // |f0|=1, |g0|=0 + mov x20, #0x800000007FFFFFFF // |f1|=0, |g1|=1 + mov x27,#0x7FFFFFFF7FFFFFFF + +.Loop_30: + sbfx x24, x8, #0, #1 // if |a_| is odd, then we'll be subtracting + and x25, x8, x14 + sub x28, x28, #1 + and x21, x14, x24 + + sub x22, x14, x8 // |b_|-|a_| + subs x23, x8, x21 // |a_|-|b_| (or |a_|-0 if |a_| was even) + add x25, x2, x25, lsr#1 // L + (a_ & b_) >> 1 + mov x21, x20 + csel x14, x14, x8, hs // |b_| = |a_| + csel x8, x23, x22, hs // borrow means |a_|<|b_|, replace with |b_|-|a_| + csel x20, x20, x17, hs // exchange |fg0| and |fg1| + csel x17, x17, x21, hs + csel x2, x2, x25, hs + lsr x8, x8, #1 + and x21, x20, x24 + and x22, x27, x24 + add x23, x14, #2 + sub x17, x17, x21 // |f0|-=|f1| (or |f0-=0| if |a_| was even) + add x20, x20, x20 // |f1|<<=1 + add x2, x2, x23, lsr#2 // "negate" |L| if |b|%8 is 3 or 5 + add x17, x17, x22 + sub x20, x20, x27 + + cbnz x28, .Loop_30 + + mov x27, #0x7FFFFFFF + ubfx x16, x17, #0, #32 + ubfx x17, x17, #32, #32 + ubfx x19, x20, #0, #32 + ubfx x20, x20, #32, #32 + sub x16, x16, x27 // remove the bias + sub x17, x17, x27 + sub x19, x19, x27 + sub x20, x20, x27 + + ret + +.def __inner_loop_48; +.type 32; +.endef +.p2align 4 +__inner_loop_48: +.Loop_48: + sbfx x24, x3, #0, #1 // if |a_| is odd, then we'll be subtracting + and x25, x3, x9 + sub x15, x15, #1 + and x21, x9, x24 + sub x22, x9, x3 // |b_|-|a_| + subs x23, x3, x21 // |a_|-|b_| (or |a_|-0 if |a_| was even) + add x25, x2, x25, lsr#1 + csel x9, x9, x3, hs // |b_| = |a_| + csel x3, x23, x22, hs // borrow means |a_|<|b_|, replace with |b_|-|a_| + csel x2, x2, x25, hs + add x23, x9, #2 + lsr x3, x3, #1 + add x2, x2, x23, lsr#2 // "negate" |L| if |b|%8 is 3 or 5 + + cbnz x15, .Loop_48 + + ret + diff --git a/temp-deps/blst-ts/deps/blst/build/coff/ct_is_square_mod_384-x86_64.s b/temp-deps/blst-ts/deps/blst/build/coff/ct_is_square_mod_384-x86_64.s new file mode 100644 index 0000000..ee47903 --- /dev/null +++ b/temp-deps/blst-ts/deps/blst/build/coff/ct_is_square_mod_384-x86_64.s @@ -0,0 +1,505 @@ +.text + +.globl ct_is_square_mod_384 +.def ct_is_square_mod_384; .scl 2; .type 32; .endef +.p2align 5 +ct_is_square_mod_384: + .byte 0xf3,0x0f,0x1e,0xfa + movq %rdi,8(%rsp) + movq %rsi,16(%rsp) + movq %rsp,%r11 +.LSEH_begin_ct_is_square_mod_384: + movq %rcx,%rdi + movq %rdx,%rsi + + + pushq %rbp + + pushq %rbx + + pushq %r12 + + pushq %r13 + + pushq %r14 + + pushq %r15 + + subq $536,%rsp + +.LSEH_body_ct_is_square_mod_384: + + + leaq 24+255(%rsp),%rax + andq $-256,%rax + + movq 0(%rdi),%r8 + movq 8(%rdi),%r9 + movq 16(%rdi),%r10 + movq 24(%rdi),%r11 + movq 32(%rdi),%r12 + movq 40(%rdi),%r13 + + movq 0(%rsi),%r14 + movq 8(%rsi),%r15 + movq 16(%rsi),%rbx + movq 24(%rsi),%rcx + movq 32(%rsi),%rdx + movq 40(%rsi),%rdi + movq %rax,%rsi + + movq %r8,0(%rax) + movq %r9,8(%rax) + movq %r10,16(%rax) + movq %r11,24(%rax) + movq %r12,32(%rax) + movq %r13,40(%rax) + + movq %r14,48(%rax) + movq %r15,56(%rax) + movq %rbx,64(%rax) + movq %rcx,72(%rax) + movq %rdx,80(%rax) + movq %rdi,88(%rax) + + xorq %rbp,%rbp + movl $24,%ecx + jmp .Loop_is_square + +.p2align 5 +.Loop_is_square: + movl %ecx,16(%rsp) + + call __ab_approximation_30 + movq %rax,0(%rsp) + movq %rbx,8(%rsp) + + movq $128+48,%rdi + xorq %rsi,%rdi + call __smulq_384_n_shift_by_30 + + movq 0(%rsp),%rdx + movq 8(%rsp),%rcx + leaq -48(%rdi),%rdi + call __smulq_384_n_shift_by_30 + + movl 16(%rsp),%ecx + xorq $128,%rsi + + andq 48(%rdi),%r14 + shrq $1,%r14 + addq %r14,%rbp + + subl $1,%ecx + jnz .Loop_is_square + + + + + movq 48(%rsi),%r9 + call __inner_loop_48 + + movq $1,%rax + andq %rbp,%rax + xorq $1,%rax + + leaq 536(%rsp),%r8 + movq 0(%r8),%r15 + + movq 8(%r8),%r14 + + movq 16(%r8),%r13 + + movq 24(%r8),%r12 + + movq 32(%r8),%rbx + + movq 40(%r8),%rbp + + leaq 48(%r8),%rsp + +.LSEH_epilogue_ct_is_square_mod_384: + mov 8(%rsp),%rdi + mov 16(%rsp),%rsi + + .byte 0xf3,0xc3 + +.LSEH_end_ct_is_square_mod_384: + +.def __smulq_384_n_shift_by_30; .scl 3; .type 32; .endef +.p2align 5 +__smulq_384_n_shift_by_30: + .byte 0xf3,0x0f,0x1e,0xfa + + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + movq 32(%rsi),%r12 + movq 40(%rsi),%r13 + + movq %rdx,%rbx + sarq $63,%rdx + xorq %rax,%rax + subq %rdx,%rax + + xorq %rdx,%rbx + addq %rax,%rbx + + xorq %rdx,%r8 + xorq %rdx,%r9 + xorq %rdx,%r10 + xorq %rdx,%r11 + xorq %rdx,%r12 + xorq %rdx,%r13 + addq %r8,%rax + adcq $0,%r9 + adcq $0,%r10 + adcq $0,%r11 + adcq $0,%r12 + adcq $0,%r13 + + movq %rdx,%r14 + andq %rbx,%r14 + mulq %rbx + movq %rax,%r8 + movq %r9,%rax + movq %rdx,%r9 + mulq %rbx + addq %rax,%r9 + movq %r10,%rax + adcq $0,%rdx + movq %rdx,%r10 + mulq %rbx + addq %rax,%r10 + movq %r11,%rax + adcq $0,%rdx + movq %rdx,%r11 + mulq %rbx + addq %rax,%r11 + movq %r12,%rax + adcq $0,%rdx + movq %rdx,%r12 + mulq %rbx + addq %rax,%r12 + movq %r13,%rax + adcq $0,%rdx + movq %rdx,%r13 + negq %r14 + mulq %rbx + addq %rax,%r13 + adcq %rdx,%r14 + leaq 48(%rsi),%rsi + movq %rcx,%rdx + + movq %r8,0(%rdi) + movq %r9,8(%rdi) + movq %r10,16(%rdi) + movq %r11,24(%rdi) + movq %r12,32(%rdi) + movq %r13,40(%rdi) + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + movq 32(%rsi),%r12 + movq 40(%rsi),%r13 + + movq %rdx,%rbx + sarq $63,%rdx + xorq %rax,%rax + subq %rdx,%rax + + xorq %rdx,%rbx + addq %rax,%rbx + + xorq %rdx,%r8 + xorq %rdx,%r9 + xorq %rdx,%r10 + xorq %rdx,%r11 + xorq %rdx,%r12 + xorq %rdx,%r13 + addq %r8,%rax + adcq $0,%r9 + adcq $0,%r10 + adcq $0,%r11 + adcq $0,%r12 + adcq $0,%r13 + + movq %rdx,%r15 + andq %rbx,%r15 + mulq %rbx + movq %rax,%r8 + movq %r9,%rax + movq %rdx,%r9 + mulq %rbx + addq %rax,%r9 + movq %r10,%rax + adcq $0,%rdx + movq %rdx,%r10 + mulq %rbx + addq %rax,%r10 + movq %r11,%rax + adcq $0,%rdx + movq %rdx,%r11 + mulq %rbx + addq %rax,%r11 + movq %r12,%rax + adcq $0,%rdx + movq %rdx,%r12 + mulq %rbx + addq %rax,%r12 + movq %r13,%rax + adcq $0,%rdx + movq %rdx,%r13 + negq %r15 + mulq %rbx + addq %rax,%r13 + adcq %rdx,%r15 + leaq -48(%rsi),%rsi + + addq 0(%rdi),%r8 + adcq 8(%rdi),%r9 + adcq 16(%rdi),%r10 + adcq 24(%rdi),%r11 + adcq 32(%rdi),%r12 + adcq 40(%rdi),%r13 + adcq %r15,%r14 + + shrdq $30,%r9,%r8 + shrdq $30,%r10,%r9 + shrdq $30,%r11,%r10 + shrdq $30,%r12,%r11 + shrdq $30,%r13,%r12 + shrdq $30,%r14,%r13 + + sarq $63,%r14 + xorq %rbx,%rbx + subq %r14,%rbx + + xorq %r14,%r8 + xorq %r14,%r9 + xorq %r14,%r10 + xorq %r14,%r11 + xorq %r14,%r12 + xorq %r14,%r13 + addq %rbx,%r8 + adcq $0,%r9 + adcq $0,%r10 + adcq $0,%r11 + adcq $0,%r12 + adcq $0,%r13 + + movq %r8,0(%rdi) + movq %r9,8(%rdi) + movq %r10,16(%rdi) + movq %r11,24(%rdi) + movq %r12,32(%rdi) + movq %r13,40(%rdi) + + .byte 0xf3,0xc3 + +.def __ab_approximation_30; .scl 3; .type 32; .endef +.p2align 5 +__ab_approximation_30: + .byte 0xf3,0x0f,0x1e,0xfa + + movq 88(%rsi),%rbx + movq 80(%rsi),%r15 + movq 72(%rsi),%r14 + + movq %r13,%rax + orq %rbx,%rax + cmovzq %r12,%r13 + cmovzq %r15,%rbx + cmovzq %r11,%r12 + movq 64(%rsi),%r11 + cmovzq %r14,%r15 + + movq %r13,%rax + orq %rbx,%rax + cmovzq %r12,%r13 + cmovzq %r15,%rbx + cmovzq %r10,%r12 + movq 56(%rsi),%r10 + cmovzq %r11,%r15 + + movq %r13,%rax + orq %rbx,%rax + cmovzq %r12,%r13 + cmovzq %r15,%rbx + cmovzq %r9,%r12 + movq 48(%rsi),%r9 + cmovzq %r10,%r15 + + movq %r13,%rax + orq %rbx,%rax + cmovzq %r12,%r13 + cmovzq %r15,%rbx + cmovzq %r8,%r12 + cmovzq %r9,%r15 + + movq %r13,%rax + orq %rbx,%rax + bsrq %rax,%rcx + leaq 1(%rcx),%rcx + cmovzq %r8,%r13 + cmovzq %r9,%rbx + cmovzq %rax,%rcx + negq %rcx + + + shldq %cl,%r12,%r13 + shldq %cl,%r15,%rbx + + movq $0xFFFFFFFF00000000,%rax + movl %r8d,%r8d + movl %r9d,%r9d + andq %rax,%r13 + andq %rax,%rbx + orq %r13,%r8 + orq %rbx,%r9 + + jmp __inner_loop_30 + + .byte 0xf3,0xc3 + +.def __inner_loop_30; .scl 3; .type 32; .endef +.p2align 5 +__inner_loop_30: + .byte 0xf3,0x0f,0x1e,0xfa + + movq $0x7FFFFFFF80000000,%rbx + movq $0x800000007FFFFFFF,%rcx + leaq -1(%rbx),%r15 + movl $30,%edi + +.Loop_30: + movq %r8,%rax + andq %r9,%rax + shrq $1,%rax + + cmpq %r9,%r8 + movq %r8,%r10 + movq %r9,%r11 + leaq (%rax,%rbp,1),%rax + movq %rbx,%r12 + movq %rcx,%r13 + movq %rbp,%r14 + cmovbq %r9,%r8 + cmovbq %r10,%r9 + cmovbq %rcx,%rbx + cmovbq %r12,%rcx + cmovbq %rax,%rbp + + subq %r9,%r8 + subq %rcx,%rbx + addq %r15,%rbx + + testq $1,%r10 + cmovzq %r10,%r8 + cmovzq %r11,%r9 + cmovzq %r12,%rbx + cmovzq %r13,%rcx + cmovzq %r14,%rbp + + leaq 2(%r9),%rax + shrq $1,%r8 + shrq $2,%rax + addq %rcx,%rcx + leaq (%rax,%rbp,1),%rbp + subq %r15,%rcx + + subl $1,%edi + jnz .Loop_30 + + shrq $32,%r15 + movl %ebx,%eax + shrq $32,%rbx + movl %ecx,%edx + shrq $32,%rcx + subq %r15,%rax + subq %r15,%rbx + subq %r15,%rdx + subq %r15,%rcx + + .byte 0xf3,0xc3 + + +.def __inner_loop_48; .scl 3; .type 32; .endef +.p2align 5 +__inner_loop_48: + .byte 0xf3,0x0f,0x1e,0xfa + + movl $48,%edi + +.Loop_48: + movq %r8,%rax + andq %r9,%rax + shrq $1,%rax + + cmpq %r9,%r8 + movq %r8,%r10 + movq %r9,%r11 + leaq (%rax,%rbp,1),%rax + movq %rbp,%r12 + cmovbq %r9,%r8 + cmovbq %r10,%r9 + cmovbq %rax,%rbp + + subq %r9,%r8 + + testq $1,%r10 + cmovzq %r10,%r8 + cmovzq %r11,%r9 + cmovzq %r12,%rbp + + leaq 2(%r9),%rax + shrq $1,%r8 + shrq $2,%rax + addq %rax,%rbp + + subl $1,%edi + jnz .Loop_48 + + .byte 0xf3,0xc3 + +.section .pdata +.p2align 2 +.rva .LSEH_begin_ct_is_square_mod_384 +.rva .LSEH_body_ct_is_square_mod_384 +.rva .LSEH_info_ct_is_square_mod_384_prologue + +.rva .LSEH_body_ct_is_square_mod_384 +.rva .LSEH_epilogue_ct_is_square_mod_384 +.rva .LSEH_info_ct_is_square_mod_384_body + +.rva .LSEH_epilogue_ct_is_square_mod_384 +.rva .LSEH_end_ct_is_square_mod_384 +.rva .LSEH_info_ct_is_square_mod_384_epilogue + +.section .xdata +.p2align 3 +.LSEH_info_ct_is_square_mod_384_prologue: +.byte 1,0,5,0x0b +.byte 0,0x74,1,0 +.byte 0,0x64,2,0 +.byte 0,0x03 +.byte 0,0 +.LSEH_info_ct_is_square_mod_384_body: +.byte 1,0,18,0 +.byte 0x00,0xf4,0x43,0x00 +.byte 0x00,0xe4,0x44,0x00 +.byte 0x00,0xd4,0x45,0x00 +.byte 0x00,0xc4,0x46,0x00 +.byte 0x00,0x34,0x47,0x00 +.byte 0x00,0x54,0x48,0x00 +.byte 0x00,0x74,0x4a,0x00 +.byte 0x00,0x64,0x4b,0x00 +.byte 0x00,0x01,0x49,0x00 +.LSEH_info_ct_is_square_mod_384_epilogue: +.byte 1,0,4,0 +.byte 0x00,0x74,0x01,0x00 +.byte 0x00,0x64,0x02,0x00 +.byte 0x00,0x00,0x00,0x00 + diff --git a/temp-deps/blst-ts/deps/blst/build/coff/ctq_inverse_mod_384-x86_64.s b/temp-deps/blst-ts/deps/blst/build/coff/ctq_inverse_mod_384-x86_64.s new file mode 100644 index 0000000..42f058a --- /dev/null +++ b/temp-deps/blst-ts/deps/blst/build/coff/ctq_inverse_mod_384-x86_64.s @@ -0,0 +1,1221 @@ +.text + +.globl ct_inverse_mod_383 +.def ct_inverse_mod_383; .scl 2; .type 32; .endef +.p2align 5 +ct_inverse_mod_383: + .byte 0xf3,0x0f,0x1e,0xfa + movq %rdi,8(%rsp) + movq %rsi,16(%rsp) + movq %rsp,%r11 +.LSEH_begin_ct_inverse_mod_383: + movq %rcx,%rdi + movq %rdx,%rsi + movq %r8,%rdx + movq %r9,%rcx + + + pushq %rbp + + pushq %rbx + + pushq %r12 + + pushq %r13 + + pushq %r14 + + pushq %r15 + + subq $1112,%rsp + +.LSEH_body_ct_inverse_mod_383: + + + leaq 88+511(%rsp),%rax + andq $-512,%rax + movq %rdi,32(%rsp) + movq %rcx,40(%rsp) + + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + movq 32(%rsi),%r12 + movq 40(%rsi),%r13 + + movq 0(%rdx),%r14 + movq 8(%rdx),%r15 + movq 16(%rdx),%rbx + movq 24(%rdx),%rbp + movq 32(%rdx),%rsi + movq 40(%rdx),%rdi + + movq %r8,0(%rax) + movq %r9,8(%rax) + movq %r10,16(%rax) + movq %r11,24(%rax) + movq %r12,32(%rax) + movq %r13,40(%rax) + + movq %r14,48(%rax) + movq %r15,56(%rax) + movq %rbx,64(%rax) + movq %rbp,72(%rax) + movq %rsi,80(%rax) + movq %rax,%rsi + movq %rdi,88(%rax) + + + movl $62,%edi + call __ab_approximation_62 + + + movq %r12,72(%rsp) + movq %r13,80(%rsp) + + movq $256,%rdi + xorq %rsi,%rdi + call __smulq_383_n_shift_by_62 + + + movq %rdx,96(%rdi) + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulq_383_n_shift_by_62 + + + movq %rdx,96(%rdi) + + + xorq $256,%rsi + movl $62,%edi + call __ab_approximation_62 + + + movq %r12,72(%rsp) + movq %r13,80(%rsp) + + movq $256,%rdi + xorq %rsi,%rdi + call __smulq_383_n_shift_by_62 + movq %rdx,56(%rsp) + movq %rcx,64(%rsp) + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulq_383_n_shift_by_62 + + + + movq 96(%rsi),%rax + movq 144(%rsi),%r11 + movq %rdx,%rbx + movq %rax,%r10 + imulq 56(%rsp) + movq %rax,%r8 + movq %r11,%rax + movq %rdx,%r9 + imulq 64(%rsp) + addq %rax,%r8 + adcq %rdx,%r9 + movq %r8,48(%rdi) + movq %r9,56(%rdi) + sarq $63,%r9 + movq %r9,64(%rdi) + movq %r9,72(%rdi) + movq %r9,80(%rdi) + movq %r9,88(%rdi) + leaq 96(%rsi),%rsi + + movq %r10,%rax + imulq %rbx + movq %rax,%r8 + movq %r11,%rax + movq %rdx,%r9 + imulq %rcx + addq %rax,%r8 + adcq %rdx,%r9 + movq %r8,96(%rdi) + movq %r9,104(%rdi) + sarq $63,%r9 + movq %r9,112(%rdi) + movq %r9,120(%rdi) + movq %r9,128(%rdi) + movq %r9,136(%rdi) + xorq $256+96,%rsi + movl $62,%edi + call __ab_approximation_62 + + + movq %r12,72(%rsp) + movq %r13,80(%rsp) + + movq $256,%rdi + xorq %rsi,%rdi + call __smulq_383_n_shift_by_62 + movq %rdx,56(%rsp) + movq %rcx,64(%rsp) + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulq_383_n_shift_by_62 + movq %rdx,72(%rsp) + movq %rcx,80(%rsp) + + movq 56(%rsp),%rdx + movq 64(%rsp),%rcx + leaq 96(%rsi),%rsi + leaq 48(%rdi),%rdi + call __smulq_383x63 + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulq_383x63 + xorq $256+96,%rsi + movl $62,%edi + call __ab_approximation_62 + + + movq %r12,72(%rsp) + movq %r13,80(%rsp) + + movq $256,%rdi + xorq %rsi,%rdi + call __smulq_383_n_shift_by_62 + movq %rdx,56(%rsp) + movq %rcx,64(%rsp) + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulq_383_n_shift_by_62 + movq %rdx,72(%rsp) + movq %rcx,80(%rsp) + + movq 56(%rsp),%rdx + movq 64(%rsp),%rcx + leaq 96(%rsi),%rsi + leaq 48(%rdi),%rdi + call __smulq_383x63 + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulq_383x63 + xorq $256+96,%rsi + movl $62,%edi + call __ab_approximation_62 + + + movq %r12,72(%rsp) + movq %r13,80(%rsp) + + movq $256,%rdi + xorq %rsi,%rdi + call __smulq_383_n_shift_by_62 + movq %rdx,56(%rsp) + movq %rcx,64(%rsp) + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulq_383_n_shift_by_62 + movq %rdx,72(%rsp) + movq %rcx,80(%rsp) + + movq 56(%rsp),%rdx + movq 64(%rsp),%rcx + leaq 96(%rsi),%rsi + leaq 48(%rdi),%rdi + call __smulq_383x63 + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulq_383x63 + xorq $256+96,%rsi + movl $62,%edi + call __ab_approximation_62 + + + movq %r12,72(%rsp) + movq %r13,80(%rsp) + + movq $256,%rdi + xorq %rsi,%rdi + call __smulq_383_n_shift_by_62 + movq %rdx,56(%rsp) + movq %rcx,64(%rsp) + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulq_383_n_shift_by_62 + movq %rdx,72(%rsp) + movq %rcx,80(%rsp) + + movq 56(%rsp),%rdx + movq 64(%rsp),%rcx + leaq 96(%rsi),%rsi + leaq 48(%rdi),%rdi + call __smulq_383x63 + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulq_383x63 + sarq $63,%r13 + movq %r13,48(%rdi) + movq %r13,56(%rdi) + movq %r13,64(%rdi) + movq %r13,72(%rdi) + movq %r13,80(%rdi) + movq %r13,88(%rdi) + xorq $256+96,%rsi + movl $62,%edi + call __ab_approximation_62 + + + movq %r12,72(%rsp) + movq %r13,80(%rsp) + + movq $256,%rdi + xorq %rsi,%rdi + call __smulq_383_n_shift_by_62 + movq %rdx,56(%rsp) + movq %rcx,64(%rsp) + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulq_383_n_shift_by_62 + movq %rdx,72(%rsp) + movq %rcx,80(%rsp) + + movq 56(%rsp),%rdx + movq 64(%rsp),%rcx + leaq 96(%rsi),%rsi + leaq 48(%rdi),%rdi + call __smulq_383x63 + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulq_767x63 + xorq $256+96,%rsi + movl $62,%edi + call __ab_approximation_62 + + + movq %r12,72(%rsp) + movq %r13,80(%rsp) + + movq $256,%rdi + xorq %rsi,%rdi + call __smulq_383_n_shift_by_62 + movq %rdx,56(%rsp) + movq %rcx,64(%rsp) + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulq_383_n_shift_by_62 + movq %rdx,72(%rsp) + movq %rcx,80(%rsp) + + movq 56(%rsp),%rdx + movq 64(%rsp),%rcx + leaq 96(%rsi),%rsi + leaq 48(%rdi),%rdi + call __smulq_383x63 + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulq_767x63 + xorq $256+96,%rsi + movl $62,%edi + call __ab_approximation_62 + + + movq %r12,72(%rsp) + movq %r13,80(%rsp) + + movq $256,%rdi + xorq %rsi,%rdi + call __smulq_383_n_shift_by_62 + movq %rdx,56(%rsp) + movq %rcx,64(%rsp) + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulq_383_n_shift_by_62 + movq %rdx,72(%rsp) + movq %rcx,80(%rsp) + + movq 56(%rsp),%rdx + movq 64(%rsp),%rcx + leaq 96(%rsi),%rsi + leaq 48(%rdi),%rdi + call __smulq_383x63 + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulq_767x63 + xorq $256+96,%rsi + movl $62,%edi + call __ab_approximation_62 + + + movq %r12,72(%rsp) + movq %r13,80(%rsp) + + movq $256,%rdi + xorq %rsi,%rdi + call __smulq_383_n_shift_by_62 + movq %rdx,56(%rsp) + movq %rcx,64(%rsp) + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulq_383_n_shift_by_62 + movq %rdx,72(%rsp) + movq %rcx,80(%rsp) + + movq 56(%rsp),%rdx + movq 64(%rsp),%rcx + leaq 96(%rsi),%rsi + leaq 48(%rdi),%rdi + call __smulq_383x63 + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulq_767x63 + xorq $256+96,%rsi + movl $62,%edi + call __ab_approximation_62 + + + movq %r12,72(%rsp) + movq %r13,80(%rsp) + + movq $256,%rdi + xorq %rsi,%rdi + call __smulq_383_n_shift_by_62 + movq %rdx,56(%rsp) + movq %rcx,64(%rsp) + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulq_383_n_shift_by_62 + movq %rdx,72(%rsp) + movq %rcx,80(%rsp) + + movq 56(%rsp),%rdx + movq 64(%rsp),%rcx + leaq 96(%rsi),%rsi + leaq 48(%rdi),%rdi + call __smulq_383x63 + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulq_767x63 + + xorq $256+96,%rsi + movl $62,%edi + + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 48(%rsi),%r10 + movq 56(%rsi),%r11 + call __inner_loop_62 + + + movq %r12,72(%rsp) + movq %r13,80(%rsp) + + movq $256,%rdi + xorq %rsi,%rdi + movq %r8,0(%rdi) + movq %r10,48(%rdi) + + + + leaq 96(%rsi),%rsi + leaq 96(%rdi),%rdi + call __smulq_383x63 + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulq_767x63 + + + xorq $256+96,%rsi + movl $22,%edi + + movq 0(%rsi),%r8 + xorq %r9,%r9 + movq 48(%rsi),%r10 + xorq %r11,%r11 + call __inner_loop_62 + + + + + + + + leaq 96(%rsi),%rsi + + + + + + movq %r12,%rdx + movq %r13,%rcx + movq 32(%rsp),%rdi + call __smulq_767x63 + + movq 40(%rsp),%rsi + movq %rax,%rdx + sarq $63,%rax + + movq %rax,%r8 + movq %rax,%r9 + movq %rax,%r10 + andq 0(%rsi),%r8 + andq 8(%rsi),%r9 + movq %rax,%r11 + andq 16(%rsi),%r10 + andq 24(%rsi),%r11 + movq %rax,%r12 + andq 32(%rsi),%r12 + andq 40(%rsi),%rax + + addq %r8,%r14 + adcq %r9,%r15 + adcq %r10,%rbx + adcq %r11,%rbp + adcq %r12,%rcx + adcq %rax,%rdx + + movq %r14,48(%rdi) + movq %r15,56(%rdi) + movq %rbx,64(%rdi) + movq %rbp,72(%rdi) + movq %rcx,80(%rdi) + movq %rdx,88(%rdi) + + leaq 1112(%rsp),%r8 + movq 0(%r8),%r15 + + movq 8(%r8),%r14 + + movq 16(%r8),%r13 + + movq 24(%r8),%r12 + + movq 32(%r8),%rbx + + movq 40(%r8),%rbp + + leaq 48(%r8),%rsp + +.LSEH_epilogue_ct_inverse_mod_383: + mov 8(%rsp),%rdi + mov 16(%rsp),%rsi + + .byte 0xf3,0xc3 + +.LSEH_end_ct_inverse_mod_383: +.def __smulq_767x63; .scl 3; .type 32; .endef +.p2align 5 +__smulq_767x63: + .byte 0xf3,0x0f,0x1e,0xfa + + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + movq 32(%rsi),%r12 + movq 40(%rsi),%r13 + + movq %rdx,%rbp + sarq $63,%rdx + xorq %rax,%rax + subq %rdx,%rax + + movq %rdi,8(%rsp) + movq %rsi,16(%rsp) + leaq 48(%rsi),%rsi + + xorq %rdx,%rbp + addq %rax,%rbp + + xorq %rdx,%r8 + xorq %rdx,%r9 + xorq %rdx,%r10 + xorq %rdx,%r11 + xorq %rdx,%r12 + xorq %rdx,%r13 + addq %r8,%rax + adcq $0,%r9 + adcq $0,%r10 + adcq $0,%r11 + adcq $0,%r12 + adcq $0,%r13 + + mulq %rbp + movq %rax,0(%rdi) + movq %r9,%rax + movq %rdx,%r9 + mulq %rbp + addq %rax,%r9 + movq %r10,%rax + adcq $0,%rdx + movq %rdx,%r10 + movq %r9,8(%rdi) + mulq %rbp + addq %rax,%r10 + movq %r11,%rax + adcq $0,%rdx + movq %rdx,%r11 + movq %r10,16(%rdi) + mulq %rbp + addq %rax,%r11 + movq %r12,%rax + adcq $0,%rdx + movq %rdx,%r12 + movq %r11,24(%rdi) + mulq %rbp + addq %rax,%r12 + movq %r13,%rax + adcq $0,%rdx + movq %rdx,%r13 + movq %r12,32(%rdi) + imulq %rbp + addq %rax,%r13 + adcq $0,%rdx + + movq %r13,40(%rdi) + movq %rdx,48(%rdi) + sarq $63,%rdx + movq %rdx,56(%rdi) + movq %rcx,%rdx + + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + movq 32(%rsi),%r12 + movq 40(%rsi),%r13 + movq 48(%rsi),%r14 + movq 56(%rsi),%r15 + movq 64(%rsi),%rbx + movq 72(%rsi),%rbp + movq 80(%rsi),%rcx + movq 88(%rsi),%rdi + + movq %rdx,%rsi + sarq $63,%rdx + xorq %rax,%rax + subq %rdx,%rax + + xorq %rdx,%rsi + addq %rax,%rsi + + xorq %rdx,%r8 + xorq %rdx,%r9 + xorq %rdx,%r10 + xorq %rdx,%r11 + xorq %rdx,%r12 + xorq %rdx,%r13 + xorq %rdx,%r14 + xorq %rdx,%r15 + xorq %rdx,%rbx + xorq %rdx,%rbp + xorq %rdx,%rcx + xorq %rdx,%rdi + addq %r8,%rax + adcq $0,%r9 + adcq $0,%r10 + adcq $0,%r11 + adcq $0,%r12 + adcq $0,%r13 + adcq $0,%r14 + adcq $0,%r15 + adcq $0,%rbx + adcq $0,%rbp + adcq $0,%rcx + adcq $0,%rdi + + mulq %rsi + movq %rax,%r8 + movq %r9,%rax + movq %rdx,%r9 + mulq %rsi + addq %rax,%r9 + movq %r10,%rax + adcq $0,%rdx + movq %rdx,%r10 + mulq %rsi + addq %rax,%r10 + movq %r11,%rax + adcq $0,%rdx + movq %rdx,%r11 + mulq %rsi + addq %rax,%r11 + movq %r12,%rax + adcq $0,%rdx + movq %rdx,%r12 + mulq %rsi + addq %rax,%r12 + movq %r13,%rax + adcq $0,%rdx + movq %rdx,%r13 + mulq %rsi + addq %rax,%r13 + movq %r14,%rax + adcq $0,%rdx + movq %rdx,%r14 + mulq %rsi + addq %rax,%r14 + movq %r15,%rax + adcq $0,%rdx + movq %rdx,%r15 + mulq %rsi + addq %rax,%r15 + movq %rbx,%rax + adcq $0,%rdx + movq %rdx,%rbx + mulq %rsi + addq %rax,%rbx + movq %rbp,%rax + adcq $0,%rdx + movq %rdx,%rbp + mulq %rsi + addq %rax,%rbp + movq %rcx,%rax + adcq $0,%rdx + movq %rdx,%rcx + mulq %rsi + addq %rax,%rcx + movq %rdi,%rax + adcq $0,%rdx + movq %rdx,%rdi + movq 8(%rsp),%rdx + imulq %rsi,%rax + movq 16(%rsp),%rsi + addq %rdi,%rax + + addq 0(%rdx),%r8 + adcq 8(%rdx),%r9 + adcq 16(%rdx),%r10 + adcq 24(%rdx),%r11 + adcq 32(%rdx),%r12 + adcq 40(%rdx),%r13 + adcq 48(%rdx),%r14 + movq 56(%rdx),%rdi + adcq %rdi,%r15 + adcq %rdi,%rbx + adcq %rdi,%rbp + adcq %rdi,%rcx + adcq %rdi,%rax + + movq %rdx,%rdi + + movq %r8,0(%rdx) + movq %r9,8(%rdx) + movq %r10,16(%rdx) + movq %r11,24(%rdx) + movq %r12,32(%rdx) + movq %r13,40(%rdx) + movq %r14,48(%rdx) + movq %r15,56(%rdx) + movq %rbx,64(%rdx) + movq %rbp,72(%rdx) + movq %rcx,80(%rdx) + movq %rax,88(%rdx) + + .byte 0xf3,0xc3 + +.def __smulq_383x63; .scl 3; .type 32; .endef +.p2align 5 +__smulq_383x63: + .byte 0xf3,0x0f,0x1e,0xfa + + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + movq 32(%rsi),%r12 + movq 40(%rsi),%r13 + + movq %rdx,%rbp + sarq $63,%rdx + xorq %rax,%rax + subq %rdx,%rax + + xorq %rdx,%rbp + addq %rax,%rbp + + xorq %rdx,%r8 + xorq %rdx,%r9 + xorq %rdx,%r10 + xorq %rdx,%r11 + xorq %rdx,%r12 + xorq %rdx,%r13 + addq %r8,%rax + adcq $0,%r9 + adcq $0,%r10 + adcq $0,%r11 + adcq $0,%r12 + adcq $0,%r13 + + mulq %rbp + movq %rax,%r8 + movq %r9,%rax + movq %rdx,%r9 + mulq %rbp + addq %rax,%r9 + movq %r10,%rax + adcq $0,%rdx + movq %rdx,%r10 + mulq %rbp + addq %rax,%r10 + movq %r11,%rax + adcq $0,%rdx + movq %rdx,%r11 + mulq %rbp + addq %rax,%r11 + movq %r12,%rax + adcq $0,%rdx + movq %rdx,%r12 + mulq %rbp + addq %rax,%r12 + movq %r13,%rax + adcq $0,%rdx + movq %rdx,%r13 + imulq %rbp,%rax + addq %rax,%r13 + + leaq 48(%rsi),%rsi + movq %rcx,%rdx + + movq %r8,0(%rdi) + movq %r9,8(%rdi) + movq %r10,16(%rdi) + movq %r11,24(%rdi) + movq %r12,32(%rdi) + movq %r13,40(%rdi) + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + movq 32(%rsi),%r12 + movq 40(%rsi),%r13 + + movq %rdx,%rbp + sarq $63,%rdx + xorq %rax,%rax + subq %rdx,%rax + + xorq %rdx,%rbp + addq %rax,%rbp + + xorq %rdx,%r8 + xorq %rdx,%r9 + xorq %rdx,%r10 + xorq %rdx,%r11 + xorq %rdx,%r12 + xorq %rdx,%r13 + addq %r8,%rax + adcq $0,%r9 + adcq $0,%r10 + adcq $0,%r11 + adcq $0,%r12 + adcq $0,%r13 + + mulq %rbp + movq %rax,%r8 + movq %r9,%rax + movq %rdx,%r9 + mulq %rbp + addq %rax,%r9 + movq %r10,%rax + adcq $0,%rdx + movq %rdx,%r10 + mulq %rbp + addq %rax,%r10 + movq %r11,%rax + adcq $0,%rdx + movq %rdx,%r11 + mulq %rbp + addq %rax,%r11 + movq %r12,%rax + adcq $0,%rdx + movq %rdx,%r12 + mulq %rbp + addq %rax,%r12 + movq %r13,%rax + adcq $0,%rdx + movq %rdx,%r13 + imulq %rbp,%rax + addq %rax,%r13 + + leaq -48(%rsi),%rsi + + addq 0(%rdi),%r8 + adcq 8(%rdi),%r9 + adcq 16(%rdi),%r10 + adcq 24(%rdi),%r11 + adcq 32(%rdi),%r12 + adcq 40(%rdi),%r13 + + movq %r8,0(%rdi) + movq %r9,8(%rdi) + movq %r10,16(%rdi) + movq %r11,24(%rdi) + movq %r12,32(%rdi) + movq %r13,40(%rdi) + + .byte 0xf3,0xc3 + +.def __smulq_383_n_shift_by_62; .scl 3; .type 32; .endef +.p2align 5 +__smulq_383_n_shift_by_62: + .byte 0xf3,0x0f,0x1e,0xfa + + movq %rdx,%rbx + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + movq 32(%rsi),%r12 + movq 40(%rsi),%r13 + + movq %rdx,%rbp + sarq $63,%rdx + xorq %rax,%rax + subq %rdx,%rax + + xorq %rdx,%rbp + addq %rax,%rbp + + xorq %rdx,%r8 + xorq %rdx,%r9 + xorq %rdx,%r10 + xorq %rdx,%r11 + xorq %rdx,%r12 + xorq %rdx,%r13 + addq %r8,%rax + adcq $0,%r9 + adcq $0,%r10 + adcq $0,%r11 + adcq $0,%r12 + adcq $0,%r13 + + mulq %rbp + movq %rax,%r8 + movq %r9,%rax + movq %rdx,%r9 + mulq %rbp + addq %rax,%r9 + movq %r10,%rax + adcq $0,%rdx + movq %rdx,%r10 + mulq %rbp + addq %rax,%r10 + movq %r11,%rax + adcq $0,%rdx + movq %rdx,%r11 + mulq %rbp + addq %rax,%r11 + movq %r12,%rax + adcq $0,%rdx + movq %rdx,%r12 + mulq %rbp + addq %rax,%r12 + movq %r13,%rax + adcq $0,%rdx + movq %rdx,%r13 + imulq %rbp + addq %rax,%r13 + adcq $0,%rdx + + leaq 48(%rsi),%rsi + movq %rdx,%r14 + movq %rcx,%rdx + + movq %r8,0(%rdi) + movq %r9,8(%rdi) + movq %r10,16(%rdi) + movq %r11,24(%rdi) + movq %r12,32(%rdi) + movq %r13,40(%rdi) + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + movq 32(%rsi),%r12 + movq 40(%rsi),%r13 + + movq %rdx,%rbp + sarq $63,%rdx + xorq %rax,%rax + subq %rdx,%rax + + xorq %rdx,%rbp + addq %rax,%rbp + + xorq %rdx,%r8 + xorq %rdx,%r9 + xorq %rdx,%r10 + xorq %rdx,%r11 + xorq %rdx,%r12 + xorq %rdx,%r13 + addq %r8,%rax + adcq $0,%r9 + adcq $0,%r10 + adcq $0,%r11 + adcq $0,%r12 + adcq $0,%r13 + + mulq %rbp + movq %rax,%r8 + movq %r9,%rax + movq %rdx,%r9 + mulq %rbp + addq %rax,%r9 + movq %r10,%rax + adcq $0,%rdx + movq %rdx,%r10 + mulq %rbp + addq %rax,%r10 + movq %r11,%rax + adcq $0,%rdx + movq %rdx,%r11 + mulq %rbp + addq %rax,%r11 + movq %r12,%rax + adcq $0,%rdx + movq %rdx,%r12 + mulq %rbp + addq %rax,%r12 + movq %r13,%rax + adcq $0,%rdx + movq %rdx,%r13 + imulq %rbp + addq %rax,%r13 + adcq $0,%rdx + + leaq -48(%rsi),%rsi + + addq 0(%rdi),%r8 + adcq 8(%rdi),%r9 + adcq 16(%rdi),%r10 + adcq 24(%rdi),%r11 + adcq 32(%rdi),%r12 + adcq 40(%rdi),%r13 + adcq %rdx,%r14 + movq %rbx,%rdx + + shrdq $62,%r9,%r8 + shrdq $62,%r10,%r9 + shrdq $62,%r11,%r10 + shrdq $62,%r12,%r11 + shrdq $62,%r13,%r12 + shrdq $62,%r14,%r13 + + sarq $63,%r14 + xorq %rbp,%rbp + subq %r14,%rbp + + xorq %r14,%r8 + xorq %r14,%r9 + xorq %r14,%r10 + xorq %r14,%r11 + xorq %r14,%r12 + xorq %r14,%r13 + addq %rbp,%r8 + adcq $0,%r9 + adcq $0,%r10 + adcq $0,%r11 + adcq $0,%r12 + adcq $0,%r13 + + movq %r8,0(%rdi) + movq %r9,8(%rdi) + movq %r10,16(%rdi) + movq %r11,24(%rdi) + movq %r12,32(%rdi) + movq %r13,40(%rdi) + + xorq %r14,%rdx + xorq %r14,%rcx + addq %rbp,%rdx + addq %rbp,%rcx + + .byte 0xf3,0xc3 + +.def __ab_approximation_62; .scl 3; .type 32; .endef +.p2align 5 +__ab_approximation_62: + .byte 0xf3,0x0f,0x1e,0xfa + + movq 40(%rsi),%r9 + movq 88(%rsi),%r11 + movq 32(%rsi),%rbx + movq 80(%rsi),%rbp + movq 24(%rsi),%r8 + movq 72(%rsi),%r10 + + movq %r9,%rax + orq %r11,%rax + cmovzq %rbx,%r9 + cmovzq %rbp,%r11 + cmovzq %r8,%rbx + cmovzq %r10,%rbp + movq 16(%rsi),%r8 + movq 64(%rsi),%r10 + + movq %r9,%rax + orq %r11,%rax + cmovzq %rbx,%r9 + cmovzq %rbp,%r11 + cmovzq %r8,%rbx + cmovzq %r10,%rbp + movq 8(%rsi),%r8 + movq 56(%rsi),%r10 + + movq %r9,%rax + orq %r11,%rax + cmovzq %rbx,%r9 + cmovzq %rbp,%r11 + cmovzq %r8,%rbx + cmovzq %r10,%rbp + movq 0(%rsi),%r8 + movq 48(%rsi),%r10 + + movq %r9,%rax + orq %r11,%rax + bsrq %rax,%rcx + leaq 1(%rcx),%rcx + cmovzq %rbx,%r9 + cmovzq %rbp,%r11 + cmovzq %rax,%rcx + negq %rcx + + + shldq %cl,%rbx,%r9 + shldq %cl,%rbp,%r11 + + jmp __inner_loop_62 + + .byte 0xf3,0xc3 + +.def __inner_loop_62; .scl 3; .type 32; .endef +.p2align 3 +.long 0 +__inner_loop_62: + .byte 0xf3,0x0f,0x1e,0xfa + + movq $1,%rdx + xorq %rcx,%rcx + xorq %r12,%r12 + movq $1,%r13 + movq %rsi,8(%rsp) + +.Loop_62: + xorq %rax,%rax + xorq %rbx,%rbx + testq $1,%r8 + movq %r10,%rbp + movq %r11,%r14 + cmovnzq %r10,%rax + cmovnzq %r11,%rbx + subq %r8,%rbp + sbbq %r9,%r14 + movq %r8,%r15 + movq %r9,%rsi + subq %rax,%r8 + sbbq %rbx,%r9 + cmovcq %rbp,%r8 + cmovcq %r14,%r9 + cmovcq %r15,%r10 + cmovcq %rsi,%r11 + movq %rdx,%rax + cmovcq %r12,%rdx + cmovcq %rax,%r12 + movq %rcx,%rbx + cmovcq %r13,%rcx + cmovcq %rbx,%r13 + xorq %rax,%rax + xorq %rbx,%rbx + shrdq $1,%r9,%r8 + shrq $1,%r9 + testq $1,%r15 + cmovnzq %r12,%rax + cmovnzq %r13,%rbx + addq %r12,%r12 + addq %r13,%r13 + subq %rax,%rdx + subq %rbx,%rcx + subl $1,%edi + jnz .Loop_62 + + movq 8(%rsp),%rsi + .byte 0xf3,0xc3 + +.section .pdata +.p2align 2 +.rva .LSEH_begin_ct_inverse_mod_383 +.rva .LSEH_body_ct_inverse_mod_383 +.rva .LSEH_info_ct_inverse_mod_383_prologue + +.rva .LSEH_body_ct_inverse_mod_383 +.rva .LSEH_epilogue_ct_inverse_mod_383 +.rva .LSEH_info_ct_inverse_mod_383_body + +.rva .LSEH_epilogue_ct_inverse_mod_383 +.rva .LSEH_end_ct_inverse_mod_383 +.rva .LSEH_info_ct_inverse_mod_383_epilogue + +.section .xdata +.p2align 3 +.LSEH_info_ct_inverse_mod_383_prologue: +.byte 1,0,5,0x0b +.byte 0,0x74,1,0 +.byte 0,0x64,2,0 +.byte 0,0x03 +.byte 0,0 +.LSEH_info_ct_inverse_mod_383_body: +.byte 1,0,18,0 +.byte 0x00,0xf4,0x8b,0x00 +.byte 0x00,0xe4,0x8c,0x00 +.byte 0x00,0xd4,0x8d,0x00 +.byte 0x00,0xc4,0x8e,0x00 +.byte 0x00,0x34,0x8f,0x00 +.byte 0x00,0x54,0x90,0x00 +.byte 0x00,0x74,0x92,0x00 +.byte 0x00,0x64,0x93,0x00 +.byte 0x00,0x01,0x91,0x00 +.LSEH_info_ct_inverse_mod_383_epilogue: +.byte 1,0,4,0 +.byte 0x00,0x74,0x01,0x00 +.byte 0x00,0x64,0x02,0x00 +.byte 0x00,0x00,0x00,0x00 + diff --git a/temp-deps/blst-ts/deps/blst/build/coff/ctx_inverse_mod_384-x86_64.s b/temp-deps/blst-ts/deps/blst/build/coff/ctx_inverse_mod_384-x86_64.s new file mode 100644 index 0000000..7c13e56 --- /dev/null +++ b/temp-deps/blst-ts/deps/blst/build/coff/ctx_inverse_mod_384-x86_64.s @@ -0,0 +1,1596 @@ +.text + +.globl ctx_inverse_mod_383 +.def ctx_inverse_mod_383; .scl 2; .type 32; .endef +.p2align 5 +ctx_inverse_mod_383: + .byte 0xf3,0x0f,0x1e,0xfa + movq %rdi,8(%rsp) + movq %rsi,16(%rsp) + movq %rsp,%r11 +.LSEH_begin_ctx_inverse_mod_383: + movq %rcx,%rdi + movq %rdx,%rsi + movq %r8,%rdx + movq %r9,%rcx + + + pushq %rbp + + pushq %rbx + + pushq %r12 + + pushq %r13 + + pushq %r14 + + pushq %r15 + + subq $1112,%rsp + +.LSEH_body_ctx_inverse_mod_383: + + + leaq 88+511(%rsp),%rax + andq $-512,%rax + movq %rdi,32(%rsp) + movq %rcx,40(%rsp) + + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + movq 32(%rsi),%r12 + movq 40(%rsi),%r13 + + movq 0(%rdx),%r14 + movq 8(%rdx),%r15 + movq 16(%rdx),%rbx + movq 24(%rdx),%rbp + movq 32(%rdx),%rsi + movq 40(%rdx),%rdi + + movq %r8,0(%rax) + movq %r9,8(%rax) + movq %r10,16(%rax) + movq %r11,24(%rax) + movq %r12,32(%rax) + movq %r13,40(%rax) + + movq %r14,48(%rax) + movq %r15,56(%rax) + movq %rbx,64(%rax) + movq %rbp,72(%rax) + movq %rsi,80(%rax) + movq %rax,%rsi + movq %rdi,88(%rax) + + + movl $31,%edi + call __ab_approximation_31 + + + movq %r12,72(%rsp) + movq %r13,80(%rsp) + + movq $256,%rdi + xorq %rsi,%rdi + call __smulx_383_n_shift_by_31 + + + movq %rdx,96(%rdi) + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulx_383_n_shift_by_31 + + + movq %rdx,96(%rdi) + + + xorq $256,%rsi + movl $31,%edi + call __ab_approximation_31 + + + movq %r12,72(%rsp) + movq %r13,80(%rsp) + + movq $256,%rdi + xorq %rsi,%rdi + call __smulx_383_n_shift_by_31 + movq %rdx,56(%rsp) + movq %rcx,64(%rsp) + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulx_383_n_shift_by_31 + + + + movq 96(%rsi),%rax + movq 144(%rsi),%r11 + movq %rdx,%rbx + movq %rax,%r10 + imulq 56(%rsp) + movq %rax,%r8 + movq %r11,%rax + movq %rdx,%r9 + imulq 64(%rsp) + addq %rax,%r8 + adcq %rdx,%r9 + movq %r8,48(%rdi) + movq %r9,56(%rdi) + sarq $63,%r9 + movq %r9,64(%rdi) + movq %r9,72(%rdi) + movq %r9,80(%rdi) + movq %r9,88(%rdi) + leaq 96(%rsi),%rsi + + movq %r10,%rax + imulq %rbx + movq %rax,%r8 + movq %r11,%rax + movq %rdx,%r9 + imulq %rcx + addq %rax,%r8 + adcq %rdx,%r9 + movq %r8,96(%rdi) + movq %r9,104(%rdi) + sarq $63,%r9 + movq %r9,112(%rdi) + movq %r9,120(%rdi) + movq %r9,128(%rdi) + movq %r9,136(%rdi) + xorq $256+96,%rsi + movl $31,%edi + call __ab_approximation_31 + + + movq %r12,72(%rsp) + movq %r13,80(%rsp) + + movq $256,%rdi + xorq %rsi,%rdi + call __smulx_383_n_shift_by_31 + movq %rdx,56(%rsp) + movq %rcx,64(%rsp) + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulx_383_n_shift_by_31 + movq %rdx,72(%rsp) + movq %rcx,80(%rsp) + + movq 56(%rsp),%rdx + movq 64(%rsp),%rcx + leaq 96(%rsi),%rsi + leaq 48(%rdi),%rdi + call __smulx_383x63 + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulx_383x63 + xorq $256+96,%rsi + movl $31,%edi + call __ab_approximation_31 + + + movq %r12,72(%rsp) + movq %r13,80(%rsp) + + movq $256,%rdi + xorq %rsi,%rdi + call __smulx_383_n_shift_by_31 + movq %rdx,56(%rsp) + movq %rcx,64(%rsp) + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulx_383_n_shift_by_31 + movq %rdx,72(%rsp) + movq %rcx,80(%rsp) + + movq 56(%rsp),%rdx + movq 64(%rsp),%rcx + leaq 96(%rsi),%rsi + leaq 48(%rdi),%rdi + call __smulx_383x63 + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulx_383x63 + xorq $256+96,%rsi + movl $31,%edi + call __ab_approximation_31 + + + movq %r12,72(%rsp) + movq %r13,80(%rsp) + + movq $256,%rdi + xorq %rsi,%rdi + call __smulx_383_n_shift_by_31 + movq %rdx,56(%rsp) + movq %rcx,64(%rsp) + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulx_383_n_shift_by_31 + movq %rdx,72(%rsp) + movq %rcx,80(%rsp) + + movq 56(%rsp),%rdx + movq 64(%rsp),%rcx + leaq 96(%rsi),%rsi + leaq 48(%rdi),%rdi + call __smulx_383x63 + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulx_383x63 + xorq $256+96,%rsi + movl $31,%edi + call __ab_approximation_31 + + + movq %r12,72(%rsp) + movq %r13,80(%rsp) + + movq $256,%rdi + xorq %rsi,%rdi + call __smulx_383_n_shift_by_31 + movq %rdx,56(%rsp) + movq %rcx,64(%rsp) + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulx_383_n_shift_by_31 + movq %rdx,72(%rsp) + movq %rcx,80(%rsp) + + movq 56(%rsp),%rdx + movq 64(%rsp),%rcx + leaq 96(%rsi),%rsi + leaq 48(%rdi),%rdi + call __smulx_383x63 + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulx_383x63 + xorq $256+96,%rsi + movl $31,%edi + call __ab_approximation_31 + + + movq %r12,72(%rsp) + movq %r13,80(%rsp) + + movq $256,%rdi + xorq %rsi,%rdi + call __smulx_383_n_shift_by_31 + movq %rdx,56(%rsp) + movq %rcx,64(%rsp) + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulx_383_n_shift_by_31 + movq %rdx,72(%rsp) + movq %rcx,80(%rsp) + + movq 56(%rsp),%rdx + movq 64(%rsp),%rcx + leaq 96(%rsi),%rsi + leaq 48(%rdi),%rdi + call __smulx_383x63 + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulx_383x63 + xorq $256+96,%rsi + movl $31,%edi + call __ab_approximation_31 + + + movq %r12,72(%rsp) + movq %r13,80(%rsp) + + movq $256,%rdi + xorq %rsi,%rdi + call __smulx_383_n_shift_by_31 + movq %rdx,56(%rsp) + movq %rcx,64(%rsp) + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulx_383_n_shift_by_31 + movq %rdx,72(%rsp) + movq %rcx,80(%rsp) + + movq 56(%rsp),%rdx + movq 64(%rsp),%rcx + leaq 96(%rsi),%rsi + leaq 48(%rdi),%rdi + call __smulx_383x63 + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulx_383x63 + xorq $256+96,%rsi + movl $31,%edi + call __ab_approximation_31 + + + movq %r12,72(%rsp) + movq %r13,80(%rsp) + + movq $256,%rdi + xorq %rsi,%rdi + call __smulx_383_n_shift_by_31 + movq %rdx,56(%rsp) + movq %rcx,64(%rsp) + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulx_383_n_shift_by_31 + movq %rdx,72(%rsp) + movq %rcx,80(%rsp) + + movq 56(%rsp),%rdx + movq 64(%rsp),%rcx + leaq 96(%rsi),%rsi + leaq 48(%rdi),%rdi + call __smulx_383x63 + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulx_383x63 + xorq $256+96,%rsi + movl $31,%edi + call __ab_approximation_31 + + + movq %r12,72(%rsp) + movq %r13,80(%rsp) + + movq $256,%rdi + xorq %rsi,%rdi + call __smulx_383_n_shift_by_31 + movq %rdx,56(%rsp) + movq %rcx,64(%rsp) + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulx_383_n_shift_by_31 + movq %rdx,72(%rsp) + movq %rcx,80(%rsp) + + movq 56(%rsp),%rdx + movq 64(%rsp),%rcx + leaq 96(%rsi),%rsi + leaq 48(%rdi),%rdi + call __smulx_383x63 + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulx_383x63 + xorq $256+96,%rsi + movl $31,%edi + call __ab_approximation_31 + + + movq %r12,72(%rsp) + movq %r13,80(%rsp) + + movq $256,%rdi + xorq %rsi,%rdi + call __smulx_383_n_shift_by_31 + movq %rdx,56(%rsp) + movq %rcx,64(%rsp) + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulx_383_n_shift_by_31 + movq %rdx,72(%rsp) + movq %rcx,80(%rsp) + + movq 56(%rsp),%rdx + movq 64(%rsp),%rcx + leaq 96(%rsi),%rsi + leaq 48(%rdi),%rdi + call __smulx_383x63 + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulx_383x63 + xorq $256+96,%rsi + movl $31,%edi + call __ab_approximation_31 + + + movq %r12,72(%rsp) + movq %r13,80(%rsp) + + movq $256,%rdi + xorq %rsi,%rdi + call __smulx_383_n_shift_by_31 + movq %rdx,56(%rsp) + movq %rcx,64(%rsp) + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulx_383_n_shift_by_31 + movq %rdx,72(%rsp) + movq %rcx,80(%rsp) + + movq 56(%rsp),%rdx + movq 64(%rsp),%rcx + leaq 96(%rsi),%rsi + leaq 48(%rdi),%rdi + call __smulx_383x63 + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulx_383x63 + sarq $63,%r13 + movq %r13,48(%rdi) + movq %r13,56(%rdi) + movq %r13,64(%rdi) + movq %r13,72(%rdi) + movq %r13,80(%rdi) + movq %r13,88(%rdi) + xorq $256+96,%rsi + movl $31,%edi + call __ab_approximation_31 + + + movq %r12,72(%rsp) + movq %r13,80(%rsp) + + movq $256,%rdi + xorq %rsi,%rdi + call __smulx_383_n_shift_by_31 + movq %rdx,56(%rsp) + movq %rcx,64(%rsp) + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulx_383_n_shift_by_31 + movq %rdx,72(%rsp) + movq %rcx,80(%rsp) + + movq 56(%rsp),%rdx + movq 64(%rsp),%rcx + leaq 96(%rsi),%rsi + leaq 48(%rdi),%rdi + call __smulx_383x63 + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulx_767x63 + xorq $256+96,%rsi + movl $31,%edi + call __ab_approximation_31 + + + movq %r12,72(%rsp) + movq %r13,80(%rsp) + + movq $256,%rdi + xorq %rsi,%rdi + call __smulx_383_n_shift_by_31 + movq %rdx,56(%rsp) + movq %rcx,64(%rsp) + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulx_383_n_shift_by_31 + movq %rdx,72(%rsp) + movq %rcx,80(%rsp) + + movq 56(%rsp),%rdx + movq 64(%rsp),%rcx + leaq 96(%rsi),%rsi + leaq 48(%rdi),%rdi + call __smulx_383x63 + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulx_767x63 + xorq $256+96,%rsi + movl $31,%edi + call __ab_approximation_31 + + + movq %r12,72(%rsp) + movq %r13,80(%rsp) + + movq $256,%rdi + xorq %rsi,%rdi + call __smulx_383_n_shift_by_31 + movq %rdx,56(%rsp) + movq %rcx,64(%rsp) + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulx_383_n_shift_by_31 + movq %rdx,72(%rsp) + movq %rcx,80(%rsp) + + movq 56(%rsp),%rdx + movq 64(%rsp),%rcx + leaq 96(%rsi),%rsi + leaq 48(%rdi),%rdi + call __smulx_383x63 + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulx_767x63 + xorq $256+96,%rsi + movl $31,%edi + call __ab_approximation_31 + + + movq %r12,72(%rsp) + movq %r13,80(%rsp) + + movq $256,%rdi + xorq %rsi,%rdi + call __smulx_383_n_shift_by_31 + movq %rdx,56(%rsp) + movq %rcx,64(%rsp) + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulx_383_n_shift_by_31 + movq %rdx,72(%rsp) + movq %rcx,80(%rsp) + + movq 56(%rsp),%rdx + movq 64(%rsp),%rcx + leaq 96(%rsi),%rsi + leaq 48(%rdi),%rdi + call __smulx_383x63 + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulx_767x63 + xorq $256+96,%rsi + movl $31,%edi + call __ab_approximation_31 + + + movq %r12,72(%rsp) + movq %r13,80(%rsp) + + movq $256,%rdi + xorq %rsi,%rdi + call __smulx_383_n_shift_by_31 + movq %rdx,56(%rsp) + movq %rcx,64(%rsp) + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulx_383_n_shift_by_31 + movq %rdx,72(%rsp) + movq %rcx,80(%rsp) + + movq 56(%rsp),%rdx + movq 64(%rsp),%rcx + leaq 96(%rsi),%rsi + leaq 48(%rdi),%rdi + call __smulx_383x63 + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulx_767x63 + xorq $256+96,%rsi + movl $31,%edi + call __ab_approximation_31 + + + movq %r12,72(%rsp) + movq %r13,80(%rsp) + + movq $256,%rdi + xorq %rsi,%rdi + call __smulx_383_n_shift_by_31 + movq %rdx,56(%rsp) + movq %rcx,64(%rsp) + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulx_383_n_shift_by_31 + movq %rdx,72(%rsp) + movq %rcx,80(%rsp) + + movq 56(%rsp),%rdx + movq 64(%rsp),%rcx + leaq 96(%rsi),%rsi + leaq 48(%rdi),%rdi + call __smulx_383x63 + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulx_767x63 + xorq $256+96,%rsi + movl $31,%edi + call __ab_approximation_31 + + + movq %r12,72(%rsp) + movq %r13,80(%rsp) + + movq $256,%rdi + xorq %rsi,%rdi + call __smulx_383_n_shift_by_31 + movq %rdx,56(%rsp) + movq %rcx,64(%rsp) + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulx_383_n_shift_by_31 + movq %rdx,72(%rsp) + movq %rcx,80(%rsp) + + movq 56(%rsp),%rdx + movq 64(%rsp),%rcx + leaq 96(%rsi),%rsi + leaq 48(%rdi),%rdi + call __smulx_383x63 + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulx_767x63 + xorq $256+96,%rsi + movl $31,%edi + call __ab_approximation_31 + + + movq %r12,72(%rsp) + movq %r13,80(%rsp) + + movq $256,%rdi + xorq %rsi,%rdi + call __smulx_191_n_shift_by_31 + movq %rdx,56(%rsp) + movq %rcx,64(%rsp) + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulx_191_n_shift_by_31 + movq %rdx,72(%rsp) + movq %rcx,80(%rsp) + + movq 56(%rsp),%rdx + movq 64(%rsp),%rcx + leaq 96(%rsi),%rsi + leaq 48(%rdi),%rdi + call __smulx_383x63 + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulx_767x63 + xorq $256+96,%rsi + movl $31,%edi + call __ab_approximation_31 + + + movq %r12,72(%rsp) + movq %r13,80(%rsp) + + movq $256,%rdi + xorq %rsi,%rdi + call __smulx_191_n_shift_by_31 + movq %rdx,56(%rsp) + movq %rcx,64(%rsp) + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulx_191_n_shift_by_31 + movq %rdx,72(%rsp) + movq %rcx,80(%rsp) + + movq 56(%rsp),%rdx + movq 64(%rsp),%rcx + leaq 96(%rsi),%rsi + leaq 48(%rdi),%rdi + call __smulx_383x63 + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulx_767x63 + xorq $256+96,%rsi + movl $31,%edi + call __ab_approximation_31 + + + movq %r12,72(%rsp) + movq %r13,80(%rsp) + + movq $256,%rdi + xorq %rsi,%rdi + call __smulx_191_n_shift_by_31 + movq %rdx,56(%rsp) + movq %rcx,64(%rsp) + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulx_191_n_shift_by_31 + movq %rdx,72(%rsp) + movq %rcx,80(%rsp) + + movq 56(%rsp),%rdx + movq 64(%rsp),%rcx + leaq 96(%rsi),%rsi + leaq 48(%rdi),%rdi + call __smulx_383x63 + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulx_767x63 + xorq $256+96,%rsi + movl $31,%edi + call __ab_approximation_31 + + + movq %r12,72(%rsp) + movq %r13,80(%rsp) + + movq $256,%rdi + xorq %rsi,%rdi + call __smulx_191_n_shift_by_31 + movq %rdx,56(%rsp) + movq %rcx,64(%rsp) + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulx_191_n_shift_by_31 + movq %rdx,72(%rsp) + movq %rcx,80(%rsp) + + movq 56(%rsp),%rdx + movq 64(%rsp),%rcx + leaq 96(%rsi),%rsi + leaq 48(%rdi),%rdi + call __smulx_383x63 + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulx_767x63 + + xorq $256+96,%rsi + movl $53,%edi + + movq 0(%rsi),%r8 + + movq 48(%rsi),%r10 + + call __inner_loop_62 + + + + + + + + leaq 96(%rsi),%rsi + + + + + + movq %r12,%rdx + movq %r13,%rcx + movq 32(%rsp),%rdi + call __smulx_767x63 + + movq 40(%rsp),%rsi + movq %rax,%rdx + sarq $63,%rax + + movq %rax,%r8 + movq %rax,%r9 + movq %rax,%r10 + andq 0(%rsi),%r8 + andq 8(%rsi),%r9 + movq %rax,%r11 + andq 16(%rsi),%r10 + andq 24(%rsi),%r11 + movq %rax,%r12 + andq 32(%rsi),%r12 + andq 40(%rsi),%rax + + addq %r8,%r14 + adcq %r9,%r15 + adcq %r10,%rbx + adcq %r11,%rbp + adcq %r12,%rcx + adcq %rax,%rdx + + movq %r14,48(%rdi) + movq %r15,56(%rdi) + movq %rbx,64(%rdi) + movq %rbp,72(%rdi) + movq %rcx,80(%rdi) + movq %rdx,88(%rdi) + + leaq 1112(%rsp),%r8 + movq 0(%r8),%r15 + + movq 8(%r8),%r14 + + movq 16(%r8),%r13 + + movq 24(%r8),%r12 + + movq 32(%r8),%rbx + + movq 40(%r8),%rbp + + leaq 48(%r8),%rsp + +.LSEH_epilogue_ctx_inverse_mod_383: + mov 8(%rsp),%rdi + mov 16(%rsp),%rsi + + .byte 0xf3,0xc3 + +.LSEH_end_ctx_inverse_mod_383: +.def __smulx_767x63; .scl 3; .type 32; .endef +.p2align 5 +__smulx_767x63: + .byte 0xf3,0x0f,0x1e,0xfa + + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + movq 32(%rsi),%r12 + movq 40(%rsi),%r13 + + movq %rdx,%rax + sarq $63,%rax + xorq %rbp,%rbp + subq %rax,%rbp + + movq %rdi,8(%rsp) + movq %rsi,16(%rsp) + leaq 48(%rsi),%rsi + + xorq %rax,%rdx + addq %rbp,%rdx + + xorq %rax,%r8 + xorq %rax,%r9 + xorq %rax,%r10 + xorq %rax,%r11 + xorq %rax,%r12 + xorq %r13,%rax + addq %rbp,%r8 + adcq $0,%r9 + adcq $0,%r10 + adcq $0,%r11 + adcq $0,%r12 + adcq $0,%rax + + mulxq %r8,%r8,%rbp + mulxq %r9,%r9,%r13 + addq %rbp,%r9 + mulxq %r10,%r10,%rbp + adcq %r13,%r10 + mulxq %r11,%r11,%r13 + adcq %rbp,%r11 + mulxq %r12,%r12,%rbp + adcq %r13,%r12 + adcq $0,%rbp + imulq %rdx + addq %rbp,%rax + adcq $0,%rdx + + movq %r8,0(%rdi) + movq %r9,8(%rdi) + movq %r10,16(%rdi) + movq %r11,24(%rdi) + movq %r12,32(%rdi) + movq %rax,40(%rdi) + movq %rdx,48(%rdi) + sarq $63,%rdx + movq %rdx,56(%rdi) + movq %rcx,%rdx + movq %rcx,%rax + + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + movq 32(%rsi),%r12 + movq 40(%rsi),%r13 + movq 48(%rsi),%r14 + movq 56(%rsi),%r15 + movq 64(%rsi),%rbx + movq 72(%rsi),%rbp + movq 80(%rsi),%rcx + movq 88(%rsi),%rdi + + sarq $63,%rax + xorq %rsi,%rsi + subq %rax,%rsi + + xorq %rax,%rdx + addq %rsi,%rdx + + xorq %rax,%r8 + xorq %rax,%r9 + xorq %rax,%r10 + xorq %rax,%r11 + xorq %rax,%r12 + xorq %rax,%r13 + xorq %rax,%r14 + xorq %rax,%r15 + xorq %rax,%rbx + xorq %rax,%rbp + xorq %rax,%rcx + xorq %rax,%rdi + addq %rsi,%r8 + adcq $0,%r9 + adcq $0,%r10 + adcq $0,%r11 + adcq $0,%r12 + adcq $0,%r13 + adcq $0,%r14 + adcq $0,%r15 + adcq $0,%rbx + adcq $0,%rbp + adcq $0,%rcx + adcq $0,%rdi + + mulxq %r8,%r8,%rax + mulxq %r9,%r9,%rsi + addq %rax,%r9 + mulxq %r10,%r10,%rax + adcq %rsi,%r10 + mulxq %r11,%r11,%rsi + adcq %rax,%r11 + mulxq %r12,%r12,%rax + adcq %rsi,%r12 + mulxq %r13,%r13,%rsi + adcq %rax,%r13 + mulxq %r14,%r14,%rax + adcq %rsi,%r14 + mulxq %r15,%r15,%rsi + adcq %rax,%r15 + mulxq %rbx,%rbx,%rax + adcq %rsi,%rbx + mulxq %rbp,%rbp,%rsi + adcq %rax,%rbp + mulxq %rcx,%rcx,%rax + adcq %rsi,%rcx + mulxq %rdi,%rdi,%rsi + movq 8(%rsp),%rdx + movq 16(%rsp),%rsi + adcq %rdi,%rax + + addq 0(%rdx),%r8 + adcq 8(%rdx),%r9 + adcq 16(%rdx),%r10 + adcq 24(%rdx),%r11 + adcq 32(%rdx),%r12 + adcq 40(%rdx),%r13 + adcq 48(%rdx),%r14 + movq 56(%rdx),%rdi + adcq %rdi,%r15 + adcq %rdi,%rbx + adcq %rdi,%rbp + adcq %rdi,%rcx + adcq %rdi,%rax + + movq %rdx,%rdi + + movq %r8,0(%rdx) + movq %r9,8(%rdx) + movq %r10,16(%rdx) + movq %r11,24(%rdx) + movq %r12,32(%rdx) + movq %r13,40(%rdx) + movq %r14,48(%rdx) + movq %r15,56(%rdx) + movq %rbx,64(%rdx) + movq %rbp,72(%rdx) + movq %rcx,80(%rdx) + movq %rax,88(%rdx) + + .byte 0xf3,0xc3 + +.def __smulx_383x63; .scl 3; .type 32; .endef +.p2align 5 +__smulx_383x63: + .byte 0xf3,0x0f,0x1e,0xfa + + movq 0+0(%rsi),%r8 + movq 0+8(%rsi),%r9 + movq 0+16(%rsi),%r10 + movq 0+24(%rsi),%r11 + movq 0+32(%rsi),%r12 + movq 0+40(%rsi),%r13 + + movq %rdx,%rbp + sarq $63,%rbp + xorq %rax,%rax + subq %rbp,%rax + + xorq %rbp,%rdx + addq %rax,%rdx + + xorq %rbp,%r8 + xorq %rbp,%r9 + xorq %rbp,%r10 + xorq %rbp,%r11 + xorq %rbp,%r12 + xorq %rbp,%r13 + addq %rax,%r8 + adcq $0,%r9 + adcq $0,%r10 + adcq $0,%r11 + adcq $0,%r12 + adcq $0,%r13 + + mulxq %r8,%r8,%rbp + mulxq %r9,%r9,%rax + addq %rbp,%r9 + mulxq %r10,%r10,%rbp + adcq %rax,%r10 + mulxq %r11,%r11,%rax + adcq %rbp,%r11 + mulxq %r12,%r12,%rbp + adcq %rax,%r12 + mulxq %r13,%r13,%rax + movq %rcx,%rdx + adcq %rbp,%r13 + + movq %r8,0(%rdi) + movq %r9,8(%rdi) + movq %r10,16(%rdi) + movq %r11,24(%rdi) + movq %r12,32(%rdi) + movq %r13,40(%rdi) + movq 48+0(%rsi),%r8 + movq 48+8(%rsi),%r9 + movq 48+16(%rsi),%r10 + movq 48+24(%rsi),%r11 + movq 48+32(%rsi),%r12 + movq 48+40(%rsi),%r13 + + movq %rdx,%rbp + sarq $63,%rbp + xorq %rax,%rax + subq %rbp,%rax + + xorq %rbp,%rdx + addq %rax,%rdx + + xorq %rbp,%r8 + xorq %rbp,%r9 + xorq %rbp,%r10 + xorq %rbp,%r11 + xorq %rbp,%r12 + xorq %rbp,%r13 + addq %rax,%r8 + adcq $0,%r9 + adcq $0,%r10 + adcq $0,%r11 + adcq $0,%r12 + adcq $0,%r13 + + mulxq %r8,%r8,%rbp + mulxq %r9,%r9,%rax + addq %rbp,%r9 + mulxq %r10,%r10,%rbp + adcq %rax,%r10 + mulxq %r11,%r11,%rax + adcq %rbp,%r11 + mulxq %r12,%r12,%rbp + adcq %rax,%r12 + mulxq %r13,%r13,%rax + adcq %rbp,%r13 + + addq 0(%rdi),%r8 + adcq 8(%rdi),%r9 + adcq 16(%rdi),%r10 + adcq 24(%rdi),%r11 + adcq 32(%rdi),%r12 + adcq 40(%rdi),%r13 + + movq %r8,0(%rdi) + movq %r9,8(%rdi) + movq %r10,16(%rdi) + movq %r11,24(%rdi) + movq %r12,32(%rdi) + movq %r13,40(%rdi) + + .byte 0xf3,0xc3 + +.def __smulx_383_n_shift_by_31; .scl 3; .type 32; .endef +.p2align 5 +__smulx_383_n_shift_by_31: + .byte 0xf3,0x0f,0x1e,0xfa + + movq %rdx,%rbx + xorq %r14,%r14 + movq 0+0(%rsi),%r8 + movq 0+8(%rsi),%r9 + movq 0+16(%rsi),%r10 + movq 0+24(%rsi),%r11 + movq 0+32(%rsi),%r12 + movq 0+40(%rsi),%r13 + + movq %rdx,%rax + sarq $63,%rax + xorq %rbp,%rbp + subq %rax,%rbp + + xorq %rax,%rdx + addq %rbp,%rdx + + xorq %rax,%r8 + xorq %rax,%r9 + xorq %rax,%r10 + xorq %rax,%r11 + xorq %rax,%r12 + xorq %r13,%rax + addq %rbp,%r8 + adcq $0,%r9 + adcq $0,%r10 + adcq $0,%r11 + adcq $0,%r12 + adcq $0,%rax + + mulxq %r8,%r8,%rbp + mulxq %r9,%r9,%r13 + addq %rbp,%r9 + mulxq %r10,%r10,%rbp + adcq %r13,%r10 + mulxq %r11,%r11,%r13 + adcq %rbp,%r11 + mulxq %r12,%r12,%rbp + adcq %r13,%r12 + adcq $0,%rbp + imulq %rdx + addq %rbp,%rax + adcq %rdx,%r14 + + movq %rcx,%rdx + + movq %r8,0(%rdi) + movq %r9,8(%rdi) + movq %r10,16(%rdi) + movq %r11,24(%rdi) + movq %r12,32(%rdi) + movq %rax,40(%rdi) + movq 48+0(%rsi),%r8 + movq 48+8(%rsi),%r9 + movq 48+16(%rsi),%r10 + movq 48+24(%rsi),%r11 + movq 48+32(%rsi),%r12 + movq 48+40(%rsi),%r13 + + movq %rdx,%rax + sarq $63,%rax + xorq %rbp,%rbp + subq %rax,%rbp + + xorq %rax,%rdx + addq %rbp,%rdx + + xorq %rax,%r8 + xorq %rax,%r9 + xorq %rax,%r10 + xorq %rax,%r11 + xorq %rax,%r12 + xorq %r13,%rax + addq %rbp,%r8 + adcq $0,%r9 + adcq $0,%r10 + adcq $0,%r11 + adcq $0,%r12 + adcq $0,%rax + + mulxq %r8,%r8,%rbp + mulxq %r9,%r9,%r13 + addq %rbp,%r9 + mulxq %r10,%r10,%rbp + adcq %r13,%r10 + mulxq %r11,%r11,%r13 + adcq %rbp,%r11 + mulxq %r12,%r12,%rbp + adcq %r13,%r12 + adcq $0,%rbp + imulq %rdx + addq %rbp,%rax + adcq $0,%rdx + + addq 0(%rdi),%r8 + adcq 8(%rdi),%r9 + adcq 16(%rdi),%r10 + adcq 24(%rdi),%r11 + adcq 32(%rdi),%r12 + adcq 40(%rdi),%rax + adcq %rdx,%r14 + movq %rbx,%rdx + + shrdq $31,%r9,%r8 + shrdq $31,%r10,%r9 + shrdq $31,%r11,%r10 + shrdq $31,%r12,%r11 + shrdq $31,%rax,%r12 + shrdq $31,%r14,%rax + + sarq $63,%r14 + xorq %rbp,%rbp + subq %r14,%rbp + + xorq %r14,%r8 + xorq %r14,%r9 + xorq %r14,%r10 + xorq %r14,%r11 + xorq %r14,%r12 + xorq %r14,%rax + addq %rbp,%r8 + adcq $0,%r9 + adcq $0,%r10 + adcq $0,%r11 + adcq $0,%r12 + adcq $0,%rax + + movq %r8,0(%rdi) + movq %r9,8(%rdi) + movq %r10,16(%rdi) + movq %r11,24(%rdi) + movq %r12,32(%rdi) + movq %rax,40(%rdi) + + xorq %r14,%rdx + xorq %r14,%rcx + addq %rbp,%rdx + addq %rbp,%rcx + + .byte 0xf3,0xc3 + +.def __smulx_191_n_shift_by_31; .scl 3; .type 32; .endef +.p2align 5 +__smulx_191_n_shift_by_31: + .byte 0xf3,0x0f,0x1e,0xfa + + movq %rdx,%rbx + movq 0+0(%rsi),%r8 + movq 0+8(%rsi),%r9 + movq 0+16(%rsi),%r10 + + movq %rdx,%rax + sarq $63,%rax + xorq %rbp,%rbp + subq %rax,%rbp + + xorq %rax,%rdx + addq %rbp,%rdx + + xorq %rax,%r8 + xorq %rax,%r9 + xorq %r10,%rax + addq %rbp,%r8 + adcq $0,%r9 + adcq $0,%rax + + mulxq %r8,%r8,%rbp + mulxq %r9,%r9,%r10 + addq %rbp,%r9 + adcq $0,%r10 + imulq %rdx + addq %rax,%r10 + adcq $0,%rdx + movq %rdx,%r14 + movq %rcx,%rdx + movq 48+0(%rsi),%r11 + movq 48+8(%rsi),%r12 + movq 48+16(%rsi),%r13 + + movq %rdx,%rax + sarq $63,%rax + xorq %rbp,%rbp + subq %rax,%rbp + + xorq %rax,%rdx + addq %rbp,%rdx + + xorq %rax,%r11 + xorq %rax,%r12 + xorq %r13,%rax + addq %rbp,%r11 + adcq $0,%r12 + adcq $0,%rax + + mulxq %r11,%r11,%rbp + mulxq %r12,%r12,%r13 + addq %rbp,%r12 + adcq $0,%r13 + imulq %rdx + addq %rax,%r13 + adcq $0,%rdx + addq %r8,%r11 + adcq %r9,%r12 + adcq %r10,%r13 + adcq %rdx,%r14 + movq %rbx,%rdx + + shrdq $31,%r12,%r11 + shrdq $31,%r13,%r12 + shrdq $31,%r14,%r13 + + sarq $63,%r14 + xorq %rbp,%rbp + subq %r14,%rbp + + xorq %r14,%r11 + xorq %r14,%r12 + xorq %r14,%r13 + addq %rbp,%r11 + adcq $0,%r12 + adcq $0,%r13 + + movq %r11,0(%rdi) + movq %r12,8(%rdi) + movq %r13,16(%rdi) + + xorq %r14,%rdx + xorq %r14,%rcx + addq %rbp,%rdx + addq %rbp,%rcx + + .byte 0xf3,0xc3 + +.def __ab_approximation_31; .scl 3; .type 32; .endef +.p2align 5 +__ab_approximation_31: + .byte 0xf3,0x0f,0x1e,0xfa + + movq 40(%rsi),%r9 + movq 88(%rsi),%r11 + movq 32(%rsi),%rbx + movq 80(%rsi),%rbp + movq 24(%rsi),%r8 + movq 72(%rsi),%r10 + + movq %r9,%rax + orq %r11,%rax + cmovzq %rbx,%r9 + cmovzq %rbp,%r11 + cmovzq %r8,%rbx + movq 16(%rsi),%r8 + cmovzq %r10,%rbp + movq 64(%rsi),%r10 + + movq %r9,%rax + orq %r11,%rax + cmovzq %rbx,%r9 + cmovzq %rbp,%r11 + cmovzq %r8,%rbx + movq 8(%rsi),%r8 + cmovzq %r10,%rbp + movq 56(%rsi),%r10 + + movq %r9,%rax + orq %r11,%rax + cmovzq %rbx,%r9 + cmovzq %rbp,%r11 + cmovzq %r8,%rbx + movq 0(%rsi),%r8 + cmovzq %r10,%rbp + movq 48(%rsi),%r10 + + movq %r9,%rax + orq %r11,%rax + cmovzq %rbx,%r9 + cmovzq %rbp,%r11 + cmovzq %r8,%rbx + cmovzq %r10,%rbp + + movq %r9,%rax + orq %r11,%rax + bsrq %rax,%rcx + leaq 1(%rcx),%rcx + cmovzq %r8,%r9 + cmovzq %r10,%r11 + cmovzq %rax,%rcx + negq %rcx + + + shldq %cl,%rbx,%r9 + shldq %cl,%rbp,%r11 + + movl $0x7FFFFFFF,%eax + andq %rax,%r8 + andq %rax,%r10 + andnq %r9,%rax,%r9 + andnq %r11,%rax,%r11 + orq %r9,%r8 + orq %r11,%r10 + + jmp __inner_loop_31 + + .byte 0xf3,0xc3 + +.def __inner_loop_31; .scl 3; .type 32; .endef +.p2align 5 +__inner_loop_31: + .byte 0xf3,0x0f,0x1e,0xfa + + movq $0x7FFFFFFF80000000,%rcx + movq $0x800000007FFFFFFF,%r13 + movq $0x7FFFFFFF7FFFFFFF,%r15 + +.Loop_31: + cmpq %r10,%r8 + movq %r8,%rax + movq %r10,%rbx + movq %rcx,%rbp + movq %r13,%r14 + cmovbq %r10,%r8 + cmovbq %rax,%r10 + cmovbq %r13,%rcx + cmovbq %rbp,%r13 + + subq %r10,%r8 + subq %r13,%rcx + addq %r15,%rcx + + testq $1,%rax + cmovzq %rax,%r8 + cmovzq %rbx,%r10 + cmovzq %rbp,%rcx + cmovzq %r14,%r13 + + shrq $1,%r8 + addq %r13,%r13 + subq %r15,%r13 + subl $1,%edi + jnz .Loop_31 + + shrq $32,%r15 + movl %ecx,%edx + movl %r13d,%r12d + shrq $32,%rcx + shrq $32,%r13 + subq %r15,%rdx + subq %r15,%rcx + subq %r15,%r12 + subq %r15,%r13 + + .byte 0xf3,0xc3 + + +.def __inner_loop_62; .scl 3; .type 32; .endef +.p2align 5 +__inner_loop_62: + .byte 0xf3,0x0f,0x1e,0xfa + + movq $1,%rdx + xorq %rcx,%rcx + xorq %r12,%r12 + movq $1,%r13 + +.Loop_62: + xorq %rax,%rax + testq $1,%r8 + movq %r10,%rbx + cmovnzq %r10,%rax + subq %r8,%rbx + movq %r8,%rbp + subq %rax,%r8 + cmovcq %rbx,%r8 + cmovcq %rbp,%r10 + movq %rdx,%rax + cmovcq %r12,%rdx + cmovcq %rax,%r12 + movq %rcx,%rbx + cmovcq %r13,%rcx + cmovcq %rbx,%r13 + xorq %rax,%rax + xorq %rbx,%rbx + shrq $1,%r8 + testq $1,%rbp + cmovnzq %r12,%rax + cmovnzq %r13,%rbx + addq %r12,%r12 + addq %r13,%r13 + subq %rax,%rdx + subq %rbx,%rcx + subl $1,%edi + jnz .Loop_62 + + .byte 0xf3,0xc3 + +.section .pdata +.p2align 2 +.rva .LSEH_begin_ctx_inverse_mod_383 +.rva .LSEH_body_ctx_inverse_mod_383 +.rva .LSEH_info_ctx_inverse_mod_383_prologue + +.rva .LSEH_body_ctx_inverse_mod_383 +.rva .LSEH_epilogue_ctx_inverse_mod_383 +.rva .LSEH_info_ctx_inverse_mod_383_body + +.rva .LSEH_epilogue_ctx_inverse_mod_383 +.rva .LSEH_end_ctx_inverse_mod_383 +.rva .LSEH_info_ctx_inverse_mod_383_epilogue + +.section .xdata +.p2align 3 +.LSEH_info_ctx_inverse_mod_383_prologue: +.byte 1,0,5,0x0b +.byte 0,0x74,1,0 +.byte 0,0x64,2,0 +.byte 0,0x03 +.byte 0,0 +.LSEH_info_ctx_inverse_mod_383_body: +.byte 1,0,18,0 +.byte 0x00,0xf4,0x8b,0x00 +.byte 0x00,0xe4,0x8c,0x00 +.byte 0x00,0xd4,0x8d,0x00 +.byte 0x00,0xc4,0x8e,0x00 +.byte 0x00,0x34,0x8f,0x00 +.byte 0x00,0x54,0x90,0x00 +.byte 0x00,0x74,0x92,0x00 +.byte 0x00,0x64,0x93,0x00 +.byte 0x00,0x01,0x91,0x00 +.LSEH_info_ctx_inverse_mod_383_epilogue: +.byte 1,0,4,0 +.byte 0x00,0x74,0x01,0x00 +.byte 0x00,0x64,0x02,0x00 +.byte 0x00,0x00,0x00,0x00 + diff --git a/temp-deps/blst-ts/deps/blst/build/coff/div3w-armv8.S b/temp-deps/blst-ts/deps/blst/build/coff/div3w-armv8.S new file mode 100644 index 0000000..c17b9e3 --- /dev/null +++ b/temp-deps/blst-ts/deps/blst/build/coff/div3w-armv8.S @@ -0,0 +1,94 @@ +.text + +.globl div_3_limbs +.def div_3_limbs; +.type 32; +.endef +.p2align 5 +div_3_limbs: + ldp x4,x5,[x0] // load R + eor x0,x0,x0 // Q = 0 + mov x3,#64 // loop counter + nop + +.Loop: + subs x6,x4,x1 // R - D + add x0,x0,x0 // Q <<= 1 + sbcs x7,x5,x2 + add x0,x0,#1 // Q + speculative bit + csel x4,x4,x6,lo // select between R and R - D + extr x1,x2,x1,#1 // D >>= 1 + csel x5,x5,x7,lo + lsr x2,x2,#1 + sbc x0,x0,xzr // subtract speculative bit + sub x3,x3,#1 + cbnz x3,.Loop + + asr x3,x0,#63 // top bit -> mask + add x0,x0,x0 // Q <<= 1 + subs x6,x4,x1 // R - D + add x0,x0,#1 // Q + specilative bit + sbcs x7,x5,x2 + sbc x0,x0,xzr // subtract speculative bit + + orr x0,x0,x3 // all ones if overflow + + ret + +.globl quot_rem_128 +.def quot_rem_128; +.type 32; +.endef +.p2align 5 +quot_rem_128: + ldp x3,x4,[x1] + + mul x5,x3,x2 // divisor[0:1} * quotient + umulh x6,x3,x2 + mul x11, x4,x2 + umulh x7,x4,x2 + + ldp x8,x9,[x0] // load 3 limbs of the dividend + ldr x10,[x0,#16] + + adds x6,x6,x11 + adc x7,x7,xzr + + subs x8,x8,x5 // dividend - divisor * quotient + sbcs x9,x9,x6 + sbcs x10,x10,x7 + sbc x5,xzr,xzr // borrow -> mask + + add x2,x2,x5 // if borrowed, adjust the quotient ... + and x3,x3,x5 + and x4,x4,x5 + adds x8,x8,x3 // ... and add divisor + adc x9,x9,x4 + + stp x8,x9,[x0] // save 2 limbs of the remainder + str x2,[x0,#16] // and one limb of the quotient + + mov x0,x2 // return adjusted quotient + + ret + + +.globl quot_rem_64 +.def quot_rem_64; +.type 32; +.endef +.p2align 5 +quot_rem_64: + ldr x3,[x1] + ldr x8,[x0] // load 1 limb of the dividend + + mul x5,x3,x2 // divisor * quotient + + sub x8,x8,x5 // dividend - divisor * quotient + + stp x8,x2,[x0] // save remainder and quotient + + mov x0,x2 // return quotient + + ret + diff --git a/temp-deps/blst-ts/deps/blst/build/coff/div3w-x86_64.s b/temp-deps/blst-ts/deps/blst/build/coff/div3w-x86_64.s new file mode 100644 index 0000000..fcfe544 --- /dev/null +++ b/temp-deps/blst-ts/deps/blst/build/coff/div3w-x86_64.s @@ -0,0 +1,140 @@ +.text + +.globl div_3_limbs + +.def div_3_limbs; .scl 2; .type 32; .endef +.p2align 5 +div_3_limbs: + .byte 0xf3,0x0f,0x1e,0xfa + movq %rdi,8(%rsp) + movq %rsi,16(%rsp) + movq %rsp,%rax +.LSEH_begin_div_3_limbs: + movq %rcx,%rdi + movq %rdx,%rsi + movq %r8,%rdx + + movq (%rdi),%r8 + movq 8(%rdi),%r9 + xorq %rax,%rax + movl $64,%ecx + +.Loop: + movq %r8,%r10 + subq %rsi,%r8 + movq %r9,%r11 + sbbq %rdx,%r9 + leaq 1(%rax,%rax,1),%rax + movq %rdx,%rdi + cmovcq %r10,%r8 + cmovcq %r11,%r9 + sbbq $0,%rax + shlq $63,%rdi + shrq $1,%rsi + shrq $1,%rdx + orq %rdi,%rsi + subl $1,%ecx + jnz .Loop + + leaq 1(%rax,%rax,1),%rcx + sarq $63,%rax + + subq %rsi,%r8 + sbbq %rdx,%r9 + sbbq $0,%rcx + + orq %rcx,%rax + + movq 8(%rsp),%rdi + movq 16(%rsp),%rsi + .byte 0xf3,0xc3 +.LSEH_end_div_3_limbs: +.globl quot_rem_128 + +.def quot_rem_128; .scl 2; .type 32; .endef +.p2align 5 +quot_rem_128: + .byte 0xf3,0x0f,0x1e,0xfa + movq %rdi,8(%rsp) + movq %rsi,16(%rsp) + movq %rsp,%rax +.LSEH_begin_quot_rem_128: + movq %rcx,%rdi + movq %rdx,%rsi + movq %r8,%rdx + + movq %rdx,%rax + movq %rdx,%rcx + + mulq 0(%rsi) + movq %rax,%r8 + movq %rcx,%rax + movq %rdx,%r9 + + mulq 8(%rsi) + addq %rax,%r9 + adcq $0,%rdx + + movq 0(%rdi),%r10 + movq 8(%rdi),%r11 + movq 16(%rdi),%rax + + subq %r8,%r10 + sbbq %r9,%r11 + sbbq %rdx,%rax + sbbq %r8,%r8 + + addq %r8,%rcx + movq %r8,%r9 + andq 0(%rsi),%r8 + andq 8(%rsi),%r9 + addq %r8,%r10 + adcq %r9,%r11 + + movq %r10,0(%rdi) + movq %r11,8(%rdi) + movq %rcx,16(%rdi) + + movq %rcx,%rax + + movq 8(%rsp),%rdi + movq 16(%rsp),%rsi + .byte 0xf3,0xc3 +.LSEH_end_quot_rem_128: + + + + + +.globl quot_rem_64 + +.def quot_rem_64; .scl 2; .type 32; .endef +.p2align 5 +quot_rem_64: + .byte 0xf3,0x0f,0x1e,0xfa + movq %rdi,8(%rsp) + movq %rsi,16(%rsp) + movq %rsp,%rax +.LSEH_begin_quot_rem_64: + movq %rcx,%rdi + movq %rdx,%rsi + movq %r8,%rdx + + movq %rdx,%rax + imulq 0(%rsi),%rdx + + movq 0(%rdi),%r10 + + subq %rdx,%r10 + + movq %r10,0(%rdi) + movq %rax,8(%rdi) + + movq 8(%rsp),%rdi + movq 16(%rsp),%rsi + .byte 0xf3,0xc3 +.LSEH_end_quot_rem_64: +.section .pdata +.p2align 2 +.section .xdata +.p2align 3 diff --git a/temp-deps/blst-ts/deps/blst/build/coff/mul_mont_256-armv8.S b/temp-deps/blst-ts/deps/blst/build/coff/mul_mont_256-armv8.S new file mode 100644 index 0000000..8cadbb8 --- /dev/null +++ b/temp-deps/blst-ts/deps/blst/build/coff/mul_mont_256-armv8.S @@ -0,0 +1,474 @@ +.text + +.globl mul_mont_sparse_256 + +.def mul_mont_sparse_256; +.type 32; +.endef +.p2align 5 +mul_mont_sparse_256: + stp x29,x30,[sp,#-64]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + stp x23,x24,[sp,#48] + + ldp x10,x11,[x1] + ldr x9, [x2] + ldp x12,x13,[x1,#16] + + mul x19,x10,x9 + ldp x5,x6,[x3] + mul x20,x11,x9 + ldp x7,x8,[x3,#16] + mul x21,x12,x9 + mul x22,x13,x9 + + umulh x14,x10,x9 + umulh x15,x11,x9 + mul x3,x4,x19 + umulh x16,x12,x9 + umulh x17,x13,x9 + adds x20,x20,x14 + //mul x14,x5,x3 + adcs x21,x21,x15 + mul x15,x6,x3 + adcs x22,x22,x16 + mul x16,x7,x3 + adc x23,xzr, x17 + mul x17,x8,x3 + ldr x9,[x2,8*1] + subs xzr,x19,#1 //adds x19,x19,x14 + umulh x14,x5,x3 + adcs x20,x20,x15 + umulh x15,x6,x3 + adcs x21,x21,x16 + umulh x16,x7,x3 + adcs x22,x22,x17 + umulh x17,x8,x3 + adc x23,x23,xzr + + adds x19,x20,x14 + mul x14,x10,x9 + adcs x20,x21,x15 + mul x15,x11,x9 + adcs x21,x22,x16 + mul x16,x12,x9 + adcs x22,x23,x17 + mul x17,x13,x9 + adc x23,xzr,xzr + + adds x19,x19,x14 + umulh x14,x10,x9 + adcs x20,x20,x15 + umulh x15,x11,x9 + adcs x21,x21,x16 + mul x3,x4,x19 + umulh x16,x12,x9 + adcs x22,x22,x17 + umulh x17,x13,x9 + adc x23,x23,xzr + + adds x20,x20,x14 + //mul x14,x5,x3 + adcs x21,x21,x15 + mul x15,x6,x3 + adcs x22,x22,x16 + mul x16,x7,x3 + adc x23,x23,x17 + mul x17,x8,x3 + ldr x9,[x2,8*2] + subs xzr,x19,#1 //adds x19,x19,x14 + umulh x14,x5,x3 + adcs x20,x20,x15 + umulh x15,x6,x3 + adcs x21,x21,x16 + umulh x16,x7,x3 + adcs x22,x22,x17 + umulh x17,x8,x3 + adc x23,x23,xzr + + adds x19,x20,x14 + mul x14,x10,x9 + adcs x20,x21,x15 + mul x15,x11,x9 + adcs x21,x22,x16 + mul x16,x12,x9 + adcs x22,x23,x17 + mul x17,x13,x9 + adc x23,xzr,xzr + + adds x19,x19,x14 + umulh x14,x10,x9 + adcs x20,x20,x15 + umulh x15,x11,x9 + adcs x21,x21,x16 + mul x3,x4,x19 + umulh x16,x12,x9 + adcs x22,x22,x17 + umulh x17,x13,x9 + adc x23,x23,xzr + + adds x20,x20,x14 + //mul x14,x5,x3 + adcs x21,x21,x15 + mul x15,x6,x3 + adcs x22,x22,x16 + mul x16,x7,x3 + adc x23,x23,x17 + mul x17,x8,x3 + ldr x9,[x2,8*3] + subs xzr,x19,#1 //adds x19,x19,x14 + umulh x14,x5,x3 + adcs x20,x20,x15 + umulh x15,x6,x3 + adcs x21,x21,x16 + umulh x16,x7,x3 + adcs x22,x22,x17 + umulh x17,x8,x3 + adc x23,x23,xzr + + adds x19,x20,x14 + mul x14,x10,x9 + adcs x20,x21,x15 + mul x15,x11,x9 + adcs x21,x22,x16 + mul x16,x12,x9 + adcs x22,x23,x17 + mul x17,x13,x9 + adc x23,xzr,xzr + + adds x19,x19,x14 + umulh x14,x10,x9 + adcs x20,x20,x15 + umulh x15,x11,x9 + adcs x21,x21,x16 + mul x3,x4,x19 + umulh x16,x12,x9 + adcs x22,x22,x17 + umulh x17,x13,x9 + adc x23,x23,xzr + + adds x20,x20,x14 + //mul x14,x5,x3 + adcs x21,x21,x15 + mul x15,x6,x3 + adcs x22,x22,x16 + mul x16,x7,x3 + adc x23,x23,x17 + mul x17,x8,x3 + subs xzr,x19,#1 //adds x19,x19,x14 + umulh x14,x5,x3 + adcs x20,x20,x15 + umulh x15,x6,x3 + adcs x21,x21,x16 + umulh x16,x7,x3 + adcs x22,x22,x17 + umulh x17,x8,x3 + adc x23,x23,xzr + + adds x19,x20,x14 + adcs x20,x21,x15 + adcs x21,x22,x16 + adcs x22,x23,x17 + adc x23,xzr,xzr + + subs x14,x19,x5 + sbcs x15,x20,x6 + sbcs x16,x21,x7 + sbcs x17,x22,x8 + sbcs xzr, x23,xzr + + csel x19,x19,x14,lo + csel x20,x20,x15,lo + csel x21,x21,x16,lo + csel x22,x22,x17,lo + + stp x19,x20,[x0] + stp x21,x22,[x0,#16] + + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldp x23,x24,[x29,#48] + ldr x29,[sp],#64 + ret + +.globl sqr_mont_sparse_256 + +.def sqr_mont_sparse_256; +.type 32; +.endef +.p2align 5 +sqr_mont_sparse_256: +.long 3573752639 + stp x29,x30,[sp,#-48]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + + ldp x5,x6,[x1] + ldp x7,x8,[x1,#16] + mov x4,x3 + + //////////////////////////////////////////////////////////////// + // | | | | | |a1*a0| | + // | | | | |a2*a0| | | + // | |a3*a2|a3*a0| | | | + // | | | |a2*a1| | | | + // | | |a3*a1| | | | | + // *| | | | | | | | 2| + // +|a3*a3|a2*a2|a1*a1|a0*a0| + // |--+--+--+--+--+--+--+--| + // |A7|A6|A5|A4|A3|A2|A1|A0|, where Ax is x10 + // + // "can't overflow" below mark carrying into high part of + // multiplication result, which can't overflow, because it + // can never be all ones. + + mul x11,x6,x5 // a[1]*a[0] + umulh x15,x6,x5 + mul x12,x7,x5 // a[2]*a[0] + umulh x16,x7,x5 + mul x13,x8,x5 // a[3]*a[0] + umulh x19,x8,x5 + + adds x12,x12,x15 // accumulate high parts of multiplication + mul x14,x7,x6 // a[2]*a[1] + umulh x15,x7,x6 + adcs x13,x13,x16 + mul x16,x8,x6 // a[3]*a[1] + umulh x17,x8,x6 + adc x19,x19,xzr // can't overflow + + mul x20,x8,x7 // a[3]*a[2] + umulh x21,x8,x7 + + adds x15,x15,x16 // accumulate high parts of multiplication + mul x10,x5,x5 // a[0]*a[0] + adc x16,x17,xzr // can't overflow + + adds x13,x13,x14 // accumulate low parts of multiplication + umulh x5,x5,x5 + adcs x19,x19,x15 + mul x15,x6,x6 // a[1]*a[1] + adcs x20,x20,x16 + umulh x6,x6,x6 + adc x21,x21,xzr // can't overflow + + adds x11,x11,x11 // acc[1-6]*=2 + mul x16,x7,x7 // a[2]*a[2] + adcs x12,x12,x12 + umulh x7,x7,x7 + adcs x13,x13,x13 + mul x17,x8,x8 // a[3]*a[3] + adcs x19,x19,x19 + umulh x8,x8,x8 + adcs x20,x20,x20 + adcs x21,x21,x21 + adc x22,xzr,xzr + + adds x11,x11,x5 // +a[i]*a[i] + adcs x12,x12,x15 + adcs x13,x13,x6 + adcs x19,x19,x16 + adcs x20,x20,x7 + adcs x21,x21,x17 + adc x22,x22,x8 + + bl __mul_by_1_mont_256 + ldr x30,[x29,#8] + + adds x10,x10,x19 // accumulate upper half + adcs x11,x11,x20 + adcs x12,x12,x21 + adcs x13,x13,x22 + adc x19,xzr,xzr + + subs x14,x10,x5 + sbcs x15,x11,x6 + sbcs x16,x12,x7 + sbcs x17,x13,x8 + sbcs xzr, x19,xzr + + csel x10,x10,x14,lo + csel x11,x11,x15,lo + csel x12,x12,x16,lo + csel x13,x13,x17,lo + + stp x10,x11,[x0] + stp x12,x13,[x0,#16] + + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldr x29,[sp],#48 +.long 3573752767 + ret + +.globl from_mont_256 + +.def from_mont_256; +.type 32; +.endef +.p2align 5 +from_mont_256: +.long 3573752639 + stp x29,x30,[sp,#-16]! + add x29,sp,#0 + + mov x4,x3 + ldp x10,x11,[x1] + ldp x12,x13,[x1,#16] + + bl __mul_by_1_mont_256 + ldr x30,[x29,#8] + + subs x14,x10,x5 + sbcs x15,x11,x6 + sbcs x16,x12,x7 + sbcs x17,x13,x8 + + csel x10,x10,x14,lo + csel x11,x11,x15,lo + csel x12,x12,x16,lo + csel x13,x13,x17,lo + + stp x10,x11,[x0] + stp x12,x13,[x0,#16] + + ldr x29,[sp],#16 +.long 3573752767 + ret + + +.globl redc_mont_256 + +.def redc_mont_256; +.type 32; +.endef +.p2align 5 +redc_mont_256: +.long 3573752639 + stp x29,x30,[sp,#-16]! + add x29,sp,#0 + + mov x4,x3 + ldp x10,x11,[x1] + ldp x12,x13,[x1,#16] + + bl __mul_by_1_mont_256 + ldr x30,[x29,#8] + + ldp x14,x15,[x1,#32] + ldp x16,x17,[x1,#48] + + adds x10,x10,x14 + adcs x11,x11,x15 + adcs x12,x12,x16 + adcs x13,x13,x17 + adc x9,xzr,xzr + + subs x14,x10,x5 + sbcs x15,x11,x6 + sbcs x16,x12,x7 + sbcs x17,x13,x8 + sbcs xzr, x9,xzr + + csel x10,x10,x14,lo + csel x11,x11,x15,lo + csel x12,x12,x16,lo + csel x13,x13,x17,lo + + stp x10,x11,[x0] + stp x12,x13,[x0,#16] + + ldr x29,[sp],#16 +.long 3573752767 + ret + + +.def __mul_by_1_mont_256; +.type 32; +.endef +.p2align 5 +__mul_by_1_mont_256: + mul x3,x4,x10 + ldp x5,x6,[x2] + ldp x7,x8,[x2,#16] + //mul x14,x5,x3 + mul x15,x6,x3 + mul x16,x7,x3 + mul x17,x8,x3 + subs xzr,x10,#1 //adds x10,x10,x14 + umulh x14,x5,x3 + adcs x11,x11,x15 + umulh x15,x6,x3 + adcs x12,x12,x16 + umulh x16,x7,x3 + adcs x13,x13,x17 + umulh x17,x8,x3 + adc x9,xzr,xzr + + adds x10,x11,x14 + adcs x11,x12,x15 + adcs x12,x13,x16 + mul x3,x4,x10 + adc x13,x9,x17 + //mul x14,x5,x3 + mul x15,x6,x3 + mul x16,x7,x3 + mul x17,x8,x3 + subs xzr,x10,#1 //adds x10,x10,x14 + umulh x14,x5,x3 + adcs x11,x11,x15 + umulh x15,x6,x3 + adcs x12,x12,x16 + umulh x16,x7,x3 + adcs x13,x13,x17 + umulh x17,x8,x3 + adc x9,xzr,xzr + + adds x10,x11,x14 + adcs x11,x12,x15 + adcs x12,x13,x16 + mul x3,x4,x10 + adc x13,x9,x17 + //mul x14,x5,x3 + mul x15,x6,x3 + mul x16,x7,x3 + mul x17,x8,x3 + subs xzr,x10,#1 //adds x10,x10,x14 + umulh x14,x5,x3 + adcs x11,x11,x15 + umulh x15,x6,x3 + adcs x12,x12,x16 + umulh x16,x7,x3 + adcs x13,x13,x17 + umulh x17,x8,x3 + adc x9,xzr,xzr + + adds x10,x11,x14 + adcs x11,x12,x15 + adcs x12,x13,x16 + mul x3,x4,x10 + adc x13,x9,x17 + //mul x14,x5,x3 + mul x15,x6,x3 + mul x16,x7,x3 + mul x17,x8,x3 + subs xzr,x10,#1 //adds x10,x10,x14 + umulh x14,x5,x3 + adcs x11,x11,x15 + umulh x15,x6,x3 + adcs x12,x12,x16 + umulh x16,x7,x3 + adcs x13,x13,x17 + umulh x17,x8,x3 + adc x9,xzr,xzr + + adds x10,x11,x14 + adcs x11,x12,x15 + adcs x12,x13,x16 + adc x13,x9,x17 + + ret + diff --git a/temp-deps/blst-ts/deps/blst/build/coff/mul_mont_384-armv8.S b/temp-deps/blst-ts/deps/blst/build/coff/mul_mont_384-armv8.S new file mode 100644 index 0000000..074f38c --- /dev/null +++ b/temp-deps/blst-ts/deps/blst/build/coff/mul_mont_384-armv8.S @@ -0,0 +1,2424 @@ +.text + +.globl add_mod_384x384 +.def add_mod_384x384; +.type 32; +.endef +.p2align 5 +add_mod_384x384: +.long 3573752639 + stp x29,x30,[sp,#-64]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + stp x23,x24,[sp,#48] + + ldp x5,x6,[x3] + ldp x7,x8,[x3,#16] + ldp x9,x10,[x3,#32] + + bl __add_mod_384x384 + ldr x30,[x29,#8] + + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldp x23,x24,[x29,#48] + ldr x29,[sp],#64 +.long 3573752767 + ret + + +.def __add_mod_384x384; +.type 32; +.endef +.p2align 5 +__add_mod_384x384: + ldp x11, x12, [x1] + ldp x19,x20,[x2] + ldp x13, x14, [x1,#16] + adds x11,x11,x19 + ldp x21,x22,[x2,#16] + adcs x12,x12,x20 + ldp x15, x16, [x1,#32] + adcs x13,x13,x21 + ldp x23,x24,[x2,#32] + adcs x14,x14,x22 + stp x11, x12, [x0] + adcs x15,x15,x23 + ldp x11, x12, [x1,#48] + adcs x16,x16,x24 + + ldp x19,x20,[x2,#48] + stp x13, x14, [x0,#16] + ldp x13, x14, [x1,#64] + ldp x21,x22,[x2,#64] + + adcs x11,x11,x19 + stp x15, x16, [x0,#32] + adcs x12,x12,x20 + ldp x15, x16, [x1,#80] + adcs x13,x13,x21 + ldp x23,x24,[x2,#80] + adcs x14,x14,x22 + adcs x15,x15,x23 + adcs x16,x16,x24 + adc x17,xzr,xzr + + subs x19,x11,x5 + sbcs x20,x12,x6 + sbcs x21,x13,x7 + sbcs x22,x14,x8 + sbcs x23,x15,x9 + sbcs x24,x16,x10 + sbcs xzr,x17,xzr + + csel x11,x11,x19,lo + csel x12,x12,x20,lo + csel x13,x13,x21,lo + csel x14,x14,x22,lo + stp x11,x12,[x0,#48] + csel x15,x15,x23,lo + stp x13,x14,[x0,#64] + csel x16,x16,x24,lo + stp x15,x16,[x0,#80] + + ret + + +.globl sub_mod_384x384 +.def sub_mod_384x384; +.type 32; +.endef +.p2align 5 +sub_mod_384x384: +.long 3573752639 + stp x29,x30,[sp,#-64]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + stp x23,x24,[sp,#48] + + ldp x5,x6,[x3] + ldp x7,x8,[x3,#16] + ldp x9,x10,[x3,#32] + + bl __sub_mod_384x384 + ldr x30,[x29,#8] + + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldp x23,x24,[x29,#48] + ldr x29,[sp],#64 +.long 3573752767 + ret + + +.def __sub_mod_384x384; +.type 32; +.endef +.p2align 5 +__sub_mod_384x384: + ldp x11, x12, [x1] + ldp x19,x20,[x2] + ldp x13, x14, [x1,#16] + subs x11,x11,x19 + ldp x21,x22,[x2,#16] + sbcs x12,x12,x20 + ldp x15, x16, [x1,#32] + sbcs x13,x13,x21 + ldp x23,x24,[x2,#32] + sbcs x14,x14,x22 + stp x11, x12, [x0] + sbcs x15,x15,x23 + ldp x11, x12, [x1,#48] + sbcs x16,x16,x24 + + ldp x19,x20,[x2,#48] + stp x13, x14, [x0,#16] + ldp x13, x14, [x1,#64] + ldp x21,x22,[x2,#64] + + sbcs x11,x11,x19 + stp x15, x16, [x0,#32] + sbcs x12,x12,x20 + ldp x15, x16, [x1,#80] + sbcs x13,x13,x21 + ldp x23,x24,[x2,#80] + sbcs x14,x14,x22 + sbcs x15,x15,x23 + sbcs x16,x16,x24 + sbc x17,xzr,xzr + + and x19,x5,x17 + and x20,x6,x17 + adds x11,x11,x19 + and x21,x7,x17 + adcs x12,x12,x20 + and x22,x8,x17 + adcs x13,x13,x21 + and x23,x9,x17 + adcs x14,x14,x22 + and x24,x10,x17 + adcs x15,x15,x23 + stp x11,x12,[x0,#48] + adc x16,x16,x24 + stp x13,x14,[x0,#64] + stp x15,x16,[x0,#80] + + ret + + +.def __add_mod_384; +.type 32; +.endef +.p2align 5 +__add_mod_384: + ldp x11, x12, [x1] + ldp x19,x20,[x2] + ldp x13, x14, [x1,#16] + adds x11,x11,x19 + ldp x21,x22,[x2,#16] + adcs x12,x12,x20 + ldp x15, x16, [x1,#32] + adcs x13,x13,x21 + ldp x23,x24,[x2,#32] + adcs x14,x14,x22 + adcs x15,x15,x23 + adcs x16,x16,x24 + adc x17,xzr,xzr + + subs x19,x11,x5 + sbcs x20,x12,x6 + sbcs x21,x13,x7 + sbcs x22,x14,x8 + sbcs x23,x15,x9 + sbcs x24,x16,x10 + sbcs xzr,x17,xzr + + csel x11,x11,x19,lo + csel x12,x12,x20,lo + csel x13,x13,x21,lo + csel x14,x14,x22,lo + csel x15,x15,x23,lo + stp x11,x12,[x0] + csel x16,x16,x24,lo + stp x13,x14,[x0,#16] + stp x15,x16,[x0,#32] + + ret + + +.def __sub_mod_384; +.type 32; +.endef +.p2align 5 +__sub_mod_384: + ldp x11, x12, [x1] + ldp x19,x20,[x2] + ldp x13, x14, [x1,#16] + subs x11,x11,x19 + ldp x21,x22,[x2,#16] + sbcs x12,x12,x20 + ldp x15, x16, [x1,#32] + sbcs x13,x13,x21 + ldp x23,x24,[x2,#32] + sbcs x14,x14,x22 + sbcs x15,x15,x23 + sbcs x16,x16,x24 + sbc x17,xzr,xzr + + and x19,x5,x17 + and x20,x6,x17 + adds x11,x11,x19 + and x21,x7,x17 + adcs x12,x12,x20 + and x22,x8,x17 + adcs x13,x13,x21 + and x23,x9,x17 + adcs x14,x14,x22 + and x24,x10,x17 + adcs x15,x15,x23 + stp x11,x12,[x0] + adc x16,x16,x24 + stp x13,x14,[x0,#16] + stp x15,x16,[x0,#32] + + ret + + +.globl mul_mont_384x + +.def mul_mont_384x; +.type 32; +.endef +.p2align 5 +mul_mont_384x: +.long 3573752639 + stp x29,x30,[sp,#-128]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + stp x23,x24,[sp,#48] + stp x25,x26,[sp,#64] + stp x27,x28,[sp,#80] + sub sp,sp,#288 // space for 3 768-bit vectors + + mov x26,x0 // save r_ptr + mov x27,x1 // save b_ptr + mov x28,x2 // save b_ptr + + sub x0,sp,#0 // mul_384(t0, a->re, b->re) + bl __mul_384 + + add x1,x1,#48 // mul_384(t1, a->im, b->im) + add x2,x2,#48 + add x0,sp,#96 + bl __mul_384 + + ldp x5,x6,[x3] + ldp x7,x8,[x3,#16] + ldp x9,x10,[x3,#32] + + sub x2,x1,#48 + add x0,sp,#240 + bl __add_mod_384 + + add x1,x28,#0 + add x2,x28,#48 + add x0,sp,#192 // t2 + bl __add_mod_384 + + add x1,x0,#0 + add x2,x0,#48 + bl __mul_384 // mul_384(t2, a->re+a->im, b->re+b->im) + + ldp x5,x6,[x3] + ldp x7,x8,[x3,#16] + ldp x9,x10,[x3,#32] + + mov x1,x0 + add x2,sp,#0 + bl __sub_mod_384x384 + + add x2,sp,#96 + bl __sub_mod_384x384 // t2 = t2-t0-t1 + + add x1,sp,#0 + add x2,sp,#96 + add x0,sp,#0 + bl __sub_mod_384x384 // t0 = t0-t1 + + add x1,sp,#0 // ret->re = redc(t0) + add x0,x26,#0 + bl __mul_by_1_mont_384 + bl __redc_tail_mont_384 + + add x1,sp,#192 // ret->im = redc(t2) + add x0,x0,#48 + bl __mul_by_1_mont_384 + bl __redc_tail_mont_384 + ldr x30,[x29,#8] + + add sp,sp,#288 + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldp x23,x24,[x29,#48] + ldp x25,x26,[x29,#64] + ldp x27,x28,[x29,#80] + ldr x29,[sp],#128 +.long 3573752767 + ret + + +.globl sqr_mont_384x + +.def sqr_mont_384x; +.type 32; +.endef +.p2align 5 +sqr_mont_384x: +.long 3573752639 + stp x29,x30,[sp,#-128]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + stp x23,x24,[sp,#48] + stp x25,x26,[sp,#64] + stp x27,x28,[sp,#80] + stp x3,x0,[sp,#96] // __mul_mont_384 wants them there + sub sp,sp,#96 // space for 2 384-bit vectors + mov x4,x3 // adjust for missing b_ptr + + ldp x5,x6,[x2] + ldp x7,x8,[x2,#16] + ldp x9,x10,[x2,#32] + + add x2,x1,#48 + add x0,sp,#0 + bl __add_mod_384 // t0 = a->re + a->im + + add x0,sp,#48 + bl __sub_mod_384 // t1 = a->re - a->im + + ldp x11,x12,[x1] + ldr x17, [x2] + ldp x13,x14,[x1,#16] + ldp x15,x16,[x1,#32] + + bl __mul_mont_384 // mul_mont_384(ret->im, a->re, a->im) + + adds x11,x11,x11 // add with itself + adcs x12,x12,x12 + adcs x13,x13,x13 + adcs x14,x14,x14 + adcs x15,x15,x15 + adcs x16,x16,x16 + adc x25,xzr,xzr + + subs x19,x11,x5 + sbcs x20,x12,x6 + sbcs x21,x13,x7 + sbcs x22,x14,x8 + sbcs x23,x15,x9 + sbcs x24,x16,x10 + sbcs xzr,x25,xzr + + csel x19,x11,x19,lo + csel x20,x12,x20,lo + csel x21,x13,x21,lo + ldp x11,x12,[sp] + csel x22,x14,x22,lo + ldr x17, [sp,#48] + csel x23,x15,x23,lo + ldp x13,x14,[sp,#16] + csel x24,x16,x24,lo + ldp x15,x16,[sp,#32] + + stp x19,x20,[x2,#48] + stp x21,x22,[x2,#64] + stp x23,x24,[x2,#80] + + add x2,sp,#48 + bl __mul_mont_384 // mul_mont_384(ret->re, t0, t1) + ldr x30,[x29,#8] + + stp x11,x12,[x2] + stp x13,x14,[x2,#16] + stp x15,x16,[x2,#32] + + add sp,sp,#96 + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldp x23,x24,[x29,#48] + ldp x25,x26,[x29,#64] + ldp x27,x28,[x29,#80] + ldr x29,[sp],#128 +.long 3573752767 + ret + + +.globl mul_mont_384 + +.def mul_mont_384; +.type 32; +.endef +.p2align 5 +mul_mont_384: +.long 3573752639 + stp x29,x30,[sp,#-128]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + stp x23,x24,[sp,#48] + stp x25,x26,[sp,#64] + stp x27,x28,[sp,#80] + stp x4,x0,[sp,#96] // __mul_mont_384 wants them there + + ldp x11,x12,[x1] + ldr x17, [x2] + ldp x13,x14,[x1,#16] + ldp x15,x16,[x1,#32] + + ldp x5,x6,[x3] + ldp x7,x8,[x3,#16] + ldp x9,x10,[x3,#32] + + bl __mul_mont_384 + ldr x30,[x29,#8] + + stp x11,x12,[x2] + stp x13,x14,[x2,#16] + stp x15,x16,[x2,#32] + + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldp x23,x24,[x29,#48] + ldp x25,x26,[x29,#64] + ldp x27,x28,[x29,#80] + ldr x29,[sp],#128 +.long 3573752767 + ret + + +.def __mul_mont_384; +.type 32; +.endef +.p2align 5 +__mul_mont_384: + mul x19,x11,x17 + mul x20,x12,x17 + mul x21,x13,x17 + mul x22,x14,x17 + mul x23,x15,x17 + mul x24,x16,x17 + mul x4,x4,x19 + + umulh x26,x11,x17 + umulh x27,x12,x17 + umulh x28,x13,x17 + umulh x0,x14,x17 + umulh x1,x15,x17 + umulh x3,x16,x17 + + adds x20,x20,x26 + // mul x26,x5,x4 + adcs x21,x21,x27 + mul x27,x6,x4 + adcs x22,x22,x28 + mul x28,x7,x4 + adcs x23,x23,x0 + mul x0,x8,x4 + adcs x24,x24,x1 + mul x1,x9,x4 + adc x25,xzr, x3 + mul x3,x10,x4 + mov x17,xzr + subs xzr,x19,#1 // adds x19,x19,x26 + umulh x26,x5,x4 + adcs x20,x20,x27 + umulh x27,x6,x4 + adcs x21,x21,x28 + umulh x28,x7,x4 + adcs x22,x22,x0 + umulh x0,x8,x4 + adcs x23,x23,x1 + umulh x1,x9,x4 + adcs x24,x24,x3 + umulh x3,x10,x4 + adcs x25,x25,xzr + adc x4,x17,xzr + ldr x17,[x2,8*1] + + adds x19,x20,x26 + mul x26,x11,x17 + adcs x20,x21,x27 + mul x27,x12,x17 + adcs x21,x22,x28 + mul x28,x13,x17 + adcs x22,x23,x0 + mul x0,x14,x17 + adcs x23,x24,x1 + mul x1,x15,x17 + adcs x24,x25,x3 + mul x3,x16,x17 + adc x25,x4,xzr + ldr x4,[x29,#96] + + adds x19,x19,x26 + umulh x26,x11,x17 + adcs x20,x20,x27 + umulh x27,x12,x17 + adcs x21,x21,x28 + mul x4,x4,x19 + umulh x28,x13,x17 + adcs x22,x22,x0 + umulh x0,x14,x17 + adcs x23,x23,x1 + umulh x1,x15,x17 + adcs x24,x24,x3 + umulh x3,x16,x17 + adcs x25,x25,xzr + adc x17,xzr,xzr + + adds x20,x20,x26 + // mul x26,x5,x4 + adcs x21,x21,x27 + mul x27,x6,x4 + adcs x22,x22,x28 + mul x28,x7,x4 + adcs x23,x23,x0 + mul x0,x8,x4 + adcs x24,x24,x1 + mul x1,x9,x4 + adcs x25,x25,x3 + mul x3,x10,x4 + adc x17,x17,xzr + subs xzr,x19,#1 // adds x19,x19,x26 + umulh x26,x5,x4 + adcs x20,x20,x27 + umulh x27,x6,x4 + adcs x21,x21,x28 + umulh x28,x7,x4 + adcs x22,x22,x0 + umulh x0,x8,x4 + adcs x23,x23,x1 + umulh x1,x9,x4 + adcs x24,x24,x3 + umulh x3,x10,x4 + adcs x25,x25,xzr + adc x4,x17,xzr + ldr x17,[x2,8*2] + + adds x19,x20,x26 + mul x26,x11,x17 + adcs x20,x21,x27 + mul x27,x12,x17 + adcs x21,x22,x28 + mul x28,x13,x17 + adcs x22,x23,x0 + mul x0,x14,x17 + adcs x23,x24,x1 + mul x1,x15,x17 + adcs x24,x25,x3 + mul x3,x16,x17 + adc x25,x4,xzr + ldr x4,[x29,#96] + + adds x19,x19,x26 + umulh x26,x11,x17 + adcs x20,x20,x27 + umulh x27,x12,x17 + adcs x21,x21,x28 + mul x4,x4,x19 + umulh x28,x13,x17 + adcs x22,x22,x0 + umulh x0,x14,x17 + adcs x23,x23,x1 + umulh x1,x15,x17 + adcs x24,x24,x3 + umulh x3,x16,x17 + adcs x25,x25,xzr + adc x17,xzr,xzr + + adds x20,x20,x26 + // mul x26,x5,x4 + adcs x21,x21,x27 + mul x27,x6,x4 + adcs x22,x22,x28 + mul x28,x7,x4 + adcs x23,x23,x0 + mul x0,x8,x4 + adcs x24,x24,x1 + mul x1,x9,x4 + adcs x25,x25,x3 + mul x3,x10,x4 + adc x17,x17,xzr + subs xzr,x19,#1 // adds x19,x19,x26 + umulh x26,x5,x4 + adcs x20,x20,x27 + umulh x27,x6,x4 + adcs x21,x21,x28 + umulh x28,x7,x4 + adcs x22,x22,x0 + umulh x0,x8,x4 + adcs x23,x23,x1 + umulh x1,x9,x4 + adcs x24,x24,x3 + umulh x3,x10,x4 + adcs x25,x25,xzr + adc x4,x17,xzr + ldr x17,[x2,8*3] + + adds x19,x20,x26 + mul x26,x11,x17 + adcs x20,x21,x27 + mul x27,x12,x17 + adcs x21,x22,x28 + mul x28,x13,x17 + adcs x22,x23,x0 + mul x0,x14,x17 + adcs x23,x24,x1 + mul x1,x15,x17 + adcs x24,x25,x3 + mul x3,x16,x17 + adc x25,x4,xzr + ldr x4,[x29,#96] + + adds x19,x19,x26 + umulh x26,x11,x17 + adcs x20,x20,x27 + umulh x27,x12,x17 + adcs x21,x21,x28 + mul x4,x4,x19 + umulh x28,x13,x17 + adcs x22,x22,x0 + umulh x0,x14,x17 + adcs x23,x23,x1 + umulh x1,x15,x17 + adcs x24,x24,x3 + umulh x3,x16,x17 + adcs x25,x25,xzr + adc x17,xzr,xzr + + adds x20,x20,x26 + // mul x26,x5,x4 + adcs x21,x21,x27 + mul x27,x6,x4 + adcs x22,x22,x28 + mul x28,x7,x4 + adcs x23,x23,x0 + mul x0,x8,x4 + adcs x24,x24,x1 + mul x1,x9,x4 + adcs x25,x25,x3 + mul x3,x10,x4 + adc x17,x17,xzr + subs xzr,x19,#1 // adds x19,x19,x26 + umulh x26,x5,x4 + adcs x20,x20,x27 + umulh x27,x6,x4 + adcs x21,x21,x28 + umulh x28,x7,x4 + adcs x22,x22,x0 + umulh x0,x8,x4 + adcs x23,x23,x1 + umulh x1,x9,x4 + adcs x24,x24,x3 + umulh x3,x10,x4 + adcs x25,x25,xzr + adc x4,x17,xzr + ldr x17,[x2,8*4] + + adds x19,x20,x26 + mul x26,x11,x17 + adcs x20,x21,x27 + mul x27,x12,x17 + adcs x21,x22,x28 + mul x28,x13,x17 + adcs x22,x23,x0 + mul x0,x14,x17 + adcs x23,x24,x1 + mul x1,x15,x17 + adcs x24,x25,x3 + mul x3,x16,x17 + adc x25,x4,xzr + ldr x4,[x29,#96] + + adds x19,x19,x26 + umulh x26,x11,x17 + adcs x20,x20,x27 + umulh x27,x12,x17 + adcs x21,x21,x28 + mul x4,x4,x19 + umulh x28,x13,x17 + adcs x22,x22,x0 + umulh x0,x14,x17 + adcs x23,x23,x1 + umulh x1,x15,x17 + adcs x24,x24,x3 + umulh x3,x16,x17 + adcs x25,x25,xzr + adc x17,xzr,xzr + + adds x20,x20,x26 + // mul x26,x5,x4 + adcs x21,x21,x27 + mul x27,x6,x4 + adcs x22,x22,x28 + mul x28,x7,x4 + adcs x23,x23,x0 + mul x0,x8,x4 + adcs x24,x24,x1 + mul x1,x9,x4 + adcs x25,x25,x3 + mul x3,x10,x4 + adc x17,x17,xzr + subs xzr,x19,#1 // adds x19,x19,x26 + umulh x26,x5,x4 + adcs x20,x20,x27 + umulh x27,x6,x4 + adcs x21,x21,x28 + umulh x28,x7,x4 + adcs x22,x22,x0 + umulh x0,x8,x4 + adcs x23,x23,x1 + umulh x1,x9,x4 + adcs x24,x24,x3 + umulh x3,x10,x4 + adcs x25,x25,xzr + adc x4,x17,xzr + ldr x17,[x2,8*5] + + adds x19,x20,x26 + mul x26,x11,x17 + adcs x20,x21,x27 + mul x27,x12,x17 + adcs x21,x22,x28 + mul x28,x13,x17 + adcs x22,x23,x0 + mul x0,x14,x17 + adcs x23,x24,x1 + mul x1,x15,x17 + adcs x24,x25,x3 + mul x3,x16,x17 + adc x25,x4,xzr + ldr x4,[x29,#96] + + adds x19,x19,x26 + umulh x26,x11,x17 + adcs x20,x20,x27 + umulh x27,x12,x17 + adcs x21,x21,x28 + mul x4,x4,x19 + umulh x28,x13,x17 + adcs x22,x22,x0 + umulh x0,x14,x17 + adcs x23,x23,x1 + umulh x1,x15,x17 + adcs x24,x24,x3 + umulh x3,x16,x17 + adcs x25,x25,xzr + adc x17,xzr,xzr + + adds x20,x20,x26 + // mul x26,x5,x4 + adcs x21,x21,x27 + mul x27,x6,x4 + adcs x22,x22,x28 + mul x28,x7,x4 + adcs x23,x23,x0 + mul x0,x8,x4 + adcs x24,x24,x1 + mul x1,x9,x4 + adcs x25,x25,x3 + mul x3,x10,x4 + adc x17,x17,xzr + subs xzr,x19,#1 // adds x19,x19,x26 + umulh x26,x5,x4 + adcs x20,x20,x27 + umulh x27,x6,x4 + adcs x21,x21,x28 + umulh x28,x7,x4 + adcs x22,x22,x0 + umulh x0,x8,x4 + adcs x23,x23,x1 + umulh x1,x9,x4 + adcs x24,x24,x3 + umulh x3,x10,x4 + adcs x25,x25,xzr + ldp x4,x2,[x29,#96] // pull r_ptr + adc x17,x17,xzr + + adds x19,x20,x26 + adcs x20,x21,x27 + adcs x21,x22,x28 + adcs x22,x23,x0 + adcs x23,x24,x1 + adcs x24,x25,x3 + adc x25,x17,xzr + + subs x26,x19,x5 + sbcs x27,x20,x6 + sbcs x28,x21,x7 + sbcs x0,x22,x8 + sbcs x1,x23,x9 + sbcs x3,x24,x10 + sbcs xzr, x25,xzr + + csel x11,x19,x26,lo + csel x12,x20,x27,lo + csel x13,x21,x28,lo + csel x14,x22,x0,lo + csel x15,x23,x1,lo + csel x16,x24,x3,lo + ret + + +.globl sqr_mont_384 + +.def sqr_mont_384; +.type 32; +.endef +.p2align 5 +sqr_mont_384: +.long 3573752639 + stp x29,x30,[sp,#-128]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + stp x23,x24,[sp,#48] + stp x25,x26,[sp,#64] + stp x27,x28,[sp,#80] + sub sp,sp,#96 // space for 768-bit vector + mov x4,x3 // adjust for missing b_ptr + + mov x3,x0 // save r_ptr + mov x0,sp + + ldp x11,x12,[x1] + ldp x13,x14,[x1,#16] + ldp x15,x16,[x1,#32] + + bl __sqr_384 + + ldp x5,x6,[x2] + ldp x7,x8,[x2,#16] + ldp x9,x10,[x2,#32] + + mov x1,sp + mov x0,x3 // restore r_ptr + bl __mul_by_1_mont_384 + bl __redc_tail_mont_384 + ldr x30,[x29,#8] + + add sp,sp,#96 + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldp x23,x24,[x29,#48] + ldp x25,x26,[x29,#64] + ldp x27,x28,[x29,#80] + ldr x29,[sp],#128 +.long 3573752767 + ret + + +.globl sqr_n_mul_mont_383 + +.def sqr_n_mul_mont_383; +.type 32; +.endef +.p2align 5 +sqr_n_mul_mont_383: +.long 3573752639 + stp x29,x30,[sp,#-128]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + stp x23,x24,[sp,#48] + stp x25,x26,[sp,#64] + stp x27,x28,[sp,#80] + stp x4,x0,[sp,#96] // __mul_mont_384 wants them there + sub sp,sp,#96 // space for 768-bit vector + mov x17,x5 // save b_ptr + + ldp x11,x12,[x1] + ldp x13,x14,[x1,#16] + ldp x15,x16,[x1,#32] + mov x0,sp +.Loop_sqr_383: + bl __sqr_384 + sub x2,x2,#1 // counter + + ldp x5,x6,[x3] + ldp x7,x8,[x3,#16] + ldp x9,x10,[x3,#32] + + mov x1,sp + bl __mul_by_1_mont_384 + + ldp x19,x20,[x1,#48] + ldp x21,x22,[x1,#64] + ldp x23,x24,[x1,#80] + + adds x11,x11,x19 // just accumulate upper half + adcs x12,x12,x20 + adcs x13,x13,x21 + adcs x14,x14,x22 + adcs x15,x15,x23 + adc x16,x16,x24 + + cbnz x2,.Loop_sqr_383 + + mov x2,x17 + ldr x17,[x17] + bl __mul_mont_384 + ldr x30,[x29,#8] + + stp x11,x12,[x2] + stp x13,x14,[x2,#16] + stp x15,x16,[x2,#32] + + add sp,sp,#96 + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldp x23,x24,[x29,#48] + ldp x25,x26,[x29,#64] + ldp x27,x28,[x29,#80] + ldr x29,[sp],#128 +.long 3573752767 + ret + +.def __sqr_384; +.type 32; +.endef +.p2align 5 +__sqr_384: + mul x19,x12,x11 + mul x20,x13,x11 + mul x21,x14,x11 + mul x22,x15,x11 + mul x23,x16,x11 + + umulh x6,x12,x11 + umulh x7,x13,x11 + umulh x8,x14,x11 + umulh x9,x15,x11 + adds x20,x20,x6 + umulh x10,x16,x11 + adcs x21,x21,x7 + mul x7,x13,x12 + adcs x22,x22,x8 + mul x8,x14,x12 + adcs x23,x23,x9 + mul x9,x15,x12 + adc x24,xzr, x10 + mul x10,x16,x12 + + adds x21,x21,x7 + umulh x7,x13,x12 + adcs x22,x22,x8 + umulh x8,x14,x12 + adcs x23,x23,x9 + umulh x9,x15,x12 + adcs x24,x24,x10 + umulh x10,x16,x12 + adc x25,xzr,xzr + + mul x5,x11,x11 + adds x22,x22,x7 + umulh x11, x11,x11 + adcs x23,x23,x8 + mul x8,x14,x13 + adcs x24,x24,x9 + mul x9,x15,x13 + adc x25,x25,x10 + mul x10,x16,x13 + + adds x23,x23,x8 + umulh x8,x14,x13 + adcs x24,x24,x9 + umulh x9,x15,x13 + adcs x25,x25,x10 + umulh x10,x16,x13 + adc x26,xzr,xzr + + mul x6,x12,x12 + adds x24,x24,x8 + umulh x12, x12,x12 + adcs x25,x25,x9 + mul x9,x15,x14 + adc x26,x26,x10 + mul x10,x16,x14 + + adds x25,x25,x9 + umulh x9,x15,x14 + adcs x26,x26,x10 + umulh x10,x16,x14 + adc x27,xzr,xzr + mul x7,x13,x13 + adds x26,x26,x9 + umulh x13, x13,x13 + adc x27,x27,x10 + mul x8,x14,x14 + + mul x10,x16,x15 + umulh x14, x14,x14 + adds x27,x27,x10 + umulh x10,x16,x15 + mul x9,x15,x15 + adc x28,x10,xzr + + adds x19,x19,x19 + adcs x20,x20,x20 + adcs x21,x21,x21 + adcs x22,x22,x22 + adcs x23,x23,x23 + adcs x24,x24,x24 + adcs x25,x25,x25 + adcs x26,x26,x26 + umulh x15, x15,x15 + adcs x27,x27,x27 + mul x10,x16,x16 + adcs x28,x28,x28 + umulh x16, x16,x16 + adc x1,xzr,xzr + + adds x19,x19,x11 + adcs x20,x20,x6 + adcs x21,x21,x12 + adcs x22,x22,x7 + adcs x23,x23,x13 + adcs x24,x24,x8 + adcs x25,x25,x14 + stp x5,x19,[x0] + adcs x26,x26,x9 + stp x20,x21,[x0,#16] + adcs x27,x27,x15 + stp x22,x23,[x0,#32] + adcs x28,x28,x10 + stp x24,x25,[x0,#48] + adc x16,x16,x1 + stp x26,x27,[x0,#64] + stp x28,x16,[x0,#80] + + ret + +.globl sqr_384 + +.def sqr_384; +.type 32; +.endef +.p2align 5 +sqr_384: +.long 3573752639 + stp x29,x30,[sp,#-128]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + stp x23,x24,[sp,#48] + stp x25,x26,[sp,#64] + stp x27,x28,[sp,#80] + + ldp x11,x12,[x1] + ldp x13,x14,[x1,#16] + ldp x15,x16,[x1,#32] + + bl __sqr_384 + ldr x30,[x29,#8] + + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldp x23,x24,[x29,#48] + ldp x25,x26,[x29,#64] + ldp x27,x28,[x29,#80] + ldr x29,[sp],#128 +.long 3573752767 + ret + + +.globl redc_mont_384 + +.def redc_mont_384; +.type 32; +.endef +.p2align 5 +redc_mont_384: +.long 3573752639 + stp x29,x30,[sp,#-128]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + stp x23,x24,[sp,#48] + stp x25,x26,[sp,#64] + stp x27,x28,[sp,#80] + mov x4,x3 // adjust for missing b_ptr + + ldp x5,x6,[x2] + ldp x7,x8,[x2,#16] + ldp x9,x10,[x2,#32] + + bl __mul_by_1_mont_384 + bl __redc_tail_mont_384 + ldr x30,[x29,#8] + + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldp x23,x24,[x29,#48] + ldp x25,x26,[x29,#64] + ldp x27,x28,[x29,#80] + ldr x29,[sp],#128 +.long 3573752767 + ret + + +.globl from_mont_384 + +.def from_mont_384; +.type 32; +.endef +.p2align 5 +from_mont_384: +.long 3573752639 + stp x29,x30,[sp,#-128]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + stp x23,x24,[sp,#48] + stp x25,x26,[sp,#64] + stp x27,x28,[sp,#80] + mov x4,x3 // adjust for missing b_ptr + + ldp x5,x6,[x2] + ldp x7,x8,[x2,#16] + ldp x9,x10,[x2,#32] + + bl __mul_by_1_mont_384 + ldr x30,[x29,#8] + + subs x19,x11,x5 + sbcs x20,x12,x6 + sbcs x21,x13,x7 + sbcs x22,x14,x8 + sbcs x23,x15,x9 + sbcs x24,x16,x10 + + csel x11,x11,x19,lo + csel x12,x12,x20,lo + csel x13,x13,x21,lo + csel x14,x14,x22,lo + csel x15,x15,x23,lo + csel x16,x16,x24,lo + + stp x11,x12,[x0] + stp x13,x14,[x0,#16] + stp x15,x16,[x0,#32] + + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldp x23,x24,[x29,#48] + ldp x25,x26,[x29,#64] + ldp x27,x28,[x29,#80] + ldr x29,[sp],#128 +.long 3573752767 + ret + + +.def __mul_by_1_mont_384; +.type 32; +.endef +.p2align 5 +__mul_by_1_mont_384: + ldp x11,x12,[x1] + ldp x13,x14,[x1,#16] + mul x26,x4,x11 + ldp x15,x16,[x1,#32] + + // mul x19,x5,x26 + mul x20,x6,x26 + mul x21,x7,x26 + mul x22,x8,x26 + mul x23,x9,x26 + mul x24,x10,x26 + subs xzr,x11,#1 // adds x19,x19,x11 + umulh x11,x5,x26 + adcs x20,x20,x12 + umulh x12,x6,x26 + adcs x21,x21,x13 + umulh x13,x7,x26 + adcs x22,x22,x14 + umulh x14,x8,x26 + adcs x23,x23,x15 + umulh x15,x9,x26 + adcs x24,x24,x16 + umulh x16,x10,x26 + adc x25,xzr,xzr + adds x11,x11,x20 + adcs x12,x12,x21 + adcs x13,x13,x22 + mul x26,x4,x11 + adcs x14,x14,x23 + adcs x15,x15,x24 + adc x16,x16,x25 + + // mul x19,x5,x26 + mul x20,x6,x26 + mul x21,x7,x26 + mul x22,x8,x26 + mul x23,x9,x26 + mul x24,x10,x26 + subs xzr,x11,#1 // adds x19,x19,x11 + umulh x11,x5,x26 + adcs x20,x20,x12 + umulh x12,x6,x26 + adcs x21,x21,x13 + umulh x13,x7,x26 + adcs x22,x22,x14 + umulh x14,x8,x26 + adcs x23,x23,x15 + umulh x15,x9,x26 + adcs x24,x24,x16 + umulh x16,x10,x26 + adc x25,xzr,xzr + adds x11,x11,x20 + adcs x12,x12,x21 + adcs x13,x13,x22 + mul x26,x4,x11 + adcs x14,x14,x23 + adcs x15,x15,x24 + adc x16,x16,x25 + + // mul x19,x5,x26 + mul x20,x6,x26 + mul x21,x7,x26 + mul x22,x8,x26 + mul x23,x9,x26 + mul x24,x10,x26 + subs xzr,x11,#1 // adds x19,x19,x11 + umulh x11,x5,x26 + adcs x20,x20,x12 + umulh x12,x6,x26 + adcs x21,x21,x13 + umulh x13,x7,x26 + adcs x22,x22,x14 + umulh x14,x8,x26 + adcs x23,x23,x15 + umulh x15,x9,x26 + adcs x24,x24,x16 + umulh x16,x10,x26 + adc x25,xzr,xzr + adds x11,x11,x20 + adcs x12,x12,x21 + adcs x13,x13,x22 + mul x26,x4,x11 + adcs x14,x14,x23 + adcs x15,x15,x24 + adc x16,x16,x25 + + // mul x19,x5,x26 + mul x20,x6,x26 + mul x21,x7,x26 + mul x22,x8,x26 + mul x23,x9,x26 + mul x24,x10,x26 + subs xzr,x11,#1 // adds x19,x19,x11 + umulh x11,x5,x26 + adcs x20,x20,x12 + umulh x12,x6,x26 + adcs x21,x21,x13 + umulh x13,x7,x26 + adcs x22,x22,x14 + umulh x14,x8,x26 + adcs x23,x23,x15 + umulh x15,x9,x26 + adcs x24,x24,x16 + umulh x16,x10,x26 + adc x25,xzr,xzr + adds x11,x11,x20 + adcs x12,x12,x21 + adcs x13,x13,x22 + mul x26,x4,x11 + adcs x14,x14,x23 + adcs x15,x15,x24 + adc x16,x16,x25 + + // mul x19,x5,x26 + mul x20,x6,x26 + mul x21,x7,x26 + mul x22,x8,x26 + mul x23,x9,x26 + mul x24,x10,x26 + subs xzr,x11,#1 // adds x19,x19,x11 + umulh x11,x5,x26 + adcs x20,x20,x12 + umulh x12,x6,x26 + adcs x21,x21,x13 + umulh x13,x7,x26 + adcs x22,x22,x14 + umulh x14,x8,x26 + adcs x23,x23,x15 + umulh x15,x9,x26 + adcs x24,x24,x16 + umulh x16,x10,x26 + adc x25,xzr,xzr + adds x11,x11,x20 + adcs x12,x12,x21 + adcs x13,x13,x22 + mul x26,x4,x11 + adcs x14,x14,x23 + adcs x15,x15,x24 + adc x16,x16,x25 + + // mul x19,x5,x26 + mul x20,x6,x26 + mul x21,x7,x26 + mul x22,x8,x26 + mul x23,x9,x26 + mul x24,x10,x26 + subs xzr,x11,#1 // adds x19,x19,x11 + umulh x11,x5,x26 + adcs x20,x20,x12 + umulh x12,x6,x26 + adcs x21,x21,x13 + umulh x13,x7,x26 + adcs x22,x22,x14 + umulh x14,x8,x26 + adcs x23,x23,x15 + umulh x15,x9,x26 + adcs x24,x24,x16 + umulh x16,x10,x26 + adc x25,xzr,xzr + adds x11,x11,x20 + adcs x12,x12,x21 + adcs x13,x13,x22 + adcs x14,x14,x23 + adcs x15,x15,x24 + adc x16,x16,x25 + + ret + + +.def __redc_tail_mont_384; +.type 32; +.endef +.p2align 5 +__redc_tail_mont_384: + ldp x19,x20,[x1,#48] + ldp x21,x22,[x1,#64] + ldp x23,x24,[x1,#80] + + adds x11,x11,x19 // accumulate upper half + adcs x12,x12,x20 + adcs x13,x13,x21 + adcs x14,x14,x22 + adcs x15,x15,x23 + adcs x16,x16,x24 + adc x25,xzr,xzr + + subs x19,x11,x5 + sbcs x20,x12,x6 + sbcs x21,x13,x7 + sbcs x22,x14,x8 + sbcs x23,x15,x9 + sbcs x24,x16,x10 + sbcs xzr,x25,xzr + + csel x11,x11,x19,lo + csel x12,x12,x20,lo + csel x13,x13,x21,lo + csel x14,x14,x22,lo + csel x15,x15,x23,lo + csel x16,x16,x24,lo + + stp x11,x12,[x0] + stp x13,x14,[x0,#16] + stp x15,x16,[x0,#32] + + ret + + +.globl mul_384 + +.def mul_384; +.type 32; +.endef +.p2align 5 +mul_384: +.long 3573752639 + stp x29,x30,[sp,#-128]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + stp x23,x24,[sp,#48] + stp x25,x26,[sp,#64] + stp x27,x28,[sp,#80] + + bl __mul_384 + ldr x30,[x29,#8] + + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldp x23,x24,[x29,#48] + ldp x25,x26,[x29,#64] + ldp x27,x28,[x29,#80] + ldr x29,[sp],#128 +.long 3573752767 + ret + + +.def __mul_384; +.type 32; +.endef +.p2align 5 +__mul_384: + ldp x11,x12,[x1] + ldr x17, [x2] + ldp x13,x14,[x1,#16] + ldp x15,x16,[x1,#32] + + mul x19,x11,x17 + mul x20,x12,x17 + mul x21,x13,x17 + mul x22,x14,x17 + mul x23,x15,x17 + mul x24,x16,x17 + + umulh x5,x11,x17 + umulh x6,x12,x17 + umulh x7,x13,x17 + umulh x8,x14,x17 + umulh x9,x15,x17 + umulh x10,x16,x17 + ldr x17,[x2,8*1] + + str x19,[x0] + adds x19,x20,x5 + mul x5,x11,x17 + adcs x20,x21,x6 + mul x6,x12,x17 + adcs x21,x22,x7 + mul x7,x13,x17 + adcs x22,x23,x8 + mul x8,x14,x17 + adcs x23,x24,x9 + mul x9,x15,x17 + adc x24,xzr, x10 + mul x10,x16,x17 + adds x19,x19,x5 + umulh x5,x11,x17 + adcs x20,x20,x6 + umulh x6,x12,x17 + adcs x21,x21,x7 + umulh x7,x13,x17 + adcs x22,x22,x8 + umulh x8,x14,x17 + adcs x23,x23,x9 + umulh x9,x15,x17 + adcs x24,x24,x10 + umulh x10,x16,x17 + ldr x17,[x2,#8*(1+1)] + adc x25,xzr,xzr + + str x19,[x0,8*1] + adds x19,x20,x5 + mul x5,x11,x17 + adcs x20,x21,x6 + mul x6,x12,x17 + adcs x21,x22,x7 + mul x7,x13,x17 + adcs x22,x23,x8 + mul x8,x14,x17 + adcs x23,x24,x9 + mul x9,x15,x17 + adc x24,x25,x10 + mul x10,x16,x17 + adds x19,x19,x5 + umulh x5,x11,x17 + adcs x20,x20,x6 + umulh x6,x12,x17 + adcs x21,x21,x7 + umulh x7,x13,x17 + adcs x22,x22,x8 + umulh x8,x14,x17 + adcs x23,x23,x9 + umulh x9,x15,x17 + adcs x24,x24,x10 + umulh x10,x16,x17 + ldr x17,[x2,#8*(2+1)] + adc x25,xzr,xzr + + str x19,[x0,8*2] + adds x19,x20,x5 + mul x5,x11,x17 + adcs x20,x21,x6 + mul x6,x12,x17 + adcs x21,x22,x7 + mul x7,x13,x17 + adcs x22,x23,x8 + mul x8,x14,x17 + adcs x23,x24,x9 + mul x9,x15,x17 + adc x24,x25,x10 + mul x10,x16,x17 + adds x19,x19,x5 + umulh x5,x11,x17 + adcs x20,x20,x6 + umulh x6,x12,x17 + adcs x21,x21,x7 + umulh x7,x13,x17 + adcs x22,x22,x8 + umulh x8,x14,x17 + adcs x23,x23,x9 + umulh x9,x15,x17 + adcs x24,x24,x10 + umulh x10,x16,x17 + ldr x17,[x2,#8*(3+1)] + adc x25,xzr,xzr + + str x19,[x0,8*3] + adds x19,x20,x5 + mul x5,x11,x17 + adcs x20,x21,x6 + mul x6,x12,x17 + adcs x21,x22,x7 + mul x7,x13,x17 + adcs x22,x23,x8 + mul x8,x14,x17 + adcs x23,x24,x9 + mul x9,x15,x17 + adc x24,x25,x10 + mul x10,x16,x17 + adds x19,x19,x5 + umulh x5,x11,x17 + adcs x20,x20,x6 + umulh x6,x12,x17 + adcs x21,x21,x7 + umulh x7,x13,x17 + adcs x22,x22,x8 + umulh x8,x14,x17 + adcs x23,x23,x9 + umulh x9,x15,x17 + adcs x24,x24,x10 + umulh x10,x16,x17 + ldr x17,[x2,#8*(4+1)] + adc x25,xzr,xzr + + str x19,[x0,8*4] + adds x19,x20,x5 + mul x5,x11,x17 + adcs x20,x21,x6 + mul x6,x12,x17 + adcs x21,x22,x7 + mul x7,x13,x17 + adcs x22,x23,x8 + mul x8,x14,x17 + adcs x23,x24,x9 + mul x9,x15,x17 + adc x24,x25,x10 + mul x10,x16,x17 + adds x19,x19,x5 + umulh x5,x11,x17 + adcs x20,x20,x6 + umulh x6,x12,x17 + adcs x21,x21,x7 + umulh x7,x13,x17 + adcs x22,x22,x8 + umulh x8,x14,x17 + adcs x23,x23,x9 + umulh x9,x15,x17 + adcs x24,x24,x10 + umulh x10,x16,x17 + adc x25,xzr,xzr + + str x19,[x0,8*5] + adds x19,x20,x5 + adcs x20,x21,x6 + adcs x21,x22,x7 + adcs x22,x23,x8 + adcs x23,x24,x9 + adc x24,x25,x10 + + stp x19,x20,[x0,#48] + stp x21,x22,[x0,#64] + stp x23,x24,[x0,#80] + + ret + + +.globl mul_382x + +.def mul_382x; +.type 32; +.endef +.p2align 5 +mul_382x: +.long 3573752639 + stp x29,x30,[sp,#-128]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + stp x23,x24,[sp,#48] + stp x25,x26,[sp,#64] + stp x27,x28,[sp,#80] + sub sp,sp,#96 // space for two 384-bit vectors + + ldp x11,x12,[x1] + mov x26,x0 // save r_ptr + ldp x19,x20,[x1,#48] + mov x27,x1 // save a_ptr + ldp x13,x14,[x1,#16] + mov x28,x2 // save b_ptr + ldp x21,x22,[x1,#64] + ldp x15,x16,[x1,#32] + adds x5,x11,x19 // t0 = a->re + a->im + ldp x23,x24,[x1,#80] + adcs x6,x12,x20 + ldp x11,x12,[x2] + adcs x7,x13,x21 + ldp x19,x20,[x2,#48] + adcs x8,x14,x22 + ldp x13,x14,[x2,#16] + adcs x9,x15,x23 + ldp x21,x22,[x2,#64] + adc x10,x16,x24 + ldp x15,x16,[x2,#32] + + stp x5,x6,[sp] + adds x5,x11,x19 // t1 = b->re + b->im + ldp x23,x24,[x2,#80] + adcs x6,x12,x20 + stp x7,x8,[sp,#16] + adcs x7,x13,x21 + adcs x8,x14,x22 + stp x9,x10,[sp,#32] + adcs x9,x15,x23 + stp x5,x6,[sp,#48] + adc x10,x16,x24 + stp x7,x8,[sp,#64] + stp x9,x10,[sp,#80] + + bl __mul_384 // mul_384(ret->re, a->re, b->re) + + add x1,sp,#0 // mul_384(ret->im, t0, t1) + add x2,sp,#48 + add x0,x26,#96 + bl __mul_384 + + add x1,x27,#48 // mul_384(tx, a->im, b->im) + add x2,x28,#48 + add x0,sp,#0 + bl __mul_384 + + ldp x5,x6,[x3] + ldp x7,x8,[x3,#16] + ldp x9,x10,[x3,#32] + + add x1,x26,#96 // ret->im -= tx + add x2,sp,#0 + add x0,x26,#96 + bl __sub_mod_384x384 + + add x2,x26,#0 // ret->im -= ret->re + bl __sub_mod_384x384 + + add x1,x26,#0 // ret->re -= tx + add x2,sp,#0 + add x0,x26,#0 + bl __sub_mod_384x384 + ldr x30,[x29,#8] + + add sp,sp,#96 + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldp x23,x24,[x29,#48] + ldp x25,x26,[x29,#64] + ldp x27,x28,[x29,#80] + ldr x29,[sp],#128 +.long 3573752767 + ret + + +.globl sqr_382x + +.def sqr_382x; +.type 32; +.endef +.p2align 5 +sqr_382x: +.long 3573752639 + stp x29,x30,[sp,#-128]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + stp x23,x24,[sp,#48] + stp x25,x26,[sp,#64] + stp x27,x28,[sp,#80] + + ldp x11,x12,[x1] + ldp x19,x20,[x1,#48] + ldp x13,x14,[x1,#16] + adds x5,x11,x19 // t0 = a->re + a->im + ldp x21,x22,[x1,#64] + adcs x6,x12,x20 + ldp x15,x16,[x1,#32] + adcs x7,x13,x21 + ldp x23,x24,[x1,#80] + adcs x8,x14,x22 + stp x5,x6,[x0] + adcs x9,x15,x23 + ldp x5,x6,[x2] + adc x10,x16,x24 + stp x7,x8,[x0,#16] + + subs x11,x11,x19 // t1 = a->re - a->im + ldp x7,x8,[x2,#16] + sbcs x12,x12,x20 + stp x9,x10,[x0,#32] + sbcs x13,x13,x21 + ldp x9,x10,[x2,#32] + sbcs x14,x14,x22 + sbcs x15,x15,x23 + sbcs x16,x16,x24 + sbc x25,xzr,xzr + + and x19,x5,x25 + and x20,x6,x25 + adds x11,x11,x19 + and x21,x7,x25 + adcs x12,x12,x20 + and x22,x8,x25 + adcs x13,x13,x21 + and x23,x9,x25 + adcs x14,x14,x22 + and x24,x10,x25 + adcs x15,x15,x23 + stp x11,x12,[x0,#48] + adc x16,x16,x24 + stp x13,x14,[x0,#64] + stp x15,x16,[x0,#80] + + mov x4,x1 // save a_ptr + add x1,x0,#0 // mul_384(ret->re, t0, t1) + add x2,x0,#48 + bl __mul_384 + + add x1,x4,#0 // mul_384(ret->im, a->re, a->im) + add x2,x4,#48 + add x0,x0,#96 + bl __mul_384 + ldr x30,[x29,#8] + + ldp x11,x12,[x0] + ldp x13,x14,[x0,#16] + adds x11,x11,x11 // add with itself + ldp x15,x16,[x0,#32] + adcs x12,x12,x12 + adcs x13,x13,x13 + adcs x14,x14,x14 + adcs x15,x15,x15 + adcs x16,x16,x16 + adcs x19,x19,x19 + adcs x20,x20,x20 + stp x11,x12,[x0] + adcs x21,x21,x21 + stp x13,x14,[x0,#16] + adcs x22,x22,x22 + stp x15,x16,[x0,#32] + adcs x23,x23,x23 + stp x19,x20,[x0,#48] + adc x24,x24,x24 + stp x21,x22,[x0,#64] + stp x23,x24,[x0,#80] + + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldp x23,x24,[x29,#48] + ldp x25,x26,[x29,#64] + ldp x27,x28,[x29,#80] + ldr x29,[sp],#128 +.long 3573752767 + ret + + +.globl sqr_mont_382x + +.def sqr_mont_382x; +.type 32; +.endef +.p2align 5 +sqr_mont_382x: +.long 3573752639 + stp x29,x30,[sp,#-128]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + stp x23,x24,[sp,#48] + stp x25,x26,[sp,#64] + stp x27,x28,[sp,#80] + stp x3,x0,[sp,#96] // __mul_mont_384 wants them there + sub sp,sp,#112 // space for two 384-bit vectors + word + mov x4,x3 // adjust for missing b_ptr + + ldp x11,x12,[x1] + ldp x13,x14,[x1,#16] + ldp x15,x16,[x1,#32] + + ldp x17,x20,[x1,#48] + ldp x21,x22,[x1,#64] + ldp x23,x24,[x1,#80] + + adds x5,x11,x17 // t0 = a->re + a->im + adcs x6,x12,x20 + adcs x7,x13,x21 + adcs x8,x14,x22 + adcs x9,x15,x23 + adc x10,x16,x24 + + subs x19,x11,x17 // t1 = a->re - a->im + sbcs x20,x12,x20 + sbcs x21,x13,x21 + sbcs x22,x14,x22 + sbcs x23,x15,x23 + sbcs x24,x16,x24 + sbc x25,xzr,xzr // borrow flag as mask + + stp x5,x6,[sp] + stp x7,x8,[sp,#16] + stp x9,x10,[sp,#32] + stp x19,x20,[sp,#48] + stp x21,x22,[sp,#64] + stp x23,x24,[sp,#80] + str x25,[sp,#96] + + ldp x5,x6,[x2] + ldp x7,x8,[x2,#16] + ldp x9,x10,[x2,#32] + + add x2,x1,#48 + bl __mul_mont_383_nonred // mul_mont_384(ret->im, a->re, a->im) + + adds x19,x11,x11 // add with itself + adcs x20,x12,x12 + adcs x21,x13,x13 + adcs x22,x14,x14 + adcs x23,x15,x15 + adc x24,x16,x16 + + stp x19,x20,[x2,#48] + stp x21,x22,[x2,#64] + stp x23,x24,[x2,#80] + + ldp x11,x12,[sp] + ldr x17,[sp,#48] + ldp x13,x14,[sp,#16] + ldp x15,x16,[sp,#32] + + add x2,sp,#48 + bl __mul_mont_383_nonred // mul_mont_384(ret->im, t0, t1) + ldr x30,[x29,#8] + + ldr x25,[sp,#96] // account for sign from a->re - a->im + ldp x19,x20,[sp] + ldp x21,x22,[sp,#16] + ldp x23,x24,[sp,#32] + + and x19,x19,x25 + and x20,x20,x25 + and x21,x21,x25 + and x22,x22,x25 + and x23,x23,x25 + and x24,x24,x25 + + subs x11,x11,x19 + sbcs x12,x12,x20 + sbcs x13,x13,x21 + sbcs x14,x14,x22 + sbcs x15,x15,x23 + sbcs x16,x16,x24 + sbc x25,xzr,xzr + + and x19,x5,x25 + and x20,x6,x25 + and x21,x7,x25 + and x22,x8,x25 + and x23,x9,x25 + and x24,x10,x25 + + adds x11,x11,x19 + adcs x12,x12,x20 + adcs x13,x13,x21 + adcs x14,x14,x22 + adcs x15,x15,x23 + adc x16,x16,x24 + + stp x11,x12,[x2] + stp x13,x14,[x2,#16] + stp x15,x16,[x2,#32] + + add sp,sp,#112 + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldp x23,x24,[x29,#48] + ldp x25,x26,[x29,#64] + ldp x27,x28,[x29,#80] + ldr x29,[sp],#128 +.long 3573752767 + ret + + +.def __mul_mont_383_nonred; +.type 32; +.endef +.p2align 5 +__mul_mont_383_nonred: + mul x19,x11,x17 + mul x20,x12,x17 + mul x21,x13,x17 + mul x22,x14,x17 + mul x23,x15,x17 + mul x24,x16,x17 + mul x4,x4,x19 + + umulh x26,x11,x17 + umulh x27,x12,x17 + umulh x28,x13,x17 + umulh x0,x14,x17 + umulh x1,x15,x17 + umulh x3,x16,x17 + + adds x20,x20,x26 + mul x26,x5,x4 + adcs x21,x21,x27 + mul x27,x6,x4 + adcs x22,x22,x28 + mul x28,x7,x4 + adcs x23,x23,x0 + mul x0,x8,x4 + adcs x24,x24,x1 + mul x1,x9,x4 + adc x25,xzr, x3 + mul x3,x10,x4 + ldr x17,[x2,8*1] + adds x19,x19,x26 + umulh x26,x5,x4 + adcs x20,x20,x27 + umulh x27,x6,x4 + adcs x21,x21,x28 + umulh x28,x7,x4 + adcs x22,x22,x0 + umulh x0,x8,x4 + adcs x23,x23,x1 + umulh x1,x9,x4 + adcs x24,x24,x3 + umulh x3,x10,x4 + adc x25,x25,xzr + + ldr x4,[x29,#96] + adds x19,x20,x26 + mul x26,x11,x17 + adcs x20,x21,x27 + mul x27,x12,x17 + adcs x21,x22,x28 + mul x28,x13,x17 + adcs x22,x23,x0 + mul x0,x14,x17 + adcs x23,x24,x1 + mul x1,x15,x17 + adcs x24,x25,x3 + mul x3,x16,x17 + adc x25,xzr,xzr + + adds x19,x19,x26 + umulh x26,x11,x17 + adcs x20,x20,x27 + umulh x27,x12,x17 + adcs x21,x21,x28 + mul x4,x4,x19 + umulh x28,x13,x17 + adcs x22,x22,x0 + umulh x0,x14,x17 + adcs x23,x23,x1 + umulh x1,x15,x17 + adcs x24,x24,x3 + umulh x3,x16,x17 + adc x25,x25,xzr + + adds x20,x20,x26 + mul x26,x5,x4 + adcs x21,x21,x27 + mul x27,x6,x4 + adcs x22,x22,x28 + mul x28,x7,x4 + adcs x23,x23,x0 + mul x0,x8,x4 + adcs x24,x24,x1 + mul x1,x9,x4 + adc x25,x25,x3 + mul x3,x10,x4 + ldr x17,[x2,8*2] + adds x19,x19,x26 + umulh x26,x5,x4 + adcs x20,x20,x27 + umulh x27,x6,x4 + adcs x21,x21,x28 + umulh x28,x7,x4 + adcs x22,x22,x0 + umulh x0,x8,x4 + adcs x23,x23,x1 + umulh x1,x9,x4 + adcs x24,x24,x3 + umulh x3,x10,x4 + adc x25,x25,xzr + + ldr x4,[x29,#96] + adds x19,x20,x26 + mul x26,x11,x17 + adcs x20,x21,x27 + mul x27,x12,x17 + adcs x21,x22,x28 + mul x28,x13,x17 + adcs x22,x23,x0 + mul x0,x14,x17 + adcs x23,x24,x1 + mul x1,x15,x17 + adcs x24,x25,x3 + mul x3,x16,x17 + adc x25,xzr,xzr + + adds x19,x19,x26 + umulh x26,x11,x17 + adcs x20,x20,x27 + umulh x27,x12,x17 + adcs x21,x21,x28 + mul x4,x4,x19 + umulh x28,x13,x17 + adcs x22,x22,x0 + umulh x0,x14,x17 + adcs x23,x23,x1 + umulh x1,x15,x17 + adcs x24,x24,x3 + umulh x3,x16,x17 + adc x25,x25,xzr + + adds x20,x20,x26 + mul x26,x5,x4 + adcs x21,x21,x27 + mul x27,x6,x4 + adcs x22,x22,x28 + mul x28,x7,x4 + adcs x23,x23,x0 + mul x0,x8,x4 + adcs x24,x24,x1 + mul x1,x9,x4 + adc x25,x25,x3 + mul x3,x10,x4 + ldr x17,[x2,8*3] + adds x19,x19,x26 + umulh x26,x5,x4 + adcs x20,x20,x27 + umulh x27,x6,x4 + adcs x21,x21,x28 + umulh x28,x7,x4 + adcs x22,x22,x0 + umulh x0,x8,x4 + adcs x23,x23,x1 + umulh x1,x9,x4 + adcs x24,x24,x3 + umulh x3,x10,x4 + adc x25,x25,xzr + + ldr x4,[x29,#96] + adds x19,x20,x26 + mul x26,x11,x17 + adcs x20,x21,x27 + mul x27,x12,x17 + adcs x21,x22,x28 + mul x28,x13,x17 + adcs x22,x23,x0 + mul x0,x14,x17 + adcs x23,x24,x1 + mul x1,x15,x17 + adcs x24,x25,x3 + mul x3,x16,x17 + adc x25,xzr,xzr + + adds x19,x19,x26 + umulh x26,x11,x17 + adcs x20,x20,x27 + umulh x27,x12,x17 + adcs x21,x21,x28 + mul x4,x4,x19 + umulh x28,x13,x17 + adcs x22,x22,x0 + umulh x0,x14,x17 + adcs x23,x23,x1 + umulh x1,x15,x17 + adcs x24,x24,x3 + umulh x3,x16,x17 + adc x25,x25,xzr + + adds x20,x20,x26 + mul x26,x5,x4 + adcs x21,x21,x27 + mul x27,x6,x4 + adcs x22,x22,x28 + mul x28,x7,x4 + adcs x23,x23,x0 + mul x0,x8,x4 + adcs x24,x24,x1 + mul x1,x9,x4 + adc x25,x25,x3 + mul x3,x10,x4 + ldr x17,[x2,8*4] + adds x19,x19,x26 + umulh x26,x5,x4 + adcs x20,x20,x27 + umulh x27,x6,x4 + adcs x21,x21,x28 + umulh x28,x7,x4 + adcs x22,x22,x0 + umulh x0,x8,x4 + adcs x23,x23,x1 + umulh x1,x9,x4 + adcs x24,x24,x3 + umulh x3,x10,x4 + adc x25,x25,xzr + + ldr x4,[x29,#96] + adds x19,x20,x26 + mul x26,x11,x17 + adcs x20,x21,x27 + mul x27,x12,x17 + adcs x21,x22,x28 + mul x28,x13,x17 + adcs x22,x23,x0 + mul x0,x14,x17 + adcs x23,x24,x1 + mul x1,x15,x17 + adcs x24,x25,x3 + mul x3,x16,x17 + adc x25,xzr,xzr + + adds x19,x19,x26 + umulh x26,x11,x17 + adcs x20,x20,x27 + umulh x27,x12,x17 + adcs x21,x21,x28 + mul x4,x4,x19 + umulh x28,x13,x17 + adcs x22,x22,x0 + umulh x0,x14,x17 + adcs x23,x23,x1 + umulh x1,x15,x17 + adcs x24,x24,x3 + umulh x3,x16,x17 + adc x25,x25,xzr + + adds x20,x20,x26 + mul x26,x5,x4 + adcs x21,x21,x27 + mul x27,x6,x4 + adcs x22,x22,x28 + mul x28,x7,x4 + adcs x23,x23,x0 + mul x0,x8,x4 + adcs x24,x24,x1 + mul x1,x9,x4 + adc x25,x25,x3 + mul x3,x10,x4 + ldr x17,[x2,8*5] + adds x19,x19,x26 + umulh x26,x5,x4 + adcs x20,x20,x27 + umulh x27,x6,x4 + adcs x21,x21,x28 + umulh x28,x7,x4 + adcs x22,x22,x0 + umulh x0,x8,x4 + adcs x23,x23,x1 + umulh x1,x9,x4 + adcs x24,x24,x3 + umulh x3,x10,x4 + adc x25,x25,xzr + + ldr x4,[x29,#96] + adds x19,x20,x26 + mul x26,x11,x17 + adcs x20,x21,x27 + mul x27,x12,x17 + adcs x21,x22,x28 + mul x28,x13,x17 + adcs x22,x23,x0 + mul x0,x14,x17 + adcs x23,x24,x1 + mul x1,x15,x17 + adcs x24,x25,x3 + mul x3,x16,x17 + adc x25,xzr,xzr + + adds x19,x19,x26 + umulh x26,x11,x17 + adcs x20,x20,x27 + umulh x27,x12,x17 + adcs x21,x21,x28 + mul x4,x4,x19 + umulh x28,x13,x17 + adcs x22,x22,x0 + umulh x0,x14,x17 + adcs x23,x23,x1 + umulh x1,x15,x17 + adcs x24,x24,x3 + umulh x3,x16,x17 + adc x25,x25,xzr + + adds x20,x20,x26 + mul x26,x5,x4 + adcs x21,x21,x27 + mul x27,x6,x4 + adcs x22,x22,x28 + mul x28,x7,x4 + adcs x23,x23,x0 + mul x0,x8,x4 + adcs x24,x24,x1 + mul x1,x9,x4 + adc x25,x25,x3 + mul x3,x10,x4 + adds x19,x19,x26 + umulh x26,x5,x4 + adcs x20,x20,x27 + umulh x27,x6,x4 + adcs x21,x21,x28 + umulh x28,x7,x4 + adcs x22,x22,x0 + umulh x0,x8,x4 + adcs x23,x23,x1 + umulh x1,x9,x4 + adcs x24,x24,x3 + umulh x3,x10,x4 + adc x25,x25,xzr + ldp x4,x2,[x29,#96] // pull r_ptr + + adds x11,x20,x26 + adcs x12,x21,x27 + adcs x13,x22,x28 + adcs x14,x23,x0 + adcs x15,x24,x1 + adcs x16,x25,x3 + + ret + + +.globl sgn0_pty_mont_384 + +.def sgn0_pty_mont_384; +.type 32; +.endef +.p2align 5 +sgn0_pty_mont_384: +.long 3573752639 + stp x29,x30,[sp,#-128]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + stp x23,x24,[sp,#48] + stp x25,x26,[sp,#64] + stp x27,x28,[sp,#80] + + mov x4,x2 + ldp x5,x6,[x1] + ldp x7,x8,[x1,#16] + ldp x9,x10,[x1,#32] + mov x1,x0 + + bl __mul_by_1_mont_384 + ldr x30,[x29,#8] + + and x0,x11,#1 + adds x11,x11,x11 + adcs x12,x12,x12 + adcs x13,x13,x13 + adcs x14,x14,x14 + adcs x15,x15,x15 + adcs x16,x16,x16 + adc x17,xzr,xzr + + subs x11,x11,x5 + sbcs x12,x12,x6 + sbcs x13,x13,x7 + sbcs x14,x14,x8 + sbcs x15,x15,x9 + sbcs x16,x16,x10 + sbc x17,x17,xzr + + mvn x17,x17 + and x17,x17,#2 + orr x0,x0,x17 + + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldp x23,x24,[x29,#48] + ldp x25,x26,[x29,#64] + ldp x27,x28,[x29,#80] + ldr x29,[sp],#128 +.long 3573752767 + ret + + +.globl sgn0_pty_mont_384x + +.def sgn0_pty_mont_384x; +.type 32; +.endef +.p2align 5 +sgn0_pty_mont_384x: +.long 3573752639 + stp x29,x30,[sp,#-128]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + stp x23,x24,[sp,#48] + stp x25,x26,[sp,#64] + stp x27,x28,[sp,#80] + + mov x4,x2 + ldp x5,x6,[x1] + ldp x7,x8,[x1,#16] + ldp x9,x10,[x1,#32] + mov x1,x0 + + bl __mul_by_1_mont_384 + add x1,x1,#48 + + and x2,x11,#1 + orr x3,x11,x12 + adds x11,x11,x11 + orr x3,x3,x13 + adcs x12,x12,x12 + orr x3,x3,x14 + adcs x13,x13,x13 + orr x3,x3,x15 + adcs x14,x14,x14 + orr x3,x3,x16 + adcs x15,x15,x15 + adcs x16,x16,x16 + adc x17,xzr,xzr + + subs x11,x11,x5 + sbcs x12,x12,x6 + sbcs x13,x13,x7 + sbcs x14,x14,x8 + sbcs x15,x15,x9 + sbcs x16,x16,x10 + sbc x17,x17,xzr + + mvn x17,x17 + and x17,x17,#2 + orr x2,x2,x17 + + bl __mul_by_1_mont_384 + ldr x30,[x29,#8] + + and x0,x11,#1 + orr x1,x11,x12 + adds x11,x11,x11 + orr x1,x1,x13 + adcs x12,x12,x12 + orr x1,x1,x14 + adcs x13,x13,x13 + orr x1,x1,x15 + adcs x14,x14,x14 + orr x1,x1,x16 + adcs x15,x15,x15 + adcs x16,x16,x16 + adc x17,xzr,xzr + + subs x11,x11,x5 + sbcs x12,x12,x6 + sbcs x13,x13,x7 + sbcs x14,x14,x8 + sbcs x15,x15,x9 + sbcs x16,x16,x10 + sbc x17,x17,xzr + + mvn x17,x17 + and x17,x17,#2 + orr x0,x0,x17 + + cmp x3,#0 + csel x3,x0,x2,eq // a->re==0? prty(a->im) : prty(a->re) + + cmp x1,#0 + csel x1,x0,x2,ne // a->im!=0? sgn0(a->im) : sgn0(a->re) + + and x3,x3,#1 + and x1,x1,#2 + orr x0,x1,x3 // pack sign and parity + + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldp x23,x24,[x29,#48] + ldp x25,x26,[x29,#64] + ldp x27,x28,[x29,#80] + ldr x29,[sp],#128 +.long 3573752767 + ret + diff --git a/temp-deps/blst-ts/deps/blst/build/coff/mulq_mont_256-x86_64.s b/temp-deps/blst-ts/deps/blst/build/coff/mulq_mont_256-x86_64.s new file mode 100644 index 0000000..dd1e00f --- /dev/null +++ b/temp-deps/blst-ts/deps/blst/build/coff/mulq_mont_256-x86_64.s @@ -0,0 +1,872 @@ +.text + +.globl mul_mont_sparse_256 + +.def mul_mont_sparse_256; .scl 2; .type 32; .endef +.p2align 5 +mul_mont_sparse_256: + .byte 0xf3,0x0f,0x1e,0xfa + movq %rdi,8(%rsp) + movq %rsi,16(%rsp) + movq %rsp,%r11 +.LSEH_begin_mul_mont_sparse_256: + movq %rcx,%rdi + movq %rdx,%rsi + movq %r8,%rdx + movq %r9,%rcx + movq 40(%rsp),%r8 + + + pushq %rbp + + pushq %rbx + + pushq %r12 + + pushq %r13 + + pushq %r14 + + pushq %r15 + + pushq %rdi + +.LSEH_body_mul_mont_sparse_256: + + + movq 0(%rdx),%rax + movq 0(%rsi),%r13 + movq 8(%rsi),%r14 + movq 16(%rsi),%r12 + movq 24(%rsi),%rbp + movq %rdx,%rbx + + movq %rax,%r15 + mulq %r13 + movq %rax,%r9 + movq %r15,%rax + movq %rdx,%r10 + call __mulq_mont_sparse_256 + + movq 8(%rsp),%r15 + + movq 16(%rsp),%r14 + + movq 24(%rsp),%r13 + + movq 32(%rsp),%r12 + + movq 40(%rsp),%rbx + + movq 48(%rsp),%rbp + + leaq 56(%rsp),%rsp + +.LSEH_epilogue_mul_mont_sparse_256: + mov 8(%rsp),%rdi + mov 16(%rsp),%rsi + + .byte 0xf3,0xc3 + +.LSEH_end_mul_mont_sparse_256: + +.globl sqr_mont_sparse_256 + +.def sqr_mont_sparse_256; .scl 2; .type 32; .endef +.p2align 5 +sqr_mont_sparse_256: + .byte 0xf3,0x0f,0x1e,0xfa + movq %rdi,8(%rsp) + movq %rsi,16(%rsp) + movq %rsp,%r11 +.LSEH_begin_sqr_mont_sparse_256: + movq %rcx,%rdi + movq %rdx,%rsi + movq %r8,%rdx + movq %r9,%rcx + + + pushq %rbp + + pushq %rbx + + pushq %r12 + + pushq %r13 + + pushq %r14 + + pushq %r15 + + pushq %rdi + +.LSEH_body_sqr_mont_sparse_256: + + + movq 0(%rsi),%rax + movq %rcx,%r8 + movq 8(%rsi),%r14 + movq %rdx,%rcx + movq 16(%rsi),%r12 + leaq (%rsi),%rbx + movq 24(%rsi),%rbp + + movq %rax,%r15 + mulq %rax + movq %rax,%r9 + movq %r15,%rax + movq %rdx,%r10 + call __mulq_mont_sparse_256 + + movq 8(%rsp),%r15 + + movq 16(%rsp),%r14 + + movq 24(%rsp),%r13 + + movq 32(%rsp),%r12 + + movq 40(%rsp),%rbx + + movq 48(%rsp),%rbp + + leaq 56(%rsp),%rsp + +.LSEH_epilogue_sqr_mont_sparse_256: + mov 8(%rsp),%rdi + mov 16(%rsp),%rsi + + .byte 0xf3,0xc3 + +.LSEH_end_sqr_mont_sparse_256: +.def __mulq_mont_sparse_256; .scl 3; .type 32; .endef +.p2align 5 +__mulq_mont_sparse_256: + .byte 0xf3,0x0f,0x1e,0xfa + + mulq %r14 + addq %rax,%r10 + movq %r15,%rax + adcq $0,%rdx + movq %rdx,%r11 + + mulq %r12 + addq %rax,%r11 + movq %r15,%rax + adcq $0,%rdx + movq %rdx,%r12 + + mulq %rbp + addq %rax,%r12 + movq 8(%rbx),%rax + adcq $0,%rdx + xorq %r14,%r14 + movq %rdx,%r13 + + movq %r9,%rdi + imulq %r8,%r9 + + + movq %rax,%r15 + mulq 0(%rsi) + addq %rax,%r10 + movq %r15,%rax + adcq $0,%rdx + movq %rdx,%rbp + + mulq 8(%rsi) + addq %rax,%r11 + movq %r15,%rax + adcq $0,%rdx + addq %rbp,%r11 + adcq $0,%rdx + movq %rdx,%rbp + + mulq 16(%rsi) + addq %rax,%r12 + movq %r15,%rax + adcq $0,%rdx + addq %rbp,%r12 + adcq $0,%rdx + movq %rdx,%rbp + + mulq 24(%rsi) + addq %rax,%r13 + movq %r9,%rax + adcq $0,%rdx + addq %rbp,%r13 + adcq %rdx,%r14 + xorq %r15,%r15 + + + mulq 0(%rcx) + addq %rax,%rdi + movq %r9,%rax + adcq %rdx,%rdi + + mulq 8(%rcx) + addq %rax,%r10 + movq %r9,%rax + adcq $0,%rdx + addq %rdi,%r10 + adcq $0,%rdx + movq %rdx,%rbp + + mulq 16(%rcx) + addq %rax,%r11 + movq %r9,%rax + adcq $0,%rdx + addq %rbp,%r11 + adcq $0,%rdx + movq %rdx,%rbp + + mulq 24(%rcx) + addq %rax,%r12 + movq 16(%rbx),%rax + adcq $0,%rdx + addq %rbp,%r12 + adcq $0,%rdx + addq %rdx,%r13 + adcq $0,%r14 + adcq $0,%r15 + movq %r10,%rdi + imulq %r8,%r10 + + + movq %rax,%r9 + mulq 0(%rsi) + addq %rax,%r11 + movq %r9,%rax + adcq $0,%rdx + movq %rdx,%rbp + + mulq 8(%rsi) + addq %rax,%r12 + movq %r9,%rax + adcq $0,%rdx + addq %rbp,%r12 + adcq $0,%rdx + movq %rdx,%rbp + + mulq 16(%rsi) + addq %rax,%r13 + movq %r9,%rax + adcq $0,%rdx + addq %rbp,%r13 + adcq $0,%rdx + movq %rdx,%rbp + + mulq 24(%rsi) + addq %rax,%r14 + movq %r10,%rax + adcq $0,%rdx + addq %rbp,%r14 + adcq %rdx,%r15 + xorq %r9,%r9 + + + mulq 0(%rcx) + addq %rax,%rdi + movq %r10,%rax + adcq %rdx,%rdi + + mulq 8(%rcx) + addq %rax,%r11 + movq %r10,%rax + adcq $0,%rdx + addq %rdi,%r11 + adcq $0,%rdx + movq %rdx,%rbp + + mulq 16(%rcx) + addq %rax,%r12 + movq %r10,%rax + adcq $0,%rdx + addq %rbp,%r12 + adcq $0,%rdx + movq %rdx,%rbp + + mulq 24(%rcx) + addq %rax,%r13 + movq 24(%rbx),%rax + adcq $0,%rdx + addq %rbp,%r13 + adcq $0,%rdx + addq %rdx,%r14 + adcq $0,%r15 + adcq $0,%r9 + movq %r11,%rdi + imulq %r8,%r11 + + + movq %rax,%r10 + mulq 0(%rsi) + addq %rax,%r12 + movq %r10,%rax + adcq $0,%rdx + movq %rdx,%rbp + + mulq 8(%rsi) + addq %rax,%r13 + movq %r10,%rax + adcq $0,%rdx + addq %rbp,%r13 + adcq $0,%rdx + movq %rdx,%rbp + + mulq 16(%rsi) + addq %rax,%r14 + movq %r10,%rax + adcq $0,%rdx + addq %rbp,%r14 + adcq $0,%rdx + movq %rdx,%rbp + + mulq 24(%rsi) + addq %rax,%r15 + movq %r11,%rax + adcq $0,%rdx + addq %rbp,%r15 + adcq %rdx,%r9 + xorq %r10,%r10 + + + mulq 0(%rcx) + addq %rax,%rdi + movq %r11,%rax + adcq %rdx,%rdi + + mulq 8(%rcx) + addq %rax,%r12 + movq %r11,%rax + adcq $0,%rdx + addq %rdi,%r12 + adcq $0,%rdx + movq %rdx,%rbp + + mulq 16(%rcx) + addq %rax,%r13 + movq %r11,%rax + adcq $0,%rdx + addq %rbp,%r13 + adcq $0,%rdx + movq %rdx,%rbp + + mulq 24(%rcx) + addq %rax,%r14 + movq %r12,%rax + adcq $0,%rdx + addq %rbp,%r14 + adcq $0,%rdx + addq %rdx,%r15 + adcq $0,%r9 + adcq $0,%r10 + imulq %r8,%rax + movq 8(%rsp),%rsi + + + movq %rax,%r11 + mulq 0(%rcx) + addq %rax,%r12 + movq %r11,%rax + adcq %rdx,%r12 + + mulq 8(%rcx) + addq %rax,%r13 + movq %r11,%rax + adcq $0,%rdx + addq %r12,%r13 + adcq $0,%rdx + movq %rdx,%rbp + + mulq 16(%rcx) + addq %rax,%r14 + movq %r11,%rax + adcq $0,%rdx + addq %rbp,%r14 + adcq $0,%rdx + movq %rdx,%rbp + + mulq 24(%rcx) + movq %r14,%rbx + addq %rbp,%r15 + adcq $0,%rdx + addq %rax,%r15 + movq %r13,%rax + adcq $0,%rdx + addq %rdx,%r9 + adcq $0,%r10 + + + + + movq %r15,%r12 + subq 0(%rcx),%r13 + sbbq 8(%rcx),%r14 + sbbq 16(%rcx),%r15 + movq %r9,%rbp + sbbq 24(%rcx),%r9 + sbbq $0,%r10 + + cmovcq %rax,%r13 + cmovcq %rbx,%r14 + cmovcq %r12,%r15 + movq %r13,0(%rsi) + cmovcq %rbp,%r9 + movq %r14,8(%rsi) + movq %r15,16(%rsi) + movq %r9,24(%rsi) + + .byte 0xf3,0xc3 + + +.globl from_mont_256 + +.def from_mont_256; .scl 2; .type 32; .endef +.p2align 5 +from_mont_256: + .byte 0xf3,0x0f,0x1e,0xfa + movq %rdi,8(%rsp) + movq %rsi,16(%rsp) + movq %rsp,%r11 +.LSEH_begin_from_mont_256: + movq %rcx,%rdi + movq %rdx,%rsi + movq %r8,%rdx + movq %r9,%rcx + + + pushq %rbp + + pushq %rbx + + pushq %r12 + + pushq %r13 + + pushq %r14 + + pushq %r15 + + subq $8,%rsp + +.LSEH_body_from_mont_256: + + + movq %rdx,%rbx + call __mulq_by_1_mont_256 + + + + + + movq %r14,%r10 + movq %r15,%r11 + movq %r9,%r12 + + subq 0(%rbx),%r13 + sbbq 8(%rbx),%r14 + sbbq 16(%rbx),%r15 + sbbq 24(%rbx),%r9 + + cmovncq %r13,%rax + cmovncq %r14,%r10 + cmovncq %r15,%r11 + movq %rax,0(%rdi) + cmovncq %r9,%r12 + movq %r10,8(%rdi) + movq %r11,16(%rdi) + movq %r12,24(%rdi) + + movq 8(%rsp),%r15 + + movq 16(%rsp),%r14 + + movq 24(%rsp),%r13 + + movq 32(%rsp),%r12 + + movq 40(%rsp),%rbx + + movq 48(%rsp),%rbp + + leaq 56(%rsp),%rsp + +.LSEH_epilogue_from_mont_256: + mov 8(%rsp),%rdi + mov 16(%rsp),%rsi + + .byte 0xf3,0xc3 + +.LSEH_end_from_mont_256: + +.globl redc_mont_256 + +.def redc_mont_256; .scl 2; .type 32; .endef +.p2align 5 +redc_mont_256: + .byte 0xf3,0x0f,0x1e,0xfa + movq %rdi,8(%rsp) + movq %rsi,16(%rsp) + movq %rsp,%r11 +.LSEH_begin_redc_mont_256: + movq %rcx,%rdi + movq %rdx,%rsi + movq %r8,%rdx + movq %r9,%rcx + + + pushq %rbp + + pushq %rbx + + pushq %r12 + + pushq %r13 + + pushq %r14 + + pushq %r15 + + subq $8,%rsp + +.LSEH_body_redc_mont_256: + + + movq %rdx,%rbx + call __mulq_by_1_mont_256 + + addq 32(%rsi),%r13 + adcq 40(%rsi),%r14 + movq %r13,%rax + adcq 48(%rsi),%r15 + movq %r14,%r10 + adcq 56(%rsi),%r9 + sbbq %rsi,%rsi + + + + + movq %r15,%r11 + subq 0(%rbx),%r13 + sbbq 8(%rbx),%r14 + sbbq 16(%rbx),%r15 + movq %r9,%r12 + sbbq 24(%rbx),%r9 + sbbq $0,%rsi + + cmovncq %r13,%rax + cmovncq %r14,%r10 + cmovncq %r15,%r11 + movq %rax,0(%rdi) + cmovncq %r9,%r12 + movq %r10,8(%rdi) + movq %r11,16(%rdi) + movq %r12,24(%rdi) + + movq 8(%rsp),%r15 + + movq 16(%rsp),%r14 + + movq 24(%rsp),%r13 + + movq 32(%rsp),%r12 + + movq 40(%rsp),%rbx + + movq 48(%rsp),%rbp + + leaq 56(%rsp),%rsp + +.LSEH_epilogue_redc_mont_256: + mov 8(%rsp),%rdi + mov 16(%rsp),%rsi + + .byte 0xf3,0xc3 + +.LSEH_end_redc_mont_256: +.def __mulq_by_1_mont_256; .scl 3; .type 32; .endef +.p2align 5 +__mulq_by_1_mont_256: + .byte 0xf3,0x0f,0x1e,0xfa + + movq 0(%rsi),%rax + movq 8(%rsi),%r10 + movq 16(%rsi),%r11 + movq 24(%rsi),%r12 + + movq %rax,%r13 + imulq %rcx,%rax + movq %rax,%r9 + + mulq 0(%rbx) + addq %rax,%r13 + movq %r9,%rax + adcq %rdx,%r13 + + mulq 8(%rbx) + addq %rax,%r10 + movq %r9,%rax + adcq $0,%rdx + addq %r13,%r10 + adcq $0,%rdx + movq %rdx,%r13 + + mulq 16(%rbx) + movq %r10,%r14 + imulq %rcx,%r10 + addq %rax,%r11 + movq %r9,%rax + adcq $0,%rdx + addq %r13,%r11 + adcq $0,%rdx + movq %rdx,%r13 + + mulq 24(%rbx) + addq %rax,%r12 + movq %r10,%rax + adcq $0,%rdx + addq %r13,%r12 + adcq $0,%rdx + movq %rdx,%r13 + + mulq 0(%rbx) + addq %rax,%r14 + movq %r10,%rax + adcq %rdx,%r14 + + mulq 8(%rbx) + addq %rax,%r11 + movq %r10,%rax + adcq $0,%rdx + addq %r14,%r11 + adcq $0,%rdx + movq %rdx,%r14 + + mulq 16(%rbx) + movq %r11,%r15 + imulq %rcx,%r11 + addq %rax,%r12 + movq %r10,%rax + adcq $0,%rdx + addq %r14,%r12 + adcq $0,%rdx + movq %rdx,%r14 + + mulq 24(%rbx) + addq %rax,%r13 + movq %r11,%rax + adcq $0,%rdx + addq %r14,%r13 + adcq $0,%rdx + movq %rdx,%r14 + + mulq 0(%rbx) + addq %rax,%r15 + movq %r11,%rax + adcq %rdx,%r15 + + mulq 8(%rbx) + addq %rax,%r12 + movq %r11,%rax + adcq $0,%rdx + addq %r15,%r12 + adcq $0,%rdx + movq %rdx,%r15 + + mulq 16(%rbx) + movq %r12,%r9 + imulq %rcx,%r12 + addq %rax,%r13 + movq %r11,%rax + adcq $0,%rdx + addq %r15,%r13 + adcq $0,%rdx + movq %rdx,%r15 + + mulq 24(%rbx) + addq %rax,%r14 + movq %r12,%rax + adcq $0,%rdx + addq %r15,%r14 + adcq $0,%rdx + movq %rdx,%r15 + + mulq 0(%rbx) + addq %rax,%r9 + movq %r12,%rax + adcq %rdx,%r9 + + mulq 8(%rbx) + addq %rax,%r13 + movq %r12,%rax + adcq $0,%rdx + addq %r9,%r13 + adcq $0,%rdx + movq %rdx,%r9 + + mulq 16(%rbx) + addq %rax,%r14 + movq %r12,%rax + adcq $0,%rdx + addq %r9,%r14 + adcq $0,%rdx + movq %rdx,%r9 + + mulq 24(%rbx) + addq %rax,%r15 + movq %r13,%rax + adcq $0,%rdx + addq %r9,%r15 + adcq $0,%rdx + movq %rdx,%r9 + .byte 0xf3,0xc3 + +.section .pdata +.p2align 2 +.rva .LSEH_begin_mul_mont_sparse_256 +.rva .LSEH_body_mul_mont_sparse_256 +.rva .LSEH_info_mul_mont_sparse_256_prologue + +.rva .LSEH_body_mul_mont_sparse_256 +.rva .LSEH_epilogue_mul_mont_sparse_256 +.rva .LSEH_info_mul_mont_sparse_256_body + +.rva .LSEH_epilogue_mul_mont_sparse_256 +.rva .LSEH_end_mul_mont_sparse_256 +.rva .LSEH_info_mul_mont_sparse_256_epilogue + +.rva .LSEH_begin_sqr_mont_sparse_256 +.rva .LSEH_body_sqr_mont_sparse_256 +.rva .LSEH_info_sqr_mont_sparse_256_prologue + +.rva .LSEH_body_sqr_mont_sparse_256 +.rva .LSEH_epilogue_sqr_mont_sparse_256 +.rva .LSEH_info_sqr_mont_sparse_256_body + +.rva .LSEH_epilogue_sqr_mont_sparse_256 +.rva .LSEH_end_sqr_mont_sparse_256 +.rva .LSEH_info_sqr_mont_sparse_256_epilogue + +.rva .LSEH_begin_from_mont_256 +.rva .LSEH_body_from_mont_256 +.rva .LSEH_info_from_mont_256_prologue + +.rva .LSEH_body_from_mont_256 +.rva .LSEH_epilogue_from_mont_256 +.rva .LSEH_info_from_mont_256_body + +.rva .LSEH_epilogue_from_mont_256 +.rva .LSEH_end_from_mont_256 +.rva .LSEH_info_from_mont_256_epilogue + +.rva .LSEH_begin_redc_mont_256 +.rva .LSEH_body_redc_mont_256 +.rva .LSEH_info_redc_mont_256_prologue + +.rva .LSEH_body_redc_mont_256 +.rva .LSEH_epilogue_redc_mont_256 +.rva .LSEH_info_redc_mont_256_body + +.rva .LSEH_epilogue_redc_mont_256 +.rva .LSEH_end_redc_mont_256 +.rva .LSEH_info_redc_mont_256_epilogue + +.section .xdata +.p2align 3 +.LSEH_info_mul_mont_sparse_256_prologue: +.byte 1,0,5,0x0b +.byte 0,0x74,1,0 +.byte 0,0x64,2,0 +.byte 0,0x03 +.byte 0,0 +.LSEH_info_mul_mont_sparse_256_body: +.byte 1,0,17,0 +.byte 0x00,0xf4,0x01,0x00 +.byte 0x00,0xe4,0x02,0x00 +.byte 0x00,0xd4,0x03,0x00 +.byte 0x00,0xc4,0x04,0x00 +.byte 0x00,0x34,0x05,0x00 +.byte 0x00,0x54,0x06,0x00 +.byte 0x00,0x74,0x08,0x00 +.byte 0x00,0x64,0x09,0x00 +.byte 0x00,0x62 +.byte 0x00,0x00 +.LSEH_info_mul_mont_sparse_256_epilogue: +.byte 1,0,4,0 +.byte 0x00,0x74,0x01,0x00 +.byte 0x00,0x64,0x02,0x00 +.byte 0x00,0x00,0x00,0x00 + +.LSEH_info_sqr_mont_sparse_256_prologue: +.byte 1,0,5,0x0b +.byte 0,0x74,1,0 +.byte 0,0x64,2,0 +.byte 0,0x03 +.byte 0,0 +.LSEH_info_sqr_mont_sparse_256_body: +.byte 1,0,17,0 +.byte 0x00,0xf4,0x01,0x00 +.byte 0x00,0xe4,0x02,0x00 +.byte 0x00,0xd4,0x03,0x00 +.byte 0x00,0xc4,0x04,0x00 +.byte 0x00,0x34,0x05,0x00 +.byte 0x00,0x54,0x06,0x00 +.byte 0x00,0x74,0x08,0x00 +.byte 0x00,0x64,0x09,0x00 +.byte 0x00,0x62 +.byte 0x00,0x00 +.LSEH_info_sqr_mont_sparse_256_epilogue: +.byte 1,0,4,0 +.byte 0x00,0x74,0x01,0x00 +.byte 0x00,0x64,0x02,0x00 +.byte 0x00,0x00,0x00,0x00 + +.LSEH_info_from_mont_256_prologue: +.byte 1,0,5,0x0b +.byte 0,0x74,1,0 +.byte 0,0x64,2,0 +.byte 0,0x03 +.byte 0,0 +.LSEH_info_from_mont_256_body: +.byte 1,0,17,0 +.byte 0x00,0xf4,0x01,0x00 +.byte 0x00,0xe4,0x02,0x00 +.byte 0x00,0xd4,0x03,0x00 +.byte 0x00,0xc4,0x04,0x00 +.byte 0x00,0x34,0x05,0x00 +.byte 0x00,0x54,0x06,0x00 +.byte 0x00,0x74,0x08,0x00 +.byte 0x00,0x64,0x09,0x00 +.byte 0x00,0x62 +.byte 0x00,0x00 +.LSEH_info_from_mont_256_epilogue: +.byte 1,0,4,0 +.byte 0x00,0x74,0x01,0x00 +.byte 0x00,0x64,0x02,0x00 +.byte 0x00,0x00,0x00,0x00 + +.LSEH_info_redc_mont_256_prologue: +.byte 1,0,5,0x0b +.byte 0,0x74,1,0 +.byte 0,0x64,2,0 +.byte 0,0x03 +.byte 0,0 +.LSEH_info_redc_mont_256_body: +.byte 1,0,17,0 +.byte 0x00,0xf4,0x01,0x00 +.byte 0x00,0xe4,0x02,0x00 +.byte 0x00,0xd4,0x03,0x00 +.byte 0x00,0xc4,0x04,0x00 +.byte 0x00,0x34,0x05,0x00 +.byte 0x00,0x54,0x06,0x00 +.byte 0x00,0x74,0x08,0x00 +.byte 0x00,0x64,0x09,0x00 +.byte 0x00,0x62 +.byte 0x00,0x00 +.LSEH_info_redc_mont_256_epilogue: +.byte 1,0,4,0 +.byte 0x00,0x74,0x01,0x00 +.byte 0x00,0x64,0x02,0x00 +.byte 0x00,0x00,0x00,0x00 + diff --git a/temp-deps/blst-ts/deps/blst/build/coff/mulq_mont_384-x86_64.s b/temp-deps/blst-ts/deps/blst/build/coff/mulq_mont_384-x86_64.s new file mode 100644 index 0000000..5663463 --- /dev/null +++ b/temp-deps/blst-ts/deps/blst/build/coff/mulq_mont_384-x86_64.s @@ -0,0 +1,4206 @@ +.text + + + + + + + +.def __sub_mod_384x384; .scl 3; .type 32; .endef +.p2align 5 +__sub_mod_384x384: + .byte 0xf3,0x0f,0x1e,0xfa + + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + movq 32(%rsi),%r12 + movq 40(%rsi),%r13 + movq 48(%rsi),%r14 + + subq 0(%rdx),%r8 + movq 56(%rsi),%r15 + sbbq 8(%rdx),%r9 + movq 64(%rsi),%rax + sbbq 16(%rdx),%r10 + movq 72(%rsi),%rbx + sbbq 24(%rdx),%r11 + movq 80(%rsi),%rbp + sbbq 32(%rdx),%r12 + movq 88(%rsi),%rsi + sbbq 40(%rdx),%r13 + movq %r8,0(%rdi) + sbbq 48(%rdx),%r14 + movq 0(%rcx),%r8 + movq %r9,8(%rdi) + sbbq 56(%rdx),%r15 + movq 8(%rcx),%r9 + movq %r10,16(%rdi) + sbbq 64(%rdx),%rax + movq 16(%rcx),%r10 + movq %r11,24(%rdi) + sbbq 72(%rdx),%rbx + movq 24(%rcx),%r11 + movq %r12,32(%rdi) + sbbq 80(%rdx),%rbp + movq 32(%rcx),%r12 + movq %r13,40(%rdi) + sbbq 88(%rdx),%rsi + movq 40(%rcx),%r13 + sbbq %rdx,%rdx + + andq %rdx,%r8 + andq %rdx,%r9 + andq %rdx,%r10 + andq %rdx,%r11 + andq %rdx,%r12 + andq %rdx,%r13 + + addq %r8,%r14 + adcq %r9,%r15 + movq %r14,48(%rdi) + adcq %r10,%rax + movq %r15,56(%rdi) + adcq %r11,%rbx + movq %rax,64(%rdi) + adcq %r12,%rbp + movq %rbx,72(%rdi) + adcq %r13,%rsi + movq %rbp,80(%rdi) + movq %rsi,88(%rdi) + + .byte 0xf3,0xc3 + + +.def __add_mod_384; .scl 3; .type 32; .endef +.p2align 5 +__add_mod_384: + .byte 0xf3,0x0f,0x1e,0xfa + + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + movq 32(%rsi),%r12 + movq 40(%rsi),%r13 + + addq 0(%rdx),%r8 + adcq 8(%rdx),%r9 + adcq 16(%rdx),%r10 + movq %r8,%r14 + adcq 24(%rdx),%r11 + movq %r9,%r15 + adcq 32(%rdx),%r12 + movq %r10,%rax + adcq 40(%rdx),%r13 + movq %r11,%rbx + sbbq %rdx,%rdx + + subq 0(%rcx),%r8 + sbbq 8(%rcx),%r9 + movq %r12,%rbp + sbbq 16(%rcx),%r10 + sbbq 24(%rcx),%r11 + sbbq 32(%rcx),%r12 + movq %r13,%rsi + sbbq 40(%rcx),%r13 + sbbq $0,%rdx + + cmovcq %r14,%r8 + cmovcq %r15,%r9 + cmovcq %rax,%r10 + movq %r8,0(%rdi) + cmovcq %rbx,%r11 + movq %r9,8(%rdi) + cmovcq %rbp,%r12 + movq %r10,16(%rdi) + cmovcq %rsi,%r13 + movq %r11,24(%rdi) + movq %r12,32(%rdi) + movq %r13,40(%rdi) + + .byte 0xf3,0xc3 + + +.def __sub_mod_384; .scl 3; .type 32; .endef +.p2align 5 +__sub_mod_384: + .byte 0xf3,0x0f,0x1e,0xfa + + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + movq 32(%rsi),%r12 + movq 40(%rsi),%r13 + +__sub_mod_384_a_is_loaded: + subq 0(%rdx),%r8 + movq 0(%rcx),%r14 + sbbq 8(%rdx),%r9 + movq 8(%rcx),%r15 + sbbq 16(%rdx),%r10 + movq 16(%rcx),%rax + sbbq 24(%rdx),%r11 + movq 24(%rcx),%rbx + sbbq 32(%rdx),%r12 + movq 32(%rcx),%rbp + sbbq 40(%rdx),%r13 + movq 40(%rcx),%rsi + sbbq %rdx,%rdx + + andq %rdx,%r14 + andq %rdx,%r15 + andq %rdx,%rax + andq %rdx,%rbx + andq %rdx,%rbp + andq %rdx,%rsi + + addq %r14,%r8 + adcq %r15,%r9 + movq %r8,0(%rdi) + adcq %rax,%r10 + movq %r9,8(%rdi) + adcq %rbx,%r11 + movq %r10,16(%rdi) + adcq %rbp,%r12 + movq %r11,24(%rdi) + adcq %rsi,%r13 + movq %r12,32(%rdi) + movq %r13,40(%rdi) + + .byte 0xf3,0xc3 + +.globl mul_mont_384x + +.def mul_mont_384x; .scl 2; .type 32; .endef +.p2align 5 +mul_mont_384x: + .byte 0xf3,0x0f,0x1e,0xfa + movq %rdi,8(%rsp) + movq %rsi,16(%rsp) + movq %rsp,%r11 +.LSEH_begin_mul_mont_384x: + movq %rcx,%rdi + movq %rdx,%rsi + movq %r8,%rdx + movq %r9,%rcx + movq 40(%rsp),%r8 + + + pushq %rbp + + pushq %rbx + + pushq %r12 + + pushq %r13 + + pushq %r14 + + pushq %r15 + + subq $328,%rsp + +.LSEH_body_mul_mont_384x: + + + movq %rdx,%rbx + movq %rdi,32(%rsp) + movq %rsi,24(%rsp) + movq %rdx,16(%rsp) + movq %rcx,8(%rsp) + movq %r8,0(%rsp) + + + + + leaq 40(%rsp),%rdi + call __mulq_384 + + + leaq 48(%rbx),%rbx + leaq 48(%rsi),%rsi + leaq 40+96(%rsp),%rdi + call __mulq_384 + + + movq 8(%rsp),%rcx + leaq -48(%rsi),%rdx + leaq 40+192+48(%rsp),%rdi + call __add_mod_384 + + movq 16(%rsp),%rsi + leaq 48(%rsi),%rdx + leaq -48(%rdi),%rdi + call __add_mod_384 + + leaq (%rdi),%rbx + leaq 48(%rdi),%rsi + call __mulq_384 + + + leaq (%rdi),%rsi + leaq 40(%rsp),%rdx + movq 8(%rsp),%rcx + call __sub_mod_384x384 + + leaq (%rdi),%rsi + leaq -96(%rdi),%rdx + call __sub_mod_384x384 + + + leaq 40(%rsp),%rsi + leaq 40+96(%rsp),%rdx + leaq 40(%rsp),%rdi + call __sub_mod_384x384 + + movq %rcx,%rbx + + + leaq 40(%rsp),%rsi + movq 0(%rsp),%rcx + movq 32(%rsp),%rdi + call __mulq_by_1_mont_384 + call __redc_tail_mont_384 + + + leaq 40+192(%rsp),%rsi + movq 0(%rsp),%rcx + leaq 48(%rdi),%rdi + call __mulq_by_1_mont_384 + call __redc_tail_mont_384 + + leaq 328(%rsp),%r8 + movq 0(%r8),%r15 + + movq 8(%r8),%r14 + + movq 16(%r8),%r13 + + movq 24(%r8),%r12 + + movq 32(%r8),%rbx + + movq 40(%r8),%rbp + + leaq 48(%r8),%rsp + +.LSEH_epilogue_mul_mont_384x: + mov 8(%rsp),%rdi + mov 16(%rsp),%rsi + + .byte 0xf3,0xc3 + +.LSEH_end_mul_mont_384x: +.globl sqr_mont_384x + +.def sqr_mont_384x; .scl 2; .type 32; .endef +.p2align 5 +sqr_mont_384x: + .byte 0xf3,0x0f,0x1e,0xfa + movq %rdi,8(%rsp) + movq %rsi,16(%rsp) + movq %rsp,%r11 +.LSEH_begin_sqr_mont_384x: + movq %rcx,%rdi + movq %rdx,%rsi + movq %r8,%rdx + movq %r9,%rcx + + + pushq %rbp + + pushq %rbx + + pushq %r12 + + pushq %r13 + + pushq %r14 + + pushq %r15 + + subq $136,%rsp + +.LSEH_body_sqr_mont_384x: + + + movq %rcx,0(%rsp) + movq %rdx,%rcx + movq %rdi,8(%rsp) + movq %rsi,16(%rsp) + + + leaq 48(%rsi),%rdx + leaq 32(%rsp),%rdi + call __add_mod_384 + + + movq 16(%rsp),%rsi + leaq 48(%rsi),%rdx + leaq 32+48(%rsp),%rdi + call __sub_mod_384 + + + movq 16(%rsp),%rsi + leaq 48(%rsi),%rbx + + movq 48(%rsi),%rax + movq 0(%rsi),%r14 + movq 8(%rsi),%r15 + movq 16(%rsi),%r12 + movq 24(%rsi),%r13 + + call __mulq_mont_384 + addq %r14,%r14 + adcq %r15,%r15 + adcq %r8,%r8 + movq %r14,%r12 + adcq %r9,%r9 + movq %r15,%r13 + adcq %r10,%r10 + movq %r8,%rax + adcq %r11,%r11 + movq %r9,%rbx + sbbq %rdx,%rdx + + subq 0(%rcx),%r14 + sbbq 8(%rcx),%r15 + movq %r10,%rbp + sbbq 16(%rcx),%r8 + sbbq 24(%rcx),%r9 + sbbq 32(%rcx),%r10 + movq %r11,%rsi + sbbq 40(%rcx),%r11 + sbbq $0,%rdx + + cmovcq %r12,%r14 + cmovcq %r13,%r15 + cmovcq %rax,%r8 + movq %r14,48(%rdi) + cmovcq %rbx,%r9 + movq %r15,56(%rdi) + cmovcq %rbp,%r10 + movq %r8,64(%rdi) + cmovcq %rsi,%r11 + movq %r9,72(%rdi) + movq %r10,80(%rdi) + movq %r11,88(%rdi) + + leaq 32(%rsp),%rsi + leaq 32+48(%rsp),%rbx + + movq 32+48(%rsp),%rax + movq 32+0(%rsp),%r14 + movq 32+8(%rsp),%r15 + movq 32+16(%rsp),%r12 + movq 32+24(%rsp),%r13 + + call __mulq_mont_384 + + leaq 136(%rsp),%r8 + movq 0(%r8),%r15 + + movq 8(%r8),%r14 + + movq 16(%r8),%r13 + + movq 24(%r8),%r12 + + movq 32(%r8),%rbx + + movq 40(%r8),%rbp + + leaq 48(%r8),%rsp + +.LSEH_epilogue_sqr_mont_384x: + mov 8(%rsp),%rdi + mov 16(%rsp),%rsi + + .byte 0xf3,0xc3 + +.LSEH_end_sqr_mont_384x: + +.globl mul_382x + +.def mul_382x; .scl 2; .type 32; .endef +.p2align 5 +mul_382x: + .byte 0xf3,0x0f,0x1e,0xfa + movq %rdi,8(%rsp) + movq %rsi,16(%rsp) + movq %rsp,%r11 +.LSEH_begin_mul_382x: + movq %rcx,%rdi + movq %rdx,%rsi + movq %r8,%rdx + movq %r9,%rcx + + + pushq %rbp + + pushq %rbx + + pushq %r12 + + pushq %r13 + + pushq %r14 + + pushq %r15 + + subq $136,%rsp + +.LSEH_body_mul_382x: + + + leaq 96(%rdi),%rdi + movq %rsi,0(%rsp) + movq %rdx,8(%rsp) + movq %rdi,16(%rsp) + movq %rcx,24(%rsp) + + + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + movq 32(%rsi),%r12 + movq 40(%rsi),%r13 + + addq 48(%rsi),%r8 + adcq 56(%rsi),%r9 + adcq 64(%rsi),%r10 + adcq 72(%rsi),%r11 + adcq 80(%rsi),%r12 + adcq 88(%rsi),%r13 + + movq %r8,32+0(%rsp) + movq %r9,32+8(%rsp) + movq %r10,32+16(%rsp) + movq %r11,32+24(%rsp) + movq %r12,32+32(%rsp) + movq %r13,32+40(%rsp) + + + movq 0(%rdx),%r8 + movq 8(%rdx),%r9 + movq 16(%rdx),%r10 + movq 24(%rdx),%r11 + movq 32(%rdx),%r12 + movq 40(%rdx),%r13 + + addq 48(%rdx),%r8 + adcq 56(%rdx),%r9 + adcq 64(%rdx),%r10 + adcq 72(%rdx),%r11 + adcq 80(%rdx),%r12 + adcq 88(%rdx),%r13 + + movq %r8,32+48(%rsp) + movq %r9,32+56(%rsp) + movq %r10,32+64(%rsp) + movq %r11,32+72(%rsp) + movq %r12,32+80(%rsp) + movq %r13,32+88(%rsp) + + + leaq 32+0(%rsp),%rsi + leaq 32+48(%rsp),%rbx + call __mulq_384 + + + movq 0(%rsp),%rsi + movq 8(%rsp),%rbx + leaq -96(%rdi),%rdi + call __mulq_384 + + + leaq 48(%rsi),%rsi + leaq 48(%rbx),%rbx + leaq 32(%rsp),%rdi + call __mulq_384 + + + movq 16(%rsp),%rsi + leaq 32(%rsp),%rdx + movq 24(%rsp),%rcx + movq %rsi,%rdi + call __sub_mod_384x384 + + + leaq 0(%rdi),%rsi + leaq -96(%rdi),%rdx + call __sub_mod_384x384 + + + leaq -96(%rdi),%rsi + leaq 32(%rsp),%rdx + leaq -96(%rdi),%rdi + call __sub_mod_384x384 + + leaq 136(%rsp),%r8 + movq 0(%r8),%r15 + + movq 8(%r8),%r14 + + movq 16(%r8),%r13 + + movq 24(%r8),%r12 + + movq 32(%r8),%rbx + + movq 40(%r8),%rbp + + leaq 48(%r8),%rsp + +.LSEH_epilogue_mul_382x: + mov 8(%rsp),%rdi + mov 16(%rsp),%rsi + + .byte 0xf3,0xc3 + +.LSEH_end_mul_382x: +.globl sqr_382x + +.def sqr_382x; .scl 2; .type 32; .endef +.p2align 5 +sqr_382x: + .byte 0xf3,0x0f,0x1e,0xfa + movq %rdi,8(%rsp) + movq %rsi,16(%rsp) + movq %rsp,%r11 +.LSEH_begin_sqr_382x: + movq %rcx,%rdi + movq %rdx,%rsi + movq %r8,%rdx + + + pushq %rbp + + pushq %rbx + + pushq %r12 + + pushq %r13 + + pushq %r14 + + pushq %r15 + + pushq %rsi + +.LSEH_body_sqr_382x: + + + movq %rdx,%rcx + + + movq 0(%rsi),%r14 + movq 8(%rsi),%r15 + movq 16(%rsi),%rax + movq 24(%rsi),%rbx + movq 32(%rsi),%rbp + movq 40(%rsi),%rdx + + movq %r14,%r8 + addq 48(%rsi),%r14 + movq %r15,%r9 + adcq 56(%rsi),%r15 + movq %rax,%r10 + adcq 64(%rsi),%rax + movq %rbx,%r11 + adcq 72(%rsi),%rbx + movq %rbp,%r12 + adcq 80(%rsi),%rbp + movq %rdx,%r13 + adcq 88(%rsi),%rdx + + movq %r14,0(%rdi) + movq %r15,8(%rdi) + movq %rax,16(%rdi) + movq %rbx,24(%rdi) + movq %rbp,32(%rdi) + movq %rdx,40(%rdi) + + + leaq 48(%rsi),%rdx + leaq 48(%rdi),%rdi + call __sub_mod_384_a_is_loaded + + + leaq (%rdi),%rsi + leaq -48(%rdi),%rbx + leaq -48(%rdi),%rdi + call __mulq_384 + + + movq (%rsp),%rsi + leaq 48(%rsi),%rbx + leaq 96(%rdi),%rdi + call __mulq_384 + + movq 0(%rdi),%r8 + movq 8(%rdi),%r9 + movq 16(%rdi),%r10 + movq 24(%rdi),%r11 + movq 32(%rdi),%r12 + movq 40(%rdi),%r13 + movq 48(%rdi),%r14 + movq 56(%rdi),%r15 + movq 64(%rdi),%rax + movq 72(%rdi),%rbx + movq 80(%rdi),%rbp + addq %r8,%r8 + movq 88(%rdi),%rdx + adcq %r9,%r9 + movq %r8,0(%rdi) + adcq %r10,%r10 + movq %r9,8(%rdi) + adcq %r11,%r11 + movq %r10,16(%rdi) + adcq %r12,%r12 + movq %r11,24(%rdi) + adcq %r13,%r13 + movq %r12,32(%rdi) + adcq %r14,%r14 + movq %r13,40(%rdi) + adcq %r15,%r15 + movq %r14,48(%rdi) + adcq %rax,%rax + movq %r15,56(%rdi) + adcq %rbx,%rbx + movq %rax,64(%rdi) + adcq %rbp,%rbp + movq %rbx,72(%rdi) + adcq %rdx,%rdx + movq %rbp,80(%rdi) + movq %rdx,88(%rdi) + + movq 8(%rsp),%r15 + + movq 16(%rsp),%r14 + + movq 24(%rsp),%r13 + + movq 32(%rsp),%r12 + + movq 40(%rsp),%rbx + + movq 48(%rsp),%rbp + + leaq 56(%rsp),%rsp + +.LSEH_epilogue_sqr_382x: + mov 8(%rsp),%rdi + mov 16(%rsp),%rsi + + .byte 0xf3,0xc3 + +.LSEH_end_sqr_382x: +.globl mul_384 + +.def mul_384; .scl 2; .type 32; .endef +.p2align 5 +mul_384: + .byte 0xf3,0x0f,0x1e,0xfa + movq %rdi,8(%rsp) + movq %rsi,16(%rsp) + movq %rsp,%r11 +.LSEH_begin_mul_384: + movq %rcx,%rdi + movq %rdx,%rsi + movq %r8,%rdx + + + pushq %rbp + + pushq %rbx + + pushq %r12 + +.LSEH_body_mul_384: + + + movq %rdx,%rbx + call __mulq_384 + + movq 0(%rsp),%r12 + + movq 8(%rsp),%rbx + + movq 16(%rsp),%rbp + + leaq 24(%rsp),%rsp + +.LSEH_epilogue_mul_384: + mov 8(%rsp),%rdi + mov 16(%rsp),%rsi + + .byte 0xf3,0xc3 + +.LSEH_end_mul_384: + +.def __mulq_384; .scl 3; .type 32; .endef +.p2align 5 +__mulq_384: + .byte 0xf3,0x0f,0x1e,0xfa + + movq 0(%rbx),%rax + + movq %rax,%rbp + mulq 0(%rsi) + movq %rax,0(%rdi) + movq %rbp,%rax + movq %rdx,%rcx + + mulq 8(%rsi) + addq %rax,%rcx + movq %rbp,%rax + adcq $0,%rdx + movq %rdx,%r8 + + mulq 16(%rsi) + addq %rax,%r8 + movq %rbp,%rax + adcq $0,%rdx + movq %rdx,%r9 + + mulq 24(%rsi) + addq %rax,%r9 + movq %rbp,%rax + adcq $0,%rdx + movq %rdx,%r10 + + mulq 32(%rsi) + addq %rax,%r10 + movq %rbp,%rax + adcq $0,%rdx + movq %rdx,%r11 + + mulq 40(%rsi) + addq %rax,%r11 + movq 8(%rbx),%rax + adcq $0,%rdx + movq %rdx,%r12 + movq %rax,%rbp + mulq 0(%rsi) + addq %rax,%rcx + movq %rbp,%rax + adcq $0,%rdx + movq %rcx,8(%rdi) + movq %rdx,%rcx + + mulq 8(%rsi) + addq %rax,%r8 + movq %rbp,%rax + adcq $0,%rdx + addq %r8,%rcx + adcq $0,%rdx + movq %rdx,%r8 + + mulq 16(%rsi) + addq %rax,%r9 + movq %rbp,%rax + adcq $0,%rdx + addq %r9,%r8 + adcq $0,%rdx + movq %rdx,%r9 + + mulq 24(%rsi) + addq %rax,%r10 + movq %rbp,%rax + adcq $0,%rdx + addq %r10,%r9 + adcq $0,%rdx + movq %rdx,%r10 + + mulq 32(%rsi) + addq %rax,%r11 + movq %rbp,%rax + adcq $0,%rdx + addq %r11,%r10 + adcq $0,%rdx + movq %rdx,%r11 + + mulq 40(%rsi) + addq %rax,%r12 + movq 16(%rbx),%rax + adcq $0,%rdx + addq %r12,%r11 + adcq $0,%rdx + movq %rdx,%r12 + movq %rax,%rbp + mulq 0(%rsi) + addq %rax,%rcx + movq %rbp,%rax + adcq $0,%rdx + movq %rcx,16(%rdi) + movq %rdx,%rcx + + mulq 8(%rsi) + addq %rax,%r8 + movq %rbp,%rax + adcq $0,%rdx + addq %r8,%rcx + adcq $0,%rdx + movq %rdx,%r8 + + mulq 16(%rsi) + addq %rax,%r9 + movq %rbp,%rax + adcq $0,%rdx + addq %r9,%r8 + adcq $0,%rdx + movq %rdx,%r9 + + mulq 24(%rsi) + addq %rax,%r10 + movq %rbp,%rax + adcq $0,%rdx + addq %r10,%r9 + adcq $0,%rdx + movq %rdx,%r10 + + mulq 32(%rsi) + addq %rax,%r11 + movq %rbp,%rax + adcq $0,%rdx + addq %r11,%r10 + adcq $0,%rdx + movq %rdx,%r11 + + mulq 40(%rsi) + addq %rax,%r12 + movq 24(%rbx),%rax + adcq $0,%rdx + addq %r12,%r11 + adcq $0,%rdx + movq %rdx,%r12 + movq %rax,%rbp + mulq 0(%rsi) + addq %rax,%rcx + movq %rbp,%rax + adcq $0,%rdx + movq %rcx,24(%rdi) + movq %rdx,%rcx + + mulq 8(%rsi) + addq %rax,%r8 + movq %rbp,%rax + adcq $0,%rdx + addq %r8,%rcx + adcq $0,%rdx + movq %rdx,%r8 + + mulq 16(%rsi) + addq %rax,%r9 + movq %rbp,%rax + adcq $0,%rdx + addq %r9,%r8 + adcq $0,%rdx + movq %rdx,%r9 + + mulq 24(%rsi) + addq %rax,%r10 + movq %rbp,%rax + adcq $0,%rdx + addq %r10,%r9 + adcq $0,%rdx + movq %rdx,%r10 + + mulq 32(%rsi) + addq %rax,%r11 + movq %rbp,%rax + adcq $0,%rdx + addq %r11,%r10 + adcq $0,%rdx + movq %rdx,%r11 + + mulq 40(%rsi) + addq %rax,%r12 + movq 32(%rbx),%rax + adcq $0,%rdx + addq %r12,%r11 + adcq $0,%rdx + movq %rdx,%r12 + movq %rax,%rbp + mulq 0(%rsi) + addq %rax,%rcx + movq %rbp,%rax + adcq $0,%rdx + movq %rcx,32(%rdi) + movq %rdx,%rcx + + mulq 8(%rsi) + addq %rax,%r8 + movq %rbp,%rax + adcq $0,%rdx + addq %r8,%rcx + adcq $0,%rdx + movq %rdx,%r8 + + mulq 16(%rsi) + addq %rax,%r9 + movq %rbp,%rax + adcq $0,%rdx + addq %r9,%r8 + adcq $0,%rdx + movq %rdx,%r9 + + mulq 24(%rsi) + addq %rax,%r10 + movq %rbp,%rax + adcq $0,%rdx + addq %r10,%r9 + adcq $0,%rdx + movq %rdx,%r10 + + mulq 32(%rsi) + addq %rax,%r11 + movq %rbp,%rax + adcq $0,%rdx + addq %r11,%r10 + adcq $0,%rdx + movq %rdx,%r11 + + mulq 40(%rsi) + addq %rax,%r12 + movq 40(%rbx),%rax + adcq $0,%rdx + addq %r12,%r11 + adcq $0,%rdx + movq %rdx,%r12 + movq %rax,%rbp + mulq 0(%rsi) + addq %rax,%rcx + movq %rbp,%rax + adcq $0,%rdx + movq %rcx,40(%rdi) + movq %rdx,%rcx + + mulq 8(%rsi) + addq %rax,%r8 + movq %rbp,%rax + adcq $0,%rdx + addq %r8,%rcx + adcq $0,%rdx + movq %rdx,%r8 + + mulq 16(%rsi) + addq %rax,%r9 + movq %rbp,%rax + adcq $0,%rdx + addq %r9,%r8 + adcq $0,%rdx + movq %rdx,%r9 + + mulq 24(%rsi) + addq %rax,%r10 + movq %rbp,%rax + adcq $0,%rdx + addq %r10,%r9 + adcq $0,%rdx + movq %rdx,%r10 + + mulq 32(%rsi) + addq %rax,%r11 + movq %rbp,%rax + adcq $0,%rdx + addq %r11,%r10 + adcq $0,%rdx + movq %rdx,%r11 + + mulq 40(%rsi) + addq %rax,%r12 + movq %rax,%rax + adcq $0,%rdx + addq %r12,%r11 + adcq $0,%rdx + movq %rdx,%r12 + movq %rcx,48(%rdi) + movq %r8,56(%rdi) + movq %r9,64(%rdi) + movq %r10,72(%rdi) + movq %r11,80(%rdi) + movq %r12,88(%rdi) + + .byte 0xf3,0xc3 + +.globl sqr_384 + +.def sqr_384; .scl 2; .type 32; .endef +.p2align 5 +sqr_384: + .byte 0xf3,0x0f,0x1e,0xfa + movq %rdi,8(%rsp) + movq %rsi,16(%rsp) + movq %rsp,%r11 +.LSEH_begin_sqr_384: + movq %rcx,%rdi + movq %rdx,%rsi + + + pushq %rbp + + pushq %rbx + + pushq %r12 + + pushq %r13 + + pushq %r14 + + pushq %r15 + + subq $8,%rsp + +.LSEH_body_sqr_384: + + + call __sqrq_384 + + movq 8(%rsp),%r15 + + movq 16(%rsp),%r14 + + movq 24(%rsp),%r13 + + movq 32(%rsp),%r12 + + movq 40(%rsp),%rbx + + movq 48(%rsp),%rbp + + leaq 56(%rsp),%rsp + +.LSEH_epilogue_sqr_384: + mov 8(%rsp),%rdi + mov 16(%rsp),%rsi + + .byte 0xf3,0xc3 + +.LSEH_end_sqr_384: + +.def __sqrq_384; .scl 3; .type 32; .endef +.p2align 5 +__sqrq_384: + .byte 0xf3,0x0f,0x1e,0xfa + + movq 0(%rsi),%rax + movq 8(%rsi),%r15 + movq 16(%rsi),%rcx + movq 24(%rsi),%rbx + + + movq %rax,%r14 + mulq %r15 + movq %rax,%r9 + movq %r14,%rax + movq 32(%rsi),%rbp + movq %rdx,%r10 + + mulq %rcx + addq %rax,%r10 + movq %r14,%rax + adcq $0,%rdx + movq 40(%rsi),%rsi + movq %rdx,%r11 + + mulq %rbx + addq %rax,%r11 + movq %r14,%rax + adcq $0,%rdx + movq %rdx,%r12 + + mulq %rbp + addq %rax,%r12 + movq %r14,%rax + adcq $0,%rdx + movq %rdx,%r13 + + mulq %rsi + addq %rax,%r13 + movq %r14,%rax + adcq $0,%rdx + movq %rdx,%r14 + + mulq %rax + xorq %r8,%r8 + movq %rax,0(%rdi) + movq %r15,%rax + addq %r9,%r9 + adcq $0,%r8 + addq %rdx,%r9 + adcq $0,%r8 + movq %r9,8(%rdi) + + mulq %rcx + addq %rax,%r11 + movq %r15,%rax + adcq $0,%rdx + movq %rdx,%r9 + + mulq %rbx + addq %rax,%r12 + movq %r15,%rax + adcq $0,%rdx + addq %r9,%r12 + adcq $0,%rdx + movq %rdx,%r9 + + mulq %rbp + addq %rax,%r13 + movq %r15,%rax + adcq $0,%rdx + addq %r9,%r13 + adcq $0,%rdx + movq %rdx,%r9 + + mulq %rsi + addq %rax,%r14 + movq %r15,%rax + adcq $0,%rdx + addq %r9,%r14 + adcq $0,%rdx + movq %rdx,%r15 + + mulq %rax + xorq %r9,%r9 + addq %rax,%r8 + movq %rcx,%rax + addq %r10,%r10 + adcq %r11,%r11 + adcq $0,%r9 + addq %r8,%r10 + adcq %rdx,%r11 + adcq $0,%r9 + movq %r10,16(%rdi) + + mulq %rbx + addq %rax,%r13 + movq %rcx,%rax + adcq $0,%rdx + movq %r11,24(%rdi) + movq %rdx,%r8 + + mulq %rbp + addq %rax,%r14 + movq %rcx,%rax + adcq $0,%rdx + addq %r8,%r14 + adcq $0,%rdx + movq %rdx,%r8 + + mulq %rsi + addq %rax,%r15 + movq %rcx,%rax + adcq $0,%rdx + addq %r8,%r15 + adcq $0,%rdx + movq %rdx,%rcx + + mulq %rax + xorq %r11,%r11 + addq %rax,%r9 + movq %rbx,%rax + addq %r12,%r12 + adcq %r13,%r13 + adcq $0,%r11 + addq %r9,%r12 + adcq %rdx,%r13 + adcq $0,%r11 + movq %r12,32(%rdi) + + + mulq %rbp + addq %rax,%r15 + movq %rbx,%rax + adcq $0,%rdx + movq %r13,40(%rdi) + movq %rdx,%r8 + + mulq %rsi + addq %rax,%rcx + movq %rbx,%rax + adcq $0,%rdx + addq %r8,%rcx + adcq $0,%rdx + movq %rdx,%rbx + + mulq %rax + xorq %r12,%r12 + addq %rax,%r11 + movq %rbp,%rax + addq %r14,%r14 + adcq %r15,%r15 + adcq $0,%r12 + addq %r11,%r14 + adcq %rdx,%r15 + movq %r14,48(%rdi) + adcq $0,%r12 + movq %r15,56(%rdi) + + + mulq %rsi + addq %rax,%rbx + movq %rbp,%rax + adcq $0,%rdx + movq %rdx,%rbp + + mulq %rax + xorq %r13,%r13 + addq %rax,%r12 + movq %rsi,%rax + addq %rcx,%rcx + adcq %rbx,%rbx + adcq $0,%r13 + addq %r12,%rcx + adcq %rdx,%rbx + movq %rcx,64(%rdi) + adcq $0,%r13 + movq %rbx,72(%rdi) + + + mulq %rax + addq %r13,%rax + addq %rbp,%rbp + adcq $0,%rdx + addq %rbp,%rax + adcq $0,%rdx + movq %rax,80(%rdi) + movq %rdx,88(%rdi) + + .byte 0xf3,0xc3 + + +.globl sqr_mont_384 + +.def sqr_mont_384; .scl 2; .type 32; .endef +.p2align 5 +sqr_mont_384: + .byte 0xf3,0x0f,0x1e,0xfa + movq %rdi,8(%rsp) + movq %rsi,16(%rsp) + movq %rsp,%r11 +.LSEH_begin_sqr_mont_384: + movq %rcx,%rdi + movq %rdx,%rsi + movq %r8,%rdx + movq %r9,%rcx + + + pushq %rbp + + pushq %rbx + + pushq %r12 + + pushq %r13 + + pushq %r14 + + pushq %r15 + + subq $120,%rsp + +.LSEH_body_sqr_mont_384: + + + movq %rcx,96(%rsp) + movq %rdx,104(%rsp) + movq %rdi,112(%rsp) + + movq %rsp,%rdi + call __sqrq_384 + + leaq 0(%rsp),%rsi + movq 96(%rsp),%rcx + movq 104(%rsp),%rbx + movq 112(%rsp),%rdi + call __mulq_by_1_mont_384 + call __redc_tail_mont_384 + + leaq 120(%rsp),%r8 + movq 120(%rsp),%r15 + + movq 8(%r8),%r14 + + movq 16(%r8),%r13 + + movq 24(%r8),%r12 + + movq 32(%r8),%rbx + + movq 40(%r8),%rbp + + leaq 48(%r8),%rsp + +.LSEH_epilogue_sqr_mont_384: + mov 8(%rsp),%rdi + mov 16(%rsp),%rsi + + .byte 0xf3,0xc3 + +.LSEH_end_sqr_mont_384: + + + +.globl redc_mont_384 + +.def redc_mont_384; .scl 2; .type 32; .endef +.p2align 5 +redc_mont_384: + .byte 0xf3,0x0f,0x1e,0xfa + movq %rdi,8(%rsp) + movq %rsi,16(%rsp) + movq %rsp,%r11 +.LSEH_begin_redc_mont_384: + movq %rcx,%rdi + movq %rdx,%rsi + movq %r8,%rdx + movq %r9,%rcx + + + pushq %rbp + + pushq %rbx + + pushq %r12 + + pushq %r13 + + pushq %r14 + + pushq %r15 + + subq $8,%rsp + +.LSEH_body_redc_mont_384: + + + movq %rdx,%rbx + call __mulq_by_1_mont_384 + call __redc_tail_mont_384 + + movq 8(%rsp),%r15 + + movq 16(%rsp),%r14 + + movq 24(%rsp),%r13 + + movq 32(%rsp),%r12 + + movq 40(%rsp),%rbx + + movq 48(%rsp),%rbp + + leaq 56(%rsp),%rsp + +.LSEH_epilogue_redc_mont_384: + mov 8(%rsp),%rdi + mov 16(%rsp),%rsi + + .byte 0xf3,0xc3 + +.LSEH_end_redc_mont_384: + + + + +.globl from_mont_384 + +.def from_mont_384; .scl 2; .type 32; .endef +.p2align 5 +from_mont_384: + .byte 0xf3,0x0f,0x1e,0xfa + movq %rdi,8(%rsp) + movq %rsi,16(%rsp) + movq %rsp,%r11 +.LSEH_begin_from_mont_384: + movq %rcx,%rdi + movq %rdx,%rsi + movq %r8,%rdx + movq %r9,%rcx + + + pushq %rbp + + pushq %rbx + + pushq %r12 + + pushq %r13 + + pushq %r14 + + pushq %r15 + + subq $8,%rsp + +.LSEH_body_from_mont_384: + + + movq %rdx,%rbx + call __mulq_by_1_mont_384 + + + + + + movq %r15,%rcx + movq %r8,%rdx + movq %r9,%rbp + + subq 0(%rbx),%r14 + sbbq 8(%rbx),%r15 + movq %r10,%r13 + sbbq 16(%rbx),%r8 + sbbq 24(%rbx),%r9 + sbbq 32(%rbx),%r10 + movq %r11,%rsi + sbbq 40(%rbx),%r11 + + cmovcq %rax,%r14 + cmovcq %rcx,%r15 + cmovcq %rdx,%r8 + movq %r14,0(%rdi) + cmovcq %rbp,%r9 + movq %r15,8(%rdi) + cmovcq %r13,%r10 + movq %r8,16(%rdi) + cmovcq %rsi,%r11 + movq %r9,24(%rdi) + movq %r10,32(%rdi) + movq %r11,40(%rdi) + + movq 8(%rsp),%r15 + + movq 16(%rsp),%r14 + + movq 24(%rsp),%r13 + + movq 32(%rsp),%r12 + + movq 40(%rsp),%rbx + + movq 48(%rsp),%rbp + + leaq 56(%rsp),%rsp + +.LSEH_epilogue_from_mont_384: + mov 8(%rsp),%rdi + mov 16(%rsp),%rsi + + .byte 0xf3,0xc3 + +.LSEH_end_from_mont_384: +.def __mulq_by_1_mont_384; .scl 3; .type 32; .endef +.p2align 5 +__mulq_by_1_mont_384: + .byte 0xf3,0x0f,0x1e,0xfa + + movq 0(%rsi),%rax + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + movq 32(%rsi),%r12 + movq 40(%rsi),%r13 + + movq %rax,%r14 + imulq %rcx,%rax + movq %rax,%r8 + + mulq 0(%rbx) + addq %rax,%r14 + movq %r8,%rax + adcq %rdx,%r14 + + mulq 8(%rbx) + addq %rax,%r9 + movq %r8,%rax + adcq $0,%rdx + addq %r14,%r9 + adcq $0,%rdx + movq %rdx,%r14 + + mulq 16(%rbx) + addq %rax,%r10 + movq %r8,%rax + adcq $0,%rdx + addq %r14,%r10 + adcq $0,%rdx + movq %rdx,%r14 + + mulq 24(%rbx) + addq %rax,%r11 + movq %r8,%rax + adcq $0,%rdx + movq %r9,%r15 + imulq %rcx,%r9 + addq %r14,%r11 + adcq $0,%rdx + movq %rdx,%r14 + + mulq 32(%rbx) + addq %rax,%r12 + movq %r8,%rax + adcq $0,%rdx + addq %r14,%r12 + adcq $0,%rdx + movq %rdx,%r14 + + mulq 40(%rbx) + addq %rax,%r13 + movq %r9,%rax + adcq $0,%rdx + addq %r14,%r13 + adcq $0,%rdx + movq %rdx,%r14 + + mulq 0(%rbx) + addq %rax,%r15 + movq %r9,%rax + adcq %rdx,%r15 + + mulq 8(%rbx) + addq %rax,%r10 + movq %r9,%rax + adcq $0,%rdx + addq %r15,%r10 + adcq $0,%rdx + movq %rdx,%r15 + + mulq 16(%rbx) + addq %rax,%r11 + movq %r9,%rax + adcq $0,%rdx + addq %r15,%r11 + adcq $0,%rdx + movq %rdx,%r15 + + mulq 24(%rbx) + addq %rax,%r12 + movq %r9,%rax + adcq $0,%rdx + movq %r10,%r8 + imulq %rcx,%r10 + addq %r15,%r12 + adcq $0,%rdx + movq %rdx,%r15 + + mulq 32(%rbx) + addq %rax,%r13 + movq %r9,%rax + adcq $0,%rdx + addq %r15,%r13 + adcq $0,%rdx + movq %rdx,%r15 + + mulq 40(%rbx) + addq %rax,%r14 + movq %r10,%rax + adcq $0,%rdx + addq %r15,%r14 + adcq $0,%rdx + movq %rdx,%r15 + + mulq 0(%rbx) + addq %rax,%r8 + movq %r10,%rax + adcq %rdx,%r8 + + mulq 8(%rbx) + addq %rax,%r11 + movq %r10,%rax + adcq $0,%rdx + addq %r8,%r11 + adcq $0,%rdx + movq %rdx,%r8 + + mulq 16(%rbx) + addq %rax,%r12 + movq %r10,%rax + adcq $0,%rdx + addq %r8,%r12 + adcq $0,%rdx + movq %rdx,%r8 + + mulq 24(%rbx) + addq %rax,%r13 + movq %r10,%rax + adcq $0,%rdx + movq %r11,%r9 + imulq %rcx,%r11 + addq %r8,%r13 + adcq $0,%rdx + movq %rdx,%r8 + + mulq 32(%rbx) + addq %rax,%r14 + movq %r10,%rax + adcq $0,%rdx + addq %r8,%r14 + adcq $0,%rdx + movq %rdx,%r8 + + mulq 40(%rbx) + addq %rax,%r15 + movq %r11,%rax + adcq $0,%rdx + addq %r8,%r15 + adcq $0,%rdx + movq %rdx,%r8 + + mulq 0(%rbx) + addq %rax,%r9 + movq %r11,%rax + adcq %rdx,%r9 + + mulq 8(%rbx) + addq %rax,%r12 + movq %r11,%rax + adcq $0,%rdx + addq %r9,%r12 + adcq $0,%rdx + movq %rdx,%r9 + + mulq 16(%rbx) + addq %rax,%r13 + movq %r11,%rax + adcq $0,%rdx + addq %r9,%r13 + adcq $0,%rdx + movq %rdx,%r9 + + mulq 24(%rbx) + addq %rax,%r14 + movq %r11,%rax + adcq $0,%rdx + movq %r12,%r10 + imulq %rcx,%r12 + addq %r9,%r14 + adcq $0,%rdx + movq %rdx,%r9 + + mulq 32(%rbx) + addq %rax,%r15 + movq %r11,%rax + adcq $0,%rdx + addq %r9,%r15 + adcq $0,%rdx + movq %rdx,%r9 + + mulq 40(%rbx) + addq %rax,%r8 + movq %r12,%rax + adcq $0,%rdx + addq %r9,%r8 + adcq $0,%rdx + movq %rdx,%r9 + + mulq 0(%rbx) + addq %rax,%r10 + movq %r12,%rax + adcq %rdx,%r10 + + mulq 8(%rbx) + addq %rax,%r13 + movq %r12,%rax + adcq $0,%rdx + addq %r10,%r13 + adcq $0,%rdx + movq %rdx,%r10 + + mulq 16(%rbx) + addq %rax,%r14 + movq %r12,%rax + adcq $0,%rdx + addq %r10,%r14 + adcq $0,%rdx + movq %rdx,%r10 + + mulq 24(%rbx) + addq %rax,%r15 + movq %r12,%rax + adcq $0,%rdx + movq %r13,%r11 + imulq %rcx,%r13 + addq %r10,%r15 + adcq $0,%rdx + movq %rdx,%r10 + + mulq 32(%rbx) + addq %rax,%r8 + movq %r12,%rax + adcq $0,%rdx + addq %r10,%r8 + adcq $0,%rdx + movq %rdx,%r10 + + mulq 40(%rbx) + addq %rax,%r9 + movq %r13,%rax + adcq $0,%rdx + addq %r10,%r9 + adcq $0,%rdx + movq %rdx,%r10 + + mulq 0(%rbx) + addq %rax,%r11 + movq %r13,%rax + adcq %rdx,%r11 + + mulq 8(%rbx) + addq %rax,%r14 + movq %r13,%rax + adcq $0,%rdx + addq %r11,%r14 + adcq $0,%rdx + movq %rdx,%r11 + + mulq 16(%rbx) + addq %rax,%r15 + movq %r13,%rax + adcq $0,%rdx + addq %r11,%r15 + adcq $0,%rdx + movq %rdx,%r11 + + mulq 24(%rbx) + addq %rax,%r8 + movq %r13,%rax + adcq $0,%rdx + addq %r11,%r8 + adcq $0,%rdx + movq %rdx,%r11 + + mulq 32(%rbx) + addq %rax,%r9 + movq %r13,%rax + adcq $0,%rdx + addq %r11,%r9 + adcq $0,%rdx + movq %rdx,%r11 + + mulq 40(%rbx) + addq %rax,%r10 + movq %r14,%rax + adcq $0,%rdx + addq %r11,%r10 + adcq $0,%rdx + movq %rdx,%r11 + .byte 0xf3,0xc3 + + +.def __redc_tail_mont_384; .scl 3; .type 32; .endef +.p2align 5 +__redc_tail_mont_384: + .byte 0xf3,0x0f,0x1e,0xfa + + addq 48(%rsi),%r14 + movq %r14,%rax + adcq 56(%rsi),%r15 + adcq 64(%rsi),%r8 + adcq 72(%rsi),%r9 + movq %r15,%rcx + adcq 80(%rsi),%r10 + adcq 88(%rsi),%r11 + sbbq %r12,%r12 + + + + + movq %r8,%rdx + movq %r9,%rbp + + subq 0(%rbx),%r14 + sbbq 8(%rbx),%r15 + movq %r10,%r13 + sbbq 16(%rbx),%r8 + sbbq 24(%rbx),%r9 + sbbq 32(%rbx),%r10 + movq %r11,%rsi + sbbq 40(%rbx),%r11 + sbbq $0,%r12 + + cmovcq %rax,%r14 + cmovcq %rcx,%r15 + cmovcq %rdx,%r8 + movq %r14,0(%rdi) + cmovcq %rbp,%r9 + movq %r15,8(%rdi) + cmovcq %r13,%r10 + movq %r8,16(%rdi) + cmovcq %rsi,%r11 + movq %r9,24(%rdi) + movq %r10,32(%rdi) + movq %r11,40(%rdi) + + .byte 0xf3,0xc3 + + +.globl sgn0_pty_mont_384 + +.def sgn0_pty_mont_384; .scl 2; .type 32; .endef +.p2align 5 +sgn0_pty_mont_384: + .byte 0xf3,0x0f,0x1e,0xfa + movq %rdi,8(%rsp) + movq %rsi,16(%rsp) + movq %rsp,%r11 +.LSEH_begin_sgn0_pty_mont_384: + movq %rcx,%rdi + movq %rdx,%rsi + movq %r8,%rdx + + + pushq %rbp + + pushq %rbx + + pushq %r12 + + pushq %r13 + + pushq %r14 + + pushq %r15 + + subq $8,%rsp + +.LSEH_body_sgn0_pty_mont_384: + + + movq %rsi,%rbx + leaq 0(%rdi),%rsi + movq %rdx,%rcx + call __mulq_by_1_mont_384 + + xorq %rax,%rax + movq %r14,%r13 + addq %r14,%r14 + adcq %r15,%r15 + adcq %r8,%r8 + adcq %r9,%r9 + adcq %r10,%r10 + adcq %r11,%r11 + adcq $0,%rax + + subq 0(%rbx),%r14 + sbbq 8(%rbx),%r15 + sbbq 16(%rbx),%r8 + sbbq 24(%rbx),%r9 + sbbq 32(%rbx),%r10 + sbbq 40(%rbx),%r11 + sbbq $0,%rax + + notq %rax + andq $1,%r13 + andq $2,%rax + orq %r13,%rax + + movq 8(%rsp),%r15 + + movq 16(%rsp),%r14 + + movq 24(%rsp),%r13 + + movq 32(%rsp),%r12 + + movq 40(%rsp),%rbx + + movq 48(%rsp),%rbp + + leaq 56(%rsp),%rsp + +.LSEH_epilogue_sgn0_pty_mont_384: + mov 8(%rsp),%rdi + mov 16(%rsp),%rsi + + .byte 0xf3,0xc3 + +.LSEH_end_sgn0_pty_mont_384: + +.globl sgn0_pty_mont_384x + +.def sgn0_pty_mont_384x; .scl 2; .type 32; .endef +.p2align 5 +sgn0_pty_mont_384x: + .byte 0xf3,0x0f,0x1e,0xfa + movq %rdi,8(%rsp) + movq %rsi,16(%rsp) + movq %rsp,%r11 +.LSEH_begin_sgn0_pty_mont_384x: + movq %rcx,%rdi + movq %rdx,%rsi + movq %r8,%rdx + + + pushq %rbp + + pushq %rbx + + pushq %r12 + + pushq %r13 + + pushq %r14 + + pushq %r15 + + subq $8,%rsp + +.LSEH_body_sgn0_pty_mont_384x: + + + movq %rsi,%rbx + leaq 48(%rdi),%rsi + movq %rdx,%rcx + call __mulq_by_1_mont_384 + + movq %r14,%r12 + orq %r15,%r14 + orq %r8,%r14 + orq %r9,%r14 + orq %r10,%r14 + orq %r11,%r14 + + leaq 0(%rdi),%rsi + xorq %rdi,%rdi + movq %r12,%r13 + addq %r12,%r12 + adcq %r15,%r15 + adcq %r8,%r8 + adcq %r9,%r9 + adcq %r10,%r10 + adcq %r11,%r11 + adcq $0,%rdi + + subq 0(%rbx),%r12 + sbbq 8(%rbx),%r15 + sbbq 16(%rbx),%r8 + sbbq 24(%rbx),%r9 + sbbq 32(%rbx),%r10 + sbbq 40(%rbx),%r11 + sbbq $0,%rdi + + movq %r14,0(%rsp) + notq %rdi + andq $1,%r13 + andq $2,%rdi + orq %r13,%rdi + + call __mulq_by_1_mont_384 + + movq %r14,%r12 + orq %r15,%r14 + orq %r8,%r14 + orq %r9,%r14 + orq %r10,%r14 + orq %r11,%r14 + + xorq %rax,%rax + movq %r12,%r13 + addq %r12,%r12 + adcq %r15,%r15 + adcq %r8,%r8 + adcq %r9,%r9 + adcq %r10,%r10 + adcq %r11,%r11 + adcq $0,%rax + + subq 0(%rbx),%r12 + sbbq 8(%rbx),%r15 + sbbq 16(%rbx),%r8 + sbbq 24(%rbx),%r9 + sbbq 32(%rbx),%r10 + sbbq 40(%rbx),%r11 + sbbq $0,%rax + + movq 0(%rsp),%r12 + + notq %rax + + testq %r14,%r14 + cmovzq %rdi,%r13 + + testq %r12,%r12 + cmovnzq %rdi,%rax + + andq $1,%r13 + andq $2,%rax + orq %r13,%rax + + movq 8(%rsp),%r15 + + movq 16(%rsp),%r14 + + movq 24(%rsp),%r13 + + movq 32(%rsp),%r12 + + movq 40(%rsp),%rbx + + movq 48(%rsp),%rbp + + leaq 56(%rsp),%rsp + +.LSEH_epilogue_sgn0_pty_mont_384x: + mov 8(%rsp),%rdi + mov 16(%rsp),%rsi + + .byte 0xf3,0xc3 + +.LSEH_end_sgn0_pty_mont_384x: +.globl mul_mont_384 + +.def mul_mont_384; .scl 2; .type 32; .endef +.p2align 5 +mul_mont_384: + .byte 0xf3,0x0f,0x1e,0xfa + movq %rdi,8(%rsp) + movq %rsi,16(%rsp) + movq %rsp,%r11 +.LSEH_begin_mul_mont_384: + movq %rcx,%rdi + movq %rdx,%rsi + movq %r8,%rdx + movq %r9,%rcx + movq 40(%rsp),%r8 + + + pushq %rbp + + pushq %rbx + + pushq %r12 + + pushq %r13 + + pushq %r14 + + pushq %r15 + + subq $24,%rsp + +.LSEH_body_mul_mont_384: + + + movq 0(%rdx),%rax + movq 0(%rsi),%r14 + movq 8(%rsi),%r15 + movq 16(%rsi),%r12 + movq 24(%rsi),%r13 + movq %rdx,%rbx + movq %r8,0(%rsp) + movq %rdi,8(%rsp) + + call __mulq_mont_384 + + movq 24(%rsp),%r15 + + movq 32(%rsp),%r14 + + movq 40(%rsp),%r13 + + movq 48(%rsp),%r12 + + movq 56(%rsp),%rbx + + movq 64(%rsp),%rbp + + leaq 72(%rsp),%rsp + +.LSEH_epilogue_mul_mont_384: + mov 8(%rsp),%rdi + mov 16(%rsp),%rsi + + .byte 0xf3,0xc3 + +.LSEH_end_mul_mont_384: +.def __mulq_mont_384; .scl 3; .type 32; .endef +.p2align 5 +__mulq_mont_384: + .byte 0xf3,0x0f,0x1e,0xfa + + movq %rax,%rdi + mulq %r14 + movq %rax,%r8 + movq %rdi,%rax + movq %rdx,%r9 + + mulq %r15 + addq %rax,%r9 + movq %rdi,%rax + adcq $0,%rdx + movq %rdx,%r10 + + mulq %r12 + addq %rax,%r10 + movq %rdi,%rax + adcq $0,%rdx + movq %rdx,%r11 + + movq %r8,%rbp + imulq 8(%rsp),%r8 + + mulq %r13 + addq %rax,%r11 + movq %rdi,%rax + adcq $0,%rdx + movq %rdx,%r12 + + mulq 32(%rsi) + addq %rax,%r12 + movq %rdi,%rax + adcq $0,%rdx + movq %rdx,%r13 + + mulq 40(%rsi) + addq %rax,%r13 + movq %r8,%rax + adcq $0,%rdx + xorq %r15,%r15 + movq %rdx,%r14 + + mulq 0(%rcx) + addq %rax,%rbp + movq %r8,%rax + adcq %rdx,%rbp + + mulq 8(%rcx) + addq %rax,%r9 + movq %r8,%rax + adcq $0,%rdx + addq %rbp,%r9 + adcq $0,%rdx + movq %rdx,%rbp + + mulq 16(%rcx) + addq %rax,%r10 + movq %r8,%rax + adcq $0,%rdx + addq %rbp,%r10 + adcq $0,%rdx + movq %rdx,%rbp + + mulq 24(%rcx) + addq %rbp,%r11 + adcq $0,%rdx + addq %rax,%r11 + movq %r8,%rax + adcq $0,%rdx + movq %rdx,%rbp + + mulq 32(%rcx) + addq %rax,%r12 + movq %r8,%rax + adcq $0,%rdx + addq %rbp,%r12 + adcq $0,%rdx + movq %rdx,%rbp + + mulq 40(%rcx) + addq %rax,%r13 + movq 8(%rbx),%rax + adcq $0,%rdx + addq %rbp,%r13 + adcq %rdx,%r14 + adcq $0,%r15 + + movq %rax,%rdi + mulq 0(%rsi) + addq %rax,%r9 + movq %rdi,%rax + adcq $0,%rdx + movq %rdx,%r8 + + mulq 8(%rsi) + addq %rax,%r10 + movq %rdi,%rax + adcq $0,%rdx + addq %r8,%r10 + adcq $0,%rdx + movq %rdx,%r8 + + mulq 16(%rsi) + addq %rax,%r11 + movq %rdi,%rax + adcq $0,%rdx + addq %r8,%r11 + adcq $0,%rdx + movq %rdx,%r8 + + movq %r9,%rbp + imulq 8(%rsp),%r9 + + mulq 24(%rsi) + addq %rax,%r12 + movq %rdi,%rax + adcq $0,%rdx + addq %r8,%r12 + adcq $0,%rdx + movq %rdx,%r8 + + mulq 32(%rsi) + addq %rax,%r13 + movq %rdi,%rax + adcq $0,%rdx + addq %r8,%r13 + adcq $0,%rdx + movq %rdx,%r8 + + mulq 40(%rsi) + addq %r8,%r14 + adcq $0,%rdx + xorq %r8,%r8 + addq %rax,%r14 + movq %r9,%rax + adcq %rdx,%r15 + adcq $0,%r8 + + mulq 0(%rcx) + addq %rax,%rbp + movq %r9,%rax + adcq %rdx,%rbp + + mulq 8(%rcx) + addq %rax,%r10 + movq %r9,%rax + adcq $0,%rdx + addq %rbp,%r10 + adcq $0,%rdx + movq %rdx,%rbp + + mulq 16(%rcx) + addq %rax,%r11 + movq %r9,%rax + adcq $0,%rdx + addq %rbp,%r11 + adcq $0,%rdx + movq %rdx,%rbp + + mulq 24(%rcx) + addq %rbp,%r12 + adcq $0,%rdx + addq %rax,%r12 + movq %r9,%rax + adcq $0,%rdx + movq %rdx,%rbp + + mulq 32(%rcx) + addq %rax,%r13 + movq %r9,%rax + adcq $0,%rdx + addq %rbp,%r13 + adcq $0,%rdx + movq %rdx,%rbp + + mulq 40(%rcx) + addq %rax,%r14 + movq 16(%rbx),%rax + adcq $0,%rdx + addq %rbp,%r14 + adcq %rdx,%r15 + adcq $0,%r8 + + movq %rax,%rdi + mulq 0(%rsi) + addq %rax,%r10 + movq %rdi,%rax + adcq $0,%rdx + movq %rdx,%r9 + + mulq 8(%rsi) + addq %rax,%r11 + movq %rdi,%rax + adcq $0,%rdx + addq %r9,%r11 + adcq $0,%rdx + movq %rdx,%r9 + + mulq 16(%rsi) + addq %rax,%r12 + movq %rdi,%rax + adcq $0,%rdx + addq %r9,%r12 + adcq $0,%rdx + movq %rdx,%r9 + + movq %r10,%rbp + imulq 8(%rsp),%r10 + + mulq 24(%rsi) + addq %rax,%r13 + movq %rdi,%rax + adcq $0,%rdx + addq %r9,%r13 + adcq $0,%rdx + movq %rdx,%r9 + + mulq 32(%rsi) + addq %rax,%r14 + movq %rdi,%rax + adcq $0,%rdx + addq %r9,%r14 + adcq $0,%rdx + movq %rdx,%r9 + + mulq 40(%rsi) + addq %r9,%r15 + adcq $0,%rdx + xorq %r9,%r9 + addq %rax,%r15 + movq %r10,%rax + adcq %rdx,%r8 + adcq $0,%r9 + + mulq 0(%rcx) + addq %rax,%rbp + movq %r10,%rax + adcq %rdx,%rbp + + mulq 8(%rcx) + addq %rax,%r11 + movq %r10,%rax + adcq $0,%rdx + addq %rbp,%r11 + adcq $0,%rdx + movq %rdx,%rbp + + mulq 16(%rcx) + addq %rax,%r12 + movq %r10,%rax + adcq $0,%rdx + addq %rbp,%r12 + adcq $0,%rdx + movq %rdx,%rbp + + mulq 24(%rcx) + addq %rbp,%r13 + adcq $0,%rdx + addq %rax,%r13 + movq %r10,%rax + adcq $0,%rdx + movq %rdx,%rbp + + mulq 32(%rcx) + addq %rax,%r14 + movq %r10,%rax + adcq $0,%rdx + addq %rbp,%r14 + adcq $0,%rdx + movq %rdx,%rbp + + mulq 40(%rcx) + addq %rax,%r15 + movq 24(%rbx),%rax + adcq $0,%rdx + addq %rbp,%r15 + adcq %rdx,%r8 + adcq $0,%r9 + + movq %rax,%rdi + mulq 0(%rsi) + addq %rax,%r11 + movq %rdi,%rax + adcq $0,%rdx + movq %rdx,%r10 + + mulq 8(%rsi) + addq %rax,%r12 + movq %rdi,%rax + adcq $0,%rdx + addq %r10,%r12 + adcq $0,%rdx + movq %rdx,%r10 + + mulq 16(%rsi) + addq %rax,%r13 + movq %rdi,%rax + adcq $0,%rdx + addq %r10,%r13 + adcq $0,%rdx + movq %rdx,%r10 + + movq %r11,%rbp + imulq 8(%rsp),%r11 + + mulq 24(%rsi) + addq %rax,%r14 + movq %rdi,%rax + adcq $0,%rdx + addq %r10,%r14 + adcq $0,%rdx + movq %rdx,%r10 + + mulq 32(%rsi) + addq %rax,%r15 + movq %rdi,%rax + adcq $0,%rdx + addq %r10,%r15 + adcq $0,%rdx + movq %rdx,%r10 + + mulq 40(%rsi) + addq %r10,%r8 + adcq $0,%rdx + xorq %r10,%r10 + addq %rax,%r8 + movq %r11,%rax + adcq %rdx,%r9 + adcq $0,%r10 + + mulq 0(%rcx) + addq %rax,%rbp + movq %r11,%rax + adcq %rdx,%rbp + + mulq 8(%rcx) + addq %rax,%r12 + movq %r11,%rax + adcq $0,%rdx + addq %rbp,%r12 + adcq $0,%rdx + movq %rdx,%rbp + + mulq 16(%rcx) + addq %rax,%r13 + movq %r11,%rax + adcq $0,%rdx + addq %rbp,%r13 + adcq $0,%rdx + movq %rdx,%rbp + + mulq 24(%rcx) + addq %rbp,%r14 + adcq $0,%rdx + addq %rax,%r14 + movq %r11,%rax + adcq $0,%rdx + movq %rdx,%rbp + + mulq 32(%rcx) + addq %rax,%r15 + movq %r11,%rax + adcq $0,%rdx + addq %rbp,%r15 + adcq $0,%rdx + movq %rdx,%rbp + + mulq 40(%rcx) + addq %rax,%r8 + movq 32(%rbx),%rax + adcq $0,%rdx + addq %rbp,%r8 + adcq %rdx,%r9 + adcq $0,%r10 + + movq %rax,%rdi + mulq 0(%rsi) + addq %rax,%r12 + movq %rdi,%rax + adcq $0,%rdx + movq %rdx,%r11 + + mulq 8(%rsi) + addq %rax,%r13 + movq %rdi,%rax + adcq $0,%rdx + addq %r11,%r13 + adcq $0,%rdx + movq %rdx,%r11 + + mulq 16(%rsi) + addq %rax,%r14 + movq %rdi,%rax + adcq $0,%rdx + addq %r11,%r14 + adcq $0,%rdx + movq %rdx,%r11 + + movq %r12,%rbp + imulq 8(%rsp),%r12 + + mulq 24(%rsi) + addq %rax,%r15 + movq %rdi,%rax + adcq $0,%rdx + addq %r11,%r15 + adcq $0,%rdx + movq %rdx,%r11 + + mulq 32(%rsi) + addq %rax,%r8 + movq %rdi,%rax + adcq $0,%rdx + addq %r11,%r8 + adcq $0,%rdx + movq %rdx,%r11 + + mulq 40(%rsi) + addq %r11,%r9 + adcq $0,%rdx + xorq %r11,%r11 + addq %rax,%r9 + movq %r12,%rax + adcq %rdx,%r10 + adcq $0,%r11 + + mulq 0(%rcx) + addq %rax,%rbp + movq %r12,%rax + adcq %rdx,%rbp + + mulq 8(%rcx) + addq %rax,%r13 + movq %r12,%rax + adcq $0,%rdx + addq %rbp,%r13 + adcq $0,%rdx + movq %rdx,%rbp + + mulq 16(%rcx) + addq %rax,%r14 + movq %r12,%rax + adcq $0,%rdx + addq %rbp,%r14 + adcq $0,%rdx + movq %rdx,%rbp + + mulq 24(%rcx) + addq %rbp,%r15 + adcq $0,%rdx + addq %rax,%r15 + movq %r12,%rax + adcq $0,%rdx + movq %rdx,%rbp + + mulq 32(%rcx) + addq %rax,%r8 + movq %r12,%rax + adcq $0,%rdx + addq %rbp,%r8 + adcq $0,%rdx + movq %rdx,%rbp + + mulq 40(%rcx) + addq %rax,%r9 + movq 40(%rbx),%rax + adcq $0,%rdx + addq %rbp,%r9 + adcq %rdx,%r10 + adcq $0,%r11 + + movq %rax,%rdi + mulq 0(%rsi) + addq %rax,%r13 + movq %rdi,%rax + adcq $0,%rdx + movq %rdx,%r12 + + mulq 8(%rsi) + addq %rax,%r14 + movq %rdi,%rax + adcq $0,%rdx + addq %r12,%r14 + adcq $0,%rdx + movq %rdx,%r12 + + mulq 16(%rsi) + addq %rax,%r15 + movq %rdi,%rax + adcq $0,%rdx + addq %r12,%r15 + adcq $0,%rdx + movq %rdx,%r12 + + movq %r13,%rbp + imulq 8(%rsp),%r13 + + mulq 24(%rsi) + addq %rax,%r8 + movq %rdi,%rax + adcq $0,%rdx + addq %r12,%r8 + adcq $0,%rdx + movq %rdx,%r12 + + mulq 32(%rsi) + addq %rax,%r9 + movq %rdi,%rax + adcq $0,%rdx + addq %r12,%r9 + adcq $0,%rdx + movq %rdx,%r12 + + mulq 40(%rsi) + addq %r12,%r10 + adcq $0,%rdx + xorq %r12,%r12 + addq %rax,%r10 + movq %r13,%rax + adcq %rdx,%r11 + adcq $0,%r12 + + mulq 0(%rcx) + addq %rax,%rbp + movq %r13,%rax + adcq %rdx,%rbp + + mulq 8(%rcx) + addq %rax,%r14 + movq %r13,%rax + adcq $0,%rdx + addq %rbp,%r14 + adcq $0,%rdx + movq %rdx,%rbp + + mulq 16(%rcx) + addq %rax,%r15 + movq %r13,%rax + adcq $0,%rdx + addq %rbp,%r15 + adcq $0,%rdx + movq %rdx,%rbp + + mulq 24(%rcx) + addq %rbp,%r8 + adcq $0,%rdx + addq %rax,%r8 + movq %r13,%rax + adcq $0,%rdx + movq %rdx,%rbp + + mulq 32(%rcx) + addq %rax,%r9 + movq %r13,%rax + adcq $0,%rdx + addq %rbp,%r9 + adcq $0,%rdx + movq %rdx,%rbp + + mulq 40(%rcx) + addq %rax,%r10 + movq %r14,%rax + adcq $0,%rdx + addq %rbp,%r10 + adcq %rdx,%r11 + adcq $0,%r12 + + + + + movq 16(%rsp),%rdi + subq 0(%rcx),%r14 + movq %r15,%rdx + sbbq 8(%rcx),%r15 + movq %r8,%rbx + sbbq 16(%rcx),%r8 + movq %r9,%rsi + sbbq 24(%rcx),%r9 + movq %r10,%rbp + sbbq 32(%rcx),%r10 + movq %r11,%r13 + sbbq 40(%rcx),%r11 + sbbq $0,%r12 + + cmovcq %rax,%r14 + cmovcq %rdx,%r15 + cmovcq %rbx,%r8 + movq %r14,0(%rdi) + cmovcq %rsi,%r9 + movq %r15,8(%rdi) + cmovcq %rbp,%r10 + movq %r8,16(%rdi) + cmovcq %r13,%r11 + movq %r9,24(%rdi) + movq %r10,32(%rdi) + movq %r11,40(%rdi) + + .byte 0xf3,0xc3 + +.globl sqr_n_mul_mont_384 + +.def sqr_n_mul_mont_384; .scl 2; .type 32; .endef +.p2align 5 +sqr_n_mul_mont_384: + .byte 0xf3,0x0f,0x1e,0xfa + movq %rdi,8(%rsp) + movq %rsi,16(%rsp) + movq %rsp,%r11 +.LSEH_begin_sqr_n_mul_mont_384: + movq %rcx,%rdi + movq %rdx,%rsi + movq %r8,%rdx + movq %r9,%rcx + movq 40(%rsp),%r8 + movq 48(%rsp),%r9 + + + pushq %rbp + + pushq %rbx + + pushq %r12 + + pushq %r13 + + pushq %r14 + + pushq %r15 + + subq $136,%rsp + +.LSEH_body_sqr_n_mul_mont_384: + + + movq %r8,0(%rsp) + movq %rdi,8(%rsp) + movq %rcx,16(%rsp) + leaq 32(%rsp),%rdi + movq %r9,24(%rsp) + movq (%r9),%xmm2 + +.Loop_sqr_384: + movd %edx,%xmm1 + + call __sqrq_384 + + leaq 0(%rdi),%rsi + movq 0(%rsp),%rcx + movq 16(%rsp),%rbx + call __mulq_by_1_mont_384 + call __redc_tail_mont_384 + + movd %xmm1,%edx + leaq 0(%rdi),%rsi + decl %edx + jnz .Loop_sqr_384 + +.byte 102,72,15,126,208 + movq %rbx,%rcx + movq 24(%rsp),%rbx + + + + + + + movq %r8,%r12 + movq %r9,%r13 + + call __mulq_mont_384 + + leaq 136(%rsp),%r8 + movq 136(%rsp),%r15 + + movq 8(%r8),%r14 + + movq 16(%r8),%r13 + + movq 24(%r8),%r12 + + movq 32(%r8),%rbx + + movq 40(%r8),%rbp + + leaq 48(%r8),%rsp + +.LSEH_epilogue_sqr_n_mul_mont_384: + mov 8(%rsp),%rdi + mov 16(%rsp),%rsi + + .byte 0xf3,0xc3 + +.LSEH_end_sqr_n_mul_mont_384: + +.globl sqr_n_mul_mont_383 + +.def sqr_n_mul_mont_383; .scl 2; .type 32; .endef +.p2align 5 +sqr_n_mul_mont_383: + .byte 0xf3,0x0f,0x1e,0xfa + movq %rdi,8(%rsp) + movq %rsi,16(%rsp) + movq %rsp,%r11 +.LSEH_begin_sqr_n_mul_mont_383: + movq %rcx,%rdi + movq %rdx,%rsi + movq %r8,%rdx + movq %r9,%rcx + movq 40(%rsp),%r8 + movq 48(%rsp),%r9 + + + pushq %rbp + + pushq %rbx + + pushq %r12 + + pushq %r13 + + pushq %r14 + + pushq %r15 + + subq $136,%rsp + +.LSEH_body_sqr_n_mul_mont_383: + + + movq %r8,0(%rsp) + movq %rdi,8(%rsp) + movq %rcx,16(%rsp) + leaq 32(%rsp),%rdi + movq %r9,24(%rsp) + movq (%r9),%xmm2 + +.Loop_sqr_383: + movd %edx,%xmm1 + + call __sqrq_384 + + leaq 0(%rdi),%rsi + movq 0(%rsp),%rcx + movq 16(%rsp),%rbx + call __mulq_by_1_mont_384 + + movd %xmm1,%edx + addq 48(%rsi),%r14 + adcq 56(%rsi),%r15 + adcq 64(%rsi),%r8 + adcq 72(%rsi),%r9 + adcq 80(%rsi),%r10 + adcq 88(%rsi),%r11 + leaq 0(%rdi),%rsi + + movq %r14,0(%rdi) + movq %r15,8(%rdi) + movq %r8,16(%rdi) + movq %r9,24(%rdi) + movq %r10,32(%rdi) + movq %r11,40(%rdi) + + decl %edx + jnz .Loop_sqr_383 + +.byte 102,72,15,126,208 + movq %rbx,%rcx + movq 24(%rsp),%rbx + + + + + + + movq %r8,%r12 + movq %r9,%r13 + + call __mulq_mont_384 + + leaq 136(%rsp),%r8 + movq 136(%rsp),%r15 + + movq 8(%r8),%r14 + + movq 16(%r8),%r13 + + movq 24(%r8),%r12 + + movq 32(%r8),%rbx + + movq 40(%r8),%rbp + + leaq 48(%r8),%rsp + +.LSEH_epilogue_sqr_n_mul_mont_383: + mov 8(%rsp),%rdi + mov 16(%rsp),%rsi + + .byte 0xf3,0xc3 + +.LSEH_end_sqr_n_mul_mont_383: +.def __mulq_mont_383_nonred; .scl 3; .type 32; .endef +.p2align 5 +__mulq_mont_383_nonred: + .byte 0xf3,0x0f,0x1e,0xfa + + movq %rax,%rbp + mulq %r14 + movq %rax,%r8 + movq %rbp,%rax + movq %rdx,%r9 + + mulq %r15 + addq %rax,%r9 + movq %rbp,%rax + adcq $0,%rdx + movq %rdx,%r10 + + mulq %r12 + addq %rax,%r10 + movq %rbp,%rax + adcq $0,%rdx + movq %rdx,%r11 + + movq %r8,%r15 + imulq 8(%rsp),%r8 + + mulq %r13 + addq %rax,%r11 + movq %rbp,%rax + adcq $0,%rdx + movq %rdx,%r12 + + mulq 32(%rsi) + addq %rax,%r12 + movq %rbp,%rax + adcq $0,%rdx + movq %rdx,%r13 + + mulq 40(%rsi) + addq %rax,%r13 + movq %r8,%rax + adcq $0,%rdx + movq %rdx,%r14 + + mulq 0(%rcx) + addq %rax,%r15 + movq %r8,%rax + adcq %rdx,%r15 + + mulq 8(%rcx) + addq %rax,%r9 + movq %r8,%rax + adcq $0,%rdx + addq %r15,%r9 + adcq $0,%rdx + movq %rdx,%r15 + + mulq 16(%rcx) + addq %rax,%r10 + movq %r8,%rax + adcq $0,%rdx + addq %r15,%r10 + adcq $0,%rdx + movq %rdx,%r15 + + mulq 24(%rcx) + addq %r15,%r11 + adcq $0,%rdx + addq %rax,%r11 + movq %r8,%rax + adcq $0,%rdx + movq %rdx,%r15 + + mulq 32(%rcx) + addq %rax,%r12 + movq %r8,%rax + adcq $0,%rdx + addq %r15,%r12 + adcq $0,%rdx + movq %rdx,%r15 + + mulq 40(%rcx) + addq %rax,%r13 + movq 8(%rbx),%rax + adcq $0,%rdx + addq %r15,%r13 + adcq %rdx,%r14 + + movq %rax,%rbp + mulq 0(%rsi) + addq %rax,%r9 + movq %rbp,%rax + adcq $0,%rdx + movq %rdx,%r15 + + mulq 8(%rsi) + addq %rax,%r10 + movq %rbp,%rax + adcq $0,%rdx + addq %r15,%r10 + adcq $0,%rdx + movq %rdx,%r15 + + mulq 16(%rsi) + addq %rax,%r11 + movq %rbp,%rax + adcq $0,%rdx + addq %r15,%r11 + adcq $0,%rdx + movq %rdx,%r15 + + movq %r9,%r8 + imulq 8(%rsp),%r9 + + mulq 24(%rsi) + addq %rax,%r12 + movq %rbp,%rax + adcq $0,%rdx + addq %r15,%r12 + adcq $0,%rdx + movq %rdx,%r15 + + mulq 32(%rsi) + addq %rax,%r13 + movq %rbp,%rax + adcq $0,%rdx + addq %r15,%r13 + adcq $0,%rdx + movq %rdx,%r15 + + mulq 40(%rsi) + addq %r15,%r14 + adcq $0,%rdx + addq %rax,%r14 + movq %r9,%rax + adcq $0,%rdx + movq %rdx,%r15 + + mulq 0(%rcx) + addq %rax,%r8 + movq %r9,%rax + adcq %rdx,%r8 + + mulq 8(%rcx) + addq %rax,%r10 + movq %r9,%rax + adcq $0,%rdx + addq %r8,%r10 + adcq $0,%rdx + movq %rdx,%r8 + + mulq 16(%rcx) + addq %rax,%r11 + movq %r9,%rax + adcq $0,%rdx + addq %r8,%r11 + adcq $0,%rdx + movq %rdx,%r8 + + mulq 24(%rcx) + addq %r8,%r12 + adcq $0,%rdx + addq %rax,%r12 + movq %r9,%rax + adcq $0,%rdx + movq %rdx,%r8 + + mulq 32(%rcx) + addq %rax,%r13 + movq %r9,%rax + adcq $0,%rdx + addq %r8,%r13 + adcq $0,%rdx + movq %rdx,%r8 + + mulq 40(%rcx) + addq %rax,%r14 + movq 16(%rbx),%rax + adcq $0,%rdx + addq %r8,%r14 + adcq %rdx,%r15 + + movq %rax,%rbp + mulq 0(%rsi) + addq %rax,%r10 + movq %rbp,%rax + adcq $0,%rdx + movq %rdx,%r8 + + mulq 8(%rsi) + addq %rax,%r11 + movq %rbp,%rax + adcq $0,%rdx + addq %r8,%r11 + adcq $0,%rdx + movq %rdx,%r8 + + mulq 16(%rsi) + addq %rax,%r12 + movq %rbp,%rax + adcq $0,%rdx + addq %r8,%r12 + adcq $0,%rdx + movq %rdx,%r8 + + movq %r10,%r9 + imulq 8(%rsp),%r10 + + mulq 24(%rsi) + addq %rax,%r13 + movq %rbp,%rax + adcq $0,%rdx + addq %r8,%r13 + adcq $0,%rdx + movq %rdx,%r8 + + mulq 32(%rsi) + addq %rax,%r14 + movq %rbp,%rax + adcq $0,%rdx + addq %r8,%r14 + adcq $0,%rdx + movq %rdx,%r8 + + mulq 40(%rsi) + addq %r8,%r15 + adcq $0,%rdx + addq %rax,%r15 + movq %r10,%rax + adcq $0,%rdx + movq %rdx,%r8 + + mulq 0(%rcx) + addq %rax,%r9 + movq %r10,%rax + adcq %rdx,%r9 + + mulq 8(%rcx) + addq %rax,%r11 + movq %r10,%rax + adcq $0,%rdx + addq %r9,%r11 + adcq $0,%rdx + movq %rdx,%r9 + + mulq 16(%rcx) + addq %rax,%r12 + movq %r10,%rax + adcq $0,%rdx + addq %r9,%r12 + adcq $0,%rdx + movq %rdx,%r9 + + mulq 24(%rcx) + addq %r9,%r13 + adcq $0,%rdx + addq %rax,%r13 + movq %r10,%rax + adcq $0,%rdx + movq %rdx,%r9 + + mulq 32(%rcx) + addq %rax,%r14 + movq %r10,%rax + adcq $0,%rdx + addq %r9,%r14 + adcq $0,%rdx + movq %rdx,%r9 + + mulq 40(%rcx) + addq %rax,%r15 + movq 24(%rbx),%rax + adcq $0,%rdx + addq %r9,%r15 + adcq %rdx,%r8 + + movq %rax,%rbp + mulq 0(%rsi) + addq %rax,%r11 + movq %rbp,%rax + adcq $0,%rdx + movq %rdx,%r9 + + mulq 8(%rsi) + addq %rax,%r12 + movq %rbp,%rax + adcq $0,%rdx + addq %r9,%r12 + adcq $0,%rdx + movq %rdx,%r9 + + mulq 16(%rsi) + addq %rax,%r13 + movq %rbp,%rax + adcq $0,%rdx + addq %r9,%r13 + adcq $0,%rdx + movq %rdx,%r9 + + movq %r11,%r10 + imulq 8(%rsp),%r11 + + mulq 24(%rsi) + addq %rax,%r14 + movq %rbp,%rax + adcq $0,%rdx + addq %r9,%r14 + adcq $0,%rdx + movq %rdx,%r9 + + mulq 32(%rsi) + addq %rax,%r15 + movq %rbp,%rax + adcq $0,%rdx + addq %r9,%r15 + adcq $0,%rdx + movq %rdx,%r9 + + mulq 40(%rsi) + addq %r9,%r8 + adcq $0,%rdx + addq %rax,%r8 + movq %r11,%rax + adcq $0,%rdx + movq %rdx,%r9 + + mulq 0(%rcx) + addq %rax,%r10 + movq %r11,%rax + adcq %rdx,%r10 + + mulq 8(%rcx) + addq %rax,%r12 + movq %r11,%rax + adcq $0,%rdx + addq %r10,%r12 + adcq $0,%rdx + movq %rdx,%r10 + + mulq 16(%rcx) + addq %rax,%r13 + movq %r11,%rax + adcq $0,%rdx + addq %r10,%r13 + adcq $0,%rdx + movq %rdx,%r10 + + mulq 24(%rcx) + addq %r10,%r14 + adcq $0,%rdx + addq %rax,%r14 + movq %r11,%rax + adcq $0,%rdx + movq %rdx,%r10 + + mulq 32(%rcx) + addq %rax,%r15 + movq %r11,%rax + adcq $0,%rdx + addq %r10,%r15 + adcq $0,%rdx + movq %rdx,%r10 + + mulq 40(%rcx) + addq %rax,%r8 + movq 32(%rbx),%rax + adcq $0,%rdx + addq %r10,%r8 + adcq %rdx,%r9 + + movq %rax,%rbp + mulq 0(%rsi) + addq %rax,%r12 + movq %rbp,%rax + adcq $0,%rdx + movq %rdx,%r10 + + mulq 8(%rsi) + addq %rax,%r13 + movq %rbp,%rax + adcq $0,%rdx + addq %r10,%r13 + adcq $0,%rdx + movq %rdx,%r10 + + mulq 16(%rsi) + addq %rax,%r14 + movq %rbp,%rax + adcq $0,%rdx + addq %r10,%r14 + adcq $0,%rdx + movq %rdx,%r10 + + movq %r12,%r11 + imulq 8(%rsp),%r12 + + mulq 24(%rsi) + addq %rax,%r15 + movq %rbp,%rax + adcq $0,%rdx + addq %r10,%r15 + adcq $0,%rdx + movq %rdx,%r10 + + mulq 32(%rsi) + addq %rax,%r8 + movq %rbp,%rax + adcq $0,%rdx + addq %r10,%r8 + adcq $0,%rdx + movq %rdx,%r10 + + mulq 40(%rsi) + addq %r10,%r9 + adcq $0,%rdx + addq %rax,%r9 + movq %r12,%rax + adcq $0,%rdx + movq %rdx,%r10 + + mulq 0(%rcx) + addq %rax,%r11 + movq %r12,%rax + adcq %rdx,%r11 + + mulq 8(%rcx) + addq %rax,%r13 + movq %r12,%rax + adcq $0,%rdx + addq %r11,%r13 + adcq $0,%rdx + movq %rdx,%r11 + + mulq 16(%rcx) + addq %rax,%r14 + movq %r12,%rax + adcq $0,%rdx + addq %r11,%r14 + adcq $0,%rdx + movq %rdx,%r11 + + mulq 24(%rcx) + addq %r11,%r15 + adcq $0,%rdx + addq %rax,%r15 + movq %r12,%rax + adcq $0,%rdx + movq %rdx,%r11 + + mulq 32(%rcx) + addq %rax,%r8 + movq %r12,%rax + adcq $0,%rdx + addq %r11,%r8 + adcq $0,%rdx + movq %rdx,%r11 + + mulq 40(%rcx) + addq %rax,%r9 + movq 40(%rbx),%rax + adcq $0,%rdx + addq %r11,%r9 + adcq %rdx,%r10 + + movq %rax,%rbp + mulq 0(%rsi) + addq %rax,%r13 + movq %rbp,%rax + adcq $0,%rdx + movq %rdx,%r11 + + mulq 8(%rsi) + addq %rax,%r14 + movq %rbp,%rax + adcq $0,%rdx + addq %r11,%r14 + adcq $0,%rdx + movq %rdx,%r11 + + mulq 16(%rsi) + addq %rax,%r15 + movq %rbp,%rax + adcq $0,%rdx + addq %r11,%r15 + adcq $0,%rdx + movq %rdx,%r11 + + movq %r13,%r12 + imulq 8(%rsp),%r13 + + mulq 24(%rsi) + addq %rax,%r8 + movq %rbp,%rax + adcq $0,%rdx + addq %r11,%r8 + adcq $0,%rdx + movq %rdx,%r11 + + mulq 32(%rsi) + addq %rax,%r9 + movq %rbp,%rax + adcq $0,%rdx + addq %r11,%r9 + adcq $0,%rdx + movq %rdx,%r11 + + mulq 40(%rsi) + addq %r11,%r10 + adcq $0,%rdx + addq %rax,%r10 + movq %r13,%rax + adcq $0,%rdx + movq %rdx,%r11 + + mulq 0(%rcx) + addq %rax,%r12 + movq %r13,%rax + adcq %rdx,%r12 + + mulq 8(%rcx) + addq %rax,%r14 + movq %r13,%rax + adcq $0,%rdx + addq %r12,%r14 + adcq $0,%rdx + movq %rdx,%r12 + + mulq 16(%rcx) + addq %rax,%r15 + movq %r13,%rax + adcq $0,%rdx + addq %r12,%r15 + adcq $0,%rdx + movq %rdx,%r12 + + mulq 24(%rcx) + addq %r12,%r8 + adcq $0,%rdx + addq %rax,%r8 + movq %r13,%rax + adcq $0,%rdx + movq %rdx,%r12 + + mulq 32(%rcx) + addq %rax,%r9 + movq %r13,%rax + adcq $0,%rdx + addq %r12,%r9 + adcq $0,%rdx + movq %rdx,%r12 + + mulq 40(%rcx) + addq %rax,%r10 + movq %r14,%rax + adcq $0,%rdx + addq %r12,%r10 + adcq %rdx,%r11 + .byte 0xf3,0xc3 + +.globl sqr_mont_382x + +.def sqr_mont_382x; .scl 2; .type 32; .endef +.p2align 5 +sqr_mont_382x: + .byte 0xf3,0x0f,0x1e,0xfa + movq %rdi,8(%rsp) + movq %rsi,16(%rsp) + movq %rsp,%r11 +.LSEH_begin_sqr_mont_382x: + movq %rcx,%rdi + movq %rdx,%rsi + movq %r8,%rdx + movq %r9,%rcx + + + pushq %rbp + + pushq %rbx + + pushq %r12 + + pushq %r13 + + pushq %r14 + + pushq %r15 + + subq $136,%rsp + +.LSEH_body_sqr_mont_382x: + + + movq %rcx,0(%rsp) + movq %rdx,%rcx + movq %rsi,16(%rsp) + movq %rdi,24(%rsp) + + + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + movq 32(%rsi),%r12 + movq 40(%rsi),%r13 + + movq %r8,%r14 + addq 48(%rsi),%r8 + movq %r9,%r15 + adcq 56(%rsi),%r9 + movq %r10,%rax + adcq 64(%rsi),%r10 + movq %r11,%rdx + adcq 72(%rsi),%r11 + movq %r12,%rbx + adcq 80(%rsi),%r12 + movq %r13,%rbp + adcq 88(%rsi),%r13 + + subq 48(%rsi),%r14 + sbbq 56(%rsi),%r15 + sbbq 64(%rsi),%rax + sbbq 72(%rsi),%rdx + sbbq 80(%rsi),%rbx + sbbq 88(%rsi),%rbp + sbbq %rdi,%rdi + + movq %r8,32+0(%rsp) + movq %r9,32+8(%rsp) + movq %r10,32+16(%rsp) + movq %r11,32+24(%rsp) + movq %r12,32+32(%rsp) + movq %r13,32+40(%rsp) + + movq %r14,32+48(%rsp) + movq %r15,32+56(%rsp) + movq %rax,32+64(%rsp) + movq %rdx,32+72(%rsp) + movq %rbx,32+80(%rsp) + movq %rbp,32+88(%rsp) + movq %rdi,32+96(%rsp) + + + + leaq 48(%rsi),%rbx + + movq 48(%rsi),%rax + movq 0(%rsi),%r14 + movq 8(%rsi),%r15 + movq 16(%rsi),%r12 + movq 24(%rsi),%r13 + + movq 24(%rsp),%rdi + call __mulq_mont_383_nonred + addq %r14,%r14 + adcq %r15,%r15 + adcq %r8,%r8 + adcq %r9,%r9 + adcq %r10,%r10 + adcq %r11,%r11 + + movq %r14,48(%rdi) + movq %r15,56(%rdi) + movq %r8,64(%rdi) + movq %r9,72(%rdi) + movq %r10,80(%rdi) + movq %r11,88(%rdi) + + leaq 32(%rsp),%rsi + leaq 32+48(%rsp),%rbx + + movq 32+48(%rsp),%rax + movq 32+0(%rsp),%r14 + movq 32+8(%rsp),%r15 + movq 32+16(%rsp),%r12 + movq 32+24(%rsp),%r13 + + call __mulq_mont_383_nonred + movq 32+96(%rsp),%rsi + movq 32+0(%rsp),%r12 + movq 32+8(%rsp),%r13 + andq %rsi,%r12 + movq 32+16(%rsp),%rax + andq %rsi,%r13 + movq 32+24(%rsp),%rbx + andq %rsi,%rax + movq 32+32(%rsp),%rbp + andq %rsi,%rbx + andq %rsi,%rbp + andq 32+40(%rsp),%rsi + + subq %r12,%r14 + movq 0(%rcx),%r12 + sbbq %r13,%r15 + movq 8(%rcx),%r13 + sbbq %rax,%r8 + movq 16(%rcx),%rax + sbbq %rbx,%r9 + movq 24(%rcx),%rbx + sbbq %rbp,%r10 + movq 32(%rcx),%rbp + sbbq %rsi,%r11 + sbbq %rsi,%rsi + + andq %rsi,%r12 + andq %rsi,%r13 + andq %rsi,%rax + andq %rsi,%rbx + andq %rsi,%rbp + andq 40(%rcx),%rsi + + addq %r12,%r14 + adcq %r13,%r15 + adcq %rax,%r8 + adcq %rbx,%r9 + adcq %rbp,%r10 + adcq %rsi,%r11 + + movq %r14,0(%rdi) + movq %r15,8(%rdi) + movq %r8,16(%rdi) + movq %r9,24(%rdi) + movq %r10,32(%rdi) + movq %r11,40(%rdi) + leaq 136(%rsp),%r8 + movq 0(%r8),%r15 + + movq 8(%r8),%r14 + + movq 16(%r8),%r13 + + movq 24(%r8),%r12 + + movq 32(%r8),%rbx + + movq 40(%r8),%rbp + + leaq 48(%r8),%rsp + +.LSEH_epilogue_sqr_mont_382x: + mov 8(%rsp),%rdi + mov 16(%rsp),%rsi + + .byte 0xf3,0xc3 + +.LSEH_end_sqr_mont_382x: +.section .pdata +.p2align 2 +.rva .LSEH_begin_mul_mont_384x +.rva .LSEH_body_mul_mont_384x +.rva .LSEH_info_mul_mont_384x_prologue + +.rva .LSEH_body_mul_mont_384x +.rva .LSEH_epilogue_mul_mont_384x +.rva .LSEH_info_mul_mont_384x_body + +.rva .LSEH_epilogue_mul_mont_384x +.rva .LSEH_end_mul_mont_384x +.rva .LSEH_info_mul_mont_384x_epilogue + +.rva .LSEH_begin_sqr_mont_384x +.rva .LSEH_body_sqr_mont_384x +.rva .LSEH_info_sqr_mont_384x_prologue + +.rva .LSEH_body_sqr_mont_384x +.rva .LSEH_epilogue_sqr_mont_384x +.rva .LSEH_info_sqr_mont_384x_body + +.rva .LSEH_epilogue_sqr_mont_384x +.rva .LSEH_end_sqr_mont_384x +.rva .LSEH_info_sqr_mont_384x_epilogue + +.rva .LSEH_begin_mul_382x +.rva .LSEH_body_mul_382x +.rva .LSEH_info_mul_382x_prologue + +.rva .LSEH_body_mul_382x +.rva .LSEH_epilogue_mul_382x +.rva .LSEH_info_mul_382x_body + +.rva .LSEH_epilogue_mul_382x +.rva .LSEH_end_mul_382x +.rva .LSEH_info_mul_382x_epilogue + +.rva .LSEH_begin_sqr_382x +.rva .LSEH_body_sqr_382x +.rva .LSEH_info_sqr_382x_prologue + +.rva .LSEH_body_sqr_382x +.rva .LSEH_epilogue_sqr_382x +.rva .LSEH_info_sqr_382x_body + +.rva .LSEH_epilogue_sqr_382x +.rva .LSEH_end_sqr_382x +.rva .LSEH_info_sqr_382x_epilogue + +.rva .LSEH_begin_mul_384 +.rva .LSEH_body_mul_384 +.rva .LSEH_info_mul_384_prologue + +.rva .LSEH_body_mul_384 +.rva .LSEH_epilogue_mul_384 +.rva .LSEH_info_mul_384_body + +.rva .LSEH_epilogue_mul_384 +.rva .LSEH_end_mul_384 +.rva .LSEH_info_mul_384_epilogue + +.rva .LSEH_begin_sqr_384 +.rva .LSEH_body_sqr_384 +.rva .LSEH_info_sqr_384_prologue + +.rva .LSEH_body_sqr_384 +.rva .LSEH_epilogue_sqr_384 +.rva .LSEH_info_sqr_384_body + +.rva .LSEH_epilogue_sqr_384 +.rva .LSEH_end_sqr_384 +.rva .LSEH_info_sqr_384_epilogue + +.rva .LSEH_begin_sqr_mont_384 +.rva .LSEH_body_sqr_mont_384 +.rva .LSEH_info_sqr_mont_384_prologue + +.rva .LSEH_body_sqr_mont_384 +.rva .LSEH_epilogue_sqr_mont_384 +.rva .LSEH_info_sqr_mont_384_body + +.rva .LSEH_epilogue_sqr_mont_384 +.rva .LSEH_end_sqr_mont_384 +.rva .LSEH_info_sqr_mont_384_epilogue + +.rva .LSEH_begin_redc_mont_384 +.rva .LSEH_body_redc_mont_384 +.rva .LSEH_info_redc_mont_384_prologue + +.rva .LSEH_body_redc_mont_384 +.rva .LSEH_epilogue_redc_mont_384 +.rva .LSEH_info_redc_mont_384_body + +.rva .LSEH_epilogue_redc_mont_384 +.rva .LSEH_end_redc_mont_384 +.rva .LSEH_info_redc_mont_384_epilogue + +.rva .LSEH_begin_from_mont_384 +.rva .LSEH_body_from_mont_384 +.rva .LSEH_info_from_mont_384_prologue + +.rva .LSEH_body_from_mont_384 +.rva .LSEH_epilogue_from_mont_384 +.rva .LSEH_info_from_mont_384_body + +.rva .LSEH_epilogue_from_mont_384 +.rva .LSEH_end_from_mont_384 +.rva .LSEH_info_from_mont_384_epilogue + +.rva .LSEH_begin_sgn0_pty_mont_384 +.rva .LSEH_body_sgn0_pty_mont_384 +.rva .LSEH_info_sgn0_pty_mont_384_prologue + +.rva .LSEH_body_sgn0_pty_mont_384 +.rva .LSEH_epilogue_sgn0_pty_mont_384 +.rva .LSEH_info_sgn0_pty_mont_384_body + +.rva .LSEH_epilogue_sgn0_pty_mont_384 +.rva .LSEH_end_sgn0_pty_mont_384 +.rva .LSEH_info_sgn0_pty_mont_384_epilogue + +.rva .LSEH_begin_sgn0_pty_mont_384x +.rva .LSEH_body_sgn0_pty_mont_384x +.rva .LSEH_info_sgn0_pty_mont_384x_prologue + +.rva .LSEH_body_sgn0_pty_mont_384x +.rva .LSEH_epilogue_sgn0_pty_mont_384x +.rva .LSEH_info_sgn0_pty_mont_384x_body + +.rva .LSEH_epilogue_sgn0_pty_mont_384x +.rva .LSEH_end_sgn0_pty_mont_384x +.rva .LSEH_info_sgn0_pty_mont_384x_epilogue + +.rva .LSEH_begin_mul_mont_384 +.rva .LSEH_body_mul_mont_384 +.rva .LSEH_info_mul_mont_384_prologue + +.rva .LSEH_body_mul_mont_384 +.rva .LSEH_epilogue_mul_mont_384 +.rva .LSEH_info_mul_mont_384_body + +.rva .LSEH_epilogue_mul_mont_384 +.rva .LSEH_end_mul_mont_384 +.rva .LSEH_info_mul_mont_384_epilogue + +.rva .LSEH_begin_sqr_n_mul_mont_384 +.rva .LSEH_body_sqr_n_mul_mont_384 +.rva .LSEH_info_sqr_n_mul_mont_384_prologue + +.rva .LSEH_body_sqr_n_mul_mont_384 +.rva .LSEH_epilogue_sqr_n_mul_mont_384 +.rva .LSEH_info_sqr_n_mul_mont_384_body + +.rva .LSEH_epilogue_sqr_n_mul_mont_384 +.rva .LSEH_end_sqr_n_mul_mont_384 +.rva .LSEH_info_sqr_n_mul_mont_384_epilogue + +.rva .LSEH_begin_sqr_n_mul_mont_383 +.rva .LSEH_body_sqr_n_mul_mont_383 +.rva .LSEH_info_sqr_n_mul_mont_383_prologue + +.rva .LSEH_body_sqr_n_mul_mont_383 +.rva .LSEH_epilogue_sqr_n_mul_mont_383 +.rva .LSEH_info_sqr_n_mul_mont_383_body + +.rva .LSEH_epilogue_sqr_n_mul_mont_383 +.rva .LSEH_end_sqr_n_mul_mont_383 +.rva .LSEH_info_sqr_n_mul_mont_383_epilogue + +.rva .LSEH_begin_sqr_mont_382x +.rva .LSEH_body_sqr_mont_382x +.rva .LSEH_info_sqr_mont_382x_prologue + +.rva .LSEH_body_sqr_mont_382x +.rva .LSEH_epilogue_sqr_mont_382x +.rva .LSEH_info_sqr_mont_382x_body + +.rva .LSEH_epilogue_sqr_mont_382x +.rva .LSEH_end_sqr_mont_382x +.rva .LSEH_info_sqr_mont_382x_epilogue + +.section .xdata +.p2align 3 +.LSEH_info_mul_mont_384x_prologue: +.byte 1,0,5,0x0b +.byte 0,0x74,1,0 +.byte 0,0x64,2,0 +.byte 0,0x03 +.byte 0,0 +.LSEH_info_mul_mont_384x_body: +.byte 1,0,18,0 +.byte 0x00,0xf4,0x29,0x00 +.byte 0x00,0xe4,0x2a,0x00 +.byte 0x00,0xd4,0x2b,0x00 +.byte 0x00,0xc4,0x2c,0x00 +.byte 0x00,0x34,0x2d,0x00 +.byte 0x00,0x54,0x2e,0x00 +.byte 0x00,0x74,0x30,0x00 +.byte 0x00,0x64,0x31,0x00 +.byte 0x00,0x01,0x2f,0x00 +.LSEH_info_mul_mont_384x_epilogue: +.byte 1,0,4,0 +.byte 0x00,0x74,0x01,0x00 +.byte 0x00,0x64,0x02,0x00 +.byte 0x00,0x00,0x00,0x00 + +.LSEH_info_sqr_mont_384x_prologue: +.byte 1,0,5,0x0b +.byte 0,0x74,1,0 +.byte 0,0x64,2,0 +.byte 0,0x03 +.byte 0,0 +.LSEH_info_sqr_mont_384x_body: +.byte 1,0,18,0 +.byte 0x00,0xf4,0x11,0x00 +.byte 0x00,0xe4,0x12,0x00 +.byte 0x00,0xd4,0x13,0x00 +.byte 0x00,0xc4,0x14,0x00 +.byte 0x00,0x34,0x15,0x00 +.byte 0x00,0x54,0x16,0x00 +.byte 0x00,0x74,0x18,0x00 +.byte 0x00,0x64,0x19,0x00 +.byte 0x00,0x01,0x17,0x00 +.LSEH_info_sqr_mont_384x_epilogue: +.byte 1,0,4,0 +.byte 0x00,0x74,0x01,0x00 +.byte 0x00,0x64,0x02,0x00 +.byte 0x00,0x00,0x00,0x00 + +.LSEH_info_mul_382x_prologue: +.byte 1,0,5,0x0b +.byte 0,0x74,1,0 +.byte 0,0x64,2,0 +.byte 0,0x03 +.byte 0,0 +.LSEH_info_mul_382x_body: +.byte 1,0,18,0 +.byte 0x00,0xf4,0x11,0x00 +.byte 0x00,0xe4,0x12,0x00 +.byte 0x00,0xd4,0x13,0x00 +.byte 0x00,0xc4,0x14,0x00 +.byte 0x00,0x34,0x15,0x00 +.byte 0x00,0x54,0x16,0x00 +.byte 0x00,0x74,0x18,0x00 +.byte 0x00,0x64,0x19,0x00 +.byte 0x00,0x01,0x17,0x00 +.LSEH_info_mul_382x_epilogue: +.byte 1,0,4,0 +.byte 0x00,0x74,0x01,0x00 +.byte 0x00,0x64,0x02,0x00 +.byte 0x00,0x00,0x00,0x00 + +.LSEH_info_sqr_382x_prologue: +.byte 1,0,5,0x0b +.byte 0,0x74,1,0 +.byte 0,0x64,2,0 +.byte 0,0x03 +.byte 0,0 +.LSEH_info_sqr_382x_body: +.byte 1,0,17,0 +.byte 0x00,0xf4,0x01,0x00 +.byte 0x00,0xe4,0x02,0x00 +.byte 0x00,0xd4,0x03,0x00 +.byte 0x00,0xc4,0x04,0x00 +.byte 0x00,0x34,0x05,0x00 +.byte 0x00,0x54,0x06,0x00 +.byte 0x00,0x74,0x08,0x00 +.byte 0x00,0x64,0x09,0x00 +.byte 0x00,0x62 +.byte 0x00,0x00 +.LSEH_info_sqr_382x_epilogue: +.byte 1,0,4,0 +.byte 0x00,0x74,0x01,0x00 +.byte 0x00,0x64,0x02,0x00 +.byte 0x00,0x00,0x00,0x00 + +.LSEH_info_mul_384_prologue: +.byte 1,0,5,0x0b +.byte 0,0x74,1,0 +.byte 0,0x64,2,0 +.byte 0,0x03 +.byte 0,0 +.LSEH_info_mul_384_body: +.byte 1,0,11,0 +.byte 0x00,0xc4,0x00,0x00 +.byte 0x00,0x34,0x01,0x00 +.byte 0x00,0x54,0x02,0x00 +.byte 0x00,0x74,0x04,0x00 +.byte 0x00,0x64,0x05,0x00 +.byte 0x00,0x22 +.byte 0x00,0x00,0x00,0x00,0x00,0x00 +.LSEH_info_mul_384_epilogue: +.byte 1,0,4,0 +.byte 0x00,0x74,0x01,0x00 +.byte 0x00,0x64,0x02,0x00 +.byte 0x00,0x00,0x00,0x00 + +.LSEH_info_sqr_384_prologue: +.byte 1,0,5,0x0b +.byte 0,0x74,1,0 +.byte 0,0x64,2,0 +.byte 0,0x03 +.byte 0,0 +.LSEH_info_sqr_384_body: +.byte 1,0,17,0 +.byte 0x00,0xf4,0x01,0x00 +.byte 0x00,0xe4,0x02,0x00 +.byte 0x00,0xd4,0x03,0x00 +.byte 0x00,0xc4,0x04,0x00 +.byte 0x00,0x34,0x05,0x00 +.byte 0x00,0x54,0x06,0x00 +.byte 0x00,0x74,0x08,0x00 +.byte 0x00,0x64,0x09,0x00 +.byte 0x00,0x62 +.byte 0x00,0x00 +.LSEH_info_sqr_384_epilogue: +.byte 1,0,4,0 +.byte 0x00,0x74,0x01,0x00 +.byte 0x00,0x64,0x02,0x00 +.byte 0x00,0x00,0x00,0x00 + +.LSEH_info_sqr_mont_384_prologue: +.byte 1,0,5,0x0b +.byte 0,0x74,1,0 +.byte 0,0x64,2,0 +.byte 0,0x03 +.byte 0,0 +.LSEH_info_sqr_mont_384_body: +.byte 1,0,18,0 +.byte 0x00,0xf4,0x0f,0x00 +.byte 0x00,0xe4,0x10,0x00 +.byte 0x00,0xd4,0x11,0x00 +.byte 0x00,0xc4,0x12,0x00 +.byte 0x00,0x34,0x13,0x00 +.byte 0x00,0x54,0x14,0x00 +.byte 0x00,0x74,0x16,0x00 +.byte 0x00,0x64,0x17,0x00 +.byte 0x00,0x01,0x15,0x00 +.LSEH_info_sqr_mont_384_epilogue: +.byte 1,0,4,0 +.byte 0x00,0x74,0x01,0x00 +.byte 0x00,0x64,0x02,0x00 +.byte 0x00,0x00,0x00,0x00 + +.LSEH_info_redc_mont_384_prologue: +.byte 1,0,5,0x0b +.byte 0,0x74,1,0 +.byte 0,0x64,2,0 +.byte 0,0x03 +.byte 0,0 +.LSEH_info_redc_mont_384_body: +.byte 1,0,17,0 +.byte 0x00,0xf4,0x01,0x00 +.byte 0x00,0xe4,0x02,0x00 +.byte 0x00,0xd4,0x03,0x00 +.byte 0x00,0xc4,0x04,0x00 +.byte 0x00,0x34,0x05,0x00 +.byte 0x00,0x54,0x06,0x00 +.byte 0x00,0x74,0x08,0x00 +.byte 0x00,0x64,0x09,0x00 +.byte 0x00,0x62 +.byte 0x00,0x00 +.LSEH_info_redc_mont_384_epilogue: +.byte 1,0,4,0 +.byte 0x00,0x74,0x01,0x00 +.byte 0x00,0x64,0x02,0x00 +.byte 0x00,0x00,0x00,0x00 + +.LSEH_info_from_mont_384_prologue: +.byte 1,0,5,0x0b +.byte 0,0x74,1,0 +.byte 0,0x64,2,0 +.byte 0,0x03 +.byte 0,0 +.LSEH_info_from_mont_384_body: +.byte 1,0,17,0 +.byte 0x00,0xf4,0x01,0x00 +.byte 0x00,0xe4,0x02,0x00 +.byte 0x00,0xd4,0x03,0x00 +.byte 0x00,0xc4,0x04,0x00 +.byte 0x00,0x34,0x05,0x00 +.byte 0x00,0x54,0x06,0x00 +.byte 0x00,0x74,0x08,0x00 +.byte 0x00,0x64,0x09,0x00 +.byte 0x00,0x62 +.byte 0x00,0x00 +.LSEH_info_from_mont_384_epilogue: +.byte 1,0,4,0 +.byte 0x00,0x74,0x01,0x00 +.byte 0x00,0x64,0x02,0x00 +.byte 0x00,0x00,0x00,0x00 + +.LSEH_info_sgn0_pty_mont_384_prologue: +.byte 1,0,5,0x0b +.byte 0,0x74,1,0 +.byte 0,0x64,2,0 +.byte 0,0x03 +.byte 0,0 +.LSEH_info_sgn0_pty_mont_384_body: +.byte 1,0,17,0 +.byte 0x00,0xf4,0x01,0x00 +.byte 0x00,0xe4,0x02,0x00 +.byte 0x00,0xd4,0x03,0x00 +.byte 0x00,0xc4,0x04,0x00 +.byte 0x00,0x34,0x05,0x00 +.byte 0x00,0x54,0x06,0x00 +.byte 0x00,0x74,0x08,0x00 +.byte 0x00,0x64,0x09,0x00 +.byte 0x00,0x62 +.byte 0x00,0x00 +.LSEH_info_sgn0_pty_mont_384_epilogue: +.byte 1,0,4,0 +.byte 0x00,0x74,0x01,0x00 +.byte 0x00,0x64,0x02,0x00 +.byte 0x00,0x00,0x00,0x00 + +.LSEH_info_sgn0_pty_mont_384x_prologue: +.byte 1,0,5,0x0b +.byte 0,0x74,1,0 +.byte 0,0x64,2,0 +.byte 0,0x03 +.byte 0,0 +.LSEH_info_sgn0_pty_mont_384x_body: +.byte 1,0,17,0 +.byte 0x00,0xf4,0x01,0x00 +.byte 0x00,0xe4,0x02,0x00 +.byte 0x00,0xd4,0x03,0x00 +.byte 0x00,0xc4,0x04,0x00 +.byte 0x00,0x34,0x05,0x00 +.byte 0x00,0x54,0x06,0x00 +.byte 0x00,0x74,0x08,0x00 +.byte 0x00,0x64,0x09,0x00 +.byte 0x00,0x62 +.byte 0x00,0x00 +.LSEH_info_sgn0_pty_mont_384x_epilogue: +.byte 1,0,4,0 +.byte 0x00,0x74,0x01,0x00 +.byte 0x00,0x64,0x02,0x00 +.byte 0x00,0x00,0x00,0x00 + +.LSEH_info_mul_mont_384_prologue: +.byte 1,0,5,0x0b +.byte 0,0x74,1,0 +.byte 0,0x64,2,0 +.byte 0,0x03 +.byte 0,0 +.LSEH_info_mul_mont_384_body: +.byte 1,0,17,0 +.byte 0x00,0xf4,0x03,0x00 +.byte 0x00,0xe4,0x04,0x00 +.byte 0x00,0xd4,0x05,0x00 +.byte 0x00,0xc4,0x06,0x00 +.byte 0x00,0x34,0x07,0x00 +.byte 0x00,0x54,0x08,0x00 +.byte 0x00,0x74,0x0a,0x00 +.byte 0x00,0x64,0x0b,0x00 +.byte 0x00,0x82 +.byte 0x00,0x00 +.LSEH_info_mul_mont_384_epilogue: +.byte 1,0,4,0 +.byte 0x00,0x74,0x01,0x00 +.byte 0x00,0x64,0x02,0x00 +.byte 0x00,0x00,0x00,0x00 + +.LSEH_info_sqr_n_mul_mont_384_prologue: +.byte 1,0,5,0x0b +.byte 0,0x74,1,0 +.byte 0,0x64,2,0 +.byte 0,0x03 +.byte 0,0 +.LSEH_info_sqr_n_mul_mont_384_body: +.byte 1,0,18,0 +.byte 0x00,0xf4,0x11,0x00 +.byte 0x00,0xe4,0x12,0x00 +.byte 0x00,0xd4,0x13,0x00 +.byte 0x00,0xc4,0x14,0x00 +.byte 0x00,0x34,0x15,0x00 +.byte 0x00,0x54,0x16,0x00 +.byte 0x00,0x74,0x18,0x00 +.byte 0x00,0x64,0x19,0x00 +.byte 0x00,0x01,0x17,0x00 +.LSEH_info_sqr_n_mul_mont_384_epilogue: +.byte 1,0,4,0 +.byte 0x00,0x74,0x01,0x00 +.byte 0x00,0x64,0x02,0x00 +.byte 0x00,0x00,0x00,0x00 + +.LSEH_info_sqr_n_mul_mont_383_prologue: +.byte 1,0,5,0x0b +.byte 0,0x74,1,0 +.byte 0,0x64,2,0 +.byte 0,0x03 +.byte 0,0 +.LSEH_info_sqr_n_mul_mont_383_body: +.byte 1,0,18,0 +.byte 0x00,0xf4,0x11,0x00 +.byte 0x00,0xe4,0x12,0x00 +.byte 0x00,0xd4,0x13,0x00 +.byte 0x00,0xc4,0x14,0x00 +.byte 0x00,0x34,0x15,0x00 +.byte 0x00,0x54,0x16,0x00 +.byte 0x00,0x74,0x18,0x00 +.byte 0x00,0x64,0x19,0x00 +.byte 0x00,0x01,0x17,0x00 +.LSEH_info_sqr_n_mul_mont_383_epilogue: +.byte 1,0,4,0 +.byte 0x00,0x74,0x01,0x00 +.byte 0x00,0x64,0x02,0x00 +.byte 0x00,0x00,0x00,0x00 + +.LSEH_info_sqr_mont_382x_prologue: +.byte 1,0,5,0x0b +.byte 0,0x74,1,0 +.byte 0,0x64,2,0 +.byte 0,0x03 +.byte 0,0 +.LSEH_info_sqr_mont_382x_body: +.byte 1,0,18,0 +.byte 0x00,0xf4,0x11,0x00 +.byte 0x00,0xe4,0x12,0x00 +.byte 0x00,0xd4,0x13,0x00 +.byte 0x00,0xc4,0x14,0x00 +.byte 0x00,0x34,0x15,0x00 +.byte 0x00,0x54,0x16,0x00 +.byte 0x00,0x74,0x18,0x00 +.byte 0x00,0x64,0x19,0x00 +.byte 0x00,0x01,0x17,0x00 +.LSEH_info_sqr_mont_382x_epilogue: +.byte 1,0,4,0 +.byte 0x00,0x74,0x01,0x00 +.byte 0x00,0x64,0x02,0x00 +.byte 0x00,0x00,0x00,0x00 + diff --git a/temp-deps/blst-ts/deps/blst/build/coff/mulx_mont_256-x86_64.s b/temp-deps/blst-ts/deps/blst/build/coff/mulx_mont_256-x86_64.s new file mode 100644 index 0000000..75c7e82 --- /dev/null +++ b/temp-deps/blst-ts/deps/blst/build/coff/mulx_mont_256-x86_64.s @@ -0,0 +1,784 @@ +.text + +.globl mulx_mont_sparse_256 + +.def mulx_mont_sparse_256; .scl 2; .type 32; .endef +.p2align 5 +mulx_mont_sparse_256: + .byte 0xf3,0x0f,0x1e,0xfa + movq %rdi,8(%rsp) + movq %rsi,16(%rsp) + movq %rsp,%r11 +.LSEH_begin_mulx_mont_sparse_256: + movq %rcx,%rdi + movq %rdx,%rsi + movq %r8,%rdx + movq %r9,%rcx + movq 40(%rsp),%r8 + + + pushq %rbp + + pushq %rbx + + pushq %r12 + + pushq %r13 + + pushq %r14 + + pushq %r15 + + subq $8,%rsp + +.LSEH_body_mulx_mont_sparse_256: + + + movq %rdx,%rbx + movq 0(%rdx),%rdx + movq 0(%rsi),%r14 + movq 8(%rsi),%r15 + movq 16(%rsi),%rbp + movq 24(%rsi),%r9 + leaq -128(%rsi),%rsi + leaq -128(%rcx),%rcx + + mulxq %r14,%rax,%r11 + call __mulx_mont_sparse_256 + + movq 8(%rsp),%r15 + + movq 16(%rsp),%r14 + + movq 24(%rsp),%r13 + + movq 32(%rsp),%r12 + + movq 40(%rsp),%rbx + + movq 48(%rsp),%rbp + + leaq 56(%rsp),%rsp + +.LSEH_epilogue_mulx_mont_sparse_256: + mov 8(%rsp),%rdi + mov 16(%rsp),%rsi + + .byte 0xf3,0xc3 + +.LSEH_end_mulx_mont_sparse_256: + +.globl sqrx_mont_sparse_256 + +.def sqrx_mont_sparse_256; .scl 2; .type 32; .endef +.p2align 5 +sqrx_mont_sparse_256: + .byte 0xf3,0x0f,0x1e,0xfa + movq %rdi,8(%rsp) + movq %rsi,16(%rsp) + movq %rsp,%r11 +.LSEH_begin_sqrx_mont_sparse_256: + movq %rcx,%rdi + movq %rdx,%rsi + movq %r8,%rdx + movq %r9,%rcx + + + pushq %rbp + + pushq %rbx + + pushq %r12 + + pushq %r13 + + pushq %r14 + + pushq %r15 + + subq $8,%rsp + +.LSEH_body_sqrx_mont_sparse_256: + + + movq %rsi,%rbx + movq %rcx,%r8 + movq %rdx,%rcx + movq 0(%rsi),%rdx + movq 8(%rsi),%r15 + movq 16(%rsi),%rbp + movq 24(%rsi),%r9 + leaq -128(%rbx),%rsi + leaq -128(%rcx),%rcx + + mulxq %rdx,%rax,%r11 + call __mulx_mont_sparse_256 + + movq 8(%rsp),%r15 + + movq 16(%rsp),%r14 + + movq 24(%rsp),%r13 + + movq 32(%rsp),%r12 + + movq 40(%rsp),%rbx + + movq 48(%rsp),%rbp + + leaq 56(%rsp),%rsp + +.LSEH_epilogue_sqrx_mont_sparse_256: + mov 8(%rsp),%rdi + mov 16(%rsp),%rsi + + .byte 0xf3,0xc3 + +.LSEH_end_sqrx_mont_sparse_256: +.def __mulx_mont_sparse_256; .scl 3; .type 32; .endef +.p2align 5 +__mulx_mont_sparse_256: + .byte 0xf3,0x0f,0x1e,0xfa + + mulxq %r15,%r15,%r12 + mulxq %rbp,%rbp,%r13 + addq %r15,%r11 + mulxq %r9,%r9,%r14 + movq 8(%rbx),%rdx + adcq %rbp,%r12 + adcq %r9,%r13 + adcq $0,%r14 + + movq %rax,%r10 + imulq %r8,%rax + + + xorq %r15,%r15 + mulxq 0+128(%rsi),%rbp,%r9 + adoxq %rbp,%r11 + adcxq %r9,%r12 + + mulxq 8+128(%rsi),%rbp,%r9 + adoxq %rbp,%r12 + adcxq %r9,%r13 + + mulxq 16+128(%rsi),%rbp,%r9 + adoxq %rbp,%r13 + adcxq %r9,%r14 + + mulxq 24+128(%rsi),%rbp,%r9 + movq %rax,%rdx + adoxq %rbp,%r14 + adcxq %r15,%r9 + adoxq %r9,%r15 + + + mulxq 0+128(%rcx),%rbp,%rax + adcxq %rbp,%r10 + adoxq %r11,%rax + + mulxq 8+128(%rcx),%rbp,%r9 + adcxq %rbp,%rax + adoxq %r9,%r12 + + mulxq 16+128(%rcx),%rbp,%r9 + adcxq %rbp,%r12 + adoxq %r9,%r13 + + mulxq 24+128(%rcx),%rbp,%r9 + movq 16(%rbx),%rdx + adcxq %rbp,%r13 + adoxq %r9,%r14 + adcxq %r10,%r14 + adoxq %r10,%r15 + adcxq %r10,%r15 + adoxq %r10,%r10 + adcq $0,%r10 + movq %rax,%r11 + imulq %r8,%rax + + + xorq %rbp,%rbp + mulxq 0+128(%rsi),%rbp,%r9 + adoxq %rbp,%r12 + adcxq %r9,%r13 + + mulxq 8+128(%rsi),%rbp,%r9 + adoxq %rbp,%r13 + adcxq %r9,%r14 + + mulxq 16+128(%rsi),%rbp,%r9 + adoxq %rbp,%r14 + adcxq %r9,%r15 + + mulxq 24+128(%rsi),%rbp,%r9 + movq %rax,%rdx + adoxq %rbp,%r15 + adcxq %r10,%r9 + adoxq %r9,%r10 + + + mulxq 0+128(%rcx),%rbp,%rax + adcxq %rbp,%r11 + adoxq %r12,%rax + + mulxq 8+128(%rcx),%rbp,%r9 + adcxq %rbp,%rax + adoxq %r9,%r13 + + mulxq 16+128(%rcx),%rbp,%r9 + adcxq %rbp,%r13 + adoxq %r9,%r14 + + mulxq 24+128(%rcx),%rbp,%r9 + movq 24(%rbx),%rdx + adcxq %rbp,%r14 + adoxq %r9,%r15 + adcxq %r11,%r15 + adoxq %r11,%r10 + adcxq %r11,%r10 + adoxq %r11,%r11 + adcq $0,%r11 + movq %rax,%r12 + imulq %r8,%rax + + + xorq %rbp,%rbp + mulxq 0+128(%rsi),%rbp,%r9 + adoxq %rbp,%r13 + adcxq %r9,%r14 + + mulxq 8+128(%rsi),%rbp,%r9 + adoxq %rbp,%r14 + adcxq %r9,%r15 + + mulxq 16+128(%rsi),%rbp,%r9 + adoxq %rbp,%r15 + adcxq %r9,%r10 + + mulxq 24+128(%rsi),%rbp,%r9 + movq %rax,%rdx + adoxq %rbp,%r10 + adcxq %r11,%r9 + adoxq %r9,%r11 + + + mulxq 0+128(%rcx),%rbp,%rax + adcxq %rbp,%r12 + adoxq %r13,%rax + + mulxq 8+128(%rcx),%rbp,%r9 + adcxq %rbp,%rax + adoxq %r9,%r14 + + mulxq 16+128(%rcx),%rbp,%r9 + adcxq %rbp,%r14 + adoxq %r9,%r15 + + mulxq 24+128(%rcx),%rbp,%r9 + movq %rax,%rdx + adcxq %rbp,%r15 + adoxq %r9,%r10 + adcxq %r12,%r10 + adoxq %r12,%r11 + adcxq %r12,%r11 + adoxq %r12,%r12 + adcq $0,%r12 + imulq %r8,%rdx + + + xorq %rbp,%rbp + mulxq 0+128(%rcx),%r13,%r9 + adcxq %rax,%r13 + adoxq %r9,%r14 + + mulxq 8+128(%rcx),%rbp,%r9 + adcxq %rbp,%r14 + adoxq %r9,%r15 + + mulxq 16+128(%rcx),%rbp,%r9 + adcxq %rbp,%r15 + adoxq %r9,%r10 + + mulxq 24+128(%rcx),%rbp,%r9 + movq %r14,%rdx + leaq 128(%rcx),%rcx + adcxq %rbp,%r10 + adoxq %r9,%r11 + movq %r15,%rax + adcxq %r13,%r11 + adoxq %r13,%r12 + adcq $0,%r12 + + + + + movq %r10,%rbp + subq 0(%rcx),%r14 + sbbq 8(%rcx),%r15 + sbbq 16(%rcx),%r10 + movq %r11,%r9 + sbbq 24(%rcx),%r11 + sbbq $0,%r12 + + cmovcq %rdx,%r14 + cmovcq %rax,%r15 + cmovcq %rbp,%r10 + movq %r14,0(%rdi) + cmovcq %r9,%r11 + movq %r15,8(%rdi) + movq %r10,16(%rdi) + movq %r11,24(%rdi) + + .byte 0xf3,0xc3 + +.globl fromx_mont_256 + +.def fromx_mont_256; .scl 2; .type 32; .endef +.p2align 5 +fromx_mont_256: + .byte 0xf3,0x0f,0x1e,0xfa + movq %rdi,8(%rsp) + movq %rsi,16(%rsp) + movq %rsp,%r11 +.LSEH_begin_fromx_mont_256: + movq %rcx,%rdi + movq %rdx,%rsi + movq %r8,%rdx + movq %r9,%rcx + + + pushq %rbp + + pushq %rbx + + pushq %r12 + + pushq %r13 + + pushq %r14 + + pushq %r15 + + subq $8,%rsp + +.LSEH_body_fromx_mont_256: + + + movq %rdx,%rbx + call __mulx_by_1_mont_256 + + + + + + movq %r15,%rdx + movq %r10,%r12 + movq %r11,%r13 + + subq 0(%rbx),%r14 + sbbq 8(%rbx),%r15 + sbbq 16(%rbx),%r10 + sbbq 24(%rbx),%r11 + + cmovncq %r14,%rax + cmovncq %r15,%rdx + cmovncq %r10,%r12 + movq %rax,0(%rdi) + cmovncq %r11,%r13 + movq %rdx,8(%rdi) + movq %r12,16(%rdi) + movq %r13,24(%rdi) + + movq 8(%rsp),%r15 + + movq 16(%rsp),%r14 + + movq 24(%rsp),%r13 + + movq 32(%rsp),%r12 + + movq 40(%rsp),%rbx + + movq 48(%rsp),%rbp + + leaq 56(%rsp),%rsp + +.LSEH_epilogue_fromx_mont_256: + mov 8(%rsp),%rdi + mov 16(%rsp),%rsi + + .byte 0xf3,0xc3 + +.LSEH_end_fromx_mont_256: + +.globl redcx_mont_256 + +.def redcx_mont_256; .scl 2; .type 32; .endef +.p2align 5 +redcx_mont_256: + .byte 0xf3,0x0f,0x1e,0xfa + movq %rdi,8(%rsp) + movq %rsi,16(%rsp) + movq %rsp,%r11 +.LSEH_begin_redcx_mont_256: + movq %rcx,%rdi + movq %rdx,%rsi + movq %r8,%rdx + movq %r9,%rcx + + + pushq %rbp + + pushq %rbx + + pushq %r12 + + pushq %r13 + + pushq %r14 + + pushq %r15 + + subq $8,%rsp + +.LSEH_body_redcx_mont_256: + + + movq %rdx,%rbx + call __mulx_by_1_mont_256 + + addq 32(%rsi),%r14 + adcq 40(%rsi),%r15 + movq %r14,%rax + adcq 48(%rsi),%r10 + movq %r15,%rdx + adcq 56(%rsi),%r11 + sbbq %rsi,%rsi + + + + + movq %r10,%r12 + subq 0(%rbx),%r14 + sbbq 8(%rbx),%r15 + sbbq 16(%rbx),%r10 + movq %r11,%r13 + sbbq 24(%rbx),%r11 + sbbq $0,%rsi + + cmovncq %r14,%rax + cmovncq %r15,%rdx + cmovncq %r10,%r12 + movq %rax,0(%rdi) + cmovncq %r11,%r13 + movq %rdx,8(%rdi) + movq %r12,16(%rdi) + movq %r13,24(%rdi) + + movq 8(%rsp),%r15 + + movq 16(%rsp),%r14 + + movq 24(%rsp),%r13 + + movq 32(%rsp),%r12 + + movq 40(%rsp),%rbx + + movq 48(%rsp),%rbp + + leaq 56(%rsp),%rsp + +.LSEH_epilogue_redcx_mont_256: + mov 8(%rsp),%rdi + mov 16(%rsp),%rsi + + .byte 0xf3,0xc3 + +.LSEH_end_redcx_mont_256: +.def __mulx_by_1_mont_256; .scl 3; .type 32; .endef +.p2align 5 +__mulx_by_1_mont_256: + .byte 0xf3,0x0f,0x1e,0xfa + + movq 0(%rsi),%rax + movq 8(%rsi),%r11 + movq 16(%rsi),%r12 + movq 24(%rsi),%r13 + + movq %rax,%r14 + imulq %rcx,%rax + movq %rax,%r10 + + mulq 0(%rbx) + addq %rax,%r14 + movq %r10,%rax + adcq %rdx,%r14 + + mulq 8(%rbx) + addq %rax,%r11 + movq %r10,%rax + adcq $0,%rdx + addq %r14,%r11 + adcq $0,%rdx + movq %rdx,%r14 + + mulq 16(%rbx) + movq %r11,%r15 + imulq %rcx,%r11 + addq %rax,%r12 + movq %r10,%rax + adcq $0,%rdx + addq %r14,%r12 + adcq $0,%rdx + movq %rdx,%r14 + + mulq 24(%rbx) + addq %rax,%r13 + movq %r11,%rax + adcq $0,%rdx + addq %r14,%r13 + adcq $0,%rdx + movq %rdx,%r14 + + mulq 0(%rbx) + addq %rax,%r15 + movq %r11,%rax + adcq %rdx,%r15 + + mulq 8(%rbx) + addq %rax,%r12 + movq %r11,%rax + adcq $0,%rdx + addq %r15,%r12 + adcq $0,%rdx + movq %rdx,%r15 + + mulq 16(%rbx) + movq %r12,%r10 + imulq %rcx,%r12 + addq %rax,%r13 + movq %r11,%rax + adcq $0,%rdx + addq %r15,%r13 + adcq $0,%rdx + movq %rdx,%r15 + + mulq 24(%rbx) + addq %rax,%r14 + movq %r12,%rax + adcq $0,%rdx + addq %r15,%r14 + adcq $0,%rdx + movq %rdx,%r15 + + mulq 0(%rbx) + addq %rax,%r10 + movq %r12,%rax + adcq %rdx,%r10 + + mulq 8(%rbx) + addq %rax,%r13 + movq %r12,%rax + adcq $0,%rdx + addq %r10,%r13 + adcq $0,%rdx + movq %rdx,%r10 + + mulq 16(%rbx) + movq %r13,%r11 + imulq %rcx,%r13 + addq %rax,%r14 + movq %r12,%rax + adcq $0,%rdx + addq %r10,%r14 + adcq $0,%rdx + movq %rdx,%r10 + + mulq 24(%rbx) + addq %rax,%r15 + movq %r13,%rax + adcq $0,%rdx + addq %r10,%r15 + adcq $0,%rdx + movq %rdx,%r10 + + mulq 0(%rbx) + addq %rax,%r11 + movq %r13,%rax + adcq %rdx,%r11 + + mulq 8(%rbx) + addq %rax,%r14 + movq %r13,%rax + adcq $0,%rdx + addq %r11,%r14 + adcq $0,%rdx + movq %rdx,%r11 + + mulq 16(%rbx) + addq %rax,%r15 + movq %r13,%rax + adcq $0,%rdx + addq %r11,%r15 + adcq $0,%rdx + movq %rdx,%r11 + + mulq 24(%rbx) + addq %rax,%r10 + movq %r14,%rax + adcq $0,%rdx + addq %r11,%r10 + adcq $0,%rdx + movq %rdx,%r11 + .byte 0xf3,0xc3 + +.section .pdata +.p2align 2 +.rva .LSEH_begin_mulx_mont_sparse_256 +.rva .LSEH_body_mulx_mont_sparse_256 +.rva .LSEH_info_mulx_mont_sparse_256_prologue + +.rva .LSEH_body_mulx_mont_sparse_256 +.rva .LSEH_epilogue_mulx_mont_sparse_256 +.rva .LSEH_info_mulx_mont_sparse_256_body + +.rva .LSEH_epilogue_mulx_mont_sparse_256 +.rva .LSEH_end_mulx_mont_sparse_256 +.rva .LSEH_info_mulx_mont_sparse_256_epilogue + +.rva .LSEH_begin_sqrx_mont_sparse_256 +.rva .LSEH_body_sqrx_mont_sparse_256 +.rva .LSEH_info_sqrx_mont_sparse_256_prologue + +.rva .LSEH_body_sqrx_mont_sparse_256 +.rva .LSEH_epilogue_sqrx_mont_sparse_256 +.rva .LSEH_info_sqrx_mont_sparse_256_body + +.rva .LSEH_epilogue_sqrx_mont_sparse_256 +.rva .LSEH_end_sqrx_mont_sparse_256 +.rva .LSEH_info_sqrx_mont_sparse_256_epilogue + +.rva .LSEH_begin_fromx_mont_256 +.rva .LSEH_body_fromx_mont_256 +.rva .LSEH_info_fromx_mont_256_prologue + +.rva .LSEH_body_fromx_mont_256 +.rva .LSEH_epilogue_fromx_mont_256 +.rva .LSEH_info_fromx_mont_256_body + +.rva .LSEH_epilogue_fromx_mont_256 +.rva .LSEH_end_fromx_mont_256 +.rva .LSEH_info_fromx_mont_256_epilogue + +.rva .LSEH_begin_redcx_mont_256 +.rva .LSEH_body_redcx_mont_256 +.rva .LSEH_info_redcx_mont_256_prologue + +.rva .LSEH_body_redcx_mont_256 +.rva .LSEH_epilogue_redcx_mont_256 +.rva .LSEH_info_redcx_mont_256_body + +.rva .LSEH_epilogue_redcx_mont_256 +.rva .LSEH_end_redcx_mont_256 +.rva .LSEH_info_redcx_mont_256_epilogue + +.section .xdata +.p2align 3 +.LSEH_info_mulx_mont_sparse_256_prologue: +.byte 1,0,5,0x0b +.byte 0,0x74,1,0 +.byte 0,0x64,2,0 +.byte 0,0x03 +.byte 0,0 +.LSEH_info_mulx_mont_sparse_256_body: +.byte 1,0,17,0 +.byte 0x00,0xf4,0x01,0x00 +.byte 0x00,0xe4,0x02,0x00 +.byte 0x00,0xd4,0x03,0x00 +.byte 0x00,0xc4,0x04,0x00 +.byte 0x00,0x34,0x05,0x00 +.byte 0x00,0x54,0x06,0x00 +.byte 0x00,0x74,0x08,0x00 +.byte 0x00,0x64,0x09,0x00 +.byte 0x00,0x62 +.byte 0x00,0x00 +.LSEH_info_mulx_mont_sparse_256_epilogue: +.byte 1,0,4,0 +.byte 0x00,0x74,0x01,0x00 +.byte 0x00,0x64,0x02,0x00 +.byte 0x00,0x00,0x00,0x00 + +.LSEH_info_sqrx_mont_sparse_256_prologue: +.byte 1,0,5,0x0b +.byte 0,0x74,1,0 +.byte 0,0x64,2,0 +.byte 0,0x03 +.byte 0,0 +.LSEH_info_sqrx_mont_sparse_256_body: +.byte 1,0,17,0 +.byte 0x00,0xf4,0x01,0x00 +.byte 0x00,0xe4,0x02,0x00 +.byte 0x00,0xd4,0x03,0x00 +.byte 0x00,0xc4,0x04,0x00 +.byte 0x00,0x34,0x05,0x00 +.byte 0x00,0x54,0x06,0x00 +.byte 0x00,0x74,0x08,0x00 +.byte 0x00,0x64,0x09,0x00 +.byte 0x00,0x62 +.byte 0x00,0x00 +.LSEH_info_sqrx_mont_sparse_256_epilogue: +.byte 1,0,4,0 +.byte 0x00,0x74,0x01,0x00 +.byte 0x00,0x64,0x02,0x00 +.byte 0x00,0x00,0x00,0x00 + +.LSEH_info_fromx_mont_256_prologue: +.byte 1,0,5,0x0b +.byte 0,0x74,1,0 +.byte 0,0x64,2,0 +.byte 0,0x03 +.byte 0,0 +.LSEH_info_fromx_mont_256_body: +.byte 1,0,17,0 +.byte 0x00,0xf4,0x01,0x00 +.byte 0x00,0xe4,0x02,0x00 +.byte 0x00,0xd4,0x03,0x00 +.byte 0x00,0xc4,0x04,0x00 +.byte 0x00,0x34,0x05,0x00 +.byte 0x00,0x54,0x06,0x00 +.byte 0x00,0x74,0x08,0x00 +.byte 0x00,0x64,0x09,0x00 +.byte 0x00,0x62 +.byte 0x00,0x00 +.LSEH_info_fromx_mont_256_epilogue: +.byte 1,0,4,0 +.byte 0x00,0x74,0x01,0x00 +.byte 0x00,0x64,0x02,0x00 +.byte 0x00,0x00,0x00,0x00 + +.LSEH_info_redcx_mont_256_prologue: +.byte 1,0,5,0x0b +.byte 0,0x74,1,0 +.byte 0,0x64,2,0 +.byte 0,0x03 +.byte 0,0 +.LSEH_info_redcx_mont_256_body: +.byte 1,0,17,0 +.byte 0x00,0xf4,0x01,0x00 +.byte 0x00,0xe4,0x02,0x00 +.byte 0x00,0xd4,0x03,0x00 +.byte 0x00,0xc4,0x04,0x00 +.byte 0x00,0x34,0x05,0x00 +.byte 0x00,0x54,0x06,0x00 +.byte 0x00,0x74,0x08,0x00 +.byte 0x00,0x64,0x09,0x00 +.byte 0x00,0x62 +.byte 0x00,0x00 +.LSEH_info_redcx_mont_256_epilogue: +.byte 1,0,4,0 +.byte 0x00,0x74,0x01,0x00 +.byte 0x00,0x64,0x02,0x00 +.byte 0x00,0x00,0x00,0x00 + diff --git a/temp-deps/blst-ts/deps/blst/build/coff/mulx_mont_384-x86_64.s b/temp-deps/blst-ts/deps/blst/build/coff/mulx_mont_384-x86_64.s new file mode 100644 index 0000000..12306a7 --- /dev/null +++ b/temp-deps/blst-ts/deps/blst/build/coff/mulx_mont_384-x86_64.s @@ -0,0 +1,3559 @@ +.text + + + + + + + +.def __sub_mod_384x384; .scl 3; .type 32; .endef +.p2align 5 +__sub_mod_384x384: + .byte 0xf3,0x0f,0x1e,0xfa + + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + movq 32(%rsi),%r12 + movq 40(%rsi),%r13 + movq 48(%rsi),%r14 + + subq 0(%rdx),%r8 + movq 56(%rsi),%r15 + sbbq 8(%rdx),%r9 + movq 64(%rsi),%rax + sbbq 16(%rdx),%r10 + movq 72(%rsi),%rbx + sbbq 24(%rdx),%r11 + movq 80(%rsi),%rbp + sbbq 32(%rdx),%r12 + movq 88(%rsi),%rsi + sbbq 40(%rdx),%r13 + movq %r8,0(%rdi) + sbbq 48(%rdx),%r14 + movq 0(%rcx),%r8 + movq %r9,8(%rdi) + sbbq 56(%rdx),%r15 + movq 8(%rcx),%r9 + movq %r10,16(%rdi) + sbbq 64(%rdx),%rax + movq 16(%rcx),%r10 + movq %r11,24(%rdi) + sbbq 72(%rdx),%rbx + movq 24(%rcx),%r11 + movq %r12,32(%rdi) + sbbq 80(%rdx),%rbp + movq 32(%rcx),%r12 + movq %r13,40(%rdi) + sbbq 88(%rdx),%rsi + movq 40(%rcx),%r13 + sbbq %rdx,%rdx + + andq %rdx,%r8 + andq %rdx,%r9 + andq %rdx,%r10 + andq %rdx,%r11 + andq %rdx,%r12 + andq %rdx,%r13 + + addq %r8,%r14 + adcq %r9,%r15 + movq %r14,48(%rdi) + adcq %r10,%rax + movq %r15,56(%rdi) + adcq %r11,%rbx + movq %rax,64(%rdi) + adcq %r12,%rbp + movq %rbx,72(%rdi) + adcq %r13,%rsi + movq %rbp,80(%rdi) + movq %rsi,88(%rdi) + + .byte 0xf3,0xc3 + + +.def __add_mod_384; .scl 3; .type 32; .endef +.p2align 5 +__add_mod_384: + .byte 0xf3,0x0f,0x1e,0xfa + + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + movq 32(%rsi),%r12 + movq 40(%rsi),%r13 + + addq 0(%rdx),%r8 + adcq 8(%rdx),%r9 + adcq 16(%rdx),%r10 + movq %r8,%r14 + adcq 24(%rdx),%r11 + movq %r9,%r15 + adcq 32(%rdx),%r12 + movq %r10,%rax + adcq 40(%rdx),%r13 + movq %r11,%rbx + sbbq %rdx,%rdx + + subq 0(%rcx),%r8 + sbbq 8(%rcx),%r9 + movq %r12,%rbp + sbbq 16(%rcx),%r10 + sbbq 24(%rcx),%r11 + sbbq 32(%rcx),%r12 + movq %r13,%rsi + sbbq 40(%rcx),%r13 + sbbq $0,%rdx + + cmovcq %r14,%r8 + cmovcq %r15,%r9 + cmovcq %rax,%r10 + movq %r8,0(%rdi) + cmovcq %rbx,%r11 + movq %r9,8(%rdi) + cmovcq %rbp,%r12 + movq %r10,16(%rdi) + cmovcq %rsi,%r13 + movq %r11,24(%rdi) + movq %r12,32(%rdi) + movq %r13,40(%rdi) + + .byte 0xf3,0xc3 + + +.def __sub_mod_384; .scl 3; .type 32; .endef +.p2align 5 +__sub_mod_384: + .byte 0xf3,0x0f,0x1e,0xfa + + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + movq 32(%rsi),%r12 + movq 40(%rsi),%r13 + +__sub_mod_384_a_is_loaded: + subq 0(%rdx),%r8 + movq 0(%rcx),%r14 + sbbq 8(%rdx),%r9 + movq 8(%rcx),%r15 + sbbq 16(%rdx),%r10 + movq 16(%rcx),%rax + sbbq 24(%rdx),%r11 + movq 24(%rcx),%rbx + sbbq 32(%rdx),%r12 + movq 32(%rcx),%rbp + sbbq 40(%rdx),%r13 + movq 40(%rcx),%rsi + sbbq %rdx,%rdx + + andq %rdx,%r14 + andq %rdx,%r15 + andq %rdx,%rax + andq %rdx,%rbx + andq %rdx,%rbp + andq %rdx,%rsi + + addq %r14,%r8 + adcq %r15,%r9 + movq %r8,0(%rdi) + adcq %rax,%r10 + movq %r9,8(%rdi) + adcq %rbx,%r11 + movq %r10,16(%rdi) + adcq %rbp,%r12 + movq %r11,24(%rdi) + adcq %rsi,%r13 + movq %r12,32(%rdi) + movq %r13,40(%rdi) + + .byte 0xf3,0xc3 + +.globl mulx_mont_384x + +.def mulx_mont_384x; .scl 2; .type 32; .endef +.p2align 5 +mulx_mont_384x: + .byte 0xf3,0x0f,0x1e,0xfa + movq %rdi,8(%rsp) + movq %rsi,16(%rsp) + movq %rsp,%r11 +.LSEH_begin_mulx_mont_384x: + movq %rcx,%rdi + movq %rdx,%rsi + movq %r8,%rdx + movq %r9,%rcx + movq 40(%rsp),%r8 + + + pushq %rbp + + pushq %rbx + + pushq %r12 + + pushq %r13 + + pushq %r14 + + pushq %r15 + + subq $328,%rsp + +.LSEH_body_mulx_mont_384x: + + + movq %rdx,%rbx + movq %rdi,32(%rsp) + movq %rsi,24(%rsp) + movq %rdx,16(%rsp) + movq %rcx,8(%rsp) + movq %r8,0(%rsp) + + + + + leaq 40(%rsp),%rdi + call __mulx_384 + + + leaq 48(%rbx),%rbx + leaq 128+48(%rsi),%rsi + leaq 96(%rdi),%rdi + call __mulx_384 + + + movq 8(%rsp),%rcx + leaq (%rbx),%rsi + leaq -48(%rbx),%rdx + leaq 40+192+48(%rsp),%rdi + call __add_mod_384 + + movq 24(%rsp),%rsi + leaq 48(%rsi),%rdx + leaq -48(%rdi),%rdi + call __add_mod_384 + + leaq (%rdi),%rbx + leaq 48(%rdi),%rsi + call __mulx_384 + + + leaq (%rdi),%rsi + leaq 40(%rsp),%rdx + movq 8(%rsp),%rcx + call __sub_mod_384x384 + + leaq (%rdi),%rsi + leaq -96(%rdi),%rdx + call __sub_mod_384x384 + + + leaq 40(%rsp),%rsi + leaq 40+96(%rsp),%rdx + leaq 40(%rsp),%rdi + call __sub_mod_384x384 + + leaq (%rcx),%rbx + + + leaq 40(%rsp),%rsi + movq 0(%rsp),%rcx + movq 32(%rsp),%rdi + call __mulx_by_1_mont_384 + call __redc_tail_mont_384 + + + leaq 40+192(%rsp),%rsi + movq 0(%rsp),%rcx + leaq 48(%rdi),%rdi + call __mulx_by_1_mont_384 + call __redc_tail_mont_384 + + leaq 328(%rsp),%r8 + movq 0(%r8),%r15 + + movq 8(%r8),%r14 + + movq 16(%r8),%r13 + + movq 24(%r8),%r12 + + movq 32(%r8),%rbx + + movq 40(%r8),%rbp + + leaq 48(%r8),%rsp + +.LSEH_epilogue_mulx_mont_384x: + mov 8(%rsp),%rdi + mov 16(%rsp),%rsi + + .byte 0xf3,0xc3 + +.LSEH_end_mulx_mont_384x: +.globl sqrx_mont_384x + +.def sqrx_mont_384x; .scl 2; .type 32; .endef +.p2align 5 +sqrx_mont_384x: + .byte 0xf3,0x0f,0x1e,0xfa + movq %rdi,8(%rsp) + movq %rsi,16(%rsp) + movq %rsp,%r11 +.LSEH_begin_sqrx_mont_384x: + movq %rcx,%rdi + movq %rdx,%rsi + movq %r8,%rdx + movq %r9,%rcx + + + pushq %rbp + + pushq %rbx + + pushq %r12 + + pushq %r13 + + pushq %r14 + + pushq %r15 + + subq $136,%rsp + +.LSEH_body_sqrx_mont_384x: + + + movq %rcx,0(%rsp) + movq %rdx,%rcx + + movq %rdi,16(%rsp) + movq %rsi,24(%rsp) + + + leaq 48(%rsi),%rdx + leaq 32(%rsp),%rdi + call __add_mod_384 + + + movq 24(%rsp),%rsi + leaq 48(%rsi),%rdx + leaq 32+48(%rsp),%rdi + call __sub_mod_384 + + + movq 24(%rsp),%rsi + leaq 48(%rsi),%rbx + + movq 48(%rsi),%rdx + movq 0(%rsi),%r14 + movq 8(%rsi),%r15 + movq 16(%rsi),%rax + movq 24(%rsi),%r12 + movq 32(%rsi),%rdi + movq 40(%rsi),%rbp + leaq -128(%rsi),%rsi + leaq -128(%rcx),%rcx + + mulxq %r14,%r8,%r9 + call __mulx_mont_384 + addq %rdx,%rdx + adcq %r15,%r15 + adcq %rax,%rax + movq %rdx,%r8 + adcq %r12,%r12 + movq %r15,%r9 + adcq %rdi,%rdi + movq %rax,%r10 + adcq %rbp,%rbp + movq %r12,%r11 + sbbq %rsi,%rsi + + subq 0(%rcx),%rdx + sbbq 8(%rcx),%r15 + movq %rdi,%r13 + sbbq 16(%rcx),%rax + sbbq 24(%rcx),%r12 + sbbq 32(%rcx),%rdi + movq %rbp,%r14 + sbbq 40(%rcx),%rbp + sbbq $0,%rsi + + cmovcq %r8,%rdx + cmovcq %r9,%r15 + cmovcq %r10,%rax + movq %rdx,48(%rbx) + cmovcq %r11,%r12 + movq %r15,56(%rbx) + cmovcq %r13,%rdi + movq %rax,64(%rbx) + cmovcq %r14,%rbp + movq %r12,72(%rbx) + movq %rdi,80(%rbx) + movq %rbp,88(%rbx) + + leaq 32(%rsp),%rsi + leaq 32+48(%rsp),%rbx + + movq 32+48(%rsp),%rdx + movq 32+0(%rsp),%r14 + movq 32+8(%rsp),%r15 + movq 32+16(%rsp),%rax + movq 32+24(%rsp),%r12 + movq 32+32(%rsp),%rdi + movq 32+40(%rsp),%rbp + leaq -128(%rsi),%rsi + leaq -128(%rcx),%rcx + + mulxq %r14,%r8,%r9 + call __mulx_mont_384 + + leaq 136(%rsp),%r8 + movq 0(%r8),%r15 + + movq 8(%r8),%r14 + + movq 16(%r8),%r13 + + movq 24(%r8),%r12 + + movq 32(%r8),%rbx + + movq 40(%r8),%rbp + + leaq 48(%r8),%rsp + +.LSEH_epilogue_sqrx_mont_384x: + mov 8(%rsp),%rdi + mov 16(%rsp),%rsi + + .byte 0xf3,0xc3 + +.LSEH_end_sqrx_mont_384x: + +.globl mulx_382x + +.def mulx_382x; .scl 2; .type 32; .endef +.p2align 5 +mulx_382x: + .byte 0xf3,0x0f,0x1e,0xfa + movq %rdi,8(%rsp) + movq %rsi,16(%rsp) + movq %rsp,%r11 +.LSEH_begin_mulx_382x: + movq %rcx,%rdi + movq %rdx,%rsi + movq %r8,%rdx + movq %r9,%rcx + + + pushq %rbp + + pushq %rbx + + pushq %r12 + + pushq %r13 + + pushq %r14 + + pushq %r15 + + subq $136,%rsp + +.LSEH_body_mulx_382x: + + + leaq 96(%rdi),%rdi + movq %rsi,0(%rsp) + movq %rdx,8(%rsp) + movq %rdi,16(%rsp) + movq %rcx,24(%rsp) + + + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + movq 32(%rsi),%r12 + movq 40(%rsi),%r13 + + addq 48(%rsi),%r8 + adcq 56(%rsi),%r9 + adcq 64(%rsi),%r10 + adcq 72(%rsi),%r11 + adcq 80(%rsi),%r12 + adcq 88(%rsi),%r13 + + movq %r8,32+0(%rsp) + movq %r9,32+8(%rsp) + movq %r10,32+16(%rsp) + movq %r11,32+24(%rsp) + movq %r12,32+32(%rsp) + movq %r13,32+40(%rsp) + + + movq 0(%rdx),%r8 + movq 8(%rdx),%r9 + movq 16(%rdx),%r10 + movq 24(%rdx),%r11 + movq 32(%rdx),%r12 + movq 40(%rdx),%r13 + + addq 48(%rdx),%r8 + adcq 56(%rdx),%r9 + adcq 64(%rdx),%r10 + adcq 72(%rdx),%r11 + adcq 80(%rdx),%r12 + adcq 88(%rdx),%r13 + + movq %r8,32+48(%rsp) + movq %r9,32+56(%rsp) + movq %r10,32+64(%rsp) + movq %r11,32+72(%rsp) + movq %r12,32+80(%rsp) + movq %r13,32+88(%rsp) + + + leaq 32+0(%rsp),%rsi + leaq 32+48(%rsp),%rbx + call __mulx_384 + + + movq 0(%rsp),%rsi + movq 8(%rsp),%rbx + leaq -96(%rdi),%rdi + call __mulx_384 + + + leaq 48+128(%rsi),%rsi + leaq 48(%rbx),%rbx + leaq 32(%rsp),%rdi + call __mulx_384 + + + movq 16(%rsp),%rsi + leaq 32(%rsp),%rdx + movq 24(%rsp),%rcx + movq %rsi,%rdi + call __sub_mod_384x384 + + + leaq 0(%rdi),%rsi + leaq -96(%rdi),%rdx + call __sub_mod_384x384 + + + leaq -96(%rdi),%rsi + leaq 32(%rsp),%rdx + leaq -96(%rdi),%rdi + call __sub_mod_384x384 + + leaq 136(%rsp),%r8 + movq 0(%r8),%r15 + + movq 8(%r8),%r14 + + movq 16(%r8),%r13 + + movq 24(%r8),%r12 + + movq 32(%r8),%rbx + + movq 40(%r8),%rbp + + leaq 48(%r8),%rsp + +.LSEH_epilogue_mulx_382x: + mov 8(%rsp),%rdi + mov 16(%rsp),%rsi + + .byte 0xf3,0xc3 + +.LSEH_end_mulx_382x: +.globl sqrx_382x + +.def sqrx_382x; .scl 2; .type 32; .endef +.p2align 5 +sqrx_382x: + .byte 0xf3,0x0f,0x1e,0xfa + movq %rdi,8(%rsp) + movq %rsi,16(%rsp) + movq %rsp,%r11 +.LSEH_begin_sqrx_382x: + movq %rcx,%rdi + movq %rdx,%rsi + movq %r8,%rdx + + + pushq %rbp + + pushq %rbx + + pushq %r12 + + pushq %r13 + + pushq %r14 + + pushq %r15 + + pushq %rsi + +.LSEH_body_sqrx_382x: + + + movq %rdx,%rcx + + + movq 0(%rsi),%r14 + movq 8(%rsi),%r15 + movq 16(%rsi),%rax + movq 24(%rsi),%rbx + movq 32(%rsi),%rbp + movq 40(%rsi),%rdx + + movq %r14,%r8 + addq 48(%rsi),%r14 + movq %r15,%r9 + adcq 56(%rsi),%r15 + movq %rax,%r10 + adcq 64(%rsi),%rax + movq %rbx,%r11 + adcq 72(%rsi),%rbx + movq %rbp,%r12 + adcq 80(%rsi),%rbp + movq %rdx,%r13 + adcq 88(%rsi),%rdx + + movq %r14,0(%rdi) + movq %r15,8(%rdi) + movq %rax,16(%rdi) + movq %rbx,24(%rdi) + movq %rbp,32(%rdi) + movq %rdx,40(%rdi) + + + leaq 48(%rsi),%rdx + leaq 48(%rdi),%rdi + call __sub_mod_384_a_is_loaded + + + leaq (%rdi),%rsi + leaq -48(%rdi),%rbx + leaq -48(%rdi),%rdi + call __mulx_384 + + + movq (%rsp),%rsi + leaq 48(%rsi),%rbx + leaq 96(%rdi),%rdi + call __mulx_384 + + movq 0(%rdi),%r8 + movq 8(%rdi),%r9 + movq 16(%rdi),%r10 + movq 24(%rdi),%r11 + movq 32(%rdi),%r12 + movq 40(%rdi),%r13 + movq 48(%rdi),%r14 + movq 56(%rdi),%r15 + movq 64(%rdi),%rax + movq 72(%rdi),%rbx + movq 80(%rdi),%rbp + addq %r8,%r8 + movq 88(%rdi),%rdx + adcq %r9,%r9 + movq %r8,0(%rdi) + adcq %r10,%r10 + movq %r9,8(%rdi) + adcq %r11,%r11 + movq %r10,16(%rdi) + adcq %r12,%r12 + movq %r11,24(%rdi) + adcq %r13,%r13 + movq %r12,32(%rdi) + adcq %r14,%r14 + movq %r13,40(%rdi) + adcq %r15,%r15 + movq %r14,48(%rdi) + adcq %rax,%rax + movq %r15,56(%rdi) + adcq %rbx,%rbx + movq %rax,64(%rdi) + adcq %rbp,%rbp + movq %rbx,72(%rdi) + adcq %rdx,%rdx + movq %rbp,80(%rdi) + movq %rdx,88(%rdi) + + movq 8(%rsp),%r15 + + movq 16(%rsp),%r14 + + movq 24(%rsp),%r13 + + movq 32(%rsp),%r12 + + movq 40(%rsp),%rbx + + movq 48(%rsp),%rbp + + leaq 56(%rsp),%rsp + +.LSEH_epilogue_sqrx_382x: + mov 8(%rsp),%rdi + mov 16(%rsp),%rsi + + .byte 0xf3,0xc3 + +.LSEH_end_sqrx_382x: +.globl mulx_384 + +.def mulx_384; .scl 2; .type 32; .endef +.p2align 5 +mulx_384: + .byte 0xf3,0x0f,0x1e,0xfa + movq %rdi,8(%rsp) + movq %rsi,16(%rsp) + movq %rsp,%r11 +.LSEH_begin_mulx_384: + movq %rcx,%rdi + movq %rdx,%rsi + movq %r8,%rdx + + + pushq %rbp + + pushq %rbx + + pushq %r12 + + pushq %r13 + + pushq %r14 + + pushq %r15 + +.LSEH_body_mulx_384: + + + movq %rdx,%rbx + call __mulx_384 + + movq 0(%rsp),%r15 + + movq 8(%rsp),%r14 + + movq 16(%rsp),%r13 + + movq 24(%rsp),%r12 + + movq 32(%rsp),%rbx + + movq 40(%rsp),%rbp + + leaq 48(%rsp),%rsp + +.LSEH_epilogue_mulx_384: + mov 8(%rsp),%rdi + mov 16(%rsp),%rsi + + .byte 0xf3,0xc3 + +.LSEH_end_mulx_384: + +.def __mulx_384; .scl 3; .type 32; .endef +.p2align 5 +__mulx_384: + .byte 0xf3,0x0f,0x1e,0xfa + + movq 0(%rbx),%rdx + movq 0(%rsi),%r14 + movq 8(%rsi),%r15 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + movq 32(%rsi),%r12 + movq 40(%rsi),%r13 + leaq -128(%rsi),%rsi + + mulxq %r14,%r9,%rcx + xorq %rbp,%rbp + + mulxq %r15,%r8,%rax + adcxq %rcx,%r8 + movq %r9,0(%rdi) + + mulxq %r10,%r9,%rcx + adcxq %rax,%r9 + + mulxq %r11,%r10,%rax + adcxq %rcx,%r10 + + mulxq %r12,%r11,%rcx + adcxq %rax,%r11 + + mulxq %r13,%r12,%r13 + movq 8(%rbx),%rdx + adcxq %rcx,%r12 + adcxq %rbp,%r13 + mulxq %r14,%rax,%rcx + adcxq %r8,%rax + adoxq %rcx,%r9 + movq %rax,8(%rdi) + + mulxq %r15,%r8,%rcx + adcxq %r9,%r8 + adoxq %rcx,%r10 + + mulxq 128+16(%rsi),%r9,%rax + adcxq %r10,%r9 + adoxq %rax,%r11 + + mulxq 128+24(%rsi),%r10,%rcx + adcxq %r11,%r10 + adoxq %rcx,%r12 + + mulxq 128+32(%rsi),%r11,%rax + adcxq %r12,%r11 + adoxq %r13,%rax + + mulxq 128+40(%rsi),%r12,%r13 + movq 16(%rbx),%rdx + adcxq %rax,%r12 + adoxq %rbp,%r13 + adcxq %rbp,%r13 + mulxq %r14,%rax,%rcx + adcxq %r8,%rax + adoxq %rcx,%r9 + movq %rax,16(%rdi) + + mulxq %r15,%r8,%rcx + adcxq %r9,%r8 + adoxq %rcx,%r10 + + mulxq 128+16(%rsi),%r9,%rax + adcxq %r10,%r9 + adoxq %rax,%r11 + + mulxq 128+24(%rsi),%r10,%rcx + adcxq %r11,%r10 + adoxq %rcx,%r12 + + mulxq 128+32(%rsi),%r11,%rax + adcxq %r12,%r11 + adoxq %r13,%rax + + mulxq 128+40(%rsi),%r12,%r13 + movq 24(%rbx),%rdx + adcxq %rax,%r12 + adoxq %rbp,%r13 + adcxq %rbp,%r13 + mulxq %r14,%rax,%rcx + adcxq %r8,%rax + adoxq %rcx,%r9 + movq %rax,24(%rdi) + + mulxq %r15,%r8,%rcx + adcxq %r9,%r8 + adoxq %rcx,%r10 + + mulxq 128+16(%rsi),%r9,%rax + adcxq %r10,%r9 + adoxq %rax,%r11 + + mulxq 128+24(%rsi),%r10,%rcx + adcxq %r11,%r10 + adoxq %rcx,%r12 + + mulxq 128+32(%rsi),%r11,%rax + adcxq %r12,%r11 + adoxq %r13,%rax + + mulxq 128+40(%rsi),%r12,%r13 + movq 32(%rbx),%rdx + adcxq %rax,%r12 + adoxq %rbp,%r13 + adcxq %rbp,%r13 + mulxq %r14,%rax,%rcx + adcxq %r8,%rax + adoxq %rcx,%r9 + movq %rax,32(%rdi) + + mulxq %r15,%r8,%rcx + adcxq %r9,%r8 + adoxq %rcx,%r10 + + mulxq 128+16(%rsi),%r9,%rax + adcxq %r10,%r9 + adoxq %rax,%r11 + + mulxq 128+24(%rsi),%r10,%rcx + adcxq %r11,%r10 + adoxq %rcx,%r12 + + mulxq 128+32(%rsi),%r11,%rax + adcxq %r12,%r11 + adoxq %r13,%rax + + mulxq 128+40(%rsi),%r12,%r13 + movq 40(%rbx),%rdx + adcxq %rax,%r12 + adoxq %rbp,%r13 + adcxq %rbp,%r13 + mulxq %r14,%rax,%rcx + adcxq %r8,%rax + adoxq %rcx,%r9 + movq %rax,40(%rdi) + + mulxq %r15,%r8,%rcx + adcxq %r9,%r8 + adoxq %rcx,%r10 + + mulxq 128+16(%rsi),%r9,%rax + adcxq %r10,%r9 + adoxq %rax,%r11 + + mulxq 128+24(%rsi),%r10,%rcx + adcxq %r11,%r10 + adoxq %rcx,%r12 + + mulxq 128+32(%rsi),%r11,%rax + adcxq %r12,%r11 + adoxq %r13,%rax + + mulxq 128+40(%rsi),%r12,%r13 + movq %rax,%rdx + adcxq %rax,%r12 + adoxq %rbp,%r13 + adcxq %rbp,%r13 + movq %r8,48(%rdi) + movq %r9,56(%rdi) + movq %r10,64(%rdi) + movq %r11,72(%rdi) + movq %r12,80(%rdi) + movq %r13,88(%rdi) + + .byte 0xf3,0xc3 + +.globl sqrx_384 + +.def sqrx_384; .scl 2; .type 32; .endef +.p2align 5 +sqrx_384: + .byte 0xf3,0x0f,0x1e,0xfa + movq %rdi,8(%rsp) + movq %rsi,16(%rsp) + movq %rsp,%r11 +.LSEH_begin_sqrx_384: + movq %rcx,%rdi + movq %rdx,%rsi + + + pushq %rbp + + pushq %rbx + + pushq %r12 + + pushq %r13 + + pushq %r14 + + pushq %r15 + + pushq %rdi + +.LSEH_body_sqrx_384: + + + call __sqrx_384 + + movq 8(%rsp),%r15 + + movq 16(%rsp),%r14 + + movq 24(%rsp),%r13 + + movq 32(%rsp),%r12 + + movq 40(%rsp),%rbx + + movq 48(%rsp),%rbp + + leaq 56(%rsp),%rsp + +.LSEH_epilogue_sqrx_384: + mov 8(%rsp),%rdi + mov 16(%rsp),%rsi + + .byte 0xf3,0xc3 + +.LSEH_end_sqrx_384: +.def __sqrx_384; .scl 3; .type 32; .endef +.p2align 5 +__sqrx_384: + .byte 0xf3,0x0f,0x1e,0xfa + + movq 0(%rsi),%rdx + movq 8(%rsi),%r14 + movq 16(%rsi),%r15 + movq 24(%rsi),%rcx + movq 32(%rsi),%rbx + + + mulxq %r14,%r8,%rdi + movq 40(%rsi),%rbp + mulxq %r15,%r9,%rax + addq %rdi,%r9 + mulxq %rcx,%r10,%rdi + adcq %rax,%r10 + mulxq %rbx,%r11,%rax + adcq %rdi,%r11 + mulxq %rbp,%r12,%r13 + movq %r14,%rdx + adcq %rax,%r12 + adcq $0,%r13 + + + xorq %r14,%r14 + mulxq %r15,%rdi,%rax + adcxq %rdi,%r10 + adoxq %rax,%r11 + + mulxq %rcx,%rdi,%rax + adcxq %rdi,%r11 + adoxq %rax,%r12 + + mulxq %rbx,%rdi,%rax + adcxq %rdi,%r12 + adoxq %rax,%r13 + + mulxq %rbp,%rdi,%rax + movq %r15,%rdx + adcxq %rdi,%r13 + adoxq %r14,%rax + adcxq %rax,%r14 + + + xorq %r15,%r15 + mulxq %rcx,%rdi,%rax + adcxq %rdi,%r12 + adoxq %rax,%r13 + + mulxq %rbx,%rdi,%rax + adcxq %rdi,%r13 + adoxq %rax,%r14 + + mulxq %rbp,%rdi,%rax + movq %rcx,%rdx + adcxq %rdi,%r14 + adoxq %r15,%rax + adcxq %rax,%r15 + + + xorq %rcx,%rcx + mulxq %rbx,%rdi,%rax + adcxq %rdi,%r14 + adoxq %rax,%r15 + + mulxq %rbp,%rdi,%rax + movq %rbx,%rdx + adcxq %rdi,%r15 + adoxq %rcx,%rax + adcxq %rax,%rcx + + + mulxq %rbp,%rdi,%rbx + movq 0(%rsi),%rdx + addq %rdi,%rcx + movq 8(%rsp),%rdi + adcq $0,%rbx + + + xorq %rbp,%rbp + adcxq %r8,%r8 + adcxq %r9,%r9 + adcxq %r10,%r10 + adcxq %r11,%r11 + adcxq %r12,%r12 + + + mulxq %rdx,%rdx,%rax + movq %rdx,0(%rdi) + movq 8(%rsi),%rdx + adoxq %rax,%r8 + movq %r8,8(%rdi) + + mulxq %rdx,%r8,%rax + movq 16(%rsi),%rdx + adoxq %r8,%r9 + adoxq %rax,%r10 + movq %r9,16(%rdi) + movq %r10,24(%rdi) + + mulxq %rdx,%r8,%r9 + movq 24(%rsi),%rdx + adoxq %r8,%r11 + adoxq %r9,%r12 + adcxq %r13,%r13 + adcxq %r14,%r14 + movq %r11,32(%rdi) + movq %r12,40(%rdi) + + mulxq %rdx,%r8,%r9 + movq 32(%rsi),%rdx + adoxq %r8,%r13 + adoxq %r9,%r14 + adcxq %r15,%r15 + adcxq %rcx,%rcx + movq %r13,48(%rdi) + movq %r14,56(%rdi) + + mulxq %rdx,%r8,%r9 + movq 40(%rsi),%rdx + adoxq %r8,%r15 + adoxq %r9,%rcx + adcxq %rbx,%rbx + adcxq %rbp,%rbp + movq %r15,64(%rdi) + movq %rcx,72(%rdi) + + mulxq %rdx,%r8,%r9 + adoxq %r8,%rbx + adoxq %r9,%rbp + + movq %rbx,80(%rdi) + movq %rbp,88(%rdi) + + .byte 0xf3,0xc3 + + + + +.globl redcx_mont_384 + +.def redcx_mont_384; .scl 2; .type 32; .endef +.p2align 5 +redcx_mont_384: + .byte 0xf3,0x0f,0x1e,0xfa + movq %rdi,8(%rsp) + movq %rsi,16(%rsp) + movq %rsp,%r11 +.LSEH_begin_redcx_mont_384: + movq %rcx,%rdi + movq %rdx,%rsi + movq %r8,%rdx + movq %r9,%rcx + + + pushq %rbp + + pushq %rbx + + pushq %r12 + + pushq %r13 + + pushq %r14 + + pushq %r15 + + subq $8,%rsp + +.LSEH_body_redcx_mont_384: + + + movq %rdx,%rbx + call __mulx_by_1_mont_384 + call __redc_tail_mont_384 + + movq 8(%rsp),%r15 + + movq 16(%rsp),%r14 + + movq 24(%rsp),%r13 + + movq 32(%rsp),%r12 + + movq 40(%rsp),%rbx + + movq 48(%rsp),%rbp + + leaq 56(%rsp),%rsp + +.LSEH_epilogue_redcx_mont_384: + mov 8(%rsp),%rdi + mov 16(%rsp),%rsi + + .byte 0xf3,0xc3 + +.LSEH_end_redcx_mont_384: + + + + +.globl fromx_mont_384 + +.def fromx_mont_384; .scl 2; .type 32; .endef +.p2align 5 +fromx_mont_384: + .byte 0xf3,0x0f,0x1e,0xfa + movq %rdi,8(%rsp) + movq %rsi,16(%rsp) + movq %rsp,%r11 +.LSEH_begin_fromx_mont_384: + movq %rcx,%rdi + movq %rdx,%rsi + movq %r8,%rdx + movq %r9,%rcx + + + pushq %rbp + + pushq %rbx + + pushq %r12 + + pushq %r13 + + pushq %r14 + + pushq %r15 + + subq $8,%rsp + +.LSEH_body_fromx_mont_384: + + + movq %rdx,%rbx + call __mulx_by_1_mont_384 + + + + + movq %r14,%rax + movq %r15,%rcx + movq %r8,%rdx + movq %r9,%rbp + + subq 0(%rbx),%r14 + sbbq 8(%rbx),%r15 + movq %r10,%r13 + sbbq 16(%rbx),%r8 + sbbq 24(%rbx),%r9 + sbbq 32(%rbx),%r10 + movq %r11,%rsi + sbbq 40(%rbx),%r11 + + cmovcq %rax,%r14 + cmovcq %rcx,%r15 + cmovcq %rdx,%r8 + movq %r14,0(%rdi) + cmovcq %rbp,%r9 + movq %r15,8(%rdi) + cmovcq %r13,%r10 + movq %r8,16(%rdi) + cmovcq %rsi,%r11 + movq %r9,24(%rdi) + movq %r10,32(%rdi) + movq %r11,40(%rdi) + + movq 8(%rsp),%r15 + + movq 16(%rsp),%r14 + + movq 24(%rsp),%r13 + + movq 32(%rsp),%r12 + + movq 40(%rsp),%rbx + + movq 48(%rsp),%rbp + + leaq 56(%rsp),%rsp + +.LSEH_epilogue_fromx_mont_384: + mov 8(%rsp),%rdi + mov 16(%rsp),%rsi + + .byte 0xf3,0xc3 + +.LSEH_end_fromx_mont_384: +.def __mulx_by_1_mont_384; .scl 3; .type 32; .endef +.p2align 5 +__mulx_by_1_mont_384: + .byte 0xf3,0x0f,0x1e,0xfa + + movq 0(%rsi),%r8 + movq %rcx,%rdx + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + movq 32(%rsi),%r12 + movq 40(%rsi),%r13 + imulq %r8,%rdx + + + xorq %r14,%r14 + mulxq 0(%rbx),%rax,%rbp + adcxq %rax,%r8 + adoxq %rbp,%r9 + + mulxq 8(%rbx),%rax,%rbp + adcxq %rax,%r9 + adoxq %rbp,%r10 + + mulxq 16(%rbx),%rax,%rbp + adcxq %rax,%r10 + adoxq %rbp,%r11 + + mulxq 24(%rbx),%rax,%rbp + adcxq %rax,%r11 + adoxq %rbp,%r12 + + mulxq 32(%rbx),%rax,%rbp + adcxq %rax,%r12 + adoxq %rbp,%r13 + + mulxq 40(%rbx),%rax,%rbp + movq %rcx,%rdx + adcxq %rax,%r13 + adoxq %r14,%rbp + adcxq %rbp,%r14 + imulq %r9,%rdx + + + xorq %r15,%r15 + mulxq 0(%rbx),%rax,%rbp + adcxq %rax,%r9 + adoxq %rbp,%r10 + + mulxq 8(%rbx),%rax,%rbp + adcxq %rax,%r10 + adoxq %rbp,%r11 + + mulxq 16(%rbx),%rax,%rbp + adcxq %rax,%r11 + adoxq %rbp,%r12 + + mulxq 24(%rbx),%rax,%rbp + adcxq %rax,%r12 + adoxq %rbp,%r13 + + mulxq 32(%rbx),%rax,%rbp + adcxq %rax,%r13 + adoxq %rbp,%r14 + + mulxq 40(%rbx),%rax,%rbp + movq %rcx,%rdx + adcxq %rax,%r14 + adoxq %r15,%rbp + adcxq %rbp,%r15 + imulq %r10,%rdx + + + xorq %r8,%r8 + mulxq 0(%rbx),%rax,%rbp + adcxq %rax,%r10 + adoxq %rbp,%r11 + + mulxq 8(%rbx),%rax,%rbp + adcxq %rax,%r11 + adoxq %rbp,%r12 + + mulxq 16(%rbx),%rax,%rbp + adcxq %rax,%r12 + adoxq %rbp,%r13 + + mulxq 24(%rbx),%rax,%rbp + adcxq %rax,%r13 + adoxq %rbp,%r14 + + mulxq 32(%rbx),%rax,%rbp + adcxq %rax,%r14 + adoxq %rbp,%r15 + + mulxq 40(%rbx),%rax,%rbp + movq %rcx,%rdx + adcxq %rax,%r15 + adoxq %r8,%rbp + adcxq %rbp,%r8 + imulq %r11,%rdx + + + xorq %r9,%r9 + mulxq 0(%rbx),%rax,%rbp + adcxq %rax,%r11 + adoxq %rbp,%r12 + + mulxq 8(%rbx),%rax,%rbp + adcxq %rax,%r12 + adoxq %rbp,%r13 + + mulxq 16(%rbx),%rax,%rbp + adcxq %rax,%r13 + adoxq %rbp,%r14 + + mulxq 24(%rbx),%rax,%rbp + adcxq %rax,%r14 + adoxq %rbp,%r15 + + mulxq 32(%rbx),%rax,%rbp + adcxq %rax,%r15 + adoxq %rbp,%r8 + + mulxq 40(%rbx),%rax,%rbp + movq %rcx,%rdx + adcxq %rax,%r8 + adoxq %r9,%rbp + adcxq %rbp,%r9 + imulq %r12,%rdx + + + xorq %r10,%r10 + mulxq 0(%rbx),%rax,%rbp + adcxq %rax,%r12 + adoxq %rbp,%r13 + + mulxq 8(%rbx),%rax,%rbp + adcxq %rax,%r13 + adoxq %rbp,%r14 + + mulxq 16(%rbx),%rax,%rbp + adcxq %rax,%r14 + adoxq %rbp,%r15 + + mulxq 24(%rbx),%rax,%rbp + adcxq %rax,%r15 + adoxq %rbp,%r8 + + mulxq 32(%rbx),%rax,%rbp + adcxq %rax,%r8 + adoxq %rbp,%r9 + + mulxq 40(%rbx),%rax,%rbp + movq %rcx,%rdx + adcxq %rax,%r9 + adoxq %r10,%rbp + adcxq %rbp,%r10 + imulq %r13,%rdx + + + xorq %r11,%r11 + mulxq 0(%rbx),%rax,%rbp + adcxq %rax,%r13 + adoxq %rbp,%r14 + + mulxq 8(%rbx),%rax,%rbp + adcxq %rax,%r14 + adoxq %rbp,%r15 + + mulxq 16(%rbx),%rax,%rbp + adcxq %rax,%r15 + adoxq %rbp,%r8 + + mulxq 24(%rbx),%rax,%rbp + adcxq %rax,%r8 + adoxq %rbp,%r9 + + mulxq 32(%rbx),%rax,%rbp + adcxq %rax,%r9 + adoxq %rbp,%r10 + + mulxq 40(%rbx),%rax,%rbp + movq %rcx,%rdx + adcxq %rax,%r10 + adoxq %r11,%rbp + adcxq %rbp,%r11 + .byte 0xf3,0xc3 + + +.def __redc_tail_mont_384; .scl 3; .type 32; .endef +.p2align 5 +__redc_tail_mont_384: + .byte 0xf3,0x0f,0x1e,0xfa + + addq 48(%rsi),%r14 + movq %r14,%rax + adcq 56(%rsi),%r15 + adcq 64(%rsi),%r8 + adcq 72(%rsi),%r9 + movq %r15,%rcx + adcq 80(%rsi),%r10 + adcq 88(%rsi),%r11 + sbbq %r12,%r12 + + + + + movq %r8,%rdx + movq %r9,%rbp + + subq 0(%rbx),%r14 + sbbq 8(%rbx),%r15 + movq %r10,%r13 + sbbq 16(%rbx),%r8 + sbbq 24(%rbx),%r9 + sbbq 32(%rbx),%r10 + movq %r11,%rsi + sbbq 40(%rbx),%r11 + sbbq $0,%r12 + + cmovcq %rax,%r14 + cmovcq %rcx,%r15 + cmovcq %rdx,%r8 + movq %r14,0(%rdi) + cmovcq %rbp,%r9 + movq %r15,8(%rdi) + cmovcq %r13,%r10 + movq %r8,16(%rdi) + cmovcq %rsi,%r11 + movq %r9,24(%rdi) + movq %r10,32(%rdi) + movq %r11,40(%rdi) + + .byte 0xf3,0xc3 + + +.globl sgn0x_pty_mont_384 + +.def sgn0x_pty_mont_384; .scl 2; .type 32; .endef +.p2align 5 +sgn0x_pty_mont_384: + .byte 0xf3,0x0f,0x1e,0xfa + movq %rdi,8(%rsp) + movq %rsi,16(%rsp) + movq %rsp,%r11 +.LSEH_begin_sgn0x_pty_mont_384: + movq %rcx,%rdi + movq %rdx,%rsi + movq %r8,%rdx + + + pushq %rbp + + pushq %rbx + + pushq %r12 + + pushq %r13 + + pushq %r14 + + pushq %r15 + + subq $8,%rsp + +.LSEH_body_sgn0x_pty_mont_384: + + + movq %rsi,%rbx + leaq 0(%rdi),%rsi + movq %rdx,%rcx + call __mulx_by_1_mont_384 + + xorq %rax,%rax + movq %r14,%r13 + addq %r14,%r14 + adcq %r15,%r15 + adcq %r8,%r8 + adcq %r9,%r9 + adcq %r10,%r10 + adcq %r11,%r11 + adcq $0,%rax + + subq 0(%rbx),%r14 + sbbq 8(%rbx),%r15 + sbbq 16(%rbx),%r8 + sbbq 24(%rbx),%r9 + sbbq 32(%rbx),%r10 + sbbq 40(%rbx),%r11 + sbbq $0,%rax + + notq %rax + andq $1,%r13 + andq $2,%rax + orq %r13,%rax + + movq 8(%rsp),%r15 + + movq 16(%rsp),%r14 + + movq 24(%rsp),%r13 + + movq 32(%rsp),%r12 + + movq 40(%rsp),%rbx + + movq 48(%rsp),%rbp + + leaq 56(%rsp),%rsp + +.LSEH_epilogue_sgn0x_pty_mont_384: + mov 8(%rsp),%rdi + mov 16(%rsp),%rsi + + .byte 0xf3,0xc3 + +.LSEH_end_sgn0x_pty_mont_384: + +.globl sgn0x_pty_mont_384x + +.def sgn0x_pty_mont_384x; .scl 2; .type 32; .endef +.p2align 5 +sgn0x_pty_mont_384x: + .byte 0xf3,0x0f,0x1e,0xfa + movq %rdi,8(%rsp) + movq %rsi,16(%rsp) + movq %rsp,%r11 +.LSEH_begin_sgn0x_pty_mont_384x: + movq %rcx,%rdi + movq %rdx,%rsi + movq %r8,%rdx + + + pushq %rbp + + pushq %rbx + + pushq %r12 + + pushq %r13 + + pushq %r14 + + pushq %r15 + + subq $8,%rsp + +.LSEH_body_sgn0x_pty_mont_384x: + + + movq %rsi,%rbx + leaq 48(%rdi),%rsi + movq %rdx,%rcx + call __mulx_by_1_mont_384 + + movq %r14,%r12 + orq %r15,%r14 + orq %r8,%r14 + orq %r9,%r14 + orq %r10,%r14 + orq %r11,%r14 + + leaq 0(%rdi),%rsi + xorq %rdi,%rdi + movq %r12,%r13 + addq %r12,%r12 + adcq %r15,%r15 + adcq %r8,%r8 + adcq %r9,%r9 + adcq %r10,%r10 + adcq %r11,%r11 + adcq $0,%rdi + + subq 0(%rbx),%r12 + sbbq 8(%rbx),%r15 + sbbq 16(%rbx),%r8 + sbbq 24(%rbx),%r9 + sbbq 32(%rbx),%r10 + sbbq 40(%rbx),%r11 + sbbq $0,%rdi + + movq %r14,0(%rsp) + notq %rdi + andq $1,%r13 + andq $2,%rdi + orq %r13,%rdi + + call __mulx_by_1_mont_384 + + movq %r14,%r12 + orq %r15,%r14 + orq %r8,%r14 + orq %r9,%r14 + orq %r10,%r14 + orq %r11,%r14 + + xorq %rax,%rax + movq %r12,%r13 + addq %r12,%r12 + adcq %r15,%r15 + adcq %r8,%r8 + adcq %r9,%r9 + adcq %r10,%r10 + adcq %r11,%r11 + adcq $0,%rax + + subq 0(%rbx),%r12 + sbbq 8(%rbx),%r15 + sbbq 16(%rbx),%r8 + sbbq 24(%rbx),%r9 + sbbq 32(%rbx),%r10 + sbbq 40(%rbx),%r11 + sbbq $0,%rax + + movq 0(%rsp),%r12 + + notq %rax + + testq %r14,%r14 + cmovzq %rdi,%r13 + + testq %r12,%r12 + cmovnzq %rdi,%rax + + andq $1,%r13 + andq $2,%rax + orq %r13,%rax + + movq 8(%rsp),%r15 + + movq 16(%rsp),%r14 + + movq 24(%rsp),%r13 + + movq 32(%rsp),%r12 + + movq 40(%rsp),%rbx + + movq 48(%rsp),%rbp + + leaq 56(%rsp),%rsp + +.LSEH_epilogue_sgn0x_pty_mont_384x: + mov 8(%rsp),%rdi + mov 16(%rsp),%rsi + + .byte 0xf3,0xc3 + +.LSEH_end_sgn0x_pty_mont_384x: +.globl mulx_mont_384 + +.def mulx_mont_384; .scl 2; .type 32; .endef +.p2align 5 +mulx_mont_384: + .byte 0xf3,0x0f,0x1e,0xfa + movq %rdi,8(%rsp) + movq %rsi,16(%rsp) + movq %rsp,%r11 +.LSEH_begin_mulx_mont_384: + movq %rcx,%rdi + movq %rdx,%rsi + movq %r8,%rdx + movq %r9,%rcx + movq 40(%rsp),%r8 + + + pushq %rbp + + pushq %rbx + + pushq %r12 + + pushq %r13 + + pushq %r14 + + pushq %r15 + + leaq -24(%rsp),%rsp + +.LSEH_body_mulx_mont_384: + + + movq %rdx,%rbx + movq 0(%rdx),%rdx + movq 0(%rsi),%r14 + movq 8(%rsi),%r15 + movq 16(%rsi),%rax + movq 24(%rsi),%r12 + movq %rdi,16(%rsp) + movq 32(%rsi),%rdi + movq 40(%rsi),%rbp + leaq -128(%rsi),%rsi + leaq -128(%rcx),%rcx + movq %r8,(%rsp) + + mulxq %r14,%r8,%r9 + call __mulx_mont_384 + + movq 24(%rsp),%r15 + + movq 32(%rsp),%r14 + + movq 40(%rsp),%r13 + + movq 48(%rsp),%r12 + + movq 56(%rsp),%rbx + + movq 64(%rsp),%rbp + + leaq 72(%rsp),%rsp + +.LSEH_epilogue_mulx_mont_384: + mov 8(%rsp),%rdi + mov 16(%rsp),%rsi + + .byte 0xf3,0xc3 + +.LSEH_end_mulx_mont_384: +.def __mulx_mont_384; .scl 3; .type 32; .endef +.p2align 5 +__mulx_mont_384: + .byte 0xf3,0x0f,0x1e,0xfa + + + mulxq %r15,%r14,%r10 + mulxq %rax,%r15,%r11 + addq %r14,%r9 + mulxq %r12,%rax,%r12 + adcq %r15,%r10 + mulxq %rdi,%rdi,%r13 + adcq %rax,%r11 + mulxq %rbp,%rbp,%r14 + movq 8(%rbx),%rdx + adcq %rdi,%r12 + adcq %rbp,%r13 + adcq $0,%r14 + xorq %r15,%r15 + + movq %r8,16(%rsp) + imulq 8(%rsp),%r8 + + + xorq %rax,%rax + mulxq 0+128(%rsi),%rdi,%rbp + adoxq %rdi,%r9 + adcxq %rbp,%r10 + + mulxq 8+128(%rsi),%rdi,%rbp + adoxq %rdi,%r10 + adcxq %rbp,%r11 + + mulxq 16+128(%rsi),%rdi,%rbp + adoxq %rdi,%r11 + adcxq %rbp,%r12 + + mulxq 24+128(%rsi),%rdi,%rbp + adoxq %rdi,%r12 + adcxq %rbp,%r13 + + mulxq 32+128(%rsi),%rdi,%rbp + adoxq %rdi,%r13 + adcxq %rbp,%r14 + + mulxq 40+128(%rsi),%rdi,%rbp + movq %r8,%rdx + adoxq %rdi,%r14 + adcxq %rbp,%r15 + adoxq %rax,%r15 + adoxq %rax,%rax + + + xorq %r8,%r8 + mulxq 0+128(%rcx),%rdi,%rbp + adcxq 16(%rsp),%rdi + adoxq %rbp,%r9 + + mulxq 8+128(%rcx),%rdi,%rbp + adcxq %rdi,%r9 + adoxq %rbp,%r10 + + mulxq 16+128(%rcx),%rdi,%rbp + adcxq %rdi,%r10 + adoxq %rbp,%r11 + + mulxq 24+128(%rcx),%rdi,%rbp + adcxq %rdi,%r11 + adoxq %rbp,%r12 + + mulxq 32+128(%rcx),%rdi,%rbp + adcxq %rdi,%r12 + adoxq %rbp,%r13 + + mulxq 40+128(%rcx),%rdi,%rbp + movq 16(%rbx),%rdx + adcxq %rdi,%r13 + adoxq %rbp,%r14 + adcxq %r8,%r14 + adoxq %r8,%r15 + adcxq %r8,%r15 + adoxq %r8,%rax + adcxq %r8,%rax + movq %r9,16(%rsp) + imulq 8(%rsp),%r9 + + + xorq %r8,%r8 + mulxq 0+128(%rsi),%rdi,%rbp + adoxq %rdi,%r10 + adcxq %rbp,%r11 + + mulxq 8+128(%rsi),%rdi,%rbp + adoxq %rdi,%r11 + adcxq %rbp,%r12 + + mulxq 16+128(%rsi),%rdi,%rbp + adoxq %rdi,%r12 + adcxq %rbp,%r13 + + mulxq 24+128(%rsi),%rdi,%rbp + adoxq %rdi,%r13 + adcxq %rbp,%r14 + + mulxq 32+128(%rsi),%rdi,%rbp + adoxq %rdi,%r14 + adcxq %rbp,%r15 + + mulxq 40+128(%rsi),%rdi,%rbp + movq %r9,%rdx + adoxq %rdi,%r15 + adcxq %rbp,%rax + adoxq %r8,%rax + adoxq %r8,%r8 + + + xorq %r9,%r9 + mulxq 0+128(%rcx),%rdi,%rbp + adcxq 16(%rsp),%rdi + adoxq %rbp,%r10 + + mulxq 8+128(%rcx),%rdi,%rbp + adcxq %rdi,%r10 + adoxq %rbp,%r11 + + mulxq 16+128(%rcx),%rdi,%rbp + adcxq %rdi,%r11 + adoxq %rbp,%r12 + + mulxq 24+128(%rcx),%rdi,%rbp + adcxq %rdi,%r12 + adoxq %rbp,%r13 + + mulxq 32+128(%rcx),%rdi,%rbp + adcxq %rdi,%r13 + adoxq %rbp,%r14 + + mulxq 40+128(%rcx),%rdi,%rbp + movq 24(%rbx),%rdx + adcxq %rdi,%r14 + adoxq %rbp,%r15 + adcxq %r9,%r15 + adoxq %r9,%rax + adcxq %r9,%rax + adoxq %r9,%r8 + adcxq %r9,%r8 + movq %r10,16(%rsp) + imulq 8(%rsp),%r10 + + + xorq %r9,%r9 + mulxq 0+128(%rsi),%rdi,%rbp + adoxq %rdi,%r11 + adcxq %rbp,%r12 + + mulxq 8+128(%rsi),%rdi,%rbp + adoxq %rdi,%r12 + adcxq %rbp,%r13 + + mulxq 16+128(%rsi),%rdi,%rbp + adoxq %rdi,%r13 + adcxq %rbp,%r14 + + mulxq 24+128(%rsi),%rdi,%rbp + adoxq %rdi,%r14 + adcxq %rbp,%r15 + + mulxq 32+128(%rsi),%rdi,%rbp + adoxq %rdi,%r15 + adcxq %rbp,%rax + + mulxq 40+128(%rsi),%rdi,%rbp + movq %r10,%rdx + adoxq %rdi,%rax + adcxq %rbp,%r8 + adoxq %r9,%r8 + adoxq %r9,%r9 + + + xorq %r10,%r10 + mulxq 0+128(%rcx),%rdi,%rbp + adcxq 16(%rsp),%rdi + adoxq %rbp,%r11 + + mulxq 8+128(%rcx),%rdi,%rbp + adcxq %rdi,%r11 + adoxq %rbp,%r12 + + mulxq 16+128(%rcx),%rdi,%rbp + adcxq %rdi,%r12 + adoxq %rbp,%r13 + + mulxq 24+128(%rcx),%rdi,%rbp + adcxq %rdi,%r13 + adoxq %rbp,%r14 + + mulxq 32+128(%rcx),%rdi,%rbp + adcxq %rdi,%r14 + adoxq %rbp,%r15 + + mulxq 40+128(%rcx),%rdi,%rbp + movq 32(%rbx),%rdx + adcxq %rdi,%r15 + adoxq %rbp,%rax + adcxq %r10,%rax + adoxq %r10,%r8 + adcxq %r10,%r8 + adoxq %r10,%r9 + adcxq %r10,%r9 + movq %r11,16(%rsp) + imulq 8(%rsp),%r11 + + + xorq %r10,%r10 + mulxq 0+128(%rsi),%rdi,%rbp + adoxq %rdi,%r12 + adcxq %rbp,%r13 + + mulxq 8+128(%rsi),%rdi,%rbp + adoxq %rdi,%r13 + adcxq %rbp,%r14 + + mulxq 16+128(%rsi),%rdi,%rbp + adoxq %rdi,%r14 + adcxq %rbp,%r15 + + mulxq 24+128(%rsi),%rdi,%rbp + adoxq %rdi,%r15 + adcxq %rbp,%rax + + mulxq 32+128(%rsi),%rdi,%rbp + adoxq %rdi,%rax + adcxq %rbp,%r8 + + mulxq 40+128(%rsi),%rdi,%rbp + movq %r11,%rdx + adoxq %rdi,%r8 + adcxq %rbp,%r9 + adoxq %r10,%r9 + adoxq %r10,%r10 + + + xorq %r11,%r11 + mulxq 0+128(%rcx),%rdi,%rbp + adcxq 16(%rsp),%rdi + adoxq %rbp,%r12 + + mulxq 8+128(%rcx),%rdi,%rbp + adcxq %rdi,%r12 + adoxq %rbp,%r13 + + mulxq 16+128(%rcx),%rdi,%rbp + adcxq %rdi,%r13 + adoxq %rbp,%r14 + + mulxq 24+128(%rcx),%rdi,%rbp + adcxq %rdi,%r14 + adoxq %rbp,%r15 + + mulxq 32+128(%rcx),%rdi,%rbp + adcxq %rdi,%r15 + adoxq %rbp,%rax + + mulxq 40+128(%rcx),%rdi,%rbp + movq 40(%rbx),%rdx + adcxq %rdi,%rax + adoxq %rbp,%r8 + adcxq %r11,%r8 + adoxq %r11,%r9 + adcxq %r11,%r9 + adoxq %r11,%r10 + adcxq %r11,%r10 + movq %r12,16(%rsp) + imulq 8(%rsp),%r12 + + + xorq %r11,%r11 + mulxq 0+128(%rsi),%rdi,%rbp + adoxq %rdi,%r13 + adcxq %rbp,%r14 + + mulxq 8+128(%rsi),%rdi,%rbp + adoxq %rdi,%r14 + adcxq %rbp,%r15 + + mulxq 16+128(%rsi),%rdi,%rbp + adoxq %rdi,%r15 + adcxq %rbp,%rax + + mulxq 24+128(%rsi),%rdi,%rbp + adoxq %rdi,%rax + adcxq %rbp,%r8 + + mulxq 32+128(%rsi),%rdi,%rbp + adoxq %rdi,%r8 + adcxq %rbp,%r9 + + mulxq 40+128(%rsi),%rdi,%rbp + movq %r12,%rdx + adoxq %rdi,%r9 + adcxq %rbp,%r10 + adoxq %r11,%r10 + adoxq %r11,%r11 + + + xorq %r12,%r12 + mulxq 0+128(%rcx),%rdi,%rbp + adcxq 16(%rsp),%rdi + adoxq %rbp,%r13 + + mulxq 8+128(%rcx),%rdi,%rbp + adcxq %rdi,%r13 + adoxq %rbp,%r14 + + mulxq 16+128(%rcx),%rdi,%rbp + adcxq %rdi,%r14 + adoxq %rbp,%r15 + + mulxq 24+128(%rcx),%rdi,%rbp + adcxq %rdi,%r15 + adoxq %rbp,%rax + + mulxq 32+128(%rcx),%rdi,%rbp + adcxq %rdi,%rax + adoxq %rbp,%r8 + + mulxq 40+128(%rcx),%rdi,%rbp + movq %r13,%rdx + adcxq %rdi,%r8 + adoxq %rbp,%r9 + adcxq %r12,%r9 + adoxq %r12,%r10 + adcxq %r12,%r10 + adoxq %r12,%r11 + adcxq %r12,%r11 + imulq 8(%rsp),%rdx + movq 24(%rsp),%rbx + + + xorq %r12,%r12 + mulxq 0+128(%rcx),%rdi,%rbp + adcxq %rdi,%r13 + adoxq %rbp,%r14 + + mulxq 8+128(%rcx),%rdi,%rbp + adcxq %rdi,%r14 + adoxq %rbp,%r15 + + mulxq 16+128(%rcx),%rdi,%rbp + adcxq %rdi,%r15 + adoxq %rbp,%rax + + mulxq 24+128(%rcx),%rdi,%rbp + adcxq %rdi,%rax + adoxq %rbp,%r8 + movq %r15,%r13 + + mulxq 32+128(%rcx),%rdi,%rbp + adcxq %rdi,%r8 + adoxq %rbp,%r9 + movq %rax,%rsi + + mulxq 40+128(%rcx),%rdi,%rbp + adcxq %rdi,%r9 + adoxq %rbp,%r10 + movq %r14,%rdx + adcxq %r12,%r10 + adoxq %r12,%r11 + leaq 128(%rcx),%rcx + movq %r8,%r12 + adcq $0,%r11 + + + + + subq 0(%rcx),%r14 + sbbq 8(%rcx),%r15 + movq %r9,%rdi + sbbq 16(%rcx),%rax + sbbq 24(%rcx),%r8 + sbbq 32(%rcx),%r9 + movq %r10,%rbp + sbbq 40(%rcx),%r10 + sbbq $0,%r11 + + cmovncq %r14,%rdx + cmovcq %r13,%r15 + cmovcq %rsi,%rax + cmovncq %r8,%r12 + movq %rdx,0(%rbx) + cmovncq %r9,%rdi + movq %r15,8(%rbx) + cmovncq %r10,%rbp + movq %rax,16(%rbx) + movq %r12,24(%rbx) + movq %rdi,32(%rbx) + movq %rbp,40(%rbx) + + .byte 0xf3,0xc3 + + +.globl sqrx_mont_384 + +.def sqrx_mont_384; .scl 2; .type 32; .endef +.p2align 5 +sqrx_mont_384: + .byte 0xf3,0x0f,0x1e,0xfa + movq %rdi,8(%rsp) + movq %rsi,16(%rsp) + movq %rsp,%r11 +.LSEH_begin_sqrx_mont_384: + movq %rcx,%rdi + movq %rdx,%rsi + movq %r8,%rdx + movq %r9,%rcx + + + pushq %rbp + + pushq %rbx + + pushq %r12 + + pushq %r13 + + pushq %r14 + + pushq %r15 + + leaq -24(%rsp),%rsp + +.LSEH_body_sqrx_mont_384: + + + movq %rcx,%r8 + leaq -128(%rdx),%rcx + movq 0(%rsi),%rdx + movq 8(%rsi),%r15 + movq 16(%rsi),%rax + movq 24(%rsi),%r12 + movq %rdi,16(%rsp) + movq 32(%rsi),%rdi + movq 40(%rsi),%rbp + + leaq (%rsi),%rbx + movq %r8,(%rsp) + leaq -128(%rsi),%rsi + + mulxq %rdx,%r8,%r9 + call __mulx_mont_384 + + movq 24(%rsp),%r15 + + movq 32(%rsp),%r14 + + movq 40(%rsp),%r13 + + movq 48(%rsp),%r12 + + movq 56(%rsp),%rbx + + movq 64(%rsp),%rbp + + leaq 72(%rsp),%rsp + +.LSEH_epilogue_sqrx_mont_384: + mov 8(%rsp),%rdi + mov 16(%rsp),%rsi + + .byte 0xf3,0xc3 + +.LSEH_end_sqrx_mont_384: + +.globl sqrx_n_mul_mont_384 + +.def sqrx_n_mul_mont_384; .scl 2; .type 32; .endef +.p2align 5 +sqrx_n_mul_mont_384: + .byte 0xf3,0x0f,0x1e,0xfa + movq %rdi,8(%rsp) + movq %rsi,16(%rsp) + movq %rsp,%r11 +.LSEH_begin_sqrx_n_mul_mont_384: + movq %rcx,%rdi + movq %rdx,%rsi + movq %r8,%rdx + movq %r9,%rcx + movq 40(%rsp),%r8 + movq 48(%rsp),%r9 + + + pushq %rbp + + pushq %rbx + + pushq %r12 + + pushq %r13 + + pushq %r14 + + pushq %r15 + + leaq -40(%rsp),%rsp + +.LSEH_body_sqrx_n_mul_mont_384: + + + movq %rdx,%r10 + movq 0(%rsi),%rdx + movq 8(%rsi),%r15 + movq 16(%rsi),%rax + movq %rsi,%rbx + movq 24(%rsi),%r12 + movq %rdi,16(%rsp) + movq 32(%rsi),%rdi + movq 40(%rsi),%rbp + + movq %r8,(%rsp) + movq %r9,24(%rsp) + movq 0(%r9),%xmm2 + +.Loop_sqrx_384: + movd %r10d,%xmm1 + leaq -128(%rbx),%rsi + leaq -128(%rcx),%rcx + + mulxq %rdx,%r8,%r9 + call __mulx_mont_384 + + movd %xmm1,%r10d + decl %r10d + jnz .Loop_sqrx_384 + + movq %rdx,%r14 +.byte 102,72,15,126,210 + leaq -128(%rbx),%rsi + movq 24(%rsp),%rbx + leaq -128(%rcx),%rcx + + mulxq %r14,%r8,%r9 + call __mulx_mont_384 + + movq 40(%rsp),%r15 + + movq 48(%rsp),%r14 + + movq 56(%rsp),%r13 + + movq 64(%rsp),%r12 + + movq 72(%rsp),%rbx + + movq 80(%rsp),%rbp + + leaq 88(%rsp),%rsp + +.LSEH_epilogue_sqrx_n_mul_mont_384: + mov 8(%rsp),%rdi + mov 16(%rsp),%rsi + + .byte 0xf3,0xc3 + +.LSEH_end_sqrx_n_mul_mont_384: + +.globl sqrx_n_mul_mont_383 + +.def sqrx_n_mul_mont_383; .scl 2; .type 32; .endef +.p2align 5 +sqrx_n_mul_mont_383: + .byte 0xf3,0x0f,0x1e,0xfa + movq %rdi,8(%rsp) + movq %rsi,16(%rsp) + movq %rsp,%r11 +.LSEH_begin_sqrx_n_mul_mont_383: + movq %rcx,%rdi + movq %rdx,%rsi + movq %r8,%rdx + movq %r9,%rcx + movq 40(%rsp),%r8 + movq 48(%rsp),%r9 + + + pushq %rbp + + pushq %rbx + + pushq %r12 + + pushq %r13 + + pushq %r14 + + pushq %r15 + + leaq -40(%rsp),%rsp + +.LSEH_body_sqrx_n_mul_mont_383: + + + movq %rdx,%r10 + movq 0(%rsi),%rdx + movq 8(%rsi),%r15 + movq 16(%rsi),%rax + movq %rsi,%rbx + movq 24(%rsi),%r12 + movq %rdi,16(%rsp) + movq 32(%rsi),%rdi + movq 40(%rsi),%rbp + + movq %r8,(%rsp) + movq %r9,24(%rsp) + movq 0(%r9),%xmm2 + leaq -128(%rcx),%rcx + +.Loop_sqrx_383: + movd %r10d,%xmm1 + leaq -128(%rbx),%rsi + + mulxq %rdx,%r8,%r9 + call __mulx_mont_383_nonred + + movd %xmm1,%r10d + decl %r10d + jnz .Loop_sqrx_383 + + movq %rdx,%r14 +.byte 102,72,15,126,210 + leaq -128(%rbx),%rsi + movq 24(%rsp),%rbx + + mulxq %r14,%r8,%r9 + call __mulx_mont_384 + + movq 40(%rsp),%r15 + + movq 48(%rsp),%r14 + + movq 56(%rsp),%r13 + + movq 64(%rsp),%r12 + + movq 72(%rsp),%rbx + + movq 80(%rsp),%rbp + + leaq 88(%rsp),%rsp + +.LSEH_epilogue_sqrx_n_mul_mont_383: + mov 8(%rsp),%rdi + mov 16(%rsp),%rsi + + .byte 0xf3,0xc3 + +.LSEH_end_sqrx_n_mul_mont_383: +.def __mulx_mont_383_nonred; .scl 3; .type 32; .endef +.p2align 5 +__mulx_mont_383_nonred: + .byte 0xf3,0x0f,0x1e,0xfa + + + mulxq %r15,%r14,%r10 + mulxq %rax,%r15,%r11 + addq %r14,%r9 + mulxq %r12,%rax,%r12 + adcq %r15,%r10 + mulxq %rdi,%rdi,%r13 + adcq %rax,%r11 + mulxq %rbp,%rbp,%r14 + movq 8(%rbx),%rdx + adcq %rdi,%r12 + adcq %rbp,%r13 + adcq $0,%r14 + movq %r8,%rax + imulq 8(%rsp),%r8 + + + xorq %r15,%r15 + mulxq 0+128(%rsi),%rdi,%rbp + adoxq %rdi,%r9 + adcxq %rbp,%r10 + + mulxq 8+128(%rsi),%rdi,%rbp + adoxq %rdi,%r10 + adcxq %rbp,%r11 + + mulxq 16+128(%rsi),%rdi,%rbp + adoxq %rdi,%r11 + adcxq %rbp,%r12 + + mulxq 24+128(%rsi),%rdi,%rbp + adoxq %rdi,%r12 + adcxq %rbp,%r13 + + mulxq 32+128(%rsi),%rdi,%rbp + adoxq %rdi,%r13 + adcxq %rbp,%r14 + + mulxq 40+128(%rsi),%rdi,%rbp + movq %r8,%rdx + adoxq %rdi,%r14 + adcxq %r15,%rbp + adoxq %rbp,%r15 + + + xorq %r8,%r8 + mulxq 0+128(%rcx),%rdi,%rbp + adcxq %rdi,%rax + adoxq %rbp,%r9 + + mulxq 8+128(%rcx),%rdi,%rbp + adcxq %rdi,%r9 + adoxq %rbp,%r10 + + mulxq 16+128(%rcx),%rdi,%rbp + adcxq %rdi,%r10 + adoxq %rbp,%r11 + + mulxq 24+128(%rcx),%rdi,%rbp + adcxq %rdi,%r11 + adoxq %rbp,%r12 + + mulxq 32+128(%rcx),%rdi,%rbp + adcxq %rdi,%r12 + adoxq %rbp,%r13 + + mulxq 40+128(%rcx),%rdi,%rbp + movq 16(%rbx),%rdx + adcxq %rdi,%r13 + adoxq %rbp,%r14 + adcxq %rax,%r14 + adoxq %rax,%r15 + adcxq %rax,%r15 + movq %r9,%r8 + imulq 8(%rsp),%r9 + + + xorq %rax,%rax + mulxq 0+128(%rsi),%rdi,%rbp + adoxq %rdi,%r10 + adcxq %rbp,%r11 + + mulxq 8+128(%rsi),%rdi,%rbp + adoxq %rdi,%r11 + adcxq %rbp,%r12 + + mulxq 16+128(%rsi),%rdi,%rbp + adoxq %rdi,%r12 + adcxq %rbp,%r13 + + mulxq 24+128(%rsi),%rdi,%rbp + adoxq %rdi,%r13 + adcxq %rbp,%r14 + + mulxq 32+128(%rsi),%rdi,%rbp + adoxq %rdi,%r14 + adcxq %rbp,%r15 + + mulxq 40+128(%rsi),%rdi,%rbp + movq %r9,%rdx + adoxq %rdi,%r15 + adcxq %rax,%rbp + adoxq %rbp,%rax + + + xorq %r9,%r9 + mulxq 0+128(%rcx),%rdi,%rbp + adcxq %rdi,%r8 + adoxq %rbp,%r10 + + mulxq 8+128(%rcx),%rdi,%rbp + adcxq %rdi,%r10 + adoxq %rbp,%r11 + + mulxq 16+128(%rcx),%rdi,%rbp + adcxq %rdi,%r11 + adoxq %rbp,%r12 + + mulxq 24+128(%rcx),%rdi,%rbp + adcxq %rdi,%r12 + adoxq %rbp,%r13 + + mulxq 32+128(%rcx),%rdi,%rbp + adcxq %rdi,%r13 + adoxq %rbp,%r14 + + mulxq 40+128(%rcx),%rdi,%rbp + movq 24(%rbx),%rdx + adcxq %rdi,%r14 + adoxq %rbp,%r15 + adcxq %r8,%r15 + adoxq %r8,%rax + adcxq %r8,%rax + movq %r10,%r9 + imulq 8(%rsp),%r10 + + + xorq %r8,%r8 + mulxq 0+128(%rsi),%rdi,%rbp + adoxq %rdi,%r11 + adcxq %rbp,%r12 + + mulxq 8+128(%rsi),%rdi,%rbp + adoxq %rdi,%r12 + adcxq %rbp,%r13 + + mulxq 16+128(%rsi),%rdi,%rbp + adoxq %rdi,%r13 + adcxq %rbp,%r14 + + mulxq 24+128(%rsi),%rdi,%rbp + adoxq %rdi,%r14 + adcxq %rbp,%r15 + + mulxq 32+128(%rsi),%rdi,%rbp + adoxq %rdi,%r15 + adcxq %rbp,%rax + + mulxq 40+128(%rsi),%rdi,%rbp + movq %r10,%rdx + adoxq %rdi,%rax + adcxq %r8,%rbp + adoxq %rbp,%r8 + + + xorq %r10,%r10 + mulxq 0+128(%rcx),%rdi,%rbp + adcxq %rdi,%r9 + adoxq %rbp,%r11 + + mulxq 8+128(%rcx),%rdi,%rbp + adcxq %rdi,%r11 + adoxq %rbp,%r12 + + mulxq 16+128(%rcx),%rdi,%rbp + adcxq %rdi,%r12 + adoxq %rbp,%r13 + + mulxq 24+128(%rcx),%rdi,%rbp + adcxq %rdi,%r13 + adoxq %rbp,%r14 + + mulxq 32+128(%rcx),%rdi,%rbp + adcxq %rdi,%r14 + adoxq %rbp,%r15 + + mulxq 40+128(%rcx),%rdi,%rbp + movq 32(%rbx),%rdx + adcxq %rdi,%r15 + adoxq %rbp,%rax + adcxq %r9,%rax + adoxq %r9,%r8 + adcxq %r9,%r8 + movq %r11,%r10 + imulq 8(%rsp),%r11 + + + xorq %r9,%r9 + mulxq 0+128(%rsi),%rdi,%rbp + adoxq %rdi,%r12 + adcxq %rbp,%r13 + + mulxq 8+128(%rsi),%rdi,%rbp + adoxq %rdi,%r13 + adcxq %rbp,%r14 + + mulxq 16+128(%rsi),%rdi,%rbp + adoxq %rdi,%r14 + adcxq %rbp,%r15 + + mulxq 24+128(%rsi),%rdi,%rbp + adoxq %rdi,%r15 + adcxq %rbp,%rax + + mulxq 32+128(%rsi),%rdi,%rbp + adoxq %rdi,%rax + adcxq %rbp,%r8 + + mulxq 40+128(%rsi),%rdi,%rbp + movq %r11,%rdx + adoxq %rdi,%r8 + adcxq %r9,%rbp + adoxq %rbp,%r9 + + + xorq %r11,%r11 + mulxq 0+128(%rcx),%rdi,%rbp + adcxq %rdi,%r10 + adoxq %rbp,%r12 + + mulxq 8+128(%rcx),%rdi,%rbp + adcxq %rdi,%r12 + adoxq %rbp,%r13 + + mulxq 16+128(%rcx),%rdi,%rbp + adcxq %rdi,%r13 + adoxq %rbp,%r14 + + mulxq 24+128(%rcx),%rdi,%rbp + adcxq %rdi,%r14 + adoxq %rbp,%r15 + + mulxq 32+128(%rcx),%rdi,%rbp + adcxq %rdi,%r15 + adoxq %rbp,%rax + + mulxq 40+128(%rcx),%rdi,%rbp + movq 40(%rbx),%rdx + adcxq %rdi,%rax + adoxq %rbp,%r8 + adcxq %r10,%r8 + adoxq %r10,%r9 + adcxq %r10,%r9 + movq %r12,%r11 + imulq 8(%rsp),%r12 + + + xorq %r10,%r10 + mulxq 0+128(%rsi),%rdi,%rbp + adoxq %rdi,%r13 + adcxq %rbp,%r14 + + mulxq 8+128(%rsi),%rdi,%rbp + adoxq %rdi,%r14 + adcxq %rbp,%r15 + + mulxq 16+128(%rsi),%rdi,%rbp + adoxq %rdi,%r15 + adcxq %rbp,%rax + + mulxq 24+128(%rsi),%rdi,%rbp + adoxq %rdi,%rax + adcxq %rbp,%r8 + + mulxq 32+128(%rsi),%rdi,%rbp + adoxq %rdi,%r8 + adcxq %rbp,%r9 + + mulxq 40+128(%rsi),%rdi,%rbp + movq %r12,%rdx + adoxq %rdi,%r9 + adcxq %r10,%rbp + adoxq %rbp,%r10 + + + xorq %r12,%r12 + mulxq 0+128(%rcx),%rdi,%rbp + adcxq %rdi,%r11 + adoxq %rbp,%r13 + + mulxq 8+128(%rcx),%rdi,%rbp + adcxq %rdi,%r13 + adoxq %rbp,%r14 + + mulxq 16+128(%rcx),%rdi,%rbp + adcxq %rdi,%r14 + adoxq %rbp,%r15 + + mulxq 24+128(%rcx),%rdi,%rbp + adcxq %rdi,%r15 + adoxq %rbp,%rax + + mulxq 32+128(%rcx),%rdi,%rbp + adcxq %rdi,%rax + adoxq %rbp,%r8 + + mulxq 40+128(%rcx),%rdi,%rbp + movq %r13,%rdx + adcxq %rdi,%r8 + adoxq %rbp,%r9 + adcxq %r11,%r9 + adoxq %r11,%r10 + adcxq %r11,%r10 + imulq 8(%rsp),%rdx + movq 24(%rsp),%rbx + + + xorq %r12,%r12 + mulxq 0+128(%rcx),%rdi,%rbp + adcxq %rdi,%r13 + adoxq %rbp,%r14 + + mulxq 8+128(%rcx),%rdi,%rbp + adcxq %rdi,%r14 + adoxq %rbp,%r15 + + mulxq 16+128(%rcx),%rdi,%rbp + adcxq %rdi,%r15 + adoxq %rbp,%rax + + mulxq 24+128(%rcx),%rdi,%rbp + adcxq %rdi,%rax + adoxq %rbp,%r8 + + mulxq 32+128(%rcx),%rdi,%rbp + adcxq %rdi,%r8 + adoxq %rbp,%r9 + + mulxq 40+128(%rcx),%rdi,%rbp + movq %r14,%rdx + adcxq %rdi,%r9 + adoxq %rbp,%r10 + adcq $0,%r10 + movq %r8,%r12 + + movq %r14,0(%rbx) + movq %r15,8(%rbx) + movq %rax,16(%rbx) + movq %r9,%rdi + movq %r8,24(%rbx) + movq %r9,32(%rbx) + movq %r10,40(%rbx) + movq %r10,%rbp + + .byte 0xf3,0xc3 + + +.globl sqrx_mont_382x + +.def sqrx_mont_382x; .scl 2; .type 32; .endef +.p2align 5 +sqrx_mont_382x: + .byte 0xf3,0x0f,0x1e,0xfa + movq %rdi,8(%rsp) + movq %rsi,16(%rsp) + movq %rsp,%r11 +.LSEH_begin_sqrx_mont_382x: + movq %rcx,%rdi + movq %rdx,%rsi + movq %r8,%rdx + movq %r9,%rcx + + + pushq %rbp + + pushq %rbx + + pushq %r12 + + pushq %r13 + + pushq %r14 + + pushq %r15 + + subq $136,%rsp + +.LSEH_body_sqrx_mont_382x: + + + movq %rcx,0(%rsp) + movq %rdx,%rcx + movq %rdi,16(%rsp) + movq %rsi,24(%rsp) + + + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + movq 32(%rsi),%r12 + movq 40(%rsi),%r13 + + movq %r8,%r14 + addq 48(%rsi),%r8 + movq %r9,%r15 + adcq 56(%rsi),%r9 + movq %r10,%rax + adcq 64(%rsi),%r10 + movq %r11,%rdx + adcq 72(%rsi),%r11 + movq %r12,%rbx + adcq 80(%rsi),%r12 + movq %r13,%rbp + adcq 88(%rsi),%r13 + + subq 48(%rsi),%r14 + sbbq 56(%rsi),%r15 + sbbq 64(%rsi),%rax + sbbq 72(%rsi),%rdx + sbbq 80(%rsi),%rbx + sbbq 88(%rsi),%rbp + sbbq %rdi,%rdi + + movq %r8,32+0(%rsp) + movq %r9,32+8(%rsp) + movq %r10,32+16(%rsp) + movq %r11,32+24(%rsp) + movq %r12,32+32(%rsp) + movq %r13,32+40(%rsp) + + movq %r14,32+48(%rsp) + movq %r15,32+56(%rsp) + movq %rax,32+64(%rsp) + movq %rdx,32+72(%rsp) + movq %rbx,32+80(%rsp) + movq %rbp,32+88(%rsp) + movq %rdi,32+96(%rsp) + + + + leaq 48(%rsi),%rbx + + movq 48(%rsi),%rdx + movq 0(%rsi),%r14 + movq 8(%rsi),%r15 + movq 16(%rsi),%rax + movq 24(%rsi),%r12 + movq 32(%rsi),%rdi + movq 40(%rsi),%rbp + leaq -128(%rsi),%rsi + leaq -128(%rcx),%rcx + + mulxq %r14,%r8,%r9 + call __mulx_mont_383_nonred + addq %rdx,%rdx + adcq %r15,%r15 + adcq %rax,%rax + adcq %r12,%r12 + adcq %rdi,%rdi + adcq %rbp,%rbp + + movq %rdx,48(%rbx) + movq %r15,56(%rbx) + movq %rax,64(%rbx) + movq %r12,72(%rbx) + movq %rdi,80(%rbx) + movq %rbp,88(%rbx) + + leaq 32-128(%rsp),%rsi + leaq 32+48(%rsp),%rbx + + movq 32+48(%rsp),%rdx + movq 32+0(%rsp),%r14 + movq 32+8(%rsp),%r15 + movq 32+16(%rsp),%rax + movq 32+24(%rsp),%r12 + movq 32+32(%rsp),%rdi + movq 32+40(%rsp),%rbp + + + + mulxq %r14,%r8,%r9 + call __mulx_mont_383_nonred + movq 32+96(%rsp),%r14 + leaq 128(%rcx),%rcx + movq 32+0(%rsp),%r8 + andq %r14,%r8 + movq 32+8(%rsp),%r9 + andq %r14,%r9 + movq 32+16(%rsp),%r10 + andq %r14,%r10 + movq 32+24(%rsp),%r11 + andq %r14,%r11 + movq 32+32(%rsp),%r13 + andq %r14,%r13 + andq 32+40(%rsp),%r14 + + subq %r8,%rdx + movq 0(%rcx),%r8 + sbbq %r9,%r15 + movq 8(%rcx),%r9 + sbbq %r10,%rax + movq 16(%rcx),%r10 + sbbq %r11,%r12 + movq 24(%rcx),%r11 + sbbq %r13,%rdi + movq 32(%rcx),%r13 + sbbq %r14,%rbp + sbbq %r14,%r14 + + andq %r14,%r8 + andq %r14,%r9 + andq %r14,%r10 + andq %r14,%r11 + andq %r14,%r13 + andq 40(%rcx),%r14 + + addq %r8,%rdx + adcq %r9,%r15 + adcq %r10,%rax + adcq %r11,%r12 + adcq %r13,%rdi + adcq %r14,%rbp + + movq %rdx,0(%rbx) + movq %r15,8(%rbx) + movq %rax,16(%rbx) + movq %r12,24(%rbx) + movq %rdi,32(%rbx) + movq %rbp,40(%rbx) + leaq 136(%rsp),%r8 + movq 0(%r8),%r15 + + movq 8(%r8),%r14 + + movq 16(%r8),%r13 + + movq 24(%r8),%r12 + + movq 32(%r8),%rbx + + movq 40(%r8),%rbp + + leaq 48(%r8),%rsp + +.LSEH_epilogue_sqrx_mont_382x: + mov 8(%rsp),%rdi + mov 16(%rsp),%rsi + + .byte 0xf3,0xc3 + +.LSEH_end_sqrx_mont_382x: +.section .pdata +.p2align 2 +.rva .LSEH_begin_mulx_mont_384x +.rva .LSEH_body_mulx_mont_384x +.rva .LSEH_info_mulx_mont_384x_prologue + +.rva .LSEH_body_mulx_mont_384x +.rva .LSEH_epilogue_mulx_mont_384x +.rva .LSEH_info_mulx_mont_384x_body + +.rva .LSEH_epilogue_mulx_mont_384x +.rva .LSEH_end_mulx_mont_384x +.rva .LSEH_info_mulx_mont_384x_epilogue + +.rva .LSEH_begin_sqrx_mont_384x +.rva .LSEH_body_sqrx_mont_384x +.rva .LSEH_info_sqrx_mont_384x_prologue + +.rva .LSEH_body_sqrx_mont_384x +.rva .LSEH_epilogue_sqrx_mont_384x +.rva .LSEH_info_sqrx_mont_384x_body + +.rva .LSEH_epilogue_sqrx_mont_384x +.rva .LSEH_end_sqrx_mont_384x +.rva .LSEH_info_sqrx_mont_384x_epilogue + +.rva .LSEH_begin_mulx_382x +.rva .LSEH_body_mulx_382x +.rva .LSEH_info_mulx_382x_prologue + +.rva .LSEH_body_mulx_382x +.rva .LSEH_epilogue_mulx_382x +.rva .LSEH_info_mulx_382x_body + +.rva .LSEH_epilogue_mulx_382x +.rva .LSEH_end_mulx_382x +.rva .LSEH_info_mulx_382x_epilogue + +.rva .LSEH_begin_sqrx_382x +.rva .LSEH_body_sqrx_382x +.rva .LSEH_info_sqrx_382x_prologue + +.rva .LSEH_body_sqrx_382x +.rva .LSEH_epilogue_sqrx_382x +.rva .LSEH_info_sqrx_382x_body + +.rva .LSEH_epilogue_sqrx_382x +.rva .LSEH_end_sqrx_382x +.rva .LSEH_info_sqrx_382x_epilogue + +.rva .LSEH_begin_mulx_384 +.rva .LSEH_body_mulx_384 +.rva .LSEH_info_mulx_384_prologue + +.rva .LSEH_body_mulx_384 +.rva .LSEH_epilogue_mulx_384 +.rva .LSEH_info_mulx_384_body + +.rva .LSEH_epilogue_mulx_384 +.rva .LSEH_end_mulx_384 +.rva .LSEH_info_mulx_384_epilogue + +.rva .LSEH_begin_sqrx_384 +.rva .LSEH_body_sqrx_384 +.rva .LSEH_info_sqrx_384_prologue + +.rva .LSEH_body_sqrx_384 +.rva .LSEH_epilogue_sqrx_384 +.rva .LSEH_info_sqrx_384_body + +.rva .LSEH_epilogue_sqrx_384 +.rva .LSEH_end_sqrx_384 +.rva .LSEH_info_sqrx_384_epilogue + +.rva .LSEH_begin_redcx_mont_384 +.rva .LSEH_body_redcx_mont_384 +.rva .LSEH_info_redcx_mont_384_prologue + +.rva .LSEH_body_redcx_mont_384 +.rva .LSEH_epilogue_redcx_mont_384 +.rva .LSEH_info_redcx_mont_384_body + +.rva .LSEH_epilogue_redcx_mont_384 +.rva .LSEH_end_redcx_mont_384 +.rva .LSEH_info_redcx_mont_384_epilogue + +.rva .LSEH_begin_fromx_mont_384 +.rva .LSEH_body_fromx_mont_384 +.rva .LSEH_info_fromx_mont_384_prologue + +.rva .LSEH_body_fromx_mont_384 +.rva .LSEH_epilogue_fromx_mont_384 +.rva .LSEH_info_fromx_mont_384_body + +.rva .LSEH_epilogue_fromx_mont_384 +.rva .LSEH_end_fromx_mont_384 +.rva .LSEH_info_fromx_mont_384_epilogue + +.rva .LSEH_begin_sgn0x_pty_mont_384 +.rva .LSEH_body_sgn0x_pty_mont_384 +.rva .LSEH_info_sgn0x_pty_mont_384_prologue + +.rva .LSEH_body_sgn0x_pty_mont_384 +.rva .LSEH_epilogue_sgn0x_pty_mont_384 +.rva .LSEH_info_sgn0x_pty_mont_384_body + +.rva .LSEH_epilogue_sgn0x_pty_mont_384 +.rva .LSEH_end_sgn0x_pty_mont_384 +.rva .LSEH_info_sgn0x_pty_mont_384_epilogue + +.rva .LSEH_begin_sgn0x_pty_mont_384x +.rva .LSEH_body_sgn0x_pty_mont_384x +.rva .LSEH_info_sgn0x_pty_mont_384x_prologue + +.rva .LSEH_body_sgn0x_pty_mont_384x +.rva .LSEH_epilogue_sgn0x_pty_mont_384x +.rva .LSEH_info_sgn0x_pty_mont_384x_body + +.rva .LSEH_epilogue_sgn0x_pty_mont_384x +.rva .LSEH_end_sgn0x_pty_mont_384x +.rva .LSEH_info_sgn0x_pty_mont_384x_epilogue + +.rva .LSEH_begin_mulx_mont_384 +.rva .LSEH_body_mulx_mont_384 +.rva .LSEH_info_mulx_mont_384_prologue + +.rva .LSEH_body_mulx_mont_384 +.rva .LSEH_epilogue_mulx_mont_384 +.rva .LSEH_info_mulx_mont_384_body + +.rva .LSEH_epilogue_mulx_mont_384 +.rva .LSEH_end_mulx_mont_384 +.rva .LSEH_info_mulx_mont_384_epilogue + +.rva .LSEH_begin_sqrx_mont_384 +.rva .LSEH_body_sqrx_mont_384 +.rva .LSEH_info_sqrx_mont_384_prologue + +.rva .LSEH_body_sqrx_mont_384 +.rva .LSEH_epilogue_sqrx_mont_384 +.rva .LSEH_info_sqrx_mont_384_body + +.rva .LSEH_epilogue_sqrx_mont_384 +.rva .LSEH_end_sqrx_mont_384 +.rva .LSEH_info_sqrx_mont_384_epilogue + +.rva .LSEH_begin_sqrx_n_mul_mont_384 +.rva .LSEH_body_sqrx_n_mul_mont_384 +.rva .LSEH_info_sqrx_n_mul_mont_384_prologue + +.rva .LSEH_body_sqrx_n_mul_mont_384 +.rva .LSEH_epilogue_sqrx_n_mul_mont_384 +.rva .LSEH_info_sqrx_n_mul_mont_384_body + +.rva .LSEH_epilogue_sqrx_n_mul_mont_384 +.rva .LSEH_end_sqrx_n_mul_mont_384 +.rva .LSEH_info_sqrx_n_mul_mont_384_epilogue + +.rva .LSEH_begin_sqrx_n_mul_mont_383 +.rva .LSEH_body_sqrx_n_mul_mont_383 +.rva .LSEH_info_sqrx_n_mul_mont_383_prologue + +.rva .LSEH_body_sqrx_n_mul_mont_383 +.rva .LSEH_epilogue_sqrx_n_mul_mont_383 +.rva .LSEH_info_sqrx_n_mul_mont_383_body + +.rva .LSEH_epilogue_sqrx_n_mul_mont_383 +.rva .LSEH_end_sqrx_n_mul_mont_383 +.rva .LSEH_info_sqrx_n_mul_mont_383_epilogue + +.rva .LSEH_begin_sqrx_mont_382x +.rva .LSEH_body_sqrx_mont_382x +.rva .LSEH_info_sqrx_mont_382x_prologue + +.rva .LSEH_body_sqrx_mont_382x +.rva .LSEH_epilogue_sqrx_mont_382x +.rva .LSEH_info_sqrx_mont_382x_body + +.rva .LSEH_epilogue_sqrx_mont_382x +.rva .LSEH_end_sqrx_mont_382x +.rva .LSEH_info_sqrx_mont_382x_epilogue + +.section .xdata +.p2align 3 +.LSEH_info_mulx_mont_384x_prologue: +.byte 1,0,5,0x0b +.byte 0,0x74,1,0 +.byte 0,0x64,2,0 +.byte 0,0x03 +.byte 0,0 +.LSEH_info_mulx_mont_384x_body: +.byte 1,0,18,0 +.byte 0x00,0xf4,0x29,0x00 +.byte 0x00,0xe4,0x2a,0x00 +.byte 0x00,0xd4,0x2b,0x00 +.byte 0x00,0xc4,0x2c,0x00 +.byte 0x00,0x34,0x2d,0x00 +.byte 0x00,0x54,0x2e,0x00 +.byte 0x00,0x74,0x30,0x00 +.byte 0x00,0x64,0x31,0x00 +.byte 0x00,0x01,0x2f,0x00 +.LSEH_info_mulx_mont_384x_epilogue: +.byte 1,0,4,0 +.byte 0x00,0x74,0x01,0x00 +.byte 0x00,0x64,0x02,0x00 +.byte 0x00,0x00,0x00,0x00 + +.LSEH_info_sqrx_mont_384x_prologue: +.byte 1,0,5,0x0b +.byte 0,0x74,1,0 +.byte 0,0x64,2,0 +.byte 0,0x03 +.byte 0,0 +.LSEH_info_sqrx_mont_384x_body: +.byte 1,0,18,0 +.byte 0x00,0xf4,0x11,0x00 +.byte 0x00,0xe4,0x12,0x00 +.byte 0x00,0xd4,0x13,0x00 +.byte 0x00,0xc4,0x14,0x00 +.byte 0x00,0x34,0x15,0x00 +.byte 0x00,0x54,0x16,0x00 +.byte 0x00,0x74,0x18,0x00 +.byte 0x00,0x64,0x19,0x00 +.byte 0x00,0x01,0x17,0x00 +.LSEH_info_sqrx_mont_384x_epilogue: +.byte 1,0,4,0 +.byte 0x00,0x74,0x01,0x00 +.byte 0x00,0x64,0x02,0x00 +.byte 0x00,0x00,0x00,0x00 + +.LSEH_info_mulx_382x_prologue: +.byte 1,0,5,0x0b +.byte 0,0x74,1,0 +.byte 0,0x64,2,0 +.byte 0,0x03 +.byte 0,0 +.LSEH_info_mulx_382x_body: +.byte 1,0,18,0 +.byte 0x00,0xf4,0x11,0x00 +.byte 0x00,0xe4,0x12,0x00 +.byte 0x00,0xd4,0x13,0x00 +.byte 0x00,0xc4,0x14,0x00 +.byte 0x00,0x34,0x15,0x00 +.byte 0x00,0x54,0x16,0x00 +.byte 0x00,0x74,0x18,0x00 +.byte 0x00,0x64,0x19,0x00 +.byte 0x00,0x01,0x17,0x00 +.LSEH_info_mulx_382x_epilogue: +.byte 1,0,4,0 +.byte 0x00,0x74,0x01,0x00 +.byte 0x00,0x64,0x02,0x00 +.byte 0x00,0x00,0x00,0x00 + +.LSEH_info_sqrx_382x_prologue: +.byte 1,0,5,0x0b +.byte 0,0x74,1,0 +.byte 0,0x64,2,0 +.byte 0,0x03 +.byte 0,0 +.LSEH_info_sqrx_382x_body: +.byte 1,0,17,0 +.byte 0x00,0xf4,0x01,0x00 +.byte 0x00,0xe4,0x02,0x00 +.byte 0x00,0xd4,0x03,0x00 +.byte 0x00,0xc4,0x04,0x00 +.byte 0x00,0x34,0x05,0x00 +.byte 0x00,0x54,0x06,0x00 +.byte 0x00,0x74,0x08,0x00 +.byte 0x00,0x64,0x09,0x00 +.byte 0x00,0x62 +.byte 0x00,0x00 +.LSEH_info_sqrx_382x_epilogue: +.byte 1,0,4,0 +.byte 0x00,0x74,0x01,0x00 +.byte 0x00,0x64,0x02,0x00 +.byte 0x00,0x00,0x00,0x00 + +.LSEH_info_mulx_384_prologue: +.byte 1,0,5,0x0b +.byte 0,0x74,1,0 +.byte 0,0x64,2,0 +.byte 0,0x03 +.byte 0,0 +.LSEH_info_mulx_384_body: +.byte 1,0,17,0 +.byte 0x00,0xf4,0x00,0x00 +.byte 0x00,0xe4,0x01,0x00 +.byte 0x00,0xd4,0x02,0x00 +.byte 0x00,0xc4,0x03,0x00 +.byte 0x00,0x34,0x04,0x00 +.byte 0x00,0x54,0x05,0x00 +.byte 0x00,0x74,0x07,0x00 +.byte 0x00,0x64,0x08,0x00 +.byte 0x00,0x52 +.byte 0x00,0x00 +.LSEH_info_mulx_384_epilogue: +.byte 1,0,4,0 +.byte 0x00,0x74,0x01,0x00 +.byte 0x00,0x64,0x02,0x00 +.byte 0x00,0x00,0x00,0x00 + +.LSEH_info_sqrx_384_prologue: +.byte 1,0,5,0x0b +.byte 0,0x74,1,0 +.byte 0,0x64,2,0 +.byte 0,0x03 +.byte 0,0 +.LSEH_info_sqrx_384_body: +.byte 1,0,17,0 +.byte 0x00,0xf4,0x01,0x00 +.byte 0x00,0xe4,0x02,0x00 +.byte 0x00,0xd4,0x03,0x00 +.byte 0x00,0xc4,0x04,0x00 +.byte 0x00,0x34,0x05,0x00 +.byte 0x00,0x54,0x06,0x00 +.byte 0x00,0x74,0x08,0x00 +.byte 0x00,0x64,0x09,0x00 +.byte 0x00,0x62 +.byte 0x00,0x00 +.LSEH_info_sqrx_384_epilogue: +.byte 1,0,4,0 +.byte 0x00,0x74,0x01,0x00 +.byte 0x00,0x64,0x02,0x00 +.byte 0x00,0x00,0x00,0x00 + +.LSEH_info_redcx_mont_384_prologue: +.byte 1,0,5,0x0b +.byte 0,0x74,1,0 +.byte 0,0x64,2,0 +.byte 0,0x03 +.byte 0,0 +.LSEH_info_redcx_mont_384_body: +.byte 1,0,17,0 +.byte 0x00,0xf4,0x01,0x00 +.byte 0x00,0xe4,0x02,0x00 +.byte 0x00,0xd4,0x03,0x00 +.byte 0x00,0xc4,0x04,0x00 +.byte 0x00,0x34,0x05,0x00 +.byte 0x00,0x54,0x06,0x00 +.byte 0x00,0x74,0x08,0x00 +.byte 0x00,0x64,0x09,0x00 +.byte 0x00,0x62 +.byte 0x00,0x00 +.LSEH_info_redcx_mont_384_epilogue: +.byte 1,0,4,0 +.byte 0x00,0x74,0x01,0x00 +.byte 0x00,0x64,0x02,0x00 +.byte 0x00,0x00,0x00,0x00 + +.LSEH_info_fromx_mont_384_prologue: +.byte 1,0,5,0x0b +.byte 0,0x74,1,0 +.byte 0,0x64,2,0 +.byte 0,0x03 +.byte 0,0 +.LSEH_info_fromx_mont_384_body: +.byte 1,0,17,0 +.byte 0x00,0xf4,0x01,0x00 +.byte 0x00,0xe4,0x02,0x00 +.byte 0x00,0xd4,0x03,0x00 +.byte 0x00,0xc4,0x04,0x00 +.byte 0x00,0x34,0x05,0x00 +.byte 0x00,0x54,0x06,0x00 +.byte 0x00,0x74,0x08,0x00 +.byte 0x00,0x64,0x09,0x00 +.byte 0x00,0x62 +.byte 0x00,0x00 +.LSEH_info_fromx_mont_384_epilogue: +.byte 1,0,4,0 +.byte 0x00,0x74,0x01,0x00 +.byte 0x00,0x64,0x02,0x00 +.byte 0x00,0x00,0x00,0x00 + +.LSEH_info_sgn0x_pty_mont_384_prologue: +.byte 1,0,5,0x0b +.byte 0,0x74,1,0 +.byte 0,0x64,2,0 +.byte 0,0x03 +.byte 0,0 +.LSEH_info_sgn0x_pty_mont_384_body: +.byte 1,0,17,0 +.byte 0x00,0xf4,0x01,0x00 +.byte 0x00,0xe4,0x02,0x00 +.byte 0x00,0xd4,0x03,0x00 +.byte 0x00,0xc4,0x04,0x00 +.byte 0x00,0x34,0x05,0x00 +.byte 0x00,0x54,0x06,0x00 +.byte 0x00,0x74,0x08,0x00 +.byte 0x00,0x64,0x09,0x00 +.byte 0x00,0x62 +.byte 0x00,0x00 +.LSEH_info_sgn0x_pty_mont_384_epilogue: +.byte 1,0,4,0 +.byte 0x00,0x74,0x01,0x00 +.byte 0x00,0x64,0x02,0x00 +.byte 0x00,0x00,0x00,0x00 + +.LSEH_info_sgn0x_pty_mont_384x_prologue: +.byte 1,0,5,0x0b +.byte 0,0x74,1,0 +.byte 0,0x64,2,0 +.byte 0,0x03 +.byte 0,0 +.LSEH_info_sgn0x_pty_mont_384x_body: +.byte 1,0,17,0 +.byte 0x00,0xf4,0x01,0x00 +.byte 0x00,0xe4,0x02,0x00 +.byte 0x00,0xd4,0x03,0x00 +.byte 0x00,0xc4,0x04,0x00 +.byte 0x00,0x34,0x05,0x00 +.byte 0x00,0x54,0x06,0x00 +.byte 0x00,0x74,0x08,0x00 +.byte 0x00,0x64,0x09,0x00 +.byte 0x00,0x62 +.byte 0x00,0x00 +.LSEH_info_sgn0x_pty_mont_384x_epilogue: +.byte 1,0,4,0 +.byte 0x00,0x74,0x01,0x00 +.byte 0x00,0x64,0x02,0x00 +.byte 0x00,0x00,0x00,0x00 + +.LSEH_info_mulx_mont_384_prologue: +.byte 1,0,5,0x0b +.byte 0,0x74,1,0 +.byte 0,0x64,2,0 +.byte 0,0x03 +.byte 0,0 +.LSEH_info_mulx_mont_384_body: +.byte 1,0,17,0 +.byte 0x00,0xf4,0x03,0x00 +.byte 0x00,0xe4,0x04,0x00 +.byte 0x00,0xd4,0x05,0x00 +.byte 0x00,0xc4,0x06,0x00 +.byte 0x00,0x34,0x07,0x00 +.byte 0x00,0x54,0x08,0x00 +.byte 0x00,0x74,0x0a,0x00 +.byte 0x00,0x64,0x0b,0x00 +.byte 0x00,0x82 +.byte 0x00,0x00 +.LSEH_info_mulx_mont_384_epilogue: +.byte 1,0,4,0 +.byte 0x00,0x74,0x01,0x00 +.byte 0x00,0x64,0x02,0x00 +.byte 0x00,0x00,0x00,0x00 + +.LSEH_info_sqrx_mont_384_prologue: +.byte 1,0,5,0x0b +.byte 0,0x74,1,0 +.byte 0,0x64,2,0 +.byte 0,0x03 +.byte 0,0 +.LSEH_info_sqrx_mont_384_body: +.byte 1,0,17,0 +.byte 0x00,0xf4,0x03,0x00 +.byte 0x00,0xe4,0x04,0x00 +.byte 0x00,0xd4,0x05,0x00 +.byte 0x00,0xc4,0x06,0x00 +.byte 0x00,0x34,0x07,0x00 +.byte 0x00,0x54,0x08,0x00 +.byte 0x00,0x74,0x0a,0x00 +.byte 0x00,0x64,0x0b,0x00 +.byte 0x00,0x82 +.byte 0x00,0x00 +.LSEH_info_sqrx_mont_384_epilogue: +.byte 1,0,4,0 +.byte 0x00,0x74,0x01,0x00 +.byte 0x00,0x64,0x02,0x00 +.byte 0x00,0x00,0x00,0x00 + +.LSEH_info_sqrx_n_mul_mont_384_prologue: +.byte 1,0,5,0x0b +.byte 0,0x74,1,0 +.byte 0,0x64,2,0 +.byte 0,0x03 +.byte 0,0 +.LSEH_info_sqrx_n_mul_mont_384_body: +.byte 1,0,17,0 +.byte 0x00,0xf4,0x05,0x00 +.byte 0x00,0xe4,0x06,0x00 +.byte 0x00,0xd4,0x07,0x00 +.byte 0x00,0xc4,0x08,0x00 +.byte 0x00,0x34,0x09,0x00 +.byte 0x00,0x54,0x0a,0x00 +.byte 0x00,0x74,0x0c,0x00 +.byte 0x00,0x64,0x0d,0x00 +.byte 0x00,0xa2 +.byte 0x00,0x00 +.LSEH_info_sqrx_n_mul_mont_384_epilogue: +.byte 1,0,4,0 +.byte 0x00,0x74,0x01,0x00 +.byte 0x00,0x64,0x02,0x00 +.byte 0x00,0x00,0x00,0x00 + +.LSEH_info_sqrx_n_mul_mont_383_prologue: +.byte 1,0,5,0x0b +.byte 0,0x74,1,0 +.byte 0,0x64,2,0 +.byte 0,0x03 +.byte 0,0 +.LSEH_info_sqrx_n_mul_mont_383_body: +.byte 1,0,17,0 +.byte 0x00,0xf4,0x05,0x00 +.byte 0x00,0xe4,0x06,0x00 +.byte 0x00,0xd4,0x07,0x00 +.byte 0x00,0xc4,0x08,0x00 +.byte 0x00,0x34,0x09,0x00 +.byte 0x00,0x54,0x0a,0x00 +.byte 0x00,0x74,0x0c,0x00 +.byte 0x00,0x64,0x0d,0x00 +.byte 0x00,0xa2 +.byte 0x00,0x00 +.LSEH_info_sqrx_n_mul_mont_383_epilogue: +.byte 1,0,4,0 +.byte 0x00,0x74,0x01,0x00 +.byte 0x00,0x64,0x02,0x00 +.byte 0x00,0x00,0x00,0x00 + +.LSEH_info_sqrx_mont_382x_prologue: +.byte 1,0,5,0x0b +.byte 0,0x74,1,0 +.byte 0,0x64,2,0 +.byte 0,0x03 +.byte 0,0 +.LSEH_info_sqrx_mont_382x_body: +.byte 1,0,18,0 +.byte 0x00,0xf4,0x11,0x00 +.byte 0x00,0xe4,0x12,0x00 +.byte 0x00,0xd4,0x13,0x00 +.byte 0x00,0xc4,0x14,0x00 +.byte 0x00,0x34,0x15,0x00 +.byte 0x00,0x54,0x16,0x00 +.byte 0x00,0x74,0x18,0x00 +.byte 0x00,0x64,0x19,0x00 +.byte 0x00,0x01,0x17,0x00 +.LSEH_info_sqrx_mont_382x_epilogue: +.byte 1,0,4,0 +.byte 0x00,0x74,0x01,0x00 +.byte 0x00,0x64,0x02,0x00 +.byte 0x00,0x00,0x00,0x00 + diff --git a/temp-deps/blst-ts/deps/blst/build/coff/sha256-armv8.S b/temp-deps/blst-ts/deps/blst/build/coff/sha256-armv8.S new file mode 100644 index 0000000..a8bcbd3 --- /dev/null +++ b/temp-deps/blst-ts/deps/blst/build/coff/sha256-armv8.S @@ -0,0 +1,1087 @@ +// +// Copyright Supranational LLC +// Licensed under the Apache License, Version 2.0, see LICENSE for details. +// SPDX-License-Identifier: Apache-2.0 +// +// ==================================================================== +// Written by Andy Polyakov, @dot-asm, initially for the OpenSSL +// project. +// ==================================================================== +// +// sha256_block procedure for ARMv8. +// +// This module is stripped of scalar code paths, with raionale that all +// known processors are NEON-capable. +// +// See original module at CRYPTOGAMS for further details. + +.text + +.p2align 6 + +.LK256: +.long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5 +.long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5 +.long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3 +.long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174 +.long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc +.long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da +.long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7 +.long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967 +.long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13 +.long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85 +.long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3 +.long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070 +.long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5 +.long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3 +.long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208 +.long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 +.long 0 //terminator + +.byte 83,72,65,50,53,54,32,98,108,111,99,107,32,116,114,97,110,115,102,111,114,109,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,64,100,111,116,45,97,115,109,0 +.align 2 +.p2align 2 +.globl blst_sha256_block_armv8 +.def blst_sha256_block_armv8; +.type 32; +.endef +.p2align 6 +blst_sha256_block_armv8: +.Lv8_entry: + stp x29,x30,[sp,#-16]! + add x29,sp,#0 + + ld1 {v0.4s,v1.4s},[x0] + adr x3,.LK256 + +.Loop_hw: + ld1 {v4.16b,v5.16b,v6.16b,v7.16b},[x1],#64 + sub x2,x2,#1 + ld1 {v16.4s},[x3],#16 + rev32 v4.16b,v4.16b + rev32 v5.16b,v5.16b + rev32 v6.16b,v6.16b + rev32 v7.16b,v7.16b + orr v18.16b,v0.16b,v0.16b // offload + orr v19.16b,v1.16b,v1.16b + ld1 {v17.4s},[x3],#16 + add v16.4s,v16.4s,v4.4s +.long 0x5e2828a4 //sha256su0 v4.16b,v5.16b + orr v2.16b,v0.16b,v0.16b +.long 0x5e104020 //sha256h v0.16b,v1.16b,v16.4s +.long 0x5e105041 //sha256h2 v1.16b,v2.16b,v16.4s +.long 0x5e0760c4 //sha256su1 v4.16b,v6.16b,v7.16b + ld1 {v16.4s},[x3],#16 + add v17.4s,v17.4s,v5.4s +.long 0x5e2828c5 //sha256su0 v5.16b,v6.16b + orr v2.16b,v0.16b,v0.16b +.long 0x5e114020 //sha256h v0.16b,v1.16b,v17.4s +.long 0x5e115041 //sha256h2 v1.16b,v2.16b,v17.4s +.long 0x5e0460e5 //sha256su1 v5.16b,v7.16b,v4.16b + ld1 {v17.4s},[x3],#16 + add v16.4s,v16.4s,v6.4s +.long 0x5e2828e6 //sha256su0 v6.16b,v7.16b + orr v2.16b,v0.16b,v0.16b +.long 0x5e104020 //sha256h v0.16b,v1.16b,v16.4s +.long 0x5e105041 //sha256h2 v1.16b,v2.16b,v16.4s +.long 0x5e056086 //sha256su1 v6.16b,v4.16b,v5.16b + ld1 {v16.4s},[x3],#16 + add v17.4s,v17.4s,v7.4s +.long 0x5e282887 //sha256su0 v7.16b,v4.16b + orr v2.16b,v0.16b,v0.16b +.long 0x5e114020 //sha256h v0.16b,v1.16b,v17.4s +.long 0x5e115041 //sha256h2 v1.16b,v2.16b,v17.4s +.long 0x5e0660a7 //sha256su1 v7.16b,v5.16b,v6.16b + ld1 {v17.4s},[x3],#16 + add v16.4s,v16.4s,v4.4s +.long 0x5e2828a4 //sha256su0 v4.16b,v5.16b + orr v2.16b,v0.16b,v0.16b +.long 0x5e104020 //sha256h v0.16b,v1.16b,v16.4s +.long 0x5e105041 //sha256h2 v1.16b,v2.16b,v16.4s +.long 0x5e0760c4 //sha256su1 v4.16b,v6.16b,v7.16b + ld1 {v16.4s},[x3],#16 + add v17.4s,v17.4s,v5.4s +.long 0x5e2828c5 //sha256su0 v5.16b,v6.16b + orr v2.16b,v0.16b,v0.16b +.long 0x5e114020 //sha256h v0.16b,v1.16b,v17.4s +.long 0x5e115041 //sha256h2 v1.16b,v2.16b,v17.4s +.long 0x5e0460e5 //sha256su1 v5.16b,v7.16b,v4.16b + ld1 {v17.4s},[x3],#16 + add v16.4s,v16.4s,v6.4s +.long 0x5e2828e6 //sha256su0 v6.16b,v7.16b + orr v2.16b,v0.16b,v0.16b +.long 0x5e104020 //sha256h v0.16b,v1.16b,v16.4s +.long 0x5e105041 //sha256h2 v1.16b,v2.16b,v16.4s +.long 0x5e056086 //sha256su1 v6.16b,v4.16b,v5.16b + ld1 {v16.4s},[x3],#16 + add v17.4s,v17.4s,v7.4s +.long 0x5e282887 //sha256su0 v7.16b,v4.16b + orr v2.16b,v0.16b,v0.16b +.long 0x5e114020 //sha256h v0.16b,v1.16b,v17.4s +.long 0x5e115041 //sha256h2 v1.16b,v2.16b,v17.4s +.long 0x5e0660a7 //sha256su1 v7.16b,v5.16b,v6.16b + ld1 {v17.4s},[x3],#16 + add v16.4s,v16.4s,v4.4s +.long 0x5e2828a4 //sha256su0 v4.16b,v5.16b + orr v2.16b,v0.16b,v0.16b +.long 0x5e104020 //sha256h v0.16b,v1.16b,v16.4s +.long 0x5e105041 //sha256h2 v1.16b,v2.16b,v16.4s +.long 0x5e0760c4 //sha256su1 v4.16b,v6.16b,v7.16b + ld1 {v16.4s},[x3],#16 + add v17.4s,v17.4s,v5.4s +.long 0x5e2828c5 //sha256su0 v5.16b,v6.16b + orr v2.16b,v0.16b,v0.16b +.long 0x5e114020 //sha256h v0.16b,v1.16b,v17.4s +.long 0x5e115041 //sha256h2 v1.16b,v2.16b,v17.4s +.long 0x5e0460e5 //sha256su1 v5.16b,v7.16b,v4.16b + ld1 {v17.4s},[x3],#16 + add v16.4s,v16.4s,v6.4s +.long 0x5e2828e6 //sha256su0 v6.16b,v7.16b + orr v2.16b,v0.16b,v0.16b +.long 0x5e104020 //sha256h v0.16b,v1.16b,v16.4s +.long 0x5e105041 //sha256h2 v1.16b,v2.16b,v16.4s +.long 0x5e056086 //sha256su1 v6.16b,v4.16b,v5.16b + ld1 {v16.4s},[x3],#16 + add v17.4s,v17.4s,v7.4s +.long 0x5e282887 //sha256su0 v7.16b,v4.16b + orr v2.16b,v0.16b,v0.16b +.long 0x5e114020 //sha256h v0.16b,v1.16b,v17.4s +.long 0x5e115041 //sha256h2 v1.16b,v2.16b,v17.4s +.long 0x5e0660a7 //sha256su1 v7.16b,v5.16b,v6.16b + ld1 {v17.4s},[x3],#16 + add v16.4s,v16.4s,v4.4s + orr v2.16b,v0.16b,v0.16b +.long 0x5e104020 //sha256h v0.16b,v1.16b,v16.4s +.long 0x5e105041 //sha256h2 v1.16b,v2.16b,v16.4s + + ld1 {v16.4s},[x3],#16 + add v17.4s,v17.4s,v5.4s + orr v2.16b,v0.16b,v0.16b +.long 0x5e114020 //sha256h v0.16b,v1.16b,v17.4s +.long 0x5e115041 //sha256h2 v1.16b,v2.16b,v17.4s + + ld1 {v17.4s},[x3] + add v16.4s,v16.4s,v6.4s + sub x3,x3,#64*4-16 // rewind + orr v2.16b,v0.16b,v0.16b +.long 0x5e104020 //sha256h v0.16b,v1.16b,v16.4s +.long 0x5e105041 //sha256h2 v1.16b,v2.16b,v16.4s + + add v17.4s,v17.4s,v7.4s + orr v2.16b,v0.16b,v0.16b +.long 0x5e114020 //sha256h v0.16b,v1.16b,v17.4s +.long 0x5e115041 //sha256h2 v1.16b,v2.16b,v17.4s + + add v0.4s,v0.4s,v18.4s + add v1.4s,v1.4s,v19.4s + + cbnz x2,.Loop_hw + + st1 {v0.4s,v1.4s},[x0] + + ldr x29,[sp],#16 + ret + +.globl blst_sha256_block_data_order +.def blst_sha256_block_data_order; +.type 32; +.endef +.p2align 4 +blst_sha256_block_data_order: + stp x29, x30, [sp, #-16]! + mov x29, sp + sub sp,sp,#16*4 + + adr x16,.LK256 + add x2,x1,x2,lsl#6 // len to point at the end of inp + + ld1 {v0.16b},[x1], #16 + ld1 {v1.16b},[x1], #16 + ld1 {v2.16b},[x1], #16 + ld1 {v3.16b},[x1], #16 + ld1 {v4.4s},[x16], #16 + ld1 {v5.4s},[x16], #16 + ld1 {v6.4s},[x16], #16 + ld1 {v7.4s},[x16], #16 + rev32 v0.16b,v0.16b // yes, even on + rev32 v1.16b,v1.16b // big-endian + rev32 v2.16b,v2.16b + rev32 v3.16b,v3.16b + mov x17,sp + add v4.4s,v4.4s,v0.4s + add v5.4s,v5.4s,v1.4s + add v6.4s,v6.4s,v2.4s + st1 {v4.4s,v5.4s},[x17], #32 + add v7.4s,v7.4s,v3.4s + st1 {v6.4s,v7.4s},[x17] + sub x17,x17,#32 + + ldp w3,w4,[x0] + ldp w5,w6,[x0,#8] + ldp w7,w8,[x0,#16] + ldp w9,w10,[x0,#24] + ldr w12,[sp,#0] + mov w13,wzr + eor w14,w4,w5 + mov w15,wzr + b .L_00_48 + +.p2align 4 +.L_00_48: + ext v4.16b,v0.16b,v1.16b,#4 + add w10,w10,w12 + add w3,w3,w15 + and w12,w8,w7 + bic w15,w9,w7 + ext v7.16b,v2.16b,v3.16b,#4 + eor w11,w7,w7,ror#5 + add w3,w3,w13 + mov d19,v3.d[1] + orr w12,w12,w15 + eor w11,w11,w7,ror#19 + ushr v6.4s,v4.4s,#7 + eor w15,w3,w3,ror#11 + ushr v5.4s,v4.4s,#3 + add w10,w10,w12 + add v0.4s,v0.4s,v7.4s + ror w11,w11,#6 + sli v6.4s,v4.4s,#25 + eor w13,w3,w4 + eor w15,w15,w3,ror#20 + ushr v7.4s,v4.4s,#18 + add w10,w10,w11 + ldr w12,[sp,#4] + and w14,w14,w13 + eor v5.16b,v5.16b,v6.16b + ror w15,w15,#2 + add w6,w6,w10 + sli v7.4s,v4.4s,#14 + eor w14,w14,w4 + ushr v16.4s,v19.4s,#17 + add w9,w9,w12 + add w10,w10,w15 + and w12,w7,w6 + eor v5.16b,v5.16b,v7.16b + bic w15,w8,w6 + eor w11,w6,w6,ror#5 + sli v16.4s,v19.4s,#15 + add w10,w10,w14 + orr w12,w12,w15 + ushr v17.4s,v19.4s,#10 + eor w11,w11,w6,ror#19 + eor w15,w10,w10,ror#11 + ushr v7.4s,v19.4s,#19 + add w9,w9,w12 + ror w11,w11,#6 + add v0.4s,v0.4s,v5.4s + eor w14,w10,w3 + eor w15,w15,w10,ror#20 + sli v7.4s,v19.4s,#13 + add w9,w9,w11 + ldr w12,[sp,#8] + and w13,w13,w14 + eor v17.16b,v17.16b,v16.16b + ror w15,w15,#2 + add w5,w5,w9 + eor w13,w13,w3 + eor v17.16b,v17.16b,v7.16b + add w8,w8,w12 + add w9,w9,w15 + and w12,w6,w5 + add v0.4s,v0.4s,v17.4s + bic w15,w7,w5 + eor w11,w5,w5,ror#5 + add w9,w9,w13 + ushr v18.4s,v0.4s,#17 + orr w12,w12,w15 + ushr v19.4s,v0.4s,#10 + eor w11,w11,w5,ror#19 + eor w15,w9,w9,ror#11 + sli v18.4s,v0.4s,#15 + add w8,w8,w12 + ushr v17.4s,v0.4s,#19 + ror w11,w11,#6 + eor w13,w9,w10 + eor v19.16b,v19.16b,v18.16b + eor w15,w15,w9,ror#20 + add w8,w8,w11 + sli v17.4s,v0.4s,#13 + ldr w12,[sp,#12] + and w14,w14,w13 + ror w15,w15,#2 + ld1 {v4.4s},[x16], #16 + add w4,w4,w8 + eor v19.16b,v19.16b,v17.16b + eor w14,w14,w10 + eor v17.16b,v17.16b,v17.16b + add w7,w7,w12 + add w8,w8,w15 + and w12,w5,w4 + mov v17.d[1],v19.d[0] + bic w15,w6,w4 + eor w11,w4,w4,ror#5 + add w8,w8,w14 + add v0.4s,v0.4s,v17.4s + orr w12,w12,w15 + eor w11,w11,w4,ror#19 + eor w15,w8,w8,ror#11 + add v4.4s,v4.4s,v0.4s + add w7,w7,w12 + ror w11,w11,#6 + eor w14,w8,w9 + eor w15,w15,w8,ror#20 + add w7,w7,w11 + ldr w12,[sp,#16] + and w13,w13,w14 + ror w15,w15,#2 + add w3,w3,w7 + eor w13,w13,w9 + st1 {v4.4s},[x17], #16 + ext v4.16b,v1.16b,v2.16b,#4 + add w6,w6,w12 + add w7,w7,w15 + and w12,w4,w3 + bic w15,w5,w3 + ext v7.16b,v3.16b,v0.16b,#4 + eor w11,w3,w3,ror#5 + add w7,w7,w13 + mov d19,v0.d[1] + orr w12,w12,w15 + eor w11,w11,w3,ror#19 + ushr v6.4s,v4.4s,#7 + eor w15,w7,w7,ror#11 + ushr v5.4s,v4.4s,#3 + add w6,w6,w12 + add v1.4s,v1.4s,v7.4s + ror w11,w11,#6 + sli v6.4s,v4.4s,#25 + eor w13,w7,w8 + eor w15,w15,w7,ror#20 + ushr v7.4s,v4.4s,#18 + add w6,w6,w11 + ldr w12,[sp,#20] + and w14,w14,w13 + eor v5.16b,v5.16b,v6.16b + ror w15,w15,#2 + add w10,w10,w6 + sli v7.4s,v4.4s,#14 + eor w14,w14,w8 + ushr v16.4s,v19.4s,#17 + add w5,w5,w12 + add w6,w6,w15 + and w12,w3,w10 + eor v5.16b,v5.16b,v7.16b + bic w15,w4,w10 + eor w11,w10,w10,ror#5 + sli v16.4s,v19.4s,#15 + add w6,w6,w14 + orr w12,w12,w15 + ushr v17.4s,v19.4s,#10 + eor w11,w11,w10,ror#19 + eor w15,w6,w6,ror#11 + ushr v7.4s,v19.4s,#19 + add w5,w5,w12 + ror w11,w11,#6 + add v1.4s,v1.4s,v5.4s + eor w14,w6,w7 + eor w15,w15,w6,ror#20 + sli v7.4s,v19.4s,#13 + add w5,w5,w11 + ldr w12,[sp,#24] + and w13,w13,w14 + eor v17.16b,v17.16b,v16.16b + ror w15,w15,#2 + add w9,w9,w5 + eor w13,w13,w7 + eor v17.16b,v17.16b,v7.16b + add w4,w4,w12 + add w5,w5,w15 + and w12,w10,w9 + add v1.4s,v1.4s,v17.4s + bic w15,w3,w9 + eor w11,w9,w9,ror#5 + add w5,w5,w13 + ushr v18.4s,v1.4s,#17 + orr w12,w12,w15 + ushr v19.4s,v1.4s,#10 + eor w11,w11,w9,ror#19 + eor w15,w5,w5,ror#11 + sli v18.4s,v1.4s,#15 + add w4,w4,w12 + ushr v17.4s,v1.4s,#19 + ror w11,w11,#6 + eor w13,w5,w6 + eor v19.16b,v19.16b,v18.16b + eor w15,w15,w5,ror#20 + add w4,w4,w11 + sli v17.4s,v1.4s,#13 + ldr w12,[sp,#28] + and w14,w14,w13 + ror w15,w15,#2 + ld1 {v4.4s},[x16], #16 + add w8,w8,w4 + eor v19.16b,v19.16b,v17.16b + eor w14,w14,w6 + eor v17.16b,v17.16b,v17.16b + add w3,w3,w12 + add w4,w4,w15 + and w12,w9,w8 + mov v17.d[1],v19.d[0] + bic w15,w10,w8 + eor w11,w8,w8,ror#5 + add w4,w4,w14 + add v1.4s,v1.4s,v17.4s + orr w12,w12,w15 + eor w11,w11,w8,ror#19 + eor w15,w4,w4,ror#11 + add v4.4s,v4.4s,v1.4s + add w3,w3,w12 + ror w11,w11,#6 + eor w14,w4,w5 + eor w15,w15,w4,ror#20 + add w3,w3,w11 + ldr w12,[sp,#32] + and w13,w13,w14 + ror w15,w15,#2 + add w7,w7,w3 + eor w13,w13,w5 + st1 {v4.4s},[x17], #16 + ext v4.16b,v2.16b,v3.16b,#4 + add w10,w10,w12 + add w3,w3,w15 + and w12,w8,w7 + bic w15,w9,w7 + ext v7.16b,v0.16b,v1.16b,#4 + eor w11,w7,w7,ror#5 + add w3,w3,w13 + mov d19,v1.d[1] + orr w12,w12,w15 + eor w11,w11,w7,ror#19 + ushr v6.4s,v4.4s,#7 + eor w15,w3,w3,ror#11 + ushr v5.4s,v4.4s,#3 + add w10,w10,w12 + add v2.4s,v2.4s,v7.4s + ror w11,w11,#6 + sli v6.4s,v4.4s,#25 + eor w13,w3,w4 + eor w15,w15,w3,ror#20 + ushr v7.4s,v4.4s,#18 + add w10,w10,w11 + ldr w12,[sp,#36] + and w14,w14,w13 + eor v5.16b,v5.16b,v6.16b + ror w15,w15,#2 + add w6,w6,w10 + sli v7.4s,v4.4s,#14 + eor w14,w14,w4 + ushr v16.4s,v19.4s,#17 + add w9,w9,w12 + add w10,w10,w15 + and w12,w7,w6 + eor v5.16b,v5.16b,v7.16b + bic w15,w8,w6 + eor w11,w6,w6,ror#5 + sli v16.4s,v19.4s,#15 + add w10,w10,w14 + orr w12,w12,w15 + ushr v17.4s,v19.4s,#10 + eor w11,w11,w6,ror#19 + eor w15,w10,w10,ror#11 + ushr v7.4s,v19.4s,#19 + add w9,w9,w12 + ror w11,w11,#6 + add v2.4s,v2.4s,v5.4s + eor w14,w10,w3 + eor w15,w15,w10,ror#20 + sli v7.4s,v19.4s,#13 + add w9,w9,w11 + ldr w12,[sp,#40] + and w13,w13,w14 + eor v17.16b,v17.16b,v16.16b + ror w15,w15,#2 + add w5,w5,w9 + eor w13,w13,w3 + eor v17.16b,v17.16b,v7.16b + add w8,w8,w12 + add w9,w9,w15 + and w12,w6,w5 + add v2.4s,v2.4s,v17.4s + bic w15,w7,w5 + eor w11,w5,w5,ror#5 + add w9,w9,w13 + ushr v18.4s,v2.4s,#17 + orr w12,w12,w15 + ushr v19.4s,v2.4s,#10 + eor w11,w11,w5,ror#19 + eor w15,w9,w9,ror#11 + sli v18.4s,v2.4s,#15 + add w8,w8,w12 + ushr v17.4s,v2.4s,#19 + ror w11,w11,#6 + eor w13,w9,w10 + eor v19.16b,v19.16b,v18.16b + eor w15,w15,w9,ror#20 + add w8,w8,w11 + sli v17.4s,v2.4s,#13 + ldr w12,[sp,#44] + and w14,w14,w13 + ror w15,w15,#2 + ld1 {v4.4s},[x16], #16 + add w4,w4,w8 + eor v19.16b,v19.16b,v17.16b + eor w14,w14,w10 + eor v17.16b,v17.16b,v17.16b + add w7,w7,w12 + add w8,w8,w15 + and w12,w5,w4 + mov v17.d[1],v19.d[0] + bic w15,w6,w4 + eor w11,w4,w4,ror#5 + add w8,w8,w14 + add v2.4s,v2.4s,v17.4s + orr w12,w12,w15 + eor w11,w11,w4,ror#19 + eor w15,w8,w8,ror#11 + add v4.4s,v4.4s,v2.4s + add w7,w7,w12 + ror w11,w11,#6 + eor w14,w8,w9 + eor w15,w15,w8,ror#20 + add w7,w7,w11 + ldr w12,[sp,#48] + and w13,w13,w14 + ror w15,w15,#2 + add w3,w3,w7 + eor w13,w13,w9 + st1 {v4.4s},[x17], #16 + ext v4.16b,v3.16b,v0.16b,#4 + add w6,w6,w12 + add w7,w7,w15 + and w12,w4,w3 + bic w15,w5,w3 + ext v7.16b,v1.16b,v2.16b,#4 + eor w11,w3,w3,ror#5 + add w7,w7,w13 + mov d19,v2.d[1] + orr w12,w12,w15 + eor w11,w11,w3,ror#19 + ushr v6.4s,v4.4s,#7 + eor w15,w7,w7,ror#11 + ushr v5.4s,v4.4s,#3 + add w6,w6,w12 + add v3.4s,v3.4s,v7.4s + ror w11,w11,#6 + sli v6.4s,v4.4s,#25 + eor w13,w7,w8 + eor w15,w15,w7,ror#20 + ushr v7.4s,v4.4s,#18 + add w6,w6,w11 + ldr w12,[sp,#52] + and w14,w14,w13 + eor v5.16b,v5.16b,v6.16b + ror w15,w15,#2 + add w10,w10,w6 + sli v7.4s,v4.4s,#14 + eor w14,w14,w8 + ushr v16.4s,v19.4s,#17 + add w5,w5,w12 + add w6,w6,w15 + and w12,w3,w10 + eor v5.16b,v5.16b,v7.16b + bic w15,w4,w10 + eor w11,w10,w10,ror#5 + sli v16.4s,v19.4s,#15 + add w6,w6,w14 + orr w12,w12,w15 + ushr v17.4s,v19.4s,#10 + eor w11,w11,w10,ror#19 + eor w15,w6,w6,ror#11 + ushr v7.4s,v19.4s,#19 + add w5,w5,w12 + ror w11,w11,#6 + add v3.4s,v3.4s,v5.4s + eor w14,w6,w7 + eor w15,w15,w6,ror#20 + sli v7.4s,v19.4s,#13 + add w5,w5,w11 + ldr w12,[sp,#56] + and w13,w13,w14 + eor v17.16b,v17.16b,v16.16b + ror w15,w15,#2 + add w9,w9,w5 + eor w13,w13,w7 + eor v17.16b,v17.16b,v7.16b + add w4,w4,w12 + add w5,w5,w15 + and w12,w10,w9 + add v3.4s,v3.4s,v17.4s + bic w15,w3,w9 + eor w11,w9,w9,ror#5 + add w5,w5,w13 + ushr v18.4s,v3.4s,#17 + orr w12,w12,w15 + ushr v19.4s,v3.4s,#10 + eor w11,w11,w9,ror#19 + eor w15,w5,w5,ror#11 + sli v18.4s,v3.4s,#15 + add w4,w4,w12 + ushr v17.4s,v3.4s,#19 + ror w11,w11,#6 + eor w13,w5,w6 + eor v19.16b,v19.16b,v18.16b + eor w15,w15,w5,ror#20 + add w4,w4,w11 + sli v17.4s,v3.4s,#13 + ldr w12,[sp,#60] + and w14,w14,w13 + ror w15,w15,#2 + ld1 {v4.4s},[x16], #16 + add w8,w8,w4 + eor v19.16b,v19.16b,v17.16b + eor w14,w14,w6 + eor v17.16b,v17.16b,v17.16b + add w3,w3,w12 + add w4,w4,w15 + and w12,w9,w8 + mov v17.d[1],v19.d[0] + bic w15,w10,w8 + eor w11,w8,w8,ror#5 + add w4,w4,w14 + add v3.4s,v3.4s,v17.4s + orr w12,w12,w15 + eor w11,w11,w8,ror#19 + eor w15,w4,w4,ror#11 + add v4.4s,v4.4s,v3.4s + add w3,w3,w12 + ror w11,w11,#6 + eor w14,w4,w5 + eor w15,w15,w4,ror#20 + add w3,w3,w11 + ldr w12,[x16] + and w13,w13,w14 + ror w15,w15,#2 + add w7,w7,w3 + eor w13,w13,w5 + st1 {v4.4s},[x17], #16 + cmp w12,#0 // check for K256 terminator + ldr w12,[sp,#0] + sub x17,x17,#64 + bne .L_00_48 + + sub x16,x16,#256 // rewind x16 + cmp x1,x2 + mov x17, #64 + csel x17, x17, xzr, eq + sub x1,x1,x17 // avoid SEGV + mov x17,sp + add w10,w10,w12 + add w3,w3,w15 + and w12,w8,w7 + ld1 {v0.16b},[x1],#16 + bic w15,w9,w7 + eor w11,w7,w7,ror#5 + ld1 {v4.4s},[x16],#16 + add w3,w3,w13 + orr w12,w12,w15 + eor w11,w11,w7,ror#19 + eor w15,w3,w3,ror#11 + rev32 v0.16b,v0.16b + add w10,w10,w12 + ror w11,w11,#6 + eor w13,w3,w4 + eor w15,w15,w3,ror#20 + add v4.4s,v4.4s,v0.4s + add w10,w10,w11 + ldr w12,[sp,#4] + and w14,w14,w13 + ror w15,w15,#2 + add w6,w6,w10 + eor w14,w14,w4 + add w9,w9,w12 + add w10,w10,w15 + and w12,w7,w6 + bic w15,w8,w6 + eor w11,w6,w6,ror#5 + add w10,w10,w14 + orr w12,w12,w15 + eor w11,w11,w6,ror#19 + eor w15,w10,w10,ror#11 + add w9,w9,w12 + ror w11,w11,#6 + eor w14,w10,w3 + eor w15,w15,w10,ror#20 + add w9,w9,w11 + ldr w12,[sp,#8] + and w13,w13,w14 + ror w15,w15,#2 + add w5,w5,w9 + eor w13,w13,w3 + add w8,w8,w12 + add w9,w9,w15 + and w12,w6,w5 + bic w15,w7,w5 + eor w11,w5,w5,ror#5 + add w9,w9,w13 + orr w12,w12,w15 + eor w11,w11,w5,ror#19 + eor w15,w9,w9,ror#11 + add w8,w8,w12 + ror w11,w11,#6 + eor w13,w9,w10 + eor w15,w15,w9,ror#20 + add w8,w8,w11 + ldr w12,[sp,#12] + and w14,w14,w13 + ror w15,w15,#2 + add w4,w4,w8 + eor w14,w14,w10 + add w7,w7,w12 + add w8,w8,w15 + and w12,w5,w4 + bic w15,w6,w4 + eor w11,w4,w4,ror#5 + add w8,w8,w14 + orr w12,w12,w15 + eor w11,w11,w4,ror#19 + eor w15,w8,w8,ror#11 + add w7,w7,w12 + ror w11,w11,#6 + eor w14,w8,w9 + eor w15,w15,w8,ror#20 + add w7,w7,w11 + ldr w12,[sp,#16] + and w13,w13,w14 + ror w15,w15,#2 + add w3,w3,w7 + eor w13,w13,w9 + st1 {v4.4s},[x17], #16 + add w6,w6,w12 + add w7,w7,w15 + and w12,w4,w3 + ld1 {v1.16b},[x1],#16 + bic w15,w5,w3 + eor w11,w3,w3,ror#5 + ld1 {v4.4s},[x16],#16 + add w7,w7,w13 + orr w12,w12,w15 + eor w11,w11,w3,ror#19 + eor w15,w7,w7,ror#11 + rev32 v1.16b,v1.16b + add w6,w6,w12 + ror w11,w11,#6 + eor w13,w7,w8 + eor w15,w15,w7,ror#20 + add v4.4s,v4.4s,v1.4s + add w6,w6,w11 + ldr w12,[sp,#20] + and w14,w14,w13 + ror w15,w15,#2 + add w10,w10,w6 + eor w14,w14,w8 + add w5,w5,w12 + add w6,w6,w15 + and w12,w3,w10 + bic w15,w4,w10 + eor w11,w10,w10,ror#5 + add w6,w6,w14 + orr w12,w12,w15 + eor w11,w11,w10,ror#19 + eor w15,w6,w6,ror#11 + add w5,w5,w12 + ror w11,w11,#6 + eor w14,w6,w7 + eor w15,w15,w6,ror#20 + add w5,w5,w11 + ldr w12,[sp,#24] + and w13,w13,w14 + ror w15,w15,#2 + add w9,w9,w5 + eor w13,w13,w7 + add w4,w4,w12 + add w5,w5,w15 + and w12,w10,w9 + bic w15,w3,w9 + eor w11,w9,w9,ror#5 + add w5,w5,w13 + orr w12,w12,w15 + eor w11,w11,w9,ror#19 + eor w15,w5,w5,ror#11 + add w4,w4,w12 + ror w11,w11,#6 + eor w13,w5,w6 + eor w15,w15,w5,ror#20 + add w4,w4,w11 + ldr w12,[sp,#28] + and w14,w14,w13 + ror w15,w15,#2 + add w8,w8,w4 + eor w14,w14,w6 + add w3,w3,w12 + add w4,w4,w15 + and w12,w9,w8 + bic w15,w10,w8 + eor w11,w8,w8,ror#5 + add w4,w4,w14 + orr w12,w12,w15 + eor w11,w11,w8,ror#19 + eor w15,w4,w4,ror#11 + add w3,w3,w12 + ror w11,w11,#6 + eor w14,w4,w5 + eor w15,w15,w4,ror#20 + add w3,w3,w11 + ldr w12,[sp,#32] + and w13,w13,w14 + ror w15,w15,#2 + add w7,w7,w3 + eor w13,w13,w5 + st1 {v4.4s},[x17], #16 + add w10,w10,w12 + add w3,w3,w15 + and w12,w8,w7 + ld1 {v2.16b},[x1],#16 + bic w15,w9,w7 + eor w11,w7,w7,ror#5 + ld1 {v4.4s},[x16],#16 + add w3,w3,w13 + orr w12,w12,w15 + eor w11,w11,w7,ror#19 + eor w15,w3,w3,ror#11 + rev32 v2.16b,v2.16b + add w10,w10,w12 + ror w11,w11,#6 + eor w13,w3,w4 + eor w15,w15,w3,ror#20 + add v4.4s,v4.4s,v2.4s + add w10,w10,w11 + ldr w12,[sp,#36] + and w14,w14,w13 + ror w15,w15,#2 + add w6,w6,w10 + eor w14,w14,w4 + add w9,w9,w12 + add w10,w10,w15 + and w12,w7,w6 + bic w15,w8,w6 + eor w11,w6,w6,ror#5 + add w10,w10,w14 + orr w12,w12,w15 + eor w11,w11,w6,ror#19 + eor w15,w10,w10,ror#11 + add w9,w9,w12 + ror w11,w11,#6 + eor w14,w10,w3 + eor w15,w15,w10,ror#20 + add w9,w9,w11 + ldr w12,[sp,#40] + and w13,w13,w14 + ror w15,w15,#2 + add w5,w5,w9 + eor w13,w13,w3 + add w8,w8,w12 + add w9,w9,w15 + and w12,w6,w5 + bic w15,w7,w5 + eor w11,w5,w5,ror#5 + add w9,w9,w13 + orr w12,w12,w15 + eor w11,w11,w5,ror#19 + eor w15,w9,w9,ror#11 + add w8,w8,w12 + ror w11,w11,#6 + eor w13,w9,w10 + eor w15,w15,w9,ror#20 + add w8,w8,w11 + ldr w12,[sp,#44] + and w14,w14,w13 + ror w15,w15,#2 + add w4,w4,w8 + eor w14,w14,w10 + add w7,w7,w12 + add w8,w8,w15 + and w12,w5,w4 + bic w15,w6,w4 + eor w11,w4,w4,ror#5 + add w8,w8,w14 + orr w12,w12,w15 + eor w11,w11,w4,ror#19 + eor w15,w8,w8,ror#11 + add w7,w7,w12 + ror w11,w11,#6 + eor w14,w8,w9 + eor w15,w15,w8,ror#20 + add w7,w7,w11 + ldr w12,[sp,#48] + and w13,w13,w14 + ror w15,w15,#2 + add w3,w3,w7 + eor w13,w13,w9 + st1 {v4.4s},[x17], #16 + add w6,w6,w12 + add w7,w7,w15 + and w12,w4,w3 + ld1 {v3.16b},[x1],#16 + bic w15,w5,w3 + eor w11,w3,w3,ror#5 + ld1 {v4.4s},[x16],#16 + add w7,w7,w13 + orr w12,w12,w15 + eor w11,w11,w3,ror#19 + eor w15,w7,w7,ror#11 + rev32 v3.16b,v3.16b + add w6,w6,w12 + ror w11,w11,#6 + eor w13,w7,w8 + eor w15,w15,w7,ror#20 + add v4.4s,v4.4s,v3.4s + add w6,w6,w11 + ldr w12,[sp,#52] + and w14,w14,w13 + ror w15,w15,#2 + add w10,w10,w6 + eor w14,w14,w8 + add w5,w5,w12 + add w6,w6,w15 + and w12,w3,w10 + bic w15,w4,w10 + eor w11,w10,w10,ror#5 + add w6,w6,w14 + orr w12,w12,w15 + eor w11,w11,w10,ror#19 + eor w15,w6,w6,ror#11 + add w5,w5,w12 + ror w11,w11,#6 + eor w14,w6,w7 + eor w15,w15,w6,ror#20 + add w5,w5,w11 + ldr w12,[sp,#56] + and w13,w13,w14 + ror w15,w15,#2 + add w9,w9,w5 + eor w13,w13,w7 + add w4,w4,w12 + add w5,w5,w15 + and w12,w10,w9 + bic w15,w3,w9 + eor w11,w9,w9,ror#5 + add w5,w5,w13 + orr w12,w12,w15 + eor w11,w11,w9,ror#19 + eor w15,w5,w5,ror#11 + add w4,w4,w12 + ror w11,w11,#6 + eor w13,w5,w6 + eor w15,w15,w5,ror#20 + add w4,w4,w11 + ldr w12,[sp,#60] + and w14,w14,w13 + ror w15,w15,#2 + add w8,w8,w4 + eor w14,w14,w6 + add w3,w3,w12 + add w4,w4,w15 + and w12,w9,w8 + bic w15,w10,w8 + eor w11,w8,w8,ror#5 + add w4,w4,w14 + orr w12,w12,w15 + eor w11,w11,w8,ror#19 + eor w15,w4,w4,ror#11 + add w3,w3,w12 + ror w11,w11,#6 + eor w14,w4,w5 + eor w15,w15,w4,ror#20 + add w3,w3,w11 + and w13,w13,w14 + ror w15,w15,#2 + add w7,w7,w3 + eor w13,w13,w5 + st1 {v4.4s},[x17], #16 + add w3,w3,w15 // h+=Sigma0(a) from the past + ldp w11,w12,[x0,#0] + add w3,w3,w13 // h+=Maj(a,b,c) from the past + ldp w13,w14,[x0,#8] + add w3,w3,w11 // accumulate + add w4,w4,w12 + ldp w11,w12,[x0,#16] + add w5,w5,w13 + add w6,w6,w14 + ldp w13,w14,[x0,#24] + add w7,w7,w11 + add w8,w8,w12 + ldr w12,[sp,#0] + stp w3,w4,[x0,#0] + add w9,w9,w13 + mov w13,wzr + stp w5,w6,[x0,#8] + add w10,w10,w14 + stp w7,w8,[x0,#16] + eor w14,w4,w5 + stp w9,w10,[x0,#24] + mov w15,wzr + mov x17,sp + b.ne .L_00_48 + + ldr x29,[x29] + add sp,sp,#16*4+16 + ret + +.globl blst_sha256_emit + +.def blst_sha256_emit; +.type 32; +.endef +.p2align 4 +blst_sha256_emit: + ldp x4,x5,[x1] + ldp x6,x7,[x1,#16] +#ifndef __AARCH64EB__ + rev x4,x4 + rev x5,x5 + rev x6,x6 + rev x7,x7 +#endif + str w4,[x0,#4] + lsr x4,x4,#32 + str w5,[x0,#12] + lsr x5,x5,#32 + str w6,[x0,#20] + lsr x6,x6,#32 + str w7,[x0,#28] + lsr x7,x7,#32 + str w4,[x0,#0] + str w5,[x0,#8] + str w6,[x0,#16] + str w7,[x0,#24] + ret + + +.globl blst_sha256_bcopy + +.def blst_sha256_bcopy; +.type 32; +.endef +.p2align 4 +blst_sha256_bcopy: +.Loop_bcopy: + ldrb w3,[x1],#1 + sub x2,x2,#1 + strb w3,[x0],#1 + cbnz x2,.Loop_bcopy + ret + + +.globl blst_sha256_hcopy + +.def blst_sha256_hcopy; +.type 32; +.endef +.p2align 4 +blst_sha256_hcopy: + ldp x4,x5,[x1] + ldp x6,x7,[x1,#16] + stp x4,x5,[x0] + stp x6,x7,[x0,#16] + ret + diff --git a/temp-deps/blst-ts/deps/blst/build/coff/sha256-portable-x86_64.s b/temp-deps/blst-ts/deps/blst/build/coff/sha256-portable-x86_64.s new file mode 100644 index 0000000..e499d10 --- /dev/null +++ b/temp-deps/blst-ts/deps/blst/build/coff/sha256-portable-x86_64.s @@ -0,0 +1,1784 @@ +.text + +.globl blst_sha256_block_data_order +.def blst_sha256_block_data_order; .scl 2; .type 32; .endef +.p2align 4 +blst_sha256_block_data_order: + .byte 0xf3,0x0f,0x1e,0xfa + movq %rdi,8(%rsp) + movq %rsi,16(%rsp) + movq %rsp,%r11 +.LSEH_begin_blst_sha256_block_data_order: + movq %rcx,%rdi + movq %rdx,%rsi + movq %r8,%rdx + + + pushq %rbx + + pushq %rbp + + pushq %r12 + + pushq %r13 + + pushq %r14 + + pushq %r15 + + shlq $4,%rdx + subq $64+24,%rsp + + leaq (%rsi,%rdx,4),%rdx + movq %rdi,64+0(%rsp) + movq %rsi,64+8(%rsp) + movq %rdx,64+16(%rsp) +.LSEH_body_blst_sha256_block_data_order: + + + movl 0(%rdi),%eax + movl 4(%rdi),%ebx + movl 8(%rdi),%ecx + movl 12(%rdi),%edx + movl 16(%rdi),%r8d + movl 20(%rdi),%r9d + movl 24(%rdi),%r10d + movl 28(%rdi),%r11d + jmp .Lloop + +.p2align 4 +.Lloop: + movl %ebx,%edi + leaq K256(%rip),%rbp + xorl %ecx,%edi + movl 0(%rsi),%r12d + movl %r8d,%r13d + movl %eax,%r14d + bswapl %r12d + rorl $14,%r13d + movl %r9d,%r15d + + xorl %r8d,%r13d + rorl $9,%r14d + xorl %r10d,%r15d + + movl %r12d,0(%rsp) + xorl %eax,%r14d + andl %r8d,%r15d + + rorl $5,%r13d + addl %r11d,%r12d + xorl %r10d,%r15d + + rorl $11,%r14d + xorl %r8d,%r13d + addl %r15d,%r12d + + movl %eax,%r15d + addl 0(%rbp),%r12d + xorl %eax,%r14d + + xorl %ebx,%r15d + rorl $6,%r13d + movl %ebx,%r11d + + andl %r15d,%edi + rorl $2,%r14d + addl %r13d,%r12d + + xorl %edi,%r11d + addl %r12d,%edx + addl %r12d,%r11d + addl %r14d,%r11d + movl 4(%rsi),%r12d + movl %edx,%r13d + movl %r11d,%r14d + bswapl %r12d + rorl $14,%r13d + movl %r8d,%edi + + xorl %edx,%r13d + rorl $9,%r14d + xorl %r9d,%edi + + movl %r12d,4(%rsp) + xorl %r11d,%r14d + andl %edx,%edi + + rorl $5,%r13d + addl %r10d,%r12d + xorl %r9d,%edi + + rorl $11,%r14d + xorl %edx,%r13d + addl %edi,%r12d + + movl %r11d,%edi + addl 4(%rbp),%r12d + xorl %r11d,%r14d + + xorl %eax,%edi + rorl $6,%r13d + movl %eax,%r10d + + andl %edi,%r15d + rorl $2,%r14d + addl %r13d,%r12d + + xorl %r15d,%r10d + addl %r12d,%ecx + addl %r12d,%r10d + addl %r14d,%r10d + movl 8(%rsi),%r12d + movl %ecx,%r13d + movl %r10d,%r14d + bswapl %r12d + rorl $14,%r13d + movl %edx,%r15d + + xorl %ecx,%r13d + rorl $9,%r14d + xorl %r8d,%r15d + + movl %r12d,8(%rsp) + xorl %r10d,%r14d + andl %ecx,%r15d + + rorl $5,%r13d + addl %r9d,%r12d + xorl %r8d,%r15d + + rorl $11,%r14d + xorl %ecx,%r13d + addl %r15d,%r12d + + movl %r10d,%r15d + addl 8(%rbp),%r12d + xorl %r10d,%r14d + + xorl %r11d,%r15d + rorl $6,%r13d + movl %r11d,%r9d + + andl %r15d,%edi + rorl $2,%r14d + addl %r13d,%r12d + + xorl %edi,%r9d + addl %r12d,%ebx + addl %r12d,%r9d + addl %r14d,%r9d + movl 12(%rsi),%r12d + movl %ebx,%r13d + movl %r9d,%r14d + bswapl %r12d + rorl $14,%r13d + movl %ecx,%edi + + xorl %ebx,%r13d + rorl $9,%r14d + xorl %edx,%edi + + movl %r12d,12(%rsp) + xorl %r9d,%r14d + andl %ebx,%edi + + rorl $5,%r13d + addl %r8d,%r12d + xorl %edx,%edi + + rorl $11,%r14d + xorl %ebx,%r13d + addl %edi,%r12d + + movl %r9d,%edi + addl 12(%rbp),%r12d + xorl %r9d,%r14d + + xorl %r10d,%edi + rorl $6,%r13d + movl %r10d,%r8d + + andl %edi,%r15d + rorl $2,%r14d + addl %r13d,%r12d + + xorl %r15d,%r8d + addl %r12d,%eax + addl %r12d,%r8d + addl %r14d,%r8d + movl 16(%rsi),%r12d + movl %eax,%r13d + movl %r8d,%r14d + bswapl %r12d + rorl $14,%r13d + movl %ebx,%r15d + + xorl %eax,%r13d + rorl $9,%r14d + xorl %ecx,%r15d + + movl %r12d,16(%rsp) + xorl %r8d,%r14d + andl %eax,%r15d + + rorl $5,%r13d + addl %edx,%r12d + xorl %ecx,%r15d + + rorl $11,%r14d + xorl %eax,%r13d + addl %r15d,%r12d + + movl %r8d,%r15d + addl 16(%rbp),%r12d + xorl %r8d,%r14d + + xorl %r9d,%r15d + rorl $6,%r13d + movl %r9d,%edx + + andl %r15d,%edi + rorl $2,%r14d + addl %r13d,%r12d + + xorl %edi,%edx + addl %r12d,%r11d + addl %r12d,%edx + addl %r14d,%edx + movl 20(%rsi),%r12d + movl %r11d,%r13d + movl %edx,%r14d + bswapl %r12d + rorl $14,%r13d + movl %eax,%edi + + xorl %r11d,%r13d + rorl $9,%r14d + xorl %ebx,%edi + + movl %r12d,20(%rsp) + xorl %edx,%r14d + andl %r11d,%edi + + rorl $5,%r13d + addl %ecx,%r12d + xorl %ebx,%edi + + rorl $11,%r14d + xorl %r11d,%r13d + addl %edi,%r12d + + movl %edx,%edi + addl 20(%rbp),%r12d + xorl %edx,%r14d + + xorl %r8d,%edi + rorl $6,%r13d + movl %r8d,%ecx + + andl %edi,%r15d + rorl $2,%r14d + addl %r13d,%r12d + + xorl %r15d,%ecx + addl %r12d,%r10d + addl %r12d,%ecx + addl %r14d,%ecx + movl 24(%rsi),%r12d + movl %r10d,%r13d + movl %ecx,%r14d + bswapl %r12d + rorl $14,%r13d + movl %r11d,%r15d + + xorl %r10d,%r13d + rorl $9,%r14d + xorl %eax,%r15d + + movl %r12d,24(%rsp) + xorl %ecx,%r14d + andl %r10d,%r15d + + rorl $5,%r13d + addl %ebx,%r12d + xorl %eax,%r15d + + rorl $11,%r14d + xorl %r10d,%r13d + addl %r15d,%r12d + + movl %ecx,%r15d + addl 24(%rbp),%r12d + xorl %ecx,%r14d + + xorl %edx,%r15d + rorl $6,%r13d + movl %edx,%ebx + + andl %r15d,%edi + rorl $2,%r14d + addl %r13d,%r12d + + xorl %edi,%ebx + addl %r12d,%r9d + addl %r12d,%ebx + addl %r14d,%ebx + movl 28(%rsi),%r12d + movl %r9d,%r13d + movl %ebx,%r14d + bswapl %r12d + rorl $14,%r13d + movl %r10d,%edi + + xorl %r9d,%r13d + rorl $9,%r14d + xorl %r11d,%edi + + movl %r12d,28(%rsp) + xorl %ebx,%r14d + andl %r9d,%edi + + rorl $5,%r13d + addl %eax,%r12d + xorl %r11d,%edi + + rorl $11,%r14d + xorl %r9d,%r13d + addl %edi,%r12d + + movl %ebx,%edi + addl 28(%rbp),%r12d + xorl %ebx,%r14d + + xorl %ecx,%edi + rorl $6,%r13d + movl %ecx,%eax + + andl %edi,%r15d + rorl $2,%r14d + addl %r13d,%r12d + + xorl %r15d,%eax + addl %r12d,%r8d + addl %r12d,%eax + addl %r14d,%eax + movl 32(%rsi),%r12d + movl %r8d,%r13d + movl %eax,%r14d + bswapl %r12d + rorl $14,%r13d + movl %r9d,%r15d + + xorl %r8d,%r13d + rorl $9,%r14d + xorl %r10d,%r15d + + movl %r12d,32(%rsp) + xorl %eax,%r14d + andl %r8d,%r15d + + rorl $5,%r13d + addl %r11d,%r12d + xorl %r10d,%r15d + + rorl $11,%r14d + xorl %r8d,%r13d + addl %r15d,%r12d + + movl %eax,%r15d + addl 32(%rbp),%r12d + xorl %eax,%r14d + + xorl %ebx,%r15d + rorl $6,%r13d + movl %ebx,%r11d + + andl %r15d,%edi + rorl $2,%r14d + addl %r13d,%r12d + + xorl %edi,%r11d + addl %r12d,%edx + addl %r12d,%r11d + addl %r14d,%r11d + movl 36(%rsi),%r12d + movl %edx,%r13d + movl %r11d,%r14d + bswapl %r12d + rorl $14,%r13d + movl %r8d,%edi + + xorl %edx,%r13d + rorl $9,%r14d + xorl %r9d,%edi + + movl %r12d,36(%rsp) + xorl %r11d,%r14d + andl %edx,%edi + + rorl $5,%r13d + addl %r10d,%r12d + xorl %r9d,%edi + + rorl $11,%r14d + xorl %edx,%r13d + addl %edi,%r12d + + movl %r11d,%edi + addl 36(%rbp),%r12d + xorl %r11d,%r14d + + xorl %eax,%edi + rorl $6,%r13d + movl %eax,%r10d + + andl %edi,%r15d + rorl $2,%r14d + addl %r13d,%r12d + + xorl %r15d,%r10d + addl %r12d,%ecx + addl %r12d,%r10d + addl %r14d,%r10d + movl 40(%rsi),%r12d + movl %ecx,%r13d + movl %r10d,%r14d + bswapl %r12d + rorl $14,%r13d + movl %edx,%r15d + + xorl %ecx,%r13d + rorl $9,%r14d + xorl %r8d,%r15d + + movl %r12d,40(%rsp) + xorl %r10d,%r14d + andl %ecx,%r15d + + rorl $5,%r13d + addl %r9d,%r12d + xorl %r8d,%r15d + + rorl $11,%r14d + xorl %ecx,%r13d + addl %r15d,%r12d + + movl %r10d,%r15d + addl 40(%rbp),%r12d + xorl %r10d,%r14d + + xorl %r11d,%r15d + rorl $6,%r13d + movl %r11d,%r9d + + andl %r15d,%edi + rorl $2,%r14d + addl %r13d,%r12d + + xorl %edi,%r9d + addl %r12d,%ebx + addl %r12d,%r9d + addl %r14d,%r9d + movl 44(%rsi),%r12d + movl %ebx,%r13d + movl %r9d,%r14d + bswapl %r12d + rorl $14,%r13d + movl %ecx,%edi + + xorl %ebx,%r13d + rorl $9,%r14d + xorl %edx,%edi + + movl %r12d,44(%rsp) + xorl %r9d,%r14d + andl %ebx,%edi + + rorl $5,%r13d + addl %r8d,%r12d + xorl %edx,%edi + + rorl $11,%r14d + xorl %ebx,%r13d + addl %edi,%r12d + + movl %r9d,%edi + addl 44(%rbp),%r12d + xorl %r9d,%r14d + + xorl %r10d,%edi + rorl $6,%r13d + movl %r10d,%r8d + + andl %edi,%r15d + rorl $2,%r14d + addl %r13d,%r12d + + xorl %r15d,%r8d + addl %r12d,%eax + addl %r12d,%r8d + addl %r14d,%r8d + movl 48(%rsi),%r12d + movl %eax,%r13d + movl %r8d,%r14d + bswapl %r12d + rorl $14,%r13d + movl %ebx,%r15d + + xorl %eax,%r13d + rorl $9,%r14d + xorl %ecx,%r15d + + movl %r12d,48(%rsp) + xorl %r8d,%r14d + andl %eax,%r15d + + rorl $5,%r13d + addl %edx,%r12d + xorl %ecx,%r15d + + rorl $11,%r14d + xorl %eax,%r13d + addl %r15d,%r12d + + movl %r8d,%r15d + addl 48(%rbp),%r12d + xorl %r8d,%r14d + + xorl %r9d,%r15d + rorl $6,%r13d + movl %r9d,%edx + + andl %r15d,%edi + rorl $2,%r14d + addl %r13d,%r12d + + xorl %edi,%edx + addl %r12d,%r11d + addl %r12d,%edx + addl %r14d,%edx + movl 52(%rsi),%r12d + movl %r11d,%r13d + movl %edx,%r14d + bswapl %r12d + rorl $14,%r13d + movl %eax,%edi + + xorl %r11d,%r13d + rorl $9,%r14d + xorl %ebx,%edi + + movl %r12d,52(%rsp) + xorl %edx,%r14d + andl %r11d,%edi + + rorl $5,%r13d + addl %ecx,%r12d + xorl %ebx,%edi + + rorl $11,%r14d + xorl %r11d,%r13d + addl %edi,%r12d + + movl %edx,%edi + addl 52(%rbp),%r12d + xorl %edx,%r14d + + xorl %r8d,%edi + rorl $6,%r13d + movl %r8d,%ecx + + andl %edi,%r15d + rorl $2,%r14d + addl %r13d,%r12d + + xorl %r15d,%ecx + addl %r12d,%r10d + addl %r12d,%ecx + addl %r14d,%ecx + movl 56(%rsi),%r12d + movl %r10d,%r13d + movl %ecx,%r14d + bswapl %r12d + rorl $14,%r13d + movl %r11d,%r15d + + xorl %r10d,%r13d + rorl $9,%r14d + xorl %eax,%r15d + + movl %r12d,56(%rsp) + xorl %ecx,%r14d + andl %r10d,%r15d + + rorl $5,%r13d + addl %ebx,%r12d + xorl %eax,%r15d + + rorl $11,%r14d + xorl %r10d,%r13d + addl %r15d,%r12d + + movl %ecx,%r15d + addl 56(%rbp),%r12d + xorl %ecx,%r14d + + xorl %edx,%r15d + rorl $6,%r13d + movl %edx,%ebx + + andl %r15d,%edi + rorl $2,%r14d + addl %r13d,%r12d + + xorl %edi,%ebx + addl %r12d,%r9d + addl %r12d,%ebx + addl %r14d,%ebx + movl 60(%rsi),%r12d + movl %r9d,%r13d + movl %ebx,%r14d + bswapl %r12d + rorl $14,%r13d + movl %r10d,%edi + + xorl %r9d,%r13d + rorl $9,%r14d + xorl %r11d,%edi + + movl %r12d,60(%rsp) + xorl %ebx,%r14d + andl %r9d,%edi + + rorl $5,%r13d + addl %eax,%r12d + xorl %r11d,%edi + + rorl $11,%r14d + xorl %r9d,%r13d + addl %edi,%r12d + + movl %ebx,%edi + addl 60(%rbp),%r12d + xorl %ebx,%r14d + + xorl %ecx,%edi + rorl $6,%r13d + movl %ecx,%eax + + andl %edi,%r15d + rorl $2,%r14d + addl %r13d,%r12d + + xorl %r15d,%eax + addl %r12d,%r8d + addl %r12d,%eax + jmp .Lrounds_16_xx +.p2align 4 +.Lrounds_16_xx: + movl 4(%rsp),%r13d + movl 56(%rsp),%r15d + + movl %r13d,%r12d + rorl $11,%r13d + addl %r14d,%eax + movl %r15d,%r14d + rorl $2,%r15d + + xorl %r12d,%r13d + shrl $3,%r12d + rorl $7,%r13d + xorl %r14d,%r15d + shrl $10,%r14d + + rorl $17,%r15d + xorl %r13d,%r12d + xorl %r14d,%r15d + addl 36(%rsp),%r12d + + addl 0(%rsp),%r12d + movl %r8d,%r13d + addl %r15d,%r12d + movl %eax,%r14d + rorl $14,%r13d + movl %r9d,%r15d + + xorl %r8d,%r13d + rorl $9,%r14d + xorl %r10d,%r15d + + movl %r12d,0(%rsp) + xorl %eax,%r14d + andl %r8d,%r15d + + rorl $5,%r13d + addl %r11d,%r12d + xorl %r10d,%r15d + + rorl $11,%r14d + xorl %r8d,%r13d + addl %r15d,%r12d + + movl %eax,%r15d + addl 64(%rbp),%r12d + xorl %eax,%r14d + + xorl %ebx,%r15d + rorl $6,%r13d + movl %ebx,%r11d + + andl %r15d,%edi + rorl $2,%r14d + addl %r13d,%r12d + + xorl %edi,%r11d + addl %r12d,%edx + addl %r12d,%r11d + movl 8(%rsp),%r13d + movl 60(%rsp),%edi + + movl %r13d,%r12d + rorl $11,%r13d + addl %r14d,%r11d + movl %edi,%r14d + rorl $2,%edi + + xorl %r12d,%r13d + shrl $3,%r12d + rorl $7,%r13d + xorl %r14d,%edi + shrl $10,%r14d + + rorl $17,%edi + xorl %r13d,%r12d + xorl %r14d,%edi + addl 40(%rsp),%r12d + + addl 4(%rsp),%r12d + movl %edx,%r13d + addl %edi,%r12d + movl %r11d,%r14d + rorl $14,%r13d + movl %r8d,%edi + + xorl %edx,%r13d + rorl $9,%r14d + xorl %r9d,%edi + + movl %r12d,4(%rsp) + xorl %r11d,%r14d + andl %edx,%edi + + rorl $5,%r13d + addl %r10d,%r12d + xorl %r9d,%edi + + rorl $11,%r14d + xorl %edx,%r13d + addl %edi,%r12d + + movl %r11d,%edi + addl 68(%rbp),%r12d + xorl %r11d,%r14d + + xorl %eax,%edi + rorl $6,%r13d + movl %eax,%r10d + + andl %edi,%r15d + rorl $2,%r14d + addl %r13d,%r12d + + xorl %r15d,%r10d + addl %r12d,%ecx + addl %r12d,%r10d + movl 12(%rsp),%r13d + movl 0(%rsp),%r15d + + movl %r13d,%r12d + rorl $11,%r13d + addl %r14d,%r10d + movl %r15d,%r14d + rorl $2,%r15d + + xorl %r12d,%r13d + shrl $3,%r12d + rorl $7,%r13d + xorl %r14d,%r15d + shrl $10,%r14d + + rorl $17,%r15d + xorl %r13d,%r12d + xorl %r14d,%r15d + addl 44(%rsp),%r12d + + addl 8(%rsp),%r12d + movl %ecx,%r13d + addl %r15d,%r12d + movl %r10d,%r14d + rorl $14,%r13d + movl %edx,%r15d + + xorl %ecx,%r13d + rorl $9,%r14d + xorl %r8d,%r15d + + movl %r12d,8(%rsp) + xorl %r10d,%r14d + andl %ecx,%r15d + + rorl $5,%r13d + addl %r9d,%r12d + xorl %r8d,%r15d + + rorl $11,%r14d + xorl %ecx,%r13d + addl %r15d,%r12d + + movl %r10d,%r15d + addl 72(%rbp),%r12d + xorl %r10d,%r14d + + xorl %r11d,%r15d + rorl $6,%r13d + movl %r11d,%r9d + + andl %r15d,%edi + rorl $2,%r14d + addl %r13d,%r12d + + xorl %edi,%r9d + addl %r12d,%ebx + addl %r12d,%r9d + movl 16(%rsp),%r13d + movl 4(%rsp),%edi + + movl %r13d,%r12d + rorl $11,%r13d + addl %r14d,%r9d + movl %edi,%r14d + rorl $2,%edi + + xorl %r12d,%r13d + shrl $3,%r12d + rorl $7,%r13d + xorl %r14d,%edi + shrl $10,%r14d + + rorl $17,%edi + xorl %r13d,%r12d + xorl %r14d,%edi + addl 48(%rsp),%r12d + + addl 12(%rsp),%r12d + movl %ebx,%r13d + addl %edi,%r12d + movl %r9d,%r14d + rorl $14,%r13d + movl %ecx,%edi + + xorl %ebx,%r13d + rorl $9,%r14d + xorl %edx,%edi + + movl %r12d,12(%rsp) + xorl %r9d,%r14d + andl %ebx,%edi + + rorl $5,%r13d + addl %r8d,%r12d + xorl %edx,%edi + + rorl $11,%r14d + xorl %ebx,%r13d + addl %edi,%r12d + + movl %r9d,%edi + addl 76(%rbp),%r12d + xorl %r9d,%r14d + + xorl %r10d,%edi + rorl $6,%r13d + movl %r10d,%r8d + + andl %edi,%r15d + rorl $2,%r14d + addl %r13d,%r12d + + xorl %r15d,%r8d + addl %r12d,%eax + addl %r12d,%r8d + movl 20(%rsp),%r13d + movl 8(%rsp),%r15d + + movl %r13d,%r12d + rorl $11,%r13d + addl %r14d,%r8d + movl %r15d,%r14d + rorl $2,%r15d + + xorl %r12d,%r13d + shrl $3,%r12d + rorl $7,%r13d + xorl %r14d,%r15d + shrl $10,%r14d + + rorl $17,%r15d + xorl %r13d,%r12d + xorl %r14d,%r15d + addl 52(%rsp),%r12d + + addl 16(%rsp),%r12d + movl %eax,%r13d + addl %r15d,%r12d + movl %r8d,%r14d + rorl $14,%r13d + movl %ebx,%r15d + + xorl %eax,%r13d + rorl $9,%r14d + xorl %ecx,%r15d + + movl %r12d,16(%rsp) + xorl %r8d,%r14d + andl %eax,%r15d + + rorl $5,%r13d + addl %edx,%r12d + xorl %ecx,%r15d + + rorl $11,%r14d + xorl %eax,%r13d + addl %r15d,%r12d + + movl %r8d,%r15d + addl 80(%rbp),%r12d + xorl %r8d,%r14d + + xorl %r9d,%r15d + rorl $6,%r13d + movl %r9d,%edx + + andl %r15d,%edi + rorl $2,%r14d + addl %r13d,%r12d + + xorl %edi,%edx + addl %r12d,%r11d + addl %r12d,%edx + movl 24(%rsp),%r13d + movl 12(%rsp),%edi + + movl %r13d,%r12d + rorl $11,%r13d + addl %r14d,%edx + movl %edi,%r14d + rorl $2,%edi + + xorl %r12d,%r13d + shrl $3,%r12d + rorl $7,%r13d + xorl %r14d,%edi + shrl $10,%r14d + + rorl $17,%edi + xorl %r13d,%r12d + xorl %r14d,%edi + addl 56(%rsp),%r12d + + addl 20(%rsp),%r12d + movl %r11d,%r13d + addl %edi,%r12d + movl %edx,%r14d + rorl $14,%r13d + movl %eax,%edi + + xorl %r11d,%r13d + rorl $9,%r14d + xorl %ebx,%edi + + movl %r12d,20(%rsp) + xorl %edx,%r14d + andl %r11d,%edi + + rorl $5,%r13d + addl %ecx,%r12d + xorl %ebx,%edi + + rorl $11,%r14d + xorl %r11d,%r13d + addl %edi,%r12d + + movl %edx,%edi + addl 84(%rbp),%r12d + xorl %edx,%r14d + + xorl %r8d,%edi + rorl $6,%r13d + movl %r8d,%ecx + + andl %edi,%r15d + rorl $2,%r14d + addl %r13d,%r12d + + xorl %r15d,%ecx + addl %r12d,%r10d + addl %r12d,%ecx + movl 28(%rsp),%r13d + movl 16(%rsp),%r15d + + movl %r13d,%r12d + rorl $11,%r13d + addl %r14d,%ecx + movl %r15d,%r14d + rorl $2,%r15d + + xorl %r12d,%r13d + shrl $3,%r12d + rorl $7,%r13d + xorl %r14d,%r15d + shrl $10,%r14d + + rorl $17,%r15d + xorl %r13d,%r12d + xorl %r14d,%r15d + addl 60(%rsp),%r12d + + addl 24(%rsp),%r12d + movl %r10d,%r13d + addl %r15d,%r12d + movl %ecx,%r14d + rorl $14,%r13d + movl %r11d,%r15d + + xorl %r10d,%r13d + rorl $9,%r14d + xorl %eax,%r15d + + movl %r12d,24(%rsp) + xorl %ecx,%r14d + andl %r10d,%r15d + + rorl $5,%r13d + addl %ebx,%r12d + xorl %eax,%r15d + + rorl $11,%r14d + xorl %r10d,%r13d + addl %r15d,%r12d + + movl %ecx,%r15d + addl 88(%rbp),%r12d + xorl %ecx,%r14d + + xorl %edx,%r15d + rorl $6,%r13d + movl %edx,%ebx + + andl %r15d,%edi + rorl $2,%r14d + addl %r13d,%r12d + + xorl %edi,%ebx + addl %r12d,%r9d + addl %r12d,%ebx + movl 32(%rsp),%r13d + movl 20(%rsp),%edi + + movl %r13d,%r12d + rorl $11,%r13d + addl %r14d,%ebx + movl %edi,%r14d + rorl $2,%edi + + xorl %r12d,%r13d + shrl $3,%r12d + rorl $7,%r13d + xorl %r14d,%edi + shrl $10,%r14d + + rorl $17,%edi + xorl %r13d,%r12d + xorl %r14d,%edi + addl 0(%rsp),%r12d + + addl 28(%rsp),%r12d + movl %r9d,%r13d + addl %edi,%r12d + movl %ebx,%r14d + rorl $14,%r13d + movl %r10d,%edi + + xorl %r9d,%r13d + rorl $9,%r14d + xorl %r11d,%edi + + movl %r12d,28(%rsp) + xorl %ebx,%r14d + andl %r9d,%edi + + rorl $5,%r13d + addl %eax,%r12d + xorl %r11d,%edi + + rorl $11,%r14d + xorl %r9d,%r13d + addl %edi,%r12d + + movl %ebx,%edi + addl 92(%rbp),%r12d + xorl %ebx,%r14d + + xorl %ecx,%edi + rorl $6,%r13d + movl %ecx,%eax + + andl %edi,%r15d + rorl $2,%r14d + addl %r13d,%r12d + + xorl %r15d,%eax + addl %r12d,%r8d + addl %r12d,%eax + movl 36(%rsp),%r13d + movl 24(%rsp),%r15d + + movl %r13d,%r12d + rorl $11,%r13d + addl %r14d,%eax + movl %r15d,%r14d + rorl $2,%r15d + + xorl %r12d,%r13d + shrl $3,%r12d + rorl $7,%r13d + xorl %r14d,%r15d + shrl $10,%r14d + + rorl $17,%r15d + xorl %r13d,%r12d + xorl %r14d,%r15d + addl 4(%rsp),%r12d + + addl 32(%rsp),%r12d + movl %r8d,%r13d + addl %r15d,%r12d + movl %eax,%r14d + rorl $14,%r13d + movl %r9d,%r15d + + xorl %r8d,%r13d + rorl $9,%r14d + xorl %r10d,%r15d + + movl %r12d,32(%rsp) + xorl %eax,%r14d + andl %r8d,%r15d + + rorl $5,%r13d + addl %r11d,%r12d + xorl %r10d,%r15d + + rorl $11,%r14d + xorl %r8d,%r13d + addl %r15d,%r12d + + movl %eax,%r15d + addl 96(%rbp),%r12d + xorl %eax,%r14d + + xorl %ebx,%r15d + rorl $6,%r13d + movl %ebx,%r11d + + andl %r15d,%edi + rorl $2,%r14d + addl %r13d,%r12d + + xorl %edi,%r11d + addl %r12d,%edx + addl %r12d,%r11d + movl 40(%rsp),%r13d + movl 28(%rsp),%edi + + movl %r13d,%r12d + rorl $11,%r13d + addl %r14d,%r11d + movl %edi,%r14d + rorl $2,%edi + + xorl %r12d,%r13d + shrl $3,%r12d + rorl $7,%r13d + xorl %r14d,%edi + shrl $10,%r14d + + rorl $17,%edi + xorl %r13d,%r12d + xorl %r14d,%edi + addl 8(%rsp),%r12d + + addl 36(%rsp),%r12d + movl %edx,%r13d + addl %edi,%r12d + movl %r11d,%r14d + rorl $14,%r13d + movl %r8d,%edi + + xorl %edx,%r13d + rorl $9,%r14d + xorl %r9d,%edi + + movl %r12d,36(%rsp) + xorl %r11d,%r14d + andl %edx,%edi + + rorl $5,%r13d + addl %r10d,%r12d + xorl %r9d,%edi + + rorl $11,%r14d + xorl %edx,%r13d + addl %edi,%r12d + + movl %r11d,%edi + addl 100(%rbp),%r12d + xorl %r11d,%r14d + + xorl %eax,%edi + rorl $6,%r13d + movl %eax,%r10d + + andl %edi,%r15d + rorl $2,%r14d + addl %r13d,%r12d + + xorl %r15d,%r10d + addl %r12d,%ecx + addl %r12d,%r10d + movl 44(%rsp),%r13d + movl 32(%rsp),%r15d + + movl %r13d,%r12d + rorl $11,%r13d + addl %r14d,%r10d + movl %r15d,%r14d + rorl $2,%r15d + + xorl %r12d,%r13d + shrl $3,%r12d + rorl $7,%r13d + xorl %r14d,%r15d + shrl $10,%r14d + + rorl $17,%r15d + xorl %r13d,%r12d + xorl %r14d,%r15d + addl 12(%rsp),%r12d + + addl 40(%rsp),%r12d + movl %ecx,%r13d + addl %r15d,%r12d + movl %r10d,%r14d + rorl $14,%r13d + movl %edx,%r15d + + xorl %ecx,%r13d + rorl $9,%r14d + xorl %r8d,%r15d + + movl %r12d,40(%rsp) + xorl %r10d,%r14d + andl %ecx,%r15d + + rorl $5,%r13d + addl %r9d,%r12d + xorl %r8d,%r15d + + rorl $11,%r14d + xorl %ecx,%r13d + addl %r15d,%r12d + + movl %r10d,%r15d + addl 104(%rbp),%r12d + xorl %r10d,%r14d + + xorl %r11d,%r15d + rorl $6,%r13d + movl %r11d,%r9d + + andl %r15d,%edi + rorl $2,%r14d + addl %r13d,%r12d + + xorl %edi,%r9d + addl %r12d,%ebx + addl %r12d,%r9d + movl 48(%rsp),%r13d + movl 36(%rsp),%edi + + movl %r13d,%r12d + rorl $11,%r13d + addl %r14d,%r9d + movl %edi,%r14d + rorl $2,%edi + + xorl %r12d,%r13d + shrl $3,%r12d + rorl $7,%r13d + xorl %r14d,%edi + shrl $10,%r14d + + rorl $17,%edi + xorl %r13d,%r12d + xorl %r14d,%edi + addl 16(%rsp),%r12d + + addl 44(%rsp),%r12d + movl %ebx,%r13d + addl %edi,%r12d + movl %r9d,%r14d + rorl $14,%r13d + movl %ecx,%edi + + xorl %ebx,%r13d + rorl $9,%r14d + xorl %edx,%edi + + movl %r12d,44(%rsp) + xorl %r9d,%r14d + andl %ebx,%edi + + rorl $5,%r13d + addl %r8d,%r12d + xorl %edx,%edi + + rorl $11,%r14d + xorl %ebx,%r13d + addl %edi,%r12d + + movl %r9d,%edi + addl 108(%rbp),%r12d + xorl %r9d,%r14d + + xorl %r10d,%edi + rorl $6,%r13d + movl %r10d,%r8d + + andl %edi,%r15d + rorl $2,%r14d + addl %r13d,%r12d + + xorl %r15d,%r8d + addl %r12d,%eax + addl %r12d,%r8d + movl 52(%rsp),%r13d + movl 40(%rsp),%r15d + + movl %r13d,%r12d + rorl $11,%r13d + addl %r14d,%r8d + movl %r15d,%r14d + rorl $2,%r15d + + xorl %r12d,%r13d + shrl $3,%r12d + rorl $7,%r13d + xorl %r14d,%r15d + shrl $10,%r14d + + rorl $17,%r15d + xorl %r13d,%r12d + xorl %r14d,%r15d + addl 20(%rsp),%r12d + + addl 48(%rsp),%r12d + movl %eax,%r13d + addl %r15d,%r12d + movl %r8d,%r14d + rorl $14,%r13d + movl %ebx,%r15d + + xorl %eax,%r13d + rorl $9,%r14d + xorl %ecx,%r15d + + movl %r12d,48(%rsp) + xorl %r8d,%r14d + andl %eax,%r15d + + rorl $5,%r13d + addl %edx,%r12d + xorl %ecx,%r15d + + rorl $11,%r14d + xorl %eax,%r13d + addl %r15d,%r12d + + movl %r8d,%r15d + addl 112(%rbp),%r12d + xorl %r8d,%r14d + + xorl %r9d,%r15d + rorl $6,%r13d + movl %r9d,%edx + + andl %r15d,%edi + rorl $2,%r14d + addl %r13d,%r12d + + xorl %edi,%edx + addl %r12d,%r11d + addl %r12d,%edx + movl 56(%rsp),%r13d + movl 44(%rsp),%edi + + movl %r13d,%r12d + rorl $11,%r13d + addl %r14d,%edx + movl %edi,%r14d + rorl $2,%edi + + xorl %r12d,%r13d + shrl $3,%r12d + rorl $7,%r13d + xorl %r14d,%edi + shrl $10,%r14d + + rorl $17,%edi + xorl %r13d,%r12d + xorl %r14d,%edi + addl 24(%rsp),%r12d + + addl 52(%rsp),%r12d + movl %r11d,%r13d + addl %edi,%r12d + movl %edx,%r14d + rorl $14,%r13d + movl %eax,%edi + + xorl %r11d,%r13d + rorl $9,%r14d + xorl %ebx,%edi + + movl %r12d,52(%rsp) + xorl %edx,%r14d + andl %r11d,%edi + + rorl $5,%r13d + addl %ecx,%r12d + xorl %ebx,%edi + + rorl $11,%r14d + xorl %r11d,%r13d + addl %edi,%r12d + + movl %edx,%edi + addl 116(%rbp),%r12d + xorl %edx,%r14d + + xorl %r8d,%edi + rorl $6,%r13d + movl %r8d,%ecx + + andl %edi,%r15d + rorl $2,%r14d + addl %r13d,%r12d + + xorl %r15d,%ecx + addl %r12d,%r10d + addl %r12d,%ecx + movl 60(%rsp),%r13d + movl 48(%rsp),%r15d + + movl %r13d,%r12d + rorl $11,%r13d + addl %r14d,%ecx + movl %r15d,%r14d + rorl $2,%r15d + + xorl %r12d,%r13d + shrl $3,%r12d + rorl $7,%r13d + xorl %r14d,%r15d + shrl $10,%r14d + + rorl $17,%r15d + xorl %r13d,%r12d + xorl %r14d,%r15d + addl 28(%rsp),%r12d + + addl 56(%rsp),%r12d + movl %r10d,%r13d + addl %r15d,%r12d + movl %ecx,%r14d + rorl $14,%r13d + movl %r11d,%r15d + + xorl %r10d,%r13d + rorl $9,%r14d + xorl %eax,%r15d + + movl %r12d,56(%rsp) + xorl %ecx,%r14d + andl %r10d,%r15d + + rorl $5,%r13d + addl %ebx,%r12d + xorl %eax,%r15d + + rorl $11,%r14d + xorl %r10d,%r13d + addl %r15d,%r12d + + movl %ecx,%r15d + addl 120(%rbp),%r12d + xorl %ecx,%r14d + + xorl %edx,%r15d + rorl $6,%r13d + movl %edx,%ebx + + andl %r15d,%edi + rorl $2,%r14d + addl %r13d,%r12d + + xorl %edi,%ebx + addl %r12d,%r9d + addl %r12d,%ebx + movl 0(%rsp),%r13d + movl 52(%rsp),%edi + + movl %r13d,%r12d + rorl $11,%r13d + addl %r14d,%ebx + movl %edi,%r14d + rorl $2,%edi + + xorl %r12d,%r13d + shrl $3,%r12d + rorl $7,%r13d + xorl %r14d,%edi + shrl $10,%r14d + + rorl $17,%edi + xorl %r13d,%r12d + xorl %r14d,%edi + addl 32(%rsp),%r12d + + addl 60(%rsp),%r12d + movl %r9d,%r13d + addl %edi,%r12d + movl %ebx,%r14d + rorl $14,%r13d + movl %r10d,%edi + + xorl %r9d,%r13d + rorl $9,%r14d + xorl %r11d,%edi + + movl %r12d,60(%rsp) + xorl %ebx,%r14d + andl %r9d,%edi + + rorl $5,%r13d + addl %eax,%r12d + xorl %r11d,%edi + + rorl $11,%r14d + xorl %r9d,%r13d + addl %edi,%r12d + + movl %ebx,%edi + addl 124(%rbp),%r12d + xorl %ebx,%r14d + + xorl %ecx,%edi + rorl $6,%r13d + movl %ecx,%eax + + andl %edi,%r15d + rorl $2,%r14d + addl %r13d,%r12d + + xorl %r15d,%eax + addl %r12d,%r8d + addl %r12d,%eax + leaq 64(%rbp),%rbp + cmpb $0x19,3(%rbp) + jnz .Lrounds_16_xx + + movq 64+0(%rsp),%rdi + addl %r14d,%eax + leaq 64(%rsi),%rsi + + addl 0(%rdi),%eax + addl 4(%rdi),%ebx + addl 8(%rdi),%ecx + addl 12(%rdi),%edx + addl 16(%rdi),%r8d + addl 20(%rdi),%r9d + addl 24(%rdi),%r10d + addl 28(%rdi),%r11d + + cmpq 64+16(%rsp),%rsi + + movl %eax,0(%rdi) + movl %ebx,4(%rdi) + movl %ecx,8(%rdi) + movl %edx,12(%rdi) + movl %r8d,16(%rdi) + movl %r9d,20(%rdi) + movl %r10d,24(%rdi) + movl %r11d,28(%rdi) + jb .Lloop + + leaq 64+24+48(%rsp),%r11 + + movq 64+24(%rsp),%r15 + + movq -40(%r11),%r14 + + movq -32(%r11),%r13 + + movq -24(%r11),%r12 + + movq -16(%r11),%rbp + + movq -8(%r11),%rbx + +.LSEH_epilogue_blst_sha256_block_data_order: + mov 8(%r11),%rdi + mov 16(%r11),%rsi + + leaq (%r11),%rsp + .byte 0xf3,0xc3 + +.LSEH_end_blst_sha256_block_data_order: + +.p2align 6 + +K256: +.long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5 +.long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5 +.long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3 +.long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174 +.long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc +.long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da +.long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7 +.long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967 +.long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13 +.long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85 +.long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3 +.long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070 +.long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5 +.long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3 +.long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208 +.long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 + +.byte 83,72,65,50,53,54,32,98,108,111,99,107,32,116,114,97,110,115,102,111,114,109,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,64,100,111,116,45,97,115,109,0 +.globl blst_sha256_emit + +.def blst_sha256_emit; .scl 2; .type 32; .endef +.p2align 4 +blst_sha256_emit: + .byte 0xf3,0x0f,0x1e,0xfa + + movq 0(%rdx),%r8 + movq 8(%rdx),%r9 + movq 16(%rdx),%r10 + bswapq %r8 + movq 24(%rdx),%r11 + bswapq %r9 + movl %r8d,4(%rcx) + bswapq %r10 + movl %r9d,12(%rcx) + bswapq %r11 + movl %r10d,20(%rcx) + shrq $32,%r8 + movl %r11d,28(%rcx) + shrq $32,%r9 + movl %r8d,0(%rcx) + shrq $32,%r10 + movl %r9d,8(%rcx) + shrq $32,%r11 + movl %r10d,16(%rcx) + movl %r11d,24(%rcx) + .byte 0xf3,0xc3 + + +.globl blst_sha256_bcopy + +.def blst_sha256_bcopy; .scl 2; .type 32; .endef +.p2align 4 +blst_sha256_bcopy: + .byte 0xf3,0x0f,0x1e,0xfa + + subq %rdx,%rcx +.Loop_bcopy: + movzbl (%rdx),%eax + leaq 1(%rdx),%rdx + movb %al,-1(%rcx,%rdx,1) + decq %r8 + jnz .Loop_bcopy + .byte 0xf3,0xc3 + + +.globl blst_sha256_hcopy + +.def blst_sha256_hcopy; .scl 2; .type 32; .endef +.p2align 4 +blst_sha256_hcopy: + .byte 0xf3,0x0f,0x1e,0xfa + + movq 0(%rdx),%r8 + movq 8(%rdx),%r9 + movq 16(%rdx),%r10 + movq 24(%rdx),%r11 + movq %r8,0(%rcx) + movq %r9,8(%rcx) + movq %r10,16(%rcx) + movq %r11,24(%rcx) + .byte 0xf3,0xc3 + +.section .pdata +.p2align 2 +.rva .LSEH_begin_blst_sha256_block_data_order +.rva .LSEH_body_blst_sha256_block_data_order +.rva .LSEH_info_blst_sha256_block_data_order_prologue + +.rva .LSEH_body_blst_sha256_block_data_order +.rva .LSEH_epilogue_blst_sha256_block_data_order +.rva .LSEH_info_blst_sha256_block_data_order_body + +.rva .LSEH_epilogue_blst_sha256_block_data_order +.rva .LSEH_end_blst_sha256_block_data_order +.rva .LSEH_info_blst_sha256_block_data_order_epilogue + +.section .xdata +.p2align 3 +.LSEH_info_blst_sha256_block_data_order_prologue: +.byte 1,0,5,0x0b +.byte 0,0x74,1,0 +.byte 0,0x64,2,0 +.byte 0,0x03 +.byte 0,0 +.LSEH_info_blst_sha256_block_data_order_body: +.byte 1,0,18,0 +.byte 0x00,0xf4,0x0b,0x00 +.byte 0x00,0xe4,0x0c,0x00 +.byte 0x00,0xd4,0x0d,0x00 +.byte 0x00,0xc4,0x0e,0x00 +.byte 0x00,0x54,0x0f,0x00 +.byte 0x00,0x34,0x10,0x00 +.byte 0x00,0x74,0x12,0x00 +.byte 0x00,0x64,0x13,0x00 +.byte 0x00,0x01,0x11,0x00 +.LSEH_info_blst_sha256_block_data_order_epilogue: +.byte 1,0,5,11 +.byte 0x00,0x74,0x01,0x00 +.byte 0x00,0x64,0x02,0x00 +.byte 0x00,0x03 +.byte 0x00,0x00 + diff --git a/temp-deps/blst-ts/deps/blst/build/coff/sha256-x86_64.s b/temp-deps/blst-ts/deps/blst/build/coff/sha256-x86_64.s new file mode 100644 index 0000000..ed28b78 --- /dev/null +++ b/temp-deps/blst-ts/deps/blst/build/coff/sha256-x86_64.s @@ -0,0 +1,1560 @@ +.text + +.p2align 6 + +K256: +.long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5 +.long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5 +.long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3 +.long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174 +.long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc +.long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da +.long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7 +.long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967 +.long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13 +.long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85 +.long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3 +.long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070 +.long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5 +.long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3 +.long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208 +.long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 + +.long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f +.long 0x03020100,0x0b0a0908,0xffffffff,0xffffffff +.long 0xffffffff,0xffffffff,0x03020100,0x0b0a0908 +.byte 83,72,65,50,53,54,32,98,108,111,99,107,32,116,114,97,110,115,102,111,114,109,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,64,100,111,116,45,97,115,109,0 +.globl blst_sha256_block_data_order_shaext + +.def blst_sha256_block_data_order_shaext; .scl 2; .type 32; .endef +.p2align 6 +blst_sha256_block_data_order_shaext: + .byte 0xf3,0x0f,0x1e,0xfa + movq %rdi,8(%rsp) + movq %rsi,16(%rsp) + movq %rsp,%r11 +.LSEH_begin_blst_sha256_block_data_order_shaext: + movq %rcx,%rdi + movq %rdx,%rsi + movq %r8,%rdx + + + subq $0x58,%rsp + + movaps %xmm6,-88(%r11) + + movaps %xmm7,-72(%r11) + + movaps %xmm8,-56(%r11) + + movaps %xmm9,-40(%r11) + + movaps %xmm10,-24(%r11) + +.LSEH_body_blst_sha256_block_data_order_shaext: + + leaq K256+128(%rip),%rcx + movdqu (%rdi),%xmm1 + movdqu 16(%rdi),%xmm2 + movdqa 256-128(%rcx),%xmm7 + + pshufd $0x1b,%xmm1,%xmm0 + pshufd $0xb1,%xmm1,%xmm1 + pshufd $0x1b,%xmm2,%xmm2 + movdqa %xmm7,%xmm8 +.byte 102,15,58,15,202,8 + punpcklqdq %xmm0,%xmm2 + jmp .Loop_shaext + +.p2align 4 +.Loop_shaext: + movdqu (%rsi),%xmm3 + movdqu 16(%rsi),%xmm4 + movdqu 32(%rsi),%xmm5 +.byte 102,15,56,0,223 + movdqu 48(%rsi),%xmm6 + + movdqa 0-128(%rcx),%xmm0 + paddd %xmm3,%xmm0 +.byte 102,15,56,0,231 + movdqa %xmm2,%xmm10 +.byte 15,56,203,209 + pshufd $0x0e,%xmm0,%xmm0 + nop + movdqa %xmm1,%xmm9 +.byte 15,56,203,202 + + movdqa 16-128(%rcx),%xmm0 + paddd %xmm4,%xmm0 +.byte 102,15,56,0,239 +.byte 15,56,203,209 + pshufd $0x0e,%xmm0,%xmm0 + leaq 64(%rsi),%rsi +.byte 15,56,204,220 +.byte 15,56,203,202 + + movdqa 32-128(%rcx),%xmm0 + paddd %xmm5,%xmm0 +.byte 102,15,56,0,247 +.byte 15,56,203,209 + pshufd $0x0e,%xmm0,%xmm0 + movdqa %xmm6,%xmm7 +.byte 102,15,58,15,253,4 + nop + paddd %xmm7,%xmm3 +.byte 15,56,204,229 +.byte 15,56,203,202 + + movdqa 48-128(%rcx),%xmm0 + paddd %xmm6,%xmm0 +.byte 15,56,205,222 +.byte 15,56,203,209 + pshufd $0x0e,%xmm0,%xmm0 + movdqa %xmm3,%xmm7 +.byte 102,15,58,15,254,4 + nop + paddd %xmm7,%xmm4 +.byte 15,56,204,238 +.byte 15,56,203,202 + movdqa 64-128(%rcx),%xmm0 + paddd %xmm3,%xmm0 +.byte 15,56,205,227 +.byte 15,56,203,209 + pshufd $0x0e,%xmm0,%xmm0 + movdqa %xmm4,%xmm7 +.byte 102,15,58,15,251,4 + nop + paddd %xmm7,%xmm5 +.byte 15,56,204,243 +.byte 15,56,203,202 + movdqa 80-128(%rcx),%xmm0 + paddd %xmm4,%xmm0 +.byte 15,56,205,236 +.byte 15,56,203,209 + pshufd $0x0e,%xmm0,%xmm0 + movdqa %xmm5,%xmm7 +.byte 102,15,58,15,252,4 + nop + paddd %xmm7,%xmm6 +.byte 15,56,204,220 +.byte 15,56,203,202 + movdqa 96-128(%rcx),%xmm0 + paddd %xmm5,%xmm0 +.byte 15,56,205,245 +.byte 15,56,203,209 + pshufd $0x0e,%xmm0,%xmm0 + movdqa %xmm6,%xmm7 +.byte 102,15,58,15,253,4 + nop + paddd %xmm7,%xmm3 +.byte 15,56,204,229 +.byte 15,56,203,202 + movdqa 112-128(%rcx),%xmm0 + paddd %xmm6,%xmm0 +.byte 15,56,205,222 +.byte 15,56,203,209 + pshufd $0x0e,%xmm0,%xmm0 + movdqa %xmm3,%xmm7 +.byte 102,15,58,15,254,4 + nop + paddd %xmm7,%xmm4 +.byte 15,56,204,238 +.byte 15,56,203,202 + movdqa 128-128(%rcx),%xmm0 + paddd %xmm3,%xmm0 +.byte 15,56,205,227 +.byte 15,56,203,209 + pshufd $0x0e,%xmm0,%xmm0 + movdqa %xmm4,%xmm7 +.byte 102,15,58,15,251,4 + nop + paddd %xmm7,%xmm5 +.byte 15,56,204,243 +.byte 15,56,203,202 + movdqa 144-128(%rcx),%xmm0 + paddd %xmm4,%xmm0 +.byte 15,56,205,236 +.byte 15,56,203,209 + pshufd $0x0e,%xmm0,%xmm0 + movdqa %xmm5,%xmm7 +.byte 102,15,58,15,252,4 + nop + paddd %xmm7,%xmm6 +.byte 15,56,204,220 +.byte 15,56,203,202 + movdqa 160-128(%rcx),%xmm0 + paddd %xmm5,%xmm0 +.byte 15,56,205,245 +.byte 15,56,203,209 + pshufd $0x0e,%xmm0,%xmm0 + movdqa %xmm6,%xmm7 +.byte 102,15,58,15,253,4 + nop + paddd %xmm7,%xmm3 +.byte 15,56,204,229 +.byte 15,56,203,202 + movdqa 176-128(%rcx),%xmm0 + paddd %xmm6,%xmm0 +.byte 15,56,205,222 +.byte 15,56,203,209 + pshufd $0x0e,%xmm0,%xmm0 + movdqa %xmm3,%xmm7 +.byte 102,15,58,15,254,4 + nop + paddd %xmm7,%xmm4 +.byte 15,56,204,238 +.byte 15,56,203,202 + movdqa 192-128(%rcx),%xmm0 + paddd %xmm3,%xmm0 +.byte 15,56,205,227 +.byte 15,56,203,209 + pshufd $0x0e,%xmm0,%xmm0 + movdqa %xmm4,%xmm7 +.byte 102,15,58,15,251,4 + nop + paddd %xmm7,%xmm5 +.byte 15,56,204,243 +.byte 15,56,203,202 + movdqa 208-128(%rcx),%xmm0 + paddd %xmm4,%xmm0 +.byte 15,56,205,236 +.byte 15,56,203,209 + pshufd $0x0e,%xmm0,%xmm0 + movdqa %xmm5,%xmm7 +.byte 102,15,58,15,252,4 +.byte 15,56,203,202 + paddd %xmm7,%xmm6 + + movdqa 224-128(%rcx),%xmm0 + paddd %xmm5,%xmm0 +.byte 15,56,203,209 + pshufd $0x0e,%xmm0,%xmm0 +.byte 15,56,205,245 + movdqa %xmm8,%xmm7 +.byte 15,56,203,202 + + movdqa 240-128(%rcx),%xmm0 + paddd %xmm6,%xmm0 + nop +.byte 15,56,203,209 + pshufd $0x0e,%xmm0,%xmm0 + decq %rdx + nop +.byte 15,56,203,202 + + paddd %xmm10,%xmm2 + paddd %xmm9,%xmm1 + jnz .Loop_shaext + + pshufd $0xb1,%xmm2,%xmm2 + pshufd $0x1b,%xmm1,%xmm7 + pshufd $0xb1,%xmm1,%xmm1 + punpckhqdq %xmm2,%xmm1 +.byte 102,15,58,15,215,8 + + movdqu %xmm1,(%rdi) + movdqu %xmm2,16(%rdi) + movaps -88(%r11),%xmm6 + movaps -72(%r11),%xmm7 + movaps -56(%r11),%xmm8 + movaps -40(%r11),%xmm9 + movaps -24(%r11),%xmm10 + movq %r11,%rsp + +.LSEH_epilogue_blst_sha256_block_data_order_shaext: + mov 8(%r11),%rdi + mov 16(%r11),%rsi + + .byte 0xf3,0xc3 + +.LSEH_end_blst_sha256_block_data_order_shaext: +.globl blst_sha256_block_data_order + +.def blst_sha256_block_data_order; .scl 2; .type 32; .endef +.p2align 6 +blst_sha256_block_data_order: + .byte 0xf3,0x0f,0x1e,0xfa + movq %rdi,8(%rsp) + movq %rsi,16(%rsp) + movq %rsp,%r11 +.LSEH_begin_blst_sha256_block_data_order: + movq %rcx,%rdi + movq %rdx,%rsi + movq %r8,%rdx + + + pushq %rbp + + pushq %rbx + + pushq %r12 + + pushq %r13 + + pushq %r14 + + pushq %r15 + + shlq $4,%rdx + subq $104,%rsp + + leaq (%rsi,%rdx,4),%rdx + movq %rdi,0(%rsp) + + movq %rdx,16(%rsp) + movaps %xmm6,32(%rsp) + + movaps %xmm7,48(%rsp) + + movaps %xmm8,64(%rsp) + + movaps %xmm9,80(%rsp) + + movq %rsp,%rbp + +.LSEH_body_blst_sha256_block_data_order: + + + leaq -64(%rsp),%rsp + movl 0(%rdi),%eax + andq $-64,%rsp + movl 4(%rdi),%ebx + movl 8(%rdi),%ecx + movl 12(%rdi),%edx + movl 16(%rdi),%r8d + movl 20(%rdi),%r9d + movl 24(%rdi),%r10d + movl 28(%rdi),%r11d + + + jmp .Lloop_ssse3 +.p2align 4 +.Lloop_ssse3: + movdqa K256+256(%rip),%xmm7 + movq %rsi,8(%rbp) + movdqu 0(%rsi),%xmm0 + movdqu 16(%rsi),%xmm1 + movdqu 32(%rsi),%xmm2 +.byte 102,15,56,0,199 + movdqu 48(%rsi),%xmm3 + leaq K256(%rip),%rsi +.byte 102,15,56,0,207 + movdqa 0(%rsi),%xmm4 + movdqa 16(%rsi),%xmm5 +.byte 102,15,56,0,215 + paddd %xmm0,%xmm4 + movdqa 32(%rsi),%xmm6 +.byte 102,15,56,0,223 + movdqa 48(%rsi),%xmm7 + paddd %xmm1,%xmm5 + paddd %xmm2,%xmm6 + paddd %xmm3,%xmm7 + movdqa %xmm4,0(%rsp) + movl %eax,%r14d + movdqa %xmm5,16(%rsp) + movl %ebx,%edi + movdqa %xmm6,32(%rsp) + xorl %ecx,%edi + movdqa %xmm7,48(%rsp) + movl %r8d,%r13d + jmp .Lssse3_00_47 + +.p2align 4 +.Lssse3_00_47: + subq $-64,%rsi + rorl $14,%r13d + movdqa %xmm1,%xmm4 + movl %r14d,%eax + movl %r9d,%r12d + movdqa %xmm3,%xmm7 + rorl $9,%r14d + xorl %r8d,%r13d + xorl %r10d,%r12d + rorl $5,%r13d + xorl %eax,%r14d +.byte 102,15,58,15,224,4 + andl %r8d,%r12d + xorl %r8d,%r13d +.byte 102,15,58,15,250,4 + addl 0(%rsp),%r11d + movl %eax,%r15d + xorl %r10d,%r12d + rorl $11,%r14d + movdqa %xmm4,%xmm5 + xorl %ebx,%r15d + addl %r12d,%r11d + movdqa %xmm4,%xmm6 + rorl $6,%r13d + andl %r15d,%edi + psrld $3,%xmm4 + xorl %eax,%r14d + addl %r13d,%r11d + xorl %ebx,%edi + paddd %xmm7,%xmm0 + rorl $2,%r14d + addl %r11d,%edx + psrld $7,%xmm6 + addl %edi,%r11d + movl %edx,%r13d + pshufd $250,%xmm3,%xmm7 + addl %r11d,%r14d + rorl $14,%r13d + pslld $14,%xmm5 + movl %r14d,%r11d + movl %r8d,%r12d + pxor %xmm6,%xmm4 + rorl $9,%r14d + xorl %edx,%r13d + xorl %r9d,%r12d + rorl $5,%r13d + psrld $11,%xmm6 + xorl %r11d,%r14d + pxor %xmm5,%xmm4 + andl %edx,%r12d + xorl %edx,%r13d + pslld $11,%xmm5 + addl 4(%rsp),%r10d + movl %r11d,%edi + pxor %xmm6,%xmm4 + xorl %r9d,%r12d + rorl $11,%r14d + movdqa %xmm7,%xmm6 + xorl %eax,%edi + addl %r12d,%r10d + pxor %xmm5,%xmm4 + rorl $6,%r13d + andl %edi,%r15d + xorl %r11d,%r14d + psrld $10,%xmm7 + addl %r13d,%r10d + xorl %eax,%r15d + paddd %xmm4,%xmm0 + rorl $2,%r14d + addl %r10d,%ecx + psrlq $17,%xmm6 + addl %r15d,%r10d + movl %ecx,%r13d + addl %r10d,%r14d + pxor %xmm6,%xmm7 + rorl $14,%r13d + movl %r14d,%r10d + movl %edx,%r12d + rorl $9,%r14d + psrlq $2,%xmm6 + xorl %ecx,%r13d + xorl %r8d,%r12d + pxor %xmm6,%xmm7 + rorl $5,%r13d + xorl %r10d,%r14d + andl %ecx,%r12d + pshufd $128,%xmm7,%xmm7 + xorl %ecx,%r13d + addl 8(%rsp),%r9d + movl %r10d,%r15d + psrldq $8,%xmm7 + xorl %r8d,%r12d + rorl $11,%r14d + xorl %r11d,%r15d + addl %r12d,%r9d + rorl $6,%r13d + paddd %xmm7,%xmm0 + andl %r15d,%edi + xorl %r10d,%r14d + addl %r13d,%r9d + pshufd $80,%xmm0,%xmm7 + xorl %r11d,%edi + rorl $2,%r14d + addl %r9d,%ebx + movdqa %xmm7,%xmm6 + addl %edi,%r9d + movl %ebx,%r13d + psrld $10,%xmm7 + addl %r9d,%r14d + rorl $14,%r13d + psrlq $17,%xmm6 + movl %r14d,%r9d + movl %ecx,%r12d + pxor %xmm6,%xmm7 + rorl $9,%r14d + xorl %ebx,%r13d + xorl %edx,%r12d + rorl $5,%r13d + xorl %r9d,%r14d + psrlq $2,%xmm6 + andl %ebx,%r12d + xorl %ebx,%r13d + addl 12(%rsp),%r8d + pxor %xmm6,%xmm7 + movl %r9d,%edi + xorl %edx,%r12d + rorl $11,%r14d + pshufd $8,%xmm7,%xmm7 + xorl %r10d,%edi + addl %r12d,%r8d + movdqa 0(%rsi),%xmm6 + rorl $6,%r13d + andl %edi,%r15d + pslldq $8,%xmm7 + xorl %r9d,%r14d + addl %r13d,%r8d + xorl %r10d,%r15d + paddd %xmm7,%xmm0 + rorl $2,%r14d + addl %r8d,%eax + addl %r15d,%r8d + paddd %xmm0,%xmm6 + movl %eax,%r13d + addl %r8d,%r14d + movdqa %xmm6,0(%rsp) + rorl $14,%r13d + movdqa %xmm2,%xmm4 + movl %r14d,%r8d + movl %ebx,%r12d + movdqa %xmm0,%xmm7 + rorl $9,%r14d + xorl %eax,%r13d + xorl %ecx,%r12d + rorl $5,%r13d + xorl %r8d,%r14d +.byte 102,15,58,15,225,4 + andl %eax,%r12d + xorl %eax,%r13d +.byte 102,15,58,15,251,4 + addl 16(%rsp),%edx + movl %r8d,%r15d + xorl %ecx,%r12d + rorl $11,%r14d + movdqa %xmm4,%xmm5 + xorl %r9d,%r15d + addl %r12d,%edx + movdqa %xmm4,%xmm6 + rorl $6,%r13d + andl %r15d,%edi + psrld $3,%xmm4 + xorl %r8d,%r14d + addl %r13d,%edx + xorl %r9d,%edi + paddd %xmm7,%xmm1 + rorl $2,%r14d + addl %edx,%r11d + psrld $7,%xmm6 + addl %edi,%edx + movl %r11d,%r13d + pshufd $250,%xmm0,%xmm7 + addl %edx,%r14d + rorl $14,%r13d + pslld $14,%xmm5 + movl %r14d,%edx + movl %eax,%r12d + pxor %xmm6,%xmm4 + rorl $9,%r14d + xorl %r11d,%r13d + xorl %ebx,%r12d + rorl $5,%r13d + psrld $11,%xmm6 + xorl %edx,%r14d + pxor %xmm5,%xmm4 + andl %r11d,%r12d + xorl %r11d,%r13d + pslld $11,%xmm5 + addl 20(%rsp),%ecx + movl %edx,%edi + pxor %xmm6,%xmm4 + xorl %ebx,%r12d + rorl $11,%r14d + movdqa %xmm7,%xmm6 + xorl %r8d,%edi + addl %r12d,%ecx + pxor %xmm5,%xmm4 + rorl $6,%r13d + andl %edi,%r15d + xorl %edx,%r14d + psrld $10,%xmm7 + addl %r13d,%ecx + xorl %r8d,%r15d + paddd %xmm4,%xmm1 + rorl $2,%r14d + addl %ecx,%r10d + psrlq $17,%xmm6 + addl %r15d,%ecx + movl %r10d,%r13d + addl %ecx,%r14d + pxor %xmm6,%xmm7 + rorl $14,%r13d + movl %r14d,%ecx + movl %r11d,%r12d + rorl $9,%r14d + psrlq $2,%xmm6 + xorl %r10d,%r13d + xorl %eax,%r12d + pxor %xmm6,%xmm7 + rorl $5,%r13d + xorl %ecx,%r14d + andl %r10d,%r12d + pshufd $128,%xmm7,%xmm7 + xorl %r10d,%r13d + addl 24(%rsp),%ebx + movl %ecx,%r15d + psrldq $8,%xmm7 + xorl %eax,%r12d + rorl $11,%r14d + xorl %edx,%r15d + addl %r12d,%ebx + rorl $6,%r13d + paddd %xmm7,%xmm1 + andl %r15d,%edi + xorl %ecx,%r14d + addl %r13d,%ebx + pshufd $80,%xmm1,%xmm7 + xorl %edx,%edi + rorl $2,%r14d + addl %ebx,%r9d + movdqa %xmm7,%xmm6 + addl %edi,%ebx + movl %r9d,%r13d + psrld $10,%xmm7 + addl %ebx,%r14d + rorl $14,%r13d + psrlq $17,%xmm6 + movl %r14d,%ebx + movl %r10d,%r12d + pxor %xmm6,%xmm7 + rorl $9,%r14d + xorl %r9d,%r13d + xorl %r11d,%r12d + rorl $5,%r13d + xorl %ebx,%r14d + psrlq $2,%xmm6 + andl %r9d,%r12d + xorl %r9d,%r13d + addl 28(%rsp),%eax + pxor %xmm6,%xmm7 + movl %ebx,%edi + xorl %r11d,%r12d + rorl $11,%r14d + pshufd $8,%xmm7,%xmm7 + xorl %ecx,%edi + addl %r12d,%eax + movdqa 16(%rsi),%xmm6 + rorl $6,%r13d + andl %edi,%r15d + pslldq $8,%xmm7 + xorl %ebx,%r14d + addl %r13d,%eax + xorl %ecx,%r15d + paddd %xmm7,%xmm1 + rorl $2,%r14d + addl %eax,%r8d + addl %r15d,%eax + paddd %xmm1,%xmm6 + movl %r8d,%r13d + addl %eax,%r14d + movdqa %xmm6,16(%rsp) + rorl $14,%r13d + movdqa %xmm3,%xmm4 + movl %r14d,%eax + movl %r9d,%r12d + movdqa %xmm1,%xmm7 + rorl $9,%r14d + xorl %r8d,%r13d + xorl %r10d,%r12d + rorl $5,%r13d + xorl %eax,%r14d +.byte 102,15,58,15,226,4 + andl %r8d,%r12d + xorl %r8d,%r13d +.byte 102,15,58,15,248,4 + addl 32(%rsp),%r11d + movl %eax,%r15d + xorl %r10d,%r12d + rorl $11,%r14d + movdqa %xmm4,%xmm5 + xorl %ebx,%r15d + addl %r12d,%r11d + movdqa %xmm4,%xmm6 + rorl $6,%r13d + andl %r15d,%edi + psrld $3,%xmm4 + xorl %eax,%r14d + addl %r13d,%r11d + xorl %ebx,%edi + paddd %xmm7,%xmm2 + rorl $2,%r14d + addl %r11d,%edx + psrld $7,%xmm6 + addl %edi,%r11d + movl %edx,%r13d + pshufd $250,%xmm1,%xmm7 + addl %r11d,%r14d + rorl $14,%r13d + pslld $14,%xmm5 + movl %r14d,%r11d + movl %r8d,%r12d + pxor %xmm6,%xmm4 + rorl $9,%r14d + xorl %edx,%r13d + xorl %r9d,%r12d + rorl $5,%r13d + psrld $11,%xmm6 + xorl %r11d,%r14d + pxor %xmm5,%xmm4 + andl %edx,%r12d + xorl %edx,%r13d + pslld $11,%xmm5 + addl 36(%rsp),%r10d + movl %r11d,%edi + pxor %xmm6,%xmm4 + xorl %r9d,%r12d + rorl $11,%r14d + movdqa %xmm7,%xmm6 + xorl %eax,%edi + addl %r12d,%r10d + pxor %xmm5,%xmm4 + rorl $6,%r13d + andl %edi,%r15d + xorl %r11d,%r14d + psrld $10,%xmm7 + addl %r13d,%r10d + xorl %eax,%r15d + paddd %xmm4,%xmm2 + rorl $2,%r14d + addl %r10d,%ecx + psrlq $17,%xmm6 + addl %r15d,%r10d + movl %ecx,%r13d + addl %r10d,%r14d + pxor %xmm6,%xmm7 + rorl $14,%r13d + movl %r14d,%r10d + movl %edx,%r12d + rorl $9,%r14d + psrlq $2,%xmm6 + xorl %ecx,%r13d + xorl %r8d,%r12d + pxor %xmm6,%xmm7 + rorl $5,%r13d + xorl %r10d,%r14d + andl %ecx,%r12d + pshufd $128,%xmm7,%xmm7 + xorl %ecx,%r13d + addl 40(%rsp),%r9d + movl %r10d,%r15d + psrldq $8,%xmm7 + xorl %r8d,%r12d + rorl $11,%r14d + xorl %r11d,%r15d + addl %r12d,%r9d + rorl $6,%r13d + paddd %xmm7,%xmm2 + andl %r15d,%edi + xorl %r10d,%r14d + addl %r13d,%r9d + pshufd $80,%xmm2,%xmm7 + xorl %r11d,%edi + rorl $2,%r14d + addl %r9d,%ebx + movdqa %xmm7,%xmm6 + addl %edi,%r9d + movl %ebx,%r13d + psrld $10,%xmm7 + addl %r9d,%r14d + rorl $14,%r13d + psrlq $17,%xmm6 + movl %r14d,%r9d + movl %ecx,%r12d + pxor %xmm6,%xmm7 + rorl $9,%r14d + xorl %ebx,%r13d + xorl %edx,%r12d + rorl $5,%r13d + xorl %r9d,%r14d + psrlq $2,%xmm6 + andl %ebx,%r12d + xorl %ebx,%r13d + addl 44(%rsp),%r8d + pxor %xmm6,%xmm7 + movl %r9d,%edi + xorl %edx,%r12d + rorl $11,%r14d + pshufd $8,%xmm7,%xmm7 + xorl %r10d,%edi + addl %r12d,%r8d + movdqa 32(%rsi),%xmm6 + rorl $6,%r13d + andl %edi,%r15d + pslldq $8,%xmm7 + xorl %r9d,%r14d + addl %r13d,%r8d + xorl %r10d,%r15d + paddd %xmm7,%xmm2 + rorl $2,%r14d + addl %r8d,%eax + addl %r15d,%r8d + paddd %xmm2,%xmm6 + movl %eax,%r13d + addl %r8d,%r14d + movdqa %xmm6,32(%rsp) + rorl $14,%r13d + movdqa %xmm0,%xmm4 + movl %r14d,%r8d + movl %ebx,%r12d + movdqa %xmm2,%xmm7 + rorl $9,%r14d + xorl %eax,%r13d + xorl %ecx,%r12d + rorl $5,%r13d + xorl %r8d,%r14d +.byte 102,15,58,15,227,4 + andl %eax,%r12d + xorl %eax,%r13d +.byte 102,15,58,15,249,4 + addl 48(%rsp),%edx + movl %r8d,%r15d + xorl %ecx,%r12d + rorl $11,%r14d + movdqa %xmm4,%xmm5 + xorl %r9d,%r15d + addl %r12d,%edx + movdqa %xmm4,%xmm6 + rorl $6,%r13d + andl %r15d,%edi + psrld $3,%xmm4 + xorl %r8d,%r14d + addl %r13d,%edx + xorl %r9d,%edi + paddd %xmm7,%xmm3 + rorl $2,%r14d + addl %edx,%r11d + psrld $7,%xmm6 + addl %edi,%edx + movl %r11d,%r13d + pshufd $250,%xmm2,%xmm7 + addl %edx,%r14d + rorl $14,%r13d + pslld $14,%xmm5 + movl %r14d,%edx + movl %eax,%r12d + pxor %xmm6,%xmm4 + rorl $9,%r14d + xorl %r11d,%r13d + xorl %ebx,%r12d + rorl $5,%r13d + psrld $11,%xmm6 + xorl %edx,%r14d + pxor %xmm5,%xmm4 + andl %r11d,%r12d + xorl %r11d,%r13d + pslld $11,%xmm5 + addl 52(%rsp),%ecx + movl %edx,%edi + pxor %xmm6,%xmm4 + xorl %ebx,%r12d + rorl $11,%r14d + movdqa %xmm7,%xmm6 + xorl %r8d,%edi + addl %r12d,%ecx + pxor %xmm5,%xmm4 + rorl $6,%r13d + andl %edi,%r15d + xorl %edx,%r14d + psrld $10,%xmm7 + addl %r13d,%ecx + xorl %r8d,%r15d + paddd %xmm4,%xmm3 + rorl $2,%r14d + addl %ecx,%r10d + psrlq $17,%xmm6 + addl %r15d,%ecx + movl %r10d,%r13d + addl %ecx,%r14d + pxor %xmm6,%xmm7 + rorl $14,%r13d + movl %r14d,%ecx + movl %r11d,%r12d + rorl $9,%r14d + psrlq $2,%xmm6 + xorl %r10d,%r13d + xorl %eax,%r12d + pxor %xmm6,%xmm7 + rorl $5,%r13d + xorl %ecx,%r14d + andl %r10d,%r12d + pshufd $128,%xmm7,%xmm7 + xorl %r10d,%r13d + addl 56(%rsp),%ebx + movl %ecx,%r15d + psrldq $8,%xmm7 + xorl %eax,%r12d + rorl $11,%r14d + xorl %edx,%r15d + addl %r12d,%ebx + rorl $6,%r13d + paddd %xmm7,%xmm3 + andl %r15d,%edi + xorl %ecx,%r14d + addl %r13d,%ebx + pshufd $80,%xmm3,%xmm7 + xorl %edx,%edi + rorl $2,%r14d + addl %ebx,%r9d + movdqa %xmm7,%xmm6 + addl %edi,%ebx + movl %r9d,%r13d + psrld $10,%xmm7 + addl %ebx,%r14d + rorl $14,%r13d + psrlq $17,%xmm6 + movl %r14d,%ebx + movl %r10d,%r12d + pxor %xmm6,%xmm7 + rorl $9,%r14d + xorl %r9d,%r13d + xorl %r11d,%r12d + rorl $5,%r13d + xorl %ebx,%r14d + psrlq $2,%xmm6 + andl %r9d,%r12d + xorl %r9d,%r13d + addl 60(%rsp),%eax + pxor %xmm6,%xmm7 + movl %ebx,%edi + xorl %r11d,%r12d + rorl $11,%r14d + pshufd $8,%xmm7,%xmm7 + xorl %ecx,%edi + addl %r12d,%eax + movdqa 48(%rsi),%xmm6 + rorl $6,%r13d + andl %edi,%r15d + pslldq $8,%xmm7 + xorl %ebx,%r14d + addl %r13d,%eax + xorl %ecx,%r15d + paddd %xmm7,%xmm3 + rorl $2,%r14d + addl %eax,%r8d + addl %r15d,%eax + paddd %xmm3,%xmm6 + movl %r8d,%r13d + addl %eax,%r14d + movdqa %xmm6,48(%rsp) + cmpb $0,67(%rsi) + jne .Lssse3_00_47 + rorl $14,%r13d + movl %r14d,%eax + movl %r9d,%r12d + rorl $9,%r14d + xorl %r8d,%r13d + xorl %r10d,%r12d + rorl $5,%r13d + xorl %eax,%r14d + andl %r8d,%r12d + xorl %r8d,%r13d + addl 0(%rsp),%r11d + movl %eax,%r15d + xorl %r10d,%r12d + rorl $11,%r14d + xorl %ebx,%r15d + addl %r12d,%r11d + rorl $6,%r13d + andl %r15d,%edi + xorl %eax,%r14d + addl %r13d,%r11d + xorl %ebx,%edi + rorl $2,%r14d + addl %r11d,%edx + addl %edi,%r11d + movl %edx,%r13d + addl %r11d,%r14d + rorl $14,%r13d + movl %r14d,%r11d + movl %r8d,%r12d + rorl $9,%r14d + xorl %edx,%r13d + xorl %r9d,%r12d + rorl $5,%r13d + xorl %r11d,%r14d + andl %edx,%r12d + xorl %edx,%r13d + addl 4(%rsp),%r10d + movl %r11d,%edi + xorl %r9d,%r12d + rorl $11,%r14d + xorl %eax,%edi + addl %r12d,%r10d + rorl $6,%r13d + andl %edi,%r15d + xorl %r11d,%r14d + addl %r13d,%r10d + xorl %eax,%r15d + rorl $2,%r14d + addl %r10d,%ecx + addl %r15d,%r10d + movl %ecx,%r13d + addl %r10d,%r14d + rorl $14,%r13d + movl %r14d,%r10d + movl %edx,%r12d + rorl $9,%r14d + xorl %ecx,%r13d + xorl %r8d,%r12d + rorl $5,%r13d + xorl %r10d,%r14d + andl %ecx,%r12d + xorl %ecx,%r13d + addl 8(%rsp),%r9d + movl %r10d,%r15d + xorl %r8d,%r12d + rorl $11,%r14d + xorl %r11d,%r15d + addl %r12d,%r9d + rorl $6,%r13d + andl %r15d,%edi + xorl %r10d,%r14d + addl %r13d,%r9d + xorl %r11d,%edi + rorl $2,%r14d + addl %r9d,%ebx + addl %edi,%r9d + movl %ebx,%r13d + addl %r9d,%r14d + rorl $14,%r13d + movl %r14d,%r9d + movl %ecx,%r12d + rorl $9,%r14d + xorl %ebx,%r13d + xorl %edx,%r12d + rorl $5,%r13d + xorl %r9d,%r14d + andl %ebx,%r12d + xorl %ebx,%r13d + addl 12(%rsp),%r8d + movl %r9d,%edi + xorl %edx,%r12d + rorl $11,%r14d + xorl %r10d,%edi + addl %r12d,%r8d + rorl $6,%r13d + andl %edi,%r15d + xorl %r9d,%r14d + addl %r13d,%r8d + xorl %r10d,%r15d + rorl $2,%r14d + addl %r8d,%eax + addl %r15d,%r8d + movl %eax,%r13d + addl %r8d,%r14d + rorl $14,%r13d + movl %r14d,%r8d + movl %ebx,%r12d + rorl $9,%r14d + xorl %eax,%r13d + xorl %ecx,%r12d + rorl $5,%r13d + xorl %r8d,%r14d + andl %eax,%r12d + xorl %eax,%r13d + addl 16(%rsp),%edx + movl %r8d,%r15d + xorl %ecx,%r12d + rorl $11,%r14d + xorl %r9d,%r15d + addl %r12d,%edx + rorl $6,%r13d + andl %r15d,%edi + xorl %r8d,%r14d + addl %r13d,%edx + xorl %r9d,%edi + rorl $2,%r14d + addl %edx,%r11d + addl %edi,%edx + movl %r11d,%r13d + addl %edx,%r14d + rorl $14,%r13d + movl %r14d,%edx + movl %eax,%r12d + rorl $9,%r14d + xorl %r11d,%r13d + xorl %ebx,%r12d + rorl $5,%r13d + xorl %edx,%r14d + andl %r11d,%r12d + xorl %r11d,%r13d + addl 20(%rsp),%ecx + movl %edx,%edi + xorl %ebx,%r12d + rorl $11,%r14d + xorl %r8d,%edi + addl %r12d,%ecx + rorl $6,%r13d + andl %edi,%r15d + xorl %edx,%r14d + addl %r13d,%ecx + xorl %r8d,%r15d + rorl $2,%r14d + addl %ecx,%r10d + addl %r15d,%ecx + movl %r10d,%r13d + addl %ecx,%r14d + rorl $14,%r13d + movl %r14d,%ecx + movl %r11d,%r12d + rorl $9,%r14d + xorl %r10d,%r13d + xorl %eax,%r12d + rorl $5,%r13d + xorl %ecx,%r14d + andl %r10d,%r12d + xorl %r10d,%r13d + addl 24(%rsp),%ebx + movl %ecx,%r15d + xorl %eax,%r12d + rorl $11,%r14d + xorl %edx,%r15d + addl %r12d,%ebx + rorl $6,%r13d + andl %r15d,%edi + xorl %ecx,%r14d + addl %r13d,%ebx + xorl %edx,%edi + rorl $2,%r14d + addl %ebx,%r9d + addl %edi,%ebx + movl %r9d,%r13d + addl %ebx,%r14d + rorl $14,%r13d + movl %r14d,%ebx + movl %r10d,%r12d + rorl $9,%r14d + xorl %r9d,%r13d + xorl %r11d,%r12d + rorl $5,%r13d + xorl %ebx,%r14d + andl %r9d,%r12d + xorl %r9d,%r13d + addl 28(%rsp),%eax + movl %ebx,%edi + xorl %r11d,%r12d + rorl $11,%r14d + xorl %ecx,%edi + addl %r12d,%eax + rorl $6,%r13d + andl %edi,%r15d + xorl %ebx,%r14d + addl %r13d,%eax + xorl %ecx,%r15d + rorl $2,%r14d + addl %eax,%r8d + addl %r15d,%eax + movl %r8d,%r13d + addl %eax,%r14d + rorl $14,%r13d + movl %r14d,%eax + movl %r9d,%r12d + rorl $9,%r14d + xorl %r8d,%r13d + xorl %r10d,%r12d + rorl $5,%r13d + xorl %eax,%r14d + andl %r8d,%r12d + xorl %r8d,%r13d + addl 32(%rsp),%r11d + movl %eax,%r15d + xorl %r10d,%r12d + rorl $11,%r14d + xorl %ebx,%r15d + addl %r12d,%r11d + rorl $6,%r13d + andl %r15d,%edi + xorl %eax,%r14d + addl %r13d,%r11d + xorl %ebx,%edi + rorl $2,%r14d + addl %r11d,%edx + addl %edi,%r11d + movl %edx,%r13d + addl %r11d,%r14d + rorl $14,%r13d + movl %r14d,%r11d + movl %r8d,%r12d + rorl $9,%r14d + xorl %edx,%r13d + xorl %r9d,%r12d + rorl $5,%r13d + xorl %r11d,%r14d + andl %edx,%r12d + xorl %edx,%r13d + addl 36(%rsp),%r10d + movl %r11d,%edi + xorl %r9d,%r12d + rorl $11,%r14d + xorl %eax,%edi + addl %r12d,%r10d + rorl $6,%r13d + andl %edi,%r15d + xorl %r11d,%r14d + addl %r13d,%r10d + xorl %eax,%r15d + rorl $2,%r14d + addl %r10d,%ecx + addl %r15d,%r10d + movl %ecx,%r13d + addl %r10d,%r14d + rorl $14,%r13d + movl %r14d,%r10d + movl %edx,%r12d + rorl $9,%r14d + xorl %ecx,%r13d + xorl %r8d,%r12d + rorl $5,%r13d + xorl %r10d,%r14d + andl %ecx,%r12d + xorl %ecx,%r13d + addl 40(%rsp),%r9d + movl %r10d,%r15d + xorl %r8d,%r12d + rorl $11,%r14d + xorl %r11d,%r15d + addl %r12d,%r9d + rorl $6,%r13d + andl %r15d,%edi + xorl %r10d,%r14d + addl %r13d,%r9d + xorl %r11d,%edi + rorl $2,%r14d + addl %r9d,%ebx + addl %edi,%r9d + movl %ebx,%r13d + addl %r9d,%r14d + rorl $14,%r13d + movl %r14d,%r9d + movl %ecx,%r12d + rorl $9,%r14d + xorl %ebx,%r13d + xorl %edx,%r12d + rorl $5,%r13d + xorl %r9d,%r14d + andl %ebx,%r12d + xorl %ebx,%r13d + addl 44(%rsp),%r8d + movl %r9d,%edi + xorl %edx,%r12d + rorl $11,%r14d + xorl %r10d,%edi + addl %r12d,%r8d + rorl $6,%r13d + andl %edi,%r15d + xorl %r9d,%r14d + addl %r13d,%r8d + xorl %r10d,%r15d + rorl $2,%r14d + addl %r8d,%eax + addl %r15d,%r8d + movl %eax,%r13d + addl %r8d,%r14d + rorl $14,%r13d + movl %r14d,%r8d + movl %ebx,%r12d + rorl $9,%r14d + xorl %eax,%r13d + xorl %ecx,%r12d + rorl $5,%r13d + xorl %r8d,%r14d + andl %eax,%r12d + xorl %eax,%r13d + addl 48(%rsp),%edx + movl %r8d,%r15d + xorl %ecx,%r12d + rorl $11,%r14d + xorl %r9d,%r15d + addl %r12d,%edx + rorl $6,%r13d + andl %r15d,%edi + xorl %r8d,%r14d + addl %r13d,%edx + xorl %r9d,%edi + rorl $2,%r14d + addl %edx,%r11d + addl %edi,%edx + movl %r11d,%r13d + addl %edx,%r14d + rorl $14,%r13d + movl %r14d,%edx + movl %eax,%r12d + rorl $9,%r14d + xorl %r11d,%r13d + xorl %ebx,%r12d + rorl $5,%r13d + xorl %edx,%r14d + andl %r11d,%r12d + xorl %r11d,%r13d + addl 52(%rsp),%ecx + movl %edx,%edi + xorl %ebx,%r12d + rorl $11,%r14d + xorl %r8d,%edi + addl %r12d,%ecx + rorl $6,%r13d + andl %edi,%r15d + xorl %edx,%r14d + addl %r13d,%ecx + xorl %r8d,%r15d + rorl $2,%r14d + addl %ecx,%r10d + addl %r15d,%ecx + movl %r10d,%r13d + addl %ecx,%r14d + rorl $14,%r13d + movl %r14d,%ecx + movl %r11d,%r12d + rorl $9,%r14d + xorl %r10d,%r13d + xorl %eax,%r12d + rorl $5,%r13d + xorl %ecx,%r14d + andl %r10d,%r12d + xorl %r10d,%r13d + addl 56(%rsp),%ebx + movl %ecx,%r15d + xorl %eax,%r12d + rorl $11,%r14d + xorl %edx,%r15d + addl %r12d,%ebx + rorl $6,%r13d + andl %r15d,%edi + xorl %ecx,%r14d + addl %r13d,%ebx + xorl %edx,%edi + rorl $2,%r14d + addl %ebx,%r9d + addl %edi,%ebx + movl %r9d,%r13d + addl %ebx,%r14d + rorl $14,%r13d + movl %r14d,%ebx + movl %r10d,%r12d + rorl $9,%r14d + xorl %r9d,%r13d + xorl %r11d,%r12d + rorl $5,%r13d + xorl %ebx,%r14d + andl %r9d,%r12d + xorl %r9d,%r13d + addl 60(%rsp),%eax + movl %ebx,%edi + xorl %r11d,%r12d + rorl $11,%r14d + xorl %ecx,%edi + addl %r12d,%eax + rorl $6,%r13d + andl %edi,%r15d + xorl %ebx,%r14d + addl %r13d,%eax + xorl %ecx,%r15d + rorl $2,%r14d + addl %eax,%r8d + addl %r15d,%eax + movl %r8d,%r13d + addl %eax,%r14d + movq 0(%rbp),%rdi + movl %r14d,%eax + movq 8(%rbp),%rsi + + addl 0(%rdi),%eax + addl 4(%rdi),%ebx + addl 8(%rdi),%ecx + addl 12(%rdi),%edx + addl 16(%rdi),%r8d + addl 20(%rdi),%r9d + addl 24(%rdi),%r10d + addl 28(%rdi),%r11d + + leaq 64(%rsi),%rsi + cmpq 16(%rbp),%rsi + + movl %eax,0(%rdi) + movl %ebx,4(%rdi) + movl %ecx,8(%rdi) + movl %edx,12(%rdi) + movl %r8d,16(%rdi) + movl %r9d,20(%rdi) + movl %r10d,24(%rdi) + movl %r11d,28(%rdi) + jb .Lloop_ssse3 + + xorps %xmm0,%xmm0 + leaq 104+48(%rbp),%r11 + + movaps %xmm0,0(%rsp) + movaps %xmm0,16(%rsp) + movaps %xmm0,32(%rsp) + movaps %xmm0,48(%rsp) + movaps 32(%rbp),%xmm6 + movaps 48(%rbp),%xmm7 + movaps 64(%rbp),%xmm8 + movaps 80(%rbp),%xmm9 + movq 104(%rbp),%r15 + + movq -40(%r11),%r14 + + movq -32(%r11),%r13 + + movq -24(%r11),%r12 + + movq -16(%r11),%rbx + + movq -8(%r11),%rbp + +.LSEH_epilogue_blst_sha256_block_data_order: + mov 8(%r11),%rdi + mov 16(%r11),%rsi + + leaq (%r11),%rsp + .byte 0xf3,0xc3 + +.LSEH_end_blst_sha256_block_data_order: +.globl blst_sha256_emit + +.def blst_sha256_emit; .scl 2; .type 32; .endef +.p2align 4 +blst_sha256_emit: + .byte 0xf3,0x0f,0x1e,0xfa + + movq 0(%rdx),%r8 + movq 8(%rdx),%r9 + movq 16(%rdx),%r10 + bswapq %r8 + movq 24(%rdx),%r11 + bswapq %r9 + movl %r8d,4(%rcx) + bswapq %r10 + movl %r9d,12(%rcx) + bswapq %r11 + movl %r10d,20(%rcx) + shrq $32,%r8 + movl %r11d,28(%rcx) + shrq $32,%r9 + movl %r8d,0(%rcx) + shrq $32,%r10 + movl %r9d,8(%rcx) + shrq $32,%r11 + movl %r10d,16(%rcx) + movl %r11d,24(%rcx) + .byte 0xf3,0xc3 + + +.globl blst_sha256_bcopy + +.def blst_sha256_bcopy; .scl 2; .type 32; .endef +.p2align 4 +blst_sha256_bcopy: + .byte 0xf3,0x0f,0x1e,0xfa + + subq %rdx,%rcx +.Loop_bcopy: + movzbl (%rdx),%eax + leaq 1(%rdx),%rdx + movb %al,-1(%rcx,%rdx,1) + decq %r8 + jnz .Loop_bcopy + .byte 0xf3,0xc3 + + +.globl blst_sha256_hcopy + +.def blst_sha256_hcopy; .scl 2; .type 32; .endef +.p2align 4 +blst_sha256_hcopy: + .byte 0xf3,0x0f,0x1e,0xfa + + movq 0(%rdx),%r8 + movq 8(%rdx),%r9 + movq 16(%rdx),%r10 + movq 24(%rdx),%r11 + movq %r8,0(%rcx) + movq %r9,8(%rcx) + movq %r10,16(%rcx) + movq %r11,24(%rcx) + .byte 0xf3,0xc3 + +.section .pdata +.p2align 2 +.rva .LSEH_begin_blst_sha256_block_data_order_shaext +.rva .LSEH_body_blst_sha256_block_data_order_shaext +.rva .LSEH_info_blst_sha256_block_data_order_shaext_prologue + +.rva .LSEH_body_blst_sha256_block_data_order_shaext +.rva .LSEH_epilogue_blst_sha256_block_data_order_shaext +.rva .LSEH_info_blst_sha256_block_data_order_shaext_body + +.rva .LSEH_epilogue_blst_sha256_block_data_order_shaext +.rva .LSEH_end_blst_sha256_block_data_order_shaext +.rva .LSEH_info_blst_sha256_block_data_order_shaext_epilogue + +.rva .LSEH_begin_blst_sha256_block_data_order +.rva .LSEH_body_blst_sha256_block_data_order +.rva .LSEH_info_blst_sha256_block_data_order_prologue + +.rva .LSEH_body_blst_sha256_block_data_order +.rva .LSEH_epilogue_blst_sha256_block_data_order +.rva .LSEH_info_blst_sha256_block_data_order_body + +.rva .LSEH_epilogue_blst_sha256_block_data_order +.rva .LSEH_end_blst_sha256_block_data_order +.rva .LSEH_info_blst_sha256_block_data_order_epilogue + +.section .xdata +.p2align 3 +.LSEH_info_blst_sha256_block_data_order_shaext_prologue: +.byte 1,0,5,0x0b +.byte 0,0x74,1,0 +.byte 0,0x64,2,0 +.byte 0,0x03 +.byte 0,0 +.LSEH_info_blst_sha256_block_data_order_shaext_body: +.byte 1,0,15,0 +.byte 0x00,0x68,0x00,0x00 +.byte 0x00,0x78,0x01,0x00 +.byte 0x00,0x88,0x02,0x00 +.byte 0x00,0x98,0x03,0x00 +.byte 0x00,0xa8,0x04,0x00 +.byte 0x00,0x74,0x0c,0x00 +.byte 0x00,0x64,0x0d,0x00 +.byte 0x00,0xa2 +.byte 0x00,0x00,0x00,0x00,0x00,0x00 +.LSEH_info_blst_sha256_block_data_order_shaext_epilogue: +.byte 1,0,5,11 +.byte 0x00,0x74,0x01,0x00 +.byte 0x00,0x64,0x02,0x00 +.byte 0x00,0x03 +.byte 0x00,0x00 + +.LSEH_info_blst_sha256_block_data_order_prologue: +.byte 1,0,5,0x0b +.byte 0,0x74,1,0 +.byte 0,0x64,2,0 +.byte 0,0x03 +.byte 0,0 +.LSEH_info_blst_sha256_block_data_order_body: +.byte 1,0,26,5 +.byte 0x00,0x68,0x02,0x00 +.byte 0x00,0x78,0x03,0x00 +.byte 0x00,0x88,0x04,0x00 +.byte 0x00,0x98,0x05,0x00 +.byte 0x00,0xf4,0x0d,0x00 +.byte 0x00,0xe4,0x0e,0x00 +.byte 0x00,0xd4,0x0f,0x00 +.byte 0x00,0xc4,0x10,0x00 +.byte 0x00,0x34,0x11,0x00 +.byte 0x00,0x74,0x14,0x00 +.byte 0x00,0x64,0x15,0x00 +.byte 0x00,0x03 +.byte 0x00,0x01,0x12,0x00 +.byte 0x00,0x50 +.LSEH_info_blst_sha256_block_data_order_epilogue: +.byte 1,0,5,11 +.byte 0x00,0x74,0x01,0x00 +.byte 0x00,0x64,0x02,0x00 +.byte 0x00,0x03 +.byte 0x00,0x00 + diff --git a/temp-deps/blst-ts/deps/blst/build/elf/add_mod_256-armv8.S b/temp-deps/blst-ts/deps/blst/build/elf/add_mod_256-armv8.S new file mode 100644 index 0000000..57476aa --- /dev/null +++ b/temp-deps/blst-ts/deps/blst/build/elf/add_mod_256-armv8.S @@ -0,0 +1,379 @@ +.text + +.globl add_mod_256 +.hidden add_mod_256 +.type add_mod_256,%function +.align 5 +add_mod_256: + ldp x8,x9,[x1] + ldp x12,x13,[x2] + + ldp x10,x11,[x1,#16] + adds x8,x8,x12 + ldp x14,x15,[x2,#16] + adcs x9,x9,x13 + ldp x4,x5,[x3] + adcs x10,x10,x14 + ldp x6,x7,[x3,#16] + adcs x11,x11,x15 + adc x3,xzr,xzr + + subs x16,x8,x4 + sbcs x17,x9,x5 + sbcs x1,x10,x6 + sbcs x2,x11,x7 + sbcs xzr,x3,xzr + + csel x8,x8,x16,lo + csel x9,x9,x17,lo + csel x10,x10,x1,lo + stp x8,x9,[x0] + csel x11,x11,x2,lo + stp x10,x11,[x0,#16] + + ret +.size add_mod_256,.-add_mod_256 + +.globl mul_by_3_mod_256 +.hidden mul_by_3_mod_256 +.type mul_by_3_mod_256,%function +.align 5 +mul_by_3_mod_256: + ldp x12,x13,[x1] + ldp x14,x15,[x1,#16] + + adds x8,x12,x12 + ldp x4,x5,[x2] + adcs x9,x13,x13 + ldp x6,x7,[x2,#16] + adcs x10,x14,x14 + adcs x11,x15,x15 + adc x3,xzr,xzr + + subs x16,x8,x4 + sbcs x17,x9,x5 + sbcs x1,x10,x6 + sbcs x2,x11,x7 + sbcs xzr,x3,xzr + + csel x8,x8,x16,lo + csel x9,x9,x17,lo + csel x10,x10,x1,lo + csel x11,x11,x2,lo + + adds x8,x8,x12 + adcs x9,x9,x13 + adcs x10,x10,x14 + adcs x11,x11,x15 + adc x3,xzr,xzr + + subs x16,x8,x4 + sbcs x17,x9,x5 + sbcs x1,x10,x6 + sbcs x2,x11,x7 + sbcs xzr,x3,xzr + + csel x8,x8,x16,lo + csel x9,x9,x17,lo + csel x10,x10,x1,lo + stp x8,x9,[x0] + csel x11,x11,x2,lo + stp x10,x11,[x0,#16] + + ret +.size mul_by_3_mod_256,.-mul_by_3_mod_256 + +.globl lshift_mod_256 +.hidden lshift_mod_256 +.type lshift_mod_256,%function +.align 5 +lshift_mod_256: + ldp x8,x9,[x1] + ldp x10,x11,[x1,#16] + + ldp x4,x5,[x3] + ldp x6,x7,[x3,#16] + +.Loop_lshift_mod_256: + adds x8,x8,x8 + sub x2,x2,#1 + adcs x9,x9,x9 + adcs x10,x10,x10 + adcs x11,x11,x11 + adc x3,xzr,xzr + + subs x12,x8,x4 + sbcs x13,x9,x5 + sbcs x14,x10,x6 + sbcs x15,x11,x7 + sbcs xzr,x3,xzr + + csel x8,x8,x12,lo + csel x9,x9,x13,lo + csel x10,x10,x14,lo + csel x11,x11,x15,lo + + cbnz x2,.Loop_lshift_mod_256 + + stp x8,x9,[x0] + stp x10,x11,[x0,#16] + + ret +.size lshift_mod_256,.-lshift_mod_256 + +.globl rshift_mod_256 +.hidden rshift_mod_256 +.type rshift_mod_256,%function +.align 5 +rshift_mod_256: + ldp x8,x9,[x1] + ldp x10,x11,[x1,#16] + + ldp x4,x5,[x3] + ldp x6,x7,[x3,#16] + +.Loop_rshift: + adds x12,x8,x4 + sub x2,x2,#1 + adcs x13,x9,x5 + adcs x14,x10,x6 + adcs x15,x11,x7 + adc x3,xzr,xzr + tst x8,#1 + + csel x12,x12,x8,ne + csel x13,x13,x9,ne + csel x14,x14,x10,ne + csel x15,x15,x11,ne + csel x3,x3,xzr,ne + + extr x8,x13,x12,#1 + extr x9,x14,x13,#1 + extr x10,x15,x14,#1 + extr x11,x3,x15,#1 + + cbnz x2,.Loop_rshift + + stp x8,x9,[x0] + stp x10,x11,[x0,#16] + + ret +.size rshift_mod_256,.-rshift_mod_256 + +.globl cneg_mod_256 +.hidden cneg_mod_256 +.type cneg_mod_256,%function +.align 5 +cneg_mod_256: + ldp x8,x9,[x1] + ldp x4,x5,[x3] + + ldp x10,x11,[x1,#16] + subs x12,x4,x8 + ldp x6,x7,[x3,#16] + orr x4,x8,x9 + sbcs x13,x5,x9 + orr x5,x10,x11 + sbcs x14,x6,x10 + orr x3,x4,x5 + sbc x15,x7,x11 + + cmp x3,#0 + csetm x3,ne + ands x2,x2,x3 + + csel x8,x8,x12,eq + csel x9,x9,x13,eq + csel x10,x10,x14,eq + stp x8,x9,[x0] + csel x11,x11,x15,eq + stp x10,x11,[x0,#16] + + ret +.size cneg_mod_256,.-cneg_mod_256 + +.globl sub_mod_256 +.hidden sub_mod_256 +.type sub_mod_256,%function +.align 5 +sub_mod_256: + ldp x8,x9,[x1] + ldp x12,x13,[x2] + + ldp x10,x11,[x1,#16] + subs x8,x8,x12 + ldp x14,x15,[x2,#16] + sbcs x9,x9,x13 + ldp x4,x5,[x3] + sbcs x10,x10,x14 + ldp x6,x7,[x3,#16] + sbcs x11,x11,x15 + sbc x3,xzr,xzr + + and x4,x4,x3 + and x5,x5,x3 + adds x8,x8,x4 + and x6,x6,x3 + adcs x9,x9,x5 + and x7,x7,x3 + adcs x10,x10,x6 + stp x8,x9,[x0] + adc x11,x11,x7 + stp x10,x11,[x0,#16] + + ret +.size sub_mod_256,.-sub_mod_256 + +.globl check_mod_256 +.hidden check_mod_256 +.type check_mod_256,%function +.align 5 +check_mod_256: + ldp x8,x9,[x0] + ldp x10,x11,[x0,#16] + ldp x4,x5,[x1] + ldp x6,x7,[x1,#16] + +#ifdef __AARCH64EB__ + rev x8,x8 + rev x9,x9 + rev x10,x10 + rev x11,x11 +#endif + + subs xzr,x8,x4 + sbcs xzr,x9,x5 + orr x8,x8,x9 + sbcs xzr,x10,x6 + orr x8,x8,x10 + sbcs xzr,x11,x7 + orr x8,x8,x11 + sbc x1,xzr,xzr + + cmp x8,#0 + mov x0,#1 + csel x0,x0,xzr,ne + and x0,x0,x1 + + ret +.size check_mod_256,.-check_mod_256 + +.globl add_n_check_mod_256 +.hidden add_n_check_mod_256 +.type add_n_check_mod_256,%function +.align 5 +add_n_check_mod_256: + ldp x8,x9,[x1] + ldp x12,x13,[x2] + ldp x10,x11,[x1,#16] + ldp x14,x15,[x2,#16] + +#ifdef __AARCH64EB__ + rev x8,x8 + rev x12,x12 + rev x9,x9 + rev x13,x13 + rev x10,x10 + rev x14,x14 + rev x11,x11 + rev x15,x15 +#endif + + adds x8,x8,x12 + ldp x4,x5,[x3] + adcs x9,x9,x13 + ldp x6,x7,[x3,#16] + adcs x10,x10,x14 + adcs x11,x11,x15 + adc x3,xzr,xzr + + subs x16,x8,x4 + sbcs x17,x9,x5 + sbcs x1,x10,x6 + sbcs x2,x11,x7 + sbcs xzr,x3,xzr + + csel x8,x8,x16,lo + csel x9,x9,x17,lo + csel x10,x10,x1,lo + csel x11,x11,x2,lo + + orr x16, x8, x9 + orr x17, x10, x11 + orr x16, x16, x17 + +#ifdef __AARCH64EB__ + rev x8,x8 + rev x9,x9 + rev x10,x10 + rev x11,x11 +#endif + + stp x8,x9,[x0] + stp x10,x11,[x0,#16] + + mov x17, #1 + cmp x16, #0 + csel x0, x17, xzr, ne + + ret +.size add_n_check_mod_256,.-add_n_check_mod_256 + +.globl sub_n_check_mod_256 +.hidden sub_n_check_mod_256 +.type sub_n_check_mod_256,%function +.align 5 +sub_n_check_mod_256: + ldp x8,x9,[x1] + ldp x12,x13,[x2] + ldp x10,x11,[x1,#16] + ldp x14,x15,[x2,#16] + +#ifdef __AARCH64EB__ + rev x8,x8 + rev x12,x12 + rev x9,x9 + rev x13,x13 + rev x10,x10 + rev x14,x14 + rev x11,x11 + rev x15,x15 +#endif + + subs x8,x8,x12 + sbcs x9,x9,x13 + ldp x4,x5,[x3] + sbcs x10,x10,x14 + ldp x6,x7,[x3,#16] + sbcs x11,x11,x15 + sbc x3,xzr,xzr + + and x4,x4,x3 + and x5,x5,x3 + adds x8,x8,x4 + and x6,x6,x3 + adcs x9,x9,x5 + and x7,x7,x3 + adcs x10,x10,x6 + adc x11,x11,x7 + + orr x16, x8, x9 + orr x17, x10, x11 + orr x16, x16, x17 + +#ifdef __AARCH64EB__ + rev x8,x8 + rev x9,x9 + rev x10,x10 + rev x11,x11 +#endif + + stp x8,x9,[x0] + stp x10,x11,[x0,#16] + + mov x17, #1 + cmp x16, #0 + csel x0, x17, xzr, ne + + ret +.size sub_n_check_mod_256,.-sub_n_check_mod_256 diff --git a/temp-deps/blst-ts/deps/blst/build/elf/add_mod_256-x86_64.s b/temp-deps/blst-ts/deps/blst/build/elf/add_mod_256-x86_64.s new file mode 100644 index 0000000..2f41781 --- /dev/null +++ b/temp-deps/blst-ts/deps/blst/build/elf/add_mod_256-x86_64.s @@ -0,0 +1,572 @@ +.text + +.globl add_mod_256 +.hidden add_mod_256 +.type add_mod_256,@function +.align 32 +add_mod_256: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + subq $8,%rsp +.cfi_adjust_cfa_offset 8 + + + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + +.Loaded_a_add_mod_256: + addq 0(%rdx),%r8 + adcq 8(%rdx),%r9 + movq %r8,%rax + adcq 16(%rdx),%r10 + movq %r9,%rsi + adcq 24(%rdx),%r11 + sbbq %rdx,%rdx + + movq %r10,%rbx + subq 0(%rcx),%r8 + sbbq 8(%rcx),%r9 + sbbq 16(%rcx),%r10 + movq %r11,%rbp + sbbq 24(%rcx),%r11 + sbbq $0,%rdx + + cmovcq %rax,%r8 + cmovcq %rsi,%r9 + movq %r8,0(%rdi) + cmovcq %rbx,%r10 + movq %r9,8(%rdi) + cmovcq %rbp,%r11 + movq %r10,16(%rdi) + movq %r11,24(%rdi) + + movq 8(%rsp),%rbx +.cfi_restore %rbx + movq 16(%rsp),%rbp +.cfi_restore %rbp + leaq 24(%rsp),%rsp +.cfi_adjust_cfa_offset -24 + + .byte 0xf3,0xc3 +.cfi_endproc +.size add_mod_256,.-add_mod_256 + + +.globl mul_by_3_mod_256 +.hidden mul_by_3_mod_256 +.type mul_by_3_mod_256,@function +.align 32 +mul_by_3_mod_256: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + + + movq %rdx,%rcx + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq %rsi,%rdx + movq 24(%rsi),%r11 + + call __lshift_mod_256 + movq 0(%rsp),%r12 +.cfi_restore %r12 + jmp .Loaded_a_add_mod_256 + + movq 8(%rsp),%rbx +.cfi_restore %rbx + movq 16(%rsp),%rbp +.cfi_restore %rbp + leaq 24(%rsp),%rsp +.cfi_adjust_cfa_offset -24 + + .byte 0xf3,0xc3 +.cfi_endproc +.size mul_by_3_mod_256,.-mul_by_3_mod_256 + +.type __lshift_mod_256,@function +.align 32 +__lshift_mod_256: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + addq %r8,%r8 + adcq %r9,%r9 + movq %r8,%rax + adcq %r10,%r10 + movq %r9,%rsi + adcq %r11,%r11 + sbbq %r12,%r12 + + movq %r10,%rbx + subq 0(%rcx),%r8 + sbbq 8(%rcx),%r9 + sbbq 16(%rcx),%r10 + movq %r11,%rbp + sbbq 24(%rcx),%r11 + sbbq $0,%r12 + + cmovcq %rax,%r8 + cmovcq %rsi,%r9 + cmovcq %rbx,%r10 + cmovcq %rbp,%r11 + + .byte 0xf3,0xc3 +.cfi_endproc +.size __lshift_mod_256,.-__lshift_mod_256 + + +.globl lshift_mod_256 +.hidden lshift_mod_256 +.type lshift_mod_256,@function +.align 32 +lshift_mod_256: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + + + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + +.Loop_lshift_mod_256: + call __lshift_mod_256 + decl %edx + jnz .Loop_lshift_mod_256 + + movq %r8,0(%rdi) + movq %r9,8(%rdi) + movq %r10,16(%rdi) + movq %r11,24(%rdi) + + movq 0(%rsp),%r12 +.cfi_restore %r12 + movq 8(%rsp),%rbx +.cfi_restore %rbx + movq 16(%rsp),%rbp +.cfi_restore %rbp + leaq 24(%rsp),%rsp +.cfi_adjust_cfa_offset -24 + + .byte 0xf3,0xc3 +.cfi_endproc +.size lshift_mod_256,.-lshift_mod_256 + + +.globl rshift_mod_256 +.hidden rshift_mod_256 +.type rshift_mod_256,@function +.align 32 +rshift_mod_256: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + subq $8,%rsp +.cfi_adjust_cfa_offset 8 + + + movq 0(%rsi),%rbp + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + +.Loop_rshift_mod_256: + movq %rbp,%r8 + andq $1,%rbp + movq 0(%rcx),%rax + negq %rbp + movq 8(%rcx),%rsi + movq 16(%rcx),%rbx + + andq %rbp,%rax + andq %rbp,%rsi + andq %rbp,%rbx + andq 24(%rcx),%rbp + + addq %rax,%r8 + adcq %rsi,%r9 + adcq %rbx,%r10 + adcq %rbp,%r11 + sbbq %rax,%rax + + shrq $1,%r8 + movq %r9,%rbp + shrq $1,%r9 + movq %r10,%rbx + shrq $1,%r10 + movq %r11,%rsi + shrq $1,%r11 + + shlq $63,%rbp + shlq $63,%rbx + orq %r8,%rbp + shlq $63,%rsi + orq %rbx,%r9 + shlq $63,%rax + orq %rsi,%r10 + orq %rax,%r11 + + decl %edx + jnz .Loop_rshift_mod_256 + + movq %rbp,0(%rdi) + movq %r9,8(%rdi) + movq %r10,16(%rdi) + movq %r11,24(%rdi) + + movq 8(%rsp),%rbx +.cfi_restore %rbx + movq 16(%rsp),%rbp +.cfi_restore %rbp + leaq 24(%rsp),%rsp +.cfi_adjust_cfa_offset -24 + + .byte 0xf3,0xc3 +.cfi_endproc +.size rshift_mod_256,.-rshift_mod_256 + + +.globl cneg_mod_256 +.hidden cneg_mod_256 +.type cneg_mod_256,@function +.align 32 +cneg_mod_256: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + + + movq 0(%rsi),%r12 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq %r12,%r8 + movq 24(%rsi),%r11 + orq %r9,%r12 + orq %r10,%r12 + orq %r11,%r12 + movq $-1,%rbp + + movq 0(%rcx),%rax + cmovnzq %rbp,%r12 + movq 8(%rcx),%rsi + movq 16(%rcx),%rbx + andq %r12,%rax + movq 24(%rcx),%rbp + andq %r12,%rsi + andq %r12,%rbx + andq %r12,%rbp + + subq %r8,%rax + sbbq %r9,%rsi + sbbq %r10,%rbx + sbbq %r11,%rbp + + orq %rdx,%rdx + + cmovzq %r8,%rax + cmovzq %r9,%rsi + movq %rax,0(%rdi) + cmovzq %r10,%rbx + movq %rsi,8(%rdi) + cmovzq %r11,%rbp + movq %rbx,16(%rdi) + movq %rbp,24(%rdi) + + movq 0(%rsp),%r12 +.cfi_restore %r12 + movq 8(%rsp),%rbx +.cfi_restore %rbx + movq 16(%rsp),%rbp +.cfi_restore %rbp + leaq 24(%rsp),%rsp +.cfi_adjust_cfa_offset -24 + + .byte 0xf3,0xc3 +.cfi_endproc +.size cneg_mod_256,.-cneg_mod_256 + + +.globl sub_mod_256 +.hidden sub_mod_256 +.type sub_mod_256,@function +.align 32 +sub_mod_256: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + subq $8,%rsp +.cfi_adjust_cfa_offset 8 + + + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + + subq 0(%rdx),%r8 + movq 0(%rcx),%rax + sbbq 8(%rdx),%r9 + movq 8(%rcx),%rsi + sbbq 16(%rdx),%r10 + movq 16(%rcx),%rbx + sbbq 24(%rdx),%r11 + movq 24(%rcx),%rbp + sbbq %rdx,%rdx + + andq %rdx,%rax + andq %rdx,%rsi + andq %rdx,%rbx + andq %rdx,%rbp + + addq %rax,%r8 + adcq %rsi,%r9 + movq %r8,0(%rdi) + adcq %rbx,%r10 + movq %r9,8(%rdi) + adcq %rbp,%r11 + movq %r10,16(%rdi) + movq %r11,24(%rdi) + + movq 8(%rsp),%rbx +.cfi_restore %rbx + movq 16(%rsp),%rbp +.cfi_restore %rbp + leaq 24(%rsp),%rsp +.cfi_adjust_cfa_offset -24 + + .byte 0xf3,0xc3 +.cfi_endproc +.size sub_mod_256,.-sub_mod_256 + + +.globl check_mod_256 +.hidden check_mod_256 +.type check_mod_256,@function +.align 32 +check_mod_256: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + movq 0(%rdi),%rax + movq 8(%rdi),%r9 + movq 16(%rdi),%r10 + movq 24(%rdi),%r11 + + movq %rax,%r8 + orq %r9,%rax + orq %r10,%rax + orq %r11,%rax + + subq 0(%rsi),%r8 + sbbq 8(%rsi),%r9 + sbbq 16(%rsi),%r10 + sbbq 24(%rsi),%r11 + sbbq %rsi,%rsi + + movq $1,%rdx + cmpq $0,%rax + cmovneq %rdx,%rax + andq %rsi,%rax + + .byte 0xf3,0xc3 +.cfi_endproc +.size check_mod_256,.-check_mod_256 + + +.globl add_n_check_mod_256 +.hidden add_n_check_mod_256 +.type add_n_check_mod_256,@function +.align 32 +add_n_check_mod_256: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + subq $8,%rsp +.cfi_adjust_cfa_offset 8 + + + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + + addq 0(%rdx),%r8 + adcq 8(%rdx),%r9 + movq %r8,%rax + adcq 16(%rdx),%r10 + movq %r9,%rsi + adcq 24(%rdx),%r11 + sbbq %rdx,%rdx + + movq %r10,%rbx + subq 0(%rcx),%r8 + sbbq 8(%rcx),%r9 + sbbq 16(%rcx),%r10 + movq %r11,%rbp + sbbq 24(%rcx),%r11 + sbbq $0,%rdx + + cmovcq %rax,%r8 + cmovcq %rsi,%r9 + movq %r8,0(%rdi) + cmovcq %rbx,%r10 + movq %r9,8(%rdi) + cmovcq %rbp,%r11 + movq %r10,16(%rdi) + movq %r11,24(%rdi) + + orq %r9,%r8 + orq %r11,%r10 + orq %r10,%r8 + movq $1,%rax + cmovzq %r8,%rax + + movq 8(%rsp),%rbx +.cfi_restore %rbx + movq 16(%rsp),%rbp +.cfi_restore %rbp + leaq 24(%rsp),%rsp +.cfi_adjust_cfa_offset -24 + + .byte 0xf3,0xc3 +.cfi_endproc +.size add_n_check_mod_256,.-add_n_check_mod_256 + + +.globl sub_n_check_mod_256 +.hidden sub_n_check_mod_256 +.type sub_n_check_mod_256,@function +.align 32 +sub_n_check_mod_256: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + subq $8,%rsp +.cfi_adjust_cfa_offset 8 + + + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + + subq 0(%rdx),%r8 + movq 0(%rcx),%rax + sbbq 8(%rdx),%r9 + movq 8(%rcx),%rsi + sbbq 16(%rdx),%r10 + movq 16(%rcx),%rbx + sbbq 24(%rdx),%r11 + movq 24(%rcx),%rbp + sbbq %rdx,%rdx + + andq %rdx,%rax + andq %rdx,%rsi + andq %rdx,%rbx + andq %rdx,%rbp + + addq %rax,%r8 + adcq %rsi,%r9 + movq %r8,0(%rdi) + adcq %rbx,%r10 + movq %r9,8(%rdi) + adcq %rbp,%r11 + movq %r10,16(%rdi) + movq %r11,24(%rdi) + + orq %r9,%r8 + orq %r11,%r10 + orq %r10,%r8 + movq $1,%rax + cmovzq %r8,%rax + + movq 8(%rsp),%rbx +.cfi_restore %rbx + movq 16(%rsp),%rbp +.cfi_restore %rbp + leaq 24(%rsp),%rsp +.cfi_adjust_cfa_offset -24 + + .byte 0xf3,0xc3 +.cfi_endproc +.size sub_n_check_mod_256,.-sub_n_check_mod_256 + +.section .note.GNU-stack,"",@progbits +.section .note.gnu.property,"a",@note + .long 4,2f-1f,5 + .byte 0x47,0x4E,0x55,0 +1: .long 0xc0000002,4,3 +.align 8 +2: diff --git a/temp-deps/blst-ts/deps/blst/build/elf/add_mod_384-armv8.S b/temp-deps/blst-ts/deps/blst/build/elf/add_mod_384-armv8.S new file mode 100644 index 0000000..5c18d7f --- /dev/null +++ b/temp-deps/blst-ts/deps/blst/build/elf/add_mod_384-armv8.S @@ -0,0 +1,1000 @@ +.text + +.globl add_mod_384 +.hidden add_mod_384 +.type add_mod_384,%function +.align 5 +add_mod_384: + .inst 0xd503233f + stp x29,x30,[sp,#-48]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + + ldp x4,x5,[x3] + ldp x6,x7,[x3,#16] + ldp x8,x9,[x3,#32] + + bl __add_mod_384 + ldr x30,[sp,#8] + + stp x10,x11,[x0] + stp x12,x13,[x0,#16] + stp x14,x15,[x0,#32] + + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldr x29,[sp],#48 + .inst 0xd50323bf + ret +.size add_mod_384,.-add_mod_384 + +.type __add_mod_384,%function +.align 5 +__add_mod_384: + ldp x10,x11,[x1] + ldp x16,x17,[x2] + ldp x12,x13,[x1,#16] + ldp x19,x20,[x2,#16] + ldp x14,x15,[x1,#32] + ldp x21,x22,[x2,#32] + +__add_mod_384_ab_are_loaded: + adds x10,x10,x16 + adcs x11,x11,x17 + adcs x12,x12,x19 + adcs x13,x13,x20 + adcs x14,x14,x21 + adcs x15,x15,x22 + adc x3,xzr,xzr + + subs x16,x10,x4 + sbcs x17,x11,x5 + sbcs x19,x12,x6 + sbcs x20,x13,x7 + sbcs x21,x14,x8 + sbcs x22,x15,x9 + sbcs xzr,x3,xzr + + csel x10,x10,x16,lo + csel x11,x11,x17,lo + csel x12,x12,x19,lo + csel x13,x13,x20,lo + csel x14,x14,x21,lo + csel x15,x15,x22,lo + + ret +.size __add_mod_384,.-__add_mod_384 + +.globl add_mod_384x +.hidden add_mod_384x +.type add_mod_384x,%function +.align 5 +add_mod_384x: + .inst 0xd503233f + stp x29,x30,[sp,#-48]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + + ldp x4,x5,[x3] + ldp x6,x7,[x3,#16] + ldp x8,x9,[x3,#32] + + bl __add_mod_384 + + stp x10,x11,[x0] + add x1,x1,#48 + stp x12,x13,[x0,#16] + add x2,x2,#48 + stp x14,x15,[x0,#32] + + bl __add_mod_384 + ldr x30,[sp,#8] + + stp x10,x11,[x0,#48] + stp x12,x13,[x0,#64] + stp x14,x15,[x0,#80] + + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldr x29,[sp],#48 + .inst 0xd50323bf + ret +.size add_mod_384x,.-add_mod_384x + +.globl rshift_mod_384 +.hidden rshift_mod_384 +.type rshift_mod_384,%function +.align 5 +rshift_mod_384: + .inst 0xd503233f + stp x29,x30,[sp,#-48]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + + ldp x10,x11,[x1] + ldp x12,x13,[x1,#16] + ldp x14,x15,[x1,#32] + + ldp x4,x5,[x3] + ldp x6,x7,[x3,#16] + ldp x8,x9,[x3,#32] + +.Loop_rshift_mod_384: + sub x2,x2,#1 + bl __rshift_mod_384 + cbnz x2,.Loop_rshift_mod_384 + + ldr x30,[sp,#8] + stp x10,x11,[x0] + stp x12,x13,[x0,#16] + stp x14,x15,[x0,#32] + + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldr x29,[sp],#48 + .inst 0xd50323bf + ret +.size rshift_mod_384,.-rshift_mod_384 + +.type __rshift_mod_384,%function +.align 5 +__rshift_mod_384: + sbfx x22,x10,#0,#1 + and x16,x22,x4 + and x17,x22,x5 + adds x10,x10,x16 + and x19,x22,x6 + adcs x11,x11,x17 + and x20,x22,x7 + adcs x12,x12,x19 + and x21,x22,x8 + adcs x13,x13,x20 + and x22,x22,x9 + adcs x14,x14,x21 + extr x10,x11,x10,#1 // a[0:5] >>= 1 + adcs x15,x15,x22 + extr x11,x12,x11,#1 + adc x22,xzr,xzr + extr x12,x13,x12,#1 + extr x13,x14,x13,#1 + extr x14,x15,x14,#1 + extr x15,x22,x15,#1 + ret +.size __rshift_mod_384,.-__rshift_mod_384 + +.globl div_by_2_mod_384 +.hidden div_by_2_mod_384 +.type div_by_2_mod_384,%function +.align 5 +div_by_2_mod_384: + .inst 0xd503233f + stp x29,x30,[sp,#-48]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + + ldp x10,x11,[x1] + ldp x12,x13,[x1,#16] + ldp x14,x15,[x1,#32] + + ldp x4,x5,[x2] + ldp x6,x7,[x2,#16] + ldp x8,x9,[x2,#32] + + bl __rshift_mod_384 + + ldr x30,[sp,#8] + stp x10,x11,[x0] + stp x12,x13,[x0,#16] + stp x14,x15,[x0,#32] + + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldr x29,[sp],#48 + .inst 0xd50323bf + ret +.size div_by_2_mod_384,.-div_by_2_mod_384 + +.globl lshift_mod_384 +.hidden lshift_mod_384 +.type lshift_mod_384,%function +.align 5 +lshift_mod_384: + .inst 0xd503233f + stp x29,x30,[sp,#-48]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + + ldp x10,x11,[x1] + ldp x12,x13,[x1,#16] + ldp x14,x15,[x1,#32] + + ldp x4,x5,[x3] + ldp x6,x7,[x3,#16] + ldp x8,x9,[x3,#32] + +.Loop_lshift_mod_384: + sub x2,x2,#1 + bl __lshift_mod_384 + cbnz x2,.Loop_lshift_mod_384 + + ldr x30,[sp,#8] + stp x10,x11,[x0] + stp x12,x13,[x0,#16] + stp x14,x15,[x0,#32] + + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldr x29,[sp],#48 + .inst 0xd50323bf + ret +.size lshift_mod_384,.-lshift_mod_384 + +.type __lshift_mod_384,%function +.align 5 +__lshift_mod_384: + adds x10,x10,x10 + adcs x11,x11,x11 + adcs x12,x12,x12 + adcs x13,x13,x13 + adcs x14,x14,x14 + adcs x15,x15,x15 + adc x3,xzr,xzr + + subs x16,x10,x4 + sbcs x17,x11,x5 + sbcs x19,x12,x6 + sbcs x20,x13,x7 + sbcs x21,x14,x8 + sbcs x22,x15,x9 + sbcs xzr,x3,xzr + + csel x10,x10,x16,lo + csel x11,x11,x17,lo + csel x12,x12,x19,lo + csel x13,x13,x20,lo + csel x14,x14,x21,lo + csel x15,x15,x22,lo + + ret +.size __lshift_mod_384,.-__lshift_mod_384 + +.globl mul_by_3_mod_384 +.hidden mul_by_3_mod_384 +.type mul_by_3_mod_384,%function +.align 5 +mul_by_3_mod_384: + .inst 0xd503233f + stp x29,x30,[sp,#-48]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + + ldp x10,x11,[x1] + ldp x12,x13,[x1,#16] + ldp x14,x15,[x1,#32] + + ldp x4,x5,[x2] + ldp x6,x7,[x2,#16] + ldp x8,x9,[x2,#32] + + bl __lshift_mod_384 + + ldp x16,x17,[x1] + ldp x19,x20,[x1,#16] + ldp x21,x22,[x1,#32] + + bl __add_mod_384_ab_are_loaded + ldr x30,[sp,#8] + + stp x10,x11,[x0] + stp x12,x13,[x0,#16] + stp x14,x15,[x0,#32] + + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldr x29,[sp],#48 + .inst 0xd50323bf + ret +.size mul_by_3_mod_384,.-mul_by_3_mod_384 + +.globl mul_by_8_mod_384 +.hidden mul_by_8_mod_384 +.type mul_by_8_mod_384,%function +.align 5 +mul_by_8_mod_384: + .inst 0xd503233f + stp x29,x30,[sp,#-48]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + + ldp x10,x11,[x1] + ldp x12,x13,[x1,#16] + ldp x14,x15,[x1,#32] + + ldp x4,x5,[x2] + ldp x6,x7,[x2,#16] + ldp x8,x9,[x2,#32] + + bl __lshift_mod_384 + bl __lshift_mod_384 + bl __lshift_mod_384 + ldr x30,[sp,#8] + + stp x10,x11,[x0] + stp x12,x13,[x0,#16] + stp x14,x15,[x0,#32] + + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldr x29,[sp],#48 + .inst 0xd50323bf + ret +.size mul_by_8_mod_384,.-mul_by_8_mod_384 + +.globl mul_by_3_mod_384x +.hidden mul_by_3_mod_384x +.type mul_by_3_mod_384x,%function +.align 5 +mul_by_3_mod_384x: + .inst 0xd503233f + stp x29,x30,[sp,#-48]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + + ldp x10,x11,[x1] + ldp x12,x13,[x1,#16] + ldp x14,x15,[x1,#32] + + ldp x4,x5,[x2] + ldp x6,x7,[x2,#16] + ldp x8,x9,[x2,#32] + + bl __lshift_mod_384 + + ldp x16,x17,[x1] + ldp x19,x20,[x1,#16] + ldp x21,x22,[x1,#32] + + bl __add_mod_384_ab_are_loaded + + stp x10,x11,[x0] + ldp x10,x11,[x1,#48] + stp x12,x13,[x0,#16] + ldp x12,x13,[x1,#64] + stp x14,x15,[x0,#32] + ldp x14,x15,[x1,#80] + + bl __lshift_mod_384 + + ldp x16,x17,[x1,#48] + ldp x19,x20,[x1,#64] + ldp x21,x22,[x1,#80] + + bl __add_mod_384_ab_are_loaded + ldr x30,[sp,#8] + + stp x10,x11,[x0,#48] + stp x12,x13,[x0,#64] + stp x14,x15,[x0,#80] + + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldr x29,[sp],#48 + .inst 0xd50323bf + ret +.size mul_by_3_mod_384x,.-mul_by_3_mod_384x + +.globl mul_by_8_mod_384x +.hidden mul_by_8_mod_384x +.type mul_by_8_mod_384x,%function +.align 5 +mul_by_8_mod_384x: + .inst 0xd503233f + stp x29,x30,[sp,#-48]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + + ldp x10,x11,[x1] + ldp x12,x13,[x1,#16] + ldp x14,x15,[x1,#32] + + ldp x4,x5,[x2] + ldp x6,x7,[x2,#16] + ldp x8,x9,[x2,#32] + + bl __lshift_mod_384 + bl __lshift_mod_384 + bl __lshift_mod_384 + + stp x10,x11,[x0] + ldp x10,x11,[x1,#48] + stp x12,x13,[x0,#16] + ldp x12,x13,[x1,#64] + stp x14,x15,[x0,#32] + ldp x14,x15,[x1,#80] + + bl __lshift_mod_384 + bl __lshift_mod_384 + bl __lshift_mod_384 + ldr x30,[sp,#8] + + stp x10,x11,[x0,#48] + stp x12,x13,[x0,#64] + stp x14,x15,[x0,#80] + + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldr x29,[sp],#48 + .inst 0xd50323bf + ret +.size mul_by_8_mod_384x,.-mul_by_8_mod_384x + +.globl cneg_mod_384 +.hidden cneg_mod_384 +.type cneg_mod_384,%function +.align 5 +cneg_mod_384: + .inst 0xd503233f + stp x29,x30,[sp,#-48]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + + ldp x10,x11,[x1] + ldp x4,x5,[x3] + ldp x12,x13,[x1,#16] + ldp x6,x7,[x3,#16] + + subs x16,x4,x10 + ldp x14,x15,[x1,#32] + ldp x8,x9,[x3,#32] + orr x3,x10,x11 + sbcs x17,x5,x11 + orr x3,x3,x12 + sbcs x19,x6,x12 + orr x3,x3,x13 + sbcs x20,x7,x13 + orr x3,x3,x14 + sbcs x21,x8,x14 + orr x3,x3,x15 + sbc x22,x9,x15 + + cmp x3,#0 + csetm x3,ne + ands x2,x2,x3 + + csel x10,x10,x16,eq + csel x11,x11,x17,eq + csel x12,x12,x19,eq + csel x13,x13,x20,eq + stp x10,x11,[x0] + csel x14,x14,x21,eq + stp x12,x13,[x0,#16] + csel x15,x15,x22,eq + stp x14,x15,[x0,#32] + + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldr x29,[sp],#48 + .inst 0xd50323bf + ret +.size cneg_mod_384,.-cneg_mod_384 + +.globl sub_mod_384 +.hidden sub_mod_384 +.type sub_mod_384,%function +.align 5 +sub_mod_384: + .inst 0xd503233f + stp x29,x30,[sp,#-48]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + + ldp x4,x5,[x3] + ldp x6,x7,[x3,#16] + ldp x8,x9,[x3,#32] + + bl __sub_mod_384 + ldr x30,[sp,#8] + + stp x10,x11,[x0] + stp x12,x13,[x0,#16] + stp x14,x15,[x0,#32] + + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldr x29,[sp],#48 + .inst 0xd50323bf + ret +.size sub_mod_384,.-sub_mod_384 + +.type __sub_mod_384,%function +.align 5 +__sub_mod_384: + ldp x10,x11,[x1] + ldp x16,x17,[x2] + ldp x12,x13,[x1,#16] + ldp x19,x20,[x2,#16] + ldp x14,x15,[x1,#32] + ldp x21,x22,[x2,#32] + + subs x10,x10,x16 + sbcs x11,x11,x17 + sbcs x12,x12,x19 + sbcs x13,x13,x20 + sbcs x14,x14,x21 + sbcs x15,x15,x22 + sbc x3,xzr,xzr + + and x16,x4,x3 + and x17,x5,x3 + adds x10,x10,x16 + and x19,x6,x3 + adcs x11,x11,x17 + and x20,x7,x3 + adcs x12,x12,x19 + and x21,x8,x3 + adcs x13,x13,x20 + and x22,x9,x3 + adcs x14,x14,x21 + adc x15,x15,x22 + + ret +.size __sub_mod_384,.-__sub_mod_384 + +.globl sub_mod_384x +.hidden sub_mod_384x +.type sub_mod_384x,%function +.align 5 +sub_mod_384x: + .inst 0xd503233f + stp x29,x30,[sp,#-48]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + + ldp x4,x5,[x3] + ldp x6,x7,[x3,#16] + ldp x8,x9,[x3,#32] + + bl __sub_mod_384 + + stp x10,x11,[x0] + add x1,x1,#48 + stp x12,x13,[x0,#16] + add x2,x2,#48 + stp x14,x15,[x0,#32] + + bl __sub_mod_384 + ldr x30,[sp,#8] + + stp x10,x11,[x0,#48] + stp x12,x13,[x0,#64] + stp x14,x15,[x0,#80] + + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldr x29,[sp],#48 + .inst 0xd50323bf + ret +.size sub_mod_384x,.-sub_mod_384x + +.globl mul_by_1_plus_i_mod_384x +.hidden mul_by_1_plus_i_mod_384x +.type mul_by_1_plus_i_mod_384x,%function +.align 5 +mul_by_1_plus_i_mod_384x: + .inst 0xd503233f + stp x29,x30,[sp,#-48]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + + ldp x4,x5,[x2] + ldp x6,x7,[x2,#16] + ldp x8,x9,[x2,#32] + add x2,x1,#48 + + bl __sub_mod_384 // a->re - a->im + + ldp x16,x17,[x1] + ldp x19,x20,[x1,#16] + ldp x21,x22,[x1,#32] + stp x10,x11,[x0] + ldp x10,x11,[x1,#48] + stp x12,x13,[x0,#16] + ldp x12,x13,[x1,#64] + stp x14,x15,[x0,#32] + ldp x14,x15,[x1,#80] + + bl __add_mod_384_ab_are_loaded // a->re + a->im + ldr x30,[sp,#8] + + stp x10,x11,[x0,#48] + stp x12,x13,[x0,#64] + stp x14,x15,[x0,#80] + + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldr x29,[sp],#48 + .inst 0xd50323bf + ret +.size mul_by_1_plus_i_mod_384x,.-mul_by_1_plus_i_mod_384x + +.globl sgn0_pty_mod_384 +.hidden sgn0_pty_mod_384 +.type sgn0_pty_mod_384,%function +.align 5 +sgn0_pty_mod_384: + ldp x10,x11,[x0] + ldp x12,x13,[x0,#16] + ldp x14,x15,[x0,#32] + + ldp x4,x5,[x1] + ldp x6,x7,[x1,#16] + ldp x8,x9,[x1,#32] + + and x0,x10,#1 + adds x10,x10,x10 + adcs x11,x11,x11 + adcs x12,x12,x12 + adcs x13,x13,x13 + adcs x14,x14,x14 + adcs x15,x15,x15 + adc x3,xzr,xzr + + subs x10,x10,x4 + sbcs x11,x11,x5 + sbcs x12,x12,x6 + sbcs x13,x13,x7 + sbcs x14,x14,x8 + sbcs x15,x15,x9 + sbc x3,x3,xzr + + mvn x3,x3 + and x3,x3,#2 + orr x0,x0,x3 + + ret +.size sgn0_pty_mod_384,.-sgn0_pty_mod_384 + +.globl sgn0_pty_mod_384x +.hidden sgn0_pty_mod_384x +.type sgn0_pty_mod_384x,%function +.align 5 +sgn0_pty_mod_384x: + ldp x10,x11,[x0] + ldp x12,x13,[x0,#16] + ldp x14,x15,[x0,#32] + + ldp x4,x5,[x1] + ldp x6,x7,[x1,#16] + ldp x8,x9,[x1,#32] + + and x2,x10,#1 + orr x3,x10,x11 + adds x10,x10,x10 + orr x3,x3,x12 + adcs x11,x11,x11 + orr x3,x3,x13 + adcs x12,x12,x12 + orr x3,x3,x14 + adcs x13,x13,x13 + orr x3,x3,x15 + adcs x14,x14,x14 + adcs x15,x15,x15 + adc x16,xzr,xzr + + subs x10,x10,x4 + sbcs x11,x11,x5 + sbcs x12,x12,x6 + sbcs x13,x13,x7 + sbcs x14,x14,x8 + sbcs x15,x15,x9 + sbc x16,x16,xzr + + ldp x10,x11,[x0,#48] + ldp x12,x13,[x0,#64] + ldp x14,x15,[x0,#80] + + mvn x16,x16 + and x16,x16,#2 + orr x2,x2,x16 + + and x0,x10,#1 + orr x1,x10,x11 + adds x10,x10,x10 + orr x1,x1,x12 + adcs x11,x11,x11 + orr x1,x1,x13 + adcs x12,x12,x12 + orr x1,x1,x14 + adcs x13,x13,x13 + orr x1,x1,x15 + adcs x14,x14,x14 + adcs x15,x15,x15 + adc x16,xzr,xzr + + subs x10,x10,x4 + sbcs x11,x11,x5 + sbcs x12,x12,x6 + sbcs x13,x13,x7 + sbcs x14,x14,x8 + sbcs x15,x15,x9 + sbc x16,x16,xzr + + mvn x16,x16 + and x16,x16,#2 + orr x0,x0,x16 + + cmp x3,#0 + csel x3,x0,x2,eq // a->re==0? prty(a->im) : prty(a->re) + + cmp x1,#0 + csel x1,x0,x2,ne // a->im!=0? sgn0(a->im) : sgn0(a->re) + + and x3,x3,#1 + and x1,x1,#2 + orr x0,x1,x3 // pack sign and parity + + ret +.size sgn0_pty_mod_384x,.-sgn0_pty_mod_384x +.globl vec_select_32 +.hidden vec_select_32 +.type vec_select_32,%function +.align 5 +vec_select_32: + dup v6.2d, x3 + ld1 {v0.2d, v1.2d, v2.2d}, [x1],#48 + cmeq v6.2d, v6.2d, #0 + ld1 {v3.2d, v4.2d, v5.2d}, [x2],#48 + bit v0.16b, v3.16b, v6.16b + bit v1.16b, v4.16b, v6.16b + bit v2.16b, v5.16b, v6.16b + st1 {v0.2d, v1.2d, v2.2d}, [x0] + ret +.size vec_select_32,.-vec_select_32 +.globl vec_select_48 +.hidden vec_select_48 +.type vec_select_48,%function +.align 5 +vec_select_48: + dup v6.2d, x3 + ld1 {v0.2d, v1.2d, v2.2d}, [x1],#48 + cmeq v6.2d, v6.2d, #0 + ld1 {v3.2d, v4.2d, v5.2d}, [x2],#48 + bit v0.16b, v3.16b, v6.16b + bit v1.16b, v4.16b, v6.16b + bit v2.16b, v5.16b, v6.16b + st1 {v0.2d, v1.2d, v2.2d}, [x0] + ret +.size vec_select_48,.-vec_select_48 +.globl vec_select_96 +.hidden vec_select_96 +.type vec_select_96,%function +.align 5 +vec_select_96: + dup v6.2d, x3 + ld1 {v0.2d, v1.2d, v2.2d}, [x1],#48 + cmeq v6.2d, v6.2d, #0 + ld1 {v3.2d, v4.2d, v5.2d}, [x2],#48 + bit v0.16b, v3.16b, v6.16b + ld1 {v16.2d, v17.2d, v18.2d}, [x1],#48 + bit v1.16b, v4.16b, v6.16b + ld1 {v19.2d, v20.2d, v21.2d}, [x2],#48 + bit v2.16b, v5.16b, v6.16b + st1 {v0.2d, v1.2d, v2.2d}, [x0],#48 + bit v16.16b, v19.16b, v6.16b + bit v17.16b, v20.16b, v6.16b + bit v18.16b, v21.16b, v6.16b + st1 {v16.2d, v17.2d, v18.2d}, [x0] + ret +.size vec_select_96,.-vec_select_96 +.globl vec_select_192 +.hidden vec_select_192 +.type vec_select_192,%function +.align 5 +vec_select_192: + dup v6.2d, x3 + ld1 {v0.2d, v1.2d, v2.2d}, [x1],#48 + cmeq v6.2d, v6.2d, #0 + ld1 {v3.2d, v4.2d, v5.2d}, [x2],#48 + bit v0.16b, v3.16b, v6.16b + ld1 {v16.2d, v17.2d, v18.2d}, [x1],#48 + bit v1.16b, v4.16b, v6.16b + ld1 {v19.2d, v20.2d, v21.2d}, [x2],#48 + bit v2.16b, v5.16b, v6.16b + st1 {v0.2d, v1.2d, v2.2d}, [x0],#48 + bit v16.16b, v19.16b, v6.16b + ld1 {v0.2d, v1.2d, v2.2d}, [x1],#48 + bit v17.16b, v20.16b, v6.16b + ld1 {v3.2d, v4.2d, v5.2d}, [x2],#48 + bit v18.16b, v21.16b, v6.16b + st1 {v16.2d, v17.2d, v18.2d}, [x0],#48 + bit v0.16b, v3.16b, v6.16b + ld1 {v16.2d, v17.2d, v18.2d}, [x1],#48 + bit v1.16b, v4.16b, v6.16b + ld1 {v19.2d, v20.2d, v21.2d}, [x2],#48 + bit v2.16b, v5.16b, v6.16b + st1 {v0.2d, v1.2d, v2.2d}, [x0],#48 + bit v16.16b, v19.16b, v6.16b + bit v17.16b, v20.16b, v6.16b + bit v18.16b, v21.16b, v6.16b + st1 {v16.2d, v17.2d, v18.2d}, [x0] + ret +.size vec_select_192,.-vec_select_192 +.globl vec_select_144 +.hidden vec_select_144 +.type vec_select_144,%function +.align 5 +vec_select_144: + dup v6.2d, x3 + ld1 {v0.2d, v1.2d, v2.2d}, [x1],#48 + cmeq v6.2d, v6.2d, #0 + ld1 {v3.2d, v4.2d, v5.2d}, [x2],#48 + bit v0.16b, v3.16b, v6.16b + ld1 {v16.2d, v17.2d, v18.2d}, [x1],#48 + bit v1.16b, v4.16b, v6.16b + ld1 {v19.2d, v20.2d, v21.2d}, [x2],#48 + bit v2.16b, v5.16b, v6.16b + st1 {v0.2d, v1.2d, v2.2d}, [x0],#48 + bit v16.16b, v19.16b, v6.16b + ld1 {v0.2d, v1.2d, v2.2d}, [x1],#48 + bit v17.16b, v20.16b, v6.16b + ld1 {v3.2d, v4.2d, v5.2d}, [x2],#48 + bit v18.16b, v21.16b, v6.16b + st1 {v16.2d, v17.2d, v18.2d}, [x0],#48 + bit v0.16b, v3.16b, v6.16b + bit v1.16b, v4.16b, v6.16b + bit v2.16b, v5.16b, v6.16b + st1 {v0.2d, v1.2d, v2.2d}, [x0] + ret +.size vec_select_144,.-vec_select_144 +.globl vec_select_288 +.hidden vec_select_288 +.type vec_select_288,%function +.align 5 +vec_select_288: + dup v6.2d, x3 + ld1 {v0.2d, v1.2d, v2.2d}, [x1],#48 + cmeq v6.2d, v6.2d, #0 + ld1 {v3.2d, v4.2d, v5.2d}, [x2],#48 + bit v0.16b, v3.16b, v6.16b + ld1 {v16.2d, v17.2d, v18.2d}, [x1],#48 + bit v1.16b, v4.16b, v6.16b + ld1 {v19.2d, v20.2d, v21.2d}, [x2],#48 + bit v2.16b, v5.16b, v6.16b + st1 {v0.2d, v1.2d, v2.2d}, [x0],#48 + bit v16.16b, v19.16b, v6.16b + ld1 {v0.2d, v1.2d, v2.2d}, [x1],#48 + bit v17.16b, v20.16b, v6.16b + ld1 {v3.2d, v4.2d, v5.2d}, [x2],#48 + bit v18.16b, v21.16b, v6.16b + st1 {v16.2d, v17.2d, v18.2d}, [x0],#48 + bit v0.16b, v3.16b, v6.16b + ld1 {v16.2d, v17.2d, v18.2d}, [x1],#48 + bit v1.16b, v4.16b, v6.16b + ld1 {v19.2d, v20.2d, v21.2d}, [x2],#48 + bit v2.16b, v5.16b, v6.16b + st1 {v0.2d, v1.2d, v2.2d}, [x0],#48 + bit v16.16b, v19.16b, v6.16b + ld1 {v0.2d, v1.2d, v2.2d}, [x1],#48 + bit v17.16b, v20.16b, v6.16b + ld1 {v3.2d, v4.2d, v5.2d}, [x2],#48 + bit v18.16b, v21.16b, v6.16b + st1 {v16.2d, v17.2d, v18.2d}, [x0],#48 + bit v0.16b, v3.16b, v6.16b + ld1 {v16.2d, v17.2d, v18.2d}, [x1],#48 + bit v1.16b, v4.16b, v6.16b + ld1 {v19.2d, v20.2d, v21.2d}, [x2],#48 + bit v2.16b, v5.16b, v6.16b + st1 {v0.2d, v1.2d, v2.2d}, [x0],#48 + bit v16.16b, v19.16b, v6.16b + bit v17.16b, v20.16b, v6.16b + bit v18.16b, v21.16b, v6.16b + st1 {v16.2d, v17.2d, v18.2d}, [x0] + ret +.size vec_select_288,.-vec_select_288 +.globl vec_prefetch +.hidden vec_prefetch +.type vec_prefetch,%function +.align 5 +vec_prefetch: + add x1, x1, x0 + sub x1, x1, #1 + mov x2, #64 + prfm pldl1keep, [x0] + add x0, x0, x2 + cmp x0, x1 + csel x0, x1, x0, hi + csel x2, xzr, x2, hi + prfm pldl1keep, [x0] + add x0, x0, x2 + cmp x0, x1 + csel x0, x1, x0, hi + csel x2, xzr, x2, hi + prfm pldl1keep, [x0] + add x0, x0, x2 + cmp x0, x1 + csel x0, x1, x0, hi + csel x2, xzr, x2, hi + prfm pldl1keep, [x0] + add x0, x0, x2 + cmp x0, x1 + csel x0, x1, x0, hi + csel x2, xzr, x2, hi + prfm pldl1keep, [x0] + add x0, x0, x2 + cmp x0, x1 + csel x0, x1, x0, hi + csel x2, xzr, x2, hi + prfm pldl1keep, [x0] + add x0, x0, x2 + cmp x0, x1 + csel x0, x1, x0, hi + prfm pldl1keep, [x0] + ret +.size vec_prefetch,.-vec_prefetch +.globl vec_is_zero_16x +.hidden vec_is_zero_16x +.type vec_is_zero_16x,%function +.align 5 +vec_is_zero_16x: + ld1 {v0.2d}, [x0], #16 + lsr x1, x1, #4 + sub x1, x1, #1 + cbz x1, .Loop_is_zero_done + +.Loop_is_zero: + ld1 {v1.2d}, [x0], #16 + orr v0.16b, v0.16b, v1.16b + sub x1, x1, #1 + cbnz x1, .Loop_is_zero + +.Loop_is_zero_done: + dup v1.2d, v0.d[1] + orr v0.16b, v0.16b, v1.16b + mov x1, v0.d[0] + mov x0, #1 + cmp x1, #0 + csel x0, x0, xzr, eq + ret +.size vec_is_zero_16x,.-vec_is_zero_16x +.globl vec_is_equal_16x +.hidden vec_is_equal_16x +.type vec_is_equal_16x,%function +.align 5 +vec_is_equal_16x: + ld1 {v0.2d}, [x0], #16 + ld1 {v1.2d}, [x1], #16 + lsr x2, x2, #4 + eor v0.16b, v0.16b, v1.16b + +.Loop_is_equal: + sub x2, x2, #1 + cbz x2, .Loop_is_equal_done + ld1 {v1.2d}, [x0], #16 + ld1 {v2.2d}, [x1], #16 + eor v1.16b, v1.16b, v2.16b + orr v0.16b, v0.16b, v1.16b + b .Loop_is_equal + nop + +.Loop_is_equal_done: + dup v1.2d, v0.d[1] + orr v0.16b, v0.16b, v1.16b + mov x1, v0.d[0] + mov x0, #1 + cmp x1, #0 + csel x0, x0, xzr, eq + ret +.size vec_is_equal_16x,.-vec_is_equal_16x diff --git a/temp-deps/blst-ts/deps/blst/build/elf/add_mod_384-x86_64.s b/temp-deps/blst-ts/deps/blst/build/elf/add_mod_384-x86_64.s new file mode 100644 index 0000000..39eee6d --- /dev/null +++ b/temp-deps/blst-ts/deps/blst/build/elf/add_mod_384-x86_64.s @@ -0,0 +1,1907 @@ +.text + +.globl add_mod_384 +.hidden add_mod_384 +.type add_mod_384,@function +.align 32 +add_mod_384: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + subq $8,%rsp +.cfi_adjust_cfa_offset 8 + + + call __add_mod_384 + + movq 8(%rsp),%r15 +.cfi_restore %r15 + movq 16(%rsp),%r14 +.cfi_restore %r14 + movq 24(%rsp),%r13 +.cfi_restore %r13 + movq 32(%rsp),%r12 +.cfi_restore %r12 + movq 40(%rsp),%rbx +.cfi_restore %rbx + movq 48(%rsp),%rbp +.cfi_restore %rbp + leaq 56(%rsp),%rsp +.cfi_adjust_cfa_offset -56 + + .byte 0xf3,0xc3 +.cfi_endproc +.size add_mod_384,.-add_mod_384 + +.type __add_mod_384,@function +.align 32 +__add_mod_384: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + movq 32(%rsi),%r12 + movq 40(%rsi),%r13 + +__add_mod_384_a_is_loaded: + addq 0(%rdx),%r8 + adcq 8(%rdx),%r9 + adcq 16(%rdx),%r10 + movq %r8,%r14 + adcq 24(%rdx),%r11 + movq %r9,%r15 + adcq 32(%rdx),%r12 + movq %r10,%rax + adcq 40(%rdx),%r13 + movq %r11,%rbx + sbbq %rdx,%rdx + + subq 0(%rcx),%r8 + sbbq 8(%rcx),%r9 + movq %r12,%rbp + sbbq 16(%rcx),%r10 + sbbq 24(%rcx),%r11 + sbbq 32(%rcx),%r12 + movq %r13,%rsi + sbbq 40(%rcx),%r13 + sbbq $0,%rdx + + cmovcq %r14,%r8 + cmovcq %r15,%r9 + cmovcq %rax,%r10 + movq %r8,0(%rdi) + cmovcq %rbx,%r11 + movq %r9,8(%rdi) + cmovcq %rbp,%r12 + movq %r10,16(%rdi) + cmovcq %rsi,%r13 + movq %r11,24(%rdi) + movq %r12,32(%rdi) + movq %r13,40(%rdi) + + .byte 0xf3,0xc3 +.cfi_endproc +.size __add_mod_384,.-__add_mod_384 + +.globl add_mod_384x +.hidden add_mod_384x +.type add_mod_384x,@function +.align 32 +add_mod_384x: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + subq $24,%rsp +.cfi_adjust_cfa_offset 24 + + + movq %rsi,0(%rsp) + movq %rdx,8(%rsp) + leaq 48(%rsi),%rsi + leaq 48(%rdx),%rdx + leaq 48(%rdi),%rdi + call __add_mod_384 + + movq 0(%rsp),%rsi + movq 8(%rsp),%rdx + leaq -48(%rdi),%rdi + call __add_mod_384 + + movq 24+0(%rsp),%r15 +.cfi_restore %r15 + movq 24+8(%rsp),%r14 +.cfi_restore %r14 + movq 24+16(%rsp),%r13 +.cfi_restore %r13 + movq 24+24(%rsp),%r12 +.cfi_restore %r12 + movq 24+32(%rsp),%rbx +.cfi_restore %rbx + movq 24+40(%rsp),%rbp +.cfi_restore %rbp + leaq 24+48(%rsp),%rsp +.cfi_adjust_cfa_offset -24-8*6 + + .byte 0xf3,0xc3 +.cfi_endproc +.size add_mod_384x,.-add_mod_384x + + +.globl rshift_mod_384 +.hidden rshift_mod_384 +.type rshift_mod_384,@function +.align 32 +rshift_mod_384: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + pushq %rdi +.cfi_adjust_cfa_offset 8 + + + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + movq 32(%rsi),%r12 + movq 40(%rsi),%r13 + +.Loop_rshift_mod_384: + call __rshift_mod_384 + decl %edx + jnz .Loop_rshift_mod_384 + + movq %r8,0(%rdi) + movq %r9,8(%rdi) + movq %r10,16(%rdi) + movq %r11,24(%rdi) + movq %r12,32(%rdi) + movq %r13,40(%rdi) + + movq 8(%rsp),%r15 +.cfi_restore %r15 + movq 16(%rsp),%r14 +.cfi_restore %r14 + movq 24(%rsp),%r13 +.cfi_restore %r13 + movq 32(%rsp),%r12 +.cfi_restore %r12 + movq 40(%rsp),%rbx +.cfi_restore %rbx + movq 48(%rsp),%rbp +.cfi_restore %rbp + leaq 56(%rsp),%rsp +.cfi_adjust_cfa_offset -56 + + .byte 0xf3,0xc3 +.cfi_endproc +.size rshift_mod_384,.-rshift_mod_384 + +.type __rshift_mod_384,@function +.align 32 +__rshift_mod_384: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + movq $1,%rsi + movq 0(%rcx),%r14 + andq %r8,%rsi + movq 8(%rcx),%r15 + negq %rsi + movq 16(%rcx),%rax + andq %rsi,%r14 + movq 24(%rcx),%rbx + andq %rsi,%r15 + movq 32(%rcx),%rbp + andq %rsi,%rax + andq %rsi,%rbx + andq %rsi,%rbp + andq 40(%rcx),%rsi + + addq %r8,%r14 + adcq %r9,%r15 + adcq %r10,%rax + adcq %r11,%rbx + adcq %r12,%rbp + adcq %r13,%rsi + sbbq %r13,%r13 + + shrq $1,%r14 + movq %r15,%r8 + shrq $1,%r15 + movq %rax,%r9 + shrq $1,%rax + movq %rbx,%r10 + shrq $1,%rbx + movq %rbp,%r11 + shrq $1,%rbp + movq %rsi,%r12 + shrq $1,%rsi + shlq $63,%r8 + shlq $63,%r9 + orq %r14,%r8 + shlq $63,%r10 + orq %r15,%r9 + shlq $63,%r11 + orq %rax,%r10 + shlq $63,%r12 + orq %rbx,%r11 + shlq $63,%r13 + orq %rbp,%r12 + orq %rsi,%r13 + + .byte 0xf3,0xc3 +.cfi_endproc +.size __rshift_mod_384,.-__rshift_mod_384 + +.globl div_by_2_mod_384 +.hidden div_by_2_mod_384 +.type div_by_2_mod_384,@function +.align 32 +div_by_2_mod_384: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + pushq %rdi +.cfi_adjust_cfa_offset 8 + + + movq 0(%rsi),%r8 + movq %rdx,%rcx + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + movq 32(%rsi),%r12 + movq 40(%rsi),%r13 + + call __rshift_mod_384 + + movq %r8,0(%rdi) + movq %r9,8(%rdi) + movq %r10,16(%rdi) + movq %r11,24(%rdi) + movq %r12,32(%rdi) + movq %r13,40(%rdi) + + movq 8(%rsp),%r15 +.cfi_restore %r15 + movq 16(%rsp),%r14 +.cfi_restore %r14 + movq 24(%rsp),%r13 +.cfi_restore %r13 + movq 32(%rsp),%r12 +.cfi_restore %r12 + movq 40(%rsp),%rbx +.cfi_restore %rbx + movq 48(%rsp),%rbp +.cfi_restore %rbp + leaq 56(%rsp),%rsp +.cfi_adjust_cfa_offset -56 + + .byte 0xf3,0xc3 +.cfi_endproc +.size div_by_2_mod_384,.-div_by_2_mod_384 + + +.globl lshift_mod_384 +.hidden lshift_mod_384 +.type lshift_mod_384,@function +.align 32 +lshift_mod_384: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + pushq %rdi +.cfi_adjust_cfa_offset 8 + + + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + movq 32(%rsi),%r12 + movq 40(%rsi),%r13 + +.Loop_lshift_mod_384: + addq %r8,%r8 + adcq %r9,%r9 + adcq %r10,%r10 + movq %r8,%r14 + adcq %r11,%r11 + movq %r9,%r15 + adcq %r12,%r12 + movq %r10,%rax + adcq %r13,%r13 + movq %r11,%rbx + sbbq %rdi,%rdi + + subq 0(%rcx),%r8 + sbbq 8(%rcx),%r9 + movq %r12,%rbp + sbbq 16(%rcx),%r10 + sbbq 24(%rcx),%r11 + sbbq 32(%rcx),%r12 + movq %r13,%rsi + sbbq 40(%rcx),%r13 + sbbq $0,%rdi + + movq (%rsp),%rdi + cmovcq %r14,%r8 + cmovcq %r15,%r9 + cmovcq %rax,%r10 + cmovcq %rbx,%r11 + cmovcq %rbp,%r12 + cmovcq %rsi,%r13 + + decl %edx + jnz .Loop_lshift_mod_384 + + movq %r8,0(%rdi) + movq %r9,8(%rdi) + movq %r10,16(%rdi) + movq %r11,24(%rdi) + movq %r12,32(%rdi) + movq %r13,40(%rdi) + + movq 8(%rsp),%r15 +.cfi_restore %r15 + movq 16(%rsp),%r14 +.cfi_restore %r14 + movq 24(%rsp),%r13 +.cfi_restore %r13 + movq 32(%rsp),%r12 +.cfi_restore %r12 + movq 40(%rsp),%rbx +.cfi_restore %rbx + movq 48(%rsp),%rbp +.cfi_restore %rbp + leaq 56(%rsp),%rsp +.cfi_adjust_cfa_offset -56 + + .byte 0xf3,0xc3 +.cfi_endproc +.size lshift_mod_384,.-lshift_mod_384 + +.type __lshift_mod_384,@function +.align 32 +__lshift_mod_384: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + addq %r8,%r8 + adcq %r9,%r9 + adcq %r10,%r10 + movq %r8,%r14 + adcq %r11,%r11 + movq %r9,%r15 + adcq %r12,%r12 + movq %r10,%rax + adcq %r13,%r13 + movq %r11,%rbx + sbbq %rdx,%rdx + + subq 0(%rcx),%r8 + sbbq 8(%rcx),%r9 + movq %r12,%rbp + sbbq 16(%rcx),%r10 + sbbq 24(%rcx),%r11 + sbbq 32(%rcx),%r12 + movq %r13,%rsi + sbbq 40(%rcx),%r13 + sbbq $0,%rdx + + cmovcq %r14,%r8 + cmovcq %r15,%r9 + cmovcq %rax,%r10 + cmovcq %rbx,%r11 + cmovcq %rbp,%r12 + cmovcq %rsi,%r13 + + .byte 0xf3,0xc3 +.cfi_endproc +.size __lshift_mod_384,.-__lshift_mod_384 + + +.globl mul_by_3_mod_384 +.hidden mul_by_3_mod_384 +.type mul_by_3_mod_384,@function +.align 32 +mul_by_3_mod_384: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + pushq %rsi +.cfi_adjust_cfa_offset 8 + + + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + movq 32(%rsi),%r12 + movq 40(%rsi),%r13 + movq %rdx,%rcx + + call __lshift_mod_384 + + movq (%rsp),%rdx + call __add_mod_384_a_is_loaded + + movq 8(%rsp),%r15 +.cfi_restore %r15 + movq 16(%rsp),%r14 +.cfi_restore %r14 + movq 24(%rsp),%r13 +.cfi_restore %r13 + movq 32(%rsp),%r12 +.cfi_restore %r12 + movq 40(%rsp),%rbx +.cfi_restore %rbx + movq 48(%rsp),%rbp +.cfi_restore %rbp + leaq 56(%rsp),%rsp +.cfi_adjust_cfa_offset -56 + + .byte 0xf3,0xc3 +.cfi_endproc +.size mul_by_3_mod_384,.-mul_by_3_mod_384 + +.globl mul_by_8_mod_384 +.hidden mul_by_8_mod_384 +.type mul_by_8_mod_384,@function +.align 32 +mul_by_8_mod_384: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + subq $8,%rsp +.cfi_adjust_cfa_offset 8 + + + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + movq 32(%rsi),%r12 + movq 40(%rsi),%r13 + movq %rdx,%rcx + + call __lshift_mod_384 + call __lshift_mod_384 + call __lshift_mod_384 + + movq %r8,0(%rdi) + movq %r9,8(%rdi) + movq %r10,16(%rdi) + movq %r11,24(%rdi) + movq %r12,32(%rdi) + movq %r13,40(%rdi) + + movq 8(%rsp),%r15 +.cfi_restore %r15 + movq 16(%rsp),%r14 +.cfi_restore %r14 + movq 24(%rsp),%r13 +.cfi_restore %r13 + movq 32(%rsp),%r12 +.cfi_restore %r12 + movq 40(%rsp),%rbx +.cfi_restore %rbx + movq 48(%rsp),%rbp +.cfi_restore %rbp + leaq 56(%rsp),%rsp +.cfi_adjust_cfa_offset -56 + + .byte 0xf3,0xc3 +.cfi_endproc +.size mul_by_8_mod_384,.-mul_by_8_mod_384 + + +.globl mul_by_3_mod_384x +.hidden mul_by_3_mod_384x +.type mul_by_3_mod_384x,@function +.align 32 +mul_by_3_mod_384x: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + pushq %rsi +.cfi_adjust_cfa_offset 8 + + + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + movq 32(%rsi),%r12 + movq 40(%rsi),%r13 + movq %rdx,%rcx + + call __lshift_mod_384 + + movq (%rsp),%rdx + call __add_mod_384_a_is_loaded + + movq (%rsp),%rsi + leaq 48(%rdi),%rdi + + movq 48(%rsi),%r8 + movq 56(%rsi),%r9 + movq 64(%rsi),%r10 + movq 72(%rsi),%r11 + movq 80(%rsi),%r12 + movq 88(%rsi),%r13 + + call __lshift_mod_384 + + movq $48,%rdx + addq (%rsp),%rdx + call __add_mod_384_a_is_loaded + + movq 8(%rsp),%r15 +.cfi_restore %r15 + movq 16(%rsp),%r14 +.cfi_restore %r14 + movq 24(%rsp),%r13 +.cfi_restore %r13 + movq 32(%rsp),%r12 +.cfi_restore %r12 + movq 40(%rsp),%rbx +.cfi_restore %rbx + movq 48(%rsp),%rbp +.cfi_restore %rbp + leaq 56(%rsp),%rsp +.cfi_adjust_cfa_offset -56 + + .byte 0xf3,0xc3 +.cfi_endproc +.size mul_by_3_mod_384x,.-mul_by_3_mod_384x + +.globl mul_by_8_mod_384x +.hidden mul_by_8_mod_384x +.type mul_by_8_mod_384x,@function +.align 32 +mul_by_8_mod_384x: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + pushq %rsi +.cfi_adjust_cfa_offset 8 + + + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + movq 32(%rsi),%r12 + movq 40(%rsi),%r13 + movq %rdx,%rcx + + call __lshift_mod_384 + call __lshift_mod_384 + call __lshift_mod_384 + + movq (%rsp),%rsi + movq %r8,0(%rdi) + movq %r9,8(%rdi) + movq %r10,16(%rdi) + movq %r11,24(%rdi) + movq %r12,32(%rdi) + movq %r13,40(%rdi) + + movq 48+0(%rsi),%r8 + movq 48+8(%rsi),%r9 + movq 48+16(%rsi),%r10 + movq 48+24(%rsi),%r11 + movq 48+32(%rsi),%r12 + movq 48+40(%rsi),%r13 + + call __lshift_mod_384 + call __lshift_mod_384 + call __lshift_mod_384 + + movq %r8,48+0(%rdi) + movq %r9,48+8(%rdi) + movq %r10,48+16(%rdi) + movq %r11,48+24(%rdi) + movq %r12,48+32(%rdi) + movq %r13,48+40(%rdi) + + movq 8(%rsp),%r15 +.cfi_restore %r15 + movq 16(%rsp),%r14 +.cfi_restore %r14 + movq 24(%rsp),%r13 +.cfi_restore %r13 + movq 32(%rsp),%r12 +.cfi_restore %r12 + movq 40(%rsp),%rbx +.cfi_restore %rbx + movq 48(%rsp),%rbp +.cfi_restore %rbp + leaq 56(%rsp),%rsp +.cfi_adjust_cfa_offset -56 + + .byte 0xf3,0xc3 +.cfi_endproc +.size mul_by_8_mod_384x,.-mul_by_8_mod_384x + + +.globl cneg_mod_384 +.hidden cneg_mod_384 +.type cneg_mod_384,@function +.align 32 +cneg_mod_384: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + pushq %rdx +.cfi_adjust_cfa_offset 8 + + + movq 0(%rsi),%rdx + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq %rdx,%r8 + movq 24(%rsi),%r11 + orq %r9,%rdx + movq 32(%rsi),%r12 + orq %r10,%rdx + movq 40(%rsi),%r13 + orq %r11,%rdx + movq $-1,%rsi + orq %r12,%rdx + orq %r13,%rdx + + movq 0(%rcx),%r14 + cmovnzq %rsi,%rdx + movq 8(%rcx),%r15 + movq 16(%rcx),%rax + andq %rdx,%r14 + movq 24(%rcx),%rbx + andq %rdx,%r15 + movq 32(%rcx),%rbp + andq %rdx,%rax + movq 40(%rcx),%rsi + andq %rdx,%rbx + movq 0(%rsp),%rcx + andq %rdx,%rbp + andq %rdx,%rsi + + subq %r8,%r14 + sbbq %r9,%r15 + sbbq %r10,%rax + sbbq %r11,%rbx + sbbq %r12,%rbp + sbbq %r13,%rsi + + orq %rcx,%rcx + + cmovzq %r8,%r14 + cmovzq %r9,%r15 + cmovzq %r10,%rax + movq %r14,0(%rdi) + cmovzq %r11,%rbx + movq %r15,8(%rdi) + cmovzq %r12,%rbp + movq %rax,16(%rdi) + cmovzq %r13,%rsi + movq %rbx,24(%rdi) + movq %rbp,32(%rdi) + movq %rsi,40(%rdi) + + movq 8(%rsp),%r15 +.cfi_restore %r15 + movq 16(%rsp),%r14 +.cfi_restore %r14 + movq 24(%rsp),%r13 +.cfi_restore %r13 + movq 32(%rsp),%r12 +.cfi_restore %r12 + movq 40(%rsp),%rbx +.cfi_restore %rbx + movq 48(%rsp),%rbp +.cfi_restore %rbp + leaq 56(%rsp),%rsp +.cfi_adjust_cfa_offset -56 + + .byte 0xf3,0xc3 +.cfi_endproc +.size cneg_mod_384,.-cneg_mod_384 + + +.globl sub_mod_384 +.hidden sub_mod_384 +.type sub_mod_384,@function +.align 32 +sub_mod_384: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + subq $8,%rsp +.cfi_adjust_cfa_offset 8 + + + call __sub_mod_384 + + movq 8(%rsp),%r15 +.cfi_restore %r15 + movq 16(%rsp),%r14 +.cfi_restore %r14 + movq 24(%rsp),%r13 +.cfi_restore %r13 + movq 32(%rsp),%r12 +.cfi_restore %r12 + movq 40(%rsp),%rbx +.cfi_restore %rbx + movq 48(%rsp),%rbp +.cfi_restore %rbp + leaq 56(%rsp),%rsp +.cfi_adjust_cfa_offset -56 + + .byte 0xf3,0xc3 +.cfi_endproc +.size sub_mod_384,.-sub_mod_384 + +.type __sub_mod_384,@function +.align 32 +__sub_mod_384: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + movq 32(%rsi),%r12 + movq 40(%rsi),%r13 + + subq 0(%rdx),%r8 + movq 0(%rcx),%r14 + sbbq 8(%rdx),%r9 + movq 8(%rcx),%r15 + sbbq 16(%rdx),%r10 + movq 16(%rcx),%rax + sbbq 24(%rdx),%r11 + movq 24(%rcx),%rbx + sbbq 32(%rdx),%r12 + movq 32(%rcx),%rbp + sbbq 40(%rdx),%r13 + movq 40(%rcx),%rsi + sbbq %rdx,%rdx + + andq %rdx,%r14 + andq %rdx,%r15 + andq %rdx,%rax + andq %rdx,%rbx + andq %rdx,%rbp + andq %rdx,%rsi + + addq %r14,%r8 + adcq %r15,%r9 + movq %r8,0(%rdi) + adcq %rax,%r10 + movq %r9,8(%rdi) + adcq %rbx,%r11 + movq %r10,16(%rdi) + adcq %rbp,%r12 + movq %r11,24(%rdi) + adcq %rsi,%r13 + movq %r12,32(%rdi) + movq %r13,40(%rdi) + + .byte 0xf3,0xc3 +.cfi_endproc +.size __sub_mod_384,.-__sub_mod_384 + +.globl sub_mod_384x +.hidden sub_mod_384x +.type sub_mod_384x,@function +.align 32 +sub_mod_384x: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + subq $24,%rsp +.cfi_adjust_cfa_offset 24 + + + movq %rsi,0(%rsp) + movq %rdx,8(%rsp) + leaq 48(%rsi),%rsi + leaq 48(%rdx),%rdx + leaq 48(%rdi),%rdi + call __sub_mod_384 + + movq 0(%rsp),%rsi + movq 8(%rsp),%rdx + leaq -48(%rdi),%rdi + call __sub_mod_384 + + movq 24+0(%rsp),%r15 +.cfi_restore %r15 + movq 24+8(%rsp),%r14 +.cfi_restore %r14 + movq 24+16(%rsp),%r13 +.cfi_restore %r13 + movq 24+24(%rsp),%r12 +.cfi_restore %r12 + movq 24+32(%rsp),%rbx +.cfi_restore %rbx + movq 24+40(%rsp),%rbp +.cfi_restore %rbp + leaq 24+48(%rsp),%rsp +.cfi_adjust_cfa_offset -24-8*6 + + .byte 0xf3,0xc3 +.cfi_endproc +.size sub_mod_384x,.-sub_mod_384x +.globl mul_by_1_plus_i_mod_384x +.hidden mul_by_1_plus_i_mod_384x +.type mul_by_1_plus_i_mod_384x,@function +.align 32 +mul_by_1_plus_i_mod_384x: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + subq $56,%rsp +.cfi_adjust_cfa_offset 56 + + + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + movq 32(%rsi),%r12 + movq 40(%rsi),%r13 + + movq %r8,%r14 + addq 48(%rsi),%r8 + movq %r9,%r15 + adcq 56(%rsi),%r9 + movq %r10,%rax + adcq 64(%rsi),%r10 + movq %r11,%rbx + adcq 72(%rsi),%r11 + movq %r12,%rcx + adcq 80(%rsi),%r12 + movq %r13,%rbp + adcq 88(%rsi),%r13 + movq %rdi,48(%rsp) + sbbq %rdi,%rdi + + subq 48(%rsi),%r14 + sbbq 56(%rsi),%r15 + sbbq 64(%rsi),%rax + sbbq 72(%rsi),%rbx + sbbq 80(%rsi),%rcx + sbbq 88(%rsi),%rbp + sbbq %rsi,%rsi + + movq %r8,0(%rsp) + movq 0(%rdx),%r8 + movq %r9,8(%rsp) + movq 8(%rdx),%r9 + movq %r10,16(%rsp) + movq 16(%rdx),%r10 + movq %r11,24(%rsp) + movq 24(%rdx),%r11 + movq %r12,32(%rsp) + andq %rsi,%r8 + movq 32(%rdx),%r12 + movq %r13,40(%rsp) + andq %rsi,%r9 + movq 40(%rdx),%r13 + andq %rsi,%r10 + andq %rsi,%r11 + andq %rsi,%r12 + andq %rsi,%r13 + movq 48(%rsp),%rsi + + addq %r8,%r14 + movq 0(%rsp),%r8 + adcq %r9,%r15 + movq 8(%rsp),%r9 + adcq %r10,%rax + movq 16(%rsp),%r10 + adcq %r11,%rbx + movq 24(%rsp),%r11 + adcq %r12,%rcx + movq 32(%rsp),%r12 + adcq %r13,%rbp + movq 40(%rsp),%r13 + + movq %r14,0(%rsi) + movq %r8,%r14 + movq %r15,8(%rsi) + movq %rax,16(%rsi) + movq %r9,%r15 + movq %rbx,24(%rsi) + movq %rcx,32(%rsi) + movq %r10,%rax + movq %rbp,40(%rsi) + + subq 0(%rdx),%r8 + movq %r11,%rbx + sbbq 8(%rdx),%r9 + sbbq 16(%rdx),%r10 + movq %r12,%rcx + sbbq 24(%rdx),%r11 + sbbq 32(%rdx),%r12 + movq %r13,%rbp + sbbq 40(%rdx),%r13 + sbbq $0,%rdi + + cmovcq %r14,%r8 + cmovcq %r15,%r9 + cmovcq %rax,%r10 + movq %r8,48(%rsi) + cmovcq %rbx,%r11 + movq %r9,56(%rsi) + cmovcq %rcx,%r12 + movq %r10,64(%rsi) + cmovcq %rbp,%r13 + movq %r11,72(%rsi) + movq %r12,80(%rsi) + movq %r13,88(%rsi) + + movq 56+0(%rsp),%r15 +.cfi_restore %r15 + movq 56+8(%rsp),%r14 +.cfi_restore %r14 + movq 56+16(%rsp),%r13 +.cfi_restore %r13 + movq 56+24(%rsp),%r12 +.cfi_restore %r12 + movq 56+32(%rsp),%rbx +.cfi_restore %rbx + movq 56+40(%rsp),%rbp +.cfi_restore %rbp + leaq 56+48(%rsp),%rsp +.cfi_adjust_cfa_offset -56-8*6 + + .byte 0xf3,0xc3 +.cfi_endproc +.size mul_by_1_plus_i_mod_384x,.-mul_by_1_plus_i_mod_384x +.globl sgn0_pty_mod_384 +.hidden sgn0_pty_mod_384 +.type sgn0_pty_mod_384,@function +.align 32 +sgn0_pty_mod_384: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + + movq 0(%rdi),%r8 + movq 8(%rdi),%r9 + movq 16(%rdi),%r10 + movq 24(%rdi),%r11 + movq 32(%rdi),%rcx + movq 40(%rdi),%rdx + + xorq %rax,%rax + movq %r8,%rdi + addq %r8,%r8 + adcq %r9,%r9 + adcq %r10,%r10 + adcq %r11,%r11 + adcq %rcx,%rcx + adcq %rdx,%rdx + adcq $0,%rax + + subq 0(%rsi),%r8 + sbbq 8(%rsi),%r9 + sbbq 16(%rsi),%r10 + sbbq 24(%rsi),%r11 + sbbq 32(%rsi),%rcx + sbbq 40(%rsi),%rdx + sbbq $0,%rax + + notq %rax + andq $1,%rdi + andq $2,%rax + orq %rdi,%rax + + + .byte 0xf3,0xc3 +.cfi_endproc +.size sgn0_pty_mod_384,.-sgn0_pty_mod_384 + +.globl sgn0_pty_mod_384x +.hidden sgn0_pty_mod_384x +.type sgn0_pty_mod_384x,@function +.align 32 +sgn0_pty_mod_384x: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + subq $8,%rsp +.cfi_adjust_cfa_offset 8 + + + movq 48(%rdi),%r8 + movq 56(%rdi),%r9 + movq 64(%rdi),%r10 + movq 72(%rdi),%r11 + movq 80(%rdi),%rcx + movq 88(%rdi),%rdx + + movq %r8,%rbx + orq %r9,%r8 + orq %r10,%r8 + orq %r11,%r8 + orq %rcx,%r8 + orq %rdx,%r8 + + leaq 0(%rdi),%rax + xorq %rdi,%rdi + movq %rbx,%rbp + addq %rbx,%rbx + adcq %r9,%r9 + adcq %r10,%r10 + adcq %r11,%r11 + adcq %rcx,%rcx + adcq %rdx,%rdx + adcq $0,%rdi + + subq 0(%rsi),%rbx + sbbq 8(%rsi),%r9 + sbbq 16(%rsi),%r10 + sbbq 24(%rsi),%r11 + sbbq 32(%rsi),%rcx + sbbq 40(%rsi),%rdx + sbbq $0,%rdi + + movq %r8,0(%rsp) + notq %rdi + andq $1,%rbp + andq $2,%rdi + orq %rbp,%rdi + + movq 0(%rax),%r8 + movq 8(%rax),%r9 + movq 16(%rax),%r10 + movq 24(%rax),%r11 + movq 32(%rax),%rcx + movq 40(%rax),%rdx + + movq %r8,%rbx + orq %r9,%r8 + orq %r10,%r8 + orq %r11,%r8 + orq %rcx,%r8 + orq %rdx,%r8 + + xorq %rax,%rax + movq %rbx,%rbp + addq %rbx,%rbx + adcq %r9,%r9 + adcq %r10,%r10 + adcq %r11,%r11 + adcq %rcx,%rcx + adcq %rdx,%rdx + adcq $0,%rax + + subq 0(%rsi),%rbx + sbbq 8(%rsi),%r9 + sbbq 16(%rsi),%r10 + sbbq 24(%rsi),%r11 + sbbq 32(%rsi),%rcx + sbbq 40(%rsi),%rdx + sbbq $0,%rax + + movq 0(%rsp),%rbx + + notq %rax + + testq %r8,%r8 + cmovzq %rdi,%rbp + + testq %rbx,%rbx + cmovnzq %rdi,%rax + + andq $1,%rbp + andq $2,%rax + orq %rbp,%rax + + movq 8(%rsp),%rbx +.cfi_restore %rbx + movq 16(%rsp),%rbp +.cfi_restore %rbp + leaq 24(%rsp),%rsp +.cfi_adjust_cfa_offset -24 + + .byte 0xf3,0xc3 +.cfi_endproc +.size sgn0_pty_mod_384x,.-sgn0_pty_mod_384x +.globl vec_select_32 +.hidden vec_select_32 +.type vec_select_32,@function +.align 32 +vec_select_32: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + movd %ecx,%xmm5 + pxor %xmm4,%xmm4 + pshufd $0,%xmm5,%xmm5 + movdqu (%rsi),%xmm0 + leaq 16(%rsi),%rsi + pcmpeqd %xmm4,%xmm5 + movdqu (%rdx),%xmm1 + leaq 16(%rdx),%rdx + pcmpeqd %xmm5,%xmm4 + leaq 16(%rdi),%rdi + pand %xmm4,%xmm0 + movdqu 0+16-16(%rsi),%xmm2 + pand %xmm5,%xmm1 + movdqu 0+16-16(%rdx),%xmm3 + por %xmm1,%xmm0 + movdqu %xmm0,0-16(%rdi) + pand %xmm4,%xmm2 + pand %xmm5,%xmm3 + por %xmm3,%xmm2 + movdqu %xmm2,16-16(%rdi) + .byte 0xf3,0xc3 +.cfi_endproc +.size vec_select_32,.-vec_select_32 +.globl vec_select_48 +.hidden vec_select_48 +.type vec_select_48,@function +.align 32 +vec_select_48: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + movd %ecx,%xmm5 + pxor %xmm4,%xmm4 + pshufd $0,%xmm5,%xmm5 + movdqu (%rsi),%xmm0 + leaq 24(%rsi),%rsi + pcmpeqd %xmm4,%xmm5 + movdqu (%rdx),%xmm1 + leaq 24(%rdx),%rdx + pcmpeqd %xmm5,%xmm4 + leaq 24(%rdi),%rdi + pand %xmm4,%xmm0 + movdqu 0+16-24(%rsi),%xmm2 + pand %xmm5,%xmm1 + movdqu 0+16-24(%rdx),%xmm3 + por %xmm1,%xmm0 + movdqu %xmm0,0-24(%rdi) + pand %xmm4,%xmm2 + movdqu 16+16-24(%rsi),%xmm0 + pand %xmm5,%xmm3 + movdqu 16+16-24(%rdx),%xmm1 + por %xmm3,%xmm2 + movdqu %xmm2,16-24(%rdi) + pand %xmm4,%xmm0 + pand %xmm5,%xmm1 + por %xmm1,%xmm0 + movdqu %xmm0,32-24(%rdi) + .byte 0xf3,0xc3 +.cfi_endproc +.size vec_select_48,.-vec_select_48 +.globl vec_select_96 +.hidden vec_select_96 +.type vec_select_96,@function +.align 32 +vec_select_96: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + movd %ecx,%xmm5 + pxor %xmm4,%xmm4 + pshufd $0,%xmm5,%xmm5 + movdqu (%rsi),%xmm0 + leaq 48(%rsi),%rsi + pcmpeqd %xmm4,%xmm5 + movdqu (%rdx),%xmm1 + leaq 48(%rdx),%rdx + pcmpeqd %xmm5,%xmm4 + leaq 48(%rdi),%rdi + pand %xmm4,%xmm0 + movdqu 0+16-48(%rsi),%xmm2 + pand %xmm5,%xmm1 + movdqu 0+16-48(%rdx),%xmm3 + por %xmm1,%xmm0 + movdqu %xmm0,0-48(%rdi) + pand %xmm4,%xmm2 + movdqu 16+16-48(%rsi),%xmm0 + pand %xmm5,%xmm3 + movdqu 16+16-48(%rdx),%xmm1 + por %xmm3,%xmm2 + movdqu %xmm2,16-48(%rdi) + pand %xmm4,%xmm0 + movdqu 32+16-48(%rsi),%xmm2 + pand %xmm5,%xmm1 + movdqu 32+16-48(%rdx),%xmm3 + por %xmm1,%xmm0 + movdqu %xmm0,32-48(%rdi) + pand %xmm4,%xmm2 + movdqu 48+16-48(%rsi),%xmm0 + pand %xmm5,%xmm3 + movdqu 48+16-48(%rdx),%xmm1 + por %xmm3,%xmm2 + movdqu %xmm2,48-48(%rdi) + pand %xmm4,%xmm0 + movdqu 64+16-48(%rsi),%xmm2 + pand %xmm5,%xmm1 + movdqu 64+16-48(%rdx),%xmm3 + por %xmm1,%xmm0 + movdqu %xmm0,64-48(%rdi) + pand %xmm4,%xmm2 + pand %xmm5,%xmm3 + por %xmm3,%xmm2 + movdqu %xmm2,80-48(%rdi) + .byte 0xf3,0xc3 +.cfi_endproc +.size vec_select_96,.-vec_select_96 +.globl vec_select_192 +.hidden vec_select_192 +.type vec_select_192,@function +.align 32 +vec_select_192: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + movd %ecx,%xmm5 + pxor %xmm4,%xmm4 + pshufd $0,%xmm5,%xmm5 + movdqu (%rsi),%xmm0 + leaq 96(%rsi),%rsi + pcmpeqd %xmm4,%xmm5 + movdqu (%rdx),%xmm1 + leaq 96(%rdx),%rdx + pcmpeqd %xmm5,%xmm4 + leaq 96(%rdi),%rdi + pand %xmm4,%xmm0 + movdqu 0+16-96(%rsi),%xmm2 + pand %xmm5,%xmm1 + movdqu 0+16-96(%rdx),%xmm3 + por %xmm1,%xmm0 + movdqu %xmm0,0-96(%rdi) + pand %xmm4,%xmm2 + movdqu 16+16-96(%rsi),%xmm0 + pand %xmm5,%xmm3 + movdqu 16+16-96(%rdx),%xmm1 + por %xmm3,%xmm2 + movdqu %xmm2,16-96(%rdi) + pand %xmm4,%xmm0 + movdqu 32+16-96(%rsi),%xmm2 + pand %xmm5,%xmm1 + movdqu 32+16-96(%rdx),%xmm3 + por %xmm1,%xmm0 + movdqu %xmm0,32-96(%rdi) + pand %xmm4,%xmm2 + movdqu 48+16-96(%rsi),%xmm0 + pand %xmm5,%xmm3 + movdqu 48+16-96(%rdx),%xmm1 + por %xmm3,%xmm2 + movdqu %xmm2,48-96(%rdi) + pand %xmm4,%xmm0 + movdqu 64+16-96(%rsi),%xmm2 + pand %xmm5,%xmm1 + movdqu 64+16-96(%rdx),%xmm3 + por %xmm1,%xmm0 + movdqu %xmm0,64-96(%rdi) + pand %xmm4,%xmm2 + movdqu 80+16-96(%rsi),%xmm0 + pand %xmm5,%xmm3 + movdqu 80+16-96(%rdx),%xmm1 + por %xmm3,%xmm2 + movdqu %xmm2,80-96(%rdi) + pand %xmm4,%xmm0 + movdqu 96+16-96(%rsi),%xmm2 + pand %xmm5,%xmm1 + movdqu 96+16-96(%rdx),%xmm3 + por %xmm1,%xmm0 + movdqu %xmm0,96-96(%rdi) + pand %xmm4,%xmm2 + movdqu 112+16-96(%rsi),%xmm0 + pand %xmm5,%xmm3 + movdqu 112+16-96(%rdx),%xmm1 + por %xmm3,%xmm2 + movdqu %xmm2,112-96(%rdi) + pand %xmm4,%xmm0 + movdqu 128+16-96(%rsi),%xmm2 + pand %xmm5,%xmm1 + movdqu 128+16-96(%rdx),%xmm3 + por %xmm1,%xmm0 + movdqu %xmm0,128-96(%rdi) + pand %xmm4,%xmm2 + movdqu 144+16-96(%rsi),%xmm0 + pand %xmm5,%xmm3 + movdqu 144+16-96(%rdx),%xmm1 + por %xmm3,%xmm2 + movdqu %xmm2,144-96(%rdi) + pand %xmm4,%xmm0 + movdqu 160+16-96(%rsi),%xmm2 + pand %xmm5,%xmm1 + movdqu 160+16-96(%rdx),%xmm3 + por %xmm1,%xmm0 + movdqu %xmm0,160-96(%rdi) + pand %xmm4,%xmm2 + pand %xmm5,%xmm3 + por %xmm3,%xmm2 + movdqu %xmm2,176-96(%rdi) + .byte 0xf3,0xc3 +.cfi_endproc +.size vec_select_192,.-vec_select_192 +.globl vec_select_144 +.hidden vec_select_144 +.type vec_select_144,@function +.align 32 +vec_select_144: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + movd %ecx,%xmm5 + pxor %xmm4,%xmm4 + pshufd $0,%xmm5,%xmm5 + movdqu (%rsi),%xmm0 + leaq 72(%rsi),%rsi + pcmpeqd %xmm4,%xmm5 + movdqu (%rdx),%xmm1 + leaq 72(%rdx),%rdx + pcmpeqd %xmm5,%xmm4 + leaq 72(%rdi),%rdi + pand %xmm4,%xmm0 + movdqu 0+16-72(%rsi),%xmm2 + pand %xmm5,%xmm1 + movdqu 0+16-72(%rdx),%xmm3 + por %xmm1,%xmm0 + movdqu %xmm0,0-72(%rdi) + pand %xmm4,%xmm2 + movdqu 16+16-72(%rsi),%xmm0 + pand %xmm5,%xmm3 + movdqu 16+16-72(%rdx),%xmm1 + por %xmm3,%xmm2 + movdqu %xmm2,16-72(%rdi) + pand %xmm4,%xmm0 + movdqu 32+16-72(%rsi),%xmm2 + pand %xmm5,%xmm1 + movdqu 32+16-72(%rdx),%xmm3 + por %xmm1,%xmm0 + movdqu %xmm0,32-72(%rdi) + pand %xmm4,%xmm2 + movdqu 48+16-72(%rsi),%xmm0 + pand %xmm5,%xmm3 + movdqu 48+16-72(%rdx),%xmm1 + por %xmm3,%xmm2 + movdqu %xmm2,48-72(%rdi) + pand %xmm4,%xmm0 + movdqu 64+16-72(%rsi),%xmm2 + pand %xmm5,%xmm1 + movdqu 64+16-72(%rdx),%xmm3 + por %xmm1,%xmm0 + movdqu %xmm0,64-72(%rdi) + pand %xmm4,%xmm2 + movdqu 80+16-72(%rsi),%xmm0 + pand %xmm5,%xmm3 + movdqu 80+16-72(%rdx),%xmm1 + por %xmm3,%xmm2 + movdqu %xmm2,80-72(%rdi) + pand %xmm4,%xmm0 + movdqu 96+16-72(%rsi),%xmm2 + pand %xmm5,%xmm1 + movdqu 96+16-72(%rdx),%xmm3 + por %xmm1,%xmm0 + movdqu %xmm0,96-72(%rdi) + pand %xmm4,%xmm2 + movdqu 112+16-72(%rsi),%xmm0 + pand %xmm5,%xmm3 + movdqu 112+16-72(%rdx),%xmm1 + por %xmm3,%xmm2 + movdqu %xmm2,112-72(%rdi) + pand %xmm4,%xmm0 + pand %xmm5,%xmm1 + por %xmm1,%xmm0 + movdqu %xmm0,128-72(%rdi) + .byte 0xf3,0xc3 +.cfi_endproc +.size vec_select_144,.-vec_select_144 +.globl vec_select_288 +.hidden vec_select_288 +.type vec_select_288,@function +.align 32 +vec_select_288: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + movd %ecx,%xmm5 + pxor %xmm4,%xmm4 + pshufd $0,%xmm5,%xmm5 + movdqu (%rsi),%xmm0 + leaq 144(%rsi),%rsi + pcmpeqd %xmm4,%xmm5 + movdqu (%rdx),%xmm1 + leaq 144(%rdx),%rdx + pcmpeqd %xmm5,%xmm4 + leaq 144(%rdi),%rdi + pand %xmm4,%xmm0 + movdqu 0+16-144(%rsi),%xmm2 + pand %xmm5,%xmm1 + movdqu 0+16-144(%rdx),%xmm3 + por %xmm1,%xmm0 + movdqu %xmm0,0-144(%rdi) + pand %xmm4,%xmm2 + movdqu 16+16-144(%rsi),%xmm0 + pand %xmm5,%xmm3 + movdqu 16+16-144(%rdx),%xmm1 + por %xmm3,%xmm2 + movdqu %xmm2,16-144(%rdi) + pand %xmm4,%xmm0 + movdqu 32+16-144(%rsi),%xmm2 + pand %xmm5,%xmm1 + movdqu 32+16-144(%rdx),%xmm3 + por %xmm1,%xmm0 + movdqu %xmm0,32-144(%rdi) + pand %xmm4,%xmm2 + movdqu 48+16-144(%rsi),%xmm0 + pand %xmm5,%xmm3 + movdqu 48+16-144(%rdx),%xmm1 + por %xmm3,%xmm2 + movdqu %xmm2,48-144(%rdi) + pand %xmm4,%xmm0 + movdqu 64+16-144(%rsi),%xmm2 + pand %xmm5,%xmm1 + movdqu 64+16-144(%rdx),%xmm3 + por %xmm1,%xmm0 + movdqu %xmm0,64-144(%rdi) + pand %xmm4,%xmm2 + movdqu 80+16-144(%rsi),%xmm0 + pand %xmm5,%xmm3 + movdqu 80+16-144(%rdx),%xmm1 + por %xmm3,%xmm2 + movdqu %xmm2,80-144(%rdi) + pand %xmm4,%xmm0 + movdqu 96+16-144(%rsi),%xmm2 + pand %xmm5,%xmm1 + movdqu 96+16-144(%rdx),%xmm3 + por %xmm1,%xmm0 + movdqu %xmm0,96-144(%rdi) + pand %xmm4,%xmm2 + movdqu 112+16-144(%rsi),%xmm0 + pand %xmm5,%xmm3 + movdqu 112+16-144(%rdx),%xmm1 + por %xmm3,%xmm2 + movdqu %xmm2,112-144(%rdi) + pand %xmm4,%xmm0 + movdqu 128+16-144(%rsi),%xmm2 + pand %xmm5,%xmm1 + movdqu 128+16-144(%rdx),%xmm3 + por %xmm1,%xmm0 + movdqu %xmm0,128-144(%rdi) + pand %xmm4,%xmm2 + movdqu 144+16-144(%rsi),%xmm0 + pand %xmm5,%xmm3 + movdqu 144+16-144(%rdx),%xmm1 + por %xmm3,%xmm2 + movdqu %xmm2,144-144(%rdi) + pand %xmm4,%xmm0 + movdqu 160+16-144(%rsi),%xmm2 + pand %xmm5,%xmm1 + movdqu 160+16-144(%rdx),%xmm3 + por %xmm1,%xmm0 + movdqu %xmm0,160-144(%rdi) + pand %xmm4,%xmm2 + movdqu 176+16-144(%rsi),%xmm0 + pand %xmm5,%xmm3 + movdqu 176+16-144(%rdx),%xmm1 + por %xmm3,%xmm2 + movdqu %xmm2,176-144(%rdi) + pand %xmm4,%xmm0 + movdqu 192+16-144(%rsi),%xmm2 + pand %xmm5,%xmm1 + movdqu 192+16-144(%rdx),%xmm3 + por %xmm1,%xmm0 + movdqu %xmm0,192-144(%rdi) + pand %xmm4,%xmm2 + movdqu 208+16-144(%rsi),%xmm0 + pand %xmm5,%xmm3 + movdqu 208+16-144(%rdx),%xmm1 + por %xmm3,%xmm2 + movdqu %xmm2,208-144(%rdi) + pand %xmm4,%xmm0 + movdqu 224+16-144(%rsi),%xmm2 + pand %xmm5,%xmm1 + movdqu 224+16-144(%rdx),%xmm3 + por %xmm1,%xmm0 + movdqu %xmm0,224-144(%rdi) + pand %xmm4,%xmm2 + movdqu 240+16-144(%rsi),%xmm0 + pand %xmm5,%xmm3 + movdqu 240+16-144(%rdx),%xmm1 + por %xmm3,%xmm2 + movdqu %xmm2,240-144(%rdi) + pand %xmm4,%xmm0 + movdqu 256+16-144(%rsi),%xmm2 + pand %xmm5,%xmm1 + movdqu 256+16-144(%rdx),%xmm3 + por %xmm1,%xmm0 + movdqu %xmm0,256-144(%rdi) + pand %xmm4,%xmm2 + pand %xmm5,%xmm3 + por %xmm3,%xmm2 + movdqu %xmm2,272-144(%rdi) + .byte 0xf3,0xc3 +.cfi_endproc +.size vec_select_288,.-vec_select_288 +.globl vec_prefetch +.hidden vec_prefetch +.type vec_prefetch,@function +.align 32 +vec_prefetch: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + leaq -1(%rdi,%rsi,1),%rsi + movq $64,%rax + xorq %r8,%r8 + prefetchnta (%rdi) + leaq (%rdi,%rax,1),%rdi + cmpq %rsi,%rdi + cmovaq %rsi,%rdi + cmovaq %r8,%rax + prefetchnta (%rdi) + leaq (%rdi,%rax,1),%rdi + cmpq %rsi,%rdi + cmovaq %rsi,%rdi + cmovaq %r8,%rax + prefetchnta (%rdi) + leaq (%rdi,%rax,1),%rdi + cmpq %rsi,%rdi + cmovaq %rsi,%rdi + cmovaq %r8,%rax + prefetchnta (%rdi) + leaq (%rdi,%rax,1),%rdi + cmpq %rsi,%rdi + cmovaq %rsi,%rdi + cmovaq %r8,%rax + prefetchnta (%rdi) + leaq (%rdi,%rax,1),%rdi + cmpq %rsi,%rdi + cmovaq %rsi,%rdi + cmovaq %r8,%rax + prefetchnta (%rdi) + leaq (%rdi,%rax,1),%rdi + cmpq %rsi,%rdi + cmovaq %rsi,%rdi + prefetchnta (%rdi) + .byte 0xf3,0xc3 +.cfi_endproc +.size vec_prefetch,.-vec_prefetch +.globl vec_is_zero_16x +.hidden vec_is_zero_16x +.type vec_is_zero_16x,@function +.align 32 +vec_is_zero_16x: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + shrl $4,%esi + movdqu (%rdi),%xmm0 + leaq 16(%rdi),%rdi + +.Loop_is_zero: + decl %esi + jz .Loop_is_zero_done + movdqu (%rdi),%xmm1 + leaq 16(%rdi),%rdi + por %xmm1,%xmm0 + jmp .Loop_is_zero + +.Loop_is_zero_done: + pshufd $0x4e,%xmm0,%xmm1 + por %xmm1,%xmm0 +.byte 102,72,15,126,192 + incl %esi + testq %rax,%rax + cmovnzl %esi,%eax + xorl $1,%eax + .byte 0xf3,0xc3 +.cfi_endproc +.size vec_is_zero_16x,.-vec_is_zero_16x +.globl vec_is_equal_16x +.hidden vec_is_equal_16x +.type vec_is_equal_16x,@function +.align 32 +vec_is_equal_16x: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + shrl $4,%edx + movdqu (%rdi),%xmm0 + movdqu (%rsi),%xmm1 + subq %rdi,%rsi + leaq 16(%rdi),%rdi + pxor %xmm1,%xmm0 + +.Loop_is_equal: + decl %edx + jz .Loop_is_equal_done + movdqu (%rdi),%xmm1 + movdqu (%rdi,%rsi,1),%xmm2 + leaq 16(%rdi),%rdi + pxor %xmm2,%xmm1 + por %xmm1,%xmm0 + jmp .Loop_is_equal + +.Loop_is_equal_done: + pshufd $0x4e,%xmm0,%xmm1 + por %xmm1,%xmm0 +.byte 102,72,15,126,192 + incl %edx + testq %rax,%rax + cmovnzl %edx,%eax + xorl $1,%eax + .byte 0xf3,0xc3 +.cfi_endproc +.size vec_is_equal_16x,.-vec_is_equal_16x + +.section .note.GNU-stack,"",@progbits +.section .note.gnu.property,"a",@note + .long 4,2f-1f,5 + .byte 0x47,0x4E,0x55,0 +1: .long 0xc0000002,4,3 +.align 8 +2: diff --git a/temp-deps/blst-ts/deps/blst/build/elf/add_mod_384x384-x86_64.s b/temp-deps/blst-ts/deps/blst/build/elf/add_mod_384x384-x86_64.s new file mode 100644 index 0000000..084f3d8 --- /dev/null +++ b/temp-deps/blst-ts/deps/blst/build/elf/add_mod_384x384-x86_64.s @@ -0,0 +1,252 @@ +.text + +.type __add_mod_384x384,@function +.align 32 +__add_mod_384x384: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + movq 32(%rsi),%r12 + movq 40(%rsi),%r13 + movq 48(%rsi),%r14 + + addq 0(%rdx),%r8 + movq 56(%rsi),%r15 + adcq 8(%rdx),%r9 + movq 64(%rsi),%rax + adcq 16(%rdx),%r10 + movq 72(%rsi),%rbx + adcq 24(%rdx),%r11 + movq 80(%rsi),%rbp + adcq 32(%rdx),%r12 + movq 88(%rsi),%rsi + adcq 40(%rdx),%r13 + movq %r8,0(%rdi) + adcq 48(%rdx),%r14 + movq %r9,8(%rdi) + adcq 56(%rdx),%r15 + movq %r10,16(%rdi) + adcq 64(%rdx),%rax + movq %r12,32(%rdi) + movq %r14,%r8 + adcq 72(%rdx),%rbx + movq %r11,24(%rdi) + movq %r15,%r9 + adcq 80(%rdx),%rbp + movq %r13,40(%rdi) + movq %rax,%r10 + adcq 88(%rdx),%rsi + movq %rbx,%r11 + sbbq %rdx,%rdx + + subq 0(%rcx),%r14 + sbbq 8(%rcx),%r15 + movq %rbp,%r12 + sbbq 16(%rcx),%rax + sbbq 24(%rcx),%rbx + sbbq 32(%rcx),%rbp + movq %rsi,%r13 + sbbq 40(%rcx),%rsi + sbbq $0,%rdx + + cmovcq %r8,%r14 + cmovcq %r9,%r15 + cmovcq %r10,%rax + movq %r14,48(%rdi) + cmovcq %r11,%rbx + movq %r15,56(%rdi) + cmovcq %r12,%rbp + movq %rax,64(%rdi) + cmovcq %r13,%rsi + movq %rbx,72(%rdi) + movq %rbp,80(%rdi) + movq %rsi,88(%rdi) + + .byte 0xf3,0xc3 +.cfi_endproc +.size __add_mod_384x384,.-__add_mod_384x384 + +.type __sub_mod_384x384,@function +.align 32 +__sub_mod_384x384: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + movq 32(%rsi),%r12 + movq 40(%rsi),%r13 + movq 48(%rsi),%r14 + + subq 0(%rdx),%r8 + movq 56(%rsi),%r15 + sbbq 8(%rdx),%r9 + movq 64(%rsi),%rax + sbbq 16(%rdx),%r10 + movq 72(%rsi),%rbx + sbbq 24(%rdx),%r11 + movq 80(%rsi),%rbp + sbbq 32(%rdx),%r12 + movq 88(%rsi),%rsi + sbbq 40(%rdx),%r13 + movq %r8,0(%rdi) + sbbq 48(%rdx),%r14 + movq 0(%rcx),%r8 + movq %r9,8(%rdi) + sbbq 56(%rdx),%r15 + movq 8(%rcx),%r9 + movq %r10,16(%rdi) + sbbq 64(%rdx),%rax + movq 16(%rcx),%r10 + movq %r11,24(%rdi) + sbbq 72(%rdx),%rbx + movq 24(%rcx),%r11 + movq %r12,32(%rdi) + sbbq 80(%rdx),%rbp + movq 32(%rcx),%r12 + movq %r13,40(%rdi) + sbbq 88(%rdx),%rsi + movq 40(%rcx),%r13 + sbbq %rdx,%rdx + + andq %rdx,%r8 + andq %rdx,%r9 + andq %rdx,%r10 + andq %rdx,%r11 + andq %rdx,%r12 + andq %rdx,%r13 + + addq %r8,%r14 + adcq %r9,%r15 + movq %r14,48(%rdi) + adcq %r10,%rax + movq %r15,56(%rdi) + adcq %r11,%rbx + movq %rax,64(%rdi) + adcq %r12,%rbp + movq %rbx,72(%rdi) + adcq %r13,%rsi + movq %rbp,80(%rdi) + movq %rsi,88(%rdi) + + .byte 0xf3,0xc3 +.cfi_endproc +.size __sub_mod_384x384,.-__sub_mod_384x384 + +.globl add_mod_384x384 +.hidden add_mod_384x384 +.type add_mod_384x384,@function +.align 32 +add_mod_384x384: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + subq $8,%rsp +.cfi_adjust_cfa_offset 8 + + + call __add_mod_384x384 + + movq 8(%rsp),%r15 +.cfi_restore %r15 + movq 16(%rsp),%r14 +.cfi_restore %r14 + movq 24(%rsp),%r13 +.cfi_restore %r13 + movq 32(%rsp),%r12 +.cfi_restore %r12 + movq 40(%rsp),%rbx +.cfi_restore %rbx + movq 48(%rsp),%rbp +.cfi_restore %rbp + leaq 56(%rsp),%rsp +.cfi_adjust_cfa_offset -56 + + .byte 0xf3,0xc3 +.cfi_endproc +.size add_mod_384x384,.-add_mod_384x384 + +.globl sub_mod_384x384 +.hidden sub_mod_384x384 +.type sub_mod_384x384,@function +.align 32 +sub_mod_384x384: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + subq $8,%rsp +.cfi_adjust_cfa_offset 8 + + + call __sub_mod_384x384 + + movq 8(%rsp),%r15 +.cfi_restore %r15 + movq 16(%rsp),%r14 +.cfi_restore %r14 + movq 24(%rsp),%r13 +.cfi_restore %r13 + movq 32(%rsp),%r12 +.cfi_restore %r12 + movq 40(%rsp),%rbx +.cfi_restore %rbx + movq 48(%rsp),%rbp +.cfi_restore %rbp + leaq 56(%rsp),%rsp +.cfi_adjust_cfa_offset -56 + + .byte 0xf3,0xc3 +.cfi_endproc +.size sub_mod_384x384,.-sub_mod_384x384 + +.section .note.GNU-stack,"",@progbits +.section .note.gnu.property,"a",@note + .long 4,2f-1f,5 + .byte 0x47,0x4E,0x55,0 +1: .long 0xc0000002,4,3 +.align 8 +2: diff --git a/temp-deps/blst-ts/deps/blst/build/elf/ct_inverse_mod_256-armv8.S b/temp-deps/blst-ts/deps/blst/build/elf/ct_inverse_mod_256-armv8.S new file mode 100644 index 0000000..347eb31 --- /dev/null +++ b/temp-deps/blst-ts/deps/blst/build/elf/ct_inverse_mod_256-armv8.S @@ -0,0 +1,784 @@ +.text + +.globl ct_inverse_mod_256 +.type ct_inverse_mod_256, %function +.align 5 +ct_inverse_mod_256: + .inst 0xd503233f + stp x29, x30, [sp,#-80]! + add x29, sp, #0 + stp x19, x20, [sp,#16] + stp x21, x22, [sp,#32] + stp x23, x24, [sp,#48] + stp x25, x26, [sp,#64] + sub sp, sp, #1040 + + ldp x4, x5, [x1,#8*0] + ldp x6, x7, [x1,#8*2] + + add x1, sp, #16+511 // find closest 512-byte-aligned spot + and x1, x1, #-512 // in the frame... + str x0, [sp] + + ldp x8, x9, [x2,#8*0] + ldp x10, x11, [x2,#8*2] + + stp x4, x5, [x1,#8*0] // copy input to |a| + stp x6, x7, [x1,#8*2] + stp x8, x9, [x1,#8*4] // copy modulus to |b| + stp x10, x11, [x1,#8*6] + + ////////////////////////////////////////// first iteration + bl .Lab_approximation_31_256_loaded + + eor x0, x1, #256 // pointer to dst |a|b|u|v| + bl __smul_256_n_shift_by_31 + str x12,[x0,#8*8] // initialize |u| with |f0| + + mov x12, x14 // |f1| + mov x13, x15 // |g1| + add x0, x0, #8*4 // pointer to dst |b| + bl __smul_256_n_shift_by_31 + str x12, [x0,#8*9] // initialize |v| with |f1| + + ////////////////////////////////////////// second iteration + eor x1, x1, #256 // flip-flop src |a|b|u|v| + bl __ab_approximation_31_256 + + eor x0, x1, #256 // pointer to dst |a|b|u|v| + bl __smul_256_n_shift_by_31 + mov x16, x12 // corrected |f0| + mov x17, x13 // corrected |g0| + + mov x12, x14 // |f1| + mov x13, x15 // |g1| + add x0, x0, #8*4 // pointer to destination |b| + bl __smul_256_n_shift_by_31 + + ldr x8, [x1,#8*8] // |u| + ldr x9, [x1,#8*13] // |v| + madd x4, x16, x8, xzr // |u|*|f0| + madd x4, x17, x9, x4 // |v|*|g0| + str x4, [x0,#8*4] + asr x5, x4, #63 // sign extenstion + stp x5, x5, [x0,#8*5] + stp x5, x5, [x0,#8*7] + + madd x4, x12, x8, xzr // |u|*|f1| + madd x4, x13, x9, x4 // |v|*|g1| + str x4, [x0,#8*9] + asr x5, x4, #63 // sign extenstion + stp x5, x5, [x0,#8*10] + stp x5, x5, [x0,#8*12] + eor x1, x1, #256 // flip-flop src |a|b|u|v| + bl __ab_approximation_31_256 + + eor x0, x1, #256 // pointer to dst |a|b|u|v| + bl __smul_256_n_shift_by_31 + mov x16, x12 // corrected |f0| + mov x17, x13 // corrected |g0| + + mov x12, x14 // |f1| + mov x13, x15 // |g1| + add x0, x0, #8*4 // pointer to destination |b| + bl __smul_256_n_shift_by_31 + + add x0, x0, #8*4 // pointer to destination |u| + bl __smul_256x63 + adc x22, x22, x23 + str x22, [x0,#8*4] + + mov x16, x12 // corrected |f1| + mov x17, x13 // corrected |g1| + add x0, x0, #8*5 // pointer to destination |v| + bl __smul_256x63 + adc x22, x22, x23 + stp x22, x22, [x0,#8*4] + stp x22, x22, [x0,#8*6] + eor x1, x1, #256 // flip-flop src |a|b|u|v| + bl __ab_approximation_31_256 + + eor x0, x1, #256 // pointer to dst |a|b|u|v| + bl __smul_256_n_shift_by_31 + mov x16, x12 // corrected |f0| + mov x17, x13 // corrected |g0| + + mov x12, x14 // |f1| + mov x13, x15 // |g1| + add x0, x0, #8*4 // pointer to destination |b| + bl __smul_256_n_shift_by_31 + + add x0, x0, #8*4 // pointer to destination |u| + bl __smul_256x63 + adc x22, x22, x23 + str x22, [x0,#8*4] + + mov x16, x12 // corrected |f1| + mov x17, x13 // corrected |g1| + add x0, x0, #8*5 // pointer to destination |v| + bl __smul_256x63 + adc x22, x22, x23 + stp x22, x22, [x0,#8*4] + stp x22, x22, [x0,#8*6] + eor x1, x1, #256 // flip-flop src |a|b|u|v| + bl __ab_approximation_31_256 + + eor x0, x1, #256 // pointer to dst |a|b|u|v| + bl __smul_256_n_shift_by_31 + mov x16, x12 // corrected |f0| + mov x17, x13 // corrected |g0| + + mov x12, x14 // |f1| + mov x13, x15 // |g1| + add x0, x0, #8*4 // pointer to destination |b| + bl __smul_256_n_shift_by_31 + + add x0, x0, #8*4 // pointer to destination |u| + bl __smul_256x63 + adc x22, x22, x23 + str x22, [x0,#8*4] + + mov x16, x12 // corrected |f1| + mov x17, x13 // corrected |g1| + add x0, x0, #8*5 // pointer to destination |v| + bl __smul_256x63 + adc x22, x22, x23 + stp x22, x22, [x0,#8*4] + stp x22, x22, [x0,#8*6] + eor x1, x1, #256 // flip-flop src |a|b|u|v| + bl __ab_approximation_31_256 + + eor x0, x1, #256 // pointer to dst |a|b|u|v| + bl __smul_256_n_shift_by_31 + mov x16, x12 // corrected |f0| + mov x17, x13 // corrected |g0| + + mov x12, x14 // |f1| + mov x13, x15 // |g1| + add x0, x0, #8*4 // pointer to destination |b| + bl __smul_256_n_shift_by_31 + + add x0, x0, #8*4 // pointer to destination |u| + bl __smul_256x63 + adc x22, x22, x23 + str x22, [x0,#8*4] + + mov x16, x12 // corrected |f1| + mov x17, x13 // corrected |g1| + add x0, x0, #8*5 // pointer to destination |v| + bl __smul_256x63 + adc x22, x22, x23 + stp x22, x22, [x0,#8*4] + stp x22, x22, [x0,#8*6] + eor x1, x1, #256 // flip-flop src |a|b|u|v| + bl __ab_approximation_31_256 + + eor x0, x1, #256 // pointer to dst |a|b|u|v| + bl __smul_256_n_shift_by_31 + mov x16, x12 // corrected |f0| + mov x17, x13 // corrected |g0| + + mov x12, x14 // |f1| + mov x13, x15 // |g1| + add x0, x0, #8*4 // pointer to destination |b| + bl __smul_256_n_shift_by_31 + + add x0, x0, #8*4 // pointer to destination |u| + bl __smul_256x63 + adc x22, x22, x23 + str x22, [x0,#8*4] + + mov x16, x12 // corrected |f1| + mov x17, x13 // corrected |g1| + add x0, x0, #8*5 // pointer to destination |v| + bl __smul_256x63 + adc x22, x22, x23 + stp x22, x22, [x0,#8*4] + stp x22, x22, [x0,#8*6] + eor x1, x1, #256 // flip-flop src |a|b|u|v| + bl __ab_approximation_31_256 + + eor x0, x1, #256 // pointer to dst |a|b|u|v| + bl __smul_256_n_shift_by_31 + mov x16, x12 // corrected |f0| + mov x17, x13 // corrected |g0| + + mov x12, x14 // |f1| + mov x13, x15 // |g1| + add x0, x0, #8*4 // pointer to destination |b| + bl __smul_256_n_shift_by_31 + + add x0, x0, #8*4 // pointer to destination |u| + bl __smul_256x63 + adc x22, x22, x23 + str x22, [x0,#8*4] + + mov x16, x12 // corrected |f1| + mov x17, x13 // corrected |g1| + add x0, x0, #8*5 // pointer to destination |v| + bl __smul_256x63 + adc x22, x22, x23 + stp x22, x22, [x0,#8*4] + stp x22, x22, [x0,#8*6] + eor x1, x1, #256 // flip-flop src |a|b|u|v| + bl __ab_approximation_31_256 + + eor x0, x1, #256 // pointer to dst |a|b|u|v| + bl __smul_256_n_shift_by_31 + mov x16, x12 // corrected |f0| + mov x17, x13 // corrected |g0| + + mov x12, x14 // |f1| + mov x13, x15 // |g1| + add x0, x0, #8*4 // pointer to destination |b| + bl __smul_256_n_shift_by_31 + + add x0, x0, #8*4 // pointer to destination |u| + bl __smul_256x63 + adc x22, x22, x23 + str x22, [x0,#8*4] + + mov x16, x12 // corrected |f1| + mov x17, x13 // corrected |g1| + add x0, x0, #8*5 // pointer to destination |v| + bl __smul_256x63 + bl __smul_512x63_tail + eor x1, x1, #256 // flip-flop src |a|b|u|v| + bl __ab_approximation_31_256 + + eor x0, x1, #256 // pointer to dst |a|b|u|v| + bl __smul_256_n_shift_by_31 + mov x16, x12 // corrected |f0| + mov x17, x13 // corrected |g0| + + mov x12, x14 // |f1| + mov x13, x15 // |g1| + add x0, x0, #8*4 // pointer to destination |b| + bl __smul_256_n_shift_by_31 + + add x0, x0, #8*4 // pointer to destination |u| + bl __smul_256x63 + adc x22, x22, x23 + str x22, [x0,#8*4] + + mov x16, x12 // corrected |f1| + mov x17, x13 // corrected |g1| + add x0, x0, #8*5 // pointer to destination |v| + bl __smul_256x63 + bl __smul_512x63_tail + eor x1, x1, #256 // flip-flop src |a|b|u|v| + bl __ab_approximation_31_256 + + eor x0, x1, #256 // pointer to dst |a|b|u|v| + bl __smul_256_n_shift_by_31 + mov x16, x12 // corrected |f0| + mov x17, x13 // corrected |g0| + + mov x12, x14 // |f1| + mov x13, x15 // |g1| + add x0, x0, #8*4 // pointer to destination |b| + bl __smul_256_n_shift_by_31 + + add x0, x0, #8*4 // pointer to destination |u| + bl __smul_256x63 + adc x22, x22, x23 + str x22, [x0,#8*4] + + mov x16, x12 // corrected |f1| + mov x17, x13 // corrected |g1| + add x0, x0, #8*5 // pointer to destination |v| + bl __smul_256x63 + bl __smul_512x63_tail + eor x1, x1, #256 // flip-flop src |a|b|u|v| + bl __ab_approximation_31_256 + + eor x0, x1, #256 // pointer to dst |a|b|u|v| + bl __smul_256_n_shift_by_31 + mov x16, x12 // corrected |f0| + mov x17, x13 // corrected |g0| + + mov x12, x14 // |f1| + mov x13, x15 // |g1| + add x0, x0, #8*4 // pointer to destination |b| + bl __smul_256_n_shift_by_31 + + add x0, x0, #8*4 // pointer to destination |u| + bl __smul_256x63 + adc x22, x22, x23 + str x22, [x0,#8*4] + + mov x16, x12 // corrected |f1| + mov x17, x13 // corrected |g1| + add x0, x0, #8*5 // pointer to destination |v| + bl __smul_256x63 + bl __smul_512x63_tail + eor x1, x1, #256 // flip-flop src |a|b|u|v| + bl __ab_approximation_31_256 + + eor x0, x1, #256 // pointer to dst |a|b|u|v| + bl __smul_256_n_shift_by_31 + mov x16, x12 // corrected |f0| + mov x17, x13 // corrected |g0| + + mov x12, x14 // |f1| + mov x13, x15 // |g1| + add x0, x0, #8*4 // pointer to destination |b| + bl __smul_256_n_shift_by_31 + + add x0, x0, #8*4 // pointer to destination |u| + bl __smul_256x63 + adc x22, x22, x23 + str x22, [x0,#8*4] + + mov x16, x12 // corrected |f1| + mov x17, x13 // corrected |g1| + add x0, x0, #8*5 // pointer to destination |v| + bl __smul_256x63 + bl __smul_512x63_tail + eor x1, x1, #256 // flip-flop src |a|b|u|v| + bl __ab_approximation_31_256 + + eor x0, x1, #256 // pointer to dst |a|b|u|v| + bl __smul_256_n_shift_by_31 + mov x16, x12 // corrected |f0| + mov x17, x13 // corrected |g0| + + mov x12, x14 // |f1| + mov x13, x15 // |g1| + add x0, x0, #8*4 // pointer to destination |b| + bl __smul_256_n_shift_by_31 + + add x0, x0, #8*4 // pointer to destination |u| + bl __smul_256x63 + adc x22, x22, x23 + str x22, [x0,#8*4] + + mov x16, x12 // corrected |f1| + mov x17, x13 // corrected |g1| + add x0, x0, #8*5 // pointer to destination |v| + bl __smul_256x63 + bl __smul_512x63_tail + eor x1, x1, #256 // flip-flop src |a|b|u|v| + bl __ab_approximation_31_256 + + eor x0, x1, #256 // pointer to dst |a|b|u|v| + bl __smul_256_n_shift_by_31 + mov x16, x12 // corrected |f0| + mov x17, x13 // corrected |g0| + + mov x12, x14 // |f1| + mov x13, x15 // |g1| + add x0, x0, #8*4 // pointer to destination |b| + bl __smul_256_n_shift_by_31 + + add x0, x0, #8*4 // pointer to destination |u| + bl __smul_256x63 + adc x22, x22, x23 + str x22, [x0,#8*4] + + mov x16, x12 // corrected |f1| + mov x17, x13 // corrected |g1| + add x0, x0, #8*5 // pointer to destination |v| + bl __smul_256x63 + bl __smul_512x63_tail + ////////////////////////////////////////// two[!] last iterations + eor x1, x1, #256 // flip-flop src |a|b|u|v| + mov x2, #47 // 31 + 512 % 31 + //bl __ab_approximation_62_256 // |a| and |b| are exact, + ldr x7, [x1,#8*0] // just load + ldr x11, [x1,#8*4] + bl __inner_loop_62_256 + + mov x16, x14 + mov x17, x15 + ldr x0, [sp] // original out_ptr + bl __smul_256x63 + bl __smul_512x63_tail + ldr x30, [x29,#8] + + smulh x20, x7, x17 // figure out top-most limb + ldp x8, x9, [x3,#8*0] + adc x23, x23, x25 + ldp x10, x11, [x3,#8*2] + + add x20, x20, x23 // x20 is 1, 0 or -1 + asr x19, x20, #63 // sign as mask + + and x23, x8, x19 // add mod<<256 conditionally + and x24, x9, x19 + adds x4, x4, x23 + and x25, x10, x19 + adcs x5, x5, x24 + and x26, x11, x19 + adcs x6, x6, x25 + adcs x7, x22, x26 + adc x20, x20, xzr // x20 is 1, 0 or -1 + + neg x19, x20 + orr x20, x20, x19 // excess bit or sign as mask + asr x19, x19, #63 // excess bit as mask + + and x8, x8, x20 // mask |mod| + and x9, x9, x20 + and x10, x10, x20 + and x11, x11, x20 + + eor x8, x8, x19 // conditionally negate |mod| + eor x9, x9, x19 + adds x8, x8, x19, lsr#63 + eor x10, x10, x19 + adcs x9, x9, xzr + eor x11, x11, x19 + adcs x10, x10, xzr + adc x11, x11, xzr + + adds x4, x4, x8 // final adjustment for |mod|<<256 + adcs x5, x5, x9 + adcs x6, x6, x10 + stp x4, x5, [x0,#8*4] + adc x7, x7, x11 + stp x6, x7, [x0,#8*6] + + add sp, sp, #1040 + ldp x19, x20, [x29,#16] + ldp x21, x22, [x29,#32] + ldp x23, x24, [x29,#48] + ldp x25, x26, [x29,#64] + ldr x29, [sp],#80 + .inst 0xd50323bf + ret +.size ct_inverse_mod_256,.-ct_inverse_mod_256 + +//////////////////////////////////////////////////////////////////////// +.type __smul_256x63, %function +.align 5 +__smul_256x63: + ldp x4, x5, [x1,#8*0+64] // load |u| (or |v|) + asr x14, x16, #63 // |f_|'s sign as mask (or |g_|'s) + ldp x6, x7, [x1,#8*2+64] + eor x16, x16, x14 // conditionally negate |f_| (or |g_|) + ldr x22, [x1,#8*4+64] + + eor x4, x4, x14 // conditionally negate |u| (or |v|) + sub x16, x16, x14 + eor x5, x5, x14 + adds x4, x4, x14, lsr#63 + eor x6, x6, x14 + adcs x5, x5, xzr + eor x7, x7, x14 + adcs x6, x6, xzr + eor x22, x22, x14 + umulh x19, x4, x16 + adcs x7, x7, xzr + umulh x20, x5, x16 + adcs x22, x22, xzr + umulh x21, x6, x16 + mul x4, x4, x16 + cmp x16, #0 + mul x5, x5, x16 + csel x22, x22, xzr, ne + mul x6, x6, x16 + adds x5, x5, x19 + mul x24, x7, x16 + adcs x6, x6, x20 + adcs x24, x24, x21 + adc x26, xzr, xzr + ldp x8, x9, [x1,#8*0+104] // load |u| (or |v|) + asr x14, x17, #63 // |f_|'s sign as mask (or |g_|'s) + ldp x10, x11, [x1,#8*2+104] + eor x17, x17, x14 // conditionally negate |f_| (or |g_|) + ldr x23, [x1,#8*4+104] + + eor x8, x8, x14 // conditionally negate |u| (or |v|) + sub x17, x17, x14 + eor x9, x9, x14 + adds x8, x8, x14, lsr#63 + eor x10, x10, x14 + adcs x9, x9, xzr + eor x11, x11, x14 + adcs x10, x10, xzr + eor x23, x23, x14 + umulh x19, x8, x17 + adcs x11, x11, xzr + umulh x20, x9, x17 + adcs x23, x23, xzr + umulh x21, x10, x17 + adc x15, xzr, xzr // used in __smul_512x63_tail + mul x8, x8, x17 + cmp x17, #0 + mul x9, x9, x17 + csel x23, x23, xzr, ne + mul x10, x10, x17 + adds x9, x9, x19 + mul x25, x11, x17 + adcs x10, x10, x20 + adcs x25, x25, x21 + adc x26, x26, xzr + + adds x4, x4, x8 + adcs x5, x5, x9 + adcs x6, x6, x10 + stp x4, x5, [x0,#8*0] + adcs x24, x24, x25 + stp x6, x24, [x0,#8*2] + + ret +.size __smul_256x63,.-__smul_256x63 + +.type __smul_512x63_tail, %function +.align 5 +__smul_512x63_tail: + umulh x24, x7, x16 + ldp x5, x6, [x1,#8*18] // load rest of |v| + adc x26, x26, xzr + ldr x7, [x1,#8*20] + and x22, x22, x16 + + umulh x11, x11, x17 // resume |v|*|g1| chain + + sub x24, x24, x22 // tie up |u|*|f1| chain + asr x25, x24, #63 + + eor x5, x5, x14 // conditionally negate rest of |v| + eor x6, x6, x14 + adds x5, x5, x15 + eor x7, x7, x14 + adcs x6, x6, xzr + umulh x19, x23, x17 + adc x7, x7, xzr + umulh x20, x5, x17 + add x11, x11, x26 + umulh x21, x6, x17 + + mul x4, x23, x17 + mul x5, x5, x17 + adds x4, x4, x11 + mul x6, x6, x17 + adcs x5, x5, x19 + mul x22, x7, x17 + adcs x6, x6, x20 + adcs x22, x22, x21 + adc x23, xzr, xzr // used in the final step + + adds x4, x4, x24 + adcs x5, x5, x25 + adcs x6, x6, x25 + stp x4, x5, [x0,#8*4] + adcs x22, x22, x25 // carry is used in the final step + stp x6, x22, [x0,#8*6] + + ret +.size __smul_512x63_tail,.-__smul_512x63_tail + +.type __smul_256_n_shift_by_31, %function +.align 5 +__smul_256_n_shift_by_31: + ldp x4, x5, [x1,#8*0+0] // load |a| (or |b|) + asr x24, x12, #63 // |f0|'s sign as mask (or |g0|'s) + ldp x6, x7, [x1,#8*2+0] + eor x25, x12, x24 // conditionally negate |f0| (or |g0|) + + eor x4, x4, x24 // conditionally negate |a| (or |b|) + sub x25, x25, x24 + eor x5, x5, x24 + adds x4, x4, x24, lsr#63 + eor x6, x6, x24 + adcs x5, x5, xzr + eor x7, x7, x24 + umulh x19, x4, x25 + adcs x6, x6, xzr + umulh x20, x5, x25 + adc x7, x7, xzr + umulh x21, x6, x25 + and x24, x24, x25 + umulh x22, x7, x25 + neg x24, x24 + + mul x4, x4, x25 + mul x5, x5, x25 + mul x6, x6, x25 + adds x5, x5, x19 + mul x7, x7, x25 + adcs x6, x6, x20 + adcs x7, x7, x21 + adc x22, x22, x24 + ldp x8, x9, [x1,#8*0+32] // load |a| (or |b|) + asr x24, x13, #63 // |f0|'s sign as mask (or |g0|'s) + ldp x10, x11, [x1,#8*2+32] + eor x25, x13, x24 // conditionally negate |f0| (or |g0|) + + eor x8, x8, x24 // conditionally negate |a| (or |b|) + sub x25, x25, x24 + eor x9, x9, x24 + adds x8, x8, x24, lsr#63 + eor x10, x10, x24 + adcs x9, x9, xzr + eor x11, x11, x24 + umulh x19, x8, x25 + adcs x10, x10, xzr + umulh x20, x9, x25 + adc x11, x11, xzr + umulh x21, x10, x25 + and x24, x24, x25 + umulh x23, x11, x25 + neg x24, x24 + + mul x8, x8, x25 + mul x9, x9, x25 + mul x10, x10, x25 + adds x9, x9, x19 + mul x11, x11, x25 + adcs x10, x10, x20 + adcs x11, x11, x21 + adc x23, x23, x24 + adds x4, x4, x8 + adcs x5, x5, x9 + adcs x6, x6, x10 + adcs x7, x7, x11 + adc x8, x22, x23 + + extr x4, x5, x4, #31 + extr x5, x6, x5, #31 + extr x6, x7, x6, #31 + asr x23, x8, #63 // result's sign as mask + extr x7, x8, x7, #31 + + eor x4, x4, x23 // ensure the result is positive + eor x5, x5, x23 + adds x4, x4, x23, lsr#63 + eor x6, x6, x23 + adcs x5, x5, xzr + eor x7, x7, x23 + adcs x6, x6, xzr + stp x4, x5, [x0,#8*0] + adc x7, x7, xzr + stp x6, x7, [x0,#8*2] + + eor x12, x12, x23 // adjust |f/g| accordingly + eor x13, x13, x23 + sub x12, x12, x23 + sub x13, x13, x23 + + ret +.size __smul_256_n_shift_by_31,.-__smul_256_n_shift_by_31 +.type __ab_approximation_31_256, %function +.align 4 +__ab_approximation_31_256: + ldp x6, x7, [x1,#8*2] + ldp x10, x11, [x1,#8*6] + ldp x4, x5, [x1,#8*0] + ldp x8, x9, [x1,#8*4] + +.Lab_approximation_31_256_loaded: + orr x19, x7, x11 // check top-most limbs, ... + cmp x19, #0 + csel x7, x7, x6, ne + csel x11, x11, x10, ne + csel x6, x6, x5, ne + orr x19, x7, x11 // and ones before top-most, ... + csel x10, x10, x9, ne + + cmp x19, #0 + csel x7, x7, x6, ne + csel x11, x11, x10, ne + csel x6, x6, x4, ne + orr x19, x7, x11 // and one more, ... + csel x10, x10, x8, ne + + clz x19, x19 + cmp x19, #64 + csel x19, x19, xzr, ne + csel x7, x7, x6, ne + csel x11, x11, x10, ne + neg x20, x19 + + lslv x7, x7, x19 // align high limbs to the left + lslv x11, x11, x19 + lsrv x6, x6, x20 + lsrv x10, x10, x20 + and x6, x6, x20, asr#6 + and x10, x10, x20, asr#6 + orr x7, x7, x6 + orr x11, x11, x10 + + bfxil x7, x4, #0, #31 + bfxil x11, x8, #0, #31 + + b __inner_loop_31_256 + ret +.size __ab_approximation_31_256,.-__ab_approximation_31_256 + +.type __inner_loop_31_256, %function +.align 4 +__inner_loop_31_256: + mov x2, #31 + mov x13, #0x7FFFFFFF80000000 // |f0|=1, |g0|=0 + mov x15, #0x800000007FFFFFFF // |f1|=0, |g1|=1 + mov x23,#0x7FFFFFFF7FFFFFFF + +.Loop_31_256: + sbfx x22, x7, #0, #1 // if |a_| is odd, then we'll be subtracting + sub x2, x2, #1 + and x19, x11, x22 + sub x20, x11, x7 // |b_|-|a_| + subs x21, x7, x19 // |a_|-|b_| (or |a_|-0 if |a_| was even) + mov x19, x15 + csel x11, x11, x7, hs // |b_| = |a_| + csel x7, x21, x20, hs // borrow means |a_|<|b_|, replace with |b_|-|a_| + csel x15, x15, x13, hs // exchange |fg0| and |fg1| + csel x13, x13, x19, hs + lsr x7, x7, #1 + and x19, x15, x22 + and x20, x23, x22 + sub x13, x13, x19 // |f0|-=|f1| (or |f0-=0| if |a_| was even) + add x15, x15, x15 // |f1|<<=1 + add x13, x13, x20 + sub x15, x15, x23 + cbnz x2, .Loop_31_256 + + mov x23, #0x7FFFFFFF + ubfx x12, x13, #0, #32 + ubfx x13, x13, #32, #32 + ubfx x14, x15, #0, #32 + ubfx x15, x15, #32, #32 + sub x12, x12, x23 // remove bias + sub x13, x13, x23 + sub x14, x14, x23 + sub x15, x15, x23 + + ret +.size __inner_loop_31_256,.-__inner_loop_31_256 + +.type __inner_loop_62_256, %function +.align 4 +__inner_loop_62_256: + mov x12, #1 // |f0|=1 + mov x13, #0 // |g0|=0 + mov x14, #0 // |f1|=0 + mov x15, #1 // |g1|=1 + +.Loop_62_256: + sbfx x22, x7, #0, #1 // if |a_| is odd, then we'll be subtracting + sub x2, x2, #1 + and x19, x11, x22 + sub x20, x11, x7 // |b_|-|a_| + subs x21, x7, x19 // |a_|-|b_| (or |a_|-0 if |a_| was even) + mov x19, x12 + csel x11, x11, x7, hs // |b_| = |a_| + csel x7, x21, x20, hs // borrow means |a_|<|b_|, replace with |b_|-|a_| + mov x20, x13 + csel x12, x12, x14, hs // exchange |f0| and |f1| + csel x14, x14, x19, hs + csel x13, x13, x15, hs // exchange |g0| and |g1| + csel x15, x15, x20, hs + lsr x7, x7, #1 + and x19, x14, x22 + and x20, x15, x22 + add x14, x14, x14 // |f1|<<=1 + add x15, x15, x15 // |g1|<<=1 + sub x12, x12, x19 // |f0|-=|f1| (or |f0-=0| if |a_| was even) + sub x13, x13, x20 // |g0|-=|g1| (or |g0-=0| ...) + cbnz x2, .Loop_62_256 + + ret +.size __inner_loop_62_256,.-__inner_loop_62_256 diff --git a/temp-deps/blst-ts/deps/blst/build/elf/ct_inverse_mod_256-x86_64.s b/temp-deps/blst-ts/deps/blst/build/elf/ct_inverse_mod_256-x86_64.s new file mode 100644 index 0000000..c4d8d6d --- /dev/null +++ b/temp-deps/blst-ts/deps/blst/build/elf/ct_inverse_mod_256-x86_64.s @@ -0,0 +1,1185 @@ +.text + +.globl ct_inverse_mod_256 +.type ct_inverse_mod_256,@function +.align 32 +ct_inverse_mod_256: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + subq $1072,%rsp +.cfi_adjust_cfa_offset 1072 + + + leaq 48+511(%rsp),%rax + andq $-512,%rax + movq %rdi,32(%rsp) + movq %rcx,40(%rsp) + + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + + movq 0(%rdx),%r12 + movq 8(%rdx),%r13 + movq 16(%rdx),%r14 + movq 24(%rdx),%r15 + + movq %r8,0(%rax) + movq %r9,8(%rax) + movq %r10,16(%rax) + movq %r11,24(%rax) + + movq %r12,32(%rax) + movq %r13,40(%rax) + movq %r14,48(%rax) + movq %r15,56(%rax) + movq %rax,%rsi + + + movl $31,%edx + call __ab_approximation_31_256 + + + movq %r12,16(%rsp) + movq %r13,24(%rsp) + + movq $256,%rdi + xorq %rsi,%rdi + call __smulq_256_n_shift_by_31 + + + movq %rdx,64(%rdi) + + movq 16(%rsp),%rdx + movq 24(%rsp),%rcx + leaq 32(%rdi),%rdi + call __smulq_256_n_shift_by_31 + + + movq %rdx,72(%rdi) + + + xorq $256,%rsi + movl $31,%edx + call __ab_approximation_31_256 + + + movq %r12,16(%rsp) + movq %r13,24(%rsp) + + movq $256,%rdi + xorq %rsi,%rdi + call __smulq_256_n_shift_by_31 + movq %rdx,0(%rsp) + movq %rcx,8(%rsp) + + movq 16(%rsp),%rdx + movq 24(%rsp),%rcx + leaq 32(%rdi),%rdi + call __smulq_256_n_shift_by_31 + + + + movq 64(%rsi),%r8 + movq 104(%rsi),%r12 + movq %r8,%r9 + imulq 0(%rsp),%r8 + movq %r12,%r13 + imulq 8(%rsp),%r12 + addq %r12,%r8 + movq %r8,32(%rdi) + sarq $63,%r8 + movq %r8,40(%rdi) + movq %r8,48(%rdi) + movq %r8,56(%rdi) + movq %r8,64(%rdi) + leaq 64(%rsi),%rsi + + imulq %rdx,%r9 + imulq %rcx,%r13 + addq %r13,%r9 + movq %r9,72(%rdi) + sarq $63,%r9 + movq %r9,80(%rdi) + movq %r9,88(%rdi) + movq %r9,96(%rdi) + movq %r9,104(%rdi) + xorq $256+64,%rsi + movl $31,%edx + call __ab_approximation_31_256 + + + movq %r12,16(%rsp) + movq %r13,24(%rsp) + + movq $256,%rdi + xorq %rsi,%rdi + call __smulq_256_n_shift_by_31 + movq %rdx,0(%rsp) + movq %rcx,8(%rsp) + + movq 16(%rsp),%rdx + movq 24(%rsp),%rcx + leaq 32(%rdi),%rdi + call __smulq_256_n_shift_by_31 + movq %rdx,16(%rsp) + movq %rcx,24(%rsp) + + movq 0(%rsp),%rdx + movq 8(%rsp),%rcx + leaq 64(%rsi),%rsi + leaq 32(%rdi),%rdi + call __smulq_256x63 + + movq 16(%rsp),%rdx + movq 24(%rsp),%rcx + leaq 40(%rdi),%rdi + call __smulq_256x63 + xorq $256+64,%rsi + movl $31,%edx + call __ab_approximation_31_256 + + + movq %r12,16(%rsp) + movq %r13,24(%rsp) + + movq $256,%rdi + xorq %rsi,%rdi + call __smulq_256_n_shift_by_31 + movq %rdx,0(%rsp) + movq %rcx,8(%rsp) + + movq 16(%rsp),%rdx + movq 24(%rsp),%rcx + leaq 32(%rdi),%rdi + call __smulq_256_n_shift_by_31 + movq %rdx,16(%rsp) + movq %rcx,24(%rsp) + + movq 0(%rsp),%rdx + movq 8(%rsp),%rcx + leaq 64(%rsi),%rsi + leaq 32(%rdi),%rdi + call __smulq_256x63 + + movq 16(%rsp),%rdx + movq 24(%rsp),%rcx + leaq 40(%rdi),%rdi + call __smulq_256x63 + xorq $256+64,%rsi + movl $31,%edx + call __ab_approximation_31_256 + + + movq %r12,16(%rsp) + movq %r13,24(%rsp) + + movq $256,%rdi + xorq %rsi,%rdi + call __smulq_256_n_shift_by_31 + movq %rdx,0(%rsp) + movq %rcx,8(%rsp) + + movq 16(%rsp),%rdx + movq 24(%rsp),%rcx + leaq 32(%rdi),%rdi + call __smulq_256_n_shift_by_31 + movq %rdx,16(%rsp) + movq %rcx,24(%rsp) + + movq 0(%rsp),%rdx + movq 8(%rsp),%rcx + leaq 64(%rsi),%rsi + leaq 32(%rdi),%rdi + call __smulq_256x63 + + movq 16(%rsp),%rdx + movq 24(%rsp),%rcx + leaq 40(%rdi),%rdi + call __smulq_256x63 + xorq $256+64,%rsi + movl $31,%edx + call __ab_approximation_31_256 + + + movq %r12,16(%rsp) + movq %r13,24(%rsp) + + movq $256,%rdi + xorq %rsi,%rdi + call __smulq_256_n_shift_by_31 + movq %rdx,0(%rsp) + movq %rcx,8(%rsp) + + movq 16(%rsp),%rdx + movq 24(%rsp),%rcx + leaq 32(%rdi),%rdi + call __smulq_256_n_shift_by_31 + movq %rdx,16(%rsp) + movq %rcx,24(%rsp) + + movq 0(%rsp),%rdx + movq 8(%rsp),%rcx + leaq 64(%rsi),%rsi + leaq 32(%rdi),%rdi + call __smulq_256x63 + + movq 16(%rsp),%rdx + movq 24(%rsp),%rcx + leaq 40(%rdi),%rdi + call __smulq_256x63 + xorq $256+64,%rsi + movl $31,%edx + call __ab_approximation_31_256 + + + movq %r12,16(%rsp) + movq %r13,24(%rsp) + + movq $256,%rdi + xorq %rsi,%rdi + call __smulq_256_n_shift_by_31 + movq %rdx,0(%rsp) + movq %rcx,8(%rsp) + + movq 16(%rsp),%rdx + movq 24(%rsp),%rcx + leaq 32(%rdi),%rdi + call __smulq_256_n_shift_by_31 + movq %rdx,16(%rsp) + movq %rcx,24(%rsp) + + movq 0(%rsp),%rdx + movq 8(%rsp),%rcx + leaq 64(%rsi),%rsi + leaq 32(%rdi),%rdi + call __smulq_256x63 + + movq 16(%rsp),%rdx + movq 24(%rsp),%rcx + leaq 40(%rdi),%rdi + call __smulq_256x63 + xorq $256+64,%rsi + movl $31,%edx + call __ab_approximation_31_256 + + + movq %r12,16(%rsp) + movq %r13,24(%rsp) + + movq $256,%rdi + xorq %rsi,%rdi + call __smulq_256_n_shift_by_31 + movq %rdx,0(%rsp) + movq %rcx,8(%rsp) + + movq 16(%rsp),%rdx + movq 24(%rsp),%rcx + leaq 32(%rdi),%rdi + call __smulq_256_n_shift_by_31 + movq %rdx,16(%rsp) + movq %rcx,24(%rsp) + + movq 0(%rsp),%rdx + movq 8(%rsp),%rcx + leaq 64(%rsi),%rsi + leaq 32(%rdi),%rdi + call __smulq_256x63 + + movq 16(%rsp),%rdx + movq 24(%rsp),%rcx + leaq 40(%rdi),%rdi + call __smulq_256x63 + xorq $256+64,%rsi + movl $31,%edx + call __ab_approximation_31_256 + + + movq %r12,16(%rsp) + movq %r13,24(%rsp) + + movq $256,%rdi + xorq %rsi,%rdi + call __smulq_256_n_shift_by_31 + movq %rdx,0(%rsp) + movq %rcx,8(%rsp) + + movq 16(%rsp),%rdx + movq 24(%rsp),%rcx + leaq 32(%rdi),%rdi + call __smulq_256_n_shift_by_31 + movq %rdx,16(%rsp) + movq %rcx,24(%rsp) + + movq 0(%rsp),%rdx + movq 8(%rsp),%rcx + leaq 64(%rsi),%rsi + leaq 32(%rdi),%rdi + call __smulq_256x63 + + movq 16(%rsp),%rdx + movq 24(%rsp),%rcx + leaq 40(%rdi),%rdi + call __smulq_256x63 + sarq $63,%rbp + movq %rbp,40(%rdi) + movq %rbp,48(%rdi) + movq %rbp,56(%rdi) + xorq $256+64,%rsi + movl $31,%edx + call __ab_approximation_31_256 + + + movq %r12,16(%rsp) + movq %r13,24(%rsp) + + movq $256,%rdi + xorq %rsi,%rdi + call __smulq_256_n_shift_by_31 + movq %rdx,0(%rsp) + movq %rcx,8(%rsp) + + movq 16(%rsp),%rdx + movq 24(%rsp),%rcx + leaq 32(%rdi),%rdi + call __smulq_256_n_shift_by_31 + movq %rdx,16(%rsp) + movq %rcx,24(%rsp) + + movq 0(%rsp),%rdx + movq 8(%rsp),%rcx + leaq 64(%rsi),%rsi + leaq 32(%rdi),%rdi + call __smulq_256x63 + + movq 16(%rsp),%rdx + movq 24(%rsp),%rcx + leaq 40(%rdi),%rdi + call __smulq_512x63 + xorq $256+64,%rsi + movl $31,%edx + call __ab_approximation_31_256 + + + movq %r12,16(%rsp) + movq %r13,24(%rsp) + + movq $256,%rdi + xorq %rsi,%rdi + call __smulq_256_n_shift_by_31 + movq %rdx,0(%rsp) + movq %rcx,8(%rsp) + + movq 16(%rsp),%rdx + movq 24(%rsp),%rcx + leaq 32(%rdi),%rdi + call __smulq_256_n_shift_by_31 + movq %rdx,16(%rsp) + movq %rcx,24(%rsp) + + movq 0(%rsp),%rdx + movq 8(%rsp),%rcx + leaq 64(%rsi),%rsi + leaq 32(%rdi),%rdi + call __smulq_256x63 + + movq 16(%rsp),%rdx + movq 24(%rsp),%rcx + leaq 40(%rdi),%rdi + call __smulq_512x63 + xorq $256+64,%rsi + movl $31,%edx + call __ab_approximation_31_256 + + + movq %r12,16(%rsp) + movq %r13,24(%rsp) + + movq $256,%rdi + xorq %rsi,%rdi + call __smulq_256_n_shift_by_31 + movq %rdx,0(%rsp) + movq %rcx,8(%rsp) + + movq 16(%rsp),%rdx + movq 24(%rsp),%rcx + leaq 32(%rdi),%rdi + call __smulq_256_n_shift_by_31 + movq %rdx,16(%rsp) + movq %rcx,24(%rsp) + + movq 0(%rsp),%rdx + movq 8(%rsp),%rcx + leaq 64(%rsi),%rsi + leaq 32(%rdi),%rdi + call __smulq_256x63 + + movq 16(%rsp),%rdx + movq 24(%rsp),%rcx + leaq 40(%rdi),%rdi + call __smulq_512x63 + xorq $256+64,%rsi + movl $31,%edx + call __ab_approximation_31_256 + + + movq %r12,16(%rsp) + movq %r13,24(%rsp) + + movq $256,%rdi + xorq %rsi,%rdi + call __smulq_256_n_shift_by_31 + movq %rdx,0(%rsp) + movq %rcx,8(%rsp) + + movq 16(%rsp),%rdx + movq 24(%rsp),%rcx + leaq 32(%rdi),%rdi + call __smulq_256_n_shift_by_31 + movq %rdx,16(%rsp) + movq %rcx,24(%rsp) + + movq 0(%rsp),%rdx + movq 8(%rsp),%rcx + leaq 64(%rsi),%rsi + leaq 32(%rdi),%rdi + call __smulq_256x63 + + movq 16(%rsp),%rdx + movq 24(%rsp),%rcx + leaq 40(%rdi),%rdi + call __smulq_512x63 + xorq $256+64,%rsi + movl $31,%edx + call __ab_approximation_31_256 + + + movq %r12,16(%rsp) + movq %r13,24(%rsp) + + movq $256,%rdi + xorq %rsi,%rdi + call __smulq_256_n_shift_by_31 + movq %rdx,0(%rsp) + movq %rcx,8(%rsp) + + movq 16(%rsp),%rdx + movq 24(%rsp),%rcx + leaq 32(%rdi),%rdi + call __smulq_256_n_shift_by_31 + movq %rdx,16(%rsp) + movq %rcx,24(%rsp) + + movq 0(%rsp),%rdx + movq 8(%rsp),%rcx + leaq 64(%rsi),%rsi + leaq 32(%rdi),%rdi + call __smulq_256x63 + + movq 16(%rsp),%rdx + movq 24(%rsp),%rcx + leaq 40(%rdi),%rdi + call __smulq_512x63 + xorq $256+64,%rsi + movl $31,%edx + call __ab_approximation_31_256 + + + movq %r12,16(%rsp) + movq %r13,24(%rsp) + + movq $256,%rdi + xorq %rsi,%rdi + call __smulq_256_n_shift_by_31 + movq %rdx,0(%rsp) + movq %rcx,8(%rsp) + + movq 16(%rsp),%rdx + movq 24(%rsp),%rcx + leaq 32(%rdi),%rdi + call __smulq_256_n_shift_by_31 + movq %rdx,16(%rsp) + movq %rcx,24(%rsp) + + movq 0(%rsp),%rdx + movq 8(%rsp),%rcx + leaq 64(%rsi),%rsi + leaq 32(%rdi),%rdi + call __smulq_256x63 + + movq 16(%rsp),%rdx + movq 24(%rsp),%rcx + leaq 40(%rdi),%rdi + call __smulq_512x63 + + xorq $256+64,%rsi + movl $47,%edx + + movq 0(%rsi),%r8 + + movq 32(%rsi),%r10 + + call __inner_loop_62_256 + + + + + + + + leaq 64(%rsi),%rsi + + + + + + movq %r12,%rdx + movq %r13,%rcx + movq 32(%rsp),%rdi + call __smulq_512x63 + adcq %rbp,%rdx + + movq 40(%rsp),%rsi + movq %rdx,%rax + sarq $63,%rdx + + movq %rdx,%r8 + movq %rdx,%r9 + andq 0(%rsi),%r8 + movq %rdx,%r10 + andq 8(%rsi),%r9 + andq 16(%rsi),%r10 + andq 24(%rsi),%rdx + + addq %r8,%r12 + adcq %r9,%r13 + adcq %r10,%r14 + adcq %rdx,%r15 + adcq $0,%rax + + movq %rax,%rdx + negq %rax + orq %rax,%rdx + sarq $63,%rax + + movq %rdx,%r8 + movq %rdx,%r9 + andq 0(%rsi),%r8 + movq %rdx,%r10 + andq 8(%rsi),%r9 + andq 16(%rsi),%r10 + andq 24(%rsi),%rdx + + xorq %rax,%r8 + xorq %rcx,%rcx + xorq %rax,%r9 + subq %rax,%rcx + xorq %rax,%r10 + xorq %rax,%rdx + addq %rcx,%r8 + adcq $0,%r9 + adcq $0,%r10 + adcq $0,%rdx + + addq %r8,%r12 + adcq %r9,%r13 + adcq %r10,%r14 + adcq %rdx,%r15 + + movq %r12,32(%rdi) + movq %r13,40(%rdi) + movq %r14,48(%rdi) + movq %r15,56(%rdi) + + leaq 1072(%rsp),%r8 + movq 0(%r8),%r15 +.cfi_restore %r15 + movq 8(%r8),%r14 +.cfi_restore %r14 + movq 16(%r8),%r13 +.cfi_restore %r13 + movq 24(%r8),%r12 +.cfi_restore %r12 + movq 32(%r8),%rbx +.cfi_restore %rbx + movq 40(%r8),%rbp +.cfi_restore %rbp + leaq 48(%r8),%rsp +.cfi_adjust_cfa_offset -1072-8*6 + + .byte 0xf3,0xc3 +.cfi_endproc +.size ct_inverse_mod_256,.-ct_inverse_mod_256 +.type __smulq_512x63,@function +.align 32 +__smulq_512x63: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + movq 32(%rsi),%rbp + + movq %rdx,%rbx + sarq $63,%rdx + xorq %rax,%rax + subq %rdx,%rax + + xorq %rdx,%rbx + addq %rax,%rbx + + xorq %rdx,%r8 + xorq %rdx,%r9 + xorq %rdx,%r10 + xorq %rdx,%r11 + xorq %rdx,%rbp + addq %r8,%rax + adcq $0,%r9 + adcq $0,%r10 + adcq $0,%r11 + adcq $0,%rbp + + mulq %rbx + movq %rax,0(%rdi) + movq %r9,%rax + movq %rdx,%r9 + mulq %rbx + addq %rax,%r9 + movq %r10,%rax + adcq $0,%rdx + movq %r9,8(%rdi) + movq %rdx,%r10 + mulq %rbx + addq %rax,%r10 + movq %r11,%rax + adcq $0,%rdx + movq %r10,16(%rdi) + movq %rdx,%r11 + andq %rbx,%rbp + negq %rbp + mulq %rbx + addq %rax,%r11 + adcq %rdx,%rbp + movq %r11,24(%rdi) + + movq 40(%rsi),%r8 + movq 48(%rsi),%r9 + movq 56(%rsi),%r10 + movq 64(%rsi),%r11 + movq 72(%rsi),%r12 + movq 80(%rsi),%r13 + movq 88(%rsi),%r14 + movq 96(%rsi),%r15 + + movq %rcx,%rdx + sarq $63,%rdx + xorq %rax,%rax + subq %rdx,%rax + + xorq %rdx,%rcx + addq %rax,%rcx + + xorq %rdx,%r8 + xorq %rdx,%r9 + xorq %rdx,%r10 + xorq %rdx,%r11 + xorq %rdx,%r12 + xorq %rdx,%r13 + xorq %rdx,%r14 + xorq %rdx,%r15 + addq %r8,%rax + adcq $0,%r9 + adcq $0,%r10 + adcq $0,%r11 + adcq $0,%r12 + adcq $0,%r13 + adcq $0,%r14 + adcq $0,%r15 + + mulq %rcx + movq %rax,%r8 + movq %r9,%rax + movq %rdx,%r9 + mulq %rcx + addq %rax,%r9 + movq %r10,%rax + adcq $0,%rdx + movq %rdx,%r10 + mulq %rcx + addq %rax,%r10 + movq %r11,%rax + adcq $0,%rdx + movq %rdx,%r11 + mulq %rcx + addq %rax,%r11 + movq %r12,%rax + adcq $0,%rdx + movq %rdx,%r12 + mulq %rcx + addq %rax,%r12 + movq %r13,%rax + adcq $0,%rdx + movq %rdx,%r13 + mulq %rcx + addq %rax,%r13 + movq %r14,%rax + adcq $0,%rdx + movq %rdx,%r14 + mulq %rcx + addq %rax,%r14 + movq %r15,%rax + adcq $0,%rdx + movq %rdx,%r15 + imulq %rcx + addq %rax,%r15 + adcq $0,%rdx + + movq %rbp,%rbx + sarq $63,%rbp + + addq 0(%rdi),%r8 + adcq 8(%rdi),%r9 + adcq 16(%rdi),%r10 + adcq 24(%rdi),%r11 + adcq %rbx,%r12 + adcq %rbp,%r13 + adcq %rbp,%r14 + adcq %rbp,%r15 + + movq %r8,0(%rdi) + movq %r9,8(%rdi) + movq %r10,16(%rdi) + movq %r11,24(%rdi) + movq %r12,32(%rdi) + movq %r13,40(%rdi) + movq %r14,48(%rdi) + movq %r15,56(%rdi) + + .byte 0xf3,0xc3 +.cfi_endproc +.size __smulq_512x63,.-__smulq_512x63 + +.type __smulq_256x63,@function +.align 32 +__smulq_256x63: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + movq 0+0(%rsi),%r8 + movq 0+8(%rsi),%r9 + movq 0+16(%rsi),%r10 + movq 0+24(%rsi),%r11 + movq 0+32(%rsi),%rbp + + movq %rdx,%rbx + sarq $63,%rdx + xorq %rax,%rax + subq %rdx,%rax + + xorq %rdx,%rbx + addq %rax,%rbx + + xorq %rdx,%r8 + xorq %rdx,%r9 + xorq %rdx,%r10 + xorq %rdx,%r11 + xorq %rdx,%rbp + addq %r8,%rax + adcq $0,%r9 + adcq $0,%r10 + adcq $0,%r11 + adcq $0,%rbp + + mulq %rbx + movq %rax,%r8 + movq %r9,%rax + movq %rdx,%r9 + mulq %rbx + addq %rax,%r9 + movq %r10,%rax + adcq $0,%rdx + movq %rdx,%r10 + mulq %rbx + addq %rax,%r10 + movq %r11,%rax + adcq $0,%rdx + movq %rdx,%r11 + andq %rbx,%rbp + negq %rbp + mulq %rbx + addq %rax,%r11 + adcq %rdx,%rbp + movq %rcx,%rdx + movq 40+0(%rsi),%r12 + movq 40+8(%rsi),%r13 + movq 40+16(%rsi),%r14 + movq 40+24(%rsi),%r15 + movq 40+32(%rsi),%rcx + + movq %rdx,%rbx + sarq $63,%rdx + xorq %rax,%rax + subq %rdx,%rax + + xorq %rdx,%rbx + addq %rax,%rbx + + xorq %rdx,%r12 + xorq %rdx,%r13 + xorq %rdx,%r14 + xorq %rdx,%r15 + xorq %rdx,%rcx + addq %r12,%rax + adcq $0,%r13 + adcq $0,%r14 + adcq $0,%r15 + adcq $0,%rcx + + mulq %rbx + movq %rax,%r12 + movq %r13,%rax + movq %rdx,%r13 + mulq %rbx + addq %rax,%r13 + movq %r14,%rax + adcq $0,%rdx + movq %rdx,%r14 + mulq %rbx + addq %rax,%r14 + movq %r15,%rax + adcq $0,%rdx + movq %rdx,%r15 + andq %rbx,%rcx + negq %rcx + mulq %rbx + addq %rax,%r15 + adcq %rdx,%rcx + addq %r12,%r8 + adcq %r13,%r9 + adcq %r14,%r10 + adcq %r15,%r11 + adcq %rcx,%rbp + + movq %r8,0(%rdi) + movq %r9,8(%rdi) + movq %r10,16(%rdi) + movq %r11,24(%rdi) + movq %rbp,32(%rdi) + + .byte 0xf3,0xc3 +.cfi_endproc +.size __smulq_256x63,.-__smulq_256x63 +.type __smulq_256_n_shift_by_31,@function +.align 32 +__smulq_256_n_shift_by_31: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + movq %rdx,0(%rdi) + movq %rcx,8(%rdi) + movq %rdx,%rbp + movq 0+0(%rsi),%r8 + movq 0+8(%rsi),%r9 + movq 0+16(%rsi),%r10 + movq 0+24(%rsi),%r11 + + movq %rbp,%rbx + sarq $63,%rbp + xorq %rax,%rax + subq %rbp,%rax + + xorq %rbp,%rbx + addq %rax,%rbx + + xorq %rbp,%r8 + xorq %rbp,%r9 + xorq %rbp,%r10 + xorq %rbp,%r11 + addq %r8,%rax + adcq $0,%r9 + adcq $0,%r10 + adcq $0,%r11 + + mulq %rbx + movq %rax,%r8 + movq %r9,%rax + andq %rbx,%rbp + negq %rbp + movq %rdx,%r9 + mulq %rbx + addq %rax,%r9 + movq %r10,%rax + adcq $0,%rdx + movq %rdx,%r10 + mulq %rbx + addq %rax,%r10 + movq %r11,%rax + adcq $0,%rdx + movq %rdx,%r11 + mulq %rbx + addq %rax,%r11 + adcq %rdx,%rbp + movq 32+0(%rsi),%r12 + movq 32+8(%rsi),%r13 + movq 32+16(%rsi),%r14 + movq 32+24(%rsi),%r15 + + movq %rcx,%rbx + sarq $63,%rcx + xorq %rax,%rax + subq %rcx,%rax + + xorq %rcx,%rbx + addq %rax,%rbx + + xorq %rcx,%r12 + xorq %rcx,%r13 + xorq %rcx,%r14 + xorq %rcx,%r15 + addq %r12,%rax + adcq $0,%r13 + adcq $0,%r14 + adcq $0,%r15 + + mulq %rbx + movq %rax,%r12 + movq %r13,%rax + andq %rbx,%rcx + negq %rcx + movq %rdx,%r13 + mulq %rbx + addq %rax,%r13 + movq %r14,%rax + adcq $0,%rdx + movq %rdx,%r14 + mulq %rbx + addq %rax,%r14 + movq %r15,%rax + adcq $0,%rdx + movq %rdx,%r15 + mulq %rbx + addq %rax,%r15 + adcq %rdx,%rcx + addq %r12,%r8 + adcq %r13,%r9 + adcq %r14,%r10 + adcq %r15,%r11 + adcq %rcx,%rbp + + movq 0(%rdi),%rdx + movq 8(%rdi),%rcx + + shrdq $31,%r9,%r8 + shrdq $31,%r10,%r9 + shrdq $31,%r11,%r10 + shrdq $31,%rbp,%r11 + + sarq $63,%rbp + xorq %rax,%rax + subq %rbp,%rax + + xorq %rbp,%r8 + xorq %rbp,%r9 + xorq %rbp,%r10 + xorq %rbp,%r11 + addq %rax,%r8 + adcq $0,%r9 + adcq $0,%r10 + adcq $0,%r11 + + movq %r8,0(%rdi) + movq %r9,8(%rdi) + movq %r10,16(%rdi) + movq %r11,24(%rdi) + + xorq %rbp,%rdx + xorq %rbp,%rcx + addq %rax,%rdx + addq %rax,%rcx + + .byte 0xf3,0xc3 +.cfi_endproc +.size __smulq_256_n_shift_by_31,.-__smulq_256_n_shift_by_31 +.type __ab_approximation_31_256,@function +.align 32 +__ab_approximation_31_256: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + movq 24(%rsi),%r9 + movq 56(%rsi),%r11 + movq 16(%rsi),%rbx + movq 48(%rsi),%rbp + movq 8(%rsi),%r8 + movq 40(%rsi),%r10 + + movq %r9,%rax + orq %r11,%rax + cmovzq %rbx,%r9 + cmovzq %rbp,%r11 + cmovzq %r8,%rbx + movq 0(%rsi),%r8 + cmovzq %r10,%rbp + movq 32(%rsi),%r10 + + movq %r9,%rax + orq %r11,%rax + cmovzq %rbx,%r9 + cmovzq %rbp,%r11 + cmovzq %r8,%rbx + cmovzq %r10,%rbp + + movq %r9,%rax + orq %r11,%rax + bsrq %rax,%rcx + leaq 1(%rcx),%rcx + cmovzq %r8,%r9 + cmovzq %r10,%r11 + cmovzq %rax,%rcx + negq %rcx + + + shldq %cl,%rbx,%r9 + shldq %cl,%rbp,%r11 + + movl $0x7FFFFFFF,%eax + andq %rax,%r8 + andq %rax,%r10 + notq %rax + andq %rax,%r9 + andq %rax,%r11 + orq %r9,%r8 + orq %r11,%r10 + + jmp __inner_loop_31_256 + + .byte 0xf3,0xc3 +.cfi_endproc +.size __ab_approximation_31_256,.-__ab_approximation_31_256 +.type __inner_loop_31_256,@function +.align 32 +__inner_loop_31_256: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + movq $0x7FFFFFFF80000000,%rcx + movq $0x800000007FFFFFFF,%r13 + movq $0x7FFFFFFF7FFFFFFF,%r15 + +.Loop_31_256: + cmpq %r10,%r8 + movq %r8,%rax + movq %r10,%rbx + movq %rcx,%rbp + movq %r13,%r14 + cmovbq %r10,%r8 + cmovbq %rax,%r10 + cmovbq %r13,%rcx + cmovbq %rbp,%r13 + + subq %r10,%r8 + subq %r13,%rcx + addq %r15,%rcx + + testq $1,%rax + cmovzq %rax,%r8 + cmovzq %rbx,%r10 + cmovzq %rbp,%rcx + cmovzq %r14,%r13 + + shrq $1,%r8 + addq %r13,%r13 + subq %r15,%r13 + subl $1,%edx + jnz .Loop_31_256 + + shrq $32,%r15 + movl %ecx,%edx + movl %r13d,%r12d + shrq $32,%rcx + shrq $32,%r13 + subq %r15,%rdx + subq %r15,%rcx + subq %r15,%r12 + subq %r15,%r13 + + .byte 0xf3,0xc3 +.cfi_endproc +.size __inner_loop_31_256,.-__inner_loop_31_256 + +.type __inner_loop_62_256,@function +.align 32 +__inner_loop_62_256: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + movl %edx,%r15d + movq $1,%rdx + xorq %rcx,%rcx + xorq %r12,%r12 + movq %rdx,%r13 + movq %rdx,%r14 + +.Loop_62_256: + xorq %rax,%rax + testq %r14,%r8 + movq %r10,%rbx + cmovnzq %r10,%rax + subq %r8,%rbx + movq %r8,%rbp + subq %rax,%r8 + cmovcq %rbx,%r8 + cmovcq %rbp,%r10 + movq %rdx,%rax + cmovcq %r12,%rdx + cmovcq %rax,%r12 + movq %rcx,%rbx + cmovcq %r13,%rcx + cmovcq %rbx,%r13 + xorq %rax,%rax + xorq %rbx,%rbx + shrq $1,%r8 + testq %r14,%rbp + cmovnzq %r12,%rax + cmovnzq %r13,%rbx + addq %r12,%r12 + addq %r13,%r13 + subq %rax,%rdx + subq %rbx,%rcx + subl $1,%r15d + jnz .Loop_62_256 + + .byte 0xf3,0xc3 +.cfi_endproc +.size __inner_loop_62_256,.-__inner_loop_62_256 + +.section .note.GNU-stack,"",@progbits +.section .note.gnu.property,"a",@note + .long 4,2f-1f,5 + .byte 0x47,0x4E,0x55,0 +1: .long 0xc0000002,4,3 +.align 8 +2: diff --git a/temp-deps/blst-ts/deps/blst/build/elf/ct_inverse_mod_384-armv8.S b/temp-deps/blst-ts/deps/blst/build/elf/ct_inverse_mod_384-armv8.S new file mode 100644 index 0000000..d7eca17 --- /dev/null +++ b/temp-deps/blst-ts/deps/blst/build/elf/ct_inverse_mod_384-armv8.S @@ -0,0 +1,717 @@ +.text + +.globl ct_inverse_mod_383 +.type ct_inverse_mod_383, %function +.align 5 +ct_inverse_mod_383: + .inst 0xd503233f + stp x29, x30, [sp,#-128]! + add x29, sp, #0 + stp x19, x20, [sp,#16] + stp x21, x22, [sp,#32] + stp x23, x24, [sp,#48] + stp x25, x26, [sp,#64] + stp x27, x28, [sp,#80] + sub sp, sp, #1040 + + ldp x22, x4, [x1,#8*0] + ldp x5, x6, [x1,#8*2] + ldp x7, x8, [x1,#8*4] + + add x1, sp, #16+511 // find closest 512-byte-aligned spot + and x1, x1, #-512 // in the frame... + stp x0, x3, [sp] + + ldp x9, x10, [x2,#8*0] + ldp x11, x12, [x2,#8*2] + ldp x13, x14, [x2,#8*4] + + stp x22, x4, [x1,#8*0] // copy input to |a| + stp x5, x6, [x1,#8*2] + stp x7, x8, [x1,#8*4] + stp x9, x10, [x1,#8*6] // copy modulus to |b| + stp x11, x12, [x1,#8*8] + stp x13, x14, [x1,#8*10] + + ////////////////////////////////////////// first iteration + mov x2, #62 + bl .Lab_approximation_62_loaded + + eor x0, x1, #256 // pointer to dst |a|b|u|v| + bl __smul_383_n_shift_by_62 + str x15,[x0,#8*12] // initialize |u| with |f0| + + mov x15, x17 // |f1| + mov x16, x19 // |g1| + add x0, x0, #8*6 // pointer to dst |b| + bl __smul_383_n_shift_by_62 + str x15, [x0,#8*12] // initialize |v| with |f1| + + ////////////////////////////////////////// second iteration + eor x1, x1, #256 // flip-flop src |a|b|u|v| + mov x2, #62 + bl __ab_approximation_62 + + eor x0, x1, #256 // pointer to dst |a|b|u|v| + bl __smul_383_n_shift_by_62 + mov x20, x15 // corrected |f0| + mov x21, x16 // corrected |g0| + + mov x15, x17 // |f1| + mov x16, x19 // |g1| + add x0, x0, #8*6 // pointer to destination |b| + bl __smul_383_n_shift_by_62 + + ldr x7, [x1,#8*12] // |u| + ldr x8, [x1,#8*18] // |v| + mul x3, x20, x7 // |u|*|f0| + smulh x4, x20, x7 + mul x5, x21, x8 // |v|*|g0| + smulh x6, x21, x8 + adds x3, x3, x5 + adc x4, x4, x6 + stp x3, x4, [x0,#8*6] + asr x5, x4, #63 // sign extenstion + stp x5, x5, [x0,#8*8] + stp x5, x5, [x0,#8*10] + + mul x3, x15, x7 // |u|*|f1| + smulh x4, x15, x7 + mul x5, x16, x8 // |v|*|g1| + smulh x6, x16, x8 + adds x3, x3, x5 + adc x4, x4, x6 + stp x3, x4, [x0,#8*12] + asr x5, x4, #63 // sign extenstion + stp x5, x5, [x0,#8*14] + stp x5, x5, [x0,#8*16] + eor x1, x1, #256 // flip-flop src |a|b|u|v| + mov x2, #62 + bl __ab_approximation_62 + + eor x0, x1, #256 // pointer to dst |a|b|u|v| + bl __smul_383_n_shift_by_62 + mov x20, x15 // corrected |f0| + mov x21, x16 // corrected |g0| + + mov x15, x17 // |f1| + mov x16, x19 // |g1| + add x0, x0, #8*6 // pointer to destination |b| + bl __smul_383_n_shift_by_62 + + add x0, x0, #8*6 // pointer to destination |u| + bl __smul_383x63 + + mov x20, x15 // corrected |f1| + mov x21, x16 // corrected |g1| + add x0, x0, #8*6 // pointer to destination |v| + bl __smul_383x63 + eor x1, x1, #256 // flip-flop src |a|b|u|v| + mov x2, #62 + bl __ab_approximation_62 + + eor x0, x1, #256 // pointer to dst |a|b|u|v| + bl __smul_383_n_shift_by_62 + mov x20, x15 // corrected |f0| + mov x21, x16 // corrected |g0| + + mov x15, x17 // |f1| + mov x16, x19 // |g1| + add x0, x0, #8*6 // pointer to destination |b| + bl __smul_383_n_shift_by_62 + + add x0, x0, #8*6 // pointer to destination |u| + bl __smul_383x63 + + mov x20, x15 // corrected |f1| + mov x21, x16 // corrected |g1| + add x0, x0, #8*6 // pointer to destination |v| + bl __smul_383x63 + eor x1, x1, #256 // flip-flop src |a|b|u|v| + mov x2, #62 + bl __ab_approximation_62 + + eor x0, x1, #256 // pointer to dst |a|b|u|v| + bl __smul_383_n_shift_by_62 + mov x20, x15 // corrected |f0| + mov x21, x16 // corrected |g0| + + mov x15, x17 // |f1| + mov x16, x19 // |g1| + add x0, x0, #8*6 // pointer to destination |b| + bl __smul_383_n_shift_by_62 + + add x0, x0, #8*6 // pointer to destination |u| + bl __smul_383x63 + + mov x20, x15 // corrected |f1| + mov x21, x16 // corrected |g1| + add x0, x0, #8*6 // pointer to destination |v| + bl __smul_383x63 + eor x1, x1, #256 // flip-flop src |a|b|u|v| + mov x2, #62 + bl __ab_approximation_62 + + eor x0, x1, #256 // pointer to dst |a|b|u|v| + bl __smul_383_n_shift_by_62 + mov x20, x15 // corrected |f0| + mov x21, x16 // corrected |g0| + + mov x15, x17 // |f1| + mov x16, x19 // |g1| + add x0, x0, #8*6 // pointer to destination |b| + bl __smul_383_n_shift_by_62 + + add x0, x0, #8*6 // pointer to destination |u| + bl __smul_383x63 + + mov x20, x15 // corrected |f1| + mov x21, x16 // corrected |g1| + add x0, x0, #8*6 // pointer to destination |v| + bl __smul_383x63 + asr x27, x27, #63 // sign extension + stp x27, x27, [x0,#8*6] + stp x27, x27, [x0,#8*8] + stp x27, x27, [x0,#8*10] + eor x1, x1, #256 // flip-flop src |a|b|u|v| + mov x2, #62 + bl __ab_approximation_62 + + eor x0, x1, #256 // pointer to dst |a|b|u|v| + bl __smul_383_n_shift_by_62 + mov x20, x15 // corrected |f0| + mov x21, x16 // corrected |g0| + + mov x15, x17 // |f1| + mov x16, x19 // |g1| + add x0, x0, #8*6 // pointer to destination |b| + bl __smul_383_n_shift_by_62 + + add x0, x0, #8*6 // pointer to destination |u| + bl __smul_383x63 + + mov x20, x15 // corrected |f1| + mov x21, x16 // corrected |g1| + add x0, x0, #8*6 // pointer to destination |v| + bl __smul_383x63 + bl __smul_767x63_tail + eor x1, x1, #256 // flip-flop src |a|b|u|v| + mov x2, #62 + bl __ab_approximation_62 + + eor x0, x1, #256 // pointer to dst |a|b|u|v| + bl __smul_383_n_shift_by_62 + mov x20, x15 // corrected |f0| + mov x21, x16 // corrected |g0| + + mov x15, x17 // |f1| + mov x16, x19 // |g1| + add x0, x0, #8*6 // pointer to destination |b| + bl __smul_383_n_shift_by_62 + + add x0, x0, #8*6 // pointer to destination |u| + bl __smul_383x63 + + mov x20, x15 // corrected |f1| + mov x21, x16 // corrected |g1| + add x0, x0, #8*6 // pointer to destination |v| + bl __smul_383x63 + bl __smul_767x63_tail + eor x1, x1, #256 // flip-flop src |a|b|u|v| + mov x2, #62 + bl __ab_approximation_62 + + eor x0, x1, #256 // pointer to dst |a|b|u|v| + bl __smul_383_n_shift_by_62 + mov x20, x15 // corrected |f0| + mov x21, x16 // corrected |g0| + + mov x15, x17 // |f1| + mov x16, x19 // |g1| + add x0, x0, #8*6 // pointer to destination |b| + bl __smul_383_n_shift_by_62 + + add x0, x0, #8*6 // pointer to destination |u| + bl __smul_383x63 + + mov x20, x15 // corrected |f1| + mov x21, x16 // corrected |g1| + add x0, x0, #8*6 // pointer to destination |v| + bl __smul_383x63 + bl __smul_767x63_tail + eor x1, x1, #256 // flip-flop src |a|b|u|v| + mov x2, #62 + bl __ab_approximation_62 + + eor x0, x1, #256 // pointer to dst |a|b|u|v| + bl __smul_383_n_shift_by_62 + mov x20, x15 // corrected |f0| + mov x21, x16 // corrected |g0| + + mov x15, x17 // |f1| + mov x16, x19 // |g1| + add x0, x0, #8*6 // pointer to destination |b| + bl __smul_383_n_shift_by_62 + + add x0, x0, #8*6 // pointer to destination |u| + bl __smul_383x63 + + mov x20, x15 // corrected |f1| + mov x21, x16 // corrected |g1| + add x0, x0, #8*6 // pointer to destination |v| + bl __smul_383x63 + bl __smul_767x63_tail + eor x1, x1, #256 // flip-flop src |a|b|u|v| + mov x2, #62 + bl __ab_approximation_62 + + eor x0, x1, #256 // pointer to dst |a|b|u|v| + bl __smul_383_n_shift_by_62 + mov x20, x15 // corrected |f0| + mov x21, x16 // corrected |g0| + + mov x15, x17 // |f1| + mov x16, x19 // |g1| + add x0, x0, #8*6 // pointer to destination |b| + bl __smul_383_n_shift_by_62 + + add x0, x0, #8*6 // pointer to destination |u| + bl __smul_383x63 + + mov x20, x15 // corrected |f1| + mov x21, x16 // corrected |g1| + add x0, x0, #8*6 // pointer to destination |v| + bl __smul_383x63 + bl __smul_767x63_tail + ////////////////////////////////////////// iteration before last + eor x1, x1, #256 // flip-flop src |a|b|u|v| + mov x2, #62 + //bl __ab_approximation_62 // |a| and |b| are exact, + ldp x3, x8, [x1,#8*0] // just load + ldp x9, x14, [x1,#8*6] + bl __inner_loop_62 + + eor x0, x1, #256 // pointer to dst |a|b|u|v| + str x3, [x0,#8*0] + str x9, [x0,#8*6] + + mov x20, x15 // exact |f0| + mov x21, x16 // exact |g0| + mov x15, x17 + mov x16, x19 + add x0, x0, #8*12 // pointer to dst |u| + bl __smul_383x63 + + mov x20, x15 // exact |f1| + mov x21, x16 // exact |g1| + add x0, x0, #8*6 // pointer to dst |v| + bl __smul_383x63 + bl __smul_767x63_tail + + ////////////////////////////////////////// last iteration + eor x1, x1, #256 // flip-flop src |a|b|u|v| + mov x2, #22 // 766 % 62 + //bl __ab_approximation_62 // |a| and |b| are exact, + ldr x3, [x1,#8*0] // just load + eor x8, x8, x8 + ldr x9, [x1,#8*6] + eor x14, x14, x14 + bl __inner_loop_62 + + mov x20, x17 + mov x21, x19 + ldp x0, x15, [sp] // original out_ptr and n_ptr + bl __smul_383x63 + bl __smul_767x63_tail + ldr x30, [x29,#8] + + asr x22, x8, #63 // sign as mask + ldp x9, x10, [x15,#8*0] + ldp x11, x12, [x15,#8*2] + ldp x13, x14, [x15,#8*4] + + and x9, x9, x22 // add mod<<384 conditionally + and x10, x10, x22 + adds x3, x3, x9 + and x11, x11, x22 + adcs x4, x4, x10 + and x12, x12, x22 + adcs x5, x5, x11 + and x13, x13, x22 + adcs x6, x6, x12 + and x14, x14, x22 + stp x3, x4, [x0,#8*6] + adcs x7, x7, x13 + stp x5, x6, [x0,#8*8] + adc x8, x8, x14 + stp x7, x8, [x0,#8*10] + + add sp, sp, #1040 + ldp x19, x20, [x29,#16] + ldp x21, x22, [x29,#32] + ldp x23, x24, [x29,#48] + ldp x25, x26, [x29,#64] + ldp x27, x28, [x29,#80] + ldr x29, [sp],#128 + .inst 0xd50323bf + ret +.size ct_inverse_mod_383,.-ct_inverse_mod_383 + +//////////////////////////////////////////////////////////////////////// +// see corresponding commentary in ctx_inverse_mod_384-x86_64... +.type __smul_383x63, %function +.align 5 +__smul_383x63: + ldp x3, x4, [x1,#8*0+96] // load |u| (or |v|) + asr x17, x20, #63 // |f_|'s sign as mask (or |g_|'s) + ldp x5, x6, [x1,#8*2+96] + eor x20, x20, x17 // conditionally negate |f_| (or |g_|) + ldp x7, x8, [x1,#8*4+96] + + eor x3, x3, x17 // conditionally negate |u| (or |v|) + sub x20, x20, x17 + eor x4, x4, x17 + adds x3, x3, x17, lsr#63 + eor x5, x5, x17 + adcs x4, x4, xzr + eor x6, x6, x17 + adcs x5, x5, xzr + eor x7, x7, x17 + adcs x6, x6, xzr + umulh x22, x3, x20 + eor x8, x8, x17 + umulh x23, x4, x20 + adcs x7, x7, xzr + umulh x24, x5, x20 + adcs x8, x8, xzr + umulh x25, x6, x20 + umulh x26, x7, x20 + mul x3, x3, x20 + mul x4, x4, x20 + mul x5, x5, x20 + adds x4, x4, x22 + mul x6, x6, x20 + adcs x5, x5, x23 + mul x7, x7, x20 + adcs x6, x6, x24 + mul x27,x8, x20 + adcs x7, x7, x25 + adcs x27,x27,x26 + adc x2, xzr, xzr + ldp x9, x10, [x1,#8*0+144] // load |u| (or |v|) + asr x17, x21, #63 // |f_|'s sign as mask (or |g_|'s) + ldp x11, x12, [x1,#8*2+144] + eor x21, x21, x17 // conditionally negate |f_| (or |g_|) + ldp x13, x14, [x1,#8*4+144] + + eor x9, x9, x17 // conditionally negate |u| (or |v|) + sub x21, x21, x17 + eor x10, x10, x17 + adds x9, x9, x17, lsr#63 + eor x11, x11, x17 + adcs x10, x10, xzr + eor x12, x12, x17 + adcs x11, x11, xzr + eor x13, x13, x17 + adcs x12, x12, xzr + umulh x22, x9, x21 + eor x14, x14, x17 + umulh x23, x10, x21 + adcs x13, x13, xzr + umulh x24, x11, x21 + adcs x14, x14, xzr + umulh x25, x12, x21 + adc x19, xzr, xzr // used in __smul_767x63_tail + umulh x26, x13, x21 + mul x9, x9, x21 + mul x10, x10, x21 + mul x11, x11, x21 + adds x10, x10, x22 + mul x12, x12, x21 + adcs x11, x11, x23 + mul x13, x13, x21 + adcs x12, x12, x24 + mul x28,x14, x21 + adcs x13, x13, x25 + adcs x28,x28,x26 + adc x2, x2, xzr + + adds x3, x3, x9 + adcs x4, x4, x10 + adcs x5, x5, x11 + adcs x6, x6, x12 + stp x3, x4, [x0,#8*0] + adcs x7, x7, x13 + stp x5, x6, [x0,#8*2] + adcs x27, x27, x28 + stp x7, x27, [x0,#8*4] + adc x28, x2, xzr // used in __smul_767x63_tail + + ret +.size __smul_383x63,.-__smul_383x63 + +.type __smul_767x63_tail, %function +.align 5 +__smul_767x63_tail: + smulh x27, x8, x20 + ldp x3, x4, [x1,#8*24] // load rest of |v| + umulh x14,x14, x21 + ldp x5, x6, [x1,#8*26] + ldp x7, x8, [x1,#8*28] + + eor x3, x3, x17 // conditionally negate rest of |v| + eor x4, x4, x17 + eor x5, x5, x17 + adds x3, x3, x19 + eor x6, x6, x17 + adcs x4, x4, xzr + eor x7, x7, x17 + adcs x5, x5, xzr + eor x8, x8, x17 + adcs x6, x6, xzr + umulh x22, x3, x21 + adcs x7, x7, xzr + umulh x23, x4, x21 + adc x8, x8, xzr + + umulh x24, x5, x21 + add x14, x14, x28 + umulh x25, x6, x21 + asr x28, x27, #63 + umulh x26, x7, x21 + mul x3, x3, x21 + mul x4, x4, x21 + mul x5, x5, x21 + adds x3, x3, x14 + mul x6, x6, x21 + adcs x4, x4, x22 + mul x7, x7, x21 + adcs x5, x5, x23 + mul x8, x8, x21 + adcs x6, x6, x24 + adcs x7, x7, x25 + adc x8, x8, x26 + + adds x3, x3, x27 + adcs x4, x4, x28 + adcs x5, x5, x28 + adcs x6, x6, x28 + stp x3, x4, [x0,#8*6] + adcs x7, x7, x28 + stp x5, x6, [x0,#8*8] + adc x8, x8, x28 + stp x7, x8, [x0,#8*10] + + ret +.size __smul_767x63_tail,.-__smul_767x63_tail + +.type __smul_383_n_shift_by_62, %function +.align 5 +__smul_383_n_shift_by_62: + ldp x3, x4, [x1,#8*0+0] // load |a| (or |b|) + asr x28, x15, #63 // |f0|'s sign as mask (or |g0|'s) + ldp x5, x6, [x1,#8*2+0] + eor x2, x15, x28 // conditionally negate |f0| (or |g0|) + ldp x7, x8, [x1,#8*4+0] + + eor x3, x3, x28 // conditionally negate |a| (or |b|) + sub x2, x2, x28 + eor x4, x4, x28 + adds x3, x3, x28, lsr#63 + eor x5, x5, x28 + adcs x4, x4, xzr + eor x6, x6, x28 + adcs x5, x5, xzr + eor x7, x7, x28 + umulh x22, x3, x2 + adcs x6, x6, xzr + umulh x23, x4, x2 + eor x8, x8, x28 + umulh x24, x5, x2 + adcs x7, x7, xzr + umulh x25, x6, x2 + adc x8, x8, xzr + + umulh x26, x7, x2 + smulh x27, x8, x2 + mul x3, x3, x2 + mul x4, x4, x2 + mul x5, x5, x2 + adds x4, x4, x22 + mul x6, x6, x2 + adcs x5, x5, x23 + mul x7, x7, x2 + adcs x6, x6, x24 + mul x8, x8, x2 + adcs x7, x7, x25 + adcs x8, x8 ,x26 + adc x27, x27, xzr + ldp x9, x10, [x1,#8*0+48] // load |a| (or |b|) + asr x28, x16, #63 // |f0|'s sign as mask (or |g0|'s) + ldp x11, x12, [x1,#8*2+48] + eor x2, x16, x28 // conditionally negate |f0| (or |g0|) + ldp x13, x14, [x1,#8*4+48] + + eor x9, x9, x28 // conditionally negate |a| (or |b|) + sub x2, x2, x28 + eor x10, x10, x28 + adds x9, x9, x28, lsr#63 + eor x11, x11, x28 + adcs x10, x10, xzr + eor x12, x12, x28 + adcs x11, x11, xzr + eor x13, x13, x28 + umulh x22, x9, x2 + adcs x12, x12, xzr + umulh x23, x10, x2 + eor x14, x14, x28 + umulh x24, x11, x2 + adcs x13, x13, xzr + umulh x25, x12, x2 + adc x14, x14, xzr + + umulh x26, x13, x2 + smulh x28, x14, x2 + mul x9, x9, x2 + mul x10, x10, x2 + mul x11, x11, x2 + adds x10, x10, x22 + mul x12, x12, x2 + adcs x11, x11, x23 + mul x13, x13, x2 + adcs x12, x12, x24 + mul x14, x14, x2 + adcs x13, x13, x25 + adcs x14, x14 ,x26 + adc x28, x28, xzr + adds x3, x3, x9 + adcs x4, x4, x10 + adcs x5, x5, x11 + adcs x6, x6, x12 + adcs x7, x7, x13 + adcs x8, x8, x14 + adc x9, x27, x28 + + extr x3, x4, x3, #62 + extr x4, x5, x4, #62 + extr x5, x6, x5, #62 + asr x28, x9, #63 + extr x6, x7, x6, #62 + extr x7, x8, x7, #62 + extr x8, x9, x8, #62 + + eor x3, x3, x28 + eor x4, x4, x28 + adds x3, x3, x28, lsr#63 + eor x5, x5, x28 + adcs x4, x4, xzr + eor x6, x6, x28 + adcs x5, x5, xzr + eor x7, x7, x28 + adcs x6, x6, xzr + eor x8, x8, x28 + stp x3, x4, [x0,#8*0] + adcs x7, x7, xzr + stp x5, x6, [x0,#8*2] + adc x8, x8, xzr + stp x7, x8, [x0,#8*4] + + eor x15, x15, x28 + eor x16, x16, x28 + sub x15, x15, x28 + sub x16, x16, x28 + + ret +.size __smul_383_n_shift_by_62,.-__smul_383_n_shift_by_62 +.type __ab_approximation_62, %function +.align 4 +__ab_approximation_62: + ldp x7, x8, [x1,#8*4] + ldp x13, x14, [x1,#8*10] + ldp x5, x6, [x1,#8*2] + ldp x11, x12, [x1,#8*8] + +.Lab_approximation_62_loaded: + orr x22, x8, x14 // check top-most limbs, ... + cmp x22, #0 + csel x8, x8, x7, ne + csel x14, x14, x13, ne + csel x7, x7, x6, ne + orr x22, x8, x14 // ... ones before top-most, ... + csel x13, x13, x12, ne + + ldp x3, x4, [x1,#8*0] + ldp x9, x10, [x1,#8*6] + + cmp x22, #0 + csel x8, x8, x7, ne + csel x14, x14, x13, ne + csel x7, x7, x5, ne + orr x22, x8, x14 // ... and ones before that ... + csel x13, x13, x11, ne + + cmp x22, #0 + csel x8, x8, x7, ne + csel x14, x14, x13, ne + csel x7, x7, x4, ne + orr x22, x8, x14 + csel x13, x13, x10, ne + + clz x22, x22 + cmp x22, #64 + csel x22, x22, xzr, ne + csel x8, x8, x7, ne + csel x14, x14, x13, ne + neg x23, x22 + + lslv x8, x8, x22 // align high limbs to the left + lslv x14, x14, x22 + lsrv x7, x7, x23 + lsrv x13, x13, x23 + and x7, x7, x23, asr#6 + and x13, x13, x23, asr#6 + orr x8, x8, x7 + orr x14, x14, x13 + + b __inner_loop_62 + ret +.size __ab_approximation_62,.-__ab_approximation_62 +.type __inner_loop_62, %function +.align 4 +__inner_loop_62: + mov x15, #1 // |f0|=1 + mov x16, #0 // |g0|=0 + mov x17, #0 // |f1|=0 + mov x19, #1 // |g1|=1 + +.Loop_62: + sbfx x28, x3, #0, #1 // if |a_| is odd, then we'll be subtracting + sub x2, x2, #1 + subs x24, x9, x3 // |b_|-|a_| + and x22, x9, x28 + sbc x25, x14, x8 + and x23, x14, x28 + subs x26, x3, x22 // |a_|-|b_| (or |a_|-0 if |a_| was even) + mov x22, x15 + sbcs x27, x8, x23 + mov x23, x16 + csel x9, x9, x3, hs // |b_| = |a_| + csel x14, x14, x8, hs + csel x3, x26, x24, hs // borrow means |a_|<|b_|, replace with |b_|-|a_| + csel x8, x27, x25, hs + csel x15, x15, x17, hs // exchange |f0| and |f1| + csel x17, x17, x22, hs + csel x16, x16, x19, hs // exchange |g0| and |g1| + csel x19, x19, x23, hs + extr x3, x8, x3, #1 + lsr x8, x8, #1 + and x22, x17, x28 + and x23, x19, x28 + add x17, x17, x17 // |f1|<<=1 + add x19, x19, x19 // |g1|<<=1 + sub x15, x15, x22 // |f0|-=|f1| (or |f0-=0| if |a_| was even) + sub x16, x16, x23 // |g0|-=|g1| (or |g0-=0| ...) + cbnz x2, .Loop_62 + + ret +.size __inner_loop_62,.-__inner_loop_62 diff --git a/temp-deps/blst-ts/deps/blst/build/elf/ct_is_square_mod_384-armv8.S b/temp-deps/blst-ts/deps/blst/build/elf/ct_is_square_mod_384-armv8.S new file mode 100644 index 0000000..3f1390e --- /dev/null +++ b/temp-deps/blst-ts/deps/blst/build/elf/ct_is_square_mod_384-armv8.S @@ -0,0 +1,324 @@ +.text + +.globl ct_is_square_mod_384 +.type ct_is_square_mod_384, %function +.align 5 +ct_is_square_mod_384: + .inst 0xd503233f + stp x29, x30, [sp,#-128]! + add x29, sp, #0 + stp x19, x20, [sp,#16] + stp x21, x22, [sp,#32] + stp x23, x24, [sp,#48] + stp x25, x26, [sp,#64] + stp x27, x28, [sp,#80] + sub sp, sp, #512 + + ldp x3, x4, [x0,#8*0] // load input + ldp x5, x6, [x0,#8*2] + ldp x7, x8, [x0,#8*4] + + add x0, sp, #255 // find closest 256-byte-aligned spot + and x0, x0, #-256 // in the frame... + + ldp x9, x10, [x1,#8*0] // load modulus + ldp x11, x12, [x1,#8*2] + ldp x13, x14, [x1,#8*4] + + stp x3, x4, [x0,#8*6] // copy input to |a| + stp x5, x6, [x0,#8*8] + stp x7, x8, [x0,#8*10] + stp x9, x10, [x0,#8*0] // copy modulus to |b| + stp x11, x12, [x0,#8*2] + stp x13, x14, [x0,#8*4] + + eor x2, x2, x2 // init the .Legendre symbol + mov x15, #24 // 24 is 768/30-1 + b .Loop_is_square + +.align 4 +.Loop_is_square: + bl __ab_approximation_30 + sub x15, x15, #1 + + eor x1, x0, #128 // pointer to dst |b| + bl __smul_384_n_shift_by_30 + + mov x19, x16 // |f0| + mov x20, x17 // |g0| + add x1, x1, #8*6 // pointer to dst |a| + bl __smul_384_n_shift_by_30 + + ldp x9, x10, [x1,#-8*6] + eor x0, x0, #128 // flip-flop src |a|b| + and x27, x27, x9 // if |a| was negative, + add x2, x2, x27, lsr#1 // adjust |L| + + cbnz x15, .Loop_is_square + + ////////////////////////////////////////// last iteration + //bl __ab_approximation_30 // |a| and |b| are exact, + //ldr x8, [x0,#8*6] // and loaded + //ldr x14, [x0,#8*0] + mov x15, #48 // 48 is 768%30 + 30 + bl __inner_loop_48 + ldr x30, [x29,#8] + + and x0, x2, #1 + eor x0, x0, #1 + + add sp, sp, #512 + ldp x19, x20, [x29,#16] + ldp x21, x22, [x29,#32] + ldp x23, x24, [x29,#48] + ldp x25, x26, [x29,#64] + ldp x27, x28, [x29,#80] + ldr x29, [sp],#128 + .inst 0xd50323bf + ret +.size ct_is_square_mod_384,.-ct_is_square_mod_384 + +.type __smul_384_n_shift_by_30, %function +.align 5 +__smul_384_n_shift_by_30: + ldp x3, x4, [x0,#8*0+0] // load |b| (or |a|) + asr x27, x20, #63 // |g1|'s sign as mask (or |f1|'s) + ldp x5, x6, [x0,#8*2+0] + eor x20, x20, x27 // conditionally negate |g1| (or |f1|) + ldp x7, x8, [x0,#8*4+0] + + eor x3, x3, x27 // conditionally negate |b| (or |a|) + sub x20, x20, x27 + eor x4, x4, x27 + adds x3, x3, x27, lsr#63 + eor x5, x5, x27 + adcs x4, x4, xzr + eor x6, x6, x27 + adcs x5, x5, xzr + eor x7, x7, x27 + umulh x21, x3, x20 + adcs x6, x6, xzr + umulh x22, x4, x20 + eor x8, x8, x27 + umulh x23, x5, x20 + adcs x7, x7, xzr + umulh x24, x6, x20 + adc x8, x8, xzr + + umulh x25, x7, x20 + and x28, x20, x27 + umulh x26, x8, x20 + neg x28, x28 + mul x3, x3, x20 + mul x4, x4, x20 + mul x5, x5, x20 + adds x4, x4, x21 + mul x6, x6, x20 + adcs x5, x5, x22 + mul x7, x7, x20 + adcs x6, x6, x23 + mul x8, x8, x20 + adcs x7, x7, x24 + adcs x8, x8 ,x25 + adc x26, x26, x28 + ldp x9, x10, [x0,#8*0+48] // load |b| (or |a|) + asr x27, x19, #63 // |g1|'s sign as mask (or |f1|'s) + ldp x11, x12, [x0,#8*2+48] + eor x19, x19, x27 // conditionally negate |g1| (or |f1|) + ldp x13, x14, [x0,#8*4+48] + + eor x9, x9, x27 // conditionally negate |b| (or |a|) + sub x19, x19, x27 + eor x10, x10, x27 + adds x9, x9, x27, lsr#63 + eor x11, x11, x27 + adcs x10, x10, xzr + eor x12, x12, x27 + adcs x11, x11, xzr + eor x13, x13, x27 + umulh x21, x9, x19 + adcs x12, x12, xzr + umulh x22, x10, x19 + eor x14, x14, x27 + umulh x23, x11, x19 + adcs x13, x13, xzr + umulh x24, x12, x19 + adc x14, x14, xzr + + umulh x25, x13, x19 + and x28, x19, x27 + umulh x27, x14, x19 + neg x28, x28 + mul x9, x9, x19 + mul x10, x10, x19 + mul x11, x11, x19 + adds x10, x10, x21 + mul x12, x12, x19 + adcs x11, x11, x22 + mul x13, x13, x19 + adcs x12, x12, x23 + mul x14, x14, x19 + adcs x13, x13, x24 + adcs x14, x14 ,x25 + adc x27, x27, x28 + adds x3, x3, x9 + adcs x4, x4, x10 + adcs x5, x5, x11 + adcs x6, x6, x12 + adcs x7, x7, x13 + adcs x8, x8, x14 + adc x9, x26, x27 + + extr x3, x4, x3, #30 + extr x4, x5, x4, #30 + extr x5, x6, x5, #30 + asr x27, x9, #63 + extr x6, x7, x6, #30 + extr x7, x8, x7, #30 + extr x8, x9, x8, #30 + + eor x3, x3, x27 + eor x4, x4, x27 + adds x3, x3, x27, lsr#63 + eor x5, x5, x27 + adcs x4, x4, xzr + eor x6, x6, x27 + adcs x5, x5, xzr + eor x7, x7, x27 + adcs x6, x6, xzr + eor x8, x8, x27 + stp x3, x4, [x1,#8*0] + adcs x7, x7, xzr + stp x5, x6, [x1,#8*2] + adc x8, x8, xzr + stp x7, x8, [x1,#8*4] + + ret +.size __smul_384_n_shift_by_30,.-__smul_384_n_shift_by_30 +.type __ab_approximation_30, %function +.align 4 +__ab_approximation_30: + ldp x13, x14, [x0,#8*4] // |a| is still in registers + ldp x11, x12, [x0,#8*2] + + orr x21, x8, x14 // check top-most limbs, ... + cmp x21, #0 + csel x8, x8, x7, ne + csel x14, x14, x13, ne + csel x7, x7, x6, ne + orr x21, x8, x14 // ... ones before top-most, ... + csel x13, x13, x12, ne + + cmp x21, #0 + csel x8, x8, x7, ne + csel x14, x14, x13, ne + csel x7, x7, x5, ne + orr x21, x8, x14 // ... and ones before that ... + csel x13, x13, x11, ne + + cmp x21, #0 + csel x8, x8, x7, ne + csel x14, x14, x13, ne + csel x7, x7, x4, ne + orr x21, x8, x14 // and one more, ... + csel x13, x13, x10, ne + + cmp x21, #0 + csel x8, x8, x7, ne + csel x14, x14, x13, ne + csel x7, x7, x3, ne + orr x21, x8, x14 + csel x13, x13, x9, ne + + clz x21, x21 + cmp x21, #64 + csel x21, x21, xzr, ne + csel x8, x8, x7, ne + csel x14, x14, x13, ne + neg x22, x21 + + lslv x8, x8, x21 // align high limbs to the left + lslv x14, x14, x21 + lsrv x7, x7, x22 + lsrv x13, x13, x22 + and x7, x7, x22, asr#6 + and x13, x13, x22, asr#6 + orr x8, x8, x7 + orr x14, x14, x13 + + bfxil x8, x3, #0, #32 + bfxil x14, x9, #0, #32 + + b __inner_loop_30 + ret +.size __ab_approximation_30,.-__ab_approximation_30 + +.type __inner_loop_30, %function +.align 4 +__inner_loop_30: + mov x28, #30 + mov x17, #0x7FFFFFFF80000000 // |f0|=1, |g0|=0 + mov x20, #0x800000007FFFFFFF // |f1|=0, |g1|=1 + mov x27,#0x7FFFFFFF7FFFFFFF + +.Loop_30: + sbfx x24, x8, #0, #1 // if |a_| is odd, then we'll be subtracting + and x25, x8, x14 + sub x28, x28, #1 + and x21, x14, x24 + + sub x22, x14, x8 // |b_|-|a_| + subs x23, x8, x21 // |a_|-|b_| (or |a_|-0 if |a_| was even) + add x25, x2, x25, lsr#1 // L + (a_ & b_) >> 1 + mov x21, x20 + csel x14, x14, x8, hs // |b_| = |a_| + csel x8, x23, x22, hs // borrow means |a_|<|b_|, replace with |b_|-|a_| + csel x20, x20, x17, hs // exchange |fg0| and |fg1| + csel x17, x17, x21, hs + csel x2, x2, x25, hs + lsr x8, x8, #1 + and x21, x20, x24 + and x22, x27, x24 + add x23, x14, #2 + sub x17, x17, x21 // |f0|-=|f1| (or |f0-=0| if |a_| was even) + add x20, x20, x20 // |f1|<<=1 + add x2, x2, x23, lsr#2 // "negate" |L| if |b|%8 is 3 or 5 + add x17, x17, x22 + sub x20, x20, x27 + + cbnz x28, .Loop_30 + + mov x27, #0x7FFFFFFF + ubfx x16, x17, #0, #32 + ubfx x17, x17, #32, #32 + ubfx x19, x20, #0, #32 + ubfx x20, x20, #32, #32 + sub x16, x16, x27 // remove the bias + sub x17, x17, x27 + sub x19, x19, x27 + sub x20, x20, x27 + + ret +.size __inner_loop_30,.-__inner_loop_30 +.type __inner_loop_48, %function +.align 4 +__inner_loop_48: +.Loop_48: + sbfx x24, x3, #0, #1 // if |a_| is odd, then we'll be subtracting + and x25, x3, x9 + sub x15, x15, #1 + and x21, x9, x24 + sub x22, x9, x3 // |b_|-|a_| + subs x23, x3, x21 // |a_|-|b_| (or |a_|-0 if |a_| was even) + add x25, x2, x25, lsr#1 + csel x9, x9, x3, hs // |b_| = |a_| + csel x3, x23, x22, hs // borrow means |a_|<|b_|, replace with |b_|-|a_| + csel x2, x2, x25, hs + add x23, x9, #2 + lsr x3, x3, #1 + add x2, x2, x23, lsr#2 // "negate" |L| if |b|%8 is 3 or 5 + + cbnz x15, .Loop_48 + + ret +.size __inner_loop_48,.-__inner_loop_48 diff --git a/temp-deps/blst-ts/deps/blst/build/elf/ct_is_square_mod_384-x86_64.s b/temp-deps/blst-ts/deps/blst/build/elf/ct_is_square_mod_384-x86_64.s new file mode 100644 index 0000000..fec1493 --- /dev/null +++ b/temp-deps/blst-ts/deps/blst/build/elf/ct_is_square_mod_384-x86_64.s @@ -0,0 +1,479 @@ +.text + +.globl ct_is_square_mod_384 +.type ct_is_square_mod_384,@function +.align 32 +ct_is_square_mod_384: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + subq $536,%rsp +.cfi_adjust_cfa_offset 536 + + + leaq 24+255(%rsp),%rax + andq $-256,%rax + + movq 0(%rdi),%r8 + movq 8(%rdi),%r9 + movq 16(%rdi),%r10 + movq 24(%rdi),%r11 + movq 32(%rdi),%r12 + movq 40(%rdi),%r13 + + movq 0(%rsi),%r14 + movq 8(%rsi),%r15 + movq 16(%rsi),%rbx + movq 24(%rsi),%rcx + movq 32(%rsi),%rdx + movq 40(%rsi),%rdi + movq %rax,%rsi + + movq %r8,0(%rax) + movq %r9,8(%rax) + movq %r10,16(%rax) + movq %r11,24(%rax) + movq %r12,32(%rax) + movq %r13,40(%rax) + + movq %r14,48(%rax) + movq %r15,56(%rax) + movq %rbx,64(%rax) + movq %rcx,72(%rax) + movq %rdx,80(%rax) + movq %rdi,88(%rax) + + xorq %rbp,%rbp + movl $24,%ecx + jmp .Loop_is_square + +.align 32 +.Loop_is_square: + movl %ecx,16(%rsp) + + call __ab_approximation_30 + movq %rax,0(%rsp) + movq %rbx,8(%rsp) + + movq $128+48,%rdi + xorq %rsi,%rdi + call __smulq_384_n_shift_by_30 + + movq 0(%rsp),%rdx + movq 8(%rsp),%rcx + leaq -48(%rdi),%rdi + call __smulq_384_n_shift_by_30 + + movl 16(%rsp),%ecx + xorq $128,%rsi + + andq 48(%rdi),%r14 + shrq $1,%r14 + addq %r14,%rbp + + subl $1,%ecx + jnz .Loop_is_square + + + + + movq 48(%rsi),%r9 + call __inner_loop_48 + + movq $1,%rax + andq %rbp,%rax + xorq $1,%rax + + leaq 536(%rsp),%r8 + movq 0(%r8),%r15 +.cfi_restore %r15 + movq 8(%r8),%r14 +.cfi_restore %r14 + movq 16(%r8),%r13 +.cfi_restore %r13 + movq 24(%r8),%r12 +.cfi_restore %r12 + movq 32(%r8),%rbx +.cfi_restore %rbx + movq 40(%r8),%rbp +.cfi_restore %rbp + leaq 48(%r8),%rsp +.cfi_adjust_cfa_offset -536-8*6 + + .byte 0xf3,0xc3 +.cfi_endproc +.size ct_is_square_mod_384,.-ct_is_square_mod_384 + +.type __smulq_384_n_shift_by_30,@function +.align 32 +__smulq_384_n_shift_by_30: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + movq 32(%rsi),%r12 + movq 40(%rsi),%r13 + + movq %rdx,%rbx + sarq $63,%rdx + xorq %rax,%rax + subq %rdx,%rax + + xorq %rdx,%rbx + addq %rax,%rbx + + xorq %rdx,%r8 + xorq %rdx,%r9 + xorq %rdx,%r10 + xorq %rdx,%r11 + xorq %rdx,%r12 + xorq %rdx,%r13 + addq %r8,%rax + adcq $0,%r9 + adcq $0,%r10 + adcq $0,%r11 + adcq $0,%r12 + adcq $0,%r13 + + movq %rdx,%r14 + andq %rbx,%r14 + mulq %rbx + movq %rax,%r8 + movq %r9,%rax + movq %rdx,%r9 + mulq %rbx + addq %rax,%r9 + movq %r10,%rax + adcq $0,%rdx + movq %rdx,%r10 + mulq %rbx + addq %rax,%r10 + movq %r11,%rax + adcq $0,%rdx + movq %rdx,%r11 + mulq %rbx + addq %rax,%r11 + movq %r12,%rax + adcq $0,%rdx + movq %rdx,%r12 + mulq %rbx + addq %rax,%r12 + movq %r13,%rax + adcq $0,%rdx + movq %rdx,%r13 + negq %r14 + mulq %rbx + addq %rax,%r13 + adcq %rdx,%r14 + leaq 48(%rsi),%rsi + movq %rcx,%rdx + + movq %r8,0(%rdi) + movq %r9,8(%rdi) + movq %r10,16(%rdi) + movq %r11,24(%rdi) + movq %r12,32(%rdi) + movq %r13,40(%rdi) + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + movq 32(%rsi),%r12 + movq 40(%rsi),%r13 + + movq %rdx,%rbx + sarq $63,%rdx + xorq %rax,%rax + subq %rdx,%rax + + xorq %rdx,%rbx + addq %rax,%rbx + + xorq %rdx,%r8 + xorq %rdx,%r9 + xorq %rdx,%r10 + xorq %rdx,%r11 + xorq %rdx,%r12 + xorq %rdx,%r13 + addq %r8,%rax + adcq $0,%r9 + adcq $0,%r10 + adcq $0,%r11 + adcq $0,%r12 + adcq $0,%r13 + + movq %rdx,%r15 + andq %rbx,%r15 + mulq %rbx + movq %rax,%r8 + movq %r9,%rax + movq %rdx,%r9 + mulq %rbx + addq %rax,%r9 + movq %r10,%rax + adcq $0,%rdx + movq %rdx,%r10 + mulq %rbx + addq %rax,%r10 + movq %r11,%rax + adcq $0,%rdx + movq %rdx,%r11 + mulq %rbx + addq %rax,%r11 + movq %r12,%rax + adcq $0,%rdx + movq %rdx,%r12 + mulq %rbx + addq %rax,%r12 + movq %r13,%rax + adcq $0,%rdx + movq %rdx,%r13 + negq %r15 + mulq %rbx + addq %rax,%r13 + adcq %rdx,%r15 + leaq -48(%rsi),%rsi + + addq 0(%rdi),%r8 + adcq 8(%rdi),%r9 + adcq 16(%rdi),%r10 + adcq 24(%rdi),%r11 + adcq 32(%rdi),%r12 + adcq 40(%rdi),%r13 + adcq %r15,%r14 + + shrdq $30,%r9,%r8 + shrdq $30,%r10,%r9 + shrdq $30,%r11,%r10 + shrdq $30,%r12,%r11 + shrdq $30,%r13,%r12 + shrdq $30,%r14,%r13 + + sarq $63,%r14 + xorq %rbx,%rbx + subq %r14,%rbx + + xorq %r14,%r8 + xorq %r14,%r9 + xorq %r14,%r10 + xorq %r14,%r11 + xorq %r14,%r12 + xorq %r14,%r13 + addq %rbx,%r8 + adcq $0,%r9 + adcq $0,%r10 + adcq $0,%r11 + adcq $0,%r12 + adcq $0,%r13 + + movq %r8,0(%rdi) + movq %r9,8(%rdi) + movq %r10,16(%rdi) + movq %r11,24(%rdi) + movq %r12,32(%rdi) + movq %r13,40(%rdi) + + .byte 0xf3,0xc3 +.cfi_endproc +.size __smulq_384_n_shift_by_30,.-__smulq_384_n_shift_by_30 +.type __ab_approximation_30,@function +.align 32 +__ab_approximation_30: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + movq 88(%rsi),%rbx + movq 80(%rsi),%r15 + movq 72(%rsi),%r14 + + movq %r13,%rax + orq %rbx,%rax + cmovzq %r12,%r13 + cmovzq %r15,%rbx + cmovzq %r11,%r12 + movq 64(%rsi),%r11 + cmovzq %r14,%r15 + + movq %r13,%rax + orq %rbx,%rax + cmovzq %r12,%r13 + cmovzq %r15,%rbx + cmovzq %r10,%r12 + movq 56(%rsi),%r10 + cmovzq %r11,%r15 + + movq %r13,%rax + orq %rbx,%rax + cmovzq %r12,%r13 + cmovzq %r15,%rbx + cmovzq %r9,%r12 + movq 48(%rsi),%r9 + cmovzq %r10,%r15 + + movq %r13,%rax + orq %rbx,%rax + cmovzq %r12,%r13 + cmovzq %r15,%rbx + cmovzq %r8,%r12 + cmovzq %r9,%r15 + + movq %r13,%rax + orq %rbx,%rax + bsrq %rax,%rcx + leaq 1(%rcx),%rcx + cmovzq %r8,%r13 + cmovzq %r9,%rbx + cmovzq %rax,%rcx + negq %rcx + + + shldq %cl,%r12,%r13 + shldq %cl,%r15,%rbx + + movq $0xFFFFFFFF00000000,%rax + movl %r8d,%r8d + movl %r9d,%r9d + andq %rax,%r13 + andq %rax,%rbx + orq %r13,%r8 + orq %rbx,%r9 + + jmp __inner_loop_30 + + .byte 0xf3,0xc3 +.cfi_endproc +.size __ab_approximation_30,.-__ab_approximation_30 +.type __inner_loop_30,@function +.align 32 +__inner_loop_30: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + movq $0x7FFFFFFF80000000,%rbx + movq $0x800000007FFFFFFF,%rcx + leaq -1(%rbx),%r15 + movl $30,%edi + +.Loop_30: + movq %r8,%rax + andq %r9,%rax + shrq $1,%rax + + cmpq %r9,%r8 + movq %r8,%r10 + movq %r9,%r11 + leaq (%rax,%rbp,1),%rax + movq %rbx,%r12 + movq %rcx,%r13 + movq %rbp,%r14 + cmovbq %r9,%r8 + cmovbq %r10,%r9 + cmovbq %rcx,%rbx + cmovbq %r12,%rcx + cmovbq %rax,%rbp + + subq %r9,%r8 + subq %rcx,%rbx + addq %r15,%rbx + + testq $1,%r10 + cmovzq %r10,%r8 + cmovzq %r11,%r9 + cmovzq %r12,%rbx + cmovzq %r13,%rcx + cmovzq %r14,%rbp + + leaq 2(%r9),%rax + shrq $1,%r8 + shrq $2,%rax + addq %rcx,%rcx + leaq (%rax,%rbp,1),%rbp + subq %r15,%rcx + + subl $1,%edi + jnz .Loop_30 + + shrq $32,%r15 + movl %ebx,%eax + shrq $32,%rbx + movl %ecx,%edx + shrq $32,%rcx + subq %r15,%rax + subq %r15,%rbx + subq %r15,%rdx + subq %r15,%rcx + + .byte 0xf3,0xc3 +.cfi_endproc +.size __inner_loop_30,.-__inner_loop_30 + +.type __inner_loop_48,@function +.align 32 +__inner_loop_48: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + movl $48,%edi + +.Loop_48: + movq %r8,%rax + andq %r9,%rax + shrq $1,%rax + + cmpq %r9,%r8 + movq %r8,%r10 + movq %r9,%r11 + leaq (%rax,%rbp,1),%rax + movq %rbp,%r12 + cmovbq %r9,%r8 + cmovbq %r10,%r9 + cmovbq %rax,%rbp + + subq %r9,%r8 + + testq $1,%r10 + cmovzq %r10,%r8 + cmovzq %r11,%r9 + cmovzq %r12,%rbp + + leaq 2(%r9),%rax + shrq $1,%r8 + shrq $2,%rax + addq %rax,%rbp + + subl $1,%edi + jnz .Loop_48 + + .byte 0xf3,0xc3 +.cfi_endproc +.size __inner_loop_48,.-__inner_loop_48 + +.section .note.GNU-stack,"",@progbits +.section .note.gnu.property,"a",@note + .long 4,2f-1f,5 + .byte 0x47,0x4E,0x55,0 +1: .long 0xc0000002,4,3 +.align 8 +2: diff --git a/temp-deps/blst-ts/deps/blst/build/elf/ctq_inverse_mod_384-x86_64.s b/temp-deps/blst-ts/deps/blst/build/elf/ctq_inverse_mod_384-x86_64.s new file mode 100644 index 0000000..b702262 --- /dev/null +++ b/temp-deps/blst-ts/deps/blst/build/elf/ctq_inverse_mod_384-x86_64.s @@ -0,0 +1,1195 @@ +.text + +.globl ct_inverse_mod_383 +.type ct_inverse_mod_383,@function +.align 32 +ct_inverse_mod_383: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + subq $1112,%rsp +.cfi_adjust_cfa_offset 1112 + + + leaq 88+511(%rsp),%rax + andq $-512,%rax + movq %rdi,32(%rsp) + movq %rcx,40(%rsp) + + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + movq 32(%rsi),%r12 + movq 40(%rsi),%r13 + + movq 0(%rdx),%r14 + movq 8(%rdx),%r15 + movq 16(%rdx),%rbx + movq 24(%rdx),%rbp + movq 32(%rdx),%rsi + movq 40(%rdx),%rdi + + movq %r8,0(%rax) + movq %r9,8(%rax) + movq %r10,16(%rax) + movq %r11,24(%rax) + movq %r12,32(%rax) + movq %r13,40(%rax) + + movq %r14,48(%rax) + movq %r15,56(%rax) + movq %rbx,64(%rax) + movq %rbp,72(%rax) + movq %rsi,80(%rax) + movq %rax,%rsi + movq %rdi,88(%rax) + + + movl $62,%edi + call __ab_approximation_62 + + + movq %r12,72(%rsp) + movq %r13,80(%rsp) + + movq $256,%rdi + xorq %rsi,%rdi + call __smulq_383_n_shift_by_62 + + + movq %rdx,96(%rdi) + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulq_383_n_shift_by_62 + + + movq %rdx,96(%rdi) + + + xorq $256,%rsi + movl $62,%edi + call __ab_approximation_62 + + + movq %r12,72(%rsp) + movq %r13,80(%rsp) + + movq $256,%rdi + xorq %rsi,%rdi + call __smulq_383_n_shift_by_62 + movq %rdx,56(%rsp) + movq %rcx,64(%rsp) + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulq_383_n_shift_by_62 + + + + movq 96(%rsi),%rax + movq 144(%rsi),%r11 + movq %rdx,%rbx + movq %rax,%r10 + imulq 56(%rsp) + movq %rax,%r8 + movq %r11,%rax + movq %rdx,%r9 + imulq 64(%rsp) + addq %rax,%r8 + adcq %rdx,%r9 + movq %r8,48(%rdi) + movq %r9,56(%rdi) + sarq $63,%r9 + movq %r9,64(%rdi) + movq %r9,72(%rdi) + movq %r9,80(%rdi) + movq %r9,88(%rdi) + leaq 96(%rsi),%rsi + + movq %r10,%rax + imulq %rbx + movq %rax,%r8 + movq %r11,%rax + movq %rdx,%r9 + imulq %rcx + addq %rax,%r8 + adcq %rdx,%r9 + movq %r8,96(%rdi) + movq %r9,104(%rdi) + sarq $63,%r9 + movq %r9,112(%rdi) + movq %r9,120(%rdi) + movq %r9,128(%rdi) + movq %r9,136(%rdi) + xorq $256+96,%rsi + movl $62,%edi + call __ab_approximation_62 + + + movq %r12,72(%rsp) + movq %r13,80(%rsp) + + movq $256,%rdi + xorq %rsi,%rdi + call __smulq_383_n_shift_by_62 + movq %rdx,56(%rsp) + movq %rcx,64(%rsp) + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulq_383_n_shift_by_62 + movq %rdx,72(%rsp) + movq %rcx,80(%rsp) + + movq 56(%rsp),%rdx + movq 64(%rsp),%rcx + leaq 96(%rsi),%rsi + leaq 48(%rdi),%rdi + call __smulq_383x63 + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulq_383x63 + xorq $256+96,%rsi + movl $62,%edi + call __ab_approximation_62 + + + movq %r12,72(%rsp) + movq %r13,80(%rsp) + + movq $256,%rdi + xorq %rsi,%rdi + call __smulq_383_n_shift_by_62 + movq %rdx,56(%rsp) + movq %rcx,64(%rsp) + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulq_383_n_shift_by_62 + movq %rdx,72(%rsp) + movq %rcx,80(%rsp) + + movq 56(%rsp),%rdx + movq 64(%rsp),%rcx + leaq 96(%rsi),%rsi + leaq 48(%rdi),%rdi + call __smulq_383x63 + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulq_383x63 + xorq $256+96,%rsi + movl $62,%edi + call __ab_approximation_62 + + + movq %r12,72(%rsp) + movq %r13,80(%rsp) + + movq $256,%rdi + xorq %rsi,%rdi + call __smulq_383_n_shift_by_62 + movq %rdx,56(%rsp) + movq %rcx,64(%rsp) + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulq_383_n_shift_by_62 + movq %rdx,72(%rsp) + movq %rcx,80(%rsp) + + movq 56(%rsp),%rdx + movq 64(%rsp),%rcx + leaq 96(%rsi),%rsi + leaq 48(%rdi),%rdi + call __smulq_383x63 + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulq_383x63 + xorq $256+96,%rsi + movl $62,%edi + call __ab_approximation_62 + + + movq %r12,72(%rsp) + movq %r13,80(%rsp) + + movq $256,%rdi + xorq %rsi,%rdi + call __smulq_383_n_shift_by_62 + movq %rdx,56(%rsp) + movq %rcx,64(%rsp) + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulq_383_n_shift_by_62 + movq %rdx,72(%rsp) + movq %rcx,80(%rsp) + + movq 56(%rsp),%rdx + movq 64(%rsp),%rcx + leaq 96(%rsi),%rsi + leaq 48(%rdi),%rdi + call __smulq_383x63 + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulq_383x63 + sarq $63,%r13 + movq %r13,48(%rdi) + movq %r13,56(%rdi) + movq %r13,64(%rdi) + movq %r13,72(%rdi) + movq %r13,80(%rdi) + movq %r13,88(%rdi) + xorq $256+96,%rsi + movl $62,%edi + call __ab_approximation_62 + + + movq %r12,72(%rsp) + movq %r13,80(%rsp) + + movq $256,%rdi + xorq %rsi,%rdi + call __smulq_383_n_shift_by_62 + movq %rdx,56(%rsp) + movq %rcx,64(%rsp) + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulq_383_n_shift_by_62 + movq %rdx,72(%rsp) + movq %rcx,80(%rsp) + + movq 56(%rsp),%rdx + movq 64(%rsp),%rcx + leaq 96(%rsi),%rsi + leaq 48(%rdi),%rdi + call __smulq_383x63 + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulq_767x63 + xorq $256+96,%rsi + movl $62,%edi + call __ab_approximation_62 + + + movq %r12,72(%rsp) + movq %r13,80(%rsp) + + movq $256,%rdi + xorq %rsi,%rdi + call __smulq_383_n_shift_by_62 + movq %rdx,56(%rsp) + movq %rcx,64(%rsp) + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulq_383_n_shift_by_62 + movq %rdx,72(%rsp) + movq %rcx,80(%rsp) + + movq 56(%rsp),%rdx + movq 64(%rsp),%rcx + leaq 96(%rsi),%rsi + leaq 48(%rdi),%rdi + call __smulq_383x63 + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulq_767x63 + xorq $256+96,%rsi + movl $62,%edi + call __ab_approximation_62 + + + movq %r12,72(%rsp) + movq %r13,80(%rsp) + + movq $256,%rdi + xorq %rsi,%rdi + call __smulq_383_n_shift_by_62 + movq %rdx,56(%rsp) + movq %rcx,64(%rsp) + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulq_383_n_shift_by_62 + movq %rdx,72(%rsp) + movq %rcx,80(%rsp) + + movq 56(%rsp),%rdx + movq 64(%rsp),%rcx + leaq 96(%rsi),%rsi + leaq 48(%rdi),%rdi + call __smulq_383x63 + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulq_767x63 + xorq $256+96,%rsi + movl $62,%edi + call __ab_approximation_62 + + + movq %r12,72(%rsp) + movq %r13,80(%rsp) + + movq $256,%rdi + xorq %rsi,%rdi + call __smulq_383_n_shift_by_62 + movq %rdx,56(%rsp) + movq %rcx,64(%rsp) + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulq_383_n_shift_by_62 + movq %rdx,72(%rsp) + movq %rcx,80(%rsp) + + movq 56(%rsp),%rdx + movq 64(%rsp),%rcx + leaq 96(%rsi),%rsi + leaq 48(%rdi),%rdi + call __smulq_383x63 + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulq_767x63 + xorq $256+96,%rsi + movl $62,%edi + call __ab_approximation_62 + + + movq %r12,72(%rsp) + movq %r13,80(%rsp) + + movq $256,%rdi + xorq %rsi,%rdi + call __smulq_383_n_shift_by_62 + movq %rdx,56(%rsp) + movq %rcx,64(%rsp) + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulq_383_n_shift_by_62 + movq %rdx,72(%rsp) + movq %rcx,80(%rsp) + + movq 56(%rsp),%rdx + movq 64(%rsp),%rcx + leaq 96(%rsi),%rsi + leaq 48(%rdi),%rdi + call __smulq_383x63 + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulq_767x63 + + xorq $256+96,%rsi + movl $62,%edi + + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 48(%rsi),%r10 + movq 56(%rsi),%r11 + call __inner_loop_62 + + + movq %r12,72(%rsp) + movq %r13,80(%rsp) + + movq $256,%rdi + xorq %rsi,%rdi + movq %r8,0(%rdi) + movq %r10,48(%rdi) + + + + leaq 96(%rsi),%rsi + leaq 96(%rdi),%rdi + call __smulq_383x63 + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulq_767x63 + + + xorq $256+96,%rsi + movl $22,%edi + + movq 0(%rsi),%r8 + xorq %r9,%r9 + movq 48(%rsi),%r10 + xorq %r11,%r11 + call __inner_loop_62 + + + + + + + + leaq 96(%rsi),%rsi + + + + + + movq %r12,%rdx + movq %r13,%rcx + movq 32(%rsp),%rdi + call __smulq_767x63 + + movq 40(%rsp),%rsi + movq %rax,%rdx + sarq $63,%rax + + movq %rax,%r8 + movq %rax,%r9 + movq %rax,%r10 + andq 0(%rsi),%r8 + andq 8(%rsi),%r9 + movq %rax,%r11 + andq 16(%rsi),%r10 + andq 24(%rsi),%r11 + movq %rax,%r12 + andq 32(%rsi),%r12 + andq 40(%rsi),%rax + + addq %r8,%r14 + adcq %r9,%r15 + adcq %r10,%rbx + adcq %r11,%rbp + adcq %r12,%rcx + adcq %rax,%rdx + + movq %r14,48(%rdi) + movq %r15,56(%rdi) + movq %rbx,64(%rdi) + movq %rbp,72(%rdi) + movq %rcx,80(%rdi) + movq %rdx,88(%rdi) + + leaq 1112(%rsp),%r8 + movq 0(%r8),%r15 +.cfi_restore %r15 + movq 8(%r8),%r14 +.cfi_restore %r14 + movq 16(%r8),%r13 +.cfi_restore %r13 + movq 24(%r8),%r12 +.cfi_restore %r12 + movq 32(%r8),%rbx +.cfi_restore %rbx + movq 40(%r8),%rbp +.cfi_restore %rbp + leaq 48(%r8),%rsp +.cfi_adjust_cfa_offset -1112-8*6 + + .byte 0xf3,0xc3 +.cfi_endproc +.size ct_inverse_mod_383,.-ct_inverse_mod_383 +.type __smulq_767x63,@function +.align 32 +__smulq_767x63: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + movq 32(%rsi),%r12 + movq 40(%rsi),%r13 + + movq %rdx,%rbp + sarq $63,%rdx + xorq %rax,%rax + subq %rdx,%rax + + movq %rdi,8(%rsp) + movq %rsi,16(%rsp) + leaq 48(%rsi),%rsi + + xorq %rdx,%rbp + addq %rax,%rbp + + xorq %rdx,%r8 + xorq %rdx,%r9 + xorq %rdx,%r10 + xorq %rdx,%r11 + xorq %rdx,%r12 + xorq %rdx,%r13 + addq %r8,%rax + adcq $0,%r9 + adcq $0,%r10 + adcq $0,%r11 + adcq $0,%r12 + adcq $0,%r13 + + mulq %rbp + movq %rax,0(%rdi) + movq %r9,%rax + movq %rdx,%r9 + mulq %rbp + addq %rax,%r9 + movq %r10,%rax + adcq $0,%rdx + movq %rdx,%r10 + movq %r9,8(%rdi) + mulq %rbp + addq %rax,%r10 + movq %r11,%rax + adcq $0,%rdx + movq %rdx,%r11 + movq %r10,16(%rdi) + mulq %rbp + addq %rax,%r11 + movq %r12,%rax + adcq $0,%rdx + movq %rdx,%r12 + movq %r11,24(%rdi) + mulq %rbp + addq %rax,%r12 + movq %r13,%rax + adcq $0,%rdx + movq %rdx,%r13 + movq %r12,32(%rdi) + imulq %rbp + addq %rax,%r13 + adcq $0,%rdx + + movq %r13,40(%rdi) + movq %rdx,48(%rdi) + sarq $63,%rdx + movq %rdx,56(%rdi) + movq %rcx,%rdx + + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + movq 32(%rsi),%r12 + movq 40(%rsi),%r13 + movq 48(%rsi),%r14 + movq 56(%rsi),%r15 + movq 64(%rsi),%rbx + movq 72(%rsi),%rbp + movq 80(%rsi),%rcx + movq 88(%rsi),%rdi + + movq %rdx,%rsi + sarq $63,%rdx + xorq %rax,%rax + subq %rdx,%rax + + xorq %rdx,%rsi + addq %rax,%rsi + + xorq %rdx,%r8 + xorq %rdx,%r9 + xorq %rdx,%r10 + xorq %rdx,%r11 + xorq %rdx,%r12 + xorq %rdx,%r13 + xorq %rdx,%r14 + xorq %rdx,%r15 + xorq %rdx,%rbx + xorq %rdx,%rbp + xorq %rdx,%rcx + xorq %rdx,%rdi + addq %r8,%rax + adcq $0,%r9 + adcq $0,%r10 + adcq $0,%r11 + adcq $0,%r12 + adcq $0,%r13 + adcq $0,%r14 + adcq $0,%r15 + adcq $0,%rbx + adcq $0,%rbp + adcq $0,%rcx + adcq $0,%rdi + + mulq %rsi + movq %rax,%r8 + movq %r9,%rax + movq %rdx,%r9 + mulq %rsi + addq %rax,%r9 + movq %r10,%rax + adcq $0,%rdx + movq %rdx,%r10 + mulq %rsi + addq %rax,%r10 + movq %r11,%rax + adcq $0,%rdx + movq %rdx,%r11 + mulq %rsi + addq %rax,%r11 + movq %r12,%rax + adcq $0,%rdx + movq %rdx,%r12 + mulq %rsi + addq %rax,%r12 + movq %r13,%rax + adcq $0,%rdx + movq %rdx,%r13 + mulq %rsi + addq %rax,%r13 + movq %r14,%rax + adcq $0,%rdx + movq %rdx,%r14 + mulq %rsi + addq %rax,%r14 + movq %r15,%rax + adcq $0,%rdx + movq %rdx,%r15 + mulq %rsi + addq %rax,%r15 + movq %rbx,%rax + adcq $0,%rdx + movq %rdx,%rbx + mulq %rsi + addq %rax,%rbx + movq %rbp,%rax + adcq $0,%rdx + movq %rdx,%rbp + mulq %rsi + addq %rax,%rbp + movq %rcx,%rax + adcq $0,%rdx + movq %rdx,%rcx + mulq %rsi + addq %rax,%rcx + movq %rdi,%rax + adcq $0,%rdx + movq %rdx,%rdi + movq 8(%rsp),%rdx + imulq %rsi,%rax + movq 16(%rsp),%rsi + addq %rdi,%rax + + addq 0(%rdx),%r8 + adcq 8(%rdx),%r9 + adcq 16(%rdx),%r10 + adcq 24(%rdx),%r11 + adcq 32(%rdx),%r12 + adcq 40(%rdx),%r13 + adcq 48(%rdx),%r14 + movq 56(%rdx),%rdi + adcq %rdi,%r15 + adcq %rdi,%rbx + adcq %rdi,%rbp + adcq %rdi,%rcx + adcq %rdi,%rax + + movq %rdx,%rdi + + movq %r8,0(%rdx) + movq %r9,8(%rdx) + movq %r10,16(%rdx) + movq %r11,24(%rdx) + movq %r12,32(%rdx) + movq %r13,40(%rdx) + movq %r14,48(%rdx) + movq %r15,56(%rdx) + movq %rbx,64(%rdx) + movq %rbp,72(%rdx) + movq %rcx,80(%rdx) + movq %rax,88(%rdx) + + .byte 0xf3,0xc3 +.cfi_endproc +.size __smulq_767x63,.-__smulq_767x63 +.type __smulq_383x63,@function +.align 32 +__smulq_383x63: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + movq 32(%rsi),%r12 + movq 40(%rsi),%r13 + + movq %rdx,%rbp + sarq $63,%rdx + xorq %rax,%rax + subq %rdx,%rax + + xorq %rdx,%rbp + addq %rax,%rbp + + xorq %rdx,%r8 + xorq %rdx,%r9 + xorq %rdx,%r10 + xorq %rdx,%r11 + xorq %rdx,%r12 + xorq %rdx,%r13 + addq %r8,%rax + adcq $0,%r9 + adcq $0,%r10 + adcq $0,%r11 + adcq $0,%r12 + adcq $0,%r13 + + mulq %rbp + movq %rax,%r8 + movq %r9,%rax + movq %rdx,%r9 + mulq %rbp + addq %rax,%r9 + movq %r10,%rax + adcq $0,%rdx + movq %rdx,%r10 + mulq %rbp + addq %rax,%r10 + movq %r11,%rax + adcq $0,%rdx + movq %rdx,%r11 + mulq %rbp + addq %rax,%r11 + movq %r12,%rax + adcq $0,%rdx + movq %rdx,%r12 + mulq %rbp + addq %rax,%r12 + movq %r13,%rax + adcq $0,%rdx + movq %rdx,%r13 + imulq %rbp,%rax + addq %rax,%r13 + + leaq 48(%rsi),%rsi + movq %rcx,%rdx + + movq %r8,0(%rdi) + movq %r9,8(%rdi) + movq %r10,16(%rdi) + movq %r11,24(%rdi) + movq %r12,32(%rdi) + movq %r13,40(%rdi) + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + movq 32(%rsi),%r12 + movq 40(%rsi),%r13 + + movq %rdx,%rbp + sarq $63,%rdx + xorq %rax,%rax + subq %rdx,%rax + + xorq %rdx,%rbp + addq %rax,%rbp + + xorq %rdx,%r8 + xorq %rdx,%r9 + xorq %rdx,%r10 + xorq %rdx,%r11 + xorq %rdx,%r12 + xorq %rdx,%r13 + addq %r8,%rax + adcq $0,%r9 + adcq $0,%r10 + adcq $0,%r11 + adcq $0,%r12 + adcq $0,%r13 + + mulq %rbp + movq %rax,%r8 + movq %r9,%rax + movq %rdx,%r9 + mulq %rbp + addq %rax,%r9 + movq %r10,%rax + adcq $0,%rdx + movq %rdx,%r10 + mulq %rbp + addq %rax,%r10 + movq %r11,%rax + adcq $0,%rdx + movq %rdx,%r11 + mulq %rbp + addq %rax,%r11 + movq %r12,%rax + adcq $0,%rdx + movq %rdx,%r12 + mulq %rbp + addq %rax,%r12 + movq %r13,%rax + adcq $0,%rdx + movq %rdx,%r13 + imulq %rbp,%rax + addq %rax,%r13 + + leaq -48(%rsi),%rsi + + addq 0(%rdi),%r8 + adcq 8(%rdi),%r9 + adcq 16(%rdi),%r10 + adcq 24(%rdi),%r11 + adcq 32(%rdi),%r12 + adcq 40(%rdi),%r13 + + movq %r8,0(%rdi) + movq %r9,8(%rdi) + movq %r10,16(%rdi) + movq %r11,24(%rdi) + movq %r12,32(%rdi) + movq %r13,40(%rdi) + + .byte 0xf3,0xc3 +.cfi_endproc +.size __smulq_383x63,.-__smulq_383x63 +.type __smulq_383_n_shift_by_62,@function +.align 32 +__smulq_383_n_shift_by_62: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + movq %rdx,%rbx + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + movq 32(%rsi),%r12 + movq 40(%rsi),%r13 + + movq %rdx,%rbp + sarq $63,%rdx + xorq %rax,%rax + subq %rdx,%rax + + xorq %rdx,%rbp + addq %rax,%rbp + + xorq %rdx,%r8 + xorq %rdx,%r9 + xorq %rdx,%r10 + xorq %rdx,%r11 + xorq %rdx,%r12 + xorq %rdx,%r13 + addq %r8,%rax + adcq $0,%r9 + adcq $0,%r10 + adcq $0,%r11 + adcq $0,%r12 + adcq $0,%r13 + + mulq %rbp + movq %rax,%r8 + movq %r9,%rax + movq %rdx,%r9 + mulq %rbp + addq %rax,%r9 + movq %r10,%rax + adcq $0,%rdx + movq %rdx,%r10 + mulq %rbp + addq %rax,%r10 + movq %r11,%rax + adcq $0,%rdx + movq %rdx,%r11 + mulq %rbp + addq %rax,%r11 + movq %r12,%rax + adcq $0,%rdx + movq %rdx,%r12 + mulq %rbp + addq %rax,%r12 + movq %r13,%rax + adcq $0,%rdx + movq %rdx,%r13 + imulq %rbp + addq %rax,%r13 + adcq $0,%rdx + + leaq 48(%rsi),%rsi + movq %rdx,%r14 + movq %rcx,%rdx + + movq %r8,0(%rdi) + movq %r9,8(%rdi) + movq %r10,16(%rdi) + movq %r11,24(%rdi) + movq %r12,32(%rdi) + movq %r13,40(%rdi) + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + movq 32(%rsi),%r12 + movq 40(%rsi),%r13 + + movq %rdx,%rbp + sarq $63,%rdx + xorq %rax,%rax + subq %rdx,%rax + + xorq %rdx,%rbp + addq %rax,%rbp + + xorq %rdx,%r8 + xorq %rdx,%r9 + xorq %rdx,%r10 + xorq %rdx,%r11 + xorq %rdx,%r12 + xorq %rdx,%r13 + addq %r8,%rax + adcq $0,%r9 + adcq $0,%r10 + adcq $0,%r11 + adcq $0,%r12 + adcq $0,%r13 + + mulq %rbp + movq %rax,%r8 + movq %r9,%rax + movq %rdx,%r9 + mulq %rbp + addq %rax,%r9 + movq %r10,%rax + adcq $0,%rdx + movq %rdx,%r10 + mulq %rbp + addq %rax,%r10 + movq %r11,%rax + adcq $0,%rdx + movq %rdx,%r11 + mulq %rbp + addq %rax,%r11 + movq %r12,%rax + adcq $0,%rdx + movq %rdx,%r12 + mulq %rbp + addq %rax,%r12 + movq %r13,%rax + adcq $0,%rdx + movq %rdx,%r13 + imulq %rbp + addq %rax,%r13 + adcq $0,%rdx + + leaq -48(%rsi),%rsi + + addq 0(%rdi),%r8 + adcq 8(%rdi),%r9 + adcq 16(%rdi),%r10 + adcq 24(%rdi),%r11 + adcq 32(%rdi),%r12 + adcq 40(%rdi),%r13 + adcq %rdx,%r14 + movq %rbx,%rdx + + shrdq $62,%r9,%r8 + shrdq $62,%r10,%r9 + shrdq $62,%r11,%r10 + shrdq $62,%r12,%r11 + shrdq $62,%r13,%r12 + shrdq $62,%r14,%r13 + + sarq $63,%r14 + xorq %rbp,%rbp + subq %r14,%rbp + + xorq %r14,%r8 + xorq %r14,%r9 + xorq %r14,%r10 + xorq %r14,%r11 + xorq %r14,%r12 + xorq %r14,%r13 + addq %rbp,%r8 + adcq $0,%r9 + adcq $0,%r10 + adcq $0,%r11 + adcq $0,%r12 + adcq $0,%r13 + + movq %r8,0(%rdi) + movq %r9,8(%rdi) + movq %r10,16(%rdi) + movq %r11,24(%rdi) + movq %r12,32(%rdi) + movq %r13,40(%rdi) + + xorq %r14,%rdx + xorq %r14,%rcx + addq %rbp,%rdx + addq %rbp,%rcx + + .byte 0xf3,0xc3 +.cfi_endproc +.size __smulq_383_n_shift_by_62,.-__smulq_383_n_shift_by_62 +.type __ab_approximation_62,@function +.align 32 +__ab_approximation_62: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + movq 40(%rsi),%r9 + movq 88(%rsi),%r11 + movq 32(%rsi),%rbx + movq 80(%rsi),%rbp + movq 24(%rsi),%r8 + movq 72(%rsi),%r10 + + movq %r9,%rax + orq %r11,%rax + cmovzq %rbx,%r9 + cmovzq %rbp,%r11 + cmovzq %r8,%rbx + cmovzq %r10,%rbp + movq 16(%rsi),%r8 + movq 64(%rsi),%r10 + + movq %r9,%rax + orq %r11,%rax + cmovzq %rbx,%r9 + cmovzq %rbp,%r11 + cmovzq %r8,%rbx + cmovzq %r10,%rbp + movq 8(%rsi),%r8 + movq 56(%rsi),%r10 + + movq %r9,%rax + orq %r11,%rax + cmovzq %rbx,%r9 + cmovzq %rbp,%r11 + cmovzq %r8,%rbx + cmovzq %r10,%rbp + movq 0(%rsi),%r8 + movq 48(%rsi),%r10 + + movq %r9,%rax + orq %r11,%rax + bsrq %rax,%rcx + leaq 1(%rcx),%rcx + cmovzq %rbx,%r9 + cmovzq %rbp,%r11 + cmovzq %rax,%rcx + negq %rcx + + + shldq %cl,%rbx,%r9 + shldq %cl,%rbp,%r11 + + jmp __inner_loop_62 + + .byte 0xf3,0xc3 +.cfi_endproc +.size __ab_approximation_62,.-__ab_approximation_62 +.type __inner_loop_62,@function +.align 8 +.long 0 +__inner_loop_62: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + movq $1,%rdx + xorq %rcx,%rcx + xorq %r12,%r12 + movq $1,%r13 + movq %rsi,8(%rsp) + +.Loop_62: + xorq %rax,%rax + xorq %rbx,%rbx + testq $1,%r8 + movq %r10,%rbp + movq %r11,%r14 + cmovnzq %r10,%rax + cmovnzq %r11,%rbx + subq %r8,%rbp + sbbq %r9,%r14 + movq %r8,%r15 + movq %r9,%rsi + subq %rax,%r8 + sbbq %rbx,%r9 + cmovcq %rbp,%r8 + cmovcq %r14,%r9 + cmovcq %r15,%r10 + cmovcq %rsi,%r11 + movq %rdx,%rax + cmovcq %r12,%rdx + cmovcq %rax,%r12 + movq %rcx,%rbx + cmovcq %r13,%rcx + cmovcq %rbx,%r13 + xorq %rax,%rax + xorq %rbx,%rbx + shrdq $1,%r9,%r8 + shrq $1,%r9 + testq $1,%r15 + cmovnzq %r12,%rax + cmovnzq %r13,%rbx + addq %r12,%r12 + addq %r13,%r13 + subq %rax,%rdx + subq %rbx,%rcx + subl $1,%edi + jnz .Loop_62 + + movq 8(%rsp),%rsi + .byte 0xf3,0xc3 +.cfi_endproc +.size __inner_loop_62,.-__inner_loop_62 + +.section .note.GNU-stack,"",@progbits +.section .note.gnu.property,"a",@note + .long 4,2f-1f,5 + .byte 0x47,0x4E,0x55,0 +1: .long 0xc0000002,4,3 +.align 8 +2: diff --git a/temp-deps/blst-ts/deps/blst/build/elf/ctx_inverse_mod_384-x86_64.s b/temp-deps/blst-ts/deps/blst/build/elf/ctx_inverse_mod_384-x86_64.s new file mode 100644 index 0000000..25a5fa5 --- /dev/null +++ b/temp-deps/blst-ts/deps/blst/build/elf/ctx_inverse_mod_384-x86_64.s @@ -0,0 +1,1574 @@ +.text + +.globl ctx_inverse_mod_383 +.type ctx_inverse_mod_383,@function +.align 32 +ctx_inverse_mod_383: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + subq $1112,%rsp +.cfi_adjust_cfa_offset 1112 + + + leaq 88+511(%rsp),%rax + andq $-512,%rax + movq %rdi,32(%rsp) + movq %rcx,40(%rsp) + + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + movq 32(%rsi),%r12 + movq 40(%rsi),%r13 + + movq 0(%rdx),%r14 + movq 8(%rdx),%r15 + movq 16(%rdx),%rbx + movq 24(%rdx),%rbp + movq 32(%rdx),%rsi + movq 40(%rdx),%rdi + + movq %r8,0(%rax) + movq %r9,8(%rax) + movq %r10,16(%rax) + movq %r11,24(%rax) + movq %r12,32(%rax) + movq %r13,40(%rax) + + movq %r14,48(%rax) + movq %r15,56(%rax) + movq %rbx,64(%rax) + movq %rbp,72(%rax) + movq %rsi,80(%rax) + movq %rax,%rsi + movq %rdi,88(%rax) + + + movl $31,%edi + call __ab_approximation_31 + + + movq %r12,72(%rsp) + movq %r13,80(%rsp) + + movq $256,%rdi + xorq %rsi,%rdi + call __smulx_383_n_shift_by_31 + + + movq %rdx,96(%rdi) + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulx_383_n_shift_by_31 + + + movq %rdx,96(%rdi) + + + xorq $256,%rsi + movl $31,%edi + call __ab_approximation_31 + + + movq %r12,72(%rsp) + movq %r13,80(%rsp) + + movq $256,%rdi + xorq %rsi,%rdi + call __smulx_383_n_shift_by_31 + movq %rdx,56(%rsp) + movq %rcx,64(%rsp) + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulx_383_n_shift_by_31 + + + + movq 96(%rsi),%rax + movq 144(%rsi),%r11 + movq %rdx,%rbx + movq %rax,%r10 + imulq 56(%rsp) + movq %rax,%r8 + movq %r11,%rax + movq %rdx,%r9 + imulq 64(%rsp) + addq %rax,%r8 + adcq %rdx,%r9 + movq %r8,48(%rdi) + movq %r9,56(%rdi) + sarq $63,%r9 + movq %r9,64(%rdi) + movq %r9,72(%rdi) + movq %r9,80(%rdi) + movq %r9,88(%rdi) + leaq 96(%rsi),%rsi + + movq %r10,%rax + imulq %rbx + movq %rax,%r8 + movq %r11,%rax + movq %rdx,%r9 + imulq %rcx + addq %rax,%r8 + adcq %rdx,%r9 + movq %r8,96(%rdi) + movq %r9,104(%rdi) + sarq $63,%r9 + movq %r9,112(%rdi) + movq %r9,120(%rdi) + movq %r9,128(%rdi) + movq %r9,136(%rdi) + xorq $256+96,%rsi + movl $31,%edi + call __ab_approximation_31 + + + movq %r12,72(%rsp) + movq %r13,80(%rsp) + + movq $256,%rdi + xorq %rsi,%rdi + call __smulx_383_n_shift_by_31 + movq %rdx,56(%rsp) + movq %rcx,64(%rsp) + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulx_383_n_shift_by_31 + movq %rdx,72(%rsp) + movq %rcx,80(%rsp) + + movq 56(%rsp),%rdx + movq 64(%rsp),%rcx + leaq 96(%rsi),%rsi + leaq 48(%rdi),%rdi + call __smulx_383x63 + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulx_383x63 + xorq $256+96,%rsi + movl $31,%edi + call __ab_approximation_31 + + + movq %r12,72(%rsp) + movq %r13,80(%rsp) + + movq $256,%rdi + xorq %rsi,%rdi + call __smulx_383_n_shift_by_31 + movq %rdx,56(%rsp) + movq %rcx,64(%rsp) + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulx_383_n_shift_by_31 + movq %rdx,72(%rsp) + movq %rcx,80(%rsp) + + movq 56(%rsp),%rdx + movq 64(%rsp),%rcx + leaq 96(%rsi),%rsi + leaq 48(%rdi),%rdi + call __smulx_383x63 + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulx_383x63 + xorq $256+96,%rsi + movl $31,%edi + call __ab_approximation_31 + + + movq %r12,72(%rsp) + movq %r13,80(%rsp) + + movq $256,%rdi + xorq %rsi,%rdi + call __smulx_383_n_shift_by_31 + movq %rdx,56(%rsp) + movq %rcx,64(%rsp) + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulx_383_n_shift_by_31 + movq %rdx,72(%rsp) + movq %rcx,80(%rsp) + + movq 56(%rsp),%rdx + movq 64(%rsp),%rcx + leaq 96(%rsi),%rsi + leaq 48(%rdi),%rdi + call __smulx_383x63 + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulx_383x63 + xorq $256+96,%rsi + movl $31,%edi + call __ab_approximation_31 + + + movq %r12,72(%rsp) + movq %r13,80(%rsp) + + movq $256,%rdi + xorq %rsi,%rdi + call __smulx_383_n_shift_by_31 + movq %rdx,56(%rsp) + movq %rcx,64(%rsp) + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulx_383_n_shift_by_31 + movq %rdx,72(%rsp) + movq %rcx,80(%rsp) + + movq 56(%rsp),%rdx + movq 64(%rsp),%rcx + leaq 96(%rsi),%rsi + leaq 48(%rdi),%rdi + call __smulx_383x63 + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulx_383x63 + xorq $256+96,%rsi + movl $31,%edi + call __ab_approximation_31 + + + movq %r12,72(%rsp) + movq %r13,80(%rsp) + + movq $256,%rdi + xorq %rsi,%rdi + call __smulx_383_n_shift_by_31 + movq %rdx,56(%rsp) + movq %rcx,64(%rsp) + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulx_383_n_shift_by_31 + movq %rdx,72(%rsp) + movq %rcx,80(%rsp) + + movq 56(%rsp),%rdx + movq 64(%rsp),%rcx + leaq 96(%rsi),%rsi + leaq 48(%rdi),%rdi + call __smulx_383x63 + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulx_383x63 + xorq $256+96,%rsi + movl $31,%edi + call __ab_approximation_31 + + + movq %r12,72(%rsp) + movq %r13,80(%rsp) + + movq $256,%rdi + xorq %rsi,%rdi + call __smulx_383_n_shift_by_31 + movq %rdx,56(%rsp) + movq %rcx,64(%rsp) + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulx_383_n_shift_by_31 + movq %rdx,72(%rsp) + movq %rcx,80(%rsp) + + movq 56(%rsp),%rdx + movq 64(%rsp),%rcx + leaq 96(%rsi),%rsi + leaq 48(%rdi),%rdi + call __smulx_383x63 + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulx_383x63 + xorq $256+96,%rsi + movl $31,%edi + call __ab_approximation_31 + + + movq %r12,72(%rsp) + movq %r13,80(%rsp) + + movq $256,%rdi + xorq %rsi,%rdi + call __smulx_383_n_shift_by_31 + movq %rdx,56(%rsp) + movq %rcx,64(%rsp) + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulx_383_n_shift_by_31 + movq %rdx,72(%rsp) + movq %rcx,80(%rsp) + + movq 56(%rsp),%rdx + movq 64(%rsp),%rcx + leaq 96(%rsi),%rsi + leaq 48(%rdi),%rdi + call __smulx_383x63 + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulx_383x63 + xorq $256+96,%rsi + movl $31,%edi + call __ab_approximation_31 + + + movq %r12,72(%rsp) + movq %r13,80(%rsp) + + movq $256,%rdi + xorq %rsi,%rdi + call __smulx_383_n_shift_by_31 + movq %rdx,56(%rsp) + movq %rcx,64(%rsp) + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulx_383_n_shift_by_31 + movq %rdx,72(%rsp) + movq %rcx,80(%rsp) + + movq 56(%rsp),%rdx + movq 64(%rsp),%rcx + leaq 96(%rsi),%rsi + leaq 48(%rdi),%rdi + call __smulx_383x63 + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulx_383x63 + xorq $256+96,%rsi + movl $31,%edi + call __ab_approximation_31 + + + movq %r12,72(%rsp) + movq %r13,80(%rsp) + + movq $256,%rdi + xorq %rsi,%rdi + call __smulx_383_n_shift_by_31 + movq %rdx,56(%rsp) + movq %rcx,64(%rsp) + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulx_383_n_shift_by_31 + movq %rdx,72(%rsp) + movq %rcx,80(%rsp) + + movq 56(%rsp),%rdx + movq 64(%rsp),%rcx + leaq 96(%rsi),%rsi + leaq 48(%rdi),%rdi + call __smulx_383x63 + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulx_383x63 + xorq $256+96,%rsi + movl $31,%edi + call __ab_approximation_31 + + + movq %r12,72(%rsp) + movq %r13,80(%rsp) + + movq $256,%rdi + xorq %rsi,%rdi + call __smulx_383_n_shift_by_31 + movq %rdx,56(%rsp) + movq %rcx,64(%rsp) + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulx_383_n_shift_by_31 + movq %rdx,72(%rsp) + movq %rcx,80(%rsp) + + movq 56(%rsp),%rdx + movq 64(%rsp),%rcx + leaq 96(%rsi),%rsi + leaq 48(%rdi),%rdi + call __smulx_383x63 + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulx_383x63 + sarq $63,%r13 + movq %r13,48(%rdi) + movq %r13,56(%rdi) + movq %r13,64(%rdi) + movq %r13,72(%rdi) + movq %r13,80(%rdi) + movq %r13,88(%rdi) + xorq $256+96,%rsi + movl $31,%edi + call __ab_approximation_31 + + + movq %r12,72(%rsp) + movq %r13,80(%rsp) + + movq $256,%rdi + xorq %rsi,%rdi + call __smulx_383_n_shift_by_31 + movq %rdx,56(%rsp) + movq %rcx,64(%rsp) + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulx_383_n_shift_by_31 + movq %rdx,72(%rsp) + movq %rcx,80(%rsp) + + movq 56(%rsp),%rdx + movq 64(%rsp),%rcx + leaq 96(%rsi),%rsi + leaq 48(%rdi),%rdi + call __smulx_383x63 + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulx_767x63 + xorq $256+96,%rsi + movl $31,%edi + call __ab_approximation_31 + + + movq %r12,72(%rsp) + movq %r13,80(%rsp) + + movq $256,%rdi + xorq %rsi,%rdi + call __smulx_383_n_shift_by_31 + movq %rdx,56(%rsp) + movq %rcx,64(%rsp) + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulx_383_n_shift_by_31 + movq %rdx,72(%rsp) + movq %rcx,80(%rsp) + + movq 56(%rsp),%rdx + movq 64(%rsp),%rcx + leaq 96(%rsi),%rsi + leaq 48(%rdi),%rdi + call __smulx_383x63 + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulx_767x63 + xorq $256+96,%rsi + movl $31,%edi + call __ab_approximation_31 + + + movq %r12,72(%rsp) + movq %r13,80(%rsp) + + movq $256,%rdi + xorq %rsi,%rdi + call __smulx_383_n_shift_by_31 + movq %rdx,56(%rsp) + movq %rcx,64(%rsp) + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulx_383_n_shift_by_31 + movq %rdx,72(%rsp) + movq %rcx,80(%rsp) + + movq 56(%rsp),%rdx + movq 64(%rsp),%rcx + leaq 96(%rsi),%rsi + leaq 48(%rdi),%rdi + call __smulx_383x63 + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulx_767x63 + xorq $256+96,%rsi + movl $31,%edi + call __ab_approximation_31 + + + movq %r12,72(%rsp) + movq %r13,80(%rsp) + + movq $256,%rdi + xorq %rsi,%rdi + call __smulx_383_n_shift_by_31 + movq %rdx,56(%rsp) + movq %rcx,64(%rsp) + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulx_383_n_shift_by_31 + movq %rdx,72(%rsp) + movq %rcx,80(%rsp) + + movq 56(%rsp),%rdx + movq 64(%rsp),%rcx + leaq 96(%rsi),%rsi + leaq 48(%rdi),%rdi + call __smulx_383x63 + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulx_767x63 + xorq $256+96,%rsi + movl $31,%edi + call __ab_approximation_31 + + + movq %r12,72(%rsp) + movq %r13,80(%rsp) + + movq $256,%rdi + xorq %rsi,%rdi + call __smulx_383_n_shift_by_31 + movq %rdx,56(%rsp) + movq %rcx,64(%rsp) + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulx_383_n_shift_by_31 + movq %rdx,72(%rsp) + movq %rcx,80(%rsp) + + movq 56(%rsp),%rdx + movq 64(%rsp),%rcx + leaq 96(%rsi),%rsi + leaq 48(%rdi),%rdi + call __smulx_383x63 + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulx_767x63 + xorq $256+96,%rsi + movl $31,%edi + call __ab_approximation_31 + + + movq %r12,72(%rsp) + movq %r13,80(%rsp) + + movq $256,%rdi + xorq %rsi,%rdi + call __smulx_383_n_shift_by_31 + movq %rdx,56(%rsp) + movq %rcx,64(%rsp) + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulx_383_n_shift_by_31 + movq %rdx,72(%rsp) + movq %rcx,80(%rsp) + + movq 56(%rsp),%rdx + movq 64(%rsp),%rcx + leaq 96(%rsi),%rsi + leaq 48(%rdi),%rdi + call __smulx_383x63 + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulx_767x63 + xorq $256+96,%rsi + movl $31,%edi + call __ab_approximation_31 + + + movq %r12,72(%rsp) + movq %r13,80(%rsp) + + movq $256,%rdi + xorq %rsi,%rdi + call __smulx_383_n_shift_by_31 + movq %rdx,56(%rsp) + movq %rcx,64(%rsp) + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulx_383_n_shift_by_31 + movq %rdx,72(%rsp) + movq %rcx,80(%rsp) + + movq 56(%rsp),%rdx + movq 64(%rsp),%rcx + leaq 96(%rsi),%rsi + leaq 48(%rdi),%rdi + call __smulx_383x63 + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulx_767x63 + xorq $256+96,%rsi + movl $31,%edi + call __ab_approximation_31 + + + movq %r12,72(%rsp) + movq %r13,80(%rsp) + + movq $256,%rdi + xorq %rsi,%rdi + call __smulx_191_n_shift_by_31 + movq %rdx,56(%rsp) + movq %rcx,64(%rsp) + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulx_191_n_shift_by_31 + movq %rdx,72(%rsp) + movq %rcx,80(%rsp) + + movq 56(%rsp),%rdx + movq 64(%rsp),%rcx + leaq 96(%rsi),%rsi + leaq 48(%rdi),%rdi + call __smulx_383x63 + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulx_767x63 + xorq $256+96,%rsi + movl $31,%edi + call __ab_approximation_31 + + + movq %r12,72(%rsp) + movq %r13,80(%rsp) + + movq $256,%rdi + xorq %rsi,%rdi + call __smulx_191_n_shift_by_31 + movq %rdx,56(%rsp) + movq %rcx,64(%rsp) + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulx_191_n_shift_by_31 + movq %rdx,72(%rsp) + movq %rcx,80(%rsp) + + movq 56(%rsp),%rdx + movq 64(%rsp),%rcx + leaq 96(%rsi),%rsi + leaq 48(%rdi),%rdi + call __smulx_383x63 + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulx_767x63 + xorq $256+96,%rsi + movl $31,%edi + call __ab_approximation_31 + + + movq %r12,72(%rsp) + movq %r13,80(%rsp) + + movq $256,%rdi + xorq %rsi,%rdi + call __smulx_191_n_shift_by_31 + movq %rdx,56(%rsp) + movq %rcx,64(%rsp) + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulx_191_n_shift_by_31 + movq %rdx,72(%rsp) + movq %rcx,80(%rsp) + + movq 56(%rsp),%rdx + movq 64(%rsp),%rcx + leaq 96(%rsi),%rsi + leaq 48(%rdi),%rdi + call __smulx_383x63 + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulx_767x63 + xorq $256+96,%rsi + movl $31,%edi + call __ab_approximation_31 + + + movq %r12,72(%rsp) + movq %r13,80(%rsp) + + movq $256,%rdi + xorq %rsi,%rdi + call __smulx_191_n_shift_by_31 + movq %rdx,56(%rsp) + movq %rcx,64(%rsp) + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulx_191_n_shift_by_31 + movq %rdx,72(%rsp) + movq %rcx,80(%rsp) + + movq 56(%rsp),%rdx + movq 64(%rsp),%rcx + leaq 96(%rsi),%rsi + leaq 48(%rdi),%rdi + call __smulx_383x63 + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulx_767x63 + + xorq $256+96,%rsi + movl $53,%edi + + movq 0(%rsi),%r8 + + movq 48(%rsi),%r10 + + call __inner_loop_62 + + + + + + + + leaq 96(%rsi),%rsi + + + + + + movq %r12,%rdx + movq %r13,%rcx + movq 32(%rsp),%rdi + call __smulx_767x63 + + movq 40(%rsp),%rsi + movq %rax,%rdx + sarq $63,%rax + + movq %rax,%r8 + movq %rax,%r9 + movq %rax,%r10 + andq 0(%rsi),%r8 + andq 8(%rsi),%r9 + movq %rax,%r11 + andq 16(%rsi),%r10 + andq 24(%rsi),%r11 + movq %rax,%r12 + andq 32(%rsi),%r12 + andq 40(%rsi),%rax + + addq %r8,%r14 + adcq %r9,%r15 + adcq %r10,%rbx + adcq %r11,%rbp + adcq %r12,%rcx + adcq %rax,%rdx + + movq %r14,48(%rdi) + movq %r15,56(%rdi) + movq %rbx,64(%rdi) + movq %rbp,72(%rdi) + movq %rcx,80(%rdi) + movq %rdx,88(%rdi) + + leaq 1112(%rsp),%r8 + movq 0(%r8),%r15 +.cfi_restore %r15 + movq 8(%r8),%r14 +.cfi_restore %r14 + movq 16(%r8),%r13 +.cfi_restore %r13 + movq 24(%r8),%r12 +.cfi_restore %r12 + movq 32(%r8),%rbx +.cfi_restore %rbx + movq 40(%r8),%rbp +.cfi_restore %rbp + leaq 48(%r8),%rsp +.cfi_adjust_cfa_offset -1112-8*6 + + .byte 0xf3,0xc3 +.cfi_endproc +.size ctx_inverse_mod_383,.-ctx_inverse_mod_383 +.type __smulx_767x63,@function +.align 32 +__smulx_767x63: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + movq 32(%rsi),%r12 + movq 40(%rsi),%r13 + + movq %rdx,%rax + sarq $63,%rax + xorq %rbp,%rbp + subq %rax,%rbp + + movq %rdi,8(%rsp) + movq %rsi,16(%rsp) + leaq 48(%rsi),%rsi + + xorq %rax,%rdx + addq %rbp,%rdx + + xorq %rax,%r8 + xorq %rax,%r9 + xorq %rax,%r10 + xorq %rax,%r11 + xorq %rax,%r12 + xorq %r13,%rax + addq %rbp,%r8 + adcq $0,%r9 + adcq $0,%r10 + adcq $0,%r11 + adcq $0,%r12 + adcq $0,%rax + + mulxq %r8,%r8,%rbp + mulxq %r9,%r9,%r13 + addq %rbp,%r9 + mulxq %r10,%r10,%rbp + adcq %r13,%r10 + mulxq %r11,%r11,%r13 + adcq %rbp,%r11 + mulxq %r12,%r12,%rbp + adcq %r13,%r12 + adcq $0,%rbp + imulq %rdx + addq %rbp,%rax + adcq $0,%rdx + + movq %r8,0(%rdi) + movq %r9,8(%rdi) + movq %r10,16(%rdi) + movq %r11,24(%rdi) + movq %r12,32(%rdi) + movq %rax,40(%rdi) + movq %rdx,48(%rdi) + sarq $63,%rdx + movq %rdx,56(%rdi) + movq %rcx,%rdx + movq %rcx,%rax + + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + movq 32(%rsi),%r12 + movq 40(%rsi),%r13 + movq 48(%rsi),%r14 + movq 56(%rsi),%r15 + movq 64(%rsi),%rbx + movq 72(%rsi),%rbp + movq 80(%rsi),%rcx + movq 88(%rsi),%rdi + + sarq $63,%rax + xorq %rsi,%rsi + subq %rax,%rsi + + xorq %rax,%rdx + addq %rsi,%rdx + + xorq %rax,%r8 + xorq %rax,%r9 + xorq %rax,%r10 + xorq %rax,%r11 + xorq %rax,%r12 + xorq %rax,%r13 + xorq %rax,%r14 + xorq %rax,%r15 + xorq %rax,%rbx + xorq %rax,%rbp + xorq %rax,%rcx + xorq %rax,%rdi + addq %rsi,%r8 + adcq $0,%r9 + adcq $0,%r10 + adcq $0,%r11 + adcq $0,%r12 + adcq $0,%r13 + adcq $0,%r14 + adcq $0,%r15 + adcq $0,%rbx + adcq $0,%rbp + adcq $0,%rcx + adcq $0,%rdi + + mulxq %r8,%r8,%rax + mulxq %r9,%r9,%rsi + addq %rax,%r9 + mulxq %r10,%r10,%rax + adcq %rsi,%r10 + mulxq %r11,%r11,%rsi + adcq %rax,%r11 + mulxq %r12,%r12,%rax + adcq %rsi,%r12 + mulxq %r13,%r13,%rsi + adcq %rax,%r13 + mulxq %r14,%r14,%rax + adcq %rsi,%r14 + mulxq %r15,%r15,%rsi + adcq %rax,%r15 + mulxq %rbx,%rbx,%rax + adcq %rsi,%rbx + mulxq %rbp,%rbp,%rsi + adcq %rax,%rbp + mulxq %rcx,%rcx,%rax + adcq %rsi,%rcx + mulxq %rdi,%rdi,%rsi + movq 8(%rsp),%rdx + movq 16(%rsp),%rsi + adcq %rdi,%rax + + addq 0(%rdx),%r8 + adcq 8(%rdx),%r9 + adcq 16(%rdx),%r10 + adcq 24(%rdx),%r11 + adcq 32(%rdx),%r12 + adcq 40(%rdx),%r13 + adcq 48(%rdx),%r14 + movq 56(%rdx),%rdi + adcq %rdi,%r15 + adcq %rdi,%rbx + adcq %rdi,%rbp + adcq %rdi,%rcx + adcq %rdi,%rax + + movq %rdx,%rdi + + movq %r8,0(%rdx) + movq %r9,8(%rdx) + movq %r10,16(%rdx) + movq %r11,24(%rdx) + movq %r12,32(%rdx) + movq %r13,40(%rdx) + movq %r14,48(%rdx) + movq %r15,56(%rdx) + movq %rbx,64(%rdx) + movq %rbp,72(%rdx) + movq %rcx,80(%rdx) + movq %rax,88(%rdx) + + .byte 0xf3,0xc3 +.cfi_endproc +.size __smulx_767x63,.-__smulx_767x63 +.type __smulx_383x63,@function +.align 32 +__smulx_383x63: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + movq 0+0(%rsi),%r8 + movq 0+8(%rsi),%r9 + movq 0+16(%rsi),%r10 + movq 0+24(%rsi),%r11 + movq 0+32(%rsi),%r12 + movq 0+40(%rsi),%r13 + + movq %rdx,%rbp + sarq $63,%rbp + xorq %rax,%rax + subq %rbp,%rax + + xorq %rbp,%rdx + addq %rax,%rdx + + xorq %rbp,%r8 + xorq %rbp,%r9 + xorq %rbp,%r10 + xorq %rbp,%r11 + xorq %rbp,%r12 + xorq %rbp,%r13 + addq %rax,%r8 + adcq $0,%r9 + adcq $0,%r10 + adcq $0,%r11 + adcq $0,%r12 + adcq $0,%r13 + + mulxq %r8,%r8,%rbp + mulxq %r9,%r9,%rax + addq %rbp,%r9 + mulxq %r10,%r10,%rbp + adcq %rax,%r10 + mulxq %r11,%r11,%rax + adcq %rbp,%r11 + mulxq %r12,%r12,%rbp + adcq %rax,%r12 + mulxq %r13,%r13,%rax + movq %rcx,%rdx + adcq %rbp,%r13 + + movq %r8,0(%rdi) + movq %r9,8(%rdi) + movq %r10,16(%rdi) + movq %r11,24(%rdi) + movq %r12,32(%rdi) + movq %r13,40(%rdi) + movq 48+0(%rsi),%r8 + movq 48+8(%rsi),%r9 + movq 48+16(%rsi),%r10 + movq 48+24(%rsi),%r11 + movq 48+32(%rsi),%r12 + movq 48+40(%rsi),%r13 + + movq %rdx,%rbp + sarq $63,%rbp + xorq %rax,%rax + subq %rbp,%rax + + xorq %rbp,%rdx + addq %rax,%rdx + + xorq %rbp,%r8 + xorq %rbp,%r9 + xorq %rbp,%r10 + xorq %rbp,%r11 + xorq %rbp,%r12 + xorq %rbp,%r13 + addq %rax,%r8 + adcq $0,%r9 + adcq $0,%r10 + adcq $0,%r11 + adcq $0,%r12 + adcq $0,%r13 + + mulxq %r8,%r8,%rbp + mulxq %r9,%r9,%rax + addq %rbp,%r9 + mulxq %r10,%r10,%rbp + adcq %rax,%r10 + mulxq %r11,%r11,%rax + adcq %rbp,%r11 + mulxq %r12,%r12,%rbp + adcq %rax,%r12 + mulxq %r13,%r13,%rax + adcq %rbp,%r13 + + addq 0(%rdi),%r8 + adcq 8(%rdi),%r9 + adcq 16(%rdi),%r10 + adcq 24(%rdi),%r11 + adcq 32(%rdi),%r12 + adcq 40(%rdi),%r13 + + movq %r8,0(%rdi) + movq %r9,8(%rdi) + movq %r10,16(%rdi) + movq %r11,24(%rdi) + movq %r12,32(%rdi) + movq %r13,40(%rdi) + + .byte 0xf3,0xc3 +.cfi_endproc +.size __smulx_383x63,.-__smulx_383x63 +.type __smulx_383_n_shift_by_31,@function +.align 32 +__smulx_383_n_shift_by_31: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + movq %rdx,%rbx + xorq %r14,%r14 + movq 0+0(%rsi),%r8 + movq 0+8(%rsi),%r9 + movq 0+16(%rsi),%r10 + movq 0+24(%rsi),%r11 + movq 0+32(%rsi),%r12 + movq 0+40(%rsi),%r13 + + movq %rdx,%rax + sarq $63,%rax + xorq %rbp,%rbp + subq %rax,%rbp + + xorq %rax,%rdx + addq %rbp,%rdx + + xorq %rax,%r8 + xorq %rax,%r9 + xorq %rax,%r10 + xorq %rax,%r11 + xorq %rax,%r12 + xorq %r13,%rax + addq %rbp,%r8 + adcq $0,%r9 + adcq $0,%r10 + adcq $0,%r11 + adcq $0,%r12 + adcq $0,%rax + + mulxq %r8,%r8,%rbp + mulxq %r9,%r9,%r13 + addq %rbp,%r9 + mulxq %r10,%r10,%rbp + adcq %r13,%r10 + mulxq %r11,%r11,%r13 + adcq %rbp,%r11 + mulxq %r12,%r12,%rbp + adcq %r13,%r12 + adcq $0,%rbp + imulq %rdx + addq %rbp,%rax + adcq %rdx,%r14 + + movq %rcx,%rdx + + movq %r8,0(%rdi) + movq %r9,8(%rdi) + movq %r10,16(%rdi) + movq %r11,24(%rdi) + movq %r12,32(%rdi) + movq %rax,40(%rdi) + movq 48+0(%rsi),%r8 + movq 48+8(%rsi),%r9 + movq 48+16(%rsi),%r10 + movq 48+24(%rsi),%r11 + movq 48+32(%rsi),%r12 + movq 48+40(%rsi),%r13 + + movq %rdx,%rax + sarq $63,%rax + xorq %rbp,%rbp + subq %rax,%rbp + + xorq %rax,%rdx + addq %rbp,%rdx + + xorq %rax,%r8 + xorq %rax,%r9 + xorq %rax,%r10 + xorq %rax,%r11 + xorq %rax,%r12 + xorq %r13,%rax + addq %rbp,%r8 + adcq $0,%r9 + adcq $0,%r10 + adcq $0,%r11 + adcq $0,%r12 + adcq $0,%rax + + mulxq %r8,%r8,%rbp + mulxq %r9,%r9,%r13 + addq %rbp,%r9 + mulxq %r10,%r10,%rbp + adcq %r13,%r10 + mulxq %r11,%r11,%r13 + adcq %rbp,%r11 + mulxq %r12,%r12,%rbp + adcq %r13,%r12 + adcq $0,%rbp + imulq %rdx + addq %rbp,%rax + adcq $0,%rdx + + addq 0(%rdi),%r8 + adcq 8(%rdi),%r9 + adcq 16(%rdi),%r10 + adcq 24(%rdi),%r11 + adcq 32(%rdi),%r12 + adcq 40(%rdi),%rax + adcq %rdx,%r14 + movq %rbx,%rdx + + shrdq $31,%r9,%r8 + shrdq $31,%r10,%r9 + shrdq $31,%r11,%r10 + shrdq $31,%r12,%r11 + shrdq $31,%rax,%r12 + shrdq $31,%r14,%rax + + sarq $63,%r14 + xorq %rbp,%rbp + subq %r14,%rbp + + xorq %r14,%r8 + xorq %r14,%r9 + xorq %r14,%r10 + xorq %r14,%r11 + xorq %r14,%r12 + xorq %r14,%rax + addq %rbp,%r8 + adcq $0,%r9 + adcq $0,%r10 + adcq $0,%r11 + adcq $0,%r12 + adcq $0,%rax + + movq %r8,0(%rdi) + movq %r9,8(%rdi) + movq %r10,16(%rdi) + movq %r11,24(%rdi) + movq %r12,32(%rdi) + movq %rax,40(%rdi) + + xorq %r14,%rdx + xorq %r14,%rcx + addq %rbp,%rdx + addq %rbp,%rcx + + .byte 0xf3,0xc3 +.cfi_endproc +.size __smulx_383_n_shift_by_31,.-__smulx_383_n_shift_by_31 +.type __smulx_191_n_shift_by_31,@function +.align 32 +__smulx_191_n_shift_by_31: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + movq %rdx,%rbx + movq 0+0(%rsi),%r8 + movq 0+8(%rsi),%r9 + movq 0+16(%rsi),%r10 + + movq %rdx,%rax + sarq $63,%rax + xorq %rbp,%rbp + subq %rax,%rbp + + xorq %rax,%rdx + addq %rbp,%rdx + + xorq %rax,%r8 + xorq %rax,%r9 + xorq %r10,%rax + addq %rbp,%r8 + adcq $0,%r9 + adcq $0,%rax + + mulxq %r8,%r8,%rbp + mulxq %r9,%r9,%r10 + addq %rbp,%r9 + adcq $0,%r10 + imulq %rdx + addq %rax,%r10 + adcq $0,%rdx + movq %rdx,%r14 + movq %rcx,%rdx + movq 48+0(%rsi),%r11 + movq 48+8(%rsi),%r12 + movq 48+16(%rsi),%r13 + + movq %rdx,%rax + sarq $63,%rax + xorq %rbp,%rbp + subq %rax,%rbp + + xorq %rax,%rdx + addq %rbp,%rdx + + xorq %rax,%r11 + xorq %rax,%r12 + xorq %r13,%rax + addq %rbp,%r11 + adcq $0,%r12 + adcq $0,%rax + + mulxq %r11,%r11,%rbp + mulxq %r12,%r12,%r13 + addq %rbp,%r12 + adcq $0,%r13 + imulq %rdx + addq %rax,%r13 + adcq $0,%rdx + addq %r8,%r11 + adcq %r9,%r12 + adcq %r10,%r13 + adcq %rdx,%r14 + movq %rbx,%rdx + + shrdq $31,%r12,%r11 + shrdq $31,%r13,%r12 + shrdq $31,%r14,%r13 + + sarq $63,%r14 + xorq %rbp,%rbp + subq %r14,%rbp + + xorq %r14,%r11 + xorq %r14,%r12 + xorq %r14,%r13 + addq %rbp,%r11 + adcq $0,%r12 + adcq $0,%r13 + + movq %r11,0(%rdi) + movq %r12,8(%rdi) + movq %r13,16(%rdi) + + xorq %r14,%rdx + xorq %r14,%rcx + addq %rbp,%rdx + addq %rbp,%rcx + + .byte 0xf3,0xc3 +.cfi_endproc +.size __smulx_191_n_shift_by_31,.-__smulx_191_n_shift_by_31 +.type __ab_approximation_31,@function +.align 32 +__ab_approximation_31: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + movq 40(%rsi),%r9 + movq 88(%rsi),%r11 + movq 32(%rsi),%rbx + movq 80(%rsi),%rbp + movq 24(%rsi),%r8 + movq 72(%rsi),%r10 + + movq %r9,%rax + orq %r11,%rax + cmovzq %rbx,%r9 + cmovzq %rbp,%r11 + cmovzq %r8,%rbx + movq 16(%rsi),%r8 + cmovzq %r10,%rbp + movq 64(%rsi),%r10 + + movq %r9,%rax + orq %r11,%rax + cmovzq %rbx,%r9 + cmovzq %rbp,%r11 + cmovzq %r8,%rbx + movq 8(%rsi),%r8 + cmovzq %r10,%rbp + movq 56(%rsi),%r10 + + movq %r9,%rax + orq %r11,%rax + cmovzq %rbx,%r9 + cmovzq %rbp,%r11 + cmovzq %r8,%rbx + movq 0(%rsi),%r8 + cmovzq %r10,%rbp + movq 48(%rsi),%r10 + + movq %r9,%rax + orq %r11,%rax + cmovzq %rbx,%r9 + cmovzq %rbp,%r11 + cmovzq %r8,%rbx + cmovzq %r10,%rbp + + movq %r9,%rax + orq %r11,%rax + bsrq %rax,%rcx + leaq 1(%rcx),%rcx + cmovzq %r8,%r9 + cmovzq %r10,%r11 + cmovzq %rax,%rcx + negq %rcx + + + shldq %cl,%rbx,%r9 + shldq %cl,%rbp,%r11 + + movl $0x7FFFFFFF,%eax + andq %rax,%r8 + andq %rax,%r10 + andnq %r9,%rax,%r9 + andnq %r11,%rax,%r11 + orq %r9,%r8 + orq %r11,%r10 + + jmp __inner_loop_31 + + .byte 0xf3,0xc3 +.cfi_endproc +.size __ab_approximation_31,.-__ab_approximation_31 +.type __inner_loop_31,@function +.align 32 +__inner_loop_31: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + movq $0x7FFFFFFF80000000,%rcx + movq $0x800000007FFFFFFF,%r13 + movq $0x7FFFFFFF7FFFFFFF,%r15 + +.Loop_31: + cmpq %r10,%r8 + movq %r8,%rax + movq %r10,%rbx + movq %rcx,%rbp + movq %r13,%r14 + cmovbq %r10,%r8 + cmovbq %rax,%r10 + cmovbq %r13,%rcx + cmovbq %rbp,%r13 + + subq %r10,%r8 + subq %r13,%rcx + addq %r15,%rcx + + testq $1,%rax + cmovzq %rax,%r8 + cmovzq %rbx,%r10 + cmovzq %rbp,%rcx + cmovzq %r14,%r13 + + shrq $1,%r8 + addq %r13,%r13 + subq %r15,%r13 + subl $1,%edi + jnz .Loop_31 + + shrq $32,%r15 + movl %ecx,%edx + movl %r13d,%r12d + shrq $32,%rcx + shrq $32,%r13 + subq %r15,%rdx + subq %r15,%rcx + subq %r15,%r12 + subq %r15,%r13 + + .byte 0xf3,0xc3 +.cfi_endproc +.size __inner_loop_31,.-__inner_loop_31 + +.type __inner_loop_62,@function +.align 32 +__inner_loop_62: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + movq $1,%rdx + xorq %rcx,%rcx + xorq %r12,%r12 + movq $1,%r13 + +.Loop_62: + xorq %rax,%rax + testq $1,%r8 + movq %r10,%rbx + cmovnzq %r10,%rax + subq %r8,%rbx + movq %r8,%rbp + subq %rax,%r8 + cmovcq %rbx,%r8 + cmovcq %rbp,%r10 + movq %rdx,%rax + cmovcq %r12,%rdx + cmovcq %rax,%r12 + movq %rcx,%rbx + cmovcq %r13,%rcx + cmovcq %rbx,%r13 + xorq %rax,%rax + xorq %rbx,%rbx + shrq $1,%r8 + testq $1,%rbp + cmovnzq %r12,%rax + cmovnzq %r13,%rbx + addq %r12,%r12 + addq %r13,%r13 + subq %rax,%rdx + subq %rbx,%rcx + subl $1,%edi + jnz .Loop_62 + + .byte 0xf3,0xc3 +.cfi_endproc +.size __inner_loop_62,.-__inner_loop_62 + +.section .note.GNU-stack,"",@progbits +.section .note.gnu.property,"a",@note + .long 4,2f-1f,5 + .byte 0x47,0x4E,0x55,0 +1: .long 0xc0000002,4,3 +.align 8 +2: diff --git a/temp-deps/blst-ts/deps/blst/build/elf/div3w-armv8.S b/temp-deps/blst-ts/deps/blst/build/elf/div3w-armv8.S new file mode 100644 index 0000000..a2b1d67 --- /dev/null +++ b/temp-deps/blst-ts/deps/blst/build/elf/div3w-armv8.S @@ -0,0 +1,88 @@ +.text + +.globl div_3_limbs +.type div_3_limbs,%function +.align 5 +div_3_limbs: + ldp x4,x5,[x0] // load R + eor x0,x0,x0 // Q = 0 + mov x3,#64 // loop counter + nop + +.Loop: + subs x6,x4,x1 // R - D + add x0,x0,x0 // Q <<= 1 + sbcs x7,x5,x2 + add x0,x0,#1 // Q + speculative bit + csel x4,x4,x6,lo // select between R and R - D + extr x1,x2,x1,#1 // D >>= 1 + csel x5,x5,x7,lo + lsr x2,x2,#1 + sbc x0,x0,xzr // subtract speculative bit + sub x3,x3,#1 + cbnz x3,.Loop + + asr x3,x0,#63 // top bit -> mask + add x0,x0,x0 // Q <<= 1 + subs x6,x4,x1 // R - D + add x0,x0,#1 // Q + specilative bit + sbcs x7,x5,x2 + sbc x0,x0,xzr // subtract speculative bit + + orr x0,x0,x3 // all ones if overflow + + ret +.size div_3_limbs,.-div_3_limbs +.globl quot_rem_128 +.type quot_rem_128,%function +.align 5 +quot_rem_128: + ldp x3,x4,[x1] + + mul x5,x3,x2 // divisor[0:1} * quotient + umulh x6,x3,x2 + mul x11, x4,x2 + umulh x7,x4,x2 + + ldp x8,x9,[x0] // load 3 limbs of the dividend + ldr x10,[x0,#16] + + adds x6,x6,x11 + adc x7,x7,xzr + + subs x8,x8,x5 // dividend - divisor * quotient + sbcs x9,x9,x6 + sbcs x10,x10,x7 + sbc x5,xzr,xzr // borrow -> mask + + add x2,x2,x5 // if borrowed, adjust the quotient ... + and x3,x3,x5 + and x4,x4,x5 + adds x8,x8,x3 // ... and add divisor + adc x9,x9,x4 + + stp x8,x9,[x0] // save 2 limbs of the remainder + str x2,[x0,#16] // and one limb of the quotient + + mov x0,x2 // return adjusted quotient + + ret +.size quot_rem_128,.-quot_rem_128 + +.globl quot_rem_64 +.type quot_rem_64,%function +.align 5 +quot_rem_64: + ldr x3,[x1] + ldr x8,[x0] // load 1 limb of the dividend + + mul x5,x3,x2 // divisor * quotient + + sub x8,x8,x5 // dividend - divisor * quotient + + stp x8,x2,[x0] // save remainder and quotient + + mov x0,x2 // return quotient + + ret +.size quot_rem_64,.-quot_rem_64 diff --git a/temp-deps/blst-ts/deps/blst/build/elf/div3w-x86_64.s b/temp-deps/blst-ts/deps/blst/build/elf/div3w-x86_64.s new file mode 100644 index 0000000..00ae569 --- /dev/null +++ b/temp-deps/blst-ts/deps/blst/build/elf/div3w-x86_64.s @@ -0,0 +1,123 @@ +.text + +.globl div_3_limbs +.hidden div_3_limbs +.type div_3_limbs,@function +.align 32 +div_3_limbs: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + movq (%rdi),%r8 + movq 8(%rdi),%r9 + xorq %rax,%rax + movl $64,%ecx + +.Loop: + movq %r8,%r10 + subq %rsi,%r8 + movq %r9,%r11 + sbbq %rdx,%r9 + leaq 1(%rax,%rax,1),%rax + movq %rdx,%rdi + cmovcq %r10,%r8 + cmovcq %r11,%r9 + sbbq $0,%rax + shlq $63,%rdi + shrq $1,%rsi + shrq $1,%rdx + orq %rdi,%rsi + subl $1,%ecx + jnz .Loop + + leaq 1(%rax,%rax,1),%rcx + sarq $63,%rax + + subq %rsi,%r8 + sbbq %rdx,%r9 + sbbq $0,%rcx + + orq %rcx,%rax + + .byte 0xf3,0xc3 +.cfi_endproc +.size div_3_limbs,.-div_3_limbs +.globl quot_rem_128 +.hidden quot_rem_128 +.type quot_rem_128,@function +.align 32 +quot_rem_128: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + movq %rdx,%rax + movq %rdx,%rcx + + mulq 0(%rsi) + movq %rax,%r8 + movq %rcx,%rax + movq %rdx,%r9 + + mulq 8(%rsi) + addq %rax,%r9 + adcq $0,%rdx + + movq 0(%rdi),%r10 + movq 8(%rdi),%r11 + movq 16(%rdi),%rax + + subq %r8,%r10 + sbbq %r9,%r11 + sbbq %rdx,%rax + sbbq %r8,%r8 + + addq %r8,%rcx + movq %r8,%r9 + andq 0(%rsi),%r8 + andq 8(%rsi),%r9 + addq %r8,%r10 + adcq %r9,%r11 + + movq %r10,0(%rdi) + movq %r11,8(%rdi) + movq %rcx,16(%rdi) + + movq %rcx,%rax + + .byte 0xf3,0xc3 +.cfi_endproc +.size quot_rem_128,.-quot_rem_128 + + + + + +.globl quot_rem_64 +.hidden quot_rem_64 +.type quot_rem_64,@function +.align 32 +quot_rem_64: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + movq %rdx,%rax + imulq 0(%rsi),%rdx + + movq 0(%rdi),%r10 + + subq %rdx,%r10 + + movq %r10,0(%rdi) + movq %rax,8(%rdi) + + .byte 0xf3,0xc3 +.cfi_endproc +.size quot_rem_64,.-quot_rem_64 + +.section .note.GNU-stack,"",@progbits +.section .note.gnu.property,"a",@note + .long 4,2f-1f,5 + .byte 0x47,0x4E,0x55,0 +1: .long 0xc0000002,4,3 +.align 8 +2: diff --git a/temp-deps/blst-ts/deps/blst/build/elf/mul_mont_256-armv8.S b/temp-deps/blst-ts/deps/blst/build/elf/mul_mont_256-armv8.S new file mode 100644 index 0000000..8bb1197 --- /dev/null +++ b/temp-deps/blst-ts/deps/blst/build/elf/mul_mont_256-armv8.S @@ -0,0 +1,464 @@ +.text + +.globl mul_mont_sparse_256 +.hidden mul_mont_sparse_256 +.type mul_mont_sparse_256,%function +.align 5 +mul_mont_sparse_256: + stp x29,x30,[sp,#-64]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + stp x23,x24,[sp,#48] + + ldp x10,x11,[x1] + ldr x9, [x2] + ldp x12,x13,[x1,#16] + + mul x19,x10,x9 + ldp x5,x6,[x3] + mul x20,x11,x9 + ldp x7,x8,[x3,#16] + mul x21,x12,x9 + mul x22,x13,x9 + + umulh x14,x10,x9 + umulh x15,x11,x9 + mul x3,x4,x19 + umulh x16,x12,x9 + umulh x17,x13,x9 + adds x20,x20,x14 + //mul x14,x5,x3 + adcs x21,x21,x15 + mul x15,x6,x3 + adcs x22,x22,x16 + mul x16,x7,x3 + adc x23,xzr, x17 + mul x17,x8,x3 + ldr x9,[x2,8*1] + subs xzr,x19,#1 //adds x19,x19,x14 + umulh x14,x5,x3 + adcs x20,x20,x15 + umulh x15,x6,x3 + adcs x21,x21,x16 + umulh x16,x7,x3 + adcs x22,x22,x17 + umulh x17,x8,x3 + adc x23,x23,xzr + + adds x19,x20,x14 + mul x14,x10,x9 + adcs x20,x21,x15 + mul x15,x11,x9 + adcs x21,x22,x16 + mul x16,x12,x9 + adcs x22,x23,x17 + mul x17,x13,x9 + adc x23,xzr,xzr + + adds x19,x19,x14 + umulh x14,x10,x9 + adcs x20,x20,x15 + umulh x15,x11,x9 + adcs x21,x21,x16 + mul x3,x4,x19 + umulh x16,x12,x9 + adcs x22,x22,x17 + umulh x17,x13,x9 + adc x23,x23,xzr + + adds x20,x20,x14 + //mul x14,x5,x3 + adcs x21,x21,x15 + mul x15,x6,x3 + adcs x22,x22,x16 + mul x16,x7,x3 + adc x23,x23,x17 + mul x17,x8,x3 + ldr x9,[x2,8*2] + subs xzr,x19,#1 //adds x19,x19,x14 + umulh x14,x5,x3 + adcs x20,x20,x15 + umulh x15,x6,x3 + adcs x21,x21,x16 + umulh x16,x7,x3 + adcs x22,x22,x17 + umulh x17,x8,x3 + adc x23,x23,xzr + + adds x19,x20,x14 + mul x14,x10,x9 + adcs x20,x21,x15 + mul x15,x11,x9 + adcs x21,x22,x16 + mul x16,x12,x9 + adcs x22,x23,x17 + mul x17,x13,x9 + adc x23,xzr,xzr + + adds x19,x19,x14 + umulh x14,x10,x9 + adcs x20,x20,x15 + umulh x15,x11,x9 + adcs x21,x21,x16 + mul x3,x4,x19 + umulh x16,x12,x9 + adcs x22,x22,x17 + umulh x17,x13,x9 + adc x23,x23,xzr + + adds x20,x20,x14 + //mul x14,x5,x3 + adcs x21,x21,x15 + mul x15,x6,x3 + adcs x22,x22,x16 + mul x16,x7,x3 + adc x23,x23,x17 + mul x17,x8,x3 + ldr x9,[x2,8*3] + subs xzr,x19,#1 //adds x19,x19,x14 + umulh x14,x5,x3 + adcs x20,x20,x15 + umulh x15,x6,x3 + adcs x21,x21,x16 + umulh x16,x7,x3 + adcs x22,x22,x17 + umulh x17,x8,x3 + adc x23,x23,xzr + + adds x19,x20,x14 + mul x14,x10,x9 + adcs x20,x21,x15 + mul x15,x11,x9 + adcs x21,x22,x16 + mul x16,x12,x9 + adcs x22,x23,x17 + mul x17,x13,x9 + adc x23,xzr,xzr + + adds x19,x19,x14 + umulh x14,x10,x9 + adcs x20,x20,x15 + umulh x15,x11,x9 + adcs x21,x21,x16 + mul x3,x4,x19 + umulh x16,x12,x9 + adcs x22,x22,x17 + umulh x17,x13,x9 + adc x23,x23,xzr + + adds x20,x20,x14 + //mul x14,x5,x3 + adcs x21,x21,x15 + mul x15,x6,x3 + adcs x22,x22,x16 + mul x16,x7,x3 + adc x23,x23,x17 + mul x17,x8,x3 + subs xzr,x19,#1 //adds x19,x19,x14 + umulh x14,x5,x3 + adcs x20,x20,x15 + umulh x15,x6,x3 + adcs x21,x21,x16 + umulh x16,x7,x3 + adcs x22,x22,x17 + umulh x17,x8,x3 + adc x23,x23,xzr + + adds x19,x20,x14 + adcs x20,x21,x15 + adcs x21,x22,x16 + adcs x22,x23,x17 + adc x23,xzr,xzr + + subs x14,x19,x5 + sbcs x15,x20,x6 + sbcs x16,x21,x7 + sbcs x17,x22,x8 + sbcs xzr, x23,xzr + + csel x19,x19,x14,lo + csel x20,x20,x15,lo + csel x21,x21,x16,lo + csel x22,x22,x17,lo + + stp x19,x20,[x0] + stp x21,x22,[x0,#16] + + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldp x23,x24,[x29,#48] + ldr x29,[sp],#64 + ret +.size mul_mont_sparse_256,.-mul_mont_sparse_256 +.globl sqr_mont_sparse_256 +.hidden sqr_mont_sparse_256 +.type sqr_mont_sparse_256,%function +.align 5 +sqr_mont_sparse_256: + .inst 0xd503233f + stp x29,x30,[sp,#-48]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + + ldp x5,x6,[x1] + ldp x7,x8,[x1,#16] + mov x4,x3 + + //////////////////////////////////////////////////////////////// + // | | | | | |a1*a0| | + // | | | | |a2*a0| | | + // | |a3*a2|a3*a0| | | | + // | | | |a2*a1| | | | + // | | |a3*a1| | | | | + // *| | | | | | | | 2| + // +|a3*a3|a2*a2|a1*a1|a0*a0| + // |--+--+--+--+--+--+--+--| + // |A7|A6|A5|A4|A3|A2|A1|A0|, where Ax is x10 + // + // "can't overflow" below mark carrying into high part of + // multiplication result, which can't overflow, because it + // can never be all ones. + + mul x11,x6,x5 // a[1]*a[0] + umulh x15,x6,x5 + mul x12,x7,x5 // a[2]*a[0] + umulh x16,x7,x5 + mul x13,x8,x5 // a[3]*a[0] + umulh x19,x8,x5 + + adds x12,x12,x15 // accumulate high parts of multiplication + mul x14,x7,x6 // a[2]*a[1] + umulh x15,x7,x6 + adcs x13,x13,x16 + mul x16,x8,x6 // a[3]*a[1] + umulh x17,x8,x6 + adc x19,x19,xzr // can't overflow + + mul x20,x8,x7 // a[3]*a[2] + umulh x21,x8,x7 + + adds x15,x15,x16 // accumulate high parts of multiplication + mul x10,x5,x5 // a[0]*a[0] + adc x16,x17,xzr // can't overflow + + adds x13,x13,x14 // accumulate low parts of multiplication + umulh x5,x5,x5 + adcs x19,x19,x15 + mul x15,x6,x6 // a[1]*a[1] + adcs x20,x20,x16 + umulh x6,x6,x6 + adc x21,x21,xzr // can't overflow + + adds x11,x11,x11 // acc[1-6]*=2 + mul x16,x7,x7 // a[2]*a[2] + adcs x12,x12,x12 + umulh x7,x7,x7 + adcs x13,x13,x13 + mul x17,x8,x8 // a[3]*a[3] + adcs x19,x19,x19 + umulh x8,x8,x8 + adcs x20,x20,x20 + adcs x21,x21,x21 + adc x22,xzr,xzr + + adds x11,x11,x5 // +a[i]*a[i] + adcs x12,x12,x15 + adcs x13,x13,x6 + adcs x19,x19,x16 + adcs x20,x20,x7 + adcs x21,x21,x17 + adc x22,x22,x8 + + bl __mul_by_1_mont_256 + ldr x30,[x29,#8] + + adds x10,x10,x19 // accumulate upper half + adcs x11,x11,x20 + adcs x12,x12,x21 + adcs x13,x13,x22 + adc x19,xzr,xzr + + subs x14,x10,x5 + sbcs x15,x11,x6 + sbcs x16,x12,x7 + sbcs x17,x13,x8 + sbcs xzr, x19,xzr + + csel x10,x10,x14,lo + csel x11,x11,x15,lo + csel x12,x12,x16,lo + csel x13,x13,x17,lo + + stp x10,x11,[x0] + stp x12,x13,[x0,#16] + + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldr x29,[sp],#48 + .inst 0xd50323bf + ret +.size sqr_mont_sparse_256,.-sqr_mont_sparse_256 +.globl from_mont_256 +.hidden from_mont_256 +.type from_mont_256,%function +.align 5 +from_mont_256: + .inst 0xd503233f + stp x29,x30,[sp,#-16]! + add x29,sp,#0 + + mov x4,x3 + ldp x10,x11,[x1] + ldp x12,x13,[x1,#16] + + bl __mul_by_1_mont_256 + ldr x30,[x29,#8] + + subs x14,x10,x5 + sbcs x15,x11,x6 + sbcs x16,x12,x7 + sbcs x17,x13,x8 + + csel x10,x10,x14,lo + csel x11,x11,x15,lo + csel x12,x12,x16,lo + csel x13,x13,x17,lo + + stp x10,x11,[x0] + stp x12,x13,[x0,#16] + + ldr x29,[sp],#16 + .inst 0xd50323bf + ret +.size from_mont_256,.-from_mont_256 + +.globl redc_mont_256 +.hidden redc_mont_256 +.type redc_mont_256,%function +.align 5 +redc_mont_256: + .inst 0xd503233f + stp x29,x30,[sp,#-16]! + add x29,sp,#0 + + mov x4,x3 + ldp x10,x11,[x1] + ldp x12,x13,[x1,#16] + + bl __mul_by_1_mont_256 + ldr x30,[x29,#8] + + ldp x14,x15,[x1,#32] + ldp x16,x17,[x1,#48] + + adds x10,x10,x14 + adcs x11,x11,x15 + adcs x12,x12,x16 + adcs x13,x13,x17 + adc x9,xzr,xzr + + subs x14,x10,x5 + sbcs x15,x11,x6 + sbcs x16,x12,x7 + sbcs x17,x13,x8 + sbcs xzr, x9,xzr + + csel x10,x10,x14,lo + csel x11,x11,x15,lo + csel x12,x12,x16,lo + csel x13,x13,x17,lo + + stp x10,x11,[x0] + stp x12,x13,[x0,#16] + + ldr x29,[sp],#16 + .inst 0xd50323bf + ret +.size redc_mont_256,.-redc_mont_256 + +.type __mul_by_1_mont_256,%function +.align 5 +__mul_by_1_mont_256: + mul x3,x4,x10 + ldp x5,x6,[x2] + ldp x7,x8,[x2,#16] + //mul x14,x5,x3 + mul x15,x6,x3 + mul x16,x7,x3 + mul x17,x8,x3 + subs xzr,x10,#1 //adds x10,x10,x14 + umulh x14,x5,x3 + adcs x11,x11,x15 + umulh x15,x6,x3 + adcs x12,x12,x16 + umulh x16,x7,x3 + adcs x13,x13,x17 + umulh x17,x8,x3 + adc x9,xzr,xzr + + adds x10,x11,x14 + adcs x11,x12,x15 + adcs x12,x13,x16 + mul x3,x4,x10 + adc x13,x9,x17 + //mul x14,x5,x3 + mul x15,x6,x3 + mul x16,x7,x3 + mul x17,x8,x3 + subs xzr,x10,#1 //adds x10,x10,x14 + umulh x14,x5,x3 + adcs x11,x11,x15 + umulh x15,x6,x3 + adcs x12,x12,x16 + umulh x16,x7,x3 + adcs x13,x13,x17 + umulh x17,x8,x3 + adc x9,xzr,xzr + + adds x10,x11,x14 + adcs x11,x12,x15 + adcs x12,x13,x16 + mul x3,x4,x10 + adc x13,x9,x17 + //mul x14,x5,x3 + mul x15,x6,x3 + mul x16,x7,x3 + mul x17,x8,x3 + subs xzr,x10,#1 //adds x10,x10,x14 + umulh x14,x5,x3 + adcs x11,x11,x15 + umulh x15,x6,x3 + adcs x12,x12,x16 + umulh x16,x7,x3 + adcs x13,x13,x17 + umulh x17,x8,x3 + adc x9,xzr,xzr + + adds x10,x11,x14 + adcs x11,x12,x15 + adcs x12,x13,x16 + mul x3,x4,x10 + adc x13,x9,x17 + //mul x14,x5,x3 + mul x15,x6,x3 + mul x16,x7,x3 + mul x17,x8,x3 + subs xzr,x10,#1 //adds x10,x10,x14 + umulh x14,x5,x3 + adcs x11,x11,x15 + umulh x15,x6,x3 + adcs x12,x12,x16 + umulh x16,x7,x3 + adcs x13,x13,x17 + umulh x17,x8,x3 + adc x9,xzr,xzr + + adds x10,x11,x14 + adcs x11,x12,x15 + adcs x12,x13,x16 + adc x13,x9,x17 + + ret +.size __mul_by_1_mont_256,.-__mul_by_1_mont_256 diff --git a/temp-deps/blst-ts/deps/blst/build/elf/mul_mont_384-armv8.S b/temp-deps/blst-ts/deps/blst/build/elf/mul_mont_384-armv8.S new file mode 100644 index 0000000..c048e81 --- /dev/null +++ b/temp-deps/blst-ts/deps/blst/build/elf/mul_mont_384-armv8.S @@ -0,0 +1,2372 @@ +.text + +.globl add_mod_384x384 +.type add_mod_384x384,%function +.align 5 +add_mod_384x384: + .inst 0xd503233f + stp x29,x30,[sp,#-64]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + stp x23,x24,[sp,#48] + + ldp x5,x6,[x3] + ldp x7,x8,[x3,#16] + ldp x9,x10,[x3,#32] + + bl __add_mod_384x384 + ldr x30,[x29,#8] + + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldp x23,x24,[x29,#48] + ldr x29,[sp],#64 + .inst 0xd50323bf + ret +.size add_mod_384x384,.-add_mod_384x384 + +.type __add_mod_384x384,%function +.align 5 +__add_mod_384x384: + ldp x11, x12, [x1] + ldp x19,x20,[x2] + ldp x13, x14, [x1,#16] + adds x11,x11,x19 + ldp x21,x22,[x2,#16] + adcs x12,x12,x20 + ldp x15, x16, [x1,#32] + adcs x13,x13,x21 + ldp x23,x24,[x2,#32] + adcs x14,x14,x22 + stp x11, x12, [x0] + adcs x15,x15,x23 + ldp x11, x12, [x1,#48] + adcs x16,x16,x24 + + ldp x19,x20,[x2,#48] + stp x13, x14, [x0,#16] + ldp x13, x14, [x1,#64] + ldp x21,x22,[x2,#64] + + adcs x11,x11,x19 + stp x15, x16, [x0,#32] + adcs x12,x12,x20 + ldp x15, x16, [x1,#80] + adcs x13,x13,x21 + ldp x23,x24,[x2,#80] + adcs x14,x14,x22 + adcs x15,x15,x23 + adcs x16,x16,x24 + adc x17,xzr,xzr + + subs x19,x11,x5 + sbcs x20,x12,x6 + sbcs x21,x13,x7 + sbcs x22,x14,x8 + sbcs x23,x15,x9 + sbcs x24,x16,x10 + sbcs xzr,x17,xzr + + csel x11,x11,x19,lo + csel x12,x12,x20,lo + csel x13,x13,x21,lo + csel x14,x14,x22,lo + stp x11,x12,[x0,#48] + csel x15,x15,x23,lo + stp x13,x14,[x0,#64] + csel x16,x16,x24,lo + stp x15,x16,[x0,#80] + + ret +.size __add_mod_384x384,.-__add_mod_384x384 + +.globl sub_mod_384x384 +.type sub_mod_384x384,%function +.align 5 +sub_mod_384x384: + .inst 0xd503233f + stp x29,x30,[sp,#-64]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + stp x23,x24,[sp,#48] + + ldp x5,x6,[x3] + ldp x7,x8,[x3,#16] + ldp x9,x10,[x3,#32] + + bl __sub_mod_384x384 + ldr x30,[x29,#8] + + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldp x23,x24,[x29,#48] + ldr x29,[sp],#64 + .inst 0xd50323bf + ret +.size sub_mod_384x384,.-sub_mod_384x384 + +.type __sub_mod_384x384,%function +.align 5 +__sub_mod_384x384: + ldp x11, x12, [x1] + ldp x19,x20,[x2] + ldp x13, x14, [x1,#16] + subs x11,x11,x19 + ldp x21,x22,[x2,#16] + sbcs x12,x12,x20 + ldp x15, x16, [x1,#32] + sbcs x13,x13,x21 + ldp x23,x24,[x2,#32] + sbcs x14,x14,x22 + stp x11, x12, [x0] + sbcs x15,x15,x23 + ldp x11, x12, [x1,#48] + sbcs x16,x16,x24 + + ldp x19,x20,[x2,#48] + stp x13, x14, [x0,#16] + ldp x13, x14, [x1,#64] + ldp x21,x22,[x2,#64] + + sbcs x11,x11,x19 + stp x15, x16, [x0,#32] + sbcs x12,x12,x20 + ldp x15, x16, [x1,#80] + sbcs x13,x13,x21 + ldp x23,x24,[x2,#80] + sbcs x14,x14,x22 + sbcs x15,x15,x23 + sbcs x16,x16,x24 + sbc x17,xzr,xzr + + and x19,x5,x17 + and x20,x6,x17 + adds x11,x11,x19 + and x21,x7,x17 + adcs x12,x12,x20 + and x22,x8,x17 + adcs x13,x13,x21 + and x23,x9,x17 + adcs x14,x14,x22 + and x24,x10,x17 + adcs x15,x15,x23 + stp x11,x12,[x0,#48] + adc x16,x16,x24 + stp x13,x14,[x0,#64] + stp x15,x16,[x0,#80] + + ret +.size __sub_mod_384x384,.-__sub_mod_384x384 + +.type __add_mod_384,%function +.align 5 +__add_mod_384: + ldp x11, x12, [x1] + ldp x19,x20,[x2] + ldp x13, x14, [x1,#16] + adds x11,x11,x19 + ldp x21,x22,[x2,#16] + adcs x12,x12,x20 + ldp x15, x16, [x1,#32] + adcs x13,x13,x21 + ldp x23,x24,[x2,#32] + adcs x14,x14,x22 + adcs x15,x15,x23 + adcs x16,x16,x24 + adc x17,xzr,xzr + + subs x19,x11,x5 + sbcs x20,x12,x6 + sbcs x21,x13,x7 + sbcs x22,x14,x8 + sbcs x23,x15,x9 + sbcs x24,x16,x10 + sbcs xzr,x17,xzr + + csel x11,x11,x19,lo + csel x12,x12,x20,lo + csel x13,x13,x21,lo + csel x14,x14,x22,lo + csel x15,x15,x23,lo + stp x11,x12,[x0] + csel x16,x16,x24,lo + stp x13,x14,[x0,#16] + stp x15,x16,[x0,#32] + + ret +.size __add_mod_384,.-__add_mod_384 + +.type __sub_mod_384,%function +.align 5 +__sub_mod_384: + ldp x11, x12, [x1] + ldp x19,x20,[x2] + ldp x13, x14, [x1,#16] + subs x11,x11,x19 + ldp x21,x22,[x2,#16] + sbcs x12,x12,x20 + ldp x15, x16, [x1,#32] + sbcs x13,x13,x21 + ldp x23,x24,[x2,#32] + sbcs x14,x14,x22 + sbcs x15,x15,x23 + sbcs x16,x16,x24 + sbc x17,xzr,xzr + + and x19,x5,x17 + and x20,x6,x17 + adds x11,x11,x19 + and x21,x7,x17 + adcs x12,x12,x20 + and x22,x8,x17 + adcs x13,x13,x21 + and x23,x9,x17 + adcs x14,x14,x22 + and x24,x10,x17 + adcs x15,x15,x23 + stp x11,x12,[x0] + adc x16,x16,x24 + stp x13,x14,[x0,#16] + stp x15,x16,[x0,#32] + + ret +.size __sub_mod_384,.-__sub_mod_384 + +.globl mul_mont_384x +.hidden mul_mont_384x +.type mul_mont_384x,%function +.align 5 +mul_mont_384x: + .inst 0xd503233f + stp x29,x30,[sp,#-128]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + stp x23,x24,[sp,#48] + stp x25,x26,[sp,#64] + stp x27,x28,[sp,#80] + sub sp,sp,#288 // space for 3 768-bit vectors + + mov x26,x0 // save r_ptr + mov x27,x1 // save b_ptr + mov x28,x2 // save b_ptr + + sub x0,sp,#0 // mul_384(t0, a->re, b->re) + bl __mul_384 + + add x1,x1,#48 // mul_384(t1, a->im, b->im) + add x2,x2,#48 + add x0,sp,#96 + bl __mul_384 + + ldp x5,x6,[x3] + ldp x7,x8,[x3,#16] + ldp x9,x10,[x3,#32] + + sub x2,x1,#48 + add x0,sp,#240 + bl __add_mod_384 + + add x1,x28,#0 + add x2,x28,#48 + add x0,sp,#192 // t2 + bl __add_mod_384 + + add x1,x0,#0 + add x2,x0,#48 + bl __mul_384 // mul_384(t2, a->re+a->im, b->re+b->im) + + ldp x5,x6,[x3] + ldp x7,x8,[x3,#16] + ldp x9,x10,[x3,#32] + + mov x1,x0 + add x2,sp,#0 + bl __sub_mod_384x384 + + add x2,sp,#96 + bl __sub_mod_384x384 // t2 = t2-t0-t1 + + add x1,sp,#0 + add x2,sp,#96 + add x0,sp,#0 + bl __sub_mod_384x384 // t0 = t0-t1 + + add x1,sp,#0 // ret->re = redc(t0) + add x0,x26,#0 + bl __mul_by_1_mont_384 + bl __redc_tail_mont_384 + + add x1,sp,#192 // ret->im = redc(t2) + add x0,x0,#48 + bl __mul_by_1_mont_384 + bl __redc_tail_mont_384 + ldr x30,[x29,#8] + + add sp,sp,#288 + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldp x23,x24,[x29,#48] + ldp x25,x26,[x29,#64] + ldp x27,x28,[x29,#80] + ldr x29,[sp],#128 + .inst 0xd50323bf + ret +.size mul_mont_384x,.-mul_mont_384x + +.globl sqr_mont_384x +.hidden sqr_mont_384x +.type sqr_mont_384x,%function +.align 5 +sqr_mont_384x: + .inst 0xd503233f + stp x29,x30,[sp,#-128]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + stp x23,x24,[sp,#48] + stp x25,x26,[sp,#64] + stp x27,x28,[sp,#80] + stp x3,x0,[sp,#96] // __mul_mont_384 wants them there + sub sp,sp,#96 // space for 2 384-bit vectors + mov x4,x3 // adjust for missing b_ptr + + ldp x5,x6,[x2] + ldp x7,x8,[x2,#16] + ldp x9,x10,[x2,#32] + + add x2,x1,#48 + add x0,sp,#0 + bl __add_mod_384 // t0 = a->re + a->im + + add x0,sp,#48 + bl __sub_mod_384 // t1 = a->re - a->im + + ldp x11,x12,[x1] + ldr x17, [x2] + ldp x13,x14,[x1,#16] + ldp x15,x16,[x1,#32] + + bl __mul_mont_384 // mul_mont_384(ret->im, a->re, a->im) + + adds x11,x11,x11 // add with itself + adcs x12,x12,x12 + adcs x13,x13,x13 + adcs x14,x14,x14 + adcs x15,x15,x15 + adcs x16,x16,x16 + adc x25,xzr,xzr + + subs x19,x11,x5 + sbcs x20,x12,x6 + sbcs x21,x13,x7 + sbcs x22,x14,x8 + sbcs x23,x15,x9 + sbcs x24,x16,x10 + sbcs xzr,x25,xzr + + csel x19,x11,x19,lo + csel x20,x12,x20,lo + csel x21,x13,x21,lo + ldp x11,x12,[sp] + csel x22,x14,x22,lo + ldr x17, [sp,#48] + csel x23,x15,x23,lo + ldp x13,x14,[sp,#16] + csel x24,x16,x24,lo + ldp x15,x16,[sp,#32] + + stp x19,x20,[x2,#48] + stp x21,x22,[x2,#64] + stp x23,x24,[x2,#80] + + add x2,sp,#48 + bl __mul_mont_384 // mul_mont_384(ret->re, t0, t1) + ldr x30,[x29,#8] + + stp x11,x12,[x2] + stp x13,x14,[x2,#16] + stp x15,x16,[x2,#32] + + add sp,sp,#96 + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldp x23,x24,[x29,#48] + ldp x25,x26,[x29,#64] + ldp x27,x28,[x29,#80] + ldr x29,[sp],#128 + .inst 0xd50323bf + ret +.size sqr_mont_384x,.-sqr_mont_384x + +.globl mul_mont_384 +.hidden mul_mont_384 +.type mul_mont_384,%function +.align 5 +mul_mont_384: + .inst 0xd503233f + stp x29,x30,[sp,#-128]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + stp x23,x24,[sp,#48] + stp x25,x26,[sp,#64] + stp x27,x28,[sp,#80] + stp x4,x0,[sp,#96] // __mul_mont_384 wants them there + + ldp x11,x12,[x1] + ldr x17, [x2] + ldp x13,x14,[x1,#16] + ldp x15,x16,[x1,#32] + + ldp x5,x6,[x3] + ldp x7,x8,[x3,#16] + ldp x9,x10,[x3,#32] + + bl __mul_mont_384 + ldr x30,[x29,#8] + + stp x11,x12,[x2] + stp x13,x14,[x2,#16] + stp x15,x16,[x2,#32] + + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldp x23,x24,[x29,#48] + ldp x25,x26,[x29,#64] + ldp x27,x28,[x29,#80] + ldr x29,[sp],#128 + .inst 0xd50323bf + ret +.size mul_mont_384,.-mul_mont_384 + +.type __mul_mont_384,%function +.align 5 +__mul_mont_384: + mul x19,x11,x17 + mul x20,x12,x17 + mul x21,x13,x17 + mul x22,x14,x17 + mul x23,x15,x17 + mul x24,x16,x17 + mul x4,x4,x19 + + umulh x26,x11,x17 + umulh x27,x12,x17 + umulh x28,x13,x17 + umulh x0,x14,x17 + umulh x1,x15,x17 + umulh x3,x16,x17 + + adds x20,x20,x26 + // mul x26,x5,x4 + adcs x21,x21,x27 + mul x27,x6,x4 + adcs x22,x22,x28 + mul x28,x7,x4 + adcs x23,x23,x0 + mul x0,x8,x4 + adcs x24,x24,x1 + mul x1,x9,x4 + adc x25,xzr, x3 + mul x3,x10,x4 + mov x17,xzr + subs xzr,x19,#1 // adds x19,x19,x26 + umulh x26,x5,x4 + adcs x20,x20,x27 + umulh x27,x6,x4 + adcs x21,x21,x28 + umulh x28,x7,x4 + adcs x22,x22,x0 + umulh x0,x8,x4 + adcs x23,x23,x1 + umulh x1,x9,x4 + adcs x24,x24,x3 + umulh x3,x10,x4 + adcs x25,x25,xzr + adc x4,x17,xzr + ldr x17,[x2,8*1] + + adds x19,x20,x26 + mul x26,x11,x17 + adcs x20,x21,x27 + mul x27,x12,x17 + adcs x21,x22,x28 + mul x28,x13,x17 + adcs x22,x23,x0 + mul x0,x14,x17 + adcs x23,x24,x1 + mul x1,x15,x17 + adcs x24,x25,x3 + mul x3,x16,x17 + adc x25,x4,xzr + ldr x4,[x29,#96] + + adds x19,x19,x26 + umulh x26,x11,x17 + adcs x20,x20,x27 + umulh x27,x12,x17 + adcs x21,x21,x28 + mul x4,x4,x19 + umulh x28,x13,x17 + adcs x22,x22,x0 + umulh x0,x14,x17 + adcs x23,x23,x1 + umulh x1,x15,x17 + adcs x24,x24,x3 + umulh x3,x16,x17 + adcs x25,x25,xzr + adc x17,xzr,xzr + + adds x20,x20,x26 + // mul x26,x5,x4 + adcs x21,x21,x27 + mul x27,x6,x4 + adcs x22,x22,x28 + mul x28,x7,x4 + adcs x23,x23,x0 + mul x0,x8,x4 + adcs x24,x24,x1 + mul x1,x9,x4 + adcs x25,x25,x3 + mul x3,x10,x4 + adc x17,x17,xzr + subs xzr,x19,#1 // adds x19,x19,x26 + umulh x26,x5,x4 + adcs x20,x20,x27 + umulh x27,x6,x4 + adcs x21,x21,x28 + umulh x28,x7,x4 + adcs x22,x22,x0 + umulh x0,x8,x4 + adcs x23,x23,x1 + umulh x1,x9,x4 + adcs x24,x24,x3 + umulh x3,x10,x4 + adcs x25,x25,xzr + adc x4,x17,xzr + ldr x17,[x2,8*2] + + adds x19,x20,x26 + mul x26,x11,x17 + adcs x20,x21,x27 + mul x27,x12,x17 + adcs x21,x22,x28 + mul x28,x13,x17 + adcs x22,x23,x0 + mul x0,x14,x17 + adcs x23,x24,x1 + mul x1,x15,x17 + adcs x24,x25,x3 + mul x3,x16,x17 + adc x25,x4,xzr + ldr x4,[x29,#96] + + adds x19,x19,x26 + umulh x26,x11,x17 + adcs x20,x20,x27 + umulh x27,x12,x17 + adcs x21,x21,x28 + mul x4,x4,x19 + umulh x28,x13,x17 + adcs x22,x22,x0 + umulh x0,x14,x17 + adcs x23,x23,x1 + umulh x1,x15,x17 + adcs x24,x24,x3 + umulh x3,x16,x17 + adcs x25,x25,xzr + adc x17,xzr,xzr + + adds x20,x20,x26 + // mul x26,x5,x4 + adcs x21,x21,x27 + mul x27,x6,x4 + adcs x22,x22,x28 + mul x28,x7,x4 + adcs x23,x23,x0 + mul x0,x8,x4 + adcs x24,x24,x1 + mul x1,x9,x4 + adcs x25,x25,x3 + mul x3,x10,x4 + adc x17,x17,xzr + subs xzr,x19,#1 // adds x19,x19,x26 + umulh x26,x5,x4 + adcs x20,x20,x27 + umulh x27,x6,x4 + adcs x21,x21,x28 + umulh x28,x7,x4 + adcs x22,x22,x0 + umulh x0,x8,x4 + adcs x23,x23,x1 + umulh x1,x9,x4 + adcs x24,x24,x3 + umulh x3,x10,x4 + adcs x25,x25,xzr + adc x4,x17,xzr + ldr x17,[x2,8*3] + + adds x19,x20,x26 + mul x26,x11,x17 + adcs x20,x21,x27 + mul x27,x12,x17 + adcs x21,x22,x28 + mul x28,x13,x17 + adcs x22,x23,x0 + mul x0,x14,x17 + adcs x23,x24,x1 + mul x1,x15,x17 + adcs x24,x25,x3 + mul x3,x16,x17 + adc x25,x4,xzr + ldr x4,[x29,#96] + + adds x19,x19,x26 + umulh x26,x11,x17 + adcs x20,x20,x27 + umulh x27,x12,x17 + adcs x21,x21,x28 + mul x4,x4,x19 + umulh x28,x13,x17 + adcs x22,x22,x0 + umulh x0,x14,x17 + adcs x23,x23,x1 + umulh x1,x15,x17 + adcs x24,x24,x3 + umulh x3,x16,x17 + adcs x25,x25,xzr + adc x17,xzr,xzr + + adds x20,x20,x26 + // mul x26,x5,x4 + adcs x21,x21,x27 + mul x27,x6,x4 + adcs x22,x22,x28 + mul x28,x7,x4 + adcs x23,x23,x0 + mul x0,x8,x4 + adcs x24,x24,x1 + mul x1,x9,x4 + adcs x25,x25,x3 + mul x3,x10,x4 + adc x17,x17,xzr + subs xzr,x19,#1 // adds x19,x19,x26 + umulh x26,x5,x4 + adcs x20,x20,x27 + umulh x27,x6,x4 + adcs x21,x21,x28 + umulh x28,x7,x4 + adcs x22,x22,x0 + umulh x0,x8,x4 + adcs x23,x23,x1 + umulh x1,x9,x4 + adcs x24,x24,x3 + umulh x3,x10,x4 + adcs x25,x25,xzr + adc x4,x17,xzr + ldr x17,[x2,8*4] + + adds x19,x20,x26 + mul x26,x11,x17 + adcs x20,x21,x27 + mul x27,x12,x17 + adcs x21,x22,x28 + mul x28,x13,x17 + adcs x22,x23,x0 + mul x0,x14,x17 + adcs x23,x24,x1 + mul x1,x15,x17 + adcs x24,x25,x3 + mul x3,x16,x17 + adc x25,x4,xzr + ldr x4,[x29,#96] + + adds x19,x19,x26 + umulh x26,x11,x17 + adcs x20,x20,x27 + umulh x27,x12,x17 + adcs x21,x21,x28 + mul x4,x4,x19 + umulh x28,x13,x17 + adcs x22,x22,x0 + umulh x0,x14,x17 + adcs x23,x23,x1 + umulh x1,x15,x17 + adcs x24,x24,x3 + umulh x3,x16,x17 + adcs x25,x25,xzr + adc x17,xzr,xzr + + adds x20,x20,x26 + // mul x26,x5,x4 + adcs x21,x21,x27 + mul x27,x6,x4 + adcs x22,x22,x28 + mul x28,x7,x4 + adcs x23,x23,x0 + mul x0,x8,x4 + adcs x24,x24,x1 + mul x1,x9,x4 + adcs x25,x25,x3 + mul x3,x10,x4 + adc x17,x17,xzr + subs xzr,x19,#1 // adds x19,x19,x26 + umulh x26,x5,x4 + adcs x20,x20,x27 + umulh x27,x6,x4 + adcs x21,x21,x28 + umulh x28,x7,x4 + adcs x22,x22,x0 + umulh x0,x8,x4 + adcs x23,x23,x1 + umulh x1,x9,x4 + adcs x24,x24,x3 + umulh x3,x10,x4 + adcs x25,x25,xzr + adc x4,x17,xzr + ldr x17,[x2,8*5] + + adds x19,x20,x26 + mul x26,x11,x17 + adcs x20,x21,x27 + mul x27,x12,x17 + adcs x21,x22,x28 + mul x28,x13,x17 + adcs x22,x23,x0 + mul x0,x14,x17 + adcs x23,x24,x1 + mul x1,x15,x17 + adcs x24,x25,x3 + mul x3,x16,x17 + adc x25,x4,xzr + ldr x4,[x29,#96] + + adds x19,x19,x26 + umulh x26,x11,x17 + adcs x20,x20,x27 + umulh x27,x12,x17 + adcs x21,x21,x28 + mul x4,x4,x19 + umulh x28,x13,x17 + adcs x22,x22,x0 + umulh x0,x14,x17 + adcs x23,x23,x1 + umulh x1,x15,x17 + adcs x24,x24,x3 + umulh x3,x16,x17 + adcs x25,x25,xzr + adc x17,xzr,xzr + + adds x20,x20,x26 + // mul x26,x5,x4 + adcs x21,x21,x27 + mul x27,x6,x4 + adcs x22,x22,x28 + mul x28,x7,x4 + adcs x23,x23,x0 + mul x0,x8,x4 + adcs x24,x24,x1 + mul x1,x9,x4 + adcs x25,x25,x3 + mul x3,x10,x4 + adc x17,x17,xzr + subs xzr,x19,#1 // adds x19,x19,x26 + umulh x26,x5,x4 + adcs x20,x20,x27 + umulh x27,x6,x4 + adcs x21,x21,x28 + umulh x28,x7,x4 + adcs x22,x22,x0 + umulh x0,x8,x4 + adcs x23,x23,x1 + umulh x1,x9,x4 + adcs x24,x24,x3 + umulh x3,x10,x4 + adcs x25,x25,xzr + ldp x4,x2,[x29,#96] // pull r_ptr + adc x17,x17,xzr + + adds x19,x20,x26 + adcs x20,x21,x27 + adcs x21,x22,x28 + adcs x22,x23,x0 + adcs x23,x24,x1 + adcs x24,x25,x3 + adc x25,x17,xzr + + subs x26,x19,x5 + sbcs x27,x20,x6 + sbcs x28,x21,x7 + sbcs x0,x22,x8 + sbcs x1,x23,x9 + sbcs x3,x24,x10 + sbcs xzr, x25,xzr + + csel x11,x19,x26,lo + csel x12,x20,x27,lo + csel x13,x21,x28,lo + csel x14,x22,x0,lo + csel x15,x23,x1,lo + csel x16,x24,x3,lo + ret +.size __mul_mont_384,.-__mul_mont_384 + +.globl sqr_mont_384 +.hidden sqr_mont_384 +.type sqr_mont_384,%function +.align 5 +sqr_mont_384: + .inst 0xd503233f + stp x29,x30,[sp,#-128]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + stp x23,x24,[sp,#48] + stp x25,x26,[sp,#64] + stp x27,x28,[sp,#80] + sub sp,sp,#96 // space for 768-bit vector + mov x4,x3 // adjust for missing b_ptr + + mov x3,x0 // save r_ptr + mov x0,sp + + ldp x11,x12,[x1] + ldp x13,x14,[x1,#16] + ldp x15,x16,[x1,#32] + + bl __sqr_384 + + ldp x5,x6,[x2] + ldp x7,x8,[x2,#16] + ldp x9,x10,[x2,#32] + + mov x1,sp + mov x0,x3 // restore r_ptr + bl __mul_by_1_mont_384 + bl __redc_tail_mont_384 + ldr x30,[x29,#8] + + add sp,sp,#96 + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldp x23,x24,[x29,#48] + ldp x25,x26,[x29,#64] + ldp x27,x28,[x29,#80] + ldr x29,[sp],#128 + .inst 0xd50323bf + ret +.size sqr_mont_384,.-sqr_mont_384 + +.globl sqr_n_mul_mont_383 +.hidden sqr_n_mul_mont_383 +.type sqr_n_mul_mont_383,%function +.align 5 +sqr_n_mul_mont_383: + .inst 0xd503233f + stp x29,x30,[sp,#-128]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + stp x23,x24,[sp,#48] + stp x25,x26,[sp,#64] + stp x27,x28,[sp,#80] + stp x4,x0,[sp,#96] // __mul_mont_384 wants them there + sub sp,sp,#96 // space for 768-bit vector + mov x17,x5 // save b_ptr + + ldp x11,x12,[x1] + ldp x13,x14,[x1,#16] + ldp x15,x16,[x1,#32] + mov x0,sp +.Loop_sqr_383: + bl __sqr_384 + sub x2,x2,#1 // counter + + ldp x5,x6,[x3] + ldp x7,x8,[x3,#16] + ldp x9,x10,[x3,#32] + + mov x1,sp + bl __mul_by_1_mont_384 + + ldp x19,x20,[x1,#48] + ldp x21,x22,[x1,#64] + ldp x23,x24,[x1,#80] + + adds x11,x11,x19 // just accumulate upper half + adcs x12,x12,x20 + adcs x13,x13,x21 + adcs x14,x14,x22 + adcs x15,x15,x23 + adc x16,x16,x24 + + cbnz x2,.Loop_sqr_383 + + mov x2,x17 + ldr x17,[x17] + bl __mul_mont_384 + ldr x30,[x29,#8] + + stp x11,x12,[x2] + stp x13,x14,[x2,#16] + stp x15,x16,[x2,#32] + + add sp,sp,#96 + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldp x23,x24,[x29,#48] + ldp x25,x26,[x29,#64] + ldp x27,x28,[x29,#80] + ldr x29,[sp],#128 + .inst 0xd50323bf + ret +.size sqr_n_mul_mont_383,.-sqr_n_mul_mont_383 +.type __sqr_384,%function +.align 5 +__sqr_384: + mul x19,x12,x11 + mul x20,x13,x11 + mul x21,x14,x11 + mul x22,x15,x11 + mul x23,x16,x11 + + umulh x6,x12,x11 + umulh x7,x13,x11 + umulh x8,x14,x11 + umulh x9,x15,x11 + adds x20,x20,x6 + umulh x10,x16,x11 + adcs x21,x21,x7 + mul x7,x13,x12 + adcs x22,x22,x8 + mul x8,x14,x12 + adcs x23,x23,x9 + mul x9,x15,x12 + adc x24,xzr, x10 + mul x10,x16,x12 + + adds x21,x21,x7 + umulh x7,x13,x12 + adcs x22,x22,x8 + umulh x8,x14,x12 + adcs x23,x23,x9 + umulh x9,x15,x12 + adcs x24,x24,x10 + umulh x10,x16,x12 + adc x25,xzr,xzr + + mul x5,x11,x11 + adds x22,x22,x7 + umulh x11, x11,x11 + adcs x23,x23,x8 + mul x8,x14,x13 + adcs x24,x24,x9 + mul x9,x15,x13 + adc x25,x25,x10 + mul x10,x16,x13 + + adds x23,x23,x8 + umulh x8,x14,x13 + adcs x24,x24,x9 + umulh x9,x15,x13 + adcs x25,x25,x10 + umulh x10,x16,x13 + adc x26,xzr,xzr + + mul x6,x12,x12 + adds x24,x24,x8 + umulh x12, x12,x12 + adcs x25,x25,x9 + mul x9,x15,x14 + adc x26,x26,x10 + mul x10,x16,x14 + + adds x25,x25,x9 + umulh x9,x15,x14 + adcs x26,x26,x10 + umulh x10,x16,x14 + adc x27,xzr,xzr + mul x7,x13,x13 + adds x26,x26,x9 + umulh x13, x13,x13 + adc x27,x27,x10 + mul x8,x14,x14 + + mul x10,x16,x15 + umulh x14, x14,x14 + adds x27,x27,x10 + umulh x10,x16,x15 + mul x9,x15,x15 + adc x28,x10,xzr + + adds x19,x19,x19 + adcs x20,x20,x20 + adcs x21,x21,x21 + adcs x22,x22,x22 + adcs x23,x23,x23 + adcs x24,x24,x24 + adcs x25,x25,x25 + adcs x26,x26,x26 + umulh x15, x15,x15 + adcs x27,x27,x27 + mul x10,x16,x16 + adcs x28,x28,x28 + umulh x16, x16,x16 + adc x1,xzr,xzr + + adds x19,x19,x11 + adcs x20,x20,x6 + adcs x21,x21,x12 + adcs x22,x22,x7 + adcs x23,x23,x13 + adcs x24,x24,x8 + adcs x25,x25,x14 + stp x5,x19,[x0] + adcs x26,x26,x9 + stp x20,x21,[x0,#16] + adcs x27,x27,x15 + stp x22,x23,[x0,#32] + adcs x28,x28,x10 + stp x24,x25,[x0,#48] + adc x16,x16,x1 + stp x26,x27,[x0,#64] + stp x28,x16,[x0,#80] + + ret +.size __sqr_384,.-__sqr_384 +.globl sqr_384 +.hidden sqr_384 +.type sqr_384,%function +.align 5 +sqr_384: + .inst 0xd503233f + stp x29,x30,[sp,#-128]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + stp x23,x24,[sp,#48] + stp x25,x26,[sp,#64] + stp x27,x28,[sp,#80] + + ldp x11,x12,[x1] + ldp x13,x14,[x1,#16] + ldp x15,x16,[x1,#32] + + bl __sqr_384 + ldr x30,[x29,#8] + + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldp x23,x24,[x29,#48] + ldp x25,x26,[x29,#64] + ldp x27,x28,[x29,#80] + ldr x29,[sp],#128 + .inst 0xd50323bf + ret +.size sqr_384,.-sqr_384 + +.globl redc_mont_384 +.hidden redc_mont_384 +.type redc_mont_384,%function +.align 5 +redc_mont_384: + .inst 0xd503233f + stp x29,x30,[sp,#-128]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + stp x23,x24,[sp,#48] + stp x25,x26,[sp,#64] + stp x27,x28,[sp,#80] + mov x4,x3 // adjust for missing b_ptr + + ldp x5,x6,[x2] + ldp x7,x8,[x2,#16] + ldp x9,x10,[x2,#32] + + bl __mul_by_1_mont_384 + bl __redc_tail_mont_384 + ldr x30,[x29,#8] + + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldp x23,x24,[x29,#48] + ldp x25,x26,[x29,#64] + ldp x27,x28,[x29,#80] + ldr x29,[sp],#128 + .inst 0xd50323bf + ret +.size redc_mont_384,.-redc_mont_384 + +.globl from_mont_384 +.hidden from_mont_384 +.type from_mont_384,%function +.align 5 +from_mont_384: + .inst 0xd503233f + stp x29,x30,[sp,#-128]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + stp x23,x24,[sp,#48] + stp x25,x26,[sp,#64] + stp x27,x28,[sp,#80] + mov x4,x3 // adjust for missing b_ptr + + ldp x5,x6,[x2] + ldp x7,x8,[x2,#16] + ldp x9,x10,[x2,#32] + + bl __mul_by_1_mont_384 + ldr x30,[x29,#8] + + subs x19,x11,x5 + sbcs x20,x12,x6 + sbcs x21,x13,x7 + sbcs x22,x14,x8 + sbcs x23,x15,x9 + sbcs x24,x16,x10 + + csel x11,x11,x19,lo + csel x12,x12,x20,lo + csel x13,x13,x21,lo + csel x14,x14,x22,lo + csel x15,x15,x23,lo + csel x16,x16,x24,lo + + stp x11,x12,[x0] + stp x13,x14,[x0,#16] + stp x15,x16,[x0,#32] + + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldp x23,x24,[x29,#48] + ldp x25,x26,[x29,#64] + ldp x27,x28,[x29,#80] + ldr x29,[sp],#128 + .inst 0xd50323bf + ret +.size from_mont_384,.-from_mont_384 + +.type __mul_by_1_mont_384,%function +.align 5 +__mul_by_1_mont_384: + ldp x11,x12,[x1] + ldp x13,x14,[x1,#16] + mul x26,x4,x11 + ldp x15,x16,[x1,#32] + + // mul x19,x5,x26 + mul x20,x6,x26 + mul x21,x7,x26 + mul x22,x8,x26 + mul x23,x9,x26 + mul x24,x10,x26 + subs xzr,x11,#1 // adds x19,x19,x11 + umulh x11,x5,x26 + adcs x20,x20,x12 + umulh x12,x6,x26 + adcs x21,x21,x13 + umulh x13,x7,x26 + adcs x22,x22,x14 + umulh x14,x8,x26 + adcs x23,x23,x15 + umulh x15,x9,x26 + adcs x24,x24,x16 + umulh x16,x10,x26 + adc x25,xzr,xzr + adds x11,x11,x20 + adcs x12,x12,x21 + adcs x13,x13,x22 + mul x26,x4,x11 + adcs x14,x14,x23 + adcs x15,x15,x24 + adc x16,x16,x25 + + // mul x19,x5,x26 + mul x20,x6,x26 + mul x21,x7,x26 + mul x22,x8,x26 + mul x23,x9,x26 + mul x24,x10,x26 + subs xzr,x11,#1 // adds x19,x19,x11 + umulh x11,x5,x26 + adcs x20,x20,x12 + umulh x12,x6,x26 + adcs x21,x21,x13 + umulh x13,x7,x26 + adcs x22,x22,x14 + umulh x14,x8,x26 + adcs x23,x23,x15 + umulh x15,x9,x26 + adcs x24,x24,x16 + umulh x16,x10,x26 + adc x25,xzr,xzr + adds x11,x11,x20 + adcs x12,x12,x21 + adcs x13,x13,x22 + mul x26,x4,x11 + adcs x14,x14,x23 + adcs x15,x15,x24 + adc x16,x16,x25 + + // mul x19,x5,x26 + mul x20,x6,x26 + mul x21,x7,x26 + mul x22,x8,x26 + mul x23,x9,x26 + mul x24,x10,x26 + subs xzr,x11,#1 // adds x19,x19,x11 + umulh x11,x5,x26 + adcs x20,x20,x12 + umulh x12,x6,x26 + adcs x21,x21,x13 + umulh x13,x7,x26 + adcs x22,x22,x14 + umulh x14,x8,x26 + adcs x23,x23,x15 + umulh x15,x9,x26 + adcs x24,x24,x16 + umulh x16,x10,x26 + adc x25,xzr,xzr + adds x11,x11,x20 + adcs x12,x12,x21 + adcs x13,x13,x22 + mul x26,x4,x11 + adcs x14,x14,x23 + adcs x15,x15,x24 + adc x16,x16,x25 + + // mul x19,x5,x26 + mul x20,x6,x26 + mul x21,x7,x26 + mul x22,x8,x26 + mul x23,x9,x26 + mul x24,x10,x26 + subs xzr,x11,#1 // adds x19,x19,x11 + umulh x11,x5,x26 + adcs x20,x20,x12 + umulh x12,x6,x26 + adcs x21,x21,x13 + umulh x13,x7,x26 + adcs x22,x22,x14 + umulh x14,x8,x26 + adcs x23,x23,x15 + umulh x15,x9,x26 + adcs x24,x24,x16 + umulh x16,x10,x26 + adc x25,xzr,xzr + adds x11,x11,x20 + adcs x12,x12,x21 + adcs x13,x13,x22 + mul x26,x4,x11 + adcs x14,x14,x23 + adcs x15,x15,x24 + adc x16,x16,x25 + + // mul x19,x5,x26 + mul x20,x6,x26 + mul x21,x7,x26 + mul x22,x8,x26 + mul x23,x9,x26 + mul x24,x10,x26 + subs xzr,x11,#1 // adds x19,x19,x11 + umulh x11,x5,x26 + adcs x20,x20,x12 + umulh x12,x6,x26 + adcs x21,x21,x13 + umulh x13,x7,x26 + adcs x22,x22,x14 + umulh x14,x8,x26 + adcs x23,x23,x15 + umulh x15,x9,x26 + adcs x24,x24,x16 + umulh x16,x10,x26 + adc x25,xzr,xzr + adds x11,x11,x20 + adcs x12,x12,x21 + adcs x13,x13,x22 + mul x26,x4,x11 + adcs x14,x14,x23 + adcs x15,x15,x24 + adc x16,x16,x25 + + // mul x19,x5,x26 + mul x20,x6,x26 + mul x21,x7,x26 + mul x22,x8,x26 + mul x23,x9,x26 + mul x24,x10,x26 + subs xzr,x11,#1 // adds x19,x19,x11 + umulh x11,x5,x26 + adcs x20,x20,x12 + umulh x12,x6,x26 + adcs x21,x21,x13 + umulh x13,x7,x26 + adcs x22,x22,x14 + umulh x14,x8,x26 + adcs x23,x23,x15 + umulh x15,x9,x26 + adcs x24,x24,x16 + umulh x16,x10,x26 + adc x25,xzr,xzr + adds x11,x11,x20 + adcs x12,x12,x21 + adcs x13,x13,x22 + adcs x14,x14,x23 + adcs x15,x15,x24 + adc x16,x16,x25 + + ret +.size __mul_by_1_mont_384,.-__mul_by_1_mont_384 + +.type __redc_tail_mont_384,%function +.align 5 +__redc_tail_mont_384: + ldp x19,x20,[x1,#48] + ldp x21,x22,[x1,#64] + ldp x23,x24,[x1,#80] + + adds x11,x11,x19 // accumulate upper half + adcs x12,x12,x20 + adcs x13,x13,x21 + adcs x14,x14,x22 + adcs x15,x15,x23 + adcs x16,x16,x24 + adc x25,xzr,xzr + + subs x19,x11,x5 + sbcs x20,x12,x6 + sbcs x21,x13,x7 + sbcs x22,x14,x8 + sbcs x23,x15,x9 + sbcs x24,x16,x10 + sbcs xzr,x25,xzr + + csel x11,x11,x19,lo + csel x12,x12,x20,lo + csel x13,x13,x21,lo + csel x14,x14,x22,lo + csel x15,x15,x23,lo + csel x16,x16,x24,lo + + stp x11,x12,[x0] + stp x13,x14,[x0,#16] + stp x15,x16,[x0,#32] + + ret +.size __redc_tail_mont_384,.-__redc_tail_mont_384 + +.globl mul_384 +.hidden mul_384 +.type mul_384,%function +.align 5 +mul_384: + .inst 0xd503233f + stp x29,x30,[sp,#-128]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + stp x23,x24,[sp,#48] + stp x25,x26,[sp,#64] + stp x27,x28,[sp,#80] + + bl __mul_384 + ldr x30,[x29,#8] + + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldp x23,x24,[x29,#48] + ldp x25,x26,[x29,#64] + ldp x27,x28,[x29,#80] + ldr x29,[sp],#128 + .inst 0xd50323bf + ret +.size mul_384,.-mul_384 + +.type __mul_384,%function +.align 5 +__mul_384: + ldp x11,x12,[x1] + ldr x17, [x2] + ldp x13,x14,[x1,#16] + ldp x15,x16,[x1,#32] + + mul x19,x11,x17 + mul x20,x12,x17 + mul x21,x13,x17 + mul x22,x14,x17 + mul x23,x15,x17 + mul x24,x16,x17 + + umulh x5,x11,x17 + umulh x6,x12,x17 + umulh x7,x13,x17 + umulh x8,x14,x17 + umulh x9,x15,x17 + umulh x10,x16,x17 + ldr x17,[x2,8*1] + + str x19,[x0] + adds x19,x20,x5 + mul x5,x11,x17 + adcs x20,x21,x6 + mul x6,x12,x17 + adcs x21,x22,x7 + mul x7,x13,x17 + adcs x22,x23,x8 + mul x8,x14,x17 + adcs x23,x24,x9 + mul x9,x15,x17 + adc x24,xzr, x10 + mul x10,x16,x17 + adds x19,x19,x5 + umulh x5,x11,x17 + adcs x20,x20,x6 + umulh x6,x12,x17 + adcs x21,x21,x7 + umulh x7,x13,x17 + adcs x22,x22,x8 + umulh x8,x14,x17 + adcs x23,x23,x9 + umulh x9,x15,x17 + adcs x24,x24,x10 + umulh x10,x16,x17 + ldr x17,[x2,#8*(1+1)] + adc x25,xzr,xzr + + str x19,[x0,8*1] + adds x19,x20,x5 + mul x5,x11,x17 + adcs x20,x21,x6 + mul x6,x12,x17 + adcs x21,x22,x7 + mul x7,x13,x17 + adcs x22,x23,x8 + mul x8,x14,x17 + adcs x23,x24,x9 + mul x9,x15,x17 + adc x24,x25,x10 + mul x10,x16,x17 + adds x19,x19,x5 + umulh x5,x11,x17 + adcs x20,x20,x6 + umulh x6,x12,x17 + adcs x21,x21,x7 + umulh x7,x13,x17 + adcs x22,x22,x8 + umulh x8,x14,x17 + adcs x23,x23,x9 + umulh x9,x15,x17 + adcs x24,x24,x10 + umulh x10,x16,x17 + ldr x17,[x2,#8*(2+1)] + adc x25,xzr,xzr + + str x19,[x0,8*2] + adds x19,x20,x5 + mul x5,x11,x17 + adcs x20,x21,x6 + mul x6,x12,x17 + adcs x21,x22,x7 + mul x7,x13,x17 + adcs x22,x23,x8 + mul x8,x14,x17 + adcs x23,x24,x9 + mul x9,x15,x17 + adc x24,x25,x10 + mul x10,x16,x17 + adds x19,x19,x5 + umulh x5,x11,x17 + adcs x20,x20,x6 + umulh x6,x12,x17 + adcs x21,x21,x7 + umulh x7,x13,x17 + adcs x22,x22,x8 + umulh x8,x14,x17 + adcs x23,x23,x9 + umulh x9,x15,x17 + adcs x24,x24,x10 + umulh x10,x16,x17 + ldr x17,[x2,#8*(3+1)] + adc x25,xzr,xzr + + str x19,[x0,8*3] + adds x19,x20,x5 + mul x5,x11,x17 + adcs x20,x21,x6 + mul x6,x12,x17 + adcs x21,x22,x7 + mul x7,x13,x17 + adcs x22,x23,x8 + mul x8,x14,x17 + adcs x23,x24,x9 + mul x9,x15,x17 + adc x24,x25,x10 + mul x10,x16,x17 + adds x19,x19,x5 + umulh x5,x11,x17 + adcs x20,x20,x6 + umulh x6,x12,x17 + adcs x21,x21,x7 + umulh x7,x13,x17 + adcs x22,x22,x8 + umulh x8,x14,x17 + adcs x23,x23,x9 + umulh x9,x15,x17 + adcs x24,x24,x10 + umulh x10,x16,x17 + ldr x17,[x2,#8*(4+1)] + adc x25,xzr,xzr + + str x19,[x0,8*4] + adds x19,x20,x5 + mul x5,x11,x17 + adcs x20,x21,x6 + mul x6,x12,x17 + adcs x21,x22,x7 + mul x7,x13,x17 + adcs x22,x23,x8 + mul x8,x14,x17 + adcs x23,x24,x9 + mul x9,x15,x17 + adc x24,x25,x10 + mul x10,x16,x17 + adds x19,x19,x5 + umulh x5,x11,x17 + adcs x20,x20,x6 + umulh x6,x12,x17 + adcs x21,x21,x7 + umulh x7,x13,x17 + adcs x22,x22,x8 + umulh x8,x14,x17 + adcs x23,x23,x9 + umulh x9,x15,x17 + adcs x24,x24,x10 + umulh x10,x16,x17 + adc x25,xzr,xzr + + str x19,[x0,8*5] + adds x19,x20,x5 + adcs x20,x21,x6 + adcs x21,x22,x7 + adcs x22,x23,x8 + adcs x23,x24,x9 + adc x24,x25,x10 + + stp x19,x20,[x0,#48] + stp x21,x22,[x0,#64] + stp x23,x24,[x0,#80] + + ret +.size __mul_384,.-__mul_384 + +.globl mul_382x +.hidden mul_382x +.type mul_382x,%function +.align 5 +mul_382x: + .inst 0xd503233f + stp x29,x30,[sp,#-128]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + stp x23,x24,[sp,#48] + stp x25,x26,[sp,#64] + stp x27,x28,[sp,#80] + sub sp,sp,#96 // space for two 384-bit vectors + + ldp x11,x12,[x1] + mov x26,x0 // save r_ptr + ldp x19,x20,[x1,#48] + mov x27,x1 // save a_ptr + ldp x13,x14,[x1,#16] + mov x28,x2 // save b_ptr + ldp x21,x22,[x1,#64] + ldp x15,x16,[x1,#32] + adds x5,x11,x19 // t0 = a->re + a->im + ldp x23,x24,[x1,#80] + adcs x6,x12,x20 + ldp x11,x12,[x2] + adcs x7,x13,x21 + ldp x19,x20,[x2,#48] + adcs x8,x14,x22 + ldp x13,x14,[x2,#16] + adcs x9,x15,x23 + ldp x21,x22,[x2,#64] + adc x10,x16,x24 + ldp x15,x16,[x2,#32] + + stp x5,x6,[sp] + adds x5,x11,x19 // t1 = b->re + b->im + ldp x23,x24,[x2,#80] + adcs x6,x12,x20 + stp x7,x8,[sp,#16] + adcs x7,x13,x21 + adcs x8,x14,x22 + stp x9,x10,[sp,#32] + adcs x9,x15,x23 + stp x5,x6,[sp,#48] + adc x10,x16,x24 + stp x7,x8,[sp,#64] + stp x9,x10,[sp,#80] + + bl __mul_384 // mul_384(ret->re, a->re, b->re) + + add x1,sp,#0 // mul_384(ret->im, t0, t1) + add x2,sp,#48 + add x0,x26,#96 + bl __mul_384 + + add x1,x27,#48 // mul_384(tx, a->im, b->im) + add x2,x28,#48 + add x0,sp,#0 + bl __mul_384 + + ldp x5,x6,[x3] + ldp x7,x8,[x3,#16] + ldp x9,x10,[x3,#32] + + add x1,x26,#96 // ret->im -= tx + add x2,sp,#0 + add x0,x26,#96 + bl __sub_mod_384x384 + + add x2,x26,#0 // ret->im -= ret->re + bl __sub_mod_384x384 + + add x1,x26,#0 // ret->re -= tx + add x2,sp,#0 + add x0,x26,#0 + bl __sub_mod_384x384 + ldr x30,[x29,#8] + + add sp,sp,#96 + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldp x23,x24,[x29,#48] + ldp x25,x26,[x29,#64] + ldp x27,x28,[x29,#80] + ldr x29,[sp],#128 + .inst 0xd50323bf + ret +.size mul_382x,.-mul_382x + +.globl sqr_382x +.hidden sqr_382x +.type sqr_382x,%function +.align 5 +sqr_382x: + .inst 0xd503233f + stp x29,x30,[sp,#-128]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + stp x23,x24,[sp,#48] + stp x25,x26,[sp,#64] + stp x27,x28,[sp,#80] + + ldp x11,x12,[x1] + ldp x19,x20,[x1,#48] + ldp x13,x14,[x1,#16] + adds x5,x11,x19 // t0 = a->re + a->im + ldp x21,x22,[x1,#64] + adcs x6,x12,x20 + ldp x15,x16,[x1,#32] + adcs x7,x13,x21 + ldp x23,x24,[x1,#80] + adcs x8,x14,x22 + stp x5,x6,[x0] + adcs x9,x15,x23 + ldp x5,x6,[x2] + adc x10,x16,x24 + stp x7,x8,[x0,#16] + + subs x11,x11,x19 // t1 = a->re - a->im + ldp x7,x8,[x2,#16] + sbcs x12,x12,x20 + stp x9,x10,[x0,#32] + sbcs x13,x13,x21 + ldp x9,x10,[x2,#32] + sbcs x14,x14,x22 + sbcs x15,x15,x23 + sbcs x16,x16,x24 + sbc x25,xzr,xzr + + and x19,x5,x25 + and x20,x6,x25 + adds x11,x11,x19 + and x21,x7,x25 + adcs x12,x12,x20 + and x22,x8,x25 + adcs x13,x13,x21 + and x23,x9,x25 + adcs x14,x14,x22 + and x24,x10,x25 + adcs x15,x15,x23 + stp x11,x12,[x0,#48] + adc x16,x16,x24 + stp x13,x14,[x0,#64] + stp x15,x16,[x0,#80] + + mov x4,x1 // save a_ptr + add x1,x0,#0 // mul_384(ret->re, t0, t1) + add x2,x0,#48 + bl __mul_384 + + add x1,x4,#0 // mul_384(ret->im, a->re, a->im) + add x2,x4,#48 + add x0,x0,#96 + bl __mul_384 + ldr x30,[x29,#8] + + ldp x11,x12,[x0] + ldp x13,x14,[x0,#16] + adds x11,x11,x11 // add with itself + ldp x15,x16,[x0,#32] + adcs x12,x12,x12 + adcs x13,x13,x13 + adcs x14,x14,x14 + adcs x15,x15,x15 + adcs x16,x16,x16 + adcs x19,x19,x19 + adcs x20,x20,x20 + stp x11,x12,[x0] + adcs x21,x21,x21 + stp x13,x14,[x0,#16] + adcs x22,x22,x22 + stp x15,x16,[x0,#32] + adcs x23,x23,x23 + stp x19,x20,[x0,#48] + adc x24,x24,x24 + stp x21,x22,[x0,#64] + stp x23,x24,[x0,#80] + + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldp x23,x24,[x29,#48] + ldp x25,x26,[x29,#64] + ldp x27,x28,[x29,#80] + ldr x29,[sp],#128 + .inst 0xd50323bf + ret +.size sqr_382x,.-sqr_382x + +.globl sqr_mont_382x +.hidden sqr_mont_382x +.type sqr_mont_382x,%function +.align 5 +sqr_mont_382x: + .inst 0xd503233f + stp x29,x30,[sp,#-128]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + stp x23,x24,[sp,#48] + stp x25,x26,[sp,#64] + stp x27,x28,[sp,#80] + stp x3,x0,[sp,#96] // __mul_mont_384 wants them there + sub sp,sp,#112 // space for two 384-bit vectors + word + mov x4,x3 // adjust for missing b_ptr + + ldp x11,x12,[x1] + ldp x13,x14,[x1,#16] + ldp x15,x16,[x1,#32] + + ldp x17,x20,[x1,#48] + ldp x21,x22,[x1,#64] + ldp x23,x24,[x1,#80] + + adds x5,x11,x17 // t0 = a->re + a->im + adcs x6,x12,x20 + adcs x7,x13,x21 + adcs x8,x14,x22 + adcs x9,x15,x23 + adc x10,x16,x24 + + subs x19,x11,x17 // t1 = a->re - a->im + sbcs x20,x12,x20 + sbcs x21,x13,x21 + sbcs x22,x14,x22 + sbcs x23,x15,x23 + sbcs x24,x16,x24 + sbc x25,xzr,xzr // borrow flag as mask + + stp x5,x6,[sp] + stp x7,x8,[sp,#16] + stp x9,x10,[sp,#32] + stp x19,x20,[sp,#48] + stp x21,x22,[sp,#64] + stp x23,x24,[sp,#80] + str x25,[sp,#96] + + ldp x5,x6,[x2] + ldp x7,x8,[x2,#16] + ldp x9,x10,[x2,#32] + + add x2,x1,#48 + bl __mul_mont_383_nonred // mul_mont_384(ret->im, a->re, a->im) + + adds x19,x11,x11 // add with itself + adcs x20,x12,x12 + adcs x21,x13,x13 + adcs x22,x14,x14 + adcs x23,x15,x15 + adc x24,x16,x16 + + stp x19,x20,[x2,#48] + stp x21,x22,[x2,#64] + stp x23,x24,[x2,#80] + + ldp x11,x12,[sp] + ldr x17,[sp,#48] + ldp x13,x14,[sp,#16] + ldp x15,x16,[sp,#32] + + add x2,sp,#48 + bl __mul_mont_383_nonred // mul_mont_384(ret->im, t0, t1) + ldr x30,[x29,#8] + + ldr x25,[sp,#96] // account for sign from a->re - a->im + ldp x19,x20,[sp] + ldp x21,x22,[sp,#16] + ldp x23,x24,[sp,#32] + + and x19,x19,x25 + and x20,x20,x25 + and x21,x21,x25 + and x22,x22,x25 + and x23,x23,x25 + and x24,x24,x25 + + subs x11,x11,x19 + sbcs x12,x12,x20 + sbcs x13,x13,x21 + sbcs x14,x14,x22 + sbcs x15,x15,x23 + sbcs x16,x16,x24 + sbc x25,xzr,xzr + + and x19,x5,x25 + and x20,x6,x25 + and x21,x7,x25 + and x22,x8,x25 + and x23,x9,x25 + and x24,x10,x25 + + adds x11,x11,x19 + adcs x12,x12,x20 + adcs x13,x13,x21 + adcs x14,x14,x22 + adcs x15,x15,x23 + adc x16,x16,x24 + + stp x11,x12,[x2] + stp x13,x14,[x2,#16] + stp x15,x16,[x2,#32] + + add sp,sp,#112 + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldp x23,x24,[x29,#48] + ldp x25,x26,[x29,#64] + ldp x27,x28,[x29,#80] + ldr x29,[sp],#128 + .inst 0xd50323bf + ret +.size sqr_mont_382x,.-sqr_mont_382x + +.type __mul_mont_383_nonred,%function +.align 5 +__mul_mont_383_nonred: + mul x19,x11,x17 + mul x20,x12,x17 + mul x21,x13,x17 + mul x22,x14,x17 + mul x23,x15,x17 + mul x24,x16,x17 + mul x4,x4,x19 + + umulh x26,x11,x17 + umulh x27,x12,x17 + umulh x28,x13,x17 + umulh x0,x14,x17 + umulh x1,x15,x17 + umulh x3,x16,x17 + + adds x20,x20,x26 + mul x26,x5,x4 + adcs x21,x21,x27 + mul x27,x6,x4 + adcs x22,x22,x28 + mul x28,x7,x4 + adcs x23,x23,x0 + mul x0,x8,x4 + adcs x24,x24,x1 + mul x1,x9,x4 + adc x25,xzr, x3 + mul x3,x10,x4 + ldr x17,[x2,8*1] + adds x19,x19,x26 + umulh x26,x5,x4 + adcs x20,x20,x27 + umulh x27,x6,x4 + adcs x21,x21,x28 + umulh x28,x7,x4 + adcs x22,x22,x0 + umulh x0,x8,x4 + adcs x23,x23,x1 + umulh x1,x9,x4 + adcs x24,x24,x3 + umulh x3,x10,x4 + adc x25,x25,xzr + + ldr x4,[x29,#96] + adds x19,x20,x26 + mul x26,x11,x17 + adcs x20,x21,x27 + mul x27,x12,x17 + adcs x21,x22,x28 + mul x28,x13,x17 + adcs x22,x23,x0 + mul x0,x14,x17 + adcs x23,x24,x1 + mul x1,x15,x17 + adcs x24,x25,x3 + mul x3,x16,x17 + adc x25,xzr,xzr + + adds x19,x19,x26 + umulh x26,x11,x17 + adcs x20,x20,x27 + umulh x27,x12,x17 + adcs x21,x21,x28 + mul x4,x4,x19 + umulh x28,x13,x17 + adcs x22,x22,x0 + umulh x0,x14,x17 + adcs x23,x23,x1 + umulh x1,x15,x17 + adcs x24,x24,x3 + umulh x3,x16,x17 + adc x25,x25,xzr + + adds x20,x20,x26 + mul x26,x5,x4 + adcs x21,x21,x27 + mul x27,x6,x4 + adcs x22,x22,x28 + mul x28,x7,x4 + adcs x23,x23,x0 + mul x0,x8,x4 + adcs x24,x24,x1 + mul x1,x9,x4 + adc x25,x25,x3 + mul x3,x10,x4 + ldr x17,[x2,8*2] + adds x19,x19,x26 + umulh x26,x5,x4 + adcs x20,x20,x27 + umulh x27,x6,x4 + adcs x21,x21,x28 + umulh x28,x7,x4 + adcs x22,x22,x0 + umulh x0,x8,x4 + adcs x23,x23,x1 + umulh x1,x9,x4 + adcs x24,x24,x3 + umulh x3,x10,x4 + adc x25,x25,xzr + + ldr x4,[x29,#96] + adds x19,x20,x26 + mul x26,x11,x17 + adcs x20,x21,x27 + mul x27,x12,x17 + adcs x21,x22,x28 + mul x28,x13,x17 + adcs x22,x23,x0 + mul x0,x14,x17 + adcs x23,x24,x1 + mul x1,x15,x17 + adcs x24,x25,x3 + mul x3,x16,x17 + adc x25,xzr,xzr + + adds x19,x19,x26 + umulh x26,x11,x17 + adcs x20,x20,x27 + umulh x27,x12,x17 + adcs x21,x21,x28 + mul x4,x4,x19 + umulh x28,x13,x17 + adcs x22,x22,x0 + umulh x0,x14,x17 + adcs x23,x23,x1 + umulh x1,x15,x17 + adcs x24,x24,x3 + umulh x3,x16,x17 + adc x25,x25,xzr + + adds x20,x20,x26 + mul x26,x5,x4 + adcs x21,x21,x27 + mul x27,x6,x4 + adcs x22,x22,x28 + mul x28,x7,x4 + adcs x23,x23,x0 + mul x0,x8,x4 + adcs x24,x24,x1 + mul x1,x9,x4 + adc x25,x25,x3 + mul x3,x10,x4 + ldr x17,[x2,8*3] + adds x19,x19,x26 + umulh x26,x5,x4 + adcs x20,x20,x27 + umulh x27,x6,x4 + adcs x21,x21,x28 + umulh x28,x7,x4 + adcs x22,x22,x0 + umulh x0,x8,x4 + adcs x23,x23,x1 + umulh x1,x9,x4 + adcs x24,x24,x3 + umulh x3,x10,x4 + adc x25,x25,xzr + + ldr x4,[x29,#96] + adds x19,x20,x26 + mul x26,x11,x17 + adcs x20,x21,x27 + mul x27,x12,x17 + adcs x21,x22,x28 + mul x28,x13,x17 + adcs x22,x23,x0 + mul x0,x14,x17 + adcs x23,x24,x1 + mul x1,x15,x17 + adcs x24,x25,x3 + mul x3,x16,x17 + adc x25,xzr,xzr + + adds x19,x19,x26 + umulh x26,x11,x17 + adcs x20,x20,x27 + umulh x27,x12,x17 + adcs x21,x21,x28 + mul x4,x4,x19 + umulh x28,x13,x17 + adcs x22,x22,x0 + umulh x0,x14,x17 + adcs x23,x23,x1 + umulh x1,x15,x17 + adcs x24,x24,x3 + umulh x3,x16,x17 + adc x25,x25,xzr + + adds x20,x20,x26 + mul x26,x5,x4 + adcs x21,x21,x27 + mul x27,x6,x4 + adcs x22,x22,x28 + mul x28,x7,x4 + adcs x23,x23,x0 + mul x0,x8,x4 + adcs x24,x24,x1 + mul x1,x9,x4 + adc x25,x25,x3 + mul x3,x10,x4 + ldr x17,[x2,8*4] + adds x19,x19,x26 + umulh x26,x5,x4 + adcs x20,x20,x27 + umulh x27,x6,x4 + adcs x21,x21,x28 + umulh x28,x7,x4 + adcs x22,x22,x0 + umulh x0,x8,x4 + adcs x23,x23,x1 + umulh x1,x9,x4 + adcs x24,x24,x3 + umulh x3,x10,x4 + adc x25,x25,xzr + + ldr x4,[x29,#96] + adds x19,x20,x26 + mul x26,x11,x17 + adcs x20,x21,x27 + mul x27,x12,x17 + adcs x21,x22,x28 + mul x28,x13,x17 + adcs x22,x23,x0 + mul x0,x14,x17 + adcs x23,x24,x1 + mul x1,x15,x17 + adcs x24,x25,x3 + mul x3,x16,x17 + adc x25,xzr,xzr + + adds x19,x19,x26 + umulh x26,x11,x17 + adcs x20,x20,x27 + umulh x27,x12,x17 + adcs x21,x21,x28 + mul x4,x4,x19 + umulh x28,x13,x17 + adcs x22,x22,x0 + umulh x0,x14,x17 + adcs x23,x23,x1 + umulh x1,x15,x17 + adcs x24,x24,x3 + umulh x3,x16,x17 + adc x25,x25,xzr + + adds x20,x20,x26 + mul x26,x5,x4 + adcs x21,x21,x27 + mul x27,x6,x4 + adcs x22,x22,x28 + mul x28,x7,x4 + adcs x23,x23,x0 + mul x0,x8,x4 + adcs x24,x24,x1 + mul x1,x9,x4 + adc x25,x25,x3 + mul x3,x10,x4 + ldr x17,[x2,8*5] + adds x19,x19,x26 + umulh x26,x5,x4 + adcs x20,x20,x27 + umulh x27,x6,x4 + adcs x21,x21,x28 + umulh x28,x7,x4 + adcs x22,x22,x0 + umulh x0,x8,x4 + adcs x23,x23,x1 + umulh x1,x9,x4 + adcs x24,x24,x3 + umulh x3,x10,x4 + adc x25,x25,xzr + + ldr x4,[x29,#96] + adds x19,x20,x26 + mul x26,x11,x17 + adcs x20,x21,x27 + mul x27,x12,x17 + adcs x21,x22,x28 + mul x28,x13,x17 + adcs x22,x23,x0 + mul x0,x14,x17 + adcs x23,x24,x1 + mul x1,x15,x17 + adcs x24,x25,x3 + mul x3,x16,x17 + adc x25,xzr,xzr + + adds x19,x19,x26 + umulh x26,x11,x17 + adcs x20,x20,x27 + umulh x27,x12,x17 + adcs x21,x21,x28 + mul x4,x4,x19 + umulh x28,x13,x17 + adcs x22,x22,x0 + umulh x0,x14,x17 + adcs x23,x23,x1 + umulh x1,x15,x17 + adcs x24,x24,x3 + umulh x3,x16,x17 + adc x25,x25,xzr + + adds x20,x20,x26 + mul x26,x5,x4 + adcs x21,x21,x27 + mul x27,x6,x4 + adcs x22,x22,x28 + mul x28,x7,x4 + adcs x23,x23,x0 + mul x0,x8,x4 + adcs x24,x24,x1 + mul x1,x9,x4 + adc x25,x25,x3 + mul x3,x10,x4 + adds x19,x19,x26 + umulh x26,x5,x4 + adcs x20,x20,x27 + umulh x27,x6,x4 + adcs x21,x21,x28 + umulh x28,x7,x4 + adcs x22,x22,x0 + umulh x0,x8,x4 + adcs x23,x23,x1 + umulh x1,x9,x4 + adcs x24,x24,x3 + umulh x3,x10,x4 + adc x25,x25,xzr + ldp x4,x2,[x29,#96] // pull r_ptr + + adds x11,x20,x26 + adcs x12,x21,x27 + adcs x13,x22,x28 + adcs x14,x23,x0 + adcs x15,x24,x1 + adcs x16,x25,x3 + + ret +.size __mul_mont_383_nonred,.-__mul_mont_383_nonred + +.globl sgn0_pty_mont_384 +.hidden sgn0_pty_mont_384 +.type sgn0_pty_mont_384,%function +.align 5 +sgn0_pty_mont_384: + .inst 0xd503233f + stp x29,x30,[sp,#-128]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + stp x23,x24,[sp,#48] + stp x25,x26,[sp,#64] + stp x27,x28,[sp,#80] + + mov x4,x2 + ldp x5,x6,[x1] + ldp x7,x8,[x1,#16] + ldp x9,x10,[x1,#32] + mov x1,x0 + + bl __mul_by_1_mont_384 + ldr x30,[x29,#8] + + and x0,x11,#1 + adds x11,x11,x11 + adcs x12,x12,x12 + adcs x13,x13,x13 + adcs x14,x14,x14 + adcs x15,x15,x15 + adcs x16,x16,x16 + adc x17,xzr,xzr + + subs x11,x11,x5 + sbcs x12,x12,x6 + sbcs x13,x13,x7 + sbcs x14,x14,x8 + sbcs x15,x15,x9 + sbcs x16,x16,x10 + sbc x17,x17,xzr + + mvn x17,x17 + and x17,x17,#2 + orr x0,x0,x17 + + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldp x23,x24,[x29,#48] + ldp x25,x26,[x29,#64] + ldp x27,x28,[x29,#80] + ldr x29,[sp],#128 + .inst 0xd50323bf + ret +.size sgn0_pty_mont_384,.-sgn0_pty_mont_384 + +.globl sgn0_pty_mont_384x +.hidden sgn0_pty_mont_384x +.type sgn0_pty_mont_384x,%function +.align 5 +sgn0_pty_mont_384x: + .inst 0xd503233f + stp x29,x30,[sp,#-128]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + stp x23,x24,[sp,#48] + stp x25,x26,[sp,#64] + stp x27,x28,[sp,#80] + + mov x4,x2 + ldp x5,x6,[x1] + ldp x7,x8,[x1,#16] + ldp x9,x10,[x1,#32] + mov x1,x0 + + bl __mul_by_1_mont_384 + add x1,x1,#48 + + and x2,x11,#1 + orr x3,x11,x12 + adds x11,x11,x11 + orr x3,x3,x13 + adcs x12,x12,x12 + orr x3,x3,x14 + adcs x13,x13,x13 + orr x3,x3,x15 + adcs x14,x14,x14 + orr x3,x3,x16 + adcs x15,x15,x15 + adcs x16,x16,x16 + adc x17,xzr,xzr + + subs x11,x11,x5 + sbcs x12,x12,x6 + sbcs x13,x13,x7 + sbcs x14,x14,x8 + sbcs x15,x15,x9 + sbcs x16,x16,x10 + sbc x17,x17,xzr + + mvn x17,x17 + and x17,x17,#2 + orr x2,x2,x17 + + bl __mul_by_1_mont_384 + ldr x30,[x29,#8] + + and x0,x11,#1 + orr x1,x11,x12 + adds x11,x11,x11 + orr x1,x1,x13 + adcs x12,x12,x12 + orr x1,x1,x14 + adcs x13,x13,x13 + orr x1,x1,x15 + adcs x14,x14,x14 + orr x1,x1,x16 + adcs x15,x15,x15 + adcs x16,x16,x16 + adc x17,xzr,xzr + + subs x11,x11,x5 + sbcs x12,x12,x6 + sbcs x13,x13,x7 + sbcs x14,x14,x8 + sbcs x15,x15,x9 + sbcs x16,x16,x10 + sbc x17,x17,xzr + + mvn x17,x17 + and x17,x17,#2 + orr x0,x0,x17 + + cmp x3,#0 + csel x3,x0,x2,eq // a->re==0? prty(a->im) : prty(a->re) + + cmp x1,#0 + csel x1,x0,x2,ne // a->im!=0? sgn0(a->im) : sgn0(a->re) + + and x3,x3,#1 + and x1,x1,#2 + orr x0,x1,x3 // pack sign and parity + + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldp x23,x24,[x29,#48] + ldp x25,x26,[x29,#64] + ldp x27,x28,[x29,#80] + ldr x29,[sp],#128 + .inst 0xd50323bf + ret +.size sgn0_pty_mont_384x,.-sgn0_pty_mont_384x diff --git a/temp-deps/blst-ts/deps/blst/build/elf/mulq_mont_256-x86_64.s b/temp-deps/blst-ts/deps/blst/build/elf/mulq_mont_256-x86_64.s new file mode 100644 index 0000000..37abd43 --- /dev/null +++ b/temp-deps/blst-ts/deps/blst/build/elf/mulq_mont_256-x86_64.s @@ -0,0 +1,714 @@ +.text + +.globl mul_mont_sparse_256 +.hidden mul_mont_sparse_256 +.type mul_mont_sparse_256,@function +.align 32 +mul_mont_sparse_256: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + pushq %rdi +.cfi_adjust_cfa_offset 8 + + + movq 0(%rdx),%rax + movq 0(%rsi),%r13 + movq 8(%rsi),%r14 + movq 16(%rsi),%r12 + movq 24(%rsi),%rbp + movq %rdx,%rbx + + movq %rax,%r15 + mulq %r13 + movq %rax,%r9 + movq %r15,%rax + movq %rdx,%r10 + call __mulq_mont_sparse_256 + + movq 8(%rsp),%r15 +.cfi_restore %r15 + movq 16(%rsp),%r14 +.cfi_restore %r14 + movq 24(%rsp),%r13 +.cfi_restore %r13 + movq 32(%rsp),%r12 +.cfi_restore %r12 + movq 40(%rsp),%rbx +.cfi_restore %rbx + movq 48(%rsp),%rbp +.cfi_restore %rbp + leaq 56(%rsp),%rsp +.cfi_adjust_cfa_offset -56 + + .byte 0xf3,0xc3 +.cfi_endproc +.size mul_mont_sparse_256,.-mul_mont_sparse_256 + +.globl sqr_mont_sparse_256 +.hidden sqr_mont_sparse_256 +.type sqr_mont_sparse_256,@function +.align 32 +sqr_mont_sparse_256: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + pushq %rdi +.cfi_adjust_cfa_offset 8 + + + movq 0(%rsi),%rax + movq %rcx,%r8 + movq 8(%rsi),%r14 + movq %rdx,%rcx + movq 16(%rsi),%r12 + leaq (%rsi),%rbx + movq 24(%rsi),%rbp + + movq %rax,%r15 + mulq %rax + movq %rax,%r9 + movq %r15,%rax + movq %rdx,%r10 + call __mulq_mont_sparse_256 + + movq 8(%rsp),%r15 +.cfi_restore %r15 + movq 16(%rsp),%r14 +.cfi_restore %r14 + movq 24(%rsp),%r13 +.cfi_restore %r13 + movq 32(%rsp),%r12 +.cfi_restore %r12 + movq 40(%rsp),%rbx +.cfi_restore %rbx + movq 48(%rsp),%rbp +.cfi_restore %rbp + leaq 56(%rsp),%rsp +.cfi_adjust_cfa_offset -56 + + .byte 0xf3,0xc3 +.cfi_endproc +.size sqr_mont_sparse_256,.-sqr_mont_sparse_256 +.type __mulq_mont_sparse_256,@function +.align 32 +__mulq_mont_sparse_256: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + mulq %r14 + addq %rax,%r10 + movq %r15,%rax + adcq $0,%rdx + movq %rdx,%r11 + + mulq %r12 + addq %rax,%r11 + movq %r15,%rax + adcq $0,%rdx + movq %rdx,%r12 + + mulq %rbp + addq %rax,%r12 + movq 8(%rbx),%rax + adcq $0,%rdx + xorq %r14,%r14 + movq %rdx,%r13 + + movq %r9,%rdi + imulq %r8,%r9 + + + movq %rax,%r15 + mulq 0(%rsi) + addq %rax,%r10 + movq %r15,%rax + adcq $0,%rdx + movq %rdx,%rbp + + mulq 8(%rsi) + addq %rax,%r11 + movq %r15,%rax + adcq $0,%rdx + addq %rbp,%r11 + adcq $0,%rdx + movq %rdx,%rbp + + mulq 16(%rsi) + addq %rax,%r12 + movq %r15,%rax + adcq $0,%rdx + addq %rbp,%r12 + adcq $0,%rdx + movq %rdx,%rbp + + mulq 24(%rsi) + addq %rax,%r13 + movq %r9,%rax + adcq $0,%rdx + addq %rbp,%r13 + adcq %rdx,%r14 + xorq %r15,%r15 + + + mulq 0(%rcx) + addq %rax,%rdi + movq %r9,%rax + adcq %rdx,%rdi + + mulq 8(%rcx) + addq %rax,%r10 + movq %r9,%rax + adcq $0,%rdx + addq %rdi,%r10 + adcq $0,%rdx + movq %rdx,%rbp + + mulq 16(%rcx) + addq %rax,%r11 + movq %r9,%rax + adcq $0,%rdx + addq %rbp,%r11 + adcq $0,%rdx + movq %rdx,%rbp + + mulq 24(%rcx) + addq %rax,%r12 + movq 16(%rbx),%rax + adcq $0,%rdx + addq %rbp,%r12 + adcq $0,%rdx + addq %rdx,%r13 + adcq $0,%r14 + adcq $0,%r15 + movq %r10,%rdi + imulq %r8,%r10 + + + movq %rax,%r9 + mulq 0(%rsi) + addq %rax,%r11 + movq %r9,%rax + adcq $0,%rdx + movq %rdx,%rbp + + mulq 8(%rsi) + addq %rax,%r12 + movq %r9,%rax + adcq $0,%rdx + addq %rbp,%r12 + adcq $0,%rdx + movq %rdx,%rbp + + mulq 16(%rsi) + addq %rax,%r13 + movq %r9,%rax + adcq $0,%rdx + addq %rbp,%r13 + adcq $0,%rdx + movq %rdx,%rbp + + mulq 24(%rsi) + addq %rax,%r14 + movq %r10,%rax + adcq $0,%rdx + addq %rbp,%r14 + adcq %rdx,%r15 + xorq %r9,%r9 + + + mulq 0(%rcx) + addq %rax,%rdi + movq %r10,%rax + adcq %rdx,%rdi + + mulq 8(%rcx) + addq %rax,%r11 + movq %r10,%rax + adcq $0,%rdx + addq %rdi,%r11 + adcq $0,%rdx + movq %rdx,%rbp + + mulq 16(%rcx) + addq %rax,%r12 + movq %r10,%rax + adcq $0,%rdx + addq %rbp,%r12 + adcq $0,%rdx + movq %rdx,%rbp + + mulq 24(%rcx) + addq %rax,%r13 + movq 24(%rbx),%rax + adcq $0,%rdx + addq %rbp,%r13 + adcq $0,%rdx + addq %rdx,%r14 + adcq $0,%r15 + adcq $0,%r9 + movq %r11,%rdi + imulq %r8,%r11 + + + movq %rax,%r10 + mulq 0(%rsi) + addq %rax,%r12 + movq %r10,%rax + adcq $0,%rdx + movq %rdx,%rbp + + mulq 8(%rsi) + addq %rax,%r13 + movq %r10,%rax + adcq $0,%rdx + addq %rbp,%r13 + adcq $0,%rdx + movq %rdx,%rbp + + mulq 16(%rsi) + addq %rax,%r14 + movq %r10,%rax + adcq $0,%rdx + addq %rbp,%r14 + adcq $0,%rdx + movq %rdx,%rbp + + mulq 24(%rsi) + addq %rax,%r15 + movq %r11,%rax + adcq $0,%rdx + addq %rbp,%r15 + adcq %rdx,%r9 + xorq %r10,%r10 + + + mulq 0(%rcx) + addq %rax,%rdi + movq %r11,%rax + adcq %rdx,%rdi + + mulq 8(%rcx) + addq %rax,%r12 + movq %r11,%rax + adcq $0,%rdx + addq %rdi,%r12 + adcq $0,%rdx + movq %rdx,%rbp + + mulq 16(%rcx) + addq %rax,%r13 + movq %r11,%rax + adcq $0,%rdx + addq %rbp,%r13 + adcq $0,%rdx + movq %rdx,%rbp + + mulq 24(%rcx) + addq %rax,%r14 + movq %r12,%rax + adcq $0,%rdx + addq %rbp,%r14 + adcq $0,%rdx + addq %rdx,%r15 + adcq $0,%r9 + adcq $0,%r10 + imulq %r8,%rax + movq 8(%rsp),%rsi + + + movq %rax,%r11 + mulq 0(%rcx) + addq %rax,%r12 + movq %r11,%rax + adcq %rdx,%r12 + + mulq 8(%rcx) + addq %rax,%r13 + movq %r11,%rax + adcq $0,%rdx + addq %r12,%r13 + adcq $0,%rdx + movq %rdx,%rbp + + mulq 16(%rcx) + addq %rax,%r14 + movq %r11,%rax + adcq $0,%rdx + addq %rbp,%r14 + adcq $0,%rdx + movq %rdx,%rbp + + mulq 24(%rcx) + movq %r14,%rbx + addq %rbp,%r15 + adcq $0,%rdx + addq %rax,%r15 + movq %r13,%rax + adcq $0,%rdx + addq %rdx,%r9 + adcq $0,%r10 + + + + + movq %r15,%r12 + subq 0(%rcx),%r13 + sbbq 8(%rcx),%r14 + sbbq 16(%rcx),%r15 + movq %r9,%rbp + sbbq 24(%rcx),%r9 + sbbq $0,%r10 + + cmovcq %rax,%r13 + cmovcq %rbx,%r14 + cmovcq %r12,%r15 + movq %r13,0(%rsi) + cmovcq %rbp,%r9 + movq %r14,8(%rsi) + movq %r15,16(%rsi) + movq %r9,24(%rsi) + + .byte 0xf3,0xc3 +.cfi_endproc +.size __mulq_mont_sparse_256,.-__mulq_mont_sparse_256 +.globl from_mont_256 +.hidden from_mont_256 +.type from_mont_256,@function +.align 32 +from_mont_256: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + subq $8,%rsp +.cfi_adjust_cfa_offset 8 + + + movq %rdx,%rbx + call __mulq_by_1_mont_256 + + + + + + movq %r14,%r10 + movq %r15,%r11 + movq %r9,%r12 + + subq 0(%rbx),%r13 + sbbq 8(%rbx),%r14 + sbbq 16(%rbx),%r15 + sbbq 24(%rbx),%r9 + + cmovncq %r13,%rax + cmovncq %r14,%r10 + cmovncq %r15,%r11 + movq %rax,0(%rdi) + cmovncq %r9,%r12 + movq %r10,8(%rdi) + movq %r11,16(%rdi) + movq %r12,24(%rdi) + + movq 8(%rsp),%r15 +.cfi_restore %r15 + movq 16(%rsp),%r14 +.cfi_restore %r14 + movq 24(%rsp),%r13 +.cfi_restore %r13 + movq 32(%rsp),%r12 +.cfi_restore %r12 + movq 40(%rsp),%rbx +.cfi_restore %rbx + movq 48(%rsp),%rbp +.cfi_restore %rbp + leaq 56(%rsp),%rsp +.cfi_adjust_cfa_offset -56 + + .byte 0xf3,0xc3 +.cfi_endproc +.size from_mont_256,.-from_mont_256 + +.globl redc_mont_256 +.hidden redc_mont_256 +.type redc_mont_256,@function +.align 32 +redc_mont_256: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + subq $8,%rsp +.cfi_adjust_cfa_offset 8 + + + movq %rdx,%rbx + call __mulq_by_1_mont_256 + + addq 32(%rsi),%r13 + adcq 40(%rsi),%r14 + movq %r13,%rax + adcq 48(%rsi),%r15 + movq %r14,%r10 + adcq 56(%rsi),%r9 + sbbq %rsi,%rsi + + + + + movq %r15,%r11 + subq 0(%rbx),%r13 + sbbq 8(%rbx),%r14 + sbbq 16(%rbx),%r15 + movq %r9,%r12 + sbbq 24(%rbx),%r9 + sbbq $0,%rsi + + cmovncq %r13,%rax + cmovncq %r14,%r10 + cmovncq %r15,%r11 + movq %rax,0(%rdi) + cmovncq %r9,%r12 + movq %r10,8(%rdi) + movq %r11,16(%rdi) + movq %r12,24(%rdi) + + movq 8(%rsp),%r15 +.cfi_restore %r15 + movq 16(%rsp),%r14 +.cfi_restore %r14 + movq 24(%rsp),%r13 +.cfi_restore %r13 + movq 32(%rsp),%r12 +.cfi_restore %r12 + movq 40(%rsp),%rbx +.cfi_restore %rbx + movq 48(%rsp),%rbp +.cfi_restore %rbp + leaq 56(%rsp),%rsp +.cfi_adjust_cfa_offset -56 + + .byte 0xf3,0xc3 +.cfi_endproc +.size redc_mont_256,.-redc_mont_256 +.type __mulq_by_1_mont_256,@function +.align 32 +__mulq_by_1_mont_256: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + movq 0(%rsi),%rax + movq 8(%rsi),%r10 + movq 16(%rsi),%r11 + movq 24(%rsi),%r12 + + movq %rax,%r13 + imulq %rcx,%rax + movq %rax,%r9 + + mulq 0(%rbx) + addq %rax,%r13 + movq %r9,%rax + adcq %rdx,%r13 + + mulq 8(%rbx) + addq %rax,%r10 + movq %r9,%rax + adcq $0,%rdx + addq %r13,%r10 + adcq $0,%rdx + movq %rdx,%r13 + + mulq 16(%rbx) + movq %r10,%r14 + imulq %rcx,%r10 + addq %rax,%r11 + movq %r9,%rax + adcq $0,%rdx + addq %r13,%r11 + adcq $0,%rdx + movq %rdx,%r13 + + mulq 24(%rbx) + addq %rax,%r12 + movq %r10,%rax + adcq $0,%rdx + addq %r13,%r12 + adcq $0,%rdx + movq %rdx,%r13 + + mulq 0(%rbx) + addq %rax,%r14 + movq %r10,%rax + adcq %rdx,%r14 + + mulq 8(%rbx) + addq %rax,%r11 + movq %r10,%rax + adcq $0,%rdx + addq %r14,%r11 + adcq $0,%rdx + movq %rdx,%r14 + + mulq 16(%rbx) + movq %r11,%r15 + imulq %rcx,%r11 + addq %rax,%r12 + movq %r10,%rax + adcq $0,%rdx + addq %r14,%r12 + adcq $0,%rdx + movq %rdx,%r14 + + mulq 24(%rbx) + addq %rax,%r13 + movq %r11,%rax + adcq $0,%rdx + addq %r14,%r13 + adcq $0,%rdx + movq %rdx,%r14 + + mulq 0(%rbx) + addq %rax,%r15 + movq %r11,%rax + adcq %rdx,%r15 + + mulq 8(%rbx) + addq %rax,%r12 + movq %r11,%rax + adcq $0,%rdx + addq %r15,%r12 + adcq $0,%rdx + movq %rdx,%r15 + + mulq 16(%rbx) + movq %r12,%r9 + imulq %rcx,%r12 + addq %rax,%r13 + movq %r11,%rax + adcq $0,%rdx + addq %r15,%r13 + adcq $0,%rdx + movq %rdx,%r15 + + mulq 24(%rbx) + addq %rax,%r14 + movq %r12,%rax + adcq $0,%rdx + addq %r15,%r14 + adcq $0,%rdx + movq %rdx,%r15 + + mulq 0(%rbx) + addq %rax,%r9 + movq %r12,%rax + adcq %rdx,%r9 + + mulq 8(%rbx) + addq %rax,%r13 + movq %r12,%rax + adcq $0,%rdx + addq %r9,%r13 + adcq $0,%rdx + movq %rdx,%r9 + + mulq 16(%rbx) + addq %rax,%r14 + movq %r12,%rax + adcq $0,%rdx + addq %r9,%r14 + adcq $0,%rdx + movq %rdx,%r9 + + mulq 24(%rbx) + addq %rax,%r15 + movq %r13,%rax + adcq $0,%rdx + addq %r9,%r15 + adcq $0,%rdx + movq %rdx,%r9 + .byte 0xf3,0xc3 +.cfi_endproc +.size __mulq_by_1_mont_256,.-__mulq_by_1_mont_256 + +.section .note.GNU-stack,"",@progbits +.section .note.gnu.property,"a",@note + .long 4,2f-1f,5 + .byte 0x47,0x4E,0x55,0 +1: .long 0xc0000002,4,3 +.align 8 +2: diff --git a/temp-deps/blst-ts/deps/blst/build/elf/mulq_mont_384-x86_64.s b/temp-deps/blst-ts/deps/blst/build/elf/mulq_mont_384-x86_64.s new file mode 100644 index 0000000..fa9dd35 --- /dev/null +++ b/temp-deps/blst-ts/deps/blst/build/elf/mulq_mont_384-x86_64.s @@ -0,0 +1,3620 @@ +.text + + + + + + + +.type __sub_mod_384x384,@function +.align 32 +__sub_mod_384x384: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + movq 32(%rsi),%r12 + movq 40(%rsi),%r13 + movq 48(%rsi),%r14 + + subq 0(%rdx),%r8 + movq 56(%rsi),%r15 + sbbq 8(%rdx),%r9 + movq 64(%rsi),%rax + sbbq 16(%rdx),%r10 + movq 72(%rsi),%rbx + sbbq 24(%rdx),%r11 + movq 80(%rsi),%rbp + sbbq 32(%rdx),%r12 + movq 88(%rsi),%rsi + sbbq 40(%rdx),%r13 + movq %r8,0(%rdi) + sbbq 48(%rdx),%r14 + movq 0(%rcx),%r8 + movq %r9,8(%rdi) + sbbq 56(%rdx),%r15 + movq 8(%rcx),%r9 + movq %r10,16(%rdi) + sbbq 64(%rdx),%rax + movq 16(%rcx),%r10 + movq %r11,24(%rdi) + sbbq 72(%rdx),%rbx + movq 24(%rcx),%r11 + movq %r12,32(%rdi) + sbbq 80(%rdx),%rbp + movq 32(%rcx),%r12 + movq %r13,40(%rdi) + sbbq 88(%rdx),%rsi + movq 40(%rcx),%r13 + sbbq %rdx,%rdx + + andq %rdx,%r8 + andq %rdx,%r9 + andq %rdx,%r10 + andq %rdx,%r11 + andq %rdx,%r12 + andq %rdx,%r13 + + addq %r8,%r14 + adcq %r9,%r15 + movq %r14,48(%rdi) + adcq %r10,%rax + movq %r15,56(%rdi) + adcq %r11,%rbx + movq %rax,64(%rdi) + adcq %r12,%rbp + movq %rbx,72(%rdi) + adcq %r13,%rsi + movq %rbp,80(%rdi) + movq %rsi,88(%rdi) + + .byte 0xf3,0xc3 +.cfi_endproc +.size __sub_mod_384x384,.-__sub_mod_384x384 + +.type __add_mod_384,@function +.align 32 +__add_mod_384: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + movq 32(%rsi),%r12 + movq 40(%rsi),%r13 + + addq 0(%rdx),%r8 + adcq 8(%rdx),%r9 + adcq 16(%rdx),%r10 + movq %r8,%r14 + adcq 24(%rdx),%r11 + movq %r9,%r15 + adcq 32(%rdx),%r12 + movq %r10,%rax + adcq 40(%rdx),%r13 + movq %r11,%rbx + sbbq %rdx,%rdx + + subq 0(%rcx),%r8 + sbbq 8(%rcx),%r9 + movq %r12,%rbp + sbbq 16(%rcx),%r10 + sbbq 24(%rcx),%r11 + sbbq 32(%rcx),%r12 + movq %r13,%rsi + sbbq 40(%rcx),%r13 + sbbq $0,%rdx + + cmovcq %r14,%r8 + cmovcq %r15,%r9 + cmovcq %rax,%r10 + movq %r8,0(%rdi) + cmovcq %rbx,%r11 + movq %r9,8(%rdi) + cmovcq %rbp,%r12 + movq %r10,16(%rdi) + cmovcq %rsi,%r13 + movq %r11,24(%rdi) + movq %r12,32(%rdi) + movq %r13,40(%rdi) + + .byte 0xf3,0xc3 +.cfi_endproc +.size __add_mod_384,.-__add_mod_384 + +.type __sub_mod_384,@function +.align 32 +__sub_mod_384: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + movq 32(%rsi),%r12 + movq 40(%rsi),%r13 + +__sub_mod_384_a_is_loaded: + subq 0(%rdx),%r8 + movq 0(%rcx),%r14 + sbbq 8(%rdx),%r9 + movq 8(%rcx),%r15 + sbbq 16(%rdx),%r10 + movq 16(%rcx),%rax + sbbq 24(%rdx),%r11 + movq 24(%rcx),%rbx + sbbq 32(%rdx),%r12 + movq 32(%rcx),%rbp + sbbq 40(%rdx),%r13 + movq 40(%rcx),%rsi + sbbq %rdx,%rdx + + andq %rdx,%r14 + andq %rdx,%r15 + andq %rdx,%rax + andq %rdx,%rbx + andq %rdx,%rbp + andq %rdx,%rsi + + addq %r14,%r8 + adcq %r15,%r9 + movq %r8,0(%rdi) + adcq %rax,%r10 + movq %r9,8(%rdi) + adcq %rbx,%r11 + movq %r10,16(%rdi) + adcq %rbp,%r12 + movq %r11,24(%rdi) + adcq %rsi,%r13 + movq %r12,32(%rdi) + movq %r13,40(%rdi) + + .byte 0xf3,0xc3 +.cfi_endproc +.size __sub_mod_384,.-__sub_mod_384 +.globl mul_mont_384x +.hidden mul_mont_384x +.type mul_mont_384x,@function +.align 32 +mul_mont_384x: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + subq $328,%rsp +.cfi_adjust_cfa_offset 328 + + + movq %rdx,%rbx + movq %rdi,32(%rsp) + movq %rsi,24(%rsp) + movq %rdx,16(%rsp) + movq %rcx,8(%rsp) + movq %r8,0(%rsp) + + + + + leaq 40(%rsp),%rdi + call __mulq_384 + + + leaq 48(%rbx),%rbx + leaq 48(%rsi),%rsi + leaq 40+96(%rsp),%rdi + call __mulq_384 + + + movq 8(%rsp),%rcx + leaq -48(%rsi),%rdx + leaq 40+192+48(%rsp),%rdi + call __add_mod_384 + + movq 16(%rsp),%rsi + leaq 48(%rsi),%rdx + leaq -48(%rdi),%rdi + call __add_mod_384 + + leaq (%rdi),%rbx + leaq 48(%rdi),%rsi + call __mulq_384 + + + leaq (%rdi),%rsi + leaq 40(%rsp),%rdx + movq 8(%rsp),%rcx + call __sub_mod_384x384 + + leaq (%rdi),%rsi + leaq -96(%rdi),%rdx + call __sub_mod_384x384 + + + leaq 40(%rsp),%rsi + leaq 40+96(%rsp),%rdx + leaq 40(%rsp),%rdi + call __sub_mod_384x384 + + movq %rcx,%rbx + + + leaq 40(%rsp),%rsi + movq 0(%rsp),%rcx + movq 32(%rsp),%rdi + call __mulq_by_1_mont_384 + call __redc_tail_mont_384 + + + leaq 40+192(%rsp),%rsi + movq 0(%rsp),%rcx + leaq 48(%rdi),%rdi + call __mulq_by_1_mont_384 + call __redc_tail_mont_384 + + leaq 328(%rsp),%r8 + movq 0(%r8),%r15 +.cfi_restore %r15 + movq 8(%r8),%r14 +.cfi_restore %r14 + movq 16(%r8),%r13 +.cfi_restore %r13 + movq 24(%r8),%r12 +.cfi_restore %r12 + movq 32(%r8),%rbx +.cfi_restore %rbx + movq 40(%r8),%rbp +.cfi_restore %rbp + leaq 48(%r8),%rsp +.cfi_adjust_cfa_offset -328-8*6 + + .byte 0xf3,0xc3 +.cfi_endproc +.size mul_mont_384x,.-mul_mont_384x +.globl sqr_mont_384x +.hidden sqr_mont_384x +.type sqr_mont_384x,@function +.align 32 +sqr_mont_384x: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + subq $136,%rsp +.cfi_adjust_cfa_offset 136 + + + movq %rcx,0(%rsp) + movq %rdx,%rcx + movq %rdi,8(%rsp) + movq %rsi,16(%rsp) + + + leaq 48(%rsi),%rdx + leaq 32(%rsp),%rdi + call __add_mod_384 + + + movq 16(%rsp),%rsi + leaq 48(%rsi),%rdx + leaq 32+48(%rsp),%rdi + call __sub_mod_384 + + + movq 16(%rsp),%rsi + leaq 48(%rsi),%rbx + + movq 48(%rsi),%rax + movq 0(%rsi),%r14 + movq 8(%rsi),%r15 + movq 16(%rsi),%r12 + movq 24(%rsi),%r13 + + call __mulq_mont_384 + addq %r14,%r14 + adcq %r15,%r15 + adcq %r8,%r8 + movq %r14,%r12 + adcq %r9,%r9 + movq %r15,%r13 + adcq %r10,%r10 + movq %r8,%rax + adcq %r11,%r11 + movq %r9,%rbx + sbbq %rdx,%rdx + + subq 0(%rcx),%r14 + sbbq 8(%rcx),%r15 + movq %r10,%rbp + sbbq 16(%rcx),%r8 + sbbq 24(%rcx),%r9 + sbbq 32(%rcx),%r10 + movq %r11,%rsi + sbbq 40(%rcx),%r11 + sbbq $0,%rdx + + cmovcq %r12,%r14 + cmovcq %r13,%r15 + cmovcq %rax,%r8 + movq %r14,48(%rdi) + cmovcq %rbx,%r9 + movq %r15,56(%rdi) + cmovcq %rbp,%r10 + movq %r8,64(%rdi) + cmovcq %rsi,%r11 + movq %r9,72(%rdi) + movq %r10,80(%rdi) + movq %r11,88(%rdi) + + leaq 32(%rsp),%rsi + leaq 32+48(%rsp),%rbx + + movq 32+48(%rsp),%rax + movq 32+0(%rsp),%r14 + movq 32+8(%rsp),%r15 + movq 32+16(%rsp),%r12 + movq 32+24(%rsp),%r13 + + call __mulq_mont_384 + + leaq 136(%rsp),%r8 + movq 0(%r8),%r15 +.cfi_restore %r15 + movq 8(%r8),%r14 +.cfi_restore %r14 + movq 16(%r8),%r13 +.cfi_restore %r13 + movq 24(%r8),%r12 +.cfi_restore %r12 + movq 32(%r8),%rbx +.cfi_restore %rbx + movq 40(%r8),%rbp +.cfi_restore %rbp + leaq 48(%r8),%rsp +.cfi_adjust_cfa_offset -136-8*6 + + .byte 0xf3,0xc3 +.cfi_endproc +.size sqr_mont_384x,.-sqr_mont_384x + +.globl mul_382x +.hidden mul_382x +.type mul_382x,@function +.align 32 +mul_382x: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + subq $136,%rsp +.cfi_adjust_cfa_offset 136 + + + leaq 96(%rdi),%rdi + movq %rsi,0(%rsp) + movq %rdx,8(%rsp) + movq %rdi,16(%rsp) + movq %rcx,24(%rsp) + + + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + movq 32(%rsi),%r12 + movq 40(%rsi),%r13 + + addq 48(%rsi),%r8 + adcq 56(%rsi),%r9 + adcq 64(%rsi),%r10 + adcq 72(%rsi),%r11 + adcq 80(%rsi),%r12 + adcq 88(%rsi),%r13 + + movq %r8,32+0(%rsp) + movq %r9,32+8(%rsp) + movq %r10,32+16(%rsp) + movq %r11,32+24(%rsp) + movq %r12,32+32(%rsp) + movq %r13,32+40(%rsp) + + + movq 0(%rdx),%r8 + movq 8(%rdx),%r9 + movq 16(%rdx),%r10 + movq 24(%rdx),%r11 + movq 32(%rdx),%r12 + movq 40(%rdx),%r13 + + addq 48(%rdx),%r8 + adcq 56(%rdx),%r9 + adcq 64(%rdx),%r10 + adcq 72(%rdx),%r11 + adcq 80(%rdx),%r12 + adcq 88(%rdx),%r13 + + movq %r8,32+48(%rsp) + movq %r9,32+56(%rsp) + movq %r10,32+64(%rsp) + movq %r11,32+72(%rsp) + movq %r12,32+80(%rsp) + movq %r13,32+88(%rsp) + + + leaq 32+0(%rsp),%rsi + leaq 32+48(%rsp),%rbx + call __mulq_384 + + + movq 0(%rsp),%rsi + movq 8(%rsp),%rbx + leaq -96(%rdi),%rdi + call __mulq_384 + + + leaq 48(%rsi),%rsi + leaq 48(%rbx),%rbx + leaq 32(%rsp),%rdi + call __mulq_384 + + + movq 16(%rsp),%rsi + leaq 32(%rsp),%rdx + movq 24(%rsp),%rcx + movq %rsi,%rdi + call __sub_mod_384x384 + + + leaq 0(%rdi),%rsi + leaq -96(%rdi),%rdx + call __sub_mod_384x384 + + + leaq -96(%rdi),%rsi + leaq 32(%rsp),%rdx + leaq -96(%rdi),%rdi + call __sub_mod_384x384 + + leaq 136(%rsp),%r8 + movq 0(%r8),%r15 +.cfi_restore %r15 + movq 8(%r8),%r14 +.cfi_restore %r14 + movq 16(%r8),%r13 +.cfi_restore %r13 + movq 24(%r8),%r12 +.cfi_restore %r12 + movq 32(%r8),%rbx +.cfi_restore %rbx + movq 40(%r8),%rbp +.cfi_restore %rbp + leaq 48(%r8),%rsp +.cfi_adjust_cfa_offset -136-8*6 + + .byte 0xf3,0xc3 +.cfi_endproc +.size mul_382x,.-mul_382x +.globl sqr_382x +.hidden sqr_382x +.type sqr_382x,@function +.align 32 +sqr_382x: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + pushq %rsi +.cfi_adjust_cfa_offset 8 + + + movq %rdx,%rcx + + + movq 0(%rsi),%r14 + movq 8(%rsi),%r15 + movq 16(%rsi),%rax + movq 24(%rsi),%rbx + movq 32(%rsi),%rbp + movq 40(%rsi),%rdx + + movq %r14,%r8 + addq 48(%rsi),%r14 + movq %r15,%r9 + adcq 56(%rsi),%r15 + movq %rax,%r10 + adcq 64(%rsi),%rax + movq %rbx,%r11 + adcq 72(%rsi),%rbx + movq %rbp,%r12 + adcq 80(%rsi),%rbp + movq %rdx,%r13 + adcq 88(%rsi),%rdx + + movq %r14,0(%rdi) + movq %r15,8(%rdi) + movq %rax,16(%rdi) + movq %rbx,24(%rdi) + movq %rbp,32(%rdi) + movq %rdx,40(%rdi) + + + leaq 48(%rsi),%rdx + leaq 48(%rdi),%rdi + call __sub_mod_384_a_is_loaded + + + leaq (%rdi),%rsi + leaq -48(%rdi),%rbx + leaq -48(%rdi),%rdi + call __mulq_384 + + + movq (%rsp),%rsi + leaq 48(%rsi),%rbx + leaq 96(%rdi),%rdi + call __mulq_384 + + movq 0(%rdi),%r8 + movq 8(%rdi),%r9 + movq 16(%rdi),%r10 + movq 24(%rdi),%r11 + movq 32(%rdi),%r12 + movq 40(%rdi),%r13 + movq 48(%rdi),%r14 + movq 56(%rdi),%r15 + movq 64(%rdi),%rax + movq 72(%rdi),%rbx + movq 80(%rdi),%rbp + addq %r8,%r8 + movq 88(%rdi),%rdx + adcq %r9,%r9 + movq %r8,0(%rdi) + adcq %r10,%r10 + movq %r9,8(%rdi) + adcq %r11,%r11 + movq %r10,16(%rdi) + adcq %r12,%r12 + movq %r11,24(%rdi) + adcq %r13,%r13 + movq %r12,32(%rdi) + adcq %r14,%r14 + movq %r13,40(%rdi) + adcq %r15,%r15 + movq %r14,48(%rdi) + adcq %rax,%rax + movq %r15,56(%rdi) + adcq %rbx,%rbx + movq %rax,64(%rdi) + adcq %rbp,%rbp + movq %rbx,72(%rdi) + adcq %rdx,%rdx + movq %rbp,80(%rdi) + movq %rdx,88(%rdi) + + movq 8(%rsp),%r15 +.cfi_restore %r15 + movq 16(%rsp),%r14 +.cfi_restore %r14 + movq 24(%rsp),%r13 +.cfi_restore %r13 + movq 32(%rsp),%r12 +.cfi_restore %r12 + movq 40(%rsp),%rbx +.cfi_restore %rbx + movq 48(%rsp),%rbp +.cfi_restore %rbp + leaq 56(%rsp),%rsp +.cfi_adjust_cfa_offset -8*7 + + .byte 0xf3,0xc3 +.cfi_endproc +.size sqr_382x,.-sqr_382x +.globl mul_384 +.hidden mul_384 +.type mul_384,@function +.align 32 +mul_384: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + + + movq %rdx,%rbx + call __mulq_384 + + movq 0(%rsp),%r12 +.cfi_restore %r12 + movq 8(%rsp),%rbx +.cfi_restore %rbx + movq 16(%rsp),%rbp +.cfi_restore %rbp + leaq 24(%rsp),%rsp +.cfi_adjust_cfa_offset -24 + + .byte 0xf3,0xc3 +.cfi_endproc +.size mul_384,.-mul_384 + +.type __mulq_384,@function +.align 32 +__mulq_384: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + movq 0(%rbx),%rax + + movq %rax,%rbp + mulq 0(%rsi) + movq %rax,0(%rdi) + movq %rbp,%rax + movq %rdx,%rcx + + mulq 8(%rsi) + addq %rax,%rcx + movq %rbp,%rax + adcq $0,%rdx + movq %rdx,%r8 + + mulq 16(%rsi) + addq %rax,%r8 + movq %rbp,%rax + adcq $0,%rdx + movq %rdx,%r9 + + mulq 24(%rsi) + addq %rax,%r9 + movq %rbp,%rax + adcq $0,%rdx + movq %rdx,%r10 + + mulq 32(%rsi) + addq %rax,%r10 + movq %rbp,%rax + adcq $0,%rdx + movq %rdx,%r11 + + mulq 40(%rsi) + addq %rax,%r11 + movq 8(%rbx),%rax + adcq $0,%rdx + movq %rdx,%r12 + movq %rax,%rbp + mulq 0(%rsi) + addq %rax,%rcx + movq %rbp,%rax + adcq $0,%rdx + movq %rcx,8(%rdi) + movq %rdx,%rcx + + mulq 8(%rsi) + addq %rax,%r8 + movq %rbp,%rax + adcq $0,%rdx + addq %r8,%rcx + adcq $0,%rdx + movq %rdx,%r8 + + mulq 16(%rsi) + addq %rax,%r9 + movq %rbp,%rax + adcq $0,%rdx + addq %r9,%r8 + adcq $0,%rdx + movq %rdx,%r9 + + mulq 24(%rsi) + addq %rax,%r10 + movq %rbp,%rax + adcq $0,%rdx + addq %r10,%r9 + adcq $0,%rdx + movq %rdx,%r10 + + mulq 32(%rsi) + addq %rax,%r11 + movq %rbp,%rax + adcq $0,%rdx + addq %r11,%r10 + adcq $0,%rdx + movq %rdx,%r11 + + mulq 40(%rsi) + addq %rax,%r12 + movq 16(%rbx),%rax + adcq $0,%rdx + addq %r12,%r11 + adcq $0,%rdx + movq %rdx,%r12 + movq %rax,%rbp + mulq 0(%rsi) + addq %rax,%rcx + movq %rbp,%rax + adcq $0,%rdx + movq %rcx,16(%rdi) + movq %rdx,%rcx + + mulq 8(%rsi) + addq %rax,%r8 + movq %rbp,%rax + adcq $0,%rdx + addq %r8,%rcx + adcq $0,%rdx + movq %rdx,%r8 + + mulq 16(%rsi) + addq %rax,%r9 + movq %rbp,%rax + adcq $0,%rdx + addq %r9,%r8 + adcq $0,%rdx + movq %rdx,%r9 + + mulq 24(%rsi) + addq %rax,%r10 + movq %rbp,%rax + adcq $0,%rdx + addq %r10,%r9 + adcq $0,%rdx + movq %rdx,%r10 + + mulq 32(%rsi) + addq %rax,%r11 + movq %rbp,%rax + adcq $0,%rdx + addq %r11,%r10 + adcq $0,%rdx + movq %rdx,%r11 + + mulq 40(%rsi) + addq %rax,%r12 + movq 24(%rbx),%rax + adcq $0,%rdx + addq %r12,%r11 + adcq $0,%rdx + movq %rdx,%r12 + movq %rax,%rbp + mulq 0(%rsi) + addq %rax,%rcx + movq %rbp,%rax + adcq $0,%rdx + movq %rcx,24(%rdi) + movq %rdx,%rcx + + mulq 8(%rsi) + addq %rax,%r8 + movq %rbp,%rax + adcq $0,%rdx + addq %r8,%rcx + adcq $0,%rdx + movq %rdx,%r8 + + mulq 16(%rsi) + addq %rax,%r9 + movq %rbp,%rax + adcq $0,%rdx + addq %r9,%r8 + adcq $0,%rdx + movq %rdx,%r9 + + mulq 24(%rsi) + addq %rax,%r10 + movq %rbp,%rax + adcq $0,%rdx + addq %r10,%r9 + adcq $0,%rdx + movq %rdx,%r10 + + mulq 32(%rsi) + addq %rax,%r11 + movq %rbp,%rax + adcq $0,%rdx + addq %r11,%r10 + adcq $0,%rdx + movq %rdx,%r11 + + mulq 40(%rsi) + addq %rax,%r12 + movq 32(%rbx),%rax + adcq $0,%rdx + addq %r12,%r11 + adcq $0,%rdx + movq %rdx,%r12 + movq %rax,%rbp + mulq 0(%rsi) + addq %rax,%rcx + movq %rbp,%rax + adcq $0,%rdx + movq %rcx,32(%rdi) + movq %rdx,%rcx + + mulq 8(%rsi) + addq %rax,%r8 + movq %rbp,%rax + adcq $0,%rdx + addq %r8,%rcx + adcq $0,%rdx + movq %rdx,%r8 + + mulq 16(%rsi) + addq %rax,%r9 + movq %rbp,%rax + adcq $0,%rdx + addq %r9,%r8 + adcq $0,%rdx + movq %rdx,%r9 + + mulq 24(%rsi) + addq %rax,%r10 + movq %rbp,%rax + adcq $0,%rdx + addq %r10,%r9 + adcq $0,%rdx + movq %rdx,%r10 + + mulq 32(%rsi) + addq %rax,%r11 + movq %rbp,%rax + adcq $0,%rdx + addq %r11,%r10 + adcq $0,%rdx + movq %rdx,%r11 + + mulq 40(%rsi) + addq %rax,%r12 + movq 40(%rbx),%rax + adcq $0,%rdx + addq %r12,%r11 + adcq $0,%rdx + movq %rdx,%r12 + movq %rax,%rbp + mulq 0(%rsi) + addq %rax,%rcx + movq %rbp,%rax + adcq $0,%rdx + movq %rcx,40(%rdi) + movq %rdx,%rcx + + mulq 8(%rsi) + addq %rax,%r8 + movq %rbp,%rax + adcq $0,%rdx + addq %r8,%rcx + adcq $0,%rdx + movq %rdx,%r8 + + mulq 16(%rsi) + addq %rax,%r9 + movq %rbp,%rax + adcq $0,%rdx + addq %r9,%r8 + adcq $0,%rdx + movq %rdx,%r9 + + mulq 24(%rsi) + addq %rax,%r10 + movq %rbp,%rax + adcq $0,%rdx + addq %r10,%r9 + adcq $0,%rdx + movq %rdx,%r10 + + mulq 32(%rsi) + addq %rax,%r11 + movq %rbp,%rax + adcq $0,%rdx + addq %r11,%r10 + adcq $0,%rdx + movq %rdx,%r11 + + mulq 40(%rsi) + addq %rax,%r12 + movq %rax,%rax + adcq $0,%rdx + addq %r12,%r11 + adcq $0,%rdx + movq %rdx,%r12 + movq %rcx,48(%rdi) + movq %r8,56(%rdi) + movq %r9,64(%rdi) + movq %r10,72(%rdi) + movq %r11,80(%rdi) + movq %r12,88(%rdi) + + .byte 0xf3,0xc3 +.cfi_endproc +.size __mulq_384,.-__mulq_384 +.globl sqr_384 +.hidden sqr_384 +.type sqr_384,@function +.align 32 +sqr_384: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + subq $8,%rsp +.cfi_adjust_cfa_offset 8 + + + call __sqrq_384 + + movq 8(%rsp),%r15 +.cfi_restore %r15 + movq 16(%rsp),%r14 +.cfi_restore %r14 + movq 24(%rsp),%r13 +.cfi_restore %r13 + movq 32(%rsp),%r12 +.cfi_restore %r12 + movq 40(%rsp),%rbx +.cfi_restore %rbx + movq 48(%rsp),%rbp +.cfi_restore %rbp + leaq 56(%rsp),%rsp +.cfi_adjust_cfa_offset -56 + + .byte 0xf3,0xc3 +.cfi_endproc +.size sqr_384,.-sqr_384 + +.type __sqrq_384,@function +.align 32 +__sqrq_384: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + movq 0(%rsi),%rax + movq 8(%rsi),%r15 + movq 16(%rsi),%rcx + movq 24(%rsi),%rbx + + + movq %rax,%r14 + mulq %r15 + movq %rax,%r9 + movq %r14,%rax + movq 32(%rsi),%rbp + movq %rdx,%r10 + + mulq %rcx + addq %rax,%r10 + movq %r14,%rax + adcq $0,%rdx + movq 40(%rsi),%rsi + movq %rdx,%r11 + + mulq %rbx + addq %rax,%r11 + movq %r14,%rax + adcq $0,%rdx + movq %rdx,%r12 + + mulq %rbp + addq %rax,%r12 + movq %r14,%rax + adcq $0,%rdx + movq %rdx,%r13 + + mulq %rsi + addq %rax,%r13 + movq %r14,%rax + adcq $0,%rdx + movq %rdx,%r14 + + mulq %rax + xorq %r8,%r8 + movq %rax,0(%rdi) + movq %r15,%rax + addq %r9,%r9 + adcq $0,%r8 + addq %rdx,%r9 + adcq $0,%r8 + movq %r9,8(%rdi) + + mulq %rcx + addq %rax,%r11 + movq %r15,%rax + adcq $0,%rdx + movq %rdx,%r9 + + mulq %rbx + addq %rax,%r12 + movq %r15,%rax + adcq $0,%rdx + addq %r9,%r12 + adcq $0,%rdx + movq %rdx,%r9 + + mulq %rbp + addq %rax,%r13 + movq %r15,%rax + adcq $0,%rdx + addq %r9,%r13 + adcq $0,%rdx + movq %rdx,%r9 + + mulq %rsi + addq %rax,%r14 + movq %r15,%rax + adcq $0,%rdx + addq %r9,%r14 + adcq $0,%rdx + movq %rdx,%r15 + + mulq %rax + xorq %r9,%r9 + addq %rax,%r8 + movq %rcx,%rax + addq %r10,%r10 + adcq %r11,%r11 + adcq $0,%r9 + addq %r8,%r10 + adcq %rdx,%r11 + adcq $0,%r9 + movq %r10,16(%rdi) + + mulq %rbx + addq %rax,%r13 + movq %rcx,%rax + adcq $0,%rdx + movq %r11,24(%rdi) + movq %rdx,%r8 + + mulq %rbp + addq %rax,%r14 + movq %rcx,%rax + adcq $0,%rdx + addq %r8,%r14 + adcq $0,%rdx + movq %rdx,%r8 + + mulq %rsi + addq %rax,%r15 + movq %rcx,%rax + adcq $0,%rdx + addq %r8,%r15 + adcq $0,%rdx + movq %rdx,%rcx + + mulq %rax + xorq %r11,%r11 + addq %rax,%r9 + movq %rbx,%rax + addq %r12,%r12 + adcq %r13,%r13 + adcq $0,%r11 + addq %r9,%r12 + adcq %rdx,%r13 + adcq $0,%r11 + movq %r12,32(%rdi) + + + mulq %rbp + addq %rax,%r15 + movq %rbx,%rax + adcq $0,%rdx + movq %r13,40(%rdi) + movq %rdx,%r8 + + mulq %rsi + addq %rax,%rcx + movq %rbx,%rax + adcq $0,%rdx + addq %r8,%rcx + adcq $0,%rdx + movq %rdx,%rbx + + mulq %rax + xorq %r12,%r12 + addq %rax,%r11 + movq %rbp,%rax + addq %r14,%r14 + adcq %r15,%r15 + adcq $0,%r12 + addq %r11,%r14 + adcq %rdx,%r15 + movq %r14,48(%rdi) + adcq $0,%r12 + movq %r15,56(%rdi) + + + mulq %rsi + addq %rax,%rbx + movq %rbp,%rax + adcq $0,%rdx + movq %rdx,%rbp + + mulq %rax + xorq %r13,%r13 + addq %rax,%r12 + movq %rsi,%rax + addq %rcx,%rcx + adcq %rbx,%rbx + adcq $0,%r13 + addq %r12,%rcx + adcq %rdx,%rbx + movq %rcx,64(%rdi) + adcq $0,%r13 + movq %rbx,72(%rdi) + + + mulq %rax + addq %r13,%rax + addq %rbp,%rbp + adcq $0,%rdx + addq %rbp,%rax + adcq $0,%rdx + movq %rax,80(%rdi) + movq %rdx,88(%rdi) + + .byte 0xf3,0xc3 +.cfi_endproc +.size __sqrq_384,.-__sqrq_384 + +.globl sqr_mont_384 +.hidden sqr_mont_384 +.type sqr_mont_384,@function +.align 32 +sqr_mont_384: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + subq $120,%rsp +.cfi_adjust_cfa_offset 8*15 + + + movq %rcx,96(%rsp) + movq %rdx,104(%rsp) + movq %rdi,112(%rsp) + + movq %rsp,%rdi + call __sqrq_384 + + leaq 0(%rsp),%rsi + movq 96(%rsp),%rcx + movq 104(%rsp),%rbx + movq 112(%rsp),%rdi + call __mulq_by_1_mont_384 + call __redc_tail_mont_384 + + leaq 120(%rsp),%r8 + movq 120(%rsp),%r15 +.cfi_restore %r15 + movq 8(%r8),%r14 +.cfi_restore %r14 + movq 16(%r8),%r13 +.cfi_restore %r13 + movq 24(%r8),%r12 +.cfi_restore %r12 + movq 32(%r8),%rbx +.cfi_restore %rbx + movq 40(%r8),%rbp +.cfi_restore %rbp + leaq 48(%r8),%rsp +.cfi_adjust_cfa_offset -8*21 + + .byte 0xf3,0xc3 +.cfi_endproc +.size sqr_mont_384,.-sqr_mont_384 + + + +.globl redc_mont_384 +.hidden redc_mont_384 +.type redc_mont_384,@function +.align 32 +redc_mont_384: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + subq $8,%rsp +.cfi_adjust_cfa_offset 8 + + + movq %rdx,%rbx + call __mulq_by_1_mont_384 + call __redc_tail_mont_384 + + movq 8(%rsp),%r15 +.cfi_restore %r15 + movq 16(%rsp),%r14 +.cfi_restore %r14 + movq 24(%rsp),%r13 +.cfi_restore %r13 + movq 32(%rsp),%r12 +.cfi_restore %r12 + movq 40(%rsp),%rbx +.cfi_restore %rbx + movq 48(%rsp),%rbp +.cfi_restore %rbp + leaq 56(%rsp),%rsp +.cfi_adjust_cfa_offset -56 + + .byte 0xf3,0xc3 +.cfi_endproc +.size redc_mont_384,.-redc_mont_384 + + + + +.globl from_mont_384 +.hidden from_mont_384 +.type from_mont_384,@function +.align 32 +from_mont_384: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + subq $8,%rsp +.cfi_adjust_cfa_offset 8 + + + movq %rdx,%rbx + call __mulq_by_1_mont_384 + + + + + + movq %r15,%rcx + movq %r8,%rdx + movq %r9,%rbp + + subq 0(%rbx),%r14 + sbbq 8(%rbx),%r15 + movq %r10,%r13 + sbbq 16(%rbx),%r8 + sbbq 24(%rbx),%r9 + sbbq 32(%rbx),%r10 + movq %r11,%rsi + sbbq 40(%rbx),%r11 + + cmovcq %rax,%r14 + cmovcq %rcx,%r15 + cmovcq %rdx,%r8 + movq %r14,0(%rdi) + cmovcq %rbp,%r9 + movq %r15,8(%rdi) + cmovcq %r13,%r10 + movq %r8,16(%rdi) + cmovcq %rsi,%r11 + movq %r9,24(%rdi) + movq %r10,32(%rdi) + movq %r11,40(%rdi) + + movq 8(%rsp),%r15 +.cfi_restore %r15 + movq 16(%rsp),%r14 +.cfi_restore %r14 + movq 24(%rsp),%r13 +.cfi_restore %r13 + movq 32(%rsp),%r12 +.cfi_restore %r12 + movq 40(%rsp),%rbx +.cfi_restore %rbx + movq 48(%rsp),%rbp +.cfi_restore %rbp + leaq 56(%rsp),%rsp +.cfi_adjust_cfa_offset -56 + + .byte 0xf3,0xc3 +.cfi_endproc +.size from_mont_384,.-from_mont_384 +.type __mulq_by_1_mont_384,@function +.align 32 +__mulq_by_1_mont_384: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + movq 0(%rsi),%rax + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + movq 32(%rsi),%r12 + movq 40(%rsi),%r13 + + movq %rax,%r14 + imulq %rcx,%rax + movq %rax,%r8 + + mulq 0(%rbx) + addq %rax,%r14 + movq %r8,%rax + adcq %rdx,%r14 + + mulq 8(%rbx) + addq %rax,%r9 + movq %r8,%rax + adcq $0,%rdx + addq %r14,%r9 + adcq $0,%rdx + movq %rdx,%r14 + + mulq 16(%rbx) + addq %rax,%r10 + movq %r8,%rax + adcq $0,%rdx + addq %r14,%r10 + adcq $0,%rdx + movq %rdx,%r14 + + mulq 24(%rbx) + addq %rax,%r11 + movq %r8,%rax + adcq $0,%rdx + movq %r9,%r15 + imulq %rcx,%r9 + addq %r14,%r11 + adcq $0,%rdx + movq %rdx,%r14 + + mulq 32(%rbx) + addq %rax,%r12 + movq %r8,%rax + adcq $0,%rdx + addq %r14,%r12 + adcq $0,%rdx + movq %rdx,%r14 + + mulq 40(%rbx) + addq %rax,%r13 + movq %r9,%rax + adcq $0,%rdx + addq %r14,%r13 + adcq $0,%rdx + movq %rdx,%r14 + + mulq 0(%rbx) + addq %rax,%r15 + movq %r9,%rax + adcq %rdx,%r15 + + mulq 8(%rbx) + addq %rax,%r10 + movq %r9,%rax + adcq $0,%rdx + addq %r15,%r10 + adcq $0,%rdx + movq %rdx,%r15 + + mulq 16(%rbx) + addq %rax,%r11 + movq %r9,%rax + adcq $0,%rdx + addq %r15,%r11 + adcq $0,%rdx + movq %rdx,%r15 + + mulq 24(%rbx) + addq %rax,%r12 + movq %r9,%rax + adcq $0,%rdx + movq %r10,%r8 + imulq %rcx,%r10 + addq %r15,%r12 + adcq $0,%rdx + movq %rdx,%r15 + + mulq 32(%rbx) + addq %rax,%r13 + movq %r9,%rax + adcq $0,%rdx + addq %r15,%r13 + adcq $0,%rdx + movq %rdx,%r15 + + mulq 40(%rbx) + addq %rax,%r14 + movq %r10,%rax + adcq $0,%rdx + addq %r15,%r14 + adcq $0,%rdx + movq %rdx,%r15 + + mulq 0(%rbx) + addq %rax,%r8 + movq %r10,%rax + adcq %rdx,%r8 + + mulq 8(%rbx) + addq %rax,%r11 + movq %r10,%rax + adcq $0,%rdx + addq %r8,%r11 + adcq $0,%rdx + movq %rdx,%r8 + + mulq 16(%rbx) + addq %rax,%r12 + movq %r10,%rax + adcq $0,%rdx + addq %r8,%r12 + adcq $0,%rdx + movq %rdx,%r8 + + mulq 24(%rbx) + addq %rax,%r13 + movq %r10,%rax + adcq $0,%rdx + movq %r11,%r9 + imulq %rcx,%r11 + addq %r8,%r13 + adcq $0,%rdx + movq %rdx,%r8 + + mulq 32(%rbx) + addq %rax,%r14 + movq %r10,%rax + adcq $0,%rdx + addq %r8,%r14 + adcq $0,%rdx + movq %rdx,%r8 + + mulq 40(%rbx) + addq %rax,%r15 + movq %r11,%rax + adcq $0,%rdx + addq %r8,%r15 + adcq $0,%rdx + movq %rdx,%r8 + + mulq 0(%rbx) + addq %rax,%r9 + movq %r11,%rax + adcq %rdx,%r9 + + mulq 8(%rbx) + addq %rax,%r12 + movq %r11,%rax + adcq $0,%rdx + addq %r9,%r12 + adcq $0,%rdx + movq %rdx,%r9 + + mulq 16(%rbx) + addq %rax,%r13 + movq %r11,%rax + adcq $0,%rdx + addq %r9,%r13 + adcq $0,%rdx + movq %rdx,%r9 + + mulq 24(%rbx) + addq %rax,%r14 + movq %r11,%rax + adcq $0,%rdx + movq %r12,%r10 + imulq %rcx,%r12 + addq %r9,%r14 + adcq $0,%rdx + movq %rdx,%r9 + + mulq 32(%rbx) + addq %rax,%r15 + movq %r11,%rax + adcq $0,%rdx + addq %r9,%r15 + adcq $0,%rdx + movq %rdx,%r9 + + mulq 40(%rbx) + addq %rax,%r8 + movq %r12,%rax + adcq $0,%rdx + addq %r9,%r8 + adcq $0,%rdx + movq %rdx,%r9 + + mulq 0(%rbx) + addq %rax,%r10 + movq %r12,%rax + adcq %rdx,%r10 + + mulq 8(%rbx) + addq %rax,%r13 + movq %r12,%rax + adcq $0,%rdx + addq %r10,%r13 + adcq $0,%rdx + movq %rdx,%r10 + + mulq 16(%rbx) + addq %rax,%r14 + movq %r12,%rax + adcq $0,%rdx + addq %r10,%r14 + adcq $0,%rdx + movq %rdx,%r10 + + mulq 24(%rbx) + addq %rax,%r15 + movq %r12,%rax + adcq $0,%rdx + movq %r13,%r11 + imulq %rcx,%r13 + addq %r10,%r15 + adcq $0,%rdx + movq %rdx,%r10 + + mulq 32(%rbx) + addq %rax,%r8 + movq %r12,%rax + adcq $0,%rdx + addq %r10,%r8 + adcq $0,%rdx + movq %rdx,%r10 + + mulq 40(%rbx) + addq %rax,%r9 + movq %r13,%rax + adcq $0,%rdx + addq %r10,%r9 + adcq $0,%rdx + movq %rdx,%r10 + + mulq 0(%rbx) + addq %rax,%r11 + movq %r13,%rax + adcq %rdx,%r11 + + mulq 8(%rbx) + addq %rax,%r14 + movq %r13,%rax + adcq $0,%rdx + addq %r11,%r14 + adcq $0,%rdx + movq %rdx,%r11 + + mulq 16(%rbx) + addq %rax,%r15 + movq %r13,%rax + adcq $0,%rdx + addq %r11,%r15 + adcq $0,%rdx + movq %rdx,%r11 + + mulq 24(%rbx) + addq %rax,%r8 + movq %r13,%rax + adcq $0,%rdx + addq %r11,%r8 + adcq $0,%rdx + movq %rdx,%r11 + + mulq 32(%rbx) + addq %rax,%r9 + movq %r13,%rax + adcq $0,%rdx + addq %r11,%r9 + adcq $0,%rdx + movq %rdx,%r11 + + mulq 40(%rbx) + addq %rax,%r10 + movq %r14,%rax + adcq $0,%rdx + addq %r11,%r10 + adcq $0,%rdx + movq %rdx,%r11 + .byte 0xf3,0xc3 +.cfi_endproc +.size __mulq_by_1_mont_384,.-__mulq_by_1_mont_384 + +.type __redc_tail_mont_384,@function +.align 32 +__redc_tail_mont_384: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + addq 48(%rsi),%r14 + movq %r14,%rax + adcq 56(%rsi),%r15 + adcq 64(%rsi),%r8 + adcq 72(%rsi),%r9 + movq %r15,%rcx + adcq 80(%rsi),%r10 + adcq 88(%rsi),%r11 + sbbq %r12,%r12 + + + + + movq %r8,%rdx + movq %r9,%rbp + + subq 0(%rbx),%r14 + sbbq 8(%rbx),%r15 + movq %r10,%r13 + sbbq 16(%rbx),%r8 + sbbq 24(%rbx),%r9 + sbbq 32(%rbx),%r10 + movq %r11,%rsi + sbbq 40(%rbx),%r11 + sbbq $0,%r12 + + cmovcq %rax,%r14 + cmovcq %rcx,%r15 + cmovcq %rdx,%r8 + movq %r14,0(%rdi) + cmovcq %rbp,%r9 + movq %r15,8(%rdi) + cmovcq %r13,%r10 + movq %r8,16(%rdi) + cmovcq %rsi,%r11 + movq %r9,24(%rdi) + movq %r10,32(%rdi) + movq %r11,40(%rdi) + + .byte 0xf3,0xc3 +.cfi_endproc +.size __redc_tail_mont_384,.-__redc_tail_mont_384 + +.globl sgn0_pty_mont_384 +.hidden sgn0_pty_mont_384 +.type sgn0_pty_mont_384,@function +.align 32 +sgn0_pty_mont_384: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + subq $8,%rsp +.cfi_adjust_cfa_offset 8 + + + movq %rsi,%rbx + leaq 0(%rdi),%rsi + movq %rdx,%rcx + call __mulq_by_1_mont_384 + + xorq %rax,%rax + movq %r14,%r13 + addq %r14,%r14 + adcq %r15,%r15 + adcq %r8,%r8 + adcq %r9,%r9 + adcq %r10,%r10 + adcq %r11,%r11 + adcq $0,%rax + + subq 0(%rbx),%r14 + sbbq 8(%rbx),%r15 + sbbq 16(%rbx),%r8 + sbbq 24(%rbx),%r9 + sbbq 32(%rbx),%r10 + sbbq 40(%rbx),%r11 + sbbq $0,%rax + + notq %rax + andq $1,%r13 + andq $2,%rax + orq %r13,%rax + + movq 8(%rsp),%r15 +.cfi_restore %r15 + movq 16(%rsp),%r14 +.cfi_restore %r14 + movq 24(%rsp),%r13 +.cfi_restore %r13 + movq 32(%rsp),%r12 +.cfi_restore %r12 + movq 40(%rsp),%rbx +.cfi_restore %rbx + movq 48(%rsp),%rbp +.cfi_restore %rbp + leaq 56(%rsp),%rsp +.cfi_adjust_cfa_offset -56 + + .byte 0xf3,0xc3 +.cfi_endproc +.size sgn0_pty_mont_384,.-sgn0_pty_mont_384 + +.globl sgn0_pty_mont_384x +.hidden sgn0_pty_mont_384x +.type sgn0_pty_mont_384x,@function +.align 32 +sgn0_pty_mont_384x: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + subq $8,%rsp +.cfi_adjust_cfa_offset 8 + + + movq %rsi,%rbx + leaq 48(%rdi),%rsi + movq %rdx,%rcx + call __mulq_by_1_mont_384 + + movq %r14,%r12 + orq %r15,%r14 + orq %r8,%r14 + orq %r9,%r14 + orq %r10,%r14 + orq %r11,%r14 + + leaq 0(%rdi),%rsi + xorq %rdi,%rdi + movq %r12,%r13 + addq %r12,%r12 + adcq %r15,%r15 + adcq %r8,%r8 + adcq %r9,%r9 + adcq %r10,%r10 + adcq %r11,%r11 + adcq $0,%rdi + + subq 0(%rbx),%r12 + sbbq 8(%rbx),%r15 + sbbq 16(%rbx),%r8 + sbbq 24(%rbx),%r9 + sbbq 32(%rbx),%r10 + sbbq 40(%rbx),%r11 + sbbq $0,%rdi + + movq %r14,0(%rsp) + notq %rdi + andq $1,%r13 + andq $2,%rdi + orq %r13,%rdi + + call __mulq_by_1_mont_384 + + movq %r14,%r12 + orq %r15,%r14 + orq %r8,%r14 + orq %r9,%r14 + orq %r10,%r14 + orq %r11,%r14 + + xorq %rax,%rax + movq %r12,%r13 + addq %r12,%r12 + adcq %r15,%r15 + adcq %r8,%r8 + adcq %r9,%r9 + adcq %r10,%r10 + adcq %r11,%r11 + adcq $0,%rax + + subq 0(%rbx),%r12 + sbbq 8(%rbx),%r15 + sbbq 16(%rbx),%r8 + sbbq 24(%rbx),%r9 + sbbq 32(%rbx),%r10 + sbbq 40(%rbx),%r11 + sbbq $0,%rax + + movq 0(%rsp),%r12 + + notq %rax + + testq %r14,%r14 + cmovzq %rdi,%r13 + + testq %r12,%r12 + cmovnzq %rdi,%rax + + andq $1,%r13 + andq $2,%rax + orq %r13,%rax + + movq 8(%rsp),%r15 +.cfi_restore %r15 + movq 16(%rsp),%r14 +.cfi_restore %r14 + movq 24(%rsp),%r13 +.cfi_restore %r13 + movq 32(%rsp),%r12 +.cfi_restore %r12 + movq 40(%rsp),%rbx +.cfi_restore %rbx + movq 48(%rsp),%rbp +.cfi_restore %rbp + leaq 56(%rsp),%rsp +.cfi_adjust_cfa_offset -56 + + .byte 0xf3,0xc3 +.cfi_endproc +.size sgn0_pty_mont_384x,.-sgn0_pty_mont_384x +.globl mul_mont_384 +.hidden mul_mont_384 +.type mul_mont_384,@function +.align 32 +mul_mont_384: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + subq $24,%rsp +.cfi_adjust_cfa_offset 8*3 + + + movq 0(%rdx),%rax + movq 0(%rsi),%r14 + movq 8(%rsi),%r15 + movq 16(%rsi),%r12 + movq 24(%rsi),%r13 + movq %rdx,%rbx + movq %r8,0(%rsp) + movq %rdi,8(%rsp) + + call __mulq_mont_384 + + movq 24(%rsp),%r15 +.cfi_restore %r15 + movq 32(%rsp),%r14 +.cfi_restore %r14 + movq 40(%rsp),%r13 +.cfi_restore %r13 + movq 48(%rsp),%r12 +.cfi_restore %r12 + movq 56(%rsp),%rbx +.cfi_restore %rbx + movq 64(%rsp),%rbp +.cfi_restore %rbp + leaq 72(%rsp),%rsp +.cfi_adjust_cfa_offset -72 + + .byte 0xf3,0xc3 +.cfi_endproc +.size mul_mont_384,.-mul_mont_384 +.type __mulq_mont_384,@function +.align 32 +__mulq_mont_384: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + movq %rax,%rdi + mulq %r14 + movq %rax,%r8 + movq %rdi,%rax + movq %rdx,%r9 + + mulq %r15 + addq %rax,%r9 + movq %rdi,%rax + adcq $0,%rdx + movq %rdx,%r10 + + mulq %r12 + addq %rax,%r10 + movq %rdi,%rax + adcq $0,%rdx + movq %rdx,%r11 + + movq %r8,%rbp + imulq 8(%rsp),%r8 + + mulq %r13 + addq %rax,%r11 + movq %rdi,%rax + adcq $0,%rdx + movq %rdx,%r12 + + mulq 32(%rsi) + addq %rax,%r12 + movq %rdi,%rax + adcq $0,%rdx + movq %rdx,%r13 + + mulq 40(%rsi) + addq %rax,%r13 + movq %r8,%rax + adcq $0,%rdx + xorq %r15,%r15 + movq %rdx,%r14 + + mulq 0(%rcx) + addq %rax,%rbp + movq %r8,%rax + adcq %rdx,%rbp + + mulq 8(%rcx) + addq %rax,%r9 + movq %r8,%rax + adcq $0,%rdx + addq %rbp,%r9 + adcq $0,%rdx + movq %rdx,%rbp + + mulq 16(%rcx) + addq %rax,%r10 + movq %r8,%rax + adcq $0,%rdx + addq %rbp,%r10 + adcq $0,%rdx + movq %rdx,%rbp + + mulq 24(%rcx) + addq %rbp,%r11 + adcq $0,%rdx + addq %rax,%r11 + movq %r8,%rax + adcq $0,%rdx + movq %rdx,%rbp + + mulq 32(%rcx) + addq %rax,%r12 + movq %r8,%rax + adcq $0,%rdx + addq %rbp,%r12 + adcq $0,%rdx + movq %rdx,%rbp + + mulq 40(%rcx) + addq %rax,%r13 + movq 8(%rbx),%rax + adcq $0,%rdx + addq %rbp,%r13 + adcq %rdx,%r14 + adcq $0,%r15 + + movq %rax,%rdi + mulq 0(%rsi) + addq %rax,%r9 + movq %rdi,%rax + adcq $0,%rdx + movq %rdx,%r8 + + mulq 8(%rsi) + addq %rax,%r10 + movq %rdi,%rax + adcq $0,%rdx + addq %r8,%r10 + adcq $0,%rdx + movq %rdx,%r8 + + mulq 16(%rsi) + addq %rax,%r11 + movq %rdi,%rax + adcq $0,%rdx + addq %r8,%r11 + adcq $0,%rdx + movq %rdx,%r8 + + movq %r9,%rbp + imulq 8(%rsp),%r9 + + mulq 24(%rsi) + addq %rax,%r12 + movq %rdi,%rax + adcq $0,%rdx + addq %r8,%r12 + adcq $0,%rdx + movq %rdx,%r8 + + mulq 32(%rsi) + addq %rax,%r13 + movq %rdi,%rax + adcq $0,%rdx + addq %r8,%r13 + adcq $0,%rdx + movq %rdx,%r8 + + mulq 40(%rsi) + addq %r8,%r14 + adcq $0,%rdx + xorq %r8,%r8 + addq %rax,%r14 + movq %r9,%rax + adcq %rdx,%r15 + adcq $0,%r8 + + mulq 0(%rcx) + addq %rax,%rbp + movq %r9,%rax + adcq %rdx,%rbp + + mulq 8(%rcx) + addq %rax,%r10 + movq %r9,%rax + adcq $0,%rdx + addq %rbp,%r10 + adcq $0,%rdx + movq %rdx,%rbp + + mulq 16(%rcx) + addq %rax,%r11 + movq %r9,%rax + adcq $0,%rdx + addq %rbp,%r11 + adcq $0,%rdx + movq %rdx,%rbp + + mulq 24(%rcx) + addq %rbp,%r12 + adcq $0,%rdx + addq %rax,%r12 + movq %r9,%rax + adcq $0,%rdx + movq %rdx,%rbp + + mulq 32(%rcx) + addq %rax,%r13 + movq %r9,%rax + adcq $0,%rdx + addq %rbp,%r13 + adcq $0,%rdx + movq %rdx,%rbp + + mulq 40(%rcx) + addq %rax,%r14 + movq 16(%rbx),%rax + adcq $0,%rdx + addq %rbp,%r14 + adcq %rdx,%r15 + adcq $0,%r8 + + movq %rax,%rdi + mulq 0(%rsi) + addq %rax,%r10 + movq %rdi,%rax + adcq $0,%rdx + movq %rdx,%r9 + + mulq 8(%rsi) + addq %rax,%r11 + movq %rdi,%rax + adcq $0,%rdx + addq %r9,%r11 + adcq $0,%rdx + movq %rdx,%r9 + + mulq 16(%rsi) + addq %rax,%r12 + movq %rdi,%rax + adcq $0,%rdx + addq %r9,%r12 + adcq $0,%rdx + movq %rdx,%r9 + + movq %r10,%rbp + imulq 8(%rsp),%r10 + + mulq 24(%rsi) + addq %rax,%r13 + movq %rdi,%rax + adcq $0,%rdx + addq %r9,%r13 + adcq $0,%rdx + movq %rdx,%r9 + + mulq 32(%rsi) + addq %rax,%r14 + movq %rdi,%rax + adcq $0,%rdx + addq %r9,%r14 + adcq $0,%rdx + movq %rdx,%r9 + + mulq 40(%rsi) + addq %r9,%r15 + adcq $0,%rdx + xorq %r9,%r9 + addq %rax,%r15 + movq %r10,%rax + adcq %rdx,%r8 + adcq $0,%r9 + + mulq 0(%rcx) + addq %rax,%rbp + movq %r10,%rax + adcq %rdx,%rbp + + mulq 8(%rcx) + addq %rax,%r11 + movq %r10,%rax + adcq $0,%rdx + addq %rbp,%r11 + adcq $0,%rdx + movq %rdx,%rbp + + mulq 16(%rcx) + addq %rax,%r12 + movq %r10,%rax + adcq $0,%rdx + addq %rbp,%r12 + adcq $0,%rdx + movq %rdx,%rbp + + mulq 24(%rcx) + addq %rbp,%r13 + adcq $0,%rdx + addq %rax,%r13 + movq %r10,%rax + adcq $0,%rdx + movq %rdx,%rbp + + mulq 32(%rcx) + addq %rax,%r14 + movq %r10,%rax + adcq $0,%rdx + addq %rbp,%r14 + adcq $0,%rdx + movq %rdx,%rbp + + mulq 40(%rcx) + addq %rax,%r15 + movq 24(%rbx),%rax + adcq $0,%rdx + addq %rbp,%r15 + adcq %rdx,%r8 + adcq $0,%r9 + + movq %rax,%rdi + mulq 0(%rsi) + addq %rax,%r11 + movq %rdi,%rax + adcq $0,%rdx + movq %rdx,%r10 + + mulq 8(%rsi) + addq %rax,%r12 + movq %rdi,%rax + adcq $0,%rdx + addq %r10,%r12 + adcq $0,%rdx + movq %rdx,%r10 + + mulq 16(%rsi) + addq %rax,%r13 + movq %rdi,%rax + adcq $0,%rdx + addq %r10,%r13 + adcq $0,%rdx + movq %rdx,%r10 + + movq %r11,%rbp + imulq 8(%rsp),%r11 + + mulq 24(%rsi) + addq %rax,%r14 + movq %rdi,%rax + adcq $0,%rdx + addq %r10,%r14 + adcq $0,%rdx + movq %rdx,%r10 + + mulq 32(%rsi) + addq %rax,%r15 + movq %rdi,%rax + adcq $0,%rdx + addq %r10,%r15 + adcq $0,%rdx + movq %rdx,%r10 + + mulq 40(%rsi) + addq %r10,%r8 + adcq $0,%rdx + xorq %r10,%r10 + addq %rax,%r8 + movq %r11,%rax + adcq %rdx,%r9 + adcq $0,%r10 + + mulq 0(%rcx) + addq %rax,%rbp + movq %r11,%rax + adcq %rdx,%rbp + + mulq 8(%rcx) + addq %rax,%r12 + movq %r11,%rax + adcq $0,%rdx + addq %rbp,%r12 + adcq $0,%rdx + movq %rdx,%rbp + + mulq 16(%rcx) + addq %rax,%r13 + movq %r11,%rax + adcq $0,%rdx + addq %rbp,%r13 + adcq $0,%rdx + movq %rdx,%rbp + + mulq 24(%rcx) + addq %rbp,%r14 + adcq $0,%rdx + addq %rax,%r14 + movq %r11,%rax + adcq $0,%rdx + movq %rdx,%rbp + + mulq 32(%rcx) + addq %rax,%r15 + movq %r11,%rax + adcq $0,%rdx + addq %rbp,%r15 + adcq $0,%rdx + movq %rdx,%rbp + + mulq 40(%rcx) + addq %rax,%r8 + movq 32(%rbx),%rax + adcq $0,%rdx + addq %rbp,%r8 + adcq %rdx,%r9 + adcq $0,%r10 + + movq %rax,%rdi + mulq 0(%rsi) + addq %rax,%r12 + movq %rdi,%rax + adcq $0,%rdx + movq %rdx,%r11 + + mulq 8(%rsi) + addq %rax,%r13 + movq %rdi,%rax + adcq $0,%rdx + addq %r11,%r13 + adcq $0,%rdx + movq %rdx,%r11 + + mulq 16(%rsi) + addq %rax,%r14 + movq %rdi,%rax + adcq $0,%rdx + addq %r11,%r14 + adcq $0,%rdx + movq %rdx,%r11 + + movq %r12,%rbp + imulq 8(%rsp),%r12 + + mulq 24(%rsi) + addq %rax,%r15 + movq %rdi,%rax + adcq $0,%rdx + addq %r11,%r15 + adcq $0,%rdx + movq %rdx,%r11 + + mulq 32(%rsi) + addq %rax,%r8 + movq %rdi,%rax + adcq $0,%rdx + addq %r11,%r8 + adcq $0,%rdx + movq %rdx,%r11 + + mulq 40(%rsi) + addq %r11,%r9 + adcq $0,%rdx + xorq %r11,%r11 + addq %rax,%r9 + movq %r12,%rax + adcq %rdx,%r10 + adcq $0,%r11 + + mulq 0(%rcx) + addq %rax,%rbp + movq %r12,%rax + adcq %rdx,%rbp + + mulq 8(%rcx) + addq %rax,%r13 + movq %r12,%rax + adcq $0,%rdx + addq %rbp,%r13 + adcq $0,%rdx + movq %rdx,%rbp + + mulq 16(%rcx) + addq %rax,%r14 + movq %r12,%rax + adcq $0,%rdx + addq %rbp,%r14 + adcq $0,%rdx + movq %rdx,%rbp + + mulq 24(%rcx) + addq %rbp,%r15 + adcq $0,%rdx + addq %rax,%r15 + movq %r12,%rax + adcq $0,%rdx + movq %rdx,%rbp + + mulq 32(%rcx) + addq %rax,%r8 + movq %r12,%rax + adcq $0,%rdx + addq %rbp,%r8 + adcq $0,%rdx + movq %rdx,%rbp + + mulq 40(%rcx) + addq %rax,%r9 + movq 40(%rbx),%rax + adcq $0,%rdx + addq %rbp,%r9 + adcq %rdx,%r10 + adcq $0,%r11 + + movq %rax,%rdi + mulq 0(%rsi) + addq %rax,%r13 + movq %rdi,%rax + adcq $0,%rdx + movq %rdx,%r12 + + mulq 8(%rsi) + addq %rax,%r14 + movq %rdi,%rax + adcq $0,%rdx + addq %r12,%r14 + adcq $0,%rdx + movq %rdx,%r12 + + mulq 16(%rsi) + addq %rax,%r15 + movq %rdi,%rax + adcq $0,%rdx + addq %r12,%r15 + adcq $0,%rdx + movq %rdx,%r12 + + movq %r13,%rbp + imulq 8(%rsp),%r13 + + mulq 24(%rsi) + addq %rax,%r8 + movq %rdi,%rax + adcq $0,%rdx + addq %r12,%r8 + adcq $0,%rdx + movq %rdx,%r12 + + mulq 32(%rsi) + addq %rax,%r9 + movq %rdi,%rax + adcq $0,%rdx + addq %r12,%r9 + adcq $0,%rdx + movq %rdx,%r12 + + mulq 40(%rsi) + addq %r12,%r10 + adcq $0,%rdx + xorq %r12,%r12 + addq %rax,%r10 + movq %r13,%rax + adcq %rdx,%r11 + adcq $0,%r12 + + mulq 0(%rcx) + addq %rax,%rbp + movq %r13,%rax + adcq %rdx,%rbp + + mulq 8(%rcx) + addq %rax,%r14 + movq %r13,%rax + adcq $0,%rdx + addq %rbp,%r14 + adcq $0,%rdx + movq %rdx,%rbp + + mulq 16(%rcx) + addq %rax,%r15 + movq %r13,%rax + adcq $0,%rdx + addq %rbp,%r15 + adcq $0,%rdx + movq %rdx,%rbp + + mulq 24(%rcx) + addq %rbp,%r8 + adcq $0,%rdx + addq %rax,%r8 + movq %r13,%rax + adcq $0,%rdx + movq %rdx,%rbp + + mulq 32(%rcx) + addq %rax,%r9 + movq %r13,%rax + adcq $0,%rdx + addq %rbp,%r9 + adcq $0,%rdx + movq %rdx,%rbp + + mulq 40(%rcx) + addq %rax,%r10 + movq %r14,%rax + adcq $0,%rdx + addq %rbp,%r10 + adcq %rdx,%r11 + adcq $0,%r12 + + + + + movq 16(%rsp),%rdi + subq 0(%rcx),%r14 + movq %r15,%rdx + sbbq 8(%rcx),%r15 + movq %r8,%rbx + sbbq 16(%rcx),%r8 + movq %r9,%rsi + sbbq 24(%rcx),%r9 + movq %r10,%rbp + sbbq 32(%rcx),%r10 + movq %r11,%r13 + sbbq 40(%rcx),%r11 + sbbq $0,%r12 + + cmovcq %rax,%r14 + cmovcq %rdx,%r15 + cmovcq %rbx,%r8 + movq %r14,0(%rdi) + cmovcq %rsi,%r9 + movq %r15,8(%rdi) + cmovcq %rbp,%r10 + movq %r8,16(%rdi) + cmovcq %r13,%r11 + movq %r9,24(%rdi) + movq %r10,32(%rdi) + movq %r11,40(%rdi) + + .byte 0xf3,0xc3 +.cfi_endproc +.size __mulq_mont_384,.-__mulq_mont_384 +.globl sqr_n_mul_mont_384 +.hidden sqr_n_mul_mont_384 +.type sqr_n_mul_mont_384,@function +.align 32 +sqr_n_mul_mont_384: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + subq $136,%rsp +.cfi_adjust_cfa_offset 8*17 + + + movq %r8,0(%rsp) + movq %rdi,8(%rsp) + movq %rcx,16(%rsp) + leaq 32(%rsp),%rdi + movq %r9,24(%rsp) + movq (%r9),%xmm2 + +.Loop_sqr_384: + movd %edx,%xmm1 + + call __sqrq_384 + + leaq 0(%rdi),%rsi + movq 0(%rsp),%rcx + movq 16(%rsp),%rbx + call __mulq_by_1_mont_384 + call __redc_tail_mont_384 + + movd %xmm1,%edx + leaq 0(%rdi),%rsi + decl %edx + jnz .Loop_sqr_384 + +.byte 102,72,15,126,208 + movq %rbx,%rcx + movq 24(%rsp),%rbx + + + + + + + movq %r8,%r12 + movq %r9,%r13 + + call __mulq_mont_384 + + leaq 136(%rsp),%r8 + movq 136(%rsp),%r15 +.cfi_restore %r15 + movq 8(%r8),%r14 +.cfi_restore %r14 + movq 16(%r8),%r13 +.cfi_restore %r13 + movq 24(%r8),%r12 +.cfi_restore %r12 + movq 32(%r8),%rbx +.cfi_restore %rbx + movq 40(%r8),%rbp +.cfi_restore %rbp + leaq 48(%r8),%rsp +.cfi_adjust_cfa_offset -8*23 + + .byte 0xf3,0xc3 +.cfi_endproc +.size sqr_n_mul_mont_384,.-sqr_n_mul_mont_384 + +.globl sqr_n_mul_mont_383 +.hidden sqr_n_mul_mont_383 +.type sqr_n_mul_mont_383,@function +.align 32 +sqr_n_mul_mont_383: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + subq $136,%rsp +.cfi_adjust_cfa_offset 8*17 + + + movq %r8,0(%rsp) + movq %rdi,8(%rsp) + movq %rcx,16(%rsp) + leaq 32(%rsp),%rdi + movq %r9,24(%rsp) + movq (%r9),%xmm2 + +.Loop_sqr_383: + movd %edx,%xmm1 + + call __sqrq_384 + + leaq 0(%rdi),%rsi + movq 0(%rsp),%rcx + movq 16(%rsp),%rbx + call __mulq_by_1_mont_384 + + movd %xmm1,%edx + addq 48(%rsi),%r14 + adcq 56(%rsi),%r15 + adcq 64(%rsi),%r8 + adcq 72(%rsi),%r9 + adcq 80(%rsi),%r10 + adcq 88(%rsi),%r11 + leaq 0(%rdi),%rsi + + movq %r14,0(%rdi) + movq %r15,8(%rdi) + movq %r8,16(%rdi) + movq %r9,24(%rdi) + movq %r10,32(%rdi) + movq %r11,40(%rdi) + + decl %edx + jnz .Loop_sqr_383 + +.byte 102,72,15,126,208 + movq %rbx,%rcx + movq 24(%rsp),%rbx + + + + + + + movq %r8,%r12 + movq %r9,%r13 + + call __mulq_mont_384 + + leaq 136(%rsp),%r8 + movq 136(%rsp),%r15 +.cfi_restore %r15 + movq 8(%r8),%r14 +.cfi_restore %r14 + movq 16(%r8),%r13 +.cfi_restore %r13 + movq 24(%r8),%r12 +.cfi_restore %r12 + movq 32(%r8),%rbx +.cfi_restore %rbx + movq 40(%r8),%rbp +.cfi_restore %rbp + leaq 48(%r8),%rsp +.cfi_adjust_cfa_offset -8*23 + + .byte 0xf3,0xc3 +.cfi_endproc +.size sqr_n_mul_mont_383,.-sqr_n_mul_mont_383 +.type __mulq_mont_383_nonred,@function +.align 32 +__mulq_mont_383_nonred: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + movq %rax,%rbp + mulq %r14 + movq %rax,%r8 + movq %rbp,%rax + movq %rdx,%r9 + + mulq %r15 + addq %rax,%r9 + movq %rbp,%rax + adcq $0,%rdx + movq %rdx,%r10 + + mulq %r12 + addq %rax,%r10 + movq %rbp,%rax + adcq $0,%rdx + movq %rdx,%r11 + + movq %r8,%r15 + imulq 8(%rsp),%r8 + + mulq %r13 + addq %rax,%r11 + movq %rbp,%rax + adcq $0,%rdx + movq %rdx,%r12 + + mulq 32(%rsi) + addq %rax,%r12 + movq %rbp,%rax + adcq $0,%rdx + movq %rdx,%r13 + + mulq 40(%rsi) + addq %rax,%r13 + movq %r8,%rax + adcq $0,%rdx + movq %rdx,%r14 + + mulq 0(%rcx) + addq %rax,%r15 + movq %r8,%rax + adcq %rdx,%r15 + + mulq 8(%rcx) + addq %rax,%r9 + movq %r8,%rax + adcq $0,%rdx + addq %r15,%r9 + adcq $0,%rdx + movq %rdx,%r15 + + mulq 16(%rcx) + addq %rax,%r10 + movq %r8,%rax + adcq $0,%rdx + addq %r15,%r10 + adcq $0,%rdx + movq %rdx,%r15 + + mulq 24(%rcx) + addq %r15,%r11 + adcq $0,%rdx + addq %rax,%r11 + movq %r8,%rax + adcq $0,%rdx + movq %rdx,%r15 + + mulq 32(%rcx) + addq %rax,%r12 + movq %r8,%rax + adcq $0,%rdx + addq %r15,%r12 + adcq $0,%rdx + movq %rdx,%r15 + + mulq 40(%rcx) + addq %rax,%r13 + movq 8(%rbx),%rax + adcq $0,%rdx + addq %r15,%r13 + adcq %rdx,%r14 + + movq %rax,%rbp + mulq 0(%rsi) + addq %rax,%r9 + movq %rbp,%rax + adcq $0,%rdx + movq %rdx,%r15 + + mulq 8(%rsi) + addq %rax,%r10 + movq %rbp,%rax + adcq $0,%rdx + addq %r15,%r10 + adcq $0,%rdx + movq %rdx,%r15 + + mulq 16(%rsi) + addq %rax,%r11 + movq %rbp,%rax + adcq $0,%rdx + addq %r15,%r11 + adcq $0,%rdx + movq %rdx,%r15 + + movq %r9,%r8 + imulq 8(%rsp),%r9 + + mulq 24(%rsi) + addq %rax,%r12 + movq %rbp,%rax + adcq $0,%rdx + addq %r15,%r12 + adcq $0,%rdx + movq %rdx,%r15 + + mulq 32(%rsi) + addq %rax,%r13 + movq %rbp,%rax + adcq $0,%rdx + addq %r15,%r13 + adcq $0,%rdx + movq %rdx,%r15 + + mulq 40(%rsi) + addq %r15,%r14 + adcq $0,%rdx + addq %rax,%r14 + movq %r9,%rax + adcq $0,%rdx + movq %rdx,%r15 + + mulq 0(%rcx) + addq %rax,%r8 + movq %r9,%rax + adcq %rdx,%r8 + + mulq 8(%rcx) + addq %rax,%r10 + movq %r9,%rax + adcq $0,%rdx + addq %r8,%r10 + adcq $0,%rdx + movq %rdx,%r8 + + mulq 16(%rcx) + addq %rax,%r11 + movq %r9,%rax + adcq $0,%rdx + addq %r8,%r11 + adcq $0,%rdx + movq %rdx,%r8 + + mulq 24(%rcx) + addq %r8,%r12 + adcq $0,%rdx + addq %rax,%r12 + movq %r9,%rax + adcq $0,%rdx + movq %rdx,%r8 + + mulq 32(%rcx) + addq %rax,%r13 + movq %r9,%rax + adcq $0,%rdx + addq %r8,%r13 + adcq $0,%rdx + movq %rdx,%r8 + + mulq 40(%rcx) + addq %rax,%r14 + movq 16(%rbx),%rax + adcq $0,%rdx + addq %r8,%r14 + adcq %rdx,%r15 + + movq %rax,%rbp + mulq 0(%rsi) + addq %rax,%r10 + movq %rbp,%rax + adcq $0,%rdx + movq %rdx,%r8 + + mulq 8(%rsi) + addq %rax,%r11 + movq %rbp,%rax + adcq $0,%rdx + addq %r8,%r11 + adcq $0,%rdx + movq %rdx,%r8 + + mulq 16(%rsi) + addq %rax,%r12 + movq %rbp,%rax + adcq $0,%rdx + addq %r8,%r12 + adcq $0,%rdx + movq %rdx,%r8 + + movq %r10,%r9 + imulq 8(%rsp),%r10 + + mulq 24(%rsi) + addq %rax,%r13 + movq %rbp,%rax + adcq $0,%rdx + addq %r8,%r13 + adcq $0,%rdx + movq %rdx,%r8 + + mulq 32(%rsi) + addq %rax,%r14 + movq %rbp,%rax + adcq $0,%rdx + addq %r8,%r14 + adcq $0,%rdx + movq %rdx,%r8 + + mulq 40(%rsi) + addq %r8,%r15 + adcq $0,%rdx + addq %rax,%r15 + movq %r10,%rax + adcq $0,%rdx + movq %rdx,%r8 + + mulq 0(%rcx) + addq %rax,%r9 + movq %r10,%rax + adcq %rdx,%r9 + + mulq 8(%rcx) + addq %rax,%r11 + movq %r10,%rax + adcq $0,%rdx + addq %r9,%r11 + adcq $0,%rdx + movq %rdx,%r9 + + mulq 16(%rcx) + addq %rax,%r12 + movq %r10,%rax + adcq $0,%rdx + addq %r9,%r12 + adcq $0,%rdx + movq %rdx,%r9 + + mulq 24(%rcx) + addq %r9,%r13 + adcq $0,%rdx + addq %rax,%r13 + movq %r10,%rax + adcq $0,%rdx + movq %rdx,%r9 + + mulq 32(%rcx) + addq %rax,%r14 + movq %r10,%rax + adcq $0,%rdx + addq %r9,%r14 + adcq $0,%rdx + movq %rdx,%r9 + + mulq 40(%rcx) + addq %rax,%r15 + movq 24(%rbx),%rax + adcq $0,%rdx + addq %r9,%r15 + adcq %rdx,%r8 + + movq %rax,%rbp + mulq 0(%rsi) + addq %rax,%r11 + movq %rbp,%rax + adcq $0,%rdx + movq %rdx,%r9 + + mulq 8(%rsi) + addq %rax,%r12 + movq %rbp,%rax + adcq $0,%rdx + addq %r9,%r12 + adcq $0,%rdx + movq %rdx,%r9 + + mulq 16(%rsi) + addq %rax,%r13 + movq %rbp,%rax + adcq $0,%rdx + addq %r9,%r13 + adcq $0,%rdx + movq %rdx,%r9 + + movq %r11,%r10 + imulq 8(%rsp),%r11 + + mulq 24(%rsi) + addq %rax,%r14 + movq %rbp,%rax + adcq $0,%rdx + addq %r9,%r14 + adcq $0,%rdx + movq %rdx,%r9 + + mulq 32(%rsi) + addq %rax,%r15 + movq %rbp,%rax + adcq $0,%rdx + addq %r9,%r15 + adcq $0,%rdx + movq %rdx,%r9 + + mulq 40(%rsi) + addq %r9,%r8 + adcq $0,%rdx + addq %rax,%r8 + movq %r11,%rax + adcq $0,%rdx + movq %rdx,%r9 + + mulq 0(%rcx) + addq %rax,%r10 + movq %r11,%rax + adcq %rdx,%r10 + + mulq 8(%rcx) + addq %rax,%r12 + movq %r11,%rax + adcq $0,%rdx + addq %r10,%r12 + adcq $0,%rdx + movq %rdx,%r10 + + mulq 16(%rcx) + addq %rax,%r13 + movq %r11,%rax + adcq $0,%rdx + addq %r10,%r13 + adcq $0,%rdx + movq %rdx,%r10 + + mulq 24(%rcx) + addq %r10,%r14 + adcq $0,%rdx + addq %rax,%r14 + movq %r11,%rax + adcq $0,%rdx + movq %rdx,%r10 + + mulq 32(%rcx) + addq %rax,%r15 + movq %r11,%rax + adcq $0,%rdx + addq %r10,%r15 + adcq $0,%rdx + movq %rdx,%r10 + + mulq 40(%rcx) + addq %rax,%r8 + movq 32(%rbx),%rax + adcq $0,%rdx + addq %r10,%r8 + adcq %rdx,%r9 + + movq %rax,%rbp + mulq 0(%rsi) + addq %rax,%r12 + movq %rbp,%rax + adcq $0,%rdx + movq %rdx,%r10 + + mulq 8(%rsi) + addq %rax,%r13 + movq %rbp,%rax + adcq $0,%rdx + addq %r10,%r13 + adcq $0,%rdx + movq %rdx,%r10 + + mulq 16(%rsi) + addq %rax,%r14 + movq %rbp,%rax + adcq $0,%rdx + addq %r10,%r14 + adcq $0,%rdx + movq %rdx,%r10 + + movq %r12,%r11 + imulq 8(%rsp),%r12 + + mulq 24(%rsi) + addq %rax,%r15 + movq %rbp,%rax + adcq $0,%rdx + addq %r10,%r15 + adcq $0,%rdx + movq %rdx,%r10 + + mulq 32(%rsi) + addq %rax,%r8 + movq %rbp,%rax + adcq $0,%rdx + addq %r10,%r8 + adcq $0,%rdx + movq %rdx,%r10 + + mulq 40(%rsi) + addq %r10,%r9 + adcq $0,%rdx + addq %rax,%r9 + movq %r12,%rax + adcq $0,%rdx + movq %rdx,%r10 + + mulq 0(%rcx) + addq %rax,%r11 + movq %r12,%rax + adcq %rdx,%r11 + + mulq 8(%rcx) + addq %rax,%r13 + movq %r12,%rax + adcq $0,%rdx + addq %r11,%r13 + adcq $0,%rdx + movq %rdx,%r11 + + mulq 16(%rcx) + addq %rax,%r14 + movq %r12,%rax + adcq $0,%rdx + addq %r11,%r14 + adcq $0,%rdx + movq %rdx,%r11 + + mulq 24(%rcx) + addq %r11,%r15 + adcq $0,%rdx + addq %rax,%r15 + movq %r12,%rax + adcq $0,%rdx + movq %rdx,%r11 + + mulq 32(%rcx) + addq %rax,%r8 + movq %r12,%rax + adcq $0,%rdx + addq %r11,%r8 + adcq $0,%rdx + movq %rdx,%r11 + + mulq 40(%rcx) + addq %rax,%r9 + movq 40(%rbx),%rax + adcq $0,%rdx + addq %r11,%r9 + adcq %rdx,%r10 + + movq %rax,%rbp + mulq 0(%rsi) + addq %rax,%r13 + movq %rbp,%rax + adcq $0,%rdx + movq %rdx,%r11 + + mulq 8(%rsi) + addq %rax,%r14 + movq %rbp,%rax + adcq $0,%rdx + addq %r11,%r14 + adcq $0,%rdx + movq %rdx,%r11 + + mulq 16(%rsi) + addq %rax,%r15 + movq %rbp,%rax + adcq $0,%rdx + addq %r11,%r15 + adcq $0,%rdx + movq %rdx,%r11 + + movq %r13,%r12 + imulq 8(%rsp),%r13 + + mulq 24(%rsi) + addq %rax,%r8 + movq %rbp,%rax + adcq $0,%rdx + addq %r11,%r8 + adcq $0,%rdx + movq %rdx,%r11 + + mulq 32(%rsi) + addq %rax,%r9 + movq %rbp,%rax + adcq $0,%rdx + addq %r11,%r9 + adcq $0,%rdx + movq %rdx,%r11 + + mulq 40(%rsi) + addq %r11,%r10 + adcq $0,%rdx + addq %rax,%r10 + movq %r13,%rax + adcq $0,%rdx + movq %rdx,%r11 + + mulq 0(%rcx) + addq %rax,%r12 + movq %r13,%rax + adcq %rdx,%r12 + + mulq 8(%rcx) + addq %rax,%r14 + movq %r13,%rax + adcq $0,%rdx + addq %r12,%r14 + adcq $0,%rdx + movq %rdx,%r12 + + mulq 16(%rcx) + addq %rax,%r15 + movq %r13,%rax + adcq $0,%rdx + addq %r12,%r15 + adcq $0,%rdx + movq %rdx,%r12 + + mulq 24(%rcx) + addq %r12,%r8 + adcq $0,%rdx + addq %rax,%r8 + movq %r13,%rax + adcq $0,%rdx + movq %rdx,%r12 + + mulq 32(%rcx) + addq %rax,%r9 + movq %r13,%rax + adcq $0,%rdx + addq %r12,%r9 + adcq $0,%rdx + movq %rdx,%r12 + + mulq 40(%rcx) + addq %rax,%r10 + movq %r14,%rax + adcq $0,%rdx + addq %r12,%r10 + adcq %rdx,%r11 + .byte 0xf3,0xc3 +.cfi_endproc +.size __mulq_mont_383_nonred,.-__mulq_mont_383_nonred +.globl sqr_mont_382x +.hidden sqr_mont_382x +.type sqr_mont_382x,@function +.align 32 +sqr_mont_382x: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + subq $136,%rsp +.cfi_adjust_cfa_offset 136 + + + movq %rcx,0(%rsp) + movq %rdx,%rcx + movq %rsi,16(%rsp) + movq %rdi,24(%rsp) + + + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + movq 32(%rsi),%r12 + movq 40(%rsi),%r13 + + movq %r8,%r14 + addq 48(%rsi),%r8 + movq %r9,%r15 + adcq 56(%rsi),%r9 + movq %r10,%rax + adcq 64(%rsi),%r10 + movq %r11,%rdx + adcq 72(%rsi),%r11 + movq %r12,%rbx + adcq 80(%rsi),%r12 + movq %r13,%rbp + adcq 88(%rsi),%r13 + + subq 48(%rsi),%r14 + sbbq 56(%rsi),%r15 + sbbq 64(%rsi),%rax + sbbq 72(%rsi),%rdx + sbbq 80(%rsi),%rbx + sbbq 88(%rsi),%rbp + sbbq %rdi,%rdi + + movq %r8,32+0(%rsp) + movq %r9,32+8(%rsp) + movq %r10,32+16(%rsp) + movq %r11,32+24(%rsp) + movq %r12,32+32(%rsp) + movq %r13,32+40(%rsp) + + movq %r14,32+48(%rsp) + movq %r15,32+56(%rsp) + movq %rax,32+64(%rsp) + movq %rdx,32+72(%rsp) + movq %rbx,32+80(%rsp) + movq %rbp,32+88(%rsp) + movq %rdi,32+96(%rsp) + + + + leaq 48(%rsi),%rbx + + movq 48(%rsi),%rax + movq 0(%rsi),%r14 + movq 8(%rsi),%r15 + movq 16(%rsi),%r12 + movq 24(%rsi),%r13 + + movq 24(%rsp),%rdi + call __mulq_mont_383_nonred + addq %r14,%r14 + adcq %r15,%r15 + adcq %r8,%r8 + adcq %r9,%r9 + adcq %r10,%r10 + adcq %r11,%r11 + + movq %r14,48(%rdi) + movq %r15,56(%rdi) + movq %r8,64(%rdi) + movq %r9,72(%rdi) + movq %r10,80(%rdi) + movq %r11,88(%rdi) + + leaq 32(%rsp),%rsi + leaq 32+48(%rsp),%rbx + + movq 32+48(%rsp),%rax + movq 32+0(%rsp),%r14 + movq 32+8(%rsp),%r15 + movq 32+16(%rsp),%r12 + movq 32+24(%rsp),%r13 + + call __mulq_mont_383_nonred + movq 32+96(%rsp),%rsi + movq 32+0(%rsp),%r12 + movq 32+8(%rsp),%r13 + andq %rsi,%r12 + movq 32+16(%rsp),%rax + andq %rsi,%r13 + movq 32+24(%rsp),%rbx + andq %rsi,%rax + movq 32+32(%rsp),%rbp + andq %rsi,%rbx + andq %rsi,%rbp + andq 32+40(%rsp),%rsi + + subq %r12,%r14 + movq 0(%rcx),%r12 + sbbq %r13,%r15 + movq 8(%rcx),%r13 + sbbq %rax,%r8 + movq 16(%rcx),%rax + sbbq %rbx,%r9 + movq 24(%rcx),%rbx + sbbq %rbp,%r10 + movq 32(%rcx),%rbp + sbbq %rsi,%r11 + sbbq %rsi,%rsi + + andq %rsi,%r12 + andq %rsi,%r13 + andq %rsi,%rax + andq %rsi,%rbx + andq %rsi,%rbp + andq 40(%rcx),%rsi + + addq %r12,%r14 + adcq %r13,%r15 + adcq %rax,%r8 + adcq %rbx,%r9 + adcq %rbp,%r10 + adcq %rsi,%r11 + + movq %r14,0(%rdi) + movq %r15,8(%rdi) + movq %r8,16(%rdi) + movq %r9,24(%rdi) + movq %r10,32(%rdi) + movq %r11,40(%rdi) + leaq 136(%rsp),%r8 + movq 0(%r8),%r15 +.cfi_restore %r15 + movq 8(%r8),%r14 +.cfi_restore %r14 + movq 16(%r8),%r13 +.cfi_restore %r13 + movq 24(%r8),%r12 +.cfi_restore %r12 + movq 32(%r8),%rbx +.cfi_restore %rbx + movq 40(%r8),%rbp +.cfi_restore %rbp + leaq 48(%r8),%rsp +.cfi_adjust_cfa_offset -136-8*6 + + .byte 0xf3,0xc3 +.cfi_endproc +.size sqr_mont_382x,.-sqr_mont_382x + +.section .note.GNU-stack,"",@progbits +.section .note.gnu.property,"a",@note + .long 4,2f-1f,5 + .byte 0x47,0x4E,0x55,0 +1: .long 0xc0000002,4,3 +.align 8 +2: diff --git a/temp-deps/blst-ts/deps/blst/build/elf/mulx_mont_256-x86_64.s b/temp-deps/blst-ts/deps/blst/build/elf/mulx_mont_256-x86_64.s new file mode 100644 index 0000000..20a0207 --- /dev/null +++ b/temp-deps/blst-ts/deps/blst/build/elf/mulx_mont_256-x86_64.s @@ -0,0 +1,627 @@ +.text + +.globl mulx_mont_sparse_256 +.hidden mulx_mont_sparse_256 +.type mulx_mont_sparse_256,@function +.align 32 +mulx_mont_sparse_256: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + subq $8,%rsp +.cfi_adjust_cfa_offset 8 + + + movq %rdx,%rbx + movq 0(%rdx),%rdx + movq 0(%rsi),%r14 + movq 8(%rsi),%r15 + movq 16(%rsi),%rbp + movq 24(%rsi),%r9 + leaq -128(%rsi),%rsi + leaq -128(%rcx),%rcx + + mulxq %r14,%rax,%r11 + call __mulx_mont_sparse_256 + + movq 8(%rsp),%r15 +.cfi_restore %r15 + movq 16(%rsp),%r14 +.cfi_restore %r14 + movq 24(%rsp),%r13 +.cfi_restore %r13 + movq 32(%rsp),%r12 +.cfi_restore %r12 + movq 40(%rsp),%rbx +.cfi_restore %rbx + movq 48(%rsp),%rbp +.cfi_restore %rbp + leaq 56(%rsp),%rsp +.cfi_adjust_cfa_offset -56 + + .byte 0xf3,0xc3 +.cfi_endproc +.size mulx_mont_sparse_256,.-mulx_mont_sparse_256 + +.globl sqrx_mont_sparse_256 +.hidden sqrx_mont_sparse_256 +.type sqrx_mont_sparse_256,@function +.align 32 +sqrx_mont_sparse_256: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + subq $8,%rsp +.cfi_adjust_cfa_offset 8 + + + movq %rsi,%rbx + movq %rcx,%r8 + movq %rdx,%rcx + movq 0(%rsi),%rdx + movq 8(%rsi),%r15 + movq 16(%rsi),%rbp + movq 24(%rsi),%r9 + leaq -128(%rbx),%rsi + leaq -128(%rcx),%rcx + + mulxq %rdx,%rax,%r11 + call __mulx_mont_sparse_256 + + movq 8(%rsp),%r15 +.cfi_restore %r15 + movq 16(%rsp),%r14 +.cfi_restore %r14 + movq 24(%rsp),%r13 +.cfi_restore %r13 + movq 32(%rsp),%r12 +.cfi_restore %r12 + movq 40(%rsp),%rbx +.cfi_restore %rbx + movq 48(%rsp),%rbp +.cfi_restore %rbp + leaq 56(%rsp),%rsp +.cfi_adjust_cfa_offset -56 + + .byte 0xf3,0xc3 +.cfi_endproc +.size sqrx_mont_sparse_256,.-sqrx_mont_sparse_256 +.type __mulx_mont_sparse_256,@function +.align 32 +__mulx_mont_sparse_256: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + mulxq %r15,%r15,%r12 + mulxq %rbp,%rbp,%r13 + addq %r15,%r11 + mulxq %r9,%r9,%r14 + movq 8(%rbx),%rdx + adcq %rbp,%r12 + adcq %r9,%r13 + adcq $0,%r14 + + movq %rax,%r10 + imulq %r8,%rax + + + xorq %r15,%r15 + mulxq 0+128(%rsi),%rbp,%r9 + adoxq %rbp,%r11 + adcxq %r9,%r12 + + mulxq 8+128(%rsi),%rbp,%r9 + adoxq %rbp,%r12 + adcxq %r9,%r13 + + mulxq 16+128(%rsi),%rbp,%r9 + adoxq %rbp,%r13 + adcxq %r9,%r14 + + mulxq 24+128(%rsi),%rbp,%r9 + movq %rax,%rdx + adoxq %rbp,%r14 + adcxq %r15,%r9 + adoxq %r9,%r15 + + + mulxq 0+128(%rcx),%rbp,%rax + adcxq %rbp,%r10 + adoxq %r11,%rax + + mulxq 8+128(%rcx),%rbp,%r9 + adcxq %rbp,%rax + adoxq %r9,%r12 + + mulxq 16+128(%rcx),%rbp,%r9 + adcxq %rbp,%r12 + adoxq %r9,%r13 + + mulxq 24+128(%rcx),%rbp,%r9 + movq 16(%rbx),%rdx + adcxq %rbp,%r13 + adoxq %r9,%r14 + adcxq %r10,%r14 + adoxq %r10,%r15 + adcxq %r10,%r15 + adoxq %r10,%r10 + adcq $0,%r10 + movq %rax,%r11 + imulq %r8,%rax + + + xorq %rbp,%rbp + mulxq 0+128(%rsi),%rbp,%r9 + adoxq %rbp,%r12 + adcxq %r9,%r13 + + mulxq 8+128(%rsi),%rbp,%r9 + adoxq %rbp,%r13 + adcxq %r9,%r14 + + mulxq 16+128(%rsi),%rbp,%r9 + adoxq %rbp,%r14 + adcxq %r9,%r15 + + mulxq 24+128(%rsi),%rbp,%r9 + movq %rax,%rdx + adoxq %rbp,%r15 + adcxq %r10,%r9 + adoxq %r9,%r10 + + + mulxq 0+128(%rcx),%rbp,%rax + adcxq %rbp,%r11 + adoxq %r12,%rax + + mulxq 8+128(%rcx),%rbp,%r9 + adcxq %rbp,%rax + adoxq %r9,%r13 + + mulxq 16+128(%rcx),%rbp,%r9 + adcxq %rbp,%r13 + adoxq %r9,%r14 + + mulxq 24+128(%rcx),%rbp,%r9 + movq 24(%rbx),%rdx + adcxq %rbp,%r14 + adoxq %r9,%r15 + adcxq %r11,%r15 + adoxq %r11,%r10 + adcxq %r11,%r10 + adoxq %r11,%r11 + adcq $0,%r11 + movq %rax,%r12 + imulq %r8,%rax + + + xorq %rbp,%rbp + mulxq 0+128(%rsi),%rbp,%r9 + adoxq %rbp,%r13 + adcxq %r9,%r14 + + mulxq 8+128(%rsi),%rbp,%r9 + adoxq %rbp,%r14 + adcxq %r9,%r15 + + mulxq 16+128(%rsi),%rbp,%r9 + adoxq %rbp,%r15 + adcxq %r9,%r10 + + mulxq 24+128(%rsi),%rbp,%r9 + movq %rax,%rdx + adoxq %rbp,%r10 + adcxq %r11,%r9 + adoxq %r9,%r11 + + + mulxq 0+128(%rcx),%rbp,%rax + adcxq %rbp,%r12 + adoxq %r13,%rax + + mulxq 8+128(%rcx),%rbp,%r9 + adcxq %rbp,%rax + adoxq %r9,%r14 + + mulxq 16+128(%rcx),%rbp,%r9 + adcxq %rbp,%r14 + adoxq %r9,%r15 + + mulxq 24+128(%rcx),%rbp,%r9 + movq %rax,%rdx + adcxq %rbp,%r15 + adoxq %r9,%r10 + adcxq %r12,%r10 + adoxq %r12,%r11 + adcxq %r12,%r11 + adoxq %r12,%r12 + adcq $0,%r12 + imulq %r8,%rdx + + + xorq %rbp,%rbp + mulxq 0+128(%rcx),%r13,%r9 + adcxq %rax,%r13 + adoxq %r9,%r14 + + mulxq 8+128(%rcx),%rbp,%r9 + adcxq %rbp,%r14 + adoxq %r9,%r15 + + mulxq 16+128(%rcx),%rbp,%r9 + adcxq %rbp,%r15 + adoxq %r9,%r10 + + mulxq 24+128(%rcx),%rbp,%r9 + movq %r14,%rdx + leaq 128(%rcx),%rcx + adcxq %rbp,%r10 + adoxq %r9,%r11 + movq %r15,%rax + adcxq %r13,%r11 + adoxq %r13,%r12 + adcq $0,%r12 + + + + + movq %r10,%rbp + subq 0(%rcx),%r14 + sbbq 8(%rcx),%r15 + sbbq 16(%rcx),%r10 + movq %r11,%r9 + sbbq 24(%rcx),%r11 + sbbq $0,%r12 + + cmovcq %rdx,%r14 + cmovcq %rax,%r15 + cmovcq %rbp,%r10 + movq %r14,0(%rdi) + cmovcq %r9,%r11 + movq %r15,8(%rdi) + movq %r10,16(%rdi) + movq %r11,24(%rdi) + + .byte 0xf3,0xc3 +.cfi_endproc +.size __mulx_mont_sparse_256,.-__mulx_mont_sparse_256 +.globl fromx_mont_256 +.hidden fromx_mont_256 +.type fromx_mont_256,@function +.align 32 +fromx_mont_256: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + subq $8,%rsp +.cfi_adjust_cfa_offset 8 + + + movq %rdx,%rbx + call __mulx_by_1_mont_256 + + + + + + movq %r15,%rdx + movq %r10,%r12 + movq %r11,%r13 + + subq 0(%rbx),%r14 + sbbq 8(%rbx),%r15 + sbbq 16(%rbx),%r10 + sbbq 24(%rbx),%r11 + + cmovncq %r14,%rax + cmovncq %r15,%rdx + cmovncq %r10,%r12 + movq %rax,0(%rdi) + cmovncq %r11,%r13 + movq %rdx,8(%rdi) + movq %r12,16(%rdi) + movq %r13,24(%rdi) + + movq 8(%rsp),%r15 +.cfi_restore %r15 + movq 16(%rsp),%r14 +.cfi_restore %r14 + movq 24(%rsp),%r13 +.cfi_restore %r13 + movq 32(%rsp),%r12 +.cfi_restore %r12 + movq 40(%rsp),%rbx +.cfi_restore %rbx + movq 48(%rsp),%rbp +.cfi_restore %rbp + leaq 56(%rsp),%rsp +.cfi_adjust_cfa_offset -56 + + .byte 0xf3,0xc3 +.cfi_endproc +.size fromx_mont_256,.-fromx_mont_256 + +.globl redcx_mont_256 +.hidden redcx_mont_256 +.type redcx_mont_256,@function +.align 32 +redcx_mont_256: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + subq $8,%rsp +.cfi_adjust_cfa_offset 8 + + + movq %rdx,%rbx + call __mulx_by_1_mont_256 + + addq 32(%rsi),%r14 + adcq 40(%rsi),%r15 + movq %r14,%rax + adcq 48(%rsi),%r10 + movq %r15,%rdx + adcq 56(%rsi),%r11 + sbbq %rsi,%rsi + + + + + movq %r10,%r12 + subq 0(%rbx),%r14 + sbbq 8(%rbx),%r15 + sbbq 16(%rbx),%r10 + movq %r11,%r13 + sbbq 24(%rbx),%r11 + sbbq $0,%rsi + + cmovncq %r14,%rax + cmovncq %r15,%rdx + cmovncq %r10,%r12 + movq %rax,0(%rdi) + cmovncq %r11,%r13 + movq %rdx,8(%rdi) + movq %r12,16(%rdi) + movq %r13,24(%rdi) + + movq 8(%rsp),%r15 +.cfi_restore %r15 + movq 16(%rsp),%r14 +.cfi_restore %r14 + movq 24(%rsp),%r13 +.cfi_restore %r13 + movq 32(%rsp),%r12 +.cfi_restore %r12 + movq 40(%rsp),%rbx +.cfi_restore %rbx + movq 48(%rsp),%rbp +.cfi_restore %rbp + leaq 56(%rsp),%rsp +.cfi_adjust_cfa_offset -56 + + .byte 0xf3,0xc3 +.cfi_endproc +.size redcx_mont_256,.-redcx_mont_256 +.type __mulx_by_1_mont_256,@function +.align 32 +__mulx_by_1_mont_256: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + movq 0(%rsi),%rax + movq 8(%rsi),%r11 + movq 16(%rsi),%r12 + movq 24(%rsi),%r13 + + movq %rax,%r14 + imulq %rcx,%rax + movq %rax,%r10 + + mulq 0(%rbx) + addq %rax,%r14 + movq %r10,%rax + adcq %rdx,%r14 + + mulq 8(%rbx) + addq %rax,%r11 + movq %r10,%rax + adcq $0,%rdx + addq %r14,%r11 + adcq $0,%rdx + movq %rdx,%r14 + + mulq 16(%rbx) + movq %r11,%r15 + imulq %rcx,%r11 + addq %rax,%r12 + movq %r10,%rax + adcq $0,%rdx + addq %r14,%r12 + adcq $0,%rdx + movq %rdx,%r14 + + mulq 24(%rbx) + addq %rax,%r13 + movq %r11,%rax + adcq $0,%rdx + addq %r14,%r13 + adcq $0,%rdx + movq %rdx,%r14 + + mulq 0(%rbx) + addq %rax,%r15 + movq %r11,%rax + adcq %rdx,%r15 + + mulq 8(%rbx) + addq %rax,%r12 + movq %r11,%rax + adcq $0,%rdx + addq %r15,%r12 + adcq $0,%rdx + movq %rdx,%r15 + + mulq 16(%rbx) + movq %r12,%r10 + imulq %rcx,%r12 + addq %rax,%r13 + movq %r11,%rax + adcq $0,%rdx + addq %r15,%r13 + adcq $0,%rdx + movq %rdx,%r15 + + mulq 24(%rbx) + addq %rax,%r14 + movq %r12,%rax + adcq $0,%rdx + addq %r15,%r14 + adcq $0,%rdx + movq %rdx,%r15 + + mulq 0(%rbx) + addq %rax,%r10 + movq %r12,%rax + adcq %rdx,%r10 + + mulq 8(%rbx) + addq %rax,%r13 + movq %r12,%rax + adcq $0,%rdx + addq %r10,%r13 + adcq $0,%rdx + movq %rdx,%r10 + + mulq 16(%rbx) + movq %r13,%r11 + imulq %rcx,%r13 + addq %rax,%r14 + movq %r12,%rax + adcq $0,%rdx + addq %r10,%r14 + adcq $0,%rdx + movq %rdx,%r10 + + mulq 24(%rbx) + addq %rax,%r15 + movq %r13,%rax + adcq $0,%rdx + addq %r10,%r15 + adcq $0,%rdx + movq %rdx,%r10 + + mulq 0(%rbx) + addq %rax,%r11 + movq %r13,%rax + adcq %rdx,%r11 + + mulq 8(%rbx) + addq %rax,%r14 + movq %r13,%rax + adcq $0,%rdx + addq %r11,%r14 + adcq $0,%rdx + movq %rdx,%r11 + + mulq 16(%rbx) + addq %rax,%r15 + movq %r13,%rax + adcq $0,%rdx + addq %r11,%r15 + adcq $0,%rdx + movq %rdx,%r11 + + mulq 24(%rbx) + addq %rax,%r10 + movq %r14,%rax + adcq $0,%rdx + addq %r11,%r10 + adcq $0,%rdx + movq %rdx,%r11 + .byte 0xf3,0xc3 +.cfi_endproc +.size __mulx_by_1_mont_256,.-__mulx_by_1_mont_256 + +.section .note.GNU-stack,"",@progbits +.section .note.gnu.property,"a",@note + .long 4,2f-1f,5 + .byte 0x47,0x4E,0x55,0 +1: .long 0xc0000002,4,3 +.align 8 +2: diff --git a/temp-deps/blst-ts/deps/blst/build/elf/mulx_mont_384-x86_64.s b/temp-deps/blst-ts/deps/blst/build/elf/mulx_mont_384-x86_64.s new file mode 100644 index 0000000..9f9f740 --- /dev/null +++ b/temp-deps/blst-ts/deps/blst/build/elf/mulx_mont_384-x86_64.s @@ -0,0 +1,2968 @@ +.text + + + + + + + +.type __sub_mod_384x384,@function +.align 32 +__sub_mod_384x384: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + movq 32(%rsi),%r12 + movq 40(%rsi),%r13 + movq 48(%rsi),%r14 + + subq 0(%rdx),%r8 + movq 56(%rsi),%r15 + sbbq 8(%rdx),%r9 + movq 64(%rsi),%rax + sbbq 16(%rdx),%r10 + movq 72(%rsi),%rbx + sbbq 24(%rdx),%r11 + movq 80(%rsi),%rbp + sbbq 32(%rdx),%r12 + movq 88(%rsi),%rsi + sbbq 40(%rdx),%r13 + movq %r8,0(%rdi) + sbbq 48(%rdx),%r14 + movq 0(%rcx),%r8 + movq %r9,8(%rdi) + sbbq 56(%rdx),%r15 + movq 8(%rcx),%r9 + movq %r10,16(%rdi) + sbbq 64(%rdx),%rax + movq 16(%rcx),%r10 + movq %r11,24(%rdi) + sbbq 72(%rdx),%rbx + movq 24(%rcx),%r11 + movq %r12,32(%rdi) + sbbq 80(%rdx),%rbp + movq 32(%rcx),%r12 + movq %r13,40(%rdi) + sbbq 88(%rdx),%rsi + movq 40(%rcx),%r13 + sbbq %rdx,%rdx + + andq %rdx,%r8 + andq %rdx,%r9 + andq %rdx,%r10 + andq %rdx,%r11 + andq %rdx,%r12 + andq %rdx,%r13 + + addq %r8,%r14 + adcq %r9,%r15 + movq %r14,48(%rdi) + adcq %r10,%rax + movq %r15,56(%rdi) + adcq %r11,%rbx + movq %rax,64(%rdi) + adcq %r12,%rbp + movq %rbx,72(%rdi) + adcq %r13,%rsi + movq %rbp,80(%rdi) + movq %rsi,88(%rdi) + + .byte 0xf3,0xc3 +.cfi_endproc +.size __sub_mod_384x384,.-__sub_mod_384x384 + +.type __add_mod_384,@function +.align 32 +__add_mod_384: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + movq 32(%rsi),%r12 + movq 40(%rsi),%r13 + + addq 0(%rdx),%r8 + adcq 8(%rdx),%r9 + adcq 16(%rdx),%r10 + movq %r8,%r14 + adcq 24(%rdx),%r11 + movq %r9,%r15 + adcq 32(%rdx),%r12 + movq %r10,%rax + adcq 40(%rdx),%r13 + movq %r11,%rbx + sbbq %rdx,%rdx + + subq 0(%rcx),%r8 + sbbq 8(%rcx),%r9 + movq %r12,%rbp + sbbq 16(%rcx),%r10 + sbbq 24(%rcx),%r11 + sbbq 32(%rcx),%r12 + movq %r13,%rsi + sbbq 40(%rcx),%r13 + sbbq $0,%rdx + + cmovcq %r14,%r8 + cmovcq %r15,%r9 + cmovcq %rax,%r10 + movq %r8,0(%rdi) + cmovcq %rbx,%r11 + movq %r9,8(%rdi) + cmovcq %rbp,%r12 + movq %r10,16(%rdi) + cmovcq %rsi,%r13 + movq %r11,24(%rdi) + movq %r12,32(%rdi) + movq %r13,40(%rdi) + + .byte 0xf3,0xc3 +.cfi_endproc +.size __add_mod_384,.-__add_mod_384 + +.type __sub_mod_384,@function +.align 32 +__sub_mod_384: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + movq 32(%rsi),%r12 + movq 40(%rsi),%r13 + +__sub_mod_384_a_is_loaded: + subq 0(%rdx),%r8 + movq 0(%rcx),%r14 + sbbq 8(%rdx),%r9 + movq 8(%rcx),%r15 + sbbq 16(%rdx),%r10 + movq 16(%rcx),%rax + sbbq 24(%rdx),%r11 + movq 24(%rcx),%rbx + sbbq 32(%rdx),%r12 + movq 32(%rcx),%rbp + sbbq 40(%rdx),%r13 + movq 40(%rcx),%rsi + sbbq %rdx,%rdx + + andq %rdx,%r14 + andq %rdx,%r15 + andq %rdx,%rax + andq %rdx,%rbx + andq %rdx,%rbp + andq %rdx,%rsi + + addq %r14,%r8 + adcq %r15,%r9 + movq %r8,0(%rdi) + adcq %rax,%r10 + movq %r9,8(%rdi) + adcq %rbx,%r11 + movq %r10,16(%rdi) + adcq %rbp,%r12 + movq %r11,24(%rdi) + adcq %rsi,%r13 + movq %r12,32(%rdi) + movq %r13,40(%rdi) + + .byte 0xf3,0xc3 +.cfi_endproc +.size __sub_mod_384,.-__sub_mod_384 +.globl mulx_mont_384x +.hidden mulx_mont_384x +.type mulx_mont_384x,@function +.align 32 +mulx_mont_384x: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + subq $328,%rsp +.cfi_adjust_cfa_offset 328 + + + movq %rdx,%rbx + movq %rdi,32(%rsp) + movq %rsi,24(%rsp) + movq %rdx,16(%rsp) + movq %rcx,8(%rsp) + movq %r8,0(%rsp) + + + + + leaq 40(%rsp),%rdi + call __mulx_384 + + + leaq 48(%rbx),%rbx + leaq 128+48(%rsi),%rsi + leaq 96(%rdi),%rdi + call __mulx_384 + + + movq 8(%rsp),%rcx + leaq (%rbx),%rsi + leaq -48(%rbx),%rdx + leaq 40+192+48(%rsp),%rdi + call __add_mod_384 + + movq 24(%rsp),%rsi + leaq 48(%rsi),%rdx + leaq -48(%rdi),%rdi + call __add_mod_384 + + leaq (%rdi),%rbx + leaq 48(%rdi),%rsi + call __mulx_384 + + + leaq (%rdi),%rsi + leaq 40(%rsp),%rdx + movq 8(%rsp),%rcx + call __sub_mod_384x384 + + leaq (%rdi),%rsi + leaq -96(%rdi),%rdx + call __sub_mod_384x384 + + + leaq 40(%rsp),%rsi + leaq 40+96(%rsp),%rdx + leaq 40(%rsp),%rdi + call __sub_mod_384x384 + + leaq (%rcx),%rbx + + + leaq 40(%rsp),%rsi + movq 0(%rsp),%rcx + movq 32(%rsp),%rdi + call __mulx_by_1_mont_384 + call __redc_tail_mont_384 + + + leaq 40+192(%rsp),%rsi + movq 0(%rsp),%rcx + leaq 48(%rdi),%rdi + call __mulx_by_1_mont_384 + call __redc_tail_mont_384 + + leaq 328(%rsp),%r8 + movq 0(%r8),%r15 +.cfi_restore %r15 + movq 8(%r8),%r14 +.cfi_restore %r14 + movq 16(%r8),%r13 +.cfi_restore %r13 + movq 24(%r8),%r12 +.cfi_restore %r12 + movq 32(%r8),%rbx +.cfi_restore %rbx + movq 40(%r8),%rbp +.cfi_restore %rbp + leaq 48(%r8),%rsp +.cfi_adjust_cfa_offset -328-8*6 + + .byte 0xf3,0xc3 +.cfi_endproc +.size mulx_mont_384x,.-mulx_mont_384x +.globl sqrx_mont_384x +.hidden sqrx_mont_384x +.type sqrx_mont_384x,@function +.align 32 +sqrx_mont_384x: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + subq $136,%rsp +.cfi_adjust_cfa_offset 136 + + + movq %rcx,0(%rsp) + movq %rdx,%rcx + + movq %rdi,16(%rsp) + movq %rsi,24(%rsp) + + + leaq 48(%rsi),%rdx + leaq 32(%rsp),%rdi + call __add_mod_384 + + + movq 24(%rsp),%rsi + leaq 48(%rsi),%rdx + leaq 32+48(%rsp),%rdi + call __sub_mod_384 + + + movq 24(%rsp),%rsi + leaq 48(%rsi),%rbx + + movq 48(%rsi),%rdx + movq 0(%rsi),%r14 + movq 8(%rsi),%r15 + movq 16(%rsi),%rax + movq 24(%rsi),%r12 + movq 32(%rsi),%rdi + movq 40(%rsi),%rbp + leaq -128(%rsi),%rsi + leaq -128(%rcx),%rcx + + mulxq %r14,%r8,%r9 + call __mulx_mont_384 + addq %rdx,%rdx + adcq %r15,%r15 + adcq %rax,%rax + movq %rdx,%r8 + adcq %r12,%r12 + movq %r15,%r9 + adcq %rdi,%rdi + movq %rax,%r10 + adcq %rbp,%rbp + movq %r12,%r11 + sbbq %rsi,%rsi + + subq 0(%rcx),%rdx + sbbq 8(%rcx),%r15 + movq %rdi,%r13 + sbbq 16(%rcx),%rax + sbbq 24(%rcx),%r12 + sbbq 32(%rcx),%rdi + movq %rbp,%r14 + sbbq 40(%rcx),%rbp + sbbq $0,%rsi + + cmovcq %r8,%rdx + cmovcq %r9,%r15 + cmovcq %r10,%rax + movq %rdx,48(%rbx) + cmovcq %r11,%r12 + movq %r15,56(%rbx) + cmovcq %r13,%rdi + movq %rax,64(%rbx) + cmovcq %r14,%rbp + movq %r12,72(%rbx) + movq %rdi,80(%rbx) + movq %rbp,88(%rbx) + + leaq 32(%rsp),%rsi + leaq 32+48(%rsp),%rbx + + movq 32+48(%rsp),%rdx + movq 32+0(%rsp),%r14 + movq 32+8(%rsp),%r15 + movq 32+16(%rsp),%rax + movq 32+24(%rsp),%r12 + movq 32+32(%rsp),%rdi + movq 32+40(%rsp),%rbp + leaq -128(%rsi),%rsi + leaq -128(%rcx),%rcx + + mulxq %r14,%r8,%r9 + call __mulx_mont_384 + + leaq 136(%rsp),%r8 + movq 0(%r8),%r15 +.cfi_restore %r15 + movq 8(%r8),%r14 +.cfi_restore %r14 + movq 16(%r8),%r13 +.cfi_restore %r13 + movq 24(%r8),%r12 +.cfi_restore %r12 + movq 32(%r8),%rbx +.cfi_restore %rbx + movq 40(%r8),%rbp +.cfi_restore %rbp + leaq 48(%r8),%rsp +.cfi_adjust_cfa_offset -136-8*6 + + .byte 0xf3,0xc3 +.cfi_endproc +.size sqrx_mont_384x,.-sqrx_mont_384x + +.globl mulx_382x +.hidden mulx_382x +.type mulx_382x,@function +.align 32 +mulx_382x: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + subq $136,%rsp +.cfi_adjust_cfa_offset 136 + + + leaq 96(%rdi),%rdi + movq %rsi,0(%rsp) + movq %rdx,8(%rsp) + movq %rdi,16(%rsp) + movq %rcx,24(%rsp) + + + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + movq 32(%rsi),%r12 + movq 40(%rsi),%r13 + + addq 48(%rsi),%r8 + adcq 56(%rsi),%r9 + adcq 64(%rsi),%r10 + adcq 72(%rsi),%r11 + adcq 80(%rsi),%r12 + adcq 88(%rsi),%r13 + + movq %r8,32+0(%rsp) + movq %r9,32+8(%rsp) + movq %r10,32+16(%rsp) + movq %r11,32+24(%rsp) + movq %r12,32+32(%rsp) + movq %r13,32+40(%rsp) + + + movq 0(%rdx),%r8 + movq 8(%rdx),%r9 + movq 16(%rdx),%r10 + movq 24(%rdx),%r11 + movq 32(%rdx),%r12 + movq 40(%rdx),%r13 + + addq 48(%rdx),%r8 + adcq 56(%rdx),%r9 + adcq 64(%rdx),%r10 + adcq 72(%rdx),%r11 + adcq 80(%rdx),%r12 + adcq 88(%rdx),%r13 + + movq %r8,32+48(%rsp) + movq %r9,32+56(%rsp) + movq %r10,32+64(%rsp) + movq %r11,32+72(%rsp) + movq %r12,32+80(%rsp) + movq %r13,32+88(%rsp) + + + leaq 32+0(%rsp),%rsi + leaq 32+48(%rsp),%rbx + call __mulx_384 + + + movq 0(%rsp),%rsi + movq 8(%rsp),%rbx + leaq -96(%rdi),%rdi + call __mulx_384 + + + leaq 48+128(%rsi),%rsi + leaq 48(%rbx),%rbx + leaq 32(%rsp),%rdi + call __mulx_384 + + + movq 16(%rsp),%rsi + leaq 32(%rsp),%rdx + movq 24(%rsp),%rcx + movq %rsi,%rdi + call __sub_mod_384x384 + + + leaq 0(%rdi),%rsi + leaq -96(%rdi),%rdx + call __sub_mod_384x384 + + + leaq -96(%rdi),%rsi + leaq 32(%rsp),%rdx + leaq -96(%rdi),%rdi + call __sub_mod_384x384 + + leaq 136(%rsp),%r8 + movq 0(%r8),%r15 +.cfi_restore %r15 + movq 8(%r8),%r14 +.cfi_restore %r14 + movq 16(%r8),%r13 +.cfi_restore %r13 + movq 24(%r8),%r12 +.cfi_restore %r12 + movq 32(%r8),%rbx +.cfi_restore %rbx + movq 40(%r8),%rbp +.cfi_restore %rbp + leaq 48(%r8),%rsp +.cfi_adjust_cfa_offset -136-8*6 + + .byte 0xf3,0xc3 +.cfi_endproc +.size mulx_382x,.-mulx_382x +.globl sqrx_382x +.hidden sqrx_382x +.type sqrx_382x,@function +.align 32 +sqrx_382x: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + pushq %rsi +.cfi_adjust_cfa_offset 8 + + + movq %rdx,%rcx + + + movq 0(%rsi),%r14 + movq 8(%rsi),%r15 + movq 16(%rsi),%rax + movq 24(%rsi),%rbx + movq 32(%rsi),%rbp + movq 40(%rsi),%rdx + + movq %r14,%r8 + addq 48(%rsi),%r14 + movq %r15,%r9 + adcq 56(%rsi),%r15 + movq %rax,%r10 + adcq 64(%rsi),%rax + movq %rbx,%r11 + adcq 72(%rsi),%rbx + movq %rbp,%r12 + adcq 80(%rsi),%rbp + movq %rdx,%r13 + adcq 88(%rsi),%rdx + + movq %r14,0(%rdi) + movq %r15,8(%rdi) + movq %rax,16(%rdi) + movq %rbx,24(%rdi) + movq %rbp,32(%rdi) + movq %rdx,40(%rdi) + + + leaq 48(%rsi),%rdx + leaq 48(%rdi),%rdi + call __sub_mod_384_a_is_loaded + + + leaq (%rdi),%rsi + leaq -48(%rdi),%rbx + leaq -48(%rdi),%rdi + call __mulx_384 + + + movq (%rsp),%rsi + leaq 48(%rsi),%rbx + leaq 96(%rdi),%rdi + call __mulx_384 + + movq 0(%rdi),%r8 + movq 8(%rdi),%r9 + movq 16(%rdi),%r10 + movq 24(%rdi),%r11 + movq 32(%rdi),%r12 + movq 40(%rdi),%r13 + movq 48(%rdi),%r14 + movq 56(%rdi),%r15 + movq 64(%rdi),%rax + movq 72(%rdi),%rbx + movq 80(%rdi),%rbp + addq %r8,%r8 + movq 88(%rdi),%rdx + adcq %r9,%r9 + movq %r8,0(%rdi) + adcq %r10,%r10 + movq %r9,8(%rdi) + adcq %r11,%r11 + movq %r10,16(%rdi) + adcq %r12,%r12 + movq %r11,24(%rdi) + adcq %r13,%r13 + movq %r12,32(%rdi) + adcq %r14,%r14 + movq %r13,40(%rdi) + adcq %r15,%r15 + movq %r14,48(%rdi) + adcq %rax,%rax + movq %r15,56(%rdi) + adcq %rbx,%rbx + movq %rax,64(%rdi) + adcq %rbp,%rbp + movq %rbx,72(%rdi) + adcq %rdx,%rdx + movq %rbp,80(%rdi) + movq %rdx,88(%rdi) + + movq 8(%rsp),%r15 +.cfi_restore %r15 + movq 16(%rsp),%r14 +.cfi_restore %r14 + movq 24(%rsp),%r13 +.cfi_restore %r13 + movq 32(%rsp),%r12 +.cfi_restore %r12 + movq 40(%rsp),%rbx +.cfi_restore %rbx + movq 48(%rsp),%rbp +.cfi_restore %rbp + leaq 56(%rsp),%rsp +.cfi_adjust_cfa_offset -8*7 + + .byte 0xf3,0xc3 +.cfi_endproc +.size sqrx_382x,.-sqrx_382x +.globl mulx_384 +.hidden mulx_384 +.type mulx_384,@function +.align 32 +mulx_384: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + + + movq %rdx,%rbx + call __mulx_384 + + movq 0(%rsp),%r15 +.cfi_restore %r15 + movq 8(%rsp),%r14 +.cfi_restore %r14 + movq 16(%rsp),%r13 +.cfi_restore %r13 + movq 24(%rsp),%r12 +.cfi_restore %r12 + movq 32(%rsp),%rbx +.cfi_restore %rbx + movq 40(%rsp),%rbp +.cfi_restore %rbp + leaq 48(%rsp),%rsp +.cfi_adjust_cfa_offset -48 + + .byte 0xf3,0xc3 +.cfi_endproc +.size mulx_384,.-mulx_384 + +.type __mulx_384,@function +.align 32 +__mulx_384: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + movq 0(%rbx),%rdx + movq 0(%rsi),%r14 + movq 8(%rsi),%r15 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + movq 32(%rsi),%r12 + movq 40(%rsi),%r13 + leaq -128(%rsi),%rsi + + mulxq %r14,%r9,%rcx + xorq %rbp,%rbp + + mulxq %r15,%r8,%rax + adcxq %rcx,%r8 + movq %r9,0(%rdi) + + mulxq %r10,%r9,%rcx + adcxq %rax,%r9 + + mulxq %r11,%r10,%rax + adcxq %rcx,%r10 + + mulxq %r12,%r11,%rcx + adcxq %rax,%r11 + + mulxq %r13,%r12,%r13 + movq 8(%rbx),%rdx + adcxq %rcx,%r12 + adcxq %rbp,%r13 + mulxq %r14,%rax,%rcx + adcxq %r8,%rax + adoxq %rcx,%r9 + movq %rax,8(%rdi) + + mulxq %r15,%r8,%rcx + adcxq %r9,%r8 + adoxq %rcx,%r10 + + mulxq 128+16(%rsi),%r9,%rax + adcxq %r10,%r9 + adoxq %rax,%r11 + + mulxq 128+24(%rsi),%r10,%rcx + adcxq %r11,%r10 + adoxq %rcx,%r12 + + mulxq 128+32(%rsi),%r11,%rax + adcxq %r12,%r11 + adoxq %r13,%rax + + mulxq 128+40(%rsi),%r12,%r13 + movq 16(%rbx),%rdx + adcxq %rax,%r12 + adoxq %rbp,%r13 + adcxq %rbp,%r13 + mulxq %r14,%rax,%rcx + adcxq %r8,%rax + adoxq %rcx,%r9 + movq %rax,16(%rdi) + + mulxq %r15,%r8,%rcx + adcxq %r9,%r8 + adoxq %rcx,%r10 + + mulxq 128+16(%rsi),%r9,%rax + adcxq %r10,%r9 + adoxq %rax,%r11 + + mulxq 128+24(%rsi),%r10,%rcx + adcxq %r11,%r10 + adoxq %rcx,%r12 + + mulxq 128+32(%rsi),%r11,%rax + adcxq %r12,%r11 + adoxq %r13,%rax + + mulxq 128+40(%rsi),%r12,%r13 + movq 24(%rbx),%rdx + adcxq %rax,%r12 + adoxq %rbp,%r13 + adcxq %rbp,%r13 + mulxq %r14,%rax,%rcx + adcxq %r8,%rax + adoxq %rcx,%r9 + movq %rax,24(%rdi) + + mulxq %r15,%r8,%rcx + adcxq %r9,%r8 + adoxq %rcx,%r10 + + mulxq 128+16(%rsi),%r9,%rax + adcxq %r10,%r9 + adoxq %rax,%r11 + + mulxq 128+24(%rsi),%r10,%rcx + adcxq %r11,%r10 + adoxq %rcx,%r12 + + mulxq 128+32(%rsi),%r11,%rax + adcxq %r12,%r11 + adoxq %r13,%rax + + mulxq 128+40(%rsi),%r12,%r13 + movq 32(%rbx),%rdx + adcxq %rax,%r12 + adoxq %rbp,%r13 + adcxq %rbp,%r13 + mulxq %r14,%rax,%rcx + adcxq %r8,%rax + adoxq %rcx,%r9 + movq %rax,32(%rdi) + + mulxq %r15,%r8,%rcx + adcxq %r9,%r8 + adoxq %rcx,%r10 + + mulxq 128+16(%rsi),%r9,%rax + adcxq %r10,%r9 + adoxq %rax,%r11 + + mulxq 128+24(%rsi),%r10,%rcx + adcxq %r11,%r10 + adoxq %rcx,%r12 + + mulxq 128+32(%rsi),%r11,%rax + adcxq %r12,%r11 + adoxq %r13,%rax + + mulxq 128+40(%rsi),%r12,%r13 + movq 40(%rbx),%rdx + adcxq %rax,%r12 + adoxq %rbp,%r13 + adcxq %rbp,%r13 + mulxq %r14,%rax,%rcx + adcxq %r8,%rax + adoxq %rcx,%r9 + movq %rax,40(%rdi) + + mulxq %r15,%r8,%rcx + adcxq %r9,%r8 + adoxq %rcx,%r10 + + mulxq 128+16(%rsi),%r9,%rax + adcxq %r10,%r9 + adoxq %rax,%r11 + + mulxq 128+24(%rsi),%r10,%rcx + adcxq %r11,%r10 + adoxq %rcx,%r12 + + mulxq 128+32(%rsi),%r11,%rax + adcxq %r12,%r11 + adoxq %r13,%rax + + mulxq 128+40(%rsi),%r12,%r13 + movq %rax,%rdx + adcxq %rax,%r12 + adoxq %rbp,%r13 + adcxq %rbp,%r13 + movq %r8,48(%rdi) + movq %r9,56(%rdi) + movq %r10,64(%rdi) + movq %r11,72(%rdi) + movq %r12,80(%rdi) + movq %r13,88(%rdi) + + .byte 0xf3,0xc3 +.cfi_endproc +.size __mulx_384,.-__mulx_384 +.globl sqrx_384 +.hidden sqrx_384 +.type sqrx_384,@function +.align 32 +sqrx_384: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + pushq %rdi +.cfi_adjust_cfa_offset 8 + + + call __sqrx_384 + + movq 8(%rsp),%r15 +.cfi_restore %r15 + movq 16(%rsp),%r14 +.cfi_restore %r14 + movq 24(%rsp),%r13 +.cfi_restore %r13 + movq 32(%rsp),%r12 +.cfi_restore %r12 + movq 40(%rsp),%rbx +.cfi_restore %rbx + movq 48(%rsp),%rbp +.cfi_restore %rbp + leaq 56(%rsp),%rsp +.cfi_adjust_cfa_offset -56 + + .byte 0xf3,0xc3 +.cfi_endproc +.size sqrx_384,.-sqrx_384 +.type __sqrx_384,@function +.align 32 +__sqrx_384: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + movq 0(%rsi),%rdx + movq 8(%rsi),%r14 + movq 16(%rsi),%r15 + movq 24(%rsi),%rcx + movq 32(%rsi),%rbx + + + mulxq %r14,%r8,%rdi + movq 40(%rsi),%rbp + mulxq %r15,%r9,%rax + addq %rdi,%r9 + mulxq %rcx,%r10,%rdi + adcq %rax,%r10 + mulxq %rbx,%r11,%rax + adcq %rdi,%r11 + mulxq %rbp,%r12,%r13 + movq %r14,%rdx + adcq %rax,%r12 + adcq $0,%r13 + + + xorq %r14,%r14 + mulxq %r15,%rdi,%rax + adcxq %rdi,%r10 + adoxq %rax,%r11 + + mulxq %rcx,%rdi,%rax + adcxq %rdi,%r11 + adoxq %rax,%r12 + + mulxq %rbx,%rdi,%rax + adcxq %rdi,%r12 + adoxq %rax,%r13 + + mulxq %rbp,%rdi,%rax + movq %r15,%rdx + adcxq %rdi,%r13 + adoxq %r14,%rax + adcxq %rax,%r14 + + + xorq %r15,%r15 + mulxq %rcx,%rdi,%rax + adcxq %rdi,%r12 + adoxq %rax,%r13 + + mulxq %rbx,%rdi,%rax + adcxq %rdi,%r13 + adoxq %rax,%r14 + + mulxq %rbp,%rdi,%rax + movq %rcx,%rdx + adcxq %rdi,%r14 + adoxq %r15,%rax + adcxq %rax,%r15 + + + xorq %rcx,%rcx + mulxq %rbx,%rdi,%rax + adcxq %rdi,%r14 + adoxq %rax,%r15 + + mulxq %rbp,%rdi,%rax + movq %rbx,%rdx + adcxq %rdi,%r15 + adoxq %rcx,%rax + adcxq %rax,%rcx + + + mulxq %rbp,%rdi,%rbx + movq 0(%rsi),%rdx + addq %rdi,%rcx + movq 8(%rsp),%rdi + adcq $0,%rbx + + + xorq %rbp,%rbp + adcxq %r8,%r8 + adcxq %r9,%r9 + adcxq %r10,%r10 + adcxq %r11,%r11 + adcxq %r12,%r12 + + + mulxq %rdx,%rdx,%rax + movq %rdx,0(%rdi) + movq 8(%rsi),%rdx + adoxq %rax,%r8 + movq %r8,8(%rdi) + + mulxq %rdx,%r8,%rax + movq 16(%rsi),%rdx + adoxq %r8,%r9 + adoxq %rax,%r10 + movq %r9,16(%rdi) + movq %r10,24(%rdi) + + mulxq %rdx,%r8,%r9 + movq 24(%rsi),%rdx + adoxq %r8,%r11 + adoxq %r9,%r12 + adcxq %r13,%r13 + adcxq %r14,%r14 + movq %r11,32(%rdi) + movq %r12,40(%rdi) + + mulxq %rdx,%r8,%r9 + movq 32(%rsi),%rdx + adoxq %r8,%r13 + adoxq %r9,%r14 + adcxq %r15,%r15 + adcxq %rcx,%rcx + movq %r13,48(%rdi) + movq %r14,56(%rdi) + + mulxq %rdx,%r8,%r9 + movq 40(%rsi),%rdx + adoxq %r8,%r15 + adoxq %r9,%rcx + adcxq %rbx,%rbx + adcxq %rbp,%rbp + movq %r15,64(%rdi) + movq %rcx,72(%rdi) + + mulxq %rdx,%r8,%r9 + adoxq %r8,%rbx + adoxq %r9,%rbp + + movq %rbx,80(%rdi) + movq %rbp,88(%rdi) + + .byte 0xf3,0xc3 +.cfi_endproc +.size __sqrx_384,.-__sqrx_384 + + + +.globl redcx_mont_384 +.hidden redcx_mont_384 +.type redcx_mont_384,@function +.align 32 +redcx_mont_384: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + subq $8,%rsp +.cfi_adjust_cfa_offset 8 + + + movq %rdx,%rbx + call __mulx_by_1_mont_384 + call __redc_tail_mont_384 + + movq 8(%rsp),%r15 +.cfi_restore %r15 + movq 16(%rsp),%r14 +.cfi_restore %r14 + movq 24(%rsp),%r13 +.cfi_restore %r13 + movq 32(%rsp),%r12 +.cfi_restore %r12 + movq 40(%rsp),%rbx +.cfi_restore %rbx + movq 48(%rsp),%rbp +.cfi_restore %rbp + leaq 56(%rsp),%rsp +.cfi_adjust_cfa_offset -56 + + .byte 0xf3,0xc3 +.cfi_endproc +.size redcx_mont_384,.-redcx_mont_384 + + + + +.globl fromx_mont_384 +.hidden fromx_mont_384 +.type fromx_mont_384,@function +.align 32 +fromx_mont_384: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + subq $8,%rsp +.cfi_adjust_cfa_offset 8 + + + movq %rdx,%rbx + call __mulx_by_1_mont_384 + + + + + movq %r14,%rax + movq %r15,%rcx + movq %r8,%rdx + movq %r9,%rbp + + subq 0(%rbx),%r14 + sbbq 8(%rbx),%r15 + movq %r10,%r13 + sbbq 16(%rbx),%r8 + sbbq 24(%rbx),%r9 + sbbq 32(%rbx),%r10 + movq %r11,%rsi + sbbq 40(%rbx),%r11 + + cmovcq %rax,%r14 + cmovcq %rcx,%r15 + cmovcq %rdx,%r8 + movq %r14,0(%rdi) + cmovcq %rbp,%r9 + movq %r15,8(%rdi) + cmovcq %r13,%r10 + movq %r8,16(%rdi) + cmovcq %rsi,%r11 + movq %r9,24(%rdi) + movq %r10,32(%rdi) + movq %r11,40(%rdi) + + movq 8(%rsp),%r15 +.cfi_restore %r15 + movq 16(%rsp),%r14 +.cfi_restore %r14 + movq 24(%rsp),%r13 +.cfi_restore %r13 + movq 32(%rsp),%r12 +.cfi_restore %r12 + movq 40(%rsp),%rbx +.cfi_restore %rbx + movq 48(%rsp),%rbp +.cfi_restore %rbp + leaq 56(%rsp),%rsp +.cfi_adjust_cfa_offset -56 + + .byte 0xf3,0xc3 +.cfi_endproc +.size fromx_mont_384,.-fromx_mont_384 +.type __mulx_by_1_mont_384,@function +.align 32 +__mulx_by_1_mont_384: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + movq 0(%rsi),%r8 + movq %rcx,%rdx + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + movq 32(%rsi),%r12 + movq 40(%rsi),%r13 + imulq %r8,%rdx + + + xorq %r14,%r14 + mulxq 0(%rbx),%rax,%rbp + adcxq %rax,%r8 + adoxq %rbp,%r9 + + mulxq 8(%rbx),%rax,%rbp + adcxq %rax,%r9 + adoxq %rbp,%r10 + + mulxq 16(%rbx),%rax,%rbp + adcxq %rax,%r10 + adoxq %rbp,%r11 + + mulxq 24(%rbx),%rax,%rbp + adcxq %rax,%r11 + adoxq %rbp,%r12 + + mulxq 32(%rbx),%rax,%rbp + adcxq %rax,%r12 + adoxq %rbp,%r13 + + mulxq 40(%rbx),%rax,%rbp + movq %rcx,%rdx + adcxq %rax,%r13 + adoxq %r14,%rbp + adcxq %rbp,%r14 + imulq %r9,%rdx + + + xorq %r15,%r15 + mulxq 0(%rbx),%rax,%rbp + adcxq %rax,%r9 + adoxq %rbp,%r10 + + mulxq 8(%rbx),%rax,%rbp + adcxq %rax,%r10 + adoxq %rbp,%r11 + + mulxq 16(%rbx),%rax,%rbp + adcxq %rax,%r11 + adoxq %rbp,%r12 + + mulxq 24(%rbx),%rax,%rbp + adcxq %rax,%r12 + adoxq %rbp,%r13 + + mulxq 32(%rbx),%rax,%rbp + adcxq %rax,%r13 + adoxq %rbp,%r14 + + mulxq 40(%rbx),%rax,%rbp + movq %rcx,%rdx + adcxq %rax,%r14 + adoxq %r15,%rbp + adcxq %rbp,%r15 + imulq %r10,%rdx + + + xorq %r8,%r8 + mulxq 0(%rbx),%rax,%rbp + adcxq %rax,%r10 + adoxq %rbp,%r11 + + mulxq 8(%rbx),%rax,%rbp + adcxq %rax,%r11 + adoxq %rbp,%r12 + + mulxq 16(%rbx),%rax,%rbp + adcxq %rax,%r12 + adoxq %rbp,%r13 + + mulxq 24(%rbx),%rax,%rbp + adcxq %rax,%r13 + adoxq %rbp,%r14 + + mulxq 32(%rbx),%rax,%rbp + adcxq %rax,%r14 + adoxq %rbp,%r15 + + mulxq 40(%rbx),%rax,%rbp + movq %rcx,%rdx + adcxq %rax,%r15 + adoxq %r8,%rbp + adcxq %rbp,%r8 + imulq %r11,%rdx + + + xorq %r9,%r9 + mulxq 0(%rbx),%rax,%rbp + adcxq %rax,%r11 + adoxq %rbp,%r12 + + mulxq 8(%rbx),%rax,%rbp + adcxq %rax,%r12 + adoxq %rbp,%r13 + + mulxq 16(%rbx),%rax,%rbp + adcxq %rax,%r13 + adoxq %rbp,%r14 + + mulxq 24(%rbx),%rax,%rbp + adcxq %rax,%r14 + adoxq %rbp,%r15 + + mulxq 32(%rbx),%rax,%rbp + adcxq %rax,%r15 + adoxq %rbp,%r8 + + mulxq 40(%rbx),%rax,%rbp + movq %rcx,%rdx + adcxq %rax,%r8 + adoxq %r9,%rbp + adcxq %rbp,%r9 + imulq %r12,%rdx + + + xorq %r10,%r10 + mulxq 0(%rbx),%rax,%rbp + adcxq %rax,%r12 + adoxq %rbp,%r13 + + mulxq 8(%rbx),%rax,%rbp + adcxq %rax,%r13 + adoxq %rbp,%r14 + + mulxq 16(%rbx),%rax,%rbp + adcxq %rax,%r14 + adoxq %rbp,%r15 + + mulxq 24(%rbx),%rax,%rbp + adcxq %rax,%r15 + adoxq %rbp,%r8 + + mulxq 32(%rbx),%rax,%rbp + adcxq %rax,%r8 + adoxq %rbp,%r9 + + mulxq 40(%rbx),%rax,%rbp + movq %rcx,%rdx + adcxq %rax,%r9 + adoxq %r10,%rbp + adcxq %rbp,%r10 + imulq %r13,%rdx + + + xorq %r11,%r11 + mulxq 0(%rbx),%rax,%rbp + adcxq %rax,%r13 + adoxq %rbp,%r14 + + mulxq 8(%rbx),%rax,%rbp + adcxq %rax,%r14 + adoxq %rbp,%r15 + + mulxq 16(%rbx),%rax,%rbp + adcxq %rax,%r15 + adoxq %rbp,%r8 + + mulxq 24(%rbx),%rax,%rbp + adcxq %rax,%r8 + adoxq %rbp,%r9 + + mulxq 32(%rbx),%rax,%rbp + adcxq %rax,%r9 + adoxq %rbp,%r10 + + mulxq 40(%rbx),%rax,%rbp + movq %rcx,%rdx + adcxq %rax,%r10 + adoxq %r11,%rbp + adcxq %rbp,%r11 + .byte 0xf3,0xc3 +.cfi_endproc +.size __mulx_by_1_mont_384,.-__mulx_by_1_mont_384 + +.type __redc_tail_mont_384,@function +.align 32 +__redc_tail_mont_384: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + addq 48(%rsi),%r14 + movq %r14,%rax + adcq 56(%rsi),%r15 + adcq 64(%rsi),%r8 + adcq 72(%rsi),%r9 + movq %r15,%rcx + adcq 80(%rsi),%r10 + adcq 88(%rsi),%r11 + sbbq %r12,%r12 + + + + + movq %r8,%rdx + movq %r9,%rbp + + subq 0(%rbx),%r14 + sbbq 8(%rbx),%r15 + movq %r10,%r13 + sbbq 16(%rbx),%r8 + sbbq 24(%rbx),%r9 + sbbq 32(%rbx),%r10 + movq %r11,%rsi + sbbq 40(%rbx),%r11 + sbbq $0,%r12 + + cmovcq %rax,%r14 + cmovcq %rcx,%r15 + cmovcq %rdx,%r8 + movq %r14,0(%rdi) + cmovcq %rbp,%r9 + movq %r15,8(%rdi) + cmovcq %r13,%r10 + movq %r8,16(%rdi) + cmovcq %rsi,%r11 + movq %r9,24(%rdi) + movq %r10,32(%rdi) + movq %r11,40(%rdi) + + .byte 0xf3,0xc3 +.cfi_endproc +.size __redc_tail_mont_384,.-__redc_tail_mont_384 + +.globl sgn0x_pty_mont_384 +.hidden sgn0x_pty_mont_384 +.type sgn0x_pty_mont_384,@function +.align 32 +sgn0x_pty_mont_384: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + subq $8,%rsp +.cfi_adjust_cfa_offset 8 + + + movq %rsi,%rbx + leaq 0(%rdi),%rsi + movq %rdx,%rcx + call __mulx_by_1_mont_384 + + xorq %rax,%rax + movq %r14,%r13 + addq %r14,%r14 + adcq %r15,%r15 + adcq %r8,%r8 + adcq %r9,%r9 + adcq %r10,%r10 + adcq %r11,%r11 + adcq $0,%rax + + subq 0(%rbx),%r14 + sbbq 8(%rbx),%r15 + sbbq 16(%rbx),%r8 + sbbq 24(%rbx),%r9 + sbbq 32(%rbx),%r10 + sbbq 40(%rbx),%r11 + sbbq $0,%rax + + notq %rax + andq $1,%r13 + andq $2,%rax + orq %r13,%rax + + movq 8(%rsp),%r15 +.cfi_restore %r15 + movq 16(%rsp),%r14 +.cfi_restore %r14 + movq 24(%rsp),%r13 +.cfi_restore %r13 + movq 32(%rsp),%r12 +.cfi_restore %r12 + movq 40(%rsp),%rbx +.cfi_restore %rbx + movq 48(%rsp),%rbp +.cfi_restore %rbp + leaq 56(%rsp),%rsp +.cfi_adjust_cfa_offset -56 + + .byte 0xf3,0xc3 +.cfi_endproc +.size sgn0x_pty_mont_384,.-sgn0x_pty_mont_384 + +.globl sgn0x_pty_mont_384x +.hidden sgn0x_pty_mont_384x +.type sgn0x_pty_mont_384x,@function +.align 32 +sgn0x_pty_mont_384x: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + subq $8,%rsp +.cfi_adjust_cfa_offset 8 + + + movq %rsi,%rbx + leaq 48(%rdi),%rsi + movq %rdx,%rcx + call __mulx_by_1_mont_384 + + movq %r14,%r12 + orq %r15,%r14 + orq %r8,%r14 + orq %r9,%r14 + orq %r10,%r14 + orq %r11,%r14 + + leaq 0(%rdi),%rsi + xorq %rdi,%rdi + movq %r12,%r13 + addq %r12,%r12 + adcq %r15,%r15 + adcq %r8,%r8 + adcq %r9,%r9 + adcq %r10,%r10 + adcq %r11,%r11 + adcq $0,%rdi + + subq 0(%rbx),%r12 + sbbq 8(%rbx),%r15 + sbbq 16(%rbx),%r8 + sbbq 24(%rbx),%r9 + sbbq 32(%rbx),%r10 + sbbq 40(%rbx),%r11 + sbbq $0,%rdi + + movq %r14,0(%rsp) + notq %rdi + andq $1,%r13 + andq $2,%rdi + orq %r13,%rdi + + call __mulx_by_1_mont_384 + + movq %r14,%r12 + orq %r15,%r14 + orq %r8,%r14 + orq %r9,%r14 + orq %r10,%r14 + orq %r11,%r14 + + xorq %rax,%rax + movq %r12,%r13 + addq %r12,%r12 + adcq %r15,%r15 + adcq %r8,%r8 + adcq %r9,%r9 + adcq %r10,%r10 + adcq %r11,%r11 + adcq $0,%rax + + subq 0(%rbx),%r12 + sbbq 8(%rbx),%r15 + sbbq 16(%rbx),%r8 + sbbq 24(%rbx),%r9 + sbbq 32(%rbx),%r10 + sbbq 40(%rbx),%r11 + sbbq $0,%rax + + movq 0(%rsp),%r12 + + notq %rax + + testq %r14,%r14 + cmovzq %rdi,%r13 + + testq %r12,%r12 + cmovnzq %rdi,%rax + + andq $1,%r13 + andq $2,%rax + orq %r13,%rax + + movq 8(%rsp),%r15 +.cfi_restore %r15 + movq 16(%rsp),%r14 +.cfi_restore %r14 + movq 24(%rsp),%r13 +.cfi_restore %r13 + movq 32(%rsp),%r12 +.cfi_restore %r12 + movq 40(%rsp),%rbx +.cfi_restore %rbx + movq 48(%rsp),%rbp +.cfi_restore %rbp + leaq 56(%rsp),%rsp +.cfi_adjust_cfa_offset -56 + + .byte 0xf3,0xc3 +.cfi_endproc +.size sgn0x_pty_mont_384x,.-sgn0x_pty_mont_384x +.globl mulx_mont_384 +.hidden mulx_mont_384 +.type mulx_mont_384,@function +.align 32 +mulx_mont_384: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + leaq -24(%rsp),%rsp +.cfi_adjust_cfa_offset 8*3 + + + movq %rdx,%rbx + movq 0(%rdx),%rdx + movq 0(%rsi),%r14 + movq 8(%rsi),%r15 + movq 16(%rsi),%rax + movq 24(%rsi),%r12 + movq %rdi,16(%rsp) + movq 32(%rsi),%rdi + movq 40(%rsi),%rbp + leaq -128(%rsi),%rsi + leaq -128(%rcx),%rcx + movq %r8,(%rsp) + + mulxq %r14,%r8,%r9 + call __mulx_mont_384 + + movq 24(%rsp),%r15 +.cfi_restore %r15 + movq 32(%rsp),%r14 +.cfi_restore %r14 + movq 40(%rsp),%r13 +.cfi_restore %r13 + movq 48(%rsp),%r12 +.cfi_restore %r12 + movq 56(%rsp),%rbx +.cfi_restore %rbx + movq 64(%rsp),%rbp +.cfi_restore %rbp + leaq 72(%rsp),%rsp +.cfi_adjust_cfa_offset -8*9 + + .byte 0xf3,0xc3 +.cfi_endproc +.size mulx_mont_384,.-mulx_mont_384 +.type __mulx_mont_384,@function +.align 32 +__mulx_mont_384: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + mulxq %r15,%r14,%r10 + mulxq %rax,%r15,%r11 + addq %r14,%r9 + mulxq %r12,%rax,%r12 + adcq %r15,%r10 + mulxq %rdi,%rdi,%r13 + adcq %rax,%r11 + mulxq %rbp,%rbp,%r14 + movq 8(%rbx),%rdx + adcq %rdi,%r12 + adcq %rbp,%r13 + adcq $0,%r14 + xorq %r15,%r15 + + movq %r8,16(%rsp) + imulq 8(%rsp),%r8 + + + xorq %rax,%rax + mulxq 0+128(%rsi),%rdi,%rbp + adoxq %rdi,%r9 + adcxq %rbp,%r10 + + mulxq 8+128(%rsi),%rdi,%rbp + adoxq %rdi,%r10 + adcxq %rbp,%r11 + + mulxq 16+128(%rsi),%rdi,%rbp + adoxq %rdi,%r11 + adcxq %rbp,%r12 + + mulxq 24+128(%rsi),%rdi,%rbp + adoxq %rdi,%r12 + adcxq %rbp,%r13 + + mulxq 32+128(%rsi),%rdi,%rbp + adoxq %rdi,%r13 + adcxq %rbp,%r14 + + mulxq 40+128(%rsi),%rdi,%rbp + movq %r8,%rdx + adoxq %rdi,%r14 + adcxq %rbp,%r15 + adoxq %rax,%r15 + adoxq %rax,%rax + + + xorq %r8,%r8 + mulxq 0+128(%rcx),%rdi,%rbp + adcxq 16(%rsp),%rdi + adoxq %rbp,%r9 + + mulxq 8+128(%rcx),%rdi,%rbp + adcxq %rdi,%r9 + adoxq %rbp,%r10 + + mulxq 16+128(%rcx),%rdi,%rbp + adcxq %rdi,%r10 + adoxq %rbp,%r11 + + mulxq 24+128(%rcx),%rdi,%rbp + adcxq %rdi,%r11 + adoxq %rbp,%r12 + + mulxq 32+128(%rcx),%rdi,%rbp + adcxq %rdi,%r12 + adoxq %rbp,%r13 + + mulxq 40+128(%rcx),%rdi,%rbp + movq 16(%rbx),%rdx + adcxq %rdi,%r13 + adoxq %rbp,%r14 + adcxq %r8,%r14 + adoxq %r8,%r15 + adcxq %r8,%r15 + adoxq %r8,%rax + adcxq %r8,%rax + movq %r9,16(%rsp) + imulq 8(%rsp),%r9 + + + xorq %r8,%r8 + mulxq 0+128(%rsi),%rdi,%rbp + adoxq %rdi,%r10 + adcxq %rbp,%r11 + + mulxq 8+128(%rsi),%rdi,%rbp + adoxq %rdi,%r11 + adcxq %rbp,%r12 + + mulxq 16+128(%rsi),%rdi,%rbp + adoxq %rdi,%r12 + adcxq %rbp,%r13 + + mulxq 24+128(%rsi),%rdi,%rbp + adoxq %rdi,%r13 + adcxq %rbp,%r14 + + mulxq 32+128(%rsi),%rdi,%rbp + adoxq %rdi,%r14 + adcxq %rbp,%r15 + + mulxq 40+128(%rsi),%rdi,%rbp + movq %r9,%rdx + adoxq %rdi,%r15 + adcxq %rbp,%rax + adoxq %r8,%rax + adoxq %r8,%r8 + + + xorq %r9,%r9 + mulxq 0+128(%rcx),%rdi,%rbp + adcxq 16(%rsp),%rdi + adoxq %rbp,%r10 + + mulxq 8+128(%rcx),%rdi,%rbp + adcxq %rdi,%r10 + adoxq %rbp,%r11 + + mulxq 16+128(%rcx),%rdi,%rbp + adcxq %rdi,%r11 + adoxq %rbp,%r12 + + mulxq 24+128(%rcx),%rdi,%rbp + adcxq %rdi,%r12 + adoxq %rbp,%r13 + + mulxq 32+128(%rcx),%rdi,%rbp + adcxq %rdi,%r13 + adoxq %rbp,%r14 + + mulxq 40+128(%rcx),%rdi,%rbp + movq 24(%rbx),%rdx + adcxq %rdi,%r14 + adoxq %rbp,%r15 + adcxq %r9,%r15 + adoxq %r9,%rax + adcxq %r9,%rax + adoxq %r9,%r8 + adcxq %r9,%r8 + movq %r10,16(%rsp) + imulq 8(%rsp),%r10 + + + xorq %r9,%r9 + mulxq 0+128(%rsi),%rdi,%rbp + adoxq %rdi,%r11 + adcxq %rbp,%r12 + + mulxq 8+128(%rsi),%rdi,%rbp + adoxq %rdi,%r12 + adcxq %rbp,%r13 + + mulxq 16+128(%rsi),%rdi,%rbp + adoxq %rdi,%r13 + adcxq %rbp,%r14 + + mulxq 24+128(%rsi),%rdi,%rbp + adoxq %rdi,%r14 + adcxq %rbp,%r15 + + mulxq 32+128(%rsi),%rdi,%rbp + adoxq %rdi,%r15 + adcxq %rbp,%rax + + mulxq 40+128(%rsi),%rdi,%rbp + movq %r10,%rdx + adoxq %rdi,%rax + adcxq %rbp,%r8 + adoxq %r9,%r8 + adoxq %r9,%r9 + + + xorq %r10,%r10 + mulxq 0+128(%rcx),%rdi,%rbp + adcxq 16(%rsp),%rdi + adoxq %rbp,%r11 + + mulxq 8+128(%rcx),%rdi,%rbp + adcxq %rdi,%r11 + adoxq %rbp,%r12 + + mulxq 16+128(%rcx),%rdi,%rbp + adcxq %rdi,%r12 + adoxq %rbp,%r13 + + mulxq 24+128(%rcx),%rdi,%rbp + adcxq %rdi,%r13 + adoxq %rbp,%r14 + + mulxq 32+128(%rcx),%rdi,%rbp + adcxq %rdi,%r14 + adoxq %rbp,%r15 + + mulxq 40+128(%rcx),%rdi,%rbp + movq 32(%rbx),%rdx + adcxq %rdi,%r15 + adoxq %rbp,%rax + adcxq %r10,%rax + adoxq %r10,%r8 + adcxq %r10,%r8 + adoxq %r10,%r9 + adcxq %r10,%r9 + movq %r11,16(%rsp) + imulq 8(%rsp),%r11 + + + xorq %r10,%r10 + mulxq 0+128(%rsi),%rdi,%rbp + adoxq %rdi,%r12 + adcxq %rbp,%r13 + + mulxq 8+128(%rsi),%rdi,%rbp + adoxq %rdi,%r13 + adcxq %rbp,%r14 + + mulxq 16+128(%rsi),%rdi,%rbp + adoxq %rdi,%r14 + adcxq %rbp,%r15 + + mulxq 24+128(%rsi),%rdi,%rbp + adoxq %rdi,%r15 + adcxq %rbp,%rax + + mulxq 32+128(%rsi),%rdi,%rbp + adoxq %rdi,%rax + adcxq %rbp,%r8 + + mulxq 40+128(%rsi),%rdi,%rbp + movq %r11,%rdx + adoxq %rdi,%r8 + adcxq %rbp,%r9 + adoxq %r10,%r9 + adoxq %r10,%r10 + + + xorq %r11,%r11 + mulxq 0+128(%rcx),%rdi,%rbp + adcxq 16(%rsp),%rdi + adoxq %rbp,%r12 + + mulxq 8+128(%rcx),%rdi,%rbp + adcxq %rdi,%r12 + adoxq %rbp,%r13 + + mulxq 16+128(%rcx),%rdi,%rbp + adcxq %rdi,%r13 + adoxq %rbp,%r14 + + mulxq 24+128(%rcx),%rdi,%rbp + adcxq %rdi,%r14 + adoxq %rbp,%r15 + + mulxq 32+128(%rcx),%rdi,%rbp + adcxq %rdi,%r15 + adoxq %rbp,%rax + + mulxq 40+128(%rcx),%rdi,%rbp + movq 40(%rbx),%rdx + adcxq %rdi,%rax + adoxq %rbp,%r8 + adcxq %r11,%r8 + adoxq %r11,%r9 + adcxq %r11,%r9 + adoxq %r11,%r10 + adcxq %r11,%r10 + movq %r12,16(%rsp) + imulq 8(%rsp),%r12 + + + xorq %r11,%r11 + mulxq 0+128(%rsi),%rdi,%rbp + adoxq %rdi,%r13 + adcxq %rbp,%r14 + + mulxq 8+128(%rsi),%rdi,%rbp + adoxq %rdi,%r14 + adcxq %rbp,%r15 + + mulxq 16+128(%rsi),%rdi,%rbp + adoxq %rdi,%r15 + adcxq %rbp,%rax + + mulxq 24+128(%rsi),%rdi,%rbp + adoxq %rdi,%rax + adcxq %rbp,%r8 + + mulxq 32+128(%rsi),%rdi,%rbp + adoxq %rdi,%r8 + adcxq %rbp,%r9 + + mulxq 40+128(%rsi),%rdi,%rbp + movq %r12,%rdx + adoxq %rdi,%r9 + adcxq %rbp,%r10 + adoxq %r11,%r10 + adoxq %r11,%r11 + + + xorq %r12,%r12 + mulxq 0+128(%rcx),%rdi,%rbp + adcxq 16(%rsp),%rdi + adoxq %rbp,%r13 + + mulxq 8+128(%rcx),%rdi,%rbp + adcxq %rdi,%r13 + adoxq %rbp,%r14 + + mulxq 16+128(%rcx),%rdi,%rbp + adcxq %rdi,%r14 + adoxq %rbp,%r15 + + mulxq 24+128(%rcx),%rdi,%rbp + adcxq %rdi,%r15 + adoxq %rbp,%rax + + mulxq 32+128(%rcx),%rdi,%rbp + adcxq %rdi,%rax + adoxq %rbp,%r8 + + mulxq 40+128(%rcx),%rdi,%rbp + movq %r13,%rdx + adcxq %rdi,%r8 + adoxq %rbp,%r9 + adcxq %r12,%r9 + adoxq %r12,%r10 + adcxq %r12,%r10 + adoxq %r12,%r11 + adcxq %r12,%r11 + imulq 8(%rsp),%rdx + movq 24(%rsp),%rbx + + + xorq %r12,%r12 + mulxq 0+128(%rcx),%rdi,%rbp + adcxq %rdi,%r13 + adoxq %rbp,%r14 + + mulxq 8+128(%rcx),%rdi,%rbp + adcxq %rdi,%r14 + adoxq %rbp,%r15 + + mulxq 16+128(%rcx),%rdi,%rbp + adcxq %rdi,%r15 + adoxq %rbp,%rax + + mulxq 24+128(%rcx),%rdi,%rbp + adcxq %rdi,%rax + adoxq %rbp,%r8 + movq %r15,%r13 + + mulxq 32+128(%rcx),%rdi,%rbp + adcxq %rdi,%r8 + adoxq %rbp,%r9 + movq %rax,%rsi + + mulxq 40+128(%rcx),%rdi,%rbp + adcxq %rdi,%r9 + adoxq %rbp,%r10 + movq %r14,%rdx + adcxq %r12,%r10 + adoxq %r12,%r11 + leaq 128(%rcx),%rcx + movq %r8,%r12 + adcq $0,%r11 + + + + + subq 0(%rcx),%r14 + sbbq 8(%rcx),%r15 + movq %r9,%rdi + sbbq 16(%rcx),%rax + sbbq 24(%rcx),%r8 + sbbq 32(%rcx),%r9 + movq %r10,%rbp + sbbq 40(%rcx),%r10 + sbbq $0,%r11 + + cmovncq %r14,%rdx + cmovcq %r13,%r15 + cmovcq %rsi,%rax + cmovncq %r8,%r12 + movq %rdx,0(%rbx) + cmovncq %r9,%rdi + movq %r15,8(%rbx) + cmovncq %r10,%rbp + movq %rax,16(%rbx) + movq %r12,24(%rbx) + movq %rdi,32(%rbx) + movq %rbp,40(%rbx) + + .byte 0xf3,0xc3 +.cfi_endproc +.size __mulx_mont_384,.-__mulx_mont_384 +.globl sqrx_mont_384 +.hidden sqrx_mont_384 +.type sqrx_mont_384,@function +.align 32 +sqrx_mont_384: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + leaq -24(%rsp),%rsp +.cfi_adjust_cfa_offset 8*3 + + + movq %rcx,%r8 + leaq -128(%rdx),%rcx + movq 0(%rsi),%rdx + movq 8(%rsi),%r15 + movq 16(%rsi),%rax + movq 24(%rsi),%r12 + movq %rdi,16(%rsp) + movq 32(%rsi),%rdi + movq 40(%rsi),%rbp + + leaq (%rsi),%rbx + movq %r8,(%rsp) + leaq -128(%rsi),%rsi + + mulxq %rdx,%r8,%r9 + call __mulx_mont_384 + + movq 24(%rsp),%r15 +.cfi_restore %r15 + movq 32(%rsp),%r14 +.cfi_restore %r14 + movq 40(%rsp),%r13 +.cfi_restore %r13 + movq 48(%rsp),%r12 +.cfi_restore %r12 + movq 56(%rsp),%rbx +.cfi_restore %rbx + movq 64(%rsp),%rbp +.cfi_restore %rbp + leaq 72(%rsp),%rsp +.cfi_adjust_cfa_offset -8*9 + + .byte 0xf3,0xc3 +.cfi_endproc +.size sqrx_mont_384,.-sqrx_mont_384 + +.globl sqrx_n_mul_mont_384 +.hidden sqrx_n_mul_mont_384 +.type sqrx_n_mul_mont_384,@function +.align 32 +sqrx_n_mul_mont_384: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + leaq -40(%rsp),%rsp +.cfi_adjust_cfa_offset 8*5 + + + movq %rdx,%r10 + movq 0(%rsi),%rdx + movq 8(%rsi),%r15 + movq 16(%rsi),%rax + movq %rsi,%rbx + movq 24(%rsi),%r12 + movq %rdi,16(%rsp) + movq 32(%rsi),%rdi + movq 40(%rsi),%rbp + + movq %r8,(%rsp) + movq %r9,24(%rsp) + movq 0(%r9),%xmm2 + +.Loop_sqrx_384: + movd %r10d,%xmm1 + leaq -128(%rbx),%rsi + leaq -128(%rcx),%rcx + + mulxq %rdx,%r8,%r9 + call __mulx_mont_384 + + movd %xmm1,%r10d + decl %r10d + jnz .Loop_sqrx_384 + + movq %rdx,%r14 +.byte 102,72,15,126,210 + leaq -128(%rbx),%rsi + movq 24(%rsp),%rbx + leaq -128(%rcx),%rcx + + mulxq %r14,%r8,%r9 + call __mulx_mont_384 + + movq 40(%rsp),%r15 +.cfi_restore %r15 + movq 48(%rsp),%r14 +.cfi_restore %r14 + movq 56(%rsp),%r13 +.cfi_restore %r13 + movq 64(%rsp),%r12 +.cfi_restore %r12 + movq 72(%rsp),%rbx +.cfi_restore %rbx + movq 80(%rsp),%rbp +.cfi_restore %rbp + leaq 88(%rsp),%rsp +.cfi_adjust_cfa_offset -8*11 + + .byte 0xf3,0xc3 +.cfi_endproc +.size sqrx_n_mul_mont_384,.-sqrx_n_mul_mont_384 + +.globl sqrx_n_mul_mont_383 +.hidden sqrx_n_mul_mont_383 +.type sqrx_n_mul_mont_383,@function +.align 32 +sqrx_n_mul_mont_383: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + leaq -40(%rsp),%rsp +.cfi_adjust_cfa_offset 8*5 + + + movq %rdx,%r10 + movq 0(%rsi),%rdx + movq 8(%rsi),%r15 + movq 16(%rsi),%rax + movq %rsi,%rbx + movq 24(%rsi),%r12 + movq %rdi,16(%rsp) + movq 32(%rsi),%rdi + movq 40(%rsi),%rbp + + movq %r8,(%rsp) + movq %r9,24(%rsp) + movq 0(%r9),%xmm2 + leaq -128(%rcx),%rcx + +.Loop_sqrx_383: + movd %r10d,%xmm1 + leaq -128(%rbx),%rsi + + mulxq %rdx,%r8,%r9 + call __mulx_mont_383_nonred + + movd %xmm1,%r10d + decl %r10d + jnz .Loop_sqrx_383 + + movq %rdx,%r14 +.byte 102,72,15,126,210 + leaq -128(%rbx),%rsi + movq 24(%rsp),%rbx + + mulxq %r14,%r8,%r9 + call __mulx_mont_384 + + movq 40(%rsp),%r15 +.cfi_restore %r15 + movq 48(%rsp),%r14 +.cfi_restore %r14 + movq 56(%rsp),%r13 +.cfi_restore %r13 + movq 64(%rsp),%r12 +.cfi_restore %r12 + movq 72(%rsp),%rbx +.cfi_restore %rbx + movq 80(%rsp),%rbp +.cfi_restore %rbp + leaq 88(%rsp),%rsp +.cfi_adjust_cfa_offset -8*11 + + .byte 0xf3,0xc3 +.cfi_endproc +.size sqrx_n_mul_mont_383,.-sqrx_n_mul_mont_383 +.type __mulx_mont_383_nonred,@function +.align 32 +__mulx_mont_383_nonred: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + mulxq %r15,%r14,%r10 + mulxq %rax,%r15,%r11 + addq %r14,%r9 + mulxq %r12,%rax,%r12 + adcq %r15,%r10 + mulxq %rdi,%rdi,%r13 + adcq %rax,%r11 + mulxq %rbp,%rbp,%r14 + movq 8(%rbx),%rdx + adcq %rdi,%r12 + adcq %rbp,%r13 + adcq $0,%r14 + movq %r8,%rax + imulq 8(%rsp),%r8 + + + xorq %r15,%r15 + mulxq 0+128(%rsi),%rdi,%rbp + adoxq %rdi,%r9 + adcxq %rbp,%r10 + + mulxq 8+128(%rsi),%rdi,%rbp + adoxq %rdi,%r10 + adcxq %rbp,%r11 + + mulxq 16+128(%rsi),%rdi,%rbp + adoxq %rdi,%r11 + adcxq %rbp,%r12 + + mulxq 24+128(%rsi),%rdi,%rbp + adoxq %rdi,%r12 + adcxq %rbp,%r13 + + mulxq 32+128(%rsi),%rdi,%rbp + adoxq %rdi,%r13 + adcxq %rbp,%r14 + + mulxq 40+128(%rsi),%rdi,%rbp + movq %r8,%rdx + adoxq %rdi,%r14 + adcxq %r15,%rbp + adoxq %rbp,%r15 + + + xorq %r8,%r8 + mulxq 0+128(%rcx),%rdi,%rbp + adcxq %rdi,%rax + adoxq %rbp,%r9 + + mulxq 8+128(%rcx),%rdi,%rbp + adcxq %rdi,%r9 + adoxq %rbp,%r10 + + mulxq 16+128(%rcx),%rdi,%rbp + adcxq %rdi,%r10 + adoxq %rbp,%r11 + + mulxq 24+128(%rcx),%rdi,%rbp + adcxq %rdi,%r11 + adoxq %rbp,%r12 + + mulxq 32+128(%rcx),%rdi,%rbp + adcxq %rdi,%r12 + adoxq %rbp,%r13 + + mulxq 40+128(%rcx),%rdi,%rbp + movq 16(%rbx),%rdx + adcxq %rdi,%r13 + adoxq %rbp,%r14 + adcxq %rax,%r14 + adoxq %rax,%r15 + adcxq %rax,%r15 + movq %r9,%r8 + imulq 8(%rsp),%r9 + + + xorq %rax,%rax + mulxq 0+128(%rsi),%rdi,%rbp + adoxq %rdi,%r10 + adcxq %rbp,%r11 + + mulxq 8+128(%rsi),%rdi,%rbp + adoxq %rdi,%r11 + adcxq %rbp,%r12 + + mulxq 16+128(%rsi),%rdi,%rbp + adoxq %rdi,%r12 + adcxq %rbp,%r13 + + mulxq 24+128(%rsi),%rdi,%rbp + adoxq %rdi,%r13 + adcxq %rbp,%r14 + + mulxq 32+128(%rsi),%rdi,%rbp + adoxq %rdi,%r14 + adcxq %rbp,%r15 + + mulxq 40+128(%rsi),%rdi,%rbp + movq %r9,%rdx + adoxq %rdi,%r15 + adcxq %rax,%rbp + adoxq %rbp,%rax + + + xorq %r9,%r9 + mulxq 0+128(%rcx),%rdi,%rbp + adcxq %rdi,%r8 + adoxq %rbp,%r10 + + mulxq 8+128(%rcx),%rdi,%rbp + adcxq %rdi,%r10 + adoxq %rbp,%r11 + + mulxq 16+128(%rcx),%rdi,%rbp + adcxq %rdi,%r11 + adoxq %rbp,%r12 + + mulxq 24+128(%rcx),%rdi,%rbp + adcxq %rdi,%r12 + adoxq %rbp,%r13 + + mulxq 32+128(%rcx),%rdi,%rbp + adcxq %rdi,%r13 + adoxq %rbp,%r14 + + mulxq 40+128(%rcx),%rdi,%rbp + movq 24(%rbx),%rdx + adcxq %rdi,%r14 + adoxq %rbp,%r15 + adcxq %r8,%r15 + adoxq %r8,%rax + adcxq %r8,%rax + movq %r10,%r9 + imulq 8(%rsp),%r10 + + + xorq %r8,%r8 + mulxq 0+128(%rsi),%rdi,%rbp + adoxq %rdi,%r11 + adcxq %rbp,%r12 + + mulxq 8+128(%rsi),%rdi,%rbp + adoxq %rdi,%r12 + adcxq %rbp,%r13 + + mulxq 16+128(%rsi),%rdi,%rbp + adoxq %rdi,%r13 + adcxq %rbp,%r14 + + mulxq 24+128(%rsi),%rdi,%rbp + adoxq %rdi,%r14 + adcxq %rbp,%r15 + + mulxq 32+128(%rsi),%rdi,%rbp + adoxq %rdi,%r15 + adcxq %rbp,%rax + + mulxq 40+128(%rsi),%rdi,%rbp + movq %r10,%rdx + adoxq %rdi,%rax + adcxq %r8,%rbp + adoxq %rbp,%r8 + + + xorq %r10,%r10 + mulxq 0+128(%rcx),%rdi,%rbp + adcxq %rdi,%r9 + adoxq %rbp,%r11 + + mulxq 8+128(%rcx),%rdi,%rbp + adcxq %rdi,%r11 + adoxq %rbp,%r12 + + mulxq 16+128(%rcx),%rdi,%rbp + adcxq %rdi,%r12 + adoxq %rbp,%r13 + + mulxq 24+128(%rcx),%rdi,%rbp + adcxq %rdi,%r13 + adoxq %rbp,%r14 + + mulxq 32+128(%rcx),%rdi,%rbp + adcxq %rdi,%r14 + adoxq %rbp,%r15 + + mulxq 40+128(%rcx),%rdi,%rbp + movq 32(%rbx),%rdx + adcxq %rdi,%r15 + adoxq %rbp,%rax + adcxq %r9,%rax + adoxq %r9,%r8 + adcxq %r9,%r8 + movq %r11,%r10 + imulq 8(%rsp),%r11 + + + xorq %r9,%r9 + mulxq 0+128(%rsi),%rdi,%rbp + adoxq %rdi,%r12 + adcxq %rbp,%r13 + + mulxq 8+128(%rsi),%rdi,%rbp + adoxq %rdi,%r13 + adcxq %rbp,%r14 + + mulxq 16+128(%rsi),%rdi,%rbp + adoxq %rdi,%r14 + adcxq %rbp,%r15 + + mulxq 24+128(%rsi),%rdi,%rbp + adoxq %rdi,%r15 + adcxq %rbp,%rax + + mulxq 32+128(%rsi),%rdi,%rbp + adoxq %rdi,%rax + adcxq %rbp,%r8 + + mulxq 40+128(%rsi),%rdi,%rbp + movq %r11,%rdx + adoxq %rdi,%r8 + adcxq %r9,%rbp + adoxq %rbp,%r9 + + + xorq %r11,%r11 + mulxq 0+128(%rcx),%rdi,%rbp + adcxq %rdi,%r10 + adoxq %rbp,%r12 + + mulxq 8+128(%rcx),%rdi,%rbp + adcxq %rdi,%r12 + adoxq %rbp,%r13 + + mulxq 16+128(%rcx),%rdi,%rbp + adcxq %rdi,%r13 + adoxq %rbp,%r14 + + mulxq 24+128(%rcx),%rdi,%rbp + adcxq %rdi,%r14 + adoxq %rbp,%r15 + + mulxq 32+128(%rcx),%rdi,%rbp + adcxq %rdi,%r15 + adoxq %rbp,%rax + + mulxq 40+128(%rcx),%rdi,%rbp + movq 40(%rbx),%rdx + adcxq %rdi,%rax + adoxq %rbp,%r8 + adcxq %r10,%r8 + adoxq %r10,%r9 + adcxq %r10,%r9 + movq %r12,%r11 + imulq 8(%rsp),%r12 + + + xorq %r10,%r10 + mulxq 0+128(%rsi),%rdi,%rbp + adoxq %rdi,%r13 + adcxq %rbp,%r14 + + mulxq 8+128(%rsi),%rdi,%rbp + adoxq %rdi,%r14 + adcxq %rbp,%r15 + + mulxq 16+128(%rsi),%rdi,%rbp + adoxq %rdi,%r15 + adcxq %rbp,%rax + + mulxq 24+128(%rsi),%rdi,%rbp + adoxq %rdi,%rax + adcxq %rbp,%r8 + + mulxq 32+128(%rsi),%rdi,%rbp + adoxq %rdi,%r8 + adcxq %rbp,%r9 + + mulxq 40+128(%rsi),%rdi,%rbp + movq %r12,%rdx + adoxq %rdi,%r9 + adcxq %r10,%rbp + adoxq %rbp,%r10 + + + xorq %r12,%r12 + mulxq 0+128(%rcx),%rdi,%rbp + adcxq %rdi,%r11 + adoxq %rbp,%r13 + + mulxq 8+128(%rcx),%rdi,%rbp + adcxq %rdi,%r13 + adoxq %rbp,%r14 + + mulxq 16+128(%rcx),%rdi,%rbp + adcxq %rdi,%r14 + adoxq %rbp,%r15 + + mulxq 24+128(%rcx),%rdi,%rbp + adcxq %rdi,%r15 + adoxq %rbp,%rax + + mulxq 32+128(%rcx),%rdi,%rbp + adcxq %rdi,%rax + adoxq %rbp,%r8 + + mulxq 40+128(%rcx),%rdi,%rbp + movq %r13,%rdx + adcxq %rdi,%r8 + adoxq %rbp,%r9 + adcxq %r11,%r9 + adoxq %r11,%r10 + adcxq %r11,%r10 + imulq 8(%rsp),%rdx + movq 24(%rsp),%rbx + + + xorq %r12,%r12 + mulxq 0+128(%rcx),%rdi,%rbp + adcxq %rdi,%r13 + adoxq %rbp,%r14 + + mulxq 8+128(%rcx),%rdi,%rbp + adcxq %rdi,%r14 + adoxq %rbp,%r15 + + mulxq 16+128(%rcx),%rdi,%rbp + adcxq %rdi,%r15 + adoxq %rbp,%rax + + mulxq 24+128(%rcx),%rdi,%rbp + adcxq %rdi,%rax + adoxq %rbp,%r8 + + mulxq 32+128(%rcx),%rdi,%rbp + adcxq %rdi,%r8 + adoxq %rbp,%r9 + + mulxq 40+128(%rcx),%rdi,%rbp + movq %r14,%rdx + adcxq %rdi,%r9 + adoxq %rbp,%r10 + adcq $0,%r10 + movq %r8,%r12 + + movq %r14,0(%rbx) + movq %r15,8(%rbx) + movq %rax,16(%rbx) + movq %r9,%rdi + movq %r8,24(%rbx) + movq %r9,32(%rbx) + movq %r10,40(%rbx) + movq %r10,%rbp + + .byte 0xf3,0xc3 +.cfi_endproc +.size __mulx_mont_383_nonred,.-__mulx_mont_383_nonred +.globl sqrx_mont_382x +.hidden sqrx_mont_382x +.type sqrx_mont_382x,@function +.align 32 +sqrx_mont_382x: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + subq $136,%rsp +.cfi_adjust_cfa_offset 136 + + + movq %rcx,0(%rsp) + movq %rdx,%rcx + movq %rdi,16(%rsp) + movq %rsi,24(%rsp) + + + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + movq 32(%rsi),%r12 + movq 40(%rsi),%r13 + + movq %r8,%r14 + addq 48(%rsi),%r8 + movq %r9,%r15 + adcq 56(%rsi),%r9 + movq %r10,%rax + adcq 64(%rsi),%r10 + movq %r11,%rdx + adcq 72(%rsi),%r11 + movq %r12,%rbx + adcq 80(%rsi),%r12 + movq %r13,%rbp + adcq 88(%rsi),%r13 + + subq 48(%rsi),%r14 + sbbq 56(%rsi),%r15 + sbbq 64(%rsi),%rax + sbbq 72(%rsi),%rdx + sbbq 80(%rsi),%rbx + sbbq 88(%rsi),%rbp + sbbq %rdi,%rdi + + movq %r8,32+0(%rsp) + movq %r9,32+8(%rsp) + movq %r10,32+16(%rsp) + movq %r11,32+24(%rsp) + movq %r12,32+32(%rsp) + movq %r13,32+40(%rsp) + + movq %r14,32+48(%rsp) + movq %r15,32+56(%rsp) + movq %rax,32+64(%rsp) + movq %rdx,32+72(%rsp) + movq %rbx,32+80(%rsp) + movq %rbp,32+88(%rsp) + movq %rdi,32+96(%rsp) + + + + leaq 48(%rsi),%rbx + + movq 48(%rsi),%rdx + movq 0(%rsi),%r14 + movq 8(%rsi),%r15 + movq 16(%rsi),%rax + movq 24(%rsi),%r12 + movq 32(%rsi),%rdi + movq 40(%rsi),%rbp + leaq -128(%rsi),%rsi + leaq -128(%rcx),%rcx + + mulxq %r14,%r8,%r9 + call __mulx_mont_383_nonred + addq %rdx,%rdx + adcq %r15,%r15 + adcq %rax,%rax + adcq %r12,%r12 + adcq %rdi,%rdi + adcq %rbp,%rbp + + movq %rdx,48(%rbx) + movq %r15,56(%rbx) + movq %rax,64(%rbx) + movq %r12,72(%rbx) + movq %rdi,80(%rbx) + movq %rbp,88(%rbx) + + leaq 32-128(%rsp),%rsi + leaq 32+48(%rsp),%rbx + + movq 32+48(%rsp),%rdx + movq 32+0(%rsp),%r14 + movq 32+8(%rsp),%r15 + movq 32+16(%rsp),%rax + movq 32+24(%rsp),%r12 + movq 32+32(%rsp),%rdi + movq 32+40(%rsp),%rbp + + + + mulxq %r14,%r8,%r9 + call __mulx_mont_383_nonred + movq 32+96(%rsp),%r14 + leaq 128(%rcx),%rcx + movq 32+0(%rsp),%r8 + andq %r14,%r8 + movq 32+8(%rsp),%r9 + andq %r14,%r9 + movq 32+16(%rsp),%r10 + andq %r14,%r10 + movq 32+24(%rsp),%r11 + andq %r14,%r11 + movq 32+32(%rsp),%r13 + andq %r14,%r13 + andq 32+40(%rsp),%r14 + + subq %r8,%rdx + movq 0(%rcx),%r8 + sbbq %r9,%r15 + movq 8(%rcx),%r9 + sbbq %r10,%rax + movq 16(%rcx),%r10 + sbbq %r11,%r12 + movq 24(%rcx),%r11 + sbbq %r13,%rdi + movq 32(%rcx),%r13 + sbbq %r14,%rbp + sbbq %r14,%r14 + + andq %r14,%r8 + andq %r14,%r9 + andq %r14,%r10 + andq %r14,%r11 + andq %r14,%r13 + andq 40(%rcx),%r14 + + addq %r8,%rdx + adcq %r9,%r15 + adcq %r10,%rax + adcq %r11,%r12 + adcq %r13,%rdi + adcq %r14,%rbp + + movq %rdx,0(%rbx) + movq %r15,8(%rbx) + movq %rax,16(%rbx) + movq %r12,24(%rbx) + movq %rdi,32(%rbx) + movq %rbp,40(%rbx) + leaq 136(%rsp),%r8 + movq 0(%r8),%r15 +.cfi_restore %r15 + movq 8(%r8),%r14 +.cfi_restore %r14 + movq 16(%r8),%r13 +.cfi_restore %r13 + movq 24(%r8),%r12 +.cfi_restore %r12 + movq 32(%r8),%rbx +.cfi_restore %rbx + movq 40(%r8),%rbp +.cfi_restore %rbp + leaq 48(%r8),%rsp +.cfi_adjust_cfa_offset -136-8*6 + + .byte 0xf3,0xc3 +.cfi_endproc +.size sqrx_mont_382x,.-sqrx_mont_382x + +.section .note.GNU-stack,"",@progbits +.section .note.gnu.property,"a",@note + .long 4,2f-1f,5 + .byte 0x47,0x4E,0x55,0 +1: .long 0xc0000002,4,3 +.align 8 +2: diff --git a/temp-deps/blst-ts/deps/blst/build/elf/sha256-armv8.S b/temp-deps/blst-ts/deps/blst/build/elf/sha256-armv8.S new file mode 100644 index 0000000..7341dec --- /dev/null +++ b/temp-deps/blst-ts/deps/blst/build/elf/sha256-armv8.S @@ -0,0 +1,1077 @@ +// +// Copyright Supranational LLC +// Licensed under the Apache License, Version 2.0, see LICENSE for details. +// SPDX-License-Identifier: Apache-2.0 +// +// ==================================================================== +// Written by Andy Polyakov, @dot-asm, initially for the OpenSSL +// project. +// ==================================================================== +// +// sha256_block procedure for ARMv8. +// +// This module is stripped of scalar code paths, with raionale that all +// known processors are NEON-capable. +// +// See original module at CRYPTOGAMS for further details. + +.text + +.align 6 +.type .LK256,%object +.LK256: +.long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5 +.long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5 +.long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3 +.long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174 +.long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc +.long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da +.long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7 +.long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967 +.long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13 +.long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85 +.long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3 +.long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070 +.long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5 +.long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3 +.long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208 +.long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 +.long 0 //terminator +.size .LK256,.-.LK256 +.byte 83,72,65,50,53,54,32,98,108,111,99,107,32,116,114,97,110,115,102,111,114,109,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,64,100,111,116,45,97,115,109,0 +.align 2 +.align 2 +.globl blst_sha256_block_armv8 +.type blst_sha256_block_armv8,%function +.align 6 +blst_sha256_block_armv8: +.Lv8_entry: + stp x29,x30,[sp,#-16]! + add x29,sp,#0 + + ld1 {v0.4s,v1.4s},[x0] + adr x3,.LK256 + +.Loop_hw: + ld1 {v4.16b,v5.16b,v6.16b,v7.16b},[x1],#64 + sub x2,x2,#1 + ld1 {v16.4s},[x3],#16 + rev32 v4.16b,v4.16b + rev32 v5.16b,v5.16b + rev32 v6.16b,v6.16b + rev32 v7.16b,v7.16b + orr v18.16b,v0.16b,v0.16b // offload + orr v19.16b,v1.16b,v1.16b + ld1 {v17.4s},[x3],#16 + add v16.4s,v16.4s,v4.4s +.inst 0x5e2828a4 //sha256su0 v4.16b,v5.16b + orr v2.16b,v0.16b,v0.16b +.inst 0x5e104020 //sha256h v0.16b,v1.16b,v16.4s +.inst 0x5e105041 //sha256h2 v1.16b,v2.16b,v16.4s +.inst 0x5e0760c4 //sha256su1 v4.16b,v6.16b,v7.16b + ld1 {v16.4s},[x3],#16 + add v17.4s,v17.4s,v5.4s +.inst 0x5e2828c5 //sha256su0 v5.16b,v6.16b + orr v2.16b,v0.16b,v0.16b +.inst 0x5e114020 //sha256h v0.16b,v1.16b,v17.4s +.inst 0x5e115041 //sha256h2 v1.16b,v2.16b,v17.4s +.inst 0x5e0460e5 //sha256su1 v5.16b,v7.16b,v4.16b + ld1 {v17.4s},[x3],#16 + add v16.4s,v16.4s,v6.4s +.inst 0x5e2828e6 //sha256su0 v6.16b,v7.16b + orr v2.16b,v0.16b,v0.16b +.inst 0x5e104020 //sha256h v0.16b,v1.16b,v16.4s +.inst 0x5e105041 //sha256h2 v1.16b,v2.16b,v16.4s +.inst 0x5e056086 //sha256su1 v6.16b,v4.16b,v5.16b + ld1 {v16.4s},[x3],#16 + add v17.4s,v17.4s,v7.4s +.inst 0x5e282887 //sha256su0 v7.16b,v4.16b + orr v2.16b,v0.16b,v0.16b +.inst 0x5e114020 //sha256h v0.16b,v1.16b,v17.4s +.inst 0x5e115041 //sha256h2 v1.16b,v2.16b,v17.4s +.inst 0x5e0660a7 //sha256su1 v7.16b,v5.16b,v6.16b + ld1 {v17.4s},[x3],#16 + add v16.4s,v16.4s,v4.4s +.inst 0x5e2828a4 //sha256su0 v4.16b,v5.16b + orr v2.16b,v0.16b,v0.16b +.inst 0x5e104020 //sha256h v0.16b,v1.16b,v16.4s +.inst 0x5e105041 //sha256h2 v1.16b,v2.16b,v16.4s +.inst 0x5e0760c4 //sha256su1 v4.16b,v6.16b,v7.16b + ld1 {v16.4s},[x3],#16 + add v17.4s,v17.4s,v5.4s +.inst 0x5e2828c5 //sha256su0 v5.16b,v6.16b + orr v2.16b,v0.16b,v0.16b +.inst 0x5e114020 //sha256h v0.16b,v1.16b,v17.4s +.inst 0x5e115041 //sha256h2 v1.16b,v2.16b,v17.4s +.inst 0x5e0460e5 //sha256su1 v5.16b,v7.16b,v4.16b + ld1 {v17.4s},[x3],#16 + add v16.4s,v16.4s,v6.4s +.inst 0x5e2828e6 //sha256su0 v6.16b,v7.16b + orr v2.16b,v0.16b,v0.16b +.inst 0x5e104020 //sha256h v0.16b,v1.16b,v16.4s +.inst 0x5e105041 //sha256h2 v1.16b,v2.16b,v16.4s +.inst 0x5e056086 //sha256su1 v6.16b,v4.16b,v5.16b + ld1 {v16.4s},[x3],#16 + add v17.4s,v17.4s,v7.4s +.inst 0x5e282887 //sha256su0 v7.16b,v4.16b + orr v2.16b,v0.16b,v0.16b +.inst 0x5e114020 //sha256h v0.16b,v1.16b,v17.4s +.inst 0x5e115041 //sha256h2 v1.16b,v2.16b,v17.4s +.inst 0x5e0660a7 //sha256su1 v7.16b,v5.16b,v6.16b + ld1 {v17.4s},[x3],#16 + add v16.4s,v16.4s,v4.4s +.inst 0x5e2828a4 //sha256su0 v4.16b,v5.16b + orr v2.16b,v0.16b,v0.16b +.inst 0x5e104020 //sha256h v0.16b,v1.16b,v16.4s +.inst 0x5e105041 //sha256h2 v1.16b,v2.16b,v16.4s +.inst 0x5e0760c4 //sha256su1 v4.16b,v6.16b,v7.16b + ld1 {v16.4s},[x3],#16 + add v17.4s,v17.4s,v5.4s +.inst 0x5e2828c5 //sha256su0 v5.16b,v6.16b + orr v2.16b,v0.16b,v0.16b +.inst 0x5e114020 //sha256h v0.16b,v1.16b,v17.4s +.inst 0x5e115041 //sha256h2 v1.16b,v2.16b,v17.4s +.inst 0x5e0460e5 //sha256su1 v5.16b,v7.16b,v4.16b + ld1 {v17.4s},[x3],#16 + add v16.4s,v16.4s,v6.4s +.inst 0x5e2828e6 //sha256su0 v6.16b,v7.16b + orr v2.16b,v0.16b,v0.16b +.inst 0x5e104020 //sha256h v0.16b,v1.16b,v16.4s +.inst 0x5e105041 //sha256h2 v1.16b,v2.16b,v16.4s +.inst 0x5e056086 //sha256su1 v6.16b,v4.16b,v5.16b + ld1 {v16.4s},[x3],#16 + add v17.4s,v17.4s,v7.4s +.inst 0x5e282887 //sha256su0 v7.16b,v4.16b + orr v2.16b,v0.16b,v0.16b +.inst 0x5e114020 //sha256h v0.16b,v1.16b,v17.4s +.inst 0x5e115041 //sha256h2 v1.16b,v2.16b,v17.4s +.inst 0x5e0660a7 //sha256su1 v7.16b,v5.16b,v6.16b + ld1 {v17.4s},[x3],#16 + add v16.4s,v16.4s,v4.4s + orr v2.16b,v0.16b,v0.16b +.inst 0x5e104020 //sha256h v0.16b,v1.16b,v16.4s +.inst 0x5e105041 //sha256h2 v1.16b,v2.16b,v16.4s + + ld1 {v16.4s},[x3],#16 + add v17.4s,v17.4s,v5.4s + orr v2.16b,v0.16b,v0.16b +.inst 0x5e114020 //sha256h v0.16b,v1.16b,v17.4s +.inst 0x5e115041 //sha256h2 v1.16b,v2.16b,v17.4s + + ld1 {v17.4s},[x3] + add v16.4s,v16.4s,v6.4s + sub x3,x3,#64*4-16 // rewind + orr v2.16b,v0.16b,v0.16b +.inst 0x5e104020 //sha256h v0.16b,v1.16b,v16.4s +.inst 0x5e105041 //sha256h2 v1.16b,v2.16b,v16.4s + + add v17.4s,v17.4s,v7.4s + orr v2.16b,v0.16b,v0.16b +.inst 0x5e114020 //sha256h v0.16b,v1.16b,v17.4s +.inst 0x5e115041 //sha256h2 v1.16b,v2.16b,v17.4s + + add v0.4s,v0.4s,v18.4s + add v1.4s,v1.4s,v19.4s + + cbnz x2,.Loop_hw + + st1 {v0.4s,v1.4s},[x0] + + ldr x29,[sp],#16 + ret +.size blst_sha256_block_armv8,.-blst_sha256_block_armv8 +.globl blst_sha256_block_data_order +.type blst_sha256_block_data_order,%function +.align 4 +blst_sha256_block_data_order: + stp x29, x30, [sp, #-16]! + mov x29, sp + sub sp,sp,#16*4 + + adr x16,.LK256 + add x2,x1,x2,lsl#6 // len to point at the end of inp + + ld1 {v0.16b},[x1], #16 + ld1 {v1.16b},[x1], #16 + ld1 {v2.16b},[x1], #16 + ld1 {v3.16b},[x1], #16 + ld1 {v4.4s},[x16], #16 + ld1 {v5.4s},[x16], #16 + ld1 {v6.4s},[x16], #16 + ld1 {v7.4s},[x16], #16 + rev32 v0.16b,v0.16b // yes, even on + rev32 v1.16b,v1.16b // big-endian + rev32 v2.16b,v2.16b + rev32 v3.16b,v3.16b + mov x17,sp + add v4.4s,v4.4s,v0.4s + add v5.4s,v5.4s,v1.4s + add v6.4s,v6.4s,v2.4s + st1 {v4.4s,v5.4s},[x17], #32 + add v7.4s,v7.4s,v3.4s + st1 {v6.4s,v7.4s},[x17] + sub x17,x17,#32 + + ldp w3,w4,[x0] + ldp w5,w6,[x0,#8] + ldp w7,w8,[x0,#16] + ldp w9,w10,[x0,#24] + ldr w12,[sp,#0] + mov w13,wzr + eor w14,w4,w5 + mov w15,wzr + b .L_00_48 + +.align 4 +.L_00_48: + ext v4.16b,v0.16b,v1.16b,#4 + add w10,w10,w12 + add w3,w3,w15 + and w12,w8,w7 + bic w15,w9,w7 + ext v7.16b,v2.16b,v3.16b,#4 + eor w11,w7,w7,ror#5 + add w3,w3,w13 + mov d19,v3.d[1] + orr w12,w12,w15 + eor w11,w11,w7,ror#19 + ushr v6.4s,v4.4s,#7 + eor w15,w3,w3,ror#11 + ushr v5.4s,v4.4s,#3 + add w10,w10,w12 + add v0.4s,v0.4s,v7.4s + ror w11,w11,#6 + sli v6.4s,v4.4s,#25 + eor w13,w3,w4 + eor w15,w15,w3,ror#20 + ushr v7.4s,v4.4s,#18 + add w10,w10,w11 + ldr w12,[sp,#4] + and w14,w14,w13 + eor v5.16b,v5.16b,v6.16b + ror w15,w15,#2 + add w6,w6,w10 + sli v7.4s,v4.4s,#14 + eor w14,w14,w4 + ushr v16.4s,v19.4s,#17 + add w9,w9,w12 + add w10,w10,w15 + and w12,w7,w6 + eor v5.16b,v5.16b,v7.16b + bic w15,w8,w6 + eor w11,w6,w6,ror#5 + sli v16.4s,v19.4s,#15 + add w10,w10,w14 + orr w12,w12,w15 + ushr v17.4s,v19.4s,#10 + eor w11,w11,w6,ror#19 + eor w15,w10,w10,ror#11 + ushr v7.4s,v19.4s,#19 + add w9,w9,w12 + ror w11,w11,#6 + add v0.4s,v0.4s,v5.4s + eor w14,w10,w3 + eor w15,w15,w10,ror#20 + sli v7.4s,v19.4s,#13 + add w9,w9,w11 + ldr w12,[sp,#8] + and w13,w13,w14 + eor v17.16b,v17.16b,v16.16b + ror w15,w15,#2 + add w5,w5,w9 + eor w13,w13,w3 + eor v17.16b,v17.16b,v7.16b + add w8,w8,w12 + add w9,w9,w15 + and w12,w6,w5 + add v0.4s,v0.4s,v17.4s + bic w15,w7,w5 + eor w11,w5,w5,ror#5 + add w9,w9,w13 + ushr v18.4s,v0.4s,#17 + orr w12,w12,w15 + ushr v19.4s,v0.4s,#10 + eor w11,w11,w5,ror#19 + eor w15,w9,w9,ror#11 + sli v18.4s,v0.4s,#15 + add w8,w8,w12 + ushr v17.4s,v0.4s,#19 + ror w11,w11,#6 + eor w13,w9,w10 + eor v19.16b,v19.16b,v18.16b + eor w15,w15,w9,ror#20 + add w8,w8,w11 + sli v17.4s,v0.4s,#13 + ldr w12,[sp,#12] + and w14,w14,w13 + ror w15,w15,#2 + ld1 {v4.4s},[x16], #16 + add w4,w4,w8 + eor v19.16b,v19.16b,v17.16b + eor w14,w14,w10 + eor v17.16b,v17.16b,v17.16b + add w7,w7,w12 + add w8,w8,w15 + and w12,w5,w4 + mov v17.d[1],v19.d[0] + bic w15,w6,w4 + eor w11,w4,w4,ror#5 + add w8,w8,w14 + add v0.4s,v0.4s,v17.4s + orr w12,w12,w15 + eor w11,w11,w4,ror#19 + eor w15,w8,w8,ror#11 + add v4.4s,v4.4s,v0.4s + add w7,w7,w12 + ror w11,w11,#6 + eor w14,w8,w9 + eor w15,w15,w8,ror#20 + add w7,w7,w11 + ldr w12,[sp,#16] + and w13,w13,w14 + ror w15,w15,#2 + add w3,w3,w7 + eor w13,w13,w9 + st1 {v4.4s},[x17], #16 + ext v4.16b,v1.16b,v2.16b,#4 + add w6,w6,w12 + add w7,w7,w15 + and w12,w4,w3 + bic w15,w5,w3 + ext v7.16b,v3.16b,v0.16b,#4 + eor w11,w3,w3,ror#5 + add w7,w7,w13 + mov d19,v0.d[1] + orr w12,w12,w15 + eor w11,w11,w3,ror#19 + ushr v6.4s,v4.4s,#7 + eor w15,w7,w7,ror#11 + ushr v5.4s,v4.4s,#3 + add w6,w6,w12 + add v1.4s,v1.4s,v7.4s + ror w11,w11,#6 + sli v6.4s,v4.4s,#25 + eor w13,w7,w8 + eor w15,w15,w7,ror#20 + ushr v7.4s,v4.4s,#18 + add w6,w6,w11 + ldr w12,[sp,#20] + and w14,w14,w13 + eor v5.16b,v5.16b,v6.16b + ror w15,w15,#2 + add w10,w10,w6 + sli v7.4s,v4.4s,#14 + eor w14,w14,w8 + ushr v16.4s,v19.4s,#17 + add w5,w5,w12 + add w6,w6,w15 + and w12,w3,w10 + eor v5.16b,v5.16b,v7.16b + bic w15,w4,w10 + eor w11,w10,w10,ror#5 + sli v16.4s,v19.4s,#15 + add w6,w6,w14 + orr w12,w12,w15 + ushr v17.4s,v19.4s,#10 + eor w11,w11,w10,ror#19 + eor w15,w6,w6,ror#11 + ushr v7.4s,v19.4s,#19 + add w5,w5,w12 + ror w11,w11,#6 + add v1.4s,v1.4s,v5.4s + eor w14,w6,w7 + eor w15,w15,w6,ror#20 + sli v7.4s,v19.4s,#13 + add w5,w5,w11 + ldr w12,[sp,#24] + and w13,w13,w14 + eor v17.16b,v17.16b,v16.16b + ror w15,w15,#2 + add w9,w9,w5 + eor w13,w13,w7 + eor v17.16b,v17.16b,v7.16b + add w4,w4,w12 + add w5,w5,w15 + and w12,w10,w9 + add v1.4s,v1.4s,v17.4s + bic w15,w3,w9 + eor w11,w9,w9,ror#5 + add w5,w5,w13 + ushr v18.4s,v1.4s,#17 + orr w12,w12,w15 + ushr v19.4s,v1.4s,#10 + eor w11,w11,w9,ror#19 + eor w15,w5,w5,ror#11 + sli v18.4s,v1.4s,#15 + add w4,w4,w12 + ushr v17.4s,v1.4s,#19 + ror w11,w11,#6 + eor w13,w5,w6 + eor v19.16b,v19.16b,v18.16b + eor w15,w15,w5,ror#20 + add w4,w4,w11 + sli v17.4s,v1.4s,#13 + ldr w12,[sp,#28] + and w14,w14,w13 + ror w15,w15,#2 + ld1 {v4.4s},[x16], #16 + add w8,w8,w4 + eor v19.16b,v19.16b,v17.16b + eor w14,w14,w6 + eor v17.16b,v17.16b,v17.16b + add w3,w3,w12 + add w4,w4,w15 + and w12,w9,w8 + mov v17.d[1],v19.d[0] + bic w15,w10,w8 + eor w11,w8,w8,ror#5 + add w4,w4,w14 + add v1.4s,v1.4s,v17.4s + orr w12,w12,w15 + eor w11,w11,w8,ror#19 + eor w15,w4,w4,ror#11 + add v4.4s,v4.4s,v1.4s + add w3,w3,w12 + ror w11,w11,#6 + eor w14,w4,w5 + eor w15,w15,w4,ror#20 + add w3,w3,w11 + ldr w12,[sp,#32] + and w13,w13,w14 + ror w15,w15,#2 + add w7,w7,w3 + eor w13,w13,w5 + st1 {v4.4s},[x17], #16 + ext v4.16b,v2.16b,v3.16b,#4 + add w10,w10,w12 + add w3,w3,w15 + and w12,w8,w7 + bic w15,w9,w7 + ext v7.16b,v0.16b,v1.16b,#4 + eor w11,w7,w7,ror#5 + add w3,w3,w13 + mov d19,v1.d[1] + orr w12,w12,w15 + eor w11,w11,w7,ror#19 + ushr v6.4s,v4.4s,#7 + eor w15,w3,w3,ror#11 + ushr v5.4s,v4.4s,#3 + add w10,w10,w12 + add v2.4s,v2.4s,v7.4s + ror w11,w11,#6 + sli v6.4s,v4.4s,#25 + eor w13,w3,w4 + eor w15,w15,w3,ror#20 + ushr v7.4s,v4.4s,#18 + add w10,w10,w11 + ldr w12,[sp,#36] + and w14,w14,w13 + eor v5.16b,v5.16b,v6.16b + ror w15,w15,#2 + add w6,w6,w10 + sli v7.4s,v4.4s,#14 + eor w14,w14,w4 + ushr v16.4s,v19.4s,#17 + add w9,w9,w12 + add w10,w10,w15 + and w12,w7,w6 + eor v5.16b,v5.16b,v7.16b + bic w15,w8,w6 + eor w11,w6,w6,ror#5 + sli v16.4s,v19.4s,#15 + add w10,w10,w14 + orr w12,w12,w15 + ushr v17.4s,v19.4s,#10 + eor w11,w11,w6,ror#19 + eor w15,w10,w10,ror#11 + ushr v7.4s,v19.4s,#19 + add w9,w9,w12 + ror w11,w11,#6 + add v2.4s,v2.4s,v5.4s + eor w14,w10,w3 + eor w15,w15,w10,ror#20 + sli v7.4s,v19.4s,#13 + add w9,w9,w11 + ldr w12,[sp,#40] + and w13,w13,w14 + eor v17.16b,v17.16b,v16.16b + ror w15,w15,#2 + add w5,w5,w9 + eor w13,w13,w3 + eor v17.16b,v17.16b,v7.16b + add w8,w8,w12 + add w9,w9,w15 + and w12,w6,w5 + add v2.4s,v2.4s,v17.4s + bic w15,w7,w5 + eor w11,w5,w5,ror#5 + add w9,w9,w13 + ushr v18.4s,v2.4s,#17 + orr w12,w12,w15 + ushr v19.4s,v2.4s,#10 + eor w11,w11,w5,ror#19 + eor w15,w9,w9,ror#11 + sli v18.4s,v2.4s,#15 + add w8,w8,w12 + ushr v17.4s,v2.4s,#19 + ror w11,w11,#6 + eor w13,w9,w10 + eor v19.16b,v19.16b,v18.16b + eor w15,w15,w9,ror#20 + add w8,w8,w11 + sli v17.4s,v2.4s,#13 + ldr w12,[sp,#44] + and w14,w14,w13 + ror w15,w15,#2 + ld1 {v4.4s},[x16], #16 + add w4,w4,w8 + eor v19.16b,v19.16b,v17.16b + eor w14,w14,w10 + eor v17.16b,v17.16b,v17.16b + add w7,w7,w12 + add w8,w8,w15 + and w12,w5,w4 + mov v17.d[1],v19.d[0] + bic w15,w6,w4 + eor w11,w4,w4,ror#5 + add w8,w8,w14 + add v2.4s,v2.4s,v17.4s + orr w12,w12,w15 + eor w11,w11,w4,ror#19 + eor w15,w8,w8,ror#11 + add v4.4s,v4.4s,v2.4s + add w7,w7,w12 + ror w11,w11,#6 + eor w14,w8,w9 + eor w15,w15,w8,ror#20 + add w7,w7,w11 + ldr w12,[sp,#48] + and w13,w13,w14 + ror w15,w15,#2 + add w3,w3,w7 + eor w13,w13,w9 + st1 {v4.4s},[x17], #16 + ext v4.16b,v3.16b,v0.16b,#4 + add w6,w6,w12 + add w7,w7,w15 + and w12,w4,w3 + bic w15,w5,w3 + ext v7.16b,v1.16b,v2.16b,#4 + eor w11,w3,w3,ror#5 + add w7,w7,w13 + mov d19,v2.d[1] + orr w12,w12,w15 + eor w11,w11,w3,ror#19 + ushr v6.4s,v4.4s,#7 + eor w15,w7,w7,ror#11 + ushr v5.4s,v4.4s,#3 + add w6,w6,w12 + add v3.4s,v3.4s,v7.4s + ror w11,w11,#6 + sli v6.4s,v4.4s,#25 + eor w13,w7,w8 + eor w15,w15,w7,ror#20 + ushr v7.4s,v4.4s,#18 + add w6,w6,w11 + ldr w12,[sp,#52] + and w14,w14,w13 + eor v5.16b,v5.16b,v6.16b + ror w15,w15,#2 + add w10,w10,w6 + sli v7.4s,v4.4s,#14 + eor w14,w14,w8 + ushr v16.4s,v19.4s,#17 + add w5,w5,w12 + add w6,w6,w15 + and w12,w3,w10 + eor v5.16b,v5.16b,v7.16b + bic w15,w4,w10 + eor w11,w10,w10,ror#5 + sli v16.4s,v19.4s,#15 + add w6,w6,w14 + orr w12,w12,w15 + ushr v17.4s,v19.4s,#10 + eor w11,w11,w10,ror#19 + eor w15,w6,w6,ror#11 + ushr v7.4s,v19.4s,#19 + add w5,w5,w12 + ror w11,w11,#6 + add v3.4s,v3.4s,v5.4s + eor w14,w6,w7 + eor w15,w15,w6,ror#20 + sli v7.4s,v19.4s,#13 + add w5,w5,w11 + ldr w12,[sp,#56] + and w13,w13,w14 + eor v17.16b,v17.16b,v16.16b + ror w15,w15,#2 + add w9,w9,w5 + eor w13,w13,w7 + eor v17.16b,v17.16b,v7.16b + add w4,w4,w12 + add w5,w5,w15 + and w12,w10,w9 + add v3.4s,v3.4s,v17.4s + bic w15,w3,w9 + eor w11,w9,w9,ror#5 + add w5,w5,w13 + ushr v18.4s,v3.4s,#17 + orr w12,w12,w15 + ushr v19.4s,v3.4s,#10 + eor w11,w11,w9,ror#19 + eor w15,w5,w5,ror#11 + sli v18.4s,v3.4s,#15 + add w4,w4,w12 + ushr v17.4s,v3.4s,#19 + ror w11,w11,#6 + eor w13,w5,w6 + eor v19.16b,v19.16b,v18.16b + eor w15,w15,w5,ror#20 + add w4,w4,w11 + sli v17.4s,v3.4s,#13 + ldr w12,[sp,#60] + and w14,w14,w13 + ror w15,w15,#2 + ld1 {v4.4s},[x16], #16 + add w8,w8,w4 + eor v19.16b,v19.16b,v17.16b + eor w14,w14,w6 + eor v17.16b,v17.16b,v17.16b + add w3,w3,w12 + add w4,w4,w15 + and w12,w9,w8 + mov v17.d[1],v19.d[0] + bic w15,w10,w8 + eor w11,w8,w8,ror#5 + add w4,w4,w14 + add v3.4s,v3.4s,v17.4s + orr w12,w12,w15 + eor w11,w11,w8,ror#19 + eor w15,w4,w4,ror#11 + add v4.4s,v4.4s,v3.4s + add w3,w3,w12 + ror w11,w11,#6 + eor w14,w4,w5 + eor w15,w15,w4,ror#20 + add w3,w3,w11 + ldr w12,[x16] + and w13,w13,w14 + ror w15,w15,#2 + add w7,w7,w3 + eor w13,w13,w5 + st1 {v4.4s},[x17], #16 + cmp w12,#0 // check for K256 terminator + ldr w12,[sp,#0] + sub x17,x17,#64 + bne .L_00_48 + + sub x16,x16,#256 // rewind x16 + cmp x1,x2 + mov x17, #64 + csel x17, x17, xzr, eq + sub x1,x1,x17 // avoid SEGV + mov x17,sp + add w10,w10,w12 + add w3,w3,w15 + and w12,w8,w7 + ld1 {v0.16b},[x1],#16 + bic w15,w9,w7 + eor w11,w7,w7,ror#5 + ld1 {v4.4s},[x16],#16 + add w3,w3,w13 + orr w12,w12,w15 + eor w11,w11,w7,ror#19 + eor w15,w3,w3,ror#11 + rev32 v0.16b,v0.16b + add w10,w10,w12 + ror w11,w11,#6 + eor w13,w3,w4 + eor w15,w15,w3,ror#20 + add v4.4s,v4.4s,v0.4s + add w10,w10,w11 + ldr w12,[sp,#4] + and w14,w14,w13 + ror w15,w15,#2 + add w6,w6,w10 + eor w14,w14,w4 + add w9,w9,w12 + add w10,w10,w15 + and w12,w7,w6 + bic w15,w8,w6 + eor w11,w6,w6,ror#5 + add w10,w10,w14 + orr w12,w12,w15 + eor w11,w11,w6,ror#19 + eor w15,w10,w10,ror#11 + add w9,w9,w12 + ror w11,w11,#6 + eor w14,w10,w3 + eor w15,w15,w10,ror#20 + add w9,w9,w11 + ldr w12,[sp,#8] + and w13,w13,w14 + ror w15,w15,#2 + add w5,w5,w9 + eor w13,w13,w3 + add w8,w8,w12 + add w9,w9,w15 + and w12,w6,w5 + bic w15,w7,w5 + eor w11,w5,w5,ror#5 + add w9,w9,w13 + orr w12,w12,w15 + eor w11,w11,w5,ror#19 + eor w15,w9,w9,ror#11 + add w8,w8,w12 + ror w11,w11,#6 + eor w13,w9,w10 + eor w15,w15,w9,ror#20 + add w8,w8,w11 + ldr w12,[sp,#12] + and w14,w14,w13 + ror w15,w15,#2 + add w4,w4,w8 + eor w14,w14,w10 + add w7,w7,w12 + add w8,w8,w15 + and w12,w5,w4 + bic w15,w6,w4 + eor w11,w4,w4,ror#5 + add w8,w8,w14 + orr w12,w12,w15 + eor w11,w11,w4,ror#19 + eor w15,w8,w8,ror#11 + add w7,w7,w12 + ror w11,w11,#6 + eor w14,w8,w9 + eor w15,w15,w8,ror#20 + add w7,w7,w11 + ldr w12,[sp,#16] + and w13,w13,w14 + ror w15,w15,#2 + add w3,w3,w7 + eor w13,w13,w9 + st1 {v4.4s},[x17], #16 + add w6,w6,w12 + add w7,w7,w15 + and w12,w4,w3 + ld1 {v1.16b},[x1],#16 + bic w15,w5,w3 + eor w11,w3,w3,ror#5 + ld1 {v4.4s},[x16],#16 + add w7,w7,w13 + orr w12,w12,w15 + eor w11,w11,w3,ror#19 + eor w15,w7,w7,ror#11 + rev32 v1.16b,v1.16b + add w6,w6,w12 + ror w11,w11,#6 + eor w13,w7,w8 + eor w15,w15,w7,ror#20 + add v4.4s,v4.4s,v1.4s + add w6,w6,w11 + ldr w12,[sp,#20] + and w14,w14,w13 + ror w15,w15,#2 + add w10,w10,w6 + eor w14,w14,w8 + add w5,w5,w12 + add w6,w6,w15 + and w12,w3,w10 + bic w15,w4,w10 + eor w11,w10,w10,ror#5 + add w6,w6,w14 + orr w12,w12,w15 + eor w11,w11,w10,ror#19 + eor w15,w6,w6,ror#11 + add w5,w5,w12 + ror w11,w11,#6 + eor w14,w6,w7 + eor w15,w15,w6,ror#20 + add w5,w5,w11 + ldr w12,[sp,#24] + and w13,w13,w14 + ror w15,w15,#2 + add w9,w9,w5 + eor w13,w13,w7 + add w4,w4,w12 + add w5,w5,w15 + and w12,w10,w9 + bic w15,w3,w9 + eor w11,w9,w9,ror#5 + add w5,w5,w13 + orr w12,w12,w15 + eor w11,w11,w9,ror#19 + eor w15,w5,w5,ror#11 + add w4,w4,w12 + ror w11,w11,#6 + eor w13,w5,w6 + eor w15,w15,w5,ror#20 + add w4,w4,w11 + ldr w12,[sp,#28] + and w14,w14,w13 + ror w15,w15,#2 + add w8,w8,w4 + eor w14,w14,w6 + add w3,w3,w12 + add w4,w4,w15 + and w12,w9,w8 + bic w15,w10,w8 + eor w11,w8,w8,ror#5 + add w4,w4,w14 + orr w12,w12,w15 + eor w11,w11,w8,ror#19 + eor w15,w4,w4,ror#11 + add w3,w3,w12 + ror w11,w11,#6 + eor w14,w4,w5 + eor w15,w15,w4,ror#20 + add w3,w3,w11 + ldr w12,[sp,#32] + and w13,w13,w14 + ror w15,w15,#2 + add w7,w7,w3 + eor w13,w13,w5 + st1 {v4.4s},[x17], #16 + add w10,w10,w12 + add w3,w3,w15 + and w12,w8,w7 + ld1 {v2.16b},[x1],#16 + bic w15,w9,w7 + eor w11,w7,w7,ror#5 + ld1 {v4.4s},[x16],#16 + add w3,w3,w13 + orr w12,w12,w15 + eor w11,w11,w7,ror#19 + eor w15,w3,w3,ror#11 + rev32 v2.16b,v2.16b + add w10,w10,w12 + ror w11,w11,#6 + eor w13,w3,w4 + eor w15,w15,w3,ror#20 + add v4.4s,v4.4s,v2.4s + add w10,w10,w11 + ldr w12,[sp,#36] + and w14,w14,w13 + ror w15,w15,#2 + add w6,w6,w10 + eor w14,w14,w4 + add w9,w9,w12 + add w10,w10,w15 + and w12,w7,w6 + bic w15,w8,w6 + eor w11,w6,w6,ror#5 + add w10,w10,w14 + orr w12,w12,w15 + eor w11,w11,w6,ror#19 + eor w15,w10,w10,ror#11 + add w9,w9,w12 + ror w11,w11,#6 + eor w14,w10,w3 + eor w15,w15,w10,ror#20 + add w9,w9,w11 + ldr w12,[sp,#40] + and w13,w13,w14 + ror w15,w15,#2 + add w5,w5,w9 + eor w13,w13,w3 + add w8,w8,w12 + add w9,w9,w15 + and w12,w6,w5 + bic w15,w7,w5 + eor w11,w5,w5,ror#5 + add w9,w9,w13 + orr w12,w12,w15 + eor w11,w11,w5,ror#19 + eor w15,w9,w9,ror#11 + add w8,w8,w12 + ror w11,w11,#6 + eor w13,w9,w10 + eor w15,w15,w9,ror#20 + add w8,w8,w11 + ldr w12,[sp,#44] + and w14,w14,w13 + ror w15,w15,#2 + add w4,w4,w8 + eor w14,w14,w10 + add w7,w7,w12 + add w8,w8,w15 + and w12,w5,w4 + bic w15,w6,w4 + eor w11,w4,w4,ror#5 + add w8,w8,w14 + orr w12,w12,w15 + eor w11,w11,w4,ror#19 + eor w15,w8,w8,ror#11 + add w7,w7,w12 + ror w11,w11,#6 + eor w14,w8,w9 + eor w15,w15,w8,ror#20 + add w7,w7,w11 + ldr w12,[sp,#48] + and w13,w13,w14 + ror w15,w15,#2 + add w3,w3,w7 + eor w13,w13,w9 + st1 {v4.4s},[x17], #16 + add w6,w6,w12 + add w7,w7,w15 + and w12,w4,w3 + ld1 {v3.16b},[x1],#16 + bic w15,w5,w3 + eor w11,w3,w3,ror#5 + ld1 {v4.4s},[x16],#16 + add w7,w7,w13 + orr w12,w12,w15 + eor w11,w11,w3,ror#19 + eor w15,w7,w7,ror#11 + rev32 v3.16b,v3.16b + add w6,w6,w12 + ror w11,w11,#6 + eor w13,w7,w8 + eor w15,w15,w7,ror#20 + add v4.4s,v4.4s,v3.4s + add w6,w6,w11 + ldr w12,[sp,#52] + and w14,w14,w13 + ror w15,w15,#2 + add w10,w10,w6 + eor w14,w14,w8 + add w5,w5,w12 + add w6,w6,w15 + and w12,w3,w10 + bic w15,w4,w10 + eor w11,w10,w10,ror#5 + add w6,w6,w14 + orr w12,w12,w15 + eor w11,w11,w10,ror#19 + eor w15,w6,w6,ror#11 + add w5,w5,w12 + ror w11,w11,#6 + eor w14,w6,w7 + eor w15,w15,w6,ror#20 + add w5,w5,w11 + ldr w12,[sp,#56] + and w13,w13,w14 + ror w15,w15,#2 + add w9,w9,w5 + eor w13,w13,w7 + add w4,w4,w12 + add w5,w5,w15 + and w12,w10,w9 + bic w15,w3,w9 + eor w11,w9,w9,ror#5 + add w5,w5,w13 + orr w12,w12,w15 + eor w11,w11,w9,ror#19 + eor w15,w5,w5,ror#11 + add w4,w4,w12 + ror w11,w11,#6 + eor w13,w5,w6 + eor w15,w15,w5,ror#20 + add w4,w4,w11 + ldr w12,[sp,#60] + and w14,w14,w13 + ror w15,w15,#2 + add w8,w8,w4 + eor w14,w14,w6 + add w3,w3,w12 + add w4,w4,w15 + and w12,w9,w8 + bic w15,w10,w8 + eor w11,w8,w8,ror#5 + add w4,w4,w14 + orr w12,w12,w15 + eor w11,w11,w8,ror#19 + eor w15,w4,w4,ror#11 + add w3,w3,w12 + ror w11,w11,#6 + eor w14,w4,w5 + eor w15,w15,w4,ror#20 + add w3,w3,w11 + and w13,w13,w14 + ror w15,w15,#2 + add w7,w7,w3 + eor w13,w13,w5 + st1 {v4.4s},[x17], #16 + add w3,w3,w15 // h+=Sigma0(a) from the past + ldp w11,w12,[x0,#0] + add w3,w3,w13 // h+=Maj(a,b,c) from the past + ldp w13,w14,[x0,#8] + add w3,w3,w11 // accumulate + add w4,w4,w12 + ldp w11,w12,[x0,#16] + add w5,w5,w13 + add w6,w6,w14 + ldp w13,w14,[x0,#24] + add w7,w7,w11 + add w8,w8,w12 + ldr w12,[sp,#0] + stp w3,w4,[x0,#0] + add w9,w9,w13 + mov w13,wzr + stp w5,w6,[x0,#8] + add w10,w10,w14 + stp w7,w8,[x0,#16] + eor w14,w4,w5 + stp w9,w10,[x0,#24] + mov w15,wzr + mov x17,sp + b.ne .L_00_48 + + ldr x29,[x29] + add sp,sp,#16*4+16 + ret +.size blst_sha256_block_data_order,.-blst_sha256_block_data_order +.globl blst_sha256_emit +.hidden blst_sha256_emit +.type blst_sha256_emit,%function +.align 4 +blst_sha256_emit: + ldp x4,x5,[x1] + ldp x6,x7,[x1,#16] +#ifndef __AARCH64EB__ + rev x4,x4 + rev x5,x5 + rev x6,x6 + rev x7,x7 +#endif + str w4,[x0,#4] + lsr x4,x4,#32 + str w5,[x0,#12] + lsr x5,x5,#32 + str w6,[x0,#20] + lsr x6,x6,#32 + str w7,[x0,#28] + lsr x7,x7,#32 + str w4,[x0,#0] + str w5,[x0,#8] + str w6,[x0,#16] + str w7,[x0,#24] + ret +.size blst_sha256_emit,.-blst_sha256_emit + +.globl blst_sha256_bcopy +.hidden blst_sha256_bcopy +.type blst_sha256_bcopy,%function +.align 4 +blst_sha256_bcopy: +.Loop_bcopy: + ldrb w3,[x1],#1 + sub x2,x2,#1 + strb w3,[x0],#1 + cbnz x2,.Loop_bcopy + ret +.size blst_sha256_bcopy,.-blst_sha256_bcopy + +.globl blst_sha256_hcopy +.hidden blst_sha256_hcopy +.type blst_sha256_hcopy,%function +.align 4 +blst_sha256_hcopy: + ldp x4,x5,[x1] + ldp x6,x7,[x1,#16] + stp x4,x5,[x0] + stp x6,x7,[x0,#16] + ret +.size blst_sha256_hcopy,.-blst_sha256_hcopy diff --git a/temp-deps/blst-ts/deps/blst/build/elf/sha256-portable-x86_64.s b/temp-deps/blst-ts/deps/blst/build/elf/sha256-portable-x86_64.s new file mode 100644 index 0000000..20b5c41 --- /dev/null +++ b/temp-deps/blst-ts/deps/blst/build/elf/sha256-portable-x86_64.s @@ -0,0 +1,1754 @@ +.text + +.globl blst_sha256_block_data_order +.type blst_sha256_block_data_order,@function +.align 16 +blst_sha256_block_data_order: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-16 + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + shlq $4,%rdx + subq $64+24,%rsp +.cfi_adjust_cfa_offset 16*4+3*8 + leaq (%rsi,%rdx,4),%rdx + movq %rdi,64+0(%rsp) + movq %rsi,64+8(%rsp) + movq %rdx,64+16(%rsp) + + + movl 0(%rdi),%eax + movl 4(%rdi),%ebx + movl 8(%rdi),%ecx + movl 12(%rdi),%edx + movl 16(%rdi),%r8d + movl 20(%rdi),%r9d + movl 24(%rdi),%r10d + movl 28(%rdi),%r11d + jmp .Lloop + +.align 16 +.Lloop: + movl %ebx,%edi + leaq K256(%rip),%rbp + xorl %ecx,%edi + movl 0(%rsi),%r12d + movl %r8d,%r13d + movl %eax,%r14d + bswapl %r12d + rorl $14,%r13d + movl %r9d,%r15d + + xorl %r8d,%r13d + rorl $9,%r14d + xorl %r10d,%r15d + + movl %r12d,0(%rsp) + xorl %eax,%r14d + andl %r8d,%r15d + + rorl $5,%r13d + addl %r11d,%r12d + xorl %r10d,%r15d + + rorl $11,%r14d + xorl %r8d,%r13d + addl %r15d,%r12d + + movl %eax,%r15d + addl 0(%rbp),%r12d + xorl %eax,%r14d + + xorl %ebx,%r15d + rorl $6,%r13d + movl %ebx,%r11d + + andl %r15d,%edi + rorl $2,%r14d + addl %r13d,%r12d + + xorl %edi,%r11d + addl %r12d,%edx + addl %r12d,%r11d + addl %r14d,%r11d + movl 4(%rsi),%r12d + movl %edx,%r13d + movl %r11d,%r14d + bswapl %r12d + rorl $14,%r13d + movl %r8d,%edi + + xorl %edx,%r13d + rorl $9,%r14d + xorl %r9d,%edi + + movl %r12d,4(%rsp) + xorl %r11d,%r14d + andl %edx,%edi + + rorl $5,%r13d + addl %r10d,%r12d + xorl %r9d,%edi + + rorl $11,%r14d + xorl %edx,%r13d + addl %edi,%r12d + + movl %r11d,%edi + addl 4(%rbp),%r12d + xorl %r11d,%r14d + + xorl %eax,%edi + rorl $6,%r13d + movl %eax,%r10d + + andl %edi,%r15d + rorl $2,%r14d + addl %r13d,%r12d + + xorl %r15d,%r10d + addl %r12d,%ecx + addl %r12d,%r10d + addl %r14d,%r10d + movl 8(%rsi),%r12d + movl %ecx,%r13d + movl %r10d,%r14d + bswapl %r12d + rorl $14,%r13d + movl %edx,%r15d + + xorl %ecx,%r13d + rorl $9,%r14d + xorl %r8d,%r15d + + movl %r12d,8(%rsp) + xorl %r10d,%r14d + andl %ecx,%r15d + + rorl $5,%r13d + addl %r9d,%r12d + xorl %r8d,%r15d + + rorl $11,%r14d + xorl %ecx,%r13d + addl %r15d,%r12d + + movl %r10d,%r15d + addl 8(%rbp),%r12d + xorl %r10d,%r14d + + xorl %r11d,%r15d + rorl $6,%r13d + movl %r11d,%r9d + + andl %r15d,%edi + rorl $2,%r14d + addl %r13d,%r12d + + xorl %edi,%r9d + addl %r12d,%ebx + addl %r12d,%r9d + addl %r14d,%r9d + movl 12(%rsi),%r12d + movl %ebx,%r13d + movl %r9d,%r14d + bswapl %r12d + rorl $14,%r13d + movl %ecx,%edi + + xorl %ebx,%r13d + rorl $9,%r14d + xorl %edx,%edi + + movl %r12d,12(%rsp) + xorl %r9d,%r14d + andl %ebx,%edi + + rorl $5,%r13d + addl %r8d,%r12d + xorl %edx,%edi + + rorl $11,%r14d + xorl %ebx,%r13d + addl %edi,%r12d + + movl %r9d,%edi + addl 12(%rbp),%r12d + xorl %r9d,%r14d + + xorl %r10d,%edi + rorl $6,%r13d + movl %r10d,%r8d + + andl %edi,%r15d + rorl $2,%r14d + addl %r13d,%r12d + + xorl %r15d,%r8d + addl %r12d,%eax + addl %r12d,%r8d + addl %r14d,%r8d + movl 16(%rsi),%r12d + movl %eax,%r13d + movl %r8d,%r14d + bswapl %r12d + rorl $14,%r13d + movl %ebx,%r15d + + xorl %eax,%r13d + rorl $9,%r14d + xorl %ecx,%r15d + + movl %r12d,16(%rsp) + xorl %r8d,%r14d + andl %eax,%r15d + + rorl $5,%r13d + addl %edx,%r12d + xorl %ecx,%r15d + + rorl $11,%r14d + xorl %eax,%r13d + addl %r15d,%r12d + + movl %r8d,%r15d + addl 16(%rbp),%r12d + xorl %r8d,%r14d + + xorl %r9d,%r15d + rorl $6,%r13d + movl %r9d,%edx + + andl %r15d,%edi + rorl $2,%r14d + addl %r13d,%r12d + + xorl %edi,%edx + addl %r12d,%r11d + addl %r12d,%edx + addl %r14d,%edx + movl 20(%rsi),%r12d + movl %r11d,%r13d + movl %edx,%r14d + bswapl %r12d + rorl $14,%r13d + movl %eax,%edi + + xorl %r11d,%r13d + rorl $9,%r14d + xorl %ebx,%edi + + movl %r12d,20(%rsp) + xorl %edx,%r14d + andl %r11d,%edi + + rorl $5,%r13d + addl %ecx,%r12d + xorl %ebx,%edi + + rorl $11,%r14d + xorl %r11d,%r13d + addl %edi,%r12d + + movl %edx,%edi + addl 20(%rbp),%r12d + xorl %edx,%r14d + + xorl %r8d,%edi + rorl $6,%r13d + movl %r8d,%ecx + + andl %edi,%r15d + rorl $2,%r14d + addl %r13d,%r12d + + xorl %r15d,%ecx + addl %r12d,%r10d + addl %r12d,%ecx + addl %r14d,%ecx + movl 24(%rsi),%r12d + movl %r10d,%r13d + movl %ecx,%r14d + bswapl %r12d + rorl $14,%r13d + movl %r11d,%r15d + + xorl %r10d,%r13d + rorl $9,%r14d + xorl %eax,%r15d + + movl %r12d,24(%rsp) + xorl %ecx,%r14d + andl %r10d,%r15d + + rorl $5,%r13d + addl %ebx,%r12d + xorl %eax,%r15d + + rorl $11,%r14d + xorl %r10d,%r13d + addl %r15d,%r12d + + movl %ecx,%r15d + addl 24(%rbp),%r12d + xorl %ecx,%r14d + + xorl %edx,%r15d + rorl $6,%r13d + movl %edx,%ebx + + andl %r15d,%edi + rorl $2,%r14d + addl %r13d,%r12d + + xorl %edi,%ebx + addl %r12d,%r9d + addl %r12d,%ebx + addl %r14d,%ebx + movl 28(%rsi),%r12d + movl %r9d,%r13d + movl %ebx,%r14d + bswapl %r12d + rorl $14,%r13d + movl %r10d,%edi + + xorl %r9d,%r13d + rorl $9,%r14d + xorl %r11d,%edi + + movl %r12d,28(%rsp) + xorl %ebx,%r14d + andl %r9d,%edi + + rorl $5,%r13d + addl %eax,%r12d + xorl %r11d,%edi + + rorl $11,%r14d + xorl %r9d,%r13d + addl %edi,%r12d + + movl %ebx,%edi + addl 28(%rbp),%r12d + xorl %ebx,%r14d + + xorl %ecx,%edi + rorl $6,%r13d + movl %ecx,%eax + + andl %edi,%r15d + rorl $2,%r14d + addl %r13d,%r12d + + xorl %r15d,%eax + addl %r12d,%r8d + addl %r12d,%eax + addl %r14d,%eax + movl 32(%rsi),%r12d + movl %r8d,%r13d + movl %eax,%r14d + bswapl %r12d + rorl $14,%r13d + movl %r9d,%r15d + + xorl %r8d,%r13d + rorl $9,%r14d + xorl %r10d,%r15d + + movl %r12d,32(%rsp) + xorl %eax,%r14d + andl %r8d,%r15d + + rorl $5,%r13d + addl %r11d,%r12d + xorl %r10d,%r15d + + rorl $11,%r14d + xorl %r8d,%r13d + addl %r15d,%r12d + + movl %eax,%r15d + addl 32(%rbp),%r12d + xorl %eax,%r14d + + xorl %ebx,%r15d + rorl $6,%r13d + movl %ebx,%r11d + + andl %r15d,%edi + rorl $2,%r14d + addl %r13d,%r12d + + xorl %edi,%r11d + addl %r12d,%edx + addl %r12d,%r11d + addl %r14d,%r11d + movl 36(%rsi),%r12d + movl %edx,%r13d + movl %r11d,%r14d + bswapl %r12d + rorl $14,%r13d + movl %r8d,%edi + + xorl %edx,%r13d + rorl $9,%r14d + xorl %r9d,%edi + + movl %r12d,36(%rsp) + xorl %r11d,%r14d + andl %edx,%edi + + rorl $5,%r13d + addl %r10d,%r12d + xorl %r9d,%edi + + rorl $11,%r14d + xorl %edx,%r13d + addl %edi,%r12d + + movl %r11d,%edi + addl 36(%rbp),%r12d + xorl %r11d,%r14d + + xorl %eax,%edi + rorl $6,%r13d + movl %eax,%r10d + + andl %edi,%r15d + rorl $2,%r14d + addl %r13d,%r12d + + xorl %r15d,%r10d + addl %r12d,%ecx + addl %r12d,%r10d + addl %r14d,%r10d + movl 40(%rsi),%r12d + movl %ecx,%r13d + movl %r10d,%r14d + bswapl %r12d + rorl $14,%r13d + movl %edx,%r15d + + xorl %ecx,%r13d + rorl $9,%r14d + xorl %r8d,%r15d + + movl %r12d,40(%rsp) + xorl %r10d,%r14d + andl %ecx,%r15d + + rorl $5,%r13d + addl %r9d,%r12d + xorl %r8d,%r15d + + rorl $11,%r14d + xorl %ecx,%r13d + addl %r15d,%r12d + + movl %r10d,%r15d + addl 40(%rbp),%r12d + xorl %r10d,%r14d + + xorl %r11d,%r15d + rorl $6,%r13d + movl %r11d,%r9d + + andl %r15d,%edi + rorl $2,%r14d + addl %r13d,%r12d + + xorl %edi,%r9d + addl %r12d,%ebx + addl %r12d,%r9d + addl %r14d,%r9d + movl 44(%rsi),%r12d + movl %ebx,%r13d + movl %r9d,%r14d + bswapl %r12d + rorl $14,%r13d + movl %ecx,%edi + + xorl %ebx,%r13d + rorl $9,%r14d + xorl %edx,%edi + + movl %r12d,44(%rsp) + xorl %r9d,%r14d + andl %ebx,%edi + + rorl $5,%r13d + addl %r8d,%r12d + xorl %edx,%edi + + rorl $11,%r14d + xorl %ebx,%r13d + addl %edi,%r12d + + movl %r9d,%edi + addl 44(%rbp),%r12d + xorl %r9d,%r14d + + xorl %r10d,%edi + rorl $6,%r13d + movl %r10d,%r8d + + andl %edi,%r15d + rorl $2,%r14d + addl %r13d,%r12d + + xorl %r15d,%r8d + addl %r12d,%eax + addl %r12d,%r8d + addl %r14d,%r8d + movl 48(%rsi),%r12d + movl %eax,%r13d + movl %r8d,%r14d + bswapl %r12d + rorl $14,%r13d + movl %ebx,%r15d + + xorl %eax,%r13d + rorl $9,%r14d + xorl %ecx,%r15d + + movl %r12d,48(%rsp) + xorl %r8d,%r14d + andl %eax,%r15d + + rorl $5,%r13d + addl %edx,%r12d + xorl %ecx,%r15d + + rorl $11,%r14d + xorl %eax,%r13d + addl %r15d,%r12d + + movl %r8d,%r15d + addl 48(%rbp),%r12d + xorl %r8d,%r14d + + xorl %r9d,%r15d + rorl $6,%r13d + movl %r9d,%edx + + andl %r15d,%edi + rorl $2,%r14d + addl %r13d,%r12d + + xorl %edi,%edx + addl %r12d,%r11d + addl %r12d,%edx + addl %r14d,%edx + movl 52(%rsi),%r12d + movl %r11d,%r13d + movl %edx,%r14d + bswapl %r12d + rorl $14,%r13d + movl %eax,%edi + + xorl %r11d,%r13d + rorl $9,%r14d + xorl %ebx,%edi + + movl %r12d,52(%rsp) + xorl %edx,%r14d + andl %r11d,%edi + + rorl $5,%r13d + addl %ecx,%r12d + xorl %ebx,%edi + + rorl $11,%r14d + xorl %r11d,%r13d + addl %edi,%r12d + + movl %edx,%edi + addl 52(%rbp),%r12d + xorl %edx,%r14d + + xorl %r8d,%edi + rorl $6,%r13d + movl %r8d,%ecx + + andl %edi,%r15d + rorl $2,%r14d + addl %r13d,%r12d + + xorl %r15d,%ecx + addl %r12d,%r10d + addl %r12d,%ecx + addl %r14d,%ecx + movl 56(%rsi),%r12d + movl %r10d,%r13d + movl %ecx,%r14d + bswapl %r12d + rorl $14,%r13d + movl %r11d,%r15d + + xorl %r10d,%r13d + rorl $9,%r14d + xorl %eax,%r15d + + movl %r12d,56(%rsp) + xorl %ecx,%r14d + andl %r10d,%r15d + + rorl $5,%r13d + addl %ebx,%r12d + xorl %eax,%r15d + + rorl $11,%r14d + xorl %r10d,%r13d + addl %r15d,%r12d + + movl %ecx,%r15d + addl 56(%rbp),%r12d + xorl %ecx,%r14d + + xorl %edx,%r15d + rorl $6,%r13d + movl %edx,%ebx + + andl %r15d,%edi + rorl $2,%r14d + addl %r13d,%r12d + + xorl %edi,%ebx + addl %r12d,%r9d + addl %r12d,%ebx + addl %r14d,%ebx + movl 60(%rsi),%r12d + movl %r9d,%r13d + movl %ebx,%r14d + bswapl %r12d + rorl $14,%r13d + movl %r10d,%edi + + xorl %r9d,%r13d + rorl $9,%r14d + xorl %r11d,%edi + + movl %r12d,60(%rsp) + xorl %ebx,%r14d + andl %r9d,%edi + + rorl $5,%r13d + addl %eax,%r12d + xorl %r11d,%edi + + rorl $11,%r14d + xorl %r9d,%r13d + addl %edi,%r12d + + movl %ebx,%edi + addl 60(%rbp),%r12d + xorl %ebx,%r14d + + xorl %ecx,%edi + rorl $6,%r13d + movl %ecx,%eax + + andl %edi,%r15d + rorl $2,%r14d + addl %r13d,%r12d + + xorl %r15d,%eax + addl %r12d,%r8d + addl %r12d,%eax + jmp .Lrounds_16_xx +.align 16 +.Lrounds_16_xx: + movl 4(%rsp),%r13d + movl 56(%rsp),%r15d + + movl %r13d,%r12d + rorl $11,%r13d + addl %r14d,%eax + movl %r15d,%r14d + rorl $2,%r15d + + xorl %r12d,%r13d + shrl $3,%r12d + rorl $7,%r13d + xorl %r14d,%r15d + shrl $10,%r14d + + rorl $17,%r15d + xorl %r13d,%r12d + xorl %r14d,%r15d + addl 36(%rsp),%r12d + + addl 0(%rsp),%r12d + movl %r8d,%r13d + addl %r15d,%r12d + movl %eax,%r14d + rorl $14,%r13d + movl %r9d,%r15d + + xorl %r8d,%r13d + rorl $9,%r14d + xorl %r10d,%r15d + + movl %r12d,0(%rsp) + xorl %eax,%r14d + andl %r8d,%r15d + + rorl $5,%r13d + addl %r11d,%r12d + xorl %r10d,%r15d + + rorl $11,%r14d + xorl %r8d,%r13d + addl %r15d,%r12d + + movl %eax,%r15d + addl 64(%rbp),%r12d + xorl %eax,%r14d + + xorl %ebx,%r15d + rorl $6,%r13d + movl %ebx,%r11d + + andl %r15d,%edi + rorl $2,%r14d + addl %r13d,%r12d + + xorl %edi,%r11d + addl %r12d,%edx + addl %r12d,%r11d + movl 8(%rsp),%r13d + movl 60(%rsp),%edi + + movl %r13d,%r12d + rorl $11,%r13d + addl %r14d,%r11d + movl %edi,%r14d + rorl $2,%edi + + xorl %r12d,%r13d + shrl $3,%r12d + rorl $7,%r13d + xorl %r14d,%edi + shrl $10,%r14d + + rorl $17,%edi + xorl %r13d,%r12d + xorl %r14d,%edi + addl 40(%rsp),%r12d + + addl 4(%rsp),%r12d + movl %edx,%r13d + addl %edi,%r12d + movl %r11d,%r14d + rorl $14,%r13d + movl %r8d,%edi + + xorl %edx,%r13d + rorl $9,%r14d + xorl %r9d,%edi + + movl %r12d,4(%rsp) + xorl %r11d,%r14d + andl %edx,%edi + + rorl $5,%r13d + addl %r10d,%r12d + xorl %r9d,%edi + + rorl $11,%r14d + xorl %edx,%r13d + addl %edi,%r12d + + movl %r11d,%edi + addl 68(%rbp),%r12d + xorl %r11d,%r14d + + xorl %eax,%edi + rorl $6,%r13d + movl %eax,%r10d + + andl %edi,%r15d + rorl $2,%r14d + addl %r13d,%r12d + + xorl %r15d,%r10d + addl %r12d,%ecx + addl %r12d,%r10d + movl 12(%rsp),%r13d + movl 0(%rsp),%r15d + + movl %r13d,%r12d + rorl $11,%r13d + addl %r14d,%r10d + movl %r15d,%r14d + rorl $2,%r15d + + xorl %r12d,%r13d + shrl $3,%r12d + rorl $7,%r13d + xorl %r14d,%r15d + shrl $10,%r14d + + rorl $17,%r15d + xorl %r13d,%r12d + xorl %r14d,%r15d + addl 44(%rsp),%r12d + + addl 8(%rsp),%r12d + movl %ecx,%r13d + addl %r15d,%r12d + movl %r10d,%r14d + rorl $14,%r13d + movl %edx,%r15d + + xorl %ecx,%r13d + rorl $9,%r14d + xorl %r8d,%r15d + + movl %r12d,8(%rsp) + xorl %r10d,%r14d + andl %ecx,%r15d + + rorl $5,%r13d + addl %r9d,%r12d + xorl %r8d,%r15d + + rorl $11,%r14d + xorl %ecx,%r13d + addl %r15d,%r12d + + movl %r10d,%r15d + addl 72(%rbp),%r12d + xorl %r10d,%r14d + + xorl %r11d,%r15d + rorl $6,%r13d + movl %r11d,%r9d + + andl %r15d,%edi + rorl $2,%r14d + addl %r13d,%r12d + + xorl %edi,%r9d + addl %r12d,%ebx + addl %r12d,%r9d + movl 16(%rsp),%r13d + movl 4(%rsp),%edi + + movl %r13d,%r12d + rorl $11,%r13d + addl %r14d,%r9d + movl %edi,%r14d + rorl $2,%edi + + xorl %r12d,%r13d + shrl $3,%r12d + rorl $7,%r13d + xorl %r14d,%edi + shrl $10,%r14d + + rorl $17,%edi + xorl %r13d,%r12d + xorl %r14d,%edi + addl 48(%rsp),%r12d + + addl 12(%rsp),%r12d + movl %ebx,%r13d + addl %edi,%r12d + movl %r9d,%r14d + rorl $14,%r13d + movl %ecx,%edi + + xorl %ebx,%r13d + rorl $9,%r14d + xorl %edx,%edi + + movl %r12d,12(%rsp) + xorl %r9d,%r14d + andl %ebx,%edi + + rorl $5,%r13d + addl %r8d,%r12d + xorl %edx,%edi + + rorl $11,%r14d + xorl %ebx,%r13d + addl %edi,%r12d + + movl %r9d,%edi + addl 76(%rbp),%r12d + xorl %r9d,%r14d + + xorl %r10d,%edi + rorl $6,%r13d + movl %r10d,%r8d + + andl %edi,%r15d + rorl $2,%r14d + addl %r13d,%r12d + + xorl %r15d,%r8d + addl %r12d,%eax + addl %r12d,%r8d + movl 20(%rsp),%r13d + movl 8(%rsp),%r15d + + movl %r13d,%r12d + rorl $11,%r13d + addl %r14d,%r8d + movl %r15d,%r14d + rorl $2,%r15d + + xorl %r12d,%r13d + shrl $3,%r12d + rorl $7,%r13d + xorl %r14d,%r15d + shrl $10,%r14d + + rorl $17,%r15d + xorl %r13d,%r12d + xorl %r14d,%r15d + addl 52(%rsp),%r12d + + addl 16(%rsp),%r12d + movl %eax,%r13d + addl %r15d,%r12d + movl %r8d,%r14d + rorl $14,%r13d + movl %ebx,%r15d + + xorl %eax,%r13d + rorl $9,%r14d + xorl %ecx,%r15d + + movl %r12d,16(%rsp) + xorl %r8d,%r14d + andl %eax,%r15d + + rorl $5,%r13d + addl %edx,%r12d + xorl %ecx,%r15d + + rorl $11,%r14d + xorl %eax,%r13d + addl %r15d,%r12d + + movl %r8d,%r15d + addl 80(%rbp),%r12d + xorl %r8d,%r14d + + xorl %r9d,%r15d + rorl $6,%r13d + movl %r9d,%edx + + andl %r15d,%edi + rorl $2,%r14d + addl %r13d,%r12d + + xorl %edi,%edx + addl %r12d,%r11d + addl %r12d,%edx + movl 24(%rsp),%r13d + movl 12(%rsp),%edi + + movl %r13d,%r12d + rorl $11,%r13d + addl %r14d,%edx + movl %edi,%r14d + rorl $2,%edi + + xorl %r12d,%r13d + shrl $3,%r12d + rorl $7,%r13d + xorl %r14d,%edi + shrl $10,%r14d + + rorl $17,%edi + xorl %r13d,%r12d + xorl %r14d,%edi + addl 56(%rsp),%r12d + + addl 20(%rsp),%r12d + movl %r11d,%r13d + addl %edi,%r12d + movl %edx,%r14d + rorl $14,%r13d + movl %eax,%edi + + xorl %r11d,%r13d + rorl $9,%r14d + xorl %ebx,%edi + + movl %r12d,20(%rsp) + xorl %edx,%r14d + andl %r11d,%edi + + rorl $5,%r13d + addl %ecx,%r12d + xorl %ebx,%edi + + rorl $11,%r14d + xorl %r11d,%r13d + addl %edi,%r12d + + movl %edx,%edi + addl 84(%rbp),%r12d + xorl %edx,%r14d + + xorl %r8d,%edi + rorl $6,%r13d + movl %r8d,%ecx + + andl %edi,%r15d + rorl $2,%r14d + addl %r13d,%r12d + + xorl %r15d,%ecx + addl %r12d,%r10d + addl %r12d,%ecx + movl 28(%rsp),%r13d + movl 16(%rsp),%r15d + + movl %r13d,%r12d + rorl $11,%r13d + addl %r14d,%ecx + movl %r15d,%r14d + rorl $2,%r15d + + xorl %r12d,%r13d + shrl $3,%r12d + rorl $7,%r13d + xorl %r14d,%r15d + shrl $10,%r14d + + rorl $17,%r15d + xorl %r13d,%r12d + xorl %r14d,%r15d + addl 60(%rsp),%r12d + + addl 24(%rsp),%r12d + movl %r10d,%r13d + addl %r15d,%r12d + movl %ecx,%r14d + rorl $14,%r13d + movl %r11d,%r15d + + xorl %r10d,%r13d + rorl $9,%r14d + xorl %eax,%r15d + + movl %r12d,24(%rsp) + xorl %ecx,%r14d + andl %r10d,%r15d + + rorl $5,%r13d + addl %ebx,%r12d + xorl %eax,%r15d + + rorl $11,%r14d + xorl %r10d,%r13d + addl %r15d,%r12d + + movl %ecx,%r15d + addl 88(%rbp),%r12d + xorl %ecx,%r14d + + xorl %edx,%r15d + rorl $6,%r13d + movl %edx,%ebx + + andl %r15d,%edi + rorl $2,%r14d + addl %r13d,%r12d + + xorl %edi,%ebx + addl %r12d,%r9d + addl %r12d,%ebx + movl 32(%rsp),%r13d + movl 20(%rsp),%edi + + movl %r13d,%r12d + rorl $11,%r13d + addl %r14d,%ebx + movl %edi,%r14d + rorl $2,%edi + + xorl %r12d,%r13d + shrl $3,%r12d + rorl $7,%r13d + xorl %r14d,%edi + shrl $10,%r14d + + rorl $17,%edi + xorl %r13d,%r12d + xorl %r14d,%edi + addl 0(%rsp),%r12d + + addl 28(%rsp),%r12d + movl %r9d,%r13d + addl %edi,%r12d + movl %ebx,%r14d + rorl $14,%r13d + movl %r10d,%edi + + xorl %r9d,%r13d + rorl $9,%r14d + xorl %r11d,%edi + + movl %r12d,28(%rsp) + xorl %ebx,%r14d + andl %r9d,%edi + + rorl $5,%r13d + addl %eax,%r12d + xorl %r11d,%edi + + rorl $11,%r14d + xorl %r9d,%r13d + addl %edi,%r12d + + movl %ebx,%edi + addl 92(%rbp),%r12d + xorl %ebx,%r14d + + xorl %ecx,%edi + rorl $6,%r13d + movl %ecx,%eax + + andl %edi,%r15d + rorl $2,%r14d + addl %r13d,%r12d + + xorl %r15d,%eax + addl %r12d,%r8d + addl %r12d,%eax + movl 36(%rsp),%r13d + movl 24(%rsp),%r15d + + movl %r13d,%r12d + rorl $11,%r13d + addl %r14d,%eax + movl %r15d,%r14d + rorl $2,%r15d + + xorl %r12d,%r13d + shrl $3,%r12d + rorl $7,%r13d + xorl %r14d,%r15d + shrl $10,%r14d + + rorl $17,%r15d + xorl %r13d,%r12d + xorl %r14d,%r15d + addl 4(%rsp),%r12d + + addl 32(%rsp),%r12d + movl %r8d,%r13d + addl %r15d,%r12d + movl %eax,%r14d + rorl $14,%r13d + movl %r9d,%r15d + + xorl %r8d,%r13d + rorl $9,%r14d + xorl %r10d,%r15d + + movl %r12d,32(%rsp) + xorl %eax,%r14d + andl %r8d,%r15d + + rorl $5,%r13d + addl %r11d,%r12d + xorl %r10d,%r15d + + rorl $11,%r14d + xorl %r8d,%r13d + addl %r15d,%r12d + + movl %eax,%r15d + addl 96(%rbp),%r12d + xorl %eax,%r14d + + xorl %ebx,%r15d + rorl $6,%r13d + movl %ebx,%r11d + + andl %r15d,%edi + rorl $2,%r14d + addl %r13d,%r12d + + xorl %edi,%r11d + addl %r12d,%edx + addl %r12d,%r11d + movl 40(%rsp),%r13d + movl 28(%rsp),%edi + + movl %r13d,%r12d + rorl $11,%r13d + addl %r14d,%r11d + movl %edi,%r14d + rorl $2,%edi + + xorl %r12d,%r13d + shrl $3,%r12d + rorl $7,%r13d + xorl %r14d,%edi + shrl $10,%r14d + + rorl $17,%edi + xorl %r13d,%r12d + xorl %r14d,%edi + addl 8(%rsp),%r12d + + addl 36(%rsp),%r12d + movl %edx,%r13d + addl %edi,%r12d + movl %r11d,%r14d + rorl $14,%r13d + movl %r8d,%edi + + xorl %edx,%r13d + rorl $9,%r14d + xorl %r9d,%edi + + movl %r12d,36(%rsp) + xorl %r11d,%r14d + andl %edx,%edi + + rorl $5,%r13d + addl %r10d,%r12d + xorl %r9d,%edi + + rorl $11,%r14d + xorl %edx,%r13d + addl %edi,%r12d + + movl %r11d,%edi + addl 100(%rbp),%r12d + xorl %r11d,%r14d + + xorl %eax,%edi + rorl $6,%r13d + movl %eax,%r10d + + andl %edi,%r15d + rorl $2,%r14d + addl %r13d,%r12d + + xorl %r15d,%r10d + addl %r12d,%ecx + addl %r12d,%r10d + movl 44(%rsp),%r13d + movl 32(%rsp),%r15d + + movl %r13d,%r12d + rorl $11,%r13d + addl %r14d,%r10d + movl %r15d,%r14d + rorl $2,%r15d + + xorl %r12d,%r13d + shrl $3,%r12d + rorl $7,%r13d + xorl %r14d,%r15d + shrl $10,%r14d + + rorl $17,%r15d + xorl %r13d,%r12d + xorl %r14d,%r15d + addl 12(%rsp),%r12d + + addl 40(%rsp),%r12d + movl %ecx,%r13d + addl %r15d,%r12d + movl %r10d,%r14d + rorl $14,%r13d + movl %edx,%r15d + + xorl %ecx,%r13d + rorl $9,%r14d + xorl %r8d,%r15d + + movl %r12d,40(%rsp) + xorl %r10d,%r14d + andl %ecx,%r15d + + rorl $5,%r13d + addl %r9d,%r12d + xorl %r8d,%r15d + + rorl $11,%r14d + xorl %ecx,%r13d + addl %r15d,%r12d + + movl %r10d,%r15d + addl 104(%rbp),%r12d + xorl %r10d,%r14d + + xorl %r11d,%r15d + rorl $6,%r13d + movl %r11d,%r9d + + andl %r15d,%edi + rorl $2,%r14d + addl %r13d,%r12d + + xorl %edi,%r9d + addl %r12d,%ebx + addl %r12d,%r9d + movl 48(%rsp),%r13d + movl 36(%rsp),%edi + + movl %r13d,%r12d + rorl $11,%r13d + addl %r14d,%r9d + movl %edi,%r14d + rorl $2,%edi + + xorl %r12d,%r13d + shrl $3,%r12d + rorl $7,%r13d + xorl %r14d,%edi + shrl $10,%r14d + + rorl $17,%edi + xorl %r13d,%r12d + xorl %r14d,%edi + addl 16(%rsp),%r12d + + addl 44(%rsp),%r12d + movl %ebx,%r13d + addl %edi,%r12d + movl %r9d,%r14d + rorl $14,%r13d + movl %ecx,%edi + + xorl %ebx,%r13d + rorl $9,%r14d + xorl %edx,%edi + + movl %r12d,44(%rsp) + xorl %r9d,%r14d + andl %ebx,%edi + + rorl $5,%r13d + addl %r8d,%r12d + xorl %edx,%edi + + rorl $11,%r14d + xorl %ebx,%r13d + addl %edi,%r12d + + movl %r9d,%edi + addl 108(%rbp),%r12d + xorl %r9d,%r14d + + xorl %r10d,%edi + rorl $6,%r13d + movl %r10d,%r8d + + andl %edi,%r15d + rorl $2,%r14d + addl %r13d,%r12d + + xorl %r15d,%r8d + addl %r12d,%eax + addl %r12d,%r8d + movl 52(%rsp),%r13d + movl 40(%rsp),%r15d + + movl %r13d,%r12d + rorl $11,%r13d + addl %r14d,%r8d + movl %r15d,%r14d + rorl $2,%r15d + + xorl %r12d,%r13d + shrl $3,%r12d + rorl $7,%r13d + xorl %r14d,%r15d + shrl $10,%r14d + + rorl $17,%r15d + xorl %r13d,%r12d + xorl %r14d,%r15d + addl 20(%rsp),%r12d + + addl 48(%rsp),%r12d + movl %eax,%r13d + addl %r15d,%r12d + movl %r8d,%r14d + rorl $14,%r13d + movl %ebx,%r15d + + xorl %eax,%r13d + rorl $9,%r14d + xorl %ecx,%r15d + + movl %r12d,48(%rsp) + xorl %r8d,%r14d + andl %eax,%r15d + + rorl $5,%r13d + addl %edx,%r12d + xorl %ecx,%r15d + + rorl $11,%r14d + xorl %eax,%r13d + addl %r15d,%r12d + + movl %r8d,%r15d + addl 112(%rbp),%r12d + xorl %r8d,%r14d + + xorl %r9d,%r15d + rorl $6,%r13d + movl %r9d,%edx + + andl %r15d,%edi + rorl $2,%r14d + addl %r13d,%r12d + + xorl %edi,%edx + addl %r12d,%r11d + addl %r12d,%edx + movl 56(%rsp),%r13d + movl 44(%rsp),%edi + + movl %r13d,%r12d + rorl $11,%r13d + addl %r14d,%edx + movl %edi,%r14d + rorl $2,%edi + + xorl %r12d,%r13d + shrl $3,%r12d + rorl $7,%r13d + xorl %r14d,%edi + shrl $10,%r14d + + rorl $17,%edi + xorl %r13d,%r12d + xorl %r14d,%edi + addl 24(%rsp),%r12d + + addl 52(%rsp),%r12d + movl %r11d,%r13d + addl %edi,%r12d + movl %edx,%r14d + rorl $14,%r13d + movl %eax,%edi + + xorl %r11d,%r13d + rorl $9,%r14d + xorl %ebx,%edi + + movl %r12d,52(%rsp) + xorl %edx,%r14d + andl %r11d,%edi + + rorl $5,%r13d + addl %ecx,%r12d + xorl %ebx,%edi + + rorl $11,%r14d + xorl %r11d,%r13d + addl %edi,%r12d + + movl %edx,%edi + addl 116(%rbp),%r12d + xorl %edx,%r14d + + xorl %r8d,%edi + rorl $6,%r13d + movl %r8d,%ecx + + andl %edi,%r15d + rorl $2,%r14d + addl %r13d,%r12d + + xorl %r15d,%ecx + addl %r12d,%r10d + addl %r12d,%ecx + movl 60(%rsp),%r13d + movl 48(%rsp),%r15d + + movl %r13d,%r12d + rorl $11,%r13d + addl %r14d,%ecx + movl %r15d,%r14d + rorl $2,%r15d + + xorl %r12d,%r13d + shrl $3,%r12d + rorl $7,%r13d + xorl %r14d,%r15d + shrl $10,%r14d + + rorl $17,%r15d + xorl %r13d,%r12d + xorl %r14d,%r15d + addl 28(%rsp),%r12d + + addl 56(%rsp),%r12d + movl %r10d,%r13d + addl %r15d,%r12d + movl %ecx,%r14d + rorl $14,%r13d + movl %r11d,%r15d + + xorl %r10d,%r13d + rorl $9,%r14d + xorl %eax,%r15d + + movl %r12d,56(%rsp) + xorl %ecx,%r14d + andl %r10d,%r15d + + rorl $5,%r13d + addl %ebx,%r12d + xorl %eax,%r15d + + rorl $11,%r14d + xorl %r10d,%r13d + addl %r15d,%r12d + + movl %ecx,%r15d + addl 120(%rbp),%r12d + xorl %ecx,%r14d + + xorl %edx,%r15d + rorl $6,%r13d + movl %edx,%ebx + + andl %r15d,%edi + rorl $2,%r14d + addl %r13d,%r12d + + xorl %edi,%ebx + addl %r12d,%r9d + addl %r12d,%ebx + movl 0(%rsp),%r13d + movl 52(%rsp),%edi + + movl %r13d,%r12d + rorl $11,%r13d + addl %r14d,%ebx + movl %edi,%r14d + rorl $2,%edi + + xorl %r12d,%r13d + shrl $3,%r12d + rorl $7,%r13d + xorl %r14d,%edi + shrl $10,%r14d + + rorl $17,%edi + xorl %r13d,%r12d + xorl %r14d,%edi + addl 32(%rsp),%r12d + + addl 60(%rsp),%r12d + movl %r9d,%r13d + addl %edi,%r12d + movl %ebx,%r14d + rorl $14,%r13d + movl %r10d,%edi + + xorl %r9d,%r13d + rorl $9,%r14d + xorl %r11d,%edi + + movl %r12d,60(%rsp) + xorl %ebx,%r14d + andl %r9d,%edi + + rorl $5,%r13d + addl %eax,%r12d + xorl %r11d,%edi + + rorl $11,%r14d + xorl %r9d,%r13d + addl %edi,%r12d + + movl %ebx,%edi + addl 124(%rbp),%r12d + xorl %ebx,%r14d + + xorl %ecx,%edi + rorl $6,%r13d + movl %ecx,%eax + + andl %edi,%r15d + rorl $2,%r14d + addl %r13d,%r12d + + xorl %r15d,%eax + addl %r12d,%r8d + addl %r12d,%eax + leaq 64(%rbp),%rbp + cmpb $0x19,3(%rbp) + jnz .Lrounds_16_xx + + movq 64+0(%rsp),%rdi + addl %r14d,%eax + leaq 64(%rsi),%rsi + + addl 0(%rdi),%eax + addl 4(%rdi),%ebx + addl 8(%rdi),%ecx + addl 12(%rdi),%edx + addl 16(%rdi),%r8d + addl 20(%rdi),%r9d + addl 24(%rdi),%r10d + addl 28(%rdi),%r11d + + cmpq 64+16(%rsp),%rsi + + movl %eax,0(%rdi) + movl %ebx,4(%rdi) + movl %ecx,8(%rdi) + movl %edx,12(%rdi) + movl %r8d,16(%rdi) + movl %r9d,20(%rdi) + movl %r10d,24(%rdi) + movl %r11d,28(%rdi) + jb .Lloop + + leaq 64+24+48(%rsp),%r11 +.cfi_def_cfa %r11,8 + movq 64+24(%rsp),%r15 +.cfi_restore %r15 + movq -40(%r11),%r14 +.cfi_restore %r14 + movq -32(%r11),%r13 +.cfi_restore %r13 + movq -24(%r11),%r12 +.cfi_restore %r12 + movq -16(%r11),%rbp +.cfi_restore %rbp + movq -8(%r11),%rbx +.cfi_restore %rbx + + leaq (%r11),%rsp + .byte 0xf3,0xc3 +.cfi_endproc +.size blst_sha256_block_data_order,.-blst_sha256_block_data_order + +.align 64 +.type K256,@object +K256: +.long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5 +.long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5 +.long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3 +.long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174 +.long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc +.long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da +.long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7 +.long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967 +.long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13 +.long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85 +.long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3 +.long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070 +.long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5 +.long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3 +.long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208 +.long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 + +.byte 83,72,65,50,53,54,32,98,108,111,99,107,32,116,114,97,110,115,102,111,114,109,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,64,100,111,116,45,97,115,109,0 +.globl blst_sha256_emit +.hidden blst_sha256_emit +.type blst_sha256_emit,@function +.align 16 +blst_sha256_emit: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + bswapq %r8 + movq 24(%rsi),%r11 + bswapq %r9 + movl %r8d,4(%rdi) + bswapq %r10 + movl %r9d,12(%rdi) + bswapq %r11 + movl %r10d,20(%rdi) + shrq $32,%r8 + movl %r11d,28(%rdi) + shrq $32,%r9 + movl %r8d,0(%rdi) + shrq $32,%r10 + movl %r9d,8(%rdi) + shrq $32,%r11 + movl %r10d,16(%rdi) + movl %r11d,24(%rdi) + .byte 0xf3,0xc3 +.cfi_endproc +.size blst_sha256_emit,.-blst_sha256_emit + +.globl blst_sha256_bcopy +.hidden blst_sha256_bcopy +.type blst_sha256_bcopy,@function +.align 16 +blst_sha256_bcopy: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + subq %rsi,%rdi +.Loop_bcopy: + movzbl (%rsi),%eax + leaq 1(%rsi),%rsi + movb %al,-1(%rdi,%rsi,1) + decq %rdx + jnz .Loop_bcopy + .byte 0xf3,0xc3 +.cfi_endproc +.size blst_sha256_bcopy,.-blst_sha256_bcopy + +.globl blst_sha256_hcopy +.hidden blst_sha256_hcopy +.type blst_sha256_hcopy,@function +.align 16 +blst_sha256_hcopy: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + movq %r8,0(%rdi) + movq %r9,8(%rdi) + movq %r10,16(%rdi) + movq %r11,24(%rdi) + .byte 0xf3,0xc3 +.cfi_endproc +.size blst_sha256_hcopy,.-blst_sha256_hcopy + +.section .note.GNU-stack,"",@progbits +.section .note.gnu.property,"a",@note + .long 4,2f-1f,5 + .byte 0x47,0x4E,0x55,0 +1: .long 0xc0000002,4,3 +.align 8 +2: diff --git a/temp-deps/blst-ts/deps/blst/build/elf/sha256-x86_64.s b/temp-deps/blst-ts/deps/blst/build/elf/sha256-x86_64.s new file mode 100644 index 0000000..47fdc5b --- /dev/null +++ b/temp-deps/blst-ts/deps/blst/build/elf/sha256-x86_64.s @@ -0,0 +1,1446 @@ +.text + +.align 64 +.type K256,@object +K256: +.long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5 +.long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5 +.long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3 +.long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174 +.long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc +.long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da +.long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7 +.long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967 +.long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13 +.long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85 +.long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3 +.long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070 +.long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5 +.long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3 +.long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208 +.long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 + +.long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f +.long 0x03020100,0x0b0a0908,0xffffffff,0xffffffff +.long 0xffffffff,0xffffffff,0x03020100,0x0b0a0908 +.byte 83,72,65,50,53,54,32,98,108,111,99,107,32,116,114,97,110,115,102,111,114,109,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,64,100,111,116,45,97,115,109,0 +.globl blst_sha256_block_data_order_shaext +.hidden blst_sha256_block_data_order_shaext +.type blst_sha256_block_data_order_shaext,@function +.align 64 +blst_sha256_block_data_order_shaext: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + leaq K256+128(%rip),%rcx + movdqu (%rdi),%xmm1 + movdqu 16(%rdi),%xmm2 + movdqa 256-128(%rcx),%xmm7 + + pshufd $0x1b,%xmm1,%xmm0 + pshufd $0xb1,%xmm1,%xmm1 + pshufd $0x1b,%xmm2,%xmm2 + movdqa %xmm7,%xmm8 +.byte 102,15,58,15,202,8 + punpcklqdq %xmm0,%xmm2 + jmp .Loop_shaext + +.align 16 +.Loop_shaext: + movdqu (%rsi),%xmm3 + movdqu 16(%rsi),%xmm4 + movdqu 32(%rsi),%xmm5 +.byte 102,15,56,0,223 + movdqu 48(%rsi),%xmm6 + + movdqa 0-128(%rcx),%xmm0 + paddd %xmm3,%xmm0 +.byte 102,15,56,0,231 + movdqa %xmm2,%xmm10 +.byte 15,56,203,209 + pshufd $0x0e,%xmm0,%xmm0 + nop + movdqa %xmm1,%xmm9 +.byte 15,56,203,202 + + movdqa 16-128(%rcx),%xmm0 + paddd %xmm4,%xmm0 +.byte 102,15,56,0,239 +.byte 15,56,203,209 + pshufd $0x0e,%xmm0,%xmm0 + leaq 64(%rsi),%rsi +.byte 15,56,204,220 +.byte 15,56,203,202 + + movdqa 32-128(%rcx),%xmm0 + paddd %xmm5,%xmm0 +.byte 102,15,56,0,247 +.byte 15,56,203,209 + pshufd $0x0e,%xmm0,%xmm0 + movdqa %xmm6,%xmm7 +.byte 102,15,58,15,253,4 + nop + paddd %xmm7,%xmm3 +.byte 15,56,204,229 +.byte 15,56,203,202 + + movdqa 48-128(%rcx),%xmm0 + paddd %xmm6,%xmm0 +.byte 15,56,205,222 +.byte 15,56,203,209 + pshufd $0x0e,%xmm0,%xmm0 + movdqa %xmm3,%xmm7 +.byte 102,15,58,15,254,4 + nop + paddd %xmm7,%xmm4 +.byte 15,56,204,238 +.byte 15,56,203,202 + movdqa 64-128(%rcx),%xmm0 + paddd %xmm3,%xmm0 +.byte 15,56,205,227 +.byte 15,56,203,209 + pshufd $0x0e,%xmm0,%xmm0 + movdqa %xmm4,%xmm7 +.byte 102,15,58,15,251,4 + nop + paddd %xmm7,%xmm5 +.byte 15,56,204,243 +.byte 15,56,203,202 + movdqa 80-128(%rcx),%xmm0 + paddd %xmm4,%xmm0 +.byte 15,56,205,236 +.byte 15,56,203,209 + pshufd $0x0e,%xmm0,%xmm0 + movdqa %xmm5,%xmm7 +.byte 102,15,58,15,252,4 + nop + paddd %xmm7,%xmm6 +.byte 15,56,204,220 +.byte 15,56,203,202 + movdqa 96-128(%rcx),%xmm0 + paddd %xmm5,%xmm0 +.byte 15,56,205,245 +.byte 15,56,203,209 + pshufd $0x0e,%xmm0,%xmm0 + movdqa %xmm6,%xmm7 +.byte 102,15,58,15,253,4 + nop + paddd %xmm7,%xmm3 +.byte 15,56,204,229 +.byte 15,56,203,202 + movdqa 112-128(%rcx),%xmm0 + paddd %xmm6,%xmm0 +.byte 15,56,205,222 +.byte 15,56,203,209 + pshufd $0x0e,%xmm0,%xmm0 + movdqa %xmm3,%xmm7 +.byte 102,15,58,15,254,4 + nop + paddd %xmm7,%xmm4 +.byte 15,56,204,238 +.byte 15,56,203,202 + movdqa 128-128(%rcx),%xmm0 + paddd %xmm3,%xmm0 +.byte 15,56,205,227 +.byte 15,56,203,209 + pshufd $0x0e,%xmm0,%xmm0 + movdqa %xmm4,%xmm7 +.byte 102,15,58,15,251,4 + nop + paddd %xmm7,%xmm5 +.byte 15,56,204,243 +.byte 15,56,203,202 + movdqa 144-128(%rcx),%xmm0 + paddd %xmm4,%xmm0 +.byte 15,56,205,236 +.byte 15,56,203,209 + pshufd $0x0e,%xmm0,%xmm0 + movdqa %xmm5,%xmm7 +.byte 102,15,58,15,252,4 + nop + paddd %xmm7,%xmm6 +.byte 15,56,204,220 +.byte 15,56,203,202 + movdqa 160-128(%rcx),%xmm0 + paddd %xmm5,%xmm0 +.byte 15,56,205,245 +.byte 15,56,203,209 + pshufd $0x0e,%xmm0,%xmm0 + movdqa %xmm6,%xmm7 +.byte 102,15,58,15,253,4 + nop + paddd %xmm7,%xmm3 +.byte 15,56,204,229 +.byte 15,56,203,202 + movdqa 176-128(%rcx),%xmm0 + paddd %xmm6,%xmm0 +.byte 15,56,205,222 +.byte 15,56,203,209 + pshufd $0x0e,%xmm0,%xmm0 + movdqa %xmm3,%xmm7 +.byte 102,15,58,15,254,4 + nop + paddd %xmm7,%xmm4 +.byte 15,56,204,238 +.byte 15,56,203,202 + movdqa 192-128(%rcx),%xmm0 + paddd %xmm3,%xmm0 +.byte 15,56,205,227 +.byte 15,56,203,209 + pshufd $0x0e,%xmm0,%xmm0 + movdqa %xmm4,%xmm7 +.byte 102,15,58,15,251,4 + nop + paddd %xmm7,%xmm5 +.byte 15,56,204,243 +.byte 15,56,203,202 + movdqa 208-128(%rcx),%xmm0 + paddd %xmm4,%xmm0 +.byte 15,56,205,236 +.byte 15,56,203,209 + pshufd $0x0e,%xmm0,%xmm0 + movdqa %xmm5,%xmm7 +.byte 102,15,58,15,252,4 +.byte 15,56,203,202 + paddd %xmm7,%xmm6 + + movdqa 224-128(%rcx),%xmm0 + paddd %xmm5,%xmm0 +.byte 15,56,203,209 + pshufd $0x0e,%xmm0,%xmm0 +.byte 15,56,205,245 + movdqa %xmm8,%xmm7 +.byte 15,56,203,202 + + movdqa 240-128(%rcx),%xmm0 + paddd %xmm6,%xmm0 + nop +.byte 15,56,203,209 + pshufd $0x0e,%xmm0,%xmm0 + decq %rdx + nop +.byte 15,56,203,202 + + paddd %xmm10,%xmm2 + paddd %xmm9,%xmm1 + jnz .Loop_shaext + + pshufd $0xb1,%xmm2,%xmm2 + pshufd $0x1b,%xmm1,%xmm7 + pshufd $0xb1,%xmm1,%xmm1 + punpckhqdq %xmm2,%xmm1 +.byte 102,15,58,15,215,8 + + movdqu %xmm1,(%rdi) + movdqu %xmm2,16(%rdi) + .byte 0xf3,0xc3 +.cfi_endproc +.size blst_sha256_block_data_order_shaext,.-blst_sha256_block_data_order_shaext +.globl blst_sha256_block_data_order +.hidden blst_sha256_block_data_order +.type blst_sha256_block_data_order,@function +.align 64 +blst_sha256_block_data_order: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + shlq $4,%rdx + subq $40,%rsp +.cfi_adjust_cfa_offset 40 + leaq (%rsi,%rdx,4),%rdx + movq %rdi,0(%rsp) + + movq %rdx,16(%rsp) + movq %rsp,%rbp +.cfi_def_cfa_register %rbp + + + leaq -64(%rsp),%rsp + movl 0(%rdi),%eax + andq $-64,%rsp + movl 4(%rdi),%ebx + movl 8(%rdi),%ecx + movl 12(%rdi),%edx + movl 16(%rdi),%r8d + movl 20(%rdi),%r9d + movl 24(%rdi),%r10d + movl 28(%rdi),%r11d + + + jmp .Lloop_ssse3 +.align 16 +.Lloop_ssse3: + movdqa K256+256(%rip),%xmm7 + movq %rsi,8(%rbp) + movdqu 0(%rsi),%xmm0 + movdqu 16(%rsi),%xmm1 + movdqu 32(%rsi),%xmm2 +.byte 102,15,56,0,199 + movdqu 48(%rsi),%xmm3 + leaq K256(%rip),%rsi +.byte 102,15,56,0,207 + movdqa 0(%rsi),%xmm4 + movdqa 16(%rsi),%xmm5 +.byte 102,15,56,0,215 + paddd %xmm0,%xmm4 + movdqa 32(%rsi),%xmm6 +.byte 102,15,56,0,223 + movdqa 48(%rsi),%xmm7 + paddd %xmm1,%xmm5 + paddd %xmm2,%xmm6 + paddd %xmm3,%xmm7 + movdqa %xmm4,0(%rsp) + movl %eax,%r14d + movdqa %xmm5,16(%rsp) + movl %ebx,%edi + movdqa %xmm6,32(%rsp) + xorl %ecx,%edi + movdqa %xmm7,48(%rsp) + movl %r8d,%r13d + jmp .Lssse3_00_47 + +.align 16 +.Lssse3_00_47: + subq $-64,%rsi + rorl $14,%r13d + movdqa %xmm1,%xmm4 + movl %r14d,%eax + movl %r9d,%r12d + movdqa %xmm3,%xmm7 + rorl $9,%r14d + xorl %r8d,%r13d + xorl %r10d,%r12d + rorl $5,%r13d + xorl %eax,%r14d +.byte 102,15,58,15,224,4 + andl %r8d,%r12d + xorl %r8d,%r13d +.byte 102,15,58,15,250,4 + addl 0(%rsp),%r11d + movl %eax,%r15d + xorl %r10d,%r12d + rorl $11,%r14d + movdqa %xmm4,%xmm5 + xorl %ebx,%r15d + addl %r12d,%r11d + movdqa %xmm4,%xmm6 + rorl $6,%r13d + andl %r15d,%edi + psrld $3,%xmm4 + xorl %eax,%r14d + addl %r13d,%r11d + xorl %ebx,%edi + paddd %xmm7,%xmm0 + rorl $2,%r14d + addl %r11d,%edx + psrld $7,%xmm6 + addl %edi,%r11d + movl %edx,%r13d + pshufd $250,%xmm3,%xmm7 + addl %r11d,%r14d + rorl $14,%r13d + pslld $14,%xmm5 + movl %r14d,%r11d + movl %r8d,%r12d + pxor %xmm6,%xmm4 + rorl $9,%r14d + xorl %edx,%r13d + xorl %r9d,%r12d + rorl $5,%r13d + psrld $11,%xmm6 + xorl %r11d,%r14d + pxor %xmm5,%xmm4 + andl %edx,%r12d + xorl %edx,%r13d + pslld $11,%xmm5 + addl 4(%rsp),%r10d + movl %r11d,%edi + pxor %xmm6,%xmm4 + xorl %r9d,%r12d + rorl $11,%r14d + movdqa %xmm7,%xmm6 + xorl %eax,%edi + addl %r12d,%r10d + pxor %xmm5,%xmm4 + rorl $6,%r13d + andl %edi,%r15d + xorl %r11d,%r14d + psrld $10,%xmm7 + addl %r13d,%r10d + xorl %eax,%r15d + paddd %xmm4,%xmm0 + rorl $2,%r14d + addl %r10d,%ecx + psrlq $17,%xmm6 + addl %r15d,%r10d + movl %ecx,%r13d + addl %r10d,%r14d + pxor %xmm6,%xmm7 + rorl $14,%r13d + movl %r14d,%r10d + movl %edx,%r12d + rorl $9,%r14d + psrlq $2,%xmm6 + xorl %ecx,%r13d + xorl %r8d,%r12d + pxor %xmm6,%xmm7 + rorl $5,%r13d + xorl %r10d,%r14d + andl %ecx,%r12d + pshufd $128,%xmm7,%xmm7 + xorl %ecx,%r13d + addl 8(%rsp),%r9d + movl %r10d,%r15d + psrldq $8,%xmm7 + xorl %r8d,%r12d + rorl $11,%r14d + xorl %r11d,%r15d + addl %r12d,%r9d + rorl $6,%r13d + paddd %xmm7,%xmm0 + andl %r15d,%edi + xorl %r10d,%r14d + addl %r13d,%r9d + pshufd $80,%xmm0,%xmm7 + xorl %r11d,%edi + rorl $2,%r14d + addl %r9d,%ebx + movdqa %xmm7,%xmm6 + addl %edi,%r9d + movl %ebx,%r13d + psrld $10,%xmm7 + addl %r9d,%r14d + rorl $14,%r13d + psrlq $17,%xmm6 + movl %r14d,%r9d + movl %ecx,%r12d + pxor %xmm6,%xmm7 + rorl $9,%r14d + xorl %ebx,%r13d + xorl %edx,%r12d + rorl $5,%r13d + xorl %r9d,%r14d + psrlq $2,%xmm6 + andl %ebx,%r12d + xorl %ebx,%r13d + addl 12(%rsp),%r8d + pxor %xmm6,%xmm7 + movl %r9d,%edi + xorl %edx,%r12d + rorl $11,%r14d + pshufd $8,%xmm7,%xmm7 + xorl %r10d,%edi + addl %r12d,%r8d + movdqa 0(%rsi),%xmm6 + rorl $6,%r13d + andl %edi,%r15d + pslldq $8,%xmm7 + xorl %r9d,%r14d + addl %r13d,%r8d + xorl %r10d,%r15d + paddd %xmm7,%xmm0 + rorl $2,%r14d + addl %r8d,%eax + addl %r15d,%r8d + paddd %xmm0,%xmm6 + movl %eax,%r13d + addl %r8d,%r14d + movdqa %xmm6,0(%rsp) + rorl $14,%r13d + movdqa %xmm2,%xmm4 + movl %r14d,%r8d + movl %ebx,%r12d + movdqa %xmm0,%xmm7 + rorl $9,%r14d + xorl %eax,%r13d + xorl %ecx,%r12d + rorl $5,%r13d + xorl %r8d,%r14d +.byte 102,15,58,15,225,4 + andl %eax,%r12d + xorl %eax,%r13d +.byte 102,15,58,15,251,4 + addl 16(%rsp),%edx + movl %r8d,%r15d + xorl %ecx,%r12d + rorl $11,%r14d + movdqa %xmm4,%xmm5 + xorl %r9d,%r15d + addl %r12d,%edx + movdqa %xmm4,%xmm6 + rorl $6,%r13d + andl %r15d,%edi + psrld $3,%xmm4 + xorl %r8d,%r14d + addl %r13d,%edx + xorl %r9d,%edi + paddd %xmm7,%xmm1 + rorl $2,%r14d + addl %edx,%r11d + psrld $7,%xmm6 + addl %edi,%edx + movl %r11d,%r13d + pshufd $250,%xmm0,%xmm7 + addl %edx,%r14d + rorl $14,%r13d + pslld $14,%xmm5 + movl %r14d,%edx + movl %eax,%r12d + pxor %xmm6,%xmm4 + rorl $9,%r14d + xorl %r11d,%r13d + xorl %ebx,%r12d + rorl $5,%r13d + psrld $11,%xmm6 + xorl %edx,%r14d + pxor %xmm5,%xmm4 + andl %r11d,%r12d + xorl %r11d,%r13d + pslld $11,%xmm5 + addl 20(%rsp),%ecx + movl %edx,%edi + pxor %xmm6,%xmm4 + xorl %ebx,%r12d + rorl $11,%r14d + movdqa %xmm7,%xmm6 + xorl %r8d,%edi + addl %r12d,%ecx + pxor %xmm5,%xmm4 + rorl $6,%r13d + andl %edi,%r15d + xorl %edx,%r14d + psrld $10,%xmm7 + addl %r13d,%ecx + xorl %r8d,%r15d + paddd %xmm4,%xmm1 + rorl $2,%r14d + addl %ecx,%r10d + psrlq $17,%xmm6 + addl %r15d,%ecx + movl %r10d,%r13d + addl %ecx,%r14d + pxor %xmm6,%xmm7 + rorl $14,%r13d + movl %r14d,%ecx + movl %r11d,%r12d + rorl $9,%r14d + psrlq $2,%xmm6 + xorl %r10d,%r13d + xorl %eax,%r12d + pxor %xmm6,%xmm7 + rorl $5,%r13d + xorl %ecx,%r14d + andl %r10d,%r12d + pshufd $128,%xmm7,%xmm7 + xorl %r10d,%r13d + addl 24(%rsp),%ebx + movl %ecx,%r15d + psrldq $8,%xmm7 + xorl %eax,%r12d + rorl $11,%r14d + xorl %edx,%r15d + addl %r12d,%ebx + rorl $6,%r13d + paddd %xmm7,%xmm1 + andl %r15d,%edi + xorl %ecx,%r14d + addl %r13d,%ebx + pshufd $80,%xmm1,%xmm7 + xorl %edx,%edi + rorl $2,%r14d + addl %ebx,%r9d + movdqa %xmm7,%xmm6 + addl %edi,%ebx + movl %r9d,%r13d + psrld $10,%xmm7 + addl %ebx,%r14d + rorl $14,%r13d + psrlq $17,%xmm6 + movl %r14d,%ebx + movl %r10d,%r12d + pxor %xmm6,%xmm7 + rorl $9,%r14d + xorl %r9d,%r13d + xorl %r11d,%r12d + rorl $5,%r13d + xorl %ebx,%r14d + psrlq $2,%xmm6 + andl %r9d,%r12d + xorl %r9d,%r13d + addl 28(%rsp),%eax + pxor %xmm6,%xmm7 + movl %ebx,%edi + xorl %r11d,%r12d + rorl $11,%r14d + pshufd $8,%xmm7,%xmm7 + xorl %ecx,%edi + addl %r12d,%eax + movdqa 16(%rsi),%xmm6 + rorl $6,%r13d + andl %edi,%r15d + pslldq $8,%xmm7 + xorl %ebx,%r14d + addl %r13d,%eax + xorl %ecx,%r15d + paddd %xmm7,%xmm1 + rorl $2,%r14d + addl %eax,%r8d + addl %r15d,%eax + paddd %xmm1,%xmm6 + movl %r8d,%r13d + addl %eax,%r14d + movdqa %xmm6,16(%rsp) + rorl $14,%r13d + movdqa %xmm3,%xmm4 + movl %r14d,%eax + movl %r9d,%r12d + movdqa %xmm1,%xmm7 + rorl $9,%r14d + xorl %r8d,%r13d + xorl %r10d,%r12d + rorl $5,%r13d + xorl %eax,%r14d +.byte 102,15,58,15,226,4 + andl %r8d,%r12d + xorl %r8d,%r13d +.byte 102,15,58,15,248,4 + addl 32(%rsp),%r11d + movl %eax,%r15d + xorl %r10d,%r12d + rorl $11,%r14d + movdqa %xmm4,%xmm5 + xorl %ebx,%r15d + addl %r12d,%r11d + movdqa %xmm4,%xmm6 + rorl $6,%r13d + andl %r15d,%edi + psrld $3,%xmm4 + xorl %eax,%r14d + addl %r13d,%r11d + xorl %ebx,%edi + paddd %xmm7,%xmm2 + rorl $2,%r14d + addl %r11d,%edx + psrld $7,%xmm6 + addl %edi,%r11d + movl %edx,%r13d + pshufd $250,%xmm1,%xmm7 + addl %r11d,%r14d + rorl $14,%r13d + pslld $14,%xmm5 + movl %r14d,%r11d + movl %r8d,%r12d + pxor %xmm6,%xmm4 + rorl $9,%r14d + xorl %edx,%r13d + xorl %r9d,%r12d + rorl $5,%r13d + psrld $11,%xmm6 + xorl %r11d,%r14d + pxor %xmm5,%xmm4 + andl %edx,%r12d + xorl %edx,%r13d + pslld $11,%xmm5 + addl 36(%rsp),%r10d + movl %r11d,%edi + pxor %xmm6,%xmm4 + xorl %r9d,%r12d + rorl $11,%r14d + movdqa %xmm7,%xmm6 + xorl %eax,%edi + addl %r12d,%r10d + pxor %xmm5,%xmm4 + rorl $6,%r13d + andl %edi,%r15d + xorl %r11d,%r14d + psrld $10,%xmm7 + addl %r13d,%r10d + xorl %eax,%r15d + paddd %xmm4,%xmm2 + rorl $2,%r14d + addl %r10d,%ecx + psrlq $17,%xmm6 + addl %r15d,%r10d + movl %ecx,%r13d + addl %r10d,%r14d + pxor %xmm6,%xmm7 + rorl $14,%r13d + movl %r14d,%r10d + movl %edx,%r12d + rorl $9,%r14d + psrlq $2,%xmm6 + xorl %ecx,%r13d + xorl %r8d,%r12d + pxor %xmm6,%xmm7 + rorl $5,%r13d + xorl %r10d,%r14d + andl %ecx,%r12d + pshufd $128,%xmm7,%xmm7 + xorl %ecx,%r13d + addl 40(%rsp),%r9d + movl %r10d,%r15d + psrldq $8,%xmm7 + xorl %r8d,%r12d + rorl $11,%r14d + xorl %r11d,%r15d + addl %r12d,%r9d + rorl $6,%r13d + paddd %xmm7,%xmm2 + andl %r15d,%edi + xorl %r10d,%r14d + addl %r13d,%r9d + pshufd $80,%xmm2,%xmm7 + xorl %r11d,%edi + rorl $2,%r14d + addl %r9d,%ebx + movdqa %xmm7,%xmm6 + addl %edi,%r9d + movl %ebx,%r13d + psrld $10,%xmm7 + addl %r9d,%r14d + rorl $14,%r13d + psrlq $17,%xmm6 + movl %r14d,%r9d + movl %ecx,%r12d + pxor %xmm6,%xmm7 + rorl $9,%r14d + xorl %ebx,%r13d + xorl %edx,%r12d + rorl $5,%r13d + xorl %r9d,%r14d + psrlq $2,%xmm6 + andl %ebx,%r12d + xorl %ebx,%r13d + addl 44(%rsp),%r8d + pxor %xmm6,%xmm7 + movl %r9d,%edi + xorl %edx,%r12d + rorl $11,%r14d + pshufd $8,%xmm7,%xmm7 + xorl %r10d,%edi + addl %r12d,%r8d + movdqa 32(%rsi),%xmm6 + rorl $6,%r13d + andl %edi,%r15d + pslldq $8,%xmm7 + xorl %r9d,%r14d + addl %r13d,%r8d + xorl %r10d,%r15d + paddd %xmm7,%xmm2 + rorl $2,%r14d + addl %r8d,%eax + addl %r15d,%r8d + paddd %xmm2,%xmm6 + movl %eax,%r13d + addl %r8d,%r14d + movdqa %xmm6,32(%rsp) + rorl $14,%r13d + movdqa %xmm0,%xmm4 + movl %r14d,%r8d + movl %ebx,%r12d + movdqa %xmm2,%xmm7 + rorl $9,%r14d + xorl %eax,%r13d + xorl %ecx,%r12d + rorl $5,%r13d + xorl %r8d,%r14d +.byte 102,15,58,15,227,4 + andl %eax,%r12d + xorl %eax,%r13d +.byte 102,15,58,15,249,4 + addl 48(%rsp),%edx + movl %r8d,%r15d + xorl %ecx,%r12d + rorl $11,%r14d + movdqa %xmm4,%xmm5 + xorl %r9d,%r15d + addl %r12d,%edx + movdqa %xmm4,%xmm6 + rorl $6,%r13d + andl %r15d,%edi + psrld $3,%xmm4 + xorl %r8d,%r14d + addl %r13d,%edx + xorl %r9d,%edi + paddd %xmm7,%xmm3 + rorl $2,%r14d + addl %edx,%r11d + psrld $7,%xmm6 + addl %edi,%edx + movl %r11d,%r13d + pshufd $250,%xmm2,%xmm7 + addl %edx,%r14d + rorl $14,%r13d + pslld $14,%xmm5 + movl %r14d,%edx + movl %eax,%r12d + pxor %xmm6,%xmm4 + rorl $9,%r14d + xorl %r11d,%r13d + xorl %ebx,%r12d + rorl $5,%r13d + psrld $11,%xmm6 + xorl %edx,%r14d + pxor %xmm5,%xmm4 + andl %r11d,%r12d + xorl %r11d,%r13d + pslld $11,%xmm5 + addl 52(%rsp),%ecx + movl %edx,%edi + pxor %xmm6,%xmm4 + xorl %ebx,%r12d + rorl $11,%r14d + movdqa %xmm7,%xmm6 + xorl %r8d,%edi + addl %r12d,%ecx + pxor %xmm5,%xmm4 + rorl $6,%r13d + andl %edi,%r15d + xorl %edx,%r14d + psrld $10,%xmm7 + addl %r13d,%ecx + xorl %r8d,%r15d + paddd %xmm4,%xmm3 + rorl $2,%r14d + addl %ecx,%r10d + psrlq $17,%xmm6 + addl %r15d,%ecx + movl %r10d,%r13d + addl %ecx,%r14d + pxor %xmm6,%xmm7 + rorl $14,%r13d + movl %r14d,%ecx + movl %r11d,%r12d + rorl $9,%r14d + psrlq $2,%xmm6 + xorl %r10d,%r13d + xorl %eax,%r12d + pxor %xmm6,%xmm7 + rorl $5,%r13d + xorl %ecx,%r14d + andl %r10d,%r12d + pshufd $128,%xmm7,%xmm7 + xorl %r10d,%r13d + addl 56(%rsp),%ebx + movl %ecx,%r15d + psrldq $8,%xmm7 + xorl %eax,%r12d + rorl $11,%r14d + xorl %edx,%r15d + addl %r12d,%ebx + rorl $6,%r13d + paddd %xmm7,%xmm3 + andl %r15d,%edi + xorl %ecx,%r14d + addl %r13d,%ebx + pshufd $80,%xmm3,%xmm7 + xorl %edx,%edi + rorl $2,%r14d + addl %ebx,%r9d + movdqa %xmm7,%xmm6 + addl %edi,%ebx + movl %r9d,%r13d + psrld $10,%xmm7 + addl %ebx,%r14d + rorl $14,%r13d + psrlq $17,%xmm6 + movl %r14d,%ebx + movl %r10d,%r12d + pxor %xmm6,%xmm7 + rorl $9,%r14d + xorl %r9d,%r13d + xorl %r11d,%r12d + rorl $5,%r13d + xorl %ebx,%r14d + psrlq $2,%xmm6 + andl %r9d,%r12d + xorl %r9d,%r13d + addl 60(%rsp),%eax + pxor %xmm6,%xmm7 + movl %ebx,%edi + xorl %r11d,%r12d + rorl $11,%r14d + pshufd $8,%xmm7,%xmm7 + xorl %ecx,%edi + addl %r12d,%eax + movdqa 48(%rsi),%xmm6 + rorl $6,%r13d + andl %edi,%r15d + pslldq $8,%xmm7 + xorl %ebx,%r14d + addl %r13d,%eax + xorl %ecx,%r15d + paddd %xmm7,%xmm3 + rorl $2,%r14d + addl %eax,%r8d + addl %r15d,%eax + paddd %xmm3,%xmm6 + movl %r8d,%r13d + addl %eax,%r14d + movdqa %xmm6,48(%rsp) + cmpb $0,67(%rsi) + jne .Lssse3_00_47 + rorl $14,%r13d + movl %r14d,%eax + movl %r9d,%r12d + rorl $9,%r14d + xorl %r8d,%r13d + xorl %r10d,%r12d + rorl $5,%r13d + xorl %eax,%r14d + andl %r8d,%r12d + xorl %r8d,%r13d + addl 0(%rsp),%r11d + movl %eax,%r15d + xorl %r10d,%r12d + rorl $11,%r14d + xorl %ebx,%r15d + addl %r12d,%r11d + rorl $6,%r13d + andl %r15d,%edi + xorl %eax,%r14d + addl %r13d,%r11d + xorl %ebx,%edi + rorl $2,%r14d + addl %r11d,%edx + addl %edi,%r11d + movl %edx,%r13d + addl %r11d,%r14d + rorl $14,%r13d + movl %r14d,%r11d + movl %r8d,%r12d + rorl $9,%r14d + xorl %edx,%r13d + xorl %r9d,%r12d + rorl $5,%r13d + xorl %r11d,%r14d + andl %edx,%r12d + xorl %edx,%r13d + addl 4(%rsp),%r10d + movl %r11d,%edi + xorl %r9d,%r12d + rorl $11,%r14d + xorl %eax,%edi + addl %r12d,%r10d + rorl $6,%r13d + andl %edi,%r15d + xorl %r11d,%r14d + addl %r13d,%r10d + xorl %eax,%r15d + rorl $2,%r14d + addl %r10d,%ecx + addl %r15d,%r10d + movl %ecx,%r13d + addl %r10d,%r14d + rorl $14,%r13d + movl %r14d,%r10d + movl %edx,%r12d + rorl $9,%r14d + xorl %ecx,%r13d + xorl %r8d,%r12d + rorl $5,%r13d + xorl %r10d,%r14d + andl %ecx,%r12d + xorl %ecx,%r13d + addl 8(%rsp),%r9d + movl %r10d,%r15d + xorl %r8d,%r12d + rorl $11,%r14d + xorl %r11d,%r15d + addl %r12d,%r9d + rorl $6,%r13d + andl %r15d,%edi + xorl %r10d,%r14d + addl %r13d,%r9d + xorl %r11d,%edi + rorl $2,%r14d + addl %r9d,%ebx + addl %edi,%r9d + movl %ebx,%r13d + addl %r9d,%r14d + rorl $14,%r13d + movl %r14d,%r9d + movl %ecx,%r12d + rorl $9,%r14d + xorl %ebx,%r13d + xorl %edx,%r12d + rorl $5,%r13d + xorl %r9d,%r14d + andl %ebx,%r12d + xorl %ebx,%r13d + addl 12(%rsp),%r8d + movl %r9d,%edi + xorl %edx,%r12d + rorl $11,%r14d + xorl %r10d,%edi + addl %r12d,%r8d + rorl $6,%r13d + andl %edi,%r15d + xorl %r9d,%r14d + addl %r13d,%r8d + xorl %r10d,%r15d + rorl $2,%r14d + addl %r8d,%eax + addl %r15d,%r8d + movl %eax,%r13d + addl %r8d,%r14d + rorl $14,%r13d + movl %r14d,%r8d + movl %ebx,%r12d + rorl $9,%r14d + xorl %eax,%r13d + xorl %ecx,%r12d + rorl $5,%r13d + xorl %r8d,%r14d + andl %eax,%r12d + xorl %eax,%r13d + addl 16(%rsp),%edx + movl %r8d,%r15d + xorl %ecx,%r12d + rorl $11,%r14d + xorl %r9d,%r15d + addl %r12d,%edx + rorl $6,%r13d + andl %r15d,%edi + xorl %r8d,%r14d + addl %r13d,%edx + xorl %r9d,%edi + rorl $2,%r14d + addl %edx,%r11d + addl %edi,%edx + movl %r11d,%r13d + addl %edx,%r14d + rorl $14,%r13d + movl %r14d,%edx + movl %eax,%r12d + rorl $9,%r14d + xorl %r11d,%r13d + xorl %ebx,%r12d + rorl $5,%r13d + xorl %edx,%r14d + andl %r11d,%r12d + xorl %r11d,%r13d + addl 20(%rsp),%ecx + movl %edx,%edi + xorl %ebx,%r12d + rorl $11,%r14d + xorl %r8d,%edi + addl %r12d,%ecx + rorl $6,%r13d + andl %edi,%r15d + xorl %edx,%r14d + addl %r13d,%ecx + xorl %r8d,%r15d + rorl $2,%r14d + addl %ecx,%r10d + addl %r15d,%ecx + movl %r10d,%r13d + addl %ecx,%r14d + rorl $14,%r13d + movl %r14d,%ecx + movl %r11d,%r12d + rorl $9,%r14d + xorl %r10d,%r13d + xorl %eax,%r12d + rorl $5,%r13d + xorl %ecx,%r14d + andl %r10d,%r12d + xorl %r10d,%r13d + addl 24(%rsp),%ebx + movl %ecx,%r15d + xorl %eax,%r12d + rorl $11,%r14d + xorl %edx,%r15d + addl %r12d,%ebx + rorl $6,%r13d + andl %r15d,%edi + xorl %ecx,%r14d + addl %r13d,%ebx + xorl %edx,%edi + rorl $2,%r14d + addl %ebx,%r9d + addl %edi,%ebx + movl %r9d,%r13d + addl %ebx,%r14d + rorl $14,%r13d + movl %r14d,%ebx + movl %r10d,%r12d + rorl $9,%r14d + xorl %r9d,%r13d + xorl %r11d,%r12d + rorl $5,%r13d + xorl %ebx,%r14d + andl %r9d,%r12d + xorl %r9d,%r13d + addl 28(%rsp),%eax + movl %ebx,%edi + xorl %r11d,%r12d + rorl $11,%r14d + xorl %ecx,%edi + addl %r12d,%eax + rorl $6,%r13d + andl %edi,%r15d + xorl %ebx,%r14d + addl %r13d,%eax + xorl %ecx,%r15d + rorl $2,%r14d + addl %eax,%r8d + addl %r15d,%eax + movl %r8d,%r13d + addl %eax,%r14d + rorl $14,%r13d + movl %r14d,%eax + movl %r9d,%r12d + rorl $9,%r14d + xorl %r8d,%r13d + xorl %r10d,%r12d + rorl $5,%r13d + xorl %eax,%r14d + andl %r8d,%r12d + xorl %r8d,%r13d + addl 32(%rsp),%r11d + movl %eax,%r15d + xorl %r10d,%r12d + rorl $11,%r14d + xorl %ebx,%r15d + addl %r12d,%r11d + rorl $6,%r13d + andl %r15d,%edi + xorl %eax,%r14d + addl %r13d,%r11d + xorl %ebx,%edi + rorl $2,%r14d + addl %r11d,%edx + addl %edi,%r11d + movl %edx,%r13d + addl %r11d,%r14d + rorl $14,%r13d + movl %r14d,%r11d + movl %r8d,%r12d + rorl $9,%r14d + xorl %edx,%r13d + xorl %r9d,%r12d + rorl $5,%r13d + xorl %r11d,%r14d + andl %edx,%r12d + xorl %edx,%r13d + addl 36(%rsp),%r10d + movl %r11d,%edi + xorl %r9d,%r12d + rorl $11,%r14d + xorl %eax,%edi + addl %r12d,%r10d + rorl $6,%r13d + andl %edi,%r15d + xorl %r11d,%r14d + addl %r13d,%r10d + xorl %eax,%r15d + rorl $2,%r14d + addl %r10d,%ecx + addl %r15d,%r10d + movl %ecx,%r13d + addl %r10d,%r14d + rorl $14,%r13d + movl %r14d,%r10d + movl %edx,%r12d + rorl $9,%r14d + xorl %ecx,%r13d + xorl %r8d,%r12d + rorl $5,%r13d + xorl %r10d,%r14d + andl %ecx,%r12d + xorl %ecx,%r13d + addl 40(%rsp),%r9d + movl %r10d,%r15d + xorl %r8d,%r12d + rorl $11,%r14d + xorl %r11d,%r15d + addl %r12d,%r9d + rorl $6,%r13d + andl %r15d,%edi + xorl %r10d,%r14d + addl %r13d,%r9d + xorl %r11d,%edi + rorl $2,%r14d + addl %r9d,%ebx + addl %edi,%r9d + movl %ebx,%r13d + addl %r9d,%r14d + rorl $14,%r13d + movl %r14d,%r9d + movl %ecx,%r12d + rorl $9,%r14d + xorl %ebx,%r13d + xorl %edx,%r12d + rorl $5,%r13d + xorl %r9d,%r14d + andl %ebx,%r12d + xorl %ebx,%r13d + addl 44(%rsp),%r8d + movl %r9d,%edi + xorl %edx,%r12d + rorl $11,%r14d + xorl %r10d,%edi + addl %r12d,%r8d + rorl $6,%r13d + andl %edi,%r15d + xorl %r9d,%r14d + addl %r13d,%r8d + xorl %r10d,%r15d + rorl $2,%r14d + addl %r8d,%eax + addl %r15d,%r8d + movl %eax,%r13d + addl %r8d,%r14d + rorl $14,%r13d + movl %r14d,%r8d + movl %ebx,%r12d + rorl $9,%r14d + xorl %eax,%r13d + xorl %ecx,%r12d + rorl $5,%r13d + xorl %r8d,%r14d + andl %eax,%r12d + xorl %eax,%r13d + addl 48(%rsp),%edx + movl %r8d,%r15d + xorl %ecx,%r12d + rorl $11,%r14d + xorl %r9d,%r15d + addl %r12d,%edx + rorl $6,%r13d + andl %r15d,%edi + xorl %r8d,%r14d + addl %r13d,%edx + xorl %r9d,%edi + rorl $2,%r14d + addl %edx,%r11d + addl %edi,%edx + movl %r11d,%r13d + addl %edx,%r14d + rorl $14,%r13d + movl %r14d,%edx + movl %eax,%r12d + rorl $9,%r14d + xorl %r11d,%r13d + xorl %ebx,%r12d + rorl $5,%r13d + xorl %edx,%r14d + andl %r11d,%r12d + xorl %r11d,%r13d + addl 52(%rsp),%ecx + movl %edx,%edi + xorl %ebx,%r12d + rorl $11,%r14d + xorl %r8d,%edi + addl %r12d,%ecx + rorl $6,%r13d + andl %edi,%r15d + xorl %edx,%r14d + addl %r13d,%ecx + xorl %r8d,%r15d + rorl $2,%r14d + addl %ecx,%r10d + addl %r15d,%ecx + movl %r10d,%r13d + addl %ecx,%r14d + rorl $14,%r13d + movl %r14d,%ecx + movl %r11d,%r12d + rorl $9,%r14d + xorl %r10d,%r13d + xorl %eax,%r12d + rorl $5,%r13d + xorl %ecx,%r14d + andl %r10d,%r12d + xorl %r10d,%r13d + addl 56(%rsp),%ebx + movl %ecx,%r15d + xorl %eax,%r12d + rorl $11,%r14d + xorl %edx,%r15d + addl %r12d,%ebx + rorl $6,%r13d + andl %r15d,%edi + xorl %ecx,%r14d + addl %r13d,%ebx + xorl %edx,%edi + rorl $2,%r14d + addl %ebx,%r9d + addl %edi,%ebx + movl %r9d,%r13d + addl %ebx,%r14d + rorl $14,%r13d + movl %r14d,%ebx + movl %r10d,%r12d + rorl $9,%r14d + xorl %r9d,%r13d + xorl %r11d,%r12d + rorl $5,%r13d + xorl %ebx,%r14d + andl %r9d,%r12d + xorl %r9d,%r13d + addl 60(%rsp),%eax + movl %ebx,%edi + xorl %r11d,%r12d + rorl $11,%r14d + xorl %ecx,%edi + addl %r12d,%eax + rorl $6,%r13d + andl %edi,%r15d + xorl %ebx,%r14d + addl %r13d,%eax + xorl %ecx,%r15d + rorl $2,%r14d + addl %eax,%r8d + addl %r15d,%eax + movl %r8d,%r13d + addl %eax,%r14d + movq 0(%rbp),%rdi + movl %r14d,%eax + movq 8(%rbp),%rsi + + addl 0(%rdi),%eax + addl 4(%rdi),%ebx + addl 8(%rdi),%ecx + addl 12(%rdi),%edx + addl 16(%rdi),%r8d + addl 20(%rdi),%r9d + addl 24(%rdi),%r10d + addl 28(%rdi),%r11d + + leaq 64(%rsi),%rsi + cmpq 16(%rbp),%rsi + + movl %eax,0(%rdi) + movl %ebx,4(%rdi) + movl %ecx,8(%rdi) + movl %edx,12(%rdi) + movl %r8d,16(%rdi) + movl %r9d,20(%rdi) + movl %r10d,24(%rdi) + movl %r11d,28(%rdi) + jb .Lloop_ssse3 + + xorps %xmm0,%xmm0 + leaq 40+48(%rbp),%r11 +.cfi_def_cfa %r11,8 + movaps %xmm0,0(%rsp) + movaps %xmm0,16(%rsp) + movaps %xmm0,32(%rsp) + movaps %xmm0,48(%rsp) + movq 40(%rbp),%r15 +.cfi_restore %r15 + movq -40(%r11),%r14 +.cfi_restore %r14 + movq -32(%r11),%r13 +.cfi_restore %r13 + movq -24(%r11),%r12 +.cfi_restore %r12 + movq -16(%r11),%rbx +.cfi_restore %rbx + movq -8(%r11),%rbp +.cfi_restore %rbp + + leaq (%r11),%rsp + .byte 0xf3,0xc3 +.cfi_endproc +.size blst_sha256_block_data_order,.-blst_sha256_block_data_order +.globl blst_sha256_emit +.hidden blst_sha256_emit +.type blst_sha256_emit,@function +.align 16 +blst_sha256_emit: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + bswapq %r8 + movq 24(%rsi),%r11 + bswapq %r9 + movl %r8d,4(%rdi) + bswapq %r10 + movl %r9d,12(%rdi) + bswapq %r11 + movl %r10d,20(%rdi) + shrq $32,%r8 + movl %r11d,28(%rdi) + shrq $32,%r9 + movl %r8d,0(%rdi) + shrq $32,%r10 + movl %r9d,8(%rdi) + shrq $32,%r11 + movl %r10d,16(%rdi) + movl %r11d,24(%rdi) + .byte 0xf3,0xc3 +.cfi_endproc +.size blst_sha256_emit,.-blst_sha256_emit + +.globl blst_sha256_bcopy +.hidden blst_sha256_bcopy +.type blst_sha256_bcopy,@function +.align 16 +blst_sha256_bcopy: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + subq %rsi,%rdi +.Loop_bcopy: + movzbl (%rsi),%eax + leaq 1(%rsi),%rsi + movb %al,-1(%rdi,%rsi,1) + decq %rdx + jnz .Loop_bcopy + .byte 0xf3,0xc3 +.cfi_endproc +.size blst_sha256_bcopy,.-blst_sha256_bcopy + +.globl blst_sha256_hcopy +.hidden blst_sha256_hcopy +.type blst_sha256_hcopy,@function +.align 16 +blst_sha256_hcopy: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + movq %r8,0(%rdi) + movq %r9,8(%rdi) + movq %r10,16(%rdi) + movq %r11,24(%rdi) + .byte 0xf3,0xc3 +.cfi_endproc +.size blst_sha256_hcopy,.-blst_sha256_hcopy + +.section .note.GNU-stack,"",@progbits +.section .note.gnu.property,"a",@note + .long 4,2f-1f,5 + .byte 0x47,0x4E,0x55,0 +1: .long 0xc0000002,4,3 +.align 8 +2: diff --git a/temp-deps/blst-ts/deps/blst/build/mach-o/add_mod_256-armv8.S b/temp-deps/blst-ts/deps/blst/build/mach-o/add_mod_256-armv8.S new file mode 100644 index 0000000..198d65a --- /dev/null +++ b/temp-deps/blst-ts/deps/blst/build/mach-o/add_mod_256-armv8.S @@ -0,0 +1,379 @@ +.text + +.globl _add_mod_256 +.private_extern _add_mod_256 + +.align 5 +_add_mod_256: + ldp x8,x9,[x1] + ldp x12,x13,[x2] + + ldp x10,x11,[x1,#16] + adds x8,x8,x12 + ldp x14,x15,[x2,#16] + adcs x9,x9,x13 + ldp x4,x5,[x3] + adcs x10,x10,x14 + ldp x6,x7,[x3,#16] + adcs x11,x11,x15 + adc x3,xzr,xzr + + subs x16,x8,x4 + sbcs x17,x9,x5 + sbcs x1,x10,x6 + sbcs x2,x11,x7 + sbcs xzr,x3,xzr + + csel x8,x8,x16,lo + csel x9,x9,x17,lo + csel x10,x10,x1,lo + stp x8,x9,[x0] + csel x11,x11,x2,lo + stp x10,x11,[x0,#16] + + ret + + +.globl _mul_by_3_mod_256 +.private_extern _mul_by_3_mod_256 + +.align 5 +_mul_by_3_mod_256: + ldp x12,x13,[x1] + ldp x14,x15,[x1,#16] + + adds x8,x12,x12 + ldp x4,x5,[x2] + adcs x9,x13,x13 + ldp x6,x7,[x2,#16] + adcs x10,x14,x14 + adcs x11,x15,x15 + adc x3,xzr,xzr + + subs x16,x8,x4 + sbcs x17,x9,x5 + sbcs x1,x10,x6 + sbcs x2,x11,x7 + sbcs xzr,x3,xzr + + csel x8,x8,x16,lo + csel x9,x9,x17,lo + csel x10,x10,x1,lo + csel x11,x11,x2,lo + + adds x8,x8,x12 + adcs x9,x9,x13 + adcs x10,x10,x14 + adcs x11,x11,x15 + adc x3,xzr,xzr + + subs x16,x8,x4 + sbcs x17,x9,x5 + sbcs x1,x10,x6 + sbcs x2,x11,x7 + sbcs xzr,x3,xzr + + csel x8,x8,x16,lo + csel x9,x9,x17,lo + csel x10,x10,x1,lo + stp x8,x9,[x0] + csel x11,x11,x2,lo + stp x10,x11,[x0,#16] + + ret + + +.globl _lshift_mod_256 +.private_extern _lshift_mod_256 + +.align 5 +_lshift_mod_256: + ldp x8,x9,[x1] + ldp x10,x11,[x1,#16] + + ldp x4,x5,[x3] + ldp x6,x7,[x3,#16] + +Loop_lshift_mod_256: + adds x8,x8,x8 + sub x2,x2,#1 + adcs x9,x9,x9 + adcs x10,x10,x10 + adcs x11,x11,x11 + adc x3,xzr,xzr + + subs x12,x8,x4 + sbcs x13,x9,x5 + sbcs x14,x10,x6 + sbcs x15,x11,x7 + sbcs xzr,x3,xzr + + csel x8,x8,x12,lo + csel x9,x9,x13,lo + csel x10,x10,x14,lo + csel x11,x11,x15,lo + + cbnz x2,Loop_lshift_mod_256 + + stp x8,x9,[x0] + stp x10,x11,[x0,#16] + + ret + + +.globl _rshift_mod_256 +.private_extern _rshift_mod_256 + +.align 5 +_rshift_mod_256: + ldp x8,x9,[x1] + ldp x10,x11,[x1,#16] + + ldp x4,x5,[x3] + ldp x6,x7,[x3,#16] + +Loop_rshift: + adds x12,x8,x4 + sub x2,x2,#1 + adcs x13,x9,x5 + adcs x14,x10,x6 + adcs x15,x11,x7 + adc x3,xzr,xzr + tst x8,#1 + + csel x12,x12,x8,ne + csel x13,x13,x9,ne + csel x14,x14,x10,ne + csel x15,x15,x11,ne + csel x3,x3,xzr,ne + + extr x8,x13,x12,#1 + extr x9,x14,x13,#1 + extr x10,x15,x14,#1 + extr x11,x3,x15,#1 + + cbnz x2,Loop_rshift + + stp x8,x9,[x0] + stp x10,x11,[x0,#16] + + ret + + +.globl _cneg_mod_256 +.private_extern _cneg_mod_256 + +.align 5 +_cneg_mod_256: + ldp x8,x9,[x1] + ldp x4,x5,[x3] + + ldp x10,x11,[x1,#16] + subs x12,x4,x8 + ldp x6,x7,[x3,#16] + orr x4,x8,x9 + sbcs x13,x5,x9 + orr x5,x10,x11 + sbcs x14,x6,x10 + orr x3,x4,x5 + sbc x15,x7,x11 + + cmp x3,#0 + csetm x3,ne + ands x2,x2,x3 + + csel x8,x8,x12,eq + csel x9,x9,x13,eq + csel x10,x10,x14,eq + stp x8,x9,[x0] + csel x11,x11,x15,eq + stp x10,x11,[x0,#16] + + ret + + +.globl _sub_mod_256 +.private_extern _sub_mod_256 + +.align 5 +_sub_mod_256: + ldp x8,x9,[x1] + ldp x12,x13,[x2] + + ldp x10,x11,[x1,#16] + subs x8,x8,x12 + ldp x14,x15,[x2,#16] + sbcs x9,x9,x13 + ldp x4,x5,[x3] + sbcs x10,x10,x14 + ldp x6,x7,[x3,#16] + sbcs x11,x11,x15 + sbc x3,xzr,xzr + + and x4,x4,x3 + and x5,x5,x3 + adds x8,x8,x4 + and x6,x6,x3 + adcs x9,x9,x5 + and x7,x7,x3 + adcs x10,x10,x6 + stp x8,x9,[x0] + adc x11,x11,x7 + stp x10,x11,[x0,#16] + + ret + + +.globl _check_mod_256 +.private_extern _check_mod_256 + +.align 5 +_check_mod_256: + ldp x8,x9,[x0] + ldp x10,x11,[x0,#16] + ldp x4,x5,[x1] + ldp x6,x7,[x1,#16] + +#ifdef __AARCH64EB__ + rev x8,x8 + rev x9,x9 + rev x10,x10 + rev x11,x11 +#endif + + subs xzr,x8,x4 + sbcs xzr,x9,x5 + orr x8,x8,x9 + sbcs xzr,x10,x6 + orr x8,x8,x10 + sbcs xzr,x11,x7 + orr x8,x8,x11 + sbc x1,xzr,xzr + + cmp x8,#0 + mov x0,#1 + csel x0,x0,xzr,ne + and x0,x0,x1 + + ret + + +.globl _add_n_check_mod_256 +.private_extern _add_n_check_mod_256 + +.align 5 +_add_n_check_mod_256: + ldp x8,x9,[x1] + ldp x12,x13,[x2] + ldp x10,x11,[x1,#16] + ldp x14,x15,[x2,#16] + +#ifdef __AARCH64EB__ + rev x8,x8 + rev x12,x12 + rev x9,x9 + rev x13,x13 + rev x10,x10 + rev x14,x14 + rev x11,x11 + rev x15,x15 +#endif + + adds x8,x8,x12 + ldp x4,x5,[x3] + adcs x9,x9,x13 + ldp x6,x7,[x3,#16] + adcs x10,x10,x14 + adcs x11,x11,x15 + adc x3,xzr,xzr + + subs x16,x8,x4 + sbcs x17,x9,x5 + sbcs x1,x10,x6 + sbcs x2,x11,x7 + sbcs xzr,x3,xzr + + csel x8,x8,x16,lo + csel x9,x9,x17,lo + csel x10,x10,x1,lo + csel x11,x11,x2,lo + + orr x16, x8, x9 + orr x17, x10, x11 + orr x16, x16, x17 + +#ifdef __AARCH64EB__ + rev x8,x8 + rev x9,x9 + rev x10,x10 + rev x11,x11 +#endif + + stp x8,x9,[x0] + stp x10,x11,[x0,#16] + + mov x17, #1 + cmp x16, #0 + csel x0, x17, xzr, ne + + ret + + +.globl _sub_n_check_mod_256 +.private_extern _sub_n_check_mod_256 + +.align 5 +_sub_n_check_mod_256: + ldp x8,x9,[x1] + ldp x12,x13,[x2] + ldp x10,x11,[x1,#16] + ldp x14,x15,[x2,#16] + +#ifdef __AARCH64EB__ + rev x8,x8 + rev x12,x12 + rev x9,x9 + rev x13,x13 + rev x10,x10 + rev x14,x14 + rev x11,x11 + rev x15,x15 +#endif + + subs x8,x8,x12 + sbcs x9,x9,x13 + ldp x4,x5,[x3] + sbcs x10,x10,x14 + ldp x6,x7,[x3,#16] + sbcs x11,x11,x15 + sbc x3,xzr,xzr + + and x4,x4,x3 + and x5,x5,x3 + adds x8,x8,x4 + and x6,x6,x3 + adcs x9,x9,x5 + and x7,x7,x3 + adcs x10,x10,x6 + adc x11,x11,x7 + + orr x16, x8, x9 + orr x17, x10, x11 + orr x16, x16, x17 + +#ifdef __AARCH64EB__ + rev x8,x8 + rev x9,x9 + rev x10,x10 + rev x11,x11 +#endif + + stp x8,x9,[x0] + stp x10,x11,[x0,#16] + + mov x17, #1 + cmp x16, #0 + csel x0, x17, xzr, ne + + ret + diff --git a/temp-deps/blst-ts/deps/blst/build/mach-o/add_mod_256-x86_64.s b/temp-deps/blst-ts/deps/blst/build/mach-o/add_mod_256-x86_64.s new file mode 100644 index 0000000..19e5ba9 --- /dev/null +++ b/temp-deps/blst-ts/deps/blst/build/mach-o/add_mod_256-x86_64.s @@ -0,0 +1,564 @@ +.text + +.globl _add_mod_256 +.private_extern _add_mod_256 + +.p2align 5 +_add_mod_256: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + subq $8,%rsp +.cfi_adjust_cfa_offset 8 + + + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + +L$oaded_a_add_mod_256: + addq 0(%rdx),%r8 + adcq 8(%rdx),%r9 + movq %r8,%rax + adcq 16(%rdx),%r10 + movq %r9,%rsi + adcq 24(%rdx),%r11 + sbbq %rdx,%rdx + + movq %r10,%rbx + subq 0(%rcx),%r8 + sbbq 8(%rcx),%r9 + sbbq 16(%rcx),%r10 + movq %r11,%rbp + sbbq 24(%rcx),%r11 + sbbq $0,%rdx + + cmovcq %rax,%r8 + cmovcq %rsi,%r9 + movq %r8,0(%rdi) + cmovcq %rbx,%r10 + movq %r9,8(%rdi) + cmovcq %rbp,%r11 + movq %r10,16(%rdi) + movq %r11,24(%rdi) + + movq 8(%rsp),%rbx +.cfi_restore %rbx + movq 16(%rsp),%rbp +.cfi_restore %rbp + leaq 24(%rsp),%rsp +.cfi_adjust_cfa_offset -24 + + .byte 0xf3,0xc3 +.cfi_endproc + + + +.globl _mul_by_3_mod_256 +.private_extern _mul_by_3_mod_256 + +.p2align 5 +_mul_by_3_mod_256: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + + + movq %rdx,%rcx + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq %rsi,%rdx + movq 24(%rsi),%r11 + + call __lshift_mod_256 + movq 0(%rsp),%r12 +.cfi_restore %r12 + jmp L$oaded_a_add_mod_256 + + movq 8(%rsp),%rbx +.cfi_restore %rbx + movq 16(%rsp),%rbp +.cfi_restore %rbp + leaq 24(%rsp),%rsp +.cfi_adjust_cfa_offset -24 + + .byte 0xf3,0xc3 +.cfi_endproc + + + +.p2align 5 +__lshift_mod_256: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + addq %r8,%r8 + adcq %r9,%r9 + movq %r8,%rax + adcq %r10,%r10 + movq %r9,%rsi + adcq %r11,%r11 + sbbq %r12,%r12 + + movq %r10,%rbx + subq 0(%rcx),%r8 + sbbq 8(%rcx),%r9 + sbbq 16(%rcx),%r10 + movq %r11,%rbp + sbbq 24(%rcx),%r11 + sbbq $0,%r12 + + cmovcq %rax,%r8 + cmovcq %rsi,%r9 + cmovcq %rbx,%r10 + cmovcq %rbp,%r11 + + .byte 0xf3,0xc3 +.cfi_endproc + + + +.globl _lshift_mod_256 +.private_extern _lshift_mod_256 + +.p2align 5 +_lshift_mod_256: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + + + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + +L$oop_lshift_mod_256: + call __lshift_mod_256 + decl %edx + jnz L$oop_lshift_mod_256 + + movq %r8,0(%rdi) + movq %r9,8(%rdi) + movq %r10,16(%rdi) + movq %r11,24(%rdi) + + movq 0(%rsp),%r12 +.cfi_restore %r12 + movq 8(%rsp),%rbx +.cfi_restore %rbx + movq 16(%rsp),%rbp +.cfi_restore %rbp + leaq 24(%rsp),%rsp +.cfi_adjust_cfa_offset -24 + + .byte 0xf3,0xc3 +.cfi_endproc + + + +.globl _rshift_mod_256 +.private_extern _rshift_mod_256 + +.p2align 5 +_rshift_mod_256: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + subq $8,%rsp +.cfi_adjust_cfa_offset 8 + + + movq 0(%rsi),%rbp + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + +L$oop_rshift_mod_256: + movq %rbp,%r8 + andq $1,%rbp + movq 0(%rcx),%rax + negq %rbp + movq 8(%rcx),%rsi + movq 16(%rcx),%rbx + + andq %rbp,%rax + andq %rbp,%rsi + andq %rbp,%rbx + andq 24(%rcx),%rbp + + addq %rax,%r8 + adcq %rsi,%r9 + adcq %rbx,%r10 + adcq %rbp,%r11 + sbbq %rax,%rax + + shrq $1,%r8 + movq %r9,%rbp + shrq $1,%r9 + movq %r10,%rbx + shrq $1,%r10 + movq %r11,%rsi + shrq $1,%r11 + + shlq $63,%rbp + shlq $63,%rbx + orq %r8,%rbp + shlq $63,%rsi + orq %rbx,%r9 + shlq $63,%rax + orq %rsi,%r10 + orq %rax,%r11 + + decl %edx + jnz L$oop_rshift_mod_256 + + movq %rbp,0(%rdi) + movq %r9,8(%rdi) + movq %r10,16(%rdi) + movq %r11,24(%rdi) + + movq 8(%rsp),%rbx +.cfi_restore %rbx + movq 16(%rsp),%rbp +.cfi_restore %rbp + leaq 24(%rsp),%rsp +.cfi_adjust_cfa_offset -24 + + .byte 0xf3,0xc3 +.cfi_endproc + + + +.globl _cneg_mod_256 +.private_extern _cneg_mod_256 + +.p2align 5 +_cneg_mod_256: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + + + movq 0(%rsi),%r12 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq %r12,%r8 + movq 24(%rsi),%r11 + orq %r9,%r12 + orq %r10,%r12 + orq %r11,%r12 + movq $-1,%rbp + + movq 0(%rcx),%rax + cmovnzq %rbp,%r12 + movq 8(%rcx),%rsi + movq 16(%rcx),%rbx + andq %r12,%rax + movq 24(%rcx),%rbp + andq %r12,%rsi + andq %r12,%rbx + andq %r12,%rbp + + subq %r8,%rax + sbbq %r9,%rsi + sbbq %r10,%rbx + sbbq %r11,%rbp + + orq %rdx,%rdx + + cmovzq %r8,%rax + cmovzq %r9,%rsi + movq %rax,0(%rdi) + cmovzq %r10,%rbx + movq %rsi,8(%rdi) + cmovzq %r11,%rbp + movq %rbx,16(%rdi) + movq %rbp,24(%rdi) + + movq 0(%rsp),%r12 +.cfi_restore %r12 + movq 8(%rsp),%rbx +.cfi_restore %rbx + movq 16(%rsp),%rbp +.cfi_restore %rbp + leaq 24(%rsp),%rsp +.cfi_adjust_cfa_offset -24 + + .byte 0xf3,0xc3 +.cfi_endproc + + + +.globl _sub_mod_256 +.private_extern _sub_mod_256 + +.p2align 5 +_sub_mod_256: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + subq $8,%rsp +.cfi_adjust_cfa_offset 8 + + + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + + subq 0(%rdx),%r8 + movq 0(%rcx),%rax + sbbq 8(%rdx),%r9 + movq 8(%rcx),%rsi + sbbq 16(%rdx),%r10 + movq 16(%rcx),%rbx + sbbq 24(%rdx),%r11 + movq 24(%rcx),%rbp + sbbq %rdx,%rdx + + andq %rdx,%rax + andq %rdx,%rsi + andq %rdx,%rbx + andq %rdx,%rbp + + addq %rax,%r8 + adcq %rsi,%r9 + movq %r8,0(%rdi) + adcq %rbx,%r10 + movq %r9,8(%rdi) + adcq %rbp,%r11 + movq %r10,16(%rdi) + movq %r11,24(%rdi) + + movq 8(%rsp),%rbx +.cfi_restore %rbx + movq 16(%rsp),%rbp +.cfi_restore %rbp + leaq 24(%rsp),%rsp +.cfi_adjust_cfa_offset -24 + + .byte 0xf3,0xc3 +.cfi_endproc + + + +.globl _check_mod_256 +.private_extern _check_mod_256 + +.p2align 5 +_check_mod_256: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + movq 0(%rdi),%rax + movq 8(%rdi),%r9 + movq 16(%rdi),%r10 + movq 24(%rdi),%r11 + + movq %rax,%r8 + orq %r9,%rax + orq %r10,%rax + orq %r11,%rax + + subq 0(%rsi),%r8 + sbbq 8(%rsi),%r9 + sbbq 16(%rsi),%r10 + sbbq 24(%rsi),%r11 + sbbq %rsi,%rsi + + movq $1,%rdx + cmpq $0,%rax + cmovneq %rdx,%rax + andq %rsi,%rax + + .byte 0xf3,0xc3 +.cfi_endproc + + + +.globl _add_n_check_mod_256 +.private_extern _add_n_check_mod_256 + +.p2align 5 +_add_n_check_mod_256: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + subq $8,%rsp +.cfi_adjust_cfa_offset 8 + + + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + + addq 0(%rdx),%r8 + adcq 8(%rdx),%r9 + movq %r8,%rax + adcq 16(%rdx),%r10 + movq %r9,%rsi + adcq 24(%rdx),%r11 + sbbq %rdx,%rdx + + movq %r10,%rbx + subq 0(%rcx),%r8 + sbbq 8(%rcx),%r9 + sbbq 16(%rcx),%r10 + movq %r11,%rbp + sbbq 24(%rcx),%r11 + sbbq $0,%rdx + + cmovcq %rax,%r8 + cmovcq %rsi,%r9 + movq %r8,0(%rdi) + cmovcq %rbx,%r10 + movq %r9,8(%rdi) + cmovcq %rbp,%r11 + movq %r10,16(%rdi) + movq %r11,24(%rdi) + + orq %r9,%r8 + orq %r11,%r10 + orq %r10,%r8 + movq $1,%rax + cmovzq %r8,%rax + + movq 8(%rsp),%rbx +.cfi_restore %rbx + movq 16(%rsp),%rbp +.cfi_restore %rbp + leaq 24(%rsp),%rsp +.cfi_adjust_cfa_offset -24 + + .byte 0xf3,0xc3 +.cfi_endproc + + + +.globl _sub_n_check_mod_256 +.private_extern _sub_n_check_mod_256 + +.p2align 5 +_sub_n_check_mod_256: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + subq $8,%rsp +.cfi_adjust_cfa_offset 8 + + + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + + subq 0(%rdx),%r8 + movq 0(%rcx),%rax + sbbq 8(%rdx),%r9 + movq 8(%rcx),%rsi + sbbq 16(%rdx),%r10 + movq 16(%rcx),%rbx + sbbq 24(%rdx),%r11 + movq 24(%rcx),%rbp + sbbq %rdx,%rdx + + andq %rdx,%rax + andq %rdx,%rsi + andq %rdx,%rbx + andq %rdx,%rbp + + addq %rax,%r8 + adcq %rsi,%r9 + movq %r8,0(%rdi) + adcq %rbx,%r10 + movq %r9,8(%rdi) + adcq %rbp,%r11 + movq %r10,16(%rdi) + movq %r11,24(%rdi) + + orq %r9,%r8 + orq %r11,%r10 + orq %r10,%r8 + movq $1,%rax + cmovzq %r8,%rax + + movq 8(%rsp),%rbx +.cfi_restore %rbx + movq 16(%rsp),%rbp +.cfi_restore %rbp + leaq 24(%rsp),%rsp +.cfi_adjust_cfa_offset -24 + + .byte 0xf3,0xc3 +.cfi_endproc + diff --git a/temp-deps/blst-ts/deps/blst/build/mach-o/add_mod_384-armv8.S b/temp-deps/blst-ts/deps/blst/build/mach-o/add_mod_384-armv8.S new file mode 100644 index 0000000..a62995f --- /dev/null +++ b/temp-deps/blst-ts/deps/blst/build/mach-o/add_mod_384-armv8.S @@ -0,0 +1,1000 @@ +.text + +.globl _add_mod_384 +.private_extern _add_mod_384 + +.align 5 +_add_mod_384: +.long 3573752639 + stp x29,x30,[sp,#-48]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + + ldp x4,x5,[x3] + ldp x6,x7,[x3,#16] + ldp x8,x9,[x3,#32] + + bl __add_mod_384 + ldr x30,[sp,#8] + + stp x10,x11,[x0] + stp x12,x13,[x0,#16] + stp x14,x15,[x0,#32] + + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldr x29,[sp],#48 +.long 3573752767 + ret + + + +.align 5 +__add_mod_384: + ldp x10,x11,[x1] + ldp x16,x17,[x2] + ldp x12,x13,[x1,#16] + ldp x19,x20,[x2,#16] + ldp x14,x15,[x1,#32] + ldp x21,x22,[x2,#32] + +__add_mod_384_ab_are_loaded: + adds x10,x10,x16 + adcs x11,x11,x17 + adcs x12,x12,x19 + adcs x13,x13,x20 + adcs x14,x14,x21 + adcs x15,x15,x22 + adc x3,xzr,xzr + + subs x16,x10,x4 + sbcs x17,x11,x5 + sbcs x19,x12,x6 + sbcs x20,x13,x7 + sbcs x21,x14,x8 + sbcs x22,x15,x9 + sbcs xzr,x3,xzr + + csel x10,x10,x16,lo + csel x11,x11,x17,lo + csel x12,x12,x19,lo + csel x13,x13,x20,lo + csel x14,x14,x21,lo + csel x15,x15,x22,lo + + ret + + +.globl _add_mod_384x +.private_extern _add_mod_384x + +.align 5 +_add_mod_384x: +.long 3573752639 + stp x29,x30,[sp,#-48]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + + ldp x4,x5,[x3] + ldp x6,x7,[x3,#16] + ldp x8,x9,[x3,#32] + + bl __add_mod_384 + + stp x10,x11,[x0] + add x1,x1,#48 + stp x12,x13,[x0,#16] + add x2,x2,#48 + stp x14,x15,[x0,#32] + + bl __add_mod_384 + ldr x30,[sp,#8] + + stp x10,x11,[x0,#48] + stp x12,x13,[x0,#64] + stp x14,x15,[x0,#80] + + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldr x29,[sp],#48 +.long 3573752767 + ret + + +.globl _rshift_mod_384 +.private_extern _rshift_mod_384 + +.align 5 +_rshift_mod_384: +.long 3573752639 + stp x29,x30,[sp,#-48]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + + ldp x10,x11,[x1] + ldp x12,x13,[x1,#16] + ldp x14,x15,[x1,#32] + + ldp x4,x5,[x3] + ldp x6,x7,[x3,#16] + ldp x8,x9,[x3,#32] + +Loop_rshift_mod_384: + sub x2,x2,#1 + bl __rshift_mod_384 + cbnz x2,Loop_rshift_mod_384 + + ldr x30,[sp,#8] + stp x10,x11,[x0] + stp x12,x13,[x0,#16] + stp x14,x15,[x0,#32] + + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldr x29,[sp],#48 +.long 3573752767 + ret + + + +.align 5 +__rshift_mod_384: + sbfx x22,x10,#0,#1 + and x16,x22,x4 + and x17,x22,x5 + adds x10,x10,x16 + and x19,x22,x6 + adcs x11,x11,x17 + and x20,x22,x7 + adcs x12,x12,x19 + and x21,x22,x8 + adcs x13,x13,x20 + and x22,x22,x9 + adcs x14,x14,x21 + extr x10,x11,x10,#1 // a[0:5] >>= 1 + adcs x15,x15,x22 + extr x11,x12,x11,#1 + adc x22,xzr,xzr + extr x12,x13,x12,#1 + extr x13,x14,x13,#1 + extr x14,x15,x14,#1 + extr x15,x22,x15,#1 + ret + + +.globl _div_by_2_mod_384 +.private_extern _div_by_2_mod_384 + +.align 5 +_div_by_2_mod_384: +.long 3573752639 + stp x29,x30,[sp,#-48]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + + ldp x10,x11,[x1] + ldp x12,x13,[x1,#16] + ldp x14,x15,[x1,#32] + + ldp x4,x5,[x2] + ldp x6,x7,[x2,#16] + ldp x8,x9,[x2,#32] + + bl __rshift_mod_384 + + ldr x30,[sp,#8] + stp x10,x11,[x0] + stp x12,x13,[x0,#16] + stp x14,x15,[x0,#32] + + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldr x29,[sp],#48 +.long 3573752767 + ret + + +.globl _lshift_mod_384 +.private_extern _lshift_mod_384 + +.align 5 +_lshift_mod_384: +.long 3573752639 + stp x29,x30,[sp,#-48]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + + ldp x10,x11,[x1] + ldp x12,x13,[x1,#16] + ldp x14,x15,[x1,#32] + + ldp x4,x5,[x3] + ldp x6,x7,[x3,#16] + ldp x8,x9,[x3,#32] + +Loop_lshift_mod_384: + sub x2,x2,#1 + bl __lshift_mod_384 + cbnz x2,Loop_lshift_mod_384 + + ldr x30,[sp,#8] + stp x10,x11,[x0] + stp x12,x13,[x0,#16] + stp x14,x15,[x0,#32] + + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldr x29,[sp],#48 +.long 3573752767 + ret + + + +.align 5 +__lshift_mod_384: + adds x10,x10,x10 + adcs x11,x11,x11 + adcs x12,x12,x12 + adcs x13,x13,x13 + adcs x14,x14,x14 + adcs x15,x15,x15 + adc x3,xzr,xzr + + subs x16,x10,x4 + sbcs x17,x11,x5 + sbcs x19,x12,x6 + sbcs x20,x13,x7 + sbcs x21,x14,x8 + sbcs x22,x15,x9 + sbcs xzr,x3,xzr + + csel x10,x10,x16,lo + csel x11,x11,x17,lo + csel x12,x12,x19,lo + csel x13,x13,x20,lo + csel x14,x14,x21,lo + csel x15,x15,x22,lo + + ret + + +.globl _mul_by_3_mod_384 +.private_extern _mul_by_3_mod_384 + +.align 5 +_mul_by_3_mod_384: +.long 3573752639 + stp x29,x30,[sp,#-48]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + + ldp x10,x11,[x1] + ldp x12,x13,[x1,#16] + ldp x14,x15,[x1,#32] + + ldp x4,x5,[x2] + ldp x6,x7,[x2,#16] + ldp x8,x9,[x2,#32] + + bl __lshift_mod_384 + + ldp x16,x17,[x1] + ldp x19,x20,[x1,#16] + ldp x21,x22,[x1,#32] + + bl __add_mod_384_ab_are_loaded + ldr x30,[sp,#8] + + stp x10,x11,[x0] + stp x12,x13,[x0,#16] + stp x14,x15,[x0,#32] + + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldr x29,[sp],#48 +.long 3573752767 + ret + + +.globl _mul_by_8_mod_384 +.private_extern _mul_by_8_mod_384 + +.align 5 +_mul_by_8_mod_384: +.long 3573752639 + stp x29,x30,[sp,#-48]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + + ldp x10,x11,[x1] + ldp x12,x13,[x1,#16] + ldp x14,x15,[x1,#32] + + ldp x4,x5,[x2] + ldp x6,x7,[x2,#16] + ldp x8,x9,[x2,#32] + + bl __lshift_mod_384 + bl __lshift_mod_384 + bl __lshift_mod_384 + ldr x30,[sp,#8] + + stp x10,x11,[x0] + stp x12,x13,[x0,#16] + stp x14,x15,[x0,#32] + + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldr x29,[sp],#48 +.long 3573752767 + ret + + +.globl _mul_by_3_mod_384x +.private_extern _mul_by_3_mod_384x + +.align 5 +_mul_by_3_mod_384x: +.long 3573752639 + stp x29,x30,[sp,#-48]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + + ldp x10,x11,[x1] + ldp x12,x13,[x1,#16] + ldp x14,x15,[x1,#32] + + ldp x4,x5,[x2] + ldp x6,x7,[x2,#16] + ldp x8,x9,[x2,#32] + + bl __lshift_mod_384 + + ldp x16,x17,[x1] + ldp x19,x20,[x1,#16] + ldp x21,x22,[x1,#32] + + bl __add_mod_384_ab_are_loaded + + stp x10,x11,[x0] + ldp x10,x11,[x1,#48] + stp x12,x13,[x0,#16] + ldp x12,x13,[x1,#64] + stp x14,x15,[x0,#32] + ldp x14,x15,[x1,#80] + + bl __lshift_mod_384 + + ldp x16,x17,[x1,#48] + ldp x19,x20,[x1,#64] + ldp x21,x22,[x1,#80] + + bl __add_mod_384_ab_are_loaded + ldr x30,[sp,#8] + + stp x10,x11,[x0,#48] + stp x12,x13,[x0,#64] + stp x14,x15,[x0,#80] + + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldr x29,[sp],#48 +.long 3573752767 + ret + + +.globl _mul_by_8_mod_384x +.private_extern _mul_by_8_mod_384x + +.align 5 +_mul_by_8_mod_384x: +.long 3573752639 + stp x29,x30,[sp,#-48]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + + ldp x10,x11,[x1] + ldp x12,x13,[x1,#16] + ldp x14,x15,[x1,#32] + + ldp x4,x5,[x2] + ldp x6,x7,[x2,#16] + ldp x8,x9,[x2,#32] + + bl __lshift_mod_384 + bl __lshift_mod_384 + bl __lshift_mod_384 + + stp x10,x11,[x0] + ldp x10,x11,[x1,#48] + stp x12,x13,[x0,#16] + ldp x12,x13,[x1,#64] + stp x14,x15,[x0,#32] + ldp x14,x15,[x1,#80] + + bl __lshift_mod_384 + bl __lshift_mod_384 + bl __lshift_mod_384 + ldr x30,[sp,#8] + + stp x10,x11,[x0,#48] + stp x12,x13,[x0,#64] + stp x14,x15,[x0,#80] + + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldr x29,[sp],#48 +.long 3573752767 + ret + + +.globl _cneg_mod_384 +.private_extern _cneg_mod_384 + +.align 5 +_cneg_mod_384: +.long 3573752639 + stp x29,x30,[sp,#-48]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + + ldp x10,x11,[x1] + ldp x4,x5,[x3] + ldp x12,x13,[x1,#16] + ldp x6,x7,[x3,#16] + + subs x16,x4,x10 + ldp x14,x15,[x1,#32] + ldp x8,x9,[x3,#32] + orr x3,x10,x11 + sbcs x17,x5,x11 + orr x3,x3,x12 + sbcs x19,x6,x12 + orr x3,x3,x13 + sbcs x20,x7,x13 + orr x3,x3,x14 + sbcs x21,x8,x14 + orr x3,x3,x15 + sbc x22,x9,x15 + + cmp x3,#0 + csetm x3,ne + ands x2,x2,x3 + + csel x10,x10,x16,eq + csel x11,x11,x17,eq + csel x12,x12,x19,eq + csel x13,x13,x20,eq + stp x10,x11,[x0] + csel x14,x14,x21,eq + stp x12,x13,[x0,#16] + csel x15,x15,x22,eq + stp x14,x15,[x0,#32] + + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldr x29,[sp],#48 +.long 3573752767 + ret + + +.globl _sub_mod_384 +.private_extern _sub_mod_384 + +.align 5 +_sub_mod_384: +.long 3573752639 + stp x29,x30,[sp,#-48]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + + ldp x4,x5,[x3] + ldp x6,x7,[x3,#16] + ldp x8,x9,[x3,#32] + + bl __sub_mod_384 + ldr x30,[sp,#8] + + stp x10,x11,[x0] + stp x12,x13,[x0,#16] + stp x14,x15,[x0,#32] + + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldr x29,[sp],#48 +.long 3573752767 + ret + + + +.align 5 +__sub_mod_384: + ldp x10,x11,[x1] + ldp x16,x17,[x2] + ldp x12,x13,[x1,#16] + ldp x19,x20,[x2,#16] + ldp x14,x15,[x1,#32] + ldp x21,x22,[x2,#32] + + subs x10,x10,x16 + sbcs x11,x11,x17 + sbcs x12,x12,x19 + sbcs x13,x13,x20 + sbcs x14,x14,x21 + sbcs x15,x15,x22 + sbc x3,xzr,xzr + + and x16,x4,x3 + and x17,x5,x3 + adds x10,x10,x16 + and x19,x6,x3 + adcs x11,x11,x17 + and x20,x7,x3 + adcs x12,x12,x19 + and x21,x8,x3 + adcs x13,x13,x20 + and x22,x9,x3 + adcs x14,x14,x21 + adc x15,x15,x22 + + ret + + +.globl _sub_mod_384x +.private_extern _sub_mod_384x + +.align 5 +_sub_mod_384x: +.long 3573752639 + stp x29,x30,[sp,#-48]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + + ldp x4,x5,[x3] + ldp x6,x7,[x3,#16] + ldp x8,x9,[x3,#32] + + bl __sub_mod_384 + + stp x10,x11,[x0] + add x1,x1,#48 + stp x12,x13,[x0,#16] + add x2,x2,#48 + stp x14,x15,[x0,#32] + + bl __sub_mod_384 + ldr x30,[sp,#8] + + stp x10,x11,[x0,#48] + stp x12,x13,[x0,#64] + stp x14,x15,[x0,#80] + + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldr x29,[sp],#48 +.long 3573752767 + ret + + +.globl _mul_by_1_plus_i_mod_384x +.private_extern _mul_by_1_plus_i_mod_384x + +.align 5 +_mul_by_1_plus_i_mod_384x: +.long 3573752639 + stp x29,x30,[sp,#-48]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + + ldp x4,x5,[x2] + ldp x6,x7,[x2,#16] + ldp x8,x9,[x2,#32] + add x2,x1,#48 + + bl __sub_mod_384 // a->re - a->im + + ldp x16,x17,[x1] + ldp x19,x20,[x1,#16] + ldp x21,x22,[x1,#32] + stp x10,x11,[x0] + ldp x10,x11,[x1,#48] + stp x12,x13,[x0,#16] + ldp x12,x13,[x1,#64] + stp x14,x15,[x0,#32] + ldp x14,x15,[x1,#80] + + bl __add_mod_384_ab_are_loaded // a->re + a->im + ldr x30,[sp,#8] + + stp x10,x11,[x0,#48] + stp x12,x13,[x0,#64] + stp x14,x15,[x0,#80] + + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldr x29,[sp],#48 +.long 3573752767 + ret + + +.globl _sgn0_pty_mod_384 +.private_extern _sgn0_pty_mod_384 + +.align 5 +_sgn0_pty_mod_384: + ldp x10,x11,[x0] + ldp x12,x13,[x0,#16] + ldp x14,x15,[x0,#32] + + ldp x4,x5,[x1] + ldp x6,x7,[x1,#16] + ldp x8,x9,[x1,#32] + + and x0,x10,#1 + adds x10,x10,x10 + adcs x11,x11,x11 + adcs x12,x12,x12 + adcs x13,x13,x13 + adcs x14,x14,x14 + adcs x15,x15,x15 + adc x3,xzr,xzr + + subs x10,x10,x4 + sbcs x11,x11,x5 + sbcs x12,x12,x6 + sbcs x13,x13,x7 + sbcs x14,x14,x8 + sbcs x15,x15,x9 + sbc x3,x3,xzr + + mvn x3,x3 + and x3,x3,#2 + orr x0,x0,x3 + + ret + + +.globl _sgn0_pty_mod_384x +.private_extern _sgn0_pty_mod_384x + +.align 5 +_sgn0_pty_mod_384x: + ldp x10,x11,[x0] + ldp x12,x13,[x0,#16] + ldp x14,x15,[x0,#32] + + ldp x4,x5,[x1] + ldp x6,x7,[x1,#16] + ldp x8,x9,[x1,#32] + + and x2,x10,#1 + orr x3,x10,x11 + adds x10,x10,x10 + orr x3,x3,x12 + adcs x11,x11,x11 + orr x3,x3,x13 + adcs x12,x12,x12 + orr x3,x3,x14 + adcs x13,x13,x13 + orr x3,x3,x15 + adcs x14,x14,x14 + adcs x15,x15,x15 + adc x16,xzr,xzr + + subs x10,x10,x4 + sbcs x11,x11,x5 + sbcs x12,x12,x6 + sbcs x13,x13,x7 + sbcs x14,x14,x8 + sbcs x15,x15,x9 + sbc x16,x16,xzr + + ldp x10,x11,[x0,#48] + ldp x12,x13,[x0,#64] + ldp x14,x15,[x0,#80] + + mvn x16,x16 + and x16,x16,#2 + orr x2,x2,x16 + + and x0,x10,#1 + orr x1,x10,x11 + adds x10,x10,x10 + orr x1,x1,x12 + adcs x11,x11,x11 + orr x1,x1,x13 + adcs x12,x12,x12 + orr x1,x1,x14 + adcs x13,x13,x13 + orr x1,x1,x15 + adcs x14,x14,x14 + adcs x15,x15,x15 + adc x16,xzr,xzr + + subs x10,x10,x4 + sbcs x11,x11,x5 + sbcs x12,x12,x6 + sbcs x13,x13,x7 + sbcs x14,x14,x8 + sbcs x15,x15,x9 + sbc x16,x16,xzr + + mvn x16,x16 + and x16,x16,#2 + orr x0,x0,x16 + + cmp x3,#0 + csel x3,x0,x2,eq // a->re==0? prty(a->im) : prty(a->re) + + cmp x1,#0 + csel x1,x0,x2,ne // a->im!=0? sgn0(a->im) : sgn0(a->re) + + and x3,x3,#1 + and x1,x1,#2 + orr x0,x1,x3 // pack sign and parity + + ret + +.globl _vec_select_32 +.private_extern _vec_select_32 + +.align 5 +_vec_select_32: + dup v6.2d, x3 + ld1 {v0.2d, v1.2d, v2.2d}, [x1],#48 + cmeq v6.2d, v6.2d, #0 + ld1 {v3.2d, v4.2d, v5.2d}, [x2],#48 + bit v0.16b, v3.16b, v6.16b + bit v1.16b, v4.16b, v6.16b + bit v2.16b, v5.16b, v6.16b + st1 {v0.2d, v1.2d, v2.2d}, [x0] + ret + +.globl _vec_select_48 +.private_extern _vec_select_48 + +.align 5 +_vec_select_48: + dup v6.2d, x3 + ld1 {v0.2d, v1.2d, v2.2d}, [x1],#48 + cmeq v6.2d, v6.2d, #0 + ld1 {v3.2d, v4.2d, v5.2d}, [x2],#48 + bit v0.16b, v3.16b, v6.16b + bit v1.16b, v4.16b, v6.16b + bit v2.16b, v5.16b, v6.16b + st1 {v0.2d, v1.2d, v2.2d}, [x0] + ret + +.globl _vec_select_96 +.private_extern _vec_select_96 + +.align 5 +_vec_select_96: + dup v6.2d, x3 + ld1 {v0.2d, v1.2d, v2.2d}, [x1],#48 + cmeq v6.2d, v6.2d, #0 + ld1 {v3.2d, v4.2d, v5.2d}, [x2],#48 + bit v0.16b, v3.16b, v6.16b + ld1 {v16.2d, v17.2d, v18.2d}, [x1],#48 + bit v1.16b, v4.16b, v6.16b + ld1 {v19.2d, v20.2d, v21.2d}, [x2],#48 + bit v2.16b, v5.16b, v6.16b + st1 {v0.2d, v1.2d, v2.2d}, [x0],#48 + bit v16.16b, v19.16b, v6.16b + bit v17.16b, v20.16b, v6.16b + bit v18.16b, v21.16b, v6.16b + st1 {v16.2d, v17.2d, v18.2d}, [x0] + ret + +.globl _vec_select_192 +.private_extern _vec_select_192 + +.align 5 +_vec_select_192: + dup v6.2d, x3 + ld1 {v0.2d, v1.2d, v2.2d}, [x1],#48 + cmeq v6.2d, v6.2d, #0 + ld1 {v3.2d, v4.2d, v5.2d}, [x2],#48 + bit v0.16b, v3.16b, v6.16b + ld1 {v16.2d, v17.2d, v18.2d}, [x1],#48 + bit v1.16b, v4.16b, v6.16b + ld1 {v19.2d, v20.2d, v21.2d}, [x2],#48 + bit v2.16b, v5.16b, v6.16b + st1 {v0.2d, v1.2d, v2.2d}, [x0],#48 + bit v16.16b, v19.16b, v6.16b + ld1 {v0.2d, v1.2d, v2.2d}, [x1],#48 + bit v17.16b, v20.16b, v6.16b + ld1 {v3.2d, v4.2d, v5.2d}, [x2],#48 + bit v18.16b, v21.16b, v6.16b + st1 {v16.2d, v17.2d, v18.2d}, [x0],#48 + bit v0.16b, v3.16b, v6.16b + ld1 {v16.2d, v17.2d, v18.2d}, [x1],#48 + bit v1.16b, v4.16b, v6.16b + ld1 {v19.2d, v20.2d, v21.2d}, [x2],#48 + bit v2.16b, v5.16b, v6.16b + st1 {v0.2d, v1.2d, v2.2d}, [x0],#48 + bit v16.16b, v19.16b, v6.16b + bit v17.16b, v20.16b, v6.16b + bit v18.16b, v21.16b, v6.16b + st1 {v16.2d, v17.2d, v18.2d}, [x0] + ret + +.globl _vec_select_144 +.private_extern _vec_select_144 + +.align 5 +_vec_select_144: + dup v6.2d, x3 + ld1 {v0.2d, v1.2d, v2.2d}, [x1],#48 + cmeq v6.2d, v6.2d, #0 + ld1 {v3.2d, v4.2d, v5.2d}, [x2],#48 + bit v0.16b, v3.16b, v6.16b + ld1 {v16.2d, v17.2d, v18.2d}, [x1],#48 + bit v1.16b, v4.16b, v6.16b + ld1 {v19.2d, v20.2d, v21.2d}, [x2],#48 + bit v2.16b, v5.16b, v6.16b + st1 {v0.2d, v1.2d, v2.2d}, [x0],#48 + bit v16.16b, v19.16b, v6.16b + ld1 {v0.2d, v1.2d, v2.2d}, [x1],#48 + bit v17.16b, v20.16b, v6.16b + ld1 {v3.2d, v4.2d, v5.2d}, [x2],#48 + bit v18.16b, v21.16b, v6.16b + st1 {v16.2d, v17.2d, v18.2d}, [x0],#48 + bit v0.16b, v3.16b, v6.16b + bit v1.16b, v4.16b, v6.16b + bit v2.16b, v5.16b, v6.16b + st1 {v0.2d, v1.2d, v2.2d}, [x0] + ret + +.globl _vec_select_288 +.private_extern _vec_select_288 + +.align 5 +_vec_select_288: + dup v6.2d, x3 + ld1 {v0.2d, v1.2d, v2.2d}, [x1],#48 + cmeq v6.2d, v6.2d, #0 + ld1 {v3.2d, v4.2d, v5.2d}, [x2],#48 + bit v0.16b, v3.16b, v6.16b + ld1 {v16.2d, v17.2d, v18.2d}, [x1],#48 + bit v1.16b, v4.16b, v6.16b + ld1 {v19.2d, v20.2d, v21.2d}, [x2],#48 + bit v2.16b, v5.16b, v6.16b + st1 {v0.2d, v1.2d, v2.2d}, [x0],#48 + bit v16.16b, v19.16b, v6.16b + ld1 {v0.2d, v1.2d, v2.2d}, [x1],#48 + bit v17.16b, v20.16b, v6.16b + ld1 {v3.2d, v4.2d, v5.2d}, [x2],#48 + bit v18.16b, v21.16b, v6.16b + st1 {v16.2d, v17.2d, v18.2d}, [x0],#48 + bit v0.16b, v3.16b, v6.16b + ld1 {v16.2d, v17.2d, v18.2d}, [x1],#48 + bit v1.16b, v4.16b, v6.16b + ld1 {v19.2d, v20.2d, v21.2d}, [x2],#48 + bit v2.16b, v5.16b, v6.16b + st1 {v0.2d, v1.2d, v2.2d}, [x0],#48 + bit v16.16b, v19.16b, v6.16b + ld1 {v0.2d, v1.2d, v2.2d}, [x1],#48 + bit v17.16b, v20.16b, v6.16b + ld1 {v3.2d, v4.2d, v5.2d}, [x2],#48 + bit v18.16b, v21.16b, v6.16b + st1 {v16.2d, v17.2d, v18.2d}, [x0],#48 + bit v0.16b, v3.16b, v6.16b + ld1 {v16.2d, v17.2d, v18.2d}, [x1],#48 + bit v1.16b, v4.16b, v6.16b + ld1 {v19.2d, v20.2d, v21.2d}, [x2],#48 + bit v2.16b, v5.16b, v6.16b + st1 {v0.2d, v1.2d, v2.2d}, [x0],#48 + bit v16.16b, v19.16b, v6.16b + bit v17.16b, v20.16b, v6.16b + bit v18.16b, v21.16b, v6.16b + st1 {v16.2d, v17.2d, v18.2d}, [x0] + ret + +.globl _vec_prefetch +.private_extern _vec_prefetch + +.align 5 +_vec_prefetch: + add x1, x1, x0 + sub x1, x1, #1 + mov x2, #64 + prfm pldl1keep, [x0] + add x0, x0, x2 + cmp x0, x1 + csel x0, x1, x0, hi + csel x2, xzr, x2, hi + prfm pldl1keep, [x0] + add x0, x0, x2 + cmp x0, x1 + csel x0, x1, x0, hi + csel x2, xzr, x2, hi + prfm pldl1keep, [x0] + add x0, x0, x2 + cmp x0, x1 + csel x0, x1, x0, hi + csel x2, xzr, x2, hi + prfm pldl1keep, [x0] + add x0, x0, x2 + cmp x0, x1 + csel x0, x1, x0, hi + csel x2, xzr, x2, hi + prfm pldl1keep, [x0] + add x0, x0, x2 + cmp x0, x1 + csel x0, x1, x0, hi + csel x2, xzr, x2, hi + prfm pldl1keep, [x0] + add x0, x0, x2 + cmp x0, x1 + csel x0, x1, x0, hi + prfm pldl1keep, [x0] + ret + +.globl _vec_is_zero_16x +.private_extern _vec_is_zero_16x + +.align 5 +_vec_is_zero_16x: + ld1 {v0.2d}, [x0], #16 + lsr x1, x1, #4 + sub x1, x1, #1 + cbz x1, Loop_is_zero_done + +Loop_is_zero: + ld1 {v1.2d}, [x0], #16 + orr v0.16b, v0.16b, v1.16b + sub x1, x1, #1 + cbnz x1, Loop_is_zero + +Loop_is_zero_done: + dup v1.2d, v0.d[1] + orr v0.16b, v0.16b, v1.16b + mov x1, v0.d[0] + mov x0, #1 + cmp x1, #0 + csel x0, x0, xzr, eq + ret + +.globl _vec_is_equal_16x +.private_extern _vec_is_equal_16x + +.align 5 +_vec_is_equal_16x: + ld1 {v0.2d}, [x0], #16 + ld1 {v1.2d}, [x1], #16 + lsr x2, x2, #4 + eor v0.16b, v0.16b, v1.16b + +Loop_is_equal: + sub x2, x2, #1 + cbz x2, Loop_is_equal_done + ld1 {v1.2d}, [x0], #16 + ld1 {v2.2d}, [x1], #16 + eor v1.16b, v1.16b, v2.16b + orr v0.16b, v0.16b, v1.16b + b Loop_is_equal + nop + +Loop_is_equal_done: + dup v1.2d, v0.d[1] + orr v0.16b, v0.16b, v1.16b + mov x1, v0.d[0] + mov x0, #1 + cmp x1, #0 + csel x0, x0, xzr, eq + ret + diff --git a/temp-deps/blst-ts/deps/blst/build/mach-o/add_mod_384-x86_64.s b/temp-deps/blst-ts/deps/blst/build/mach-o/add_mod_384-x86_64.s new file mode 100644 index 0000000..974978e --- /dev/null +++ b/temp-deps/blst-ts/deps/blst/build/mach-o/add_mod_384-x86_64.s @@ -0,0 +1,1899 @@ +.text + +.globl _add_mod_384 +.private_extern _add_mod_384 + +.p2align 5 +_add_mod_384: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + subq $8,%rsp +.cfi_adjust_cfa_offset 8 + + + call __add_mod_384 + + movq 8(%rsp),%r15 +.cfi_restore %r15 + movq 16(%rsp),%r14 +.cfi_restore %r14 + movq 24(%rsp),%r13 +.cfi_restore %r13 + movq 32(%rsp),%r12 +.cfi_restore %r12 + movq 40(%rsp),%rbx +.cfi_restore %rbx + movq 48(%rsp),%rbp +.cfi_restore %rbp + leaq 56(%rsp),%rsp +.cfi_adjust_cfa_offset -56 + + .byte 0xf3,0xc3 +.cfi_endproc + + + +.p2align 5 +__add_mod_384: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + movq 32(%rsi),%r12 + movq 40(%rsi),%r13 + +__add_mod_384_a_is_loaded: + addq 0(%rdx),%r8 + adcq 8(%rdx),%r9 + adcq 16(%rdx),%r10 + movq %r8,%r14 + adcq 24(%rdx),%r11 + movq %r9,%r15 + adcq 32(%rdx),%r12 + movq %r10,%rax + adcq 40(%rdx),%r13 + movq %r11,%rbx + sbbq %rdx,%rdx + + subq 0(%rcx),%r8 + sbbq 8(%rcx),%r9 + movq %r12,%rbp + sbbq 16(%rcx),%r10 + sbbq 24(%rcx),%r11 + sbbq 32(%rcx),%r12 + movq %r13,%rsi + sbbq 40(%rcx),%r13 + sbbq $0,%rdx + + cmovcq %r14,%r8 + cmovcq %r15,%r9 + cmovcq %rax,%r10 + movq %r8,0(%rdi) + cmovcq %rbx,%r11 + movq %r9,8(%rdi) + cmovcq %rbp,%r12 + movq %r10,16(%rdi) + cmovcq %rsi,%r13 + movq %r11,24(%rdi) + movq %r12,32(%rdi) + movq %r13,40(%rdi) + + .byte 0xf3,0xc3 +.cfi_endproc + + +.globl _add_mod_384x +.private_extern _add_mod_384x + +.p2align 5 +_add_mod_384x: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + subq $24,%rsp +.cfi_adjust_cfa_offset 24 + + + movq %rsi,0(%rsp) + movq %rdx,8(%rsp) + leaq 48(%rsi),%rsi + leaq 48(%rdx),%rdx + leaq 48(%rdi),%rdi + call __add_mod_384 + + movq 0(%rsp),%rsi + movq 8(%rsp),%rdx + leaq -48(%rdi),%rdi + call __add_mod_384 + + movq 24+0(%rsp),%r15 +.cfi_restore %r15 + movq 24+8(%rsp),%r14 +.cfi_restore %r14 + movq 24+16(%rsp),%r13 +.cfi_restore %r13 + movq 24+24(%rsp),%r12 +.cfi_restore %r12 + movq 24+32(%rsp),%rbx +.cfi_restore %rbx + movq 24+40(%rsp),%rbp +.cfi_restore %rbp + leaq 24+48(%rsp),%rsp +.cfi_adjust_cfa_offset -24-8*6 + + .byte 0xf3,0xc3 +.cfi_endproc + + + +.globl _rshift_mod_384 +.private_extern _rshift_mod_384 + +.p2align 5 +_rshift_mod_384: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + pushq %rdi +.cfi_adjust_cfa_offset 8 + + + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + movq 32(%rsi),%r12 + movq 40(%rsi),%r13 + +L$oop_rshift_mod_384: + call __rshift_mod_384 + decl %edx + jnz L$oop_rshift_mod_384 + + movq %r8,0(%rdi) + movq %r9,8(%rdi) + movq %r10,16(%rdi) + movq %r11,24(%rdi) + movq %r12,32(%rdi) + movq %r13,40(%rdi) + + movq 8(%rsp),%r15 +.cfi_restore %r15 + movq 16(%rsp),%r14 +.cfi_restore %r14 + movq 24(%rsp),%r13 +.cfi_restore %r13 + movq 32(%rsp),%r12 +.cfi_restore %r12 + movq 40(%rsp),%rbx +.cfi_restore %rbx + movq 48(%rsp),%rbp +.cfi_restore %rbp + leaq 56(%rsp),%rsp +.cfi_adjust_cfa_offset -56 + + .byte 0xf3,0xc3 +.cfi_endproc + + + +.p2align 5 +__rshift_mod_384: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + movq $1,%rsi + movq 0(%rcx),%r14 + andq %r8,%rsi + movq 8(%rcx),%r15 + negq %rsi + movq 16(%rcx),%rax + andq %rsi,%r14 + movq 24(%rcx),%rbx + andq %rsi,%r15 + movq 32(%rcx),%rbp + andq %rsi,%rax + andq %rsi,%rbx + andq %rsi,%rbp + andq 40(%rcx),%rsi + + addq %r8,%r14 + adcq %r9,%r15 + adcq %r10,%rax + adcq %r11,%rbx + adcq %r12,%rbp + adcq %r13,%rsi + sbbq %r13,%r13 + + shrq $1,%r14 + movq %r15,%r8 + shrq $1,%r15 + movq %rax,%r9 + shrq $1,%rax + movq %rbx,%r10 + shrq $1,%rbx + movq %rbp,%r11 + shrq $1,%rbp + movq %rsi,%r12 + shrq $1,%rsi + shlq $63,%r8 + shlq $63,%r9 + orq %r14,%r8 + shlq $63,%r10 + orq %r15,%r9 + shlq $63,%r11 + orq %rax,%r10 + shlq $63,%r12 + orq %rbx,%r11 + shlq $63,%r13 + orq %rbp,%r12 + orq %rsi,%r13 + + .byte 0xf3,0xc3 +.cfi_endproc + + +.globl _div_by_2_mod_384 +.private_extern _div_by_2_mod_384 + +.p2align 5 +_div_by_2_mod_384: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + pushq %rdi +.cfi_adjust_cfa_offset 8 + + + movq 0(%rsi),%r8 + movq %rdx,%rcx + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + movq 32(%rsi),%r12 + movq 40(%rsi),%r13 + + call __rshift_mod_384 + + movq %r8,0(%rdi) + movq %r9,8(%rdi) + movq %r10,16(%rdi) + movq %r11,24(%rdi) + movq %r12,32(%rdi) + movq %r13,40(%rdi) + + movq 8(%rsp),%r15 +.cfi_restore %r15 + movq 16(%rsp),%r14 +.cfi_restore %r14 + movq 24(%rsp),%r13 +.cfi_restore %r13 + movq 32(%rsp),%r12 +.cfi_restore %r12 + movq 40(%rsp),%rbx +.cfi_restore %rbx + movq 48(%rsp),%rbp +.cfi_restore %rbp + leaq 56(%rsp),%rsp +.cfi_adjust_cfa_offset -56 + + .byte 0xf3,0xc3 +.cfi_endproc + + + +.globl _lshift_mod_384 +.private_extern _lshift_mod_384 + +.p2align 5 +_lshift_mod_384: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + pushq %rdi +.cfi_adjust_cfa_offset 8 + + + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + movq 32(%rsi),%r12 + movq 40(%rsi),%r13 + +L$oop_lshift_mod_384: + addq %r8,%r8 + adcq %r9,%r9 + adcq %r10,%r10 + movq %r8,%r14 + adcq %r11,%r11 + movq %r9,%r15 + adcq %r12,%r12 + movq %r10,%rax + adcq %r13,%r13 + movq %r11,%rbx + sbbq %rdi,%rdi + + subq 0(%rcx),%r8 + sbbq 8(%rcx),%r9 + movq %r12,%rbp + sbbq 16(%rcx),%r10 + sbbq 24(%rcx),%r11 + sbbq 32(%rcx),%r12 + movq %r13,%rsi + sbbq 40(%rcx),%r13 + sbbq $0,%rdi + + movq (%rsp),%rdi + cmovcq %r14,%r8 + cmovcq %r15,%r9 + cmovcq %rax,%r10 + cmovcq %rbx,%r11 + cmovcq %rbp,%r12 + cmovcq %rsi,%r13 + + decl %edx + jnz L$oop_lshift_mod_384 + + movq %r8,0(%rdi) + movq %r9,8(%rdi) + movq %r10,16(%rdi) + movq %r11,24(%rdi) + movq %r12,32(%rdi) + movq %r13,40(%rdi) + + movq 8(%rsp),%r15 +.cfi_restore %r15 + movq 16(%rsp),%r14 +.cfi_restore %r14 + movq 24(%rsp),%r13 +.cfi_restore %r13 + movq 32(%rsp),%r12 +.cfi_restore %r12 + movq 40(%rsp),%rbx +.cfi_restore %rbx + movq 48(%rsp),%rbp +.cfi_restore %rbp + leaq 56(%rsp),%rsp +.cfi_adjust_cfa_offset -56 + + .byte 0xf3,0xc3 +.cfi_endproc + + + +.p2align 5 +__lshift_mod_384: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + addq %r8,%r8 + adcq %r9,%r9 + adcq %r10,%r10 + movq %r8,%r14 + adcq %r11,%r11 + movq %r9,%r15 + adcq %r12,%r12 + movq %r10,%rax + adcq %r13,%r13 + movq %r11,%rbx + sbbq %rdx,%rdx + + subq 0(%rcx),%r8 + sbbq 8(%rcx),%r9 + movq %r12,%rbp + sbbq 16(%rcx),%r10 + sbbq 24(%rcx),%r11 + sbbq 32(%rcx),%r12 + movq %r13,%rsi + sbbq 40(%rcx),%r13 + sbbq $0,%rdx + + cmovcq %r14,%r8 + cmovcq %r15,%r9 + cmovcq %rax,%r10 + cmovcq %rbx,%r11 + cmovcq %rbp,%r12 + cmovcq %rsi,%r13 + + .byte 0xf3,0xc3 +.cfi_endproc + + + +.globl _mul_by_3_mod_384 +.private_extern _mul_by_3_mod_384 + +.p2align 5 +_mul_by_3_mod_384: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + pushq %rsi +.cfi_adjust_cfa_offset 8 + + + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + movq 32(%rsi),%r12 + movq 40(%rsi),%r13 + movq %rdx,%rcx + + call __lshift_mod_384 + + movq (%rsp),%rdx + call __add_mod_384_a_is_loaded + + movq 8(%rsp),%r15 +.cfi_restore %r15 + movq 16(%rsp),%r14 +.cfi_restore %r14 + movq 24(%rsp),%r13 +.cfi_restore %r13 + movq 32(%rsp),%r12 +.cfi_restore %r12 + movq 40(%rsp),%rbx +.cfi_restore %rbx + movq 48(%rsp),%rbp +.cfi_restore %rbp + leaq 56(%rsp),%rsp +.cfi_adjust_cfa_offset -56 + + .byte 0xf3,0xc3 +.cfi_endproc + + +.globl _mul_by_8_mod_384 +.private_extern _mul_by_8_mod_384 + +.p2align 5 +_mul_by_8_mod_384: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + subq $8,%rsp +.cfi_adjust_cfa_offset 8 + + + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + movq 32(%rsi),%r12 + movq 40(%rsi),%r13 + movq %rdx,%rcx + + call __lshift_mod_384 + call __lshift_mod_384 + call __lshift_mod_384 + + movq %r8,0(%rdi) + movq %r9,8(%rdi) + movq %r10,16(%rdi) + movq %r11,24(%rdi) + movq %r12,32(%rdi) + movq %r13,40(%rdi) + + movq 8(%rsp),%r15 +.cfi_restore %r15 + movq 16(%rsp),%r14 +.cfi_restore %r14 + movq 24(%rsp),%r13 +.cfi_restore %r13 + movq 32(%rsp),%r12 +.cfi_restore %r12 + movq 40(%rsp),%rbx +.cfi_restore %rbx + movq 48(%rsp),%rbp +.cfi_restore %rbp + leaq 56(%rsp),%rsp +.cfi_adjust_cfa_offset -56 + + .byte 0xf3,0xc3 +.cfi_endproc + + + +.globl _mul_by_3_mod_384x +.private_extern _mul_by_3_mod_384x + +.p2align 5 +_mul_by_3_mod_384x: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + pushq %rsi +.cfi_adjust_cfa_offset 8 + + + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + movq 32(%rsi),%r12 + movq 40(%rsi),%r13 + movq %rdx,%rcx + + call __lshift_mod_384 + + movq (%rsp),%rdx + call __add_mod_384_a_is_loaded + + movq (%rsp),%rsi + leaq 48(%rdi),%rdi + + movq 48(%rsi),%r8 + movq 56(%rsi),%r9 + movq 64(%rsi),%r10 + movq 72(%rsi),%r11 + movq 80(%rsi),%r12 + movq 88(%rsi),%r13 + + call __lshift_mod_384 + + movq $48,%rdx + addq (%rsp),%rdx + call __add_mod_384_a_is_loaded + + movq 8(%rsp),%r15 +.cfi_restore %r15 + movq 16(%rsp),%r14 +.cfi_restore %r14 + movq 24(%rsp),%r13 +.cfi_restore %r13 + movq 32(%rsp),%r12 +.cfi_restore %r12 + movq 40(%rsp),%rbx +.cfi_restore %rbx + movq 48(%rsp),%rbp +.cfi_restore %rbp + leaq 56(%rsp),%rsp +.cfi_adjust_cfa_offset -56 + + .byte 0xf3,0xc3 +.cfi_endproc + + +.globl _mul_by_8_mod_384x +.private_extern _mul_by_8_mod_384x + +.p2align 5 +_mul_by_8_mod_384x: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + pushq %rsi +.cfi_adjust_cfa_offset 8 + + + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + movq 32(%rsi),%r12 + movq 40(%rsi),%r13 + movq %rdx,%rcx + + call __lshift_mod_384 + call __lshift_mod_384 + call __lshift_mod_384 + + movq (%rsp),%rsi + movq %r8,0(%rdi) + movq %r9,8(%rdi) + movq %r10,16(%rdi) + movq %r11,24(%rdi) + movq %r12,32(%rdi) + movq %r13,40(%rdi) + + movq 48+0(%rsi),%r8 + movq 48+8(%rsi),%r9 + movq 48+16(%rsi),%r10 + movq 48+24(%rsi),%r11 + movq 48+32(%rsi),%r12 + movq 48+40(%rsi),%r13 + + call __lshift_mod_384 + call __lshift_mod_384 + call __lshift_mod_384 + + movq %r8,48+0(%rdi) + movq %r9,48+8(%rdi) + movq %r10,48+16(%rdi) + movq %r11,48+24(%rdi) + movq %r12,48+32(%rdi) + movq %r13,48+40(%rdi) + + movq 8(%rsp),%r15 +.cfi_restore %r15 + movq 16(%rsp),%r14 +.cfi_restore %r14 + movq 24(%rsp),%r13 +.cfi_restore %r13 + movq 32(%rsp),%r12 +.cfi_restore %r12 + movq 40(%rsp),%rbx +.cfi_restore %rbx + movq 48(%rsp),%rbp +.cfi_restore %rbp + leaq 56(%rsp),%rsp +.cfi_adjust_cfa_offset -56 + + .byte 0xf3,0xc3 +.cfi_endproc + + + +.globl _cneg_mod_384 +.private_extern _cneg_mod_384 + +.p2align 5 +_cneg_mod_384: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + pushq %rdx +.cfi_adjust_cfa_offset 8 + + + movq 0(%rsi),%rdx + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq %rdx,%r8 + movq 24(%rsi),%r11 + orq %r9,%rdx + movq 32(%rsi),%r12 + orq %r10,%rdx + movq 40(%rsi),%r13 + orq %r11,%rdx + movq $-1,%rsi + orq %r12,%rdx + orq %r13,%rdx + + movq 0(%rcx),%r14 + cmovnzq %rsi,%rdx + movq 8(%rcx),%r15 + movq 16(%rcx),%rax + andq %rdx,%r14 + movq 24(%rcx),%rbx + andq %rdx,%r15 + movq 32(%rcx),%rbp + andq %rdx,%rax + movq 40(%rcx),%rsi + andq %rdx,%rbx + movq 0(%rsp),%rcx + andq %rdx,%rbp + andq %rdx,%rsi + + subq %r8,%r14 + sbbq %r9,%r15 + sbbq %r10,%rax + sbbq %r11,%rbx + sbbq %r12,%rbp + sbbq %r13,%rsi + + orq %rcx,%rcx + + cmovzq %r8,%r14 + cmovzq %r9,%r15 + cmovzq %r10,%rax + movq %r14,0(%rdi) + cmovzq %r11,%rbx + movq %r15,8(%rdi) + cmovzq %r12,%rbp + movq %rax,16(%rdi) + cmovzq %r13,%rsi + movq %rbx,24(%rdi) + movq %rbp,32(%rdi) + movq %rsi,40(%rdi) + + movq 8(%rsp),%r15 +.cfi_restore %r15 + movq 16(%rsp),%r14 +.cfi_restore %r14 + movq 24(%rsp),%r13 +.cfi_restore %r13 + movq 32(%rsp),%r12 +.cfi_restore %r12 + movq 40(%rsp),%rbx +.cfi_restore %rbx + movq 48(%rsp),%rbp +.cfi_restore %rbp + leaq 56(%rsp),%rsp +.cfi_adjust_cfa_offset -56 + + .byte 0xf3,0xc3 +.cfi_endproc + + + +.globl _sub_mod_384 +.private_extern _sub_mod_384 + +.p2align 5 +_sub_mod_384: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + subq $8,%rsp +.cfi_adjust_cfa_offset 8 + + + call __sub_mod_384 + + movq 8(%rsp),%r15 +.cfi_restore %r15 + movq 16(%rsp),%r14 +.cfi_restore %r14 + movq 24(%rsp),%r13 +.cfi_restore %r13 + movq 32(%rsp),%r12 +.cfi_restore %r12 + movq 40(%rsp),%rbx +.cfi_restore %rbx + movq 48(%rsp),%rbp +.cfi_restore %rbp + leaq 56(%rsp),%rsp +.cfi_adjust_cfa_offset -56 + + .byte 0xf3,0xc3 +.cfi_endproc + + + +.p2align 5 +__sub_mod_384: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + movq 32(%rsi),%r12 + movq 40(%rsi),%r13 + + subq 0(%rdx),%r8 + movq 0(%rcx),%r14 + sbbq 8(%rdx),%r9 + movq 8(%rcx),%r15 + sbbq 16(%rdx),%r10 + movq 16(%rcx),%rax + sbbq 24(%rdx),%r11 + movq 24(%rcx),%rbx + sbbq 32(%rdx),%r12 + movq 32(%rcx),%rbp + sbbq 40(%rdx),%r13 + movq 40(%rcx),%rsi + sbbq %rdx,%rdx + + andq %rdx,%r14 + andq %rdx,%r15 + andq %rdx,%rax + andq %rdx,%rbx + andq %rdx,%rbp + andq %rdx,%rsi + + addq %r14,%r8 + adcq %r15,%r9 + movq %r8,0(%rdi) + adcq %rax,%r10 + movq %r9,8(%rdi) + adcq %rbx,%r11 + movq %r10,16(%rdi) + adcq %rbp,%r12 + movq %r11,24(%rdi) + adcq %rsi,%r13 + movq %r12,32(%rdi) + movq %r13,40(%rdi) + + .byte 0xf3,0xc3 +.cfi_endproc + + +.globl _sub_mod_384x +.private_extern _sub_mod_384x + +.p2align 5 +_sub_mod_384x: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + subq $24,%rsp +.cfi_adjust_cfa_offset 24 + + + movq %rsi,0(%rsp) + movq %rdx,8(%rsp) + leaq 48(%rsi),%rsi + leaq 48(%rdx),%rdx + leaq 48(%rdi),%rdi + call __sub_mod_384 + + movq 0(%rsp),%rsi + movq 8(%rsp),%rdx + leaq -48(%rdi),%rdi + call __sub_mod_384 + + movq 24+0(%rsp),%r15 +.cfi_restore %r15 + movq 24+8(%rsp),%r14 +.cfi_restore %r14 + movq 24+16(%rsp),%r13 +.cfi_restore %r13 + movq 24+24(%rsp),%r12 +.cfi_restore %r12 + movq 24+32(%rsp),%rbx +.cfi_restore %rbx + movq 24+40(%rsp),%rbp +.cfi_restore %rbp + leaq 24+48(%rsp),%rsp +.cfi_adjust_cfa_offset -24-8*6 + + .byte 0xf3,0xc3 +.cfi_endproc + +.globl _mul_by_1_plus_i_mod_384x +.private_extern _mul_by_1_plus_i_mod_384x + +.p2align 5 +_mul_by_1_plus_i_mod_384x: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + subq $56,%rsp +.cfi_adjust_cfa_offset 56 + + + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + movq 32(%rsi),%r12 + movq 40(%rsi),%r13 + + movq %r8,%r14 + addq 48(%rsi),%r8 + movq %r9,%r15 + adcq 56(%rsi),%r9 + movq %r10,%rax + adcq 64(%rsi),%r10 + movq %r11,%rbx + adcq 72(%rsi),%r11 + movq %r12,%rcx + adcq 80(%rsi),%r12 + movq %r13,%rbp + adcq 88(%rsi),%r13 + movq %rdi,48(%rsp) + sbbq %rdi,%rdi + + subq 48(%rsi),%r14 + sbbq 56(%rsi),%r15 + sbbq 64(%rsi),%rax + sbbq 72(%rsi),%rbx + sbbq 80(%rsi),%rcx + sbbq 88(%rsi),%rbp + sbbq %rsi,%rsi + + movq %r8,0(%rsp) + movq 0(%rdx),%r8 + movq %r9,8(%rsp) + movq 8(%rdx),%r9 + movq %r10,16(%rsp) + movq 16(%rdx),%r10 + movq %r11,24(%rsp) + movq 24(%rdx),%r11 + movq %r12,32(%rsp) + andq %rsi,%r8 + movq 32(%rdx),%r12 + movq %r13,40(%rsp) + andq %rsi,%r9 + movq 40(%rdx),%r13 + andq %rsi,%r10 + andq %rsi,%r11 + andq %rsi,%r12 + andq %rsi,%r13 + movq 48(%rsp),%rsi + + addq %r8,%r14 + movq 0(%rsp),%r8 + adcq %r9,%r15 + movq 8(%rsp),%r9 + adcq %r10,%rax + movq 16(%rsp),%r10 + adcq %r11,%rbx + movq 24(%rsp),%r11 + adcq %r12,%rcx + movq 32(%rsp),%r12 + adcq %r13,%rbp + movq 40(%rsp),%r13 + + movq %r14,0(%rsi) + movq %r8,%r14 + movq %r15,8(%rsi) + movq %rax,16(%rsi) + movq %r9,%r15 + movq %rbx,24(%rsi) + movq %rcx,32(%rsi) + movq %r10,%rax + movq %rbp,40(%rsi) + + subq 0(%rdx),%r8 + movq %r11,%rbx + sbbq 8(%rdx),%r9 + sbbq 16(%rdx),%r10 + movq %r12,%rcx + sbbq 24(%rdx),%r11 + sbbq 32(%rdx),%r12 + movq %r13,%rbp + sbbq 40(%rdx),%r13 + sbbq $0,%rdi + + cmovcq %r14,%r8 + cmovcq %r15,%r9 + cmovcq %rax,%r10 + movq %r8,48(%rsi) + cmovcq %rbx,%r11 + movq %r9,56(%rsi) + cmovcq %rcx,%r12 + movq %r10,64(%rsi) + cmovcq %rbp,%r13 + movq %r11,72(%rsi) + movq %r12,80(%rsi) + movq %r13,88(%rsi) + + movq 56+0(%rsp),%r15 +.cfi_restore %r15 + movq 56+8(%rsp),%r14 +.cfi_restore %r14 + movq 56+16(%rsp),%r13 +.cfi_restore %r13 + movq 56+24(%rsp),%r12 +.cfi_restore %r12 + movq 56+32(%rsp),%rbx +.cfi_restore %rbx + movq 56+40(%rsp),%rbp +.cfi_restore %rbp + leaq 56+48(%rsp),%rsp +.cfi_adjust_cfa_offset -56-8*6 + + .byte 0xf3,0xc3 +.cfi_endproc + +.globl _sgn0_pty_mod_384 +.private_extern _sgn0_pty_mod_384 + +.p2align 5 +_sgn0_pty_mod_384: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + + movq 0(%rdi),%r8 + movq 8(%rdi),%r9 + movq 16(%rdi),%r10 + movq 24(%rdi),%r11 + movq 32(%rdi),%rcx + movq 40(%rdi),%rdx + + xorq %rax,%rax + movq %r8,%rdi + addq %r8,%r8 + adcq %r9,%r9 + adcq %r10,%r10 + adcq %r11,%r11 + adcq %rcx,%rcx + adcq %rdx,%rdx + adcq $0,%rax + + subq 0(%rsi),%r8 + sbbq 8(%rsi),%r9 + sbbq 16(%rsi),%r10 + sbbq 24(%rsi),%r11 + sbbq 32(%rsi),%rcx + sbbq 40(%rsi),%rdx + sbbq $0,%rax + + notq %rax + andq $1,%rdi + andq $2,%rax + orq %rdi,%rax + + + .byte 0xf3,0xc3 +.cfi_endproc + + +.globl _sgn0_pty_mod_384x +.private_extern _sgn0_pty_mod_384x + +.p2align 5 +_sgn0_pty_mod_384x: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + subq $8,%rsp +.cfi_adjust_cfa_offset 8 + + + movq 48(%rdi),%r8 + movq 56(%rdi),%r9 + movq 64(%rdi),%r10 + movq 72(%rdi),%r11 + movq 80(%rdi),%rcx + movq 88(%rdi),%rdx + + movq %r8,%rbx + orq %r9,%r8 + orq %r10,%r8 + orq %r11,%r8 + orq %rcx,%r8 + orq %rdx,%r8 + + leaq 0(%rdi),%rax + xorq %rdi,%rdi + movq %rbx,%rbp + addq %rbx,%rbx + adcq %r9,%r9 + adcq %r10,%r10 + adcq %r11,%r11 + adcq %rcx,%rcx + adcq %rdx,%rdx + adcq $0,%rdi + + subq 0(%rsi),%rbx + sbbq 8(%rsi),%r9 + sbbq 16(%rsi),%r10 + sbbq 24(%rsi),%r11 + sbbq 32(%rsi),%rcx + sbbq 40(%rsi),%rdx + sbbq $0,%rdi + + movq %r8,0(%rsp) + notq %rdi + andq $1,%rbp + andq $2,%rdi + orq %rbp,%rdi + + movq 0(%rax),%r8 + movq 8(%rax),%r9 + movq 16(%rax),%r10 + movq 24(%rax),%r11 + movq 32(%rax),%rcx + movq 40(%rax),%rdx + + movq %r8,%rbx + orq %r9,%r8 + orq %r10,%r8 + orq %r11,%r8 + orq %rcx,%r8 + orq %rdx,%r8 + + xorq %rax,%rax + movq %rbx,%rbp + addq %rbx,%rbx + adcq %r9,%r9 + adcq %r10,%r10 + adcq %r11,%r11 + adcq %rcx,%rcx + adcq %rdx,%rdx + adcq $0,%rax + + subq 0(%rsi),%rbx + sbbq 8(%rsi),%r9 + sbbq 16(%rsi),%r10 + sbbq 24(%rsi),%r11 + sbbq 32(%rsi),%rcx + sbbq 40(%rsi),%rdx + sbbq $0,%rax + + movq 0(%rsp),%rbx + + notq %rax + + testq %r8,%r8 + cmovzq %rdi,%rbp + + testq %rbx,%rbx + cmovnzq %rdi,%rax + + andq $1,%rbp + andq $2,%rax + orq %rbp,%rax + + movq 8(%rsp),%rbx +.cfi_restore %rbx + movq 16(%rsp),%rbp +.cfi_restore %rbp + leaq 24(%rsp),%rsp +.cfi_adjust_cfa_offset -24 + + .byte 0xf3,0xc3 +.cfi_endproc + +.globl _vec_select_32 +.private_extern _vec_select_32 + +.p2align 5 +_vec_select_32: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + movd %ecx,%xmm5 + pxor %xmm4,%xmm4 + pshufd $0,%xmm5,%xmm5 + movdqu (%rsi),%xmm0 + leaq 16(%rsi),%rsi + pcmpeqd %xmm4,%xmm5 + movdqu (%rdx),%xmm1 + leaq 16(%rdx),%rdx + pcmpeqd %xmm5,%xmm4 + leaq 16(%rdi),%rdi + pand %xmm4,%xmm0 + movdqu 0+16-16(%rsi),%xmm2 + pand %xmm5,%xmm1 + movdqu 0+16-16(%rdx),%xmm3 + por %xmm1,%xmm0 + movdqu %xmm0,0-16(%rdi) + pand %xmm4,%xmm2 + pand %xmm5,%xmm3 + por %xmm3,%xmm2 + movdqu %xmm2,16-16(%rdi) + .byte 0xf3,0xc3 +.cfi_endproc + +.globl _vec_select_48 +.private_extern _vec_select_48 + +.p2align 5 +_vec_select_48: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + movd %ecx,%xmm5 + pxor %xmm4,%xmm4 + pshufd $0,%xmm5,%xmm5 + movdqu (%rsi),%xmm0 + leaq 24(%rsi),%rsi + pcmpeqd %xmm4,%xmm5 + movdqu (%rdx),%xmm1 + leaq 24(%rdx),%rdx + pcmpeqd %xmm5,%xmm4 + leaq 24(%rdi),%rdi + pand %xmm4,%xmm0 + movdqu 0+16-24(%rsi),%xmm2 + pand %xmm5,%xmm1 + movdqu 0+16-24(%rdx),%xmm3 + por %xmm1,%xmm0 + movdqu %xmm0,0-24(%rdi) + pand %xmm4,%xmm2 + movdqu 16+16-24(%rsi),%xmm0 + pand %xmm5,%xmm3 + movdqu 16+16-24(%rdx),%xmm1 + por %xmm3,%xmm2 + movdqu %xmm2,16-24(%rdi) + pand %xmm4,%xmm0 + pand %xmm5,%xmm1 + por %xmm1,%xmm0 + movdqu %xmm0,32-24(%rdi) + .byte 0xf3,0xc3 +.cfi_endproc + +.globl _vec_select_96 +.private_extern _vec_select_96 + +.p2align 5 +_vec_select_96: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + movd %ecx,%xmm5 + pxor %xmm4,%xmm4 + pshufd $0,%xmm5,%xmm5 + movdqu (%rsi),%xmm0 + leaq 48(%rsi),%rsi + pcmpeqd %xmm4,%xmm5 + movdqu (%rdx),%xmm1 + leaq 48(%rdx),%rdx + pcmpeqd %xmm5,%xmm4 + leaq 48(%rdi),%rdi + pand %xmm4,%xmm0 + movdqu 0+16-48(%rsi),%xmm2 + pand %xmm5,%xmm1 + movdqu 0+16-48(%rdx),%xmm3 + por %xmm1,%xmm0 + movdqu %xmm0,0-48(%rdi) + pand %xmm4,%xmm2 + movdqu 16+16-48(%rsi),%xmm0 + pand %xmm5,%xmm3 + movdqu 16+16-48(%rdx),%xmm1 + por %xmm3,%xmm2 + movdqu %xmm2,16-48(%rdi) + pand %xmm4,%xmm0 + movdqu 32+16-48(%rsi),%xmm2 + pand %xmm5,%xmm1 + movdqu 32+16-48(%rdx),%xmm3 + por %xmm1,%xmm0 + movdqu %xmm0,32-48(%rdi) + pand %xmm4,%xmm2 + movdqu 48+16-48(%rsi),%xmm0 + pand %xmm5,%xmm3 + movdqu 48+16-48(%rdx),%xmm1 + por %xmm3,%xmm2 + movdqu %xmm2,48-48(%rdi) + pand %xmm4,%xmm0 + movdqu 64+16-48(%rsi),%xmm2 + pand %xmm5,%xmm1 + movdqu 64+16-48(%rdx),%xmm3 + por %xmm1,%xmm0 + movdqu %xmm0,64-48(%rdi) + pand %xmm4,%xmm2 + pand %xmm5,%xmm3 + por %xmm3,%xmm2 + movdqu %xmm2,80-48(%rdi) + .byte 0xf3,0xc3 +.cfi_endproc + +.globl _vec_select_192 +.private_extern _vec_select_192 + +.p2align 5 +_vec_select_192: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + movd %ecx,%xmm5 + pxor %xmm4,%xmm4 + pshufd $0,%xmm5,%xmm5 + movdqu (%rsi),%xmm0 + leaq 96(%rsi),%rsi + pcmpeqd %xmm4,%xmm5 + movdqu (%rdx),%xmm1 + leaq 96(%rdx),%rdx + pcmpeqd %xmm5,%xmm4 + leaq 96(%rdi),%rdi + pand %xmm4,%xmm0 + movdqu 0+16-96(%rsi),%xmm2 + pand %xmm5,%xmm1 + movdqu 0+16-96(%rdx),%xmm3 + por %xmm1,%xmm0 + movdqu %xmm0,0-96(%rdi) + pand %xmm4,%xmm2 + movdqu 16+16-96(%rsi),%xmm0 + pand %xmm5,%xmm3 + movdqu 16+16-96(%rdx),%xmm1 + por %xmm3,%xmm2 + movdqu %xmm2,16-96(%rdi) + pand %xmm4,%xmm0 + movdqu 32+16-96(%rsi),%xmm2 + pand %xmm5,%xmm1 + movdqu 32+16-96(%rdx),%xmm3 + por %xmm1,%xmm0 + movdqu %xmm0,32-96(%rdi) + pand %xmm4,%xmm2 + movdqu 48+16-96(%rsi),%xmm0 + pand %xmm5,%xmm3 + movdqu 48+16-96(%rdx),%xmm1 + por %xmm3,%xmm2 + movdqu %xmm2,48-96(%rdi) + pand %xmm4,%xmm0 + movdqu 64+16-96(%rsi),%xmm2 + pand %xmm5,%xmm1 + movdqu 64+16-96(%rdx),%xmm3 + por %xmm1,%xmm0 + movdqu %xmm0,64-96(%rdi) + pand %xmm4,%xmm2 + movdqu 80+16-96(%rsi),%xmm0 + pand %xmm5,%xmm3 + movdqu 80+16-96(%rdx),%xmm1 + por %xmm3,%xmm2 + movdqu %xmm2,80-96(%rdi) + pand %xmm4,%xmm0 + movdqu 96+16-96(%rsi),%xmm2 + pand %xmm5,%xmm1 + movdqu 96+16-96(%rdx),%xmm3 + por %xmm1,%xmm0 + movdqu %xmm0,96-96(%rdi) + pand %xmm4,%xmm2 + movdqu 112+16-96(%rsi),%xmm0 + pand %xmm5,%xmm3 + movdqu 112+16-96(%rdx),%xmm1 + por %xmm3,%xmm2 + movdqu %xmm2,112-96(%rdi) + pand %xmm4,%xmm0 + movdqu 128+16-96(%rsi),%xmm2 + pand %xmm5,%xmm1 + movdqu 128+16-96(%rdx),%xmm3 + por %xmm1,%xmm0 + movdqu %xmm0,128-96(%rdi) + pand %xmm4,%xmm2 + movdqu 144+16-96(%rsi),%xmm0 + pand %xmm5,%xmm3 + movdqu 144+16-96(%rdx),%xmm1 + por %xmm3,%xmm2 + movdqu %xmm2,144-96(%rdi) + pand %xmm4,%xmm0 + movdqu 160+16-96(%rsi),%xmm2 + pand %xmm5,%xmm1 + movdqu 160+16-96(%rdx),%xmm3 + por %xmm1,%xmm0 + movdqu %xmm0,160-96(%rdi) + pand %xmm4,%xmm2 + pand %xmm5,%xmm3 + por %xmm3,%xmm2 + movdqu %xmm2,176-96(%rdi) + .byte 0xf3,0xc3 +.cfi_endproc + +.globl _vec_select_144 +.private_extern _vec_select_144 + +.p2align 5 +_vec_select_144: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + movd %ecx,%xmm5 + pxor %xmm4,%xmm4 + pshufd $0,%xmm5,%xmm5 + movdqu (%rsi),%xmm0 + leaq 72(%rsi),%rsi + pcmpeqd %xmm4,%xmm5 + movdqu (%rdx),%xmm1 + leaq 72(%rdx),%rdx + pcmpeqd %xmm5,%xmm4 + leaq 72(%rdi),%rdi + pand %xmm4,%xmm0 + movdqu 0+16-72(%rsi),%xmm2 + pand %xmm5,%xmm1 + movdqu 0+16-72(%rdx),%xmm3 + por %xmm1,%xmm0 + movdqu %xmm0,0-72(%rdi) + pand %xmm4,%xmm2 + movdqu 16+16-72(%rsi),%xmm0 + pand %xmm5,%xmm3 + movdqu 16+16-72(%rdx),%xmm1 + por %xmm3,%xmm2 + movdqu %xmm2,16-72(%rdi) + pand %xmm4,%xmm0 + movdqu 32+16-72(%rsi),%xmm2 + pand %xmm5,%xmm1 + movdqu 32+16-72(%rdx),%xmm3 + por %xmm1,%xmm0 + movdqu %xmm0,32-72(%rdi) + pand %xmm4,%xmm2 + movdqu 48+16-72(%rsi),%xmm0 + pand %xmm5,%xmm3 + movdqu 48+16-72(%rdx),%xmm1 + por %xmm3,%xmm2 + movdqu %xmm2,48-72(%rdi) + pand %xmm4,%xmm0 + movdqu 64+16-72(%rsi),%xmm2 + pand %xmm5,%xmm1 + movdqu 64+16-72(%rdx),%xmm3 + por %xmm1,%xmm0 + movdqu %xmm0,64-72(%rdi) + pand %xmm4,%xmm2 + movdqu 80+16-72(%rsi),%xmm0 + pand %xmm5,%xmm3 + movdqu 80+16-72(%rdx),%xmm1 + por %xmm3,%xmm2 + movdqu %xmm2,80-72(%rdi) + pand %xmm4,%xmm0 + movdqu 96+16-72(%rsi),%xmm2 + pand %xmm5,%xmm1 + movdqu 96+16-72(%rdx),%xmm3 + por %xmm1,%xmm0 + movdqu %xmm0,96-72(%rdi) + pand %xmm4,%xmm2 + movdqu 112+16-72(%rsi),%xmm0 + pand %xmm5,%xmm3 + movdqu 112+16-72(%rdx),%xmm1 + por %xmm3,%xmm2 + movdqu %xmm2,112-72(%rdi) + pand %xmm4,%xmm0 + pand %xmm5,%xmm1 + por %xmm1,%xmm0 + movdqu %xmm0,128-72(%rdi) + .byte 0xf3,0xc3 +.cfi_endproc + +.globl _vec_select_288 +.private_extern _vec_select_288 + +.p2align 5 +_vec_select_288: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + movd %ecx,%xmm5 + pxor %xmm4,%xmm4 + pshufd $0,%xmm5,%xmm5 + movdqu (%rsi),%xmm0 + leaq 144(%rsi),%rsi + pcmpeqd %xmm4,%xmm5 + movdqu (%rdx),%xmm1 + leaq 144(%rdx),%rdx + pcmpeqd %xmm5,%xmm4 + leaq 144(%rdi),%rdi + pand %xmm4,%xmm0 + movdqu 0+16-144(%rsi),%xmm2 + pand %xmm5,%xmm1 + movdqu 0+16-144(%rdx),%xmm3 + por %xmm1,%xmm0 + movdqu %xmm0,0-144(%rdi) + pand %xmm4,%xmm2 + movdqu 16+16-144(%rsi),%xmm0 + pand %xmm5,%xmm3 + movdqu 16+16-144(%rdx),%xmm1 + por %xmm3,%xmm2 + movdqu %xmm2,16-144(%rdi) + pand %xmm4,%xmm0 + movdqu 32+16-144(%rsi),%xmm2 + pand %xmm5,%xmm1 + movdqu 32+16-144(%rdx),%xmm3 + por %xmm1,%xmm0 + movdqu %xmm0,32-144(%rdi) + pand %xmm4,%xmm2 + movdqu 48+16-144(%rsi),%xmm0 + pand %xmm5,%xmm3 + movdqu 48+16-144(%rdx),%xmm1 + por %xmm3,%xmm2 + movdqu %xmm2,48-144(%rdi) + pand %xmm4,%xmm0 + movdqu 64+16-144(%rsi),%xmm2 + pand %xmm5,%xmm1 + movdqu 64+16-144(%rdx),%xmm3 + por %xmm1,%xmm0 + movdqu %xmm0,64-144(%rdi) + pand %xmm4,%xmm2 + movdqu 80+16-144(%rsi),%xmm0 + pand %xmm5,%xmm3 + movdqu 80+16-144(%rdx),%xmm1 + por %xmm3,%xmm2 + movdqu %xmm2,80-144(%rdi) + pand %xmm4,%xmm0 + movdqu 96+16-144(%rsi),%xmm2 + pand %xmm5,%xmm1 + movdqu 96+16-144(%rdx),%xmm3 + por %xmm1,%xmm0 + movdqu %xmm0,96-144(%rdi) + pand %xmm4,%xmm2 + movdqu 112+16-144(%rsi),%xmm0 + pand %xmm5,%xmm3 + movdqu 112+16-144(%rdx),%xmm1 + por %xmm3,%xmm2 + movdqu %xmm2,112-144(%rdi) + pand %xmm4,%xmm0 + movdqu 128+16-144(%rsi),%xmm2 + pand %xmm5,%xmm1 + movdqu 128+16-144(%rdx),%xmm3 + por %xmm1,%xmm0 + movdqu %xmm0,128-144(%rdi) + pand %xmm4,%xmm2 + movdqu 144+16-144(%rsi),%xmm0 + pand %xmm5,%xmm3 + movdqu 144+16-144(%rdx),%xmm1 + por %xmm3,%xmm2 + movdqu %xmm2,144-144(%rdi) + pand %xmm4,%xmm0 + movdqu 160+16-144(%rsi),%xmm2 + pand %xmm5,%xmm1 + movdqu 160+16-144(%rdx),%xmm3 + por %xmm1,%xmm0 + movdqu %xmm0,160-144(%rdi) + pand %xmm4,%xmm2 + movdqu 176+16-144(%rsi),%xmm0 + pand %xmm5,%xmm3 + movdqu 176+16-144(%rdx),%xmm1 + por %xmm3,%xmm2 + movdqu %xmm2,176-144(%rdi) + pand %xmm4,%xmm0 + movdqu 192+16-144(%rsi),%xmm2 + pand %xmm5,%xmm1 + movdqu 192+16-144(%rdx),%xmm3 + por %xmm1,%xmm0 + movdqu %xmm0,192-144(%rdi) + pand %xmm4,%xmm2 + movdqu 208+16-144(%rsi),%xmm0 + pand %xmm5,%xmm3 + movdqu 208+16-144(%rdx),%xmm1 + por %xmm3,%xmm2 + movdqu %xmm2,208-144(%rdi) + pand %xmm4,%xmm0 + movdqu 224+16-144(%rsi),%xmm2 + pand %xmm5,%xmm1 + movdqu 224+16-144(%rdx),%xmm3 + por %xmm1,%xmm0 + movdqu %xmm0,224-144(%rdi) + pand %xmm4,%xmm2 + movdqu 240+16-144(%rsi),%xmm0 + pand %xmm5,%xmm3 + movdqu 240+16-144(%rdx),%xmm1 + por %xmm3,%xmm2 + movdqu %xmm2,240-144(%rdi) + pand %xmm4,%xmm0 + movdqu 256+16-144(%rsi),%xmm2 + pand %xmm5,%xmm1 + movdqu 256+16-144(%rdx),%xmm3 + por %xmm1,%xmm0 + movdqu %xmm0,256-144(%rdi) + pand %xmm4,%xmm2 + pand %xmm5,%xmm3 + por %xmm3,%xmm2 + movdqu %xmm2,272-144(%rdi) + .byte 0xf3,0xc3 +.cfi_endproc + +.globl _vec_prefetch +.private_extern _vec_prefetch + +.p2align 5 +_vec_prefetch: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + leaq -1(%rdi,%rsi,1),%rsi + movq $64,%rax + xorq %r8,%r8 + prefetchnta (%rdi) + leaq (%rdi,%rax,1),%rdi + cmpq %rsi,%rdi + cmovaq %rsi,%rdi + cmovaq %r8,%rax + prefetchnta (%rdi) + leaq (%rdi,%rax,1),%rdi + cmpq %rsi,%rdi + cmovaq %rsi,%rdi + cmovaq %r8,%rax + prefetchnta (%rdi) + leaq (%rdi,%rax,1),%rdi + cmpq %rsi,%rdi + cmovaq %rsi,%rdi + cmovaq %r8,%rax + prefetchnta (%rdi) + leaq (%rdi,%rax,1),%rdi + cmpq %rsi,%rdi + cmovaq %rsi,%rdi + cmovaq %r8,%rax + prefetchnta (%rdi) + leaq (%rdi,%rax,1),%rdi + cmpq %rsi,%rdi + cmovaq %rsi,%rdi + cmovaq %r8,%rax + prefetchnta (%rdi) + leaq (%rdi,%rax,1),%rdi + cmpq %rsi,%rdi + cmovaq %rsi,%rdi + prefetchnta (%rdi) + .byte 0xf3,0xc3 +.cfi_endproc + +.globl _vec_is_zero_16x +.private_extern _vec_is_zero_16x + +.p2align 5 +_vec_is_zero_16x: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + shrl $4,%esi + movdqu (%rdi),%xmm0 + leaq 16(%rdi),%rdi + +L$oop_is_zero: + decl %esi + jz L$oop_is_zero_done + movdqu (%rdi),%xmm1 + leaq 16(%rdi),%rdi + por %xmm1,%xmm0 + jmp L$oop_is_zero + +L$oop_is_zero_done: + pshufd $0x4e,%xmm0,%xmm1 + por %xmm1,%xmm0 +.byte 102,72,15,126,192 + incl %esi + testq %rax,%rax + cmovnzl %esi,%eax + xorl $1,%eax + .byte 0xf3,0xc3 +.cfi_endproc + +.globl _vec_is_equal_16x +.private_extern _vec_is_equal_16x + +.p2align 5 +_vec_is_equal_16x: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + shrl $4,%edx + movdqu (%rdi),%xmm0 + movdqu (%rsi),%xmm1 + subq %rdi,%rsi + leaq 16(%rdi),%rdi + pxor %xmm1,%xmm0 + +L$oop_is_equal: + decl %edx + jz L$oop_is_equal_done + movdqu (%rdi),%xmm1 + movdqu (%rdi,%rsi,1),%xmm2 + leaq 16(%rdi),%rdi + pxor %xmm2,%xmm1 + por %xmm1,%xmm0 + jmp L$oop_is_equal + +L$oop_is_equal_done: + pshufd $0x4e,%xmm0,%xmm1 + por %xmm1,%xmm0 +.byte 102,72,15,126,192 + incl %edx + testq %rax,%rax + cmovnzl %edx,%eax + xorl $1,%eax + .byte 0xf3,0xc3 +.cfi_endproc + diff --git a/temp-deps/blst-ts/deps/blst/build/mach-o/add_mod_384x384-x86_64.s b/temp-deps/blst-ts/deps/blst/build/mach-o/add_mod_384x384-x86_64.s new file mode 100644 index 0000000..2dc58f8 --- /dev/null +++ b/temp-deps/blst-ts/deps/blst/build/mach-o/add_mod_384x384-x86_64.s @@ -0,0 +1,244 @@ +.text + + +.p2align 5 +__add_mod_384x384: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + movq 32(%rsi),%r12 + movq 40(%rsi),%r13 + movq 48(%rsi),%r14 + + addq 0(%rdx),%r8 + movq 56(%rsi),%r15 + adcq 8(%rdx),%r9 + movq 64(%rsi),%rax + adcq 16(%rdx),%r10 + movq 72(%rsi),%rbx + adcq 24(%rdx),%r11 + movq 80(%rsi),%rbp + adcq 32(%rdx),%r12 + movq 88(%rsi),%rsi + adcq 40(%rdx),%r13 + movq %r8,0(%rdi) + adcq 48(%rdx),%r14 + movq %r9,8(%rdi) + adcq 56(%rdx),%r15 + movq %r10,16(%rdi) + adcq 64(%rdx),%rax + movq %r12,32(%rdi) + movq %r14,%r8 + adcq 72(%rdx),%rbx + movq %r11,24(%rdi) + movq %r15,%r9 + adcq 80(%rdx),%rbp + movq %r13,40(%rdi) + movq %rax,%r10 + adcq 88(%rdx),%rsi + movq %rbx,%r11 + sbbq %rdx,%rdx + + subq 0(%rcx),%r14 + sbbq 8(%rcx),%r15 + movq %rbp,%r12 + sbbq 16(%rcx),%rax + sbbq 24(%rcx),%rbx + sbbq 32(%rcx),%rbp + movq %rsi,%r13 + sbbq 40(%rcx),%rsi + sbbq $0,%rdx + + cmovcq %r8,%r14 + cmovcq %r9,%r15 + cmovcq %r10,%rax + movq %r14,48(%rdi) + cmovcq %r11,%rbx + movq %r15,56(%rdi) + cmovcq %r12,%rbp + movq %rax,64(%rdi) + cmovcq %r13,%rsi + movq %rbx,72(%rdi) + movq %rbp,80(%rdi) + movq %rsi,88(%rdi) + + .byte 0xf3,0xc3 +.cfi_endproc + + + +.p2align 5 +__sub_mod_384x384: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + movq 32(%rsi),%r12 + movq 40(%rsi),%r13 + movq 48(%rsi),%r14 + + subq 0(%rdx),%r8 + movq 56(%rsi),%r15 + sbbq 8(%rdx),%r9 + movq 64(%rsi),%rax + sbbq 16(%rdx),%r10 + movq 72(%rsi),%rbx + sbbq 24(%rdx),%r11 + movq 80(%rsi),%rbp + sbbq 32(%rdx),%r12 + movq 88(%rsi),%rsi + sbbq 40(%rdx),%r13 + movq %r8,0(%rdi) + sbbq 48(%rdx),%r14 + movq 0(%rcx),%r8 + movq %r9,8(%rdi) + sbbq 56(%rdx),%r15 + movq 8(%rcx),%r9 + movq %r10,16(%rdi) + sbbq 64(%rdx),%rax + movq 16(%rcx),%r10 + movq %r11,24(%rdi) + sbbq 72(%rdx),%rbx + movq 24(%rcx),%r11 + movq %r12,32(%rdi) + sbbq 80(%rdx),%rbp + movq 32(%rcx),%r12 + movq %r13,40(%rdi) + sbbq 88(%rdx),%rsi + movq 40(%rcx),%r13 + sbbq %rdx,%rdx + + andq %rdx,%r8 + andq %rdx,%r9 + andq %rdx,%r10 + andq %rdx,%r11 + andq %rdx,%r12 + andq %rdx,%r13 + + addq %r8,%r14 + adcq %r9,%r15 + movq %r14,48(%rdi) + adcq %r10,%rax + movq %r15,56(%rdi) + adcq %r11,%rbx + movq %rax,64(%rdi) + adcq %r12,%rbp + movq %rbx,72(%rdi) + adcq %r13,%rsi + movq %rbp,80(%rdi) + movq %rsi,88(%rdi) + + .byte 0xf3,0xc3 +.cfi_endproc + + +.globl _add_mod_384x384 +.private_extern _add_mod_384x384 + +.p2align 5 +_add_mod_384x384: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + subq $8,%rsp +.cfi_adjust_cfa_offset 8 + + + call __add_mod_384x384 + + movq 8(%rsp),%r15 +.cfi_restore %r15 + movq 16(%rsp),%r14 +.cfi_restore %r14 + movq 24(%rsp),%r13 +.cfi_restore %r13 + movq 32(%rsp),%r12 +.cfi_restore %r12 + movq 40(%rsp),%rbx +.cfi_restore %rbx + movq 48(%rsp),%rbp +.cfi_restore %rbp + leaq 56(%rsp),%rsp +.cfi_adjust_cfa_offset -56 + + .byte 0xf3,0xc3 +.cfi_endproc + + +.globl _sub_mod_384x384 +.private_extern _sub_mod_384x384 + +.p2align 5 +_sub_mod_384x384: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + subq $8,%rsp +.cfi_adjust_cfa_offset 8 + + + call __sub_mod_384x384 + + movq 8(%rsp),%r15 +.cfi_restore %r15 + movq 16(%rsp),%r14 +.cfi_restore %r14 + movq 24(%rsp),%r13 +.cfi_restore %r13 + movq 32(%rsp),%r12 +.cfi_restore %r12 + movq 40(%rsp),%rbx +.cfi_restore %rbx + movq 48(%rsp),%rbp +.cfi_restore %rbp + leaq 56(%rsp),%rsp +.cfi_adjust_cfa_offset -56 + + .byte 0xf3,0xc3 +.cfi_endproc + diff --git a/temp-deps/blst-ts/deps/blst/build/mach-o/ct_inverse_mod_256-armv8.S b/temp-deps/blst-ts/deps/blst/build/mach-o/ct_inverse_mod_256-armv8.S new file mode 100644 index 0000000..f3a2c3b --- /dev/null +++ b/temp-deps/blst-ts/deps/blst/build/mach-o/ct_inverse_mod_256-armv8.S @@ -0,0 +1,784 @@ +.text + +.globl _ct_inverse_mod_256 + +.align 5 +_ct_inverse_mod_256: +.long 3573752639 + stp x29, x30, [sp,#-80]! + add x29, sp, #0 + stp x19, x20, [sp,#16] + stp x21, x22, [sp,#32] + stp x23, x24, [sp,#48] + stp x25, x26, [sp,#64] + sub sp, sp, #1040 + + ldp x4, x5, [x1,#8*0] + ldp x6, x7, [x1,#8*2] + + add x1, sp, #16+511 // find closest 512-byte-aligned spot + and x1, x1, #-512 // in the frame... + str x0, [sp] + + ldp x8, x9, [x2,#8*0] + ldp x10, x11, [x2,#8*2] + + stp x4, x5, [x1,#8*0] // copy input to |a| + stp x6, x7, [x1,#8*2] + stp x8, x9, [x1,#8*4] // copy modulus to |b| + stp x10, x11, [x1,#8*6] + + ////////////////////////////////////////// first iteration + bl Lab_approximation_31_256_loaded + + eor x0, x1, #256 // pointer to dst |a|b|u|v| + bl __smul_256_n_shift_by_31 + str x12,[x0,#8*8] // initialize |u| with |f0| + + mov x12, x14 // |f1| + mov x13, x15 // |g1| + add x0, x0, #8*4 // pointer to dst |b| + bl __smul_256_n_shift_by_31 + str x12, [x0,#8*9] // initialize |v| with |f1| + + ////////////////////////////////////////// second iteration + eor x1, x1, #256 // flip-flop src |a|b|u|v| + bl __ab_approximation_31_256 + + eor x0, x1, #256 // pointer to dst |a|b|u|v| + bl __smul_256_n_shift_by_31 + mov x16, x12 // corrected |f0| + mov x17, x13 // corrected |g0| + + mov x12, x14 // |f1| + mov x13, x15 // |g1| + add x0, x0, #8*4 // pointer to destination |b| + bl __smul_256_n_shift_by_31 + + ldr x8, [x1,#8*8] // |u| + ldr x9, [x1,#8*13] // |v| + madd x4, x16, x8, xzr // |u|*|f0| + madd x4, x17, x9, x4 // |v|*|g0| + str x4, [x0,#8*4] + asr x5, x4, #63 // sign extenstion + stp x5, x5, [x0,#8*5] + stp x5, x5, [x0,#8*7] + + madd x4, x12, x8, xzr // |u|*|f1| + madd x4, x13, x9, x4 // |v|*|g1| + str x4, [x0,#8*9] + asr x5, x4, #63 // sign extenstion + stp x5, x5, [x0,#8*10] + stp x5, x5, [x0,#8*12] + eor x1, x1, #256 // flip-flop src |a|b|u|v| + bl __ab_approximation_31_256 + + eor x0, x1, #256 // pointer to dst |a|b|u|v| + bl __smul_256_n_shift_by_31 + mov x16, x12 // corrected |f0| + mov x17, x13 // corrected |g0| + + mov x12, x14 // |f1| + mov x13, x15 // |g1| + add x0, x0, #8*4 // pointer to destination |b| + bl __smul_256_n_shift_by_31 + + add x0, x0, #8*4 // pointer to destination |u| + bl __smul_256x63 + adc x22, x22, x23 + str x22, [x0,#8*4] + + mov x16, x12 // corrected |f1| + mov x17, x13 // corrected |g1| + add x0, x0, #8*5 // pointer to destination |v| + bl __smul_256x63 + adc x22, x22, x23 + stp x22, x22, [x0,#8*4] + stp x22, x22, [x0,#8*6] + eor x1, x1, #256 // flip-flop src |a|b|u|v| + bl __ab_approximation_31_256 + + eor x0, x1, #256 // pointer to dst |a|b|u|v| + bl __smul_256_n_shift_by_31 + mov x16, x12 // corrected |f0| + mov x17, x13 // corrected |g0| + + mov x12, x14 // |f1| + mov x13, x15 // |g1| + add x0, x0, #8*4 // pointer to destination |b| + bl __smul_256_n_shift_by_31 + + add x0, x0, #8*4 // pointer to destination |u| + bl __smul_256x63 + adc x22, x22, x23 + str x22, [x0,#8*4] + + mov x16, x12 // corrected |f1| + mov x17, x13 // corrected |g1| + add x0, x0, #8*5 // pointer to destination |v| + bl __smul_256x63 + adc x22, x22, x23 + stp x22, x22, [x0,#8*4] + stp x22, x22, [x0,#8*6] + eor x1, x1, #256 // flip-flop src |a|b|u|v| + bl __ab_approximation_31_256 + + eor x0, x1, #256 // pointer to dst |a|b|u|v| + bl __smul_256_n_shift_by_31 + mov x16, x12 // corrected |f0| + mov x17, x13 // corrected |g0| + + mov x12, x14 // |f1| + mov x13, x15 // |g1| + add x0, x0, #8*4 // pointer to destination |b| + bl __smul_256_n_shift_by_31 + + add x0, x0, #8*4 // pointer to destination |u| + bl __smul_256x63 + adc x22, x22, x23 + str x22, [x0,#8*4] + + mov x16, x12 // corrected |f1| + mov x17, x13 // corrected |g1| + add x0, x0, #8*5 // pointer to destination |v| + bl __smul_256x63 + adc x22, x22, x23 + stp x22, x22, [x0,#8*4] + stp x22, x22, [x0,#8*6] + eor x1, x1, #256 // flip-flop src |a|b|u|v| + bl __ab_approximation_31_256 + + eor x0, x1, #256 // pointer to dst |a|b|u|v| + bl __smul_256_n_shift_by_31 + mov x16, x12 // corrected |f0| + mov x17, x13 // corrected |g0| + + mov x12, x14 // |f1| + mov x13, x15 // |g1| + add x0, x0, #8*4 // pointer to destination |b| + bl __smul_256_n_shift_by_31 + + add x0, x0, #8*4 // pointer to destination |u| + bl __smul_256x63 + adc x22, x22, x23 + str x22, [x0,#8*4] + + mov x16, x12 // corrected |f1| + mov x17, x13 // corrected |g1| + add x0, x0, #8*5 // pointer to destination |v| + bl __smul_256x63 + adc x22, x22, x23 + stp x22, x22, [x0,#8*4] + stp x22, x22, [x0,#8*6] + eor x1, x1, #256 // flip-flop src |a|b|u|v| + bl __ab_approximation_31_256 + + eor x0, x1, #256 // pointer to dst |a|b|u|v| + bl __smul_256_n_shift_by_31 + mov x16, x12 // corrected |f0| + mov x17, x13 // corrected |g0| + + mov x12, x14 // |f1| + mov x13, x15 // |g1| + add x0, x0, #8*4 // pointer to destination |b| + bl __smul_256_n_shift_by_31 + + add x0, x0, #8*4 // pointer to destination |u| + bl __smul_256x63 + adc x22, x22, x23 + str x22, [x0,#8*4] + + mov x16, x12 // corrected |f1| + mov x17, x13 // corrected |g1| + add x0, x0, #8*5 // pointer to destination |v| + bl __smul_256x63 + adc x22, x22, x23 + stp x22, x22, [x0,#8*4] + stp x22, x22, [x0,#8*6] + eor x1, x1, #256 // flip-flop src |a|b|u|v| + bl __ab_approximation_31_256 + + eor x0, x1, #256 // pointer to dst |a|b|u|v| + bl __smul_256_n_shift_by_31 + mov x16, x12 // corrected |f0| + mov x17, x13 // corrected |g0| + + mov x12, x14 // |f1| + mov x13, x15 // |g1| + add x0, x0, #8*4 // pointer to destination |b| + bl __smul_256_n_shift_by_31 + + add x0, x0, #8*4 // pointer to destination |u| + bl __smul_256x63 + adc x22, x22, x23 + str x22, [x0,#8*4] + + mov x16, x12 // corrected |f1| + mov x17, x13 // corrected |g1| + add x0, x0, #8*5 // pointer to destination |v| + bl __smul_256x63 + adc x22, x22, x23 + stp x22, x22, [x0,#8*4] + stp x22, x22, [x0,#8*6] + eor x1, x1, #256 // flip-flop src |a|b|u|v| + bl __ab_approximation_31_256 + + eor x0, x1, #256 // pointer to dst |a|b|u|v| + bl __smul_256_n_shift_by_31 + mov x16, x12 // corrected |f0| + mov x17, x13 // corrected |g0| + + mov x12, x14 // |f1| + mov x13, x15 // |g1| + add x0, x0, #8*4 // pointer to destination |b| + bl __smul_256_n_shift_by_31 + + add x0, x0, #8*4 // pointer to destination |u| + bl __smul_256x63 + adc x22, x22, x23 + str x22, [x0,#8*4] + + mov x16, x12 // corrected |f1| + mov x17, x13 // corrected |g1| + add x0, x0, #8*5 // pointer to destination |v| + bl __smul_256x63 + bl __smul_512x63_tail + eor x1, x1, #256 // flip-flop src |a|b|u|v| + bl __ab_approximation_31_256 + + eor x0, x1, #256 // pointer to dst |a|b|u|v| + bl __smul_256_n_shift_by_31 + mov x16, x12 // corrected |f0| + mov x17, x13 // corrected |g0| + + mov x12, x14 // |f1| + mov x13, x15 // |g1| + add x0, x0, #8*4 // pointer to destination |b| + bl __smul_256_n_shift_by_31 + + add x0, x0, #8*4 // pointer to destination |u| + bl __smul_256x63 + adc x22, x22, x23 + str x22, [x0,#8*4] + + mov x16, x12 // corrected |f1| + mov x17, x13 // corrected |g1| + add x0, x0, #8*5 // pointer to destination |v| + bl __smul_256x63 + bl __smul_512x63_tail + eor x1, x1, #256 // flip-flop src |a|b|u|v| + bl __ab_approximation_31_256 + + eor x0, x1, #256 // pointer to dst |a|b|u|v| + bl __smul_256_n_shift_by_31 + mov x16, x12 // corrected |f0| + mov x17, x13 // corrected |g0| + + mov x12, x14 // |f1| + mov x13, x15 // |g1| + add x0, x0, #8*4 // pointer to destination |b| + bl __smul_256_n_shift_by_31 + + add x0, x0, #8*4 // pointer to destination |u| + bl __smul_256x63 + adc x22, x22, x23 + str x22, [x0,#8*4] + + mov x16, x12 // corrected |f1| + mov x17, x13 // corrected |g1| + add x0, x0, #8*5 // pointer to destination |v| + bl __smul_256x63 + bl __smul_512x63_tail + eor x1, x1, #256 // flip-flop src |a|b|u|v| + bl __ab_approximation_31_256 + + eor x0, x1, #256 // pointer to dst |a|b|u|v| + bl __smul_256_n_shift_by_31 + mov x16, x12 // corrected |f0| + mov x17, x13 // corrected |g0| + + mov x12, x14 // |f1| + mov x13, x15 // |g1| + add x0, x0, #8*4 // pointer to destination |b| + bl __smul_256_n_shift_by_31 + + add x0, x0, #8*4 // pointer to destination |u| + bl __smul_256x63 + adc x22, x22, x23 + str x22, [x0,#8*4] + + mov x16, x12 // corrected |f1| + mov x17, x13 // corrected |g1| + add x0, x0, #8*5 // pointer to destination |v| + bl __smul_256x63 + bl __smul_512x63_tail + eor x1, x1, #256 // flip-flop src |a|b|u|v| + bl __ab_approximation_31_256 + + eor x0, x1, #256 // pointer to dst |a|b|u|v| + bl __smul_256_n_shift_by_31 + mov x16, x12 // corrected |f0| + mov x17, x13 // corrected |g0| + + mov x12, x14 // |f1| + mov x13, x15 // |g1| + add x0, x0, #8*4 // pointer to destination |b| + bl __smul_256_n_shift_by_31 + + add x0, x0, #8*4 // pointer to destination |u| + bl __smul_256x63 + adc x22, x22, x23 + str x22, [x0,#8*4] + + mov x16, x12 // corrected |f1| + mov x17, x13 // corrected |g1| + add x0, x0, #8*5 // pointer to destination |v| + bl __smul_256x63 + bl __smul_512x63_tail + eor x1, x1, #256 // flip-flop src |a|b|u|v| + bl __ab_approximation_31_256 + + eor x0, x1, #256 // pointer to dst |a|b|u|v| + bl __smul_256_n_shift_by_31 + mov x16, x12 // corrected |f0| + mov x17, x13 // corrected |g0| + + mov x12, x14 // |f1| + mov x13, x15 // |g1| + add x0, x0, #8*4 // pointer to destination |b| + bl __smul_256_n_shift_by_31 + + add x0, x0, #8*4 // pointer to destination |u| + bl __smul_256x63 + adc x22, x22, x23 + str x22, [x0,#8*4] + + mov x16, x12 // corrected |f1| + mov x17, x13 // corrected |g1| + add x0, x0, #8*5 // pointer to destination |v| + bl __smul_256x63 + bl __smul_512x63_tail + eor x1, x1, #256 // flip-flop src |a|b|u|v| + bl __ab_approximation_31_256 + + eor x0, x1, #256 // pointer to dst |a|b|u|v| + bl __smul_256_n_shift_by_31 + mov x16, x12 // corrected |f0| + mov x17, x13 // corrected |g0| + + mov x12, x14 // |f1| + mov x13, x15 // |g1| + add x0, x0, #8*4 // pointer to destination |b| + bl __smul_256_n_shift_by_31 + + add x0, x0, #8*4 // pointer to destination |u| + bl __smul_256x63 + adc x22, x22, x23 + str x22, [x0,#8*4] + + mov x16, x12 // corrected |f1| + mov x17, x13 // corrected |g1| + add x0, x0, #8*5 // pointer to destination |v| + bl __smul_256x63 + bl __smul_512x63_tail + ////////////////////////////////////////// two[!] last iterations + eor x1, x1, #256 // flip-flop src |a|b|u|v| + mov x2, #47 // 31 + 512 % 31 + //bl __ab_approximation_62_256 // |a| and |b| are exact, + ldr x7, [x1,#8*0] // just load + ldr x11, [x1,#8*4] + bl __inner_loop_62_256 + + mov x16, x14 + mov x17, x15 + ldr x0, [sp] // original out_ptr + bl __smul_256x63 + bl __smul_512x63_tail + ldr x30, [x29,#8] + + smulh x20, x7, x17 // figure out top-most limb + ldp x8, x9, [x3,#8*0] + adc x23, x23, x25 + ldp x10, x11, [x3,#8*2] + + add x20, x20, x23 // x20 is 1, 0 or -1 + asr x19, x20, #63 // sign as mask + + and x23, x8, x19 // add mod<<256 conditionally + and x24, x9, x19 + adds x4, x4, x23 + and x25, x10, x19 + adcs x5, x5, x24 + and x26, x11, x19 + adcs x6, x6, x25 + adcs x7, x22, x26 + adc x20, x20, xzr // x20 is 1, 0 or -1 + + neg x19, x20 + orr x20, x20, x19 // excess bit or sign as mask + asr x19, x19, #63 // excess bit as mask + + and x8, x8, x20 // mask |mod| + and x9, x9, x20 + and x10, x10, x20 + and x11, x11, x20 + + eor x8, x8, x19 // conditionally negate |mod| + eor x9, x9, x19 + adds x8, x8, x19, lsr#63 + eor x10, x10, x19 + adcs x9, x9, xzr + eor x11, x11, x19 + adcs x10, x10, xzr + adc x11, x11, xzr + + adds x4, x4, x8 // final adjustment for |mod|<<256 + adcs x5, x5, x9 + adcs x6, x6, x10 + stp x4, x5, [x0,#8*4] + adc x7, x7, x11 + stp x6, x7, [x0,#8*6] + + add sp, sp, #1040 + ldp x19, x20, [x29,#16] + ldp x21, x22, [x29,#32] + ldp x23, x24, [x29,#48] + ldp x25, x26, [x29,#64] + ldr x29, [sp],#80 +.long 3573752767 + ret + + +//////////////////////////////////////////////////////////////////////// + +.align 5 +__smul_256x63: + ldp x4, x5, [x1,#8*0+64] // load |u| (or |v|) + asr x14, x16, #63 // |f_|'s sign as mask (or |g_|'s) + ldp x6, x7, [x1,#8*2+64] + eor x16, x16, x14 // conditionally negate |f_| (or |g_|) + ldr x22, [x1,#8*4+64] + + eor x4, x4, x14 // conditionally negate |u| (or |v|) + sub x16, x16, x14 + eor x5, x5, x14 + adds x4, x4, x14, lsr#63 + eor x6, x6, x14 + adcs x5, x5, xzr + eor x7, x7, x14 + adcs x6, x6, xzr + eor x22, x22, x14 + umulh x19, x4, x16 + adcs x7, x7, xzr + umulh x20, x5, x16 + adcs x22, x22, xzr + umulh x21, x6, x16 + mul x4, x4, x16 + cmp x16, #0 + mul x5, x5, x16 + csel x22, x22, xzr, ne + mul x6, x6, x16 + adds x5, x5, x19 + mul x24, x7, x16 + adcs x6, x6, x20 + adcs x24, x24, x21 + adc x26, xzr, xzr + ldp x8, x9, [x1,#8*0+104] // load |u| (or |v|) + asr x14, x17, #63 // |f_|'s sign as mask (or |g_|'s) + ldp x10, x11, [x1,#8*2+104] + eor x17, x17, x14 // conditionally negate |f_| (or |g_|) + ldr x23, [x1,#8*4+104] + + eor x8, x8, x14 // conditionally negate |u| (or |v|) + sub x17, x17, x14 + eor x9, x9, x14 + adds x8, x8, x14, lsr#63 + eor x10, x10, x14 + adcs x9, x9, xzr + eor x11, x11, x14 + adcs x10, x10, xzr + eor x23, x23, x14 + umulh x19, x8, x17 + adcs x11, x11, xzr + umulh x20, x9, x17 + adcs x23, x23, xzr + umulh x21, x10, x17 + adc x15, xzr, xzr // used in __smul_512x63_tail + mul x8, x8, x17 + cmp x17, #0 + mul x9, x9, x17 + csel x23, x23, xzr, ne + mul x10, x10, x17 + adds x9, x9, x19 + mul x25, x11, x17 + adcs x10, x10, x20 + adcs x25, x25, x21 + adc x26, x26, xzr + + adds x4, x4, x8 + adcs x5, x5, x9 + adcs x6, x6, x10 + stp x4, x5, [x0,#8*0] + adcs x24, x24, x25 + stp x6, x24, [x0,#8*2] + + ret + + + +.align 5 +__smul_512x63_tail: + umulh x24, x7, x16 + ldp x5, x6, [x1,#8*18] // load rest of |v| + adc x26, x26, xzr + ldr x7, [x1,#8*20] + and x22, x22, x16 + + umulh x11, x11, x17 // resume |v|*|g1| chain + + sub x24, x24, x22 // tie up |u|*|f1| chain + asr x25, x24, #63 + + eor x5, x5, x14 // conditionally negate rest of |v| + eor x6, x6, x14 + adds x5, x5, x15 + eor x7, x7, x14 + adcs x6, x6, xzr + umulh x19, x23, x17 + adc x7, x7, xzr + umulh x20, x5, x17 + add x11, x11, x26 + umulh x21, x6, x17 + + mul x4, x23, x17 + mul x5, x5, x17 + adds x4, x4, x11 + mul x6, x6, x17 + adcs x5, x5, x19 + mul x22, x7, x17 + adcs x6, x6, x20 + adcs x22, x22, x21 + adc x23, xzr, xzr // used in the final step + + adds x4, x4, x24 + adcs x5, x5, x25 + adcs x6, x6, x25 + stp x4, x5, [x0,#8*4] + adcs x22, x22, x25 // carry is used in the final step + stp x6, x22, [x0,#8*6] + + ret + + + +.align 5 +__smul_256_n_shift_by_31: + ldp x4, x5, [x1,#8*0+0] // load |a| (or |b|) + asr x24, x12, #63 // |f0|'s sign as mask (or |g0|'s) + ldp x6, x7, [x1,#8*2+0] + eor x25, x12, x24 // conditionally negate |f0| (or |g0|) + + eor x4, x4, x24 // conditionally negate |a| (or |b|) + sub x25, x25, x24 + eor x5, x5, x24 + adds x4, x4, x24, lsr#63 + eor x6, x6, x24 + adcs x5, x5, xzr + eor x7, x7, x24 + umulh x19, x4, x25 + adcs x6, x6, xzr + umulh x20, x5, x25 + adc x7, x7, xzr + umulh x21, x6, x25 + and x24, x24, x25 + umulh x22, x7, x25 + neg x24, x24 + + mul x4, x4, x25 + mul x5, x5, x25 + mul x6, x6, x25 + adds x5, x5, x19 + mul x7, x7, x25 + adcs x6, x6, x20 + adcs x7, x7, x21 + adc x22, x22, x24 + ldp x8, x9, [x1,#8*0+32] // load |a| (or |b|) + asr x24, x13, #63 // |f0|'s sign as mask (or |g0|'s) + ldp x10, x11, [x1,#8*2+32] + eor x25, x13, x24 // conditionally negate |f0| (or |g0|) + + eor x8, x8, x24 // conditionally negate |a| (or |b|) + sub x25, x25, x24 + eor x9, x9, x24 + adds x8, x8, x24, lsr#63 + eor x10, x10, x24 + adcs x9, x9, xzr + eor x11, x11, x24 + umulh x19, x8, x25 + adcs x10, x10, xzr + umulh x20, x9, x25 + adc x11, x11, xzr + umulh x21, x10, x25 + and x24, x24, x25 + umulh x23, x11, x25 + neg x24, x24 + + mul x8, x8, x25 + mul x9, x9, x25 + mul x10, x10, x25 + adds x9, x9, x19 + mul x11, x11, x25 + adcs x10, x10, x20 + adcs x11, x11, x21 + adc x23, x23, x24 + adds x4, x4, x8 + adcs x5, x5, x9 + adcs x6, x6, x10 + adcs x7, x7, x11 + adc x8, x22, x23 + + extr x4, x5, x4, #31 + extr x5, x6, x5, #31 + extr x6, x7, x6, #31 + asr x23, x8, #63 // result's sign as mask + extr x7, x8, x7, #31 + + eor x4, x4, x23 // ensure the result is positive + eor x5, x5, x23 + adds x4, x4, x23, lsr#63 + eor x6, x6, x23 + adcs x5, x5, xzr + eor x7, x7, x23 + adcs x6, x6, xzr + stp x4, x5, [x0,#8*0] + adc x7, x7, xzr + stp x6, x7, [x0,#8*2] + + eor x12, x12, x23 // adjust |f/g| accordingly + eor x13, x13, x23 + sub x12, x12, x23 + sub x13, x13, x23 + + ret + + +.align 4 +__ab_approximation_31_256: + ldp x6, x7, [x1,#8*2] + ldp x10, x11, [x1,#8*6] + ldp x4, x5, [x1,#8*0] + ldp x8, x9, [x1,#8*4] + +Lab_approximation_31_256_loaded: + orr x19, x7, x11 // check top-most limbs, ... + cmp x19, #0 + csel x7, x7, x6, ne + csel x11, x11, x10, ne + csel x6, x6, x5, ne + orr x19, x7, x11 // and ones before top-most, ... + csel x10, x10, x9, ne + + cmp x19, #0 + csel x7, x7, x6, ne + csel x11, x11, x10, ne + csel x6, x6, x4, ne + orr x19, x7, x11 // and one more, ... + csel x10, x10, x8, ne + + clz x19, x19 + cmp x19, #64 + csel x19, x19, xzr, ne + csel x7, x7, x6, ne + csel x11, x11, x10, ne + neg x20, x19 + + lslv x7, x7, x19 // align high limbs to the left + lslv x11, x11, x19 + lsrv x6, x6, x20 + lsrv x10, x10, x20 + and x6, x6, x20, asr#6 + and x10, x10, x20, asr#6 + orr x7, x7, x6 + orr x11, x11, x10 + + bfxil x7, x4, #0, #31 + bfxil x11, x8, #0, #31 + + b __inner_loop_31_256 + ret + + + +.align 4 +__inner_loop_31_256: + mov x2, #31 + mov x13, #0x7FFFFFFF80000000 // |f0|=1, |g0|=0 + mov x15, #0x800000007FFFFFFF // |f1|=0, |g1|=1 + mov x23,#0x7FFFFFFF7FFFFFFF + +Loop_31_256: + sbfx x22, x7, #0, #1 // if |a_| is odd, then we'll be subtracting + sub x2, x2, #1 + and x19, x11, x22 + sub x20, x11, x7 // |b_|-|a_| + subs x21, x7, x19 // |a_|-|b_| (or |a_|-0 if |a_| was even) + mov x19, x15 + csel x11, x11, x7, hs // |b_| = |a_| + csel x7, x21, x20, hs // borrow means |a_|<|b_|, replace with |b_|-|a_| + csel x15, x15, x13, hs // exchange |fg0| and |fg1| + csel x13, x13, x19, hs + lsr x7, x7, #1 + and x19, x15, x22 + and x20, x23, x22 + sub x13, x13, x19 // |f0|-=|f1| (or |f0-=0| if |a_| was even) + add x15, x15, x15 // |f1|<<=1 + add x13, x13, x20 + sub x15, x15, x23 + cbnz x2, Loop_31_256 + + mov x23, #0x7FFFFFFF + ubfx x12, x13, #0, #32 + ubfx x13, x13, #32, #32 + ubfx x14, x15, #0, #32 + ubfx x15, x15, #32, #32 + sub x12, x12, x23 // remove bias + sub x13, x13, x23 + sub x14, x14, x23 + sub x15, x15, x23 + + ret + + + +.align 4 +__inner_loop_62_256: + mov x12, #1 // |f0|=1 + mov x13, #0 // |g0|=0 + mov x14, #0 // |f1|=0 + mov x15, #1 // |g1|=1 + +Loop_62_256: + sbfx x22, x7, #0, #1 // if |a_| is odd, then we'll be subtracting + sub x2, x2, #1 + and x19, x11, x22 + sub x20, x11, x7 // |b_|-|a_| + subs x21, x7, x19 // |a_|-|b_| (or |a_|-0 if |a_| was even) + mov x19, x12 + csel x11, x11, x7, hs // |b_| = |a_| + csel x7, x21, x20, hs // borrow means |a_|<|b_|, replace with |b_|-|a_| + mov x20, x13 + csel x12, x12, x14, hs // exchange |f0| and |f1| + csel x14, x14, x19, hs + csel x13, x13, x15, hs // exchange |g0| and |g1| + csel x15, x15, x20, hs + lsr x7, x7, #1 + and x19, x14, x22 + and x20, x15, x22 + add x14, x14, x14 // |f1|<<=1 + add x15, x15, x15 // |g1|<<=1 + sub x12, x12, x19 // |f0|-=|f1| (or |f0-=0| if |a_| was even) + sub x13, x13, x20 // |g0|-=|g1| (or |g0-=0| ...) + cbnz x2, Loop_62_256 + + ret + diff --git a/temp-deps/blst-ts/deps/blst/build/mach-o/ct_inverse_mod_256-x86_64.s b/temp-deps/blst-ts/deps/blst/build/mach-o/ct_inverse_mod_256-x86_64.s new file mode 100644 index 0000000..b6441da --- /dev/null +++ b/temp-deps/blst-ts/deps/blst/build/mach-o/ct_inverse_mod_256-x86_64.s @@ -0,0 +1,1177 @@ +.text + +.globl _ct_inverse_mod_256 + +.p2align 5 +_ct_inverse_mod_256: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + subq $1072,%rsp +.cfi_adjust_cfa_offset 1072 + + + leaq 48+511(%rsp),%rax + andq $-512,%rax + movq %rdi,32(%rsp) + movq %rcx,40(%rsp) + + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + + movq 0(%rdx),%r12 + movq 8(%rdx),%r13 + movq 16(%rdx),%r14 + movq 24(%rdx),%r15 + + movq %r8,0(%rax) + movq %r9,8(%rax) + movq %r10,16(%rax) + movq %r11,24(%rax) + + movq %r12,32(%rax) + movq %r13,40(%rax) + movq %r14,48(%rax) + movq %r15,56(%rax) + movq %rax,%rsi + + + movl $31,%edx + call __ab_approximation_31_256 + + + movq %r12,16(%rsp) + movq %r13,24(%rsp) + + movq $256,%rdi + xorq %rsi,%rdi + call __smulq_256_n_shift_by_31 + + + movq %rdx,64(%rdi) + + movq 16(%rsp),%rdx + movq 24(%rsp),%rcx + leaq 32(%rdi),%rdi + call __smulq_256_n_shift_by_31 + + + movq %rdx,72(%rdi) + + + xorq $256,%rsi + movl $31,%edx + call __ab_approximation_31_256 + + + movq %r12,16(%rsp) + movq %r13,24(%rsp) + + movq $256,%rdi + xorq %rsi,%rdi + call __smulq_256_n_shift_by_31 + movq %rdx,0(%rsp) + movq %rcx,8(%rsp) + + movq 16(%rsp),%rdx + movq 24(%rsp),%rcx + leaq 32(%rdi),%rdi + call __smulq_256_n_shift_by_31 + + + + movq 64(%rsi),%r8 + movq 104(%rsi),%r12 + movq %r8,%r9 + imulq 0(%rsp),%r8 + movq %r12,%r13 + imulq 8(%rsp),%r12 + addq %r12,%r8 + movq %r8,32(%rdi) + sarq $63,%r8 + movq %r8,40(%rdi) + movq %r8,48(%rdi) + movq %r8,56(%rdi) + movq %r8,64(%rdi) + leaq 64(%rsi),%rsi + + imulq %rdx,%r9 + imulq %rcx,%r13 + addq %r13,%r9 + movq %r9,72(%rdi) + sarq $63,%r9 + movq %r9,80(%rdi) + movq %r9,88(%rdi) + movq %r9,96(%rdi) + movq %r9,104(%rdi) + xorq $256+64,%rsi + movl $31,%edx + call __ab_approximation_31_256 + + + movq %r12,16(%rsp) + movq %r13,24(%rsp) + + movq $256,%rdi + xorq %rsi,%rdi + call __smulq_256_n_shift_by_31 + movq %rdx,0(%rsp) + movq %rcx,8(%rsp) + + movq 16(%rsp),%rdx + movq 24(%rsp),%rcx + leaq 32(%rdi),%rdi + call __smulq_256_n_shift_by_31 + movq %rdx,16(%rsp) + movq %rcx,24(%rsp) + + movq 0(%rsp),%rdx + movq 8(%rsp),%rcx + leaq 64(%rsi),%rsi + leaq 32(%rdi),%rdi + call __smulq_256x63 + + movq 16(%rsp),%rdx + movq 24(%rsp),%rcx + leaq 40(%rdi),%rdi + call __smulq_256x63 + xorq $256+64,%rsi + movl $31,%edx + call __ab_approximation_31_256 + + + movq %r12,16(%rsp) + movq %r13,24(%rsp) + + movq $256,%rdi + xorq %rsi,%rdi + call __smulq_256_n_shift_by_31 + movq %rdx,0(%rsp) + movq %rcx,8(%rsp) + + movq 16(%rsp),%rdx + movq 24(%rsp),%rcx + leaq 32(%rdi),%rdi + call __smulq_256_n_shift_by_31 + movq %rdx,16(%rsp) + movq %rcx,24(%rsp) + + movq 0(%rsp),%rdx + movq 8(%rsp),%rcx + leaq 64(%rsi),%rsi + leaq 32(%rdi),%rdi + call __smulq_256x63 + + movq 16(%rsp),%rdx + movq 24(%rsp),%rcx + leaq 40(%rdi),%rdi + call __smulq_256x63 + xorq $256+64,%rsi + movl $31,%edx + call __ab_approximation_31_256 + + + movq %r12,16(%rsp) + movq %r13,24(%rsp) + + movq $256,%rdi + xorq %rsi,%rdi + call __smulq_256_n_shift_by_31 + movq %rdx,0(%rsp) + movq %rcx,8(%rsp) + + movq 16(%rsp),%rdx + movq 24(%rsp),%rcx + leaq 32(%rdi),%rdi + call __smulq_256_n_shift_by_31 + movq %rdx,16(%rsp) + movq %rcx,24(%rsp) + + movq 0(%rsp),%rdx + movq 8(%rsp),%rcx + leaq 64(%rsi),%rsi + leaq 32(%rdi),%rdi + call __smulq_256x63 + + movq 16(%rsp),%rdx + movq 24(%rsp),%rcx + leaq 40(%rdi),%rdi + call __smulq_256x63 + xorq $256+64,%rsi + movl $31,%edx + call __ab_approximation_31_256 + + + movq %r12,16(%rsp) + movq %r13,24(%rsp) + + movq $256,%rdi + xorq %rsi,%rdi + call __smulq_256_n_shift_by_31 + movq %rdx,0(%rsp) + movq %rcx,8(%rsp) + + movq 16(%rsp),%rdx + movq 24(%rsp),%rcx + leaq 32(%rdi),%rdi + call __smulq_256_n_shift_by_31 + movq %rdx,16(%rsp) + movq %rcx,24(%rsp) + + movq 0(%rsp),%rdx + movq 8(%rsp),%rcx + leaq 64(%rsi),%rsi + leaq 32(%rdi),%rdi + call __smulq_256x63 + + movq 16(%rsp),%rdx + movq 24(%rsp),%rcx + leaq 40(%rdi),%rdi + call __smulq_256x63 + xorq $256+64,%rsi + movl $31,%edx + call __ab_approximation_31_256 + + + movq %r12,16(%rsp) + movq %r13,24(%rsp) + + movq $256,%rdi + xorq %rsi,%rdi + call __smulq_256_n_shift_by_31 + movq %rdx,0(%rsp) + movq %rcx,8(%rsp) + + movq 16(%rsp),%rdx + movq 24(%rsp),%rcx + leaq 32(%rdi),%rdi + call __smulq_256_n_shift_by_31 + movq %rdx,16(%rsp) + movq %rcx,24(%rsp) + + movq 0(%rsp),%rdx + movq 8(%rsp),%rcx + leaq 64(%rsi),%rsi + leaq 32(%rdi),%rdi + call __smulq_256x63 + + movq 16(%rsp),%rdx + movq 24(%rsp),%rcx + leaq 40(%rdi),%rdi + call __smulq_256x63 + xorq $256+64,%rsi + movl $31,%edx + call __ab_approximation_31_256 + + + movq %r12,16(%rsp) + movq %r13,24(%rsp) + + movq $256,%rdi + xorq %rsi,%rdi + call __smulq_256_n_shift_by_31 + movq %rdx,0(%rsp) + movq %rcx,8(%rsp) + + movq 16(%rsp),%rdx + movq 24(%rsp),%rcx + leaq 32(%rdi),%rdi + call __smulq_256_n_shift_by_31 + movq %rdx,16(%rsp) + movq %rcx,24(%rsp) + + movq 0(%rsp),%rdx + movq 8(%rsp),%rcx + leaq 64(%rsi),%rsi + leaq 32(%rdi),%rdi + call __smulq_256x63 + + movq 16(%rsp),%rdx + movq 24(%rsp),%rcx + leaq 40(%rdi),%rdi + call __smulq_256x63 + xorq $256+64,%rsi + movl $31,%edx + call __ab_approximation_31_256 + + + movq %r12,16(%rsp) + movq %r13,24(%rsp) + + movq $256,%rdi + xorq %rsi,%rdi + call __smulq_256_n_shift_by_31 + movq %rdx,0(%rsp) + movq %rcx,8(%rsp) + + movq 16(%rsp),%rdx + movq 24(%rsp),%rcx + leaq 32(%rdi),%rdi + call __smulq_256_n_shift_by_31 + movq %rdx,16(%rsp) + movq %rcx,24(%rsp) + + movq 0(%rsp),%rdx + movq 8(%rsp),%rcx + leaq 64(%rsi),%rsi + leaq 32(%rdi),%rdi + call __smulq_256x63 + + movq 16(%rsp),%rdx + movq 24(%rsp),%rcx + leaq 40(%rdi),%rdi + call __smulq_256x63 + sarq $63,%rbp + movq %rbp,40(%rdi) + movq %rbp,48(%rdi) + movq %rbp,56(%rdi) + xorq $256+64,%rsi + movl $31,%edx + call __ab_approximation_31_256 + + + movq %r12,16(%rsp) + movq %r13,24(%rsp) + + movq $256,%rdi + xorq %rsi,%rdi + call __smulq_256_n_shift_by_31 + movq %rdx,0(%rsp) + movq %rcx,8(%rsp) + + movq 16(%rsp),%rdx + movq 24(%rsp),%rcx + leaq 32(%rdi),%rdi + call __smulq_256_n_shift_by_31 + movq %rdx,16(%rsp) + movq %rcx,24(%rsp) + + movq 0(%rsp),%rdx + movq 8(%rsp),%rcx + leaq 64(%rsi),%rsi + leaq 32(%rdi),%rdi + call __smulq_256x63 + + movq 16(%rsp),%rdx + movq 24(%rsp),%rcx + leaq 40(%rdi),%rdi + call __smulq_512x63 + xorq $256+64,%rsi + movl $31,%edx + call __ab_approximation_31_256 + + + movq %r12,16(%rsp) + movq %r13,24(%rsp) + + movq $256,%rdi + xorq %rsi,%rdi + call __smulq_256_n_shift_by_31 + movq %rdx,0(%rsp) + movq %rcx,8(%rsp) + + movq 16(%rsp),%rdx + movq 24(%rsp),%rcx + leaq 32(%rdi),%rdi + call __smulq_256_n_shift_by_31 + movq %rdx,16(%rsp) + movq %rcx,24(%rsp) + + movq 0(%rsp),%rdx + movq 8(%rsp),%rcx + leaq 64(%rsi),%rsi + leaq 32(%rdi),%rdi + call __smulq_256x63 + + movq 16(%rsp),%rdx + movq 24(%rsp),%rcx + leaq 40(%rdi),%rdi + call __smulq_512x63 + xorq $256+64,%rsi + movl $31,%edx + call __ab_approximation_31_256 + + + movq %r12,16(%rsp) + movq %r13,24(%rsp) + + movq $256,%rdi + xorq %rsi,%rdi + call __smulq_256_n_shift_by_31 + movq %rdx,0(%rsp) + movq %rcx,8(%rsp) + + movq 16(%rsp),%rdx + movq 24(%rsp),%rcx + leaq 32(%rdi),%rdi + call __smulq_256_n_shift_by_31 + movq %rdx,16(%rsp) + movq %rcx,24(%rsp) + + movq 0(%rsp),%rdx + movq 8(%rsp),%rcx + leaq 64(%rsi),%rsi + leaq 32(%rdi),%rdi + call __smulq_256x63 + + movq 16(%rsp),%rdx + movq 24(%rsp),%rcx + leaq 40(%rdi),%rdi + call __smulq_512x63 + xorq $256+64,%rsi + movl $31,%edx + call __ab_approximation_31_256 + + + movq %r12,16(%rsp) + movq %r13,24(%rsp) + + movq $256,%rdi + xorq %rsi,%rdi + call __smulq_256_n_shift_by_31 + movq %rdx,0(%rsp) + movq %rcx,8(%rsp) + + movq 16(%rsp),%rdx + movq 24(%rsp),%rcx + leaq 32(%rdi),%rdi + call __smulq_256_n_shift_by_31 + movq %rdx,16(%rsp) + movq %rcx,24(%rsp) + + movq 0(%rsp),%rdx + movq 8(%rsp),%rcx + leaq 64(%rsi),%rsi + leaq 32(%rdi),%rdi + call __smulq_256x63 + + movq 16(%rsp),%rdx + movq 24(%rsp),%rcx + leaq 40(%rdi),%rdi + call __smulq_512x63 + xorq $256+64,%rsi + movl $31,%edx + call __ab_approximation_31_256 + + + movq %r12,16(%rsp) + movq %r13,24(%rsp) + + movq $256,%rdi + xorq %rsi,%rdi + call __smulq_256_n_shift_by_31 + movq %rdx,0(%rsp) + movq %rcx,8(%rsp) + + movq 16(%rsp),%rdx + movq 24(%rsp),%rcx + leaq 32(%rdi),%rdi + call __smulq_256_n_shift_by_31 + movq %rdx,16(%rsp) + movq %rcx,24(%rsp) + + movq 0(%rsp),%rdx + movq 8(%rsp),%rcx + leaq 64(%rsi),%rsi + leaq 32(%rdi),%rdi + call __smulq_256x63 + + movq 16(%rsp),%rdx + movq 24(%rsp),%rcx + leaq 40(%rdi),%rdi + call __smulq_512x63 + xorq $256+64,%rsi + movl $31,%edx + call __ab_approximation_31_256 + + + movq %r12,16(%rsp) + movq %r13,24(%rsp) + + movq $256,%rdi + xorq %rsi,%rdi + call __smulq_256_n_shift_by_31 + movq %rdx,0(%rsp) + movq %rcx,8(%rsp) + + movq 16(%rsp),%rdx + movq 24(%rsp),%rcx + leaq 32(%rdi),%rdi + call __smulq_256_n_shift_by_31 + movq %rdx,16(%rsp) + movq %rcx,24(%rsp) + + movq 0(%rsp),%rdx + movq 8(%rsp),%rcx + leaq 64(%rsi),%rsi + leaq 32(%rdi),%rdi + call __smulq_256x63 + + movq 16(%rsp),%rdx + movq 24(%rsp),%rcx + leaq 40(%rdi),%rdi + call __smulq_512x63 + + xorq $256+64,%rsi + movl $47,%edx + + movq 0(%rsi),%r8 + + movq 32(%rsi),%r10 + + call __inner_loop_62_256 + + + + + + + + leaq 64(%rsi),%rsi + + + + + + movq %r12,%rdx + movq %r13,%rcx + movq 32(%rsp),%rdi + call __smulq_512x63 + adcq %rbp,%rdx + + movq 40(%rsp),%rsi + movq %rdx,%rax + sarq $63,%rdx + + movq %rdx,%r8 + movq %rdx,%r9 + andq 0(%rsi),%r8 + movq %rdx,%r10 + andq 8(%rsi),%r9 + andq 16(%rsi),%r10 + andq 24(%rsi),%rdx + + addq %r8,%r12 + adcq %r9,%r13 + adcq %r10,%r14 + adcq %rdx,%r15 + adcq $0,%rax + + movq %rax,%rdx + negq %rax + orq %rax,%rdx + sarq $63,%rax + + movq %rdx,%r8 + movq %rdx,%r9 + andq 0(%rsi),%r8 + movq %rdx,%r10 + andq 8(%rsi),%r9 + andq 16(%rsi),%r10 + andq 24(%rsi),%rdx + + xorq %rax,%r8 + xorq %rcx,%rcx + xorq %rax,%r9 + subq %rax,%rcx + xorq %rax,%r10 + xorq %rax,%rdx + addq %rcx,%r8 + adcq $0,%r9 + adcq $0,%r10 + adcq $0,%rdx + + addq %r8,%r12 + adcq %r9,%r13 + adcq %r10,%r14 + adcq %rdx,%r15 + + movq %r12,32(%rdi) + movq %r13,40(%rdi) + movq %r14,48(%rdi) + movq %r15,56(%rdi) + + leaq 1072(%rsp),%r8 + movq 0(%r8),%r15 +.cfi_restore %r15 + movq 8(%r8),%r14 +.cfi_restore %r14 + movq 16(%r8),%r13 +.cfi_restore %r13 + movq 24(%r8),%r12 +.cfi_restore %r12 + movq 32(%r8),%rbx +.cfi_restore %rbx + movq 40(%r8),%rbp +.cfi_restore %rbp + leaq 48(%r8),%rsp +.cfi_adjust_cfa_offset -1072-8*6 + + .byte 0xf3,0xc3 +.cfi_endproc + + +.p2align 5 +__smulq_512x63: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + movq 32(%rsi),%rbp + + movq %rdx,%rbx + sarq $63,%rdx + xorq %rax,%rax + subq %rdx,%rax + + xorq %rdx,%rbx + addq %rax,%rbx + + xorq %rdx,%r8 + xorq %rdx,%r9 + xorq %rdx,%r10 + xorq %rdx,%r11 + xorq %rdx,%rbp + addq %r8,%rax + adcq $0,%r9 + adcq $0,%r10 + adcq $0,%r11 + adcq $0,%rbp + + mulq %rbx + movq %rax,0(%rdi) + movq %r9,%rax + movq %rdx,%r9 + mulq %rbx + addq %rax,%r9 + movq %r10,%rax + adcq $0,%rdx + movq %r9,8(%rdi) + movq %rdx,%r10 + mulq %rbx + addq %rax,%r10 + movq %r11,%rax + adcq $0,%rdx + movq %r10,16(%rdi) + movq %rdx,%r11 + andq %rbx,%rbp + negq %rbp + mulq %rbx + addq %rax,%r11 + adcq %rdx,%rbp + movq %r11,24(%rdi) + + movq 40(%rsi),%r8 + movq 48(%rsi),%r9 + movq 56(%rsi),%r10 + movq 64(%rsi),%r11 + movq 72(%rsi),%r12 + movq 80(%rsi),%r13 + movq 88(%rsi),%r14 + movq 96(%rsi),%r15 + + movq %rcx,%rdx + sarq $63,%rdx + xorq %rax,%rax + subq %rdx,%rax + + xorq %rdx,%rcx + addq %rax,%rcx + + xorq %rdx,%r8 + xorq %rdx,%r9 + xorq %rdx,%r10 + xorq %rdx,%r11 + xorq %rdx,%r12 + xorq %rdx,%r13 + xorq %rdx,%r14 + xorq %rdx,%r15 + addq %r8,%rax + adcq $0,%r9 + adcq $0,%r10 + adcq $0,%r11 + adcq $0,%r12 + adcq $0,%r13 + adcq $0,%r14 + adcq $0,%r15 + + mulq %rcx + movq %rax,%r8 + movq %r9,%rax + movq %rdx,%r9 + mulq %rcx + addq %rax,%r9 + movq %r10,%rax + adcq $0,%rdx + movq %rdx,%r10 + mulq %rcx + addq %rax,%r10 + movq %r11,%rax + adcq $0,%rdx + movq %rdx,%r11 + mulq %rcx + addq %rax,%r11 + movq %r12,%rax + adcq $0,%rdx + movq %rdx,%r12 + mulq %rcx + addq %rax,%r12 + movq %r13,%rax + adcq $0,%rdx + movq %rdx,%r13 + mulq %rcx + addq %rax,%r13 + movq %r14,%rax + adcq $0,%rdx + movq %rdx,%r14 + mulq %rcx + addq %rax,%r14 + movq %r15,%rax + adcq $0,%rdx + movq %rdx,%r15 + imulq %rcx + addq %rax,%r15 + adcq $0,%rdx + + movq %rbp,%rbx + sarq $63,%rbp + + addq 0(%rdi),%r8 + adcq 8(%rdi),%r9 + adcq 16(%rdi),%r10 + adcq 24(%rdi),%r11 + adcq %rbx,%r12 + adcq %rbp,%r13 + adcq %rbp,%r14 + adcq %rbp,%r15 + + movq %r8,0(%rdi) + movq %r9,8(%rdi) + movq %r10,16(%rdi) + movq %r11,24(%rdi) + movq %r12,32(%rdi) + movq %r13,40(%rdi) + movq %r14,48(%rdi) + movq %r15,56(%rdi) + + .byte 0xf3,0xc3 +.cfi_endproc + + + +.p2align 5 +__smulq_256x63: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + movq 0+0(%rsi),%r8 + movq 0+8(%rsi),%r9 + movq 0+16(%rsi),%r10 + movq 0+24(%rsi),%r11 + movq 0+32(%rsi),%rbp + + movq %rdx,%rbx + sarq $63,%rdx + xorq %rax,%rax + subq %rdx,%rax + + xorq %rdx,%rbx + addq %rax,%rbx + + xorq %rdx,%r8 + xorq %rdx,%r9 + xorq %rdx,%r10 + xorq %rdx,%r11 + xorq %rdx,%rbp + addq %r8,%rax + adcq $0,%r9 + adcq $0,%r10 + adcq $0,%r11 + adcq $0,%rbp + + mulq %rbx + movq %rax,%r8 + movq %r9,%rax + movq %rdx,%r9 + mulq %rbx + addq %rax,%r9 + movq %r10,%rax + adcq $0,%rdx + movq %rdx,%r10 + mulq %rbx + addq %rax,%r10 + movq %r11,%rax + adcq $0,%rdx + movq %rdx,%r11 + andq %rbx,%rbp + negq %rbp + mulq %rbx + addq %rax,%r11 + adcq %rdx,%rbp + movq %rcx,%rdx + movq 40+0(%rsi),%r12 + movq 40+8(%rsi),%r13 + movq 40+16(%rsi),%r14 + movq 40+24(%rsi),%r15 + movq 40+32(%rsi),%rcx + + movq %rdx,%rbx + sarq $63,%rdx + xorq %rax,%rax + subq %rdx,%rax + + xorq %rdx,%rbx + addq %rax,%rbx + + xorq %rdx,%r12 + xorq %rdx,%r13 + xorq %rdx,%r14 + xorq %rdx,%r15 + xorq %rdx,%rcx + addq %r12,%rax + adcq $0,%r13 + adcq $0,%r14 + adcq $0,%r15 + adcq $0,%rcx + + mulq %rbx + movq %rax,%r12 + movq %r13,%rax + movq %rdx,%r13 + mulq %rbx + addq %rax,%r13 + movq %r14,%rax + adcq $0,%rdx + movq %rdx,%r14 + mulq %rbx + addq %rax,%r14 + movq %r15,%rax + adcq $0,%rdx + movq %rdx,%r15 + andq %rbx,%rcx + negq %rcx + mulq %rbx + addq %rax,%r15 + adcq %rdx,%rcx + addq %r12,%r8 + adcq %r13,%r9 + adcq %r14,%r10 + adcq %r15,%r11 + adcq %rcx,%rbp + + movq %r8,0(%rdi) + movq %r9,8(%rdi) + movq %r10,16(%rdi) + movq %r11,24(%rdi) + movq %rbp,32(%rdi) + + .byte 0xf3,0xc3 +.cfi_endproc + + +.p2align 5 +__smulq_256_n_shift_by_31: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + movq %rdx,0(%rdi) + movq %rcx,8(%rdi) + movq %rdx,%rbp + movq 0+0(%rsi),%r8 + movq 0+8(%rsi),%r9 + movq 0+16(%rsi),%r10 + movq 0+24(%rsi),%r11 + + movq %rbp,%rbx + sarq $63,%rbp + xorq %rax,%rax + subq %rbp,%rax + + xorq %rbp,%rbx + addq %rax,%rbx + + xorq %rbp,%r8 + xorq %rbp,%r9 + xorq %rbp,%r10 + xorq %rbp,%r11 + addq %r8,%rax + adcq $0,%r9 + adcq $0,%r10 + adcq $0,%r11 + + mulq %rbx + movq %rax,%r8 + movq %r9,%rax + andq %rbx,%rbp + negq %rbp + movq %rdx,%r9 + mulq %rbx + addq %rax,%r9 + movq %r10,%rax + adcq $0,%rdx + movq %rdx,%r10 + mulq %rbx + addq %rax,%r10 + movq %r11,%rax + adcq $0,%rdx + movq %rdx,%r11 + mulq %rbx + addq %rax,%r11 + adcq %rdx,%rbp + movq 32+0(%rsi),%r12 + movq 32+8(%rsi),%r13 + movq 32+16(%rsi),%r14 + movq 32+24(%rsi),%r15 + + movq %rcx,%rbx + sarq $63,%rcx + xorq %rax,%rax + subq %rcx,%rax + + xorq %rcx,%rbx + addq %rax,%rbx + + xorq %rcx,%r12 + xorq %rcx,%r13 + xorq %rcx,%r14 + xorq %rcx,%r15 + addq %r12,%rax + adcq $0,%r13 + adcq $0,%r14 + adcq $0,%r15 + + mulq %rbx + movq %rax,%r12 + movq %r13,%rax + andq %rbx,%rcx + negq %rcx + movq %rdx,%r13 + mulq %rbx + addq %rax,%r13 + movq %r14,%rax + adcq $0,%rdx + movq %rdx,%r14 + mulq %rbx + addq %rax,%r14 + movq %r15,%rax + adcq $0,%rdx + movq %rdx,%r15 + mulq %rbx + addq %rax,%r15 + adcq %rdx,%rcx + addq %r12,%r8 + adcq %r13,%r9 + adcq %r14,%r10 + adcq %r15,%r11 + adcq %rcx,%rbp + + movq 0(%rdi),%rdx + movq 8(%rdi),%rcx + + shrdq $31,%r9,%r8 + shrdq $31,%r10,%r9 + shrdq $31,%r11,%r10 + shrdq $31,%rbp,%r11 + + sarq $63,%rbp + xorq %rax,%rax + subq %rbp,%rax + + xorq %rbp,%r8 + xorq %rbp,%r9 + xorq %rbp,%r10 + xorq %rbp,%r11 + addq %rax,%r8 + adcq $0,%r9 + adcq $0,%r10 + adcq $0,%r11 + + movq %r8,0(%rdi) + movq %r9,8(%rdi) + movq %r10,16(%rdi) + movq %r11,24(%rdi) + + xorq %rbp,%rdx + xorq %rbp,%rcx + addq %rax,%rdx + addq %rax,%rcx + + .byte 0xf3,0xc3 +.cfi_endproc + + +.p2align 5 +__ab_approximation_31_256: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + movq 24(%rsi),%r9 + movq 56(%rsi),%r11 + movq 16(%rsi),%rbx + movq 48(%rsi),%rbp + movq 8(%rsi),%r8 + movq 40(%rsi),%r10 + + movq %r9,%rax + orq %r11,%rax + cmovzq %rbx,%r9 + cmovzq %rbp,%r11 + cmovzq %r8,%rbx + movq 0(%rsi),%r8 + cmovzq %r10,%rbp + movq 32(%rsi),%r10 + + movq %r9,%rax + orq %r11,%rax + cmovzq %rbx,%r9 + cmovzq %rbp,%r11 + cmovzq %r8,%rbx + cmovzq %r10,%rbp + + movq %r9,%rax + orq %r11,%rax + bsrq %rax,%rcx + leaq 1(%rcx),%rcx + cmovzq %r8,%r9 + cmovzq %r10,%r11 + cmovzq %rax,%rcx + negq %rcx + + + shldq %cl,%rbx,%r9 + shldq %cl,%rbp,%r11 + + movl $0x7FFFFFFF,%eax + andq %rax,%r8 + andq %rax,%r10 + notq %rax + andq %rax,%r9 + andq %rax,%r11 + orq %r9,%r8 + orq %r11,%r10 + + jmp __inner_loop_31_256 + + .byte 0xf3,0xc3 +.cfi_endproc + + +.p2align 5 +__inner_loop_31_256: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + movq $0x7FFFFFFF80000000,%rcx + movq $0x800000007FFFFFFF,%r13 + movq $0x7FFFFFFF7FFFFFFF,%r15 + +L$oop_31_256: + cmpq %r10,%r8 + movq %r8,%rax + movq %r10,%rbx + movq %rcx,%rbp + movq %r13,%r14 + cmovbq %r10,%r8 + cmovbq %rax,%r10 + cmovbq %r13,%rcx + cmovbq %rbp,%r13 + + subq %r10,%r8 + subq %r13,%rcx + addq %r15,%rcx + + testq $1,%rax + cmovzq %rax,%r8 + cmovzq %rbx,%r10 + cmovzq %rbp,%rcx + cmovzq %r14,%r13 + + shrq $1,%r8 + addq %r13,%r13 + subq %r15,%r13 + subl $1,%edx + jnz L$oop_31_256 + + shrq $32,%r15 + movl %ecx,%edx + movl %r13d,%r12d + shrq $32,%rcx + shrq $32,%r13 + subq %r15,%rdx + subq %r15,%rcx + subq %r15,%r12 + subq %r15,%r13 + + .byte 0xf3,0xc3 +.cfi_endproc + + + +.p2align 5 +__inner_loop_62_256: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + movl %edx,%r15d + movq $1,%rdx + xorq %rcx,%rcx + xorq %r12,%r12 + movq %rdx,%r13 + movq %rdx,%r14 + +L$oop_62_256: + xorq %rax,%rax + testq %r14,%r8 + movq %r10,%rbx + cmovnzq %r10,%rax + subq %r8,%rbx + movq %r8,%rbp + subq %rax,%r8 + cmovcq %rbx,%r8 + cmovcq %rbp,%r10 + movq %rdx,%rax + cmovcq %r12,%rdx + cmovcq %rax,%r12 + movq %rcx,%rbx + cmovcq %r13,%rcx + cmovcq %rbx,%r13 + xorq %rax,%rax + xorq %rbx,%rbx + shrq $1,%r8 + testq %r14,%rbp + cmovnzq %r12,%rax + cmovnzq %r13,%rbx + addq %r12,%r12 + addq %r13,%r13 + subq %rax,%rdx + subq %rbx,%rcx + subl $1,%r15d + jnz L$oop_62_256 + + .byte 0xf3,0xc3 +.cfi_endproc + diff --git a/temp-deps/blst-ts/deps/blst/build/mach-o/ct_inverse_mod_384-armv8.S b/temp-deps/blst-ts/deps/blst/build/mach-o/ct_inverse_mod_384-armv8.S new file mode 100644 index 0000000..c7d9ba8 --- /dev/null +++ b/temp-deps/blst-ts/deps/blst/build/mach-o/ct_inverse_mod_384-armv8.S @@ -0,0 +1,717 @@ +.text + +.globl _ct_inverse_mod_383 + +.align 5 +_ct_inverse_mod_383: +.long 3573752639 + stp x29, x30, [sp,#-128]! + add x29, sp, #0 + stp x19, x20, [sp,#16] + stp x21, x22, [sp,#32] + stp x23, x24, [sp,#48] + stp x25, x26, [sp,#64] + stp x27, x28, [sp,#80] + sub sp, sp, #1040 + + ldp x22, x4, [x1,#8*0] + ldp x5, x6, [x1,#8*2] + ldp x7, x8, [x1,#8*4] + + add x1, sp, #16+511 // find closest 512-byte-aligned spot + and x1, x1, #-512 // in the frame... + stp x0, x3, [sp] + + ldp x9, x10, [x2,#8*0] + ldp x11, x12, [x2,#8*2] + ldp x13, x14, [x2,#8*4] + + stp x22, x4, [x1,#8*0] // copy input to |a| + stp x5, x6, [x1,#8*2] + stp x7, x8, [x1,#8*4] + stp x9, x10, [x1,#8*6] // copy modulus to |b| + stp x11, x12, [x1,#8*8] + stp x13, x14, [x1,#8*10] + + ////////////////////////////////////////// first iteration + mov x2, #62 + bl Lab_approximation_62_loaded + + eor x0, x1, #256 // pointer to dst |a|b|u|v| + bl __smul_383_n_shift_by_62 + str x15,[x0,#8*12] // initialize |u| with |f0| + + mov x15, x17 // |f1| + mov x16, x19 // |g1| + add x0, x0, #8*6 // pointer to dst |b| + bl __smul_383_n_shift_by_62 + str x15, [x0,#8*12] // initialize |v| with |f1| + + ////////////////////////////////////////// second iteration + eor x1, x1, #256 // flip-flop src |a|b|u|v| + mov x2, #62 + bl __ab_approximation_62 + + eor x0, x1, #256 // pointer to dst |a|b|u|v| + bl __smul_383_n_shift_by_62 + mov x20, x15 // corrected |f0| + mov x21, x16 // corrected |g0| + + mov x15, x17 // |f1| + mov x16, x19 // |g1| + add x0, x0, #8*6 // pointer to destination |b| + bl __smul_383_n_shift_by_62 + + ldr x7, [x1,#8*12] // |u| + ldr x8, [x1,#8*18] // |v| + mul x3, x20, x7 // |u|*|f0| + smulh x4, x20, x7 + mul x5, x21, x8 // |v|*|g0| + smulh x6, x21, x8 + adds x3, x3, x5 + adc x4, x4, x6 + stp x3, x4, [x0,#8*6] + asr x5, x4, #63 // sign extenstion + stp x5, x5, [x0,#8*8] + stp x5, x5, [x0,#8*10] + + mul x3, x15, x7 // |u|*|f1| + smulh x4, x15, x7 + mul x5, x16, x8 // |v|*|g1| + smulh x6, x16, x8 + adds x3, x3, x5 + adc x4, x4, x6 + stp x3, x4, [x0,#8*12] + asr x5, x4, #63 // sign extenstion + stp x5, x5, [x0,#8*14] + stp x5, x5, [x0,#8*16] + eor x1, x1, #256 // flip-flop src |a|b|u|v| + mov x2, #62 + bl __ab_approximation_62 + + eor x0, x1, #256 // pointer to dst |a|b|u|v| + bl __smul_383_n_shift_by_62 + mov x20, x15 // corrected |f0| + mov x21, x16 // corrected |g0| + + mov x15, x17 // |f1| + mov x16, x19 // |g1| + add x0, x0, #8*6 // pointer to destination |b| + bl __smul_383_n_shift_by_62 + + add x0, x0, #8*6 // pointer to destination |u| + bl __smul_383x63 + + mov x20, x15 // corrected |f1| + mov x21, x16 // corrected |g1| + add x0, x0, #8*6 // pointer to destination |v| + bl __smul_383x63 + eor x1, x1, #256 // flip-flop src |a|b|u|v| + mov x2, #62 + bl __ab_approximation_62 + + eor x0, x1, #256 // pointer to dst |a|b|u|v| + bl __smul_383_n_shift_by_62 + mov x20, x15 // corrected |f0| + mov x21, x16 // corrected |g0| + + mov x15, x17 // |f1| + mov x16, x19 // |g1| + add x0, x0, #8*6 // pointer to destination |b| + bl __smul_383_n_shift_by_62 + + add x0, x0, #8*6 // pointer to destination |u| + bl __smul_383x63 + + mov x20, x15 // corrected |f1| + mov x21, x16 // corrected |g1| + add x0, x0, #8*6 // pointer to destination |v| + bl __smul_383x63 + eor x1, x1, #256 // flip-flop src |a|b|u|v| + mov x2, #62 + bl __ab_approximation_62 + + eor x0, x1, #256 // pointer to dst |a|b|u|v| + bl __smul_383_n_shift_by_62 + mov x20, x15 // corrected |f0| + mov x21, x16 // corrected |g0| + + mov x15, x17 // |f1| + mov x16, x19 // |g1| + add x0, x0, #8*6 // pointer to destination |b| + bl __smul_383_n_shift_by_62 + + add x0, x0, #8*6 // pointer to destination |u| + bl __smul_383x63 + + mov x20, x15 // corrected |f1| + mov x21, x16 // corrected |g1| + add x0, x0, #8*6 // pointer to destination |v| + bl __smul_383x63 + eor x1, x1, #256 // flip-flop src |a|b|u|v| + mov x2, #62 + bl __ab_approximation_62 + + eor x0, x1, #256 // pointer to dst |a|b|u|v| + bl __smul_383_n_shift_by_62 + mov x20, x15 // corrected |f0| + mov x21, x16 // corrected |g0| + + mov x15, x17 // |f1| + mov x16, x19 // |g1| + add x0, x0, #8*6 // pointer to destination |b| + bl __smul_383_n_shift_by_62 + + add x0, x0, #8*6 // pointer to destination |u| + bl __smul_383x63 + + mov x20, x15 // corrected |f1| + mov x21, x16 // corrected |g1| + add x0, x0, #8*6 // pointer to destination |v| + bl __smul_383x63 + asr x27, x27, #63 // sign extension + stp x27, x27, [x0,#8*6] + stp x27, x27, [x0,#8*8] + stp x27, x27, [x0,#8*10] + eor x1, x1, #256 // flip-flop src |a|b|u|v| + mov x2, #62 + bl __ab_approximation_62 + + eor x0, x1, #256 // pointer to dst |a|b|u|v| + bl __smul_383_n_shift_by_62 + mov x20, x15 // corrected |f0| + mov x21, x16 // corrected |g0| + + mov x15, x17 // |f1| + mov x16, x19 // |g1| + add x0, x0, #8*6 // pointer to destination |b| + bl __smul_383_n_shift_by_62 + + add x0, x0, #8*6 // pointer to destination |u| + bl __smul_383x63 + + mov x20, x15 // corrected |f1| + mov x21, x16 // corrected |g1| + add x0, x0, #8*6 // pointer to destination |v| + bl __smul_383x63 + bl __smul_767x63_tail + eor x1, x1, #256 // flip-flop src |a|b|u|v| + mov x2, #62 + bl __ab_approximation_62 + + eor x0, x1, #256 // pointer to dst |a|b|u|v| + bl __smul_383_n_shift_by_62 + mov x20, x15 // corrected |f0| + mov x21, x16 // corrected |g0| + + mov x15, x17 // |f1| + mov x16, x19 // |g1| + add x0, x0, #8*6 // pointer to destination |b| + bl __smul_383_n_shift_by_62 + + add x0, x0, #8*6 // pointer to destination |u| + bl __smul_383x63 + + mov x20, x15 // corrected |f1| + mov x21, x16 // corrected |g1| + add x0, x0, #8*6 // pointer to destination |v| + bl __smul_383x63 + bl __smul_767x63_tail + eor x1, x1, #256 // flip-flop src |a|b|u|v| + mov x2, #62 + bl __ab_approximation_62 + + eor x0, x1, #256 // pointer to dst |a|b|u|v| + bl __smul_383_n_shift_by_62 + mov x20, x15 // corrected |f0| + mov x21, x16 // corrected |g0| + + mov x15, x17 // |f1| + mov x16, x19 // |g1| + add x0, x0, #8*6 // pointer to destination |b| + bl __smul_383_n_shift_by_62 + + add x0, x0, #8*6 // pointer to destination |u| + bl __smul_383x63 + + mov x20, x15 // corrected |f1| + mov x21, x16 // corrected |g1| + add x0, x0, #8*6 // pointer to destination |v| + bl __smul_383x63 + bl __smul_767x63_tail + eor x1, x1, #256 // flip-flop src |a|b|u|v| + mov x2, #62 + bl __ab_approximation_62 + + eor x0, x1, #256 // pointer to dst |a|b|u|v| + bl __smul_383_n_shift_by_62 + mov x20, x15 // corrected |f0| + mov x21, x16 // corrected |g0| + + mov x15, x17 // |f1| + mov x16, x19 // |g1| + add x0, x0, #8*6 // pointer to destination |b| + bl __smul_383_n_shift_by_62 + + add x0, x0, #8*6 // pointer to destination |u| + bl __smul_383x63 + + mov x20, x15 // corrected |f1| + mov x21, x16 // corrected |g1| + add x0, x0, #8*6 // pointer to destination |v| + bl __smul_383x63 + bl __smul_767x63_tail + eor x1, x1, #256 // flip-flop src |a|b|u|v| + mov x2, #62 + bl __ab_approximation_62 + + eor x0, x1, #256 // pointer to dst |a|b|u|v| + bl __smul_383_n_shift_by_62 + mov x20, x15 // corrected |f0| + mov x21, x16 // corrected |g0| + + mov x15, x17 // |f1| + mov x16, x19 // |g1| + add x0, x0, #8*6 // pointer to destination |b| + bl __smul_383_n_shift_by_62 + + add x0, x0, #8*6 // pointer to destination |u| + bl __smul_383x63 + + mov x20, x15 // corrected |f1| + mov x21, x16 // corrected |g1| + add x0, x0, #8*6 // pointer to destination |v| + bl __smul_383x63 + bl __smul_767x63_tail + ////////////////////////////////////////// iteration before last + eor x1, x1, #256 // flip-flop src |a|b|u|v| + mov x2, #62 + //bl __ab_approximation_62 // |a| and |b| are exact, + ldp x3, x8, [x1,#8*0] // just load + ldp x9, x14, [x1,#8*6] + bl __inner_loop_62 + + eor x0, x1, #256 // pointer to dst |a|b|u|v| + str x3, [x0,#8*0] + str x9, [x0,#8*6] + + mov x20, x15 // exact |f0| + mov x21, x16 // exact |g0| + mov x15, x17 + mov x16, x19 + add x0, x0, #8*12 // pointer to dst |u| + bl __smul_383x63 + + mov x20, x15 // exact |f1| + mov x21, x16 // exact |g1| + add x0, x0, #8*6 // pointer to dst |v| + bl __smul_383x63 + bl __smul_767x63_tail + + ////////////////////////////////////////// last iteration + eor x1, x1, #256 // flip-flop src |a|b|u|v| + mov x2, #22 // 766 % 62 + //bl __ab_approximation_62 // |a| and |b| are exact, + ldr x3, [x1,#8*0] // just load + eor x8, x8, x8 + ldr x9, [x1,#8*6] + eor x14, x14, x14 + bl __inner_loop_62 + + mov x20, x17 + mov x21, x19 + ldp x0, x15, [sp] // original out_ptr and n_ptr + bl __smul_383x63 + bl __smul_767x63_tail + ldr x30, [x29,#8] + + asr x22, x8, #63 // sign as mask + ldp x9, x10, [x15,#8*0] + ldp x11, x12, [x15,#8*2] + ldp x13, x14, [x15,#8*4] + + and x9, x9, x22 // add mod<<384 conditionally + and x10, x10, x22 + adds x3, x3, x9 + and x11, x11, x22 + adcs x4, x4, x10 + and x12, x12, x22 + adcs x5, x5, x11 + and x13, x13, x22 + adcs x6, x6, x12 + and x14, x14, x22 + stp x3, x4, [x0,#8*6] + adcs x7, x7, x13 + stp x5, x6, [x0,#8*8] + adc x8, x8, x14 + stp x7, x8, [x0,#8*10] + + add sp, sp, #1040 + ldp x19, x20, [x29,#16] + ldp x21, x22, [x29,#32] + ldp x23, x24, [x29,#48] + ldp x25, x26, [x29,#64] + ldp x27, x28, [x29,#80] + ldr x29, [sp],#128 +.long 3573752767 + ret + + +//////////////////////////////////////////////////////////////////////// +// see corresponding commentary in ctx_inverse_mod_384-x86_64... + +.align 5 +__smul_383x63: + ldp x3, x4, [x1,#8*0+96] // load |u| (or |v|) + asr x17, x20, #63 // |f_|'s sign as mask (or |g_|'s) + ldp x5, x6, [x1,#8*2+96] + eor x20, x20, x17 // conditionally negate |f_| (or |g_|) + ldp x7, x8, [x1,#8*4+96] + + eor x3, x3, x17 // conditionally negate |u| (or |v|) + sub x20, x20, x17 + eor x4, x4, x17 + adds x3, x3, x17, lsr#63 + eor x5, x5, x17 + adcs x4, x4, xzr + eor x6, x6, x17 + adcs x5, x5, xzr + eor x7, x7, x17 + adcs x6, x6, xzr + umulh x22, x3, x20 + eor x8, x8, x17 + umulh x23, x4, x20 + adcs x7, x7, xzr + umulh x24, x5, x20 + adcs x8, x8, xzr + umulh x25, x6, x20 + umulh x26, x7, x20 + mul x3, x3, x20 + mul x4, x4, x20 + mul x5, x5, x20 + adds x4, x4, x22 + mul x6, x6, x20 + adcs x5, x5, x23 + mul x7, x7, x20 + adcs x6, x6, x24 + mul x27,x8, x20 + adcs x7, x7, x25 + adcs x27,x27,x26 + adc x2, xzr, xzr + ldp x9, x10, [x1,#8*0+144] // load |u| (or |v|) + asr x17, x21, #63 // |f_|'s sign as mask (or |g_|'s) + ldp x11, x12, [x1,#8*2+144] + eor x21, x21, x17 // conditionally negate |f_| (or |g_|) + ldp x13, x14, [x1,#8*4+144] + + eor x9, x9, x17 // conditionally negate |u| (or |v|) + sub x21, x21, x17 + eor x10, x10, x17 + adds x9, x9, x17, lsr#63 + eor x11, x11, x17 + adcs x10, x10, xzr + eor x12, x12, x17 + adcs x11, x11, xzr + eor x13, x13, x17 + adcs x12, x12, xzr + umulh x22, x9, x21 + eor x14, x14, x17 + umulh x23, x10, x21 + adcs x13, x13, xzr + umulh x24, x11, x21 + adcs x14, x14, xzr + umulh x25, x12, x21 + adc x19, xzr, xzr // used in __smul_767x63_tail + umulh x26, x13, x21 + mul x9, x9, x21 + mul x10, x10, x21 + mul x11, x11, x21 + adds x10, x10, x22 + mul x12, x12, x21 + adcs x11, x11, x23 + mul x13, x13, x21 + adcs x12, x12, x24 + mul x28,x14, x21 + adcs x13, x13, x25 + adcs x28,x28,x26 + adc x2, x2, xzr + + adds x3, x3, x9 + adcs x4, x4, x10 + adcs x5, x5, x11 + adcs x6, x6, x12 + stp x3, x4, [x0,#8*0] + adcs x7, x7, x13 + stp x5, x6, [x0,#8*2] + adcs x27, x27, x28 + stp x7, x27, [x0,#8*4] + adc x28, x2, xzr // used in __smul_767x63_tail + + ret + + + +.align 5 +__smul_767x63_tail: + smulh x27, x8, x20 + ldp x3, x4, [x1,#8*24] // load rest of |v| + umulh x14,x14, x21 + ldp x5, x6, [x1,#8*26] + ldp x7, x8, [x1,#8*28] + + eor x3, x3, x17 // conditionally negate rest of |v| + eor x4, x4, x17 + eor x5, x5, x17 + adds x3, x3, x19 + eor x6, x6, x17 + adcs x4, x4, xzr + eor x7, x7, x17 + adcs x5, x5, xzr + eor x8, x8, x17 + adcs x6, x6, xzr + umulh x22, x3, x21 + adcs x7, x7, xzr + umulh x23, x4, x21 + adc x8, x8, xzr + + umulh x24, x5, x21 + add x14, x14, x28 + umulh x25, x6, x21 + asr x28, x27, #63 + umulh x26, x7, x21 + mul x3, x3, x21 + mul x4, x4, x21 + mul x5, x5, x21 + adds x3, x3, x14 + mul x6, x6, x21 + adcs x4, x4, x22 + mul x7, x7, x21 + adcs x5, x5, x23 + mul x8, x8, x21 + adcs x6, x6, x24 + adcs x7, x7, x25 + adc x8, x8, x26 + + adds x3, x3, x27 + adcs x4, x4, x28 + adcs x5, x5, x28 + adcs x6, x6, x28 + stp x3, x4, [x0,#8*6] + adcs x7, x7, x28 + stp x5, x6, [x0,#8*8] + adc x8, x8, x28 + stp x7, x8, [x0,#8*10] + + ret + + + +.align 5 +__smul_383_n_shift_by_62: + ldp x3, x4, [x1,#8*0+0] // load |a| (or |b|) + asr x28, x15, #63 // |f0|'s sign as mask (or |g0|'s) + ldp x5, x6, [x1,#8*2+0] + eor x2, x15, x28 // conditionally negate |f0| (or |g0|) + ldp x7, x8, [x1,#8*4+0] + + eor x3, x3, x28 // conditionally negate |a| (or |b|) + sub x2, x2, x28 + eor x4, x4, x28 + adds x3, x3, x28, lsr#63 + eor x5, x5, x28 + adcs x4, x4, xzr + eor x6, x6, x28 + adcs x5, x5, xzr + eor x7, x7, x28 + umulh x22, x3, x2 + adcs x6, x6, xzr + umulh x23, x4, x2 + eor x8, x8, x28 + umulh x24, x5, x2 + adcs x7, x7, xzr + umulh x25, x6, x2 + adc x8, x8, xzr + + umulh x26, x7, x2 + smulh x27, x8, x2 + mul x3, x3, x2 + mul x4, x4, x2 + mul x5, x5, x2 + adds x4, x4, x22 + mul x6, x6, x2 + adcs x5, x5, x23 + mul x7, x7, x2 + adcs x6, x6, x24 + mul x8, x8, x2 + adcs x7, x7, x25 + adcs x8, x8 ,x26 + adc x27, x27, xzr + ldp x9, x10, [x1,#8*0+48] // load |a| (or |b|) + asr x28, x16, #63 // |f0|'s sign as mask (or |g0|'s) + ldp x11, x12, [x1,#8*2+48] + eor x2, x16, x28 // conditionally negate |f0| (or |g0|) + ldp x13, x14, [x1,#8*4+48] + + eor x9, x9, x28 // conditionally negate |a| (or |b|) + sub x2, x2, x28 + eor x10, x10, x28 + adds x9, x9, x28, lsr#63 + eor x11, x11, x28 + adcs x10, x10, xzr + eor x12, x12, x28 + adcs x11, x11, xzr + eor x13, x13, x28 + umulh x22, x9, x2 + adcs x12, x12, xzr + umulh x23, x10, x2 + eor x14, x14, x28 + umulh x24, x11, x2 + adcs x13, x13, xzr + umulh x25, x12, x2 + adc x14, x14, xzr + + umulh x26, x13, x2 + smulh x28, x14, x2 + mul x9, x9, x2 + mul x10, x10, x2 + mul x11, x11, x2 + adds x10, x10, x22 + mul x12, x12, x2 + adcs x11, x11, x23 + mul x13, x13, x2 + adcs x12, x12, x24 + mul x14, x14, x2 + adcs x13, x13, x25 + adcs x14, x14 ,x26 + adc x28, x28, xzr + adds x3, x3, x9 + adcs x4, x4, x10 + adcs x5, x5, x11 + adcs x6, x6, x12 + adcs x7, x7, x13 + adcs x8, x8, x14 + adc x9, x27, x28 + + extr x3, x4, x3, #62 + extr x4, x5, x4, #62 + extr x5, x6, x5, #62 + asr x28, x9, #63 + extr x6, x7, x6, #62 + extr x7, x8, x7, #62 + extr x8, x9, x8, #62 + + eor x3, x3, x28 + eor x4, x4, x28 + adds x3, x3, x28, lsr#63 + eor x5, x5, x28 + adcs x4, x4, xzr + eor x6, x6, x28 + adcs x5, x5, xzr + eor x7, x7, x28 + adcs x6, x6, xzr + eor x8, x8, x28 + stp x3, x4, [x0,#8*0] + adcs x7, x7, xzr + stp x5, x6, [x0,#8*2] + adc x8, x8, xzr + stp x7, x8, [x0,#8*4] + + eor x15, x15, x28 + eor x16, x16, x28 + sub x15, x15, x28 + sub x16, x16, x28 + + ret + + +.align 4 +__ab_approximation_62: + ldp x7, x8, [x1,#8*4] + ldp x13, x14, [x1,#8*10] + ldp x5, x6, [x1,#8*2] + ldp x11, x12, [x1,#8*8] + +Lab_approximation_62_loaded: + orr x22, x8, x14 // check top-most limbs, ... + cmp x22, #0 + csel x8, x8, x7, ne + csel x14, x14, x13, ne + csel x7, x7, x6, ne + orr x22, x8, x14 // ... ones before top-most, ... + csel x13, x13, x12, ne + + ldp x3, x4, [x1,#8*0] + ldp x9, x10, [x1,#8*6] + + cmp x22, #0 + csel x8, x8, x7, ne + csel x14, x14, x13, ne + csel x7, x7, x5, ne + orr x22, x8, x14 // ... and ones before that ... + csel x13, x13, x11, ne + + cmp x22, #0 + csel x8, x8, x7, ne + csel x14, x14, x13, ne + csel x7, x7, x4, ne + orr x22, x8, x14 + csel x13, x13, x10, ne + + clz x22, x22 + cmp x22, #64 + csel x22, x22, xzr, ne + csel x8, x8, x7, ne + csel x14, x14, x13, ne + neg x23, x22 + + lslv x8, x8, x22 // align high limbs to the left + lslv x14, x14, x22 + lsrv x7, x7, x23 + lsrv x13, x13, x23 + and x7, x7, x23, asr#6 + and x13, x13, x23, asr#6 + orr x8, x8, x7 + orr x14, x14, x13 + + b __inner_loop_62 + ret + + +.align 4 +__inner_loop_62: + mov x15, #1 // |f0|=1 + mov x16, #0 // |g0|=0 + mov x17, #0 // |f1|=0 + mov x19, #1 // |g1|=1 + +Loop_62: + sbfx x28, x3, #0, #1 // if |a_| is odd, then we'll be subtracting + sub x2, x2, #1 + subs x24, x9, x3 // |b_|-|a_| + and x22, x9, x28 + sbc x25, x14, x8 + and x23, x14, x28 + subs x26, x3, x22 // |a_|-|b_| (or |a_|-0 if |a_| was even) + mov x22, x15 + sbcs x27, x8, x23 + mov x23, x16 + csel x9, x9, x3, hs // |b_| = |a_| + csel x14, x14, x8, hs + csel x3, x26, x24, hs // borrow means |a_|<|b_|, replace with |b_|-|a_| + csel x8, x27, x25, hs + csel x15, x15, x17, hs // exchange |f0| and |f1| + csel x17, x17, x22, hs + csel x16, x16, x19, hs // exchange |g0| and |g1| + csel x19, x19, x23, hs + extr x3, x8, x3, #1 + lsr x8, x8, #1 + and x22, x17, x28 + and x23, x19, x28 + add x17, x17, x17 // |f1|<<=1 + add x19, x19, x19 // |g1|<<=1 + sub x15, x15, x22 // |f0|-=|f1| (or |f0-=0| if |a_| was even) + sub x16, x16, x23 // |g0|-=|g1| (or |g0-=0| ...) + cbnz x2, Loop_62 + + ret + diff --git a/temp-deps/blst-ts/deps/blst/build/mach-o/ct_is_square_mod_384-armv8.S b/temp-deps/blst-ts/deps/blst/build/mach-o/ct_is_square_mod_384-armv8.S new file mode 100644 index 0000000..b5c953d --- /dev/null +++ b/temp-deps/blst-ts/deps/blst/build/mach-o/ct_is_square_mod_384-armv8.S @@ -0,0 +1,324 @@ +.text + +.globl _ct_is_square_mod_384 + +.align 5 +_ct_is_square_mod_384: +.long 3573752639 + stp x29, x30, [sp,#-128]! + add x29, sp, #0 + stp x19, x20, [sp,#16] + stp x21, x22, [sp,#32] + stp x23, x24, [sp,#48] + stp x25, x26, [sp,#64] + stp x27, x28, [sp,#80] + sub sp, sp, #512 + + ldp x3, x4, [x0,#8*0] // load input + ldp x5, x6, [x0,#8*2] + ldp x7, x8, [x0,#8*4] + + add x0, sp, #255 // find closest 256-byte-aligned spot + and x0, x0, #-256 // in the frame... + + ldp x9, x10, [x1,#8*0] // load modulus + ldp x11, x12, [x1,#8*2] + ldp x13, x14, [x1,#8*4] + + stp x3, x4, [x0,#8*6] // copy input to |a| + stp x5, x6, [x0,#8*8] + stp x7, x8, [x0,#8*10] + stp x9, x10, [x0,#8*0] // copy modulus to |b| + stp x11, x12, [x0,#8*2] + stp x13, x14, [x0,#8*4] + + eor x2, x2, x2 // init the Legendre symbol + mov x15, #24 // 24 is 768/30-1 + b Loop_is_square + +.align 4 +Loop_is_square: + bl __ab_approximation_30 + sub x15, x15, #1 + + eor x1, x0, #128 // pointer to dst |b| + bl __smul_384_n_shift_by_30 + + mov x19, x16 // |f0| + mov x20, x17 // |g0| + add x1, x1, #8*6 // pointer to dst |a| + bl __smul_384_n_shift_by_30 + + ldp x9, x10, [x1,#-8*6] + eor x0, x0, #128 // flip-flop src |a|b| + and x27, x27, x9 // if |a| was negative, + add x2, x2, x27, lsr#1 // adjust |L| + + cbnz x15, Loop_is_square + + ////////////////////////////////////////// last iteration + //bl __ab_approximation_30 // |a| and |b| are exact, + //ldr x8, [x0,#8*6] // and loaded + //ldr x14, [x0,#8*0] + mov x15, #48 // 48 is 768%30 + 30 + bl __inner_loop_48 + ldr x30, [x29,#8] + + and x0, x2, #1 + eor x0, x0, #1 + + add sp, sp, #512 + ldp x19, x20, [x29,#16] + ldp x21, x22, [x29,#32] + ldp x23, x24, [x29,#48] + ldp x25, x26, [x29,#64] + ldp x27, x28, [x29,#80] + ldr x29, [sp],#128 +.long 3573752767 + ret + + + +.align 5 +__smul_384_n_shift_by_30: + ldp x3, x4, [x0,#8*0+0] // load |b| (or |a|) + asr x27, x20, #63 // |g1|'s sign as mask (or |f1|'s) + ldp x5, x6, [x0,#8*2+0] + eor x20, x20, x27 // conditionally negate |g1| (or |f1|) + ldp x7, x8, [x0,#8*4+0] + + eor x3, x3, x27 // conditionally negate |b| (or |a|) + sub x20, x20, x27 + eor x4, x4, x27 + adds x3, x3, x27, lsr#63 + eor x5, x5, x27 + adcs x4, x4, xzr + eor x6, x6, x27 + adcs x5, x5, xzr + eor x7, x7, x27 + umulh x21, x3, x20 + adcs x6, x6, xzr + umulh x22, x4, x20 + eor x8, x8, x27 + umulh x23, x5, x20 + adcs x7, x7, xzr + umulh x24, x6, x20 + adc x8, x8, xzr + + umulh x25, x7, x20 + and x28, x20, x27 + umulh x26, x8, x20 + neg x28, x28 + mul x3, x3, x20 + mul x4, x4, x20 + mul x5, x5, x20 + adds x4, x4, x21 + mul x6, x6, x20 + adcs x5, x5, x22 + mul x7, x7, x20 + adcs x6, x6, x23 + mul x8, x8, x20 + adcs x7, x7, x24 + adcs x8, x8 ,x25 + adc x26, x26, x28 + ldp x9, x10, [x0,#8*0+48] // load |b| (or |a|) + asr x27, x19, #63 // |g1|'s sign as mask (or |f1|'s) + ldp x11, x12, [x0,#8*2+48] + eor x19, x19, x27 // conditionally negate |g1| (or |f1|) + ldp x13, x14, [x0,#8*4+48] + + eor x9, x9, x27 // conditionally negate |b| (or |a|) + sub x19, x19, x27 + eor x10, x10, x27 + adds x9, x9, x27, lsr#63 + eor x11, x11, x27 + adcs x10, x10, xzr + eor x12, x12, x27 + adcs x11, x11, xzr + eor x13, x13, x27 + umulh x21, x9, x19 + adcs x12, x12, xzr + umulh x22, x10, x19 + eor x14, x14, x27 + umulh x23, x11, x19 + adcs x13, x13, xzr + umulh x24, x12, x19 + adc x14, x14, xzr + + umulh x25, x13, x19 + and x28, x19, x27 + umulh x27, x14, x19 + neg x28, x28 + mul x9, x9, x19 + mul x10, x10, x19 + mul x11, x11, x19 + adds x10, x10, x21 + mul x12, x12, x19 + adcs x11, x11, x22 + mul x13, x13, x19 + adcs x12, x12, x23 + mul x14, x14, x19 + adcs x13, x13, x24 + adcs x14, x14 ,x25 + adc x27, x27, x28 + adds x3, x3, x9 + adcs x4, x4, x10 + adcs x5, x5, x11 + adcs x6, x6, x12 + adcs x7, x7, x13 + adcs x8, x8, x14 + adc x9, x26, x27 + + extr x3, x4, x3, #30 + extr x4, x5, x4, #30 + extr x5, x6, x5, #30 + asr x27, x9, #63 + extr x6, x7, x6, #30 + extr x7, x8, x7, #30 + extr x8, x9, x8, #30 + + eor x3, x3, x27 + eor x4, x4, x27 + adds x3, x3, x27, lsr#63 + eor x5, x5, x27 + adcs x4, x4, xzr + eor x6, x6, x27 + adcs x5, x5, xzr + eor x7, x7, x27 + adcs x6, x6, xzr + eor x8, x8, x27 + stp x3, x4, [x1,#8*0] + adcs x7, x7, xzr + stp x5, x6, [x1,#8*2] + adc x8, x8, xzr + stp x7, x8, [x1,#8*4] + + ret + + +.align 4 +__ab_approximation_30: + ldp x13, x14, [x0,#8*4] // |a| is still in registers + ldp x11, x12, [x0,#8*2] + + orr x21, x8, x14 // check top-most limbs, ... + cmp x21, #0 + csel x8, x8, x7, ne + csel x14, x14, x13, ne + csel x7, x7, x6, ne + orr x21, x8, x14 // ... ones before top-most, ... + csel x13, x13, x12, ne + + cmp x21, #0 + csel x8, x8, x7, ne + csel x14, x14, x13, ne + csel x7, x7, x5, ne + orr x21, x8, x14 // ... and ones before that ... + csel x13, x13, x11, ne + + cmp x21, #0 + csel x8, x8, x7, ne + csel x14, x14, x13, ne + csel x7, x7, x4, ne + orr x21, x8, x14 // and one more, ... + csel x13, x13, x10, ne + + cmp x21, #0 + csel x8, x8, x7, ne + csel x14, x14, x13, ne + csel x7, x7, x3, ne + orr x21, x8, x14 + csel x13, x13, x9, ne + + clz x21, x21 + cmp x21, #64 + csel x21, x21, xzr, ne + csel x8, x8, x7, ne + csel x14, x14, x13, ne + neg x22, x21 + + lslv x8, x8, x21 // align high limbs to the left + lslv x14, x14, x21 + lsrv x7, x7, x22 + lsrv x13, x13, x22 + and x7, x7, x22, asr#6 + and x13, x13, x22, asr#6 + orr x8, x8, x7 + orr x14, x14, x13 + + bfxil x8, x3, #0, #32 + bfxil x14, x9, #0, #32 + + b __inner_loop_30 + ret + + + +.align 4 +__inner_loop_30: + mov x28, #30 + mov x17, #0x7FFFFFFF80000000 // |f0|=1, |g0|=0 + mov x20, #0x800000007FFFFFFF // |f1|=0, |g1|=1 + mov x27,#0x7FFFFFFF7FFFFFFF + +Loop_30: + sbfx x24, x8, #0, #1 // if |a_| is odd, then we'll be subtracting + and x25, x8, x14 + sub x28, x28, #1 + and x21, x14, x24 + + sub x22, x14, x8 // |b_|-|a_| + subs x23, x8, x21 // |a_|-|b_| (or |a_|-0 if |a_| was even) + add x25, x2, x25, lsr#1 // L + (a_ & b_) >> 1 + mov x21, x20 + csel x14, x14, x8, hs // |b_| = |a_| + csel x8, x23, x22, hs // borrow means |a_|<|b_|, replace with |b_|-|a_| + csel x20, x20, x17, hs // exchange |fg0| and |fg1| + csel x17, x17, x21, hs + csel x2, x2, x25, hs + lsr x8, x8, #1 + and x21, x20, x24 + and x22, x27, x24 + add x23, x14, #2 + sub x17, x17, x21 // |f0|-=|f1| (or |f0-=0| if |a_| was even) + add x20, x20, x20 // |f1|<<=1 + add x2, x2, x23, lsr#2 // "negate" |L| if |b|%8 is 3 or 5 + add x17, x17, x22 + sub x20, x20, x27 + + cbnz x28, Loop_30 + + mov x27, #0x7FFFFFFF + ubfx x16, x17, #0, #32 + ubfx x17, x17, #32, #32 + ubfx x19, x20, #0, #32 + ubfx x20, x20, #32, #32 + sub x16, x16, x27 // remove the bias + sub x17, x17, x27 + sub x19, x19, x27 + sub x20, x20, x27 + + ret + + +.align 4 +__inner_loop_48: +Loop_48: + sbfx x24, x3, #0, #1 // if |a_| is odd, then we'll be subtracting + and x25, x3, x9 + sub x15, x15, #1 + and x21, x9, x24 + sub x22, x9, x3 // |b_|-|a_| + subs x23, x3, x21 // |a_|-|b_| (or |a_|-0 if |a_| was even) + add x25, x2, x25, lsr#1 + csel x9, x9, x3, hs // |b_| = |a_| + csel x3, x23, x22, hs // borrow means |a_|<|b_|, replace with |b_|-|a_| + csel x2, x2, x25, hs + add x23, x9, #2 + lsr x3, x3, #1 + add x2, x2, x23, lsr#2 // "negate" |L| if |b|%8 is 3 or 5 + + cbnz x15, Loop_48 + + ret + diff --git a/temp-deps/blst-ts/deps/blst/build/mach-o/ct_is_square_mod_384-x86_64.s b/temp-deps/blst-ts/deps/blst/build/mach-o/ct_is_square_mod_384-x86_64.s new file mode 100644 index 0000000..f282394 --- /dev/null +++ b/temp-deps/blst-ts/deps/blst/build/mach-o/ct_is_square_mod_384-x86_64.s @@ -0,0 +1,471 @@ +.text + +.globl _ct_is_square_mod_384 + +.p2align 5 +_ct_is_square_mod_384: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + subq $536,%rsp +.cfi_adjust_cfa_offset 536 + + + leaq 24+255(%rsp),%rax + andq $-256,%rax + + movq 0(%rdi),%r8 + movq 8(%rdi),%r9 + movq 16(%rdi),%r10 + movq 24(%rdi),%r11 + movq 32(%rdi),%r12 + movq 40(%rdi),%r13 + + movq 0(%rsi),%r14 + movq 8(%rsi),%r15 + movq 16(%rsi),%rbx + movq 24(%rsi),%rcx + movq 32(%rsi),%rdx + movq 40(%rsi),%rdi + movq %rax,%rsi + + movq %r8,0(%rax) + movq %r9,8(%rax) + movq %r10,16(%rax) + movq %r11,24(%rax) + movq %r12,32(%rax) + movq %r13,40(%rax) + + movq %r14,48(%rax) + movq %r15,56(%rax) + movq %rbx,64(%rax) + movq %rcx,72(%rax) + movq %rdx,80(%rax) + movq %rdi,88(%rax) + + xorq %rbp,%rbp + movl $24,%ecx + jmp L$oop_is_square + +.p2align 5 +L$oop_is_square: + movl %ecx,16(%rsp) + + call __ab_approximation_30 + movq %rax,0(%rsp) + movq %rbx,8(%rsp) + + movq $128+48,%rdi + xorq %rsi,%rdi + call __smulq_384_n_shift_by_30 + + movq 0(%rsp),%rdx + movq 8(%rsp),%rcx + leaq -48(%rdi),%rdi + call __smulq_384_n_shift_by_30 + + movl 16(%rsp),%ecx + xorq $128,%rsi + + andq 48(%rdi),%r14 + shrq $1,%r14 + addq %r14,%rbp + + subl $1,%ecx + jnz L$oop_is_square + + + + + movq 48(%rsi),%r9 + call __inner_loop_48 + + movq $1,%rax + andq %rbp,%rax + xorq $1,%rax + + leaq 536(%rsp),%r8 + movq 0(%r8),%r15 +.cfi_restore %r15 + movq 8(%r8),%r14 +.cfi_restore %r14 + movq 16(%r8),%r13 +.cfi_restore %r13 + movq 24(%r8),%r12 +.cfi_restore %r12 + movq 32(%r8),%rbx +.cfi_restore %rbx + movq 40(%r8),%rbp +.cfi_restore %rbp + leaq 48(%r8),%rsp +.cfi_adjust_cfa_offset -536-8*6 + + .byte 0xf3,0xc3 +.cfi_endproc + + + +.p2align 5 +__smulq_384_n_shift_by_30: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + movq 32(%rsi),%r12 + movq 40(%rsi),%r13 + + movq %rdx,%rbx + sarq $63,%rdx + xorq %rax,%rax + subq %rdx,%rax + + xorq %rdx,%rbx + addq %rax,%rbx + + xorq %rdx,%r8 + xorq %rdx,%r9 + xorq %rdx,%r10 + xorq %rdx,%r11 + xorq %rdx,%r12 + xorq %rdx,%r13 + addq %r8,%rax + adcq $0,%r9 + adcq $0,%r10 + adcq $0,%r11 + adcq $0,%r12 + adcq $0,%r13 + + movq %rdx,%r14 + andq %rbx,%r14 + mulq %rbx + movq %rax,%r8 + movq %r9,%rax + movq %rdx,%r9 + mulq %rbx + addq %rax,%r9 + movq %r10,%rax + adcq $0,%rdx + movq %rdx,%r10 + mulq %rbx + addq %rax,%r10 + movq %r11,%rax + adcq $0,%rdx + movq %rdx,%r11 + mulq %rbx + addq %rax,%r11 + movq %r12,%rax + adcq $0,%rdx + movq %rdx,%r12 + mulq %rbx + addq %rax,%r12 + movq %r13,%rax + adcq $0,%rdx + movq %rdx,%r13 + negq %r14 + mulq %rbx + addq %rax,%r13 + adcq %rdx,%r14 + leaq 48(%rsi),%rsi + movq %rcx,%rdx + + movq %r8,0(%rdi) + movq %r9,8(%rdi) + movq %r10,16(%rdi) + movq %r11,24(%rdi) + movq %r12,32(%rdi) + movq %r13,40(%rdi) + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + movq 32(%rsi),%r12 + movq 40(%rsi),%r13 + + movq %rdx,%rbx + sarq $63,%rdx + xorq %rax,%rax + subq %rdx,%rax + + xorq %rdx,%rbx + addq %rax,%rbx + + xorq %rdx,%r8 + xorq %rdx,%r9 + xorq %rdx,%r10 + xorq %rdx,%r11 + xorq %rdx,%r12 + xorq %rdx,%r13 + addq %r8,%rax + adcq $0,%r9 + adcq $0,%r10 + adcq $0,%r11 + adcq $0,%r12 + adcq $0,%r13 + + movq %rdx,%r15 + andq %rbx,%r15 + mulq %rbx + movq %rax,%r8 + movq %r9,%rax + movq %rdx,%r9 + mulq %rbx + addq %rax,%r9 + movq %r10,%rax + adcq $0,%rdx + movq %rdx,%r10 + mulq %rbx + addq %rax,%r10 + movq %r11,%rax + adcq $0,%rdx + movq %rdx,%r11 + mulq %rbx + addq %rax,%r11 + movq %r12,%rax + adcq $0,%rdx + movq %rdx,%r12 + mulq %rbx + addq %rax,%r12 + movq %r13,%rax + adcq $0,%rdx + movq %rdx,%r13 + negq %r15 + mulq %rbx + addq %rax,%r13 + adcq %rdx,%r15 + leaq -48(%rsi),%rsi + + addq 0(%rdi),%r8 + adcq 8(%rdi),%r9 + adcq 16(%rdi),%r10 + adcq 24(%rdi),%r11 + adcq 32(%rdi),%r12 + adcq 40(%rdi),%r13 + adcq %r15,%r14 + + shrdq $30,%r9,%r8 + shrdq $30,%r10,%r9 + shrdq $30,%r11,%r10 + shrdq $30,%r12,%r11 + shrdq $30,%r13,%r12 + shrdq $30,%r14,%r13 + + sarq $63,%r14 + xorq %rbx,%rbx + subq %r14,%rbx + + xorq %r14,%r8 + xorq %r14,%r9 + xorq %r14,%r10 + xorq %r14,%r11 + xorq %r14,%r12 + xorq %r14,%r13 + addq %rbx,%r8 + adcq $0,%r9 + adcq $0,%r10 + adcq $0,%r11 + adcq $0,%r12 + adcq $0,%r13 + + movq %r8,0(%rdi) + movq %r9,8(%rdi) + movq %r10,16(%rdi) + movq %r11,24(%rdi) + movq %r12,32(%rdi) + movq %r13,40(%rdi) + + .byte 0xf3,0xc3 +.cfi_endproc + + +.p2align 5 +__ab_approximation_30: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + movq 88(%rsi),%rbx + movq 80(%rsi),%r15 + movq 72(%rsi),%r14 + + movq %r13,%rax + orq %rbx,%rax + cmovzq %r12,%r13 + cmovzq %r15,%rbx + cmovzq %r11,%r12 + movq 64(%rsi),%r11 + cmovzq %r14,%r15 + + movq %r13,%rax + orq %rbx,%rax + cmovzq %r12,%r13 + cmovzq %r15,%rbx + cmovzq %r10,%r12 + movq 56(%rsi),%r10 + cmovzq %r11,%r15 + + movq %r13,%rax + orq %rbx,%rax + cmovzq %r12,%r13 + cmovzq %r15,%rbx + cmovzq %r9,%r12 + movq 48(%rsi),%r9 + cmovzq %r10,%r15 + + movq %r13,%rax + orq %rbx,%rax + cmovzq %r12,%r13 + cmovzq %r15,%rbx + cmovzq %r8,%r12 + cmovzq %r9,%r15 + + movq %r13,%rax + orq %rbx,%rax + bsrq %rax,%rcx + leaq 1(%rcx),%rcx + cmovzq %r8,%r13 + cmovzq %r9,%rbx + cmovzq %rax,%rcx + negq %rcx + + + shldq %cl,%r12,%r13 + shldq %cl,%r15,%rbx + + movq $0xFFFFFFFF00000000,%rax + movl %r8d,%r8d + movl %r9d,%r9d + andq %rax,%r13 + andq %rax,%rbx + orq %r13,%r8 + orq %rbx,%r9 + + jmp __inner_loop_30 + + .byte 0xf3,0xc3 +.cfi_endproc + + +.p2align 5 +__inner_loop_30: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + movq $0x7FFFFFFF80000000,%rbx + movq $0x800000007FFFFFFF,%rcx + leaq -1(%rbx),%r15 + movl $30,%edi + +L$oop_30: + movq %r8,%rax + andq %r9,%rax + shrq $1,%rax + + cmpq %r9,%r8 + movq %r8,%r10 + movq %r9,%r11 + leaq (%rax,%rbp,1),%rax + movq %rbx,%r12 + movq %rcx,%r13 + movq %rbp,%r14 + cmovbq %r9,%r8 + cmovbq %r10,%r9 + cmovbq %rcx,%rbx + cmovbq %r12,%rcx + cmovbq %rax,%rbp + + subq %r9,%r8 + subq %rcx,%rbx + addq %r15,%rbx + + testq $1,%r10 + cmovzq %r10,%r8 + cmovzq %r11,%r9 + cmovzq %r12,%rbx + cmovzq %r13,%rcx + cmovzq %r14,%rbp + + leaq 2(%r9),%rax + shrq $1,%r8 + shrq $2,%rax + addq %rcx,%rcx + leaq (%rax,%rbp,1),%rbp + subq %r15,%rcx + + subl $1,%edi + jnz L$oop_30 + + shrq $32,%r15 + movl %ebx,%eax + shrq $32,%rbx + movl %ecx,%edx + shrq $32,%rcx + subq %r15,%rax + subq %r15,%rbx + subq %r15,%rdx + subq %r15,%rcx + + .byte 0xf3,0xc3 +.cfi_endproc + + + +.p2align 5 +__inner_loop_48: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + movl $48,%edi + +L$oop_48: + movq %r8,%rax + andq %r9,%rax + shrq $1,%rax + + cmpq %r9,%r8 + movq %r8,%r10 + movq %r9,%r11 + leaq (%rax,%rbp,1),%rax + movq %rbp,%r12 + cmovbq %r9,%r8 + cmovbq %r10,%r9 + cmovbq %rax,%rbp + + subq %r9,%r8 + + testq $1,%r10 + cmovzq %r10,%r8 + cmovzq %r11,%r9 + cmovzq %r12,%rbp + + leaq 2(%r9),%rax + shrq $1,%r8 + shrq $2,%rax + addq %rax,%rbp + + subl $1,%edi + jnz L$oop_48 + + .byte 0xf3,0xc3 +.cfi_endproc + diff --git a/temp-deps/blst-ts/deps/blst/build/mach-o/ctq_inverse_mod_384-x86_64.s b/temp-deps/blst-ts/deps/blst/build/mach-o/ctq_inverse_mod_384-x86_64.s new file mode 100644 index 0000000..185a876 --- /dev/null +++ b/temp-deps/blst-ts/deps/blst/build/mach-o/ctq_inverse_mod_384-x86_64.s @@ -0,0 +1,1187 @@ +.text + +.globl _ct_inverse_mod_383 + +.p2align 5 +_ct_inverse_mod_383: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + subq $1112,%rsp +.cfi_adjust_cfa_offset 1112 + + + leaq 88+511(%rsp),%rax + andq $-512,%rax + movq %rdi,32(%rsp) + movq %rcx,40(%rsp) + + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + movq 32(%rsi),%r12 + movq 40(%rsi),%r13 + + movq 0(%rdx),%r14 + movq 8(%rdx),%r15 + movq 16(%rdx),%rbx + movq 24(%rdx),%rbp + movq 32(%rdx),%rsi + movq 40(%rdx),%rdi + + movq %r8,0(%rax) + movq %r9,8(%rax) + movq %r10,16(%rax) + movq %r11,24(%rax) + movq %r12,32(%rax) + movq %r13,40(%rax) + + movq %r14,48(%rax) + movq %r15,56(%rax) + movq %rbx,64(%rax) + movq %rbp,72(%rax) + movq %rsi,80(%rax) + movq %rax,%rsi + movq %rdi,88(%rax) + + + movl $62,%edi + call __ab_approximation_62 + + + movq %r12,72(%rsp) + movq %r13,80(%rsp) + + movq $256,%rdi + xorq %rsi,%rdi + call __smulq_383_n_shift_by_62 + + + movq %rdx,96(%rdi) + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulq_383_n_shift_by_62 + + + movq %rdx,96(%rdi) + + + xorq $256,%rsi + movl $62,%edi + call __ab_approximation_62 + + + movq %r12,72(%rsp) + movq %r13,80(%rsp) + + movq $256,%rdi + xorq %rsi,%rdi + call __smulq_383_n_shift_by_62 + movq %rdx,56(%rsp) + movq %rcx,64(%rsp) + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulq_383_n_shift_by_62 + + + + movq 96(%rsi),%rax + movq 144(%rsi),%r11 + movq %rdx,%rbx + movq %rax,%r10 + imulq 56(%rsp) + movq %rax,%r8 + movq %r11,%rax + movq %rdx,%r9 + imulq 64(%rsp) + addq %rax,%r8 + adcq %rdx,%r9 + movq %r8,48(%rdi) + movq %r9,56(%rdi) + sarq $63,%r9 + movq %r9,64(%rdi) + movq %r9,72(%rdi) + movq %r9,80(%rdi) + movq %r9,88(%rdi) + leaq 96(%rsi),%rsi + + movq %r10,%rax + imulq %rbx + movq %rax,%r8 + movq %r11,%rax + movq %rdx,%r9 + imulq %rcx + addq %rax,%r8 + adcq %rdx,%r9 + movq %r8,96(%rdi) + movq %r9,104(%rdi) + sarq $63,%r9 + movq %r9,112(%rdi) + movq %r9,120(%rdi) + movq %r9,128(%rdi) + movq %r9,136(%rdi) + xorq $256+96,%rsi + movl $62,%edi + call __ab_approximation_62 + + + movq %r12,72(%rsp) + movq %r13,80(%rsp) + + movq $256,%rdi + xorq %rsi,%rdi + call __smulq_383_n_shift_by_62 + movq %rdx,56(%rsp) + movq %rcx,64(%rsp) + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulq_383_n_shift_by_62 + movq %rdx,72(%rsp) + movq %rcx,80(%rsp) + + movq 56(%rsp),%rdx + movq 64(%rsp),%rcx + leaq 96(%rsi),%rsi + leaq 48(%rdi),%rdi + call __smulq_383x63 + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulq_383x63 + xorq $256+96,%rsi + movl $62,%edi + call __ab_approximation_62 + + + movq %r12,72(%rsp) + movq %r13,80(%rsp) + + movq $256,%rdi + xorq %rsi,%rdi + call __smulq_383_n_shift_by_62 + movq %rdx,56(%rsp) + movq %rcx,64(%rsp) + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulq_383_n_shift_by_62 + movq %rdx,72(%rsp) + movq %rcx,80(%rsp) + + movq 56(%rsp),%rdx + movq 64(%rsp),%rcx + leaq 96(%rsi),%rsi + leaq 48(%rdi),%rdi + call __smulq_383x63 + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulq_383x63 + xorq $256+96,%rsi + movl $62,%edi + call __ab_approximation_62 + + + movq %r12,72(%rsp) + movq %r13,80(%rsp) + + movq $256,%rdi + xorq %rsi,%rdi + call __smulq_383_n_shift_by_62 + movq %rdx,56(%rsp) + movq %rcx,64(%rsp) + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulq_383_n_shift_by_62 + movq %rdx,72(%rsp) + movq %rcx,80(%rsp) + + movq 56(%rsp),%rdx + movq 64(%rsp),%rcx + leaq 96(%rsi),%rsi + leaq 48(%rdi),%rdi + call __smulq_383x63 + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulq_383x63 + xorq $256+96,%rsi + movl $62,%edi + call __ab_approximation_62 + + + movq %r12,72(%rsp) + movq %r13,80(%rsp) + + movq $256,%rdi + xorq %rsi,%rdi + call __smulq_383_n_shift_by_62 + movq %rdx,56(%rsp) + movq %rcx,64(%rsp) + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulq_383_n_shift_by_62 + movq %rdx,72(%rsp) + movq %rcx,80(%rsp) + + movq 56(%rsp),%rdx + movq 64(%rsp),%rcx + leaq 96(%rsi),%rsi + leaq 48(%rdi),%rdi + call __smulq_383x63 + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulq_383x63 + sarq $63,%r13 + movq %r13,48(%rdi) + movq %r13,56(%rdi) + movq %r13,64(%rdi) + movq %r13,72(%rdi) + movq %r13,80(%rdi) + movq %r13,88(%rdi) + xorq $256+96,%rsi + movl $62,%edi + call __ab_approximation_62 + + + movq %r12,72(%rsp) + movq %r13,80(%rsp) + + movq $256,%rdi + xorq %rsi,%rdi + call __smulq_383_n_shift_by_62 + movq %rdx,56(%rsp) + movq %rcx,64(%rsp) + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulq_383_n_shift_by_62 + movq %rdx,72(%rsp) + movq %rcx,80(%rsp) + + movq 56(%rsp),%rdx + movq 64(%rsp),%rcx + leaq 96(%rsi),%rsi + leaq 48(%rdi),%rdi + call __smulq_383x63 + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulq_767x63 + xorq $256+96,%rsi + movl $62,%edi + call __ab_approximation_62 + + + movq %r12,72(%rsp) + movq %r13,80(%rsp) + + movq $256,%rdi + xorq %rsi,%rdi + call __smulq_383_n_shift_by_62 + movq %rdx,56(%rsp) + movq %rcx,64(%rsp) + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulq_383_n_shift_by_62 + movq %rdx,72(%rsp) + movq %rcx,80(%rsp) + + movq 56(%rsp),%rdx + movq 64(%rsp),%rcx + leaq 96(%rsi),%rsi + leaq 48(%rdi),%rdi + call __smulq_383x63 + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulq_767x63 + xorq $256+96,%rsi + movl $62,%edi + call __ab_approximation_62 + + + movq %r12,72(%rsp) + movq %r13,80(%rsp) + + movq $256,%rdi + xorq %rsi,%rdi + call __smulq_383_n_shift_by_62 + movq %rdx,56(%rsp) + movq %rcx,64(%rsp) + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulq_383_n_shift_by_62 + movq %rdx,72(%rsp) + movq %rcx,80(%rsp) + + movq 56(%rsp),%rdx + movq 64(%rsp),%rcx + leaq 96(%rsi),%rsi + leaq 48(%rdi),%rdi + call __smulq_383x63 + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulq_767x63 + xorq $256+96,%rsi + movl $62,%edi + call __ab_approximation_62 + + + movq %r12,72(%rsp) + movq %r13,80(%rsp) + + movq $256,%rdi + xorq %rsi,%rdi + call __smulq_383_n_shift_by_62 + movq %rdx,56(%rsp) + movq %rcx,64(%rsp) + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulq_383_n_shift_by_62 + movq %rdx,72(%rsp) + movq %rcx,80(%rsp) + + movq 56(%rsp),%rdx + movq 64(%rsp),%rcx + leaq 96(%rsi),%rsi + leaq 48(%rdi),%rdi + call __smulq_383x63 + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulq_767x63 + xorq $256+96,%rsi + movl $62,%edi + call __ab_approximation_62 + + + movq %r12,72(%rsp) + movq %r13,80(%rsp) + + movq $256,%rdi + xorq %rsi,%rdi + call __smulq_383_n_shift_by_62 + movq %rdx,56(%rsp) + movq %rcx,64(%rsp) + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulq_383_n_shift_by_62 + movq %rdx,72(%rsp) + movq %rcx,80(%rsp) + + movq 56(%rsp),%rdx + movq 64(%rsp),%rcx + leaq 96(%rsi),%rsi + leaq 48(%rdi),%rdi + call __smulq_383x63 + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulq_767x63 + + xorq $256+96,%rsi + movl $62,%edi + + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 48(%rsi),%r10 + movq 56(%rsi),%r11 + call __inner_loop_62 + + + movq %r12,72(%rsp) + movq %r13,80(%rsp) + + movq $256,%rdi + xorq %rsi,%rdi + movq %r8,0(%rdi) + movq %r10,48(%rdi) + + + + leaq 96(%rsi),%rsi + leaq 96(%rdi),%rdi + call __smulq_383x63 + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulq_767x63 + + + xorq $256+96,%rsi + movl $22,%edi + + movq 0(%rsi),%r8 + xorq %r9,%r9 + movq 48(%rsi),%r10 + xorq %r11,%r11 + call __inner_loop_62 + + + + + + + + leaq 96(%rsi),%rsi + + + + + + movq %r12,%rdx + movq %r13,%rcx + movq 32(%rsp),%rdi + call __smulq_767x63 + + movq 40(%rsp),%rsi + movq %rax,%rdx + sarq $63,%rax + + movq %rax,%r8 + movq %rax,%r9 + movq %rax,%r10 + andq 0(%rsi),%r8 + andq 8(%rsi),%r9 + movq %rax,%r11 + andq 16(%rsi),%r10 + andq 24(%rsi),%r11 + movq %rax,%r12 + andq 32(%rsi),%r12 + andq 40(%rsi),%rax + + addq %r8,%r14 + adcq %r9,%r15 + adcq %r10,%rbx + adcq %r11,%rbp + adcq %r12,%rcx + adcq %rax,%rdx + + movq %r14,48(%rdi) + movq %r15,56(%rdi) + movq %rbx,64(%rdi) + movq %rbp,72(%rdi) + movq %rcx,80(%rdi) + movq %rdx,88(%rdi) + + leaq 1112(%rsp),%r8 + movq 0(%r8),%r15 +.cfi_restore %r15 + movq 8(%r8),%r14 +.cfi_restore %r14 + movq 16(%r8),%r13 +.cfi_restore %r13 + movq 24(%r8),%r12 +.cfi_restore %r12 + movq 32(%r8),%rbx +.cfi_restore %rbx + movq 40(%r8),%rbp +.cfi_restore %rbp + leaq 48(%r8),%rsp +.cfi_adjust_cfa_offset -1112-8*6 + + .byte 0xf3,0xc3 +.cfi_endproc + + +.p2align 5 +__smulq_767x63: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + movq 32(%rsi),%r12 + movq 40(%rsi),%r13 + + movq %rdx,%rbp + sarq $63,%rdx + xorq %rax,%rax + subq %rdx,%rax + + movq %rdi,8(%rsp) + movq %rsi,16(%rsp) + leaq 48(%rsi),%rsi + + xorq %rdx,%rbp + addq %rax,%rbp + + xorq %rdx,%r8 + xorq %rdx,%r9 + xorq %rdx,%r10 + xorq %rdx,%r11 + xorq %rdx,%r12 + xorq %rdx,%r13 + addq %r8,%rax + adcq $0,%r9 + adcq $0,%r10 + adcq $0,%r11 + adcq $0,%r12 + adcq $0,%r13 + + mulq %rbp + movq %rax,0(%rdi) + movq %r9,%rax + movq %rdx,%r9 + mulq %rbp + addq %rax,%r9 + movq %r10,%rax + adcq $0,%rdx + movq %rdx,%r10 + movq %r9,8(%rdi) + mulq %rbp + addq %rax,%r10 + movq %r11,%rax + adcq $0,%rdx + movq %rdx,%r11 + movq %r10,16(%rdi) + mulq %rbp + addq %rax,%r11 + movq %r12,%rax + adcq $0,%rdx + movq %rdx,%r12 + movq %r11,24(%rdi) + mulq %rbp + addq %rax,%r12 + movq %r13,%rax + adcq $0,%rdx + movq %rdx,%r13 + movq %r12,32(%rdi) + imulq %rbp + addq %rax,%r13 + adcq $0,%rdx + + movq %r13,40(%rdi) + movq %rdx,48(%rdi) + sarq $63,%rdx + movq %rdx,56(%rdi) + movq %rcx,%rdx + + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + movq 32(%rsi),%r12 + movq 40(%rsi),%r13 + movq 48(%rsi),%r14 + movq 56(%rsi),%r15 + movq 64(%rsi),%rbx + movq 72(%rsi),%rbp + movq 80(%rsi),%rcx + movq 88(%rsi),%rdi + + movq %rdx,%rsi + sarq $63,%rdx + xorq %rax,%rax + subq %rdx,%rax + + xorq %rdx,%rsi + addq %rax,%rsi + + xorq %rdx,%r8 + xorq %rdx,%r9 + xorq %rdx,%r10 + xorq %rdx,%r11 + xorq %rdx,%r12 + xorq %rdx,%r13 + xorq %rdx,%r14 + xorq %rdx,%r15 + xorq %rdx,%rbx + xorq %rdx,%rbp + xorq %rdx,%rcx + xorq %rdx,%rdi + addq %r8,%rax + adcq $0,%r9 + adcq $0,%r10 + adcq $0,%r11 + adcq $0,%r12 + adcq $0,%r13 + adcq $0,%r14 + adcq $0,%r15 + adcq $0,%rbx + adcq $0,%rbp + adcq $0,%rcx + adcq $0,%rdi + + mulq %rsi + movq %rax,%r8 + movq %r9,%rax + movq %rdx,%r9 + mulq %rsi + addq %rax,%r9 + movq %r10,%rax + adcq $0,%rdx + movq %rdx,%r10 + mulq %rsi + addq %rax,%r10 + movq %r11,%rax + adcq $0,%rdx + movq %rdx,%r11 + mulq %rsi + addq %rax,%r11 + movq %r12,%rax + adcq $0,%rdx + movq %rdx,%r12 + mulq %rsi + addq %rax,%r12 + movq %r13,%rax + adcq $0,%rdx + movq %rdx,%r13 + mulq %rsi + addq %rax,%r13 + movq %r14,%rax + adcq $0,%rdx + movq %rdx,%r14 + mulq %rsi + addq %rax,%r14 + movq %r15,%rax + adcq $0,%rdx + movq %rdx,%r15 + mulq %rsi + addq %rax,%r15 + movq %rbx,%rax + adcq $0,%rdx + movq %rdx,%rbx + mulq %rsi + addq %rax,%rbx + movq %rbp,%rax + adcq $0,%rdx + movq %rdx,%rbp + mulq %rsi + addq %rax,%rbp + movq %rcx,%rax + adcq $0,%rdx + movq %rdx,%rcx + mulq %rsi + addq %rax,%rcx + movq %rdi,%rax + adcq $0,%rdx + movq %rdx,%rdi + movq 8(%rsp),%rdx + imulq %rsi,%rax + movq 16(%rsp),%rsi + addq %rdi,%rax + + addq 0(%rdx),%r8 + adcq 8(%rdx),%r9 + adcq 16(%rdx),%r10 + adcq 24(%rdx),%r11 + adcq 32(%rdx),%r12 + adcq 40(%rdx),%r13 + adcq 48(%rdx),%r14 + movq 56(%rdx),%rdi + adcq %rdi,%r15 + adcq %rdi,%rbx + adcq %rdi,%rbp + adcq %rdi,%rcx + adcq %rdi,%rax + + movq %rdx,%rdi + + movq %r8,0(%rdx) + movq %r9,8(%rdx) + movq %r10,16(%rdx) + movq %r11,24(%rdx) + movq %r12,32(%rdx) + movq %r13,40(%rdx) + movq %r14,48(%rdx) + movq %r15,56(%rdx) + movq %rbx,64(%rdx) + movq %rbp,72(%rdx) + movq %rcx,80(%rdx) + movq %rax,88(%rdx) + + .byte 0xf3,0xc3 +.cfi_endproc + + +.p2align 5 +__smulq_383x63: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + movq 32(%rsi),%r12 + movq 40(%rsi),%r13 + + movq %rdx,%rbp + sarq $63,%rdx + xorq %rax,%rax + subq %rdx,%rax + + xorq %rdx,%rbp + addq %rax,%rbp + + xorq %rdx,%r8 + xorq %rdx,%r9 + xorq %rdx,%r10 + xorq %rdx,%r11 + xorq %rdx,%r12 + xorq %rdx,%r13 + addq %r8,%rax + adcq $0,%r9 + adcq $0,%r10 + adcq $0,%r11 + adcq $0,%r12 + adcq $0,%r13 + + mulq %rbp + movq %rax,%r8 + movq %r9,%rax + movq %rdx,%r9 + mulq %rbp + addq %rax,%r9 + movq %r10,%rax + adcq $0,%rdx + movq %rdx,%r10 + mulq %rbp + addq %rax,%r10 + movq %r11,%rax + adcq $0,%rdx + movq %rdx,%r11 + mulq %rbp + addq %rax,%r11 + movq %r12,%rax + adcq $0,%rdx + movq %rdx,%r12 + mulq %rbp + addq %rax,%r12 + movq %r13,%rax + adcq $0,%rdx + movq %rdx,%r13 + imulq %rbp,%rax + addq %rax,%r13 + + leaq 48(%rsi),%rsi + movq %rcx,%rdx + + movq %r8,0(%rdi) + movq %r9,8(%rdi) + movq %r10,16(%rdi) + movq %r11,24(%rdi) + movq %r12,32(%rdi) + movq %r13,40(%rdi) + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + movq 32(%rsi),%r12 + movq 40(%rsi),%r13 + + movq %rdx,%rbp + sarq $63,%rdx + xorq %rax,%rax + subq %rdx,%rax + + xorq %rdx,%rbp + addq %rax,%rbp + + xorq %rdx,%r8 + xorq %rdx,%r9 + xorq %rdx,%r10 + xorq %rdx,%r11 + xorq %rdx,%r12 + xorq %rdx,%r13 + addq %r8,%rax + adcq $0,%r9 + adcq $0,%r10 + adcq $0,%r11 + adcq $0,%r12 + adcq $0,%r13 + + mulq %rbp + movq %rax,%r8 + movq %r9,%rax + movq %rdx,%r9 + mulq %rbp + addq %rax,%r9 + movq %r10,%rax + adcq $0,%rdx + movq %rdx,%r10 + mulq %rbp + addq %rax,%r10 + movq %r11,%rax + adcq $0,%rdx + movq %rdx,%r11 + mulq %rbp + addq %rax,%r11 + movq %r12,%rax + adcq $0,%rdx + movq %rdx,%r12 + mulq %rbp + addq %rax,%r12 + movq %r13,%rax + adcq $0,%rdx + movq %rdx,%r13 + imulq %rbp,%rax + addq %rax,%r13 + + leaq -48(%rsi),%rsi + + addq 0(%rdi),%r8 + adcq 8(%rdi),%r9 + adcq 16(%rdi),%r10 + adcq 24(%rdi),%r11 + adcq 32(%rdi),%r12 + adcq 40(%rdi),%r13 + + movq %r8,0(%rdi) + movq %r9,8(%rdi) + movq %r10,16(%rdi) + movq %r11,24(%rdi) + movq %r12,32(%rdi) + movq %r13,40(%rdi) + + .byte 0xf3,0xc3 +.cfi_endproc + + +.p2align 5 +__smulq_383_n_shift_by_62: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + movq %rdx,%rbx + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + movq 32(%rsi),%r12 + movq 40(%rsi),%r13 + + movq %rdx,%rbp + sarq $63,%rdx + xorq %rax,%rax + subq %rdx,%rax + + xorq %rdx,%rbp + addq %rax,%rbp + + xorq %rdx,%r8 + xorq %rdx,%r9 + xorq %rdx,%r10 + xorq %rdx,%r11 + xorq %rdx,%r12 + xorq %rdx,%r13 + addq %r8,%rax + adcq $0,%r9 + adcq $0,%r10 + adcq $0,%r11 + adcq $0,%r12 + adcq $0,%r13 + + mulq %rbp + movq %rax,%r8 + movq %r9,%rax + movq %rdx,%r9 + mulq %rbp + addq %rax,%r9 + movq %r10,%rax + adcq $0,%rdx + movq %rdx,%r10 + mulq %rbp + addq %rax,%r10 + movq %r11,%rax + adcq $0,%rdx + movq %rdx,%r11 + mulq %rbp + addq %rax,%r11 + movq %r12,%rax + adcq $0,%rdx + movq %rdx,%r12 + mulq %rbp + addq %rax,%r12 + movq %r13,%rax + adcq $0,%rdx + movq %rdx,%r13 + imulq %rbp + addq %rax,%r13 + adcq $0,%rdx + + leaq 48(%rsi),%rsi + movq %rdx,%r14 + movq %rcx,%rdx + + movq %r8,0(%rdi) + movq %r9,8(%rdi) + movq %r10,16(%rdi) + movq %r11,24(%rdi) + movq %r12,32(%rdi) + movq %r13,40(%rdi) + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + movq 32(%rsi),%r12 + movq 40(%rsi),%r13 + + movq %rdx,%rbp + sarq $63,%rdx + xorq %rax,%rax + subq %rdx,%rax + + xorq %rdx,%rbp + addq %rax,%rbp + + xorq %rdx,%r8 + xorq %rdx,%r9 + xorq %rdx,%r10 + xorq %rdx,%r11 + xorq %rdx,%r12 + xorq %rdx,%r13 + addq %r8,%rax + adcq $0,%r9 + adcq $0,%r10 + adcq $0,%r11 + adcq $0,%r12 + adcq $0,%r13 + + mulq %rbp + movq %rax,%r8 + movq %r9,%rax + movq %rdx,%r9 + mulq %rbp + addq %rax,%r9 + movq %r10,%rax + adcq $0,%rdx + movq %rdx,%r10 + mulq %rbp + addq %rax,%r10 + movq %r11,%rax + adcq $0,%rdx + movq %rdx,%r11 + mulq %rbp + addq %rax,%r11 + movq %r12,%rax + adcq $0,%rdx + movq %rdx,%r12 + mulq %rbp + addq %rax,%r12 + movq %r13,%rax + adcq $0,%rdx + movq %rdx,%r13 + imulq %rbp + addq %rax,%r13 + adcq $0,%rdx + + leaq -48(%rsi),%rsi + + addq 0(%rdi),%r8 + adcq 8(%rdi),%r9 + adcq 16(%rdi),%r10 + adcq 24(%rdi),%r11 + adcq 32(%rdi),%r12 + adcq 40(%rdi),%r13 + adcq %rdx,%r14 + movq %rbx,%rdx + + shrdq $62,%r9,%r8 + shrdq $62,%r10,%r9 + shrdq $62,%r11,%r10 + shrdq $62,%r12,%r11 + shrdq $62,%r13,%r12 + shrdq $62,%r14,%r13 + + sarq $63,%r14 + xorq %rbp,%rbp + subq %r14,%rbp + + xorq %r14,%r8 + xorq %r14,%r9 + xorq %r14,%r10 + xorq %r14,%r11 + xorq %r14,%r12 + xorq %r14,%r13 + addq %rbp,%r8 + adcq $0,%r9 + adcq $0,%r10 + adcq $0,%r11 + adcq $0,%r12 + adcq $0,%r13 + + movq %r8,0(%rdi) + movq %r9,8(%rdi) + movq %r10,16(%rdi) + movq %r11,24(%rdi) + movq %r12,32(%rdi) + movq %r13,40(%rdi) + + xorq %r14,%rdx + xorq %r14,%rcx + addq %rbp,%rdx + addq %rbp,%rcx + + .byte 0xf3,0xc3 +.cfi_endproc + + +.p2align 5 +__ab_approximation_62: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + movq 40(%rsi),%r9 + movq 88(%rsi),%r11 + movq 32(%rsi),%rbx + movq 80(%rsi),%rbp + movq 24(%rsi),%r8 + movq 72(%rsi),%r10 + + movq %r9,%rax + orq %r11,%rax + cmovzq %rbx,%r9 + cmovzq %rbp,%r11 + cmovzq %r8,%rbx + cmovzq %r10,%rbp + movq 16(%rsi),%r8 + movq 64(%rsi),%r10 + + movq %r9,%rax + orq %r11,%rax + cmovzq %rbx,%r9 + cmovzq %rbp,%r11 + cmovzq %r8,%rbx + cmovzq %r10,%rbp + movq 8(%rsi),%r8 + movq 56(%rsi),%r10 + + movq %r9,%rax + orq %r11,%rax + cmovzq %rbx,%r9 + cmovzq %rbp,%r11 + cmovzq %r8,%rbx + cmovzq %r10,%rbp + movq 0(%rsi),%r8 + movq 48(%rsi),%r10 + + movq %r9,%rax + orq %r11,%rax + bsrq %rax,%rcx + leaq 1(%rcx),%rcx + cmovzq %rbx,%r9 + cmovzq %rbp,%r11 + cmovzq %rax,%rcx + negq %rcx + + + shldq %cl,%rbx,%r9 + shldq %cl,%rbp,%r11 + + jmp __inner_loop_62 + + .byte 0xf3,0xc3 +.cfi_endproc + + +.p2align 3 +.long 0 +__inner_loop_62: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + movq $1,%rdx + xorq %rcx,%rcx + xorq %r12,%r12 + movq $1,%r13 + movq %rsi,8(%rsp) + +L$oop_62: + xorq %rax,%rax + xorq %rbx,%rbx + testq $1,%r8 + movq %r10,%rbp + movq %r11,%r14 + cmovnzq %r10,%rax + cmovnzq %r11,%rbx + subq %r8,%rbp + sbbq %r9,%r14 + movq %r8,%r15 + movq %r9,%rsi + subq %rax,%r8 + sbbq %rbx,%r9 + cmovcq %rbp,%r8 + cmovcq %r14,%r9 + cmovcq %r15,%r10 + cmovcq %rsi,%r11 + movq %rdx,%rax + cmovcq %r12,%rdx + cmovcq %rax,%r12 + movq %rcx,%rbx + cmovcq %r13,%rcx + cmovcq %rbx,%r13 + xorq %rax,%rax + xorq %rbx,%rbx + shrdq $1,%r9,%r8 + shrq $1,%r9 + testq $1,%r15 + cmovnzq %r12,%rax + cmovnzq %r13,%rbx + addq %r12,%r12 + addq %r13,%r13 + subq %rax,%rdx + subq %rbx,%rcx + subl $1,%edi + jnz L$oop_62 + + movq 8(%rsp),%rsi + .byte 0xf3,0xc3 +.cfi_endproc + diff --git a/temp-deps/blst-ts/deps/blst/build/mach-o/ctx_inverse_mod_384-x86_64.s b/temp-deps/blst-ts/deps/blst/build/mach-o/ctx_inverse_mod_384-x86_64.s new file mode 100644 index 0000000..3e05df3 --- /dev/null +++ b/temp-deps/blst-ts/deps/blst/build/mach-o/ctx_inverse_mod_384-x86_64.s @@ -0,0 +1,1566 @@ +.text + +.globl _ctx_inverse_mod_383 + +.p2align 5 +_ctx_inverse_mod_383: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + subq $1112,%rsp +.cfi_adjust_cfa_offset 1112 + + + leaq 88+511(%rsp),%rax + andq $-512,%rax + movq %rdi,32(%rsp) + movq %rcx,40(%rsp) + + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + movq 32(%rsi),%r12 + movq 40(%rsi),%r13 + + movq 0(%rdx),%r14 + movq 8(%rdx),%r15 + movq 16(%rdx),%rbx + movq 24(%rdx),%rbp + movq 32(%rdx),%rsi + movq 40(%rdx),%rdi + + movq %r8,0(%rax) + movq %r9,8(%rax) + movq %r10,16(%rax) + movq %r11,24(%rax) + movq %r12,32(%rax) + movq %r13,40(%rax) + + movq %r14,48(%rax) + movq %r15,56(%rax) + movq %rbx,64(%rax) + movq %rbp,72(%rax) + movq %rsi,80(%rax) + movq %rax,%rsi + movq %rdi,88(%rax) + + + movl $31,%edi + call __ab_approximation_31 + + + movq %r12,72(%rsp) + movq %r13,80(%rsp) + + movq $256,%rdi + xorq %rsi,%rdi + call __smulx_383_n_shift_by_31 + + + movq %rdx,96(%rdi) + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulx_383_n_shift_by_31 + + + movq %rdx,96(%rdi) + + + xorq $256,%rsi + movl $31,%edi + call __ab_approximation_31 + + + movq %r12,72(%rsp) + movq %r13,80(%rsp) + + movq $256,%rdi + xorq %rsi,%rdi + call __smulx_383_n_shift_by_31 + movq %rdx,56(%rsp) + movq %rcx,64(%rsp) + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulx_383_n_shift_by_31 + + + + movq 96(%rsi),%rax + movq 144(%rsi),%r11 + movq %rdx,%rbx + movq %rax,%r10 + imulq 56(%rsp) + movq %rax,%r8 + movq %r11,%rax + movq %rdx,%r9 + imulq 64(%rsp) + addq %rax,%r8 + adcq %rdx,%r9 + movq %r8,48(%rdi) + movq %r9,56(%rdi) + sarq $63,%r9 + movq %r9,64(%rdi) + movq %r9,72(%rdi) + movq %r9,80(%rdi) + movq %r9,88(%rdi) + leaq 96(%rsi),%rsi + + movq %r10,%rax + imulq %rbx + movq %rax,%r8 + movq %r11,%rax + movq %rdx,%r9 + imulq %rcx + addq %rax,%r8 + adcq %rdx,%r9 + movq %r8,96(%rdi) + movq %r9,104(%rdi) + sarq $63,%r9 + movq %r9,112(%rdi) + movq %r9,120(%rdi) + movq %r9,128(%rdi) + movq %r9,136(%rdi) + xorq $256+96,%rsi + movl $31,%edi + call __ab_approximation_31 + + + movq %r12,72(%rsp) + movq %r13,80(%rsp) + + movq $256,%rdi + xorq %rsi,%rdi + call __smulx_383_n_shift_by_31 + movq %rdx,56(%rsp) + movq %rcx,64(%rsp) + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulx_383_n_shift_by_31 + movq %rdx,72(%rsp) + movq %rcx,80(%rsp) + + movq 56(%rsp),%rdx + movq 64(%rsp),%rcx + leaq 96(%rsi),%rsi + leaq 48(%rdi),%rdi + call __smulx_383x63 + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulx_383x63 + xorq $256+96,%rsi + movl $31,%edi + call __ab_approximation_31 + + + movq %r12,72(%rsp) + movq %r13,80(%rsp) + + movq $256,%rdi + xorq %rsi,%rdi + call __smulx_383_n_shift_by_31 + movq %rdx,56(%rsp) + movq %rcx,64(%rsp) + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulx_383_n_shift_by_31 + movq %rdx,72(%rsp) + movq %rcx,80(%rsp) + + movq 56(%rsp),%rdx + movq 64(%rsp),%rcx + leaq 96(%rsi),%rsi + leaq 48(%rdi),%rdi + call __smulx_383x63 + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulx_383x63 + xorq $256+96,%rsi + movl $31,%edi + call __ab_approximation_31 + + + movq %r12,72(%rsp) + movq %r13,80(%rsp) + + movq $256,%rdi + xorq %rsi,%rdi + call __smulx_383_n_shift_by_31 + movq %rdx,56(%rsp) + movq %rcx,64(%rsp) + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulx_383_n_shift_by_31 + movq %rdx,72(%rsp) + movq %rcx,80(%rsp) + + movq 56(%rsp),%rdx + movq 64(%rsp),%rcx + leaq 96(%rsi),%rsi + leaq 48(%rdi),%rdi + call __smulx_383x63 + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulx_383x63 + xorq $256+96,%rsi + movl $31,%edi + call __ab_approximation_31 + + + movq %r12,72(%rsp) + movq %r13,80(%rsp) + + movq $256,%rdi + xorq %rsi,%rdi + call __smulx_383_n_shift_by_31 + movq %rdx,56(%rsp) + movq %rcx,64(%rsp) + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulx_383_n_shift_by_31 + movq %rdx,72(%rsp) + movq %rcx,80(%rsp) + + movq 56(%rsp),%rdx + movq 64(%rsp),%rcx + leaq 96(%rsi),%rsi + leaq 48(%rdi),%rdi + call __smulx_383x63 + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulx_383x63 + xorq $256+96,%rsi + movl $31,%edi + call __ab_approximation_31 + + + movq %r12,72(%rsp) + movq %r13,80(%rsp) + + movq $256,%rdi + xorq %rsi,%rdi + call __smulx_383_n_shift_by_31 + movq %rdx,56(%rsp) + movq %rcx,64(%rsp) + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulx_383_n_shift_by_31 + movq %rdx,72(%rsp) + movq %rcx,80(%rsp) + + movq 56(%rsp),%rdx + movq 64(%rsp),%rcx + leaq 96(%rsi),%rsi + leaq 48(%rdi),%rdi + call __smulx_383x63 + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulx_383x63 + xorq $256+96,%rsi + movl $31,%edi + call __ab_approximation_31 + + + movq %r12,72(%rsp) + movq %r13,80(%rsp) + + movq $256,%rdi + xorq %rsi,%rdi + call __smulx_383_n_shift_by_31 + movq %rdx,56(%rsp) + movq %rcx,64(%rsp) + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulx_383_n_shift_by_31 + movq %rdx,72(%rsp) + movq %rcx,80(%rsp) + + movq 56(%rsp),%rdx + movq 64(%rsp),%rcx + leaq 96(%rsi),%rsi + leaq 48(%rdi),%rdi + call __smulx_383x63 + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulx_383x63 + xorq $256+96,%rsi + movl $31,%edi + call __ab_approximation_31 + + + movq %r12,72(%rsp) + movq %r13,80(%rsp) + + movq $256,%rdi + xorq %rsi,%rdi + call __smulx_383_n_shift_by_31 + movq %rdx,56(%rsp) + movq %rcx,64(%rsp) + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulx_383_n_shift_by_31 + movq %rdx,72(%rsp) + movq %rcx,80(%rsp) + + movq 56(%rsp),%rdx + movq 64(%rsp),%rcx + leaq 96(%rsi),%rsi + leaq 48(%rdi),%rdi + call __smulx_383x63 + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulx_383x63 + xorq $256+96,%rsi + movl $31,%edi + call __ab_approximation_31 + + + movq %r12,72(%rsp) + movq %r13,80(%rsp) + + movq $256,%rdi + xorq %rsi,%rdi + call __smulx_383_n_shift_by_31 + movq %rdx,56(%rsp) + movq %rcx,64(%rsp) + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulx_383_n_shift_by_31 + movq %rdx,72(%rsp) + movq %rcx,80(%rsp) + + movq 56(%rsp),%rdx + movq 64(%rsp),%rcx + leaq 96(%rsi),%rsi + leaq 48(%rdi),%rdi + call __smulx_383x63 + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulx_383x63 + xorq $256+96,%rsi + movl $31,%edi + call __ab_approximation_31 + + + movq %r12,72(%rsp) + movq %r13,80(%rsp) + + movq $256,%rdi + xorq %rsi,%rdi + call __smulx_383_n_shift_by_31 + movq %rdx,56(%rsp) + movq %rcx,64(%rsp) + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulx_383_n_shift_by_31 + movq %rdx,72(%rsp) + movq %rcx,80(%rsp) + + movq 56(%rsp),%rdx + movq 64(%rsp),%rcx + leaq 96(%rsi),%rsi + leaq 48(%rdi),%rdi + call __smulx_383x63 + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulx_383x63 + xorq $256+96,%rsi + movl $31,%edi + call __ab_approximation_31 + + + movq %r12,72(%rsp) + movq %r13,80(%rsp) + + movq $256,%rdi + xorq %rsi,%rdi + call __smulx_383_n_shift_by_31 + movq %rdx,56(%rsp) + movq %rcx,64(%rsp) + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulx_383_n_shift_by_31 + movq %rdx,72(%rsp) + movq %rcx,80(%rsp) + + movq 56(%rsp),%rdx + movq 64(%rsp),%rcx + leaq 96(%rsi),%rsi + leaq 48(%rdi),%rdi + call __smulx_383x63 + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulx_383x63 + sarq $63,%r13 + movq %r13,48(%rdi) + movq %r13,56(%rdi) + movq %r13,64(%rdi) + movq %r13,72(%rdi) + movq %r13,80(%rdi) + movq %r13,88(%rdi) + xorq $256+96,%rsi + movl $31,%edi + call __ab_approximation_31 + + + movq %r12,72(%rsp) + movq %r13,80(%rsp) + + movq $256,%rdi + xorq %rsi,%rdi + call __smulx_383_n_shift_by_31 + movq %rdx,56(%rsp) + movq %rcx,64(%rsp) + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulx_383_n_shift_by_31 + movq %rdx,72(%rsp) + movq %rcx,80(%rsp) + + movq 56(%rsp),%rdx + movq 64(%rsp),%rcx + leaq 96(%rsi),%rsi + leaq 48(%rdi),%rdi + call __smulx_383x63 + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulx_767x63 + xorq $256+96,%rsi + movl $31,%edi + call __ab_approximation_31 + + + movq %r12,72(%rsp) + movq %r13,80(%rsp) + + movq $256,%rdi + xorq %rsi,%rdi + call __smulx_383_n_shift_by_31 + movq %rdx,56(%rsp) + movq %rcx,64(%rsp) + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulx_383_n_shift_by_31 + movq %rdx,72(%rsp) + movq %rcx,80(%rsp) + + movq 56(%rsp),%rdx + movq 64(%rsp),%rcx + leaq 96(%rsi),%rsi + leaq 48(%rdi),%rdi + call __smulx_383x63 + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulx_767x63 + xorq $256+96,%rsi + movl $31,%edi + call __ab_approximation_31 + + + movq %r12,72(%rsp) + movq %r13,80(%rsp) + + movq $256,%rdi + xorq %rsi,%rdi + call __smulx_383_n_shift_by_31 + movq %rdx,56(%rsp) + movq %rcx,64(%rsp) + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulx_383_n_shift_by_31 + movq %rdx,72(%rsp) + movq %rcx,80(%rsp) + + movq 56(%rsp),%rdx + movq 64(%rsp),%rcx + leaq 96(%rsi),%rsi + leaq 48(%rdi),%rdi + call __smulx_383x63 + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulx_767x63 + xorq $256+96,%rsi + movl $31,%edi + call __ab_approximation_31 + + + movq %r12,72(%rsp) + movq %r13,80(%rsp) + + movq $256,%rdi + xorq %rsi,%rdi + call __smulx_383_n_shift_by_31 + movq %rdx,56(%rsp) + movq %rcx,64(%rsp) + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulx_383_n_shift_by_31 + movq %rdx,72(%rsp) + movq %rcx,80(%rsp) + + movq 56(%rsp),%rdx + movq 64(%rsp),%rcx + leaq 96(%rsi),%rsi + leaq 48(%rdi),%rdi + call __smulx_383x63 + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulx_767x63 + xorq $256+96,%rsi + movl $31,%edi + call __ab_approximation_31 + + + movq %r12,72(%rsp) + movq %r13,80(%rsp) + + movq $256,%rdi + xorq %rsi,%rdi + call __smulx_383_n_shift_by_31 + movq %rdx,56(%rsp) + movq %rcx,64(%rsp) + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulx_383_n_shift_by_31 + movq %rdx,72(%rsp) + movq %rcx,80(%rsp) + + movq 56(%rsp),%rdx + movq 64(%rsp),%rcx + leaq 96(%rsi),%rsi + leaq 48(%rdi),%rdi + call __smulx_383x63 + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulx_767x63 + xorq $256+96,%rsi + movl $31,%edi + call __ab_approximation_31 + + + movq %r12,72(%rsp) + movq %r13,80(%rsp) + + movq $256,%rdi + xorq %rsi,%rdi + call __smulx_383_n_shift_by_31 + movq %rdx,56(%rsp) + movq %rcx,64(%rsp) + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulx_383_n_shift_by_31 + movq %rdx,72(%rsp) + movq %rcx,80(%rsp) + + movq 56(%rsp),%rdx + movq 64(%rsp),%rcx + leaq 96(%rsi),%rsi + leaq 48(%rdi),%rdi + call __smulx_383x63 + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulx_767x63 + xorq $256+96,%rsi + movl $31,%edi + call __ab_approximation_31 + + + movq %r12,72(%rsp) + movq %r13,80(%rsp) + + movq $256,%rdi + xorq %rsi,%rdi + call __smulx_383_n_shift_by_31 + movq %rdx,56(%rsp) + movq %rcx,64(%rsp) + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulx_383_n_shift_by_31 + movq %rdx,72(%rsp) + movq %rcx,80(%rsp) + + movq 56(%rsp),%rdx + movq 64(%rsp),%rcx + leaq 96(%rsi),%rsi + leaq 48(%rdi),%rdi + call __smulx_383x63 + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulx_767x63 + xorq $256+96,%rsi + movl $31,%edi + call __ab_approximation_31 + + + movq %r12,72(%rsp) + movq %r13,80(%rsp) + + movq $256,%rdi + xorq %rsi,%rdi + call __smulx_191_n_shift_by_31 + movq %rdx,56(%rsp) + movq %rcx,64(%rsp) + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulx_191_n_shift_by_31 + movq %rdx,72(%rsp) + movq %rcx,80(%rsp) + + movq 56(%rsp),%rdx + movq 64(%rsp),%rcx + leaq 96(%rsi),%rsi + leaq 48(%rdi),%rdi + call __smulx_383x63 + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulx_767x63 + xorq $256+96,%rsi + movl $31,%edi + call __ab_approximation_31 + + + movq %r12,72(%rsp) + movq %r13,80(%rsp) + + movq $256,%rdi + xorq %rsi,%rdi + call __smulx_191_n_shift_by_31 + movq %rdx,56(%rsp) + movq %rcx,64(%rsp) + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulx_191_n_shift_by_31 + movq %rdx,72(%rsp) + movq %rcx,80(%rsp) + + movq 56(%rsp),%rdx + movq 64(%rsp),%rcx + leaq 96(%rsi),%rsi + leaq 48(%rdi),%rdi + call __smulx_383x63 + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulx_767x63 + xorq $256+96,%rsi + movl $31,%edi + call __ab_approximation_31 + + + movq %r12,72(%rsp) + movq %r13,80(%rsp) + + movq $256,%rdi + xorq %rsi,%rdi + call __smulx_191_n_shift_by_31 + movq %rdx,56(%rsp) + movq %rcx,64(%rsp) + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulx_191_n_shift_by_31 + movq %rdx,72(%rsp) + movq %rcx,80(%rsp) + + movq 56(%rsp),%rdx + movq 64(%rsp),%rcx + leaq 96(%rsi),%rsi + leaq 48(%rdi),%rdi + call __smulx_383x63 + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulx_767x63 + xorq $256+96,%rsi + movl $31,%edi + call __ab_approximation_31 + + + movq %r12,72(%rsp) + movq %r13,80(%rsp) + + movq $256,%rdi + xorq %rsi,%rdi + call __smulx_191_n_shift_by_31 + movq %rdx,56(%rsp) + movq %rcx,64(%rsp) + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulx_191_n_shift_by_31 + movq %rdx,72(%rsp) + movq %rcx,80(%rsp) + + movq 56(%rsp),%rdx + movq 64(%rsp),%rcx + leaq 96(%rsi),%rsi + leaq 48(%rdi),%rdi + call __smulx_383x63 + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulx_767x63 + + xorq $256+96,%rsi + movl $53,%edi + + movq 0(%rsi),%r8 + + movq 48(%rsi),%r10 + + call __inner_loop_62 + + + + + + + + leaq 96(%rsi),%rsi + + + + + + movq %r12,%rdx + movq %r13,%rcx + movq 32(%rsp),%rdi + call __smulx_767x63 + + movq 40(%rsp),%rsi + movq %rax,%rdx + sarq $63,%rax + + movq %rax,%r8 + movq %rax,%r9 + movq %rax,%r10 + andq 0(%rsi),%r8 + andq 8(%rsi),%r9 + movq %rax,%r11 + andq 16(%rsi),%r10 + andq 24(%rsi),%r11 + movq %rax,%r12 + andq 32(%rsi),%r12 + andq 40(%rsi),%rax + + addq %r8,%r14 + adcq %r9,%r15 + adcq %r10,%rbx + adcq %r11,%rbp + adcq %r12,%rcx + adcq %rax,%rdx + + movq %r14,48(%rdi) + movq %r15,56(%rdi) + movq %rbx,64(%rdi) + movq %rbp,72(%rdi) + movq %rcx,80(%rdi) + movq %rdx,88(%rdi) + + leaq 1112(%rsp),%r8 + movq 0(%r8),%r15 +.cfi_restore %r15 + movq 8(%r8),%r14 +.cfi_restore %r14 + movq 16(%r8),%r13 +.cfi_restore %r13 + movq 24(%r8),%r12 +.cfi_restore %r12 + movq 32(%r8),%rbx +.cfi_restore %rbx + movq 40(%r8),%rbp +.cfi_restore %rbp + leaq 48(%r8),%rsp +.cfi_adjust_cfa_offset -1112-8*6 + + .byte 0xf3,0xc3 +.cfi_endproc + + +.p2align 5 +__smulx_767x63: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + movq 32(%rsi),%r12 + movq 40(%rsi),%r13 + + movq %rdx,%rax + sarq $63,%rax + xorq %rbp,%rbp + subq %rax,%rbp + + movq %rdi,8(%rsp) + movq %rsi,16(%rsp) + leaq 48(%rsi),%rsi + + xorq %rax,%rdx + addq %rbp,%rdx + + xorq %rax,%r8 + xorq %rax,%r9 + xorq %rax,%r10 + xorq %rax,%r11 + xorq %rax,%r12 + xorq %r13,%rax + addq %rbp,%r8 + adcq $0,%r9 + adcq $0,%r10 + adcq $0,%r11 + adcq $0,%r12 + adcq $0,%rax + + mulxq %r8,%r8,%rbp + mulxq %r9,%r9,%r13 + addq %rbp,%r9 + mulxq %r10,%r10,%rbp + adcq %r13,%r10 + mulxq %r11,%r11,%r13 + adcq %rbp,%r11 + mulxq %r12,%r12,%rbp + adcq %r13,%r12 + adcq $0,%rbp + imulq %rdx + addq %rbp,%rax + adcq $0,%rdx + + movq %r8,0(%rdi) + movq %r9,8(%rdi) + movq %r10,16(%rdi) + movq %r11,24(%rdi) + movq %r12,32(%rdi) + movq %rax,40(%rdi) + movq %rdx,48(%rdi) + sarq $63,%rdx + movq %rdx,56(%rdi) + movq %rcx,%rdx + movq %rcx,%rax + + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + movq 32(%rsi),%r12 + movq 40(%rsi),%r13 + movq 48(%rsi),%r14 + movq 56(%rsi),%r15 + movq 64(%rsi),%rbx + movq 72(%rsi),%rbp + movq 80(%rsi),%rcx + movq 88(%rsi),%rdi + + sarq $63,%rax + xorq %rsi,%rsi + subq %rax,%rsi + + xorq %rax,%rdx + addq %rsi,%rdx + + xorq %rax,%r8 + xorq %rax,%r9 + xorq %rax,%r10 + xorq %rax,%r11 + xorq %rax,%r12 + xorq %rax,%r13 + xorq %rax,%r14 + xorq %rax,%r15 + xorq %rax,%rbx + xorq %rax,%rbp + xorq %rax,%rcx + xorq %rax,%rdi + addq %rsi,%r8 + adcq $0,%r9 + adcq $0,%r10 + adcq $0,%r11 + adcq $0,%r12 + adcq $0,%r13 + adcq $0,%r14 + adcq $0,%r15 + adcq $0,%rbx + adcq $0,%rbp + adcq $0,%rcx + adcq $0,%rdi + + mulxq %r8,%r8,%rax + mulxq %r9,%r9,%rsi + addq %rax,%r9 + mulxq %r10,%r10,%rax + adcq %rsi,%r10 + mulxq %r11,%r11,%rsi + adcq %rax,%r11 + mulxq %r12,%r12,%rax + adcq %rsi,%r12 + mulxq %r13,%r13,%rsi + adcq %rax,%r13 + mulxq %r14,%r14,%rax + adcq %rsi,%r14 + mulxq %r15,%r15,%rsi + adcq %rax,%r15 + mulxq %rbx,%rbx,%rax + adcq %rsi,%rbx + mulxq %rbp,%rbp,%rsi + adcq %rax,%rbp + mulxq %rcx,%rcx,%rax + adcq %rsi,%rcx + mulxq %rdi,%rdi,%rsi + movq 8(%rsp),%rdx + movq 16(%rsp),%rsi + adcq %rdi,%rax + + addq 0(%rdx),%r8 + adcq 8(%rdx),%r9 + adcq 16(%rdx),%r10 + adcq 24(%rdx),%r11 + adcq 32(%rdx),%r12 + adcq 40(%rdx),%r13 + adcq 48(%rdx),%r14 + movq 56(%rdx),%rdi + adcq %rdi,%r15 + adcq %rdi,%rbx + adcq %rdi,%rbp + adcq %rdi,%rcx + adcq %rdi,%rax + + movq %rdx,%rdi + + movq %r8,0(%rdx) + movq %r9,8(%rdx) + movq %r10,16(%rdx) + movq %r11,24(%rdx) + movq %r12,32(%rdx) + movq %r13,40(%rdx) + movq %r14,48(%rdx) + movq %r15,56(%rdx) + movq %rbx,64(%rdx) + movq %rbp,72(%rdx) + movq %rcx,80(%rdx) + movq %rax,88(%rdx) + + .byte 0xf3,0xc3 +.cfi_endproc + + +.p2align 5 +__smulx_383x63: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + movq 0+0(%rsi),%r8 + movq 0+8(%rsi),%r9 + movq 0+16(%rsi),%r10 + movq 0+24(%rsi),%r11 + movq 0+32(%rsi),%r12 + movq 0+40(%rsi),%r13 + + movq %rdx,%rbp + sarq $63,%rbp + xorq %rax,%rax + subq %rbp,%rax + + xorq %rbp,%rdx + addq %rax,%rdx + + xorq %rbp,%r8 + xorq %rbp,%r9 + xorq %rbp,%r10 + xorq %rbp,%r11 + xorq %rbp,%r12 + xorq %rbp,%r13 + addq %rax,%r8 + adcq $0,%r9 + adcq $0,%r10 + adcq $0,%r11 + adcq $0,%r12 + adcq $0,%r13 + + mulxq %r8,%r8,%rbp + mulxq %r9,%r9,%rax + addq %rbp,%r9 + mulxq %r10,%r10,%rbp + adcq %rax,%r10 + mulxq %r11,%r11,%rax + adcq %rbp,%r11 + mulxq %r12,%r12,%rbp + adcq %rax,%r12 + mulxq %r13,%r13,%rax + movq %rcx,%rdx + adcq %rbp,%r13 + + movq %r8,0(%rdi) + movq %r9,8(%rdi) + movq %r10,16(%rdi) + movq %r11,24(%rdi) + movq %r12,32(%rdi) + movq %r13,40(%rdi) + movq 48+0(%rsi),%r8 + movq 48+8(%rsi),%r9 + movq 48+16(%rsi),%r10 + movq 48+24(%rsi),%r11 + movq 48+32(%rsi),%r12 + movq 48+40(%rsi),%r13 + + movq %rdx,%rbp + sarq $63,%rbp + xorq %rax,%rax + subq %rbp,%rax + + xorq %rbp,%rdx + addq %rax,%rdx + + xorq %rbp,%r8 + xorq %rbp,%r9 + xorq %rbp,%r10 + xorq %rbp,%r11 + xorq %rbp,%r12 + xorq %rbp,%r13 + addq %rax,%r8 + adcq $0,%r9 + adcq $0,%r10 + adcq $0,%r11 + adcq $0,%r12 + adcq $0,%r13 + + mulxq %r8,%r8,%rbp + mulxq %r9,%r9,%rax + addq %rbp,%r9 + mulxq %r10,%r10,%rbp + adcq %rax,%r10 + mulxq %r11,%r11,%rax + adcq %rbp,%r11 + mulxq %r12,%r12,%rbp + adcq %rax,%r12 + mulxq %r13,%r13,%rax + adcq %rbp,%r13 + + addq 0(%rdi),%r8 + adcq 8(%rdi),%r9 + adcq 16(%rdi),%r10 + adcq 24(%rdi),%r11 + adcq 32(%rdi),%r12 + adcq 40(%rdi),%r13 + + movq %r8,0(%rdi) + movq %r9,8(%rdi) + movq %r10,16(%rdi) + movq %r11,24(%rdi) + movq %r12,32(%rdi) + movq %r13,40(%rdi) + + .byte 0xf3,0xc3 +.cfi_endproc + + +.p2align 5 +__smulx_383_n_shift_by_31: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + movq %rdx,%rbx + xorq %r14,%r14 + movq 0+0(%rsi),%r8 + movq 0+8(%rsi),%r9 + movq 0+16(%rsi),%r10 + movq 0+24(%rsi),%r11 + movq 0+32(%rsi),%r12 + movq 0+40(%rsi),%r13 + + movq %rdx,%rax + sarq $63,%rax + xorq %rbp,%rbp + subq %rax,%rbp + + xorq %rax,%rdx + addq %rbp,%rdx + + xorq %rax,%r8 + xorq %rax,%r9 + xorq %rax,%r10 + xorq %rax,%r11 + xorq %rax,%r12 + xorq %r13,%rax + addq %rbp,%r8 + adcq $0,%r9 + adcq $0,%r10 + adcq $0,%r11 + adcq $0,%r12 + adcq $0,%rax + + mulxq %r8,%r8,%rbp + mulxq %r9,%r9,%r13 + addq %rbp,%r9 + mulxq %r10,%r10,%rbp + adcq %r13,%r10 + mulxq %r11,%r11,%r13 + adcq %rbp,%r11 + mulxq %r12,%r12,%rbp + adcq %r13,%r12 + adcq $0,%rbp + imulq %rdx + addq %rbp,%rax + adcq %rdx,%r14 + + movq %rcx,%rdx + + movq %r8,0(%rdi) + movq %r9,8(%rdi) + movq %r10,16(%rdi) + movq %r11,24(%rdi) + movq %r12,32(%rdi) + movq %rax,40(%rdi) + movq 48+0(%rsi),%r8 + movq 48+8(%rsi),%r9 + movq 48+16(%rsi),%r10 + movq 48+24(%rsi),%r11 + movq 48+32(%rsi),%r12 + movq 48+40(%rsi),%r13 + + movq %rdx,%rax + sarq $63,%rax + xorq %rbp,%rbp + subq %rax,%rbp + + xorq %rax,%rdx + addq %rbp,%rdx + + xorq %rax,%r8 + xorq %rax,%r9 + xorq %rax,%r10 + xorq %rax,%r11 + xorq %rax,%r12 + xorq %r13,%rax + addq %rbp,%r8 + adcq $0,%r9 + adcq $0,%r10 + adcq $0,%r11 + adcq $0,%r12 + adcq $0,%rax + + mulxq %r8,%r8,%rbp + mulxq %r9,%r9,%r13 + addq %rbp,%r9 + mulxq %r10,%r10,%rbp + adcq %r13,%r10 + mulxq %r11,%r11,%r13 + adcq %rbp,%r11 + mulxq %r12,%r12,%rbp + adcq %r13,%r12 + adcq $0,%rbp + imulq %rdx + addq %rbp,%rax + adcq $0,%rdx + + addq 0(%rdi),%r8 + adcq 8(%rdi),%r9 + adcq 16(%rdi),%r10 + adcq 24(%rdi),%r11 + adcq 32(%rdi),%r12 + adcq 40(%rdi),%rax + adcq %rdx,%r14 + movq %rbx,%rdx + + shrdq $31,%r9,%r8 + shrdq $31,%r10,%r9 + shrdq $31,%r11,%r10 + shrdq $31,%r12,%r11 + shrdq $31,%rax,%r12 + shrdq $31,%r14,%rax + + sarq $63,%r14 + xorq %rbp,%rbp + subq %r14,%rbp + + xorq %r14,%r8 + xorq %r14,%r9 + xorq %r14,%r10 + xorq %r14,%r11 + xorq %r14,%r12 + xorq %r14,%rax + addq %rbp,%r8 + adcq $0,%r9 + adcq $0,%r10 + adcq $0,%r11 + adcq $0,%r12 + adcq $0,%rax + + movq %r8,0(%rdi) + movq %r9,8(%rdi) + movq %r10,16(%rdi) + movq %r11,24(%rdi) + movq %r12,32(%rdi) + movq %rax,40(%rdi) + + xorq %r14,%rdx + xorq %r14,%rcx + addq %rbp,%rdx + addq %rbp,%rcx + + .byte 0xf3,0xc3 +.cfi_endproc + + +.p2align 5 +__smulx_191_n_shift_by_31: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + movq %rdx,%rbx + movq 0+0(%rsi),%r8 + movq 0+8(%rsi),%r9 + movq 0+16(%rsi),%r10 + + movq %rdx,%rax + sarq $63,%rax + xorq %rbp,%rbp + subq %rax,%rbp + + xorq %rax,%rdx + addq %rbp,%rdx + + xorq %rax,%r8 + xorq %rax,%r9 + xorq %r10,%rax + addq %rbp,%r8 + adcq $0,%r9 + adcq $0,%rax + + mulxq %r8,%r8,%rbp + mulxq %r9,%r9,%r10 + addq %rbp,%r9 + adcq $0,%r10 + imulq %rdx + addq %rax,%r10 + adcq $0,%rdx + movq %rdx,%r14 + movq %rcx,%rdx + movq 48+0(%rsi),%r11 + movq 48+8(%rsi),%r12 + movq 48+16(%rsi),%r13 + + movq %rdx,%rax + sarq $63,%rax + xorq %rbp,%rbp + subq %rax,%rbp + + xorq %rax,%rdx + addq %rbp,%rdx + + xorq %rax,%r11 + xorq %rax,%r12 + xorq %r13,%rax + addq %rbp,%r11 + adcq $0,%r12 + adcq $0,%rax + + mulxq %r11,%r11,%rbp + mulxq %r12,%r12,%r13 + addq %rbp,%r12 + adcq $0,%r13 + imulq %rdx + addq %rax,%r13 + adcq $0,%rdx + addq %r8,%r11 + adcq %r9,%r12 + adcq %r10,%r13 + adcq %rdx,%r14 + movq %rbx,%rdx + + shrdq $31,%r12,%r11 + shrdq $31,%r13,%r12 + shrdq $31,%r14,%r13 + + sarq $63,%r14 + xorq %rbp,%rbp + subq %r14,%rbp + + xorq %r14,%r11 + xorq %r14,%r12 + xorq %r14,%r13 + addq %rbp,%r11 + adcq $0,%r12 + adcq $0,%r13 + + movq %r11,0(%rdi) + movq %r12,8(%rdi) + movq %r13,16(%rdi) + + xorq %r14,%rdx + xorq %r14,%rcx + addq %rbp,%rdx + addq %rbp,%rcx + + .byte 0xf3,0xc3 +.cfi_endproc + + +.p2align 5 +__ab_approximation_31: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + movq 40(%rsi),%r9 + movq 88(%rsi),%r11 + movq 32(%rsi),%rbx + movq 80(%rsi),%rbp + movq 24(%rsi),%r8 + movq 72(%rsi),%r10 + + movq %r9,%rax + orq %r11,%rax + cmovzq %rbx,%r9 + cmovzq %rbp,%r11 + cmovzq %r8,%rbx + movq 16(%rsi),%r8 + cmovzq %r10,%rbp + movq 64(%rsi),%r10 + + movq %r9,%rax + orq %r11,%rax + cmovzq %rbx,%r9 + cmovzq %rbp,%r11 + cmovzq %r8,%rbx + movq 8(%rsi),%r8 + cmovzq %r10,%rbp + movq 56(%rsi),%r10 + + movq %r9,%rax + orq %r11,%rax + cmovzq %rbx,%r9 + cmovzq %rbp,%r11 + cmovzq %r8,%rbx + movq 0(%rsi),%r8 + cmovzq %r10,%rbp + movq 48(%rsi),%r10 + + movq %r9,%rax + orq %r11,%rax + cmovzq %rbx,%r9 + cmovzq %rbp,%r11 + cmovzq %r8,%rbx + cmovzq %r10,%rbp + + movq %r9,%rax + orq %r11,%rax + bsrq %rax,%rcx + leaq 1(%rcx),%rcx + cmovzq %r8,%r9 + cmovzq %r10,%r11 + cmovzq %rax,%rcx + negq %rcx + + + shldq %cl,%rbx,%r9 + shldq %cl,%rbp,%r11 + + movl $0x7FFFFFFF,%eax + andq %rax,%r8 + andq %rax,%r10 + andnq %r9,%rax,%r9 + andnq %r11,%rax,%r11 + orq %r9,%r8 + orq %r11,%r10 + + jmp __inner_loop_31 + + .byte 0xf3,0xc3 +.cfi_endproc + + +.p2align 5 +__inner_loop_31: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + movq $0x7FFFFFFF80000000,%rcx + movq $0x800000007FFFFFFF,%r13 + movq $0x7FFFFFFF7FFFFFFF,%r15 + +L$oop_31: + cmpq %r10,%r8 + movq %r8,%rax + movq %r10,%rbx + movq %rcx,%rbp + movq %r13,%r14 + cmovbq %r10,%r8 + cmovbq %rax,%r10 + cmovbq %r13,%rcx + cmovbq %rbp,%r13 + + subq %r10,%r8 + subq %r13,%rcx + addq %r15,%rcx + + testq $1,%rax + cmovzq %rax,%r8 + cmovzq %rbx,%r10 + cmovzq %rbp,%rcx + cmovzq %r14,%r13 + + shrq $1,%r8 + addq %r13,%r13 + subq %r15,%r13 + subl $1,%edi + jnz L$oop_31 + + shrq $32,%r15 + movl %ecx,%edx + movl %r13d,%r12d + shrq $32,%rcx + shrq $32,%r13 + subq %r15,%rdx + subq %r15,%rcx + subq %r15,%r12 + subq %r15,%r13 + + .byte 0xf3,0xc3 +.cfi_endproc + + + +.p2align 5 +__inner_loop_62: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + movq $1,%rdx + xorq %rcx,%rcx + xorq %r12,%r12 + movq $1,%r13 + +L$oop_62: + xorq %rax,%rax + testq $1,%r8 + movq %r10,%rbx + cmovnzq %r10,%rax + subq %r8,%rbx + movq %r8,%rbp + subq %rax,%r8 + cmovcq %rbx,%r8 + cmovcq %rbp,%r10 + movq %rdx,%rax + cmovcq %r12,%rdx + cmovcq %rax,%r12 + movq %rcx,%rbx + cmovcq %r13,%rcx + cmovcq %rbx,%r13 + xorq %rax,%rax + xorq %rbx,%rbx + shrq $1,%r8 + testq $1,%rbp + cmovnzq %r12,%rax + cmovnzq %r13,%rbx + addq %r12,%r12 + addq %r13,%r13 + subq %rax,%rdx + subq %rbx,%rcx + subl $1,%edi + jnz L$oop_62 + + .byte 0xf3,0xc3 +.cfi_endproc + diff --git a/temp-deps/blst-ts/deps/blst/build/mach-o/div3w-armv8.S b/temp-deps/blst-ts/deps/blst/build/mach-o/div3w-armv8.S new file mode 100644 index 0000000..5a5eb3a --- /dev/null +++ b/temp-deps/blst-ts/deps/blst/build/mach-o/div3w-armv8.S @@ -0,0 +1,88 @@ +.text + +.globl _div_3_limbs + +.align 5 +_div_3_limbs: + ldp x4,x5,[x0] // load R + eor x0,x0,x0 // Q = 0 + mov x3,#64 // loop counter + nop + +Loop: + subs x6,x4,x1 // R - D + add x0,x0,x0 // Q <<= 1 + sbcs x7,x5,x2 + add x0,x0,#1 // Q + speculative bit + csel x4,x4,x6,lo // select between R and R - D + extr x1,x2,x1,#1 // D >>= 1 + csel x5,x5,x7,lo + lsr x2,x2,#1 + sbc x0,x0,xzr // subtract speculative bit + sub x3,x3,#1 + cbnz x3,Loop + + asr x3,x0,#63 // top bit -> mask + add x0,x0,x0 // Q <<= 1 + subs x6,x4,x1 // R - D + add x0,x0,#1 // Q + specilative bit + sbcs x7,x5,x2 + sbc x0,x0,xzr // subtract speculative bit + + orr x0,x0,x3 // all ones if overflow + + ret + +.globl _quot_rem_128 + +.align 5 +_quot_rem_128: + ldp x3,x4,[x1] + + mul x5,x3,x2 // divisor[0:1} * quotient + umulh x6,x3,x2 + mul x11, x4,x2 + umulh x7,x4,x2 + + ldp x8,x9,[x0] // load 3 limbs of the dividend + ldr x10,[x0,#16] + + adds x6,x6,x11 + adc x7,x7,xzr + + subs x8,x8,x5 // dividend - divisor * quotient + sbcs x9,x9,x6 + sbcs x10,x10,x7 + sbc x5,xzr,xzr // borrow -> mask + + add x2,x2,x5 // if borrowed, adjust the quotient ... + and x3,x3,x5 + and x4,x4,x5 + adds x8,x8,x3 // ... and add divisor + adc x9,x9,x4 + + stp x8,x9,[x0] // save 2 limbs of the remainder + str x2,[x0,#16] // and one limb of the quotient + + mov x0,x2 // return adjusted quotient + + ret + + +.globl _quot_rem_64 + +.align 5 +_quot_rem_64: + ldr x3,[x1] + ldr x8,[x0] // load 1 limb of the dividend + + mul x5,x3,x2 // divisor * quotient + + sub x8,x8,x5 // dividend - divisor * quotient + + stp x8,x2,[x0] // save remainder and quotient + + mov x0,x2 // return quotient + + ret + diff --git a/temp-deps/blst-ts/deps/blst/build/mach-o/div3w-x86_64.s b/temp-deps/blst-ts/deps/blst/build/mach-o/div3w-x86_64.s new file mode 100644 index 0000000..8075571 --- /dev/null +++ b/temp-deps/blst-ts/deps/blst/build/mach-o/div3w-x86_64.s @@ -0,0 +1,115 @@ +.text + +.globl _div_3_limbs +.private_extern _div_3_limbs + +.p2align 5 +_div_3_limbs: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + movq (%rdi),%r8 + movq 8(%rdi),%r9 + xorq %rax,%rax + movl $64,%ecx + +L$oop: + movq %r8,%r10 + subq %rsi,%r8 + movq %r9,%r11 + sbbq %rdx,%r9 + leaq 1(%rax,%rax,1),%rax + movq %rdx,%rdi + cmovcq %r10,%r8 + cmovcq %r11,%r9 + sbbq $0,%rax + shlq $63,%rdi + shrq $1,%rsi + shrq $1,%rdx + orq %rdi,%rsi + subl $1,%ecx + jnz L$oop + + leaq 1(%rax,%rax,1),%rcx + sarq $63,%rax + + subq %rsi,%r8 + sbbq %rdx,%r9 + sbbq $0,%rcx + + orq %rcx,%rax + + .byte 0xf3,0xc3 +.cfi_endproc + +.globl _quot_rem_128 +.private_extern _quot_rem_128 + +.p2align 5 +_quot_rem_128: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + movq %rdx,%rax + movq %rdx,%rcx + + mulq 0(%rsi) + movq %rax,%r8 + movq %rcx,%rax + movq %rdx,%r9 + + mulq 8(%rsi) + addq %rax,%r9 + adcq $0,%rdx + + movq 0(%rdi),%r10 + movq 8(%rdi),%r11 + movq 16(%rdi),%rax + + subq %r8,%r10 + sbbq %r9,%r11 + sbbq %rdx,%rax + sbbq %r8,%r8 + + addq %r8,%rcx + movq %r8,%r9 + andq 0(%rsi),%r8 + andq 8(%rsi),%r9 + addq %r8,%r10 + adcq %r9,%r11 + + movq %r10,0(%rdi) + movq %r11,8(%rdi) + movq %rcx,16(%rdi) + + movq %rcx,%rax + + .byte 0xf3,0xc3 +.cfi_endproc + + + + + + +.globl _quot_rem_64 +.private_extern _quot_rem_64 + +.p2align 5 +_quot_rem_64: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + movq %rdx,%rax + imulq 0(%rsi),%rdx + + movq 0(%rdi),%r10 + + subq %rdx,%r10 + + movq %r10,0(%rdi) + movq %rax,8(%rdi) + + .byte 0xf3,0xc3 +.cfi_endproc + diff --git a/temp-deps/blst-ts/deps/blst/build/mach-o/mul_mont_256-armv8.S b/temp-deps/blst-ts/deps/blst/build/mach-o/mul_mont_256-armv8.S new file mode 100644 index 0000000..4f506b5 --- /dev/null +++ b/temp-deps/blst-ts/deps/blst/build/mach-o/mul_mont_256-armv8.S @@ -0,0 +1,464 @@ +.text + +.globl _mul_mont_sparse_256 +.private_extern _mul_mont_sparse_256 + +.align 5 +_mul_mont_sparse_256: + stp x29,x30,[sp,#-64]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + stp x23,x24,[sp,#48] + + ldp x10,x11,[x1] + ldr x9, [x2] + ldp x12,x13,[x1,#16] + + mul x19,x10,x9 + ldp x5,x6,[x3] + mul x20,x11,x9 + ldp x7,x8,[x3,#16] + mul x21,x12,x9 + mul x22,x13,x9 + + umulh x14,x10,x9 + umulh x15,x11,x9 + mul x3,x4,x19 + umulh x16,x12,x9 + umulh x17,x13,x9 + adds x20,x20,x14 + //mul x14,x5,x3 + adcs x21,x21,x15 + mul x15,x6,x3 + adcs x22,x22,x16 + mul x16,x7,x3 + adc x23,xzr, x17 + mul x17,x8,x3 + ldr x9,[x2,8*1] + subs xzr,x19,#1 //adds x19,x19,x14 + umulh x14,x5,x3 + adcs x20,x20,x15 + umulh x15,x6,x3 + adcs x21,x21,x16 + umulh x16,x7,x3 + adcs x22,x22,x17 + umulh x17,x8,x3 + adc x23,x23,xzr + + adds x19,x20,x14 + mul x14,x10,x9 + adcs x20,x21,x15 + mul x15,x11,x9 + adcs x21,x22,x16 + mul x16,x12,x9 + adcs x22,x23,x17 + mul x17,x13,x9 + adc x23,xzr,xzr + + adds x19,x19,x14 + umulh x14,x10,x9 + adcs x20,x20,x15 + umulh x15,x11,x9 + adcs x21,x21,x16 + mul x3,x4,x19 + umulh x16,x12,x9 + adcs x22,x22,x17 + umulh x17,x13,x9 + adc x23,x23,xzr + + adds x20,x20,x14 + //mul x14,x5,x3 + adcs x21,x21,x15 + mul x15,x6,x3 + adcs x22,x22,x16 + mul x16,x7,x3 + adc x23,x23,x17 + mul x17,x8,x3 + ldr x9,[x2,8*2] + subs xzr,x19,#1 //adds x19,x19,x14 + umulh x14,x5,x3 + adcs x20,x20,x15 + umulh x15,x6,x3 + adcs x21,x21,x16 + umulh x16,x7,x3 + adcs x22,x22,x17 + umulh x17,x8,x3 + adc x23,x23,xzr + + adds x19,x20,x14 + mul x14,x10,x9 + adcs x20,x21,x15 + mul x15,x11,x9 + adcs x21,x22,x16 + mul x16,x12,x9 + adcs x22,x23,x17 + mul x17,x13,x9 + adc x23,xzr,xzr + + adds x19,x19,x14 + umulh x14,x10,x9 + adcs x20,x20,x15 + umulh x15,x11,x9 + adcs x21,x21,x16 + mul x3,x4,x19 + umulh x16,x12,x9 + adcs x22,x22,x17 + umulh x17,x13,x9 + adc x23,x23,xzr + + adds x20,x20,x14 + //mul x14,x5,x3 + adcs x21,x21,x15 + mul x15,x6,x3 + adcs x22,x22,x16 + mul x16,x7,x3 + adc x23,x23,x17 + mul x17,x8,x3 + ldr x9,[x2,8*3] + subs xzr,x19,#1 //adds x19,x19,x14 + umulh x14,x5,x3 + adcs x20,x20,x15 + umulh x15,x6,x3 + adcs x21,x21,x16 + umulh x16,x7,x3 + adcs x22,x22,x17 + umulh x17,x8,x3 + adc x23,x23,xzr + + adds x19,x20,x14 + mul x14,x10,x9 + adcs x20,x21,x15 + mul x15,x11,x9 + adcs x21,x22,x16 + mul x16,x12,x9 + adcs x22,x23,x17 + mul x17,x13,x9 + adc x23,xzr,xzr + + adds x19,x19,x14 + umulh x14,x10,x9 + adcs x20,x20,x15 + umulh x15,x11,x9 + adcs x21,x21,x16 + mul x3,x4,x19 + umulh x16,x12,x9 + adcs x22,x22,x17 + umulh x17,x13,x9 + adc x23,x23,xzr + + adds x20,x20,x14 + //mul x14,x5,x3 + adcs x21,x21,x15 + mul x15,x6,x3 + adcs x22,x22,x16 + mul x16,x7,x3 + adc x23,x23,x17 + mul x17,x8,x3 + subs xzr,x19,#1 //adds x19,x19,x14 + umulh x14,x5,x3 + adcs x20,x20,x15 + umulh x15,x6,x3 + adcs x21,x21,x16 + umulh x16,x7,x3 + adcs x22,x22,x17 + umulh x17,x8,x3 + adc x23,x23,xzr + + adds x19,x20,x14 + adcs x20,x21,x15 + adcs x21,x22,x16 + adcs x22,x23,x17 + adc x23,xzr,xzr + + subs x14,x19,x5 + sbcs x15,x20,x6 + sbcs x16,x21,x7 + sbcs x17,x22,x8 + sbcs xzr, x23,xzr + + csel x19,x19,x14,lo + csel x20,x20,x15,lo + csel x21,x21,x16,lo + csel x22,x22,x17,lo + + stp x19,x20,[x0] + stp x21,x22,[x0,#16] + + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldp x23,x24,[x29,#48] + ldr x29,[sp],#64 + ret + +.globl _sqr_mont_sparse_256 +.private_extern _sqr_mont_sparse_256 + +.align 5 +_sqr_mont_sparse_256: +.long 3573752639 + stp x29,x30,[sp,#-48]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + + ldp x5,x6,[x1] + ldp x7,x8,[x1,#16] + mov x4,x3 + + //////////////////////////////////////////////////////////////// + // | | | | | |a1*a0| | + // | | | | |a2*a0| | | + // | |a3*a2|a3*a0| | | | + // | | | |a2*a1| | | | + // | | |a3*a1| | | | | + // *| | | | | | | | 2| + // +|a3*a3|a2*a2|a1*a1|a0*a0| + // |--+--+--+--+--+--+--+--| + // |A7|A6|A5|A4|A3|A2|A1|A0|, where Ax is x10 + // + // "can't overflow" below mark carrying into high part of + // multiplication result, which can't overflow, because it + // can never be all ones. + + mul x11,x6,x5 // a[1]*a[0] + umulh x15,x6,x5 + mul x12,x7,x5 // a[2]*a[0] + umulh x16,x7,x5 + mul x13,x8,x5 // a[3]*a[0] + umulh x19,x8,x5 + + adds x12,x12,x15 // accumulate high parts of multiplication + mul x14,x7,x6 // a[2]*a[1] + umulh x15,x7,x6 + adcs x13,x13,x16 + mul x16,x8,x6 // a[3]*a[1] + umulh x17,x8,x6 + adc x19,x19,xzr // can't overflow + + mul x20,x8,x7 // a[3]*a[2] + umulh x21,x8,x7 + + adds x15,x15,x16 // accumulate high parts of multiplication + mul x10,x5,x5 // a[0]*a[0] + adc x16,x17,xzr // can't overflow + + adds x13,x13,x14 // accumulate low parts of multiplication + umulh x5,x5,x5 + adcs x19,x19,x15 + mul x15,x6,x6 // a[1]*a[1] + adcs x20,x20,x16 + umulh x6,x6,x6 + adc x21,x21,xzr // can't overflow + + adds x11,x11,x11 // acc[1-6]*=2 + mul x16,x7,x7 // a[2]*a[2] + adcs x12,x12,x12 + umulh x7,x7,x7 + adcs x13,x13,x13 + mul x17,x8,x8 // a[3]*a[3] + adcs x19,x19,x19 + umulh x8,x8,x8 + adcs x20,x20,x20 + adcs x21,x21,x21 + adc x22,xzr,xzr + + adds x11,x11,x5 // +a[i]*a[i] + adcs x12,x12,x15 + adcs x13,x13,x6 + adcs x19,x19,x16 + adcs x20,x20,x7 + adcs x21,x21,x17 + adc x22,x22,x8 + + bl __mul_by_1_mont_256 + ldr x30,[x29,#8] + + adds x10,x10,x19 // accumulate upper half + adcs x11,x11,x20 + adcs x12,x12,x21 + adcs x13,x13,x22 + adc x19,xzr,xzr + + subs x14,x10,x5 + sbcs x15,x11,x6 + sbcs x16,x12,x7 + sbcs x17,x13,x8 + sbcs xzr, x19,xzr + + csel x10,x10,x14,lo + csel x11,x11,x15,lo + csel x12,x12,x16,lo + csel x13,x13,x17,lo + + stp x10,x11,[x0] + stp x12,x13,[x0,#16] + + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldr x29,[sp],#48 +.long 3573752767 + ret + +.globl _from_mont_256 +.private_extern _from_mont_256 + +.align 5 +_from_mont_256: +.long 3573752639 + stp x29,x30,[sp,#-16]! + add x29,sp,#0 + + mov x4,x3 + ldp x10,x11,[x1] + ldp x12,x13,[x1,#16] + + bl __mul_by_1_mont_256 + ldr x30,[x29,#8] + + subs x14,x10,x5 + sbcs x15,x11,x6 + sbcs x16,x12,x7 + sbcs x17,x13,x8 + + csel x10,x10,x14,lo + csel x11,x11,x15,lo + csel x12,x12,x16,lo + csel x13,x13,x17,lo + + stp x10,x11,[x0] + stp x12,x13,[x0,#16] + + ldr x29,[sp],#16 +.long 3573752767 + ret + + +.globl _redc_mont_256 +.private_extern _redc_mont_256 + +.align 5 +_redc_mont_256: +.long 3573752639 + stp x29,x30,[sp,#-16]! + add x29,sp,#0 + + mov x4,x3 + ldp x10,x11,[x1] + ldp x12,x13,[x1,#16] + + bl __mul_by_1_mont_256 + ldr x30,[x29,#8] + + ldp x14,x15,[x1,#32] + ldp x16,x17,[x1,#48] + + adds x10,x10,x14 + adcs x11,x11,x15 + adcs x12,x12,x16 + adcs x13,x13,x17 + adc x9,xzr,xzr + + subs x14,x10,x5 + sbcs x15,x11,x6 + sbcs x16,x12,x7 + sbcs x17,x13,x8 + sbcs xzr, x9,xzr + + csel x10,x10,x14,lo + csel x11,x11,x15,lo + csel x12,x12,x16,lo + csel x13,x13,x17,lo + + stp x10,x11,[x0] + stp x12,x13,[x0,#16] + + ldr x29,[sp],#16 +.long 3573752767 + ret + + + +.align 5 +__mul_by_1_mont_256: + mul x3,x4,x10 + ldp x5,x6,[x2] + ldp x7,x8,[x2,#16] + //mul x14,x5,x3 + mul x15,x6,x3 + mul x16,x7,x3 + mul x17,x8,x3 + subs xzr,x10,#1 //adds x10,x10,x14 + umulh x14,x5,x3 + adcs x11,x11,x15 + umulh x15,x6,x3 + adcs x12,x12,x16 + umulh x16,x7,x3 + adcs x13,x13,x17 + umulh x17,x8,x3 + adc x9,xzr,xzr + + adds x10,x11,x14 + adcs x11,x12,x15 + adcs x12,x13,x16 + mul x3,x4,x10 + adc x13,x9,x17 + //mul x14,x5,x3 + mul x15,x6,x3 + mul x16,x7,x3 + mul x17,x8,x3 + subs xzr,x10,#1 //adds x10,x10,x14 + umulh x14,x5,x3 + adcs x11,x11,x15 + umulh x15,x6,x3 + adcs x12,x12,x16 + umulh x16,x7,x3 + adcs x13,x13,x17 + umulh x17,x8,x3 + adc x9,xzr,xzr + + adds x10,x11,x14 + adcs x11,x12,x15 + adcs x12,x13,x16 + mul x3,x4,x10 + adc x13,x9,x17 + //mul x14,x5,x3 + mul x15,x6,x3 + mul x16,x7,x3 + mul x17,x8,x3 + subs xzr,x10,#1 //adds x10,x10,x14 + umulh x14,x5,x3 + adcs x11,x11,x15 + umulh x15,x6,x3 + adcs x12,x12,x16 + umulh x16,x7,x3 + adcs x13,x13,x17 + umulh x17,x8,x3 + adc x9,xzr,xzr + + adds x10,x11,x14 + adcs x11,x12,x15 + adcs x12,x13,x16 + mul x3,x4,x10 + adc x13,x9,x17 + //mul x14,x5,x3 + mul x15,x6,x3 + mul x16,x7,x3 + mul x17,x8,x3 + subs xzr,x10,#1 //adds x10,x10,x14 + umulh x14,x5,x3 + adcs x11,x11,x15 + umulh x15,x6,x3 + adcs x12,x12,x16 + umulh x16,x7,x3 + adcs x13,x13,x17 + umulh x17,x8,x3 + adc x9,xzr,xzr + + adds x10,x11,x14 + adcs x11,x12,x15 + adcs x12,x13,x16 + adc x13,x9,x17 + + ret + diff --git a/temp-deps/blst-ts/deps/blst/build/mach-o/mul_mont_384-armv8.S b/temp-deps/blst-ts/deps/blst/build/mach-o/mul_mont_384-armv8.S new file mode 100644 index 0000000..5aa2e9f --- /dev/null +++ b/temp-deps/blst-ts/deps/blst/build/mach-o/mul_mont_384-armv8.S @@ -0,0 +1,2372 @@ +.text + +.globl _add_mod_384x384 + +.align 5 +_add_mod_384x384: +.long 3573752639 + stp x29,x30,[sp,#-64]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + stp x23,x24,[sp,#48] + + ldp x5,x6,[x3] + ldp x7,x8,[x3,#16] + ldp x9,x10,[x3,#32] + + bl __add_mod_384x384 + ldr x30,[x29,#8] + + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldp x23,x24,[x29,#48] + ldr x29,[sp],#64 +.long 3573752767 + ret + + + +.align 5 +__add_mod_384x384: + ldp x11, x12, [x1] + ldp x19,x20,[x2] + ldp x13, x14, [x1,#16] + adds x11,x11,x19 + ldp x21,x22,[x2,#16] + adcs x12,x12,x20 + ldp x15, x16, [x1,#32] + adcs x13,x13,x21 + ldp x23,x24,[x2,#32] + adcs x14,x14,x22 + stp x11, x12, [x0] + adcs x15,x15,x23 + ldp x11, x12, [x1,#48] + adcs x16,x16,x24 + + ldp x19,x20,[x2,#48] + stp x13, x14, [x0,#16] + ldp x13, x14, [x1,#64] + ldp x21,x22,[x2,#64] + + adcs x11,x11,x19 + stp x15, x16, [x0,#32] + adcs x12,x12,x20 + ldp x15, x16, [x1,#80] + adcs x13,x13,x21 + ldp x23,x24,[x2,#80] + adcs x14,x14,x22 + adcs x15,x15,x23 + adcs x16,x16,x24 + adc x17,xzr,xzr + + subs x19,x11,x5 + sbcs x20,x12,x6 + sbcs x21,x13,x7 + sbcs x22,x14,x8 + sbcs x23,x15,x9 + sbcs x24,x16,x10 + sbcs xzr,x17,xzr + + csel x11,x11,x19,lo + csel x12,x12,x20,lo + csel x13,x13,x21,lo + csel x14,x14,x22,lo + stp x11,x12,[x0,#48] + csel x15,x15,x23,lo + stp x13,x14,[x0,#64] + csel x16,x16,x24,lo + stp x15,x16,[x0,#80] + + ret + + +.globl _sub_mod_384x384 + +.align 5 +_sub_mod_384x384: +.long 3573752639 + stp x29,x30,[sp,#-64]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + stp x23,x24,[sp,#48] + + ldp x5,x6,[x3] + ldp x7,x8,[x3,#16] + ldp x9,x10,[x3,#32] + + bl __sub_mod_384x384 + ldr x30,[x29,#8] + + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldp x23,x24,[x29,#48] + ldr x29,[sp],#64 +.long 3573752767 + ret + + + +.align 5 +__sub_mod_384x384: + ldp x11, x12, [x1] + ldp x19,x20,[x2] + ldp x13, x14, [x1,#16] + subs x11,x11,x19 + ldp x21,x22,[x2,#16] + sbcs x12,x12,x20 + ldp x15, x16, [x1,#32] + sbcs x13,x13,x21 + ldp x23,x24,[x2,#32] + sbcs x14,x14,x22 + stp x11, x12, [x0] + sbcs x15,x15,x23 + ldp x11, x12, [x1,#48] + sbcs x16,x16,x24 + + ldp x19,x20,[x2,#48] + stp x13, x14, [x0,#16] + ldp x13, x14, [x1,#64] + ldp x21,x22,[x2,#64] + + sbcs x11,x11,x19 + stp x15, x16, [x0,#32] + sbcs x12,x12,x20 + ldp x15, x16, [x1,#80] + sbcs x13,x13,x21 + ldp x23,x24,[x2,#80] + sbcs x14,x14,x22 + sbcs x15,x15,x23 + sbcs x16,x16,x24 + sbc x17,xzr,xzr + + and x19,x5,x17 + and x20,x6,x17 + adds x11,x11,x19 + and x21,x7,x17 + adcs x12,x12,x20 + and x22,x8,x17 + adcs x13,x13,x21 + and x23,x9,x17 + adcs x14,x14,x22 + and x24,x10,x17 + adcs x15,x15,x23 + stp x11,x12,[x0,#48] + adc x16,x16,x24 + stp x13,x14,[x0,#64] + stp x15,x16,[x0,#80] + + ret + + + +.align 5 +__add_mod_384: + ldp x11, x12, [x1] + ldp x19,x20,[x2] + ldp x13, x14, [x1,#16] + adds x11,x11,x19 + ldp x21,x22,[x2,#16] + adcs x12,x12,x20 + ldp x15, x16, [x1,#32] + adcs x13,x13,x21 + ldp x23,x24,[x2,#32] + adcs x14,x14,x22 + adcs x15,x15,x23 + adcs x16,x16,x24 + adc x17,xzr,xzr + + subs x19,x11,x5 + sbcs x20,x12,x6 + sbcs x21,x13,x7 + sbcs x22,x14,x8 + sbcs x23,x15,x9 + sbcs x24,x16,x10 + sbcs xzr,x17,xzr + + csel x11,x11,x19,lo + csel x12,x12,x20,lo + csel x13,x13,x21,lo + csel x14,x14,x22,lo + csel x15,x15,x23,lo + stp x11,x12,[x0] + csel x16,x16,x24,lo + stp x13,x14,[x0,#16] + stp x15,x16,[x0,#32] + + ret + + + +.align 5 +__sub_mod_384: + ldp x11, x12, [x1] + ldp x19,x20,[x2] + ldp x13, x14, [x1,#16] + subs x11,x11,x19 + ldp x21,x22,[x2,#16] + sbcs x12,x12,x20 + ldp x15, x16, [x1,#32] + sbcs x13,x13,x21 + ldp x23,x24,[x2,#32] + sbcs x14,x14,x22 + sbcs x15,x15,x23 + sbcs x16,x16,x24 + sbc x17,xzr,xzr + + and x19,x5,x17 + and x20,x6,x17 + adds x11,x11,x19 + and x21,x7,x17 + adcs x12,x12,x20 + and x22,x8,x17 + adcs x13,x13,x21 + and x23,x9,x17 + adcs x14,x14,x22 + and x24,x10,x17 + adcs x15,x15,x23 + stp x11,x12,[x0] + adc x16,x16,x24 + stp x13,x14,[x0,#16] + stp x15,x16,[x0,#32] + + ret + + +.globl _mul_mont_384x +.private_extern _mul_mont_384x + +.align 5 +_mul_mont_384x: +.long 3573752639 + stp x29,x30,[sp,#-128]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + stp x23,x24,[sp,#48] + stp x25,x26,[sp,#64] + stp x27,x28,[sp,#80] + sub sp,sp,#288 // space for 3 768-bit vectors + + mov x26,x0 // save r_ptr + mov x27,x1 // save b_ptr + mov x28,x2 // save b_ptr + + sub x0,sp,#0 // mul_384(t0, a->re, b->re) + bl __mul_384 + + add x1,x1,#48 // mul_384(t1, a->im, b->im) + add x2,x2,#48 + add x0,sp,#96 + bl __mul_384 + + ldp x5,x6,[x3] + ldp x7,x8,[x3,#16] + ldp x9,x10,[x3,#32] + + sub x2,x1,#48 + add x0,sp,#240 + bl __add_mod_384 + + add x1,x28,#0 + add x2,x28,#48 + add x0,sp,#192 // t2 + bl __add_mod_384 + + add x1,x0,#0 + add x2,x0,#48 + bl __mul_384 // mul_384(t2, a->re+a->im, b->re+b->im) + + ldp x5,x6,[x3] + ldp x7,x8,[x3,#16] + ldp x9,x10,[x3,#32] + + mov x1,x0 + add x2,sp,#0 + bl __sub_mod_384x384 + + add x2,sp,#96 + bl __sub_mod_384x384 // t2 = t2-t0-t1 + + add x1,sp,#0 + add x2,sp,#96 + add x0,sp,#0 + bl __sub_mod_384x384 // t0 = t0-t1 + + add x1,sp,#0 // ret->re = redc(t0) + add x0,x26,#0 + bl __mul_by_1_mont_384 + bl __redc_tail_mont_384 + + add x1,sp,#192 // ret->im = redc(t2) + add x0,x0,#48 + bl __mul_by_1_mont_384 + bl __redc_tail_mont_384 + ldr x30,[x29,#8] + + add sp,sp,#288 + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldp x23,x24,[x29,#48] + ldp x25,x26,[x29,#64] + ldp x27,x28,[x29,#80] + ldr x29,[sp],#128 +.long 3573752767 + ret + + +.globl _sqr_mont_384x +.private_extern _sqr_mont_384x + +.align 5 +_sqr_mont_384x: +.long 3573752639 + stp x29,x30,[sp,#-128]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + stp x23,x24,[sp,#48] + stp x25,x26,[sp,#64] + stp x27,x28,[sp,#80] + stp x3,x0,[sp,#96] // __mul_mont_384 wants them there + sub sp,sp,#96 // space for 2 384-bit vectors + mov x4,x3 // adjust for missing b_ptr + + ldp x5,x6,[x2] + ldp x7,x8,[x2,#16] + ldp x9,x10,[x2,#32] + + add x2,x1,#48 + add x0,sp,#0 + bl __add_mod_384 // t0 = a->re + a->im + + add x0,sp,#48 + bl __sub_mod_384 // t1 = a->re - a->im + + ldp x11,x12,[x1] + ldr x17, [x2] + ldp x13,x14,[x1,#16] + ldp x15,x16,[x1,#32] + + bl __mul_mont_384 // mul_mont_384(ret->im, a->re, a->im) + + adds x11,x11,x11 // add with itself + adcs x12,x12,x12 + adcs x13,x13,x13 + adcs x14,x14,x14 + adcs x15,x15,x15 + adcs x16,x16,x16 + adc x25,xzr,xzr + + subs x19,x11,x5 + sbcs x20,x12,x6 + sbcs x21,x13,x7 + sbcs x22,x14,x8 + sbcs x23,x15,x9 + sbcs x24,x16,x10 + sbcs xzr,x25,xzr + + csel x19,x11,x19,lo + csel x20,x12,x20,lo + csel x21,x13,x21,lo + ldp x11,x12,[sp] + csel x22,x14,x22,lo + ldr x17, [sp,#48] + csel x23,x15,x23,lo + ldp x13,x14,[sp,#16] + csel x24,x16,x24,lo + ldp x15,x16,[sp,#32] + + stp x19,x20,[x2,#48] + stp x21,x22,[x2,#64] + stp x23,x24,[x2,#80] + + add x2,sp,#48 + bl __mul_mont_384 // mul_mont_384(ret->re, t0, t1) + ldr x30,[x29,#8] + + stp x11,x12,[x2] + stp x13,x14,[x2,#16] + stp x15,x16,[x2,#32] + + add sp,sp,#96 + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldp x23,x24,[x29,#48] + ldp x25,x26,[x29,#64] + ldp x27,x28,[x29,#80] + ldr x29,[sp],#128 +.long 3573752767 + ret + + +.globl _mul_mont_384 +.private_extern _mul_mont_384 + +.align 5 +_mul_mont_384: +.long 3573752639 + stp x29,x30,[sp,#-128]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + stp x23,x24,[sp,#48] + stp x25,x26,[sp,#64] + stp x27,x28,[sp,#80] + stp x4,x0,[sp,#96] // __mul_mont_384 wants them there + + ldp x11,x12,[x1] + ldr x17, [x2] + ldp x13,x14,[x1,#16] + ldp x15,x16,[x1,#32] + + ldp x5,x6,[x3] + ldp x7,x8,[x3,#16] + ldp x9,x10,[x3,#32] + + bl __mul_mont_384 + ldr x30,[x29,#8] + + stp x11,x12,[x2] + stp x13,x14,[x2,#16] + stp x15,x16,[x2,#32] + + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldp x23,x24,[x29,#48] + ldp x25,x26,[x29,#64] + ldp x27,x28,[x29,#80] + ldr x29,[sp],#128 +.long 3573752767 + ret + + + +.align 5 +__mul_mont_384: + mul x19,x11,x17 + mul x20,x12,x17 + mul x21,x13,x17 + mul x22,x14,x17 + mul x23,x15,x17 + mul x24,x16,x17 + mul x4,x4,x19 + + umulh x26,x11,x17 + umulh x27,x12,x17 + umulh x28,x13,x17 + umulh x0,x14,x17 + umulh x1,x15,x17 + umulh x3,x16,x17 + + adds x20,x20,x26 + // mul x26,x5,x4 + adcs x21,x21,x27 + mul x27,x6,x4 + adcs x22,x22,x28 + mul x28,x7,x4 + adcs x23,x23,x0 + mul x0,x8,x4 + adcs x24,x24,x1 + mul x1,x9,x4 + adc x25,xzr, x3 + mul x3,x10,x4 + mov x17,xzr + subs xzr,x19,#1 // adds x19,x19,x26 + umulh x26,x5,x4 + adcs x20,x20,x27 + umulh x27,x6,x4 + adcs x21,x21,x28 + umulh x28,x7,x4 + adcs x22,x22,x0 + umulh x0,x8,x4 + adcs x23,x23,x1 + umulh x1,x9,x4 + adcs x24,x24,x3 + umulh x3,x10,x4 + adcs x25,x25,xzr + adc x4,x17,xzr + ldr x17,[x2,8*1] + + adds x19,x20,x26 + mul x26,x11,x17 + adcs x20,x21,x27 + mul x27,x12,x17 + adcs x21,x22,x28 + mul x28,x13,x17 + adcs x22,x23,x0 + mul x0,x14,x17 + adcs x23,x24,x1 + mul x1,x15,x17 + adcs x24,x25,x3 + mul x3,x16,x17 + adc x25,x4,xzr + ldr x4,[x29,#96] + + adds x19,x19,x26 + umulh x26,x11,x17 + adcs x20,x20,x27 + umulh x27,x12,x17 + adcs x21,x21,x28 + mul x4,x4,x19 + umulh x28,x13,x17 + adcs x22,x22,x0 + umulh x0,x14,x17 + adcs x23,x23,x1 + umulh x1,x15,x17 + adcs x24,x24,x3 + umulh x3,x16,x17 + adcs x25,x25,xzr + adc x17,xzr,xzr + + adds x20,x20,x26 + // mul x26,x5,x4 + adcs x21,x21,x27 + mul x27,x6,x4 + adcs x22,x22,x28 + mul x28,x7,x4 + adcs x23,x23,x0 + mul x0,x8,x4 + adcs x24,x24,x1 + mul x1,x9,x4 + adcs x25,x25,x3 + mul x3,x10,x4 + adc x17,x17,xzr + subs xzr,x19,#1 // adds x19,x19,x26 + umulh x26,x5,x4 + adcs x20,x20,x27 + umulh x27,x6,x4 + adcs x21,x21,x28 + umulh x28,x7,x4 + adcs x22,x22,x0 + umulh x0,x8,x4 + adcs x23,x23,x1 + umulh x1,x9,x4 + adcs x24,x24,x3 + umulh x3,x10,x4 + adcs x25,x25,xzr + adc x4,x17,xzr + ldr x17,[x2,8*2] + + adds x19,x20,x26 + mul x26,x11,x17 + adcs x20,x21,x27 + mul x27,x12,x17 + adcs x21,x22,x28 + mul x28,x13,x17 + adcs x22,x23,x0 + mul x0,x14,x17 + adcs x23,x24,x1 + mul x1,x15,x17 + adcs x24,x25,x3 + mul x3,x16,x17 + adc x25,x4,xzr + ldr x4,[x29,#96] + + adds x19,x19,x26 + umulh x26,x11,x17 + adcs x20,x20,x27 + umulh x27,x12,x17 + adcs x21,x21,x28 + mul x4,x4,x19 + umulh x28,x13,x17 + adcs x22,x22,x0 + umulh x0,x14,x17 + adcs x23,x23,x1 + umulh x1,x15,x17 + adcs x24,x24,x3 + umulh x3,x16,x17 + adcs x25,x25,xzr + adc x17,xzr,xzr + + adds x20,x20,x26 + // mul x26,x5,x4 + adcs x21,x21,x27 + mul x27,x6,x4 + adcs x22,x22,x28 + mul x28,x7,x4 + adcs x23,x23,x0 + mul x0,x8,x4 + adcs x24,x24,x1 + mul x1,x9,x4 + adcs x25,x25,x3 + mul x3,x10,x4 + adc x17,x17,xzr + subs xzr,x19,#1 // adds x19,x19,x26 + umulh x26,x5,x4 + adcs x20,x20,x27 + umulh x27,x6,x4 + adcs x21,x21,x28 + umulh x28,x7,x4 + adcs x22,x22,x0 + umulh x0,x8,x4 + adcs x23,x23,x1 + umulh x1,x9,x4 + adcs x24,x24,x3 + umulh x3,x10,x4 + adcs x25,x25,xzr + adc x4,x17,xzr + ldr x17,[x2,8*3] + + adds x19,x20,x26 + mul x26,x11,x17 + adcs x20,x21,x27 + mul x27,x12,x17 + adcs x21,x22,x28 + mul x28,x13,x17 + adcs x22,x23,x0 + mul x0,x14,x17 + adcs x23,x24,x1 + mul x1,x15,x17 + adcs x24,x25,x3 + mul x3,x16,x17 + adc x25,x4,xzr + ldr x4,[x29,#96] + + adds x19,x19,x26 + umulh x26,x11,x17 + adcs x20,x20,x27 + umulh x27,x12,x17 + adcs x21,x21,x28 + mul x4,x4,x19 + umulh x28,x13,x17 + adcs x22,x22,x0 + umulh x0,x14,x17 + adcs x23,x23,x1 + umulh x1,x15,x17 + adcs x24,x24,x3 + umulh x3,x16,x17 + adcs x25,x25,xzr + adc x17,xzr,xzr + + adds x20,x20,x26 + // mul x26,x5,x4 + adcs x21,x21,x27 + mul x27,x6,x4 + adcs x22,x22,x28 + mul x28,x7,x4 + adcs x23,x23,x0 + mul x0,x8,x4 + adcs x24,x24,x1 + mul x1,x9,x4 + adcs x25,x25,x3 + mul x3,x10,x4 + adc x17,x17,xzr + subs xzr,x19,#1 // adds x19,x19,x26 + umulh x26,x5,x4 + adcs x20,x20,x27 + umulh x27,x6,x4 + adcs x21,x21,x28 + umulh x28,x7,x4 + adcs x22,x22,x0 + umulh x0,x8,x4 + adcs x23,x23,x1 + umulh x1,x9,x4 + adcs x24,x24,x3 + umulh x3,x10,x4 + adcs x25,x25,xzr + adc x4,x17,xzr + ldr x17,[x2,8*4] + + adds x19,x20,x26 + mul x26,x11,x17 + adcs x20,x21,x27 + mul x27,x12,x17 + adcs x21,x22,x28 + mul x28,x13,x17 + adcs x22,x23,x0 + mul x0,x14,x17 + adcs x23,x24,x1 + mul x1,x15,x17 + adcs x24,x25,x3 + mul x3,x16,x17 + adc x25,x4,xzr + ldr x4,[x29,#96] + + adds x19,x19,x26 + umulh x26,x11,x17 + adcs x20,x20,x27 + umulh x27,x12,x17 + adcs x21,x21,x28 + mul x4,x4,x19 + umulh x28,x13,x17 + adcs x22,x22,x0 + umulh x0,x14,x17 + adcs x23,x23,x1 + umulh x1,x15,x17 + adcs x24,x24,x3 + umulh x3,x16,x17 + adcs x25,x25,xzr + adc x17,xzr,xzr + + adds x20,x20,x26 + // mul x26,x5,x4 + adcs x21,x21,x27 + mul x27,x6,x4 + adcs x22,x22,x28 + mul x28,x7,x4 + adcs x23,x23,x0 + mul x0,x8,x4 + adcs x24,x24,x1 + mul x1,x9,x4 + adcs x25,x25,x3 + mul x3,x10,x4 + adc x17,x17,xzr + subs xzr,x19,#1 // adds x19,x19,x26 + umulh x26,x5,x4 + adcs x20,x20,x27 + umulh x27,x6,x4 + adcs x21,x21,x28 + umulh x28,x7,x4 + adcs x22,x22,x0 + umulh x0,x8,x4 + adcs x23,x23,x1 + umulh x1,x9,x4 + adcs x24,x24,x3 + umulh x3,x10,x4 + adcs x25,x25,xzr + adc x4,x17,xzr + ldr x17,[x2,8*5] + + adds x19,x20,x26 + mul x26,x11,x17 + adcs x20,x21,x27 + mul x27,x12,x17 + adcs x21,x22,x28 + mul x28,x13,x17 + adcs x22,x23,x0 + mul x0,x14,x17 + adcs x23,x24,x1 + mul x1,x15,x17 + adcs x24,x25,x3 + mul x3,x16,x17 + adc x25,x4,xzr + ldr x4,[x29,#96] + + adds x19,x19,x26 + umulh x26,x11,x17 + adcs x20,x20,x27 + umulh x27,x12,x17 + adcs x21,x21,x28 + mul x4,x4,x19 + umulh x28,x13,x17 + adcs x22,x22,x0 + umulh x0,x14,x17 + adcs x23,x23,x1 + umulh x1,x15,x17 + adcs x24,x24,x3 + umulh x3,x16,x17 + adcs x25,x25,xzr + adc x17,xzr,xzr + + adds x20,x20,x26 + // mul x26,x5,x4 + adcs x21,x21,x27 + mul x27,x6,x4 + adcs x22,x22,x28 + mul x28,x7,x4 + adcs x23,x23,x0 + mul x0,x8,x4 + adcs x24,x24,x1 + mul x1,x9,x4 + adcs x25,x25,x3 + mul x3,x10,x4 + adc x17,x17,xzr + subs xzr,x19,#1 // adds x19,x19,x26 + umulh x26,x5,x4 + adcs x20,x20,x27 + umulh x27,x6,x4 + adcs x21,x21,x28 + umulh x28,x7,x4 + adcs x22,x22,x0 + umulh x0,x8,x4 + adcs x23,x23,x1 + umulh x1,x9,x4 + adcs x24,x24,x3 + umulh x3,x10,x4 + adcs x25,x25,xzr + ldp x4,x2,[x29,#96] // pull r_ptr + adc x17,x17,xzr + + adds x19,x20,x26 + adcs x20,x21,x27 + adcs x21,x22,x28 + adcs x22,x23,x0 + adcs x23,x24,x1 + adcs x24,x25,x3 + adc x25,x17,xzr + + subs x26,x19,x5 + sbcs x27,x20,x6 + sbcs x28,x21,x7 + sbcs x0,x22,x8 + sbcs x1,x23,x9 + sbcs x3,x24,x10 + sbcs xzr, x25,xzr + + csel x11,x19,x26,lo + csel x12,x20,x27,lo + csel x13,x21,x28,lo + csel x14,x22,x0,lo + csel x15,x23,x1,lo + csel x16,x24,x3,lo + ret + + +.globl _sqr_mont_384 +.private_extern _sqr_mont_384 + +.align 5 +_sqr_mont_384: +.long 3573752639 + stp x29,x30,[sp,#-128]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + stp x23,x24,[sp,#48] + stp x25,x26,[sp,#64] + stp x27,x28,[sp,#80] + sub sp,sp,#96 // space for 768-bit vector + mov x4,x3 // adjust for missing b_ptr + + mov x3,x0 // save r_ptr + mov x0,sp + + ldp x11,x12,[x1] + ldp x13,x14,[x1,#16] + ldp x15,x16,[x1,#32] + + bl __sqr_384 + + ldp x5,x6,[x2] + ldp x7,x8,[x2,#16] + ldp x9,x10,[x2,#32] + + mov x1,sp + mov x0,x3 // restore r_ptr + bl __mul_by_1_mont_384 + bl __redc_tail_mont_384 + ldr x30,[x29,#8] + + add sp,sp,#96 + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldp x23,x24,[x29,#48] + ldp x25,x26,[x29,#64] + ldp x27,x28,[x29,#80] + ldr x29,[sp],#128 +.long 3573752767 + ret + + +.globl _sqr_n_mul_mont_383 +.private_extern _sqr_n_mul_mont_383 + +.align 5 +_sqr_n_mul_mont_383: +.long 3573752639 + stp x29,x30,[sp,#-128]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + stp x23,x24,[sp,#48] + stp x25,x26,[sp,#64] + stp x27,x28,[sp,#80] + stp x4,x0,[sp,#96] // __mul_mont_384 wants them there + sub sp,sp,#96 // space for 768-bit vector + mov x17,x5 // save b_ptr + + ldp x11,x12,[x1] + ldp x13,x14,[x1,#16] + ldp x15,x16,[x1,#32] + mov x0,sp +Loop_sqr_383: + bl __sqr_384 + sub x2,x2,#1 // counter + + ldp x5,x6,[x3] + ldp x7,x8,[x3,#16] + ldp x9,x10,[x3,#32] + + mov x1,sp + bl __mul_by_1_mont_384 + + ldp x19,x20,[x1,#48] + ldp x21,x22,[x1,#64] + ldp x23,x24,[x1,#80] + + adds x11,x11,x19 // just accumulate upper half + adcs x12,x12,x20 + adcs x13,x13,x21 + adcs x14,x14,x22 + adcs x15,x15,x23 + adc x16,x16,x24 + + cbnz x2,Loop_sqr_383 + + mov x2,x17 + ldr x17,[x17] + bl __mul_mont_384 + ldr x30,[x29,#8] + + stp x11,x12,[x2] + stp x13,x14,[x2,#16] + stp x15,x16,[x2,#32] + + add sp,sp,#96 + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldp x23,x24,[x29,#48] + ldp x25,x26,[x29,#64] + ldp x27,x28,[x29,#80] + ldr x29,[sp],#128 +.long 3573752767 + ret + + +.align 5 +__sqr_384: + mul x19,x12,x11 + mul x20,x13,x11 + mul x21,x14,x11 + mul x22,x15,x11 + mul x23,x16,x11 + + umulh x6,x12,x11 + umulh x7,x13,x11 + umulh x8,x14,x11 + umulh x9,x15,x11 + adds x20,x20,x6 + umulh x10,x16,x11 + adcs x21,x21,x7 + mul x7,x13,x12 + adcs x22,x22,x8 + mul x8,x14,x12 + adcs x23,x23,x9 + mul x9,x15,x12 + adc x24,xzr, x10 + mul x10,x16,x12 + + adds x21,x21,x7 + umulh x7,x13,x12 + adcs x22,x22,x8 + umulh x8,x14,x12 + adcs x23,x23,x9 + umulh x9,x15,x12 + adcs x24,x24,x10 + umulh x10,x16,x12 + adc x25,xzr,xzr + + mul x5,x11,x11 + adds x22,x22,x7 + umulh x11, x11,x11 + adcs x23,x23,x8 + mul x8,x14,x13 + adcs x24,x24,x9 + mul x9,x15,x13 + adc x25,x25,x10 + mul x10,x16,x13 + + adds x23,x23,x8 + umulh x8,x14,x13 + adcs x24,x24,x9 + umulh x9,x15,x13 + adcs x25,x25,x10 + umulh x10,x16,x13 + adc x26,xzr,xzr + + mul x6,x12,x12 + adds x24,x24,x8 + umulh x12, x12,x12 + adcs x25,x25,x9 + mul x9,x15,x14 + adc x26,x26,x10 + mul x10,x16,x14 + + adds x25,x25,x9 + umulh x9,x15,x14 + adcs x26,x26,x10 + umulh x10,x16,x14 + adc x27,xzr,xzr + mul x7,x13,x13 + adds x26,x26,x9 + umulh x13, x13,x13 + adc x27,x27,x10 + mul x8,x14,x14 + + mul x10,x16,x15 + umulh x14, x14,x14 + adds x27,x27,x10 + umulh x10,x16,x15 + mul x9,x15,x15 + adc x28,x10,xzr + + adds x19,x19,x19 + adcs x20,x20,x20 + adcs x21,x21,x21 + adcs x22,x22,x22 + adcs x23,x23,x23 + adcs x24,x24,x24 + adcs x25,x25,x25 + adcs x26,x26,x26 + umulh x15, x15,x15 + adcs x27,x27,x27 + mul x10,x16,x16 + adcs x28,x28,x28 + umulh x16, x16,x16 + adc x1,xzr,xzr + + adds x19,x19,x11 + adcs x20,x20,x6 + adcs x21,x21,x12 + adcs x22,x22,x7 + adcs x23,x23,x13 + adcs x24,x24,x8 + adcs x25,x25,x14 + stp x5,x19,[x0] + adcs x26,x26,x9 + stp x20,x21,[x0,#16] + adcs x27,x27,x15 + stp x22,x23,[x0,#32] + adcs x28,x28,x10 + stp x24,x25,[x0,#48] + adc x16,x16,x1 + stp x26,x27,[x0,#64] + stp x28,x16,[x0,#80] + + ret + +.globl _sqr_384 +.private_extern _sqr_384 + +.align 5 +_sqr_384: +.long 3573752639 + stp x29,x30,[sp,#-128]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + stp x23,x24,[sp,#48] + stp x25,x26,[sp,#64] + stp x27,x28,[sp,#80] + + ldp x11,x12,[x1] + ldp x13,x14,[x1,#16] + ldp x15,x16,[x1,#32] + + bl __sqr_384 + ldr x30,[x29,#8] + + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldp x23,x24,[x29,#48] + ldp x25,x26,[x29,#64] + ldp x27,x28,[x29,#80] + ldr x29,[sp],#128 +.long 3573752767 + ret + + +.globl _redc_mont_384 +.private_extern _redc_mont_384 + +.align 5 +_redc_mont_384: +.long 3573752639 + stp x29,x30,[sp,#-128]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + stp x23,x24,[sp,#48] + stp x25,x26,[sp,#64] + stp x27,x28,[sp,#80] + mov x4,x3 // adjust for missing b_ptr + + ldp x5,x6,[x2] + ldp x7,x8,[x2,#16] + ldp x9,x10,[x2,#32] + + bl __mul_by_1_mont_384 + bl __redc_tail_mont_384 + ldr x30,[x29,#8] + + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldp x23,x24,[x29,#48] + ldp x25,x26,[x29,#64] + ldp x27,x28,[x29,#80] + ldr x29,[sp],#128 +.long 3573752767 + ret + + +.globl _from_mont_384 +.private_extern _from_mont_384 + +.align 5 +_from_mont_384: +.long 3573752639 + stp x29,x30,[sp,#-128]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + stp x23,x24,[sp,#48] + stp x25,x26,[sp,#64] + stp x27,x28,[sp,#80] + mov x4,x3 // adjust for missing b_ptr + + ldp x5,x6,[x2] + ldp x7,x8,[x2,#16] + ldp x9,x10,[x2,#32] + + bl __mul_by_1_mont_384 + ldr x30,[x29,#8] + + subs x19,x11,x5 + sbcs x20,x12,x6 + sbcs x21,x13,x7 + sbcs x22,x14,x8 + sbcs x23,x15,x9 + sbcs x24,x16,x10 + + csel x11,x11,x19,lo + csel x12,x12,x20,lo + csel x13,x13,x21,lo + csel x14,x14,x22,lo + csel x15,x15,x23,lo + csel x16,x16,x24,lo + + stp x11,x12,[x0] + stp x13,x14,[x0,#16] + stp x15,x16,[x0,#32] + + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldp x23,x24,[x29,#48] + ldp x25,x26,[x29,#64] + ldp x27,x28,[x29,#80] + ldr x29,[sp],#128 +.long 3573752767 + ret + + + +.align 5 +__mul_by_1_mont_384: + ldp x11,x12,[x1] + ldp x13,x14,[x1,#16] + mul x26,x4,x11 + ldp x15,x16,[x1,#32] + + // mul x19,x5,x26 + mul x20,x6,x26 + mul x21,x7,x26 + mul x22,x8,x26 + mul x23,x9,x26 + mul x24,x10,x26 + subs xzr,x11,#1 // adds x19,x19,x11 + umulh x11,x5,x26 + adcs x20,x20,x12 + umulh x12,x6,x26 + adcs x21,x21,x13 + umulh x13,x7,x26 + adcs x22,x22,x14 + umulh x14,x8,x26 + adcs x23,x23,x15 + umulh x15,x9,x26 + adcs x24,x24,x16 + umulh x16,x10,x26 + adc x25,xzr,xzr + adds x11,x11,x20 + adcs x12,x12,x21 + adcs x13,x13,x22 + mul x26,x4,x11 + adcs x14,x14,x23 + adcs x15,x15,x24 + adc x16,x16,x25 + + // mul x19,x5,x26 + mul x20,x6,x26 + mul x21,x7,x26 + mul x22,x8,x26 + mul x23,x9,x26 + mul x24,x10,x26 + subs xzr,x11,#1 // adds x19,x19,x11 + umulh x11,x5,x26 + adcs x20,x20,x12 + umulh x12,x6,x26 + adcs x21,x21,x13 + umulh x13,x7,x26 + adcs x22,x22,x14 + umulh x14,x8,x26 + adcs x23,x23,x15 + umulh x15,x9,x26 + adcs x24,x24,x16 + umulh x16,x10,x26 + adc x25,xzr,xzr + adds x11,x11,x20 + adcs x12,x12,x21 + adcs x13,x13,x22 + mul x26,x4,x11 + adcs x14,x14,x23 + adcs x15,x15,x24 + adc x16,x16,x25 + + // mul x19,x5,x26 + mul x20,x6,x26 + mul x21,x7,x26 + mul x22,x8,x26 + mul x23,x9,x26 + mul x24,x10,x26 + subs xzr,x11,#1 // adds x19,x19,x11 + umulh x11,x5,x26 + adcs x20,x20,x12 + umulh x12,x6,x26 + adcs x21,x21,x13 + umulh x13,x7,x26 + adcs x22,x22,x14 + umulh x14,x8,x26 + adcs x23,x23,x15 + umulh x15,x9,x26 + adcs x24,x24,x16 + umulh x16,x10,x26 + adc x25,xzr,xzr + adds x11,x11,x20 + adcs x12,x12,x21 + adcs x13,x13,x22 + mul x26,x4,x11 + adcs x14,x14,x23 + adcs x15,x15,x24 + adc x16,x16,x25 + + // mul x19,x5,x26 + mul x20,x6,x26 + mul x21,x7,x26 + mul x22,x8,x26 + mul x23,x9,x26 + mul x24,x10,x26 + subs xzr,x11,#1 // adds x19,x19,x11 + umulh x11,x5,x26 + adcs x20,x20,x12 + umulh x12,x6,x26 + adcs x21,x21,x13 + umulh x13,x7,x26 + adcs x22,x22,x14 + umulh x14,x8,x26 + adcs x23,x23,x15 + umulh x15,x9,x26 + adcs x24,x24,x16 + umulh x16,x10,x26 + adc x25,xzr,xzr + adds x11,x11,x20 + adcs x12,x12,x21 + adcs x13,x13,x22 + mul x26,x4,x11 + adcs x14,x14,x23 + adcs x15,x15,x24 + adc x16,x16,x25 + + // mul x19,x5,x26 + mul x20,x6,x26 + mul x21,x7,x26 + mul x22,x8,x26 + mul x23,x9,x26 + mul x24,x10,x26 + subs xzr,x11,#1 // adds x19,x19,x11 + umulh x11,x5,x26 + adcs x20,x20,x12 + umulh x12,x6,x26 + adcs x21,x21,x13 + umulh x13,x7,x26 + adcs x22,x22,x14 + umulh x14,x8,x26 + adcs x23,x23,x15 + umulh x15,x9,x26 + adcs x24,x24,x16 + umulh x16,x10,x26 + adc x25,xzr,xzr + adds x11,x11,x20 + adcs x12,x12,x21 + adcs x13,x13,x22 + mul x26,x4,x11 + adcs x14,x14,x23 + adcs x15,x15,x24 + adc x16,x16,x25 + + // mul x19,x5,x26 + mul x20,x6,x26 + mul x21,x7,x26 + mul x22,x8,x26 + mul x23,x9,x26 + mul x24,x10,x26 + subs xzr,x11,#1 // adds x19,x19,x11 + umulh x11,x5,x26 + adcs x20,x20,x12 + umulh x12,x6,x26 + adcs x21,x21,x13 + umulh x13,x7,x26 + adcs x22,x22,x14 + umulh x14,x8,x26 + adcs x23,x23,x15 + umulh x15,x9,x26 + adcs x24,x24,x16 + umulh x16,x10,x26 + adc x25,xzr,xzr + adds x11,x11,x20 + adcs x12,x12,x21 + adcs x13,x13,x22 + adcs x14,x14,x23 + adcs x15,x15,x24 + adc x16,x16,x25 + + ret + + + +.align 5 +__redc_tail_mont_384: + ldp x19,x20,[x1,#48] + ldp x21,x22,[x1,#64] + ldp x23,x24,[x1,#80] + + adds x11,x11,x19 // accumulate upper half + adcs x12,x12,x20 + adcs x13,x13,x21 + adcs x14,x14,x22 + adcs x15,x15,x23 + adcs x16,x16,x24 + adc x25,xzr,xzr + + subs x19,x11,x5 + sbcs x20,x12,x6 + sbcs x21,x13,x7 + sbcs x22,x14,x8 + sbcs x23,x15,x9 + sbcs x24,x16,x10 + sbcs xzr,x25,xzr + + csel x11,x11,x19,lo + csel x12,x12,x20,lo + csel x13,x13,x21,lo + csel x14,x14,x22,lo + csel x15,x15,x23,lo + csel x16,x16,x24,lo + + stp x11,x12,[x0] + stp x13,x14,[x0,#16] + stp x15,x16,[x0,#32] + + ret + + +.globl _mul_384 +.private_extern _mul_384 + +.align 5 +_mul_384: +.long 3573752639 + stp x29,x30,[sp,#-128]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + stp x23,x24,[sp,#48] + stp x25,x26,[sp,#64] + stp x27,x28,[sp,#80] + + bl __mul_384 + ldr x30,[x29,#8] + + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldp x23,x24,[x29,#48] + ldp x25,x26,[x29,#64] + ldp x27,x28,[x29,#80] + ldr x29,[sp],#128 +.long 3573752767 + ret + + + +.align 5 +__mul_384: + ldp x11,x12,[x1] + ldr x17, [x2] + ldp x13,x14,[x1,#16] + ldp x15,x16,[x1,#32] + + mul x19,x11,x17 + mul x20,x12,x17 + mul x21,x13,x17 + mul x22,x14,x17 + mul x23,x15,x17 + mul x24,x16,x17 + + umulh x5,x11,x17 + umulh x6,x12,x17 + umulh x7,x13,x17 + umulh x8,x14,x17 + umulh x9,x15,x17 + umulh x10,x16,x17 + ldr x17,[x2,8*1] + + str x19,[x0] + adds x19,x20,x5 + mul x5,x11,x17 + adcs x20,x21,x6 + mul x6,x12,x17 + adcs x21,x22,x7 + mul x7,x13,x17 + adcs x22,x23,x8 + mul x8,x14,x17 + adcs x23,x24,x9 + mul x9,x15,x17 + adc x24,xzr, x10 + mul x10,x16,x17 + adds x19,x19,x5 + umulh x5,x11,x17 + adcs x20,x20,x6 + umulh x6,x12,x17 + adcs x21,x21,x7 + umulh x7,x13,x17 + adcs x22,x22,x8 + umulh x8,x14,x17 + adcs x23,x23,x9 + umulh x9,x15,x17 + adcs x24,x24,x10 + umulh x10,x16,x17 + ldr x17,[x2,#8*(1+1)] + adc x25,xzr,xzr + + str x19,[x0,8*1] + adds x19,x20,x5 + mul x5,x11,x17 + adcs x20,x21,x6 + mul x6,x12,x17 + adcs x21,x22,x7 + mul x7,x13,x17 + adcs x22,x23,x8 + mul x8,x14,x17 + adcs x23,x24,x9 + mul x9,x15,x17 + adc x24,x25,x10 + mul x10,x16,x17 + adds x19,x19,x5 + umulh x5,x11,x17 + adcs x20,x20,x6 + umulh x6,x12,x17 + adcs x21,x21,x7 + umulh x7,x13,x17 + adcs x22,x22,x8 + umulh x8,x14,x17 + adcs x23,x23,x9 + umulh x9,x15,x17 + adcs x24,x24,x10 + umulh x10,x16,x17 + ldr x17,[x2,#8*(2+1)] + adc x25,xzr,xzr + + str x19,[x0,8*2] + adds x19,x20,x5 + mul x5,x11,x17 + adcs x20,x21,x6 + mul x6,x12,x17 + adcs x21,x22,x7 + mul x7,x13,x17 + adcs x22,x23,x8 + mul x8,x14,x17 + adcs x23,x24,x9 + mul x9,x15,x17 + adc x24,x25,x10 + mul x10,x16,x17 + adds x19,x19,x5 + umulh x5,x11,x17 + adcs x20,x20,x6 + umulh x6,x12,x17 + adcs x21,x21,x7 + umulh x7,x13,x17 + adcs x22,x22,x8 + umulh x8,x14,x17 + adcs x23,x23,x9 + umulh x9,x15,x17 + adcs x24,x24,x10 + umulh x10,x16,x17 + ldr x17,[x2,#8*(3+1)] + adc x25,xzr,xzr + + str x19,[x0,8*3] + adds x19,x20,x5 + mul x5,x11,x17 + adcs x20,x21,x6 + mul x6,x12,x17 + adcs x21,x22,x7 + mul x7,x13,x17 + adcs x22,x23,x8 + mul x8,x14,x17 + adcs x23,x24,x9 + mul x9,x15,x17 + adc x24,x25,x10 + mul x10,x16,x17 + adds x19,x19,x5 + umulh x5,x11,x17 + adcs x20,x20,x6 + umulh x6,x12,x17 + adcs x21,x21,x7 + umulh x7,x13,x17 + adcs x22,x22,x8 + umulh x8,x14,x17 + adcs x23,x23,x9 + umulh x9,x15,x17 + adcs x24,x24,x10 + umulh x10,x16,x17 + ldr x17,[x2,#8*(4+1)] + adc x25,xzr,xzr + + str x19,[x0,8*4] + adds x19,x20,x5 + mul x5,x11,x17 + adcs x20,x21,x6 + mul x6,x12,x17 + adcs x21,x22,x7 + mul x7,x13,x17 + adcs x22,x23,x8 + mul x8,x14,x17 + adcs x23,x24,x9 + mul x9,x15,x17 + adc x24,x25,x10 + mul x10,x16,x17 + adds x19,x19,x5 + umulh x5,x11,x17 + adcs x20,x20,x6 + umulh x6,x12,x17 + adcs x21,x21,x7 + umulh x7,x13,x17 + adcs x22,x22,x8 + umulh x8,x14,x17 + adcs x23,x23,x9 + umulh x9,x15,x17 + adcs x24,x24,x10 + umulh x10,x16,x17 + adc x25,xzr,xzr + + str x19,[x0,8*5] + adds x19,x20,x5 + adcs x20,x21,x6 + adcs x21,x22,x7 + adcs x22,x23,x8 + adcs x23,x24,x9 + adc x24,x25,x10 + + stp x19,x20,[x0,#48] + stp x21,x22,[x0,#64] + stp x23,x24,[x0,#80] + + ret + + +.globl _mul_382x +.private_extern _mul_382x + +.align 5 +_mul_382x: +.long 3573752639 + stp x29,x30,[sp,#-128]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + stp x23,x24,[sp,#48] + stp x25,x26,[sp,#64] + stp x27,x28,[sp,#80] + sub sp,sp,#96 // space for two 384-bit vectors + + ldp x11,x12,[x1] + mov x26,x0 // save r_ptr + ldp x19,x20,[x1,#48] + mov x27,x1 // save a_ptr + ldp x13,x14,[x1,#16] + mov x28,x2 // save b_ptr + ldp x21,x22,[x1,#64] + ldp x15,x16,[x1,#32] + adds x5,x11,x19 // t0 = a->re + a->im + ldp x23,x24,[x1,#80] + adcs x6,x12,x20 + ldp x11,x12,[x2] + adcs x7,x13,x21 + ldp x19,x20,[x2,#48] + adcs x8,x14,x22 + ldp x13,x14,[x2,#16] + adcs x9,x15,x23 + ldp x21,x22,[x2,#64] + adc x10,x16,x24 + ldp x15,x16,[x2,#32] + + stp x5,x6,[sp] + adds x5,x11,x19 // t1 = b->re + b->im + ldp x23,x24,[x2,#80] + adcs x6,x12,x20 + stp x7,x8,[sp,#16] + adcs x7,x13,x21 + adcs x8,x14,x22 + stp x9,x10,[sp,#32] + adcs x9,x15,x23 + stp x5,x6,[sp,#48] + adc x10,x16,x24 + stp x7,x8,[sp,#64] + stp x9,x10,[sp,#80] + + bl __mul_384 // _mul_384(ret->re, a->re, b->re) + + add x1,sp,#0 // _mul_384(ret->im, t0, t1) + add x2,sp,#48 + add x0,x26,#96 + bl __mul_384 + + add x1,x27,#48 // _mul_384(tx, a->im, b->im) + add x2,x28,#48 + add x0,sp,#0 + bl __mul_384 + + ldp x5,x6,[x3] + ldp x7,x8,[x3,#16] + ldp x9,x10,[x3,#32] + + add x1,x26,#96 // ret->im -= tx + add x2,sp,#0 + add x0,x26,#96 + bl __sub_mod_384x384 + + add x2,x26,#0 // ret->im -= ret->re + bl __sub_mod_384x384 + + add x1,x26,#0 // ret->re -= tx + add x2,sp,#0 + add x0,x26,#0 + bl __sub_mod_384x384 + ldr x30,[x29,#8] + + add sp,sp,#96 + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldp x23,x24,[x29,#48] + ldp x25,x26,[x29,#64] + ldp x27,x28,[x29,#80] + ldr x29,[sp],#128 +.long 3573752767 + ret + + +.globl _sqr_382x +.private_extern _sqr_382x + +.align 5 +_sqr_382x: +.long 3573752639 + stp x29,x30,[sp,#-128]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + stp x23,x24,[sp,#48] + stp x25,x26,[sp,#64] + stp x27,x28,[sp,#80] + + ldp x11,x12,[x1] + ldp x19,x20,[x1,#48] + ldp x13,x14,[x1,#16] + adds x5,x11,x19 // t0 = a->re + a->im + ldp x21,x22,[x1,#64] + adcs x6,x12,x20 + ldp x15,x16,[x1,#32] + adcs x7,x13,x21 + ldp x23,x24,[x1,#80] + adcs x8,x14,x22 + stp x5,x6,[x0] + adcs x9,x15,x23 + ldp x5,x6,[x2] + adc x10,x16,x24 + stp x7,x8,[x0,#16] + + subs x11,x11,x19 // t1 = a->re - a->im + ldp x7,x8,[x2,#16] + sbcs x12,x12,x20 + stp x9,x10,[x0,#32] + sbcs x13,x13,x21 + ldp x9,x10,[x2,#32] + sbcs x14,x14,x22 + sbcs x15,x15,x23 + sbcs x16,x16,x24 + sbc x25,xzr,xzr + + and x19,x5,x25 + and x20,x6,x25 + adds x11,x11,x19 + and x21,x7,x25 + adcs x12,x12,x20 + and x22,x8,x25 + adcs x13,x13,x21 + and x23,x9,x25 + adcs x14,x14,x22 + and x24,x10,x25 + adcs x15,x15,x23 + stp x11,x12,[x0,#48] + adc x16,x16,x24 + stp x13,x14,[x0,#64] + stp x15,x16,[x0,#80] + + mov x4,x1 // save a_ptr + add x1,x0,#0 // _mul_384(ret->re, t0, t1) + add x2,x0,#48 + bl __mul_384 + + add x1,x4,#0 // _mul_384(ret->im, a->re, a->im) + add x2,x4,#48 + add x0,x0,#96 + bl __mul_384 + ldr x30,[x29,#8] + + ldp x11,x12,[x0] + ldp x13,x14,[x0,#16] + adds x11,x11,x11 // add with itself + ldp x15,x16,[x0,#32] + adcs x12,x12,x12 + adcs x13,x13,x13 + adcs x14,x14,x14 + adcs x15,x15,x15 + adcs x16,x16,x16 + adcs x19,x19,x19 + adcs x20,x20,x20 + stp x11,x12,[x0] + adcs x21,x21,x21 + stp x13,x14,[x0,#16] + adcs x22,x22,x22 + stp x15,x16,[x0,#32] + adcs x23,x23,x23 + stp x19,x20,[x0,#48] + adc x24,x24,x24 + stp x21,x22,[x0,#64] + stp x23,x24,[x0,#80] + + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldp x23,x24,[x29,#48] + ldp x25,x26,[x29,#64] + ldp x27,x28,[x29,#80] + ldr x29,[sp],#128 +.long 3573752767 + ret + + +.globl _sqr_mont_382x +.private_extern _sqr_mont_382x + +.align 5 +_sqr_mont_382x: +.long 3573752639 + stp x29,x30,[sp,#-128]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + stp x23,x24,[sp,#48] + stp x25,x26,[sp,#64] + stp x27,x28,[sp,#80] + stp x3,x0,[sp,#96] // __mul_mont_384 wants them there + sub sp,sp,#112 // space for two 384-bit vectors + word + mov x4,x3 // adjust for missing b_ptr + + ldp x11,x12,[x1] + ldp x13,x14,[x1,#16] + ldp x15,x16,[x1,#32] + + ldp x17,x20,[x1,#48] + ldp x21,x22,[x1,#64] + ldp x23,x24,[x1,#80] + + adds x5,x11,x17 // t0 = a->re + a->im + adcs x6,x12,x20 + adcs x7,x13,x21 + adcs x8,x14,x22 + adcs x9,x15,x23 + adc x10,x16,x24 + + subs x19,x11,x17 // t1 = a->re - a->im + sbcs x20,x12,x20 + sbcs x21,x13,x21 + sbcs x22,x14,x22 + sbcs x23,x15,x23 + sbcs x24,x16,x24 + sbc x25,xzr,xzr // borrow flag as mask + + stp x5,x6,[sp] + stp x7,x8,[sp,#16] + stp x9,x10,[sp,#32] + stp x19,x20,[sp,#48] + stp x21,x22,[sp,#64] + stp x23,x24,[sp,#80] + str x25,[sp,#96] + + ldp x5,x6,[x2] + ldp x7,x8,[x2,#16] + ldp x9,x10,[x2,#32] + + add x2,x1,#48 + bl __mul_mont_383_nonred // _mul_mont_384(ret->im, a->re, a->im) + + adds x19,x11,x11 // add with itself + adcs x20,x12,x12 + adcs x21,x13,x13 + adcs x22,x14,x14 + adcs x23,x15,x15 + adc x24,x16,x16 + + stp x19,x20,[x2,#48] + stp x21,x22,[x2,#64] + stp x23,x24,[x2,#80] + + ldp x11,x12,[sp] + ldr x17,[sp,#48] + ldp x13,x14,[sp,#16] + ldp x15,x16,[sp,#32] + + add x2,sp,#48 + bl __mul_mont_383_nonred // _mul_mont_384(ret->im, t0, t1) + ldr x30,[x29,#8] + + ldr x25,[sp,#96] // account for sign from a->re - a->im + ldp x19,x20,[sp] + ldp x21,x22,[sp,#16] + ldp x23,x24,[sp,#32] + + and x19,x19,x25 + and x20,x20,x25 + and x21,x21,x25 + and x22,x22,x25 + and x23,x23,x25 + and x24,x24,x25 + + subs x11,x11,x19 + sbcs x12,x12,x20 + sbcs x13,x13,x21 + sbcs x14,x14,x22 + sbcs x15,x15,x23 + sbcs x16,x16,x24 + sbc x25,xzr,xzr + + and x19,x5,x25 + and x20,x6,x25 + and x21,x7,x25 + and x22,x8,x25 + and x23,x9,x25 + and x24,x10,x25 + + adds x11,x11,x19 + adcs x12,x12,x20 + adcs x13,x13,x21 + adcs x14,x14,x22 + adcs x15,x15,x23 + adc x16,x16,x24 + + stp x11,x12,[x2] + stp x13,x14,[x2,#16] + stp x15,x16,[x2,#32] + + add sp,sp,#112 + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldp x23,x24,[x29,#48] + ldp x25,x26,[x29,#64] + ldp x27,x28,[x29,#80] + ldr x29,[sp],#128 +.long 3573752767 + ret + + + +.align 5 +__mul_mont_383_nonred: + mul x19,x11,x17 + mul x20,x12,x17 + mul x21,x13,x17 + mul x22,x14,x17 + mul x23,x15,x17 + mul x24,x16,x17 + mul x4,x4,x19 + + umulh x26,x11,x17 + umulh x27,x12,x17 + umulh x28,x13,x17 + umulh x0,x14,x17 + umulh x1,x15,x17 + umulh x3,x16,x17 + + adds x20,x20,x26 + mul x26,x5,x4 + adcs x21,x21,x27 + mul x27,x6,x4 + adcs x22,x22,x28 + mul x28,x7,x4 + adcs x23,x23,x0 + mul x0,x8,x4 + adcs x24,x24,x1 + mul x1,x9,x4 + adc x25,xzr, x3 + mul x3,x10,x4 + ldr x17,[x2,8*1] + adds x19,x19,x26 + umulh x26,x5,x4 + adcs x20,x20,x27 + umulh x27,x6,x4 + adcs x21,x21,x28 + umulh x28,x7,x4 + adcs x22,x22,x0 + umulh x0,x8,x4 + adcs x23,x23,x1 + umulh x1,x9,x4 + adcs x24,x24,x3 + umulh x3,x10,x4 + adc x25,x25,xzr + + ldr x4,[x29,#96] + adds x19,x20,x26 + mul x26,x11,x17 + adcs x20,x21,x27 + mul x27,x12,x17 + adcs x21,x22,x28 + mul x28,x13,x17 + adcs x22,x23,x0 + mul x0,x14,x17 + adcs x23,x24,x1 + mul x1,x15,x17 + adcs x24,x25,x3 + mul x3,x16,x17 + adc x25,xzr,xzr + + adds x19,x19,x26 + umulh x26,x11,x17 + adcs x20,x20,x27 + umulh x27,x12,x17 + adcs x21,x21,x28 + mul x4,x4,x19 + umulh x28,x13,x17 + adcs x22,x22,x0 + umulh x0,x14,x17 + adcs x23,x23,x1 + umulh x1,x15,x17 + adcs x24,x24,x3 + umulh x3,x16,x17 + adc x25,x25,xzr + + adds x20,x20,x26 + mul x26,x5,x4 + adcs x21,x21,x27 + mul x27,x6,x4 + adcs x22,x22,x28 + mul x28,x7,x4 + adcs x23,x23,x0 + mul x0,x8,x4 + adcs x24,x24,x1 + mul x1,x9,x4 + adc x25,x25,x3 + mul x3,x10,x4 + ldr x17,[x2,8*2] + adds x19,x19,x26 + umulh x26,x5,x4 + adcs x20,x20,x27 + umulh x27,x6,x4 + adcs x21,x21,x28 + umulh x28,x7,x4 + adcs x22,x22,x0 + umulh x0,x8,x4 + adcs x23,x23,x1 + umulh x1,x9,x4 + adcs x24,x24,x3 + umulh x3,x10,x4 + adc x25,x25,xzr + + ldr x4,[x29,#96] + adds x19,x20,x26 + mul x26,x11,x17 + adcs x20,x21,x27 + mul x27,x12,x17 + adcs x21,x22,x28 + mul x28,x13,x17 + adcs x22,x23,x0 + mul x0,x14,x17 + adcs x23,x24,x1 + mul x1,x15,x17 + adcs x24,x25,x3 + mul x3,x16,x17 + adc x25,xzr,xzr + + adds x19,x19,x26 + umulh x26,x11,x17 + adcs x20,x20,x27 + umulh x27,x12,x17 + adcs x21,x21,x28 + mul x4,x4,x19 + umulh x28,x13,x17 + adcs x22,x22,x0 + umulh x0,x14,x17 + adcs x23,x23,x1 + umulh x1,x15,x17 + adcs x24,x24,x3 + umulh x3,x16,x17 + adc x25,x25,xzr + + adds x20,x20,x26 + mul x26,x5,x4 + adcs x21,x21,x27 + mul x27,x6,x4 + adcs x22,x22,x28 + mul x28,x7,x4 + adcs x23,x23,x0 + mul x0,x8,x4 + adcs x24,x24,x1 + mul x1,x9,x4 + adc x25,x25,x3 + mul x3,x10,x4 + ldr x17,[x2,8*3] + adds x19,x19,x26 + umulh x26,x5,x4 + adcs x20,x20,x27 + umulh x27,x6,x4 + adcs x21,x21,x28 + umulh x28,x7,x4 + adcs x22,x22,x0 + umulh x0,x8,x4 + adcs x23,x23,x1 + umulh x1,x9,x4 + adcs x24,x24,x3 + umulh x3,x10,x4 + adc x25,x25,xzr + + ldr x4,[x29,#96] + adds x19,x20,x26 + mul x26,x11,x17 + adcs x20,x21,x27 + mul x27,x12,x17 + adcs x21,x22,x28 + mul x28,x13,x17 + adcs x22,x23,x0 + mul x0,x14,x17 + adcs x23,x24,x1 + mul x1,x15,x17 + adcs x24,x25,x3 + mul x3,x16,x17 + adc x25,xzr,xzr + + adds x19,x19,x26 + umulh x26,x11,x17 + adcs x20,x20,x27 + umulh x27,x12,x17 + adcs x21,x21,x28 + mul x4,x4,x19 + umulh x28,x13,x17 + adcs x22,x22,x0 + umulh x0,x14,x17 + adcs x23,x23,x1 + umulh x1,x15,x17 + adcs x24,x24,x3 + umulh x3,x16,x17 + adc x25,x25,xzr + + adds x20,x20,x26 + mul x26,x5,x4 + adcs x21,x21,x27 + mul x27,x6,x4 + adcs x22,x22,x28 + mul x28,x7,x4 + adcs x23,x23,x0 + mul x0,x8,x4 + adcs x24,x24,x1 + mul x1,x9,x4 + adc x25,x25,x3 + mul x3,x10,x4 + ldr x17,[x2,8*4] + adds x19,x19,x26 + umulh x26,x5,x4 + adcs x20,x20,x27 + umulh x27,x6,x4 + adcs x21,x21,x28 + umulh x28,x7,x4 + adcs x22,x22,x0 + umulh x0,x8,x4 + adcs x23,x23,x1 + umulh x1,x9,x4 + adcs x24,x24,x3 + umulh x3,x10,x4 + adc x25,x25,xzr + + ldr x4,[x29,#96] + adds x19,x20,x26 + mul x26,x11,x17 + adcs x20,x21,x27 + mul x27,x12,x17 + adcs x21,x22,x28 + mul x28,x13,x17 + adcs x22,x23,x0 + mul x0,x14,x17 + adcs x23,x24,x1 + mul x1,x15,x17 + adcs x24,x25,x3 + mul x3,x16,x17 + adc x25,xzr,xzr + + adds x19,x19,x26 + umulh x26,x11,x17 + adcs x20,x20,x27 + umulh x27,x12,x17 + adcs x21,x21,x28 + mul x4,x4,x19 + umulh x28,x13,x17 + adcs x22,x22,x0 + umulh x0,x14,x17 + adcs x23,x23,x1 + umulh x1,x15,x17 + adcs x24,x24,x3 + umulh x3,x16,x17 + adc x25,x25,xzr + + adds x20,x20,x26 + mul x26,x5,x4 + adcs x21,x21,x27 + mul x27,x6,x4 + adcs x22,x22,x28 + mul x28,x7,x4 + adcs x23,x23,x0 + mul x0,x8,x4 + adcs x24,x24,x1 + mul x1,x9,x4 + adc x25,x25,x3 + mul x3,x10,x4 + ldr x17,[x2,8*5] + adds x19,x19,x26 + umulh x26,x5,x4 + adcs x20,x20,x27 + umulh x27,x6,x4 + adcs x21,x21,x28 + umulh x28,x7,x4 + adcs x22,x22,x0 + umulh x0,x8,x4 + adcs x23,x23,x1 + umulh x1,x9,x4 + adcs x24,x24,x3 + umulh x3,x10,x4 + adc x25,x25,xzr + + ldr x4,[x29,#96] + adds x19,x20,x26 + mul x26,x11,x17 + adcs x20,x21,x27 + mul x27,x12,x17 + adcs x21,x22,x28 + mul x28,x13,x17 + adcs x22,x23,x0 + mul x0,x14,x17 + adcs x23,x24,x1 + mul x1,x15,x17 + adcs x24,x25,x3 + mul x3,x16,x17 + adc x25,xzr,xzr + + adds x19,x19,x26 + umulh x26,x11,x17 + adcs x20,x20,x27 + umulh x27,x12,x17 + adcs x21,x21,x28 + mul x4,x4,x19 + umulh x28,x13,x17 + adcs x22,x22,x0 + umulh x0,x14,x17 + adcs x23,x23,x1 + umulh x1,x15,x17 + adcs x24,x24,x3 + umulh x3,x16,x17 + adc x25,x25,xzr + + adds x20,x20,x26 + mul x26,x5,x4 + adcs x21,x21,x27 + mul x27,x6,x4 + adcs x22,x22,x28 + mul x28,x7,x4 + adcs x23,x23,x0 + mul x0,x8,x4 + adcs x24,x24,x1 + mul x1,x9,x4 + adc x25,x25,x3 + mul x3,x10,x4 + adds x19,x19,x26 + umulh x26,x5,x4 + adcs x20,x20,x27 + umulh x27,x6,x4 + adcs x21,x21,x28 + umulh x28,x7,x4 + adcs x22,x22,x0 + umulh x0,x8,x4 + adcs x23,x23,x1 + umulh x1,x9,x4 + adcs x24,x24,x3 + umulh x3,x10,x4 + adc x25,x25,xzr + ldp x4,x2,[x29,#96] // pull r_ptr + + adds x11,x20,x26 + adcs x12,x21,x27 + adcs x13,x22,x28 + adcs x14,x23,x0 + adcs x15,x24,x1 + adcs x16,x25,x3 + + ret + + +.globl _sgn0_pty_mont_384 +.private_extern _sgn0_pty_mont_384 + +.align 5 +_sgn0_pty_mont_384: +.long 3573752639 + stp x29,x30,[sp,#-128]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + stp x23,x24,[sp,#48] + stp x25,x26,[sp,#64] + stp x27,x28,[sp,#80] + + mov x4,x2 + ldp x5,x6,[x1] + ldp x7,x8,[x1,#16] + ldp x9,x10,[x1,#32] + mov x1,x0 + + bl __mul_by_1_mont_384 + ldr x30,[x29,#8] + + and x0,x11,#1 + adds x11,x11,x11 + adcs x12,x12,x12 + adcs x13,x13,x13 + adcs x14,x14,x14 + adcs x15,x15,x15 + adcs x16,x16,x16 + adc x17,xzr,xzr + + subs x11,x11,x5 + sbcs x12,x12,x6 + sbcs x13,x13,x7 + sbcs x14,x14,x8 + sbcs x15,x15,x9 + sbcs x16,x16,x10 + sbc x17,x17,xzr + + mvn x17,x17 + and x17,x17,#2 + orr x0,x0,x17 + + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldp x23,x24,[x29,#48] + ldp x25,x26,[x29,#64] + ldp x27,x28,[x29,#80] + ldr x29,[sp],#128 +.long 3573752767 + ret + + +.globl _sgn0_pty_mont_384x +.private_extern _sgn0_pty_mont_384x + +.align 5 +_sgn0_pty_mont_384x: +.long 3573752639 + stp x29,x30,[sp,#-128]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + stp x23,x24,[sp,#48] + stp x25,x26,[sp,#64] + stp x27,x28,[sp,#80] + + mov x4,x2 + ldp x5,x6,[x1] + ldp x7,x8,[x1,#16] + ldp x9,x10,[x1,#32] + mov x1,x0 + + bl __mul_by_1_mont_384 + add x1,x1,#48 + + and x2,x11,#1 + orr x3,x11,x12 + adds x11,x11,x11 + orr x3,x3,x13 + adcs x12,x12,x12 + orr x3,x3,x14 + adcs x13,x13,x13 + orr x3,x3,x15 + adcs x14,x14,x14 + orr x3,x3,x16 + adcs x15,x15,x15 + adcs x16,x16,x16 + adc x17,xzr,xzr + + subs x11,x11,x5 + sbcs x12,x12,x6 + sbcs x13,x13,x7 + sbcs x14,x14,x8 + sbcs x15,x15,x9 + sbcs x16,x16,x10 + sbc x17,x17,xzr + + mvn x17,x17 + and x17,x17,#2 + orr x2,x2,x17 + + bl __mul_by_1_mont_384 + ldr x30,[x29,#8] + + and x0,x11,#1 + orr x1,x11,x12 + adds x11,x11,x11 + orr x1,x1,x13 + adcs x12,x12,x12 + orr x1,x1,x14 + adcs x13,x13,x13 + orr x1,x1,x15 + adcs x14,x14,x14 + orr x1,x1,x16 + adcs x15,x15,x15 + adcs x16,x16,x16 + adc x17,xzr,xzr + + subs x11,x11,x5 + sbcs x12,x12,x6 + sbcs x13,x13,x7 + sbcs x14,x14,x8 + sbcs x15,x15,x9 + sbcs x16,x16,x10 + sbc x17,x17,xzr + + mvn x17,x17 + and x17,x17,#2 + orr x0,x0,x17 + + cmp x3,#0 + csel x3,x0,x2,eq // a->re==0? prty(a->im) : prty(a->re) + + cmp x1,#0 + csel x1,x0,x2,ne // a->im!=0? sgn0(a->im) : sgn0(a->re) + + and x3,x3,#1 + and x1,x1,#2 + orr x0,x1,x3 // pack sign and parity + + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldp x23,x24,[x29,#48] + ldp x25,x26,[x29,#64] + ldp x27,x28,[x29,#80] + ldr x29,[sp],#128 +.long 3573752767 + ret + diff --git a/temp-deps/blst-ts/deps/blst/build/mach-o/mulq_mont_256-x86_64.s b/temp-deps/blst-ts/deps/blst/build/mach-o/mulq_mont_256-x86_64.s new file mode 100644 index 0000000..d83f544 --- /dev/null +++ b/temp-deps/blst-ts/deps/blst/build/mach-o/mulq_mont_256-x86_64.s @@ -0,0 +1,706 @@ +.text + +.globl _mul_mont_sparse_256 +.private_extern _mul_mont_sparse_256 + +.p2align 5 +_mul_mont_sparse_256: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + pushq %rdi +.cfi_adjust_cfa_offset 8 + + + movq 0(%rdx),%rax + movq 0(%rsi),%r13 + movq 8(%rsi),%r14 + movq 16(%rsi),%r12 + movq 24(%rsi),%rbp + movq %rdx,%rbx + + movq %rax,%r15 + mulq %r13 + movq %rax,%r9 + movq %r15,%rax + movq %rdx,%r10 + call __mulq_mont_sparse_256 + + movq 8(%rsp),%r15 +.cfi_restore %r15 + movq 16(%rsp),%r14 +.cfi_restore %r14 + movq 24(%rsp),%r13 +.cfi_restore %r13 + movq 32(%rsp),%r12 +.cfi_restore %r12 + movq 40(%rsp),%rbx +.cfi_restore %rbx + movq 48(%rsp),%rbp +.cfi_restore %rbp + leaq 56(%rsp),%rsp +.cfi_adjust_cfa_offset -56 + + .byte 0xf3,0xc3 +.cfi_endproc + + +.globl _sqr_mont_sparse_256 +.private_extern _sqr_mont_sparse_256 + +.p2align 5 +_sqr_mont_sparse_256: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + pushq %rdi +.cfi_adjust_cfa_offset 8 + + + movq 0(%rsi),%rax + movq %rcx,%r8 + movq 8(%rsi),%r14 + movq %rdx,%rcx + movq 16(%rsi),%r12 + leaq (%rsi),%rbx + movq 24(%rsi),%rbp + + movq %rax,%r15 + mulq %rax + movq %rax,%r9 + movq %r15,%rax + movq %rdx,%r10 + call __mulq_mont_sparse_256 + + movq 8(%rsp),%r15 +.cfi_restore %r15 + movq 16(%rsp),%r14 +.cfi_restore %r14 + movq 24(%rsp),%r13 +.cfi_restore %r13 + movq 32(%rsp),%r12 +.cfi_restore %r12 + movq 40(%rsp),%rbx +.cfi_restore %rbx + movq 48(%rsp),%rbp +.cfi_restore %rbp + leaq 56(%rsp),%rsp +.cfi_adjust_cfa_offset -56 + + .byte 0xf3,0xc3 +.cfi_endproc + + +.p2align 5 +__mulq_mont_sparse_256: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + mulq %r14 + addq %rax,%r10 + movq %r15,%rax + adcq $0,%rdx + movq %rdx,%r11 + + mulq %r12 + addq %rax,%r11 + movq %r15,%rax + adcq $0,%rdx + movq %rdx,%r12 + + mulq %rbp + addq %rax,%r12 + movq 8(%rbx),%rax + adcq $0,%rdx + xorq %r14,%r14 + movq %rdx,%r13 + + movq %r9,%rdi + imulq %r8,%r9 + + + movq %rax,%r15 + mulq 0(%rsi) + addq %rax,%r10 + movq %r15,%rax + adcq $0,%rdx + movq %rdx,%rbp + + mulq 8(%rsi) + addq %rax,%r11 + movq %r15,%rax + adcq $0,%rdx + addq %rbp,%r11 + adcq $0,%rdx + movq %rdx,%rbp + + mulq 16(%rsi) + addq %rax,%r12 + movq %r15,%rax + adcq $0,%rdx + addq %rbp,%r12 + adcq $0,%rdx + movq %rdx,%rbp + + mulq 24(%rsi) + addq %rax,%r13 + movq %r9,%rax + adcq $0,%rdx + addq %rbp,%r13 + adcq %rdx,%r14 + xorq %r15,%r15 + + + mulq 0(%rcx) + addq %rax,%rdi + movq %r9,%rax + adcq %rdx,%rdi + + mulq 8(%rcx) + addq %rax,%r10 + movq %r9,%rax + adcq $0,%rdx + addq %rdi,%r10 + adcq $0,%rdx + movq %rdx,%rbp + + mulq 16(%rcx) + addq %rax,%r11 + movq %r9,%rax + adcq $0,%rdx + addq %rbp,%r11 + adcq $0,%rdx + movq %rdx,%rbp + + mulq 24(%rcx) + addq %rax,%r12 + movq 16(%rbx),%rax + adcq $0,%rdx + addq %rbp,%r12 + adcq $0,%rdx + addq %rdx,%r13 + adcq $0,%r14 + adcq $0,%r15 + movq %r10,%rdi + imulq %r8,%r10 + + + movq %rax,%r9 + mulq 0(%rsi) + addq %rax,%r11 + movq %r9,%rax + adcq $0,%rdx + movq %rdx,%rbp + + mulq 8(%rsi) + addq %rax,%r12 + movq %r9,%rax + adcq $0,%rdx + addq %rbp,%r12 + adcq $0,%rdx + movq %rdx,%rbp + + mulq 16(%rsi) + addq %rax,%r13 + movq %r9,%rax + adcq $0,%rdx + addq %rbp,%r13 + adcq $0,%rdx + movq %rdx,%rbp + + mulq 24(%rsi) + addq %rax,%r14 + movq %r10,%rax + adcq $0,%rdx + addq %rbp,%r14 + adcq %rdx,%r15 + xorq %r9,%r9 + + + mulq 0(%rcx) + addq %rax,%rdi + movq %r10,%rax + adcq %rdx,%rdi + + mulq 8(%rcx) + addq %rax,%r11 + movq %r10,%rax + adcq $0,%rdx + addq %rdi,%r11 + adcq $0,%rdx + movq %rdx,%rbp + + mulq 16(%rcx) + addq %rax,%r12 + movq %r10,%rax + adcq $0,%rdx + addq %rbp,%r12 + adcq $0,%rdx + movq %rdx,%rbp + + mulq 24(%rcx) + addq %rax,%r13 + movq 24(%rbx),%rax + adcq $0,%rdx + addq %rbp,%r13 + adcq $0,%rdx + addq %rdx,%r14 + adcq $0,%r15 + adcq $0,%r9 + movq %r11,%rdi + imulq %r8,%r11 + + + movq %rax,%r10 + mulq 0(%rsi) + addq %rax,%r12 + movq %r10,%rax + adcq $0,%rdx + movq %rdx,%rbp + + mulq 8(%rsi) + addq %rax,%r13 + movq %r10,%rax + adcq $0,%rdx + addq %rbp,%r13 + adcq $0,%rdx + movq %rdx,%rbp + + mulq 16(%rsi) + addq %rax,%r14 + movq %r10,%rax + adcq $0,%rdx + addq %rbp,%r14 + adcq $0,%rdx + movq %rdx,%rbp + + mulq 24(%rsi) + addq %rax,%r15 + movq %r11,%rax + adcq $0,%rdx + addq %rbp,%r15 + adcq %rdx,%r9 + xorq %r10,%r10 + + + mulq 0(%rcx) + addq %rax,%rdi + movq %r11,%rax + adcq %rdx,%rdi + + mulq 8(%rcx) + addq %rax,%r12 + movq %r11,%rax + adcq $0,%rdx + addq %rdi,%r12 + adcq $0,%rdx + movq %rdx,%rbp + + mulq 16(%rcx) + addq %rax,%r13 + movq %r11,%rax + adcq $0,%rdx + addq %rbp,%r13 + adcq $0,%rdx + movq %rdx,%rbp + + mulq 24(%rcx) + addq %rax,%r14 + movq %r12,%rax + adcq $0,%rdx + addq %rbp,%r14 + adcq $0,%rdx + addq %rdx,%r15 + adcq $0,%r9 + adcq $0,%r10 + imulq %r8,%rax + movq 8(%rsp),%rsi + + + movq %rax,%r11 + mulq 0(%rcx) + addq %rax,%r12 + movq %r11,%rax + adcq %rdx,%r12 + + mulq 8(%rcx) + addq %rax,%r13 + movq %r11,%rax + adcq $0,%rdx + addq %r12,%r13 + adcq $0,%rdx + movq %rdx,%rbp + + mulq 16(%rcx) + addq %rax,%r14 + movq %r11,%rax + adcq $0,%rdx + addq %rbp,%r14 + adcq $0,%rdx + movq %rdx,%rbp + + mulq 24(%rcx) + movq %r14,%rbx + addq %rbp,%r15 + adcq $0,%rdx + addq %rax,%r15 + movq %r13,%rax + adcq $0,%rdx + addq %rdx,%r9 + adcq $0,%r10 + + + + + movq %r15,%r12 + subq 0(%rcx),%r13 + sbbq 8(%rcx),%r14 + sbbq 16(%rcx),%r15 + movq %r9,%rbp + sbbq 24(%rcx),%r9 + sbbq $0,%r10 + + cmovcq %rax,%r13 + cmovcq %rbx,%r14 + cmovcq %r12,%r15 + movq %r13,0(%rsi) + cmovcq %rbp,%r9 + movq %r14,8(%rsi) + movq %r15,16(%rsi) + movq %r9,24(%rsi) + + .byte 0xf3,0xc3 +.cfi_endproc + +.globl _from_mont_256 +.private_extern _from_mont_256 + +.p2align 5 +_from_mont_256: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + subq $8,%rsp +.cfi_adjust_cfa_offset 8 + + + movq %rdx,%rbx + call __mulq_by_1_mont_256 + + + + + + movq %r14,%r10 + movq %r15,%r11 + movq %r9,%r12 + + subq 0(%rbx),%r13 + sbbq 8(%rbx),%r14 + sbbq 16(%rbx),%r15 + sbbq 24(%rbx),%r9 + + cmovncq %r13,%rax + cmovncq %r14,%r10 + cmovncq %r15,%r11 + movq %rax,0(%rdi) + cmovncq %r9,%r12 + movq %r10,8(%rdi) + movq %r11,16(%rdi) + movq %r12,24(%rdi) + + movq 8(%rsp),%r15 +.cfi_restore %r15 + movq 16(%rsp),%r14 +.cfi_restore %r14 + movq 24(%rsp),%r13 +.cfi_restore %r13 + movq 32(%rsp),%r12 +.cfi_restore %r12 + movq 40(%rsp),%rbx +.cfi_restore %rbx + movq 48(%rsp),%rbp +.cfi_restore %rbp + leaq 56(%rsp),%rsp +.cfi_adjust_cfa_offset -56 + + .byte 0xf3,0xc3 +.cfi_endproc + + +.globl _redc_mont_256 +.private_extern _redc_mont_256 + +.p2align 5 +_redc_mont_256: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + subq $8,%rsp +.cfi_adjust_cfa_offset 8 + + + movq %rdx,%rbx + call __mulq_by_1_mont_256 + + addq 32(%rsi),%r13 + adcq 40(%rsi),%r14 + movq %r13,%rax + adcq 48(%rsi),%r15 + movq %r14,%r10 + adcq 56(%rsi),%r9 + sbbq %rsi,%rsi + + + + + movq %r15,%r11 + subq 0(%rbx),%r13 + sbbq 8(%rbx),%r14 + sbbq 16(%rbx),%r15 + movq %r9,%r12 + sbbq 24(%rbx),%r9 + sbbq $0,%rsi + + cmovncq %r13,%rax + cmovncq %r14,%r10 + cmovncq %r15,%r11 + movq %rax,0(%rdi) + cmovncq %r9,%r12 + movq %r10,8(%rdi) + movq %r11,16(%rdi) + movq %r12,24(%rdi) + + movq 8(%rsp),%r15 +.cfi_restore %r15 + movq 16(%rsp),%r14 +.cfi_restore %r14 + movq 24(%rsp),%r13 +.cfi_restore %r13 + movq 32(%rsp),%r12 +.cfi_restore %r12 + movq 40(%rsp),%rbx +.cfi_restore %rbx + movq 48(%rsp),%rbp +.cfi_restore %rbp + leaq 56(%rsp),%rsp +.cfi_adjust_cfa_offset -56 + + .byte 0xf3,0xc3 +.cfi_endproc + + +.p2align 5 +__mulq_by_1_mont_256: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + movq 0(%rsi),%rax + movq 8(%rsi),%r10 + movq 16(%rsi),%r11 + movq 24(%rsi),%r12 + + movq %rax,%r13 + imulq %rcx,%rax + movq %rax,%r9 + + mulq 0(%rbx) + addq %rax,%r13 + movq %r9,%rax + adcq %rdx,%r13 + + mulq 8(%rbx) + addq %rax,%r10 + movq %r9,%rax + adcq $0,%rdx + addq %r13,%r10 + adcq $0,%rdx + movq %rdx,%r13 + + mulq 16(%rbx) + movq %r10,%r14 + imulq %rcx,%r10 + addq %rax,%r11 + movq %r9,%rax + adcq $0,%rdx + addq %r13,%r11 + adcq $0,%rdx + movq %rdx,%r13 + + mulq 24(%rbx) + addq %rax,%r12 + movq %r10,%rax + adcq $0,%rdx + addq %r13,%r12 + adcq $0,%rdx + movq %rdx,%r13 + + mulq 0(%rbx) + addq %rax,%r14 + movq %r10,%rax + adcq %rdx,%r14 + + mulq 8(%rbx) + addq %rax,%r11 + movq %r10,%rax + adcq $0,%rdx + addq %r14,%r11 + adcq $0,%rdx + movq %rdx,%r14 + + mulq 16(%rbx) + movq %r11,%r15 + imulq %rcx,%r11 + addq %rax,%r12 + movq %r10,%rax + adcq $0,%rdx + addq %r14,%r12 + adcq $0,%rdx + movq %rdx,%r14 + + mulq 24(%rbx) + addq %rax,%r13 + movq %r11,%rax + adcq $0,%rdx + addq %r14,%r13 + adcq $0,%rdx + movq %rdx,%r14 + + mulq 0(%rbx) + addq %rax,%r15 + movq %r11,%rax + adcq %rdx,%r15 + + mulq 8(%rbx) + addq %rax,%r12 + movq %r11,%rax + adcq $0,%rdx + addq %r15,%r12 + adcq $0,%rdx + movq %rdx,%r15 + + mulq 16(%rbx) + movq %r12,%r9 + imulq %rcx,%r12 + addq %rax,%r13 + movq %r11,%rax + adcq $0,%rdx + addq %r15,%r13 + adcq $0,%rdx + movq %rdx,%r15 + + mulq 24(%rbx) + addq %rax,%r14 + movq %r12,%rax + adcq $0,%rdx + addq %r15,%r14 + adcq $0,%rdx + movq %rdx,%r15 + + mulq 0(%rbx) + addq %rax,%r9 + movq %r12,%rax + adcq %rdx,%r9 + + mulq 8(%rbx) + addq %rax,%r13 + movq %r12,%rax + adcq $0,%rdx + addq %r9,%r13 + adcq $0,%rdx + movq %rdx,%r9 + + mulq 16(%rbx) + addq %rax,%r14 + movq %r12,%rax + adcq $0,%rdx + addq %r9,%r14 + adcq $0,%rdx + movq %rdx,%r9 + + mulq 24(%rbx) + addq %rax,%r15 + movq %r13,%rax + adcq $0,%rdx + addq %r9,%r15 + adcq $0,%rdx + movq %rdx,%r9 + .byte 0xf3,0xc3 +.cfi_endproc + diff --git a/temp-deps/blst-ts/deps/blst/build/mach-o/mulq_mont_384-x86_64.s b/temp-deps/blst-ts/deps/blst/build/mach-o/mulq_mont_384-x86_64.s new file mode 100644 index 0000000..0d8ac89 --- /dev/null +++ b/temp-deps/blst-ts/deps/blst/build/mach-o/mulq_mont_384-x86_64.s @@ -0,0 +1,3612 @@ +.text + + + + + + + + +.p2align 5 +__sub_mod_384x384: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + movq 32(%rsi),%r12 + movq 40(%rsi),%r13 + movq 48(%rsi),%r14 + + subq 0(%rdx),%r8 + movq 56(%rsi),%r15 + sbbq 8(%rdx),%r9 + movq 64(%rsi),%rax + sbbq 16(%rdx),%r10 + movq 72(%rsi),%rbx + sbbq 24(%rdx),%r11 + movq 80(%rsi),%rbp + sbbq 32(%rdx),%r12 + movq 88(%rsi),%rsi + sbbq 40(%rdx),%r13 + movq %r8,0(%rdi) + sbbq 48(%rdx),%r14 + movq 0(%rcx),%r8 + movq %r9,8(%rdi) + sbbq 56(%rdx),%r15 + movq 8(%rcx),%r9 + movq %r10,16(%rdi) + sbbq 64(%rdx),%rax + movq 16(%rcx),%r10 + movq %r11,24(%rdi) + sbbq 72(%rdx),%rbx + movq 24(%rcx),%r11 + movq %r12,32(%rdi) + sbbq 80(%rdx),%rbp + movq 32(%rcx),%r12 + movq %r13,40(%rdi) + sbbq 88(%rdx),%rsi + movq 40(%rcx),%r13 + sbbq %rdx,%rdx + + andq %rdx,%r8 + andq %rdx,%r9 + andq %rdx,%r10 + andq %rdx,%r11 + andq %rdx,%r12 + andq %rdx,%r13 + + addq %r8,%r14 + adcq %r9,%r15 + movq %r14,48(%rdi) + adcq %r10,%rax + movq %r15,56(%rdi) + adcq %r11,%rbx + movq %rax,64(%rdi) + adcq %r12,%rbp + movq %rbx,72(%rdi) + adcq %r13,%rsi + movq %rbp,80(%rdi) + movq %rsi,88(%rdi) + + .byte 0xf3,0xc3 +.cfi_endproc + + + +.p2align 5 +__add_mod_384: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + movq 32(%rsi),%r12 + movq 40(%rsi),%r13 + + addq 0(%rdx),%r8 + adcq 8(%rdx),%r9 + adcq 16(%rdx),%r10 + movq %r8,%r14 + adcq 24(%rdx),%r11 + movq %r9,%r15 + adcq 32(%rdx),%r12 + movq %r10,%rax + adcq 40(%rdx),%r13 + movq %r11,%rbx + sbbq %rdx,%rdx + + subq 0(%rcx),%r8 + sbbq 8(%rcx),%r9 + movq %r12,%rbp + sbbq 16(%rcx),%r10 + sbbq 24(%rcx),%r11 + sbbq 32(%rcx),%r12 + movq %r13,%rsi + sbbq 40(%rcx),%r13 + sbbq $0,%rdx + + cmovcq %r14,%r8 + cmovcq %r15,%r9 + cmovcq %rax,%r10 + movq %r8,0(%rdi) + cmovcq %rbx,%r11 + movq %r9,8(%rdi) + cmovcq %rbp,%r12 + movq %r10,16(%rdi) + cmovcq %rsi,%r13 + movq %r11,24(%rdi) + movq %r12,32(%rdi) + movq %r13,40(%rdi) + + .byte 0xf3,0xc3 +.cfi_endproc + + + +.p2align 5 +__sub_mod_384: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + movq 32(%rsi),%r12 + movq 40(%rsi),%r13 + +__sub_mod_384_a_is_loaded: + subq 0(%rdx),%r8 + movq 0(%rcx),%r14 + sbbq 8(%rdx),%r9 + movq 8(%rcx),%r15 + sbbq 16(%rdx),%r10 + movq 16(%rcx),%rax + sbbq 24(%rdx),%r11 + movq 24(%rcx),%rbx + sbbq 32(%rdx),%r12 + movq 32(%rcx),%rbp + sbbq 40(%rdx),%r13 + movq 40(%rcx),%rsi + sbbq %rdx,%rdx + + andq %rdx,%r14 + andq %rdx,%r15 + andq %rdx,%rax + andq %rdx,%rbx + andq %rdx,%rbp + andq %rdx,%rsi + + addq %r14,%r8 + adcq %r15,%r9 + movq %r8,0(%rdi) + adcq %rax,%r10 + movq %r9,8(%rdi) + adcq %rbx,%r11 + movq %r10,16(%rdi) + adcq %rbp,%r12 + movq %r11,24(%rdi) + adcq %rsi,%r13 + movq %r12,32(%rdi) + movq %r13,40(%rdi) + + .byte 0xf3,0xc3 +.cfi_endproc + +.globl _mul_mont_384x +.private_extern _mul_mont_384x + +.p2align 5 +_mul_mont_384x: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + subq $328,%rsp +.cfi_adjust_cfa_offset 328 + + + movq %rdx,%rbx + movq %rdi,32(%rsp) + movq %rsi,24(%rsp) + movq %rdx,16(%rsp) + movq %rcx,8(%rsp) + movq %r8,0(%rsp) + + + + + leaq 40(%rsp),%rdi + call __mulq_384 + + + leaq 48(%rbx),%rbx + leaq 48(%rsi),%rsi + leaq 40+96(%rsp),%rdi + call __mulq_384 + + + movq 8(%rsp),%rcx + leaq -48(%rsi),%rdx + leaq 40+192+48(%rsp),%rdi + call __add_mod_384 + + movq 16(%rsp),%rsi + leaq 48(%rsi),%rdx + leaq -48(%rdi),%rdi + call __add_mod_384 + + leaq (%rdi),%rbx + leaq 48(%rdi),%rsi + call __mulq_384 + + + leaq (%rdi),%rsi + leaq 40(%rsp),%rdx + movq 8(%rsp),%rcx + call __sub_mod_384x384 + + leaq (%rdi),%rsi + leaq -96(%rdi),%rdx + call __sub_mod_384x384 + + + leaq 40(%rsp),%rsi + leaq 40+96(%rsp),%rdx + leaq 40(%rsp),%rdi + call __sub_mod_384x384 + + movq %rcx,%rbx + + + leaq 40(%rsp),%rsi + movq 0(%rsp),%rcx + movq 32(%rsp),%rdi + call __mulq_by_1_mont_384 + call __redc_tail_mont_384 + + + leaq 40+192(%rsp),%rsi + movq 0(%rsp),%rcx + leaq 48(%rdi),%rdi + call __mulq_by_1_mont_384 + call __redc_tail_mont_384 + + leaq 328(%rsp),%r8 + movq 0(%r8),%r15 +.cfi_restore %r15 + movq 8(%r8),%r14 +.cfi_restore %r14 + movq 16(%r8),%r13 +.cfi_restore %r13 + movq 24(%r8),%r12 +.cfi_restore %r12 + movq 32(%r8),%rbx +.cfi_restore %rbx + movq 40(%r8),%rbp +.cfi_restore %rbp + leaq 48(%r8),%rsp +.cfi_adjust_cfa_offset -328-8*6 + + .byte 0xf3,0xc3 +.cfi_endproc + +.globl _sqr_mont_384x +.private_extern _sqr_mont_384x + +.p2align 5 +_sqr_mont_384x: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + subq $136,%rsp +.cfi_adjust_cfa_offset 136 + + + movq %rcx,0(%rsp) + movq %rdx,%rcx + movq %rdi,8(%rsp) + movq %rsi,16(%rsp) + + + leaq 48(%rsi),%rdx + leaq 32(%rsp),%rdi + call __add_mod_384 + + + movq 16(%rsp),%rsi + leaq 48(%rsi),%rdx + leaq 32+48(%rsp),%rdi + call __sub_mod_384 + + + movq 16(%rsp),%rsi + leaq 48(%rsi),%rbx + + movq 48(%rsi),%rax + movq 0(%rsi),%r14 + movq 8(%rsi),%r15 + movq 16(%rsi),%r12 + movq 24(%rsi),%r13 + + call __mulq_mont_384 + addq %r14,%r14 + adcq %r15,%r15 + adcq %r8,%r8 + movq %r14,%r12 + adcq %r9,%r9 + movq %r15,%r13 + adcq %r10,%r10 + movq %r8,%rax + adcq %r11,%r11 + movq %r9,%rbx + sbbq %rdx,%rdx + + subq 0(%rcx),%r14 + sbbq 8(%rcx),%r15 + movq %r10,%rbp + sbbq 16(%rcx),%r8 + sbbq 24(%rcx),%r9 + sbbq 32(%rcx),%r10 + movq %r11,%rsi + sbbq 40(%rcx),%r11 + sbbq $0,%rdx + + cmovcq %r12,%r14 + cmovcq %r13,%r15 + cmovcq %rax,%r8 + movq %r14,48(%rdi) + cmovcq %rbx,%r9 + movq %r15,56(%rdi) + cmovcq %rbp,%r10 + movq %r8,64(%rdi) + cmovcq %rsi,%r11 + movq %r9,72(%rdi) + movq %r10,80(%rdi) + movq %r11,88(%rdi) + + leaq 32(%rsp),%rsi + leaq 32+48(%rsp),%rbx + + movq 32+48(%rsp),%rax + movq 32+0(%rsp),%r14 + movq 32+8(%rsp),%r15 + movq 32+16(%rsp),%r12 + movq 32+24(%rsp),%r13 + + call __mulq_mont_384 + + leaq 136(%rsp),%r8 + movq 0(%r8),%r15 +.cfi_restore %r15 + movq 8(%r8),%r14 +.cfi_restore %r14 + movq 16(%r8),%r13 +.cfi_restore %r13 + movq 24(%r8),%r12 +.cfi_restore %r12 + movq 32(%r8),%rbx +.cfi_restore %rbx + movq 40(%r8),%rbp +.cfi_restore %rbp + leaq 48(%r8),%rsp +.cfi_adjust_cfa_offset -136-8*6 + + .byte 0xf3,0xc3 +.cfi_endproc + + +.globl _mul_382x +.private_extern _mul_382x + +.p2align 5 +_mul_382x: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + subq $136,%rsp +.cfi_adjust_cfa_offset 136 + + + leaq 96(%rdi),%rdi + movq %rsi,0(%rsp) + movq %rdx,8(%rsp) + movq %rdi,16(%rsp) + movq %rcx,24(%rsp) + + + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + movq 32(%rsi),%r12 + movq 40(%rsi),%r13 + + addq 48(%rsi),%r8 + adcq 56(%rsi),%r9 + adcq 64(%rsi),%r10 + adcq 72(%rsi),%r11 + adcq 80(%rsi),%r12 + adcq 88(%rsi),%r13 + + movq %r8,32+0(%rsp) + movq %r9,32+8(%rsp) + movq %r10,32+16(%rsp) + movq %r11,32+24(%rsp) + movq %r12,32+32(%rsp) + movq %r13,32+40(%rsp) + + + movq 0(%rdx),%r8 + movq 8(%rdx),%r9 + movq 16(%rdx),%r10 + movq 24(%rdx),%r11 + movq 32(%rdx),%r12 + movq 40(%rdx),%r13 + + addq 48(%rdx),%r8 + adcq 56(%rdx),%r9 + adcq 64(%rdx),%r10 + adcq 72(%rdx),%r11 + adcq 80(%rdx),%r12 + adcq 88(%rdx),%r13 + + movq %r8,32+48(%rsp) + movq %r9,32+56(%rsp) + movq %r10,32+64(%rsp) + movq %r11,32+72(%rsp) + movq %r12,32+80(%rsp) + movq %r13,32+88(%rsp) + + + leaq 32+0(%rsp),%rsi + leaq 32+48(%rsp),%rbx + call __mulq_384 + + + movq 0(%rsp),%rsi + movq 8(%rsp),%rbx + leaq -96(%rdi),%rdi + call __mulq_384 + + + leaq 48(%rsi),%rsi + leaq 48(%rbx),%rbx + leaq 32(%rsp),%rdi + call __mulq_384 + + + movq 16(%rsp),%rsi + leaq 32(%rsp),%rdx + movq 24(%rsp),%rcx + movq %rsi,%rdi + call __sub_mod_384x384 + + + leaq 0(%rdi),%rsi + leaq -96(%rdi),%rdx + call __sub_mod_384x384 + + + leaq -96(%rdi),%rsi + leaq 32(%rsp),%rdx + leaq -96(%rdi),%rdi + call __sub_mod_384x384 + + leaq 136(%rsp),%r8 + movq 0(%r8),%r15 +.cfi_restore %r15 + movq 8(%r8),%r14 +.cfi_restore %r14 + movq 16(%r8),%r13 +.cfi_restore %r13 + movq 24(%r8),%r12 +.cfi_restore %r12 + movq 32(%r8),%rbx +.cfi_restore %rbx + movq 40(%r8),%rbp +.cfi_restore %rbp + leaq 48(%r8),%rsp +.cfi_adjust_cfa_offset -136-8*6 + + .byte 0xf3,0xc3 +.cfi_endproc + +.globl _sqr_382x +.private_extern _sqr_382x + +.p2align 5 +_sqr_382x: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + pushq %rsi +.cfi_adjust_cfa_offset 8 + + + movq %rdx,%rcx + + + movq 0(%rsi),%r14 + movq 8(%rsi),%r15 + movq 16(%rsi),%rax + movq 24(%rsi),%rbx + movq 32(%rsi),%rbp + movq 40(%rsi),%rdx + + movq %r14,%r8 + addq 48(%rsi),%r14 + movq %r15,%r9 + adcq 56(%rsi),%r15 + movq %rax,%r10 + adcq 64(%rsi),%rax + movq %rbx,%r11 + adcq 72(%rsi),%rbx + movq %rbp,%r12 + adcq 80(%rsi),%rbp + movq %rdx,%r13 + adcq 88(%rsi),%rdx + + movq %r14,0(%rdi) + movq %r15,8(%rdi) + movq %rax,16(%rdi) + movq %rbx,24(%rdi) + movq %rbp,32(%rdi) + movq %rdx,40(%rdi) + + + leaq 48(%rsi),%rdx + leaq 48(%rdi),%rdi + call __sub_mod_384_a_is_loaded + + + leaq (%rdi),%rsi + leaq -48(%rdi),%rbx + leaq -48(%rdi),%rdi + call __mulq_384 + + + movq (%rsp),%rsi + leaq 48(%rsi),%rbx + leaq 96(%rdi),%rdi + call __mulq_384 + + movq 0(%rdi),%r8 + movq 8(%rdi),%r9 + movq 16(%rdi),%r10 + movq 24(%rdi),%r11 + movq 32(%rdi),%r12 + movq 40(%rdi),%r13 + movq 48(%rdi),%r14 + movq 56(%rdi),%r15 + movq 64(%rdi),%rax + movq 72(%rdi),%rbx + movq 80(%rdi),%rbp + addq %r8,%r8 + movq 88(%rdi),%rdx + adcq %r9,%r9 + movq %r8,0(%rdi) + adcq %r10,%r10 + movq %r9,8(%rdi) + adcq %r11,%r11 + movq %r10,16(%rdi) + adcq %r12,%r12 + movq %r11,24(%rdi) + adcq %r13,%r13 + movq %r12,32(%rdi) + adcq %r14,%r14 + movq %r13,40(%rdi) + adcq %r15,%r15 + movq %r14,48(%rdi) + adcq %rax,%rax + movq %r15,56(%rdi) + adcq %rbx,%rbx + movq %rax,64(%rdi) + adcq %rbp,%rbp + movq %rbx,72(%rdi) + adcq %rdx,%rdx + movq %rbp,80(%rdi) + movq %rdx,88(%rdi) + + movq 8(%rsp),%r15 +.cfi_restore %r15 + movq 16(%rsp),%r14 +.cfi_restore %r14 + movq 24(%rsp),%r13 +.cfi_restore %r13 + movq 32(%rsp),%r12 +.cfi_restore %r12 + movq 40(%rsp),%rbx +.cfi_restore %rbx + movq 48(%rsp),%rbp +.cfi_restore %rbp + leaq 56(%rsp),%rsp +.cfi_adjust_cfa_offset -8*7 + + .byte 0xf3,0xc3 +.cfi_endproc + +.globl _mul_384 +.private_extern _mul_384 + +.p2align 5 +_mul_384: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + + + movq %rdx,%rbx + call __mulq_384 + + movq 0(%rsp),%r12 +.cfi_restore %r12 + movq 8(%rsp),%rbx +.cfi_restore %rbx + movq 16(%rsp),%rbp +.cfi_restore %rbp + leaq 24(%rsp),%rsp +.cfi_adjust_cfa_offset -24 + + .byte 0xf3,0xc3 +.cfi_endproc + + + +.p2align 5 +__mulq_384: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + movq 0(%rbx),%rax + + movq %rax,%rbp + mulq 0(%rsi) + movq %rax,0(%rdi) + movq %rbp,%rax + movq %rdx,%rcx + + mulq 8(%rsi) + addq %rax,%rcx + movq %rbp,%rax + adcq $0,%rdx + movq %rdx,%r8 + + mulq 16(%rsi) + addq %rax,%r8 + movq %rbp,%rax + adcq $0,%rdx + movq %rdx,%r9 + + mulq 24(%rsi) + addq %rax,%r9 + movq %rbp,%rax + adcq $0,%rdx + movq %rdx,%r10 + + mulq 32(%rsi) + addq %rax,%r10 + movq %rbp,%rax + adcq $0,%rdx + movq %rdx,%r11 + + mulq 40(%rsi) + addq %rax,%r11 + movq 8(%rbx),%rax + adcq $0,%rdx + movq %rdx,%r12 + movq %rax,%rbp + mulq 0(%rsi) + addq %rax,%rcx + movq %rbp,%rax + adcq $0,%rdx + movq %rcx,8(%rdi) + movq %rdx,%rcx + + mulq 8(%rsi) + addq %rax,%r8 + movq %rbp,%rax + adcq $0,%rdx + addq %r8,%rcx + adcq $0,%rdx + movq %rdx,%r8 + + mulq 16(%rsi) + addq %rax,%r9 + movq %rbp,%rax + adcq $0,%rdx + addq %r9,%r8 + adcq $0,%rdx + movq %rdx,%r9 + + mulq 24(%rsi) + addq %rax,%r10 + movq %rbp,%rax + adcq $0,%rdx + addq %r10,%r9 + adcq $0,%rdx + movq %rdx,%r10 + + mulq 32(%rsi) + addq %rax,%r11 + movq %rbp,%rax + adcq $0,%rdx + addq %r11,%r10 + adcq $0,%rdx + movq %rdx,%r11 + + mulq 40(%rsi) + addq %rax,%r12 + movq 16(%rbx),%rax + adcq $0,%rdx + addq %r12,%r11 + adcq $0,%rdx + movq %rdx,%r12 + movq %rax,%rbp + mulq 0(%rsi) + addq %rax,%rcx + movq %rbp,%rax + adcq $0,%rdx + movq %rcx,16(%rdi) + movq %rdx,%rcx + + mulq 8(%rsi) + addq %rax,%r8 + movq %rbp,%rax + adcq $0,%rdx + addq %r8,%rcx + adcq $0,%rdx + movq %rdx,%r8 + + mulq 16(%rsi) + addq %rax,%r9 + movq %rbp,%rax + adcq $0,%rdx + addq %r9,%r8 + adcq $0,%rdx + movq %rdx,%r9 + + mulq 24(%rsi) + addq %rax,%r10 + movq %rbp,%rax + adcq $0,%rdx + addq %r10,%r9 + adcq $0,%rdx + movq %rdx,%r10 + + mulq 32(%rsi) + addq %rax,%r11 + movq %rbp,%rax + adcq $0,%rdx + addq %r11,%r10 + adcq $0,%rdx + movq %rdx,%r11 + + mulq 40(%rsi) + addq %rax,%r12 + movq 24(%rbx),%rax + adcq $0,%rdx + addq %r12,%r11 + adcq $0,%rdx + movq %rdx,%r12 + movq %rax,%rbp + mulq 0(%rsi) + addq %rax,%rcx + movq %rbp,%rax + adcq $0,%rdx + movq %rcx,24(%rdi) + movq %rdx,%rcx + + mulq 8(%rsi) + addq %rax,%r8 + movq %rbp,%rax + adcq $0,%rdx + addq %r8,%rcx + adcq $0,%rdx + movq %rdx,%r8 + + mulq 16(%rsi) + addq %rax,%r9 + movq %rbp,%rax + adcq $0,%rdx + addq %r9,%r8 + adcq $0,%rdx + movq %rdx,%r9 + + mulq 24(%rsi) + addq %rax,%r10 + movq %rbp,%rax + adcq $0,%rdx + addq %r10,%r9 + adcq $0,%rdx + movq %rdx,%r10 + + mulq 32(%rsi) + addq %rax,%r11 + movq %rbp,%rax + adcq $0,%rdx + addq %r11,%r10 + adcq $0,%rdx + movq %rdx,%r11 + + mulq 40(%rsi) + addq %rax,%r12 + movq 32(%rbx),%rax + adcq $0,%rdx + addq %r12,%r11 + adcq $0,%rdx + movq %rdx,%r12 + movq %rax,%rbp + mulq 0(%rsi) + addq %rax,%rcx + movq %rbp,%rax + adcq $0,%rdx + movq %rcx,32(%rdi) + movq %rdx,%rcx + + mulq 8(%rsi) + addq %rax,%r8 + movq %rbp,%rax + adcq $0,%rdx + addq %r8,%rcx + adcq $0,%rdx + movq %rdx,%r8 + + mulq 16(%rsi) + addq %rax,%r9 + movq %rbp,%rax + adcq $0,%rdx + addq %r9,%r8 + adcq $0,%rdx + movq %rdx,%r9 + + mulq 24(%rsi) + addq %rax,%r10 + movq %rbp,%rax + adcq $0,%rdx + addq %r10,%r9 + adcq $0,%rdx + movq %rdx,%r10 + + mulq 32(%rsi) + addq %rax,%r11 + movq %rbp,%rax + adcq $0,%rdx + addq %r11,%r10 + adcq $0,%rdx + movq %rdx,%r11 + + mulq 40(%rsi) + addq %rax,%r12 + movq 40(%rbx),%rax + adcq $0,%rdx + addq %r12,%r11 + adcq $0,%rdx + movq %rdx,%r12 + movq %rax,%rbp + mulq 0(%rsi) + addq %rax,%rcx + movq %rbp,%rax + adcq $0,%rdx + movq %rcx,40(%rdi) + movq %rdx,%rcx + + mulq 8(%rsi) + addq %rax,%r8 + movq %rbp,%rax + adcq $0,%rdx + addq %r8,%rcx + adcq $0,%rdx + movq %rdx,%r8 + + mulq 16(%rsi) + addq %rax,%r9 + movq %rbp,%rax + adcq $0,%rdx + addq %r9,%r8 + adcq $0,%rdx + movq %rdx,%r9 + + mulq 24(%rsi) + addq %rax,%r10 + movq %rbp,%rax + adcq $0,%rdx + addq %r10,%r9 + adcq $0,%rdx + movq %rdx,%r10 + + mulq 32(%rsi) + addq %rax,%r11 + movq %rbp,%rax + adcq $0,%rdx + addq %r11,%r10 + adcq $0,%rdx + movq %rdx,%r11 + + mulq 40(%rsi) + addq %rax,%r12 + movq %rax,%rax + adcq $0,%rdx + addq %r12,%r11 + adcq $0,%rdx + movq %rdx,%r12 + movq %rcx,48(%rdi) + movq %r8,56(%rdi) + movq %r9,64(%rdi) + movq %r10,72(%rdi) + movq %r11,80(%rdi) + movq %r12,88(%rdi) + + .byte 0xf3,0xc3 +.cfi_endproc + +.globl _sqr_384 +.private_extern _sqr_384 + +.p2align 5 +_sqr_384: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + subq $8,%rsp +.cfi_adjust_cfa_offset 8 + + + call __sqrq_384 + + movq 8(%rsp),%r15 +.cfi_restore %r15 + movq 16(%rsp),%r14 +.cfi_restore %r14 + movq 24(%rsp),%r13 +.cfi_restore %r13 + movq 32(%rsp),%r12 +.cfi_restore %r12 + movq 40(%rsp),%rbx +.cfi_restore %rbx + movq 48(%rsp),%rbp +.cfi_restore %rbp + leaq 56(%rsp),%rsp +.cfi_adjust_cfa_offset -56 + + .byte 0xf3,0xc3 +.cfi_endproc + + + +.p2align 5 +__sqrq_384: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + movq 0(%rsi),%rax + movq 8(%rsi),%r15 + movq 16(%rsi),%rcx + movq 24(%rsi),%rbx + + + movq %rax,%r14 + mulq %r15 + movq %rax,%r9 + movq %r14,%rax + movq 32(%rsi),%rbp + movq %rdx,%r10 + + mulq %rcx + addq %rax,%r10 + movq %r14,%rax + adcq $0,%rdx + movq 40(%rsi),%rsi + movq %rdx,%r11 + + mulq %rbx + addq %rax,%r11 + movq %r14,%rax + adcq $0,%rdx + movq %rdx,%r12 + + mulq %rbp + addq %rax,%r12 + movq %r14,%rax + adcq $0,%rdx + movq %rdx,%r13 + + mulq %rsi + addq %rax,%r13 + movq %r14,%rax + adcq $0,%rdx + movq %rdx,%r14 + + mulq %rax + xorq %r8,%r8 + movq %rax,0(%rdi) + movq %r15,%rax + addq %r9,%r9 + adcq $0,%r8 + addq %rdx,%r9 + adcq $0,%r8 + movq %r9,8(%rdi) + + mulq %rcx + addq %rax,%r11 + movq %r15,%rax + adcq $0,%rdx + movq %rdx,%r9 + + mulq %rbx + addq %rax,%r12 + movq %r15,%rax + adcq $0,%rdx + addq %r9,%r12 + adcq $0,%rdx + movq %rdx,%r9 + + mulq %rbp + addq %rax,%r13 + movq %r15,%rax + adcq $0,%rdx + addq %r9,%r13 + adcq $0,%rdx + movq %rdx,%r9 + + mulq %rsi + addq %rax,%r14 + movq %r15,%rax + adcq $0,%rdx + addq %r9,%r14 + adcq $0,%rdx + movq %rdx,%r15 + + mulq %rax + xorq %r9,%r9 + addq %rax,%r8 + movq %rcx,%rax + addq %r10,%r10 + adcq %r11,%r11 + adcq $0,%r9 + addq %r8,%r10 + adcq %rdx,%r11 + adcq $0,%r9 + movq %r10,16(%rdi) + + mulq %rbx + addq %rax,%r13 + movq %rcx,%rax + adcq $0,%rdx + movq %r11,24(%rdi) + movq %rdx,%r8 + + mulq %rbp + addq %rax,%r14 + movq %rcx,%rax + adcq $0,%rdx + addq %r8,%r14 + adcq $0,%rdx + movq %rdx,%r8 + + mulq %rsi + addq %rax,%r15 + movq %rcx,%rax + adcq $0,%rdx + addq %r8,%r15 + adcq $0,%rdx + movq %rdx,%rcx + + mulq %rax + xorq %r11,%r11 + addq %rax,%r9 + movq %rbx,%rax + addq %r12,%r12 + adcq %r13,%r13 + adcq $0,%r11 + addq %r9,%r12 + adcq %rdx,%r13 + adcq $0,%r11 + movq %r12,32(%rdi) + + + mulq %rbp + addq %rax,%r15 + movq %rbx,%rax + adcq $0,%rdx + movq %r13,40(%rdi) + movq %rdx,%r8 + + mulq %rsi + addq %rax,%rcx + movq %rbx,%rax + adcq $0,%rdx + addq %r8,%rcx + adcq $0,%rdx + movq %rdx,%rbx + + mulq %rax + xorq %r12,%r12 + addq %rax,%r11 + movq %rbp,%rax + addq %r14,%r14 + adcq %r15,%r15 + adcq $0,%r12 + addq %r11,%r14 + adcq %rdx,%r15 + movq %r14,48(%rdi) + adcq $0,%r12 + movq %r15,56(%rdi) + + + mulq %rsi + addq %rax,%rbx + movq %rbp,%rax + adcq $0,%rdx + movq %rdx,%rbp + + mulq %rax + xorq %r13,%r13 + addq %rax,%r12 + movq %rsi,%rax + addq %rcx,%rcx + adcq %rbx,%rbx + adcq $0,%r13 + addq %r12,%rcx + adcq %rdx,%rbx + movq %rcx,64(%rdi) + adcq $0,%r13 + movq %rbx,72(%rdi) + + + mulq %rax + addq %r13,%rax + addq %rbp,%rbp + adcq $0,%rdx + addq %rbp,%rax + adcq $0,%rdx + movq %rax,80(%rdi) + movq %rdx,88(%rdi) + + .byte 0xf3,0xc3 +.cfi_endproc + + +.globl _sqr_mont_384 +.private_extern _sqr_mont_384 + +.p2align 5 +_sqr_mont_384: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + subq $120,%rsp +.cfi_adjust_cfa_offset 8*15 + + + movq %rcx,96(%rsp) + movq %rdx,104(%rsp) + movq %rdi,112(%rsp) + + movq %rsp,%rdi + call __sqrq_384 + + leaq 0(%rsp),%rsi + movq 96(%rsp),%rcx + movq 104(%rsp),%rbx + movq 112(%rsp),%rdi + call __mulq_by_1_mont_384 + call __redc_tail_mont_384 + + leaq 120(%rsp),%r8 + movq 120(%rsp),%r15 +.cfi_restore %r15 + movq 8(%r8),%r14 +.cfi_restore %r14 + movq 16(%r8),%r13 +.cfi_restore %r13 + movq 24(%r8),%r12 +.cfi_restore %r12 + movq 32(%r8),%rbx +.cfi_restore %rbx + movq 40(%r8),%rbp +.cfi_restore %rbp + leaq 48(%r8),%rsp +.cfi_adjust_cfa_offset -8*21 + + .byte 0xf3,0xc3 +.cfi_endproc + + + + +.globl _redc_mont_384 +.private_extern _redc_mont_384 + +.p2align 5 +_redc_mont_384: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + subq $8,%rsp +.cfi_adjust_cfa_offset 8 + + + movq %rdx,%rbx + call __mulq_by_1_mont_384 + call __redc_tail_mont_384 + + movq 8(%rsp),%r15 +.cfi_restore %r15 + movq 16(%rsp),%r14 +.cfi_restore %r14 + movq 24(%rsp),%r13 +.cfi_restore %r13 + movq 32(%rsp),%r12 +.cfi_restore %r12 + movq 40(%rsp),%rbx +.cfi_restore %rbx + movq 48(%rsp),%rbp +.cfi_restore %rbp + leaq 56(%rsp),%rsp +.cfi_adjust_cfa_offset -56 + + .byte 0xf3,0xc3 +.cfi_endproc + + + + + +.globl _from_mont_384 +.private_extern _from_mont_384 + +.p2align 5 +_from_mont_384: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + subq $8,%rsp +.cfi_adjust_cfa_offset 8 + + + movq %rdx,%rbx + call __mulq_by_1_mont_384 + + + + + + movq %r15,%rcx + movq %r8,%rdx + movq %r9,%rbp + + subq 0(%rbx),%r14 + sbbq 8(%rbx),%r15 + movq %r10,%r13 + sbbq 16(%rbx),%r8 + sbbq 24(%rbx),%r9 + sbbq 32(%rbx),%r10 + movq %r11,%rsi + sbbq 40(%rbx),%r11 + + cmovcq %rax,%r14 + cmovcq %rcx,%r15 + cmovcq %rdx,%r8 + movq %r14,0(%rdi) + cmovcq %rbp,%r9 + movq %r15,8(%rdi) + cmovcq %r13,%r10 + movq %r8,16(%rdi) + cmovcq %rsi,%r11 + movq %r9,24(%rdi) + movq %r10,32(%rdi) + movq %r11,40(%rdi) + + movq 8(%rsp),%r15 +.cfi_restore %r15 + movq 16(%rsp),%r14 +.cfi_restore %r14 + movq 24(%rsp),%r13 +.cfi_restore %r13 + movq 32(%rsp),%r12 +.cfi_restore %r12 + movq 40(%rsp),%rbx +.cfi_restore %rbx + movq 48(%rsp),%rbp +.cfi_restore %rbp + leaq 56(%rsp),%rsp +.cfi_adjust_cfa_offset -56 + + .byte 0xf3,0xc3 +.cfi_endproc + + +.p2align 5 +__mulq_by_1_mont_384: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + movq 0(%rsi),%rax + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + movq 32(%rsi),%r12 + movq 40(%rsi),%r13 + + movq %rax,%r14 + imulq %rcx,%rax + movq %rax,%r8 + + mulq 0(%rbx) + addq %rax,%r14 + movq %r8,%rax + adcq %rdx,%r14 + + mulq 8(%rbx) + addq %rax,%r9 + movq %r8,%rax + adcq $0,%rdx + addq %r14,%r9 + adcq $0,%rdx + movq %rdx,%r14 + + mulq 16(%rbx) + addq %rax,%r10 + movq %r8,%rax + adcq $0,%rdx + addq %r14,%r10 + adcq $0,%rdx + movq %rdx,%r14 + + mulq 24(%rbx) + addq %rax,%r11 + movq %r8,%rax + adcq $0,%rdx + movq %r9,%r15 + imulq %rcx,%r9 + addq %r14,%r11 + adcq $0,%rdx + movq %rdx,%r14 + + mulq 32(%rbx) + addq %rax,%r12 + movq %r8,%rax + adcq $0,%rdx + addq %r14,%r12 + adcq $0,%rdx + movq %rdx,%r14 + + mulq 40(%rbx) + addq %rax,%r13 + movq %r9,%rax + adcq $0,%rdx + addq %r14,%r13 + adcq $0,%rdx + movq %rdx,%r14 + + mulq 0(%rbx) + addq %rax,%r15 + movq %r9,%rax + adcq %rdx,%r15 + + mulq 8(%rbx) + addq %rax,%r10 + movq %r9,%rax + adcq $0,%rdx + addq %r15,%r10 + adcq $0,%rdx + movq %rdx,%r15 + + mulq 16(%rbx) + addq %rax,%r11 + movq %r9,%rax + adcq $0,%rdx + addq %r15,%r11 + adcq $0,%rdx + movq %rdx,%r15 + + mulq 24(%rbx) + addq %rax,%r12 + movq %r9,%rax + adcq $0,%rdx + movq %r10,%r8 + imulq %rcx,%r10 + addq %r15,%r12 + adcq $0,%rdx + movq %rdx,%r15 + + mulq 32(%rbx) + addq %rax,%r13 + movq %r9,%rax + adcq $0,%rdx + addq %r15,%r13 + adcq $0,%rdx + movq %rdx,%r15 + + mulq 40(%rbx) + addq %rax,%r14 + movq %r10,%rax + adcq $0,%rdx + addq %r15,%r14 + adcq $0,%rdx + movq %rdx,%r15 + + mulq 0(%rbx) + addq %rax,%r8 + movq %r10,%rax + adcq %rdx,%r8 + + mulq 8(%rbx) + addq %rax,%r11 + movq %r10,%rax + adcq $0,%rdx + addq %r8,%r11 + adcq $0,%rdx + movq %rdx,%r8 + + mulq 16(%rbx) + addq %rax,%r12 + movq %r10,%rax + adcq $0,%rdx + addq %r8,%r12 + adcq $0,%rdx + movq %rdx,%r8 + + mulq 24(%rbx) + addq %rax,%r13 + movq %r10,%rax + adcq $0,%rdx + movq %r11,%r9 + imulq %rcx,%r11 + addq %r8,%r13 + adcq $0,%rdx + movq %rdx,%r8 + + mulq 32(%rbx) + addq %rax,%r14 + movq %r10,%rax + adcq $0,%rdx + addq %r8,%r14 + adcq $0,%rdx + movq %rdx,%r8 + + mulq 40(%rbx) + addq %rax,%r15 + movq %r11,%rax + adcq $0,%rdx + addq %r8,%r15 + adcq $0,%rdx + movq %rdx,%r8 + + mulq 0(%rbx) + addq %rax,%r9 + movq %r11,%rax + adcq %rdx,%r9 + + mulq 8(%rbx) + addq %rax,%r12 + movq %r11,%rax + adcq $0,%rdx + addq %r9,%r12 + adcq $0,%rdx + movq %rdx,%r9 + + mulq 16(%rbx) + addq %rax,%r13 + movq %r11,%rax + adcq $0,%rdx + addq %r9,%r13 + adcq $0,%rdx + movq %rdx,%r9 + + mulq 24(%rbx) + addq %rax,%r14 + movq %r11,%rax + adcq $0,%rdx + movq %r12,%r10 + imulq %rcx,%r12 + addq %r9,%r14 + adcq $0,%rdx + movq %rdx,%r9 + + mulq 32(%rbx) + addq %rax,%r15 + movq %r11,%rax + adcq $0,%rdx + addq %r9,%r15 + adcq $0,%rdx + movq %rdx,%r9 + + mulq 40(%rbx) + addq %rax,%r8 + movq %r12,%rax + adcq $0,%rdx + addq %r9,%r8 + adcq $0,%rdx + movq %rdx,%r9 + + mulq 0(%rbx) + addq %rax,%r10 + movq %r12,%rax + adcq %rdx,%r10 + + mulq 8(%rbx) + addq %rax,%r13 + movq %r12,%rax + adcq $0,%rdx + addq %r10,%r13 + adcq $0,%rdx + movq %rdx,%r10 + + mulq 16(%rbx) + addq %rax,%r14 + movq %r12,%rax + adcq $0,%rdx + addq %r10,%r14 + adcq $0,%rdx + movq %rdx,%r10 + + mulq 24(%rbx) + addq %rax,%r15 + movq %r12,%rax + adcq $0,%rdx + movq %r13,%r11 + imulq %rcx,%r13 + addq %r10,%r15 + adcq $0,%rdx + movq %rdx,%r10 + + mulq 32(%rbx) + addq %rax,%r8 + movq %r12,%rax + adcq $0,%rdx + addq %r10,%r8 + adcq $0,%rdx + movq %rdx,%r10 + + mulq 40(%rbx) + addq %rax,%r9 + movq %r13,%rax + adcq $0,%rdx + addq %r10,%r9 + adcq $0,%rdx + movq %rdx,%r10 + + mulq 0(%rbx) + addq %rax,%r11 + movq %r13,%rax + adcq %rdx,%r11 + + mulq 8(%rbx) + addq %rax,%r14 + movq %r13,%rax + adcq $0,%rdx + addq %r11,%r14 + adcq $0,%rdx + movq %rdx,%r11 + + mulq 16(%rbx) + addq %rax,%r15 + movq %r13,%rax + adcq $0,%rdx + addq %r11,%r15 + adcq $0,%rdx + movq %rdx,%r11 + + mulq 24(%rbx) + addq %rax,%r8 + movq %r13,%rax + adcq $0,%rdx + addq %r11,%r8 + adcq $0,%rdx + movq %rdx,%r11 + + mulq 32(%rbx) + addq %rax,%r9 + movq %r13,%rax + adcq $0,%rdx + addq %r11,%r9 + adcq $0,%rdx + movq %rdx,%r11 + + mulq 40(%rbx) + addq %rax,%r10 + movq %r14,%rax + adcq $0,%rdx + addq %r11,%r10 + adcq $0,%rdx + movq %rdx,%r11 + .byte 0xf3,0xc3 +.cfi_endproc + + + +.p2align 5 +__redc_tail_mont_384: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + addq 48(%rsi),%r14 + movq %r14,%rax + adcq 56(%rsi),%r15 + adcq 64(%rsi),%r8 + adcq 72(%rsi),%r9 + movq %r15,%rcx + adcq 80(%rsi),%r10 + adcq 88(%rsi),%r11 + sbbq %r12,%r12 + + + + + movq %r8,%rdx + movq %r9,%rbp + + subq 0(%rbx),%r14 + sbbq 8(%rbx),%r15 + movq %r10,%r13 + sbbq 16(%rbx),%r8 + sbbq 24(%rbx),%r9 + sbbq 32(%rbx),%r10 + movq %r11,%rsi + sbbq 40(%rbx),%r11 + sbbq $0,%r12 + + cmovcq %rax,%r14 + cmovcq %rcx,%r15 + cmovcq %rdx,%r8 + movq %r14,0(%rdi) + cmovcq %rbp,%r9 + movq %r15,8(%rdi) + cmovcq %r13,%r10 + movq %r8,16(%rdi) + cmovcq %rsi,%r11 + movq %r9,24(%rdi) + movq %r10,32(%rdi) + movq %r11,40(%rdi) + + .byte 0xf3,0xc3 +.cfi_endproc + + +.globl _sgn0_pty_mont_384 +.private_extern _sgn0_pty_mont_384 + +.p2align 5 +_sgn0_pty_mont_384: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + subq $8,%rsp +.cfi_adjust_cfa_offset 8 + + + movq %rsi,%rbx + leaq 0(%rdi),%rsi + movq %rdx,%rcx + call __mulq_by_1_mont_384 + + xorq %rax,%rax + movq %r14,%r13 + addq %r14,%r14 + adcq %r15,%r15 + adcq %r8,%r8 + adcq %r9,%r9 + adcq %r10,%r10 + adcq %r11,%r11 + adcq $0,%rax + + subq 0(%rbx),%r14 + sbbq 8(%rbx),%r15 + sbbq 16(%rbx),%r8 + sbbq 24(%rbx),%r9 + sbbq 32(%rbx),%r10 + sbbq 40(%rbx),%r11 + sbbq $0,%rax + + notq %rax + andq $1,%r13 + andq $2,%rax + orq %r13,%rax + + movq 8(%rsp),%r15 +.cfi_restore %r15 + movq 16(%rsp),%r14 +.cfi_restore %r14 + movq 24(%rsp),%r13 +.cfi_restore %r13 + movq 32(%rsp),%r12 +.cfi_restore %r12 + movq 40(%rsp),%rbx +.cfi_restore %rbx + movq 48(%rsp),%rbp +.cfi_restore %rbp + leaq 56(%rsp),%rsp +.cfi_adjust_cfa_offset -56 + + .byte 0xf3,0xc3 +.cfi_endproc + + +.globl _sgn0_pty_mont_384x +.private_extern _sgn0_pty_mont_384x + +.p2align 5 +_sgn0_pty_mont_384x: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + subq $8,%rsp +.cfi_adjust_cfa_offset 8 + + + movq %rsi,%rbx + leaq 48(%rdi),%rsi + movq %rdx,%rcx + call __mulq_by_1_mont_384 + + movq %r14,%r12 + orq %r15,%r14 + orq %r8,%r14 + orq %r9,%r14 + orq %r10,%r14 + orq %r11,%r14 + + leaq 0(%rdi),%rsi + xorq %rdi,%rdi + movq %r12,%r13 + addq %r12,%r12 + adcq %r15,%r15 + adcq %r8,%r8 + adcq %r9,%r9 + adcq %r10,%r10 + adcq %r11,%r11 + adcq $0,%rdi + + subq 0(%rbx),%r12 + sbbq 8(%rbx),%r15 + sbbq 16(%rbx),%r8 + sbbq 24(%rbx),%r9 + sbbq 32(%rbx),%r10 + sbbq 40(%rbx),%r11 + sbbq $0,%rdi + + movq %r14,0(%rsp) + notq %rdi + andq $1,%r13 + andq $2,%rdi + orq %r13,%rdi + + call __mulq_by_1_mont_384 + + movq %r14,%r12 + orq %r15,%r14 + orq %r8,%r14 + orq %r9,%r14 + orq %r10,%r14 + orq %r11,%r14 + + xorq %rax,%rax + movq %r12,%r13 + addq %r12,%r12 + adcq %r15,%r15 + adcq %r8,%r8 + adcq %r9,%r9 + adcq %r10,%r10 + adcq %r11,%r11 + adcq $0,%rax + + subq 0(%rbx),%r12 + sbbq 8(%rbx),%r15 + sbbq 16(%rbx),%r8 + sbbq 24(%rbx),%r9 + sbbq 32(%rbx),%r10 + sbbq 40(%rbx),%r11 + sbbq $0,%rax + + movq 0(%rsp),%r12 + + notq %rax + + testq %r14,%r14 + cmovzq %rdi,%r13 + + testq %r12,%r12 + cmovnzq %rdi,%rax + + andq $1,%r13 + andq $2,%rax + orq %r13,%rax + + movq 8(%rsp),%r15 +.cfi_restore %r15 + movq 16(%rsp),%r14 +.cfi_restore %r14 + movq 24(%rsp),%r13 +.cfi_restore %r13 + movq 32(%rsp),%r12 +.cfi_restore %r12 + movq 40(%rsp),%rbx +.cfi_restore %rbx + movq 48(%rsp),%rbp +.cfi_restore %rbp + leaq 56(%rsp),%rsp +.cfi_adjust_cfa_offset -56 + + .byte 0xf3,0xc3 +.cfi_endproc + +.globl _mul_mont_384 +.private_extern _mul_mont_384 + +.p2align 5 +_mul_mont_384: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + subq $24,%rsp +.cfi_adjust_cfa_offset 8*3 + + + movq 0(%rdx),%rax + movq 0(%rsi),%r14 + movq 8(%rsi),%r15 + movq 16(%rsi),%r12 + movq 24(%rsi),%r13 + movq %rdx,%rbx + movq %r8,0(%rsp) + movq %rdi,8(%rsp) + + call __mulq_mont_384 + + movq 24(%rsp),%r15 +.cfi_restore %r15 + movq 32(%rsp),%r14 +.cfi_restore %r14 + movq 40(%rsp),%r13 +.cfi_restore %r13 + movq 48(%rsp),%r12 +.cfi_restore %r12 + movq 56(%rsp),%rbx +.cfi_restore %rbx + movq 64(%rsp),%rbp +.cfi_restore %rbp + leaq 72(%rsp),%rsp +.cfi_adjust_cfa_offset -72 + + .byte 0xf3,0xc3 +.cfi_endproc + + +.p2align 5 +__mulq_mont_384: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + movq %rax,%rdi + mulq %r14 + movq %rax,%r8 + movq %rdi,%rax + movq %rdx,%r9 + + mulq %r15 + addq %rax,%r9 + movq %rdi,%rax + adcq $0,%rdx + movq %rdx,%r10 + + mulq %r12 + addq %rax,%r10 + movq %rdi,%rax + adcq $0,%rdx + movq %rdx,%r11 + + movq %r8,%rbp + imulq 8(%rsp),%r8 + + mulq %r13 + addq %rax,%r11 + movq %rdi,%rax + adcq $0,%rdx + movq %rdx,%r12 + + mulq 32(%rsi) + addq %rax,%r12 + movq %rdi,%rax + adcq $0,%rdx + movq %rdx,%r13 + + mulq 40(%rsi) + addq %rax,%r13 + movq %r8,%rax + adcq $0,%rdx + xorq %r15,%r15 + movq %rdx,%r14 + + mulq 0(%rcx) + addq %rax,%rbp + movq %r8,%rax + adcq %rdx,%rbp + + mulq 8(%rcx) + addq %rax,%r9 + movq %r8,%rax + adcq $0,%rdx + addq %rbp,%r9 + adcq $0,%rdx + movq %rdx,%rbp + + mulq 16(%rcx) + addq %rax,%r10 + movq %r8,%rax + adcq $0,%rdx + addq %rbp,%r10 + adcq $0,%rdx + movq %rdx,%rbp + + mulq 24(%rcx) + addq %rbp,%r11 + adcq $0,%rdx + addq %rax,%r11 + movq %r8,%rax + adcq $0,%rdx + movq %rdx,%rbp + + mulq 32(%rcx) + addq %rax,%r12 + movq %r8,%rax + adcq $0,%rdx + addq %rbp,%r12 + adcq $0,%rdx + movq %rdx,%rbp + + mulq 40(%rcx) + addq %rax,%r13 + movq 8(%rbx),%rax + adcq $0,%rdx + addq %rbp,%r13 + adcq %rdx,%r14 + adcq $0,%r15 + + movq %rax,%rdi + mulq 0(%rsi) + addq %rax,%r9 + movq %rdi,%rax + adcq $0,%rdx + movq %rdx,%r8 + + mulq 8(%rsi) + addq %rax,%r10 + movq %rdi,%rax + adcq $0,%rdx + addq %r8,%r10 + adcq $0,%rdx + movq %rdx,%r8 + + mulq 16(%rsi) + addq %rax,%r11 + movq %rdi,%rax + adcq $0,%rdx + addq %r8,%r11 + adcq $0,%rdx + movq %rdx,%r8 + + movq %r9,%rbp + imulq 8(%rsp),%r9 + + mulq 24(%rsi) + addq %rax,%r12 + movq %rdi,%rax + adcq $0,%rdx + addq %r8,%r12 + adcq $0,%rdx + movq %rdx,%r8 + + mulq 32(%rsi) + addq %rax,%r13 + movq %rdi,%rax + adcq $0,%rdx + addq %r8,%r13 + adcq $0,%rdx + movq %rdx,%r8 + + mulq 40(%rsi) + addq %r8,%r14 + adcq $0,%rdx + xorq %r8,%r8 + addq %rax,%r14 + movq %r9,%rax + adcq %rdx,%r15 + adcq $0,%r8 + + mulq 0(%rcx) + addq %rax,%rbp + movq %r9,%rax + adcq %rdx,%rbp + + mulq 8(%rcx) + addq %rax,%r10 + movq %r9,%rax + adcq $0,%rdx + addq %rbp,%r10 + adcq $0,%rdx + movq %rdx,%rbp + + mulq 16(%rcx) + addq %rax,%r11 + movq %r9,%rax + adcq $0,%rdx + addq %rbp,%r11 + adcq $0,%rdx + movq %rdx,%rbp + + mulq 24(%rcx) + addq %rbp,%r12 + adcq $0,%rdx + addq %rax,%r12 + movq %r9,%rax + adcq $0,%rdx + movq %rdx,%rbp + + mulq 32(%rcx) + addq %rax,%r13 + movq %r9,%rax + adcq $0,%rdx + addq %rbp,%r13 + adcq $0,%rdx + movq %rdx,%rbp + + mulq 40(%rcx) + addq %rax,%r14 + movq 16(%rbx),%rax + adcq $0,%rdx + addq %rbp,%r14 + adcq %rdx,%r15 + adcq $0,%r8 + + movq %rax,%rdi + mulq 0(%rsi) + addq %rax,%r10 + movq %rdi,%rax + adcq $0,%rdx + movq %rdx,%r9 + + mulq 8(%rsi) + addq %rax,%r11 + movq %rdi,%rax + adcq $0,%rdx + addq %r9,%r11 + adcq $0,%rdx + movq %rdx,%r9 + + mulq 16(%rsi) + addq %rax,%r12 + movq %rdi,%rax + adcq $0,%rdx + addq %r9,%r12 + adcq $0,%rdx + movq %rdx,%r9 + + movq %r10,%rbp + imulq 8(%rsp),%r10 + + mulq 24(%rsi) + addq %rax,%r13 + movq %rdi,%rax + adcq $0,%rdx + addq %r9,%r13 + adcq $0,%rdx + movq %rdx,%r9 + + mulq 32(%rsi) + addq %rax,%r14 + movq %rdi,%rax + adcq $0,%rdx + addq %r9,%r14 + adcq $0,%rdx + movq %rdx,%r9 + + mulq 40(%rsi) + addq %r9,%r15 + adcq $0,%rdx + xorq %r9,%r9 + addq %rax,%r15 + movq %r10,%rax + adcq %rdx,%r8 + adcq $0,%r9 + + mulq 0(%rcx) + addq %rax,%rbp + movq %r10,%rax + adcq %rdx,%rbp + + mulq 8(%rcx) + addq %rax,%r11 + movq %r10,%rax + adcq $0,%rdx + addq %rbp,%r11 + adcq $0,%rdx + movq %rdx,%rbp + + mulq 16(%rcx) + addq %rax,%r12 + movq %r10,%rax + adcq $0,%rdx + addq %rbp,%r12 + adcq $0,%rdx + movq %rdx,%rbp + + mulq 24(%rcx) + addq %rbp,%r13 + adcq $0,%rdx + addq %rax,%r13 + movq %r10,%rax + adcq $0,%rdx + movq %rdx,%rbp + + mulq 32(%rcx) + addq %rax,%r14 + movq %r10,%rax + adcq $0,%rdx + addq %rbp,%r14 + adcq $0,%rdx + movq %rdx,%rbp + + mulq 40(%rcx) + addq %rax,%r15 + movq 24(%rbx),%rax + adcq $0,%rdx + addq %rbp,%r15 + adcq %rdx,%r8 + adcq $0,%r9 + + movq %rax,%rdi + mulq 0(%rsi) + addq %rax,%r11 + movq %rdi,%rax + adcq $0,%rdx + movq %rdx,%r10 + + mulq 8(%rsi) + addq %rax,%r12 + movq %rdi,%rax + adcq $0,%rdx + addq %r10,%r12 + adcq $0,%rdx + movq %rdx,%r10 + + mulq 16(%rsi) + addq %rax,%r13 + movq %rdi,%rax + adcq $0,%rdx + addq %r10,%r13 + adcq $0,%rdx + movq %rdx,%r10 + + movq %r11,%rbp + imulq 8(%rsp),%r11 + + mulq 24(%rsi) + addq %rax,%r14 + movq %rdi,%rax + adcq $0,%rdx + addq %r10,%r14 + adcq $0,%rdx + movq %rdx,%r10 + + mulq 32(%rsi) + addq %rax,%r15 + movq %rdi,%rax + adcq $0,%rdx + addq %r10,%r15 + adcq $0,%rdx + movq %rdx,%r10 + + mulq 40(%rsi) + addq %r10,%r8 + adcq $0,%rdx + xorq %r10,%r10 + addq %rax,%r8 + movq %r11,%rax + adcq %rdx,%r9 + adcq $0,%r10 + + mulq 0(%rcx) + addq %rax,%rbp + movq %r11,%rax + adcq %rdx,%rbp + + mulq 8(%rcx) + addq %rax,%r12 + movq %r11,%rax + adcq $0,%rdx + addq %rbp,%r12 + adcq $0,%rdx + movq %rdx,%rbp + + mulq 16(%rcx) + addq %rax,%r13 + movq %r11,%rax + adcq $0,%rdx + addq %rbp,%r13 + adcq $0,%rdx + movq %rdx,%rbp + + mulq 24(%rcx) + addq %rbp,%r14 + adcq $0,%rdx + addq %rax,%r14 + movq %r11,%rax + adcq $0,%rdx + movq %rdx,%rbp + + mulq 32(%rcx) + addq %rax,%r15 + movq %r11,%rax + adcq $0,%rdx + addq %rbp,%r15 + adcq $0,%rdx + movq %rdx,%rbp + + mulq 40(%rcx) + addq %rax,%r8 + movq 32(%rbx),%rax + adcq $0,%rdx + addq %rbp,%r8 + adcq %rdx,%r9 + adcq $0,%r10 + + movq %rax,%rdi + mulq 0(%rsi) + addq %rax,%r12 + movq %rdi,%rax + adcq $0,%rdx + movq %rdx,%r11 + + mulq 8(%rsi) + addq %rax,%r13 + movq %rdi,%rax + adcq $0,%rdx + addq %r11,%r13 + adcq $0,%rdx + movq %rdx,%r11 + + mulq 16(%rsi) + addq %rax,%r14 + movq %rdi,%rax + adcq $0,%rdx + addq %r11,%r14 + adcq $0,%rdx + movq %rdx,%r11 + + movq %r12,%rbp + imulq 8(%rsp),%r12 + + mulq 24(%rsi) + addq %rax,%r15 + movq %rdi,%rax + adcq $0,%rdx + addq %r11,%r15 + adcq $0,%rdx + movq %rdx,%r11 + + mulq 32(%rsi) + addq %rax,%r8 + movq %rdi,%rax + adcq $0,%rdx + addq %r11,%r8 + adcq $0,%rdx + movq %rdx,%r11 + + mulq 40(%rsi) + addq %r11,%r9 + adcq $0,%rdx + xorq %r11,%r11 + addq %rax,%r9 + movq %r12,%rax + adcq %rdx,%r10 + adcq $0,%r11 + + mulq 0(%rcx) + addq %rax,%rbp + movq %r12,%rax + adcq %rdx,%rbp + + mulq 8(%rcx) + addq %rax,%r13 + movq %r12,%rax + adcq $0,%rdx + addq %rbp,%r13 + adcq $0,%rdx + movq %rdx,%rbp + + mulq 16(%rcx) + addq %rax,%r14 + movq %r12,%rax + adcq $0,%rdx + addq %rbp,%r14 + adcq $0,%rdx + movq %rdx,%rbp + + mulq 24(%rcx) + addq %rbp,%r15 + adcq $0,%rdx + addq %rax,%r15 + movq %r12,%rax + adcq $0,%rdx + movq %rdx,%rbp + + mulq 32(%rcx) + addq %rax,%r8 + movq %r12,%rax + adcq $0,%rdx + addq %rbp,%r8 + adcq $0,%rdx + movq %rdx,%rbp + + mulq 40(%rcx) + addq %rax,%r9 + movq 40(%rbx),%rax + adcq $0,%rdx + addq %rbp,%r9 + adcq %rdx,%r10 + adcq $0,%r11 + + movq %rax,%rdi + mulq 0(%rsi) + addq %rax,%r13 + movq %rdi,%rax + adcq $0,%rdx + movq %rdx,%r12 + + mulq 8(%rsi) + addq %rax,%r14 + movq %rdi,%rax + adcq $0,%rdx + addq %r12,%r14 + adcq $0,%rdx + movq %rdx,%r12 + + mulq 16(%rsi) + addq %rax,%r15 + movq %rdi,%rax + adcq $0,%rdx + addq %r12,%r15 + adcq $0,%rdx + movq %rdx,%r12 + + movq %r13,%rbp + imulq 8(%rsp),%r13 + + mulq 24(%rsi) + addq %rax,%r8 + movq %rdi,%rax + adcq $0,%rdx + addq %r12,%r8 + adcq $0,%rdx + movq %rdx,%r12 + + mulq 32(%rsi) + addq %rax,%r9 + movq %rdi,%rax + adcq $0,%rdx + addq %r12,%r9 + adcq $0,%rdx + movq %rdx,%r12 + + mulq 40(%rsi) + addq %r12,%r10 + adcq $0,%rdx + xorq %r12,%r12 + addq %rax,%r10 + movq %r13,%rax + adcq %rdx,%r11 + adcq $0,%r12 + + mulq 0(%rcx) + addq %rax,%rbp + movq %r13,%rax + adcq %rdx,%rbp + + mulq 8(%rcx) + addq %rax,%r14 + movq %r13,%rax + adcq $0,%rdx + addq %rbp,%r14 + adcq $0,%rdx + movq %rdx,%rbp + + mulq 16(%rcx) + addq %rax,%r15 + movq %r13,%rax + adcq $0,%rdx + addq %rbp,%r15 + adcq $0,%rdx + movq %rdx,%rbp + + mulq 24(%rcx) + addq %rbp,%r8 + adcq $0,%rdx + addq %rax,%r8 + movq %r13,%rax + adcq $0,%rdx + movq %rdx,%rbp + + mulq 32(%rcx) + addq %rax,%r9 + movq %r13,%rax + adcq $0,%rdx + addq %rbp,%r9 + adcq $0,%rdx + movq %rdx,%rbp + + mulq 40(%rcx) + addq %rax,%r10 + movq %r14,%rax + adcq $0,%rdx + addq %rbp,%r10 + adcq %rdx,%r11 + adcq $0,%r12 + + + + + movq 16(%rsp),%rdi + subq 0(%rcx),%r14 + movq %r15,%rdx + sbbq 8(%rcx),%r15 + movq %r8,%rbx + sbbq 16(%rcx),%r8 + movq %r9,%rsi + sbbq 24(%rcx),%r9 + movq %r10,%rbp + sbbq 32(%rcx),%r10 + movq %r11,%r13 + sbbq 40(%rcx),%r11 + sbbq $0,%r12 + + cmovcq %rax,%r14 + cmovcq %rdx,%r15 + cmovcq %rbx,%r8 + movq %r14,0(%rdi) + cmovcq %rsi,%r9 + movq %r15,8(%rdi) + cmovcq %rbp,%r10 + movq %r8,16(%rdi) + cmovcq %r13,%r11 + movq %r9,24(%rdi) + movq %r10,32(%rdi) + movq %r11,40(%rdi) + + .byte 0xf3,0xc3 +.cfi_endproc + +.globl _sqr_n_mul_mont_384 +.private_extern _sqr_n_mul_mont_384 + +.p2align 5 +_sqr_n_mul_mont_384: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + subq $136,%rsp +.cfi_adjust_cfa_offset 8*17 + + + movq %r8,0(%rsp) + movq %rdi,8(%rsp) + movq %rcx,16(%rsp) + leaq 32(%rsp),%rdi + movq %r9,24(%rsp) + movq (%r9),%xmm2 + +L$oop_sqr_384: + movd %edx,%xmm1 + + call __sqrq_384 + + leaq 0(%rdi),%rsi + movq 0(%rsp),%rcx + movq 16(%rsp),%rbx + call __mulq_by_1_mont_384 + call __redc_tail_mont_384 + + movd %xmm1,%edx + leaq 0(%rdi),%rsi + decl %edx + jnz L$oop_sqr_384 + +.byte 102,72,15,126,208 + movq %rbx,%rcx + movq 24(%rsp),%rbx + + + + + + + movq %r8,%r12 + movq %r9,%r13 + + call __mulq_mont_384 + + leaq 136(%rsp),%r8 + movq 136(%rsp),%r15 +.cfi_restore %r15 + movq 8(%r8),%r14 +.cfi_restore %r14 + movq 16(%r8),%r13 +.cfi_restore %r13 + movq 24(%r8),%r12 +.cfi_restore %r12 + movq 32(%r8),%rbx +.cfi_restore %rbx + movq 40(%r8),%rbp +.cfi_restore %rbp + leaq 48(%r8),%rsp +.cfi_adjust_cfa_offset -8*23 + + .byte 0xf3,0xc3 +.cfi_endproc + + +.globl _sqr_n_mul_mont_383 +.private_extern _sqr_n_mul_mont_383 + +.p2align 5 +_sqr_n_mul_mont_383: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + subq $136,%rsp +.cfi_adjust_cfa_offset 8*17 + + + movq %r8,0(%rsp) + movq %rdi,8(%rsp) + movq %rcx,16(%rsp) + leaq 32(%rsp),%rdi + movq %r9,24(%rsp) + movq (%r9),%xmm2 + +L$oop_sqr_383: + movd %edx,%xmm1 + + call __sqrq_384 + + leaq 0(%rdi),%rsi + movq 0(%rsp),%rcx + movq 16(%rsp),%rbx + call __mulq_by_1_mont_384 + + movd %xmm1,%edx + addq 48(%rsi),%r14 + adcq 56(%rsi),%r15 + adcq 64(%rsi),%r8 + adcq 72(%rsi),%r9 + adcq 80(%rsi),%r10 + adcq 88(%rsi),%r11 + leaq 0(%rdi),%rsi + + movq %r14,0(%rdi) + movq %r15,8(%rdi) + movq %r8,16(%rdi) + movq %r9,24(%rdi) + movq %r10,32(%rdi) + movq %r11,40(%rdi) + + decl %edx + jnz L$oop_sqr_383 + +.byte 102,72,15,126,208 + movq %rbx,%rcx + movq 24(%rsp),%rbx + + + + + + + movq %r8,%r12 + movq %r9,%r13 + + call __mulq_mont_384 + + leaq 136(%rsp),%r8 + movq 136(%rsp),%r15 +.cfi_restore %r15 + movq 8(%r8),%r14 +.cfi_restore %r14 + movq 16(%r8),%r13 +.cfi_restore %r13 + movq 24(%r8),%r12 +.cfi_restore %r12 + movq 32(%r8),%rbx +.cfi_restore %rbx + movq 40(%r8),%rbp +.cfi_restore %rbp + leaq 48(%r8),%rsp +.cfi_adjust_cfa_offset -8*23 + + .byte 0xf3,0xc3 +.cfi_endproc + + +.p2align 5 +__mulq_mont_383_nonred: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + movq %rax,%rbp + mulq %r14 + movq %rax,%r8 + movq %rbp,%rax + movq %rdx,%r9 + + mulq %r15 + addq %rax,%r9 + movq %rbp,%rax + adcq $0,%rdx + movq %rdx,%r10 + + mulq %r12 + addq %rax,%r10 + movq %rbp,%rax + adcq $0,%rdx + movq %rdx,%r11 + + movq %r8,%r15 + imulq 8(%rsp),%r8 + + mulq %r13 + addq %rax,%r11 + movq %rbp,%rax + adcq $0,%rdx + movq %rdx,%r12 + + mulq 32(%rsi) + addq %rax,%r12 + movq %rbp,%rax + adcq $0,%rdx + movq %rdx,%r13 + + mulq 40(%rsi) + addq %rax,%r13 + movq %r8,%rax + adcq $0,%rdx + movq %rdx,%r14 + + mulq 0(%rcx) + addq %rax,%r15 + movq %r8,%rax + adcq %rdx,%r15 + + mulq 8(%rcx) + addq %rax,%r9 + movq %r8,%rax + adcq $0,%rdx + addq %r15,%r9 + adcq $0,%rdx + movq %rdx,%r15 + + mulq 16(%rcx) + addq %rax,%r10 + movq %r8,%rax + adcq $0,%rdx + addq %r15,%r10 + adcq $0,%rdx + movq %rdx,%r15 + + mulq 24(%rcx) + addq %r15,%r11 + adcq $0,%rdx + addq %rax,%r11 + movq %r8,%rax + adcq $0,%rdx + movq %rdx,%r15 + + mulq 32(%rcx) + addq %rax,%r12 + movq %r8,%rax + adcq $0,%rdx + addq %r15,%r12 + adcq $0,%rdx + movq %rdx,%r15 + + mulq 40(%rcx) + addq %rax,%r13 + movq 8(%rbx),%rax + adcq $0,%rdx + addq %r15,%r13 + adcq %rdx,%r14 + + movq %rax,%rbp + mulq 0(%rsi) + addq %rax,%r9 + movq %rbp,%rax + adcq $0,%rdx + movq %rdx,%r15 + + mulq 8(%rsi) + addq %rax,%r10 + movq %rbp,%rax + adcq $0,%rdx + addq %r15,%r10 + adcq $0,%rdx + movq %rdx,%r15 + + mulq 16(%rsi) + addq %rax,%r11 + movq %rbp,%rax + adcq $0,%rdx + addq %r15,%r11 + adcq $0,%rdx + movq %rdx,%r15 + + movq %r9,%r8 + imulq 8(%rsp),%r9 + + mulq 24(%rsi) + addq %rax,%r12 + movq %rbp,%rax + adcq $0,%rdx + addq %r15,%r12 + adcq $0,%rdx + movq %rdx,%r15 + + mulq 32(%rsi) + addq %rax,%r13 + movq %rbp,%rax + adcq $0,%rdx + addq %r15,%r13 + adcq $0,%rdx + movq %rdx,%r15 + + mulq 40(%rsi) + addq %r15,%r14 + adcq $0,%rdx + addq %rax,%r14 + movq %r9,%rax + adcq $0,%rdx + movq %rdx,%r15 + + mulq 0(%rcx) + addq %rax,%r8 + movq %r9,%rax + adcq %rdx,%r8 + + mulq 8(%rcx) + addq %rax,%r10 + movq %r9,%rax + adcq $0,%rdx + addq %r8,%r10 + adcq $0,%rdx + movq %rdx,%r8 + + mulq 16(%rcx) + addq %rax,%r11 + movq %r9,%rax + adcq $0,%rdx + addq %r8,%r11 + adcq $0,%rdx + movq %rdx,%r8 + + mulq 24(%rcx) + addq %r8,%r12 + adcq $0,%rdx + addq %rax,%r12 + movq %r9,%rax + adcq $0,%rdx + movq %rdx,%r8 + + mulq 32(%rcx) + addq %rax,%r13 + movq %r9,%rax + adcq $0,%rdx + addq %r8,%r13 + adcq $0,%rdx + movq %rdx,%r8 + + mulq 40(%rcx) + addq %rax,%r14 + movq 16(%rbx),%rax + adcq $0,%rdx + addq %r8,%r14 + adcq %rdx,%r15 + + movq %rax,%rbp + mulq 0(%rsi) + addq %rax,%r10 + movq %rbp,%rax + adcq $0,%rdx + movq %rdx,%r8 + + mulq 8(%rsi) + addq %rax,%r11 + movq %rbp,%rax + adcq $0,%rdx + addq %r8,%r11 + adcq $0,%rdx + movq %rdx,%r8 + + mulq 16(%rsi) + addq %rax,%r12 + movq %rbp,%rax + adcq $0,%rdx + addq %r8,%r12 + adcq $0,%rdx + movq %rdx,%r8 + + movq %r10,%r9 + imulq 8(%rsp),%r10 + + mulq 24(%rsi) + addq %rax,%r13 + movq %rbp,%rax + adcq $0,%rdx + addq %r8,%r13 + adcq $0,%rdx + movq %rdx,%r8 + + mulq 32(%rsi) + addq %rax,%r14 + movq %rbp,%rax + adcq $0,%rdx + addq %r8,%r14 + adcq $0,%rdx + movq %rdx,%r8 + + mulq 40(%rsi) + addq %r8,%r15 + adcq $0,%rdx + addq %rax,%r15 + movq %r10,%rax + adcq $0,%rdx + movq %rdx,%r8 + + mulq 0(%rcx) + addq %rax,%r9 + movq %r10,%rax + adcq %rdx,%r9 + + mulq 8(%rcx) + addq %rax,%r11 + movq %r10,%rax + adcq $0,%rdx + addq %r9,%r11 + adcq $0,%rdx + movq %rdx,%r9 + + mulq 16(%rcx) + addq %rax,%r12 + movq %r10,%rax + adcq $0,%rdx + addq %r9,%r12 + adcq $0,%rdx + movq %rdx,%r9 + + mulq 24(%rcx) + addq %r9,%r13 + adcq $0,%rdx + addq %rax,%r13 + movq %r10,%rax + adcq $0,%rdx + movq %rdx,%r9 + + mulq 32(%rcx) + addq %rax,%r14 + movq %r10,%rax + adcq $0,%rdx + addq %r9,%r14 + adcq $0,%rdx + movq %rdx,%r9 + + mulq 40(%rcx) + addq %rax,%r15 + movq 24(%rbx),%rax + adcq $0,%rdx + addq %r9,%r15 + adcq %rdx,%r8 + + movq %rax,%rbp + mulq 0(%rsi) + addq %rax,%r11 + movq %rbp,%rax + adcq $0,%rdx + movq %rdx,%r9 + + mulq 8(%rsi) + addq %rax,%r12 + movq %rbp,%rax + adcq $0,%rdx + addq %r9,%r12 + adcq $0,%rdx + movq %rdx,%r9 + + mulq 16(%rsi) + addq %rax,%r13 + movq %rbp,%rax + adcq $0,%rdx + addq %r9,%r13 + adcq $0,%rdx + movq %rdx,%r9 + + movq %r11,%r10 + imulq 8(%rsp),%r11 + + mulq 24(%rsi) + addq %rax,%r14 + movq %rbp,%rax + adcq $0,%rdx + addq %r9,%r14 + adcq $0,%rdx + movq %rdx,%r9 + + mulq 32(%rsi) + addq %rax,%r15 + movq %rbp,%rax + adcq $0,%rdx + addq %r9,%r15 + adcq $0,%rdx + movq %rdx,%r9 + + mulq 40(%rsi) + addq %r9,%r8 + adcq $0,%rdx + addq %rax,%r8 + movq %r11,%rax + adcq $0,%rdx + movq %rdx,%r9 + + mulq 0(%rcx) + addq %rax,%r10 + movq %r11,%rax + adcq %rdx,%r10 + + mulq 8(%rcx) + addq %rax,%r12 + movq %r11,%rax + adcq $0,%rdx + addq %r10,%r12 + adcq $0,%rdx + movq %rdx,%r10 + + mulq 16(%rcx) + addq %rax,%r13 + movq %r11,%rax + adcq $0,%rdx + addq %r10,%r13 + adcq $0,%rdx + movq %rdx,%r10 + + mulq 24(%rcx) + addq %r10,%r14 + adcq $0,%rdx + addq %rax,%r14 + movq %r11,%rax + adcq $0,%rdx + movq %rdx,%r10 + + mulq 32(%rcx) + addq %rax,%r15 + movq %r11,%rax + adcq $0,%rdx + addq %r10,%r15 + adcq $0,%rdx + movq %rdx,%r10 + + mulq 40(%rcx) + addq %rax,%r8 + movq 32(%rbx),%rax + adcq $0,%rdx + addq %r10,%r8 + adcq %rdx,%r9 + + movq %rax,%rbp + mulq 0(%rsi) + addq %rax,%r12 + movq %rbp,%rax + adcq $0,%rdx + movq %rdx,%r10 + + mulq 8(%rsi) + addq %rax,%r13 + movq %rbp,%rax + adcq $0,%rdx + addq %r10,%r13 + adcq $0,%rdx + movq %rdx,%r10 + + mulq 16(%rsi) + addq %rax,%r14 + movq %rbp,%rax + adcq $0,%rdx + addq %r10,%r14 + adcq $0,%rdx + movq %rdx,%r10 + + movq %r12,%r11 + imulq 8(%rsp),%r12 + + mulq 24(%rsi) + addq %rax,%r15 + movq %rbp,%rax + adcq $0,%rdx + addq %r10,%r15 + adcq $0,%rdx + movq %rdx,%r10 + + mulq 32(%rsi) + addq %rax,%r8 + movq %rbp,%rax + adcq $0,%rdx + addq %r10,%r8 + adcq $0,%rdx + movq %rdx,%r10 + + mulq 40(%rsi) + addq %r10,%r9 + adcq $0,%rdx + addq %rax,%r9 + movq %r12,%rax + adcq $0,%rdx + movq %rdx,%r10 + + mulq 0(%rcx) + addq %rax,%r11 + movq %r12,%rax + adcq %rdx,%r11 + + mulq 8(%rcx) + addq %rax,%r13 + movq %r12,%rax + adcq $0,%rdx + addq %r11,%r13 + adcq $0,%rdx + movq %rdx,%r11 + + mulq 16(%rcx) + addq %rax,%r14 + movq %r12,%rax + adcq $0,%rdx + addq %r11,%r14 + adcq $0,%rdx + movq %rdx,%r11 + + mulq 24(%rcx) + addq %r11,%r15 + adcq $0,%rdx + addq %rax,%r15 + movq %r12,%rax + adcq $0,%rdx + movq %rdx,%r11 + + mulq 32(%rcx) + addq %rax,%r8 + movq %r12,%rax + adcq $0,%rdx + addq %r11,%r8 + adcq $0,%rdx + movq %rdx,%r11 + + mulq 40(%rcx) + addq %rax,%r9 + movq 40(%rbx),%rax + adcq $0,%rdx + addq %r11,%r9 + adcq %rdx,%r10 + + movq %rax,%rbp + mulq 0(%rsi) + addq %rax,%r13 + movq %rbp,%rax + adcq $0,%rdx + movq %rdx,%r11 + + mulq 8(%rsi) + addq %rax,%r14 + movq %rbp,%rax + adcq $0,%rdx + addq %r11,%r14 + adcq $0,%rdx + movq %rdx,%r11 + + mulq 16(%rsi) + addq %rax,%r15 + movq %rbp,%rax + adcq $0,%rdx + addq %r11,%r15 + adcq $0,%rdx + movq %rdx,%r11 + + movq %r13,%r12 + imulq 8(%rsp),%r13 + + mulq 24(%rsi) + addq %rax,%r8 + movq %rbp,%rax + adcq $0,%rdx + addq %r11,%r8 + adcq $0,%rdx + movq %rdx,%r11 + + mulq 32(%rsi) + addq %rax,%r9 + movq %rbp,%rax + adcq $0,%rdx + addq %r11,%r9 + adcq $0,%rdx + movq %rdx,%r11 + + mulq 40(%rsi) + addq %r11,%r10 + adcq $0,%rdx + addq %rax,%r10 + movq %r13,%rax + adcq $0,%rdx + movq %rdx,%r11 + + mulq 0(%rcx) + addq %rax,%r12 + movq %r13,%rax + adcq %rdx,%r12 + + mulq 8(%rcx) + addq %rax,%r14 + movq %r13,%rax + adcq $0,%rdx + addq %r12,%r14 + adcq $0,%rdx + movq %rdx,%r12 + + mulq 16(%rcx) + addq %rax,%r15 + movq %r13,%rax + adcq $0,%rdx + addq %r12,%r15 + adcq $0,%rdx + movq %rdx,%r12 + + mulq 24(%rcx) + addq %r12,%r8 + adcq $0,%rdx + addq %rax,%r8 + movq %r13,%rax + adcq $0,%rdx + movq %rdx,%r12 + + mulq 32(%rcx) + addq %rax,%r9 + movq %r13,%rax + adcq $0,%rdx + addq %r12,%r9 + adcq $0,%rdx + movq %rdx,%r12 + + mulq 40(%rcx) + addq %rax,%r10 + movq %r14,%rax + adcq $0,%rdx + addq %r12,%r10 + adcq %rdx,%r11 + .byte 0xf3,0xc3 +.cfi_endproc + +.globl _sqr_mont_382x +.private_extern _sqr_mont_382x + +.p2align 5 +_sqr_mont_382x: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + subq $136,%rsp +.cfi_adjust_cfa_offset 136 + + + movq %rcx,0(%rsp) + movq %rdx,%rcx + movq %rsi,16(%rsp) + movq %rdi,24(%rsp) + + + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + movq 32(%rsi),%r12 + movq 40(%rsi),%r13 + + movq %r8,%r14 + addq 48(%rsi),%r8 + movq %r9,%r15 + adcq 56(%rsi),%r9 + movq %r10,%rax + adcq 64(%rsi),%r10 + movq %r11,%rdx + adcq 72(%rsi),%r11 + movq %r12,%rbx + adcq 80(%rsi),%r12 + movq %r13,%rbp + adcq 88(%rsi),%r13 + + subq 48(%rsi),%r14 + sbbq 56(%rsi),%r15 + sbbq 64(%rsi),%rax + sbbq 72(%rsi),%rdx + sbbq 80(%rsi),%rbx + sbbq 88(%rsi),%rbp + sbbq %rdi,%rdi + + movq %r8,32+0(%rsp) + movq %r9,32+8(%rsp) + movq %r10,32+16(%rsp) + movq %r11,32+24(%rsp) + movq %r12,32+32(%rsp) + movq %r13,32+40(%rsp) + + movq %r14,32+48(%rsp) + movq %r15,32+56(%rsp) + movq %rax,32+64(%rsp) + movq %rdx,32+72(%rsp) + movq %rbx,32+80(%rsp) + movq %rbp,32+88(%rsp) + movq %rdi,32+96(%rsp) + + + + leaq 48(%rsi),%rbx + + movq 48(%rsi),%rax + movq 0(%rsi),%r14 + movq 8(%rsi),%r15 + movq 16(%rsi),%r12 + movq 24(%rsi),%r13 + + movq 24(%rsp),%rdi + call __mulq_mont_383_nonred + addq %r14,%r14 + adcq %r15,%r15 + adcq %r8,%r8 + adcq %r9,%r9 + adcq %r10,%r10 + adcq %r11,%r11 + + movq %r14,48(%rdi) + movq %r15,56(%rdi) + movq %r8,64(%rdi) + movq %r9,72(%rdi) + movq %r10,80(%rdi) + movq %r11,88(%rdi) + + leaq 32(%rsp),%rsi + leaq 32+48(%rsp),%rbx + + movq 32+48(%rsp),%rax + movq 32+0(%rsp),%r14 + movq 32+8(%rsp),%r15 + movq 32+16(%rsp),%r12 + movq 32+24(%rsp),%r13 + + call __mulq_mont_383_nonred + movq 32+96(%rsp),%rsi + movq 32+0(%rsp),%r12 + movq 32+8(%rsp),%r13 + andq %rsi,%r12 + movq 32+16(%rsp),%rax + andq %rsi,%r13 + movq 32+24(%rsp),%rbx + andq %rsi,%rax + movq 32+32(%rsp),%rbp + andq %rsi,%rbx + andq %rsi,%rbp + andq 32+40(%rsp),%rsi + + subq %r12,%r14 + movq 0(%rcx),%r12 + sbbq %r13,%r15 + movq 8(%rcx),%r13 + sbbq %rax,%r8 + movq 16(%rcx),%rax + sbbq %rbx,%r9 + movq 24(%rcx),%rbx + sbbq %rbp,%r10 + movq 32(%rcx),%rbp + sbbq %rsi,%r11 + sbbq %rsi,%rsi + + andq %rsi,%r12 + andq %rsi,%r13 + andq %rsi,%rax + andq %rsi,%rbx + andq %rsi,%rbp + andq 40(%rcx),%rsi + + addq %r12,%r14 + adcq %r13,%r15 + adcq %rax,%r8 + adcq %rbx,%r9 + adcq %rbp,%r10 + adcq %rsi,%r11 + + movq %r14,0(%rdi) + movq %r15,8(%rdi) + movq %r8,16(%rdi) + movq %r9,24(%rdi) + movq %r10,32(%rdi) + movq %r11,40(%rdi) + leaq 136(%rsp),%r8 + movq 0(%r8),%r15 +.cfi_restore %r15 + movq 8(%r8),%r14 +.cfi_restore %r14 + movq 16(%r8),%r13 +.cfi_restore %r13 + movq 24(%r8),%r12 +.cfi_restore %r12 + movq 32(%r8),%rbx +.cfi_restore %rbx + movq 40(%r8),%rbp +.cfi_restore %rbp + leaq 48(%r8),%rsp +.cfi_adjust_cfa_offset -136-8*6 + + .byte 0xf3,0xc3 +.cfi_endproc + diff --git a/temp-deps/blst-ts/deps/blst/build/mach-o/mulx_mont_256-x86_64.s b/temp-deps/blst-ts/deps/blst/build/mach-o/mulx_mont_256-x86_64.s new file mode 100644 index 0000000..178372f --- /dev/null +++ b/temp-deps/blst-ts/deps/blst/build/mach-o/mulx_mont_256-x86_64.s @@ -0,0 +1,619 @@ +.text + +.globl _mulx_mont_sparse_256 +.private_extern _mulx_mont_sparse_256 + +.p2align 5 +_mulx_mont_sparse_256: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + subq $8,%rsp +.cfi_adjust_cfa_offset 8 + + + movq %rdx,%rbx + movq 0(%rdx),%rdx + movq 0(%rsi),%r14 + movq 8(%rsi),%r15 + movq 16(%rsi),%rbp + movq 24(%rsi),%r9 + leaq -128(%rsi),%rsi + leaq -128(%rcx),%rcx + + mulxq %r14,%rax,%r11 + call __mulx_mont_sparse_256 + + movq 8(%rsp),%r15 +.cfi_restore %r15 + movq 16(%rsp),%r14 +.cfi_restore %r14 + movq 24(%rsp),%r13 +.cfi_restore %r13 + movq 32(%rsp),%r12 +.cfi_restore %r12 + movq 40(%rsp),%rbx +.cfi_restore %rbx + movq 48(%rsp),%rbp +.cfi_restore %rbp + leaq 56(%rsp),%rsp +.cfi_adjust_cfa_offset -56 + + .byte 0xf3,0xc3 +.cfi_endproc + + +.globl _sqrx_mont_sparse_256 +.private_extern _sqrx_mont_sparse_256 + +.p2align 5 +_sqrx_mont_sparse_256: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + subq $8,%rsp +.cfi_adjust_cfa_offset 8 + + + movq %rsi,%rbx + movq %rcx,%r8 + movq %rdx,%rcx + movq 0(%rsi),%rdx + movq 8(%rsi),%r15 + movq 16(%rsi),%rbp + movq 24(%rsi),%r9 + leaq -128(%rbx),%rsi + leaq -128(%rcx),%rcx + + mulxq %rdx,%rax,%r11 + call __mulx_mont_sparse_256 + + movq 8(%rsp),%r15 +.cfi_restore %r15 + movq 16(%rsp),%r14 +.cfi_restore %r14 + movq 24(%rsp),%r13 +.cfi_restore %r13 + movq 32(%rsp),%r12 +.cfi_restore %r12 + movq 40(%rsp),%rbx +.cfi_restore %rbx + movq 48(%rsp),%rbp +.cfi_restore %rbp + leaq 56(%rsp),%rsp +.cfi_adjust_cfa_offset -56 + + .byte 0xf3,0xc3 +.cfi_endproc + + +.p2align 5 +__mulx_mont_sparse_256: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + mulxq %r15,%r15,%r12 + mulxq %rbp,%rbp,%r13 + addq %r15,%r11 + mulxq %r9,%r9,%r14 + movq 8(%rbx),%rdx + adcq %rbp,%r12 + adcq %r9,%r13 + adcq $0,%r14 + + movq %rax,%r10 + imulq %r8,%rax + + + xorq %r15,%r15 + mulxq 0+128(%rsi),%rbp,%r9 + adoxq %rbp,%r11 + adcxq %r9,%r12 + + mulxq 8+128(%rsi),%rbp,%r9 + adoxq %rbp,%r12 + adcxq %r9,%r13 + + mulxq 16+128(%rsi),%rbp,%r9 + adoxq %rbp,%r13 + adcxq %r9,%r14 + + mulxq 24+128(%rsi),%rbp,%r9 + movq %rax,%rdx + adoxq %rbp,%r14 + adcxq %r15,%r9 + adoxq %r9,%r15 + + + mulxq 0+128(%rcx),%rbp,%rax + adcxq %rbp,%r10 + adoxq %r11,%rax + + mulxq 8+128(%rcx),%rbp,%r9 + adcxq %rbp,%rax + adoxq %r9,%r12 + + mulxq 16+128(%rcx),%rbp,%r9 + adcxq %rbp,%r12 + adoxq %r9,%r13 + + mulxq 24+128(%rcx),%rbp,%r9 + movq 16(%rbx),%rdx + adcxq %rbp,%r13 + adoxq %r9,%r14 + adcxq %r10,%r14 + adoxq %r10,%r15 + adcxq %r10,%r15 + adoxq %r10,%r10 + adcq $0,%r10 + movq %rax,%r11 + imulq %r8,%rax + + + xorq %rbp,%rbp + mulxq 0+128(%rsi),%rbp,%r9 + adoxq %rbp,%r12 + adcxq %r9,%r13 + + mulxq 8+128(%rsi),%rbp,%r9 + adoxq %rbp,%r13 + adcxq %r9,%r14 + + mulxq 16+128(%rsi),%rbp,%r9 + adoxq %rbp,%r14 + adcxq %r9,%r15 + + mulxq 24+128(%rsi),%rbp,%r9 + movq %rax,%rdx + adoxq %rbp,%r15 + adcxq %r10,%r9 + adoxq %r9,%r10 + + + mulxq 0+128(%rcx),%rbp,%rax + adcxq %rbp,%r11 + adoxq %r12,%rax + + mulxq 8+128(%rcx),%rbp,%r9 + adcxq %rbp,%rax + adoxq %r9,%r13 + + mulxq 16+128(%rcx),%rbp,%r9 + adcxq %rbp,%r13 + adoxq %r9,%r14 + + mulxq 24+128(%rcx),%rbp,%r9 + movq 24(%rbx),%rdx + adcxq %rbp,%r14 + adoxq %r9,%r15 + adcxq %r11,%r15 + adoxq %r11,%r10 + adcxq %r11,%r10 + adoxq %r11,%r11 + adcq $0,%r11 + movq %rax,%r12 + imulq %r8,%rax + + + xorq %rbp,%rbp + mulxq 0+128(%rsi),%rbp,%r9 + adoxq %rbp,%r13 + adcxq %r9,%r14 + + mulxq 8+128(%rsi),%rbp,%r9 + adoxq %rbp,%r14 + adcxq %r9,%r15 + + mulxq 16+128(%rsi),%rbp,%r9 + adoxq %rbp,%r15 + adcxq %r9,%r10 + + mulxq 24+128(%rsi),%rbp,%r9 + movq %rax,%rdx + adoxq %rbp,%r10 + adcxq %r11,%r9 + adoxq %r9,%r11 + + + mulxq 0+128(%rcx),%rbp,%rax + adcxq %rbp,%r12 + adoxq %r13,%rax + + mulxq 8+128(%rcx),%rbp,%r9 + adcxq %rbp,%rax + adoxq %r9,%r14 + + mulxq 16+128(%rcx),%rbp,%r9 + adcxq %rbp,%r14 + adoxq %r9,%r15 + + mulxq 24+128(%rcx),%rbp,%r9 + movq %rax,%rdx + adcxq %rbp,%r15 + adoxq %r9,%r10 + adcxq %r12,%r10 + adoxq %r12,%r11 + adcxq %r12,%r11 + adoxq %r12,%r12 + adcq $0,%r12 + imulq %r8,%rdx + + + xorq %rbp,%rbp + mulxq 0+128(%rcx),%r13,%r9 + adcxq %rax,%r13 + adoxq %r9,%r14 + + mulxq 8+128(%rcx),%rbp,%r9 + adcxq %rbp,%r14 + adoxq %r9,%r15 + + mulxq 16+128(%rcx),%rbp,%r9 + adcxq %rbp,%r15 + adoxq %r9,%r10 + + mulxq 24+128(%rcx),%rbp,%r9 + movq %r14,%rdx + leaq 128(%rcx),%rcx + adcxq %rbp,%r10 + adoxq %r9,%r11 + movq %r15,%rax + adcxq %r13,%r11 + adoxq %r13,%r12 + adcq $0,%r12 + + + + + movq %r10,%rbp + subq 0(%rcx),%r14 + sbbq 8(%rcx),%r15 + sbbq 16(%rcx),%r10 + movq %r11,%r9 + sbbq 24(%rcx),%r11 + sbbq $0,%r12 + + cmovcq %rdx,%r14 + cmovcq %rax,%r15 + cmovcq %rbp,%r10 + movq %r14,0(%rdi) + cmovcq %r9,%r11 + movq %r15,8(%rdi) + movq %r10,16(%rdi) + movq %r11,24(%rdi) + + .byte 0xf3,0xc3 +.cfi_endproc + +.globl _fromx_mont_256 +.private_extern _fromx_mont_256 + +.p2align 5 +_fromx_mont_256: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + subq $8,%rsp +.cfi_adjust_cfa_offset 8 + + + movq %rdx,%rbx + call __mulx_by_1_mont_256 + + + + + + movq %r15,%rdx + movq %r10,%r12 + movq %r11,%r13 + + subq 0(%rbx),%r14 + sbbq 8(%rbx),%r15 + sbbq 16(%rbx),%r10 + sbbq 24(%rbx),%r11 + + cmovncq %r14,%rax + cmovncq %r15,%rdx + cmovncq %r10,%r12 + movq %rax,0(%rdi) + cmovncq %r11,%r13 + movq %rdx,8(%rdi) + movq %r12,16(%rdi) + movq %r13,24(%rdi) + + movq 8(%rsp),%r15 +.cfi_restore %r15 + movq 16(%rsp),%r14 +.cfi_restore %r14 + movq 24(%rsp),%r13 +.cfi_restore %r13 + movq 32(%rsp),%r12 +.cfi_restore %r12 + movq 40(%rsp),%rbx +.cfi_restore %rbx + movq 48(%rsp),%rbp +.cfi_restore %rbp + leaq 56(%rsp),%rsp +.cfi_adjust_cfa_offset -56 + + .byte 0xf3,0xc3 +.cfi_endproc + + +.globl _redcx_mont_256 +.private_extern _redcx_mont_256 + +.p2align 5 +_redcx_mont_256: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + subq $8,%rsp +.cfi_adjust_cfa_offset 8 + + + movq %rdx,%rbx + call __mulx_by_1_mont_256 + + addq 32(%rsi),%r14 + adcq 40(%rsi),%r15 + movq %r14,%rax + adcq 48(%rsi),%r10 + movq %r15,%rdx + adcq 56(%rsi),%r11 + sbbq %rsi,%rsi + + + + + movq %r10,%r12 + subq 0(%rbx),%r14 + sbbq 8(%rbx),%r15 + sbbq 16(%rbx),%r10 + movq %r11,%r13 + sbbq 24(%rbx),%r11 + sbbq $0,%rsi + + cmovncq %r14,%rax + cmovncq %r15,%rdx + cmovncq %r10,%r12 + movq %rax,0(%rdi) + cmovncq %r11,%r13 + movq %rdx,8(%rdi) + movq %r12,16(%rdi) + movq %r13,24(%rdi) + + movq 8(%rsp),%r15 +.cfi_restore %r15 + movq 16(%rsp),%r14 +.cfi_restore %r14 + movq 24(%rsp),%r13 +.cfi_restore %r13 + movq 32(%rsp),%r12 +.cfi_restore %r12 + movq 40(%rsp),%rbx +.cfi_restore %rbx + movq 48(%rsp),%rbp +.cfi_restore %rbp + leaq 56(%rsp),%rsp +.cfi_adjust_cfa_offset -56 + + .byte 0xf3,0xc3 +.cfi_endproc + + +.p2align 5 +__mulx_by_1_mont_256: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + movq 0(%rsi),%rax + movq 8(%rsi),%r11 + movq 16(%rsi),%r12 + movq 24(%rsi),%r13 + + movq %rax,%r14 + imulq %rcx,%rax + movq %rax,%r10 + + mulq 0(%rbx) + addq %rax,%r14 + movq %r10,%rax + adcq %rdx,%r14 + + mulq 8(%rbx) + addq %rax,%r11 + movq %r10,%rax + adcq $0,%rdx + addq %r14,%r11 + adcq $0,%rdx + movq %rdx,%r14 + + mulq 16(%rbx) + movq %r11,%r15 + imulq %rcx,%r11 + addq %rax,%r12 + movq %r10,%rax + adcq $0,%rdx + addq %r14,%r12 + adcq $0,%rdx + movq %rdx,%r14 + + mulq 24(%rbx) + addq %rax,%r13 + movq %r11,%rax + adcq $0,%rdx + addq %r14,%r13 + adcq $0,%rdx + movq %rdx,%r14 + + mulq 0(%rbx) + addq %rax,%r15 + movq %r11,%rax + adcq %rdx,%r15 + + mulq 8(%rbx) + addq %rax,%r12 + movq %r11,%rax + adcq $0,%rdx + addq %r15,%r12 + adcq $0,%rdx + movq %rdx,%r15 + + mulq 16(%rbx) + movq %r12,%r10 + imulq %rcx,%r12 + addq %rax,%r13 + movq %r11,%rax + adcq $0,%rdx + addq %r15,%r13 + adcq $0,%rdx + movq %rdx,%r15 + + mulq 24(%rbx) + addq %rax,%r14 + movq %r12,%rax + adcq $0,%rdx + addq %r15,%r14 + adcq $0,%rdx + movq %rdx,%r15 + + mulq 0(%rbx) + addq %rax,%r10 + movq %r12,%rax + adcq %rdx,%r10 + + mulq 8(%rbx) + addq %rax,%r13 + movq %r12,%rax + adcq $0,%rdx + addq %r10,%r13 + adcq $0,%rdx + movq %rdx,%r10 + + mulq 16(%rbx) + movq %r13,%r11 + imulq %rcx,%r13 + addq %rax,%r14 + movq %r12,%rax + adcq $0,%rdx + addq %r10,%r14 + adcq $0,%rdx + movq %rdx,%r10 + + mulq 24(%rbx) + addq %rax,%r15 + movq %r13,%rax + adcq $0,%rdx + addq %r10,%r15 + adcq $0,%rdx + movq %rdx,%r10 + + mulq 0(%rbx) + addq %rax,%r11 + movq %r13,%rax + adcq %rdx,%r11 + + mulq 8(%rbx) + addq %rax,%r14 + movq %r13,%rax + adcq $0,%rdx + addq %r11,%r14 + adcq $0,%rdx + movq %rdx,%r11 + + mulq 16(%rbx) + addq %rax,%r15 + movq %r13,%rax + adcq $0,%rdx + addq %r11,%r15 + adcq $0,%rdx + movq %rdx,%r11 + + mulq 24(%rbx) + addq %rax,%r10 + movq %r14,%rax + adcq $0,%rdx + addq %r11,%r10 + adcq $0,%rdx + movq %rdx,%r11 + .byte 0xf3,0xc3 +.cfi_endproc + diff --git a/temp-deps/blst-ts/deps/blst/build/mach-o/mulx_mont_384-x86_64.s b/temp-deps/blst-ts/deps/blst/build/mach-o/mulx_mont_384-x86_64.s new file mode 100644 index 0000000..95d3dad --- /dev/null +++ b/temp-deps/blst-ts/deps/blst/build/mach-o/mulx_mont_384-x86_64.s @@ -0,0 +1,2960 @@ +.text + + + + + + + + +.p2align 5 +__sub_mod_384x384: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + movq 32(%rsi),%r12 + movq 40(%rsi),%r13 + movq 48(%rsi),%r14 + + subq 0(%rdx),%r8 + movq 56(%rsi),%r15 + sbbq 8(%rdx),%r9 + movq 64(%rsi),%rax + sbbq 16(%rdx),%r10 + movq 72(%rsi),%rbx + sbbq 24(%rdx),%r11 + movq 80(%rsi),%rbp + sbbq 32(%rdx),%r12 + movq 88(%rsi),%rsi + sbbq 40(%rdx),%r13 + movq %r8,0(%rdi) + sbbq 48(%rdx),%r14 + movq 0(%rcx),%r8 + movq %r9,8(%rdi) + sbbq 56(%rdx),%r15 + movq 8(%rcx),%r9 + movq %r10,16(%rdi) + sbbq 64(%rdx),%rax + movq 16(%rcx),%r10 + movq %r11,24(%rdi) + sbbq 72(%rdx),%rbx + movq 24(%rcx),%r11 + movq %r12,32(%rdi) + sbbq 80(%rdx),%rbp + movq 32(%rcx),%r12 + movq %r13,40(%rdi) + sbbq 88(%rdx),%rsi + movq 40(%rcx),%r13 + sbbq %rdx,%rdx + + andq %rdx,%r8 + andq %rdx,%r9 + andq %rdx,%r10 + andq %rdx,%r11 + andq %rdx,%r12 + andq %rdx,%r13 + + addq %r8,%r14 + adcq %r9,%r15 + movq %r14,48(%rdi) + adcq %r10,%rax + movq %r15,56(%rdi) + adcq %r11,%rbx + movq %rax,64(%rdi) + adcq %r12,%rbp + movq %rbx,72(%rdi) + adcq %r13,%rsi + movq %rbp,80(%rdi) + movq %rsi,88(%rdi) + + .byte 0xf3,0xc3 +.cfi_endproc + + + +.p2align 5 +__add_mod_384: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + movq 32(%rsi),%r12 + movq 40(%rsi),%r13 + + addq 0(%rdx),%r8 + adcq 8(%rdx),%r9 + adcq 16(%rdx),%r10 + movq %r8,%r14 + adcq 24(%rdx),%r11 + movq %r9,%r15 + adcq 32(%rdx),%r12 + movq %r10,%rax + adcq 40(%rdx),%r13 + movq %r11,%rbx + sbbq %rdx,%rdx + + subq 0(%rcx),%r8 + sbbq 8(%rcx),%r9 + movq %r12,%rbp + sbbq 16(%rcx),%r10 + sbbq 24(%rcx),%r11 + sbbq 32(%rcx),%r12 + movq %r13,%rsi + sbbq 40(%rcx),%r13 + sbbq $0,%rdx + + cmovcq %r14,%r8 + cmovcq %r15,%r9 + cmovcq %rax,%r10 + movq %r8,0(%rdi) + cmovcq %rbx,%r11 + movq %r9,8(%rdi) + cmovcq %rbp,%r12 + movq %r10,16(%rdi) + cmovcq %rsi,%r13 + movq %r11,24(%rdi) + movq %r12,32(%rdi) + movq %r13,40(%rdi) + + .byte 0xf3,0xc3 +.cfi_endproc + + + +.p2align 5 +__sub_mod_384: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + movq 32(%rsi),%r12 + movq 40(%rsi),%r13 + +__sub_mod_384_a_is_loaded: + subq 0(%rdx),%r8 + movq 0(%rcx),%r14 + sbbq 8(%rdx),%r9 + movq 8(%rcx),%r15 + sbbq 16(%rdx),%r10 + movq 16(%rcx),%rax + sbbq 24(%rdx),%r11 + movq 24(%rcx),%rbx + sbbq 32(%rdx),%r12 + movq 32(%rcx),%rbp + sbbq 40(%rdx),%r13 + movq 40(%rcx),%rsi + sbbq %rdx,%rdx + + andq %rdx,%r14 + andq %rdx,%r15 + andq %rdx,%rax + andq %rdx,%rbx + andq %rdx,%rbp + andq %rdx,%rsi + + addq %r14,%r8 + adcq %r15,%r9 + movq %r8,0(%rdi) + adcq %rax,%r10 + movq %r9,8(%rdi) + adcq %rbx,%r11 + movq %r10,16(%rdi) + adcq %rbp,%r12 + movq %r11,24(%rdi) + adcq %rsi,%r13 + movq %r12,32(%rdi) + movq %r13,40(%rdi) + + .byte 0xf3,0xc3 +.cfi_endproc + +.globl _mulx_mont_384x +.private_extern _mulx_mont_384x + +.p2align 5 +_mulx_mont_384x: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + subq $328,%rsp +.cfi_adjust_cfa_offset 328 + + + movq %rdx,%rbx + movq %rdi,32(%rsp) + movq %rsi,24(%rsp) + movq %rdx,16(%rsp) + movq %rcx,8(%rsp) + movq %r8,0(%rsp) + + + + + leaq 40(%rsp),%rdi + call __mulx_384 + + + leaq 48(%rbx),%rbx + leaq 128+48(%rsi),%rsi + leaq 96(%rdi),%rdi + call __mulx_384 + + + movq 8(%rsp),%rcx + leaq (%rbx),%rsi + leaq -48(%rbx),%rdx + leaq 40+192+48(%rsp),%rdi + call __add_mod_384 + + movq 24(%rsp),%rsi + leaq 48(%rsi),%rdx + leaq -48(%rdi),%rdi + call __add_mod_384 + + leaq (%rdi),%rbx + leaq 48(%rdi),%rsi + call __mulx_384 + + + leaq (%rdi),%rsi + leaq 40(%rsp),%rdx + movq 8(%rsp),%rcx + call __sub_mod_384x384 + + leaq (%rdi),%rsi + leaq -96(%rdi),%rdx + call __sub_mod_384x384 + + + leaq 40(%rsp),%rsi + leaq 40+96(%rsp),%rdx + leaq 40(%rsp),%rdi + call __sub_mod_384x384 + + leaq (%rcx),%rbx + + + leaq 40(%rsp),%rsi + movq 0(%rsp),%rcx + movq 32(%rsp),%rdi + call __mulx_by_1_mont_384 + call __redc_tail_mont_384 + + + leaq 40+192(%rsp),%rsi + movq 0(%rsp),%rcx + leaq 48(%rdi),%rdi + call __mulx_by_1_mont_384 + call __redc_tail_mont_384 + + leaq 328(%rsp),%r8 + movq 0(%r8),%r15 +.cfi_restore %r15 + movq 8(%r8),%r14 +.cfi_restore %r14 + movq 16(%r8),%r13 +.cfi_restore %r13 + movq 24(%r8),%r12 +.cfi_restore %r12 + movq 32(%r8),%rbx +.cfi_restore %rbx + movq 40(%r8),%rbp +.cfi_restore %rbp + leaq 48(%r8),%rsp +.cfi_adjust_cfa_offset -328-8*6 + + .byte 0xf3,0xc3 +.cfi_endproc + +.globl _sqrx_mont_384x +.private_extern _sqrx_mont_384x + +.p2align 5 +_sqrx_mont_384x: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + subq $136,%rsp +.cfi_adjust_cfa_offset 136 + + + movq %rcx,0(%rsp) + movq %rdx,%rcx + + movq %rdi,16(%rsp) + movq %rsi,24(%rsp) + + + leaq 48(%rsi),%rdx + leaq 32(%rsp),%rdi + call __add_mod_384 + + + movq 24(%rsp),%rsi + leaq 48(%rsi),%rdx + leaq 32+48(%rsp),%rdi + call __sub_mod_384 + + + movq 24(%rsp),%rsi + leaq 48(%rsi),%rbx + + movq 48(%rsi),%rdx + movq 0(%rsi),%r14 + movq 8(%rsi),%r15 + movq 16(%rsi),%rax + movq 24(%rsi),%r12 + movq 32(%rsi),%rdi + movq 40(%rsi),%rbp + leaq -128(%rsi),%rsi + leaq -128(%rcx),%rcx + + mulxq %r14,%r8,%r9 + call __mulx_mont_384 + addq %rdx,%rdx + adcq %r15,%r15 + adcq %rax,%rax + movq %rdx,%r8 + adcq %r12,%r12 + movq %r15,%r9 + adcq %rdi,%rdi + movq %rax,%r10 + adcq %rbp,%rbp + movq %r12,%r11 + sbbq %rsi,%rsi + + subq 0(%rcx),%rdx + sbbq 8(%rcx),%r15 + movq %rdi,%r13 + sbbq 16(%rcx),%rax + sbbq 24(%rcx),%r12 + sbbq 32(%rcx),%rdi + movq %rbp,%r14 + sbbq 40(%rcx),%rbp + sbbq $0,%rsi + + cmovcq %r8,%rdx + cmovcq %r9,%r15 + cmovcq %r10,%rax + movq %rdx,48(%rbx) + cmovcq %r11,%r12 + movq %r15,56(%rbx) + cmovcq %r13,%rdi + movq %rax,64(%rbx) + cmovcq %r14,%rbp + movq %r12,72(%rbx) + movq %rdi,80(%rbx) + movq %rbp,88(%rbx) + + leaq 32(%rsp),%rsi + leaq 32+48(%rsp),%rbx + + movq 32+48(%rsp),%rdx + movq 32+0(%rsp),%r14 + movq 32+8(%rsp),%r15 + movq 32+16(%rsp),%rax + movq 32+24(%rsp),%r12 + movq 32+32(%rsp),%rdi + movq 32+40(%rsp),%rbp + leaq -128(%rsi),%rsi + leaq -128(%rcx),%rcx + + mulxq %r14,%r8,%r9 + call __mulx_mont_384 + + leaq 136(%rsp),%r8 + movq 0(%r8),%r15 +.cfi_restore %r15 + movq 8(%r8),%r14 +.cfi_restore %r14 + movq 16(%r8),%r13 +.cfi_restore %r13 + movq 24(%r8),%r12 +.cfi_restore %r12 + movq 32(%r8),%rbx +.cfi_restore %rbx + movq 40(%r8),%rbp +.cfi_restore %rbp + leaq 48(%r8),%rsp +.cfi_adjust_cfa_offset -136-8*6 + + .byte 0xf3,0xc3 +.cfi_endproc + + +.globl _mulx_382x +.private_extern _mulx_382x + +.p2align 5 +_mulx_382x: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + subq $136,%rsp +.cfi_adjust_cfa_offset 136 + + + leaq 96(%rdi),%rdi + movq %rsi,0(%rsp) + movq %rdx,8(%rsp) + movq %rdi,16(%rsp) + movq %rcx,24(%rsp) + + + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + movq 32(%rsi),%r12 + movq 40(%rsi),%r13 + + addq 48(%rsi),%r8 + adcq 56(%rsi),%r9 + adcq 64(%rsi),%r10 + adcq 72(%rsi),%r11 + adcq 80(%rsi),%r12 + adcq 88(%rsi),%r13 + + movq %r8,32+0(%rsp) + movq %r9,32+8(%rsp) + movq %r10,32+16(%rsp) + movq %r11,32+24(%rsp) + movq %r12,32+32(%rsp) + movq %r13,32+40(%rsp) + + + movq 0(%rdx),%r8 + movq 8(%rdx),%r9 + movq 16(%rdx),%r10 + movq 24(%rdx),%r11 + movq 32(%rdx),%r12 + movq 40(%rdx),%r13 + + addq 48(%rdx),%r8 + adcq 56(%rdx),%r9 + adcq 64(%rdx),%r10 + adcq 72(%rdx),%r11 + adcq 80(%rdx),%r12 + adcq 88(%rdx),%r13 + + movq %r8,32+48(%rsp) + movq %r9,32+56(%rsp) + movq %r10,32+64(%rsp) + movq %r11,32+72(%rsp) + movq %r12,32+80(%rsp) + movq %r13,32+88(%rsp) + + + leaq 32+0(%rsp),%rsi + leaq 32+48(%rsp),%rbx + call __mulx_384 + + + movq 0(%rsp),%rsi + movq 8(%rsp),%rbx + leaq -96(%rdi),%rdi + call __mulx_384 + + + leaq 48+128(%rsi),%rsi + leaq 48(%rbx),%rbx + leaq 32(%rsp),%rdi + call __mulx_384 + + + movq 16(%rsp),%rsi + leaq 32(%rsp),%rdx + movq 24(%rsp),%rcx + movq %rsi,%rdi + call __sub_mod_384x384 + + + leaq 0(%rdi),%rsi + leaq -96(%rdi),%rdx + call __sub_mod_384x384 + + + leaq -96(%rdi),%rsi + leaq 32(%rsp),%rdx + leaq -96(%rdi),%rdi + call __sub_mod_384x384 + + leaq 136(%rsp),%r8 + movq 0(%r8),%r15 +.cfi_restore %r15 + movq 8(%r8),%r14 +.cfi_restore %r14 + movq 16(%r8),%r13 +.cfi_restore %r13 + movq 24(%r8),%r12 +.cfi_restore %r12 + movq 32(%r8),%rbx +.cfi_restore %rbx + movq 40(%r8),%rbp +.cfi_restore %rbp + leaq 48(%r8),%rsp +.cfi_adjust_cfa_offset -136-8*6 + + .byte 0xf3,0xc3 +.cfi_endproc + +.globl _sqrx_382x +.private_extern _sqrx_382x + +.p2align 5 +_sqrx_382x: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + pushq %rsi +.cfi_adjust_cfa_offset 8 + + + movq %rdx,%rcx + + + movq 0(%rsi),%r14 + movq 8(%rsi),%r15 + movq 16(%rsi),%rax + movq 24(%rsi),%rbx + movq 32(%rsi),%rbp + movq 40(%rsi),%rdx + + movq %r14,%r8 + addq 48(%rsi),%r14 + movq %r15,%r9 + adcq 56(%rsi),%r15 + movq %rax,%r10 + adcq 64(%rsi),%rax + movq %rbx,%r11 + adcq 72(%rsi),%rbx + movq %rbp,%r12 + adcq 80(%rsi),%rbp + movq %rdx,%r13 + adcq 88(%rsi),%rdx + + movq %r14,0(%rdi) + movq %r15,8(%rdi) + movq %rax,16(%rdi) + movq %rbx,24(%rdi) + movq %rbp,32(%rdi) + movq %rdx,40(%rdi) + + + leaq 48(%rsi),%rdx + leaq 48(%rdi),%rdi + call __sub_mod_384_a_is_loaded + + + leaq (%rdi),%rsi + leaq -48(%rdi),%rbx + leaq -48(%rdi),%rdi + call __mulx_384 + + + movq (%rsp),%rsi + leaq 48(%rsi),%rbx + leaq 96(%rdi),%rdi + call __mulx_384 + + movq 0(%rdi),%r8 + movq 8(%rdi),%r9 + movq 16(%rdi),%r10 + movq 24(%rdi),%r11 + movq 32(%rdi),%r12 + movq 40(%rdi),%r13 + movq 48(%rdi),%r14 + movq 56(%rdi),%r15 + movq 64(%rdi),%rax + movq 72(%rdi),%rbx + movq 80(%rdi),%rbp + addq %r8,%r8 + movq 88(%rdi),%rdx + adcq %r9,%r9 + movq %r8,0(%rdi) + adcq %r10,%r10 + movq %r9,8(%rdi) + adcq %r11,%r11 + movq %r10,16(%rdi) + adcq %r12,%r12 + movq %r11,24(%rdi) + adcq %r13,%r13 + movq %r12,32(%rdi) + adcq %r14,%r14 + movq %r13,40(%rdi) + adcq %r15,%r15 + movq %r14,48(%rdi) + adcq %rax,%rax + movq %r15,56(%rdi) + adcq %rbx,%rbx + movq %rax,64(%rdi) + adcq %rbp,%rbp + movq %rbx,72(%rdi) + adcq %rdx,%rdx + movq %rbp,80(%rdi) + movq %rdx,88(%rdi) + + movq 8(%rsp),%r15 +.cfi_restore %r15 + movq 16(%rsp),%r14 +.cfi_restore %r14 + movq 24(%rsp),%r13 +.cfi_restore %r13 + movq 32(%rsp),%r12 +.cfi_restore %r12 + movq 40(%rsp),%rbx +.cfi_restore %rbx + movq 48(%rsp),%rbp +.cfi_restore %rbp + leaq 56(%rsp),%rsp +.cfi_adjust_cfa_offset -8*7 + + .byte 0xf3,0xc3 +.cfi_endproc + +.globl _mulx_384 +.private_extern _mulx_384 + +.p2align 5 +_mulx_384: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + + + movq %rdx,%rbx + call __mulx_384 + + movq 0(%rsp),%r15 +.cfi_restore %r15 + movq 8(%rsp),%r14 +.cfi_restore %r14 + movq 16(%rsp),%r13 +.cfi_restore %r13 + movq 24(%rsp),%r12 +.cfi_restore %r12 + movq 32(%rsp),%rbx +.cfi_restore %rbx + movq 40(%rsp),%rbp +.cfi_restore %rbp + leaq 48(%rsp),%rsp +.cfi_adjust_cfa_offset -48 + + .byte 0xf3,0xc3 +.cfi_endproc + + + +.p2align 5 +__mulx_384: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + movq 0(%rbx),%rdx + movq 0(%rsi),%r14 + movq 8(%rsi),%r15 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + movq 32(%rsi),%r12 + movq 40(%rsi),%r13 + leaq -128(%rsi),%rsi + + mulxq %r14,%r9,%rcx + xorq %rbp,%rbp + + mulxq %r15,%r8,%rax + adcxq %rcx,%r8 + movq %r9,0(%rdi) + + mulxq %r10,%r9,%rcx + adcxq %rax,%r9 + + mulxq %r11,%r10,%rax + adcxq %rcx,%r10 + + mulxq %r12,%r11,%rcx + adcxq %rax,%r11 + + mulxq %r13,%r12,%r13 + movq 8(%rbx),%rdx + adcxq %rcx,%r12 + adcxq %rbp,%r13 + mulxq %r14,%rax,%rcx + adcxq %r8,%rax + adoxq %rcx,%r9 + movq %rax,8(%rdi) + + mulxq %r15,%r8,%rcx + adcxq %r9,%r8 + adoxq %rcx,%r10 + + mulxq 128+16(%rsi),%r9,%rax + adcxq %r10,%r9 + adoxq %rax,%r11 + + mulxq 128+24(%rsi),%r10,%rcx + adcxq %r11,%r10 + adoxq %rcx,%r12 + + mulxq 128+32(%rsi),%r11,%rax + adcxq %r12,%r11 + adoxq %r13,%rax + + mulxq 128+40(%rsi),%r12,%r13 + movq 16(%rbx),%rdx + adcxq %rax,%r12 + adoxq %rbp,%r13 + adcxq %rbp,%r13 + mulxq %r14,%rax,%rcx + adcxq %r8,%rax + adoxq %rcx,%r9 + movq %rax,16(%rdi) + + mulxq %r15,%r8,%rcx + adcxq %r9,%r8 + adoxq %rcx,%r10 + + mulxq 128+16(%rsi),%r9,%rax + adcxq %r10,%r9 + adoxq %rax,%r11 + + mulxq 128+24(%rsi),%r10,%rcx + adcxq %r11,%r10 + adoxq %rcx,%r12 + + mulxq 128+32(%rsi),%r11,%rax + adcxq %r12,%r11 + adoxq %r13,%rax + + mulxq 128+40(%rsi),%r12,%r13 + movq 24(%rbx),%rdx + adcxq %rax,%r12 + adoxq %rbp,%r13 + adcxq %rbp,%r13 + mulxq %r14,%rax,%rcx + adcxq %r8,%rax + adoxq %rcx,%r9 + movq %rax,24(%rdi) + + mulxq %r15,%r8,%rcx + adcxq %r9,%r8 + adoxq %rcx,%r10 + + mulxq 128+16(%rsi),%r9,%rax + adcxq %r10,%r9 + adoxq %rax,%r11 + + mulxq 128+24(%rsi),%r10,%rcx + adcxq %r11,%r10 + adoxq %rcx,%r12 + + mulxq 128+32(%rsi),%r11,%rax + adcxq %r12,%r11 + adoxq %r13,%rax + + mulxq 128+40(%rsi),%r12,%r13 + movq 32(%rbx),%rdx + adcxq %rax,%r12 + adoxq %rbp,%r13 + adcxq %rbp,%r13 + mulxq %r14,%rax,%rcx + adcxq %r8,%rax + adoxq %rcx,%r9 + movq %rax,32(%rdi) + + mulxq %r15,%r8,%rcx + adcxq %r9,%r8 + adoxq %rcx,%r10 + + mulxq 128+16(%rsi),%r9,%rax + adcxq %r10,%r9 + adoxq %rax,%r11 + + mulxq 128+24(%rsi),%r10,%rcx + adcxq %r11,%r10 + adoxq %rcx,%r12 + + mulxq 128+32(%rsi),%r11,%rax + adcxq %r12,%r11 + adoxq %r13,%rax + + mulxq 128+40(%rsi),%r12,%r13 + movq 40(%rbx),%rdx + adcxq %rax,%r12 + adoxq %rbp,%r13 + adcxq %rbp,%r13 + mulxq %r14,%rax,%rcx + adcxq %r8,%rax + adoxq %rcx,%r9 + movq %rax,40(%rdi) + + mulxq %r15,%r8,%rcx + adcxq %r9,%r8 + adoxq %rcx,%r10 + + mulxq 128+16(%rsi),%r9,%rax + adcxq %r10,%r9 + adoxq %rax,%r11 + + mulxq 128+24(%rsi),%r10,%rcx + adcxq %r11,%r10 + adoxq %rcx,%r12 + + mulxq 128+32(%rsi),%r11,%rax + adcxq %r12,%r11 + adoxq %r13,%rax + + mulxq 128+40(%rsi),%r12,%r13 + movq %rax,%rdx + adcxq %rax,%r12 + adoxq %rbp,%r13 + adcxq %rbp,%r13 + movq %r8,48(%rdi) + movq %r9,56(%rdi) + movq %r10,64(%rdi) + movq %r11,72(%rdi) + movq %r12,80(%rdi) + movq %r13,88(%rdi) + + .byte 0xf3,0xc3 +.cfi_endproc + +.globl _sqrx_384 +.private_extern _sqrx_384 + +.p2align 5 +_sqrx_384: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + pushq %rdi +.cfi_adjust_cfa_offset 8 + + + call __sqrx_384 + + movq 8(%rsp),%r15 +.cfi_restore %r15 + movq 16(%rsp),%r14 +.cfi_restore %r14 + movq 24(%rsp),%r13 +.cfi_restore %r13 + movq 32(%rsp),%r12 +.cfi_restore %r12 + movq 40(%rsp),%rbx +.cfi_restore %rbx + movq 48(%rsp),%rbp +.cfi_restore %rbp + leaq 56(%rsp),%rsp +.cfi_adjust_cfa_offset -56 + + .byte 0xf3,0xc3 +.cfi_endproc + + +.p2align 5 +__sqrx_384: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + movq 0(%rsi),%rdx + movq 8(%rsi),%r14 + movq 16(%rsi),%r15 + movq 24(%rsi),%rcx + movq 32(%rsi),%rbx + + + mulxq %r14,%r8,%rdi + movq 40(%rsi),%rbp + mulxq %r15,%r9,%rax + addq %rdi,%r9 + mulxq %rcx,%r10,%rdi + adcq %rax,%r10 + mulxq %rbx,%r11,%rax + adcq %rdi,%r11 + mulxq %rbp,%r12,%r13 + movq %r14,%rdx + adcq %rax,%r12 + adcq $0,%r13 + + + xorq %r14,%r14 + mulxq %r15,%rdi,%rax + adcxq %rdi,%r10 + adoxq %rax,%r11 + + mulxq %rcx,%rdi,%rax + adcxq %rdi,%r11 + adoxq %rax,%r12 + + mulxq %rbx,%rdi,%rax + adcxq %rdi,%r12 + adoxq %rax,%r13 + + mulxq %rbp,%rdi,%rax + movq %r15,%rdx + adcxq %rdi,%r13 + adoxq %r14,%rax + adcxq %rax,%r14 + + + xorq %r15,%r15 + mulxq %rcx,%rdi,%rax + adcxq %rdi,%r12 + adoxq %rax,%r13 + + mulxq %rbx,%rdi,%rax + adcxq %rdi,%r13 + adoxq %rax,%r14 + + mulxq %rbp,%rdi,%rax + movq %rcx,%rdx + adcxq %rdi,%r14 + adoxq %r15,%rax + adcxq %rax,%r15 + + + xorq %rcx,%rcx + mulxq %rbx,%rdi,%rax + adcxq %rdi,%r14 + adoxq %rax,%r15 + + mulxq %rbp,%rdi,%rax + movq %rbx,%rdx + adcxq %rdi,%r15 + adoxq %rcx,%rax + adcxq %rax,%rcx + + + mulxq %rbp,%rdi,%rbx + movq 0(%rsi),%rdx + addq %rdi,%rcx + movq 8(%rsp),%rdi + adcq $0,%rbx + + + xorq %rbp,%rbp + adcxq %r8,%r8 + adcxq %r9,%r9 + adcxq %r10,%r10 + adcxq %r11,%r11 + adcxq %r12,%r12 + + + mulxq %rdx,%rdx,%rax + movq %rdx,0(%rdi) + movq 8(%rsi),%rdx + adoxq %rax,%r8 + movq %r8,8(%rdi) + + mulxq %rdx,%r8,%rax + movq 16(%rsi),%rdx + adoxq %r8,%r9 + adoxq %rax,%r10 + movq %r9,16(%rdi) + movq %r10,24(%rdi) + + mulxq %rdx,%r8,%r9 + movq 24(%rsi),%rdx + adoxq %r8,%r11 + adoxq %r9,%r12 + adcxq %r13,%r13 + adcxq %r14,%r14 + movq %r11,32(%rdi) + movq %r12,40(%rdi) + + mulxq %rdx,%r8,%r9 + movq 32(%rsi),%rdx + adoxq %r8,%r13 + adoxq %r9,%r14 + adcxq %r15,%r15 + adcxq %rcx,%rcx + movq %r13,48(%rdi) + movq %r14,56(%rdi) + + mulxq %rdx,%r8,%r9 + movq 40(%rsi),%rdx + adoxq %r8,%r15 + adoxq %r9,%rcx + adcxq %rbx,%rbx + adcxq %rbp,%rbp + movq %r15,64(%rdi) + movq %rcx,72(%rdi) + + mulxq %rdx,%r8,%r9 + adoxq %r8,%rbx + adoxq %r9,%rbp + + movq %rbx,80(%rdi) + movq %rbp,88(%rdi) + + .byte 0xf3,0xc3 +.cfi_endproc + + + + +.globl _redcx_mont_384 +.private_extern _redcx_mont_384 + +.p2align 5 +_redcx_mont_384: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + subq $8,%rsp +.cfi_adjust_cfa_offset 8 + + + movq %rdx,%rbx + call __mulx_by_1_mont_384 + call __redc_tail_mont_384 + + movq 8(%rsp),%r15 +.cfi_restore %r15 + movq 16(%rsp),%r14 +.cfi_restore %r14 + movq 24(%rsp),%r13 +.cfi_restore %r13 + movq 32(%rsp),%r12 +.cfi_restore %r12 + movq 40(%rsp),%rbx +.cfi_restore %rbx + movq 48(%rsp),%rbp +.cfi_restore %rbp + leaq 56(%rsp),%rsp +.cfi_adjust_cfa_offset -56 + + .byte 0xf3,0xc3 +.cfi_endproc + + + + + +.globl _fromx_mont_384 +.private_extern _fromx_mont_384 + +.p2align 5 +_fromx_mont_384: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + subq $8,%rsp +.cfi_adjust_cfa_offset 8 + + + movq %rdx,%rbx + call __mulx_by_1_mont_384 + + + + + movq %r14,%rax + movq %r15,%rcx + movq %r8,%rdx + movq %r9,%rbp + + subq 0(%rbx),%r14 + sbbq 8(%rbx),%r15 + movq %r10,%r13 + sbbq 16(%rbx),%r8 + sbbq 24(%rbx),%r9 + sbbq 32(%rbx),%r10 + movq %r11,%rsi + sbbq 40(%rbx),%r11 + + cmovcq %rax,%r14 + cmovcq %rcx,%r15 + cmovcq %rdx,%r8 + movq %r14,0(%rdi) + cmovcq %rbp,%r9 + movq %r15,8(%rdi) + cmovcq %r13,%r10 + movq %r8,16(%rdi) + cmovcq %rsi,%r11 + movq %r9,24(%rdi) + movq %r10,32(%rdi) + movq %r11,40(%rdi) + + movq 8(%rsp),%r15 +.cfi_restore %r15 + movq 16(%rsp),%r14 +.cfi_restore %r14 + movq 24(%rsp),%r13 +.cfi_restore %r13 + movq 32(%rsp),%r12 +.cfi_restore %r12 + movq 40(%rsp),%rbx +.cfi_restore %rbx + movq 48(%rsp),%rbp +.cfi_restore %rbp + leaq 56(%rsp),%rsp +.cfi_adjust_cfa_offset -56 + + .byte 0xf3,0xc3 +.cfi_endproc + + +.p2align 5 +__mulx_by_1_mont_384: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + movq 0(%rsi),%r8 + movq %rcx,%rdx + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + movq 32(%rsi),%r12 + movq 40(%rsi),%r13 + imulq %r8,%rdx + + + xorq %r14,%r14 + mulxq 0(%rbx),%rax,%rbp + adcxq %rax,%r8 + adoxq %rbp,%r9 + + mulxq 8(%rbx),%rax,%rbp + adcxq %rax,%r9 + adoxq %rbp,%r10 + + mulxq 16(%rbx),%rax,%rbp + adcxq %rax,%r10 + adoxq %rbp,%r11 + + mulxq 24(%rbx),%rax,%rbp + adcxq %rax,%r11 + adoxq %rbp,%r12 + + mulxq 32(%rbx),%rax,%rbp + adcxq %rax,%r12 + adoxq %rbp,%r13 + + mulxq 40(%rbx),%rax,%rbp + movq %rcx,%rdx + adcxq %rax,%r13 + adoxq %r14,%rbp + adcxq %rbp,%r14 + imulq %r9,%rdx + + + xorq %r15,%r15 + mulxq 0(%rbx),%rax,%rbp + adcxq %rax,%r9 + adoxq %rbp,%r10 + + mulxq 8(%rbx),%rax,%rbp + adcxq %rax,%r10 + adoxq %rbp,%r11 + + mulxq 16(%rbx),%rax,%rbp + adcxq %rax,%r11 + adoxq %rbp,%r12 + + mulxq 24(%rbx),%rax,%rbp + adcxq %rax,%r12 + adoxq %rbp,%r13 + + mulxq 32(%rbx),%rax,%rbp + adcxq %rax,%r13 + adoxq %rbp,%r14 + + mulxq 40(%rbx),%rax,%rbp + movq %rcx,%rdx + adcxq %rax,%r14 + adoxq %r15,%rbp + adcxq %rbp,%r15 + imulq %r10,%rdx + + + xorq %r8,%r8 + mulxq 0(%rbx),%rax,%rbp + adcxq %rax,%r10 + adoxq %rbp,%r11 + + mulxq 8(%rbx),%rax,%rbp + adcxq %rax,%r11 + adoxq %rbp,%r12 + + mulxq 16(%rbx),%rax,%rbp + adcxq %rax,%r12 + adoxq %rbp,%r13 + + mulxq 24(%rbx),%rax,%rbp + adcxq %rax,%r13 + adoxq %rbp,%r14 + + mulxq 32(%rbx),%rax,%rbp + adcxq %rax,%r14 + adoxq %rbp,%r15 + + mulxq 40(%rbx),%rax,%rbp + movq %rcx,%rdx + adcxq %rax,%r15 + adoxq %r8,%rbp + adcxq %rbp,%r8 + imulq %r11,%rdx + + + xorq %r9,%r9 + mulxq 0(%rbx),%rax,%rbp + adcxq %rax,%r11 + adoxq %rbp,%r12 + + mulxq 8(%rbx),%rax,%rbp + adcxq %rax,%r12 + adoxq %rbp,%r13 + + mulxq 16(%rbx),%rax,%rbp + adcxq %rax,%r13 + adoxq %rbp,%r14 + + mulxq 24(%rbx),%rax,%rbp + adcxq %rax,%r14 + adoxq %rbp,%r15 + + mulxq 32(%rbx),%rax,%rbp + adcxq %rax,%r15 + adoxq %rbp,%r8 + + mulxq 40(%rbx),%rax,%rbp + movq %rcx,%rdx + adcxq %rax,%r8 + adoxq %r9,%rbp + adcxq %rbp,%r9 + imulq %r12,%rdx + + + xorq %r10,%r10 + mulxq 0(%rbx),%rax,%rbp + adcxq %rax,%r12 + adoxq %rbp,%r13 + + mulxq 8(%rbx),%rax,%rbp + adcxq %rax,%r13 + adoxq %rbp,%r14 + + mulxq 16(%rbx),%rax,%rbp + adcxq %rax,%r14 + adoxq %rbp,%r15 + + mulxq 24(%rbx),%rax,%rbp + adcxq %rax,%r15 + adoxq %rbp,%r8 + + mulxq 32(%rbx),%rax,%rbp + adcxq %rax,%r8 + adoxq %rbp,%r9 + + mulxq 40(%rbx),%rax,%rbp + movq %rcx,%rdx + adcxq %rax,%r9 + adoxq %r10,%rbp + adcxq %rbp,%r10 + imulq %r13,%rdx + + + xorq %r11,%r11 + mulxq 0(%rbx),%rax,%rbp + adcxq %rax,%r13 + adoxq %rbp,%r14 + + mulxq 8(%rbx),%rax,%rbp + adcxq %rax,%r14 + adoxq %rbp,%r15 + + mulxq 16(%rbx),%rax,%rbp + adcxq %rax,%r15 + adoxq %rbp,%r8 + + mulxq 24(%rbx),%rax,%rbp + adcxq %rax,%r8 + adoxq %rbp,%r9 + + mulxq 32(%rbx),%rax,%rbp + adcxq %rax,%r9 + adoxq %rbp,%r10 + + mulxq 40(%rbx),%rax,%rbp + movq %rcx,%rdx + adcxq %rax,%r10 + adoxq %r11,%rbp + adcxq %rbp,%r11 + .byte 0xf3,0xc3 +.cfi_endproc + + + +.p2align 5 +__redc_tail_mont_384: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + addq 48(%rsi),%r14 + movq %r14,%rax + adcq 56(%rsi),%r15 + adcq 64(%rsi),%r8 + adcq 72(%rsi),%r9 + movq %r15,%rcx + adcq 80(%rsi),%r10 + adcq 88(%rsi),%r11 + sbbq %r12,%r12 + + + + + movq %r8,%rdx + movq %r9,%rbp + + subq 0(%rbx),%r14 + sbbq 8(%rbx),%r15 + movq %r10,%r13 + sbbq 16(%rbx),%r8 + sbbq 24(%rbx),%r9 + sbbq 32(%rbx),%r10 + movq %r11,%rsi + sbbq 40(%rbx),%r11 + sbbq $0,%r12 + + cmovcq %rax,%r14 + cmovcq %rcx,%r15 + cmovcq %rdx,%r8 + movq %r14,0(%rdi) + cmovcq %rbp,%r9 + movq %r15,8(%rdi) + cmovcq %r13,%r10 + movq %r8,16(%rdi) + cmovcq %rsi,%r11 + movq %r9,24(%rdi) + movq %r10,32(%rdi) + movq %r11,40(%rdi) + + .byte 0xf3,0xc3 +.cfi_endproc + + +.globl _sgn0x_pty_mont_384 +.private_extern _sgn0x_pty_mont_384 + +.p2align 5 +_sgn0x_pty_mont_384: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + subq $8,%rsp +.cfi_adjust_cfa_offset 8 + + + movq %rsi,%rbx + leaq 0(%rdi),%rsi + movq %rdx,%rcx + call __mulx_by_1_mont_384 + + xorq %rax,%rax + movq %r14,%r13 + addq %r14,%r14 + adcq %r15,%r15 + adcq %r8,%r8 + adcq %r9,%r9 + adcq %r10,%r10 + adcq %r11,%r11 + adcq $0,%rax + + subq 0(%rbx),%r14 + sbbq 8(%rbx),%r15 + sbbq 16(%rbx),%r8 + sbbq 24(%rbx),%r9 + sbbq 32(%rbx),%r10 + sbbq 40(%rbx),%r11 + sbbq $0,%rax + + notq %rax + andq $1,%r13 + andq $2,%rax + orq %r13,%rax + + movq 8(%rsp),%r15 +.cfi_restore %r15 + movq 16(%rsp),%r14 +.cfi_restore %r14 + movq 24(%rsp),%r13 +.cfi_restore %r13 + movq 32(%rsp),%r12 +.cfi_restore %r12 + movq 40(%rsp),%rbx +.cfi_restore %rbx + movq 48(%rsp),%rbp +.cfi_restore %rbp + leaq 56(%rsp),%rsp +.cfi_adjust_cfa_offset -56 + + .byte 0xf3,0xc3 +.cfi_endproc + + +.globl _sgn0x_pty_mont_384x +.private_extern _sgn0x_pty_mont_384x + +.p2align 5 +_sgn0x_pty_mont_384x: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + subq $8,%rsp +.cfi_adjust_cfa_offset 8 + + + movq %rsi,%rbx + leaq 48(%rdi),%rsi + movq %rdx,%rcx + call __mulx_by_1_mont_384 + + movq %r14,%r12 + orq %r15,%r14 + orq %r8,%r14 + orq %r9,%r14 + orq %r10,%r14 + orq %r11,%r14 + + leaq 0(%rdi),%rsi + xorq %rdi,%rdi + movq %r12,%r13 + addq %r12,%r12 + adcq %r15,%r15 + adcq %r8,%r8 + adcq %r9,%r9 + adcq %r10,%r10 + adcq %r11,%r11 + adcq $0,%rdi + + subq 0(%rbx),%r12 + sbbq 8(%rbx),%r15 + sbbq 16(%rbx),%r8 + sbbq 24(%rbx),%r9 + sbbq 32(%rbx),%r10 + sbbq 40(%rbx),%r11 + sbbq $0,%rdi + + movq %r14,0(%rsp) + notq %rdi + andq $1,%r13 + andq $2,%rdi + orq %r13,%rdi + + call __mulx_by_1_mont_384 + + movq %r14,%r12 + orq %r15,%r14 + orq %r8,%r14 + orq %r9,%r14 + orq %r10,%r14 + orq %r11,%r14 + + xorq %rax,%rax + movq %r12,%r13 + addq %r12,%r12 + adcq %r15,%r15 + adcq %r8,%r8 + adcq %r9,%r9 + adcq %r10,%r10 + adcq %r11,%r11 + adcq $0,%rax + + subq 0(%rbx),%r12 + sbbq 8(%rbx),%r15 + sbbq 16(%rbx),%r8 + sbbq 24(%rbx),%r9 + sbbq 32(%rbx),%r10 + sbbq 40(%rbx),%r11 + sbbq $0,%rax + + movq 0(%rsp),%r12 + + notq %rax + + testq %r14,%r14 + cmovzq %rdi,%r13 + + testq %r12,%r12 + cmovnzq %rdi,%rax + + andq $1,%r13 + andq $2,%rax + orq %r13,%rax + + movq 8(%rsp),%r15 +.cfi_restore %r15 + movq 16(%rsp),%r14 +.cfi_restore %r14 + movq 24(%rsp),%r13 +.cfi_restore %r13 + movq 32(%rsp),%r12 +.cfi_restore %r12 + movq 40(%rsp),%rbx +.cfi_restore %rbx + movq 48(%rsp),%rbp +.cfi_restore %rbp + leaq 56(%rsp),%rsp +.cfi_adjust_cfa_offset -56 + + .byte 0xf3,0xc3 +.cfi_endproc + +.globl _mulx_mont_384 +.private_extern _mulx_mont_384 + +.p2align 5 +_mulx_mont_384: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + leaq -24(%rsp),%rsp +.cfi_adjust_cfa_offset 8*3 + + + movq %rdx,%rbx + movq 0(%rdx),%rdx + movq 0(%rsi),%r14 + movq 8(%rsi),%r15 + movq 16(%rsi),%rax + movq 24(%rsi),%r12 + movq %rdi,16(%rsp) + movq 32(%rsi),%rdi + movq 40(%rsi),%rbp + leaq -128(%rsi),%rsi + leaq -128(%rcx),%rcx + movq %r8,(%rsp) + + mulxq %r14,%r8,%r9 + call __mulx_mont_384 + + movq 24(%rsp),%r15 +.cfi_restore %r15 + movq 32(%rsp),%r14 +.cfi_restore %r14 + movq 40(%rsp),%r13 +.cfi_restore %r13 + movq 48(%rsp),%r12 +.cfi_restore %r12 + movq 56(%rsp),%rbx +.cfi_restore %rbx + movq 64(%rsp),%rbp +.cfi_restore %rbp + leaq 72(%rsp),%rsp +.cfi_adjust_cfa_offset -8*9 + + .byte 0xf3,0xc3 +.cfi_endproc + + +.p2align 5 +__mulx_mont_384: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + mulxq %r15,%r14,%r10 + mulxq %rax,%r15,%r11 + addq %r14,%r9 + mulxq %r12,%rax,%r12 + adcq %r15,%r10 + mulxq %rdi,%rdi,%r13 + adcq %rax,%r11 + mulxq %rbp,%rbp,%r14 + movq 8(%rbx),%rdx + adcq %rdi,%r12 + adcq %rbp,%r13 + adcq $0,%r14 + xorq %r15,%r15 + + movq %r8,16(%rsp) + imulq 8(%rsp),%r8 + + + xorq %rax,%rax + mulxq 0+128(%rsi),%rdi,%rbp + adoxq %rdi,%r9 + adcxq %rbp,%r10 + + mulxq 8+128(%rsi),%rdi,%rbp + adoxq %rdi,%r10 + adcxq %rbp,%r11 + + mulxq 16+128(%rsi),%rdi,%rbp + adoxq %rdi,%r11 + adcxq %rbp,%r12 + + mulxq 24+128(%rsi),%rdi,%rbp + adoxq %rdi,%r12 + adcxq %rbp,%r13 + + mulxq 32+128(%rsi),%rdi,%rbp + adoxq %rdi,%r13 + adcxq %rbp,%r14 + + mulxq 40+128(%rsi),%rdi,%rbp + movq %r8,%rdx + adoxq %rdi,%r14 + adcxq %rbp,%r15 + adoxq %rax,%r15 + adoxq %rax,%rax + + + xorq %r8,%r8 + mulxq 0+128(%rcx),%rdi,%rbp + adcxq 16(%rsp),%rdi + adoxq %rbp,%r9 + + mulxq 8+128(%rcx),%rdi,%rbp + adcxq %rdi,%r9 + adoxq %rbp,%r10 + + mulxq 16+128(%rcx),%rdi,%rbp + adcxq %rdi,%r10 + adoxq %rbp,%r11 + + mulxq 24+128(%rcx),%rdi,%rbp + adcxq %rdi,%r11 + adoxq %rbp,%r12 + + mulxq 32+128(%rcx),%rdi,%rbp + adcxq %rdi,%r12 + adoxq %rbp,%r13 + + mulxq 40+128(%rcx),%rdi,%rbp + movq 16(%rbx),%rdx + adcxq %rdi,%r13 + adoxq %rbp,%r14 + adcxq %r8,%r14 + adoxq %r8,%r15 + adcxq %r8,%r15 + adoxq %r8,%rax + adcxq %r8,%rax + movq %r9,16(%rsp) + imulq 8(%rsp),%r9 + + + xorq %r8,%r8 + mulxq 0+128(%rsi),%rdi,%rbp + adoxq %rdi,%r10 + adcxq %rbp,%r11 + + mulxq 8+128(%rsi),%rdi,%rbp + adoxq %rdi,%r11 + adcxq %rbp,%r12 + + mulxq 16+128(%rsi),%rdi,%rbp + adoxq %rdi,%r12 + adcxq %rbp,%r13 + + mulxq 24+128(%rsi),%rdi,%rbp + adoxq %rdi,%r13 + adcxq %rbp,%r14 + + mulxq 32+128(%rsi),%rdi,%rbp + adoxq %rdi,%r14 + adcxq %rbp,%r15 + + mulxq 40+128(%rsi),%rdi,%rbp + movq %r9,%rdx + adoxq %rdi,%r15 + adcxq %rbp,%rax + adoxq %r8,%rax + adoxq %r8,%r8 + + + xorq %r9,%r9 + mulxq 0+128(%rcx),%rdi,%rbp + adcxq 16(%rsp),%rdi + adoxq %rbp,%r10 + + mulxq 8+128(%rcx),%rdi,%rbp + adcxq %rdi,%r10 + adoxq %rbp,%r11 + + mulxq 16+128(%rcx),%rdi,%rbp + adcxq %rdi,%r11 + adoxq %rbp,%r12 + + mulxq 24+128(%rcx),%rdi,%rbp + adcxq %rdi,%r12 + adoxq %rbp,%r13 + + mulxq 32+128(%rcx),%rdi,%rbp + adcxq %rdi,%r13 + adoxq %rbp,%r14 + + mulxq 40+128(%rcx),%rdi,%rbp + movq 24(%rbx),%rdx + adcxq %rdi,%r14 + adoxq %rbp,%r15 + adcxq %r9,%r15 + adoxq %r9,%rax + adcxq %r9,%rax + adoxq %r9,%r8 + adcxq %r9,%r8 + movq %r10,16(%rsp) + imulq 8(%rsp),%r10 + + + xorq %r9,%r9 + mulxq 0+128(%rsi),%rdi,%rbp + adoxq %rdi,%r11 + adcxq %rbp,%r12 + + mulxq 8+128(%rsi),%rdi,%rbp + adoxq %rdi,%r12 + adcxq %rbp,%r13 + + mulxq 16+128(%rsi),%rdi,%rbp + adoxq %rdi,%r13 + adcxq %rbp,%r14 + + mulxq 24+128(%rsi),%rdi,%rbp + adoxq %rdi,%r14 + adcxq %rbp,%r15 + + mulxq 32+128(%rsi),%rdi,%rbp + adoxq %rdi,%r15 + adcxq %rbp,%rax + + mulxq 40+128(%rsi),%rdi,%rbp + movq %r10,%rdx + adoxq %rdi,%rax + adcxq %rbp,%r8 + adoxq %r9,%r8 + adoxq %r9,%r9 + + + xorq %r10,%r10 + mulxq 0+128(%rcx),%rdi,%rbp + adcxq 16(%rsp),%rdi + adoxq %rbp,%r11 + + mulxq 8+128(%rcx),%rdi,%rbp + adcxq %rdi,%r11 + adoxq %rbp,%r12 + + mulxq 16+128(%rcx),%rdi,%rbp + adcxq %rdi,%r12 + adoxq %rbp,%r13 + + mulxq 24+128(%rcx),%rdi,%rbp + adcxq %rdi,%r13 + adoxq %rbp,%r14 + + mulxq 32+128(%rcx),%rdi,%rbp + adcxq %rdi,%r14 + adoxq %rbp,%r15 + + mulxq 40+128(%rcx),%rdi,%rbp + movq 32(%rbx),%rdx + adcxq %rdi,%r15 + adoxq %rbp,%rax + adcxq %r10,%rax + adoxq %r10,%r8 + adcxq %r10,%r8 + adoxq %r10,%r9 + adcxq %r10,%r9 + movq %r11,16(%rsp) + imulq 8(%rsp),%r11 + + + xorq %r10,%r10 + mulxq 0+128(%rsi),%rdi,%rbp + adoxq %rdi,%r12 + adcxq %rbp,%r13 + + mulxq 8+128(%rsi),%rdi,%rbp + adoxq %rdi,%r13 + adcxq %rbp,%r14 + + mulxq 16+128(%rsi),%rdi,%rbp + adoxq %rdi,%r14 + adcxq %rbp,%r15 + + mulxq 24+128(%rsi),%rdi,%rbp + adoxq %rdi,%r15 + adcxq %rbp,%rax + + mulxq 32+128(%rsi),%rdi,%rbp + adoxq %rdi,%rax + adcxq %rbp,%r8 + + mulxq 40+128(%rsi),%rdi,%rbp + movq %r11,%rdx + adoxq %rdi,%r8 + adcxq %rbp,%r9 + adoxq %r10,%r9 + adoxq %r10,%r10 + + + xorq %r11,%r11 + mulxq 0+128(%rcx),%rdi,%rbp + adcxq 16(%rsp),%rdi + adoxq %rbp,%r12 + + mulxq 8+128(%rcx),%rdi,%rbp + adcxq %rdi,%r12 + adoxq %rbp,%r13 + + mulxq 16+128(%rcx),%rdi,%rbp + adcxq %rdi,%r13 + adoxq %rbp,%r14 + + mulxq 24+128(%rcx),%rdi,%rbp + adcxq %rdi,%r14 + adoxq %rbp,%r15 + + mulxq 32+128(%rcx),%rdi,%rbp + adcxq %rdi,%r15 + adoxq %rbp,%rax + + mulxq 40+128(%rcx),%rdi,%rbp + movq 40(%rbx),%rdx + adcxq %rdi,%rax + adoxq %rbp,%r8 + adcxq %r11,%r8 + adoxq %r11,%r9 + adcxq %r11,%r9 + adoxq %r11,%r10 + adcxq %r11,%r10 + movq %r12,16(%rsp) + imulq 8(%rsp),%r12 + + + xorq %r11,%r11 + mulxq 0+128(%rsi),%rdi,%rbp + adoxq %rdi,%r13 + adcxq %rbp,%r14 + + mulxq 8+128(%rsi),%rdi,%rbp + adoxq %rdi,%r14 + adcxq %rbp,%r15 + + mulxq 16+128(%rsi),%rdi,%rbp + adoxq %rdi,%r15 + adcxq %rbp,%rax + + mulxq 24+128(%rsi),%rdi,%rbp + adoxq %rdi,%rax + adcxq %rbp,%r8 + + mulxq 32+128(%rsi),%rdi,%rbp + adoxq %rdi,%r8 + adcxq %rbp,%r9 + + mulxq 40+128(%rsi),%rdi,%rbp + movq %r12,%rdx + adoxq %rdi,%r9 + adcxq %rbp,%r10 + adoxq %r11,%r10 + adoxq %r11,%r11 + + + xorq %r12,%r12 + mulxq 0+128(%rcx),%rdi,%rbp + adcxq 16(%rsp),%rdi + adoxq %rbp,%r13 + + mulxq 8+128(%rcx),%rdi,%rbp + adcxq %rdi,%r13 + adoxq %rbp,%r14 + + mulxq 16+128(%rcx),%rdi,%rbp + adcxq %rdi,%r14 + adoxq %rbp,%r15 + + mulxq 24+128(%rcx),%rdi,%rbp + adcxq %rdi,%r15 + adoxq %rbp,%rax + + mulxq 32+128(%rcx),%rdi,%rbp + adcxq %rdi,%rax + adoxq %rbp,%r8 + + mulxq 40+128(%rcx),%rdi,%rbp + movq %r13,%rdx + adcxq %rdi,%r8 + adoxq %rbp,%r9 + adcxq %r12,%r9 + adoxq %r12,%r10 + adcxq %r12,%r10 + adoxq %r12,%r11 + adcxq %r12,%r11 + imulq 8(%rsp),%rdx + movq 24(%rsp),%rbx + + + xorq %r12,%r12 + mulxq 0+128(%rcx),%rdi,%rbp + adcxq %rdi,%r13 + adoxq %rbp,%r14 + + mulxq 8+128(%rcx),%rdi,%rbp + adcxq %rdi,%r14 + adoxq %rbp,%r15 + + mulxq 16+128(%rcx),%rdi,%rbp + adcxq %rdi,%r15 + adoxq %rbp,%rax + + mulxq 24+128(%rcx),%rdi,%rbp + adcxq %rdi,%rax + adoxq %rbp,%r8 + movq %r15,%r13 + + mulxq 32+128(%rcx),%rdi,%rbp + adcxq %rdi,%r8 + adoxq %rbp,%r9 + movq %rax,%rsi + + mulxq 40+128(%rcx),%rdi,%rbp + adcxq %rdi,%r9 + adoxq %rbp,%r10 + movq %r14,%rdx + adcxq %r12,%r10 + adoxq %r12,%r11 + leaq 128(%rcx),%rcx + movq %r8,%r12 + adcq $0,%r11 + + + + + subq 0(%rcx),%r14 + sbbq 8(%rcx),%r15 + movq %r9,%rdi + sbbq 16(%rcx),%rax + sbbq 24(%rcx),%r8 + sbbq 32(%rcx),%r9 + movq %r10,%rbp + sbbq 40(%rcx),%r10 + sbbq $0,%r11 + + cmovncq %r14,%rdx + cmovcq %r13,%r15 + cmovcq %rsi,%rax + cmovncq %r8,%r12 + movq %rdx,0(%rbx) + cmovncq %r9,%rdi + movq %r15,8(%rbx) + cmovncq %r10,%rbp + movq %rax,16(%rbx) + movq %r12,24(%rbx) + movq %rdi,32(%rbx) + movq %rbp,40(%rbx) + + .byte 0xf3,0xc3 +.cfi_endproc + +.globl _sqrx_mont_384 +.private_extern _sqrx_mont_384 + +.p2align 5 +_sqrx_mont_384: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + leaq -24(%rsp),%rsp +.cfi_adjust_cfa_offset 8*3 + + + movq %rcx,%r8 + leaq -128(%rdx),%rcx + movq 0(%rsi),%rdx + movq 8(%rsi),%r15 + movq 16(%rsi),%rax + movq 24(%rsi),%r12 + movq %rdi,16(%rsp) + movq 32(%rsi),%rdi + movq 40(%rsi),%rbp + + leaq (%rsi),%rbx + movq %r8,(%rsp) + leaq -128(%rsi),%rsi + + mulxq %rdx,%r8,%r9 + call __mulx_mont_384 + + movq 24(%rsp),%r15 +.cfi_restore %r15 + movq 32(%rsp),%r14 +.cfi_restore %r14 + movq 40(%rsp),%r13 +.cfi_restore %r13 + movq 48(%rsp),%r12 +.cfi_restore %r12 + movq 56(%rsp),%rbx +.cfi_restore %rbx + movq 64(%rsp),%rbp +.cfi_restore %rbp + leaq 72(%rsp),%rsp +.cfi_adjust_cfa_offset -8*9 + + .byte 0xf3,0xc3 +.cfi_endproc + + +.globl _sqrx_n_mul_mont_384 +.private_extern _sqrx_n_mul_mont_384 + +.p2align 5 +_sqrx_n_mul_mont_384: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + leaq -40(%rsp),%rsp +.cfi_adjust_cfa_offset 8*5 + + + movq %rdx,%r10 + movq 0(%rsi),%rdx + movq 8(%rsi),%r15 + movq 16(%rsi),%rax + movq %rsi,%rbx + movq 24(%rsi),%r12 + movq %rdi,16(%rsp) + movq 32(%rsi),%rdi + movq 40(%rsi),%rbp + + movq %r8,(%rsp) + movq %r9,24(%rsp) + movq 0(%r9),%xmm2 + +L$oop_sqrx_384: + movd %r10d,%xmm1 + leaq -128(%rbx),%rsi + leaq -128(%rcx),%rcx + + mulxq %rdx,%r8,%r9 + call __mulx_mont_384 + + movd %xmm1,%r10d + decl %r10d + jnz L$oop_sqrx_384 + + movq %rdx,%r14 +.byte 102,72,15,126,210 + leaq -128(%rbx),%rsi + movq 24(%rsp),%rbx + leaq -128(%rcx),%rcx + + mulxq %r14,%r8,%r9 + call __mulx_mont_384 + + movq 40(%rsp),%r15 +.cfi_restore %r15 + movq 48(%rsp),%r14 +.cfi_restore %r14 + movq 56(%rsp),%r13 +.cfi_restore %r13 + movq 64(%rsp),%r12 +.cfi_restore %r12 + movq 72(%rsp),%rbx +.cfi_restore %rbx + movq 80(%rsp),%rbp +.cfi_restore %rbp + leaq 88(%rsp),%rsp +.cfi_adjust_cfa_offset -8*11 + + .byte 0xf3,0xc3 +.cfi_endproc + + +.globl _sqrx_n_mul_mont_383 +.private_extern _sqrx_n_mul_mont_383 + +.p2align 5 +_sqrx_n_mul_mont_383: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + leaq -40(%rsp),%rsp +.cfi_adjust_cfa_offset 8*5 + + + movq %rdx,%r10 + movq 0(%rsi),%rdx + movq 8(%rsi),%r15 + movq 16(%rsi),%rax + movq %rsi,%rbx + movq 24(%rsi),%r12 + movq %rdi,16(%rsp) + movq 32(%rsi),%rdi + movq 40(%rsi),%rbp + + movq %r8,(%rsp) + movq %r9,24(%rsp) + movq 0(%r9),%xmm2 + leaq -128(%rcx),%rcx + +L$oop_sqrx_383: + movd %r10d,%xmm1 + leaq -128(%rbx),%rsi + + mulxq %rdx,%r8,%r9 + call __mulx_mont_383_nonred + + movd %xmm1,%r10d + decl %r10d + jnz L$oop_sqrx_383 + + movq %rdx,%r14 +.byte 102,72,15,126,210 + leaq -128(%rbx),%rsi + movq 24(%rsp),%rbx + + mulxq %r14,%r8,%r9 + call __mulx_mont_384 + + movq 40(%rsp),%r15 +.cfi_restore %r15 + movq 48(%rsp),%r14 +.cfi_restore %r14 + movq 56(%rsp),%r13 +.cfi_restore %r13 + movq 64(%rsp),%r12 +.cfi_restore %r12 + movq 72(%rsp),%rbx +.cfi_restore %rbx + movq 80(%rsp),%rbp +.cfi_restore %rbp + leaq 88(%rsp),%rsp +.cfi_adjust_cfa_offset -8*11 + + .byte 0xf3,0xc3 +.cfi_endproc + + +.p2align 5 +__mulx_mont_383_nonred: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + mulxq %r15,%r14,%r10 + mulxq %rax,%r15,%r11 + addq %r14,%r9 + mulxq %r12,%rax,%r12 + adcq %r15,%r10 + mulxq %rdi,%rdi,%r13 + adcq %rax,%r11 + mulxq %rbp,%rbp,%r14 + movq 8(%rbx),%rdx + adcq %rdi,%r12 + adcq %rbp,%r13 + adcq $0,%r14 + movq %r8,%rax + imulq 8(%rsp),%r8 + + + xorq %r15,%r15 + mulxq 0+128(%rsi),%rdi,%rbp + adoxq %rdi,%r9 + adcxq %rbp,%r10 + + mulxq 8+128(%rsi),%rdi,%rbp + adoxq %rdi,%r10 + adcxq %rbp,%r11 + + mulxq 16+128(%rsi),%rdi,%rbp + adoxq %rdi,%r11 + adcxq %rbp,%r12 + + mulxq 24+128(%rsi),%rdi,%rbp + adoxq %rdi,%r12 + adcxq %rbp,%r13 + + mulxq 32+128(%rsi),%rdi,%rbp + adoxq %rdi,%r13 + adcxq %rbp,%r14 + + mulxq 40+128(%rsi),%rdi,%rbp + movq %r8,%rdx + adoxq %rdi,%r14 + adcxq %r15,%rbp + adoxq %rbp,%r15 + + + xorq %r8,%r8 + mulxq 0+128(%rcx),%rdi,%rbp + adcxq %rdi,%rax + adoxq %rbp,%r9 + + mulxq 8+128(%rcx),%rdi,%rbp + adcxq %rdi,%r9 + adoxq %rbp,%r10 + + mulxq 16+128(%rcx),%rdi,%rbp + adcxq %rdi,%r10 + adoxq %rbp,%r11 + + mulxq 24+128(%rcx),%rdi,%rbp + adcxq %rdi,%r11 + adoxq %rbp,%r12 + + mulxq 32+128(%rcx),%rdi,%rbp + adcxq %rdi,%r12 + adoxq %rbp,%r13 + + mulxq 40+128(%rcx),%rdi,%rbp + movq 16(%rbx),%rdx + adcxq %rdi,%r13 + adoxq %rbp,%r14 + adcxq %rax,%r14 + adoxq %rax,%r15 + adcxq %rax,%r15 + movq %r9,%r8 + imulq 8(%rsp),%r9 + + + xorq %rax,%rax + mulxq 0+128(%rsi),%rdi,%rbp + adoxq %rdi,%r10 + adcxq %rbp,%r11 + + mulxq 8+128(%rsi),%rdi,%rbp + adoxq %rdi,%r11 + adcxq %rbp,%r12 + + mulxq 16+128(%rsi),%rdi,%rbp + adoxq %rdi,%r12 + adcxq %rbp,%r13 + + mulxq 24+128(%rsi),%rdi,%rbp + adoxq %rdi,%r13 + adcxq %rbp,%r14 + + mulxq 32+128(%rsi),%rdi,%rbp + adoxq %rdi,%r14 + adcxq %rbp,%r15 + + mulxq 40+128(%rsi),%rdi,%rbp + movq %r9,%rdx + adoxq %rdi,%r15 + adcxq %rax,%rbp + adoxq %rbp,%rax + + + xorq %r9,%r9 + mulxq 0+128(%rcx),%rdi,%rbp + adcxq %rdi,%r8 + adoxq %rbp,%r10 + + mulxq 8+128(%rcx),%rdi,%rbp + adcxq %rdi,%r10 + adoxq %rbp,%r11 + + mulxq 16+128(%rcx),%rdi,%rbp + adcxq %rdi,%r11 + adoxq %rbp,%r12 + + mulxq 24+128(%rcx),%rdi,%rbp + adcxq %rdi,%r12 + adoxq %rbp,%r13 + + mulxq 32+128(%rcx),%rdi,%rbp + adcxq %rdi,%r13 + adoxq %rbp,%r14 + + mulxq 40+128(%rcx),%rdi,%rbp + movq 24(%rbx),%rdx + adcxq %rdi,%r14 + adoxq %rbp,%r15 + adcxq %r8,%r15 + adoxq %r8,%rax + adcxq %r8,%rax + movq %r10,%r9 + imulq 8(%rsp),%r10 + + + xorq %r8,%r8 + mulxq 0+128(%rsi),%rdi,%rbp + adoxq %rdi,%r11 + adcxq %rbp,%r12 + + mulxq 8+128(%rsi),%rdi,%rbp + adoxq %rdi,%r12 + adcxq %rbp,%r13 + + mulxq 16+128(%rsi),%rdi,%rbp + adoxq %rdi,%r13 + adcxq %rbp,%r14 + + mulxq 24+128(%rsi),%rdi,%rbp + adoxq %rdi,%r14 + adcxq %rbp,%r15 + + mulxq 32+128(%rsi),%rdi,%rbp + adoxq %rdi,%r15 + adcxq %rbp,%rax + + mulxq 40+128(%rsi),%rdi,%rbp + movq %r10,%rdx + adoxq %rdi,%rax + adcxq %r8,%rbp + adoxq %rbp,%r8 + + + xorq %r10,%r10 + mulxq 0+128(%rcx),%rdi,%rbp + adcxq %rdi,%r9 + adoxq %rbp,%r11 + + mulxq 8+128(%rcx),%rdi,%rbp + adcxq %rdi,%r11 + adoxq %rbp,%r12 + + mulxq 16+128(%rcx),%rdi,%rbp + adcxq %rdi,%r12 + adoxq %rbp,%r13 + + mulxq 24+128(%rcx),%rdi,%rbp + adcxq %rdi,%r13 + adoxq %rbp,%r14 + + mulxq 32+128(%rcx),%rdi,%rbp + adcxq %rdi,%r14 + adoxq %rbp,%r15 + + mulxq 40+128(%rcx),%rdi,%rbp + movq 32(%rbx),%rdx + adcxq %rdi,%r15 + adoxq %rbp,%rax + adcxq %r9,%rax + adoxq %r9,%r8 + adcxq %r9,%r8 + movq %r11,%r10 + imulq 8(%rsp),%r11 + + + xorq %r9,%r9 + mulxq 0+128(%rsi),%rdi,%rbp + adoxq %rdi,%r12 + adcxq %rbp,%r13 + + mulxq 8+128(%rsi),%rdi,%rbp + adoxq %rdi,%r13 + adcxq %rbp,%r14 + + mulxq 16+128(%rsi),%rdi,%rbp + adoxq %rdi,%r14 + adcxq %rbp,%r15 + + mulxq 24+128(%rsi),%rdi,%rbp + adoxq %rdi,%r15 + adcxq %rbp,%rax + + mulxq 32+128(%rsi),%rdi,%rbp + adoxq %rdi,%rax + adcxq %rbp,%r8 + + mulxq 40+128(%rsi),%rdi,%rbp + movq %r11,%rdx + adoxq %rdi,%r8 + adcxq %r9,%rbp + adoxq %rbp,%r9 + + + xorq %r11,%r11 + mulxq 0+128(%rcx),%rdi,%rbp + adcxq %rdi,%r10 + adoxq %rbp,%r12 + + mulxq 8+128(%rcx),%rdi,%rbp + adcxq %rdi,%r12 + adoxq %rbp,%r13 + + mulxq 16+128(%rcx),%rdi,%rbp + adcxq %rdi,%r13 + adoxq %rbp,%r14 + + mulxq 24+128(%rcx),%rdi,%rbp + adcxq %rdi,%r14 + adoxq %rbp,%r15 + + mulxq 32+128(%rcx),%rdi,%rbp + adcxq %rdi,%r15 + adoxq %rbp,%rax + + mulxq 40+128(%rcx),%rdi,%rbp + movq 40(%rbx),%rdx + adcxq %rdi,%rax + adoxq %rbp,%r8 + adcxq %r10,%r8 + adoxq %r10,%r9 + adcxq %r10,%r9 + movq %r12,%r11 + imulq 8(%rsp),%r12 + + + xorq %r10,%r10 + mulxq 0+128(%rsi),%rdi,%rbp + adoxq %rdi,%r13 + adcxq %rbp,%r14 + + mulxq 8+128(%rsi),%rdi,%rbp + adoxq %rdi,%r14 + adcxq %rbp,%r15 + + mulxq 16+128(%rsi),%rdi,%rbp + adoxq %rdi,%r15 + adcxq %rbp,%rax + + mulxq 24+128(%rsi),%rdi,%rbp + adoxq %rdi,%rax + adcxq %rbp,%r8 + + mulxq 32+128(%rsi),%rdi,%rbp + adoxq %rdi,%r8 + adcxq %rbp,%r9 + + mulxq 40+128(%rsi),%rdi,%rbp + movq %r12,%rdx + adoxq %rdi,%r9 + adcxq %r10,%rbp + adoxq %rbp,%r10 + + + xorq %r12,%r12 + mulxq 0+128(%rcx),%rdi,%rbp + adcxq %rdi,%r11 + adoxq %rbp,%r13 + + mulxq 8+128(%rcx),%rdi,%rbp + adcxq %rdi,%r13 + adoxq %rbp,%r14 + + mulxq 16+128(%rcx),%rdi,%rbp + adcxq %rdi,%r14 + adoxq %rbp,%r15 + + mulxq 24+128(%rcx),%rdi,%rbp + adcxq %rdi,%r15 + adoxq %rbp,%rax + + mulxq 32+128(%rcx),%rdi,%rbp + adcxq %rdi,%rax + adoxq %rbp,%r8 + + mulxq 40+128(%rcx),%rdi,%rbp + movq %r13,%rdx + adcxq %rdi,%r8 + adoxq %rbp,%r9 + adcxq %r11,%r9 + adoxq %r11,%r10 + adcxq %r11,%r10 + imulq 8(%rsp),%rdx + movq 24(%rsp),%rbx + + + xorq %r12,%r12 + mulxq 0+128(%rcx),%rdi,%rbp + adcxq %rdi,%r13 + adoxq %rbp,%r14 + + mulxq 8+128(%rcx),%rdi,%rbp + adcxq %rdi,%r14 + adoxq %rbp,%r15 + + mulxq 16+128(%rcx),%rdi,%rbp + adcxq %rdi,%r15 + adoxq %rbp,%rax + + mulxq 24+128(%rcx),%rdi,%rbp + adcxq %rdi,%rax + adoxq %rbp,%r8 + + mulxq 32+128(%rcx),%rdi,%rbp + adcxq %rdi,%r8 + adoxq %rbp,%r9 + + mulxq 40+128(%rcx),%rdi,%rbp + movq %r14,%rdx + adcxq %rdi,%r9 + adoxq %rbp,%r10 + adcq $0,%r10 + movq %r8,%r12 + + movq %r14,0(%rbx) + movq %r15,8(%rbx) + movq %rax,16(%rbx) + movq %r9,%rdi + movq %r8,24(%rbx) + movq %r9,32(%rbx) + movq %r10,40(%rbx) + movq %r10,%rbp + + .byte 0xf3,0xc3 +.cfi_endproc + +.globl _sqrx_mont_382x +.private_extern _sqrx_mont_382x + +.p2align 5 +_sqrx_mont_382x: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + subq $136,%rsp +.cfi_adjust_cfa_offset 136 + + + movq %rcx,0(%rsp) + movq %rdx,%rcx + movq %rdi,16(%rsp) + movq %rsi,24(%rsp) + + + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + movq 32(%rsi),%r12 + movq 40(%rsi),%r13 + + movq %r8,%r14 + addq 48(%rsi),%r8 + movq %r9,%r15 + adcq 56(%rsi),%r9 + movq %r10,%rax + adcq 64(%rsi),%r10 + movq %r11,%rdx + adcq 72(%rsi),%r11 + movq %r12,%rbx + adcq 80(%rsi),%r12 + movq %r13,%rbp + adcq 88(%rsi),%r13 + + subq 48(%rsi),%r14 + sbbq 56(%rsi),%r15 + sbbq 64(%rsi),%rax + sbbq 72(%rsi),%rdx + sbbq 80(%rsi),%rbx + sbbq 88(%rsi),%rbp + sbbq %rdi,%rdi + + movq %r8,32+0(%rsp) + movq %r9,32+8(%rsp) + movq %r10,32+16(%rsp) + movq %r11,32+24(%rsp) + movq %r12,32+32(%rsp) + movq %r13,32+40(%rsp) + + movq %r14,32+48(%rsp) + movq %r15,32+56(%rsp) + movq %rax,32+64(%rsp) + movq %rdx,32+72(%rsp) + movq %rbx,32+80(%rsp) + movq %rbp,32+88(%rsp) + movq %rdi,32+96(%rsp) + + + + leaq 48(%rsi),%rbx + + movq 48(%rsi),%rdx + movq 0(%rsi),%r14 + movq 8(%rsi),%r15 + movq 16(%rsi),%rax + movq 24(%rsi),%r12 + movq 32(%rsi),%rdi + movq 40(%rsi),%rbp + leaq -128(%rsi),%rsi + leaq -128(%rcx),%rcx + + mulxq %r14,%r8,%r9 + call __mulx_mont_383_nonred + addq %rdx,%rdx + adcq %r15,%r15 + adcq %rax,%rax + adcq %r12,%r12 + adcq %rdi,%rdi + adcq %rbp,%rbp + + movq %rdx,48(%rbx) + movq %r15,56(%rbx) + movq %rax,64(%rbx) + movq %r12,72(%rbx) + movq %rdi,80(%rbx) + movq %rbp,88(%rbx) + + leaq 32-128(%rsp),%rsi + leaq 32+48(%rsp),%rbx + + movq 32+48(%rsp),%rdx + movq 32+0(%rsp),%r14 + movq 32+8(%rsp),%r15 + movq 32+16(%rsp),%rax + movq 32+24(%rsp),%r12 + movq 32+32(%rsp),%rdi + movq 32+40(%rsp),%rbp + + + + mulxq %r14,%r8,%r9 + call __mulx_mont_383_nonred + movq 32+96(%rsp),%r14 + leaq 128(%rcx),%rcx + movq 32+0(%rsp),%r8 + andq %r14,%r8 + movq 32+8(%rsp),%r9 + andq %r14,%r9 + movq 32+16(%rsp),%r10 + andq %r14,%r10 + movq 32+24(%rsp),%r11 + andq %r14,%r11 + movq 32+32(%rsp),%r13 + andq %r14,%r13 + andq 32+40(%rsp),%r14 + + subq %r8,%rdx + movq 0(%rcx),%r8 + sbbq %r9,%r15 + movq 8(%rcx),%r9 + sbbq %r10,%rax + movq 16(%rcx),%r10 + sbbq %r11,%r12 + movq 24(%rcx),%r11 + sbbq %r13,%rdi + movq 32(%rcx),%r13 + sbbq %r14,%rbp + sbbq %r14,%r14 + + andq %r14,%r8 + andq %r14,%r9 + andq %r14,%r10 + andq %r14,%r11 + andq %r14,%r13 + andq 40(%rcx),%r14 + + addq %r8,%rdx + adcq %r9,%r15 + adcq %r10,%rax + adcq %r11,%r12 + adcq %r13,%rdi + adcq %r14,%rbp + + movq %rdx,0(%rbx) + movq %r15,8(%rbx) + movq %rax,16(%rbx) + movq %r12,24(%rbx) + movq %rdi,32(%rbx) + movq %rbp,40(%rbx) + leaq 136(%rsp),%r8 + movq 0(%r8),%r15 +.cfi_restore %r15 + movq 8(%r8),%r14 +.cfi_restore %r14 + movq 16(%r8),%r13 +.cfi_restore %r13 + movq 24(%r8),%r12 +.cfi_restore %r12 + movq 32(%r8),%rbx +.cfi_restore %rbx + movq 40(%r8),%rbp +.cfi_restore %rbp + leaq 48(%r8),%rsp +.cfi_adjust_cfa_offset -136-8*6 + + .byte 0xf3,0xc3 +.cfi_endproc + diff --git a/temp-deps/blst-ts/deps/blst/build/mach-o/sha256-armv8.S b/temp-deps/blst-ts/deps/blst/build/mach-o/sha256-armv8.S new file mode 100644 index 0000000..c928f75 --- /dev/null +++ b/temp-deps/blst-ts/deps/blst/build/mach-o/sha256-armv8.S @@ -0,0 +1,1077 @@ +// +// Copyright Supranational LLC +// Licensed under the Apache License, Version 2.0, see LICENSE for details. +// SPDX-License-Identifier: Apache-2.0 +// +// ==================================================================== +// Written by Andy Polyakov, @dot-asm, initially for the OpenSSL +// project. +// ==================================================================== +// +// sha256_block procedure for ARMv8. +// +// This module is stripped of scalar code paths, with raionale that all +// known processors are NEON-capable. +// +// See original module at CRYPTOGAMS for further details. + +.text + +.align 6 + +LK256: +.long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5 +.long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5 +.long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3 +.long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174 +.long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc +.long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da +.long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7 +.long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967 +.long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13 +.long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85 +.long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3 +.long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070 +.long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5 +.long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3 +.long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208 +.long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 +.long 0 //terminator + +.byte 83,72,65,50,53,54,32,98,108,111,99,107,32,116,114,97,110,115,102,111,114,109,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,64,100,111,116,45,97,115,109,0 +.align 2 +.align 2 +.globl _blst_sha256_block_armv8 + +.align 6 +_blst_sha256_block_armv8: +Lv8_entry: + stp x29,x30,[sp,#-16]! + add x29,sp,#0 + + ld1 {v0.4s,v1.4s},[x0] + adr x3,LK256 + +Loop_hw: + ld1 {v4.16b,v5.16b,v6.16b,v7.16b},[x1],#64 + sub x2,x2,#1 + ld1 {v16.4s},[x3],#16 + rev32 v4.16b,v4.16b + rev32 v5.16b,v5.16b + rev32 v6.16b,v6.16b + rev32 v7.16b,v7.16b + orr v18.16b,v0.16b,v0.16b // offload + orr v19.16b,v1.16b,v1.16b + ld1 {v17.4s},[x3],#16 + add v16.4s,v16.4s,v4.4s +.long 0x5e2828a4 //sha256su0 v4.16b,v5.16b + orr v2.16b,v0.16b,v0.16b +.long 0x5e104020 //sha256h v0.16b,v1.16b,v16.4s +.long 0x5e105041 //sha256h2 v1.16b,v2.16b,v16.4s +.long 0x5e0760c4 //sha256su1 v4.16b,v6.16b,v7.16b + ld1 {v16.4s},[x3],#16 + add v17.4s,v17.4s,v5.4s +.long 0x5e2828c5 //sha256su0 v5.16b,v6.16b + orr v2.16b,v0.16b,v0.16b +.long 0x5e114020 //sha256h v0.16b,v1.16b,v17.4s +.long 0x5e115041 //sha256h2 v1.16b,v2.16b,v17.4s +.long 0x5e0460e5 //sha256su1 v5.16b,v7.16b,v4.16b + ld1 {v17.4s},[x3],#16 + add v16.4s,v16.4s,v6.4s +.long 0x5e2828e6 //sha256su0 v6.16b,v7.16b + orr v2.16b,v0.16b,v0.16b +.long 0x5e104020 //sha256h v0.16b,v1.16b,v16.4s +.long 0x5e105041 //sha256h2 v1.16b,v2.16b,v16.4s +.long 0x5e056086 //sha256su1 v6.16b,v4.16b,v5.16b + ld1 {v16.4s},[x3],#16 + add v17.4s,v17.4s,v7.4s +.long 0x5e282887 //sha256su0 v7.16b,v4.16b + orr v2.16b,v0.16b,v0.16b +.long 0x5e114020 //sha256h v0.16b,v1.16b,v17.4s +.long 0x5e115041 //sha256h2 v1.16b,v2.16b,v17.4s +.long 0x5e0660a7 //sha256su1 v7.16b,v5.16b,v6.16b + ld1 {v17.4s},[x3],#16 + add v16.4s,v16.4s,v4.4s +.long 0x5e2828a4 //sha256su0 v4.16b,v5.16b + orr v2.16b,v0.16b,v0.16b +.long 0x5e104020 //sha256h v0.16b,v1.16b,v16.4s +.long 0x5e105041 //sha256h2 v1.16b,v2.16b,v16.4s +.long 0x5e0760c4 //sha256su1 v4.16b,v6.16b,v7.16b + ld1 {v16.4s},[x3],#16 + add v17.4s,v17.4s,v5.4s +.long 0x5e2828c5 //sha256su0 v5.16b,v6.16b + orr v2.16b,v0.16b,v0.16b +.long 0x5e114020 //sha256h v0.16b,v1.16b,v17.4s +.long 0x5e115041 //sha256h2 v1.16b,v2.16b,v17.4s +.long 0x5e0460e5 //sha256su1 v5.16b,v7.16b,v4.16b + ld1 {v17.4s},[x3],#16 + add v16.4s,v16.4s,v6.4s +.long 0x5e2828e6 //sha256su0 v6.16b,v7.16b + orr v2.16b,v0.16b,v0.16b +.long 0x5e104020 //sha256h v0.16b,v1.16b,v16.4s +.long 0x5e105041 //sha256h2 v1.16b,v2.16b,v16.4s +.long 0x5e056086 //sha256su1 v6.16b,v4.16b,v5.16b + ld1 {v16.4s},[x3],#16 + add v17.4s,v17.4s,v7.4s +.long 0x5e282887 //sha256su0 v7.16b,v4.16b + orr v2.16b,v0.16b,v0.16b +.long 0x5e114020 //sha256h v0.16b,v1.16b,v17.4s +.long 0x5e115041 //sha256h2 v1.16b,v2.16b,v17.4s +.long 0x5e0660a7 //sha256su1 v7.16b,v5.16b,v6.16b + ld1 {v17.4s},[x3],#16 + add v16.4s,v16.4s,v4.4s +.long 0x5e2828a4 //sha256su0 v4.16b,v5.16b + orr v2.16b,v0.16b,v0.16b +.long 0x5e104020 //sha256h v0.16b,v1.16b,v16.4s +.long 0x5e105041 //sha256h2 v1.16b,v2.16b,v16.4s +.long 0x5e0760c4 //sha256su1 v4.16b,v6.16b,v7.16b + ld1 {v16.4s},[x3],#16 + add v17.4s,v17.4s,v5.4s +.long 0x5e2828c5 //sha256su0 v5.16b,v6.16b + orr v2.16b,v0.16b,v0.16b +.long 0x5e114020 //sha256h v0.16b,v1.16b,v17.4s +.long 0x5e115041 //sha256h2 v1.16b,v2.16b,v17.4s +.long 0x5e0460e5 //sha256su1 v5.16b,v7.16b,v4.16b + ld1 {v17.4s},[x3],#16 + add v16.4s,v16.4s,v6.4s +.long 0x5e2828e6 //sha256su0 v6.16b,v7.16b + orr v2.16b,v0.16b,v0.16b +.long 0x5e104020 //sha256h v0.16b,v1.16b,v16.4s +.long 0x5e105041 //sha256h2 v1.16b,v2.16b,v16.4s +.long 0x5e056086 //sha256su1 v6.16b,v4.16b,v5.16b + ld1 {v16.4s},[x3],#16 + add v17.4s,v17.4s,v7.4s +.long 0x5e282887 //sha256su0 v7.16b,v4.16b + orr v2.16b,v0.16b,v0.16b +.long 0x5e114020 //sha256h v0.16b,v1.16b,v17.4s +.long 0x5e115041 //sha256h2 v1.16b,v2.16b,v17.4s +.long 0x5e0660a7 //sha256su1 v7.16b,v5.16b,v6.16b + ld1 {v17.4s},[x3],#16 + add v16.4s,v16.4s,v4.4s + orr v2.16b,v0.16b,v0.16b +.long 0x5e104020 //sha256h v0.16b,v1.16b,v16.4s +.long 0x5e105041 //sha256h2 v1.16b,v2.16b,v16.4s + + ld1 {v16.4s},[x3],#16 + add v17.4s,v17.4s,v5.4s + orr v2.16b,v0.16b,v0.16b +.long 0x5e114020 //sha256h v0.16b,v1.16b,v17.4s +.long 0x5e115041 //sha256h2 v1.16b,v2.16b,v17.4s + + ld1 {v17.4s},[x3] + add v16.4s,v16.4s,v6.4s + sub x3,x3,#64*4-16 // rewind + orr v2.16b,v0.16b,v0.16b +.long 0x5e104020 //sha256h v0.16b,v1.16b,v16.4s +.long 0x5e105041 //sha256h2 v1.16b,v2.16b,v16.4s + + add v17.4s,v17.4s,v7.4s + orr v2.16b,v0.16b,v0.16b +.long 0x5e114020 //sha256h v0.16b,v1.16b,v17.4s +.long 0x5e115041 //sha256h2 v1.16b,v2.16b,v17.4s + + add v0.4s,v0.4s,v18.4s + add v1.4s,v1.4s,v19.4s + + cbnz x2,Loop_hw + + st1 {v0.4s,v1.4s},[x0] + + ldr x29,[sp],#16 + ret + +.globl _blst_sha256_block_data_order + +.align 4 +_blst_sha256_block_data_order: + stp x29, x30, [sp, #-16]! + mov x29, sp + sub sp,sp,#16*4 + + adr x16,LK256 + add x2,x1,x2,lsl#6 // len to point at the end of inp + + ld1 {v0.16b},[x1], #16 + ld1 {v1.16b},[x1], #16 + ld1 {v2.16b},[x1], #16 + ld1 {v3.16b},[x1], #16 + ld1 {v4.4s},[x16], #16 + ld1 {v5.4s},[x16], #16 + ld1 {v6.4s},[x16], #16 + ld1 {v7.4s},[x16], #16 + rev32 v0.16b,v0.16b // yes, even on + rev32 v1.16b,v1.16b // big-endian + rev32 v2.16b,v2.16b + rev32 v3.16b,v3.16b + mov x17,sp + add v4.4s,v4.4s,v0.4s + add v5.4s,v5.4s,v1.4s + add v6.4s,v6.4s,v2.4s + st1 {v4.4s,v5.4s},[x17], #32 + add v7.4s,v7.4s,v3.4s + st1 {v6.4s,v7.4s},[x17] + sub x17,x17,#32 + + ldp w3,w4,[x0] + ldp w5,w6,[x0,#8] + ldp w7,w8,[x0,#16] + ldp w9,w10,[x0,#24] + ldr w12,[sp,#0] + mov w13,wzr + eor w14,w4,w5 + mov w15,wzr + b L_00_48 + +.align 4 +L_00_48: + ext v4.16b,v0.16b,v1.16b,#4 + add w10,w10,w12 + add w3,w3,w15 + and w12,w8,w7 + bic w15,w9,w7 + ext v7.16b,v2.16b,v3.16b,#4 + eor w11,w7,w7,ror#5 + add w3,w3,w13 + mov d19,v3.d[1] + orr w12,w12,w15 + eor w11,w11,w7,ror#19 + ushr v6.4s,v4.4s,#7 + eor w15,w3,w3,ror#11 + ushr v5.4s,v4.4s,#3 + add w10,w10,w12 + add v0.4s,v0.4s,v7.4s + ror w11,w11,#6 + sli v6.4s,v4.4s,#25 + eor w13,w3,w4 + eor w15,w15,w3,ror#20 + ushr v7.4s,v4.4s,#18 + add w10,w10,w11 + ldr w12,[sp,#4] + and w14,w14,w13 + eor v5.16b,v5.16b,v6.16b + ror w15,w15,#2 + add w6,w6,w10 + sli v7.4s,v4.4s,#14 + eor w14,w14,w4 + ushr v16.4s,v19.4s,#17 + add w9,w9,w12 + add w10,w10,w15 + and w12,w7,w6 + eor v5.16b,v5.16b,v7.16b + bic w15,w8,w6 + eor w11,w6,w6,ror#5 + sli v16.4s,v19.4s,#15 + add w10,w10,w14 + orr w12,w12,w15 + ushr v17.4s,v19.4s,#10 + eor w11,w11,w6,ror#19 + eor w15,w10,w10,ror#11 + ushr v7.4s,v19.4s,#19 + add w9,w9,w12 + ror w11,w11,#6 + add v0.4s,v0.4s,v5.4s + eor w14,w10,w3 + eor w15,w15,w10,ror#20 + sli v7.4s,v19.4s,#13 + add w9,w9,w11 + ldr w12,[sp,#8] + and w13,w13,w14 + eor v17.16b,v17.16b,v16.16b + ror w15,w15,#2 + add w5,w5,w9 + eor w13,w13,w3 + eor v17.16b,v17.16b,v7.16b + add w8,w8,w12 + add w9,w9,w15 + and w12,w6,w5 + add v0.4s,v0.4s,v17.4s + bic w15,w7,w5 + eor w11,w5,w5,ror#5 + add w9,w9,w13 + ushr v18.4s,v0.4s,#17 + orr w12,w12,w15 + ushr v19.4s,v0.4s,#10 + eor w11,w11,w5,ror#19 + eor w15,w9,w9,ror#11 + sli v18.4s,v0.4s,#15 + add w8,w8,w12 + ushr v17.4s,v0.4s,#19 + ror w11,w11,#6 + eor w13,w9,w10 + eor v19.16b,v19.16b,v18.16b + eor w15,w15,w9,ror#20 + add w8,w8,w11 + sli v17.4s,v0.4s,#13 + ldr w12,[sp,#12] + and w14,w14,w13 + ror w15,w15,#2 + ld1 {v4.4s},[x16], #16 + add w4,w4,w8 + eor v19.16b,v19.16b,v17.16b + eor w14,w14,w10 + eor v17.16b,v17.16b,v17.16b + add w7,w7,w12 + add w8,w8,w15 + and w12,w5,w4 + mov v17.d[1],v19.d[0] + bic w15,w6,w4 + eor w11,w4,w4,ror#5 + add w8,w8,w14 + add v0.4s,v0.4s,v17.4s + orr w12,w12,w15 + eor w11,w11,w4,ror#19 + eor w15,w8,w8,ror#11 + add v4.4s,v4.4s,v0.4s + add w7,w7,w12 + ror w11,w11,#6 + eor w14,w8,w9 + eor w15,w15,w8,ror#20 + add w7,w7,w11 + ldr w12,[sp,#16] + and w13,w13,w14 + ror w15,w15,#2 + add w3,w3,w7 + eor w13,w13,w9 + st1 {v4.4s},[x17], #16 + ext v4.16b,v1.16b,v2.16b,#4 + add w6,w6,w12 + add w7,w7,w15 + and w12,w4,w3 + bic w15,w5,w3 + ext v7.16b,v3.16b,v0.16b,#4 + eor w11,w3,w3,ror#5 + add w7,w7,w13 + mov d19,v0.d[1] + orr w12,w12,w15 + eor w11,w11,w3,ror#19 + ushr v6.4s,v4.4s,#7 + eor w15,w7,w7,ror#11 + ushr v5.4s,v4.4s,#3 + add w6,w6,w12 + add v1.4s,v1.4s,v7.4s + ror w11,w11,#6 + sli v6.4s,v4.4s,#25 + eor w13,w7,w8 + eor w15,w15,w7,ror#20 + ushr v7.4s,v4.4s,#18 + add w6,w6,w11 + ldr w12,[sp,#20] + and w14,w14,w13 + eor v5.16b,v5.16b,v6.16b + ror w15,w15,#2 + add w10,w10,w6 + sli v7.4s,v4.4s,#14 + eor w14,w14,w8 + ushr v16.4s,v19.4s,#17 + add w5,w5,w12 + add w6,w6,w15 + and w12,w3,w10 + eor v5.16b,v5.16b,v7.16b + bic w15,w4,w10 + eor w11,w10,w10,ror#5 + sli v16.4s,v19.4s,#15 + add w6,w6,w14 + orr w12,w12,w15 + ushr v17.4s,v19.4s,#10 + eor w11,w11,w10,ror#19 + eor w15,w6,w6,ror#11 + ushr v7.4s,v19.4s,#19 + add w5,w5,w12 + ror w11,w11,#6 + add v1.4s,v1.4s,v5.4s + eor w14,w6,w7 + eor w15,w15,w6,ror#20 + sli v7.4s,v19.4s,#13 + add w5,w5,w11 + ldr w12,[sp,#24] + and w13,w13,w14 + eor v17.16b,v17.16b,v16.16b + ror w15,w15,#2 + add w9,w9,w5 + eor w13,w13,w7 + eor v17.16b,v17.16b,v7.16b + add w4,w4,w12 + add w5,w5,w15 + and w12,w10,w9 + add v1.4s,v1.4s,v17.4s + bic w15,w3,w9 + eor w11,w9,w9,ror#5 + add w5,w5,w13 + ushr v18.4s,v1.4s,#17 + orr w12,w12,w15 + ushr v19.4s,v1.4s,#10 + eor w11,w11,w9,ror#19 + eor w15,w5,w5,ror#11 + sli v18.4s,v1.4s,#15 + add w4,w4,w12 + ushr v17.4s,v1.4s,#19 + ror w11,w11,#6 + eor w13,w5,w6 + eor v19.16b,v19.16b,v18.16b + eor w15,w15,w5,ror#20 + add w4,w4,w11 + sli v17.4s,v1.4s,#13 + ldr w12,[sp,#28] + and w14,w14,w13 + ror w15,w15,#2 + ld1 {v4.4s},[x16], #16 + add w8,w8,w4 + eor v19.16b,v19.16b,v17.16b + eor w14,w14,w6 + eor v17.16b,v17.16b,v17.16b + add w3,w3,w12 + add w4,w4,w15 + and w12,w9,w8 + mov v17.d[1],v19.d[0] + bic w15,w10,w8 + eor w11,w8,w8,ror#5 + add w4,w4,w14 + add v1.4s,v1.4s,v17.4s + orr w12,w12,w15 + eor w11,w11,w8,ror#19 + eor w15,w4,w4,ror#11 + add v4.4s,v4.4s,v1.4s + add w3,w3,w12 + ror w11,w11,#6 + eor w14,w4,w5 + eor w15,w15,w4,ror#20 + add w3,w3,w11 + ldr w12,[sp,#32] + and w13,w13,w14 + ror w15,w15,#2 + add w7,w7,w3 + eor w13,w13,w5 + st1 {v4.4s},[x17], #16 + ext v4.16b,v2.16b,v3.16b,#4 + add w10,w10,w12 + add w3,w3,w15 + and w12,w8,w7 + bic w15,w9,w7 + ext v7.16b,v0.16b,v1.16b,#4 + eor w11,w7,w7,ror#5 + add w3,w3,w13 + mov d19,v1.d[1] + orr w12,w12,w15 + eor w11,w11,w7,ror#19 + ushr v6.4s,v4.4s,#7 + eor w15,w3,w3,ror#11 + ushr v5.4s,v4.4s,#3 + add w10,w10,w12 + add v2.4s,v2.4s,v7.4s + ror w11,w11,#6 + sli v6.4s,v4.4s,#25 + eor w13,w3,w4 + eor w15,w15,w3,ror#20 + ushr v7.4s,v4.4s,#18 + add w10,w10,w11 + ldr w12,[sp,#36] + and w14,w14,w13 + eor v5.16b,v5.16b,v6.16b + ror w15,w15,#2 + add w6,w6,w10 + sli v7.4s,v4.4s,#14 + eor w14,w14,w4 + ushr v16.4s,v19.4s,#17 + add w9,w9,w12 + add w10,w10,w15 + and w12,w7,w6 + eor v5.16b,v5.16b,v7.16b + bic w15,w8,w6 + eor w11,w6,w6,ror#5 + sli v16.4s,v19.4s,#15 + add w10,w10,w14 + orr w12,w12,w15 + ushr v17.4s,v19.4s,#10 + eor w11,w11,w6,ror#19 + eor w15,w10,w10,ror#11 + ushr v7.4s,v19.4s,#19 + add w9,w9,w12 + ror w11,w11,#6 + add v2.4s,v2.4s,v5.4s + eor w14,w10,w3 + eor w15,w15,w10,ror#20 + sli v7.4s,v19.4s,#13 + add w9,w9,w11 + ldr w12,[sp,#40] + and w13,w13,w14 + eor v17.16b,v17.16b,v16.16b + ror w15,w15,#2 + add w5,w5,w9 + eor w13,w13,w3 + eor v17.16b,v17.16b,v7.16b + add w8,w8,w12 + add w9,w9,w15 + and w12,w6,w5 + add v2.4s,v2.4s,v17.4s + bic w15,w7,w5 + eor w11,w5,w5,ror#5 + add w9,w9,w13 + ushr v18.4s,v2.4s,#17 + orr w12,w12,w15 + ushr v19.4s,v2.4s,#10 + eor w11,w11,w5,ror#19 + eor w15,w9,w9,ror#11 + sli v18.4s,v2.4s,#15 + add w8,w8,w12 + ushr v17.4s,v2.4s,#19 + ror w11,w11,#6 + eor w13,w9,w10 + eor v19.16b,v19.16b,v18.16b + eor w15,w15,w9,ror#20 + add w8,w8,w11 + sli v17.4s,v2.4s,#13 + ldr w12,[sp,#44] + and w14,w14,w13 + ror w15,w15,#2 + ld1 {v4.4s},[x16], #16 + add w4,w4,w8 + eor v19.16b,v19.16b,v17.16b + eor w14,w14,w10 + eor v17.16b,v17.16b,v17.16b + add w7,w7,w12 + add w8,w8,w15 + and w12,w5,w4 + mov v17.d[1],v19.d[0] + bic w15,w6,w4 + eor w11,w4,w4,ror#5 + add w8,w8,w14 + add v2.4s,v2.4s,v17.4s + orr w12,w12,w15 + eor w11,w11,w4,ror#19 + eor w15,w8,w8,ror#11 + add v4.4s,v4.4s,v2.4s + add w7,w7,w12 + ror w11,w11,#6 + eor w14,w8,w9 + eor w15,w15,w8,ror#20 + add w7,w7,w11 + ldr w12,[sp,#48] + and w13,w13,w14 + ror w15,w15,#2 + add w3,w3,w7 + eor w13,w13,w9 + st1 {v4.4s},[x17], #16 + ext v4.16b,v3.16b,v0.16b,#4 + add w6,w6,w12 + add w7,w7,w15 + and w12,w4,w3 + bic w15,w5,w3 + ext v7.16b,v1.16b,v2.16b,#4 + eor w11,w3,w3,ror#5 + add w7,w7,w13 + mov d19,v2.d[1] + orr w12,w12,w15 + eor w11,w11,w3,ror#19 + ushr v6.4s,v4.4s,#7 + eor w15,w7,w7,ror#11 + ushr v5.4s,v4.4s,#3 + add w6,w6,w12 + add v3.4s,v3.4s,v7.4s + ror w11,w11,#6 + sli v6.4s,v4.4s,#25 + eor w13,w7,w8 + eor w15,w15,w7,ror#20 + ushr v7.4s,v4.4s,#18 + add w6,w6,w11 + ldr w12,[sp,#52] + and w14,w14,w13 + eor v5.16b,v5.16b,v6.16b + ror w15,w15,#2 + add w10,w10,w6 + sli v7.4s,v4.4s,#14 + eor w14,w14,w8 + ushr v16.4s,v19.4s,#17 + add w5,w5,w12 + add w6,w6,w15 + and w12,w3,w10 + eor v5.16b,v5.16b,v7.16b + bic w15,w4,w10 + eor w11,w10,w10,ror#5 + sli v16.4s,v19.4s,#15 + add w6,w6,w14 + orr w12,w12,w15 + ushr v17.4s,v19.4s,#10 + eor w11,w11,w10,ror#19 + eor w15,w6,w6,ror#11 + ushr v7.4s,v19.4s,#19 + add w5,w5,w12 + ror w11,w11,#6 + add v3.4s,v3.4s,v5.4s + eor w14,w6,w7 + eor w15,w15,w6,ror#20 + sli v7.4s,v19.4s,#13 + add w5,w5,w11 + ldr w12,[sp,#56] + and w13,w13,w14 + eor v17.16b,v17.16b,v16.16b + ror w15,w15,#2 + add w9,w9,w5 + eor w13,w13,w7 + eor v17.16b,v17.16b,v7.16b + add w4,w4,w12 + add w5,w5,w15 + and w12,w10,w9 + add v3.4s,v3.4s,v17.4s + bic w15,w3,w9 + eor w11,w9,w9,ror#5 + add w5,w5,w13 + ushr v18.4s,v3.4s,#17 + orr w12,w12,w15 + ushr v19.4s,v3.4s,#10 + eor w11,w11,w9,ror#19 + eor w15,w5,w5,ror#11 + sli v18.4s,v3.4s,#15 + add w4,w4,w12 + ushr v17.4s,v3.4s,#19 + ror w11,w11,#6 + eor w13,w5,w6 + eor v19.16b,v19.16b,v18.16b + eor w15,w15,w5,ror#20 + add w4,w4,w11 + sli v17.4s,v3.4s,#13 + ldr w12,[sp,#60] + and w14,w14,w13 + ror w15,w15,#2 + ld1 {v4.4s},[x16], #16 + add w8,w8,w4 + eor v19.16b,v19.16b,v17.16b + eor w14,w14,w6 + eor v17.16b,v17.16b,v17.16b + add w3,w3,w12 + add w4,w4,w15 + and w12,w9,w8 + mov v17.d[1],v19.d[0] + bic w15,w10,w8 + eor w11,w8,w8,ror#5 + add w4,w4,w14 + add v3.4s,v3.4s,v17.4s + orr w12,w12,w15 + eor w11,w11,w8,ror#19 + eor w15,w4,w4,ror#11 + add v4.4s,v4.4s,v3.4s + add w3,w3,w12 + ror w11,w11,#6 + eor w14,w4,w5 + eor w15,w15,w4,ror#20 + add w3,w3,w11 + ldr w12,[x16] + and w13,w13,w14 + ror w15,w15,#2 + add w7,w7,w3 + eor w13,w13,w5 + st1 {v4.4s},[x17], #16 + cmp w12,#0 // check for K256 terminator + ldr w12,[sp,#0] + sub x17,x17,#64 + bne L_00_48 + + sub x16,x16,#256 // rewind x16 + cmp x1,x2 + mov x17, #64 + csel x17, x17, xzr, eq + sub x1,x1,x17 // avoid SEGV + mov x17,sp + add w10,w10,w12 + add w3,w3,w15 + and w12,w8,w7 + ld1 {v0.16b},[x1],#16 + bic w15,w9,w7 + eor w11,w7,w7,ror#5 + ld1 {v4.4s},[x16],#16 + add w3,w3,w13 + orr w12,w12,w15 + eor w11,w11,w7,ror#19 + eor w15,w3,w3,ror#11 + rev32 v0.16b,v0.16b + add w10,w10,w12 + ror w11,w11,#6 + eor w13,w3,w4 + eor w15,w15,w3,ror#20 + add v4.4s,v4.4s,v0.4s + add w10,w10,w11 + ldr w12,[sp,#4] + and w14,w14,w13 + ror w15,w15,#2 + add w6,w6,w10 + eor w14,w14,w4 + add w9,w9,w12 + add w10,w10,w15 + and w12,w7,w6 + bic w15,w8,w6 + eor w11,w6,w6,ror#5 + add w10,w10,w14 + orr w12,w12,w15 + eor w11,w11,w6,ror#19 + eor w15,w10,w10,ror#11 + add w9,w9,w12 + ror w11,w11,#6 + eor w14,w10,w3 + eor w15,w15,w10,ror#20 + add w9,w9,w11 + ldr w12,[sp,#8] + and w13,w13,w14 + ror w15,w15,#2 + add w5,w5,w9 + eor w13,w13,w3 + add w8,w8,w12 + add w9,w9,w15 + and w12,w6,w5 + bic w15,w7,w5 + eor w11,w5,w5,ror#5 + add w9,w9,w13 + orr w12,w12,w15 + eor w11,w11,w5,ror#19 + eor w15,w9,w9,ror#11 + add w8,w8,w12 + ror w11,w11,#6 + eor w13,w9,w10 + eor w15,w15,w9,ror#20 + add w8,w8,w11 + ldr w12,[sp,#12] + and w14,w14,w13 + ror w15,w15,#2 + add w4,w4,w8 + eor w14,w14,w10 + add w7,w7,w12 + add w8,w8,w15 + and w12,w5,w4 + bic w15,w6,w4 + eor w11,w4,w4,ror#5 + add w8,w8,w14 + orr w12,w12,w15 + eor w11,w11,w4,ror#19 + eor w15,w8,w8,ror#11 + add w7,w7,w12 + ror w11,w11,#6 + eor w14,w8,w9 + eor w15,w15,w8,ror#20 + add w7,w7,w11 + ldr w12,[sp,#16] + and w13,w13,w14 + ror w15,w15,#2 + add w3,w3,w7 + eor w13,w13,w9 + st1 {v4.4s},[x17], #16 + add w6,w6,w12 + add w7,w7,w15 + and w12,w4,w3 + ld1 {v1.16b},[x1],#16 + bic w15,w5,w3 + eor w11,w3,w3,ror#5 + ld1 {v4.4s},[x16],#16 + add w7,w7,w13 + orr w12,w12,w15 + eor w11,w11,w3,ror#19 + eor w15,w7,w7,ror#11 + rev32 v1.16b,v1.16b + add w6,w6,w12 + ror w11,w11,#6 + eor w13,w7,w8 + eor w15,w15,w7,ror#20 + add v4.4s,v4.4s,v1.4s + add w6,w6,w11 + ldr w12,[sp,#20] + and w14,w14,w13 + ror w15,w15,#2 + add w10,w10,w6 + eor w14,w14,w8 + add w5,w5,w12 + add w6,w6,w15 + and w12,w3,w10 + bic w15,w4,w10 + eor w11,w10,w10,ror#5 + add w6,w6,w14 + orr w12,w12,w15 + eor w11,w11,w10,ror#19 + eor w15,w6,w6,ror#11 + add w5,w5,w12 + ror w11,w11,#6 + eor w14,w6,w7 + eor w15,w15,w6,ror#20 + add w5,w5,w11 + ldr w12,[sp,#24] + and w13,w13,w14 + ror w15,w15,#2 + add w9,w9,w5 + eor w13,w13,w7 + add w4,w4,w12 + add w5,w5,w15 + and w12,w10,w9 + bic w15,w3,w9 + eor w11,w9,w9,ror#5 + add w5,w5,w13 + orr w12,w12,w15 + eor w11,w11,w9,ror#19 + eor w15,w5,w5,ror#11 + add w4,w4,w12 + ror w11,w11,#6 + eor w13,w5,w6 + eor w15,w15,w5,ror#20 + add w4,w4,w11 + ldr w12,[sp,#28] + and w14,w14,w13 + ror w15,w15,#2 + add w8,w8,w4 + eor w14,w14,w6 + add w3,w3,w12 + add w4,w4,w15 + and w12,w9,w8 + bic w15,w10,w8 + eor w11,w8,w8,ror#5 + add w4,w4,w14 + orr w12,w12,w15 + eor w11,w11,w8,ror#19 + eor w15,w4,w4,ror#11 + add w3,w3,w12 + ror w11,w11,#6 + eor w14,w4,w5 + eor w15,w15,w4,ror#20 + add w3,w3,w11 + ldr w12,[sp,#32] + and w13,w13,w14 + ror w15,w15,#2 + add w7,w7,w3 + eor w13,w13,w5 + st1 {v4.4s},[x17], #16 + add w10,w10,w12 + add w3,w3,w15 + and w12,w8,w7 + ld1 {v2.16b},[x1],#16 + bic w15,w9,w7 + eor w11,w7,w7,ror#5 + ld1 {v4.4s},[x16],#16 + add w3,w3,w13 + orr w12,w12,w15 + eor w11,w11,w7,ror#19 + eor w15,w3,w3,ror#11 + rev32 v2.16b,v2.16b + add w10,w10,w12 + ror w11,w11,#6 + eor w13,w3,w4 + eor w15,w15,w3,ror#20 + add v4.4s,v4.4s,v2.4s + add w10,w10,w11 + ldr w12,[sp,#36] + and w14,w14,w13 + ror w15,w15,#2 + add w6,w6,w10 + eor w14,w14,w4 + add w9,w9,w12 + add w10,w10,w15 + and w12,w7,w6 + bic w15,w8,w6 + eor w11,w6,w6,ror#5 + add w10,w10,w14 + orr w12,w12,w15 + eor w11,w11,w6,ror#19 + eor w15,w10,w10,ror#11 + add w9,w9,w12 + ror w11,w11,#6 + eor w14,w10,w3 + eor w15,w15,w10,ror#20 + add w9,w9,w11 + ldr w12,[sp,#40] + and w13,w13,w14 + ror w15,w15,#2 + add w5,w5,w9 + eor w13,w13,w3 + add w8,w8,w12 + add w9,w9,w15 + and w12,w6,w5 + bic w15,w7,w5 + eor w11,w5,w5,ror#5 + add w9,w9,w13 + orr w12,w12,w15 + eor w11,w11,w5,ror#19 + eor w15,w9,w9,ror#11 + add w8,w8,w12 + ror w11,w11,#6 + eor w13,w9,w10 + eor w15,w15,w9,ror#20 + add w8,w8,w11 + ldr w12,[sp,#44] + and w14,w14,w13 + ror w15,w15,#2 + add w4,w4,w8 + eor w14,w14,w10 + add w7,w7,w12 + add w8,w8,w15 + and w12,w5,w4 + bic w15,w6,w4 + eor w11,w4,w4,ror#5 + add w8,w8,w14 + orr w12,w12,w15 + eor w11,w11,w4,ror#19 + eor w15,w8,w8,ror#11 + add w7,w7,w12 + ror w11,w11,#6 + eor w14,w8,w9 + eor w15,w15,w8,ror#20 + add w7,w7,w11 + ldr w12,[sp,#48] + and w13,w13,w14 + ror w15,w15,#2 + add w3,w3,w7 + eor w13,w13,w9 + st1 {v4.4s},[x17], #16 + add w6,w6,w12 + add w7,w7,w15 + and w12,w4,w3 + ld1 {v3.16b},[x1],#16 + bic w15,w5,w3 + eor w11,w3,w3,ror#5 + ld1 {v4.4s},[x16],#16 + add w7,w7,w13 + orr w12,w12,w15 + eor w11,w11,w3,ror#19 + eor w15,w7,w7,ror#11 + rev32 v3.16b,v3.16b + add w6,w6,w12 + ror w11,w11,#6 + eor w13,w7,w8 + eor w15,w15,w7,ror#20 + add v4.4s,v4.4s,v3.4s + add w6,w6,w11 + ldr w12,[sp,#52] + and w14,w14,w13 + ror w15,w15,#2 + add w10,w10,w6 + eor w14,w14,w8 + add w5,w5,w12 + add w6,w6,w15 + and w12,w3,w10 + bic w15,w4,w10 + eor w11,w10,w10,ror#5 + add w6,w6,w14 + orr w12,w12,w15 + eor w11,w11,w10,ror#19 + eor w15,w6,w6,ror#11 + add w5,w5,w12 + ror w11,w11,#6 + eor w14,w6,w7 + eor w15,w15,w6,ror#20 + add w5,w5,w11 + ldr w12,[sp,#56] + and w13,w13,w14 + ror w15,w15,#2 + add w9,w9,w5 + eor w13,w13,w7 + add w4,w4,w12 + add w5,w5,w15 + and w12,w10,w9 + bic w15,w3,w9 + eor w11,w9,w9,ror#5 + add w5,w5,w13 + orr w12,w12,w15 + eor w11,w11,w9,ror#19 + eor w15,w5,w5,ror#11 + add w4,w4,w12 + ror w11,w11,#6 + eor w13,w5,w6 + eor w15,w15,w5,ror#20 + add w4,w4,w11 + ldr w12,[sp,#60] + and w14,w14,w13 + ror w15,w15,#2 + add w8,w8,w4 + eor w14,w14,w6 + add w3,w3,w12 + add w4,w4,w15 + and w12,w9,w8 + bic w15,w10,w8 + eor w11,w8,w8,ror#5 + add w4,w4,w14 + orr w12,w12,w15 + eor w11,w11,w8,ror#19 + eor w15,w4,w4,ror#11 + add w3,w3,w12 + ror w11,w11,#6 + eor w14,w4,w5 + eor w15,w15,w4,ror#20 + add w3,w3,w11 + and w13,w13,w14 + ror w15,w15,#2 + add w7,w7,w3 + eor w13,w13,w5 + st1 {v4.4s},[x17], #16 + add w3,w3,w15 // h+=Sigma0(a) from the past + ldp w11,w12,[x0,#0] + add w3,w3,w13 // h+=Maj(a,b,c) from the past + ldp w13,w14,[x0,#8] + add w3,w3,w11 // accumulate + add w4,w4,w12 + ldp w11,w12,[x0,#16] + add w5,w5,w13 + add w6,w6,w14 + ldp w13,w14,[x0,#24] + add w7,w7,w11 + add w8,w8,w12 + ldr w12,[sp,#0] + stp w3,w4,[x0,#0] + add w9,w9,w13 + mov w13,wzr + stp w5,w6,[x0,#8] + add w10,w10,w14 + stp w7,w8,[x0,#16] + eor w14,w4,w5 + stp w9,w10,[x0,#24] + mov w15,wzr + mov x17,sp + b.ne L_00_48 + + ldr x29,[x29] + add sp,sp,#16*4+16 + ret + +.globl _blst_sha256_emit +.private_extern _blst_sha256_emit + +.align 4 +_blst_sha256_emit: + ldp x4,x5,[x1] + ldp x6,x7,[x1,#16] +#ifndef __AARCH64EB__ + rev x4,x4 + rev x5,x5 + rev x6,x6 + rev x7,x7 +#endif + str w4,[x0,#4] + lsr x4,x4,#32 + str w5,[x0,#12] + lsr x5,x5,#32 + str w6,[x0,#20] + lsr x6,x6,#32 + str w7,[x0,#28] + lsr x7,x7,#32 + str w4,[x0,#0] + str w5,[x0,#8] + str w6,[x0,#16] + str w7,[x0,#24] + ret + + +.globl _blst_sha256_bcopy +.private_extern _blst_sha256_bcopy + +.align 4 +_blst_sha256_bcopy: +Loop_bcopy: + ldrb w3,[x1],#1 + sub x2,x2,#1 + strb w3,[x0],#1 + cbnz x2,Loop_bcopy + ret + + +.globl _blst_sha256_hcopy +.private_extern _blst_sha256_hcopy + +.align 4 +_blst_sha256_hcopy: + ldp x4,x5,[x1] + ldp x6,x7,[x1,#16] + stp x4,x5,[x0] + stp x6,x7,[x0,#16] + ret + diff --git a/temp-deps/blst-ts/deps/blst/build/mach-o/sha256-portable-x86_64.s b/temp-deps/blst-ts/deps/blst/build/mach-o/sha256-portable-x86_64.s new file mode 100644 index 0000000..3f00072 --- /dev/null +++ b/temp-deps/blst-ts/deps/blst/build/mach-o/sha256-portable-x86_64.s @@ -0,0 +1,1746 @@ +.text + +.globl _blst_sha256_block_data_order + +.p2align 4 +_blst_sha256_block_data_order: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-16 + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + shlq $4,%rdx + subq $64+24,%rsp +.cfi_adjust_cfa_offset 16*4+3*8 + leaq (%rsi,%rdx,4),%rdx + movq %rdi,64+0(%rsp) + movq %rsi,64+8(%rsp) + movq %rdx,64+16(%rsp) + + + movl 0(%rdi),%eax + movl 4(%rdi),%ebx + movl 8(%rdi),%ecx + movl 12(%rdi),%edx + movl 16(%rdi),%r8d + movl 20(%rdi),%r9d + movl 24(%rdi),%r10d + movl 28(%rdi),%r11d + jmp L$loop + +.p2align 4 +L$loop: + movl %ebx,%edi + leaq K256(%rip),%rbp + xorl %ecx,%edi + movl 0(%rsi),%r12d + movl %r8d,%r13d + movl %eax,%r14d + bswapl %r12d + rorl $14,%r13d + movl %r9d,%r15d + + xorl %r8d,%r13d + rorl $9,%r14d + xorl %r10d,%r15d + + movl %r12d,0(%rsp) + xorl %eax,%r14d + andl %r8d,%r15d + + rorl $5,%r13d + addl %r11d,%r12d + xorl %r10d,%r15d + + rorl $11,%r14d + xorl %r8d,%r13d + addl %r15d,%r12d + + movl %eax,%r15d + addl 0(%rbp),%r12d + xorl %eax,%r14d + + xorl %ebx,%r15d + rorl $6,%r13d + movl %ebx,%r11d + + andl %r15d,%edi + rorl $2,%r14d + addl %r13d,%r12d + + xorl %edi,%r11d + addl %r12d,%edx + addl %r12d,%r11d + addl %r14d,%r11d + movl 4(%rsi),%r12d + movl %edx,%r13d + movl %r11d,%r14d + bswapl %r12d + rorl $14,%r13d + movl %r8d,%edi + + xorl %edx,%r13d + rorl $9,%r14d + xorl %r9d,%edi + + movl %r12d,4(%rsp) + xorl %r11d,%r14d + andl %edx,%edi + + rorl $5,%r13d + addl %r10d,%r12d + xorl %r9d,%edi + + rorl $11,%r14d + xorl %edx,%r13d + addl %edi,%r12d + + movl %r11d,%edi + addl 4(%rbp),%r12d + xorl %r11d,%r14d + + xorl %eax,%edi + rorl $6,%r13d + movl %eax,%r10d + + andl %edi,%r15d + rorl $2,%r14d + addl %r13d,%r12d + + xorl %r15d,%r10d + addl %r12d,%ecx + addl %r12d,%r10d + addl %r14d,%r10d + movl 8(%rsi),%r12d + movl %ecx,%r13d + movl %r10d,%r14d + bswapl %r12d + rorl $14,%r13d + movl %edx,%r15d + + xorl %ecx,%r13d + rorl $9,%r14d + xorl %r8d,%r15d + + movl %r12d,8(%rsp) + xorl %r10d,%r14d + andl %ecx,%r15d + + rorl $5,%r13d + addl %r9d,%r12d + xorl %r8d,%r15d + + rorl $11,%r14d + xorl %ecx,%r13d + addl %r15d,%r12d + + movl %r10d,%r15d + addl 8(%rbp),%r12d + xorl %r10d,%r14d + + xorl %r11d,%r15d + rorl $6,%r13d + movl %r11d,%r9d + + andl %r15d,%edi + rorl $2,%r14d + addl %r13d,%r12d + + xorl %edi,%r9d + addl %r12d,%ebx + addl %r12d,%r9d + addl %r14d,%r9d + movl 12(%rsi),%r12d + movl %ebx,%r13d + movl %r9d,%r14d + bswapl %r12d + rorl $14,%r13d + movl %ecx,%edi + + xorl %ebx,%r13d + rorl $9,%r14d + xorl %edx,%edi + + movl %r12d,12(%rsp) + xorl %r9d,%r14d + andl %ebx,%edi + + rorl $5,%r13d + addl %r8d,%r12d + xorl %edx,%edi + + rorl $11,%r14d + xorl %ebx,%r13d + addl %edi,%r12d + + movl %r9d,%edi + addl 12(%rbp),%r12d + xorl %r9d,%r14d + + xorl %r10d,%edi + rorl $6,%r13d + movl %r10d,%r8d + + andl %edi,%r15d + rorl $2,%r14d + addl %r13d,%r12d + + xorl %r15d,%r8d + addl %r12d,%eax + addl %r12d,%r8d + addl %r14d,%r8d + movl 16(%rsi),%r12d + movl %eax,%r13d + movl %r8d,%r14d + bswapl %r12d + rorl $14,%r13d + movl %ebx,%r15d + + xorl %eax,%r13d + rorl $9,%r14d + xorl %ecx,%r15d + + movl %r12d,16(%rsp) + xorl %r8d,%r14d + andl %eax,%r15d + + rorl $5,%r13d + addl %edx,%r12d + xorl %ecx,%r15d + + rorl $11,%r14d + xorl %eax,%r13d + addl %r15d,%r12d + + movl %r8d,%r15d + addl 16(%rbp),%r12d + xorl %r8d,%r14d + + xorl %r9d,%r15d + rorl $6,%r13d + movl %r9d,%edx + + andl %r15d,%edi + rorl $2,%r14d + addl %r13d,%r12d + + xorl %edi,%edx + addl %r12d,%r11d + addl %r12d,%edx + addl %r14d,%edx + movl 20(%rsi),%r12d + movl %r11d,%r13d + movl %edx,%r14d + bswapl %r12d + rorl $14,%r13d + movl %eax,%edi + + xorl %r11d,%r13d + rorl $9,%r14d + xorl %ebx,%edi + + movl %r12d,20(%rsp) + xorl %edx,%r14d + andl %r11d,%edi + + rorl $5,%r13d + addl %ecx,%r12d + xorl %ebx,%edi + + rorl $11,%r14d + xorl %r11d,%r13d + addl %edi,%r12d + + movl %edx,%edi + addl 20(%rbp),%r12d + xorl %edx,%r14d + + xorl %r8d,%edi + rorl $6,%r13d + movl %r8d,%ecx + + andl %edi,%r15d + rorl $2,%r14d + addl %r13d,%r12d + + xorl %r15d,%ecx + addl %r12d,%r10d + addl %r12d,%ecx + addl %r14d,%ecx + movl 24(%rsi),%r12d + movl %r10d,%r13d + movl %ecx,%r14d + bswapl %r12d + rorl $14,%r13d + movl %r11d,%r15d + + xorl %r10d,%r13d + rorl $9,%r14d + xorl %eax,%r15d + + movl %r12d,24(%rsp) + xorl %ecx,%r14d + andl %r10d,%r15d + + rorl $5,%r13d + addl %ebx,%r12d + xorl %eax,%r15d + + rorl $11,%r14d + xorl %r10d,%r13d + addl %r15d,%r12d + + movl %ecx,%r15d + addl 24(%rbp),%r12d + xorl %ecx,%r14d + + xorl %edx,%r15d + rorl $6,%r13d + movl %edx,%ebx + + andl %r15d,%edi + rorl $2,%r14d + addl %r13d,%r12d + + xorl %edi,%ebx + addl %r12d,%r9d + addl %r12d,%ebx + addl %r14d,%ebx + movl 28(%rsi),%r12d + movl %r9d,%r13d + movl %ebx,%r14d + bswapl %r12d + rorl $14,%r13d + movl %r10d,%edi + + xorl %r9d,%r13d + rorl $9,%r14d + xorl %r11d,%edi + + movl %r12d,28(%rsp) + xorl %ebx,%r14d + andl %r9d,%edi + + rorl $5,%r13d + addl %eax,%r12d + xorl %r11d,%edi + + rorl $11,%r14d + xorl %r9d,%r13d + addl %edi,%r12d + + movl %ebx,%edi + addl 28(%rbp),%r12d + xorl %ebx,%r14d + + xorl %ecx,%edi + rorl $6,%r13d + movl %ecx,%eax + + andl %edi,%r15d + rorl $2,%r14d + addl %r13d,%r12d + + xorl %r15d,%eax + addl %r12d,%r8d + addl %r12d,%eax + addl %r14d,%eax + movl 32(%rsi),%r12d + movl %r8d,%r13d + movl %eax,%r14d + bswapl %r12d + rorl $14,%r13d + movl %r9d,%r15d + + xorl %r8d,%r13d + rorl $9,%r14d + xorl %r10d,%r15d + + movl %r12d,32(%rsp) + xorl %eax,%r14d + andl %r8d,%r15d + + rorl $5,%r13d + addl %r11d,%r12d + xorl %r10d,%r15d + + rorl $11,%r14d + xorl %r8d,%r13d + addl %r15d,%r12d + + movl %eax,%r15d + addl 32(%rbp),%r12d + xorl %eax,%r14d + + xorl %ebx,%r15d + rorl $6,%r13d + movl %ebx,%r11d + + andl %r15d,%edi + rorl $2,%r14d + addl %r13d,%r12d + + xorl %edi,%r11d + addl %r12d,%edx + addl %r12d,%r11d + addl %r14d,%r11d + movl 36(%rsi),%r12d + movl %edx,%r13d + movl %r11d,%r14d + bswapl %r12d + rorl $14,%r13d + movl %r8d,%edi + + xorl %edx,%r13d + rorl $9,%r14d + xorl %r9d,%edi + + movl %r12d,36(%rsp) + xorl %r11d,%r14d + andl %edx,%edi + + rorl $5,%r13d + addl %r10d,%r12d + xorl %r9d,%edi + + rorl $11,%r14d + xorl %edx,%r13d + addl %edi,%r12d + + movl %r11d,%edi + addl 36(%rbp),%r12d + xorl %r11d,%r14d + + xorl %eax,%edi + rorl $6,%r13d + movl %eax,%r10d + + andl %edi,%r15d + rorl $2,%r14d + addl %r13d,%r12d + + xorl %r15d,%r10d + addl %r12d,%ecx + addl %r12d,%r10d + addl %r14d,%r10d + movl 40(%rsi),%r12d + movl %ecx,%r13d + movl %r10d,%r14d + bswapl %r12d + rorl $14,%r13d + movl %edx,%r15d + + xorl %ecx,%r13d + rorl $9,%r14d + xorl %r8d,%r15d + + movl %r12d,40(%rsp) + xorl %r10d,%r14d + andl %ecx,%r15d + + rorl $5,%r13d + addl %r9d,%r12d + xorl %r8d,%r15d + + rorl $11,%r14d + xorl %ecx,%r13d + addl %r15d,%r12d + + movl %r10d,%r15d + addl 40(%rbp),%r12d + xorl %r10d,%r14d + + xorl %r11d,%r15d + rorl $6,%r13d + movl %r11d,%r9d + + andl %r15d,%edi + rorl $2,%r14d + addl %r13d,%r12d + + xorl %edi,%r9d + addl %r12d,%ebx + addl %r12d,%r9d + addl %r14d,%r9d + movl 44(%rsi),%r12d + movl %ebx,%r13d + movl %r9d,%r14d + bswapl %r12d + rorl $14,%r13d + movl %ecx,%edi + + xorl %ebx,%r13d + rorl $9,%r14d + xorl %edx,%edi + + movl %r12d,44(%rsp) + xorl %r9d,%r14d + andl %ebx,%edi + + rorl $5,%r13d + addl %r8d,%r12d + xorl %edx,%edi + + rorl $11,%r14d + xorl %ebx,%r13d + addl %edi,%r12d + + movl %r9d,%edi + addl 44(%rbp),%r12d + xorl %r9d,%r14d + + xorl %r10d,%edi + rorl $6,%r13d + movl %r10d,%r8d + + andl %edi,%r15d + rorl $2,%r14d + addl %r13d,%r12d + + xorl %r15d,%r8d + addl %r12d,%eax + addl %r12d,%r8d + addl %r14d,%r8d + movl 48(%rsi),%r12d + movl %eax,%r13d + movl %r8d,%r14d + bswapl %r12d + rorl $14,%r13d + movl %ebx,%r15d + + xorl %eax,%r13d + rorl $9,%r14d + xorl %ecx,%r15d + + movl %r12d,48(%rsp) + xorl %r8d,%r14d + andl %eax,%r15d + + rorl $5,%r13d + addl %edx,%r12d + xorl %ecx,%r15d + + rorl $11,%r14d + xorl %eax,%r13d + addl %r15d,%r12d + + movl %r8d,%r15d + addl 48(%rbp),%r12d + xorl %r8d,%r14d + + xorl %r9d,%r15d + rorl $6,%r13d + movl %r9d,%edx + + andl %r15d,%edi + rorl $2,%r14d + addl %r13d,%r12d + + xorl %edi,%edx + addl %r12d,%r11d + addl %r12d,%edx + addl %r14d,%edx + movl 52(%rsi),%r12d + movl %r11d,%r13d + movl %edx,%r14d + bswapl %r12d + rorl $14,%r13d + movl %eax,%edi + + xorl %r11d,%r13d + rorl $9,%r14d + xorl %ebx,%edi + + movl %r12d,52(%rsp) + xorl %edx,%r14d + andl %r11d,%edi + + rorl $5,%r13d + addl %ecx,%r12d + xorl %ebx,%edi + + rorl $11,%r14d + xorl %r11d,%r13d + addl %edi,%r12d + + movl %edx,%edi + addl 52(%rbp),%r12d + xorl %edx,%r14d + + xorl %r8d,%edi + rorl $6,%r13d + movl %r8d,%ecx + + andl %edi,%r15d + rorl $2,%r14d + addl %r13d,%r12d + + xorl %r15d,%ecx + addl %r12d,%r10d + addl %r12d,%ecx + addl %r14d,%ecx + movl 56(%rsi),%r12d + movl %r10d,%r13d + movl %ecx,%r14d + bswapl %r12d + rorl $14,%r13d + movl %r11d,%r15d + + xorl %r10d,%r13d + rorl $9,%r14d + xorl %eax,%r15d + + movl %r12d,56(%rsp) + xorl %ecx,%r14d + andl %r10d,%r15d + + rorl $5,%r13d + addl %ebx,%r12d + xorl %eax,%r15d + + rorl $11,%r14d + xorl %r10d,%r13d + addl %r15d,%r12d + + movl %ecx,%r15d + addl 56(%rbp),%r12d + xorl %ecx,%r14d + + xorl %edx,%r15d + rorl $6,%r13d + movl %edx,%ebx + + andl %r15d,%edi + rorl $2,%r14d + addl %r13d,%r12d + + xorl %edi,%ebx + addl %r12d,%r9d + addl %r12d,%ebx + addl %r14d,%ebx + movl 60(%rsi),%r12d + movl %r9d,%r13d + movl %ebx,%r14d + bswapl %r12d + rorl $14,%r13d + movl %r10d,%edi + + xorl %r9d,%r13d + rorl $9,%r14d + xorl %r11d,%edi + + movl %r12d,60(%rsp) + xorl %ebx,%r14d + andl %r9d,%edi + + rorl $5,%r13d + addl %eax,%r12d + xorl %r11d,%edi + + rorl $11,%r14d + xorl %r9d,%r13d + addl %edi,%r12d + + movl %ebx,%edi + addl 60(%rbp),%r12d + xorl %ebx,%r14d + + xorl %ecx,%edi + rorl $6,%r13d + movl %ecx,%eax + + andl %edi,%r15d + rorl $2,%r14d + addl %r13d,%r12d + + xorl %r15d,%eax + addl %r12d,%r8d + addl %r12d,%eax + jmp L$rounds_16_xx +.p2align 4 +L$rounds_16_xx: + movl 4(%rsp),%r13d + movl 56(%rsp),%r15d + + movl %r13d,%r12d + rorl $11,%r13d + addl %r14d,%eax + movl %r15d,%r14d + rorl $2,%r15d + + xorl %r12d,%r13d + shrl $3,%r12d + rorl $7,%r13d + xorl %r14d,%r15d + shrl $10,%r14d + + rorl $17,%r15d + xorl %r13d,%r12d + xorl %r14d,%r15d + addl 36(%rsp),%r12d + + addl 0(%rsp),%r12d + movl %r8d,%r13d + addl %r15d,%r12d + movl %eax,%r14d + rorl $14,%r13d + movl %r9d,%r15d + + xorl %r8d,%r13d + rorl $9,%r14d + xorl %r10d,%r15d + + movl %r12d,0(%rsp) + xorl %eax,%r14d + andl %r8d,%r15d + + rorl $5,%r13d + addl %r11d,%r12d + xorl %r10d,%r15d + + rorl $11,%r14d + xorl %r8d,%r13d + addl %r15d,%r12d + + movl %eax,%r15d + addl 64(%rbp),%r12d + xorl %eax,%r14d + + xorl %ebx,%r15d + rorl $6,%r13d + movl %ebx,%r11d + + andl %r15d,%edi + rorl $2,%r14d + addl %r13d,%r12d + + xorl %edi,%r11d + addl %r12d,%edx + addl %r12d,%r11d + movl 8(%rsp),%r13d + movl 60(%rsp),%edi + + movl %r13d,%r12d + rorl $11,%r13d + addl %r14d,%r11d + movl %edi,%r14d + rorl $2,%edi + + xorl %r12d,%r13d + shrl $3,%r12d + rorl $7,%r13d + xorl %r14d,%edi + shrl $10,%r14d + + rorl $17,%edi + xorl %r13d,%r12d + xorl %r14d,%edi + addl 40(%rsp),%r12d + + addl 4(%rsp),%r12d + movl %edx,%r13d + addl %edi,%r12d + movl %r11d,%r14d + rorl $14,%r13d + movl %r8d,%edi + + xorl %edx,%r13d + rorl $9,%r14d + xorl %r9d,%edi + + movl %r12d,4(%rsp) + xorl %r11d,%r14d + andl %edx,%edi + + rorl $5,%r13d + addl %r10d,%r12d + xorl %r9d,%edi + + rorl $11,%r14d + xorl %edx,%r13d + addl %edi,%r12d + + movl %r11d,%edi + addl 68(%rbp),%r12d + xorl %r11d,%r14d + + xorl %eax,%edi + rorl $6,%r13d + movl %eax,%r10d + + andl %edi,%r15d + rorl $2,%r14d + addl %r13d,%r12d + + xorl %r15d,%r10d + addl %r12d,%ecx + addl %r12d,%r10d + movl 12(%rsp),%r13d + movl 0(%rsp),%r15d + + movl %r13d,%r12d + rorl $11,%r13d + addl %r14d,%r10d + movl %r15d,%r14d + rorl $2,%r15d + + xorl %r12d,%r13d + shrl $3,%r12d + rorl $7,%r13d + xorl %r14d,%r15d + shrl $10,%r14d + + rorl $17,%r15d + xorl %r13d,%r12d + xorl %r14d,%r15d + addl 44(%rsp),%r12d + + addl 8(%rsp),%r12d + movl %ecx,%r13d + addl %r15d,%r12d + movl %r10d,%r14d + rorl $14,%r13d + movl %edx,%r15d + + xorl %ecx,%r13d + rorl $9,%r14d + xorl %r8d,%r15d + + movl %r12d,8(%rsp) + xorl %r10d,%r14d + andl %ecx,%r15d + + rorl $5,%r13d + addl %r9d,%r12d + xorl %r8d,%r15d + + rorl $11,%r14d + xorl %ecx,%r13d + addl %r15d,%r12d + + movl %r10d,%r15d + addl 72(%rbp),%r12d + xorl %r10d,%r14d + + xorl %r11d,%r15d + rorl $6,%r13d + movl %r11d,%r9d + + andl %r15d,%edi + rorl $2,%r14d + addl %r13d,%r12d + + xorl %edi,%r9d + addl %r12d,%ebx + addl %r12d,%r9d + movl 16(%rsp),%r13d + movl 4(%rsp),%edi + + movl %r13d,%r12d + rorl $11,%r13d + addl %r14d,%r9d + movl %edi,%r14d + rorl $2,%edi + + xorl %r12d,%r13d + shrl $3,%r12d + rorl $7,%r13d + xorl %r14d,%edi + shrl $10,%r14d + + rorl $17,%edi + xorl %r13d,%r12d + xorl %r14d,%edi + addl 48(%rsp),%r12d + + addl 12(%rsp),%r12d + movl %ebx,%r13d + addl %edi,%r12d + movl %r9d,%r14d + rorl $14,%r13d + movl %ecx,%edi + + xorl %ebx,%r13d + rorl $9,%r14d + xorl %edx,%edi + + movl %r12d,12(%rsp) + xorl %r9d,%r14d + andl %ebx,%edi + + rorl $5,%r13d + addl %r8d,%r12d + xorl %edx,%edi + + rorl $11,%r14d + xorl %ebx,%r13d + addl %edi,%r12d + + movl %r9d,%edi + addl 76(%rbp),%r12d + xorl %r9d,%r14d + + xorl %r10d,%edi + rorl $6,%r13d + movl %r10d,%r8d + + andl %edi,%r15d + rorl $2,%r14d + addl %r13d,%r12d + + xorl %r15d,%r8d + addl %r12d,%eax + addl %r12d,%r8d + movl 20(%rsp),%r13d + movl 8(%rsp),%r15d + + movl %r13d,%r12d + rorl $11,%r13d + addl %r14d,%r8d + movl %r15d,%r14d + rorl $2,%r15d + + xorl %r12d,%r13d + shrl $3,%r12d + rorl $7,%r13d + xorl %r14d,%r15d + shrl $10,%r14d + + rorl $17,%r15d + xorl %r13d,%r12d + xorl %r14d,%r15d + addl 52(%rsp),%r12d + + addl 16(%rsp),%r12d + movl %eax,%r13d + addl %r15d,%r12d + movl %r8d,%r14d + rorl $14,%r13d + movl %ebx,%r15d + + xorl %eax,%r13d + rorl $9,%r14d + xorl %ecx,%r15d + + movl %r12d,16(%rsp) + xorl %r8d,%r14d + andl %eax,%r15d + + rorl $5,%r13d + addl %edx,%r12d + xorl %ecx,%r15d + + rorl $11,%r14d + xorl %eax,%r13d + addl %r15d,%r12d + + movl %r8d,%r15d + addl 80(%rbp),%r12d + xorl %r8d,%r14d + + xorl %r9d,%r15d + rorl $6,%r13d + movl %r9d,%edx + + andl %r15d,%edi + rorl $2,%r14d + addl %r13d,%r12d + + xorl %edi,%edx + addl %r12d,%r11d + addl %r12d,%edx + movl 24(%rsp),%r13d + movl 12(%rsp),%edi + + movl %r13d,%r12d + rorl $11,%r13d + addl %r14d,%edx + movl %edi,%r14d + rorl $2,%edi + + xorl %r12d,%r13d + shrl $3,%r12d + rorl $7,%r13d + xorl %r14d,%edi + shrl $10,%r14d + + rorl $17,%edi + xorl %r13d,%r12d + xorl %r14d,%edi + addl 56(%rsp),%r12d + + addl 20(%rsp),%r12d + movl %r11d,%r13d + addl %edi,%r12d + movl %edx,%r14d + rorl $14,%r13d + movl %eax,%edi + + xorl %r11d,%r13d + rorl $9,%r14d + xorl %ebx,%edi + + movl %r12d,20(%rsp) + xorl %edx,%r14d + andl %r11d,%edi + + rorl $5,%r13d + addl %ecx,%r12d + xorl %ebx,%edi + + rorl $11,%r14d + xorl %r11d,%r13d + addl %edi,%r12d + + movl %edx,%edi + addl 84(%rbp),%r12d + xorl %edx,%r14d + + xorl %r8d,%edi + rorl $6,%r13d + movl %r8d,%ecx + + andl %edi,%r15d + rorl $2,%r14d + addl %r13d,%r12d + + xorl %r15d,%ecx + addl %r12d,%r10d + addl %r12d,%ecx + movl 28(%rsp),%r13d + movl 16(%rsp),%r15d + + movl %r13d,%r12d + rorl $11,%r13d + addl %r14d,%ecx + movl %r15d,%r14d + rorl $2,%r15d + + xorl %r12d,%r13d + shrl $3,%r12d + rorl $7,%r13d + xorl %r14d,%r15d + shrl $10,%r14d + + rorl $17,%r15d + xorl %r13d,%r12d + xorl %r14d,%r15d + addl 60(%rsp),%r12d + + addl 24(%rsp),%r12d + movl %r10d,%r13d + addl %r15d,%r12d + movl %ecx,%r14d + rorl $14,%r13d + movl %r11d,%r15d + + xorl %r10d,%r13d + rorl $9,%r14d + xorl %eax,%r15d + + movl %r12d,24(%rsp) + xorl %ecx,%r14d + andl %r10d,%r15d + + rorl $5,%r13d + addl %ebx,%r12d + xorl %eax,%r15d + + rorl $11,%r14d + xorl %r10d,%r13d + addl %r15d,%r12d + + movl %ecx,%r15d + addl 88(%rbp),%r12d + xorl %ecx,%r14d + + xorl %edx,%r15d + rorl $6,%r13d + movl %edx,%ebx + + andl %r15d,%edi + rorl $2,%r14d + addl %r13d,%r12d + + xorl %edi,%ebx + addl %r12d,%r9d + addl %r12d,%ebx + movl 32(%rsp),%r13d + movl 20(%rsp),%edi + + movl %r13d,%r12d + rorl $11,%r13d + addl %r14d,%ebx + movl %edi,%r14d + rorl $2,%edi + + xorl %r12d,%r13d + shrl $3,%r12d + rorl $7,%r13d + xorl %r14d,%edi + shrl $10,%r14d + + rorl $17,%edi + xorl %r13d,%r12d + xorl %r14d,%edi + addl 0(%rsp),%r12d + + addl 28(%rsp),%r12d + movl %r9d,%r13d + addl %edi,%r12d + movl %ebx,%r14d + rorl $14,%r13d + movl %r10d,%edi + + xorl %r9d,%r13d + rorl $9,%r14d + xorl %r11d,%edi + + movl %r12d,28(%rsp) + xorl %ebx,%r14d + andl %r9d,%edi + + rorl $5,%r13d + addl %eax,%r12d + xorl %r11d,%edi + + rorl $11,%r14d + xorl %r9d,%r13d + addl %edi,%r12d + + movl %ebx,%edi + addl 92(%rbp),%r12d + xorl %ebx,%r14d + + xorl %ecx,%edi + rorl $6,%r13d + movl %ecx,%eax + + andl %edi,%r15d + rorl $2,%r14d + addl %r13d,%r12d + + xorl %r15d,%eax + addl %r12d,%r8d + addl %r12d,%eax + movl 36(%rsp),%r13d + movl 24(%rsp),%r15d + + movl %r13d,%r12d + rorl $11,%r13d + addl %r14d,%eax + movl %r15d,%r14d + rorl $2,%r15d + + xorl %r12d,%r13d + shrl $3,%r12d + rorl $7,%r13d + xorl %r14d,%r15d + shrl $10,%r14d + + rorl $17,%r15d + xorl %r13d,%r12d + xorl %r14d,%r15d + addl 4(%rsp),%r12d + + addl 32(%rsp),%r12d + movl %r8d,%r13d + addl %r15d,%r12d + movl %eax,%r14d + rorl $14,%r13d + movl %r9d,%r15d + + xorl %r8d,%r13d + rorl $9,%r14d + xorl %r10d,%r15d + + movl %r12d,32(%rsp) + xorl %eax,%r14d + andl %r8d,%r15d + + rorl $5,%r13d + addl %r11d,%r12d + xorl %r10d,%r15d + + rorl $11,%r14d + xorl %r8d,%r13d + addl %r15d,%r12d + + movl %eax,%r15d + addl 96(%rbp),%r12d + xorl %eax,%r14d + + xorl %ebx,%r15d + rorl $6,%r13d + movl %ebx,%r11d + + andl %r15d,%edi + rorl $2,%r14d + addl %r13d,%r12d + + xorl %edi,%r11d + addl %r12d,%edx + addl %r12d,%r11d + movl 40(%rsp),%r13d + movl 28(%rsp),%edi + + movl %r13d,%r12d + rorl $11,%r13d + addl %r14d,%r11d + movl %edi,%r14d + rorl $2,%edi + + xorl %r12d,%r13d + shrl $3,%r12d + rorl $7,%r13d + xorl %r14d,%edi + shrl $10,%r14d + + rorl $17,%edi + xorl %r13d,%r12d + xorl %r14d,%edi + addl 8(%rsp),%r12d + + addl 36(%rsp),%r12d + movl %edx,%r13d + addl %edi,%r12d + movl %r11d,%r14d + rorl $14,%r13d + movl %r8d,%edi + + xorl %edx,%r13d + rorl $9,%r14d + xorl %r9d,%edi + + movl %r12d,36(%rsp) + xorl %r11d,%r14d + andl %edx,%edi + + rorl $5,%r13d + addl %r10d,%r12d + xorl %r9d,%edi + + rorl $11,%r14d + xorl %edx,%r13d + addl %edi,%r12d + + movl %r11d,%edi + addl 100(%rbp),%r12d + xorl %r11d,%r14d + + xorl %eax,%edi + rorl $6,%r13d + movl %eax,%r10d + + andl %edi,%r15d + rorl $2,%r14d + addl %r13d,%r12d + + xorl %r15d,%r10d + addl %r12d,%ecx + addl %r12d,%r10d + movl 44(%rsp),%r13d + movl 32(%rsp),%r15d + + movl %r13d,%r12d + rorl $11,%r13d + addl %r14d,%r10d + movl %r15d,%r14d + rorl $2,%r15d + + xorl %r12d,%r13d + shrl $3,%r12d + rorl $7,%r13d + xorl %r14d,%r15d + shrl $10,%r14d + + rorl $17,%r15d + xorl %r13d,%r12d + xorl %r14d,%r15d + addl 12(%rsp),%r12d + + addl 40(%rsp),%r12d + movl %ecx,%r13d + addl %r15d,%r12d + movl %r10d,%r14d + rorl $14,%r13d + movl %edx,%r15d + + xorl %ecx,%r13d + rorl $9,%r14d + xorl %r8d,%r15d + + movl %r12d,40(%rsp) + xorl %r10d,%r14d + andl %ecx,%r15d + + rorl $5,%r13d + addl %r9d,%r12d + xorl %r8d,%r15d + + rorl $11,%r14d + xorl %ecx,%r13d + addl %r15d,%r12d + + movl %r10d,%r15d + addl 104(%rbp),%r12d + xorl %r10d,%r14d + + xorl %r11d,%r15d + rorl $6,%r13d + movl %r11d,%r9d + + andl %r15d,%edi + rorl $2,%r14d + addl %r13d,%r12d + + xorl %edi,%r9d + addl %r12d,%ebx + addl %r12d,%r9d + movl 48(%rsp),%r13d + movl 36(%rsp),%edi + + movl %r13d,%r12d + rorl $11,%r13d + addl %r14d,%r9d + movl %edi,%r14d + rorl $2,%edi + + xorl %r12d,%r13d + shrl $3,%r12d + rorl $7,%r13d + xorl %r14d,%edi + shrl $10,%r14d + + rorl $17,%edi + xorl %r13d,%r12d + xorl %r14d,%edi + addl 16(%rsp),%r12d + + addl 44(%rsp),%r12d + movl %ebx,%r13d + addl %edi,%r12d + movl %r9d,%r14d + rorl $14,%r13d + movl %ecx,%edi + + xorl %ebx,%r13d + rorl $9,%r14d + xorl %edx,%edi + + movl %r12d,44(%rsp) + xorl %r9d,%r14d + andl %ebx,%edi + + rorl $5,%r13d + addl %r8d,%r12d + xorl %edx,%edi + + rorl $11,%r14d + xorl %ebx,%r13d + addl %edi,%r12d + + movl %r9d,%edi + addl 108(%rbp),%r12d + xorl %r9d,%r14d + + xorl %r10d,%edi + rorl $6,%r13d + movl %r10d,%r8d + + andl %edi,%r15d + rorl $2,%r14d + addl %r13d,%r12d + + xorl %r15d,%r8d + addl %r12d,%eax + addl %r12d,%r8d + movl 52(%rsp),%r13d + movl 40(%rsp),%r15d + + movl %r13d,%r12d + rorl $11,%r13d + addl %r14d,%r8d + movl %r15d,%r14d + rorl $2,%r15d + + xorl %r12d,%r13d + shrl $3,%r12d + rorl $7,%r13d + xorl %r14d,%r15d + shrl $10,%r14d + + rorl $17,%r15d + xorl %r13d,%r12d + xorl %r14d,%r15d + addl 20(%rsp),%r12d + + addl 48(%rsp),%r12d + movl %eax,%r13d + addl %r15d,%r12d + movl %r8d,%r14d + rorl $14,%r13d + movl %ebx,%r15d + + xorl %eax,%r13d + rorl $9,%r14d + xorl %ecx,%r15d + + movl %r12d,48(%rsp) + xorl %r8d,%r14d + andl %eax,%r15d + + rorl $5,%r13d + addl %edx,%r12d + xorl %ecx,%r15d + + rorl $11,%r14d + xorl %eax,%r13d + addl %r15d,%r12d + + movl %r8d,%r15d + addl 112(%rbp),%r12d + xorl %r8d,%r14d + + xorl %r9d,%r15d + rorl $6,%r13d + movl %r9d,%edx + + andl %r15d,%edi + rorl $2,%r14d + addl %r13d,%r12d + + xorl %edi,%edx + addl %r12d,%r11d + addl %r12d,%edx + movl 56(%rsp),%r13d + movl 44(%rsp),%edi + + movl %r13d,%r12d + rorl $11,%r13d + addl %r14d,%edx + movl %edi,%r14d + rorl $2,%edi + + xorl %r12d,%r13d + shrl $3,%r12d + rorl $7,%r13d + xorl %r14d,%edi + shrl $10,%r14d + + rorl $17,%edi + xorl %r13d,%r12d + xorl %r14d,%edi + addl 24(%rsp),%r12d + + addl 52(%rsp),%r12d + movl %r11d,%r13d + addl %edi,%r12d + movl %edx,%r14d + rorl $14,%r13d + movl %eax,%edi + + xorl %r11d,%r13d + rorl $9,%r14d + xorl %ebx,%edi + + movl %r12d,52(%rsp) + xorl %edx,%r14d + andl %r11d,%edi + + rorl $5,%r13d + addl %ecx,%r12d + xorl %ebx,%edi + + rorl $11,%r14d + xorl %r11d,%r13d + addl %edi,%r12d + + movl %edx,%edi + addl 116(%rbp),%r12d + xorl %edx,%r14d + + xorl %r8d,%edi + rorl $6,%r13d + movl %r8d,%ecx + + andl %edi,%r15d + rorl $2,%r14d + addl %r13d,%r12d + + xorl %r15d,%ecx + addl %r12d,%r10d + addl %r12d,%ecx + movl 60(%rsp),%r13d + movl 48(%rsp),%r15d + + movl %r13d,%r12d + rorl $11,%r13d + addl %r14d,%ecx + movl %r15d,%r14d + rorl $2,%r15d + + xorl %r12d,%r13d + shrl $3,%r12d + rorl $7,%r13d + xorl %r14d,%r15d + shrl $10,%r14d + + rorl $17,%r15d + xorl %r13d,%r12d + xorl %r14d,%r15d + addl 28(%rsp),%r12d + + addl 56(%rsp),%r12d + movl %r10d,%r13d + addl %r15d,%r12d + movl %ecx,%r14d + rorl $14,%r13d + movl %r11d,%r15d + + xorl %r10d,%r13d + rorl $9,%r14d + xorl %eax,%r15d + + movl %r12d,56(%rsp) + xorl %ecx,%r14d + andl %r10d,%r15d + + rorl $5,%r13d + addl %ebx,%r12d + xorl %eax,%r15d + + rorl $11,%r14d + xorl %r10d,%r13d + addl %r15d,%r12d + + movl %ecx,%r15d + addl 120(%rbp),%r12d + xorl %ecx,%r14d + + xorl %edx,%r15d + rorl $6,%r13d + movl %edx,%ebx + + andl %r15d,%edi + rorl $2,%r14d + addl %r13d,%r12d + + xorl %edi,%ebx + addl %r12d,%r9d + addl %r12d,%ebx + movl 0(%rsp),%r13d + movl 52(%rsp),%edi + + movl %r13d,%r12d + rorl $11,%r13d + addl %r14d,%ebx + movl %edi,%r14d + rorl $2,%edi + + xorl %r12d,%r13d + shrl $3,%r12d + rorl $7,%r13d + xorl %r14d,%edi + shrl $10,%r14d + + rorl $17,%edi + xorl %r13d,%r12d + xorl %r14d,%edi + addl 32(%rsp),%r12d + + addl 60(%rsp),%r12d + movl %r9d,%r13d + addl %edi,%r12d + movl %ebx,%r14d + rorl $14,%r13d + movl %r10d,%edi + + xorl %r9d,%r13d + rorl $9,%r14d + xorl %r11d,%edi + + movl %r12d,60(%rsp) + xorl %ebx,%r14d + andl %r9d,%edi + + rorl $5,%r13d + addl %eax,%r12d + xorl %r11d,%edi + + rorl $11,%r14d + xorl %r9d,%r13d + addl %edi,%r12d + + movl %ebx,%edi + addl 124(%rbp),%r12d + xorl %ebx,%r14d + + xorl %ecx,%edi + rorl $6,%r13d + movl %ecx,%eax + + andl %edi,%r15d + rorl $2,%r14d + addl %r13d,%r12d + + xorl %r15d,%eax + addl %r12d,%r8d + addl %r12d,%eax + leaq 64(%rbp),%rbp + cmpb $0x19,3(%rbp) + jnz L$rounds_16_xx + + movq 64+0(%rsp),%rdi + addl %r14d,%eax + leaq 64(%rsi),%rsi + + addl 0(%rdi),%eax + addl 4(%rdi),%ebx + addl 8(%rdi),%ecx + addl 12(%rdi),%edx + addl 16(%rdi),%r8d + addl 20(%rdi),%r9d + addl 24(%rdi),%r10d + addl 28(%rdi),%r11d + + cmpq 64+16(%rsp),%rsi + + movl %eax,0(%rdi) + movl %ebx,4(%rdi) + movl %ecx,8(%rdi) + movl %edx,12(%rdi) + movl %r8d,16(%rdi) + movl %r9d,20(%rdi) + movl %r10d,24(%rdi) + movl %r11d,28(%rdi) + jb L$loop + + leaq 64+24+48(%rsp),%r11 +.cfi_def_cfa %r11,8 + movq 64+24(%rsp),%r15 +.cfi_restore %r15 + movq -40(%r11),%r14 +.cfi_restore %r14 + movq -32(%r11),%r13 +.cfi_restore %r13 + movq -24(%r11),%r12 +.cfi_restore %r12 + movq -16(%r11),%rbp +.cfi_restore %rbp + movq -8(%r11),%rbx +.cfi_restore %rbx + + leaq (%r11),%rsp + .byte 0xf3,0xc3 +.cfi_endproc + + +.p2align 6 + +K256: +.long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5 +.long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5 +.long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3 +.long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174 +.long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc +.long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da +.long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7 +.long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967 +.long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13 +.long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85 +.long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3 +.long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070 +.long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5 +.long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3 +.long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208 +.long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 + +.byte 83,72,65,50,53,54,32,98,108,111,99,107,32,116,114,97,110,115,102,111,114,109,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,64,100,111,116,45,97,115,109,0 +.globl _blst_sha256_emit +.private_extern _blst_sha256_emit + +.p2align 4 +_blst_sha256_emit: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + bswapq %r8 + movq 24(%rsi),%r11 + bswapq %r9 + movl %r8d,4(%rdi) + bswapq %r10 + movl %r9d,12(%rdi) + bswapq %r11 + movl %r10d,20(%rdi) + shrq $32,%r8 + movl %r11d,28(%rdi) + shrq $32,%r9 + movl %r8d,0(%rdi) + shrq $32,%r10 + movl %r9d,8(%rdi) + shrq $32,%r11 + movl %r10d,16(%rdi) + movl %r11d,24(%rdi) + .byte 0xf3,0xc3 +.cfi_endproc + + +.globl _blst_sha256_bcopy +.private_extern _blst_sha256_bcopy + +.p2align 4 +_blst_sha256_bcopy: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + subq %rsi,%rdi +L$oop_bcopy: + movzbl (%rsi),%eax + leaq 1(%rsi),%rsi + movb %al,-1(%rdi,%rsi,1) + decq %rdx + jnz L$oop_bcopy + .byte 0xf3,0xc3 +.cfi_endproc + + +.globl _blst_sha256_hcopy +.private_extern _blst_sha256_hcopy + +.p2align 4 +_blst_sha256_hcopy: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + movq %r8,0(%rdi) + movq %r9,8(%rdi) + movq %r10,16(%rdi) + movq %r11,24(%rdi) + .byte 0xf3,0xc3 +.cfi_endproc + diff --git a/temp-deps/blst-ts/deps/blst/build/mach-o/sha256-x86_64.s b/temp-deps/blst-ts/deps/blst/build/mach-o/sha256-x86_64.s new file mode 100644 index 0000000..dee75e3 --- /dev/null +++ b/temp-deps/blst-ts/deps/blst/build/mach-o/sha256-x86_64.s @@ -0,0 +1,1438 @@ +.text + +.p2align 6 + +K256: +.long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5 +.long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5 +.long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3 +.long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174 +.long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc +.long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da +.long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7 +.long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967 +.long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13 +.long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85 +.long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3 +.long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070 +.long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5 +.long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3 +.long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208 +.long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 + +.long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f +.long 0x03020100,0x0b0a0908,0xffffffff,0xffffffff +.long 0xffffffff,0xffffffff,0x03020100,0x0b0a0908 +.byte 83,72,65,50,53,54,32,98,108,111,99,107,32,116,114,97,110,115,102,111,114,109,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,64,100,111,116,45,97,115,109,0 +.globl _blst_sha256_block_data_order_shaext +.private_extern _blst_sha256_block_data_order_shaext + +.p2align 6 +_blst_sha256_block_data_order_shaext: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + leaq K256+128(%rip),%rcx + movdqu (%rdi),%xmm1 + movdqu 16(%rdi),%xmm2 + movdqa 256-128(%rcx),%xmm7 + + pshufd $0x1b,%xmm1,%xmm0 + pshufd $0xb1,%xmm1,%xmm1 + pshufd $0x1b,%xmm2,%xmm2 + movdqa %xmm7,%xmm8 +.byte 102,15,58,15,202,8 + punpcklqdq %xmm0,%xmm2 + jmp L$oop_shaext + +.p2align 4 +L$oop_shaext: + movdqu (%rsi),%xmm3 + movdqu 16(%rsi),%xmm4 + movdqu 32(%rsi),%xmm5 +.byte 102,15,56,0,223 + movdqu 48(%rsi),%xmm6 + + movdqa 0-128(%rcx),%xmm0 + paddd %xmm3,%xmm0 +.byte 102,15,56,0,231 + movdqa %xmm2,%xmm10 +.byte 15,56,203,209 + pshufd $0x0e,%xmm0,%xmm0 + nop + movdqa %xmm1,%xmm9 +.byte 15,56,203,202 + + movdqa 16-128(%rcx),%xmm0 + paddd %xmm4,%xmm0 +.byte 102,15,56,0,239 +.byte 15,56,203,209 + pshufd $0x0e,%xmm0,%xmm0 + leaq 64(%rsi),%rsi +.byte 15,56,204,220 +.byte 15,56,203,202 + + movdqa 32-128(%rcx),%xmm0 + paddd %xmm5,%xmm0 +.byte 102,15,56,0,247 +.byte 15,56,203,209 + pshufd $0x0e,%xmm0,%xmm0 + movdqa %xmm6,%xmm7 +.byte 102,15,58,15,253,4 + nop + paddd %xmm7,%xmm3 +.byte 15,56,204,229 +.byte 15,56,203,202 + + movdqa 48-128(%rcx),%xmm0 + paddd %xmm6,%xmm0 +.byte 15,56,205,222 +.byte 15,56,203,209 + pshufd $0x0e,%xmm0,%xmm0 + movdqa %xmm3,%xmm7 +.byte 102,15,58,15,254,4 + nop + paddd %xmm7,%xmm4 +.byte 15,56,204,238 +.byte 15,56,203,202 + movdqa 64-128(%rcx),%xmm0 + paddd %xmm3,%xmm0 +.byte 15,56,205,227 +.byte 15,56,203,209 + pshufd $0x0e,%xmm0,%xmm0 + movdqa %xmm4,%xmm7 +.byte 102,15,58,15,251,4 + nop + paddd %xmm7,%xmm5 +.byte 15,56,204,243 +.byte 15,56,203,202 + movdqa 80-128(%rcx),%xmm0 + paddd %xmm4,%xmm0 +.byte 15,56,205,236 +.byte 15,56,203,209 + pshufd $0x0e,%xmm0,%xmm0 + movdqa %xmm5,%xmm7 +.byte 102,15,58,15,252,4 + nop + paddd %xmm7,%xmm6 +.byte 15,56,204,220 +.byte 15,56,203,202 + movdqa 96-128(%rcx),%xmm0 + paddd %xmm5,%xmm0 +.byte 15,56,205,245 +.byte 15,56,203,209 + pshufd $0x0e,%xmm0,%xmm0 + movdqa %xmm6,%xmm7 +.byte 102,15,58,15,253,4 + nop + paddd %xmm7,%xmm3 +.byte 15,56,204,229 +.byte 15,56,203,202 + movdqa 112-128(%rcx),%xmm0 + paddd %xmm6,%xmm0 +.byte 15,56,205,222 +.byte 15,56,203,209 + pshufd $0x0e,%xmm0,%xmm0 + movdqa %xmm3,%xmm7 +.byte 102,15,58,15,254,4 + nop + paddd %xmm7,%xmm4 +.byte 15,56,204,238 +.byte 15,56,203,202 + movdqa 128-128(%rcx),%xmm0 + paddd %xmm3,%xmm0 +.byte 15,56,205,227 +.byte 15,56,203,209 + pshufd $0x0e,%xmm0,%xmm0 + movdqa %xmm4,%xmm7 +.byte 102,15,58,15,251,4 + nop + paddd %xmm7,%xmm5 +.byte 15,56,204,243 +.byte 15,56,203,202 + movdqa 144-128(%rcx),%xmm0 + paddd %xmm4,%xmm0 +.byte 15,56,205,236 +.byte 15,56,203,209 + pshufd $0x0e,%xmm0,%xmm0 + movdqa %xmm5,%xmm7 +.byte 102,15,58,15,252,4 + nop + paddd %xmm7,%xmm6 +.byte 15,56,204,220 +.byte 15,56,203,202 + movdqa 160-128(%rcx),%xmm0 + paddd %xmm5,%xmm0 +.byte 15,56,205,245 +.byte 15,56,203,209 + pshufd $0x0e,%xmm0,%xmm0 + movdqa %xmm6,%xmm7 +.byte 102,15,58,15,253,4 + nop + paddd %xmm7,%xmm3 +.byte 15,56,204,229 +.byte 15,56,203,202 + movdqa 176-128(%rcx),%xmm0 + paddd %xmm6,%xmm0 +.byte 15,56,205,222 +.byte 15,56,203,209 + pshufd $0x0e,%xmm0,%xmm0 + movdqa %xmm3,%xmm7 +.byte 102,15,58,15,254,4 + nop + paddd %xmm7,%xmm4 +.byte 15,56,204,238 +.byte 15,56,203,202 + movdqa 192-128(%rcx),%xmm0 + paddd %xmm3,%xmm0 +.byte 15,56,205,227 +.byte 15,56,203,209 + pshufd $0x0e,%xmm0,%xmm0 + movdqa %xmm4,%xmm7 +.byte 102,15,58,15,251,4 + nop + paddd %xmm7,%xmm5 +.byte 15,56,204,243 +.byte 15,56,203,202 + movdqa 208-128(%rcx),%xmm0 + paddd %xmm4,%xmm0 +.byte 15,56,205,236 +.byte 15,56,203,209 + pshufd $0x0e,%xmm0,%xmm0 + movdqa %xmm5,%xmm7 +.byte 102,15,58,15,252,4 +.byte 15,56,203,202 + paddd %xmm7,%xmm6 + + movdqa 224-128(%rcx),%xmm0 + paddd %xmm5,%xmm0 +.byte 15,56,203,209 + pshufd $0x0e,%xmm0,%xmm0 +.byte 15,56,205,245 + movdqa %xmm8,%xmm7 +.byte 15,56,203,202 + + movdqa 240-128(%rcx),%xmm0 + paddd %xmm6,%xmm0 + nop +.byte 15,56,203,209 + pshufd $0x0e,%xmm0,%xmm0 + decq %rdx + nop +.byte 15,56,203,202 + + paddd %xmm10,%xmm2 + paddd %xmm9,%xmm1 + jnz L$oop_shaext + + pshufd $0xb1,%xmm2,%xmm2 + pshufd $0x1b,%xmm1,%xmm7 + pshufd $0xb1,%xmm1,%xmm1 + punpckhqdq %xmm2,%xmm1 +.byte 102,15,58,15,215,8 + + movdqu %xmm1,(%rdi) + movdqu %xmm2,16(%rdi) + .byte 0xf3,0xc3 +.cfi_endproc + +.globl _blst_sha256_block_data_order +.private_extern _blst_sha256_block_data_order + +.p2align 6 +_blst_sha256_block_data_order: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + shlq $4,%rdx + subq $40,%rsp +.cfi_adjust_cfa_offset 40 + leaq (%rsi,%rdx,4),%rdx + movq %rdi,0(%rsp) + + movq %rdx,16(%rsp) + movq %rsp,%rbp +.cfi_def_cfa_register %rbp + + + leaq -64(%rsp),%rsp + movl 0(%rdi),%eax + andq $-64,%rsp + movl 4(%rdi),%ebx + movl 8(%rdi),%ecx + movl 12(%rdi),%edx + movl 16(%rdi),%r8d + movl 20(%rdi),%r9d + movl 24(%rdi),%r10d + movl 28(%rdi),%r11d + + + jmp L$loop_ssse3 +.p2align 4 +L$loop_ssse3: + movdqa K256+256(%rip),%xmm7 + movq %rsi,8(%rbp) + movdqu 0(%rsi),%xmm0 + movdqu 16(%rsi),%xmm1 + movdqu 32(%rsi),%xmm2 +.byte 102,15,56,0,199 + movdqu 48(%rsi),%xmm3 + leaq K256(%rip),%rsi +.byte 102,15,56,0,207 + movdqa 0(%rsi),%xmm4 + movdqa 16(%rsi),%xmm5 +.byte 102,15,56,0,215 + paddd %xmm0,%xmm4 + movdqa 32(%rsi),%xmm6 +.byte 102,15,56,0,223 + movdqa 48(%rsi),%xmm7 + paddd %xmm1,%xmm5 + paddd %xmm2,%xmm6 + paddd %xmm3,%xmm7 + movdqa %xmm4,0(%rsp) + movl %eax,%r14d + movdqa %xmm5,16(%rsp) + movl %ebx,%edi + movdqa %xmm6,32(%rsp) + xorl %ecx,%edi + movdqa %xmm7,48(%rsp) + movl %r8d,%r13d + jmp L$ssse3_00_47 + +.p2align 4 +L$ssse3_00_47: + subq $-64,%rsi + rorl $14,%r13d + movdqa %xmm1,%xmm4 + movl %r14d,%eax + movl %r9d,%r12d + movdqa %xmm3,%xmm7 + rorl $9,%r14d + xorl %r8d,%r13d + xorl %r10d,%r12d + rorl $5,%r13d + xorl %eax,%r14d +.byte 102,15,58,15,224,4 + andl %r8d,%r12d + xorl %r8d,%r13d +.byte 102,15,58,15,250,4 + addl 0(%rsp),%r11d + movl %eax,%r15d + xorl %r10d,%r12d + rorl $11,%r14d + movdqa %xmm4,%xmm5 + xorl %ebx,%r15d + addl %r12d,%r11d + movdqa %xmm4,%xmm6 + rorl $6,%r13d + andl %r15d,%edi + psrld $3,%xmm4 + xorl %eax,%r14d + addl %r13d,%r11d + xorl %ebx,%edi + paddd %xmm7,%xmm0 + rorl $2,%r14d + addl %r11d,%edx + psrld $7,%xmm6 + addl %edi,%r11d + movl %edx,%r13d + pshufd $250,%xmm3,%xmm7 + addl %r11d,%r14d + rorl $14,%r13d + pslld $14,%xmm5 + movl %r14d,%r11d + movl %r8d,%r12d + pxor %xmm6,%xmm4 + rorl $9,%r14d + xorl %edx,%r13d + xorl %r9d,%r12d + rorl $5,%r13d + psrld $11,%xmm6 + xorl %r11d,%r14d + pxor %xmm5,%xmm4 + andl %edx,%r12d + xorl %edx,%r13d + pslld $11,%xmm5 + addl 4(%rsp),%r10d + movl %r11d,%edi + pxor %xmm6,%xmm4 + xorl %r9d,%r12d + rorl $11,%r14d + movdqa %xmm7,%xmm6 + xorl %eax,%edi + addl %r12d,%r10d + pxor %xmm5,%xmm4 + rorl $6,%r13d + andl %edi,%r15d + xorl %r11d,%r14d + psrld $10,%xmm7 + addl %r13d,%r10d + xorl %eax,%r15d + paddd %xmm4,%xmm0 + rorl $2,%r14d + addl %r10d,%ecx + psrlq $17,%xmm6 + addl %r15d,%r10d + movl %ecx,%r13d + addl %r10d,%r14d + pxor %xmm6,%xmm7 + rorl $14,%r13d + movl %r14d,%r10d + movl %edx,%r12d + rorl $9,%r14d + psrlq $2,%xmm6 + xorl %ecx,%r13d + xorl %r8d,%r12d + pxor %xmm6,%xmm7 + rorl $5,%r13d + xorl %r10d,%r14d + andl %ecx,%r12d + pshufd $128,%xmm7,%xmm7 + xorl %ecx,%r13d + addl 8(%rsp),%r9d + movl %r10d,%r15d + psrldq $8,%xmm7 + xorl %r8d,%r12d + rorl $11,%r14d + xorl %r11d,%r15d + addl %r12d,%r9d + rorl $6,%r13d + paddd %xmm7,%xmm0 + andl %r15d,%edi + xorl %r10d,%r14d + addl %r13d,%r9d + pshufd $80,%xmm0,%xmm7 + xorl %r11d,%edi + rorl $2,%r14d + addl %r9d,%ebx + movdqa %xmm7,%xmm6 + addl %edi,%r9d + movl %ebx,%r13d + psrld $10,%xmm7 + addl %r9d,%r14d + rorl $14,%r13d + psrlq $17,%xmm6 + movl %r14d,%r9d + movl %ecx,%r12d + pxor %xmm6,%xmm7 + rorl $9,%r14d + xorl %ebx,%r13d + xorl %edx,%r12d + rorl $5,%r13d + xorl %r9d,%r14d + psrlq $2,%xmm6 + andl %ebx,%r12d + xorl %ebx,%r13d + addl 12(%rsp),%r8d + pxor %xmm6,%xmm7 + movl %r9d,%edi + xorl %edx,%r12d + rorl $11,%r14d + pshufd $8,%xmm7,%xmm7 + xorl %r10d,%edi + addl %r12d,%r8d + movdqa 0(%rsi),%xmm6 + rorl $6,%r13d + andl %edi,%r15d + pslldq $8,%xmm7 + xorl %r9d,%r14d + addl %r13d,%r8d + xorl %r10d,%r15d + paddd %xmm7,%xmm0 + rorl $2,%r14d + addl %r8d,%eax + addl %r15d,%r8d + paddd %xmm0,%xmm6 + movl %eax,%r13d + addl %r8d,%r14d + movdqa %xmm6,0(%rsp) + rorl $14,%r13d + movdqa %xmm2,%xmm4 + movl %r14d,%r8d + movl %ebx,%r12d + movdqa %xmm0,%xmm7 + rorl $9,%r14d + xorl %eax,%r13d + xorl %ecx,%r12d + rorl $5,%r13d + xorl %r8d,%r14d +.byte 102,15,58,15,225,4 + andl %eax,%r12d + xorl %eax,%r13d +.byte 102,15,58,15,251,4 + addl 16(%rsp),%edx + movl %r8d,%r15d + xorl %ecx,%r12d + rorl $11,%r14d + movdqa %xmm4,%xmm5 + xorl %r9d,%r15d + addl %r12d,%edx + movdqa %xmm4,%xmm6 + rorl $6,%r13d + andl %r15d,%edi + psrld $3,%xmm4 + xorl %r8d,%r14d + addl %r13d,%edx + xorl %r9d,%edi + paddd %xmm7,%xmm1 + rorl $2,%r14d + addl %edx,%r11d + psrld $7,%xmm6 + addl %edi,%edx + movl %r11d,%r13d + pshufd $250,%xmm0,%xmm7 + addl %edx,%r14d + rorl $14,%r13d + pslld $14,%xmm5 + movl %r14d,%edx + movl %eax,%r12d + pxor %xmm6,%xmm4 + rorl $9,%r14d + xorl %r11d,%r13d + xorl %ebx,%r12d + rorl $5,%r13d + psrld $11,%xmm6 + xorl %edx,%r14d + pxor %xmm5,%xmm4 + andl %r11d,%r12d + xorl %r11d,%r13d + pslld $11,%xmm5 + addl 20(%rsp),%ecx + movl %edx,%edi + pxor %xmm6,%xmm4 + xorl %ebx,%r12d + rorl $11,%r14d + movdqa %xmm7,%xmm6 + xorl %r8d,%edi + addl %r12d,%ecx + pxor %xmm5,%xmm4 + rorl $6,%r13d + andl %edi,%r15d + xorl %edx,%r14d + psrld $10,%xmm7 + addl %r13d,%ecx + xorl %r8d,%r15d + paddd %xmm4,%xmm1 + rorl $2,%r14d + addl %ecx,%r10d + psrlq $17,%xmm6 + addl %r15d,%ecx + movl %r10d,%r13d + addl %ecx,%r14d + pxor %xmm6,%xmm7 + rorl $14,%r13d + movl %r14d,%ecx + movl %r11d,%r12d + rorl $9,%r14d + psrlq $2,%xmm6 + xorl %r10d,%r13d + xorl %eax,%r12d + pxor %xmm6,%xmm7 + rorl $5,%r13d + xorl %ecx,%r14d + andl %r10d,%r12d + pshufd $128,%xmm7,%xmm7 + xorl %r10d,%r13d + addl 24(%rsp),%ebx + movl %ecx,%r15d + psrldq $8,%xmm7 + xorl %eax,%r12d + rorl $11,%r14d + xorl %edx,%r15d + addl %r12d,%ebx + rorl $6,%r13d + paddd %xmm7,%xmm1 + andl %r15d,%edi + xorl %ecx,%r14d + addl %r13d,%ebx + pshufd $80,%xmm1,%xmm7 + xorl %edx,%edi + rorl $2,%r14d + addl %ebx,%r9d + movdqa %xmm7,%xmm6 + addl %edi,%ebx + movl %r9d,%r13d + psrld $10,%xmm7 + addl %ebx,%r14d + rorl $14,%r13d + psrlq $17,%xmm6 + movl %r14d,%ebx + movl %r10d,%r12d + pxor %xmm6,%xmm7 + rorl $9,%r14d + xorl %r9d,%r13d + xorl %r11d,%r12d + rorl $5,%r13d + xorl %ebx,%r14d + psrlq $2,%xmm6 + andl %r9d,%r12d + xorl %r9d,%r13d + addl 28(%rsp),%eax + pxor %xmm6,%xmm7 + movl %ebx,%edi + xorl %r11d,%r12d + rorl $11,%r14d + pshufd $8,%xmm7,%xmm7 + xorl %ecx,%edi + addl %r12d,%eax + movdqa 16(%rsi),%xmm6 + rorl $6,%r13d + andl %edi,%r15d + pslldq $8,%xmm7 + xorl %ebx,%r14d + addl %r13d,%eax + xorl %ecx,%r15d + paddd %xmm7,%xmm1 + rorl $2,%r14d + addl %eax,%r8d + addl %r15d,%eax + paddd %xmm1,%xmm6 + movl %r8d,%r13d + addl %eax,%r14d + movdqa %xmm6,16(%rsp) + rorl $14,%r13d + movdqa %xmm3,%xmm4 + movl %r14d,%eax + movl %r9d,%r12d + movdqa %xmm1,%xmm7 + rorl $9,%r14d + xorl %r8d,%r13d + xorl %r10d,%r12d + rorl $5,%r13d + xorl %eax,%r14d +.byte 102,15,58,15,226,4 + andl %r8d,%r12d + xorl %r8d,%r13d +.byte 102,15,58,15,248,4 + addl 32(%rsp),%r11d + movl %eax,%r15d + xorl %r10d,%r12d + rorl $11,%r14d + movdqa %xmm4,%xmm5 + xorl %ebx,%r15d + addl %r12d,%r11d + movdqa %xmm4,%xmm6 + rorl $6,%r13d + andl %r15d,%edi + psrld $3,%xmm4 + xorl %eax,%r14d + addl %r13d,%r11d + xorl %ebx,%edi + paddd %xmm7,%xmm2 + rorl $2,%r14d + addl %r11d,%edx + psrld $7,%xmm6 + addl %edi,%r11d + movl %edx,%r13d + pshufd $250,%xmm1,%xmm7 + addl %r11d,%r14d + rorl $14,%r13d + pslld $14,%xmm5 + movl %r14d,%r11d + movl %r8d,%r12d + pxor %xmm6,%xmm4 + rorl $9,%r14d + xorl %edx,%r13d + xorl %r9d,%r12d + rorl $5,%r13d + psrld $11,%xmm6 + xorl %r11d,%r14d + pxor %xmm5,%xmm4 + andl %edx,%r12d + xorl %edx,%r13d + pslld $11,%xmm5 + addl 36(%rsp),%r10d + movl %r11d,%edi + pxor %xmm6,%xmm4 + xorl %r9d,%r12d + rorl $11,%r14d + movdqa %xmm7,%xmm6 + xorl %eax,%edi + addl %r12d,%r10d + pxor %xmm5,%xmm4 + rorl $6,%r13d + andl %edi,%r15d + xorl %r11d,%r14d + psrld $10,%xmm7 + addl %r13d,%r10d + xorl %eax,%r15d + paddd %xmm4,%xmm2 + rorl $2,%r14d + addl %r10d,%ecx + psrlq $17,%xmm6 + addl %r15d,%r10d + movl %ecx,%r13d + addl %r10d,%r14d + pxor %xmm6,%xmm7 + rorl $14,%r13d + movl %r14d,%r10d + movl %edx,%r12d + rorl $9,%r14d + psrlq $2,%xmm6 + xorl %ecx,%r13d + xorl %r8d,%r12d + pxor %xmm6,%xmm7 + rorl $5,%r13d + xorl %r10d,%r14d + andl %ecx,%r12d + pshufd $128,%xmm7,%xmm7 + xorl %ecx,%r13d + addl 40(%rsp),%r9d + movl %r10d,%r15d + psrldq $8,%xmm7 + xorl %r8d,%r12d + rorl $11,%r14d + xorl %r11d,%r15d + addl %r12d,%r9d + rorl $6,%r13d + paddd %xmm7,%xmm2 + andl %r15d,%edi + xorl %r10d,%r14d + addl %r13d,%r9d + pshufd $80,%xmm2,%xmm7 + xorl %r11d,%edi + rorl $2,%r14d + addl %r9d,%ebx + movdqa %xmm7,%xmm6 + addl %edi,%r9d + movl %ebx,%r13d + psrld $10,%xmm7 + addl %r9d,%r14d + rorl $14,%r13d + psrlq $17,%xmm6 + movl %r14d,%r9d + movl %ecx,%r12d + pxor %xmm6,%xmm7 + rorl $9,%r14d + xorl %ebx,%r13d + xorl %edx,%r12d + rorl $5,%r13d + xorl %r9d,%r14d + psrlq $2,%xmm6 + andl %ebx,%r12d + xorl %ebx,%r13d + addl 44(%rsp),%r8d + pxor %xmm6,%xmm7 + movl %r9d,%edi + xorl %edx,%r12d + rorl $11,%r14d + pshufd $8,%xmm7,%xmm7 + xorl %r10d,%edi + addl %r12d,%r8d + movdqa 32(%rsi),%xmm6 + rorl $6,%r13d + andl %edi,%r15d + pslldq $8,%xmm7 + xorl %r9d,%r14d + addl %r13d,%r8d + xorl %r10d,%r15d + paddd %xmm7,%xmm2 + rorl $2,%r14d + addl %r8d,%eax + addl %r15d,%r8d + paddd %xmm2,%xmm6 + movl %eax,%r13d + addl %r8d,%r14d + movdqa %xmm6,32(%rsp) + rorl $14,%r13d + movdqa %xmm0,%xmm4 + movl %r14d,%r8d + movl %ebx,%r12d + movdqa %xmm2,%xmm7 + rorl $9,%r14d + xorl %eax,%r13d + xorl %ecx,%r12d + rorl $5,%r13d + xorl %r8d,%r14d +.byte 102,15,58,15,227,4 + andl %eax,%r12d + xorl %eax,%r13d +.byte 102,15,58,15,249,4 + addl 48(%rsp),%edx + movl %r8d,%r15d + xorl %ecx,%r12d + rorl $11,%r14d + movdqa %xmm4,%xmm5 + xorl %r9d,%r15d + addl %r12d,%edx + movdqa %xmm4,%xmm6 + rorl $6,%r13d + andl %r15d,%edi + psrld $3,%xmm4 + xorl %r8d,%r14d + addl %r13d,%edx + xorl %r9d,%edi + paddd %xmm7,%xmm3 + rorl $2,%r14d + addl %edx,%r11d + psrld $7,%xmm6 + addl %edi,%edx + movl %r11d,%r13d + pshufd $250,%xmm2,%xmm7 + addl %edx,%r14d + rorl $14,%r13d + pslld $14,%xmm5 + movl %r14d,%edx + movl %eax,%r12d + pxor %xmm6,%xmm4 + rorl $9,%r14d + xorl %r11d,%r13d + xorl %ebx,%r12d + rorl $5,%r13d + psrld $11,%xmm6 + xorl %edx,%r14d + pxor %xmm5,%xmm4 + andl %r11d,%r12d + xorl %r11d,%r13d + pslld $11,%xmm5 + addl 52(%rsp),%ecx + movl %edx,%edi + pxor %xmm6,%xmm4 + xorl %ebx,%r12d + rorl $11,%r14d + movdqa %xmm7,%xmm6 + xorl %r8d,%edi + addl %r12d,%ecx + pxor %xmm5,%xmm4 + rorl $6,%r13d + andl %edi,%r15d + xorl %edx,%r14d + psrld $10,%xmm7 + addl %r13d,%ecx + xorl %r8d,%r15d + paddd %xmm4,%xmm3 + rorl $2,%r14d + addl %ecx,%r10d + psrlq $17,%xmm6 + addl %r15d,%ecx + movl %r10d,%r13d + addl %ecx,%r14d + pxor %xmm6,%xmm7 + rorl $14,%r13d + movl %r14d,%ecx + movl %r11d,%r12d + rorl $9,%r14d + psrlq $2,%xmm6 + xorl %r10d,%r13d + xorl %eax,%r12d + pxor %xmm6,%xmm7 + rorl $5,%r13d + xorl %ecx,%r14d + andl %r10d,%r12d + pshufd $128,%xmm7,%xmm7 + xorl %r10d,%r13d + addl 56(%rsp),%ebx + movl %ecx,%r15d + psrldq $8,%xmm7 + xorl %eax,%r12d + rorl $11,%r14d + xorl %edx,%r15d + addl %r12d,%ebx + rorl $6,%r13d + paddd %xmm7,%xmm3 + andl %r15d,%edi + xorl %ecx,%r14d + addl %r13d,%ebx + pshufd $80,%xmm3,%xmm7 + xorl %edx,%edi + rorl $2,%r14d + addl %ebx,%r9d + movdqa %xmm7,%xmm6 + addl %edi,%ebx + movl %r9d,%r13d + psrld $10,%xmm7 + addl %ebx,%r14d + rorl $14,%r13d + psrlq $17,%xmm6 + movl %r14d,%ebx + movl %r10d,%r12d + pxor %xmm6,%xmm7 + rorl $9,%r14d + xorl %r9d,%r13d + xorl %r11d,%r12d + rorl $5,%r13d + xorl %ebx,%r14d + psrlq $2,%xmm6 + andl %r9d,%r12d + xorl %r9d,%r13d + addl 60(%rsp),%eax + pxor %xmm6,%xmm7 + movl %ebx,%edi + xorl %r11d,%r12d + rorl $11,%r14d + pshufd $8,%xmm7,%xmm7 + xorl %ecx,%edi + addl %r12d,%eax + movdqa 48(%rsi),%xmm6 + rorl $6,%r13d + andl %edi,%r15d + pslldq $8,%xmm7 + xorl %ebx,%r14d + addl %r13d,%eax + xorl %ecx,%r15d + paddd %xmm7,%xmm3 + rorl $2,%r14d + addl %eax,%r8d + addl %r15d,%eax + paddd %xmm3,%xmm6 + movl %r8d,%r13d + addl %eax,%r14d + movdqa %xmm6,48(%rsp) + cmpb $0,67(%rsi) + jne L$ssse3_00_47 + rorl $14,%r13d + movl %r14d,%eax + movl %r9d,%r12d + rorl $9,%r14d + xorl %r8d,%r13d + xorl %r10d,%r12d + rorl $5,%r13d + xorl %eax,%r14d + andl %r8d,%r12d + xorl %r8d,%r13d + addl 0(%rsp),%r11d + movl %eax,%r15d + xorl %r10d,%r12d + rorl $11,%r14d + xorl %ebx,%r15d + addl %r12d,%r11d + rorl $6,%r13d + andl %r15d,%edi + xorl %eax,%r14d + addl %r13d,%r11d + xorl %ebx,%edi + rorl $2,%r14d + addl %r11d,%edx + addl %edi,%r11d + movl %edx,%r13d + addl %r11d,%r14d + rorl $14,%r13d + movl %r14d,%r11d + movl %r8d,%r12d + rorl $9,%r14d + xorl %edx,%r13d + xorl %r9d,%r12d + rorl $5,%r13d + xorl %r11d,%r14d + andl %edx,%r12d + xorl %edx,%r13d + addl 4(%rsp),%r10d + movl %r11d,%edi + xorl %r9d,%r12d + rorl $11,%r14d + xorl %eax,%edi + addl %r12d,%r10d + rorl $6,%r13d + andl %edi,%r15d + xorl %r11d,%r14d + addl %r13d,%r10d + xorl %eax,%r15d + rorl $2,%r14d + addl %r10d,%ecx + addl %r15d,%r10d + movl %ecx,%r13d + addl %r10d,%r14d + rorl $14,%r13d + movl %r14d,%r10d + movl %edx,%r12d + rorl $9,%r14d + xorl %ecx,%r13d + xorl %r8d,%r12d + rorl $5,%r13d + xorl %r10d,%r14d + andl %ecx,%r12d + xorl %ecx,%r13d + addl 8(%rsp),%r9d + movl %r10d,%r15d + xorl %r8d,%r12d + rorl $11,%r14d + xorl %r11d,%r15d + addl %r12d,%r9d + rorl $6,%r13d + andl %r15d,%edi + xorl %r10d,%r14d + addl %r13d,%r9d + xorl %r11d,%edi + rorl $2,%r14d + addl %r9d,%ebx + addl %edi,%r9d + movl %ebx,%r13d + addl %r9d,%r14d + rorl $14,%r13d + movl %r14d,%r9d + movl %ecx,%r12d + rorl $9,%r14d + xorl %ebx,%r13d + xorl %edx,%r12d + rorl $5,%r13d + xorl %r9d,%r14d + andl %ebx,%r12d + xorl %ebx,%r13d + addl 12(%rsp),%r8d + movl %r9d,%edi + xorl %edx,%r12d + rorl $11,%r14d + xorl %r10d,%edi + addl %r12d,%r8d + rorl $6,%r13d + andl %edi,%r15d + xorl %r9d,%r14d + addl %r13d,%r8d + xorl %r10d,%r15d + rorl $2,%r14d + addl %r8d,%eax + addl %r15d,%r8d + movl %eax,%r13d + addl %r8d,%r14d + rorl $14,%r13d + movl %r14d,%r8d + movl %ebx,%r12d + rorl $9,%r14d + xorl %eax,%r13d + xorl %ecx,%r12d + rorl $5,%r13d + xorl %r8d,%r14d + andl %eax,%r12d + xorl %eax,%r13d + addl 16(%rsp),%edx + movl %r8d,%r15d + xorl %ecx,%r12d + rorl $11,%r14d + xorl %r9d,%r15d + addl %r12d,%edx + rorl $6,%r13d + andl %r15d,%edi + xorl %r8d,%r14d + addl %r13d,%edx + xorl %r9d,%edi + rorl $2,%r14d + addl %edx,%r11d + addl %edi,%edx + movl %r11d,%r13d + addl %edx,%r14d + rorl $14,%r13d + movl %r14d,%edx + movl %eax,%r12d + rorl $9,%r14d + xorl %r11d,%r13d + xorl %ebx,%r12d + rorl $5,%r13d + xorl %edx,%r14d + andl %r11d,%r12d + xorl %r11d,%r13d + addl 20(%rsp),%ecx + movl %edx,%edi + xorl %ebx,%r12d + rorl $11,%r14d + xorl %r8d,%edi + addl %r12d,%ecx + rorl $6,%r13d + andl %edi,%r15d + xorl %edx,%r14d + addl %r13d,%ecx + xorl %r8d,%r15d + rorl $2,%r14d + addl %ecx,%r10d + addl %r15d,%ecx + movl %r10d,%r13d + addl %ecx,%r14d + rorl $14,%r13d + movl %r14d,%ecx + movl %r11d,%r12d + rorl $9,%r14d + xorl %r10d,%r13d + xorl %eax,%r12d + rorl $5,%r13d + xorl %ecx,%r14d + andl %r10d,%r12d + xorl %r10d,%r13d + addl 24(%rsp),%ebx + movl %ecx,%r15d + xorl %eax,%r12d + rorl $11,%r14d + xorl %edx,%r15d + addl %r12d,%ebx + rorl $6,%r13d + andl %r15d,%edi + xorl %ecx,%r14d + addl %r13d,%ebx + xorl %edx,%edi + rorl $2,%r14d + addl %ebx,%r9d + addl %edi,%ebx + movl %r9d,%r13d + addl %ebx,%r14d + rorl $14,%r13d + movl %r14d,%ebx + movl %r10d,%r12d + rorl $9,%r14d + xorl %r9d,%r13d + xorl %r11d,%r12d + rorl $5,%r13d + xorl %ebx,%r14d + andl %r9d,%r12d + xorl %r9d,%r13d + addl 28(%rsp),%eax + movl %ebx,%edi + xorl %r11d,%r12d + rorl $11,%r14d + xorl %ecx,%edi + addl %r12d,%eax + rorl $6,%r13d + andl %edi,%r15d + xorl %ebx,%r14d + addl %r13d,%eax + xorl %ecx,%r15d + rorl $2,%r14d + addl %eax,%r8d + addl %r15d,%eax + movl %r8d,%r13d + addl %eax,%r14d + rorl $14,%r13d + movl %r14d,%eax + movl %r9d,%r12d + rorl $9,%r14d + xorl %r8d,%r13d + xorl %r10d,%r12d + rorl $5,%r13d + xorl %eax,%r14d + andl %r8d,%r12d + xorl %r8d,%r13d + addl 32(%rsp),%r11d + movl %eax,%r15d + xorl %r10d,%r12d + rorl $11,%r14d + xorl %ebx,%r15d + addl %r12d,%r11d + rorl $6,%r13d + andl %r15d,%edi + xorl %eax,%r14d + addl %r13d,%r11d + xorl %ebx,%edi + rorl $2,%r14d + addl %r11d,%edx + addl %edi,%r11d + movl %edx,%r13d + addl %r11d,%r14d + rorl $14,%r13d + movl %r14d,%r11d + movl %r8d,%r12d + rorl $9,%r14d + xorl %edx,%r13d + xorl %r9d,%r12d + rorl $5,%r13d + xorl %r11d,%r14d + andl %edx,%r12d + xorl %edx,%r13d + addl 36(%rsp),%r10d + movl %r11d,%edi + xorl %r9d,%r12d + rorl $11,%r14d + xorl %eax,%edi + addl %r12d,%r10d + rorl $6,%r13d + andl %edi,%r15d + xorl %r11d,%r14d + addl %r13d,%r10d + xorl %eax,%r15d + rorl $2,%r14d + addl %r10d,%ecx + addl %r15d,%r10d + movl %ecx,%r13d + addl %r10d,%r14d + rorl $14,%r13d + movl %r14d,%r10d + movl %edx,%r12d + rorl $9,%r14d + xorl %ecx,%r13d + xorl %r8d,%r12d + rorl $5,%r13d + xorl %r10d,%r14d + andl %ecx,%r12d + xorl %ecx,%r13d + addl 40(%rsp),%r9d + movl %r10d,%r15d + xorl %r8d,%r12d + rorl $11,%r14d + xorl %r11d,%r15d + addl %r12d,%r9d + rorl $6,%r13d + andl %r15d,%edi + xorl %r10d,%r14d + addl %r13d,%r9d + xorl %r11d,%edi + rorl $2,%r14d + addl %r9d,%ebx + addl %edi,%r9d + movl %ebx,%r13d + addl %r9d,%r14d + rorl $14,%r13d + movl %r14d,%r9d + movl %ecx,%r12d + rorl $9,%r14d + xorl %ebx,%r13d + xorl %edx,%r12d + rorl $5,%r13d + xorl %r9d,%r14d + andl %ebx,%r12d + xorl %ebx,%r13d + addl 44(%rsp),%r8d + movl %r9d,%edi + xorl %edx,%r12d + rorl $11,%r14d + xorl %r10d,%edi + addl %r12d,%r8d + rorl $6,%r13d + andl %edi,%r15d + xorl %r9d,%r14d + addl %r13d,%r8d + xorl %r10d,%r15d + rorl $2,%r14d + addl %r8d,%eax + addl %r15d,%r8d + movl %eax,%r13d + addl %r8d,%r14d + rorl $14,%r13d + movl %r14d,%r8d + movl %ebx,%r12d + rorl $9,%r14d + xorl %eax,%r13d + xorl %ecx,%r12d + rorl $5,%r13d + xorl %r8d,%r14d + andl %eax,%r12d + xorl %eax,%r13d + addl 48(%rsp),%edx + movl %r8d,%r15d + xorl %ecx,%r12d + rorl $11,%r14d + xorl %r9d,%r15d + addl %r12d,%edx + rorl $6,%r13d + andl %r15d,%edi + xorl %r8d,%r14d + addl %r13d,%edx + xorl %r9d,%edi + rorl $2,%r14d + addl %edx,%r11d + addl %edi,%edx + movl %r11d,%r13d + addl %edx,%r14d + rorl $14,%r13d + movl %r14d,%edx + movl %eax,%r12d + rorl $9,%r14d + xorl %r11d,%r13d + xorl %ebx,%r12d + rorl $5,%r13d + xorl %edx,%r14d + andl %r11d,%r12d + xorl %r11d,%r13d + addl 52(%rsp),%ecx + movl %edx,%edi + xorl %ebx,%r12d + rorl $11,%r14d + xorl %r8d,%edi + addl %r12d,%ecx + rorl $6,%r13d + andl %edi,%r15d + xorl %edx,%r14d + addl %r13d,%ecx + xorl %r8d,%r15d + rorl $2,%r14d + addl %ecx,%r10d + addl %r15d,%ecx + movl %r10d,%r13d + addl %ecx,%r14d + rorl $14,%r13d + movl %r14d,%ecx + movl %r11d,%r12d + rorl $9,%r14d + xorl %r10d,%r13d + xorl %eax,%r12d + rorl $5,%r13d + xorl %ecx,%r14d + andl %r10d,%r12d + xorl %r10d,%r13d + addl 56(%rsp),%ebx + movl %ecx,%r15d + xorl %eax,%r12d + rorl $11,%r14d + xorl %edx,%r15d + addl %r12d,%ebx + rorl $6,%r13d + andl %r15d,%edi + xorl %ecx,%r14d + addl %r13d,%ebx + xorl %edx,%edi + rorl $2,%r14d + addl %ebx,%r9d + addl %edi,%ebx + movl %r9d,%r13d + addl %ebx,%r14d + rorl $14,%r13d + movl %r14d,%ebx + movl %r10d,%r12d + rorl $9,%r14d + xorl %r9d,%r13d + xorl %r11d,%r12d + rorl $5,%r13d + xorl %ebx,%r14d + andl %r9d,%r12d + xorl %r9d,%r13d + addl 60(%rsp),%eax + movl %ebx,%edi + xorl %r11d,%r12d + rorl $11,%r14d + xorl %ecx,%edi + addl %r12d,%eax + rorl $6,%r13d + andl %edi,%r15d + xorl %ebx,%r14d + addl %r13d,%eax + xorl %ecx,%r15d + rorl $2,%r14d + addl %eax,%r8d + addl %r15d,%eax + movl %r8d,%r13d + addl %eax,%r14d + movq 0(%rbp),%rdi + movl %r14d,%eax + movq 8(%rbp),%rsi + + addl 0(%rdi),%eax + addl 4(%rdi),%ebx + addl 8(%rdi),%ecx + addl 12(%rdi),%edx + addl 16(%rdi),%r8d + addl 20(%rdi),%r9d + addl 24(%rdi),%r10d + addl 28(%rdi),%r11d + + leaq 64(%rsi),%rsi + cmpq 16(%rbp),%rsi + + movl %eax,0(%rdi) + movl %ebx,4(%rdi) + movl %ecx,8(%rdi) + movl %edx,12(%rdi) + movl %r8d,16(%rdi) + movl %r9d,20(%rdi) + movl %r10d,24(%rdi) + movl %r11d,28(%rdi) + jb L$loop_ssse3 + + xorps %xmm0,%xmm0 + leaq 40+48(%rbp),%r11 +.cfi_def_cfa %r11,8 + movaps %xmm0,0(%rsp) + movaps %xmm0,16(%rsp) + movaps %xmm0,32(%rsp) + movaps %xmm0,48(%rsp) + movq 40(%rbp),%r15 +.cfi_restore %r15 + movq -40(%r11),%r14 +.cfi_restore %r14 + movq -32(%r11),%r13 +.cfi_restore %r13 + movq -24(%r11),%r12 +.cfi_restore %r12 + movq -16(%r11),%rbx +.cfi_restore %rbx + movq -8(%r11),%rbp +.cfi_restore %rbp + + leaq (%r11),%rsp + .byte 0xf3,0xc3 +.cfi_endproc + +.globl _blst_sha256_emit +.private_extern _blst_sha256_emit + +.p2align 4 +_blst_sha256_emit: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + bswapq %r8 + movq 24(%rsi),%r11 + bswapq %r9 + movl %r8d,4(%rdi) + bswapq %r10 + movl %r9d,12(%rdi) + bswapq %r11 + movl %r10d,20(%rdi) + shrq $32,%r8 + movl %r11d,28(%rdi) + shrq $32,%r9 + movl %r8d,0(%rdi) + shrq $32,%r10 + movl %r9d,8(%rdi) + shrq $32,%r11 + movl %r10d,16(%rdi) + movl %r11d,24(%rdi) + .byte 0xf3,0xc3 +.cfi_endproc + + +.globl _blst_sha256_bcopy +.private_extern _blst_sha256_bcopy + +.p2align 4 +_blst_sha256_bcopy: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + subq %rsi,%rdi +L$oop_bcopy: + movzbl (%rsi),%eax + leaq 1(%rsi),%rsi + movb %al,-1(%rdi,%rsi,1) + decq %rdx + jnz L$oop_bcopy + .byte 0xf3,0xc3 +.cfi_endproc + + +.globl _blst_sha256_hcopy +.private_extern _blst_sha256_hcopy + +.p2align 4 +_blst_sha256_hcopy: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + movq %r8,0(%rdi) + movq %r9,8(%rdi) + movq %r10,16(%rdi) + movq %r11,24(%rdi) + .byte 0xf3,0xc3 +.cfi_endproc + diff --git a/temp-deps/blst-ts/deps/blst/build/refresh.sh b/temp-deps/blst-ts/deps/blst/build/refresh.sh new file mode 100755 index 0000000..e8c8137 --- /dev/null +++ b/temp-deps/blst-ts/deps/blst/build/refresh.sh @@ -0,0 +1,49 @@ +#!/bin/sh + +HERE=`dirname $0` +cd "${HERE}" + +PERL=${PERL:-perl} + +for pl in ../src/asm/*-x86_64.pl; do + s=`basename $pl .pl`.asm + expr $s : '.*portable' > /dev/null || (set -x; ${PERL} $pl masm > win64/$s) + s=`basename $pl .pl`.s + (set -x; ${PERL} $pl elf > elf/$s) + (set -x; ${PERL} $pl mingw64 > coff/$s) + (set -x; ${PERL} $pl macosx > mach-o/$s) +done + +for pl in ../src/asm/*-armv8.pl; do + s=`basename $pl .pl`.asm + (set -x; ${PERL} $pl win64 > win64/$s) + s=`basename $pl .pl`.S + (set -x; ${PERL} $pl linux64 > elf/$s) + (set -x; ${PERL} $pl coff64 > coff/$s) + (set -x; ${PERL} $pl ios64 > mach-o/$s) +done + +( cd ../bindings; + echo "LIBRARY blst" + echo + echo "EXPORTS" + cc -E blst.h | \ + ${PERL} -ne '{ (/(blst_[\w]+)\s*\(/ || /(BLS12_[\w]+);/) && print "\t$1\n" }' + echo +) > win64/blst.def + +if which bindgen > /dev/null 2>&1; then + ( cd ../bindings; set -x; + bindgen --opaque-type blst_pairing \ + --opaque-type blst_uniq \ + --with-derive-default \ + --with-derive-eq \ + --size_t-is-usize \ + --rustified-enum BLST.\* \ + blst.h -- -D__BLST_RUST_BINDGEN__ \ + | ${PERL} ../build/bindings_trim.pl > rust/src/bindings.rs + ) +else + echo "Install Rust bindgen with 'cargo install bindgen'" 1>&2 + exit 1 +fi diff --git a/temp-deps/blst-ts/deps/blst/build/win64/add_mod_256-armv8.asm b/temp-deps/blst-ts/deps/blst/build/win64/add_mod_256-armv8.asm new file mode 100644 index 0000000..8d69751 --- /dev/null +++ b/temp-deps/blst-ts/deps/blst/build/win64/add_mod_256-armv8.asm @@ -0,0 +1,380 @@ + AREA |.text|,CODE,ALIGN=8,ARM64 + + + + EXPORT |add_mod_256|[FUNC] + ALIGN 32 +|add_mod_256| PROC + ldp x8,x9,[x1] + ldp x12,x13,[x2] + + ldp x10,x11,[x1,#16] + adds x8,x8,x12 + ldp x14,x15,[x2,#16] + adcs x9,x9,x13 + ldp x4,x5,[x3] + adcs x10,x10,x14 + ldp x6,x7,[x3,#16] + adcs x11,x11,x15 + adc x3,xzr,xzr + + subs x16,x8,x4 + sbcs x17,x9,x5 + sbcs x1,x10,x6 + sbcs x2,x11,x7 + sbcs xzr,x3,xzr + + csello x8,x8,x16 + csello x9,x9,x17 + csello x10,x10,x1 + stp x8,x9,[x0] + csello x11,x11,x2 + stp x10,x11,[x0,#16] + + ret + ENDP + + + + EXPORT |mul_by_3_mod_256|[FUNC] + ALIGN 32 +|mul_by_3_mod_256| PROC + ldp x12,x13,[x1] + ldp x14,x15,[x1,#16] + + adds x8,x12,x12 + ldp x4,x5,[x2] + adcs x9,x13,x13 + ldp x6,x7,[x2,#16] + adcs x10,x14,x14 + adcs x11,x15,x15 + adc x3,xzr,xzr + + subs x16,x8,x4 + sbcs x17,x9,x5 + sbcs x1,x10,x6 + sbcs x2,x11,x7 + sbcs xzr,x3,xzr + + csello x8,x8,x16 + csello x9,x9,x17 + csello x10,x10,x1 + csello x11,x11,x2 + + adds x8,x8,x12 + adcs x9,x9,x13 + adcs x10,x10,x14 + adcs x11,x11,x15 + adc x3,xzr,xzr + + subs x16,x8,x4 + sbcs x17,x9,x5 + sbcs x1,x10,x6 + sbcs x2,x11,x7 + sbcs xzr,x3,xzr + + csello x8,x8,x16 + csello x9,x9,x17 + csello x10,x10,x1 + stp x8,x9,[x0] + csello x11,x11,x2 + stp x10,x11,[x0,#16] + + ret + ENDP + + + + EXPORT |lshift_mod_256|[FUNC] + ALIGN 32 +|lshift_mod_256| PROC + ldp x8,x9,[x1] + ldp x10,x11,[x1,#16] + + ldp x4,x5,[x3] + ldp x6,x7,[x3,#16] + +|$Loop_lshift_mod_256| + adds x8,x8,x8 + sub x2,x2,#1 + adcs x9,x9,x9 + adcs x10,x10,x10 + adcs x11,x11,x11 + adc x3,xzr,xzr + + subs x12,x8,x4 + sbcs x13,x9,x5 + sbcs x14,x10,x6 + sbcs x15,x11,x7 + sbcs xzr,x3,xzr + + csello x8,x8,x12 + csello x9,x9,x13 + csello x10,x10,x14 + csello x11,x11,x15 + + cbnz x2,|$Loop_lshift_mod_256| + + stp x8,x9,[x0] + stp x10,x11,[x0,#16] + + ret + ENDP + + + + EXPORT |rshift_mod_256|[FUNC] + ALIGN 32 +|rshift_mod_256| PROC + ldp x8,x9,[x1] + ldp x10,x11,[x1,#16] + + ldp x4,x5,[x3] + ldp x6,x7,[x3,#16] + +|$Loop_rshift| + adds x12,x8,x4 + sub x2,x2,#1 + adcs x13,x9,x5 + adcs x14,x10,x6 + adcs x15,x11,x7 + adc x3,xzr,xzr + tst x8,#1 + + cselne x12,x12,x8 + cselne x13,x13,x9 + cselne x14,x14,x10 + cselne x15,x15,x11 + cselne x3,x3,xzr + + extr x8,x13,x12,#1 + extr x9,x14,x13,#1 + extr x10,x15,x14,#1 + extr x11,x3,x15,#1 + + cbnz x2,|$Loop_rshift| + + stp x8,x9,[x0] + stp x10,x11,[x0,#16] + + ret + ENDP + + + + EXPORT |cneg_mod_256|[FUNC] + ALIGN 32 +|cneg_mod_256| PROC + ldp x8,x9,[x1] + ldp x4,x5,[x3] + + ldp x10,x11,[x1,#16] + subs x12,x4,x8 + ldp x6,x7,[x3,#16] + orr x4,x8,x9 + sbcs x13,x5,x9 + orr x5,x10,x11 + sbcs x14,x6,x10 + orr x3,x4,x5 + sbc x15,x7,x11 + + cmp x3,#0 + csetmne x3 + ands x2,x2,x3 + + cseleq x8,x8,x12 + cseleq x9,x9,x13 + cseleq x10,x10,x14 + stp x8,x9,[x0] + cseleq x11,x11,x15 + stp x10,x11,[x0,#16] + + ret + ENDP + + + + EXPORT |sub_mod_256|[FUNC] + ALIGN 32 +|sub_mod_256| PROC + ldp x8,x9,[x1] + ldp x12,x13,[x2] + + ldp x10,x11,[x1,#16] + subs x8,x8,x12 + ldp x14,x15,[x2,#16] + sbcs x9,x9,x13 + ldp x4,x5,[x3] + sbcs x10,x10,x14 + ldp x6,x7,[x3,#16] + sbcs x11,x11,x15 + sbc x3,xzr,xzr + + and x4,x4,x3 + and x5,x5,x3 + adds x8,x8,x4 + and x6,x6,x3 + adcs x9,x9,x5 + and x7,x7,x3 + adcs x10,x10,x6 + stp x8,x9,[x0] + adc x11,x11,x7 + stp x10,x11,[x0,#16] + + ret + ENDP + + + + EXPORT |check_mod_256|[FUNC] + ALIGN 32 +|check_mod_256| PROC + ldp x8,x9,[x0] + ldp x10,x11,[x0,#16] + ldp x4,x5,[x1] + ldp x6,x7,[x1,#16] + +#ifdef __AARCH64EB__ + rev x8,x8 + rev x9,x9 + rev x10,x10 + rev x11,x11 +#endif + + subs xzr,x8,x4 + sbcs xzr,x9,x5 + orr x8,x8,x9 + sbcs xzr,x10,x6 + orr x8,x8,x10 + sbcs xzr,x11,x7 + orr x8,x8,x11 + sbc x1,xzr,xzr + + cmp x8,#0 + mov x0,#1 + cselne x0,x0,xzr + and x0,x0,x1 + + ret + ENDP + + + + EXPORT |add_n_check_mod_256|[FUNC] + ALIGN 32 +|add_n_check_mod_256| PROC + ldp x8,x9,[x1] + ldp x12,x13,[x2] + ldp x10,x11,[x1,#16] + ldp x14,x15,[x2,#16] + +#ifdef __AARCH64EB__ + rev x8,x8 + rev x12,x12 + rev x9,x9 + rev x13,x13 + rev x10,x10 + rev x14,x14 + rev x11,x11 + rev x15,x15 +#endif + + adds x8,x8,x12 + ldp x4,x5,[x3] + adcs x9,x9,x13 + ldp x6,x7,[x3,#16] + adcs x10,x10,x14 + adcs x11,x11,x15 + adc x3,xzr,xzr + + subs x16,x8,x4 + sbcs x17,x9,x5 + sbcs x1,x10,x6 + sbcs x2,x11,x7 + sbcs xzr,x3,xzr + + csello x8,x8,x16 + csello x9,x9,x17 + csello x10,x10,x1 + csello x11,x11,x2 + + orr x16, x8, x9 + orr x17, x10, x11 + orr x16, x16, x17 + +#ifdef __AARCH64EB__ + rev x8,x8 + rev x9,x9 + rev x10,x10 + rev x11,x11 +#endif + + stp x8,x9,[x0] + stp x10,x11,[x0,#16] + + mov x17, #1 + cmp x16, #0 + cselne x0,x17,xzr + + ret + ENDP + + + + EXPORT |sub_n_check_mod_256|[FUNC] + ALIGN 32 +|sub_n_check_mod_256| PROC + ldp x8,x9,[x1] + ldp x12,x13,[x2] + ldp x10,x11,[x1,#16] + ldp x14,x15,[x2,#16] + +#ifdef __AARCH64EB__ + rev x8,x8 + rev x12,x12 + rev x9,x9 + rev x13,x13 + rev x10,x10 + rev x14,x14 + rev x11,x11 + rev x15,x15 +#endif + + subs x8,x8,x12 + sbcs x9,x9,x13 + ldp x4,x5,[x3] + sbcs x10,x10,x14 + ldp x6,x7,[x3,#16] + sbcs x11,x11,x15 + sbc x3,xzr,xzr + + and x4,x4,x3 + and x5,x5,x3 + adds x8,x8,x4 + and x6,x6,x3 + adcs x9,x9,x5 + and x7,x7,x3 + adcs x10,x10,x6 + adc x11,x11,x7 + + orr x16, x8, x9 + orr x17, x10, x11 + orr x16, x16, x17 + +#ifdef __AARCH64EB__ + rev x8,x8 + rev x9,x9 + rev x10,x10 + rev x11,x11 +#endif + + stp x8,x9,[x0] + stp x10,x11,[x0,#16] + + mov x17, #1 + cmp x16, #0 + cselne x0,x17,xzr + + ret + ENDP + END diff --git a/temp-deps/blst-ts/deps/blst/build/win64/add_mod_256-x86_64.asm b/temp-deps/blst-ts/deps/blst/build/win64/add_mod_256-x86_64.asm new file mode 100644 index 0000000..09a5c17 --- /dev/null +++ b/temp-deps/blst-ts/deps/blst/build/win64/add_mod_256-x86_64.asm @@ -0,0 +1,934 @@ +OPTION DOTNAME +.text$ SEGMENT ALIGN(256) 'CODE' + +PUBLIC add_mod_256 + + +ALIGN 32 +add_mod_256 PROC PUBLIC + DB 243,15,30,250 + mov QWORD PTR[8+rsp],rdi ;WIN64 prologue + mov QWORD PTR[16+rsp],rsi + mov r11,rsp +$L$SEH_begin_add_mod_256:: + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 + mov rcx,r9 + + + + push rbp + + push rbx + + sub rsp,8 + +$L$SEH_body_add_mod_256:: + + + mov r8,QWORD PTR[rsi] + mov r9,QWORD PTR[8+rsi] + mov r10,QWORD PTR[16+rsi] + mov r11,QWORD PTR[24+rsi] + +$L$oaded_a_add_mod_256:: + add r8,QWORD PTR[rdx] + adc r9,QWORD PTR[8+rdx] + mov rax,r8 + adc r10,QWORD PTR[16+rdx] + mov rsi,r9 + adc r11,QWORD PTR[24+rdx] + sbb rdx,rdx + + mov rbx,r10 + sub r8,QWORD PTR[rcx] + sbb r9,QWORD PTR[8+rcx] + sbb r10,QWORD PTR[16+rcx] + mov rbp,r11 + sbb r11,QWORD PTR[24+rcx] + sbb rdx,0 + + cmovc r8,rax + cmovc r9,rsi + mov QWORD PTR[rdi],r8 + cmovc r10,rbx + mov QWORD PTR[8+rdi],r9 + cmovc r11,rbp + mov QWORD PTR[16+rdi],r10 + mov QWORD PTR[24+rdi],r11 + + mov rbx,QWORD PTR[8+rsp] + + mov rbp,QWORD PTR[16+rsp] + + lea rsp,QWORD PTR[24+rsp] + +$L$SEH_epilogue_add_mod_256:: + mov rdi,QWORD PTR[8+rsp] ;WIN64 epilogue + mov rsi,QWORD PTR[16+rsp] + + DB 0F3h,0C3h ;repret + +$L$SEH_end_add_mod_256:: +add_mod_256 ENDP + + +PUBLIC mul_by_3_mod_256 + + +ALIGN 32 +mul_by_3_mod_256 PROC PUBLIC + DB 243,15,30,250 + mov QWORD PTR[8+rsp],rdi ;WIN64 prologue + mov QWORD PTR[16+rsp],rsi + mov r11,rsp +$L$SEH_begin_mul_by_3_mod_256:: + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 + + + + push rbp + + push rbx + + push r12 + +$L$SEH_body_mul_by_3_mod_256:: + + + mov rcx,rdx + mov r8,QWORD PTR[rsi] + mov r9,QWORD PTR[8+rsi] + mov r10,QWORD PTR[16+rsi] + mov rdx,rsi + mov r11,QWORD PTR[24+rsi] + + call __lshift_mod_256 + mov r12,QWORD PTR[rsp] + + jmp $L$oaded_a_add_mod_256 + + mov rbx,QWORD PTR[8+rsp] + + mov rbp,QWORD PTR[16+rsp] + + lea rsp,QWORD PTR[24+rsp] + +$L$SEH_epilogue_mul_by_3_mod_256:: + mov rdi,QWORD PTR[8+rsp] ;WIN64 epilogue + mov rsi,QWORD PTR[16+rsp] + + DB 0F3h,0C3h ;repret + +$L$SEH_end_mul_by_3_mod_256:: +mul_by_3_mod_256 ENDP + + +ALIGN 32 +__lshift_mod_256 PROC PRIVATE + DB 243,15,30,250 + add r8,r8 + adc r9,r9 + mov rax,r8 + adc r10,r10 + mov rsi,r9 + adc r11,r11 + sbb r12,r12 + + mov rbx,r10 + sub r8,QWORD PTR[rcx] + sbb r9,QWORD PTR[8+rcx] + sbb r10,QWORD PTR[16+rcx] + mov rbp,r11 + sbb r11,QWORD PTR[24+rcx] + sbb r12,0 + + cmovc r8,rax + cmovc r9,rsi + cmovc r10,rbx + cmovc r11,rbp + + DB 0F3h,0C3h ;repret +__lshift_mod_256 ENDP + + +PUBLIC lshift_mod_256 + + +ALIGN 32 +lshift_mod_256 PROC PUBLIC + DB 243,15,30,250 + mov QWORD PTR[8+rsp],rdi ;WIN64 prologue + mov QWORD PTR[16+rsp],rsi + mov r11,rsp +$L$SEH_begin_lshift_mod_256:: + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 + mov rcx,r9 + + + + push rbp + + push rbx + + push r12 + +$L$SEH_body_lshift_mod_256:: + + + mov r8,QWORD PTR[rsi] + mov r9,QWORD PTR[8+rsi] + mov r10,QWORD PTR[16+rsi] + mov r11,QWORD PTR[24+rsi] + +$L$oop_lshift_mod_256:: + call __lshift_mod_256 + dec edx + jnz $L$oop_lshift_mod_256 + + mov QWORD PTR[rdi],r8 + mov QWORD PTR[8+rdi],r9 + mov QWORD PTR[16+rdi],r10 + mov QWORD PTR[24+rdi],r11 + + mov r12,QWORD PTR[rsp] + + mov rbx,QWORD PTR[8+rsp] + + mov rbp,QWORD PTR[16+rsp] + + lea rsp,QWORD PTR[24+rsp] + +$L$SEH_epilogue_lshift_mod_256:: + mov rdi,QWORD PTR[8+rsp] ;WIN64 epilogue + mov rsi,QWORD PTR[16+rsp] + + DB 0F3h,0C3h ;repret + +$L$SEH_end_lshift_mod_256:: +lshift_mod_256 ENDP + + +PUBLIC rshift_mod_256 + + +ALIGN 32 +rshift_mod_256 PROC PUBLIC + DB 243,15,30,250 + mov QWORD PTR[8+rsp],rdi ;WIN64 prologue + mov QWORD PTR[16+rsp],rsi + mov r11,rsp +$L$SEH_begin_rshift_mod_256:: + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 + mov rcx,r9 + + + + push rbp + + push rbx + + sub rsp,8 + +$L$SEH_body_rshift_mod_256:: + + + mov rbp,QWORD PTR[rsi] + mov r9,QWORD PTR[8+rsi] + mov r10,QWORD PTR[16+rsi] + mov r11,QWORD PTR[24+rsi] + +$L$oop_rshift_mod_256:: + mov r8,rbp + and rbp,1 + mov rax,QWORD PTR[rcx] + neg rbp + mov rsi,QWORD PTR[8+rcx] + mov rbx,QWORD PTR[16+rcx] + + and rax,rbp + and rsi,rbp + and rbx,rbp + and rbp,QWORD PTR[24+rcx] + + add r8,rax + adc r9,rsi + adc r10,rbx + adc r11,rbp + sbb rax,rax + + shr r8,1 + mov rbp,r9 + shr r9,1 + mov rbx,r10 + shr r10,1 + mov rsi,r11 + shr r11,1 + + shl rbp,63 + shl rbx,63 + or rbp,r8 + shl rsi,63 + or r9,rbx + shl rax,63 + or r10,rsi + or r11,rax + + dec edx + jnz $L$oop_rshift_mod_256 + + mov QWORD PTR[rdi],rbp + mov QWORD PTR[8+rdi],r9 + mov QWORD PTR[16+rdi],r10 + mov QWORD PTR[24+rdi],r11 + + mov rbx,QWORD PTR[8+rsp] + + mov rbp,QWORD PTR[16+rsp] + + lea rsp,QWORD PTR[24+rsp] + +$L$SEH_epilogue_rshift_mod_256:: + mov rdi,QWORD PTR[8+rsp] ;WIN64 epilogue + mov rsi,QWORD PTR[16+rsp] + + DB 0F3h,0C3h ;repret + +$L$SEH_end_rshift_mod_256:: +rshift_mod_256 ENDP + + +PUBLIC cneg_mod_256 + + +ALIGN 32 +cneg_mod_256 PROC PUBLIC + DB 243,15,30,250 + mov QWORD PTR[8+rsp],rdi ;WIN64 prologue + mov QWORD PTR[16+rsp],rsi + mov r11,rsp +$L$SEH_begin_cneg_mod_256:: + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 + mov rcx,r9 + + + + push rbp + + push rbx + + push r12 + +$L$SEH_body_cneg_mod_256:: + + + mov r12,QWORD PTR[rsi] + mov r9,QWORD PTR[8+rsi] + mov r10,QWORD PTR[16+rsi] + mov r8,r12 + mov r11,QWORD PTR[24+rsi] + or r12,r9 + or r12,r10 + or r12,r11 + mov rbp,-1 + + mov rax,QWORD PTR[rcx] + cmovnz r12,rbp + mov rsi,QWORD PTR[8+rcx] + mov rbx,QWORD PTR[16+rcx] + and rax,r12 + mov rbp,QWORD PTR[24+rcx] + and rsi,r12 + and rbx,r12 + and rbp,r12 + + sub rax,r8 + sbb rsi,r9 + sbb rbx,r10 + sbb rbp,r11 + + or rdx,rdx + + cmovz rax,r8 + cmovz rsi,r9 + mov QWORD PTR[rdi],rax + cmovz rbx,r10 + mov QWORD PTR[8+rdi],rsi + cmovz rbp,r11 + mov QWORD PTR[16+rdi],rbx + mov QWORD PTR[24+rdi],rbp + + mov r12,QWORD PTR[rsp] + + mov rbx,QWORD PTR[8+rsp] + + mov rbp,QWORD PTR[16+rsp] + + lea rsp,QWORD PTR[24+rsp] + +$L$SEH_epilogue_cneg_mod_256:: + mov rdi,QWORD PTR[8+rsp] ;WIN64 epilogue + mov rsi,QWORD PTR[16+rsp] + + DB 0F3h,0C3h ;repret + +$L$SEH_end_cneg_mod_256:: +cneg_mod_256 ENDP + + +PUBLIC sub_mod_256 + + +ALIGN 32 +sub_mod_256 PROC PUBLIC + DB 243,15,30,250 + mov QWORD PTR[8+rsp],rdi ;WIN64 prologue + mov QWORD PTR[16+rsp],rsi + mov r11,rsp +$L$SEH_begin_sub_mod_256:: + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 + mov rcx,r9 + + + + push rbp + + push rbx + + sub rsp,8 + +$L$SEH_body_sub_mod_256:: + + + mov r8,QWORD PTR[rsi] + mov r9,QWORD PTR[8+rsi] + mov r10,QWORD PTR[16+rsi] + mov r11,QWORD PTR[24+rsi] + + sub r8,QWORD PTR[rdx] + mov rax,QWORD PTR[rcx] + sbb r9,QWORD PTR[8+rdx] + mov rsi,QWORD PTR[8+rcx] + sbb r10,QWORD PTR[16+rdx] + mov rbx,QWORD PTR[16+rcx] + sbb r11,QWORD PTR[24+rdx] + mov rbp,QWORD PTR[24+rcx] + sbb rdx,rdx + + and rax,rdx + and rsi,rdx + and rbx,rdx + and rbp,rdx + + add r8,rax + adc r9,rsi + mov QWORD PTR[rdi],r8 + adc r10,rbx + mov QWORD PTR[8+rdi],r9 + adc r11,rbp + mov QWORD PTR[16+rdi],r10 + mov QWORD PTR[24+rdi],r11 + + mov rbx,QWORD PTR[8+rsp] + + mov rbp,QWORD PTR[16+rsp] + + lea rsp,QWORD PTR[24+rsp] + +$L$SEH_epilogue_sub_mod_256:: + mov rdi,QWORD PTR[8+rsp] ;WIN64 epilogue + mov rsi,QWORD PTR[16+rsp] + + DB 0F3h,0C3h ;repret + +$L$SEH_end_sub_mod_256:: +sub_mod_256 ENDP + + +PUBLIC check_mod_256 + + +ALIGN 32 +check_mod_256 PROC PUBLIC + DB 243,15,30,250 + mov QWORD PTR[8+rsp],rdi ;WIN64 prologue + mov QWORD PTR[16+rsp],rsi + mov r11,rsp +$L$SEH_begin_check_mod_256:: + mov rdi,rcx + mov rsi,rdx + + + + mov rax,QWORD PTR[rdi] + mov r9,QWORD PTR[8+rdi] + mov r10,QWORD PTR[16+rdi] + mov r11,QWORD PTR[24+rdi] + + mov r8,rax + or rax,r9 + or rax,r10 + or rax,r11 + + sub r8,QWORD PTR[rsi] + sbb r9,QWORD PTR[8+rsi] + sbb r10,QWORD PTR[16+rsi] + sbb r11,QWORD PTR[24+rsi] + sbb rsi,rsi + + mov rdx,1 + cmp rax,0 + cmovne rax,rdx + and rax,rsi +$L$SEH_epilogue_check_mod_256:: + mov rdi,QWORD PTR[8+rsp] ;WIN64 epilogue + mov rsi,QWORD PTR[16+rsp] + + DB 0F3h,0C3h ;repret + +$L$SEH_end_check_mod_256:: +check_mod_256 ENDP + + +PUBLIC add_n_check_mod_256 + + +ALIGN 32 +add_n_check_mod_256 PROC PUBLIC + DB 243,15,30,250 + mov QWORD PTR[8+rsp],rdi ;WIN64 prologue + mov QWORD PTR[16+rsp],rsi + mov r11,rsp +$L$SEH_begin_add_n_check_mod_256:: + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 + mov rcx,r9 + + + + push rbp + + push rbx + + sub rsp,8 + +$L$SEH_body_add_n_check_mod_256:: + + + mov r8,QWORD PTR[rsi] + mov r9,QWORD PTR[8+rsi] + mov r10,QWORD PTR[16+rsi] + mov r11,QWORD PTR[24+rsi] + + add r8,QWORD PTR[rdx] + adc r9,QWORD PTR[8+rdx] + mov rax,r8 + adc r10,QWORD PTR[16+rdx] + mov rsi,r9 + adc r11,QWORD PTR[24+rdx] + sbb rdx,rdx + + mov rbx,r10 + sub r8,QWORD PTR[rcx] + sbb r9,QWORD PTR[8+rcx] + sbb r10,QWORD PTR[16+rcx] + mov rbp,r11 + sbb r11,QWORD PTR[24+rcx] + sbb rdx,0 + + cmovc r8,rax + cmovc r9,rsi + mov QWORD PTR[rdi],r8 + cmovc r10,rbx + mov QWORD PTR[8+rdi],r9 + cmovc r11,rbp + mov QWORD PTR[16+rdi],r10 + mov QWORD PTR[24+rdi],r11 + + or r8,r9 + or r10,r11 + or r8,r10 + mov rax,1 + cmovz rax,r8 + + mov rbx,QWORD PTR[8+rsp] + + mov rbp,QWORD PTR[16+rsp] + + lea rsp,QWORD PTR[24+rsp] + +$L$SEH_epilogue_add_n_check_mod_256:: + mov rdi,QWORD PTR[8+rsp] ;WIN64 epilogue + mov rsi,QWORD PTR[16+rsp] + + DB 0F3h,0C3h ;repret + +$L$SEH_end_add_n_check_mod_256:: +add_n_check_mod_256 ENDP + + +PUBLIC sub_n_check_mod_256 + + +ALIGN 32 +sub_n_check_mod_256 PROC PUBLIC + DB 243,15,30,250 + mov QWORD PTR[8+rsp],rdi ;WIN64 prologue + mov QWORD PTR[16+rsp],rsi + mov r11,rsp +$L$SEH_begin_sub_n_check_mod_256:: + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 + mov rcx,r9 + + + + push rbp + + push rbx + + sub rsp,8 + +$L$SEH_body_sub_n_check_mod_256:: + + + mov r8,QWORD PTR[rsi] + mov r9,QWORD PTR[8+rsi] + mov r10,QWORD PTR[16+rsi] + mov r11,QWORD PTR[24+rsi] + + sub r8,QWORD PTR[rdx] + mov rax,QWORD PTR[rcx] + sbb r9,QWORD PTR[8+rdx] + mov rsi,QWORD PTR[8+rcx] + sbb r10,QWORD PTR[16+rdx] + mov rbx,QWORD PTR[16+rcx] + sbb r11,QWORD PTR[24+rdx] + mov rbp,QWORD PTR[24+rcx] + sbb rdx,rdx + + and rax,rdx + and rsi,rdx + and rbx,rdx + and rbp,rdx + + add r8,rax + adc r9,rsi + mov QWORD PTR[rdi],r8 + adc r10,rbx + mov QWORD PTR[8+rdi],r9 + adc r11,rbp + mov QWORD PTR[16+rdi],r10 + mov QWORD PTR[24+rdi],r11 + + or r8,r9 + or r10,r11 + or r8,r10 + mov rax,1 + cmovz rax,r8 + + mov rbx,QWORD PTR[8+rsp] + + mov rbp,QWORD PTR[16+rsp] + + lea rsp,QWORD PTR[24+rsp] + +$L$SEH_epilogue_sub_n_check_mod_256:: + mov rdi,QWORD PTR[8+rsp] ;WIN64 epilogue + mov rsi,QWORD PTR[16+rsp] + + DB 0F3h,0C3h ;repret + +$L$SEH_end_sub_n_check_mod_256:: +sub_n_check_mod_256 ENDP +.text$ ENDS +.pdata SEGMENT READONLY ALIGN(4) +ALIGN 4 + DD imagerel $L$SEH_begin_add_mod_256 + DD imagerel $L$SEH_body_add_mod_256 + DD imagerel $L$SEH_info_add_mod_256_prologue + + DD imagerel $L$SEH_body_add_mod_256 + DD imagerel $L$SEH_epilogue_add_mod_256 + DD imagerel $L$SEH_info_add_mod_256_body + + DD imagerel $L$SEH_epilogue_add_mod_256 + DD imagerel $L$SEH_end_add_mod_256 + DD imagerel $L$SEH_info_add_mod_256_epilogue + + DD imagerel $L$SEH_begin_mul_by_3_mod_256 + DD imagerel $L$SEH_body_mul_by_3_mod_256 + DD imagerel $L$SEH_info_mul_by_3_mod_256_prologue + + DD imagerel $L$SEH_body_mul_by_3_mod_256 + DD imagerel $L$SEH_epilogue_mul_by_3_mod_256 + DD imagerel $L$SEH_info_mul_by_3_mod_256_body + + DD imagerel $L$SEH_epilogue_mul_by_3_mod_256 + DD imagerel $L$SEH_end_mul_by_3_mod_256 + DD imagerel $L$SEH_info_mul_by_3_mod_256_epilogue + + DD imagerel $L$SEH_begin_lshift_mod_256 + DD imagerel $L$SEH_body_lshift_mod_256 + DD imagerel $L$SEH_info_lshift_mod_256_prologue + + DD imagerel $L$SEH_body_lshift_mod_256 + DD imagerel $L$SEH_epilogue_lshift_mod_256 + DD imagerel $L$SEH_info_lshift_mod_256_body + + DD imagerel $L$SEH_epilogue_lshift_mod_256 + DD imagerel $L$SEH_end_lshift_mod_256 + DD imagerel $L$SEH_info_lshift_mod_256_epilogue + + DD imagerel $L$SEH_begin_rshift_mod_256 + DD imagerel $L$SEH_body_rshift_mod_256 + DD imagerel $L$SEH_info_rshift_mod_256_prologue + + DD imagerel $L$SEH_body_rshift_mod_256 + DD imagerel $L$SEH_epilogue_rshift_mod_256 + DD imagerel $L$SEH_info_rshift_mod_256_body + + DD imagerel $L$SEH_epilogue_rshift_mod_256 + DD imagerel $L$SEH_end_rshift_mod_256 + DD imagerel $L$SEH_info_rshift_mod_256_epilogue + + DD imagerel $L$SEH_begin_cneg_mod_256 + DD imagerel $L$SEH_body_cneg_mod_256 + DD imagerel $L$SEH_info_cneg_mod_256_prologue + + DD imagerel $L$SEH_body_cneg_mod_256 + DD imagerel $L$SEH_epilogue_cneg_mod_256 + DD imagerel $L$SEH_info_cneg_mod_256_body + + DD imagerel $L$SEH_epilogue_cneg_mod_256 + DD imagerel $L$SEH_end_cneg_mod_256 + DD imagerel $L$SEH_info_cneg_mod_256_epilogue + + DD imagerel $L$SEH_begin_sub_mod_256 + DD imagerel $L$SEH_body_sub_mod_256 + DD imagerel $L$SEH_info_sub_mod_256_prologue + + DD imagerel $L$SEH_body_sub_mod_256 + DD imagerel $L$SEH_epilogue_sub_mod_256 + DD imagerel $L$SEH_info_sub_mod_256_body + + DD imagerel $L$SEH_epilogue_sub_mod_256 + DD imagerel $L$SEH_end_sub_mod_256 + DD imagerel $L$SEH_info_sub_mod_256_epilogue + + DD imagerel $L$SEH_epilogue_check_mod_256 + DD imagerel $L$SEH_end_check_mod_256 + DD imagerel $L$SEH_info_check_mod_256_epilogue + + DD imagerel $L$SEH_begin_add_n_check_mod_256 + DD imagerel $L$SEH_body_add_n_check_mod_256 + DD imagerel $L$SEH_info_add_n_check_mod_256_prologue + + DD imagerel $L$SEH_body_add_n_check_mod_256 + DD imagerel $L$SEH_epilogue_add_n_check_mod_256 + DD imagerel $L$SEH_info_add_n_check_mod_256_body + + DD imagerel $L$SEH_epilogue_add_n_check_mod_256 + DD imagerel $L$SEH_end_add_n_check_mod_256 + DD imagerel $L$SEH_info_add_n_check_mod_256_epilogue + + DD imagerel $L$SEH_begin_sub_n_check_mod_256 + DD imagerel $L$SEH_body_sub_n_check_mod_256 + DD imagerel $L$SEH_info_sub_n_check_mod_256_prologue + + DD imagerel $L$SEH_body_sub_n_check_mod_256 + DD imagerel $L$SEH_epilogue_sub_n_check_mod_256 + DD imagerel $L$SEH_info_sub_n_check_mod_256_body + + DD imagerel $L$SEH_epilogue_sub_n_check_mod_256 + DD imagerel $L$SEH_end_sub_n_check_mod_256 + DD imagerel $L$SEH_info_sub_n_check_mod_256_epilogue + +.pdata ENDS +.xdata SEGMENT READONLY ALIGN(8) +ALIGN 8 +$L$SEH_info_add_mod_256_prologue:: +DB 1,0,5,00bh +DB 0,074h,1,0 +DB 0,064h,2,0 +DB 0,003h +DB 0,0 +$L$SEH_info_add_mod_256_body:: +DB 1,0,9,0 +DB 000h,034h,001h,000h +DB 000h,054h,002h,000h +DB 000h,074h,004h,000h +DB 000h,064h,005h,000h +DB 000h,022h +DB 000h,000h +$L$SEH_info_add_mod_256_epilogue:: +DB 1,0,4,0 +DB 000h,074h,001h,000h +DB 000h,064h,002h,000h +DB 000h,000h,000h,000h + +$L$SEH_info_mul_by_3_mod_256_prologue:: +DB 1,0,5,00bh +DB 0,074h,1,0 +DB 0,064h,2,0 +DB 0,003h +DB 0,0 +$L$SEH_info_mul_by_3_mod_256_body:: +DB 1,0,11,0 +DB 000h,0c4h,000h,000h +DB 000h,034h,001h,000h +DB 000h,054h,002h,000h +DB 000h,074h,004h,000h +DB 000h,064h,005h,000h +DB 000h,022h +DB 000h,000h,000h,000h,000h,000h +$L$SEH_info_mul_by_3_mod_256_epilogue:: +DB 1,0,4,0 +DB 000h,074h,001h,000h +DB 000h,064h,002h,000h +DB 000h,000h,000h,000h + +$L$SEH_info_lshift_mod_256_prologue:: +DB 1,0,5,00bh +DB 0,074h,1,0 +DB 0,064h,2,0 +DB 0,003h +DB 0,0 +$L$SEH_info_lshift_mod_256_body:: +DB 1,0,11,0 +DB 000h,0c4h,000h,000h +DB 000h,034h,001h,000h +DB 000h,054h,002h,000h +DB 000h,074h,004h,000h +DB 000h,064h,005h,000h +DB 000h,022h +DB 000h,000h,000h,000h,000h,000h +$L$SEH_info_lshift_mod_256_epilogue:: +DB 1,0,4,0 +DB 000h,074h,001h,000h +DB 000h,064h,002h,000h +DB 000h,000h,000h,000h + +$L$SEH_info_rshift_mod_256_prologue:: +DB 1,0,5,00bh +DB 0,074h,1,0 +DB 0,064h,2,0 +DB 0,003h +DB 0,0 +$L$SEH_info_rshift_mod_256_body:: +DB 1,0,9,0 +DB 000h,034h,001h,000h +DB 000h,054h,002h,000h +DB 000h,074h,004h,000h +DB 000h,064h,005h,000h +DB 000h,022h +DB 000h,000h +$L$SEH_info_rshift_mod_256_epilogue:: +DB 1,0,4,0 +DB 000h,074h,001h,000h +DB 000h,064h,002h,000h +DB 000h,000h,000h,000h + +$L$SEH_info_cneg_mod_256_prologue:: +DB 1,0,5,00bh +DB 0,074h,1,0 +DB 0,064h,2,0 +DB 0,003h +DB 0,0 +$L$SEH_info_cneg_mod_256_body:: +DB 1,0,11,0 +DB 000h,0c4h,000h,000h +DB 000h,034h,001h,000h +DB 000h,054h,002h,000h +DB 000h,074h,004h,000h +DB 000h,064h,005h,000h +DB 000h,022h +DB 000h,000h,000h,000h,000h,000h +$L$SEH_info_cneg_mod_256_epilogue:: +DB 1,0,4,0 +DB 000h,074h,001h,000h +DB 000h,064h,002h,000h +DB 000h,000h,000h,000h + +$L$SEH_info_sub_mod_256_prologue:: +DB 1,0,5,00bh +DB 0,074h,1,0 +DB 0,064h,2,0 +DB 0,003h +DB 0,0 +$L$SEH_info_sub_mod_256_body:: +DB 1,0,9,0 +DB 000h,034h,001h,000h +DB 000h,054h,002h,000h +DB 000h,074h,004h,000h +DB 000h,064h,005h,000h +DB 000h,022h +DB 000h,000h +$L$SEH_info_sub_mod_256_epilogue:: +DB 1,0,4,0 +DB 000h,074h,001h,000h +DB 000h,064h,002h,000h +DB 000h,000h,000h,000h + +$L$SEH_info_check_mod_256_epilogue:: +DB 1,0,4,0 +DB 000h,074h,001h,000h +DB 000h,064h,002h,000h +DB 000h,000h,000h,000h + +$L$SEH_info_add_n_check_mod_256_prologue:: +DB 1,0,5,00bh +DB 0,074h,1,0 +DB 0,064h,2,0 +DB 0,003h +DB 0,0 +$L$SEH_info_add_n_check_mod_256_body:: +DB 1,0,9,0 +DB 000h,034h,001h,000h +DB 000h,054h,002h,000h +DB 000h,074h,004h,000h +DB 000h,064h,005h,000h +DB 000h,022h +DB 000h,000h +$L$SEH_info_add_n_check_mod_256_epilogue:: +DB 1,0,4,0 +DB 000h,074h,001h,000h +DB 000h,064h,002h,000h +DB 000h,000h,000h,000h + +$L$SEH_info_sub_n_check_mod_256_prologue:: +DB 1,0,5,00bh +DB 0,074h,1,0 +DB 0,064h,2,0 +DB 0,003h +DB 0,0 +$L$SEH_info_sub_n_check_mod_256_body:: +DB 1,0,9,0 +DB 000h,034h,001h,000h +DB 000h,054h,002h,000h +DB 000h,074h,004h,000h +DB 000h,064h,005h,000h +DB 000h,022h +DB 000h,000h +$L$SEH_info_sub_n_check_mod_256_epilogue:: +DB 1,0,4,0 +DB 000h,074h,001h,000h +DB 000h,064h,002h,000h +DB 000h,000h,000h,000h + + +.xdata ENDS +END diff --git a/temp-deps/blst-ts/deps/blst/build/win64/add_mod_384-armv8.asm b/temp-deps/blst-ts/deps/blst/build/win64/add_mod_384-armv8.asm new file mode 100644 index 0000000..4bf703a --- /dev/null +++ b/temp-deps/blst-ts/deps/blst/build/win64/add_mod_384-armv8.asm @@ -0,0 +1,1001 @@ + AREA |.text|,CODE,ALIGN=8,ARM64 + + + + EXPORT |add_mod_384|[FUNC] + ALIGN 32 +|add_mod_384| PROC + DCDU 3573752639 + stp x29,x30,[sp,#-48]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + + ldp x4,x5,[x3] + ldp x6,x7,[x3,#16] + ldp x8,x9,[x3,#32] + + bl __add_mod_384 + ldr x30,[sp,#8] + + stp x10,x11,[x0] + stp x12,x13,[x0,#16] + stp x14,x15,[x0,#32] + + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldr x29,[sp],#48 + DCDU 3573752767 + ret + ENDP + + + ALIGN 32 +|__add_mod_384| PROC + ldp x10,x11,[x1] + ldp x16,x17,[x2] + ldp x12,x13,[x1,#16] + ldp x19,x20,[x2,#16] + ldp x14,x15,[x1,#32] + ldp x21,x22,[x2,#32] + +|__add_mod_384_ab_are_loaded| + adds x10,x10,x16 + adcs x11,x11,x17 + adcs x12,x12,x19 + adcs x13,x13,x20 + adcs x14,x14,x21 + adcs x15,x15,x22 + adc x3,xzr,xzr + + subs x16,x10,x4 + sbcs x17,x11,x5 + sbcs x19,x12,x6 + sbcs x20,x13,x7 + sbcs x21,x14,x8 + sbcs x22,x15,x9 + sbcs xzr,x3,xzr + + csello x10,x10,x16 + csello x11,x11,x17 + csello x12,x12,x19 + csello x13,x13,x20 + csello x14,x14,x21 + csello x15,x15,x22 + + ret + ENDP + + + + EXPORT |add_mod_384x|[FUNC] + ALIGN 32 +|add_mod_384x| PROC + DCDU 3573752639 + stp x29,x30,[sp,#-48]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + + ldp x4,x5,[x3] + ldp x6,x7,[x3,#16] + ldp x8,x9,[x3,#32] + + bl __add_mod_384 + + stp x10,x11,[x0] + add x1,x1,#48 + stp x12,x13,[x0,#16] + add x2,x2,#48 + stp x14,x15,[x0,#32] + + bl __add_mod_384 + ldr x30,[sp,#8] + + stp x10,x11,[x0,#48] + stp x12,x13,[x0,#64] + stp x14,x15,[x0,#80] + + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldr x29,[sp],#48 + DCDU 3573752767 + ret + ENDP + + + + EXPORT |rshift_mod_384|[FUNC] + ALIGN 32 +|rshift_mod_384| PROC + DCDU 3573752639 + stp x29,x30,[sp,#-48]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + + ldp x10,x11,[x1] + ldp x12,x13,[x1,#16] + ldp x14,x15,[x1,#32] + + ldp x4,x5,[x3] + ldp x6,x7,[x3,#16] + ldp x8,x9,[x3,#32] + +|$Loop_rshift_mod_384| + sub x2,x2,#1 + bl __rshift_mod_384 + cbnz x2,|$Loop_rshift_mod_384| + + ldr x30,[sp,#8] + stp x10,x11,[x0] + stp x12,x13,[x0,#16] + stp x14,x15,[x0,#32] + + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldr x29,[sp],#48 + DCDU 3573752767 + ret + ENDP + + + ALIGN 32 +|__rshift_mod_384| PROC + sbfx x22,x10,#0,#1 + and x16,x22,x4 + and x17,x22,x5 + adds x10,x10,x16 + and x19,x22,x6 + adcs x11,x11,x17 + and x20,x22,x7 + adcs x12,x12,x19 + and x21,x22,x8 + adcs x13,x13,x20 + and x22,x22,x9 + adcs x14,x14,x21 + extr x10,x11,x10,#1 // a[0:5] >>= 1 + adcs x15,x15,x22 + extr x11,x12,x11,#1 + adc x22,xzr,xzr + extr x12,x13,x12,#1 + extr x13,x14,x13,#1 + extr x14,x15,x14,#1 + extr x15,x22,x15,#1 + ret + ENDP + + + + EXPORT |div_by_2_mod_384|[FUNC] + ALIGN 32 +|div_by_2_mod_384| PROC + DCDU 3573752639 + stp x29,x30,[sp,#-48]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + + ldp x10,x11,[x1] + ldp x12,x13,[x1,#16] + ldp x14,x15,[x1,#32] + + ldp x4,x5,[x2] + ldp x6,x7,[x2,#16] + ldp x8,x9,[x2,#32] + + bl __rshift_mod_384 + + ldr x30,[sp,#8] + stp x10,x11,[x0] + stp x12,x13,[x0,#16] + stp x14,x15,[x0,#32] + + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldr x29,[sp],#48 + DCDU 3573752767 + ret + ENDP + + + + EXPORT |lshift_mod_384|[FUNC] + ALIGN 32 +|lshift_mod_384| PROC + DCDU 3573752639 + stp x29,x30,[sp,#-48]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + + ldp x10,x11,[x1] + ldp x12,x13,[x1,#16] + ldp x14,x15,[x1,#32] + + ldp x4,x5,[x3] + ldp x6,x7,[x3,#16] + ldp x8,x9,[x3,#32] + +|$Loop_lshift_mod_384| + sub x2,x2,#1 + bl __lshift_mod_384 + cbnz x2,|$Loop_lshift_mod_384| + + ldr x30,[sp,#8] + stp x10,x11,[x0] + stp x12,x13,[x0,#16] + stp x14,x15,[x0,#32] + + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldr x29,[sp],#48 + DCDU 3573752767 + ret + ENDP + + + ALIGN 32 +|__lshift_mod_384| PROC + adds x10,x10,x10 + adcs x11,x11,x11 + adcs x12,x12,x12 + adcs x13,x13,x13 + adcs x14,x14,x14 + adcs x15,x15,x15 + adc x3,xzr,xzr + + subs x16,x10,x4 + sbcs x17,x11,x5 + sbcs x19,x12,x6 + sbcs x20,x13,x7 + sbcs x21,x14,x8 + sbcs x22,x15,x9 + sbcs xzr,x3,xzr + + csello x10,x10,x16 + csello x11,x11,x17 + csello x12,x12,x19 + csello x13,x13,x20 + csello x14,x14,x21 + csello x15,x15,x22 + + ret + ENDP + + + + EXPORT |mul_by_3_mod_384|[FUNC] + ALIGN 32 +|mul_by_3_mod_384| PROC + DCDU 3573752639 + stp x29,x30,[sp,#-48]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + + ldp x10,x11,[x1] + ldp x12,x13,[x1,#16] + ldp x14,x15,[x1,#32] + + ldp x4,x5,[x2] + ldp x6,x7,[x2,#16] + ldp x8,x9,[x2,#32] + + bl __lshift_mod_384 + + ldp x16,x17,[x1] + ldp x19,x20,[x1,#16] + ldp x21,x22,[x1,#32] + + bl __add_mod_384_ab_are_loaded + ldr x30,[sp,#8] + + stp x10,x11,[x0] + stp x12,x13,[x0,#16] + stp x14,x15,[x0,#32] + + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldr x29,[sp],#48 + DCDU 3573752767 + ret + ENDP + + + + EXPORT |mul_by_8_mod_384|[FUNC] + ALIGN 32 +|mul_by_8_mod_384| PROC + DCDU 3573752639 + stp x29,x30,[sp,#-48]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + + ldp x10,x11,[x1] + ldp x12,x13,[x1,#16] + ldp x14,x15,[x1,#32] + + ldp x4,x5,[x2] + ldp x6,x7,[x2,#16] + ldp x8,x9,[x2,#32] + + bl __lshift_mod_384 + bl __lshift_mod_384 + bl __lshift_mod_384 + ldr x30,[sp,#8] + + stp x10,x11,[x0] + stp x12,x13,[x0,#16] + stp x14,x15,[x0,#32] + + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldr x29,[sp],#48 + DCDU 3573752767 + ret + ENDP + + + + EXPORT |mul_by_3_mod_384x|[FUNC] + ALIGN 32 +|mul_by_3_mod_384x| PROC + DCDU 3573752639 + stp x29,x30,[sp,#-48]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + + ldp x10,x11,[x1] + ldp x12,x13,[x1,#16] + ldp x14,x15,[x1,#32] + + ldp x4,x5,[x2] + ldp x6,x7,[x2,#16] + ldp x8,x9,[x2,#32] + + bl __lshift_mod_384 + + ldp x16,x17,[x1] + ldp x19,x20,[x1,#16] + ldp x21,x22,[x1,#32] + + bl __add_mod_384_ab_are_loaded + + stp x10,x11,[x0] + ldp x10,x11,[x1,#48] + stp x12,x13,[x0,#16] + ldp x12,x13,[x1,#64] + stp x14,x15,[x0,#32] + ldp x14,x15,[x1,#80] + + bl __lshift_mod_384 + + ldp x16,x17,[x1,#48] + ldp x19,x20,[x1,#64] + ldp x21,x22,[x1,#80] + + bl __add_mod_384_ab_are_loaded + ldr x30,[sp,#8] + + stp x10,x11,[x0,#48] + stp x12,x13,[x0,#64] + stp x14,x15,[x0,#80] + + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldr x29,[sp],#48 + DCDU 3573752767 + ret + ENDP + + + + EXPORT |mul_by_8_mod_384x|[FUNC] + ALIGN 32 +|mul_by_8_mod_384x| PROC + DCDU 3573752639 + stp x29,x30,[sp,#-48]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + + ldp x10,x11,[x1] + ldp x12,x13,[x1,#16] + ldp x14,x15,[x1,#32] + + ldp x4,x5,[x2] + ldp x6,x7,[x2,#16] + ldp x8,x9,[x2,#32] + + bl __lshift_mod_384 + bl __lshift_mod_384 + bl __lshift_mod_384 + + stp x10,x11,[x0] + ldp x10,x11,[x1,#48] + stp x12,x13,[x0,#16] + ldp x12,x13,[x1,#64] + stp x14,x15,[x0,#32] + ldp x14,x15,[x1,#80] + + bl __lshift_mod_384 + bl __lshift_mod_384 + bl __lshift_mod_384 + ldr x30,[sp,#8] + + stp x10,x11,[x0,#48] + stp x12,x13,[x0,#64] + stp x14,x15,[x0,#80] + + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldr x29,[sp],#48 + DCDU 3573752767 + ret + ENDP + + + + EXPORT |cneg_mod_384|[FUNC] + ALIGN 32 +|cneg_mod_384| PROC + DCDU 3573752639 + stp x29,x30,[sp,#-48]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + + ldp x10,x11,[x1] + ldp x4,x5,[x3] + ldp x12,x13,[x1,#16] + ldp x6,x7,[x3,#16] + + subs x16,x4,x10 + ldp x14,x15,[x1,#32] + ldp x8,x9,[x3,#32] + orr x3,x10,x11 + sbcs x17,x5,x11 + orr x3,x3,x12 + sbcs x19,x6,x12 + orr x3,x3,x13 + sbcs x20,x7,x13 + orr x3,x3,x14 + sbcs x21,x8,x14 + orr x3,x3,x15 + sbc x22,x9,x15 + + cmp x3,#0 + csetmne x3 + ands x2,x2,x3 + + cseleq x10,x10,x16 + cseleq x11,x11,x17 + cseleq x12,x12,x19 + cseleq x13,x13,x20 + stp x10,x11,[x0] + cseleq x14,x14,x21 + stp x12,x13,[x0,#16] + cseleq x15,x15,x22 + stp x14,x15,[x0,#32] + + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldr x29,[sp],#48 + DCDU 3573752767 + ret + ENDP + + + + EXPORT |sub_mod_384|[FUNC] + ALIGN 32 +|sub_mod_384| PROC + DCDU 3573752639 + stp x29,x30,[sp,#-48]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + + ldp x4,x5,[x3] + ldp x6,x7,[x3,#16] + ldp x8,x9,[x3,#32] + + bl __sub_mod_384 + ldr x30,[sp,#8] + + stp x10,x11,[x0] + stp x12,x13,[x0,#16] + stp x14,x15,[x0,#32] + + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldr x29,[sp],#48 + DCDU 3573752767 + ret + ENDP + + + ALIGN 32 +|__sub_mod_384| PROC + ldp x10,x11,[x1] + ldp x16,x17,[x2] + ldp x12,x13,[x1,#16] + ldp x19,x20,[x2,#16] + ldp x14,x15,[x1,#32] + ldp x21,x22,[x2,#32] + + subs x10,x10,x16 + sbcs x11,x11,x17 + sbcs x12,x12,x19 + sbcs x13,x13,x20 + sbcs x14,x14,x21 + sbcs x15,x15,x22 + sbc x3,xzr,xzr + + and x16,x4,x3 + and x17,x5,x3 + adds x10,x10,x16 + and x19,x6,x3 + adcs x11,x11,x17 + and x20,x7,x3 + adcs x12,x12,x19 + and x21,x8,x3 + adcs x13,x13,x20 + and x22,x9,x3 + adcs x14,x14,x21 + adc x15,x15,x22 + + ret + ENDP + + + + EXPORT |sub_mod_384x|[FUNC] + ALIGN 32 +|sub_mod_384x| PROC + DCDU 3573752639 + stp x29,x30,[sp,#-48]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + + ldp x4,x5,[x3] + ldp x6,x7,[x3,#16] + ldp x8,x9,[x3,#32] + + bl __sub_mod_384 + + stp x10,x11,[x0] + add x1,x1,#48 + stp x12,x13,[x0,#16] + add x2,x2,#48 + stp x14,x15,[x0,#32] + + bl __sub_mod_384 + ldr x30,[sp,#8] + + stp x10,x11,[x0,#48] + stp x12,x13,[x0,#64] + stp x14,x15,[x0,#80] + + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldr x29,[sp],#48 + DCDU 3573752767 + ret + ENDP + + + + EXPORT |mul_by_1_plus_i_mod_384x|[FUNC] + ALIGN 32 +|mul_by_1_plus_i_mod_384x| PROC + DCDU 3573752639 + stp x29,x30,[sp,#-48]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + + ldp x4,x5,[x2] + ldp x6,x7,[x2,#16] + ldp x8,x9,[x2,#32] + add x2,x1,#48 + + bl __sub_mod_384 // a->re - a->im + + ldp x16,x17,[x1] + ldp x19,x20,[x1,#16] + ldp x21,x22,[x1,#32] + stp x10,x11,[x0] + ldp x10,x11,[x1,#48] + stp x12,x13,[x0,#16] + ldp x12,x13,[x1,#64] + stp x14,x15,[x0,#32] + ldp x14,x15,[x1,#80] + + bl __add_mod_384_ab_are_loaded // a->re + a->im + ldr x30,[sp,#8] + + stp x10,x11,[x0,#48] + stp x12,x13,[x0,#64] + stp x14,x15,[x0,#80] + + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldr x29,[sp],#48 + DCDU 3573752767 + ret + ENDP + + + + EXPORT |sgn0_pty_mod_384|[FUNC] + ALIGN 32 +|sgn0_pty_mod_384| PROC + ldp x10,x11,[x0] + ldp x12,x13,[x0,#16] + ldp x14,x15,[x0,#32] + + ldp x4,x5,[x1] + ldp x6,x7,[x1,#16] + ldp x8,x9,[x1,#32] + + and x0,x10,#1 + adds x10,x10,x10 + adcs x11,x11,x11 + adcs x12,x12,x12 + adcs x13,x13,x13 + adcs x14,x14,x14 + adcs x15,x15,x15 + adc x3,xzr,xzr + + subs x10,x10,x4 + sbcs x11,x11,x5 + sbcs x12,x12,x6 + sbcs x13,x13,x7 + sbcs x14,x14,x8 + sbcs x15,x15,x9 + sbc x3,x3,xzr + + mvn x3,x3 + and x3,x3,#2 + orr x0,x0,x3 + + ret + ENDP + + + + EXPORT |sgn0_pty_mod_384x|[FUNC] + ALIGN 32 +|sgn0_pty_mod_384x| PROC + ldp x10,x11,[x0] + ldp x12,x13,[x0,#16] + ldp x14,x15,[x0,#32] + + ldp x4,x5,[x1] + ldp x6,x7,[x1,#16] + ldp x8,x9,[x1,#32] + + and x2,x10,#1 + orr x3,x10,x11 + adds x10,x10,x10 + orr x3,x3,x12 + adcs x11,x11,x11 + orr x3,x3,x13 + adcs x12,x12,x12 + orr x3,x3,x14 + adcs x13,x13,x13 + orr x3,x3,x15 + adcs x14,x14,x14 + adcs x15,x15,x15 + adc x16,xzr,xzr + + subs x10,x10,x4 + sbcs x11,x11,x5 + sbcs x12,x12,x6 + sbcs x13,x13,x7 + sbcs x14,x14,x8 + sbcs x15,x15,x9 + sbc x16,x16,xzr + + ldp x10,x11,[x0,#48] + ldp x12,x13,[x0,#64] + ldp x14,x15,[x0,#80] + + mvn x16,x16 + and x16,x16,#2 + orr x2,x2,x16 + + and x0,x10,#1 + orr x1,x10,x11 + adds x10,x10,x10 + orr x1,x1,x12 + adcs x11,x11,x11 + orr x1,x1,x13 + adcs x12,x12,x12 + orr x1,x1,x14 + adcs x13,x13,x13 + orr x1,x1,x15 + adcs x14,x14,x14 + adcs x15,x15,x15 + adc x16,xzr,xzr + + subs x10,x10,x4 + sbcs x11,x11,x5 + sbcs x12,x12,x6 + sbcs x13,x13,x7 + sbcs x14,x14,x8 + sbcs x15,x15,x9 + sbc x16,x16,xzr + + mvn x16,x16 + and x16,x16,#2 + orr x0,x0,x16 + + cmp x3,#0 + cseleq x3,x0,x2 + + cmp x1,#0 + cselne x1,x0,x2 + + and x3,x3,#1 + and x1,x1,#2 + orr x0,x1,x3 // pack sign and parity + + ret + ENDP + + + EXPORT |vec_select_32|[FUNC] + ALIGN 32 +|vec_select_32| PROC + dup v6.2d, x3 + ld1 {v0.2d, v1.2d, v2.2d}, [x1],#48 + cmeq v6.2d, v6.2d, #0 + ld1 {v3.2d, v4.2d, v5.2d}, [x2],#48 + bit v0.16b, v3.16b, v6.16b + bit v1.16b, v4.16b, v6.16b + bit v2.16b, v5.16b, v6.16b + st1 {v0.2d, v1.2d, v2.2d}, [x0] + ret + ENDP + + + EXPORT |vec_select_48|[FUNC] + ALIGN 32 +|vec_select_48| PROC + dup v6.2d, x3 + ld1 {v0.2d, v1.2d, v2.2d}, [x1],#48 + cmeq v6.2d, v6.2d, #0 + ld1 {v3.2d, v4.2d, v5.2d}, [x2],#48 + bit v0.16b, v3.16b, v6.16b + bit v1.16b, v4.16b, v6.16b + bit v2.16b, v5.16b, v6.16b + st1 {v0.2d, v1.2d, v2.2d}, [x0] + ret + ENDP + + + EXPORT |vec_select_96|[FUNC] + ALIGN 32 +|vec_select_96| PROC + dup v6.2d, x3 + ld1 {v0.2d, v1.2d, v2.2d}, [x1],#48 + cmeq v6.2d, v6.2d, #0 + ld1 {v3.2d, v4.2d, v5.2d}, [x2],#48 + bit v0.16b, v3.16b, v6.16b + ld1 {v16.2d, v17.2d, v18.2d}, [x1],#48 + bit v1.16b, v4.16b, v6.16b + ld1 {v19.2d, v20.2d, v21.2d}, [x2],#48 + bit v2.16b, v5.16b, v6.16b + st1 {v0.2d, v1.2d, v2.2d}, [x0],#48 + bit v16.16b, v19.16b, v6.16b + bit v17.16b, v20.16b, v6.16b + bit v18.16b, v21.16b, v6.16b + st1 {v16.2d, v17.2d, v18.2d}, [x0] + ret + ENDP + + + EXPORT |vec_select_192|[FUNC] + ALIGN 32 +|vec_select_192| PROC + dup v6.2d, x3 + ld1 {v0.2d, v1.2d, v2.2d}, [x1],#48 + cmeq v6.2d, v6.2d, #0 + ld1 {v3.2d, v4.2d, v5.2d}, [x2],#48 + bit v0.16b, v3.16b, v6.16b + ld1 {v16.2d, v17.2d, v18.2d}, [x1],#48 + bit v1.16b, v4.16b, v6.16b + ld1 {v19.2d, v20.2d, v21.2d}, [x2],#48 + bit v2.16b, v5.16b, v6.16b + st1 {v0.2d, v1.2d, v2.2d}, [x0],#48 + bit v16.16b, v19.16b, v6.16b + ld1 {v0.2d, v1.2d, v2.2d}, [x1],#48 + bit v17.16b, v20.16b, v6.16b + ld1 {v3.2d, v4.2d, v5.2d}, [x2],#48 + bit v18.16b, v21.16b, v6.16b + st1 {v16.2d, v17.2d, v18.2d}, [x0],#48 + bit v0.16b, v3.16b, v6.16b + ld1 {v16.2d, v17.2d, v18.2d}, [x1],#48 + bit v1.16b, v4.16b, v6.16b + ld1 {v19.2d, v20.2d, v21.2d}, [x2],#48 + bit v2.16b, v5.16b, v6.16b + st1 {v0.2d, v1.2d, v2.2d}, [x0],#48 + bit v16.16b, v19.16b, v6.16b + bit v17.16b, v20.16b, v6.16b + bit v18.16b, v21.16b, v6.16b + st1 {v16.2d, v17.2d, v18.2d}, [x0] + ret + ENDP + + + EXPORT |vec_select_144|[FUNC] + ALIGN 32 +|vec_select_144| PROC + dup v6.2d, x3 + ld1 {v0.2d, v1.2d, v2.2d}, [x1],#48 + cmeq v6.2d, v6.2d, #0 + ld1 {v3.2d, v4.2d, v5.2d}, [x2],#48 + bit v0.16b, v3.16b, v6.16b + ld1 {v16.2d, v17.2d, v18.2d}, [x1],#48 + bit v1.16b, v4.16b, v6.16b + ld1 {v19.2d, v20.2d, v21.2d}, [x2],#48 + bit v2.16b, v5.16b, v6.16b + st1 {v0.2d, v1.2d, v2.2d}, [x0],#48 + bit v16.16b, v19.16b, v6.16b + ld1 {v0.2d, v1.2d, v2.2d}, [x1],#48 + bit v17.16b, v20.16b, v6.16b + ld1 {v3.2d, v4.2d, v5.2d}, [x2],#48 + bit v18.16b, v21.16b, v6.16b + st1 {v16.2d, v17.2d, v18.2d}, [x0],#48 + bit v0.16b, v3.16b, v6.16b + bit v1.16b, v4.16b, v6.16b + bit v2.16b, v5.16b, v6.16b + st1 {v0.2d, v1.2d, v2.2d}, [x0] + ret + ENDP + + + EXPORT |vec_select_288|[FUNC] + ALIGN 32 +|vec_select_288| PROC + dup v6.2d, x3 + ld1 {v0.2d, v1.2d, v2.2d}, [x1],#48 + cmeq v6.2d, v6.2d, #0 + ld1 {v3.2d, v4.2d, v5.2d}, [x2],#48 + bit v0.16b, v3.16b, v6.16b + ld1 {v16.2d, v17.2d, v18.2d}, [x1],#48 + bit v1.16b, v4.16b, v6.16b + ld1 {v19.2d, v20.2d, v21.2d}, [x2],#48 + bit v2.16b, v5.16b, v6.16b + st1 {v0.2d, v1.2d, v2.2d}, [x0],#48 + bit v16.16b, v19.16b, v6.16b + ld1 {v0.2d, v1.2d, v2.2d}, [x1],#48 + bit v17.16b, v20.16b, v6.16b + ld1 {v3.2d, v4.2d, v5.2d}, [x2],#48 + bit v18.16b, v21.16b, v6.16b + st1 {v16.2d, v17.2d, v18.2d}, [x0],#48 + bit v0.16b, v3.16b, v6.16b + ld1 {v16.2d, v17.2d, v18.2d}, [x1],#48 + bit v1.16b, v4.16b, v6.16b + ld1 {v19.2d, v20.2d, v21.2d}, [x2],#48 + bit v2.16b, v5.16b, v6.16b + st1 {v0.2d, v1.2d, v2.2d}, [x0],#48 + bit v16.16b, v19.16b, v6.16b + ld1 {v0.2d, v1.2d, v2.2d}, [x1],#48 + bit v17.16b, v20.16b, v6.16b + ld1 {v3.2d, v4.2d, v5.2d}, [x2],#48 + bit v18.16b, v21.16b, v6.16b + st1 {v16.2d, v17.2d, v18.2d}, [x0],#48 + bit v0.16b, v3.16b, v6.16b + ld1 {v16.2d, v17.2d, v18.2d}, [x1],#48 + bit v1.16b, v4.16b, v6.16b + ld1 {v19.2d, v20.2d, v21.2d}, [x2],#48 + bit v2.16b, v5.16b, v6.16b + st1 {v0.2d, v1.2d, v2.2d}, [x0],#48 + bit v16.16b, v19.16b, v6.16b + bit v17.16b, v20.16b, v6.16b + bit v18.16b, v21.16b, v6.16b + st1 {v16.2d, v17.2d, v18.2d}, [x0] + ret + ENDP + + + EXPORT |vec_prefetch|[FUNC] + ALIGN 32 +|vec_prefetch| PROC + add x1, x1, x0 + sub x1, x1, #1 + mov x2, #64 + prfm pldl1keep, [x0] + add x0, x0, x2 + cmp x0, x1 + cselhi x0,x1,x0 + cselhi x2,xzr,x2 + prfm pldl1keep, [x0] + add x0, x0, x2 + cmp x0, x1 + cselhi x0,x1,x0 + cselhi x2,xzr,x2 + prfm pldl1keep, [x0] + add x0, x0, x2 + cmp x0, x1 + cselhi x0,x1,x0 + cselhi x2,xzr,x2 + prfm pldl1keep, [x0] + add x0, x0, x2 + cmp x0, x1 + cselhi x0,x1,x0 + cselhi x2,xzr,x2 + prfm pldl1keep, [x0] + add x0, x0, x2 + cmp x0, x1 + cselhi x0,x1,x0 + cselhi x2,xzr,x2 + prfm pldl1keep, [x0] + add x0, x0, x2 + cmp x0, x1 + cselhi x0,x1,x0 + prfm pldl1keep, [x0] + ret + ENDP + + + EXPORT |vec_is_zero_16x|[FUNC] + ALIGN 32 +|vec_is_zero_16x| PROC + ld1 {v0.2d}, [x0], #16 + lsr x1, x1, #4 + sub x1, x1, #1 + cbz x1, |$Loop_is_zero_done| + +|$Loop_is_zero| + ld1 {v1.2d}, [x0], #16 + orr v0.16b, v0.16b, v1.16b + sub x1, x1, #1 + cbnz x1, |$Loop_is_zero| + +|$Loop_is_zero_done| + dup v1.2d, v0.d[1] + orr v0.16b, v0.16b, v1.16b + mov x1, v0.d[0] + mov x0, #1 + cmp x1, #0 + cseleq x0,x0,xzr + ret + ENDP + + + EXPORT |vec_is_equal_16x|[FUNC] + ALIGN 32 +|vec_is_equal_16x| PROC + ld1 {v0.2d}, [x0], #16 + ld1 {v1.2d}, [x1], #16 + lsr x2, x2, #4 + eor v0.16b, v0.16b, v1.16b + +|$Loop_is_equal| + sub x2, x2, #1 + cbz x2, |$Loop_is_equal_done| + ld1 {v1.2d}, [x0], #16 + ld1 {v2.2d}, [x1], #16 + eor v1.16b, v1.16b, v2.16b + orr v0.16b, v0.16b, v1.16b + b |$Loop_is_equal| + nop + +|$Loop_is_equal_done| + dup v1.2d, v0.d[1] + orr v0.16b, v0.16b, v1.16b + mov x1, v0.d[0] + mov x0, #1 + cmp x1, #0 + cseleq x0,x0,xzr + ret + ENDP + END diff --git a/temp-deps/blst-ts/deps/blst/build/win64/add_mod_384-x86_64.asm b/temp-deps/blst-ts/deps/blst/build/win64/add_mod_384-x86_64.asm new file mode 100644 index 0000000..8a7b9e2 --- /dev/null +++ b/temp-deps/blst-ts/deps/blst/build/win64/add_mod_384-x86_64.asm @@ -0,0 +1,2504 @@ +OPTION DOTNAME +.text$ SEGMENT ALIGN(256) 'CODE' + +PUBLIC add_mod_384 + + +ALIGN 32 +add_mod_384 PROC PUBLIC + DB 243,15,30,250 + mov QWORD PTR[8+rsp],rdi ;WIN64 prologue + mov QWORD PTR[16+rsp],rsi + mov r11,rsp +$L$SEH_begin_add_mod_384:: + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 + mov rcx,r9 + + + + push rbp + + push rbx + + push r12 + + push r13 + + push r14 + + push r15 + + sub rsp,8 + +$L$SEH_body_add_mod_384:: + + + call __add_mod_384 + + mov r15,QWORD PTR[8+rsp] + + mov r14,QWORD PTR[16+rsp] + + mov r13,QWORD PTR[24+rsp] + + mov r12,QWORD PTR[32+rsp] + + mov rbx,QWORD PTR[40+rsp] + + mov rbp,QWORD PTR[48+rsp] + + lea rsp,QWORD PTR[56+rsp] + +$L$SEH_epilogue_add_mod_384:: + mov rdi,QWORD PTR[8+rsp] ;WIN64 epilogue + mov rsi,QWORD PTR[16+rsp] + + DB 0F3h,0C3h ;repret + +$L$SEH_end_add_mod_384:: +add_mod_384 ENDP + + +ALIGN 32 +__add_mod_384 PROC PRIVATE + DB 243,15,30,250 + mov r8,QWORD PTR[rsi] + mov r9,QWORD PTR[8+rsi] + mov r10,QWORD PTR[16+rsi] + mov r11,QWORD PTR[24+rsi] + mov r12,QWORD PTR[32+rsi] + mov r13,QWORD PTR[40+rsi] + +__add_mod_384_a_is_loaded:: + add r8,QWORD PTR[rdx] + adc r9,QWORD PTR[8+rdx] + adc r10,QWORD PTR[16+rdx] + mov r14,r8 + adc r11,QWORD PTR[24+rdx] + mov r15,r9 + adc r12,QWORD PTR[32+rdx] + mov rax,r10 + adc r13,QWORD PTR[40+rdx] + mov rbx,r11 + sbb rdx,rdx + + sub r8,QWORD PTR[rcx] + sbb r9,QWORD PTR[8+rcx] + mov rbp,r12 + sbb r10,QWORD PTR[16+rcx] + sbb r11,QWORD PTR[24+rcx] + sbb r12,QWORD PTR[32+rcx] + mov rsi,r13 + sbb r13,QWORD PTR[40+rcx] + sbb rdx,0 + + cmovc r8,r14 + cmovc r9,r15 + cmovc r10,rax + mov QWORD PTR[rdi],r8 + cmovc r11,rbx + mov QWORD PTR[8+rdi],r9 + cmovc r12,rbp + mov QWORD PTR[16+rdi],r10 + cmovc r13,rsi + mov QWORD PTR[24+rdi],r11 + mov QWORD PTR[32+rdi],r12 + mov QWORD PTR[40+rdi],r13 + + DB 0F3h,0C3h ;repret +__add_mod_384 ENDP + +PUBLIC add_mod_384x + + +ALIGN 32 +add_mod_384x PROC PUBLIC + DB 243,15,30,250 + mov QWORD PTR[8+rsp],rdi ;WIN64 prologue + mov QWORD PTR[16+rsp],rsi + mov r11,rsp +$L$SEH_begin_add_mod_384x:: + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 + mov rcx,r9 + + + + push rbp + + push rbx + + push r12 + + push r13 + + push r14 + + push r15 + + sub rsp,24 + +$L$SEH_body_add_mod_384x:: + + + mov QWORD PTR[rsp],rsi + mov QWORD PTR[8+rsp],rdx + lea rsi,QWORD PTR[48+rsi] + lea rdx,QWORD PTR[48+rdx] + lea rdi,QWORD PTR[48+rdi] + call __add_mod_384 + + mov rsi,QWORD PTR[rsp] + mov rdx,QWORD PTR[8+rsp] + lea rdi,QWORD PTR[((-48))+rdi] + call __add_mod_384 + + mov r15,QWORD PTR[((24+0))+rsp] + + mov r14,QWORD PTR[((24+8))+rsp] + + mov r13,QWORD PTR[((24+16))+rsp] + + mov r12,QWORD PTR[((24+24))+rsp] + + mov rbx,QWORD PTR[((24+32))+rsp] + + mov rbp,QWORD PTR[((24+40))+rsp] + + lea rsp,QWORD PTR[((24+48))+rsp] + +$L$SEH_epilogue_add_mod_384x:: + mov rdi,QWORD PTR[8+rsp] ;WIN64 epilogue + mov rsi,QWORD PTR[16+rsp] + + DB 0F3h,0C3h ;repret + +$L$SEH_end_add_mod_384x:: +add_mod_384x ENDP + + +PUBLIC rshift_mod_384 + + +ALIGN 32 +rshift_mod_384 PROC PUBLIC + DB 243,15,30,250 + mov QWORD PTR[8+rsp],rdi ;WIN64 prologue + mov QWORD PTR[16+rsp],rsi + mov r11,rsp +$L$SEH_begin_rshift_mod_384:: + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 + mov rcx,r9 + + + + push rbp + + push rbx + + push r12 + + push r13 + + push r14 + + push r15 + + push rdi + +$L$SEH_body_rshift_mod_384:: + + + mov r8,QWORD PTR[rsi] + mov r9,QWORD PTR[8+rsi] + mov r10,QWORD PTR[16+rsi] + mov r11,QWORD PTR[24+rsi] + mov r12,QWORD PTR[32+rsi] + mov r13,QWORD PTR[40+rsi] + +$L$oop_rshift_mod_384:: + call __rshift_mod_384 + dec edx + jnz $L$oop_rshift_mod_384 + + mov QWORD PTR[rdi],r8 + mov QWORD PTR[8+rdi],r9 + mov QWORD PTR[16+rdi],r10 + mov QWORD PTR[24+rdi],r11 + mov QWORD PTR[32+rdi],r12 + mov QWORD PTR[40+rdi],r13 + + mov r15,QWORD PTR[8+rsp] + + mov r14,QWORD PTR[16+rsp] + + mov r13,QWORD PTR[24+rsp] + + mov r12,QWORD PTR[32+rsp] + + mov rbx,QWORD PTR[40+rsp] + + mov rbp,QWORD PTR[48+rsp] + + lea rsp,QWORD PTR[56+rsp] + +$L$SEH_epilogue_rshift_mod_384:: + mov rdi,QWORD PTR[8+rsp] ;WIN64 epilogue + mov rsi,QWORD PTR[16+rsp] + + DB 0F3h,0C3h ;repret + +$L$SEH_end_rshift_mod_384:: +rshift_mod_384 ENDP + + +ALIGN 32 +__rshift_mod_384 PROC PRIVATE + DB 243,15,30,250 + mov rsi,1 + mov r14,QWORD PTR[rcx] + and rsi,r8 + mov r15,QWORD PTR[8+rcx] + neg rsi + mov rax,QWORD PTR[16+rcx] + and r14,rsi + mov rbx,QWORD PTR[24+rcx] + and r15,rsi + mov rbp,QWORD PTR[32+rcx] + and rax,rsi + and rbx,rsi + and rbp,rsi + and rsi,QWORD PTR[40+rcx] + + add r14,r8 + adc r15,r9 + adc rax,r10 + adc rbx,r11 + adc rbp,r12 + adc rsi,r13 + sbb r13,r13 + + shr r14,1 + mov r8,r15 + shr r15,1 + mov r9,rax + shr rax,1 + mov r10,rbx + shr rbx,1 + mov r11,rbp + shr rbp,1 + mov r12,rsi + shr rsi,1 + shl r8,63 + shl r9,63 + or r8,r14 + shl r10,63 + or r9,r15 + shl r11,63 + or r10,rax + shl r12,63 + or r11,rbx + shl r13,63 + or r12,rbp + or r13,rsi + + DB 0F3h,0C3h ;repret +__rshift_mod_384 ENDP + +PUBLIC div_by_2_mod_384 + + +ALIGN 32 +div_by_2_mod_384 PROC PUBLIC + DB 243,15,30,250 + mov QWORD PTR[8+rsp],rdi ;WIN64 prologue + mov QWORD PTR[16+rsp],rsi + mov r11,rsp +$L$SEH_begin_div_by_2_mod_384:: + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 + + + + push rbp + + push rbx + + push r12 + + push r13 + + push r14 + + push r15 + + push rdi + +$L$SEH_body_div_by_2_mod_384:: + + + mov r8,QWORD PTR[rsi] + mov rcx,rdx + mov r9,QWORD PTR[8+rsi] + mov r10,QWORD PTR[16+rsi] + mov r11,QWORD PTR[24+rsi] + mov r12,QWORD PTR[32+rsi] + mov r13,QWORD PTR[40+rsi] + + call __rshift_mod_384 + + mov QWORD PTR[rdi],r8 + mov QWORD PTR[8+rdi],r9 + mov QWORD PTR[16+rdi],r10 + mov QWORD PTR[24+rdi],r11 + mov QWORD PTR[32+rdi],r12 + mov QWORD PTR[40+rdi],r13 + + mov r15,QWORD PTR[8+rsp] + + mov r14,QWORD PTR[16+rsp] + + mov r13,QWORD PTR[24+rsp] + + mov r12,QWORD PTR[32+rsp] + + mov rbx,QWORD PTR[40+rsp] + + mov rbp,QWORD PTR[48+rsp] + + lea rsp,QWORD PTR[56+rsp] + +$L$SEH_epilogue_div_by_2_mod_384:: + mov rdi,QWORD PTR[8+rsp] ;WIN64 epilogue + mov rsi,QWORD PTR[16+rsp] + + DB 0F3h,0C3h ;repret + +$L$SEH_end_div_by_2_mod_384:: +div_by_2_mod_384 ENDP + + +PUBLIC lshift_mod_384 + + +ALIGN 32 +lshift_mod_384 PROC PUBLIC + DB 243,15,30,250 + mov QWORD PTR[8+rsp],rdi ;WIN64 prologue + mov QWORD PTR[16+rsp],rsi + mov r11,rsp +$L$SEH_begin_lshift_mod_384:: + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 + mov rcx,r9 + + + + push rbp + + push rbx + + push r12 + + push r13 + + push r14 + + push r15 + + push rdi + +$L$SEH_body_lshift_mod_384:: + + + mov r8,QWORD PTR[rsi] + mov r9,QWORD PTR[8+rsi] + mov r10,QWORD PTR[16+rsi] + mov r11,QWORD PTR[24+rsi] + mov r12,QWORD PTR[32+rsi] + mov r13,QWORD PTR[40+rsi] + +$L$oop_lshift_mod_384:: + add r8,r8 + adc r9,r9 + adc r10,r10 + mov r14,r8 + adc r11,r11 + mov r15,r9 + adc r12,r12 + mov rax,r10 + adc r13,r13 + mov rbx,r11 + sbb rdi,rdi + + sub r8,QWORD PTR[rcx] + sbb r9,QWORD PTR[8+rcx] + mov rbp,r12 + sbb r10,QWORD PTR[16+rcx] + sbb r11,QWORD PTR[24+rcx] + sbb r12,QWORD PTR[32+rcx] + mov rsi,r13 + sbb r13,QWORD PTR[40+rcx] + sbb rdi,0 + + mov rdi,QWORD PTR[rsp] + cmovc r8,r14 + cmovc r9,r15 + cmovc r10,rax + cmovc r11,rbx + cmovc r12,rbp + cmovc r13,rsi + + dec edx + jnz $L$oop_lshift_mod_384 + + mov QWORD PTR[rdi],r8 + mov QWORD PTR[8+rdi],r9 + mov QWORD PTR[16+rdi],r10 + mov QWORD PTR[24+rdi],r11 + mov QWORD PTR[32+rdi],r12 + mov QWORD PTR[40+rdi],r13 + + mov r15,QWORD PTR[8+rsp] + + mov r14,QWORD PTR[16+rsp] + + mov r13,QWORD PTR[24+rsp] + + mov r12,QWORD PTR[32+rsp] + + mov rbx,QWORD PTR[40+rsp] + + mov rbp,QWORD PTR[48+rsp] + + lea rsp,QWORD PTR[56+rsp] + +$L$SEH_epilogue_lshift_mod_384:: + mov rdi,QWORD PTR[8+rsp] ;WIN64 epilogue + mov rsi,QWORD PTR[16+rsp] + + DB 0F3h,0C3h ;repret + +$L$SEH_end_lshift_mod_384:: +lshift_mod_384 ENDP + + +ALIGN 32 +__lshift_mod_384 PROC PRIVATE + DB 243,15,30,250 + add r8,r8 + adc r9,r9 + adc r10,r10 + mov r14,r8 + adc r11,r11 + mov r15,r9 + adc r12,r12 + mov rax,r10 + adc r13,r13 + mov rbx,r11 + sbb rdx,rdx + + sub r8,QWORD PTR[rcx] + sbb r9,QWORD PTR[8+rcx] + mov rbp,r12 + sbb r10,QWORD PTR[16+rcx] + sbb r11,QWORD PTR[24+rcx] + sbb r12,QWORD PTR[32+rcx] + mov rsi,r13 + sbb r13,QWORD PTR[40+rcx] + sbb rdx,0 + + cmovc r8,r14 + cmovc r9,r15 + cmovc r10,rax + cmovc r11,rbx + cmovc r12,rbp + cmovc r13,rsi + + DB 0F3h,0C3h ;repret +__lshift_mod_384 ENDP + + +PUBLIC mul_by_3_mod_384 + + +ALIGN 32 +mul_by_3_mod_384 PROC PUBLIC + DB 243,15,30,250 + mov QWORD PTR[8+rsp],rdi ;WIN64 prologue + mov QWORD PTR[16+rsp],rsi + mov r11,rsp +$L$SEH_begin_mul_by_3_mod_384:: + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 + + + + push rbp + + push rbx + + push r12 + + push r13 + + push r14 + + push r15 + + push rsi + +$L$SEH_body_mul_by_3_mod_384:: + + + mov r8,QWORD PTR[rsi] + mov r9,QWORD PTR[8+rsi] + mov r10,QWORD PTR[16+rsi] + mov r11,QWORD PTR[24+rsi] + mov r12,QWORD PTR[32+rsi] + mov r13,QWORD PTR[40+rsi] + mov rcx,rdx + + call __lshift_mod_384 + + mov rdx,QWORD PTR[rsp] + call __add_mod_384_a_is_loaded + + mov r15,QWORD PTR[8+rsp] + + mov r14,QWORD PTR[16+rsp] + + mov r13,QWORD PTR[24+rsp] + + mov r12,QWORD PTR[32+rsp] + + mov rbx,QWORD PTR[40+rsp] + + mov rbp,QWORD PTR[48+rsp] + + lea rsp,QWORD PTR[56+rsp] + +$L$SEH_epilogue_mul_by_3_mod_384:: + mov rdi,QWORD PTR[8+rsp] ;WIN64 epilogue + mov rsi,QWORD PTR[16+rsp] + + DB 0F3h,0C3h ;repret + +$L$SEH_end_mul_by_3_mod_384:: +mul_by_3_mod_384 ENDP + +PUBLIC mul_by_8_mod_384 + + +ALIGN 32 +mul_by_8_mod_384 PROC PUBLIC + DB 243,15,30,250 + mov QWORD PTR[8+rsp],rdi ;WIN64 prologue + mov QWORD PTR[16+rsp],rsi + mov r11,rsp +$L$SEH_begin_mul_by_8_mod_384:: + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 + + + + push rbp + + push rbx + + push r12 + + push r13 + + push r14 + + push r15 + + sub rsp,8 + +$L$SEH_body_mul_by_8_mod_384:: + + + mov r8,QWORD PTR[rsi] + mov r9,QWORD PTR[8+rsi] + mov r10,QWORD PTR[16+rsi] + mov r11,QWORD PTR[24+rsi] + mov r12,QWORD PTR[32+rsi] + mov r13,QWORD PTR[40+rsi] + mov rcx,rdx + + call __lshift_mod_384 + call __lshift_mod_384 + call __lshift_mod_384 + + mov QWORD PTR[rdi],r8 + mov QWORD PTR[8+rdi],r9 + mov QWORD PTR[16+rdi],r10 + mov QWORD PTR[24+rdi],r11 + mov QWORD PTR[32+rdi],r12 + mov QWORD PTR[40+rdi],r13 + + mov r15,QWORD PTR[8+rsp] + + mov r14,QWORD PTR[16+rsp] + + mov r13,QWORD PTR[24+rsp] + + mov r12,QWORD PTR[32+rsp] + + mov rbx,QWORD PTR[40+rsp] + + mov rbp,QWORD PTR[48+rsp] + + lea rsp,QWORD PTR[56+rsp] + +$L$SEH_epilogue_mul_by_8_mod_384:: + mov rdi,QWORD PTR[8+rsp] ;WIN64 epilogue + mov rsi,QWORD PTR[16+rsp] + + DB 0F3h,0C3h ;repret + +$L$SEH_end_mul_by_8_mod_384:: +mul_by_8_mod_384 ENDP + + +PUBLIC mul_by_3_mod_384x + + +ALIGN 32 +mul_by_3_mod_384x PROC PUBLIC + DB 243,15,30,250 + mov QWORD PTR[8+rsp],rdi ;WIN64 prologue + mov QWORD PTR[16+rsp],rsi + mov r11,rsp +$L$SEH_begin_mul_by_3_mod_384x:: + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 + + + + push rbp + + push rbx + + push r12 + + push r13 + + push r14 + + push r15 + + push rsi + +$L$SEH_body_mul_by_3_mod_384x:: + + + mov r8,QWORD PTR[rsi] + mov r9,QWORD PTR[8+rsi] + mov r10,QWORD PTR[16+rsi] + mov r11,QWORD PTR[24+rsi] + mov r12,QWORD PTR[32+rsi] + mov r13,QWORD PTR[40+rsi] + mov rcx,rdx + + call __lshift_mod_384 + + mov rdx,QWORD PTR[rsp] + call __add_mod_384_a_is_loaded + + mov rsi,QWORD PTR[rsp] + lea rdi,QWORD PTR[48+rdi] + + mov r8,QWORD PTR[48+rsi] + mov r9,QWORD PTR[56+rsi] + mov r10,QWORD PTR[64+rsi] + mov r11,QWORD PTR[72+rsi] + mov r12,QWORD PTR[80+rsi] + mov r13,QWORD PTR[88+rsi] + + call __lshift_mod_384 + + mov rdx,8*6 + add rdx,QWORD PTR[rsp] + call __add_mod_384_a_is_loaded + + mov r15,QWORD PTR[8+rsp] + + mov r14,QWORD PTR[16+rsp] + + mov r13,QWORD PTR[24+rsp] + + mov r12,QWORD PTR[32+rsp] + + mov rbx,QWORD PTR[40+rsp] + + mov rbp,QWORD PTR[48+rsp] + + lea rsp,QWORD PTR[56+rsp] + +$L$SEH_epilogue_mul_by_3_mod_384x:: + mov rdi,QWORD PTR[8+rsp] ;WIN64 epilogue + mov rsi,QWORD PTR[16+rsp] + + DB 0F3h,0C3h ;repret + +$L$SEH_end_mul_by_3_mod_384x:: +mul_by_3_mod_384x ENDP + +PUBLIC mul_by_8_mod_384x + + +ALIGN 32 +mul_by_8_mod_384x PROC PUBLIC + DB 243,15,30,250 + mov QWORD PTR[8+rsp],rdi ;WIN64 prologue + mov QWORD PTR[16+rsp],rsi + mov r11,rsp +$L$SEH_begin_mul_by_8_mod_384x:: + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 + + + + push rbp + + push rbx + + push r12 + + push r13 + + push r14 + + push r15 + + push rsi + +$L$SEH_body_mul_by_8_mod_384x:: + + + mov r8,QWORD PTR[rsi] + mov r9,QWORD PTR[8+rsi] + mov r10,QWORD PTR[16+rsi] + mov r11,QWORD PTR[24+rsi] + mov r12,QWORD PTR[32+rsi] + mov r13,QWORD PTR[40+rsi] + mov rcx,rdx + + call __lshift_mod_384 + call __lshift_mod_384 + call __lshift_mod_384 + + mov rsi,QWORD PTR[rsp] + mov QWORD PTR[rdi],r8 + mov QWORD PTR[8+rdi],r9 + mov QWORD PTR[16+rdi],r10 + mov QWORD PTR[24+rdi],r11 + mov QWORD PTR[32+rdi],r12 + mov QWORD PTR[40+rdi],r13 + + mov r8,QWORD PTR[((48+0))+rsi] + mov r9,QWORD PTR[((48+8))+rsi] + mov r10,QWORD PTR[((48+16))+rsi] + mov r11,QWORD PTR[((48+24))+rsi] + mov r12,QWORD PTR[((48+32))+rsi] + mov r13,QWORD PTR[((48+40))+rsi] + + call __lshift_mod_384 + call __lshift_mod_384 + call __lshift_mod_384 + + mov QWORD PTR[((48+0))+rdi],r8 + mov QWORD PTR[((48+8))+rdi],r9 + mov QWORD PTR[((48+16))+rdi],r10 + mov QWORD PTR[((48+24))+rdi],r11 + mov QWORD PTR[((48+32))+rdi],r12 + mov QWORD PTR[((48+40))+rdi],r13 + + mov r15,QWORD PTR[8+rsp] + + mov r14,QWORD PTR[16+rsp] + + mov r13,QWORD PTR[24+rsp] + + mov r12,QWORD PTR[32+rsp] + + mov rbx,QWORD PTR[40+rsp] + + mov rbp,QWORD PTR[48+rsp] + + lea rsp,QWORD PTR[56+rsp] + +$L$SEH_epilogue_mul_by_8_mod_384x:: + mov rdi,QWORD PTR[8+rsp] ;WIN64 epilogue + mov rsi,QWORD PTR[16+rsp] + + DB 0F3h,0C3h ;repret + +$L$SEH_end_mul_by_8_mod_384x:: +mul_by_8_mod_384x ENDP + + +PUBLIC cneg_mod_384 + + +ALIGN 32 +cneg_mod_384 PROC PUBLIC + DB 243,15,30,250 + mov QWORD PTR[8+rsp],rdi ;WIN64 prologue + mov QWORD PTR[16+rsp],rsi + mov r11,rsp +$L$SEH_begin_cneg_mod_384:: + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 + mov rcx,r9 + + + + push rbp + + push rbx + + push r12 + + push r13 + + push r14 + + push r15 + + push rdx + +$L$SEH_body_cneg_mod_384:: + + + mov rdx,QWORD PTR[rsi] + mov r9,QWORD PTR[8+rsi] + mov r10,QWORD PTR[16+rsi] + mov r8,rdx + mov r11,QWORD PTR[24+rsi] + or rdx,r9 + mov r12,QWORD PTR[32+rsi] + or rdx,r10 + mov r13,QWORD PTR[40+rsi] + or rdx,r11 + mov rsi,-1 + or rdx,r12 + or rdx,r13 + + mov r14,QWORD PTR[rcx] + cmovnz rdx,rsi + mov r15,QWORD PTR[8+rcx] + mov rax,QWORD PTR[16+rcx] + and r14,rdx + mov rbx,QWORD PTR[24+rcx] + and r15,rdx + mov rbp,QWORD PTR[32+rcx] + and rax,rdx + mov rsi,QWORD PTR[40+rcx] + and rbx,rdx + mov rcx,QWORD PTR[rsp] + and rbp,rdx + and rsi,rdx + + sub r14,r8 + sbb r15,r9 + sbb rax,r10 + sbb rbx,r11 + sbb rbp,r12 + sbb rsi,r13 + + or rcx,rcx + + cmovz r14,r8 + cmovz r15,r9 + cmovz rax,r10 + mov QWORD PTR[rdi],r14 + cmovz rbx,r11 + mov QWORD PTR[8+rdi],r15 + cmovz rbp,r12 + mov QWORD PTR[16+rdi],rax + cmovz rsi,r13 + mov QWORD PTR[24+rdi],rbx + mov QWORD PTR[32+rdi],rbp + mov QWORD PTR[40+rdi],rsi + + mov r15,QWORD PTR[8+rsp] + + mov r14,QWORD PTR[16+rsp] + + mov r13,QWORD PTR[24+rsp] + + mov r12,QWORD PTR[32+rsp] + + mov rbx,QWORD PTR[40+rsp] + + mov rbp,QWORD PTR[48+rsp] + + lea rsp,QWORD PTR[56+rsp] + +$L$SEH_epilogue_cneg_mod_384:: + mov rdi,QWORD PTR[8+rsp] ;WIN64 epilogue + mov rsi,QWORD PTR[16+rsp] + + DB 0F3h,0C3h ;repret + +$L$SEH_end_cneg_mod_384:: +cneg_mod_384 ENDP + + +PUBLIC sub_mod_384 + + +ALIGN 32 +sub_mod_384 PROC PUBLIC + DB 243,15,30,250 + mov QWORD PTR[8+rsp],rdi ;WIN64 prologue + mov QWORD PTR[16+rsp],rsi + mov r11,rsp +$L$SEH_begin_sub_mod_384:: + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 + mov rcx,r9 + + + + push rbp + + push rbx + + push r12 + + push r13 + + push r14 + + push r15 + + sub rsp,8 + +$L$SEH_body_sub_mod_384:: + + + call __sub_mod_384 + + mov r15,QWORD PTR[8+rsp] + + mov r14,QWORD PTR[16+rsp] + + mov r13,QWORD PTR[24+rsp] + + mov r12,QWORD PTR[32+rsp] + + mov rbx,QWORD PTR[40+rsp] + + mov rbp,QWORD PTR[48+rsp] + + lea rsp,QWORD PTR[56+rsp] + +$L$SEH_epilogue_sub_mod_384:: + mov rdi,QWORD PTR[8+rsp] ;WIN64 epilogue + mov rsi,QWORD PTR[16+rsp] + + DB 0F3h,0C3h ;repret + +$L$SEH_end_sub_mod_384:: +sub_mod_384 ENDP + + +ALIGN 32 +__sub_mod_384 PROC PRIVATE + DB 243,15,30,250 + mov r8,QWORD PTR[rsi] + mov r9,QWORD PTR[8+rsi] + mov r10,QWORD PTR[16+rsi] + mov r11,QWORD PTR[24+rsi] + mov r12,QWORD PTR[32+rsi] + mov r13,QWORD PTR[40+rsi] + + sub r8,QWORD PTR[rdx] + mov r14,QWORD PTR[rcx] + sbb r9,QWORD PTR[8+rdx] + mov r15,QWORD PTR[8+rcx] + sbb r10,QWORD PTR[16+rdx] + mov rax,QWORD PTR[16+rcx] + sbb r11,QWORD PTR[24+rdx] + mov rbx,QWORD PTR[24+rcx] + sbb r12,QWORD PTR[32+rdx] + mov rbp,QWORD PTR[32+rcx] + sbb r13,QWORD PTR[40+rdx] + mov rsi,QWORD PTR[40+rcx] + sbb rdx,rdx + + and r14,rdx + and r15,rdx + and rax,rdx + and rbx,rdx + and rbp,rdx + and rsi,rdx + + add r8,r14 + adc r9,r15 + mov QWORD PTR[rdi],r8 + adc r10,rax + mov QWORD PTR[8+rdi],r9 + adc r11,rbx + mov QWORD PTR[16+rdi],r10 + adc r12,rbp + mov QWORD PTR[24+rdi],r11 + adc r13,rsi + mov QWORD PTR[32+rdi],r12 + mov QWORD PTR[40+rdi],r13 + + DB 0F3h,0C3h ;repret +__sub_mod_384 ENDP + +PUBLIC sub_mod_384x + + +ALIGN 32 +sub_mod_384x PROC PUBLIC + DB 243,15,30,250 + mov QWORD PTR[8+rsp],rdi ;WIN64 prologue + mov QWORD PTR[16+rsp],rsi + mov r11,rsp +$L$SEH_begin_sub_mod_384x:: + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 + mov rcx,r9 + + + + push rbp + + push rbx + + push r12 + + push r13 + + push r14 + + push r15 + + sub rsp,24 + +$L$SEH_body_sub_mod_384x:: + + + mov QWORD PTR[rsp],rsi + mov QWORD PTR[8+rsp],rdx + lea rsi,QWORD PTR[48+rsi] + lea rdx,QWORD PTR[48+rdx] + lea rdi,QWORD PTR[48+rdi] + call __sub_mod_384 + + mov rsi,QWORD PTR[rsp] + mov rdx,QWORD PTR[8+rsp] + lea rdi,QWORD PTR[((-48))+rdi] + call __sub_mod_384 + + mov r15,QWORD PTR[((24+0))+rsp] + + mov r14,QWORD PTR[((24+8))+rsp] + + mov r13,QWORD PTR[((24+16))+rsp] + + mov r12,QWORD PTR[((24+24))+rsp] + + mov rbx,QWORD PTR[((24+32))+rsp] + + mov rbp,QWORD PTR[((24+40))+rsp] + + lea rsp,QWORD PTR[((24+48))+rsp] + +$L$SEH_epilogue_sub_mod_384x:: + mov rdi,QWORD PTR[8+rsp] ;WIN64 epilogue + mov rsi,QWORD PTR[16+rsp] + + DB 0F3h,0C3h ;repret + +$L$SEH_end_sub_mod_384x:: +sub_mod_384x ENDP +PUBLIC mul_by_1_plus_i_mod_384x + + +ALIGN 32 +mul_by_1_plus_i_mod_384x PROC PUBLIC + DB 243,15,30,250 + mov QWORD PTR[8+rsp],rdi ;WIN64 prologue + mov QWORD PTR[16+rsp],rsi + mov r11,rsp +$L$SEH_begin_mul_by_1_plus_i_mod_384x:: + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 + + + + push rbp + + push rbx + + push r12 + + push r13 + + push r14 + + push r15 + + sub rsp,56 + +$L$SEH_body_mul_by_1_plus_i_mod_384x:: + + + mov r8,QWORD PTR[rsi] + mov r9,QWORD PTR[8+rsi] + mov r10,QWORD PTR[16+rsi] + mov r11,QWORD PTR[24+rsi] + mov r12,QWORD PTR[32+rsi] + mov r13,QWORD PTR[40+rsi] + + mov r14,r8 + add r8,QWORD PTR[48+rsi] + mov r15,r9 + adc r9,QWORD PTR[56+rsi] + mov rax,r10 + adc r10,QWORD PTR[64+rsi] + mov rbx,r11 + adc r11,QWORD PTR[72+rsi] + mov rcx,r12 + adc r12,QWORD PTR[80+rsi] + mov rbp,r13 + adc r13,QWORD PTR[88+rsi] + mov QWORD PTR[48+rsp],rdi + sbb rdi,rdi + + sub r14,QWORD PTR[48+rsi] + sbb r15,QWORD PTR[56+rsi] + sbb rax,QWORD PTR[64+rsi] + sbb rbx,QWORD PTR[72+rsi] + sbb rcx,QWORD PTR[80+rsi] + sbb rbp,QWORD PTR[88+rsi] + sbb rsi,rsi + + mov QWORD PTR[rsp],r8 + mov r8,QWORD PTR[rdx] + mov QWORD PTR[8+rsp],r9 + mov r9,QWORD PTR[8+rdx] + mov QWORD PTR[16+rsp],r10 + mov r10,QWORD PTR[16+rdx] + mov QWORD PTR[24+rsp],r11 + mov r11,QWORD PTR[24+rdx] + mov QWORD PTR[32+rsp],r12 + and r8,rsi + mov r12,QWORD PTR[32+rdx] + mov QWORD PTR[40+rsp],r13 + and r9,rsi + mov r13,QWORD PTR[40+rdx] + and r10,rsi + and r11,rsi + and r12,rsi + and r13,rsi + mov rsi,QWORD PTR[48+rsp] + + add r14,r8 + mov r8,QWORD PTR[rsp] + adc r15,r9 + mov r9,QWORD PTR[8+rsp] + adc rax,r10 + mov r10,QWORD PTR[16+rsp] + adc rbx,r11 + mov r11,QWORD PTR[24+rsp] + adc rcx,r12 + mov r12,QWORD PTR[32+rsp] + adc rbp,r13 + mov r13,QWORD PTR[40+rsp] + + mov QWORD PTR[rsi],r14 + mov r14,r8 + mov QWORD PTR[8+rsi],r15 + mov QWORD PTR[16+rsi],rax + mov r15,r9 + mov QWORD PTR[24+rsi],rbx + mov QWORD PTR[32+rsi],rcx + mov rax,r10 + mov QWORD PTR[40+rsi],rbp + + sub r8,QWORD PTR[rdx] + mov rbx,r11 + sbb r9,QWORD PTR[8+rdx] + sbb r10,QWORD PTR[16+rdx] + mov rcx,r12 + sbb r11,QWORD PTR[24+rdx] + sbb r12,QWORD PTR[32+rdx] + mov rbp,r13 + sbb r13,QWORD PTR[40+rdx] + sbb rdi,0 + + cmovc r8,r14 + cmovc r9,r15 + cmovc r10,rax + mov QWORD PTR[48+rsi],r8 + cmovc r11,rbx + mov QWORD PTR[56+rsi],r9 + cmovc r12,rcx + mov QWORD PTR[64+rsi],r10 + cmovc r13,rbp + mov QWORD PTR[72+rsi],r11 + mov QWORD PTR[80+rsi],r12 + mov QWORD PTR[88+rsi],r13 + + mov r15,QWORD PTR[((56+0))+rsp] + + mov r14,QWORD PTR[((56+8))+rsp] + + mov r13,QWORD PTR[((56+16))+rsp] + + mov r12,QWORD PTR[((56+24))+rsp] + + mov rbx,QWORD PTR[((56+32))+rsp] + + mov rbp,QWORD PTR[((56+40))+rsp] + + lea rsp,QWORD PTR[((56+48))+rsp] + +$L$SEH_epilogue_mul_by_1_plus_i_mod_384x:: + mov rdi,QWORD PTR[8+rsp] ;WIN64 epilogue + mov rsi,QWORD PTR[16+rsp] + + DB 0F3h,0C3h ;repret + +$L$SEH_end_mul_by_1_plus_i_mod_384x:: +mul_by_1_plus_i_mod_384x ENDP +PUBLIC sgn0_pty_mod_384 + + +ALIGN 32 +sgn0_pty_mod_384 PROC PUBLIC + DB 243,15,30,250 + mov QWORD PTR[8+rsp],rdi ;WIN64 prologue + mov QWORD PTR[16+rsp],rsi + mov r11,rsp +$L$SEH_begin_sgn0_pty_mod_384:: + mov rdi,rcx + mov rsi,rdx + + + +$L$SEH_body_sgn0_pty_mod_384:: + + mov r8,QWORD PTR[rdi] + mov r9,QWORD PTR[8+rdi] + mov r10,QWORD PTR[16+rdi] + mov r11,QWORD PTR[24+rdi] + mov rcx,QWORD PTR[32+rdi] + mov rdx,QWORD PTR[40+rdi] + + xor rax,rax + mov rdi,r8 + add r8,r8 + adc r9,r9 + adc r10,r10 + adc r11,r11 + adc rcx,rcx + adc rdx,rdx + adc rax,0 + + sub r8,QWORD PTR[rsi] + sbb r9,QWORD PTR[8+rsi] + sbb r10,QWORD PTR[16+rsi] + sbb r11,QWORD PTR[24+rsi] + sbb rcx,QWORD PTR[32+rsi] + sbb rdx,QWORD PTR[40+rsi] + sbb rax,0 + + not rax + and rdi,1 + and rax,2 + or rax,rdi + +$L$SEH_epilogue_sgn0_pty_mod_384:: + mov rdi,QWORD PTR[8+rsp] ;WIN64 epilogue + mov rsi,QWORD PTR[16+rsp] + + DB 0F3h,0C3h ;repret + +$L$SEH_end_sgn0_pty_mod_384:: +sgn0_pty_mod_384 ENDP + +PUBLIC sgn0_pty_mod_384x + + +ALIGN 32 +sgn0_pty_mod_384x PROC PUBLIC + DB 243,15,30,250 + mov QWORD PTR[8+rsp],rdi ;WIN64 prologue + mov QWORD PTR[16+rsp],rsi + mov r11,rsp +$L$SEH_begin_sgn0_pty_mod_384x:: + mov rdi,rcx + mov rsi,rdx + + + + push rbp + + push rbx + + sub rsp,8 + +$L$SEH_body_sgn0_pty_mod_384x:: + + + mov r8,QWORD PTR[48+rdi] + mov r9,QWORD PTR[56+rdi] + mov r10,QWORD PTR[64+rdi] + mov r11,QWORD PTR[72+rdi] + mov rcx,QWORD PTR[80+rdi] + mov rdx,QWORD PTR[88+rdi] + + mov rbx,r8 + or r8,r9 + or r8,r10 + or r8,r11 + or r8,rcx + or r8,rdx + + lea rax,QWORD PTR[rdi] + xor rdi,rdi + mov rbp,rbx + add rbx,rbx + adc r9,r9 + adc r10,r10 + adc r11,r11 + adc rcx,rcx + adc rdx,rdx + adc rdi,0 + + sub rbx,QWORD PTR[rsi] + sbb r9,QWORD PTR[8+rsi] + sbb r10,QWORD PTR[16+rsi] + sbb r11,QWORD PTR[24+rsi] + sbb rcx,QWORD PTR[32+rsi] + sbb rdx,QWORD PTR[40+rsi] + sbb rdi,0 + + mov QWORD PTR[rsp],r8 + not rdi + and rbp,1 + and rdi,2 + or rdi,rbp + + mov r8,QWORD PTR[rax] + mov r9,QWORD PTR[8+rax] + mov r10,QWORD PTR[16+rax] + mov r11,QWORD PTR[24+rax] + mov rcx,QWORD PTR[32+rax] + mov rdx,QWORD PTR[40+rax] + + mov rbx,r8 + or r8,r9 + or r8,r10 + or r8,r11 + or r8,rcx + or r8,rdx + + xor rax,rax + mov rbp,rbx + add rbx,rbx + adc r9,r9 + adc r10,r10 + adc r11,r11 + adc rcx,rcx + adc rdx,rdx + adc rax,0 + + sub rbx,QWORD PTR[rsi] + sbb r9,QWORD PTR[8+rsi] + sbb r10,QWORD PTR[16+rsi] + sbb r11,QWORD PTR[24+rsi] + sbb rcx,QWORD PTR[32+rsi] + sbb rdx,QWORD PTR[40+rsi] + sbb rax,0 + + mov rbx,QWORD PTR[rsp] + + not rax + + test r8,r8 + cmovz rbp,rdi + + test rbx,rbx + cmovnz rax,rdi + + and rbp,1 + and rax,2 + or rax,rbp + + mov rbx,QWORD PTR[8+rsp] + + mov rbp,QWORD PTR[16+rsp] + + lea rsp,QWORD PTR[24+rsp] + +$L$SEH_epilogue_sgn0_pty_mod_384x:: + mov rdi,QWORD PTR[8+rsp] ;WIN64 epilogue + mov rsi,QWORD PTR[16+rsp] + + DB 0F3h,0C3h ;repret + +$L$SEH_end_sgn0_pty_mod_384x:: +sgn0_pty_mod_384x ENDP +PUBLIC vec_select_32 + + +ALIGN 32 +vec_select_32 PROC PUBLIC + DB 243,15,30,250 + movd xmm5,r9d + pxor xmm4,xmm4 + pshufd xmm5,xmm5,0 + movdqu xmm0,XMMWORD PTR[rdx] + lea rdx,QWORD PTR[16+rdx] + pcmpeqd xmm5,xmm4 + movdqu xmm1,XMMWORD PTR[r8] + lea r8,QWORD PTR[16+r8] + pcmpeqd xmm4,xmm5 + lea rcx,QWORD PTR[16+rcx] + pand xmm0,xmm4 + movdqu xmm2,XMMWORD PTR[((0+16-16))+rdx] + pand xmm1,xmm5 + movdqu xmm3,XMMWORD PTR[((0+16-16))+r8] + por xmm0,xmm1 + movdqu XMMWORD PTR[(0-16)+rcx],xmm0 + pand xmm2,xmm4 + pand xmm3,xmm5 + por xmm2,xmm3 + movdqu XMMWORD PTR[(16-16)+rcx],xmm2 + DB 0F3h,0C3h ;repret +vec_select_32 ENDP +PUBLIC vec_select_48 + + +ALIGN 32 +vec_select_48 PROC PUBLIC + DB 243,15,30,250 + movd xmm5,r9d + pxor xmm4,xmm4 + pshufd xmm5,xmm5,0 + movdqu xmm0,XMMWORD PTR[rdx] + lea rdx,QWORD PTR[24+rdx] + pcmpeqd xmm5,xmm4 + movdqu xmm1,XMMWORD PTR[r8] + lea r8,QWORD PTR[24+r8] + pcmpeqd xmm4,xmm5 + lea rcx,QWORD PTR[24+rcx] + pand xmm0,xmm4 + movdqu xmm2,XMMWORD PTR[((0+16-24))+rdx] + pand xmm1,xmm5 + movdqu xmm3,XMMWORD PTR[((0+16-24))+r8] + por xmm0,xmm1 + movdqu XMMWORD PTR[(0-24)+rcx],xmm0 + pand xmm2,xmm4 + movdqu xmm0,XMMWORD PTR[((16+16-24))+rdx] + pand xmm3,xmm5 + movdqu xmm1,XMMWORD PTR[((16+16-24))+r8] + por xmm2,xmm3 + movdqu XMMWORD PTR[(16-24)+rcx],xmm2 + pand xmm0,xmm4 + pand xmm1,xmm5 + por xmm0,xmm1 + movdqu XMMWORD PTR[(32-24)+rcx],xmm0 + DB 0F3h,0C3h ;repret +vec_select_48 ENDP +PUBLIC vec_select_96 + + +ALIGN 32 +vec_select_96 PROC PUBLIC + DB 243,15,30,250 + movd xmm5,r9d + pxor xmm4,xmm4 + pshufd xmm5,xmm5,0 + movdqu xmm0,XMMWORD PTR[rdx] + lea rdx,QWORD PTR[48+rdx] + pcmpeqd xmm5,xmm4 + movdqu xmm1,XMMWORD PTR[r8] + lea r8,QWORD PTR[48+r8] + pcmpeqd xmm4,xmm5 + lea rcx,QWORD PTR[48+rcx] + pand xmm0,xmm4 + movdqu xmm2,XMMWORD PTR[((0+16-48))+rdx] + pand xmm1,xmm5 + movdqu xmm3,XMMWORD PTR[((0+16-48))+r8] + por xmm0,xmm1 + movdqu XMMWORD PTR[(0-48)+rcx],xmm0 + pand xmm2,xmm4 + movdqu xmm0,XMMWORD PTR[((16+16-48))+rdx] + pand xmm3,xmm5 + movdqu xmm1,XMMWORD PTR[((16+16-48))+r8] + por xmm2,xmm3 + movdqu XMMWORD PTR[(16-48)+rcx],xmm2 + pand xmm0,xmm4 + movdqu xmm2,XMMWORD PTR[((32+16-48))+rdx] + pand xmm1,xmm5 + movdqu xmm3,XMMWORD PTR[((32+16-48))+r8] + por xmm0,xmm1 + movdqu XMMWORD PTR[(32-48)+rcx],xmm0 + pand xmm2,xmm4 + movdqu xmm0,XMMWORD PTR[((48+16-48))+rdx] + pand xmm3,xmm5 + movdqu xmm1,XMMWORD PTR[((48+16-48))+r8] + por xmm2,xmm3 + movdqu XMMWORD PTR[(48-48)+rcx],xmm2 + pand xmm0,xmm4 + movdqu xmm2,XMMWORD PTR[((64+16-48))+rdx] + pand xmm1,xmm5 + movdqu xmm3,XMMWORD PTR[((64+16-48))+r8] + por xmm0,xmm1 + movdqu XMMWORD PTR[(64-48)+rcx],xmm0 + pand xmm2,xmm4 + pand xmm3,xmm5 + por xmm2,xmm3 + movdqu XMMWORD PTR[(80-48)+rcx],xmm2 + DB 0F3h,0C3h ;repret +vec_select_96 ENDP +PUBLIC vec_select_192 + + +ALIGN 32 +vec_select_192 PROC PUBLIC + DB 243,15,30,250 + movd xmm5,r9d + pxor xmm4,xmm4 + pshufd xmm5,xmm5,0 + movdqu xmm0,XMMWORD PTR[rdx] + lea rdx,QWORD PTR[96+rdx] + pcmpeqd xmm5,xmm4 + movdqu xmm1,XMMWORD PTR[r8] + lea r8,QWORD PTR[96+r8] + pcmpeqd xmm4,xmm5 + lea rcx,QWORD PTR[96+rcx] + pand xmm0,xmm4 + movdqu xmm2,XMMWORD PTR[((0+16-96))+rdx] + pand xmm1,xmm5 + movdqu xmm3,XMMWORD PTR[((0+16-96))+r8] + por xmm0,xmm1 + movdqu XMMWORD PTR[(0-96)+rcx],xmm0 + pand xmm2,xmm4 + movdqu xmm0,XMMWORD PTR[((16+16-96))+rdx] + pand xmm3,xmm5 + movdqu xmm1,XMMWORD PTR[((16+16-96))+r8] + por xmm2,xmm3 + movdqu XMMWORD PTR[(16-96)+rcx],xmm2 + pand xmm0,xmm4 + movdqu xmm2,XMMWORD PTR[((32+16-96))+rdx] + pand xmm1,xmm5 + movdqu xmm3,XMMWORD PTR[((32+16-96))+r8] + por xmm0,xmm1 + movdqu XMMWORD PTR[(32-96)+rcx],xmm0 + pand xmm2,xmm4 + movdqu xmm0,XMMWORD PTR[((48+16-96))+rdx] + pand xmm3,xmm5 + movdqu xmm1,XMMWORD PTR[((48+16-96))+r8] + por xmm2,xmm3 + movdqu XMMWORD PTR[(48-96)+rcx],xmm2 + pand xmm0,xmm4 + movdqu xmm2,XMMWORD PTR[((64+16-96))+rdx] + pand xmm1,xmm5 + movdqu xmm3,XMMWORD PTR[((64+16-96))+r8] + por xmm0,xmm1 + movdqu XMMWORD PTR[(64-96)+rcx],xmm0 + pand xmm2,xmm4 + movdqu xmm0,XMMWORD PTR[((80+16-96))+rdx] + pand xmm3,xmm5 + movdqu xmm1,XMMWORD PTR[((80+16-96))+r8] + por xmm2,xmm3 + movdqu XMMWORD PTR[(80-96)+rcx],xmm2 + pand xmm0,xmm4 + movdqu xmm2,XMMWORD PTR[((96+16-96))+rdx] + pand xmm1,xmm5 + movdqu xmm3,XMMWORD PTR[((96+16-96))+r8] + por xmm0,xmm1 + movdqu XMMWORD PTR[(96-96)+rcx],xmm0 + pand xmm2,xmm4 + movdqu xmm0,XMMWORD PTR[((112+16-96))+rdx] + pand xmm3,xmm5 + movdqu xmm1,XMMWORD PTR[((112+16-96))+r8] + por xmm2,xmm3 + movdqu XMMWORD PTR[(112-96)+rcx],xmm2 + pand xmm0,xmm4 + movdqu xmm2,XMMWORD PTR[((128+16-96))+rdx] + pand xmm1,xmm5 + movdqu xmm3,XMMWORD PTR[((128+16-96))+r8] + por xmm0,xmm1 + movdqu XMMWORD PTR[(128-96)+rcx],xmm0 + pand xmm2,xmm4 + movdqu xmm0,XMMWORD PTR[((144+16-96))+rdx] + pand xmm3,xmm5 + movdqu xmm1,XMMWORD PTR[((144+16-96))+r8] + por xmm2,xmm3 + movdqu XMMWORD PTR[(144-96)+rcx],xmm2 + pand xmm0,xmm4 + movdqu xmm2,XMMWORD PTR[((160+16-96))+rdx] + pand xmm1,xmm5 + movdqu xmm3,XMMWORD PTR[((160+16-96))+r8] + por xmm0,xmm1 + movdqu XMMWORD PTR[(160-96)+rcx],xmm0 + pand xmm2,xmm4 + pand xmm3,xmm5 + por xmm2,xmm3 + movdqu XMMWORD PTR[(176-96)+rcx],xmm2 + DB 0F3h,0C3h ;repret +vec_select_192 ENDP +PUBLIC vec_select_144 + + +ALIGN 32 +vec_select_144 PROC PUBLIC + DB 243,15,30,250 + movd xmm5,r9d + pxor xmm4,xmm4 + pshufd xmm5,xmm5,0 + movdqu xmm0,XMMWORD PTR[rdx] + lea rdx,QWORD PTR[72+rdx] + pcmpeqd xmm5,xmm4 + movdqu xmm1,XMMWORD PTR[r8] + lea r8,QWORD PTR[72+r8] + pcmpeqd xmm4,xmm5 + lea rcx,QWORD PTR[72+rcx] + pand xmm0,xmm4 + movdqu xmm2,XMMWORD PTR[((0+16-72))+rdx] + pand xmm1,xmm5 + movdqu xmm3,XMMWORD PTR[((0+16-72))+r8] + por xmm0,xmm1 + movdqu XMMWORD PTR[(0-72)+rcx],xmm0 + pand xmm2,xmm4 + movdqu xmm0,XMMWORD PTR[((16+16-72))+rdx] + pand xmm3,xmm5 + movdqu xmm1,XMMWORD PTR[((16+16-72))+r8] + por xmm2,xmm3 + movdqu XMMWORD PTR[(16-72)+rcx],xmm2 + pand xmm0,xmm4 + movdqu xmm2,XMMWORD PTR[((32+16-72))+rdx] + pand xmm1,xmm5 + movdqu xmm3,XMMWORD PTR[((32+16-72))+r8] + por xmm0,xmm1 + movdqu XMMWORD PTR[(32-72)+rcx],xmm0 + pand xmm2,xmm4 + movdqu xmm0,XMMWORD PTR[((48+16-72))+rdx] + pand xmm3,xmm5 + movdqu xmm1,XMMWORD PTR[((48+16-72))+r8] + por xmm2,xmm3 + movdqu XMMWORD PTR[(48-72)+rcx],xmm2 + pand xmm0,xmm4 + movdqu xmm2,XMMWORD PTR[((64+16-72))+rdx] + pand xmm1,xmm5 + movdqu xmm3,XMMWORD PTR[((64+16-72))+r8] + por xmm0,xmm1 + movdqu XMMWORD PTR[(64-72)+rcx],xmm0 + pand xmm2,xmm4 + movdqu xmm0,XMMWORD PTR[((80+16-72))+rdx] + pand xmm3,xmm5 + movdqu xmm1,XMMWORD PTR[((80+16-72))+r8] + por xmm2,xmm3 + movdqu XMMWORD PTR[(80-72)+rcx],xmm2 + pand xmm0,xmm4 + movdqu xmm2,XMMWORD PTR[((96+16-72))+rdx] + pand xmm1,xmm5 + movdqu xmm3,XMMWORD PTR[((96+16-72))+r8] + por xmm0,xmm1 + movdqu XMMWORD PTR[(96-72)+rcx],xmm0 + pand xmm2,xmm4 + movdqu xmm0,XMMWORD PTR[((112+16-72))+rdx] + pand xmm3,xmm5 + movdqu xmm1,XMMWORD PTR[((112+16-72))+r8] + por xmm2,xmm3 + movdqu XMMWORD PTR[(112-72)+rcx],xmm2 + pand xmm0,xmm4 + pand xmm1,xmm5 + por xmm0,xmm1 + movdqu XMMWORD PTR[(128-72)+rcx],xmm0 + DB 0F3h,0C3h ;repret +vec_select_144 ENDP +PUBLIC vec_select_288 + + +ALIGN 32 +vec_select_288 PROC PUBLIC + DB 243,15,30,250 + movd xmm5,r9d + pxor xmm4,xmm4 + pshufd xmm5,xmm5,0 + movdqu xmm0,XMMWORD PTR[rdx] + lea rdx,QWORD PTR[144+rdx] + pcmpeqd xmm5,xmm4 + movdqu xmm1,XMMWORD PTR[r8] + lea r8,QWORD PTR[144+r8] + pcmpeqd xmm4,xmm5 + lea rcx,QWORD PTR[144+rcx] + pand xmm0,xmm4 + movdqu xmm2,XMMWORD PTR[((0+16-144))+rdx] + pand xmm1,xmm5 + movdqu xmm3,XMMWORD PTR[((0+16-144))+r8] + por xmm0,xmm1 + movdqu XMMWORD PTR[(0-144)+rcx],xmm0 + pand xmm2,xmm4 + movdqu xmm0,XMMWORD PTR[((16+16-144))+rdx] + pand xmm3,xmm5 + movdqu xmm1,XMMWORD PTR[((16+16-144))+r8] + por xmm2,xmm3 + movdqu XMMWORD PTR[(16-144)+rcx],xmm2 + pand xmm0,xmm4 + movdqu xmm2,XMMWORD PTR[((32+16-144))+rdx] + pand xmm1,xmm5 + movdqu xmm3,XMMWORD PTR[((32+16-144))+r8] + por xmm0,xmm1 + movdqu XMMWORD PTR[(32-144)+rcx],xmm0 + pand xmm2,xmm4 + movdqu xmm0,XMMWORD PTR[((48+16-144))+rdx] + pand xmm3,xmm5 + movdqu xmm1,XMMWORD PTR[((48+16-144))+r8] + por xmm2,xmm3 + movdqu XMMWORD PTR[(48-144)+rcx],xmm2 + pand xmm0,xmm4 + movdqu xmm2,XMMWORD PTR[((64+16-144))+rdx] + pand xmm1,xmm5 + movdqu xmm3,XMMWORD PTR[((64+16-144))+r8] + por xmm0,xmm1 + movdqu XMMWORD PTR[(64-144)+rcx],xmm0 + pand xmm2,xmm4 + movdqu xmm0,XMMWORD PTR[((80+16-144))+rdx] + pand xmm3,xmm5 + movdqu xmm1,XMMWORD PTR[((80+16-144))+r8] + por xmm2,xmm3 + movdqu XMMWORD PTR[(80-144)+rcx],xmm2 + pand xmm0,xmm4 + movdqu xmm2,XMMWORD PTR[((96+16-144))+rdx] + pand xmm1,xmm5 + movdqu xmm3,XMMWORD PTR[((96+16-144))+r8] + por xmm0,xmm1 + movdqu XMMWORD PTR[(96-144)+rcx],xmm0 + pand xmm2,xmm4 + movdqu xmm0,XMMWORD PTR[((112+16-144))+rdx] + pand xmm3,xmm5 + movdqu xmm1,XMMWORD PTR[((112+16-144))+r8] + por xmm2,xmm3 + movdqu XMMWORD PTR[(112-144)+rcx],xmm2 + pand xmm0,xmm4 + movdqu xmm2,XMMWORD PTR[((128+16-144))+rdx] + pand xmm1,xmm5 + movdqu xmm3,XMMWORD PTR[((128+16-144))+r8] + por xmm0,xmm1 + movdqu XMMWORD PTR[(128-144)+rcx],xmm0 + pand xmm2,xmm4 + movdqu xmm0,XMMWORD PTR[((144+16-144))+rdx] + pand xmm3,xmm5 + movdqu xmm1,XMMWORD PTR[((144+16-144))+r8] + por xmm2,xmm3 + movdqu XMMWORD PTR[(144-144)+rcx],xmm2 + pand xmm0,xmm4 + movdqu xmm2,XMMWORD PTR[((160+16-144))+rdx] + pand xmm1,xmm5 + movdqu xmm3,XMMWORD PTR[((160+16-144))+r8] + por xmm0,xmm1 + movdqu XMMWORD PTR[(160-144)+rcx],xmm0 + pand xmm2,xmm4 + movdqu xmm0,XMMWORD PTR[((176+16-144))+rdx] + pand xmm3,xmm5 + movdqu xmm1,XMMWORD PTR[((176+16-144))+r8] + por xmm2,xmm3 + movdqu XMMWORD PTR[(176-144)+rcx],xmm2 + pand xmm0,xmm4 + movdqu xmm2,XMMWORD PTR[((192+16-144))+rdx] + pand xmm1,xmm5 + movdqu xmm3,XMMWORD PTR[((192+16-144))+r8] + por xmm0,xmm1 + movdqu XMMWORD PTR[(192-144)+rcx],xmm0 + pand xmm2,xmm4 + movdqu xmm0,XMMWORD PTR[((208+16-144))+rdx] + pand xmm3,xmm5 + movdqu xmm1,XMMWORD PTR[((208+16-144))+r8] + por xmm2,xmm3 + movdqu XMMWORD PTR[(208-144)+rcx],xmm2 + pand xmm0,xmm4 + movdqu xmm2,XMMWORD PTR[((224+16-144))+rdx] + pand xmm1,xmm5 + movdqu xmm3,XMMWORD PTR[((224+16-144))+r8] + por xmm0,xmm1 + movdqu XMMWORD PTR[(224-144)+rcx],xmm0 + pand xmm2,xmm4 + movdqu xmm0,XMMWORD PTR[((240+16-144))+rdx] + pand xmm3,xmm5 + movdqu xmm1,XMMWORD PTR[((240+16-144))+r8] + por xmm2,xmm3 + movdqu XMMWORD PTR[(240-144)+rcx],xmm2 + pand xmm0,xmm4 + movdqu xmm2,XMMWORD PTR[((256+16-144))+rdx] + pand xmm1,xmm5 + movdqu xmm3,XMMWORD PTR[((256+16-144))+r8] + por xmm0,xmm1 + movdqu XMMWORD PTR[(256-144)+rcx],xmm0 + pand xmm2,xmm4 + pand xmm3,xmm5 + por xmm2,xmm3 + movdqu XMMWORD PTR[(272-144)+rcx],xmm2 + DB 0F3h,0C3h ;repret +vec_select_288 ENDP +PUBLIC vec_prefetch + + +ALIGN 32 +vec_prefetch PROC PUBLIC + DB 243,15,30,250 + lea rdx,QWORD PTR[((-1))+rdx*1+rcx] + mov rax,64 + xor r8,r8 + prefetchnta [rcx] + lea rcx,QWORD PTR[rax*1+rcx] + cmp rcx,rdx + cmova rcx,rdx + cmova rax,r8 + prefetchnta [rcx] + lea rcx,QWORD PTR[rax*1+rcx] + cmp rcx,rdx + cmova rcx,rdx + cmova rax,r8 + prefetchnta [rcx] + lea rcx,QWORD PTR[rax*1+rcx] + cmp rcx,rdx + cmova rcx,rdx + cmova rax,r8 + prefetchnta [rcx] + lea rcx,QWORD PTR[rax*1+rcx] + cmp rcx,rdx + cmova rcx,rdx + cmova rax,r8 + prefetchnta [rcx] + lea rcx,QWORD PTR[rax*1+rcx] + cmp rcx,rdx + cmova rcx,rdx + cmova rax,r8 + prefetchnta [rcx] + lea rcx,QWORD PTR[rax*1+rcx] + cmp rcx,rdx + cmova rcx,rdx + prefetchnta [rcx] + DB 0F3h,0C3h ;repret +vec_prefetch ENDP +PUBLIC vec_is_zero_16x + + +ALIGN 32 +vec_is_zero_16x PROC PUBLIC + DB 243,15,30,250 + shr edx,4 + movdqu xmm0,XMMWORD PTR[rcx] + lea rcx,QWORD PTR[16+rcx] + +$L$oop_is_zero:: + dec edx + jz $L$oop_is_zero_done + movdqu xmm1,XMMWORD PTR[rcx] + lea rcx,QWORD PTR[16+rcx] + por xmm0,xmm1 + jmp $L$oop_is_zero + +$L$oop_is_zero_done:: + pshufd xmm1,xmm0,04eh + por xmm0,xmm1 +DB 102,72,15,126,192 + inc edx + test rax,rax + cmovnz eax,edx + xor eax,1 + DB 0F3h,0C3h ;repret +vec_is_zero_16x ENDP +PUBLIC vec_is_equal_16x + + +ALIGN 32 +vec_is_equal_16x PROC PUBLIC + DB 243,15,30,250 + shr r8d,4 + movdqu xmm0,XMMWORD PTR[rcx] + movdqu xmm1,XMMWORD PTR[rdx] + sub rdx,rcx + lea rcx,QWORD PTR[16+rcx] + pxor xmm0,xmm1 + +$L$oop_is_equal:: + dec r8d + jz $L$oop_is_equal_done + movdqu xmm1,XMMWORD PTR[rcx] + movdqu xmm2,XMMWORD PTR[rdx*1+rcx] + lea rcx,QWORD PTR[16+rcx] + pxor xmm1,xmm2 + por xmm0,xmm1 + jmp $L$oop_is_equal + +$L$oop_is_equal_done:: + pshufd xmm1,xmm0,04eh + por xmm0,xmm1 +DB 102,72,15,126,192 + inc r8d + test rax,rax + cmovnz eax,r8d + xor eax,1 + DB 0F3h,0C3h ;repret +vec_is_equal_16x ENDP +.text$ ENDS +.pdata SEGMENT READONLY ALIGN(4) +ALIGN 4 + DD imagerel $L$SEH_begin_add_mod_384 + DD imagerel $L$SEH_body_add_mod_384 + DD imagerel $L$SEH_info_add_mod_384_prologue + + DD imagerel $L$SEH_body_add_mod_384 + DD imagerel $L$SEH_epilogue_add_mod_384 + DD imagerel $L$SEH_info_add_mod_384_body + + DD imagerel $L$SEH_epilogue_add_mod_384 + DD imagerel $L$SEH_end_add_mod_384 + DD imagerel $L$SEH_info_add_mod_384_epilogue + + DD imagerel $L$SEH_begin_add_mod_384x + DD imagerel $L$SEH_body_add_mod_384x + DD imagerel $L$SEH_info_add_mod_384x_prologue + + DD imagerel $L$SEH_body_add_mod_384x + DD imagerel $L$SEH_epilogue_add_mod_384x + DD imagerel $L$SEH_info_add_mod_384x_body + + DD imagerel $L$SEH_epilogue_add_mod_384x + DD imagerel $L$SEH_end_add_mod_384x + DD imagerel $L$SEH_info_add_mod_384x_epilogue + + DD imagerel $L$SEH_begin_rshift_mod_384 + DD imagerel $L$SEH_body_rshift_mod_384 + DD imagerel $L$SEH_info_rshift_mod_384_prologue + + DD imagerel $L$SEH_body_rshift_mod_384 + DD imagerel $L$SEH_epilogue_rshift_mod_384 + DD imagerel $L$SEH_info_rshift_mod_384_body + + DD imagerel $L$SEH_epilogue_rshift_mod_384 + DD imagerel $L$SEH_end_rshift_mod_384 + DD imagerel $L$SEH_info_rshift_mod_384_epilogue + + DD imagerel $L$SEH_begin_div_by_2_mod_384 + DD imagerel $L$SEH_body_div_by_2_mod_384 + DD imagerel $L$SEH_info_div_by_2_mod_384_prologue + + DD imagerel $L$SEH_body_div_by_2_mod_384 + DD imagerel $L$SEH_epilogue_div_by_2_mod_384 + DD imagerel $L$SEH_info_div_by_2_mod_384_body + + DD imagerel $L$SEH_epilogue_div_by_2_mod_384 + DD imagerel $L$SEH_end_div_by_2_mod_384 + DD imagerel $L$SEH_info_div_by_2_mod_384_epilogue + + DD imagerel $L$SEH_begin_lshift_mod_384 + DD imagerel $L$SEH_body_lshift_mod_384 + DD imagerel $L$SEH_info_lshift_mod_384_prologue + + DD imagerel $L$SEH_body_lshift_mod_384 + DD imagerel $L$SEH_epilogue_lshift_mod_384 + DD imagerel $L$SEH_info_lshift_mod_384_body + + DD imagerel $L$SEH_epilogue_lshift_mod_384 + DD imagerel $L$SEH_end_lshift_mod_384 + DD imagerel $L$SEH_info_lshift_mod_384_epilogue + + DD imagerel $L$SEH_begin_mul_by_3_mod_384 + DD imagerel $L$SEH_body_mul_by_3_mod_384 + DD imagerel $L$SEH_info_mul_by_3_mod_384_prologue + + DD imagerel $L$SEH_body_mul_by_3_mod_384 + DD imagerel $L$SEH_epilogue_mul_by_3_mod_384 + DD imagerel $L$SEH_info_mul_by_3_mod_384_body + + DD imagerel $L$SEH_epilogue_mul_by_3_mod_384 + DD imagerel $L$SEH_end_mul_by_3_mod_384 + DD imagerel $L$SEH_info_mul_by_3_mod_384_epilogue + + DD imagerel $L$SEH_begin_mul_by_8_mod_384 + DD imagerel $L$SEH_body_mul_by_8_mod_384 + DD imagerel $L$SEH_info_mul_by_8_mod_384_prologue + + DD imagerel $L$SEH_body_mul_by_8_mod_384 + DD imagerel $L$SEH_epilogue_mul_by_8_mod_384 + DD imagerel $L$SEH_info_mul_by_8_mod_384_body + + DD imagerel $L$SEH_epilogue_mul_by_8_mod_384 + DD imagerel $L$SEH_end_mul_by_8_mod_384 + DD imagerel $L$SEH_info_mul_by_8_mod_384_epilogue + + DD imagerel $L$SEH_begin_mul_by_3_mod_384x + DD imagerel $L$SEH_body_mul_by_3_mod_384x + DD imagerel $L$SEH_info_mul_by_3_mod_384x_prologue + + DD imagerel $L$SEH_body_mul_by_3_mod_384x + DD imagerel $L$SEH_epilogue_mul_by_3_mod_384x + DD imagerel $L$SEH_info_mul_by_3_mod_384x_body + + DD imagerel $L$SEH_epilogue_mul_by_3_mod_384x + DD imagerel $L$SEH_end_mul_by_3_mod_384x + DD imagerel $L$SEH_info_mul_by_3_mod_384x_epilogue + + DD imagerel $L$SEH_begin_mul_by_8_mod_384x + DD imagerel $L$SEH_body_mul_by_8_mod_384x + DD imagerel $L$SEH_info_mul_by_8_mod_384x_prologue + + DD imagerel $L$SEH_body_mul_by_8_mod_384x + DD imagerel $L$SEH_epilogue_mul_by_8_mod_384x + DD imagerel $L$SEH_info_mul_by_8_mod_384x_body + + DD imagerel $L$SEH_epilogue_mul_by_8_mod_384x + DD imagerel $L$SEH_end_mul_by_8_mod_384x + DD imagerel $L$SEH_info_mul_by_8_mod_384x_epilogue + + DD imagerel $L$SEH_begin_cneg_mod_384 + DD imagerel $L$SEH_body_cneg_mod_384 + DD imagerel $L$SEH_info_cneg_mod_384_prologue + + DD imagerel $L$SEH_body_cneg_mod_384 + DD imagerel $L$SEH_epilogue_cneg_mod_384 + DD imagerel $L$SEH_info_cneg_mod_384_body + + DD imagerel $L$SEH_epilogue_cneg_mod_384 + DD imagerel $L$SEH_end_cneg_mod_384 + DD imagerel $L$SEH_info_cneg_mod_384_epilogue + + DD imagerel $L$SEH_begin_sub_mod_384 + DD imagerel $L$SEH_body_sub_mod_384 + DD imagerel $L$SEH_info_sub_mod_384_prologue + + DD imagerel $L$SEH_body_sub_mod_384 + DD imagerel $L$SEH_epilogue_sub_mod_384 + DD imagerel $L$SEH_info_sub_mod_384_body + + DD imagerel $L$SEH_epilogue_sub_mod_384 + DD imagerel $L$SEH_end_sub_mod_384 + DD imagerel $L$SEH_info_sub_mod_384_epilogue + + DD imagerel $L$SEH_begin_sub_mod_384x + DD imagerel $L$SEH_body_sub_mod_384x + DD imagerel $L$SEH_info_sub_mod_384x_prologue + + DD imagerel $L$SEH_body_sub_mod_384x + DD imagerel $L$SEH_epilogue_sub_mod_384x + DD imagerel $L$SEH_info_sub_mod_384x_body + + DD imagerel $L$SEH_epilogue_sub_mod_384x + DD imagerel $L$SEH_end_sub_mod_384x + DD imagerel $L$SEH_info_sub_mod_384x_epilogue + + DD imagerel $L$SEH_begin_mul_by_1_plus_i_mod_384x + DD imagerel $L$SEH_body_mul_by_1_plus_i_mod_384x + DD imagerel $L$SEH_info_mul_by_1_plus_i_mod_384x_prologue + + DD imagerel $L$SEH_body_mul_by_1_plus_i_mod_384x + DD imagerel $L$SEH_epilogue_mul_by_1_plus_i_mod_384x + DD imagerel $L$SEH_info_mul_by_1_plus_i_mod_384x_body + + DD imagerel $L$SEH_epilogue_mul_by_1_plus_i_mod_384x + DD imagerel $L$SEH_end_mul_by_1_plus_i_mod_384x + DD imagerel $L$SEH_info_mul_by_1_plus_i_mod_384x_epilogue + + DD imagerel $L$SEH_begin_sgn0_pty_mod_384 + DD imagerel $L$SEH_body_sgn0_pty_mod_384 + DD imagerel $L$SEH_info_sgn0_pty_mod_384_prologue + + DD imagerel $L$SEH_body_sgn0_pty_mod_384 + DD imagerel $L$SEH_epilogue_sgn0_pty_mod_384 + DD imagerel $L$SEH_info_sgn0_pty_mod_384_body + + DD imagerel $L$SEH_epilogue_sgn0_pty_mod_384 + DD imagerel $L$SEH_end_sgn0_pty_mod_384 + DD imagerel $L$SEH_info_sgn0_pty_mod_384_epilogue + + DD imagerel $L$SEH_begin_sgn0_pty_mod_384x + DD imagerel $L$SEH_body_sgn0_pty_mod_384x + DD imagerel $L$SEH_info_sgn0_pty_mod_384x_prologue + + DD imagerel $L$SEH_body_sgn0_pty_mod_384x + DD imagerel $L$SEH_epilogue_sgn0_pty_mod_384x + DD imagerel $L$SEH_info_sgn0_pty_mod_384x_body + + DD imagerel $L$SEH_epilogue_sgn0_pty_mod_384x + DD imagerel $L$SEH_end_sgn0_pty_mod_384x + DD imagerel $L$SEH_info_sgn0_pty_mod_384x_epilogue + +.pdata ENDS +.xdata SEGMENT READONLY ALIGN(8) +ALIGN 8 +$L$SEH_info_add_mod_384_prologue:: +DB 1,0,5,00bh +DB 0,074h,1,0 +DB 0,064h,2,0 +DB 0,003h +DB 0,0 +$L$SEH_info_add_mod_384_body:: +DB 1,0,17,0 +DB 000h,0f4h,001h,000h +DB 000h,0e4h,002h,000h +DB 000h,0d4h,003h,000h +DB 000h,0c4h,004h,000h +DB 000h,034h,005h,000h +DB 000h,054h,006h,000h +DB 000h,074h,008h,000h +DB 000h,064h,009h,000h +DB 000h,062h +DB 000h,000h +$L$SEH_info_add_mod_384_epilogue:: +DB 1,0,4,0 +DB 000h,074h,001h,000h +DB 000h,064h,002h,000h +DB 000h,000h,000h,000h + +$L$SEH_info_add_mod_384x_prologue:: +DB 1,0,5,00bh +DB 0,074h,1,0 +DB 0,064h,2,0 +DB 0,003h +DB 0,0 +$L$SEH_info_add_mod_384x_body:: +DB 1,0,17,0 +DB 000h,0f4h,003h,000h +DB 000h,0e4h,004h,000h +DB 000h,0d4h,005h,000h +DB 000h,0c4h,006h,000h +DB 000h,034h,007h,000h +DB 000h,054h,008h,000h +DB 000h,074h,00ah,000h +DB 000h,064h,00bh,000h +DB 000h,082h +DB 000h,000h +$L$SEH_info_add_mod_384x_epilogue:: +DB 1,0,4,0 +DB 000h,074h,001h,000h +DB 000h,064h,002h,000h +DB 000h,000h,000h,000h + +$L$SEH_info_rshift_mod_384_prologue:: +DB 1,0,5,00bh +DB 0,074h,1,0 +DB 0,064h,2,0 +DB 0,003h +DB 0,0 +$L$SEH_info_rshift_mod_384_body:: +DB 1,0,17,0 +DB 000h,0f4h,001h,000h +DB 000h,0e4h,002h,000h +DB 000h,0d4h,003h,000h +DB 000h,0c4h,004h,000h +DB 000h,034h,005h,000h +DB 000h,054h,006h,000h +DB 000h,074h,008h,000h +DB 000h,064h,009h,000h +DB 000h,062h +DB 000h,000h +$L$SEH_info_rshift_mod_384_epilogue:: +DB 1,0,4,0 +DB 000h,074h,001h,000h +DB 000h,064h,002h,000h +DB 000h,000h,000h,000h + +$L$SEH_info_div_by_2_mod_384_prologue:: +DB 1,0,5,00bh +DB 0,074h,1,0 +DB 0,064h,2,0 +DB 0,003h +DB 0,0 +$L$SEH_info_div_by_2_mod_384_body:: +DB 1,0,17,0 +DB 000h,0f4h,001h,000h +DB 000h,0e4h,002h,000h +DB 000h,0d4h,003h,000h +DB 000h,0c4h,004h,000h +DB 000h,034h,005h,000h +DB 000h,054h,006h,000h +DB 000h,074h,008h,000h +DB 000h,064h,009h,000h +DB 000h,062h +DB 000h,000h +$L$SEH_info_div_by_2_mod_384_epilogue:: +DB 1,0,4,0 +DB 000h,074h,001h,000h +DB 000h,064h,002h,000h +DB 000h,000h,000h,000h + +$L$SEH_info_lshift_mod_384_prologue:: +DB 1,0,5,00bh +DB 0,074h,1,0 +DB 0,064h,2,0 +DB 0,003h +DB 0,0 +$L$SEH_info_lshift_mod_384_body:: +DB 1,0,17,0 +DB 000h,0f4h,001h,000h +DB 000h,0e4h,002h,000h +DB 000h,0d4h,003h,000h +DB 000h,0c4h,004h,000h +DB 000h,034h,005h,000h +DB 000h,054h,006h,000h +DB 000h,074h,008h,000h +DB 000h,064h,009h,000h +DB 000h,062h +DB 000h,000h +$L$SEH_info_lshift_mod_384_epilogue:: +DB 1,0,4,0 +DB 000h,074h,001h,000h +DB 000h,064h,002h,000h +DB 000h,000h,000h,000h + +$L$SEH_info_mul_by_3_mod_384_prologue:: +DB 1,0,5,00bh +DB 0,074h,1,0 +DB 0,064h,2,0 +DB 0,003h +DB 0,0 +$L$SEH_info_mul_by_3_mod_384_body:: +DB 1,0,17,0 +DB 000h,0f4h,001h,000h +DB 000h,0e4h,002h,000h +DB 000h,0d4h,003h,000h +DB 000h,0c4h,004h,000h +DB 000h,034h,005h,000h +DB 000h,054h,006h,000h +DB 000h,074h,008h,000h +DB 000h,064h,009h,000h +DB 000h,062h +DB 000h,000h +$L$SEH_info_mul_by_3_mod_384_epilogue:: +DB 1,0,4,0 +DB 000h,074h,001h,000h +DB 000h,064h,002h,000h +DB 000h,000h,000h,000h + +$L$SEH_info_mul_by_8_mod_384_prologue:: +DB 1,0,5,00bh +DB 0,074h,1,0 +DB 0,064h,2,0 +DB 0,003h +DB 0,0 +$L$SEH_info_mul_by_8_mod_384_body:: +DB 1,0,17,0 +DB 000h,0f4h,001h,000h +DB 000h,0e4h,002h,000h +DB 000h,0d4h,003h,000h +DB 000h,0c4h,004h,000h +DB 000h,034h,005h,000h +DB 000h,054h,006h,000h +DB 000h,074h,008h,000h +DB 000h,064h,009h,000h +DB 000h,062h +DB 000h,000h +$L$SEH_info_mul_by_8_mod_384_epilogue:: +DB 1,0,4,0 +DB 000h,074h,001h,000h +DB 000h,064h,002h,000h +DB 000h,000h,000h,000h + +$L$SEH_info_mul_by_3_mod_384x_prologue:: +DB 1,0,5,00bh +DB 0,074h,1,0 +DB 0,064h,2,0 +DB 0,003h +DB 0,0 +$L$SEH_info_mul_by_3_mod_384x_body:: +DB 1,0,17,0 +DB 000h,0f4h,001h,000h +DB 000h,0e4h,002h,000h +DB 000h,0d4h,003h,000h +DB 000h,0c4h,004h,000h +DB 000h,034h,005h,000h +DB 000h,054h,006h,000h +DB 000h,074h,008h,000h +DB 000h,064h,009h,000h +DB 000h,062h +DB 000h,000h +$L$SEH_info_mul_by_3_mod_384x_epilogue:: +DB 1,0,4,0 +DB 000h,074h,001h,000h +DB 000h,064h,002h,000h +DB 000h,000h,000h,000h + +$L$SEH_info_mul_by_8_mod_384x_prologue:: +DB 1,0,5,00bh +DB 0,074h,1,0 +DB 0,064h,2,0 +DB 0,003h +DB 0,0 +$L$SEH_info_mul_by_8_mod_384x_body:: +DB 1,0,17,0 +DB 000h,0f4h,001h,000h +DB 000h,0e4h,002h,000h +DB 000h,0d4h,003h,000h +DB 000h,0c4h,004h,000h +DB 000h,034h,005h,000h +DB 000h,054h,006h,000h +DB 000h,074h,008h,000h +DB 000h,064h,009h,000h +DB 000h,062h +DB 000h,000h +$L$SEH_info_mul_by_8_mod_384x_epilogue:: +DB 1,0,4,0 +DB 000h,074h,001h,000h +DB 000h,064h,002h,000h +DB 000h,000h,000h,000h + +$L$SEH_info_cneg_mod_384_prologue:: +DB 1,0,5,00bh +DB 0,074h,1,0 +DB 0,064h,2,0 +DB 0,003h +DB 0,0 +$L$SEH_info_cneg_mod_384_body:: +DB 1,0,17,0 +DB 000h,0f4h,001h,000h +DB 000h,0e4h,002h,000h +DB 000h,0d4h,003h,000h +DB 000h,0c4h,004h,000h +DB 000h,034h,005h,000h +DB 000h,054h,006h,000h +DB 000h,074h,008h,000h +DB 000h,064h,009h,000h +DB 000h,062h +DB 000h,000h +$L$SEH_info_cneg_mod_384_epilogue:: +DB 1,0,4,0 +DB 000h,074h,001h,000h +DB 000h,064h,002h,000h +DB 000h,000h,000h,000h + +$L$SEH_info_sub_mod_384_prologue:: +DB 1,0,5,00bh +DB 0,074h,1,0 +DB 0,064h,2,0 +DB 0,003h +DB 0,0 +$L$SEH_info_sub_mod_384_body:: +DB 1,0,17,0 +DB 000h,0f4h,001h,000h +DB 000h,0e4h,002h,000h +DB 000h,0d4h,003h,000h +DB 000h,0c4h,004h,000h +DB 000h,034h,005h,000h +DB 000h,054h,006h,000h +DB 000h,074h,008h,000h +DB 000h,064h,009h,000h +DB 000h,062h +DB 000h,000h +$L$SEH_info_sub_mod_384_epilogue:: +DB 1,0,4,0 +DB 000h,074h,001h,000h +DB 000h,064h,002h,000h +DB 000h,000h,000h,000h + +$L$SEH_info_sub_mod_384x_prologue:: +DB 1,0,5,00bh +DB 0,074h,1,0 +DB 0,064h,2,0 +DB 0,003h +DB 0,0 +$L$SEH_info_sub_mod_384x_body:: +DB 1,0,17,0 +DB 000h,0f4h,003h,000h +DB 000h,0e4h,004h,000h +DB 000h,0d4h,005h,000h +DB 000h,0c4h,006h,000h +DB 000h,034h,007h,000h +DB 000h,054h,008h,000h +DB 000h,074h,00ah,000h +DB 000h,064h,00bh,000h +DB 000h,082h +DB 000h,000h +$L$SEH_info_sub_mod_384x_epilogue:: +DB 1,0,4,0 +DB 000h,074h,001h,000h +DB 000h,064h,002h,000h +DB 000h,000h,000h,000h + +$L$SEH_info_mul_by_1_plus_i_mod_384x_prologue:: +DB 1,0,5,00bh +DB 0,074h,1,0 +DB 0,064h,2,0 +DB 0,003h +DB 0,0 +$L$SEH_info_mul_by_1_plus_i_mod_384x_body:: +DB 1,0,17,0 +DB 000h,0f4h,007h,000h +DB 000h,0e4h,008h,000h +DB 000h,0d4h,009h,000h +DB 000h,0c4h,00ah,000h +DB 000h,034h,00bh,000h +DB 000h,054h,00ch,000h +DB 000h,074h,00eh,000h +DB 000h,064h,00fh,000h +DB 000h,0c2h +DB 000h,000h +$L$SEH_info_mul_by_1_plus_i_mod_384x_epilogue:: +DB 1,0,4,0 +DB 000h,074h,001h,000h +DB 000h,064h,002h,000h +DB 000h,000h,000h,000h + +$L$SEH_info_sgn0_pty_mod_384_prologue:: +DB 1,0,5,00bh +DB 0,074h,1,0 +DB 0,064h,2,0 +DB 0,003h +DB 0,0 +$L$SEH_info_sgn0_pty_mod_384_body:: +DB 1,0,4,0 +DB 000h,074h,001h,000h +DB 000h,064h,002h,000h +DB 000h,000h,000h,000h +$L$SEH_info_sgn0_pty_mod_384_epilogue:: +DB 1,0,4,0 +DB 000h,074h,001h,000h +DB 000h,064h,002h,000h +DB 000h,000h,000h,000h + +$L$SEH_info_sgn0_pty_mod_384x_prologue:: +DB 1,0,5,00bh +DB 0,074h,1,0 +DB 0,064h,2,0 +DB 0,003h +DB 0,0 +$L$SEH_info_sgn0_pty_mod_384x_body:: +DB 1,0,9,0 +DB 000h,034h,001h,000h +DB 000h,054h,002h,000h +DB 000h,074h,004h,000h +DB 000h,064h,005h,000h +DB 000h,022h +DB 000h,000h +$L$SEH_info_sgn0_pty_mod_384x_epilogue:: +DB 1,0,4,0 +DB 000h,074h,001h,000h +DB 000h,064h,002h,000h +DB 000h,000h,000h,000h + + +.xdata ENDS +END diff --git a/temp-deps/blst-ts/deps/blst/build/win64/add_mod_384x384-x86_64.asm b/temp-deps/blst-ts/deps/blst/build/win64/add_mod_384x384-x86_64.asm new file mode 100644 index 0000000..57d1752 --- /dev/null +++ b/temp-deps/blst-ts/deps/blst/build/win64/add_mod_384x384-x86_64.asm @@ -0,0 +1,334 @@ +OPTION DOTNAME +.text$ SEGMENT ALIGN(256) 'CODE' + + +ALIGN 32 +__add_mod_384x384 PROC PRIVATE + DB 243,15,30,250 + mov r8,QWORD PTR[rsi] + mov r9,QWORD PTR[8+rsi] + mov r10,QWORD PTR[16+rsi] + mov r11,QWORD PTR[24+rsi] + mov r12,QWORD PTR[32+rsi] + mov r13,QWORD PTR[40+rsi] + mov r14,QWORD PTR[48+rsi] + + add r8,QWORD PTR[rdx] + mov r15,QWORD PTR[56+rsi] + adc r9,QWORD PTR[8+rdx] + mov rax,QWORD PTR[64+rsi] + adc r10,QWORD PTR[16+rdx] + mov rbx,QWORD PTR[72+rsi] + adc r11,QWORD PTR[24+rdx] + mov rbp,QWORD PTR[80+rsi] + adc r12,QWORD PTR[32+rdx] + mov rsi,QWORD PTR[88+rsi] + adc r13,QWORD PTR[40+rdx] + mov QWORD PTR[rdi],r8 + adc r14,QWORD PTR[48+rdx] + mov QWORD PTR[8+rdi],r9 + adc r15,QWORD PTR[56+rdx] + mov QWORD PTR[16+rdi],r10 + adc rax,QWORD PTR[64+rdx] + mov QWORD PTR[32+rdi],r12 + mov r8,r14 + adc rbx,QWORD PTR[72+rdx] + mov QWORD PTR[24+rdi],r11 + mov r9,r15 + adc rbp,QWORD PTR[80+rdx] + mov QWORD PTR[40+rdi],r13 + mov r10,rax + adc rsi,QWORD PTR[88+rdx] + mov r11,rbx + sbb rdx,rdx + + sub r14,QWORD PTR[rcx] + sbb r15,QWORD PTR[8+rcx] + mov r12,rbp + sbb rax,QWORD PTR[16+rcx] + sbb rbx,QWORD PTR[24+rcx] + sbb rbp,QWORD PTR[32+rcx] + mov r13,rsi + sbb rsi,QWORD PTR[40+rcx] + sbb rdx,0 + + cmovc r14,r8 + cmovc r15,r9 + cmovc rax,r10 + mov QWORD PTR[48+rdi],r14 + cmovc rbx,r11 + mov QWORD PTR[56+rdi],r15 + cmovc rbp,r12 + mov QWORD PTR[64+rdi],rax + cmovc rsi,r13 + mov QWORD PTR[72+rdi],rbx + mov QWORD PTR[80+rdi],rbp + mov QWORD PTR[88+rdi],rsi + + DB 0F3h,0C3h ;repret +__add_mod_384x384 ENDP + + +ALIGN 32 +__sub_mod_384x384 PROC PRIVATE + DB 243,15,30,250 + mov r8,QWORD PTR[rsi] + mov r9,QWORD PTR[8+rsi] + mov r10,QWORD PTR[16+rsi] + mov r11,QWORD PTR[24+rsi] + mov r12,QWORD PTR[32+rsi] + mov r13,QWORD PTR[40+rsi] + mov r14,QWORD PTR[48+rsi] + + sub r8,QWORD PTR[rdx] + mov r15,QWORD PTR[56+rsi] + sbb r9,QWORD PTR[8+rdx] + mov rax,QWORD PTR[64+rsi] + sbb r10,QWORD PTR[16+rdx] + mov rbx,QWORD PTR[72+rsi] + sbb r11,QWORD PTR[24+rdx] + mov rbp,QWORD PTR[80+rsi] + sbb r12,QWORD PTR[32+rdx] + mov rsi,QWORD PTR[88+rsi] + sbb r13,QWORD PTR[40+rdx] + mov QWORD PTR[rdi],r8 + sbb r14,QWORD PTR[48+rdx] + mov r8,QWORD PTR[rcx] + mov QWORD PTR[8+rdi],r9 + sbb r15,QWORD PTR[56+rdx] + mov r9,QWORD PTR[8+rcx] + mov QWORD PTR[16+rdi],r10 + sbb rax,QWORD PTR[64+rdx] + mov r10,QWORD PTR[16+rcx] + mov QWORD PTR[24+rdi],r11 + sbb rbx,QWORD PTR[72+rdx] + mov r11,QWORD PTR[24+rcx] + mov QWORD PTR[32+rdi],r12 + sbb rbp,QWORD PTR[80+rdx] + mov r12,QWORD PTR[32+rcx] + mov QWORD PTR[40+rdi],r13 + sbb rsi,QWORD PTR[88+rdx] + mov r13,QWORD PTR[40+rcx] + sbb rdx,rdx + + and r8,rdx + and r9,rdx + and r10,rdx + and r11,rdx + and r12,rdx + and r13,rdx + + add r14,r8 + adc r15,r9 + mov QWORD PTR[48+rdi],r14 + adc rax,r10 + mov QWORD PTR[56+rdi],r15 + adc rbx,r11 + mov QWORD PTR[64+rdi],rax + adc rbp,r12 + mov QWORD PTR[72+rdi],rbx + adc rsi,r13 + mov QWORD PTR[80+rdi],rbp + mov QWORD PTR[88+rdi],rsi + + DB 0F3h,0C3h ;repret +__sub_mod_384x384 ENDP + +PUBLIC add_mod_384x384 + + +ALIGN 32 +add_mod_384x384 PROC PUBLIC + DB 243,15,30,250 + mov QWORD PTR[8+rsp],rdi ;WIN64 prologue + mov QWORD PTR[16+rsp],rsi + mov r11,rsp +$L$SEH_begin_add_mod_384x384:: + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 + mov rcx,r9 + + + + push rbp + + push rbx + + push r12 + + push r13 + + push r14 + + push r15 + + sub rsp,8 + +$L$SEH_body_add_mod_384x384:: + + + call __add_mod_384x384 + + mov r15,QWORD PTR[8+rsp] + + mov r14,QWORD PTR[16+rsp] + + mov r13,QWORD PTR[24+rsp] + + mov r12,QWORD PTR[32+rsp] + + mov rbx,QWORD PTR[40+rsp] + + mov rbp,QWORD PTR[48+rsp] + + lea rsp,QWORD PTR[56+rsp] + +$L$SEH_epilogue_add_mod_384x384:: + mov rdi,QWORD PTR[8+rsp] ;WIN64 epilogue + mov rsi,QWORD PTR[16+rsp] + + DB 0F3h,0C3h ;repret + +$L$SEH_end_add_mod_384x384:: +add_mod_384x384 ENDP + +PUBLIC sub_mod_384x384 + + +ALIGN 32 +sub_mod_384x384 PROC PUBLIC + DB 243,15,30,250 + mov QWORD PTR[8+rsp],rdi ;WIN64 prologue + mov QWORD PTR[16+rsp],rsi + mov r11,rsp +$L$SEH_begin_sub_mod_384x384:: + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 + mov rcx,r9 + + + + push rbp + + push rbx + + push r12 + + push r13 + + push r14 + + push r15 + + sub rsp,8 + +$L$SEH_body_sub_mod_384x384:: + + + call __sub_mod_384x384 + + mov r15,QWORD PTR[8+rsp] + + mov r14,QWORD PTR[16+rsp] + + mov r13,QWORD PTR[24+rsp] + + mov r12,QWORD PTR[32+rsp] + + mov rbx,QWORD PTR[40+rsp] + + mov rbp,QWORD PTR[48+rsp] + + lea rsp,QWORD PTR[56+rsp] + +$L$SEH_epilogue_sub_mod_384x384:: + mov rdi,QWORD PTR[8+rsp] ;WIN64 epilogue + mov rsi,QWORD PTR[16+rsp] + + DB 0F3h,0C3h ;repret + +$L$SEH_end_sub_mod_384x384:: +sub_mod_384x384 ENDP +.text$ ENDS +.pdata SEGMENT READONLY ALIGN(4) +ALIGN 4 + DD imagerel $L$SEH_begin_add_mod_384x384 + DD imagerel $L$SEH_body_add_mod_384x384 + DD imagerel $L$SEH_info_add_mod_384x384_prologue + + DD imagerel $L$SEH_body_add_mod_384x384 + DD imagerel $L$SEH_epilogue_add_mod_384x384 + DD imagerel $L$SEH_info_add_mod_384x384_body + + DD imagerel $L$SEH_epilogue_add_mod_384x384 + DD imagerel $L$SEH_end_add_mod_384x384 + DD imagerel $L$SEH_info_add_mod_384x384_epilogue + + DD imagerel $L$SEH_begin_sub_mod_384x384 + DD imagerel $L$SEH_body_sub_mod_384x384 + DD imagerel $L$SEH_info_sub_mod_384x384_prologue + + DD imagerel $L$SEH_body_sub_mod_384x384 + DD imagerel $L$SEH_epilogue_sub_mod_384x384 + DD imagerel $L$SEH_info_sub_mod_384x384_body + + DD imagerel $L$SEH_epilogue_sub_mod_384x384 + DD imagerel $L$SEH_end_sub_mod_384x384 + DD imagerel $L$SEH_info_sub_mod_384x384_epilogue + +.pdata ENDS +.xdata SEGMENT READONLY ALIGN(8) +ALIGN 8 +$L$SEH_info_add_mod_384x384_prologue:: +DB 1,0,5,00bh +DB 0,074h,1,0 +DB 0,064h,2,0 +DB 0,003h +DB 0,0 +$L$SEH_info_add_mod_384x384_body:: +DB 1,0,17,0 +DB 000h,0f4h,001h,000h +DB 000h,0e4h,002h,000h +DB 000h,0d4h,003h,000h +DB 000h,0c4h,004h,000h +DB 000h,034h,005h,000h +DB 000h,054h,006h,000h +DB 000h,074h,008h,000h +DB 000h,064h,009h,000h +DB 000h,062h +DB 000h,000h +$L$SEH_info_add_mod_384x384_epilogue:: +DB 1,0,4,0 +DB 000h,074h,001h,000h +DB 000h,064h,002h,000h +DB 000h,000h,000h,000h + +$L$SEH_info_sub_mod_384x384_prologue:: +DB 1,0,5,00bh +DB 0,074h,1,0 +DB 0,064h,2,0 +DB 0,003h +DB 0,0 +$L$SEH_info_sub_mod_384x384_body:: +DB 1,0,17,0 +DB 000h,0f4h,001h,000h +DB 000h,0e4h,002h,000h +DB 000h,0d4h,003h,000h +DB 000h,0c4h,004h,000h +DB 000h,034h,005h,000h +DB 000h,054h,006h,000h +DB 000h,074h,008h,000h +DB 000h,064h,009h,000h +DB 000h,062h +DB 000h,000h +$L$SEH_info_sub_mod_384x384_epilogue:: +DB 1,0,4,0 +DB 000h,074h,001h,000h +DB 000h,064h,002h,000h +DB 000h,000h,000h,000h + + +.xdata ENDS +END diff --git a/temp-deps/blst-ts/deps/blst/build/win64/blst.def b/temp-deps/blst-ts/deps/blst/build/win64/blst.def new file mode 100644 index 0000000..3fbb6b3 --- /dev/null +++ b/temp-deps/blst-ts/deps/blst/build/win64/blst.def @@ -0,0 +1,217 @@ +LIBRARY blst + +EXPORTS + blst_scalar_from_uint32 + blst_uint32_from_scalar + blst_scalar_from_uint64 + blst_uint64_from_scalar + blst_scalar_from_bendian + blst_bendian_from_scalar + blst_scalar_from_lendian + blst_lendian_from_scalar + blst_scalar_fr_check + blst_sk_check + blst_sk_add_n_check + blst_sk_sub_n_check + blst_sk_mul_n_check + blst_sk_inverse + blst_scalar_from_le_bytes + blst_scalar_from_be_bytes + blst_fr_add + blst_fr_sub + blst_fr_mul_by_3 + blst_fr_lshift + blst_fr_rshift + blst_fr_mul + blst_fr_sqr + blst_fr_cneg + blst_fr_eucl_inverse + blst_fr_inverse + blst_fr_from_uint64 + blst_uint64_from_fr + blst_fr_from_scalar + blst_scalar_from_fr + blst_fp_add + blst_fp_sub + blst_fp_mul_by_3 + blst_fp_mul_by_8 + blst_fp_lshift + blst_fp_mul + blst_fp_sqr + blst_fp_cneg + blst_fp_eucl_inverse + blst_fp_inverse + blst_fp_sqrt + blst_fp_from_uint32 + blst_uint32_from_fp + blst_fp_from_uint64 + blst_uint64_from_fp + blst_fp_from_bendian + blst_bendian_from_fp + blst_fp_from_lendian + blst_lendian_from_fp + blst_fp2_add + blst_fp2_sub + blst_fp2_mul_by_3 + blst_fp2_mul_by_8 + blst_fp2_lshift + blst_fp2_mul + blst_fp2_sqr + blst_fp2_cneg + blst_fp2_eucl_inverse + blst_fp2_inverse + blst_fp2_sqrt + blst_fp12_sqr + blst_fp12_cyclotomic_sqr + blst_fp12_mul + blst_fp12_mul_by_xy00z0 + blst_fp12_conjugate + blst_fp12_inverse + blst_fp12_frobenius_map + blst_fp12_is_equal + blst_fp12_is_one + blst_fp12_in_group + blst_fp12_one + blst_p1_add + blst_p1_add_or_double + blst_p1_add_affine + blst_p1_add_or_double_affine + blst_p1_double + blst_p1_mult + blst_p1_cneg + blst_p1_to_affine + blst_p1_from_affine + blst_p1_on_curve + blst_p1_in_g1 + blst_p1_is_equal + blst_p1_is_inf + blst_p1_generator + blst_p1_affine_on_curve + blst_p1_affine_in_g1 + blst_p1_affine_is_equal + blst_p1_affine_is_inf + blst_p1_affine_generator + blst_p2_add + blst_p2_add_or_double + blst_p2_add_affine + blst_p2_add_or_double_affine + blst_p2_double + blst_p2_mult + blst_p2_cneg + blst_p2_to_affine + blst_p2_from_affine + blst_p2_on_curve + blst_p2_in_g2 + blst_p2_is_equal + blst_p2_is_inf + blst_p2_generator + blst_p2_affine_on_curve + blst_p2_affine_in_g2 + blst_p2_affine_is_equal + blst_p2_affine_is_inf + blst_p2_affine_generator + blst_p1s_to_affine + blst_p1s_add + blst_p1s_mult_wbits_precompute_sizeof + blst_p1s_mult_wbits_precompute + blst_p1s_mult_wbits_scratch_sizeof + blst_p1s_mult_wbits + blst_p1s_mult_pippenger_scratch_sizeof + blst_p1s_mult_pippenger + blst_p1s_tile_pippenger + blst_p2s_to_affine + blst_p2s_add + blst_p2s_mult_wbits_precompute_sizeof + blst_p2s_mult_wbits_precompute + blst_p2s_mult_wbits_scratch_sizeof + blst_p2s_mult_wbits + blst_p2s_mult_pippenger_scratch_sizeof + blst_p2s_mult_pippenger + blst_p2s_tile_pippenger + blst_map_to_g1 + blst_map_to_g2 + blst_encode_to_g1 + blst_hash_to_g1 + blst_encode_to_g2 + blst_hash_to_g2 + blst_p1_serialize + blst_p1_compress + blst_p1_affine_serialize + blst_p1_affine_compress + blst_p1_uncompress + blst_p1_deserialize + blst_p2_serialize + blst_p2_compress + blst_p2_affine_serialize + blst_p2_affine_compress + blst_p2_uncompress + blst_p2_deserialize + blst_keygen + blst_sk_to_pk_in_g1 + blst_sign_pk_in_g1 + blst_sk_to_pk_in_g2 + blst_sign_pk_in_g2 + blst_miller_loop + blst_final_exp + blst_precompute_lines + blst_miller_loop_lines + blst_fp12_finalverify + blst_pairing_sizeof + blst_pairing_init + blst_pairing_get_dst + blst_pairing_commit + blst_pairing_aggregate_pk_in_g2 + blst_pairing_chk_n_aggr_pk_in_g2 + blst_pairing_mul_n_aggregate_pk_in_g2 + blst_pairing_chk_n_mul_n_aggr_pk_in_g2 + blst_pairing_aggregate_pk_in_g1 + blst_pairing_chk_n_aggr_pk_in_g1 + blst_pairing_mul_n_aggregate_pk_in_g1 + blst_pairing_chk_n_mul_n_aggr_pk_in_g1 + blst_pairing_merge + blst_pairing_finalverify + blst_aggregate_in_g1 + blst_aggregate_in_g2 + blst_aggregated_in_g1 + blst_aggregated_in_g2 + blst_core_verify_pk_in_g1 + blst_core_verify_pk_in_g2 + BLS12_381_G1 + BLS12_381_NEG_G1 + BLS12_381_G2 + BLS12_381_NEG_G2 + blst_fr_to + blst_fr_from + blst_fp_to + blst_fp_from + blst_fp_is_square + blst_fp2_is_square + blst_p1_from_jacobian + blst_p2_from_jacobian + blst_sk_to_pk2_in_g1 + blst_sign_pk2_in_g1 + blst_sk_to_pk2_in_g2 + blst_sign_pk2_in_g2 + blst_uniq_sizeof + blst_uniq_init + blst_uniq_test + blst_expand_message_xmd + blst_p1_unchecked_mult + blst_p2_unchecked_mult + blst_pairing_raw_aggregate + blst_pairing_as_fp12 + blst_bendian_from_fp12 + blst_keygen_v3 + blst_keygen_v4_5 + blst_keygen_v5 + blst_derive_master_eip2333 + blst_derive_child_eip2333 + blst_scalar_from_hexascii + blst_fr_from_hexascii + blst_fp_from_hexascii + blst_p1_sizeof + blst_p1_affine_sizeof + blst_p2_sizeof + blst_p2_affine_sizeof + blst_fp12_sizeof + diff --git a/temp-deps/blst-ts/deps/blst/build/win64/ct_inverse_mod_256-armv8.asm b/temp-deps/blst-ts/deps/blst/build/win64/ct_inverse_mod_256-armv8.asm new file mode 100644 index 0000000..f3c2f0d --- /dev/null +++ b/temp-deps/blst-ts/deps/blst/build/win64/ct_inverse_mod_256-armv8.asm @@ -0,0 +1,785 @@ + AREA |.text|,CODE,ALIGN=8,ARM64 + + + EXPORT |ct_inverse_mod_256|[FUNC] + ALIGN 32 +|ct_inverse_mod_256| PROC + DCDU 3573752639 + stp x29, x30, [sp,#-80]! + add x29, sp, #0 + stp x19, x20, [sp,#16] + stp x21, x22, [sp,#32] + stp x23, x24, [sp,#48] + stp x25, x26, [sp,#64] + sub sp, sp, #1040 + + ldp x4, x5, [x1,#8*0] + ldp x6, x7, [x1,#8*2] + + add x1, sp, #16+511 // find closest 512-byte-aligned spot + and x1, x1, #-512 // in the frame... + str x0, [sp] + + ldp x8, x9, [x2,#8*0] + ldp x10, x11, [x2,#8*2] + + stp x4, x5, [x1,#8*0] // copy input to |a| + stp x6, x7, [x1,#8*2] + stp x8, x9, [x1,#8*4] // copy modulus to |b| + stp x10, x11, [x1,#8*6] + + ////////////////////////////////////////// first iteration + bl |$Lab_approximation_31_256_loaded| + + eor x0, x1, #256 // pointer to dst |a|b|u|v| + bl __smul_256_n_shift_by_31 + str x12,[x0,#8*8] // initialize |u| with |f0| + + mov x12, x14 // |f1| + mov x13, x15 // |g1| + add x0, x0, #8*4 // pointer to dst |b| + bl __smul_256_n_shift_by_31 + str x12, [x0,#8*9] // initialize |v| with |f1| + + ////////////////////////////////////////// second iteration + eor x1, x1, #256 // flip-flop src |a|b|u|v| + bl __ab_approximation_31_256 + + eor x0, x1, #256 // pointer to dst |a|b|u|v| + bl __smul_256_n_shift_by_31 + mov x16, x12 // corrected |f0| + mov x17, x13 // corrected |g0| + + mov x12, x14 // |f1| + mov x13, x15 // |g1| + add x0, x0, #8*4 // pointer to destination |b| + bl __smul_256_n_shift_by_31 + + ldr x8, [x1,#8*8] // |u| + ldr x9, [x1,#8*13] // |v| + madd x4, x16, x8, xzr // |u|*|f0| + madd x4, x17, x9, x4 // |v|*|g0| + str x4, [x0,#8*4] + asr x5, x4, #63 // sign extenstion + stp x5, x5, [x0,#8*5] + stp x5, x5, [x0,#8*7] + + madd x4, x12, x8, xzr // |u|*|f1| + madd x4, x13, x9, x4 // |v|*|g1| + str x4, [x0,#8*9] + asr x5, x4, #63 // sign extenstion + stp x5, x5, [x0,#8*10] + stp x5, x5, [x0,#8*12] + eor x1, x1, #256 // flip-flop src |a|b|u|v| + bl __ab_approximation_31_256 + + eor x0, x1, #256 // pointer to dst |a|b|u|v| + bl __smul_256_n_shift_by_31 + mov x16, x12 // corrected |f0| + mov x17, x13 // corrected |g0| + + mov x12, x14 // |f1| + mov x13, x15 // |g1| + add x0, x0, #8*4 // pointer to destination |b| + bl __smul_256_n_shift_by_31 + + add x0, x0, #8*4 // pointer to destination |u| + bl __smul_256x63 + adc x22, x22, x23 + str x22, [x0,#8*4] + + mov x16, x12 // corrected |f1| + mov x17, x13 // corrected |g1| + add x0, x0, #8*5 // pointer to destination |v| + bl __smul_256x63 + adc x22, x22, x23 + stp x22, x22, [x0,#8*4] + stp x22, x22, [x0,#8*6] + eor x1, x1, #256 // flip-flop src |a|b|u|v| + bl __ab_approximation_31_256 + + eor x0, x1, #256 // pointer to dst |a|b|u|v| + bl __smul_256_n_shift_by_31 + mov x16, x12 // corrected |f0| + mov x17, x13 // corrected |g0| + + mov x12, x14 // |f1| + mov x13, x15 // |g1| + add x0, x0, #8*4 // pointer to destination |b| + bl __smul_256_n_shift_by_31 + + add x0, x0, #8*4 // pointer to destination |u| + bl __smul_256x63 + adc x22, x22, x23 + str x22, [x0,#8*4] + + mov x16, x12 // corrected |f1| + mov x17, x13 // corrected |g1| + add x0, x0, #8*5 // pointer to destination |v| + bl __smul_256x63 + adc x22, x22, x23 + stp x22, x22, [x0,#8*4] + stp x22, x22, [x0,#8*6] + eor x1, x1, #256 // flip-flop src |a|b|u|v| + bl __ab_approximation_31_256 + + eor x0, x1, #256 // pointer to dst |a|b|u|v| + bl __smul_256_n_shift_by_31 + mov x16, x12 // corrected |f0| + mov x17, x13 // corrected |g0| + + mov x12, x14 // |f1| + mov x13, x15 // |g1| + add x0, x0, #8*4 // pointer to destination |b| + bl __smul_256_n_shift_by_31 + + add x0, x0, #8*4 // pointer to destination |u| + bl __smul_256x63 + adc x22, x22, x23 + str x22, [x0,#8*4] + + mov x16, x12 // corrected |f1| + mov x17, x13 // corrected |g1| + add x0, x0, #8*5 // pointer to destination |v| + bl __smul_256x63 + adc x22, x22, x23 + stp x22, x22, [x0,#8*4] + stp x22, x22, [x0,#8*6] + eor x1, x1, #256 // flip-flop src |a|b|u|v| + bl __ab_approximation_31_256 + + eor x0, x1, #256 // pointer to dst |a|b|u|v| + bl __smul_256_n_shift_by_31 + mov x16, x12 // corrected |f0| + mov x17, x13 // corrected |g0| + + mov x12, x14 // |f1| + mov x13, x15 // |g1| + add x0, x0, #8*4 // pointer to destination |b| + bl __smul_256_n_shift_by_31 + + add x0, x0, #8*4 // pointer to destination |u| + bl __smul_256x63 + adc x22, x22, x23 + str x22, [x0,#8*4] + + mov x16, x12 // corrected |f1| + mov x17, x13 // corrected |g1| + add x0, x0, #8*5 // pointer to destination |v| + bl __smul_256x63 + adc x22, x22, x23 + stp x22, x22, [x0,#8*4] + stp x22, x22, [x0,#8*6] + eor x1, x1, #256 // flip-flop src |a|b|u|v| + bl __ab_approximation_31_256 + + eor x0, x1, #256 // pointer to dst |a|b|u|v| + bl __smul_256_n_shift_by_31 + mov x16, x12 // corrected |f0| + mov x17, x13 // corrected |g0| + + mov x12, x14 // |f1| + mov x13, x15 // |g1| + add x0, x0, #8*4 // pointer to destination |b| + bl __smul_256_n_shift_by_31 + + add x0, x0, #8*4 // pointer to destination |u| + bl __smul_256x63 + adc x22, x22, x23 + str x22, [x0,#8*4] + + mov x16, x12 // corrected |f1| + mov x17, x13 // corrected |g1| + add x0, x0, #8*5 // pointer to destination |v| + bl __smul_256x63 + adc x22, x22, x23 + stp x22, x22, [x0,#8*4] + stp x22, x22, [x0,#8*6] + eor x1, x1, #256 // flip-flop src |a|b|u|v| + bl __ab_approximation_31_256 + + eor x0, x1, #256 // pointer to dst |a|b|u|v| + bl __smul_256_n_shift_by_31 + mov x16, x12 // corrected |f0| + mov x17, x13 // corrected |g0| + + mov x12, x14 // |f1| + mov x13, x15 // |g1| + add x0, x0, #8*4 // pointer to destination |b| + bl __smul_256_n_shift_by_31 + + add x0, x0, #8*4 // pointer to destination |u| + bl __smul_256x63 + adc x22, x22, x23 + str x22, [x0,#8*4] + + mov x16, x12 // corrected |f1| + mov x17, x13 // corrected |g1| + add x0, x0, #8*5 // pointer to destination |v| + bl __smul_256x63 + adc x22, x22, x23 + stp x22, x22, [x0,#8*4] + stp x22, x22, [x0,#8*6] + eor x1, x1, #256 // flip-flop src |a|b|u|v| + bl __ab_approximation_31_256 + + eor x0, x1, #256 // pointer to dst |a|b|u|v| + bl __smul_256_n_shift_by_31 + mov x16, x12 // corrected |f0| + mov x17, x13 // corrected |g0| + + mov x12, x14 // |f1| + mov x13, x15 // |g1| + add x0, x0, #8*4 // pointer to destination |b| + bl __smul_256_n_shift_by_31 + + add x0, x0, #8*4 // pointer to destination |u| + bl __smul_256x63 + adc x22, x22, x23 + str x22, [x0,#8*4] + + mov x16, x12 // corrected |f1| + mov x17, x13 // corrected |g1| + add x0, x0, #8*5 // pointer to destination |v| + bl __smul_256x63 + bl __smul_512x63_tail + eor x1, x1, #256 // flip-flop src |a|b|u|v| + bl __ab_approximation_31_256 + + eor x0, x1, #256 // pointer to dst |a|b|u|v| + bl __smul_256_n_shift_by_31 + mov x16, x12 // corrected |f0| + mov x17, x13 // corrected |g0| + + mov x12, x14 // |f1| + mov x13, x15 // |g1| + add x0, x0, #8*4 // pointer to destination |b| + bl __smul_256_n_shift_by_31 + + add x0, x0, #8*4 // pointer to destination |u| + bl __smul_256x63 + adc x22, x22, x23 + str x22, [x0,#8*4] + + mov x16, x12 // corrected |f1| + mov x17, x13 // corrected |g1| + add x0, x0, #8*5 // pointer to destination |v| + bl __smul_256x63 + bl __smul_512x63_tail + eor x1, x1, #256 // flip-flop src |a|b|u|v| + bl __ab_approximation_31_256 + + eor x0, x1, #256 // pointer to dst |a|b|u|v| + bl __smul_256_n_shift_by_31 + mov x16, x12 // corrected |f0| + mov x17, x13 // corrected |g0| + + mov x12, x14 // |f1| + mov x13, x15 // |g1| + add x0, x0, #8*4 // pointer to destination |b| + bl __smul_256_n_shift_by_31 + + add x0, x0, #8*4 // pointer to destination |u| + bl __smul_256x63 + adc x22, x22, x23 + str x22, [x0,#8*4] + + mov x16, x12 // corrected |f1| + mov x17, x13 // corrected |g1| + add x0, x0, #8*5 // pointer to destination |v| + bl __smul_256x63 + bl __smul_512x63_tail + eor x1, x1, #256 // flip-flop src |a|b|u|v| + bl __ab_approximation_31_256 + + eor x0, x1, #256 // pointer to dst |a|b|u|v| + bl __smul_256_n_shift_by_31 + mov x16, x12 // corrected |f0| + mov x17, x13 // corrected |g0| + + mov x12, x14 // |f1| + mov x13, x15 // |g1| + add x0, x0, #8*4 // pointer to destination |b| + bl __smul_256_n_shift_by_31 + + add x0, x0, #8*4 // pointer to destination |u| + bl __smul_256x63 + adc x22, x22, x23 + str x22, [x0,#8*4] + + mov x16, x12 // corrected |f1| + mov x17, x13 // corrected |g1| + add x0, x0, #8*5 // pointer to destination |v| + bl __smul_256x63 + bl __smul_512x63_tail + eor x1, x1, #256 // flip-flop src |a|b|u|v| + bl __ab_approximation_31_256 + + eor x0, x1, #256 // pointer to dst |a|b|u|v| + bl __smul_256_n_shift_by_31 + mov x16, x12 // corrected |f0| + mov x17, x13 // corrected |g0| + + mov x12, x14 // |f1| + mov x13, x15 // |g1| + add x0, x0, #8*4 // pointer to destination |b| + bl __smul_256_n_shift_by_31 + + add x0, x0, #8*4 // pointer to destination |u| + bl __smul_256x63 + adc x22, x22, x23 + str x22, [x0,#8*4] + + mov x16, x12 // corrected |f1| + mov x17, x13 // corrected |g1| + add x0, x0, #8*5 // pointer to destination |v| + bl __smul_256x63 + bl __smul_512x63_tail + eor x1, x1, #256 // flip-flop src |a|b|u|v| + bl __ab_approximation_31_256 + + eor x0, x1, #256 // pointer to dst |a|b|u|v| + bl __smul_256_n_shift_by_31 + mov x16, x12 // corrected |f0| + mov x17, x13 // corrected |g0| + + mov x12, x14 // |f1| + mov x13, x15 // |g1| + add x0, x0, #8*4 // pointer to destination |b| + bl __smul_256_n_shift_by_31 + + add x0, x0, #8*4 // pointer to destination |u| + bl __smul_256x63 + adc x22, x22, x23 + str x22, [x0,#8*4] + + mov x16, x12 // corrected |f1| + mov x17, x13 // corrected |g1| + add x0, x0, #8*5 // pointer to destination |v| + bl __smul_256x63 + bl __smul_512x63_tail + eor x1, x1, #256 // flip-flop src |a|b|u|v| + bl __ab_approximation_31_256 + + eor x0, x1, #256 // pointer to dst |a|b|u|v| + bl __smul_256_n_shift_by_31 + mov x16, x12 // corrected |f0| + mov x17, x13 // corrected |g0| + + mov x12, x14 // |f1| + mov x13, x15 // |g1| + add x0, x0, #8*4 // pointer to destination |b| + bl __smul_256_n_shift_by_31 + + add x0, x0, #8*4 // pointer to destination |u| + bl __smul_256x63 + adc x22, x22, x23 + str x22, [x0,#8*4] + + mov x16, x12 // corrected |f1| + mov x17, x13 // corrected |g1| + add x0, x0, #8*5 // pointer to destination |v| + bl __smul_256x63 + bl __smul_512x63_tail + ////////////////////////////////////////// two[!] last iterations + eor x1, x1, #256 // flip-flop src |a|b|u|v| + mov x2, #47 // 31 + 512 % 31 + //bl __ab_approximation_62_256 // |a| and |b| are exact, + ldr x7, [x1,#8*0] // just load + ldr x11, [x1,#8*4] + bl __inner_loop_62_256 + + mov x16, x14 + mov x17, x15 + ldr x0, [sp] // original out_ptr + bl __smul_256x63 + bl __smul_512x63_tail + ldr x30, [x29,#8] + + smulh x20, x7, x17 // figure out top-most limb + ldp x8, x9, [x3,#8*0] + adc x23, x23, x25 + ldp x10, x11, [x3,#8*2] + + add x20, x20, x23 // x20 is 1, 0 or -1 + asr x19, x20, #63 // sign as mask + + and x23, x8, x19 // add mod<<256 conditionally + and x24, x9, x19 + adds x4, x4, x23 + and x25, x10, x19 + adcs x5, x5, x24 + and x26, x11, x19 + adcs x6, x6, x25 + adcs x7, x22, x26 + adc x20, x20, xzr // x20 is 1, 0 or -1 + + neg x19, x20 + orr x20, x20, x19 // excess bit or sign as mask + asr x19, x19, #63 // excess bit as mask + + and x8, x8, x20 // mask |mod| + and x9, x9, x20 + and x10, x10, x20 + and x11, x11, x20 + + eor x8, x8, x19 // conditionally negate |mod| + eor x9, x9, x19 + adds x8, x8, x19, lsr#63 + eor x10, x10, x19 + adcs x9, x9, xzr + eor x11, x11, x19 + adcs x10, x10, xzr + adc x11, x11, xzr + + adds x4, x4, x8 // final adjustment for |mod|<<256 + adcs x5, x5, x9 + adcs x6, x6, x10 + stp x4, x5, [x0,#8*4] + adc x7, x7, x11 + stp x6, x7, [x0,#8*6] + + add sp, sp, #1040 + ldp x19, x20, [x29,#16] + ldp x21, x22, [x29,#32] + ldp x23, x24, [x29,#48] + ldp x25, x26, [x29,#64] + ldr x29, [sp],#80 + DCDU 3573752767 + ret + ENDP + +//////////////////////////////////////////////////////////////////////// + + ALIGN 32 +|__smul_256x63| PROC + ldp x4, x5, [x1,#8*0+64] // load |u| (or |v|) + asr x14, x16, #63 // |f_|'s sign as mask (or |g_|'s) + ldp x6, x7, [x1,#8*2+64] + eor x16, x16, x14 // conditionally negate |f_| (or |g_|) + ldr x22, [x1,#8*4+64] + + eor x4, x4, x14 // conditionally negate |u| (or |v|) + sub x16, x16, x14 + eor x5, x5, x14 + adds x4, x4, x14, lsr#63 + eor x6, x6, x14 + adcs x5, x5, xzr + eor x7, x7, x14 + adcs x6, x6, xzr + eor x22, x22, x14 + umulh x19, x4, x16 + adcs x7, x7, xzr + umulh x20, x5, x16 + adcs x22, x22, xzr + umulh x21, x6, x16 + mul x4, x4, x16 + cmp x16, #0 + mul x5, x5, x16 + cselne x22,x22,xzr + mul x6, x6, x16 + adds x5, x5, x19 + mul x24, x7, x16 + adcs x6, x6, x20 + adcs x24, x24, x21 + adc x26, xzr, xzr + ldp x8, x9, [x1,#8*0+104] // load |u| (or |v|) + asr x14, x17, #63 // |f_|'s sign as mask (or |g_|'s) + ldp x10, x11, [x1,#8*2+104] + eor x17, x17, x14 // conditionally negate |f_| (or |g_|) + ldr x23, [x1,#8*4+104] + + eor x8, x8, x14 // conditionally negate |u| (or |v|) + sub x17, x17, x14 + eor x9, x9, x14 + adds x8, x8, x14, lsr#63 + eor x10, x10, x14 + adcs x9, x9, xzr + eor x11, x11, x14 + adcs x10, x10, xzr + eor x23, x23, x14 + umulh x19, x8, x17 + adcs x11, x11, xzr + umulh x20, x9, x17 + adcs x23, x23, xzr + umulh x21, x10, x17 + adc x15, xzr, xzr // used in __smul_512x63_tail + mul x8, x8, x17 + cmp x17, #0 + mul x9, x9, x17 + cselne x23,x23,xzr + mul x10, x10, x17 + adds x9, x9, x19 + mul x25, x11, x17 + adcs x10, x10, x20 + adcs x25, x25, x21 + adc x26, x26, xzr + + adds x4, x4, x8 + adcs x5, x5, x9 + adcs x6, x6, x10 + stp x4, x5, [x0,#8*0] + adcs x24, x24, x25 + stp x6, x24, [x0,#8*2] + + ret + ENDP + + + ALIGN 32 +|__smul_512x63_tail| PROC + umulh x24, x7, x16 + ldp x5, x6, [x1,#8*18] // load rest of |v| + adc x26, x26, xzr + ldr x7, [x1,#8*20] + and x22, x22, x16 + + umulh x11, x11, x17 // resume |v|*|g1| chain + + sub x24, x24, x22 // tie up |u|*|f1| chain + asr x25, x24, #63 + + eor x5, x5, x14 // conditionally negate rest of |v| + eor x6, x6, x14 + adds x5, x5, x15 + eor x7, x7, x14 + adcs x6, x6, xzr + umulh x19, x23, x17 + adc x7, x7, xzr + umulh x20, x5, x17 + add x11, x11, x26 + umulh x21, x6, x17 + + mul x4, x23, x17 + mul x5, x5, x17 + adds x4, x4, x11 + mul x6, x6, x17 + adcs x5, x5, x19 + mul x22, x7, x17 + adcs x6, x6, x20 + adcs x22, x22, x21 + adc x23, xzr, xzr // used in the final step + + adds x4, x4, x24 + adcs x5, x5, x25 + adcs x6, x6, x25 + stp x4, x5, [x0,#8*4] + adcs x22, x22, x25 // carry is used in the final step + stp x6, x22, [x0,#8*6] + + ret + ENDP + + + ALIGN 32 +|__smul_256_n_shift_by_31| PROC + ldp x4, x5, [x1,#8*0+0] // load |a| (or |b|) + asr x24, x12, #63 // |f0|'s sign as mask (or |g0|'s) + ldp x6, x7, [x1,#8*2+0] + eor x25, x12, x24 // conditionally negate |f0| (or |g0|) + + eor x4, x4, x24 // conditionally negate |a| (or |b|) + sub x25, x25, x24 + eor x5, x5, x24 + adds x4, x4, x24, lsr#63 + eor x6, x6, x24 + adcs x5, x5, xzr + eor x7, x7, x24 + umulh x19, x4, x25 + adcs x6, x6, xzr + umulh x20, x5, x25 + adc x7, x7, xzr + umulh x21, x6, x25 + and x24, x24, x25 + umulh x22, x7, x25 + neg x24, x24 + + mul x4, x4, x25 + mul x5, x5, x25 + mul x6, x6, x25 + adds x5, x5, x19 + mul x7, x7, x25 + adcs x6, x6, x20 + adcs x7, x7, x21 + adc x22, x22, x24 + ldp x8, x9, [x1,#8*0+32] // load |a| (or |b|) + asr x24, x13, #63 // |f0|'s sign as mask (or |g0|'s) + ldp x10, x11, [x1,#8*2+32] + eor x25, x13, x24 // conditionally negate |f0| (or |g0|) + + eor x8, x8, x24 // conditionally negate |a| (or |b|) + sub x25, x25, x24 + eor x9, x9, x24 + adds x8, x8, x24, lsr#63 + eor x10, x10, x24 + adcs x9, x9, xzr + eor x11, x11, x24 + umulh x19, x8, x25 + adcs x10, x10, xzr + umulh x20, x9, x25 + adc x11, x11, xzr + umulh x21, x10, x25 + and x24, x24, x25 + umulh x23, x11, x25 + neg x24, x24 + + mul x8, x8, x25 + mul x9, x9, x25 + mul x10, x10, x25 + adds x9, x9, x19 + mul x11, x11, x25 + adcs x10, x10, x20 + adcs x11, x11, x21 + adc x23, x23, x24 + adds x4, x4, x8 + adcs x5, x5, x9 + adcs x6, x6, x10 + adcs x7, x7, x11 + adc x8, x22, x23 + + extr x4, x5, x4, #31 + extr x5, x6, x5, #31 + extr x6, x7, x6, #31 + asr x23, x8, #63 // result's sign as mask + extr x7, x8, x7, #31 + + eor x4, x4, x23 // ensure the result is positive + eor x5, x5, x23 + adds x4, x4, x23, lsr#63 + eor x6, x6, x23 + adcs x5, x5, xzr + eor x7, x7, x23 + adcs x6, x6, xzr + stp x4, x5, [x0,#8*0] + adc x7, x7, xzr + stp x6, x7, [x0,#8*2] + + eor x12, x12, x23 // adjust |f/g| accordingly + eor x13, x13, x23 + sub x12, x12, x23 + sub x13, x13, x23 + + ret + ENDP + + ALIGN 16 +|__ab_approximation_31_256| PROC + ldp x6, x7, [x1,#8*2] + ldp x10, x11, [x1,#8*6] + ldp x4, x5, [x1,#8*0] + ldp x8, x9, [x1,#8*4] + +|$Lab_approximation_31_256_loaded| + orr x19, x7, x11 // check top-most limbs, ... + cmp x19, #0 + cselne x7,x7,x6 + cselne x11,x11,x10 + cselne x6,x6,x5 + orr x19, x7, x11 // and ones before top-most, ... + cselne x10,x10,x9 + + cmp x19, #0 + cselne x7,x7,x6 + cselne x11,x11,x10 + cselne x6,x6,x4 + orr x19, x7, x11 // and one more, ... + cselne x10,x10,x8 + + clz x19, x19 + cmp x19, #64 + cselne x19,x19,xzr + cselne x7,x7,x6 + cselne x11,x11,x10 + neg x20, x19 + + lslv x7, x7, x19 // align high limbs to the left + lslv x11, x11, x19 + lsrv x6, x6, x20 + lsrv x10, x10, x20 + and x6, x6, x20, asr#6 + and x10, x10, x20, asr#6 + orr x7, x7, x6 + orr x11, x11, x10 + + bfxil x7, x4, #0, #31 + bfxil x11, x8, #0, #31 + + b __inner_loop_31_256 + ret + ENDP + + + ALIGN 16 +|__inner_loop_31_256| PROC + mov x2, #31 + mov x13, #0x7FFFFFFF80000000 // |f0|=1, |g0|=0 + mov x15, #0x800000007FFFFFFF // |f1|=0, |g1|=1 + mov x23,#0x7FFFFFFF7FFFFFFF + +|$Loop_31_256| + sbfx x22, x7, #0, #1 // if |a_| is odd, then we'll be subtracting + sub x2, x2, #1 + and x19, x11, x22 + sub x20, x11, x7 // |b_|-|a_| + subs x21, x7, x19 // |a_|-|b_| (or |a_|-0 if |a_| was even) + mov x19, x15 + cselhs x11,x11,x7 + cselhs x7,x21,x20 + cselhs x15,x15,x13 + cselhs x13,x13,x19 + lsr x7, x7, #1 + and x19, x15, x22 + and x20, x23, x22 + sub x13, x13, x19 // |f0|-=|f1| (or |f0-=0| if |a_| was even) + add x15, x15, x15 // |f1|<<=1 + add x13, x13, x20 + sub x15, x15, x23 + cbnz x2, |$Loop_31_256| + + mov x23, #0x7FFFFFFF + ubfx x12, x13, #0, #32 + ubfx x13, x13, #32, #32 + ubfx x14, x15, #0, #32 + ubfx x15, x15, #32, #32 + sub x12, x12, x23 // remove bias + sub x13, x13, x23 + sub x14, x14, x23 + sub x15, x15, x23 + + ret + ENDP + + + ALIGN 16 +|__inner_loop_62_256| PROC + mov x12, #1 // |f0|=1 + mov x13, #0 // |g0|=0 + mov x14, #0 // |f1|=0 + mov x15, #1 // |g1|=1 + +|$Loop_62_256| + sbfx x22, x7, #0, #1 // if |a_| is odd, then we'll be subtracting + sub x2, x2, #1 + and x19, x11, x22 + sub x20, x11, x7 // |b_|-|a_| + subs x21, x7, x19 // |a_|-|b_| (or |a_|-0 if |a_| was even) + mov x19, x12 + cselhs x11,x11,x7 + cselhs x7,x21,x20 + mov x20, x13 + cselhs x12,x12,x14 + cselhs x14,x14,x19 + cselhs x13,x13,x15 + cselhs x15,x15,x20 + lsr x7, x7, #1 + and x19, x14, x22 + and x20, x15, x22 + add x14, x14, x14 // |f1|<<=1 + add x15, x15, x15 // |g1|<<=1 + sub x12, x12, x19 // |f0|-=|f1| (or |f0-=0| if |a_| was even) + sub x13, x13, x20 // |g0|-=|g1| (or |g0-=0| ...) + cbnz x2, |$Loop_62_256| + + ret + ENDP + END diff --git a/temp-deps/blst-ts/deps/blst/build/win64/ct_inverse_mod_256-x86_64.asm b/temp-deps/blst-ts/deps/blst/build/win64/ct_inverse_mod_256-x86_64.asm new file mode 100644 index 0000000..65665c9 --- /dev/null +++ b/temp-deps/blst-ts/deps/blst/build/win64/ct_inverse_mod_256-x86_64.asm @@ -0,0 +1,1211 @@ +OPTION DOTNAME +.text$ SEGMENT ALIGN(256) 'CODE' + +PUBLIC ct_inverse_mod_256 + +ALIGN 32 +ct_inverse_mod_256 PROC PUBLIC + DB 243,15,30,250 + mov QWORD PTR[8+rsp],rdi ;WIN64 prologue + mov QWORD PTR[16+rsp],rsi + mov r11,rsp +$L$SEH_begin_ct_inverse_mod_256:: + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 + mov rcx,r9 + + + + push rbp + + push rbx + + push r12 + + push r13 + + push r14 + + push r15 + + sub rsp,1072 + +$L$SEH_body_ct_inverse_mod_256:: + + + lea rax,QWORD PTR[((48+511))+rsp] + and rax,-512 + mov QWORD PTR[32+rsp],rdi + mov QWORD PTR[40+rsp],rcx + + mov r8,QWORD PTR[rsi] + mov r9,QWORD PTR[8+rsi] + mov r10,QWORD PTR[16+rsi] + mov r11,QWORD PTR[24+rsi] + + mov r12,QWORD PTR[rdx] + mov r13,QWORD PTR[8+rdx] + mov r14,QWORD PTR[16+rdx] + mov r15,QWORD PTR[24+rdx] + + mov QWORD PTR[rax],r8 + mov QWORD PTR[8+rax],r9 + mov QWORD PTR[16+rax],r10 + mov QWORD PTR[24+rax],r11 + + mov QWORD PTR[32+rax],r12 + mov QWORD PTR[40+rax],r13 + mov QWORD PTR[48+rax],r14 + mov QWORD PTR[56+rax],r15 + mov rsi,rax + + + mov edx,31 + call __ab_approximation_31_256 + + + mov QWORD PTR[16+rsp],r12 + mov QWORD PTR[24+rsp],r13 + + mov rdi,256 + xor rdi,rsi + call __smulq_256_n_shift_by_31 + + + mov QWORD PTR[64+rdi],rdx + + mov rdx,QWORD PTR[16+rsp] + mov rcx,QWORD PTR[24+rsp] + lea rdi,QWORD PTR[32+rdi] + call __smulq_256_n_shift_by_31 + + + mov QWORD PTR[72+rdi],rdx + + + xor rsi,256 + mov edx,31 + call __ab_approximation_31_256 + + + mov QWORD PTR[16+rsp],r12 + mov QWORD PTR[24+rsp],r13 + + mov rdi,256 + xor rdi,rsi + call __smulq_256_n_shift_by_31 + mov QWORD PTR[rsp],rdx + mov QWORD PTR[8+rsp],rcx + + mov rdx,QWORD PTR[16+rsp] + mov rcx,QWORD PTR[24+rsp] + lea rdi,QWORD PTR[32+rdi] + call __smulq_256_n_shift_by_31 + + + + mov r8,QWORD PTR[64+rsi] + mov r12,QWORD PTR[104+rsi] + mov r9,r8 + imul r8,QWORD PTR[rsp] + mov r13,r12 + imul r12,QWORD PTR[8+rsp] + add r8,r12 + mov QWORD PTR[32+rdi],r8 + sar r8,63 + mov QWORD PTR[40+rdi],r8 + mov QWORD PTR[48+rdi],r8 + mov QWORD PTR[56+rdi],r8 + mov QWORD PTR[64+rdi],r8 + lea rsi,QWORD PTR[64+rsi] + + imul r9,rdx + imul r13,rcx + add r9,r13 + mov QWORD PTR[72+rdi],r9 + sar r9,63 + mov QWORD PTR[80+rdi],r9 + mov QWORD PTR[88+rdi],r9 + mov QWORD PTR[96+rdi],r9 + mov QWORD PTR[104+rdi],r9 + xor rsi,256+8*8 + mov edx,31 + call __ab_approximation_31_256 + + + mov QWORD PTR[16+rsp],r12 + mov QWORD PTR[24+rsp],r13 + + mov rdi,256 + xor rdi,rsi + call __smulq_256_n_shift_by_31 + mov QWORD PTR[rsp],rdx + mov QWORD PTR[8+rsp],rcx + + mov rdx,QWORD PTR[16+rsp] + mov rcx,QWORD PTR[24+rsp] + lea rdi,QWORD PTR[32+rdi] + call __smulq_256_n_shift_by_31 + mov QWORD PTR[16+rsp],rdx + mov QWORD PTR[24+rsp],rcx + + mov rdx,QWORD PTR[rsp] + mov rcx,QWORD PTR[8+rsp] + lea rsi,QWORD PTR[64+rsi] + lea rdi,QWORD PTR[32+rdi] + call __smulq_256x63 + + mov rdx,QWORD PTR[16+rsp] + mov rcx,QWORD PTR[24+rsp] + lea rdi,QWORD PTR[40+rdi] + call __smulq_256x63 + xor rsi,256+8*8 + mov edx,31 + call __ab_approximation_31_256 + + + mov QWORD PTR[16+rsp],r12 + mov QWORD PTR[24+rsp],r13 + + mov rdi,256 + xor rdi,rsi + call __smulq_256_n_shift_by_31 + mov QWORD PTR[rsp],rdx + mov QWORD PTR[8+rsp],rcx + + mov rdx,QWORD PTR[16+rsp] + mov rcx,QWORD PTR[24+rsp] + lea rdi,QWORD PTR[32+rdi] + call __smulq_256_n_shift_by_31 + mov QWORD PTR[16+rsp],rdx + mov QWORD PTR[24+rsp],rcx + + mov rdx,QWORD PTR[rsp] + mov rcx,QWORD PTR[8+rsp] + lea rsi,QWORD PTR[64+rsi] + lea rdi,QWORD PTR[32+rdi] + call __smulq_256x63 + + mov rdx,QWORD PTR[16+rsp] + mov rcx,QWORD PTR[24+rsp] + lea rdi,QWORD PTR[40+rdi] + call __smulq_256x63 + xor rsi,256+8*8 + mov edx,31 + call __ab_approximation_31_256 + + + mov QWORD PTR[16+rsp],r12 + mov QWORD PTR[24+rsp],r13 + + mov rdi,256 + xor rdi,rsi + call __smulq_256_n_shift_by_31 + mov QWORD PTR[rsp],rdx + mov QWORD PTR[8+rsp],rcx + + mov rdx,QWORD PTR[16+rsp] + mov rcx,QWORD PTR[24+rsp] + lea rdi,QWORD PTR[32+rdi] + call __smulq_256_n_shift_by_31 + mov QWORD PTR[16+rsp],rdx + mov QWORD PTR[24+rsp],rcx + + mov rdx,QWORD PTR[rsp] + mov rcx,QWORD PTR[8+rsp] + lea rsi,QWORD PTR[64+rsi] + lea rdi,QWORD PTR[32+rdi] + call __smulq_256x63 + + mov rdx,QWORD PTR[16+rsp] + mov rcx,QWORD PTR[24+rsp] + lea rdi,QWORD PTR[40+rdi] + call __smulq_256x63 + xor rsi,256+8*8 + mov edx,31 + call __ab_approximation_31_256 + + + mov QWORD PTR[16+rsp],r12 + mov QWORD PTR[24+rsp],r13 + + mov rdi,256 + xor rdi,rsi + call __smulq_256_n_shift_by_31 + mov QWORD PTR[rsp],rdx + mov QWORD PTR[8+rsp],rcx + + mov rdx,QWORD PTR[16+rsp] + mov rcx,QWORD PTR[24+rsp] + lea rdi,QWORD PTR[32+rdi] + call __smulq_256_n_shift_by_31 + mov QWORD PTR[16+rsp],rdx + mov QWORD PTR[24+rsp],rcx + + mov rdx,QWORD PTR[rsp] + mov rcx,QWORD PTR[8+rsp] + lea rsi,QWORD PTR[64+rsi] + lea rdi,QWORD PTR[32+rdi] + call __smulq_256x63 + + mov rdx,QWORD PTR[16+rsp] + mov rcx,QWORD PTR[24+rsp] + lea rdi,QWORD PTR[40+rdi] + call __smulq_256x63 + xor rsi,256+8*8 + mov edx,31 + call __ab_approximation_31_256 + + + mov QWORD PTR[16+rsp],r12 + mov QWORD PTR[24+rsp],r13 + + mov rdi,256 + xor rdi,rsi + call __smulq_256_n_shift_by_31 + mov QWORD PTR[rsp],rdx + mov QWORD PTR[8+rsp],rcx + + mov rdx,QWORD PTR[16+rsp] + mov rcx,QWORD PTR[24+rsp] + lea rdi,QWORD PTR[32+rdi] + call __smulq_256_n_shift_by_31 + mov QWORD PTR[16+rsp],rdx + mov QWORD PTR[24+rsp],rcx + + mov rdx,QWORD PTR[rsp] + mov rcx,QWORD PTR[8+rsp] + lea rsi,QWORD PTR[64+rsi] + lea rdi,QWORD PTR[32+rdi] + call __smulq_256x63 + + mov rdx,QWORD PTR[16+rsp] + mov rcx,QWORD PTR[24+rsp] + lea rdi,QWORD PTR[40+rdi] + call __smulq_256x63 + xor rsi,256+8*8 + mov edx,31 + call __ab_approximation_31_256 + + + mov QWORD PTR[16+rsp],r12 + mov QWORD PTR[24+rsp],r13 + + mov rdi,256 + xor rdi,rsi + call __smulq_256_n_shift_by_31 + mov QWORD PTR[rsp],rdx + mov QWORD PTR[8+rsp],rcx + + mov rdx,QWORD PTR[16+rsp] + mov rcx,QWORD PTR[24+rsp] + lea rdi,QWORD PTR[32+rdi] + call __smulq_256_n_shift_by_31 + mov QWORD PTR[16+rsp],rdx + mov QWORD PTR[24+rsp],rcx + + mov rdx,QWORD PTR[rsp] + mov rcx,QWORD PTR[8+rsp] + lea rsi,QWORD PTR[64+rsi] + lea rdi,QWORD PTR[32+rdi] + call __smulq_256x63 + + mov rdx,QWORD PTR[16+rsp] + mov rcx,QWORD PTR[24+rsp] + lea rdi,QWORD PTR[40+rdi] + call __smulq_256x63 + xor rsi,256+8*8 + mov edx,31 + call __ab_approximation_31_256 + + + mov QWORD PTR[16+rsp],r12 + mov QWORD PTR[24+rsp],r13 + + mov rdi,256 + xor rdi,rsi + call __smulq_256_n_shift_by_31 + mov QWORD PTR[rsp],rdx + mov QWORD PTR[8+rsp],rcx + + mov rdx,QWORD PTR[16+rsp] + mov rcx,QWORD PTR[24+rsp] + lea rdi,QWORD PTR[32+rdi] + call __smulq_256_n_shift_by_31 + mov QWORD PTR[16+rsp],rdx + mov QWORD PTR[24+rsp],rcx + + mov rdx,QWORD PTR[rsp] + mov rcx,QWORD PTR[8+rsp] + lea rsi,QWORD PTR[64+rsi] + lea rdi,QWORD PTR[32+rdi] + call __smulq_256x63 + + mov rdx,QWORD PTR[16+rsp] + mov rcx,QWORD PTR[24+rsp] + lea rdi,QWORD PTR[40+rdi] + call __smulq_256x63 + sar rbp,63 + mov QWORD PTR[40+rdi],rbp + mov QWORD PTR[48+rdi],rbp + mov QWORD PTR[56+rdi],rbp + xor rsi,256+8*8 + mov edx,31 + call __ab_approximation_31_256 + + + mov QWORD PTR[16+rsp],r12 + mov QWORD PTR[24+rsp],r13 + + mov rdi,256 + xor rdi,rsi + call __smulq_256_n_shift_by_31 + mov QWORD PTR[rsp],rdx + mov QWORD PTR[8+rsp],rcx + + mov rdx,QWORD PTR[16+rsp] + mov rcx,QWORD PTR[24+rsp] + lea rdi,QWORD PTR[32+rdi] + call __smulq_256_n_shift_by_31 + mov QWORD PTR[16+rsp],rdx + mov QWORD PTR[24+rsp],rcx + + mov rdx,QWORD PTR[rsp] + mov rcx,QWORD PTR[8+rsp] + lea rsi,QWORD PTR[64+rsi] + lea rdi,QWORD PTR[32+rdi] + call __smulq_256x63 + + mov rdx,QWORD PTR[16+rsp] + mov rcx,QWORD PTR[24+rsp] + lea rdi,QWORD PTR[40+rdi] + call __smulq_512x63 + xor rsi,256+8*8 + mov edx,31 + call __ab_approximation_31_256 + + + mov QWORD PTR[16+rsp],r12 + mov QWORD PTR[24+rsp],r13 + + mov rdi,256 + xor rdi,rsi + call __smulq_256_n_shift_by_31 + mov QWORD PTR[rsp],rdx + mov QWORD PTR[8+rsp],rcx + + mov rdx,QWORD PTR[16+rsp] + mov rcx,QWORD PTR[24+rsp] + lea rdi,QWORD PTR[32+rdi] + call __smulq_256_n_shift_by_31 + mov QWORD PTR[16+rsp],rdx + mov QWORD PTR[24+rsp],rcx + + mov rdx,QWORD PTR[rsp] + mov rcx,QWORD PTR[8+rsp] + lea rsi,QWORD PTR[64+rsi] + lea rdi,QWORD PTR[32+rdi] + call __smulq_256x63 + + mov rdx,QWORD PTR[16+rsp] + mov rcx,QWORD PTR[24+rsp] + lea rdi,QWORD PTR[40+rdi] + call __smulq_512x63 + xor rsi,256+8*8 + mov edx,31 + call __ab_approximation_31_256 + + + mov QWORD PTR[16+rsp],r12 + mov QWORD PTR[24+rsp],r13 + + mov rdi,256 + xor rdi,rsi + call __smulq_256_n_shift_by_31 + mov QWORD PTR[rsp],rdx + mov QWORD PTR[8+rsp],rcx + + mov rdx,QWORD PTR[16+rsp] + mov rcx,QWORD PTR[24+rsp] + lea rdi,QWORD PTR[32+rdi] + call __smulq_256_n_shift_by_31 + mov QWORD PTR[16+rsp],rdx + mov QWORD PTR[24+rsp],rcx + + mov rdx,QWORD PTR[rsp] + mov rcx,QWORD PTR[8+rsp] + lea rsi,QWORD PTR[64+rsi] + lea rdi,QWORD PTR[32+rdi] + call __smulq_256x63 + + mov rdx,QWORD PTR[16+rsp] + mov rcx,QWORD PTR[24+rsp] + lea rdi,QWORD PTR[40+rdi] + call __smulq_512x63 + xor rsi,256+8*8 + mov edx,31 + call __ab_approximation_31_256 + + + mov QWORD PTR[16+rsp],r12 + mov QWORD PTR[24+rsp],r13 + + mov rdi,256 + xor rdi,rsi + call __smulq_256_n_shift_by_31 + mov QWORD PTR[rsp],rdx + mov QWORD PTR[8+rsp],rcx + + mov rdx,QWORD PTR[16+rsp] + mov rcx,QWORD PTR[24+rsp] + lea rdi,QWORD PTR[32+rdi] + call __smulq_256_n_shift_by_31 + mov QWORD PTR[16+rsp],rdx + mov QWORD PTR[24+rsp],rcx + + mov rdx,QWORD PTR[rsp] + mov rcx,QWORD PTR[8+rsp] + lea rsi,QWORD PTR[64+rsi] + lea rdi,QWORD PTR[32+rdi] + call __smulq_256x63 + + mov rdx,QWORD PTR[16+rsp] + mov rcx,QWORD PTR[24+rsp] + lea rdi,QWORD PTR[40+rdi] + call __smulq_512x63 + xor rsi,256+8*8 + mov edx,31 + call __ab_approximation_31_256 + + + mov QWORD PTR[16+rsp],r12 + mov QWORD PTR[24+rsp],r13 + + mov rdi,256 + xor rdi,rsi + call __smulq_256_n_shift_by_31 + mov QWORD PTR[rsp],rdx + mov QWORD PTR[8+rsp],rcx + + mov rdx,QWORD PTR[16+rsp] + mov rcx,QWORD PTR[24+rsp] + lea rdi,QWORD PTR[32+rdi] + call __smulq_256_n_shift_by_31 + mov QWORD PTR[16+rsp],rdx + mov QWORD PTR[24+rsp],rcx + + mov rdx,QWORD PTR[rsp] + mov rcx,QWORD PTR[8+rsp] + lea rsi,QWORD PTR[64+rsi] + lea rdi,QWORD PTR[32+rdi] + call __smulq_256x63 + + mov rdx,QWORD PTR[16+rsp] + mov rcx,QWORD PTR[24+rsp] + lea rdi,QWORD PTR[40+rdi] + call __smulq_512x63 + xor rsi,256+8*8 + mov edx,31 + call __ab_approximation_31_256 + + + mov QWORD PTR[16+rsp],r12 + mov QWORD PTR[24+rsp],r13 + + mov rdi,256 + xor rdi,rsi + call __smulq_256_n_shift_by_31 + mov QWORD PTR[rsp],rdx + mov QWORD PTR[8+rsp],rcx + + mov rdx,QWORD PTR[16+rsp] + mov rcx,QWORD PTR[24+rsp] + lea rdi,QWORD PTR[32+rdi] + call __smulq_256_n_shift_by_31 + mov QWORD PTR[16+rsp],rdx + mov QWORD PTR[24+rsp],rcx + + mov rdx,QWORD PTR[rsp] + mov rcx,QWORD PTR[8+rsp] + lea rsi,QWORD PTR[64+rsi] + lea rdi,QWORD PTR[32+rdi] + call __smulq_256x63 + + mov rdx,QWORD PTR[16+rsp] + mov rcx,QWORD PTR[24+rsp] + lea rdi,QWORD PTR[40+rdi] + call __smulq_512x63 + + xor rsi,256+8*8 + mov edx,47 + + mov r8,QWORD PTR[rsi] + + mov r10,QWORD PTR[32+rsi] + + call __inner_loop_62_256 + + + + + + + + lea rsi,QWORD PTR[64+rsi] + + + + + + mov rdx,r12 + mov rcx,r13 + mov rdi,QWORD PTR[32+rsp] + call __smulq_512x63 + adc rdx,rbp + + mov rsi,QWORD PTR[40+rsp] + mov rax,rdx + sar rdx,63 + + mov r8,rdx + mov r9,rdx + and r8,QWORD PTR[rsi] + mov r10,rdx + and r9,QWORD PTR[8+rsi] + and r10,QWORD PTR[16+rsi] + and rdx,QWORD PTR[24+rsi] + + add r12,r8 + adc r13,r9 + adc r14,r10 + adc r15,rdx + adc rax,0 + + mov rdx,rax + neg rax + or rdx,rax + sar rax,63 + + mov r8,rdx + mov r9,rdx + and r8,QWORD PTR[rsi] + mov r10,rdx + and r9,QWORD PTR[8+rsi] + and r10,QWORD PTR[16+rsi] + and rdx,QWORD PTR[24+rsi] + + xor r8,rax + xor rcx,rcx + xor r9,rax + sub rcx,rax + xor r10,rax + xor rdx,rax + add r8,rcx + adc r9,0 + adc r10,0 + adc rdx,0 + + add r12,r8 + adc r13,r9 + adc r14,r10 + adc r15,rdx + + mov QWORD PTR[32+rdi],r12 + mov QWORD PTR[40+rdi],r13 + mov QWORD PTR[48+rdi],r14 + mov QWORD PTR[56+rdi],r15 + + lea r8,QWORD PTR[1072+rsp] + mov r15,QWORD PTR[r8] + + mov r14,QWORD PTR[8+r8] + + mov r13,QWORD PTR[16+r8] + + mov r12,QWORD PTR[24+r8] + + mov rbx,QWORD PTR[32+r8] + + mov rbp,QWORD PTR[40+r8] + + lea rsp,QWORD PTR[48+r8] + +$L$SEH_epilogue_ct_inverse_mod_256:: + mov rdi,QWORD PTR[8+rsp] ;WIN64 epilogue + mov rsi,QWORD PTR[16+rsp] + + DB 0F3h,0C3h ;repret + +$L$SEH_end_ct_inverse_mod_256:: +ct_inverse_mod_256 ENDP + +ALIGN 32 +__smulq_512x63 PROC PRIVATE + DB 243,15,30,250 + mov r8,QWORD PTR[rsi] + mov r9,QWORD PTR[8+rsi] + mov r10,QWORD PTR[16+rsi] + mov r11,QWORD PTR[24+rsi] + mov rbp,QWORD PTR[32+rsi] + + mov rbx,rdx + sar rdx,63 + xor rax,rax + sub rax,rdx + + xor rbx,rdx + add rbx,rax + + xor r8,rdx + xor r9,rdx + xor r10,rdx + xor r11,rdx + xor rbp,rdx + add rax,r8 + adc r9,0 + adc r10,0 + adc r11,0 + adc rbp,0 + + mul rbx + mov QWORD PTR[rdi],rax + mov rax,r9 + mov r9,rdx + mul rbx + add r9,rax + mov rax,r10 + adc rdx,0 + mov QWORD PTR[8+rdi],r9 + mov r10,rdx + mul rbx + add r10,rax + mov rax,r11 + adc rdx,0 + mov QWORD PTR[16+rdi],r10 + mov r11,rdx + and rbp,rbx + neg rbp + mul rbx + add r11,rax + adc rbp,rdx + mov QWORD PTR[24+rdi],r11 + + mov r8,QWORD PTR[40+rsi] + mov r9,QWORD PTR[48+rsi] + mov r10,QWORD PTR[56+rsi] + mov r11,QWORD PTR[64+rsi] + mov r12,QWORD PTR[72+rsi] + mov r13,QWORD PTR[80+rsi] + mov r14,QWORD PTR[88+rsi] + mov r15,QWORD PTR[96+rsi] + + mov rdx,rcx + sar rdx,63 + xor rax,rax + sub rax,rdx + + xor rcx,rdx + add rcx,rax + + xor r8,rdx + xor r9,rdx + xor r10,rdx + xor r11,rdx + xor r12,rdx + xor r13,rdx + xor r14,rdx + xor r15,rdx + add rax,r8 + adc r9,0 + adc r10,0 + adc r11,0 + adc r12,0 + adc r13,0 + adc r14,0 + adc r15,0 + + mul rcx + mov r8,rax + mov rax,r9 + mov r9,rdx + mul rcx + add r9,rax + mov rax,r10 + adc rdx,0 + mov r10,rdx + mul rcx + add r10,rax + mov rax,r11 + adc rdx,0 + mov r11,rdx + mul rcx + add r11,rax + mov rax,r12 + adc rdx,0 + mov r12,rdx + mul rcx + add r12,rax + mov rax,r13 + adc rdx,0 + mov r13,rdx + mul rcx + add r13,rax + mov rax,r14 + adc rdx,0 + mov r14,rdx + mul rcx + add r14,rax + mov rax,r15 + adc rdx,0 + mov r15,rdx + imul rcx + add r15,rax + adc rdx,0 + + mov rbx,rbp + sar rbp,63 + + add r8,QWORD PTR[rdi] + adc r9,QWORD PTR[8+rdi] + adc r10,QWORD PTR[16+rdi] + adc r11,QWORD PTR[24+rdi] + adc r12,rbx + adc r13,rbp + adc r14,rbp + adc r15,rbp + + mov QWORD PTR[rdi],r8 + mov QWORD PTR[8+rdi],r9 + mov QWORD PTR[16+rdi],r10 + mov QWORD PTR[24+rdi],r11 + mov QWORD PTR[32+rdi],r12 + mov QWORD PTR[40+rdi],r13 + mov QWORD PTR[48+rdi],r14 + mov QWORD PTR[56+rdi],r15 + + DB 0F3h,0C3h ;repret +__smulq_512x63 ENDP + + +ALIGN 32 +__smulq_256x63 PROC PRIVATE + DB 243,15,30,250 + mov r8,QWORD PTR[((0+0))+rsi] + mov r9,QWORD PTR[((0+8))+rsi] + mov r10,QWORD PTR[((0+16))+rsi] + mov r11,QWORD PTR[((0+24))+rsi] + mov rbp,QWORD PTR[((0+32))+rsi] + + mov rbx,rdx + sar rdx,63 + xor rax,rax + sub rax,rdx + + xor rbx,rdx + add rbx,rax + + xor r8,rdx + xor r9,rdx + xor r10,rdx + xor r11,rdx + xor rbp,rdx + add rax,r8 + adc r9,0 + adc r10,0 + adc r11,0 + adc rbp,0 + + mul rbx + mov r8,rax + mov rax,r9 + mov r9,rdx + mul rbx + add r9,rax + mov rax,r10 + adc rdx,0 + mov r10,rdx + mul rbx + add r10,rax + mov rax,r11 + adc rdx,0 + mov r11,rdx + and rbp,rbx + neg rbp + mul rbx + add r11,rax + adc rbp,rdx + mov rdx,rcx + mov r12,QWORD PTR[((40+0))+rsi] + mov r13,QWORD PTR[((40+8))+rsi] + mov r14,QWORD PTR[((40+16))+rsi] + mov r15,QWORD PTR[((40+24))+rsi] + mov rcx,QWORD PTR[((40+32))+rsi] + + mov rbx,rdx + sar rdx,63 + xor rax,rax + sub rax,rdx + + xor rbx,rdx + add rbx,rax + + xor r12,rdx + xor r13,rdx + xor r14,rdx + xor r15,rdx + xor rcx,rdx + add rax,r12 + adc r13,0 + adc r14,0 + adc r15,0 + adc rcx,0 + + mul rbx + mov r12,rax + mov rax,r13 + mov r13,rdx + mul rbx + add r13,rax + mov rax,r14 + adc rdx,0 + mov r14,rdx + mul rbx + add r14,rax + mov rax,r15 + adc rdx,0 + mov r15,rdx + and rcx,rbx + neg rcx + mul rbx + add r15,rax + adc rcx,rdx + add r8,r12 + adc r9,r13 + adc r10,r14 + adc r11,r15 + adc rbp,rcx + + mov QWORD PTR[rdi],r8 + mov QWORD PTR[8+rdi],r9 + mov QWORD PTR[16+rdi],r10 + mov QWORD PTR[24+rdi],r11 + mov QWORD PTR[32+rdi],rbp + + DB 0F3h,0C3h ;repret +__smulq_256x63 ENDP + +ALIGN 32 +__smulq_256_n_shift_by_31 PROC PRIVATE + DB 243,15,30,250 + mov QWORD PTR[rdi],rdx + mov QWORD PTR[8+rdi],rcx + mov rbp,rdx + mov r8,QWORD PTR[((0+0))+rsi] + mov r9,QWORD PTR[((0+8))+rsi] + mov r10,QWORD PTR[((0+16))+rsi] + mov r11,QWORD PTR[((0+24))+rsi] + + mov rbx,rbp + sar rbp,63 + xor rax,rax + sub rax,rbp + + xor rbx,rbp + add rbx,rax + + xor r8,rbp + xor r9,rbp + xor r10,rbp + xor r11,rbp + add rax,r8 + adc r9,0 + adc r10,0 + adc r11,0 + + mul rbx + mov r8,rax + mov rax,r9 + and rbp,rbx + neg rbp + mov r9,rdx + mul rbx + add r9,rax + mov rax,r10 + adc rdx,0 + mov r10,rdx + mul rbx + add r10,rax + mov rax,r11 + adc rdx,0 + mov r11,rdx + mul rbx + add r11,rax + adc rbp,rdx + mov r12,QWORD PTR[((32+0))+rsi] + mov r13,QWORD PTR[((32+8))+rsi] + mov r14,QWORD PTR[((32+16))+rsi] + mov r15,QWORD PTR[((32+24))+rsi] + + mov rbx,rcx + sar rcx,63 + xor rax,rax + sub rax,rcx + + xor rbx,rcx + add rbx,rax + + xor r12,rcx + xor r13,rcx + xor r14,rcx + xor r15,rcx + add rax,r12 + adc r13,0 + adc r14,0 + adc r15,0 + + mul rbx + mov r12,rax + mov rax,r13 + and rcx,rbx + neg rcx + mov r13,rdx + mul rbx + add r13,rax + mov rax,r14 + adc rdx,0 + mov r14,rdx + mul rbx + add r14,rax + mov rax,r15 + adc rdx,0 + mov r15,rdx + mul rbx + add r15,rax + adc rcx,rdx + add r8,r12 + adc r9,r13 + adc r10,r14 + adc r11,r15 + adc rbp,rcx + + mov rdx,QWORD PTR[rdi] + mov rcx,QWORD PTR[8+rdi] + + shrd r8,r9,31 + shrd r9,r10,31 + shrd r10,r11,31 + shrd r11,rbp,31 + + sar rbp,63 + xor rax,rax + sub rax,rbp + + xor r8,rbp + xor r9,rbp + xor r10,rbp + xor r11,rbp + add r8,rax + adc r9,0 + adc r10,0 + adc r11,0 + + mov QWORD PTR[rdi],r8 + mov QWORD PTR[8+rdi],r9 + mov QWORD PTR[16+rdi],r10 + mov QWORD PTR[24+rdi],r11 + + xor rdx,rbp + xor rcx,rbp + add rdx,rax + add rcx,rax + + DB 0F3h,0C3h ;repret +__smulq_256_n_shift_by_31 ENDP + +ALIGN 32 +__ab_approximation_31_256 PROC PRIVATE + DB 243,15,30,250 + mov r9,QWORD PTR[24+rsi] + mov r11,QWORD PTR[56+rsi] + mov rbx,QWORD PTR[16+rsi] + mov rbp,QWORD PTR[48+rsi] + mov r8,QWORD PTR[8+rsi] + mov r10,QWORD PTR[40+rsi] + + mov rax,r9 + or rax,r11 + cmovz r9,rbx + cmovz r11,rbp + cmovz rbx,r8 + mov r8,QWORD PTR[rsi] + cmovz rbp,r10 + mov r10,QWORD PTR[32+rsi] + + mov rax,r9 + or rax,r11 + cmovz r9,rbx + cmovz r11,rbp + cmovz rbx,r8 + cmovz rbp,r10 + + mov rax,r9 + or rax,r11 + bsr rcx,rax + lea rcx,QWORD PTR[1+rcx] + cmovz r9,r8 + cmovz r11,r10 + cmovz rcx,rax + neg rcx + + + shld r9,rbx,cl + shld r11,rbp,cl + + mov eax,07FFFFFFFh + and r8,rax + and r10,rax + not rax + and r9,rax + and r11,rax + or r8,r9 + or r10,r11 + + jmp __inner_loop_31_256 + + DB 0F3h,0C3h ;repret +__ab_approximation_31_256 ENDP + +ALIGN 32 +__inner_loop_31_256 PROC PRIVATE + DB 243,15,30,250 + mov rcx,07FFFFFFF80000000h + mov r13,0800000007FFFFFFFh + mov r15,07FFFFFFF7FFFFFFFh + +$L$oop_31_256:: + cmp r8,r10 + mov rax,r8 + mov rbx,r10 + mov rbp,rcx + mov r14,r13 + cmovb r8,r10 + cmovb r10,rax + cmovb rcx,r13 + cmovb r13,rbp + + sub r8,r10 + sub rcx,r13 + add rcx,r15 + + test rax,1 + cmovz r8,rax + cmovz r10,rbx + cmovz rcx,rbp + cmovz r13,r14 + + shr r8,1 + add r13,r13 + sub r13,r15 + sub edx,1 + jnz $L$oop_31_256 + + shr r15,32 + mov edx,ecx + mov r12d,r13d + shr rcx,32 + shr r13,32 + sub rdx,r15 + sub rcx,r15 + sub r12,r15 + sub r13,r15 + + DB 0F3h,0C3h ;repret +__inner_loop_31_256 ENDP + + +ALIGN 32 +__inner_loop_62_256 PROC PRIVATE + DB 243,15,30,250 + mov r15d,edx + mov rdx,1 + xor rcx,rcx + xor r12,r12 + mov r13,rdx + mov r14,rdx + +$L$oop_62_256:: + xor rax,rax + test r8,r14 + mov rbx,r10 + cmovnz rax,r10 + sub rbx,r8 + mov rbp,r8 + sub r8,rax + cmovc r8,rbx + cmovc r10,rbp + mov rax,rdx + cmovc rdx,r12 + cmovc r12,rax + mov rbx,rcx + cmovc rcx,r13 + cmovc r13,rbx + xor rax,rax + xor rbx,rbx + shr r8,1 + test rbp,r14 + cmovnz rax,r12 + cmovnz rbx,r13 + add r12,r12 + add r13,r13 + sub rdx,rax + sub rcx,rbx + sub r15d,1 + jnz $L$oop_62_256 + + DB 0F3h,0C3h ;repret +__inner_loop_62_256 ENDP +.text$ ENDS +.pdata SEGMENT READONLY ALIGN(4) +ALIGN 4 + DD imagerel $L$SEH_begin_ct_inverse_mod_256 + DD imagerel $L$SEH_body_ct_inverse_mod_256 + DD imagerel $L$SEH_info_ct_inverse_mod_256_prologue + + DD imagerel $L$SEH_body_ct_inverse_mod_256 + DD imagerel $L$SEH_epilogue_ct_inverse_mod_256 + DD imagerel $L$SEH_info_ct_inverse_mod_256_body + + DD imagerel $L$SEH_epilogue_ct_inverse_mod_256 + DD imagerel $L$SEH_end_ct_inverse_mod_256 + DD imagerel $L$SEH_info_ct_inverse_mod_256_epilogue + +.pdata ENDS +.xdata SEGMENT READONLY ALIGN(8) +ALIGN 8 +$L$SEH_info_ct_inverse_mod_256_prologue:: +DB 1,0,5,00bh +DB 0,074h,1,0 +DB 0,064h,2,0 +DB 0,003h +DB 0,0 +$L$SEH_info_ct_inverse_mod_256_body:: +DB 1,0,18,0 +DB 000h,0f4h,086h,000h +DB 000h,0e4h,087h,000h +DB 000h,0d4h,088h,000h +DB 000h,0c4h,089h,000h +DB 000h,034h,08ah,000h +DB 000h,054h,08bh,000h +DB 000h,074h,08dh,000h +DB 000h,064h,08eh,000h +DB 000h,001h,08ch,000h +$L$SEH_info_ct_inverse_mod_256_epilogue:: +DB 1,0,4,0 +DB 000h,074h,001h,000h +DB 000h,064h,002h,000h +DB 000h,000h,000h,000h + + +.xdata ENDS +END diff --git a/temp-deps/blst-ts/deps/blst/build/win64/ct_inverse_mod_384-armv8.asm b/temp-deps/blst-ts/deps/blst/build/win64/ct_inverse_mod_384-armv8.asm new file mode 100644 index 0000000..4ab12e0 --- /dev/null +++ b/temp-deps/blst-ts/deps/blst/build/win64/ct_inverse_mod_384-armv8.asm @@ -0,0 +1,718 @@ + AREA |.text|,CODE,ALIGN=8,ARM64 + + + EXPORT |ct_inverse_mod_383|[FUNC] + ALIGN 32 +|ct_inverse_mod_383| PROC + DCDU 3573752639 + stp x29, x30, [sp,#-128]! + add x29, sp, #0 + stp x19, x20, [sp,#16] + stp x21, x22, [sp,#32] + stp x23, x24, [sp,#48] + stp x25, x26, [sp,#64] + stp x27, x28, [sp,#80] + sub sp, sp, #1040 + + ldp x22, x4, [x1,#8*0] + ldp x5, x6, [x1,#8*2] + ldp x7, x8, [x1,#8*4] + + add x1, sp, #16+511 // find closest 512-byte-aligned spot + and x1, x1, #-512 // in the frame... + stp x0, x3, [sp] + + ldp x9, x10, [x2,#8*0] + ldp x11, x12, [x2,#8*2] + ldp x13, x14, [x2,#8*4] + + stp x22, x4, [x1,#8*0] // copy input to |a| + stp x5, x6, [x1,#8*2] + stp x7, x8, [x1,#8*4] + stp x9, x10, [x1,#8*6] // copy modulus to |b| + stp x11, x12, [x1,#8*8] + stp x13, x14, [x1,#8*10] + + ////////////////////////////////////////// first iteration + mov x2, #62 + bl |$Lab_approximation_62_loaded| + + eor x0, x1, #256 // pointer to dst |a|b|u|v| + bl __smul_383_n_shift_by_62 + str x15,[x0,#8*12] // initialize |u| with |f0| + + mov x15, x17 // |f1| + mov x16, x19 // |g1| + add x0, x0, #8*6 // pointer to dst |b| + bl __smul_383_n_shift_by_62 + str x15, [x0,#8*12] // initialize |v| with |f1| + + ////////////////////////////////////////// second iteration + eor x1, x1, #256 // flip-flop src |a|b|u|v| + mov x2, #62 + bl __ab_approximation_62 + + eor x0, x1, #256 // pointer to dst |a|b|u|v| + bl __smul_383_n_shift_by_62 + mov x20, x15 // corrected |f0| + mov x21, x16 // corrected |g0| + + mov x15, x17 // |f1| + mov x16, x19 // |g1| + add x0, x0, #8*6 // pointer to destination |b| + bl __smul_383_n_shift_by_62 + + ldr x7, [x1,#8*12] // |u| + ldr x8, [x1,#8*18] // |v| + mul x3, x20, x7 // |u|*|f0| + smulh x4, x20, x7 + mul x5, x21, x8 // |v|*|g0| + smulh x6, x21, x8 + adds x3, x3, x5 + adc x4, x4, x6 + stp x3, x4, [x0,#8*6] + asr x5, x4, #63 // sign extenstion + stp x5, x5, [x0,#8*8] + stp x5, x5, [x0,#8*10] + + mul x3, x15, x7 // |u|*|f1| + smulh x4, x15, x7 + mul x5, x16, x8 // |v|*|g1| + smulh x6, x16, x8 + adds x3, x3, x5 + adc x4, x4, x6 + stp x3, x4, [x0,#8*12] + asr x5, x4, #63 // sign extenstion + stp x5, x5, [x0,#8*14] + stp x5, x5, [x0,#8*16] + eor x1, x1, #256 // flip-flop src |a|b|u|v| + mov x2, #62 + bl __ab_approximation_62 + + eor x0, x1, #256 // pointer to dst |a|b|u|v| + bl __smul_383_n_shift_by_62 + mov x20, x15 // corrected |f0| + mov x21, x16 // corrected |g0| + + mov x15, x17 // |f1| + mov x16, x19 // |g1| + add x0, x0, #8*6 // pointer to destination |b| + bl __smul_383_n_shift_by_62 + + add x0, x0, #8*6 // pointer to destination |u| + bl __smul_383x63 + + mov x20, x15 // corrected |f1| + mov x21, x16 // corrected |g1| + add x0, x0, #8*6 // pointer to destination |v| + bl __smul_383x63 + eor x1, x1, #256 // flip-flop src |a|b|u|v| + mov x2, #62 + bl __ab_approximation_62 + + eor x0, x1, #256 // pointer to dst |a|b|u|v| + bl __smul_383_n_shift_by_62 + mov x20, x15 // corrected |f0| + mov x21, x16 // corrected |g0| + + mov x15, x17 // |f1| + mov x16, x19 // |g1| + add x0, x0, #8*6 // pointer to destination |b| + bl __smul_383_n_shift_by_62 + + add x0, x0, #8*6 // pointer to destination |u| + bl __smul_383x63 + + mov x20, x15 // corrected |f1| + mov x21, x16 // corrected |g1| + add x0, x0, #8*6 // pointer to destination |v| + bl __smul_383x63 + eor x1, x1, #256 // flip-flop src |a|b|u|v| + mov x2, #62 + bl __ab_approximation_62 + + eor x0, x1, #256 // pointer to dst |a|b|u|v| + bl __smul_383_n_shift_by_62 + mov x20, x15 // corrected |f0| + mov x21, x16 // corrected |g0| + + mov x15, x17 // |f1| + mov x16, x19 // |g1| + add x0, x0, #8*6 // pointer to destination |b| + bl __smul_383_n_shift_by_62 + + add x0, x0, #8*6 // pointer to destination |u| + bl __smul_383x63 + + mov x20, x15 // corrected |f1| + mov x21, x16 // corrected |g1| + add x0, x0, #8*6 // pointer to destination |v| + bl __smul_383x63 + eor x1, x1, #256 // flip-flop src |a|b|u|v| + mov x2, #62 + bl __ab_approximation_62 + + eor x0, x1, #256 // pointer to dst |a|b|u|v| + bl __smul_383_n_shift_by_62 + mov x20, x15 // corrected |f0| + mov x21, x16 // corrected |g0| + + mov x15, x17 // |f1| + mov x16, x19 // |g1| + add x0, x0, #8*6 // pointer to destination |b| + bl __smul_383_n_shift_by_62 + + add x0, x0, #8*6 // pointer to destination |u| + bl __smul_383x63 + + mov x20, x15 // corrected |f1| + mov x21, x16 // corrected |g1| + add x0, x0, #8*6 // pointer to destination |v| + bl __smul_383x63 + asr x27, x27, #63 // sign extension + stp x27, x27, [x0,#8*6] + stp x27, x27, [x0,#8*8] + stp x27, x27, [x0,#8*10] + eor x1, x1, #256 // flip-flop src |a|b|u|v| + mov x2, #62 + bl __ab_approximation_62 + + eor x0, x1, #256 // pointer to dst |a|b|u|v| + bl __smul_383_n_shift_by_62 + mov x20, x15 // corrected |f0| + mov x21, x16 // corrected |g0| + + mov x15, x17 // |f1| + mov x16, x19 // |g1| + add x0, x0, #8*6 // pointer to destination |b| + bl __smul_383_n_shift_by_62 + + add x0, x0, #8*6 // pointer to destination |u| + bl __smul_383x63 + + mov x20, x15 // corrected |f1| + mov x21, x16 // corrected |g1| + add x0, x0, #8*6 // pointer to destination |v| + bl __smul_383x63 + bl __smul_767x63_tail + eor x1, x1, #256 // flip-flop src |a|b|u|v| + mov x2, #62 + bl __ab_approximation_62 + + eor x0, x1, #256 // pointer to dst |a|b|u|v| + bl __smul_383_n_shift_by_62 + mov x20, x15 // corrected |f0| + mov x21, x16 // corrected |g0| + + mov x15, x17 // |f1| + mov x16, x19 // |g1| + add x0, x0, #8*6 // pointer to destination |b| + bl __smul_383_n_shift_by_62 + + add x0, x0, #8*6 // pointer to destination |u| + bl __smul_383x63 + + mov x20, x15 // corrected |f1| + mov x21, x16 // corrected |g1| + add x0, x0, #8*6 // pointer to destination |v| + bl __smul_383x63 + bl __smul_767x63_tail + eor x1, x1, #256 // flip-flop src |a|b|u|v| + mov x2, #62 + bl __ab_approximation_62 + + eor x0, x1, #256 // pointer to dst |a|b|u|v| + bl __smul_383_n_shift_by_62 + mov x20, x15 // corrected |f0| + mov x21, x16 // corrected |g0| + + mov x15, x17 // |f1| + mov x16, x19 // |g1| + add x0, x0, #8*6 // pointer to destination |b| + bl __smul_383_n_shift_by_62 + + add x0, x0, #8*6 // pointer to destination |u| + bl __smul_383x63 + + mov x20, x15 // corrected |f1| + mov x21, x16 // corrected |g1| + add x0, x0, #8*6 // pointer to destination |v| + bl __smul_383x63 + bl __smul_767x63_tail + eor x1, x1, #256 // flip-flop src |a|b|u|v| + mov x2, #62 + bl __ab_approximation_62 + + eor x0, x1, #256 // pointer to dst |a|b|u|v| + bl __smul_383_n_shift_by_62 + mov x20, x15 // corrected |f0| + mov x21, x16 // corrected |g0| + + mov x15, x17 // |f1| + mov x16, x19 // |g1| + add x0, x0, #8*6 // pointer to destination |b| + bl __smul_383_n_shift_by_62 + + add x0, x0, #8*6 // pointer to destination |u| + bl __smul_383x63 + + mov x20, x15 // corrected |f1| + mov x21, x16 // corrected |g1| + add x0, x0, #8*6 // pointer to destination |v| + bl __smul_383x63 + bl __smul_767x63_tail + eor x1, x1, #256 // flip-flop src |a|b|u|v| + mov x2, #62 + bl __ab_approximation_62 + + eor x0, x1, #256 // pointer to dst |a|b|u|v| + bl __smul_383_n_shift_by_62 + mov x20, x15 // corrected |f0| + mov x21, x16 // corrected |g0| + + mov x15, x17 // |f1| + mov x16, x19 // |g1| + add x0, x0, #8*6 // pointer to destination |b| + bl __smul_383_n_shift_by_62 + + add x0, x0, #8*6 // pointer to destination |u| + bl __smul_383x63 + + mov x20, x15 // corrected |f1| + mov x21, x16 // corrected |g1| + add x0, x0, #8*6 // pointer to destination |v| + bl __smul_383x63 + bl __smul_767x63_tail + ////////////////////////////////////////// iteration before last + eor x1, x1, #256 // flip-flop src |a|b|u|v| + mov x2, #62 + //bl __ab_approximation_62 // |a| and |b| are exact, + ldp x3, x8, [x1,#8*0] // just load + ldp x9, x14, [x1,#8*6] + bl __inner_loop_62 + + eor x0, x1, #256 // pointer to dst |a|b|u|v| + str x3, [x0,#8*0] + str x9, [x0,#8*6] + + mov x20, x15 // exact |f0| + mov x21, x16 // exact |g0| + mov x15, x17 + mov x16, x19 + add x0, x0, #8*12 // pointer to dst |u| + bl __smul_383x63 + + mov x20, x15 // exact |f1| + mov x21, x16 // exact |g1| + add x0, x0, #8*6 // pointer to dst |v| + bl __smul_383x63 + bl __smul_767x63_tail + + ////////////////////////////////////////// last iteration + eor x1, x1, #256 // flip-flop src |a|b|u|v| + mov x2, #22 // 766 % 62 + //bl __ab_approximation_62 // |a| and |b| are exact, + ldr x3, [x1,#8*0] // just load + eor x8, x8, x8 + ldr x9, [x1,#8*6] + eor x14, x14, x14 + bl __inner_loop_62 + + mov x20, x17 + mov x21, x19 + ldp x0, x15, [sp] // original out_ptr and n_ptr + bl __smul_383x63 + bl __smul_767x63_tail + ldr x30, [x29,#8] + + asr x22, x8, #63 // sign as mask + ldp x9, x10, [x15,#8*0] + ldp x11, x12, [x15,#8*2] + ldp x13, x14, [x15,#8*4] + + and x9, x9, x22 // add mod<<384 conditionally + and x10, x10, x22 + adds x3, x3, x9 + and x11, x11, x22 + adcs x4, x4, x10 + and x12, x12, x22 + adcs x5, x5, x11 + and x13, x13, x22 + adcs x6, x6, x12 + and x14, x14, x22 + stp x3, x4, [x0,#8*6] + adcs x7, x7, x13 + stp x5, x6, [x0,#8*8] + adc x8, x8, x14 + stp x7, x8, [x0,#8*10] + + add sp, sp, #1040 + ldp x19, x20, [x29,#16] + ldp x21, x22, [x29,#32] + ldp x23, x24, [x29,#48] + ldp x25, x26, [x29,#64] + ldp x27, x28, [x29,#80] + ldr x29, [sp],#128 + DCDU 3573752767 + ret + ENDP + +//////////////////////////////////////////////////////////////////////// +// see corresponding commentary in ctx_inverse_mod_384-x86_64... + + ALIGN 32 +|__smul_383x63| PROC + ldp x3, x4, [x1,#8*0+96] // load |u| (or |v|) + asr x17, x20, #63 // |f_|'s sign as mask (or |g_|'s) + ldp x5, x6, [x1,#8*2+96] + eor x20, x20, x17 // conditionally negate |f_| (or |g_|) + ldp x7, x8, [x1,#8*4+96] + + eor x3, x3, x17 // conditionally negate |u| (or |v|) + sub x20, x20, x17 + eor x4, x4, x17 + adds x3, x3, x17, lsr#63 + eor x5, x5, x17 + adcs x4, x4, xzr + eor x6, x6, x17 + adcs x5, x5, xzr + eor x7, x7, x17 + adcs x6, x6, xzr + umulh x22, x3, x20 + eor x8, x8, x17 + umulh x23, x4, x20 + adcs x7, x7, xzr + umulh x24, x5, x20 + adcs x8, x8, xzr + umulh x25, x6, x20 + umulh x26, x7, x20 + mul x3, x3, x20 + mul x4, x4, x20 + mul x5, x5, x20 + adds x4, x4, x22 + mul x6, x6, x20 + adcs x5, x5, x23 + mul x7, x7, x20 + adcs x6, x6, x24 + mul x27,x8, x20 + adcs x7, x7, x25 + adcs x27,x27,x26 + adc x2, xzr, xzr + ldp x9, x10, [x1,#8*0+144] // load |u| (or |v|) + asr x17, x21, #63 // |f_|'s sign as mask (or |g_|'s) + ldp x11, x12, [x1,#8*2+144] + eor x21, x21, x17 // conditionally negate |f_| (or |g_|) + ldp x13, x14, [x1,#8*4+144] + + eor x9, x9, x17 // conditionally negate |u| (or |v|) + sub x21, x21, x17 + eor x10, x10, x17 + adds x9, x9, x17, lsr#63 + eor x11, x11, x17 + adcs x10, x10, xzr + eor x12, x12, x17 + adcs x11, x11, xzr + eor x13, x13, x17 + adcs x12, x12, xzr + umulh x22, x9, x21 + eor x14, x14, x17 + umulh x23, x10, x21 + adcs x13, x13, xzr + umulh x24, x11, x21 + adcs x14, x14, xzr + umulh x25, x12, x21 + adc x19, xzr, xzr // used in __smul_767x63_tail + umulh x26, x13, x21 + mul x9, x9, x21 + mul x10, x10, x21 + mul x11, x11, x21 + adds x10, x10, x22 + mul x12, x12, x21 + adcs x11, x11, x23 + mul x13, x13, x21 + adcs x12, x12, x24 + mul x28,x14, x21 + adcs x13, x13, x25 + adcs x28,x28,x26 + adc x2, x2, xzr + + adds x3, x3, x9 + adcs x4, x4, x10 + adcs x5, x5, x11 + adcs x6, x6, x12 + stp x3, x4, [x0,#8*0] + adcs x7, x7, x13 + stp x5, x6, [x0,#8*2] + adcs x27, x27, x28 + stp x7, x27, [x0,#8*4] + adc x28, x2, xzr // used in __smul_767x63_tail + + ret + ENDP + + + ALIGN 32 +|__smul_767x63_tail| PROC + smulh x27, x8, x20 + ldp x3, x4, [x1,#8*24] // load rest of |v| + umulh x14,x14, x21 + ldp x5, x6, [x1,#8*26] + ldp x7, x8, [x1,#8*28] + + eor x3, x3, x17 // conditionally negate rest of |v| + eor x4, x4, x17 + eor x5, x5, x17 + adds x3, x3, x19 + eor x6, x6, x17 + adcs x4, x4, xzr + eor x7, x7, x17 + adcs x5, x5, xzr + eor x8, x8, x17 + adcs x6, x6, xzr + umulh x22, x3, x21 + adcs x7, x7, xzr + umulh x23, x4, x21 + adc x8, x8, xzr + + umulh x24, x5, x21 + add x14, x14, x28 + umulh x25, x6, x21 + asr x28, x27, #63 + umulh x26, x7, x21 + mul x3, x3, x21 + mul x4, x4, x21 + mul x5, x5, x21 + adds x3, x3, x14 + mul x6, x6, x21 + adcs x4, x4, x22 + mul x7, x7, x21 + adcs x5, x5, x23 + mul x8, x8, x21 + adcs x6, x6, x24 + adcs x7, x7, x25 + adc x8, x8, x26 + + adds x3, x3, x27 + adcs x4, x4, x28 + adcs x5, x5, x28 + adcs x6, x6, x28 + stp x3, x4, [x0,#8*6] + adcs x7, x7, x28 + stp x5, x6, [x0,#8*8] + adc x8, x8, x28 + stp x7, x8, [x0,#8*10] + + ret + ENDP + + + ALIGN 32 +|__smul_383_n_shift_by_62| PROC + ldp x3, x4, [x1,#8*0+0] // load |a| (or |b|) + asr x28, x15, #63 // |f0|'s sign as mask (or |g0|'s) + ldp x5, x6, [x1,#8*2+0] + eor x2, x15, x28 // conditionally negate |f0| (or |g0|) + ldp x7, x8, [x1,#8*4+0] + + eor x3, x3, x28 // conditionally negate |a| (or |b|) + sub x2, x2, x28 + eor x4, x4, x28 + adds x3, x3, x28, lsr#63 + eor x5, x5, x28 + adcs x4, x4, xzr + eor x6, x6, x28 + adcs x5, x5, xzr + eor x7, x7, x28 + umulh x22, x3, x2 + adcs x6, x6, xzr + umulh x23, x4, x2 + eor x8, x8, x28 + umulh x24, x5, x2 + adcs x7, x7, xzr + umulh x25, x6, x2 + adc x8, x8, xzr + + umulh x26, x7, x2 + smulh x27, x8, x2 + mul x3, x3, x2 + mul x4, x4, x2 + mul x5, x5, x2 + adds x4, x4, x22 + mul x6, x6, x2 + adcs x5, x5, x23 + mul x7, x7, x2 + adcs x6, x6, x24 + mul x8, x8, x2 + adcs x7, x7, x25 + adcs x8, x8 ,x26 + adc x27, x27, xzr + ldp x9, x10, [x1,#8*0+48] // load |a| (or |b|) + asr x28, x16, #63 // |f0|'s sign as mask (or |g0|'s) + ldp x11, x12, [x1,#8*2+48] + eor x2, x16, x28 // conditionally negate |f0| (or |g0|) + ldp x13, x14, [x1,#8*4+48] + + eor x9, x9, x28 // conditionally negate |a| (or |b|) + sub x2, x2, x28 + eor x10, x10, x28 + adds x9, x9, x28, lsr#63 + eor x11, x11, x28 + adcs x10, x10, xzr + eor x12, x12, x28 + adcs x11, x11, xzr + eor x13, x13, x28 + umulh x22, x9, x2 + adcs x12, x12, xzr + umulh x23, x10, x2 + eor x14, x14, x28 + umulh x24, x11, x2 + adcs x13, x13, xzr + umulh x25, x12, x2 + adc x14, x14, xzr + + umulh x26, x13, x2 + smulh x28, x14, x2 + mul x9, x9, x2 + mul x10, x10, x2 + mul x11, x11, x2 + adds x10, x10, x22 + mul x12, x12, x2 + adcs x11, x11, x23 + mul x13, x13, x2 + adcs x12, x12, x24 + mul x14, x14, x2 + adcs x13, x13, x25 + adcs x14, x14 ,x26 + adc x28, x28, xzr + adds x3, x3, x9 + adcs x4, x4, x10 + adcs x5, x5, x11 + adcs x6, x6, x12 + adcs x7, x7, x13 + adcs x8, x8, x14 + adc x9, x27, x28 + + extr x3, x4, x3, #62 + extr x4, x5, x4, #62 + extr x5, x6, x5, #62 + asr x28, x9, #63 + extr x6, x7, x6, #62 + extr x7, x8, x7, #62 + extr x8, x9, x8, #62 + + eor x3, x3, x28 + eor x4, x4, x28 + adds x3, x3, x28, lsr#63 + eor x5, x5, x28 + adcs x4, x4, xzr + eor x6, x6, x28 + adcs x5, x5, xzr + eor x7, x7, x28 + adcs x6, x6, xzr + eor x8, x8, x28 + stp x3, x4, [x0,#8*0] + adcs x7, x7, xzr + stp x5, x6, [x0,#8*2] + adc x8, x8, xzr + stp x7, x8, [x0,#8*4] + + eor x15, x15, x28 + eor x16, x16, x28 + sub x15, x15, x28 + sub x16, x16, x28 + + ret + ENDP + + ALIGN 16 +|__ab_approximation_62| PROC + ldp x7, x8, [x1,#8*4] + ldp x13, x14, [x1,#8*10] + ldp x5, x6, [x1,#8*2] + ldp x11, x12, [x1,#8*8] + +|$Lab_approximation_62_loaded| + orr x22, x8, x14 // check top-most limbs, ... + cmp x22, #0 + cselne x8,x8,x7 + cselne x14,x14,x13 + cselne x7,x7,x6 + orr x22, x8, x14 // ... ones before top-most, ... + cselne x13,x13,x12 + + ldp x3, x4, [x1,#8*0] + ldp x9, x10, [x1,#8*6] + + cmp x22, #0 + cselne x8,x8,x7 + cselne x14,x14,x13 + cselne x7,x7,x5 + orr x22, x8, x14 // ... and ones before that ... + cselne x13,x13,x11 + + cmp x22, #0 + cselne x8,x8,x7 + cselne x14,x14,x13 + cselne x7,x7,x4 + orr x22, x8, x14 + cselne x13,x13,x10 + + clz x22, x22 + cmp x22, #64 + cselne x22,x22,xzr + cselne x8,x8,x7 + cselne x14,x14,x13 + neg x23, x22 + + lslv x8, x8, x22 // align high limbs to the left + lslv x14, x14, x22 + lsrv x7, x7, x23 + lsrv x13, x13, x23 + and x7, x7, x23, asr#6 + and x13, x13, x23, asr#6 + orr x8, x8, x7 + orr x14, x14, x13 + + b __inner_loop_62 + ret + ENDP + + ALIGN 16 +|__inner_loop_62| PROC + mov x15, #1 // |f0|=1 + mov x16, #0 // |g0|=0 + mov x17, #0 // |f1|=0 + mov x19, #1 // |g1|=1 + +|$Loop_62| + sbfx x28, x3, #0, #1 // if |a_| is odd, then we'll be subtracting + sub x2, x2, #1 + subs x24, x9, x3 // |b_|-|a_| + and x22, x9, x28 + sbc x25, x14, x8 + and x23, x14, x28 + subs x26, x3, x22 // |a_|-|b_| (or |a_|-0 if |a_| was even) + mov x22, x15 + sbcs x27, x8, x23 + mov x23, x16 + cselhs x9,x9,x3 + cselhs x14,x14,x8 + cselhs x3,x26,x24 + cselhs x8,x27,x25 + cselhs x15,x15,x17 + cselhs x17,x17,x22 + cselhs x16,x16,x19 + cselhs x19,x19,x23 + extr x3, x8, x3, #1 + lsr x8, x8, #1 + and x22, x17, x28 + and x23, x19, x28 + add x17, x17, x17 // |f1|<<=1 + add x19, x19, x19 // |g1|<<=1 + sub x15, x15, x22 // |f0|-=|f1| (or |f0-=0| if |a_| was even) + sub x16, x16, x23 // |g0|-=|g1| (or |g0-=0| ...) + cbnz x2, |$Loop_62| + + ret + ENDP + END diff --git a/temp-deps/blst-ts/deps/blst/build/win64/ct_is_square_mod_384-armv8.asm b/temp-deps/blst-ts/deps/blst/build/win64/ct_is_square_mod_384-armv8.asm new file mode 100644 index 0000000..ab72328 --- /dev/null +++ b/temp-deps/blst-ts/deps/blst/build/win64/ct_is_square_mod_384-armv8.asm @@ -0,0 +1,325 @@ + AREA |.text|,CODE,ALIGN=8,ARM64 + + + EXPORT |ct_is_square_mod_384|[FUNC] + ALIGN 32 +|ct_is_square_mod_384| PROC + DCDU 3573752639 + stp x29, x30, [sp,#-128]! + add x29, sp, #0 + stp x19, x20, [sp,#16] + stp x21, x22, [sp,#32] + stp x23, x24, [sp,#48] + stp x25, x26, [sp,#64] + stp x27, x28, [sp,#80] + sub sp, sp, #512 + + ldp x3, x4, [x0,#8*0] // load input + ldp x5, x6, [x0,#8*2] + ldp x7, x8, [x0,#8*4] + + add x0, sp, #255 // find closest 256-byte-aligned spot + and x0, x0, #-256 // in the frame... + + ldp x9, x10, [x1,#8*0] // load modulus + ldp x11, x12, [x1,#8*2] + ldp x13, x14, [x1,#8*4] + + stp x3, x4, [x0,#8*6] // copy input to |a| + stp x5, x6, [x0,#8*8] + stp x7, x8, [x0,#8*10] + stp x9, x10, [x0,#8*0] // copy modulus to |b| + stp x11, x12, [x0,#8*2] + stp x13, x14, [x0,#8*4] + + eor x2, x2, x2 // init the |$Legendre| symbol + mov x15, #24 // 24 is 768/30-1 + b |$Loop_is_square| + + ALIGN 16 +|$Loop_is_square| + bl __ab_approximation_30 + sub x15, x15, #1 + + eor x1, x0, #128 // pointer to dst |b| + bl __smul_384_n_shift_by_30 + + mov x19, x16 // |f0| + mov x20, x17 // |g0| + add x1, x1, #8*6 // pointer to dst |a| + bl __smul_384_n_shift_by_30 + + ldp x9, x10, [x1,#-8*6] + eor x0, x0, #128 // flip-flop src |a|b| + and x27, x27, x9 // if |a| was negative, + add x2, x2, x27, lsr#1 // adjust |L| + + cbnz x15, |$Loop_is_square| + + ////////////////////////////////////////// last iteration + //bl __ab_approximation_30 // |a| and |b| are exact, + //ldr x8, [x0,#8*6] // and loaded + //ldr x14, [x0,#8*0] + mov x15, #48 // 48 is 768%30 + 30 + bl __inner_loop_48 + ldr x30, [x29,#8] + + and x0, x2, #1 + eor x0, x0, #1 + + add sp, sp, #512 + ldp x19, x20, [x29,#16] + ldp x21, x22, [x29,#32] + ldp x23, x24, [x29,#48] + ldp x25, x26, [x29,#64] + ldp x27, x28, [x29,#80] + ldr x29, [sp],#128 + DCDU 3573752767 + ret + ENDP + + + ALIGN 32 +|__smul_384_n_shift_by_30| PROC + ldp x3, x4, [x0,#8*0+0] // load |b| (or |a|) + asr x27, x20, #63 // |g1|'s sign as mask (or |f1|'s) + ldp x5, x6, [x0,#8*2+0] + eor x20, x20, x27 // conditionally negate |g1| (or |f1|) + ldp x7, x8, [x0,#8*4+0] + + eor x3, x3, x27 // conditionally negate |b| (or |a|) + sub x20, x20, x27 + eor x4, x4, x27 + adds x3, x3, x27, lsr#63 + eor x5, x5, x27 + adcs x4, x4, xzr + eor x6, x6, x27 + adcs x5, x5, xzr + eor x7, x7, x27 + umulh x21, x3, x20 + adcs x6, x6, xzr + umulh x22, x4, x20 + eor x8, x8, x27 + umulh x23, x5, x20 + adcs x7, x7, xzr + umulh x24, x6, x20 + adc x8, x8, xzr + + umulh x25, x7, x20 + and x28, x20, x27 + umulh x26, x8, x20 + neg x28, x28 + mul x3, x3, x20 + mul x4, x4, x20 + mul x5, x5, x20 + adds x4, x4, x21 + mul x6, x6, x20 + adcs x5, x5, x22 + mul x7, x7, x20 + adcs x6, x6, x23 + mul x8, x8, x20 + adcs x7, x7, x24 + adcs x8, x8 ,x25 + adc x26, x26, x28 + ldp x9, x10, [x0,#8*0+48] // load |b| (or |a|) + asr x27, x19, #63 // |g1|'s sign as mask (or |f1|'s) + ldp x11, x12, [x0,#8*2+48] + eor x19, x19, x27 // conditionally negate |g1| (or |f1|) + ldp x13, x14, [x0,#8*4+48] + + eor x9, x9, x27 // conditionally negate |b| (or |a|) + sub x19, x19, x27 + eor x10, x10, x27 + adds x9, x9, x27, lsr#63 + eor x11, x11, x27 + adcs x10, x10, xzr + eor x12, x12, x27 + adcs x11, x11, xzr + eor x13, x13, x27 + umulh x21, x9, x19 + adcs x12, x12, xzr + umulh x22, x10, x19 + eor x14, x14, x27 + umulh x23, x11, x19 + adcs x13, x13, xzr + umulh x24, x12, x19 + adc x14, x14, xzr + + umulh x25, x13, x19 + and x28, x19, x27 + umulh x27, x14, x19 + neg x28, x28 + mul x9, x9, x19 + mul x10, x10, x19 + mul x11, x11, x19 + adds x10, x10, x21 + mul x12, x12, x19 + adcs x11, x11, x22 + mul x13, x13, x19 + adcs x12, x12, x23 + mul x14, x14, x19 + adcs x13, x13, x24 + adcs x14, x14 ,x25 + adc x27, x27, x28 + adds x3, x3, x9 + adcs x4, x4, x10 + adcs x5, x5, x11 + adcs x6, x6, x12 + adcs x7, x7, x13 + adcs x8, x8, x14 + adc x9, x26, x27 + + extr x3, x4, x3, #30 + extr x4, x5, x4, #30 + extr x5, x6, x5, #30 + asr x27, x9, #63 + extr x6, x7, x6, #30 + extr x7, x8, x7, #30 + extr x8, x9, x8, #30 + + eor x3, x3, x27 + eor x4, x4, x27 + adds x3, x3, x27, lsr#63 + eor x5, x5, x27 + adcs x4, x4, xzr + eor x6, x6, x27 + adcs x5, x5, xzr + eor x7, x7, x27 + adcs x6, x6, xzr + eor x8, x8, x27 + stp x3, x4, [x1,#8*0] + adcs x7, x7, xzr + stp x5, x6, [x1,#8*2] + adc x8, x8, xzr + stp x7, x8, [x1,#8*4] + + ret + ENDP + + ALIGN 16 +|__ab_approximation_30| PROC + ldp x13, x14, [x0,#8*4] // |a| is still in registers + ldp x11, x12, [x0,#8*2] + + orr x21, x8, x14 // check top-most limbs, ... + cmp x21, #0 + cselne x8,x8,x7 + cselne x14,x14,x13 + cselne x7,x7,x6 + orr x21, x8, x14 // ... ones before top-most, ... + cselne x13,x13,x12 + + cmp x21, #0 + cselne x8,x8,x7 + cselne x14,x14,x13 + cselne x7,x7,x5 + orr x21, x8, x14 // ... and ones before that ... + cselne x13,x13,x11 + + cmp x21, #0 + cselne x8,x8,x7 + cselne x14,x14,x13 + cselne x7,x7,x4 + orr x21, x8, x14 // and one more, ... + cselne x13,x13,x10 + + cmp x21, #0 + cselne x8,x8,x7 + cselne x14,x14,x13 + cselne x7,x7,x3 + orr x21, x8, x14 + cselne x13,x13,x9 + + clz x21, x21 + cmp x21, #64 + cselne x21,x21,xzr + cselne x8,x8,x7 + cselne x14,x14,x13 + neg x22, x21 + + lslv x8, x8, x21 // align high limbs to the left + lslv x14, x14, x21 + lsrv x7, x7, x22 + lsrv x13, x13, x22 + and x7, x7, x22, asr#6 + and x13, x13, x22, asr#6 + orr x8, x8, x7 + orr x14, x14, x13 + + bfxil x8, x3, #0, #32 + bfxil x14, x9, #0, #32 + + b __inner_loop_30 + ret + ENDP + + + ALIGN 16 +|__inner_loop_30| PROC + mov x28, #30 + mov x17, #0x7FFFFFFF80000000 // |f0|=1, |g0|=0 + mov x20, #0x800000007FFFFFFF // |f1|=0, |g1|=1 + mov x27,#0x7FFFFFFF7FFFFFFF + +|$Loop_30| + sbfx x24, x8, #0, #1 // if |a_| is odd, then we'll be subtracting + and x25, x8, x14 + sub x28, x28, #1 + and x21, x14, x24 + + sub x22, x14, x8 // |b_|-|a_| + subs x23, x8, x21 // |a_|-|b_| (or |a_|-0 if |a_| was even) + add x25, x2, x25, lsr#1 // L + (a_ & b_) >> 1 + mov x21, x20 + cselhs x14,x14,x8 + cselhs x8,x23,x22 + cselhs x20,x20,x17 + cselhs x17,x17,x21 + cselhs x2,x2,x25 + lsr x8, x8, #1 + and x21, x20, x24 + and x22, x27, x24 + add x23, x14, #2 + sub x17, x17, x21 // |f0|-=|f1| (or |f0-=0| if |a_| was even) + add x20, x20, x20 // |f1|<<=1 + add x2, x2, x23, lsr#2 // "negate" |L| if |b|%8 is 3 or 5 + add x17, x17, x22 + sub x20, x20, x27 + + cbnz x28, |$Loop_30| + + mov x27, #0x7FFFFFFF + ubfx x16, x17, #0, #32 + ubfx x17, x17, #32, #32 + ubfx x19, x20, #0, #32 + ubfx x20, x20, #32, #32 + sub x16, x16, x27 // remove the bias + sub x17, x17, x27 + sub x19, x19, x27 + sub x20, x20, x27 + + ret + ENDP + + ALIGN 16 +|__inner_loop_48| PROC +|$Loop_48| + sbfx x24, x3, #0, #1 // if |a_| is odd, then we'll be subtracting + and x25, x3, x9 + sub x15, x15, #1 + and x21, x9, x24 + sub x22, x9, x3 // |b_|-|a_| + subs x23, x3, x21 // |a_|-|b_| (or |a_|-0 if |a_| was even) + add x25, x2, x25, lsr#1 + cselhs x9,x9,x3 + cselhs x3,x23,x22 + cselhs x2,x2,x25 + add x23, x9, #2 + lsr x3, x3, #1 + add x2, x2, x23, lsr#2 // "negate" |L| if |b|%8 is 3 or 5 + + cbnz x15, |$Loop_48| + + ret + ENDP + END diff --git a/temp-deps/blst-ts/deps/blst/build/win64/ct_is_square_mod_384-x86_64.asm b/temp-deps/blst-ts/deps/blst/build/win64/ct_is_square_mod_384-x86_64.asm new file mode 100644 index 0000000..38de6fc --- /dev/null +++ b/temp-deps/blst-ts/deps/blst/build/win64/ct_is_square_mod_384-x86_64.asm @@ -0,0 +1,509 @@ +OPTION DOTNAME +.text$ SEGMENT ALIGN(256) 'CODE' + +PUBLIC ct_is_square_mod_384 + +ALIGN 32 +ct_is_square_mod_384 PROC PUBLIC + DB 243,15,30,250 + mov QWORD PTR[8+rsp],rdi ;WIN64 prologue + mov QWORD PTR[16+rsp],rsi + mov r11,rsp +$L$SEH_begin_ct_is_square_mod_384:: + mov rdi,rcx + mov rsi,rdx + + + + push rbp + + push rbx + + push r12 + + push r13 + + push r14 + + push r15 + + sub rsp,536 + +$L$SEH_body_ct_is_square_mod_384:: + + + lea rax,QWORD PTR[((24+255))+rsp] + and rax,-256 + + mov r8,QWORD PTR[rdi] + mov r9,QWORD PTR[8+rdi] + mov r10,QWORD PTR[16+rdi] + mov r11,QWORD PTR[24+rdi] + mov r12,QWORD PTR[32+rdi] + mov r13,QWORD PTR[40+rdi] + + mov r14,QWORD PTR[rsi] + mov r15,QWORD PTR[8+rsi] + mov rbx,QWORD PTR[16+rsi] + mov rcx,QWORD PTR[24+rsi] + mov rdx,QWORD PTR[32+rsi] + mov rdi,QWORD PTR[40+rsi] + mov rsi,rax + + mov QWORD PTR[rax],r8 + mov QWORD PTR[8+rax],r9 + mov QWORD PTR[16+rax],r10 + mov QWORD PTR[24+rax],r11 + mov QWORD PTR[32+rax],r12 + mov QWORD PTR[40+rax],r13 + + mov QWORD PTR[48+rax],r14 + mov QWORD PTR[56+rax],r15 + mov QWORD PTR[64+rax],rbx + mov QWORD PTR[72+rax],rcx + mov QWORD PTR[80+rax],rdx + mov QWORD PTR[88+rax],rdi + + xor rbp,rbp + mov ecx,24 + jmp $L$oop_is_square + +ALIGN 32 +$L$oop_is_square:: + mov DWORD PTR[16+rsp],ecx + + call __ab_approximation_30 + mov QWORD PTR[rsp],rax + mov QWORD PTR[8+rsp],rbx + + mov rdi,128+8*6 + xor rdi,rsi + call __smulq_384_n_shift_by_30 + + mov rdx,QWORD PTR[rsp] + mov rcx,QWORD PTR[8+rsp] + lea rdi,QWORD PTR[((-48))+rdi] + call __smulq_384_n_shift_by_30 + + mov ecx,DWORD PTR[16+rsp] + xor rsi,128 + + and r14,QWORD PTR[48+rdi] + shr r14,1 + add rbp,r14 + + sub ecx,1 + jnz $L$oop_is_square + + + + + mov r9,QWORD PTR[48+rsi] + call __inner_loop_48 + + mov rax,1 + and rax,rbp + xor rax,1 + + lea r8,QWORD PTR[536+rsp] + mov r15,QWORD PTR[r8] + + mov r14,QWORD PTR[8+r8] + + mov r13,QWORD PTR[16+r8] + + mov r12,QWORD PTR[24+r8] + + mov rbx,QWORD PTR[32+r8] + + mov rbp,QWORD PTR[40+r8] + + lea rsp,QWORD PTR[48+r8] + +$L$SEH_epilogue_ct_is_square_mod_384:: + mov rdi,QWORD PTR[8+rsp] ;WIN64 epilogue + mov rsi,QWORD PTR[16+rsp] + + DB 0F3h,0C3h ;repret + +$L$SEH_end_ct_is_square_mod_384:: +ct_is_square_mod_384 ENDP + + +ALIGN 32 +__smulq_384_n_shift_by_30 PROC PRIVATE + DB 243,15,30,250 + mov r8,QWORD PTR[rsi] + mov r9,QWORD PTR[8+rsi] + mov r10,QWORD PTR[16+rsi] + mov r11,QWORD PTR[24+rsi] + mov r12,QWORD PTR[32+rsi] + mov r13,QWORD PTR[40+rsi] + + mov rbx,rdx + sar rdx,63 + xor rax,rax + sub rax,rdx + + xor rbx,rdx + add rbx,rax + + xor r8,rdx + xor r9,rdx + xor r10,rdx + xor r11,rdx + xor r12,rdx + xor r13,rdx + add rax,r8 + adc r9,0 + adc r10,0 + adc r11,0 + adc r12,0 + adc r13,0 + + mov r14,rdx + and r14,rbx + mul rbx + mov r8,rax + mov rax,r9 + mov r9,rdx + mul rbx + add r9,rax + mov rax,r10 + adc rdx,0 + mov r10,rdx + mul rbx + add r10,rax + mov rax,r11 + adc rdx,0 + mov r11,rdx + mul rbx + add r11,rax + mov rax,r12 + adc rdx,0 + mov r12,rdx + mul rbx + add r12,rax + mov rax,r13 + adc rdx,0 + mov r13,rdx + neg r14 + mul rbx + add r13,rax + adc r14,rdx + lea rsi,QWORD PTR[48+rsi] + mov rdx,rcx + + mov QWORD PTR[rdi],r8 + mov QWORD PTR[8+rdi],r9 + mov QWORD PTR[16+rdi],r10 + mov QWORD PTR[24+rdi],r11 + mov QWORD PTR[32+rdi],r12 + mov QWORD PTR[40+rdi],r13 + mov r8,QWORD PTR[rsi] + mov r9,QWORD PTR[8+rsi] + mov r10,QWORD PTR[16+rsi] + mov r11,QWORD PTR[24+rsi] + mov r12,QWORD PTR[32+rsi] + mov r13,QWORD PTR[40+rsi] + + mov rbx,rdx + sar rdx,63 + xor rax,rax + sub rax,rdx + + xor rbx,rdx + add rbx,rax + + xor r8,rdx + xor r9,rdx + xor r10,rdx + xor r11,rdx + xor r12,rdx + xor r13,rdx + add rax,r8 + adc r9,0 + adc r10,0 + adc r11,0 + adc r12,0 + adc r13,0 + + mov r15,rdx + and r15,rbx + mul rbx + mov r8,rax + mov rax,r9 + mov r9,rdx + mul rbx + add r9,rax + mov rax,r10 + adc rdx,0 + mov r10,rdx + mul rbx + add r10,rax + mov rax,r11 + adc rdx,0 + mov r11,rdx + mul rbx + add r11,rax + mov rax,r12 + adc rdx,0 + mov r12,rdx + mul rbx + add r12,rax + mov rax,r13 + adc rdx,0 + mov r13,rdx + neg r15 + mul rbx + add r13,rax + adc r15,rdx + lea rsi,QWORD PTR[((-48))+rsi] + + add r8,QWORD PTR[rdi] + adc r9,QWORD PTR[8+rdi] + adc r10,QWORD PTR[16+rdi] + adc r11,QWORD PTR[24+rdi] + adc r12,QWORD PTR[32+rdi] + adc r13,QWORD PTR[40+rdi] + adc r14,r15 + + shrd r8,r9,30 + shrd r9,r10,30 + shrd r10,r11,30 + shrd r11,r12,30 + shrd r12,r13,30 + shrd r13,r14,30 + + sar r14,63 + xor rbx,rbx + sub rbx,r14 + + xor r8,r14 + xor r9,r14 + xor r10,r14 + xor r11,r14 + xor r12,r14 + xor r13,r14 + add r8,rbx + adc r9,0 + adc r10,0 + adc r11,0 + adc r12,0 + adc r13,0 + + mov QWORD PTR[rdi],r8 + mov QWORD PTR[8+rdi],r9 + mov QWORD PTR[16+rdi],r10 + mov QWORD PTR[24+rdi],r11 + mov QWORD PTR[32+rdi],r12 + mov QWORD PTR[40+rdi],r13 + + DB 0F3h,0C3h ;repret +__smulq_384_n_shift_by_30 ENDP + +ALIGN 32 +__ab_approximation_30 PROC PRIVATE + DB 243,15,30,250 + mov rbx,QWORD PTR[88+rsi] + mov r15,QWORD PTR[80+rsi] + mov r14,QWORD PTR[72+rsi] + + mov rax,r13 + or rax,rbx + cmovz r13,r12 + cmovz rbx,r15 + cmovz r12,r11 + mov r11,QWORD PTR[64+rsi] + cmovz r15,r14 + + mov rax,r13 + or rax,rbx + cmovz r13,r12 + cmovz rbx,r15 + cmovz r12,r10 + mov r10,QWORD PTR[56+rsi] + cmovz r15,r11 + + mov rax,r13 + or rax,rbx + cmovz r13,r12 + cmovz rbx,r15 + cmovz r12,r9 + mov r9,QWORD PTR[48+rsi] + cmovz r15,r10 + + mov rax,r13 + or rax,rbx + cmovz r13,r12 + cmovz rbx,r15 + cmovz r12,r8 + cmovz r15,r9 + + mov rax,r13 + or rax,rbx + bsr rcx,rax + lea rcx,QWORD PTR[1+rcx] + cmovz r13,r8 + cmovz rbx,r9 + cmovz rcx,rax + neg rcx + + + shld r13,r12,cl + shld rbx,r15,cl + + mov rax,0FFFFFFFF00000000h + mov r8d,r8d + mov r9d,r9d + and r13,rax + and rbx,rax + or r8,r13 + or r9,rbx + + jmp __inner_loop_30 + + DB 0F3h,0C3h ;repret +__ab_approximation_30 ENDP + +ALIGN 32 +__inner_loop_30 PROC PRIVATE + DB 243,15,30,250 + mov rbx,07FFFFFFF80000000h + mov rcx,0800000007FFFFFFFh + lea r15,QWORD PTR[((-1))+rbx] + mov edi,30 + +$L$oop_30:: + mov rax,r8 + and rax,r9 + shr rax,1 + + cmp r8,r9 + mov r10,r8 + mov r11,r9 + lea rax,QWORD PTR[rbp*1+rax] + mov r12,rbx + mov r13,rcx + mov r14,rbp + cmovb r8,r9 + cmovb r9,r10 + cmovb rbx,rcx + cmovb rcx,r12 + cmovb rbp,rax + + sub r8,r9 + sub rbx,rcx + add rbx,r15 + + test r10,1 + cmovz r8,r10 + cmovz r9,r11 + cmovz rbx,r12 + cmovz rcx,r13 + cmovz rbp,r14 + + lea rax,QWORD PTR[2+r9] + shr r8,1 + shr rax,2 + add rcx,rcx + lea rbp,QWORD PTR[rbp*1+rax] + sub rcx,r15 + + sub edi,1 + jnz $L$oop_30 + + shr r15,32 + mov eax,ebx + shr rbx,32 + mov edx,ecx + shr rcx,32 + sub rax,r15 + sub rbx,r15 + sub rdx,r15 + sub rcx,r15 + + DB 0F3h,0C3h ;repret +__inner_loop_30 ENDP + + +ALIGN 32 +__inner_loop_48 PROC PRIVATE + DB 243,15,30,250 + mov edi,48 + +$L$oop_48:: + mov rax,r8 + and rax,r9 + shr rax,1 + + cmp r8,r9 + mov r10,r8 + mov r11,r9 + lea rax,QWORD PTR[rbp*1+rax] + mov r12,rbp + cmovb r8,r9 + cmovb r9,r10 + cmovb rbp,rax + + sub r8,r9 + + test r10,1 + cmovz r8,r10 + cmovz r9,r11 + cmovz rbp,r12 + + lea rax,QWORD PTR[2+r9] + shr r8,1 + shr rax,2 + add rbp,rax + + sub edi,1 + jnz $L$oop_48 + + DB 0F3h,0C3h ;repret +__inner_loop_48 ENDP +.text$ ENDS +.pdata SEGMENT READONLY ALIGN(4) +ALIGN 4 + DD imagerel $L$SEH_begin_ct_is_square_mod_384 + DD imagerel $L$SEH_body_ct_is_square_mod_384 + DD imagerel $L$SEH_info_ct_is_square_mod_384_prologue + + DD imagerel $L$SEH_body_ct_is_square_mod_384 + DD imagerel $L$SEH_epilogue_ct_is_square_mod_384 + DD imagerel $L$SEH_info_ct_is_square_mod_384_body + + DD imagerel $L$SEH_epilogue_ct_is_square_mod_384 + DD imagerel $L$SEH_end_ct_is_square_mod_384 + DD imagerel $L$SEH_info_ct_is_square_mod_384_epilogue + +.pdata ENDS +.xdata SEGMENT READONLY ALIGN(8) +ALIGN 8 +$L$SEH_info_ct_is_square_mod_384_prologue:: +DB 1,0,5,00bh +DB 0,074h,1,0 +DB 0,064h,2,0 +DB 0,003h +DB 0,0 +$L$SEH_info_ct_is_square_mod_384_body:: +DB 1,0,18,0 +DB 000h,0f4h,043h,000h +DB 000h,0e4h,044h,000h +DB 000h,0d4h,045h,000h +DB 000h,0c4h,046h,000h +DB 000h,034h,047h,000h +DB 000h,054h,048h,000h +DB 000h,074h,04ah,000h +DB 000h,064h,04bh,000h +DB 000h,001h,049h,000h +$L$SEH_info_ct_is_square_mod_384_epilogue:: +DB 1,0,4,0 +DB 000h,074h,001h,000h +DB 000h,064h,002h,000h +DB 000h,000h,000h,000h + + +.xdata ENDS +END diff --git a/temp-deps/blst-ts/deps/blst/build/win64/ctq_inverse_mod_384-x86_64.asm b/temp-deps/blst-ts/deps/blst/build/win64/ctq_inverse_mod_384-x86_64.asm new file mode 100644 index 0000000..de79f8e --- /dev/null +++ b/temp-deps/blst-ts/deps/blst/build/win64/ctq_inverse_mod_384-x86_64.asm @@ -0,0 +1,1224 @@ +OPTION DOTNAME +.text$ SEGMENT ALIGN(256) 'CODE' + +PUBLIC ct_inverse_mod_383 + +ALIGN 32 +ct_inverse_mod_383 PROC PUBLIC + DB 243,15,30,250 + mov QWORD PTR[8+rsp],rdi ;WIN64 prologue + mov QWORD PTR[16+rsp],rsi + mov r11,rsp +$L$SEH_begin_ct_inverse_mod_383:: + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 + mov rcx,r9 + + + + push rbp + + push rbx + + push r12 + + push r13 + + push r14 + + push r15 + + sub rsp,1112 + +$L$SEH_body_ct_inverse_mod_383:: + + + lea rax,QWORD PTR[((88+511))+rsp] + and rax,-512 + mov QWORD PTR[32+rsp],rdi + mov QWORD PTR[40+rsp],rcx + + mov r8,QWORD PTR[rsi] + mov r9,QWORD PTR[8+rsi] + mov r10,QWORD PTR[16+rsi] + mov r11,QWORD PTR[24+rsi] + mov r12,QWORD PTR[32+rsi] + mov r13,QWORD PTR[40+rsi] + + mov r14,QWORD PTR[rdx] + mov r15,QWORD PTR[8+rdx] + mov rbx,QWORD PTR[16+rdx] + mov rbp,QWORD PTR[24+rdx] + mov rsi,QWORD PTR[32+rdx] + mov rdi,QWORD PTR[40+rdx] + + mov QWORD PTR[rax],r8 + mov QWORD PTR[8+rax],r9 + mov QWORD PTR[16+rax],r10 + mov QWORD PTR[24+rax],r11 + mov QWORD PTR[32+rax],r12 + mov QWORD PTR[40+rax],r13 + + mov QWORD PTR[48+rax],r14 + mov QWORD PTR[56+rax],r15 + mov QWORD PTR[64+rax],rbx + mov QWORD PTR[72+rax],rbp + mov QWORD PTR[80+rax],rsi + mov rsi,rax + mov QWORD PTR[88+rax],rdi + + + mov edi,62 + call __ab_approximation_62 + + + mov QWORD PTR[72+rsp],r12 + mov QWORD PTR[80+rsp],r13 + + mov rdi,256 + xor rdi,rsi + call __smulq_383_n_shift_by_62 + + + mov QWORD PTR[96+rdi],rdx + + mov rdx,QWORD PTR[72+rsp] + mov rcx,QWORD PTR[80+rsp] + lea rdi,QWORD PTR[48+rdi] + call __smulq_383_n_shift_by_62 + + + mov QWORD PTR[96+rdi],rdx + + + xor rsi,256 + mov edi,62 + call __ab_approximation_62 + + + mov QWORD PTR[72+rsp],r12 + mov QWORD PTR[80+rsp],r13 + + mov rdi,256 + xor rdi,rsi + call __smulq_383_n_shift_by_62 + mov QWORD PTR[56+rsp],rdx + mov QWORD PTR[64+rsp],rcx + + mov rdx,QWORD PTR[72+rsp] + mov rcx,QWORD PTR[80+rsp] + lea rdi,QWORD PTR[48+rdi] + call __smulq_383_n_shift_by_62 + + + + mov rax,QWORD PTR[96+rsi] + mov r11,QWORD PTR[144+rsi] + mov rbx,rdx + mov r10,rax + imul QWORD PTR[56+rsp] + mov r8,rax + mov rax,r11 + mov r9,rdx + imul QWORD PTR[64+rsp] + add r8,rax + adc r9,rdx + mov QWORD PTR[48+rdi],r8 + mov QWORD PTR[56+rdi],r9 + sar r9,63 + mov QWORD PTR[64+rdi],r9 + mov QWORD PTR[72+rdi],r9 + mov QWORD PTR[80+rdi],r9 + mov QWORD PTR[88+rdi],r9 + lea rsi,QWORD PTR[96+rsi] + + mov rax,r10 + imul rbx + mov r8,rax + mov rax,r11 + mov r9,rdx + imul rcx + add r8,rax + adc r9,rdx + mov QWORD PTR[96+rdi],r8 + mov QWORD PTR[104+rdi],r9 + sar r9,63 + mov QWORD PTR[112+rdi],r9 + mov QWORD PTR[120+rdi],r9 + mov QWORD PTR[128+rdi],r9 + mov QWORD PTR[136+rdi],r9 + xor rsi,256+8*12 + mov edi,62 + call __ab_approximation_62 + + + mov QWORD PTR[72+rsp],r12 + mov QWORD PTR[80+rsp],r13 + + mov rdi,256 + xor rdi,rsi + call __smulq_383_n_shift_by_62 + mov QWORD PTR[56+rsp],rdx + mov QWORD PTR[64+rsp],rcx + + mov rdx,QWORD PTR[72+rsp] + mov rcx,QWORD PTR[80+rsp] + lea rdi,QWORD PTR[48+rdi] + call __smulq_383_n_shift_by_62 + mov QWORD PTR[72+rsp],rdx + mov QWORD PTR[80+rsp],rcx + + mov rdx,QWORD PTR[56+rsp] + mov rcx,QWORD PTR[64+rsp] + lea rsi,QWORD PTR[96+rsi] + lea rdi,QWORD PTR[48+rdi] + call __smulq_383x63 + + mov rdx,QWORD PTR[72+rsp] + mov rcx,QWORD PTR[80+rsp] + lea rdi,QWORD PTR[48+rdi] + call __smulq_383x63 + xor rsi,256+8*12 + mov edi,62 + call __ab_approximation_62 + + + mov QWORD PTR[72+rsp],r12 + mov QWORD PTR[80+rsp],r13 + + mov rdi,256 + xor rdi,rsi + call __smulq_383_n_shift_by_62 + mov QWORD PTR[56+rsp],rdx + mov QWORD PTR[64+rsp],rcx + + mov rdx,QWORD PTR[72+rsp] + mov rcx,QWORD PTR[80+rsp] + lea rdi,QWORD PTR[48+rdi] + call __smulq_383_n_shift_by_62 + mov QWORD PTR[72+rsp],rdx + mov QWORD PTR[80+rsp],rcx + + mov rdx,QWORD PTR[56+rsp] + mov rcx,QWORD PTR[64+rsp] + lea rsi,QWORD PTR[96+rsi] + lea rdi,QWORD PTR[48+rdi] + call __smulq_383x63 + + mov rdx,QWORD PTR[72+rsp] + mov rcx,QWORD PTR[80+rsp] + lea rdi,QWORD PTR[48+rdi] + call __smulq_383x63 + xor rsi,256+8*12 + mov edi,62 + call __ab_approximation_62 + + + mov QWORD PTR[72+rsp],r12 + mov QWORD PTR[80+rsp],r13 + + mov rdi,256 + xor rdi,rsi + call __smulq_383_n_shift_by_62 + mov QWORD PTR[56+rsp],rdx + mov QWORD PTR[64+rsp],rcx + + mov rdx,QWORD PTR[72+rsp] + mov rcx,QWORD PTR[80+rsp] + lea rdi,QWORD PTR[48+rdi] + call __smulq_383_n_shift_by_62 + mov QWORD PTR[72+rsp],rdx + mov QWORD PTR[80+rsp],rcx + + mov rdx,QWORD PTR[56+rsp] + mov rcx,QWORD PTR[64+rsp] + lea rsi,QWORD PTR[96+rsi] + lea rdi,QWORD PTR[48+rdi] + call __smulq_383x63 + + mov rdx,QWORD PTR[72+rsp] + mov rcx,QWORD PTR[80+rsp] + lea rdi,QWORD PTR[48+rdi] + call __smulq_383x63 + xor rsi,256+8*12 + mov edi,62 + call __ab_approximation_62 + + + mov QWORD PTR[72+rsp],r12 + mov QWORD PTR[80+rsp],r13 + + mov rdi,256 + xor rdi,rsi + call __smulq_383_n_shift_by_62 + mov QWORD PTR[56+rsp],rdx + mov QWORD PTR[64+rsp],rcx + + mov rdx,QWORD PTR[72+rsp] + mov rcx,QWORD PTR[80+rsp] + lea rdi,QWORD PTR[48+rdi] + call __smulq_383_n_shift_by_62 + mov QWORD PTR[72+rsp],rdx + mov QWORD PTR[80+rsp],rcx + + mov rdx,QWORD PTR[56+rsp] + mov rcx,QWORD PTR[64+rsp] + lea rsi,QWORD PTR[96+rsi] + lea rdi,QWORD PTR[48+rdi] + call __smulq_383x63 + + mov rdx,QWORD PTR[72+rsp] + mov rcx,QWORD PTR[80+rsp] + lea rdi,QWORD PTR[48+rdi] + call __smulq_383x63 + sar r13,63 + mov QWORD PTR[48+rdi],r13 + mov QWORD PTR[56+rdi],r13 + mov QWORD PTR[64+rdi],r13 + mov QWORD PTR[72+rdi],r13 + mov QWORD PTR[80+rdi],r13 + mov QWORD PTR[88+rdi],r13 + xor rsi,256+8*12 + mov edi,62 + call __ab_approximation_62 + + + mov QWORD PTR[72+rsp],r12 + mov QWORD PTR[80+rsp],r13 + + mov rdi,256 + xor rdi,rsi + call __smulq_383_n_shift_by_62 + mov QWORD PTR[56+rsp],rdx + mov QWORD PTR[64+rsp],rcx + + mov rdx,QWORD PTR[72+rsp] + mov rcx,QWORD PTR[80+rsp] + lea rdi,QWORD PTR[48+rdi] + call __smulq_383_n_shift_by_62 + mov QWORD PTR[72+rsp],rdx + mov QWORD PTR[80+rsp],rcx + + mov rdx,QWORD PTR[56+rsp] + mov rcx,QWORD PTR[64+rsp] + lea rsi,QWORD PTR[96+rsi] + lea rdi,QWORD PTR[48+rdi] + call __smulq_383x63 + + mov rdx,QWORD PTR[72+rsp] + mov rcx,QWORD PTR[80+rsp] + lea rdi,QWORD PTR[48+rdi] + call __smulq_767x63 + xor rsi,256+8*12 + mov edi,62 + call __ab_approximation_62 + + + mov QWORD PTR[72+rsp],r12 + mov QWORD PTR[80+rsp],r13 + + mov rdi,256 + xor rdi,rsi + call __smulq_383_n_shift_by_62 + mov QWORD PTR[56+rsp],rdx + mov QWORD PTR[64+rsp],rcx + + mov rdx,QWORD PTR[72+rsp] + mov rcx,QWORD PTR[80+rsp] + lea rdi,QWORD PTR[48+rdi] + call __smulq_383_n_shift_by_62 + mov QWORD PTR[72+rsp],rdx + mov QWORD PTR[80+rsp],rcx + + mov rdx,QWORD PTR[56+rsp] + mov rcx,QWORD PTR[64+rsp] + lea rsi,QWORD PTR[96+rsi] + lea rdi,QWORD PTR[48+rdi] + call __smulq_383x63 + + mov rdx,QWORD PTR[72+rsp] + mov rcx,QWORD PTR[80+rsp] + lea rdi,QWORD PTR[48+rdi] + call __smulq_767x63 + xor rsi,256+8*12 + mov edi,62 + call __ab_approximation_62 + + + mov QWORD PTR[72+rsp],r12 + mov QWORD PTR[80+rsp],r13 + + mov rdi,256 + xor rdi,rsi + call __smulq_383_n_shift_by_62 + mov QWORD PTR[56+rsp],rdx + mov QWORD PTR[64+rsp],rcx + + mov rdx,QWORD PTR[72+rsp] + mov rcx,QWORD PTR[80+rsp] + lea rdi,QWORD PTR[48+rdi] + call __smulq_383_n_shift_by_62 + mov QWORD PTR[72+rsp],rdx + mov QWORD PTR[80+rsp],rcx + + mov rdx,QWORD PTR[56+rsp] + mov rcx,QWORD PTR[64+rsp] + lea rsi,QWORD PTR[96+rsi] + lea rdi,QWORD PTR[48+rdi] + call __smulq_383x63 + + mov rdx,QWORD PTR[72+rsp] + mov rcx,QWORD PTR[80+rsp] + lea rdi,QWORD PTR[48+rdi] + call __smulq_767x63 + xor rsi,256+8*12 + mov edi,62 + call __ab_approximation_62 + + + mov QWORD PTR[72+rsp],r12 + mov QWORD PTR[80+rsp],r13 + + mov rdi,256 + xor rdi,rsi + call __smulq_383_n_shift_by_62 + mov QWORD PTR[56+rsp],rdx + mov QWORD PTR[64+rsp],rcx + + mov rdx,QWORD PTR[72+rsp] + mov rcx,QWORD PTR[80+rsp] + lea rdi,QWORD PTR[48+rdi] + call __smulq_383_n_shift_by_62 + mov QWORD PTR[72+rsp],rdx + mov QWORD PTR[80+rsp],rcx + + mov rdx,QWORD PTR[56+rsp] + mov rcx,QWORD PTR[64+rsp] + lea rsi,QWORD PTR[96+rsi] + lea rdi,QWORD PTR[48+rdi] + call __smulq_383x63 + + mov rdx,QWORD PTR[72+rsp] + mov rcx,QWORD PTR[80+rsp] + lea rdi,QWORD PTR[48+rdi] + call __smulq_767x63 + xor rsi,256+8*12 + mov edi,62 + call __ab_approximation_62 + + + mov QWORD PTR[72+rsp],r12 + mov QWORD PTR[80+rsp],r13 + + mov rdi,256 + xor rdi,rsi + call __smulq_383_n_shift_by_62 + mov QWORD PTR[56+rsp],rdx + mov QWORD PTR[64+rsp],rcx + + mov rdx,QWORD PTR[72+rsp] + mov rcx,QWORD PTR[80+rsp] + lea rdi,QWORD PTR[48+rdi] + call __smulq_383_n_shift_by_62 + mov QWORD PTR[72+rsp],rdx + mov QWORD PTR[80+rsp],rcx + + mov rdx,QWORD PTR[56+rsp] + mov rcx,QWORD PTR[64+rsp] + lea rsi,QWORD PTR[96+rsi] + lea rdi,QWORD PTR[48+rdi] + call __smulq_383x63 + + mov rdx,QWORD PTR[72+rsp] + mov rcx,QWORD PTR[80+rsp] + lea rdi,QWORD PTR[48+rdi] + call __smulq_767x63 + + xor rsi,256+8*12 + mov edi,62 + + mov r8,QWORD PTR[rsi] + mov r9,QWORD PTR[8+rsi] + mov r10,QWORD PTR[48+rsi] + mov r11,QWORD PTR[56+rsi] + call __inner_loop_62 + + + mov QWORD PTR[72+rsp],r12 + mov QWORD PTR[80+rsp],r13 + + mov rdi,256 + xor rdi,rsi + mov QWORD PTR[rdi],r8 + mov QWORD PTR[48+rdi],r10 + + + + lea rsi,QWORD PTR[96+rsi] + lea rdi,QWORD PTR[96+rdi] + call __smulq_383x63 + + mov rdx,QWORD PTR[72+rsp] + mov rcx,QWORD PTR[80+rsp] + lea rdi,QWORD PTR[48+rdi] + call __smulq_767x63 + + + xor rsi,256+8*12 + mov edi,22 + + mov r8,QWORD PTR[rsi] + xor r9,r9 + mov r10,QWORD PTR[48+rsi] + xor r11,r11 + call __inner_loop_62 + + + + + + + + lea rsi,QWORD PTR[96+rsi] + + + + + + mov rdx,r12 + mov rcx,r13 + mov rdi,QWORD PTR[32+rsp] + call __smulq_767x63 + + mov rsi,QWORD PTR[40+rsp] + mov rdx,rax + sar rax,63 + + mov r8,rax + mov r9,rax + mov r10,rax + and r8,QWORD PTR[rsi] + and r9,QWORD PTR[8+rsi] + mov r11,rax + and r10,QWORD PTR[16+rsi] + and r11,QWORD PTR[24+rsi] + mov r12,rax + and r12,QWORD PTR[32+rsi] + and rax,QWORD PTR[40+rsi] + + add r14,r8 + adc r15,r9 + adc rbx,r10 + adc rbp,r11 + adc rcx,r12 + adc rdx,rax + + mov QWORD PTR[48+rdi],r14 + mov QWORD PTR[56+rdi],r15 + mov QWORD PTR[64+rdi],rbx + mov QWORD PTR[72+rdi],rbp + mov QWORD PTR[80+rdi],rcx + mov QWORD PTR[88+rdi],rdx + + lea r8,QWORD PTR[1112+rsp] + mov r15,QWORD PTR[r8] + + mov r14,QWORD PTR[8+r8] + + mov r13,QWORD PTR[16+r8] + + mov r12,QWORD PTR[24+r8] + + mov rbx,QWORD PTR[32+r8] + + mov rbp,QWORD PTR[40+r8] + + lea rsp,QWORD PTR[48+r8] + +$L$SEH_epilogue_ct_inverse_mod_383:: + mov rdi,QWORD PTR[8+rsp] ;WIN64 epilogue + mov rsi,QWORD PTR[16+rsp] + + DB 0F3h,0C3h ;repret + +$L$SEH_end_ct_inverse_mod_383:: +ct_inverse_mod_383 ENDP + +ALIGN 32 +__smulq_767x63 PROC PRIVATE + DB 243,15,30,250 + mov r8,QWORD PTR[rsi] + mov r9,QWORD PTR[8+rsi] + mov r10,QWORD PTR[16+rsi] + mov r11,QWORD PTR[24+rsi] + mov r12,QWORD PTR[32+rsi] + mov r13,QWORD PTR[40+rsi] + + mov rbp,rdx + sar rdx,63 + xor rax,rax + sub rax,rdx + + mov QWORD PTR[8+rsp],rdi + mov QWORD PTR[16+rsp],rsi + lea rsi,QWORD PTR[48+rsi] + + xor rbp,rdx + add rbp,rax + + xor r8,rdx + xor r9,rdx + xor r10,rdx + xor r11,rdx + xor r12,rdx + xor r13,rdx + add rax,r8 + adc r9,0 + adc r10,0 + adc r11,0 + adc r12,0 + adc r13,0 + + mul rbp + mov QWORD PTR[rdi],rax + mov rax,r9 + mov r9,rdx + mul rbp + add r9,rax + mov rax,r10 + adc rdx,0 + mov r10,rdx + mov QWORD PTR[8+rdi],r9 + mul rbp + add r10,rax + mov rax,r11 + adc rdx,0 + mov r11,rdx + mov QWORD PTR[16+rdi],r10 + mul rbp + add r11,rax + mov rax,r12 + adc rdx,0 + mov r12,rdx + mov QWORD PTR[24+rdi],r11 + mul rbp + add r12,rax + mov rax,r13 + adc rdx,0 + mov r13,rdx + mov QWORD PTR[32+rdi],r12 + imul rbp + add r13,rax + adc rdx,0 + + mov QWORD PTR[40+rdi],r13 + mov QWORD PTR[48+rdi],rdx + sar rdx,63 + mov QWORD PTR[56+rdi],rdx + mov rdx,rcx + + mov r8,QWORD PTR[rsi] + mov r9,QWORD PTR[8+rsi] + mov r10,QWORD PTR[16+rsi] + mov r11,QWORD PTR[24+rsi] + mov r12,QWORD PTR[32+rsi] + mov r13,QWORD PTR[40+rsi] + mov r14,QWORD PTR[48+rsi] + mov r15,QWORD PTR[56+rsi] + mov rbx,QWORD PTR[64+rsi] + mov rbp,QWORD PTR[72+rsi] + mov rcx,QWORD PTR[80+rsi] + mov rdi,QWORD PTR[88+rsi] + + mov rsi,rdx + sar rdx,63 + xor rax,rax + sub rax,rdx + + xor rsi,rdx + add rsi,rax + + xor r8,rdx + xor r9,rdx + xor r10,rdx + xor r11,rdx + xor r12,rdx + xor r13,rdx + xor r14,rdx + xor r15,rdx + xor rbx,rdx + xor rbp,rdx + xor rcx,rdx + xor rdi,rdx + add rax,r8 + adc r9,0 + adc r10,0 + adc r11,0 + adc r12,0 + adc r13,0 + adc r14,0 + adc r15,0 + adc rbx,0 + adc rbp,0 + adc rcx,0 + adc rdi,0 + + mul rsi + mov r8,rax + mov rax,r9 + mov r9,rdx + mul rsi + add r9,rax + mov rax,r10 + adc rdx,0 + mov r10,rdx + mul rsi + add r10,rax + mov rax,r11 + adc rdx,0 + mov r11,rdx + mul rsi + add r11,rax + mov rax,r12 + adc rdx,0 + mov r12,rdx + mul rsi + add r12,rax + mov rax,r13 + adc rdx,0 + mov r13,rdx + mul rsi + add r13,rax + mov rax,r14 + adc rdx,0 + mov r14,rdx + mul rsi + add r14,rax + mov rax,r15 + adc rdx,0 + mov r15,rdx + mul rsi + add r15,rax + mov rax,rbx + adc rdx,0 + mov rbx,rdx + mul rsi + add rbx,rax + mov rax,rbp + adc rdx,0 + mov rbp,rdx + mul rsi + add rbp,rax + mov rax,rcx + adc rdx,0 + mov rcx,rdx + mul rsi + add rcx,rax + mov rax,rdi + adc rdx,0 + mov rdi,rdx + mov rdx,QWORD PTR[8+rsp] + imul rax,rsi + mov rsi,QWORD PTR[16+rsp] + add rax,rdi + + add r8,QWORD PTR[rdx] + adc r9,QWORD PTR[8+rdx] + adc r10,QWORD PTR[16+rdx] + adc r11,QWORD PTR[24+rdx] + adc r12,QWORD PTR[32+rdx] + adc r13,QWORD PTR[40+rdx] + adc r14,QWORD PTR[48+rdx] + mov rdi,QWORD PTR[56+rdx] + adc r15,rdi + adc rbx,rdi + adc rbp,rdi + adc rcx,rdi + adc rax,rdi + + mov rdi,rdx + + mov QWORD PTR[rdx],r8 + mov QWORD PTR[8+rdx],r9 + mov QWORD PTR[16+rdx],r10 + mov QWORD PTR[24+rdx],r11 + mov QWORD PTR[32+rdx],r12 + mov QWORD PTR[40+rdx],r13 + mov QWORD PTR[48+rdx],r14 + mov QWORD PTR[56+rdx],r15 + mov QWORD PTR[64+rdx],rbx + mov QWORD PTR[72+rdx],rbp + mov QWORD PTR[80+rdx],rcx + mov QWORD PTR[88+rdx],rax + + DB 0F3h,0C3h ;repret +__smulq_767x63 ENDP + +ALIGN 32 +__smulq_383x63 PROC PRIVATE + DB 243,15,30,250 + mov r8,QWORD PTR[rsi] + mov r9,QWORD PTR[8+rsi] + mov r10,QWORD PTR[16+rsi] + mov r11,QWORD PTR[24+rsi] + mov r12,QWORD PTR[32+rsi] + mov r13,QWORD PTR[40+rsi] + + mov rbp,rdx + sar rdx,63 + xor rax,rax + sub rax,rdx + + xor rbp,rdx + add rbp,rax + + xor r8,rdx + xor r9,rdx + xor r10,rdx + xor r11,rdx + xor r12,rdx + xor r13,rdx + add rax,r8 + adc r9,0 + adc r10,0 + adc r11,0 + adc r12,0 + adc r13,0 + + mul rbp + mov r8,rax + mov rax,r9 + mov r9,rdx + mul rbp + add r9,rax + mov rax,r10 + adc rdx,0 + mov r10,rdx + mul rbp + add r10,rax + mov rax,r11 + adc rdx,0 + mov r11,rdx + mul rbp + add r11,rax + mov rax,r12 + adc rdx,0 + mov r12,rdx + mul rbp + add r12,rax + mov rax,r13 + adc rdx,0 + mov r13,rdx + imul rax,rbp + add r13,rax + + lea rsi,QWORD PTR[48+rsi] + mov rdx,rcx + + mov QWORD PTR[rdi],r8 + mov QWORD PTR[8+rdi],r9 + mov QWORD PTR[16+rdi],r10 + mov QWORD PTR[24+rdi],r11 + mov QWORD PTR[32+rdi],r12 + mov QWORD PTR[40+rdi],r13 + mov r8,QWORD PTR[rsi] + mov r9,QWORD PTR[8+rsi] + mov r10,QWORD PTR[16+rsi] + mov r11,QWORD PTR[24+rsi] + mov r12,QWORD PTR[32+rsi] + mov r13,QWORD PTR[40+rsi] + + mov rbp,rdx + sar rdx,63 + xor rax,rax + sub rax,rdx + + xor rbp,rdx + add rbp,rax + + xor r8,rdx + xor r9,rdx + xor r10,rdx + xor r11,rdx + xor r12,rdx + xor r13,rdx + add rax,r8 + adc r9,0 + adc r10,0 + adc r11,0 + adc r12,0 + adc r13,0 + + mul rbp + mov r8,rax + mov rax,r9 + mov r9,rdx + mul rbp + add r9,rax + mov rax,r10 + adc rdx,0 + mov r10,rdx + mul rbp + add r10,rax + mov rax,r11 + adc rdx,0 + mov r11,rdx + mul rbp + add r11,rax + mov rax,r12 + adc rdx,0 + mov r12,rdx + mul rbp + add r12,rax + mov rax,r13 + adc rdx,0 + mov r13,rdx + imul rax,rbp + add r13,rax + + lea rsi,QWORD PTR[((-48))+rsi] + + add r8,QWORD PTR[rdi] + adc r9,QWORD PTR[8+rdi] + adc r10,QWORD PTR[16+rdi] + adc r11,QWORD PTR[24+rdi] + adc r12,QWORD PTR[32+rdi] + adc r13,QWORD PTR[40+rdi] + + mov QWORD PTR[rdi],r8 + mov QWORD PTR[8+rdi],r9 + mov QWORD PTR[16+rdi],r10 + mov QWORD PTR[24+rdi],r11 + mov QWORD PTR[32+rdi],r12 + mov QWORD PTR[40+rdi],r13 + + DB 0F3h,0C3h ;repret +__smulq_383x63 ENDP + +ALIGN 32 +__smulq_383_n_shift_by_62 PROC PRIVATE + DB 243,15,30,250 + mov rbx,rdx + mov r8,QWORD PTR[rsi] + mov r9,QWORD PTR[8+rsi] + mov r10,QWORD PTR[16+rsi] + mov r11,QWORD PTR[24+rsi] + mov r12,QWORD PTR[32+rsi] + mov r13,QWORD PTR[40+rsi] + + mov rbp,rdx + sar rdx,63 + xor rax,rax + sub rax,rdx + + xor rbp,rdx + add rbp,rax + + xor r8,rdx + xor r9,rdx + xor r10,rdx + xor r11,rdx + xor r12,rdx + xor r13,rdx + add rax,r8 + adc r9,0 + adc r10,0 + adc r11,0 + adc r12,0 + adc r13,0 + + mul rbp + mov r8,rax + mov rax,r9 + mov r9,rdx + mul rbp + add r9,rax + mov rax,r10 + adc rdx,0 + mov r10,rdx + mul rbp + add r10,rax + mov rax,r11 + adc rdx,0 + mov r11,rdx + mul rbp + add r11,rax + mov rax,r12 + adc rdx,0 + mov r12,rdx + mul rbp + add r12,rax + mov rax,r13 + adc rdx,0 + mov r13,rdx + imul rbp + add r13,rax + adc rdx,0 + + lea rsi,QWORD PTR[48+rsi] + mov r14,rdx + mov rdx,rcx + + mov QWORD PTR[rdi],r8 + mov QWORD PTR[8+rdi],r9 + mov QWORD PTR[16+rdi],r10 + mov QWORD PTR[24+rdi],r11 + mov QWORD PTR[32+rdi],r12 + mov QWORD PTR[40+rdi],r13 + mov r8,QWORD PTR[rsi] + mov r9,QWORD PTR[8+rsi] + mov r10,QWORD PTR[16+rsi] + mov r11,QWORD PTR[24+rsi] + mov r12,QWORD PTR[32+rsi] + mov r13,QWORD PTR[40+rsi] + + mov rbp,rdx + sar rdx,63 + xor rax,rax + sub rax,rdx + + xor rbp,rdx + add rbp,rax + + xor r8,rdx + xor r9,rdx + xor r10,rdx + xor r11,rdx + xor r12,rdx + xor r13,rdx + add rax,r8 + adc r9,0 + adc r10,0 + adc r11,0 + adc r12,0 + adc r13,0 + + mul rbp + mov r8,rax + mov rax,r9 + mov r9,rdx + mul rbp + add r9,rax + mov rax,r10 + adc rdx,0 + mov r10,rdx + mul rbp + add r10,rax + mov rax,r11 + adc rdx,0 + mov r11,rdx + mul rbp + add r11,rax + mov rax,r12 + adc rdx,0 + mov r12,rdx + mul rbp + add r12,rax + mov rax,r13 + adc rdx,0 + mov r13,rdx + imul rbp + add r13,rax + adc rdx,0 + + lea rsi,QWORD PTR[((-48))+rsi] + + add r8,QWORD PTR[rdi] + adc r9,QWORD PTR[8+rdi] + adc r10,QWORD PTR[16+rdi] + adc r11,QWORD PTR[24+rdi] + adc r12,QWORD PTR[32+rdi] + adc r13,QWORD PTR[40+rdi] + adc r14,rdx + mov rdx,rbx + + shrd r8,r9,62 + shrd r9,r10,62 + shrd r10,r11,62 + shrd r11,r12,62 + shrd r12,r13,62 + shrd r13,r14,62 + + sar r14,63 + xor rbp,rbp + sub rbp,r14 + + xor r8,r14 + xor r9,r14 + xor r10,r14 + xor r11,r14 + xor r12,r14 + xor r13,r14 + add r8,rbp + adc r9,0 + adc r10,0 + adc r11,0 + adc r12,0 + adc r13,0 + + mov QWORD PTR[rdi],r8 + mov QWORD PTR[8+rdi],r9 + mov QWORD PTR[16+rdi],r10 + mov QWORD PTR[24+rdi],r11 + mov QWORD PTR[32+rdi],r12 + mov QWORD PTR[40+rdi],r13 + + xor rdx,r14 + xor rcx,r14 + add rdx,rbp + add rcx,rbp + + DB 0F3h,0C3h ;repret +__smulq_383_n_shift_by_62 ENDP + +ALIGN 32 +__ab_approximation_62 PROC PRIVATE + DB 243,15,30,250 + mov r9,QWORD PTR[40+rsi] + mov r11,QWORD PTR[88+rsi] + mov rbx,QWORD PTR[32+rsi] + mov rbp,QWORD PTR[80+rsi] + mov r8,QWORD PTR[24+rsi] + mov r10,QWORD PTR[72+rsi] + + mov rax,r9 + or rax,r11 + cmovz r9,rbx + cmovz r11,rbp + cmovz rbx,r8 + cmovz rbp,r10 + mov r8,QWORD PTR[16+rsi] + mov r10,QWORD PTR[64+rsi] + + mov rax,r9 + or rax,r11 + cmovz r9,rbx + cmovz r11,rbp + cmovz rbx,r8 + cmovz rbp,r10 + mov r8,QWORD PTR[8+rsi] + mov r10,QWORD PTR[56+rsi] + + mov rax,r9 + or rax,r11 + cmovz r9,rbx + cmovz r11,rbp + cmovz rbx,r8 + cmovz rbp,r10 + mov r8,QWORD PTR[rsi] + mov r10,QWORD PTR[48+rsi] + + mov rax,r9 + or rax,r11 + bsr rcx,rax + lea rcx,QWORD PTR[1+rcx] + cmovz r9,rbx + cmovz r11,rbp + cmovz rcx,rax + neg rcx + + + shld r9,rbx,cl + shld r11,rbp,cl + + jmp __inner_loop_62 + + DB 0F3h,0C3h ;repret +__ab_approximation_62 ENDP + +ALIGN 8 + DD 0 +__inner_loop_62 PROC PRIVATE + DB 243,15,30,250 + mov rdx,1 + xor rcx,rcx + xor r12,r12 + mov r13,1 + mov QWORD PTR[8+rsp],rsi + +$L$oop_62:: + xor rax,rax + xor rbx,rbx + test r8,1 + mov rbp,r10 + mov r14,r11 + cmovnz rax,r10 + cmovnz rbx,r11 + sub rbp,r8 + sbb r14,r9 + mov r15,r8 + mov rsi,r9 + sub r8,rax + sbb r9,rbx + cmovc r8,rbp + cmovc r9,r14 + cmovc r10,r15 + cmovc r11,rsi + mov rax,rdx + cmovc rdx,r12 + cmovc r12,rax + mov rbx,rcx + cmovc rcx,r13 + cmovc r13,rbx + xor rax,rax + xor rbx,rbx + shrd r8,r9,1 + shr r9,1 + test r15,1 + cmovnz rax,r12 + cmovnz rbx,r13 + add r12,r12 + add r13,r13 + sub rdx,rax + sub rcx,rbx + sub edi,1 + jnz $L$oop_62 + + mov rsi,QWORD PTR[8+rsp] + DB 0F3h,0C3h ;repret +__inner_loop_62 ENDP +.text$ ENDS +.pdata SEGMENT READONLY ALIGN(4) +ALIGN 4 + DD imagerel $L$SEH_begin_ct_inverse_mod_383 + DD imagerel $L$SEH_body_ct_inverse_mod_383 + DD imagerel $L$SEH_info_ct_inverse_mod_383_prologue + + DD imagerel $L$SEH_body_ct_inverse_mod_383 + DD imagerel $L$SEH_epilogue_ct_inverse_mod_383 + DD imagerel $L$SEH_info_ct_inverse_mod_383_body + + DD imagerel $L$SEH_epilogue_ct_inverse_mod_383 + DD imagerel $L$SEH_end_ct_inverse_mod_383 + DD imagerel $L$SEH_info_ct_inverse_mod_383_epilogue + +.pdata ENDS +.xdata SEGMENT READONLY ALIGN(8) +ALIGN 8 +$L$SEH_info_ct_inverse_mod_383_prologue:: +DB 1,0,5,00bh +DB 0,074h,1,0 +DB 0,064h,2,0 +DB 0,003h +DB 0,0 +$L$SEH_info_ct_inverse_mod_383_body:: +DB 1,0,18,0 +DB 000h,0f4h,08bh,000h +DB 000h,0e4h,08ch,000h +DB 000h,0d4h,08dh,000h +DB 000h,0c4h,08eh,000h +DB 000h,034h,08fh,000h +DB 000h,054h,090h,000h +DB 000h,074h,092h,000h +DB 000h,064h,093h,000h +DB 000h,001h,091h,000h +$L$SEH_info_ct_inverse_mod_383_epilogue:: +DB 1,0,4,0 +DB 000h,074h,001h,000h +DB 000h,064h,002h,000h +DB 000h,000h,000h,000h + + +.xdata ENDS +END diff --git a/temp-deps/blst-ts/deps/blst/build/win64/ctx_inverse_mod_384-x86_64.asm b/temp-deps/blst-ts/deps/blst/build/win64/ctx_inverse_mod_384-x86_64.asm new file mode 100644 index 0000000..df4c46a --- /dev/null +++ b/temp-deps/blst-ts/deps/blst/build/win64/ctx_inverse_mod_384-x86_64.asm @@ -0,0 +1,1597 @@ +OPTION DOTNAME +.text$ SEGMENT ALIGN(256) 'CODE' + +PUBLIC ctx_inverse_mod_383 + +ALIGN 32 +ctx_inverse_mod_383 PROC PUBLIC + DB 243,15,30,250 + mov QWORD PTR[8+rsp],rdi ;WIN64 prologue + mov QWORD PTR[16+rsp],rsi + mov r11,rsp +$L$SEH_begin_ctx_inverse_mod_383:: + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 + mov rcx,r9 + + + + push rbp + + push rbx + + push r12 + + push r13 + + push r14 + + push r15 + + sub rsp,1112 + +$L$SEH_body_ctx_inverse_mod_383:: + + + lea rax,QWORD PTR[((88+511))+rsp] + and rax,-512 + mov QWORD PTR[32+rsp],rdi + mov QWORD PTR[40+rsp],rcx + + mov r8,QWORD PTR[rsi] + mov r9,QWORD PTR[8+rsi] + mov r10,QWORD PTR[16+rsi] + mov r11,QWORD PTR[24+rsi] + mov r12,QWORD PTR[32+rsi] + mov r13,QWORD PTR[40+rsi] + + mov r14,QWORD PTR[rdx] + mov r15,QWORD PTR[8+rdx] + mov rbx,QWORD PTR[16+rdx] + mov rbp,QWORD PTR[24+rdx] + mov rsi,QWORD PTR[32+rdx] + mov rdi,QWORD PTR[40+rdx] + + mov QWORD PTR[rax],r8 + mov QWORD PTR[8+rax],r9 + mov QWORD PTR[16+rax],r10 + mov QWORD PTR[24+rax],r11 + mov QWORD PTR[32+rax],r12 + mov QWORD PTR[40+rax],r13 + + mov QWORD PTR[48+rax],r14 + mov QWORD PTR[56+rax],r15 + mov QWORD PTR[64+rax],rbx + mov QWORD PTR[72+rax],rbp + mov QWORD PTR[80+rax],rsi + mov rsi,rax + mov QWORD PTR[88+rax],rdi + + + mov edi,31 + call __ab_approximation_31 + + + mov QWORD PTR[72+rsp],r12 + mov QWORD PTR[80+rsp],r13 + + mov rdi,256 + xor rdi,rsi + call __smulx_383_n_shift_by_31 + + + mov QWORD PTR[96+rdi],rdx + + mov rdx,QWORD PTR[72+rsp] + mov rcx,QWORD PTR[80+rsp] + lea rdi,QWORD PTR[48+rdi] + call __smulx_383_n_shift_by_31 + + + mov QWORD PTR[96+rdi],rdx + + + xor rsi,256 + mov edi,31 + call __ab_approximation_31 + + + mov QWORD PTR[72+rsp],r12 + mov QWORD PTR[80+rsp],r13 + + mov rdi,256 + xor rdi,rsi + call __smulx_383_n_shift_by_31 + mov QWORD PTR[56+rsp],rdx + mov QWORD PTR[64+rsp],rcx + + mov rdx,QWORD PTR[72+rsp] + mov rcx,QWORD PTR[80+rsp] + lea rdi,QWORD PTR[48+rdi] + call __smulx_383_n_shift_by_31 + + + + mov rax,QWORD PTR[96+rsi] + mov r11,QWORD PTR[144+rsi] + mov rbx,rdx + mov r10,rax + imul QWORD PTR[56+rsp] + mov r8,rax + mov rax,r11 + mov r9,rdx + imul QWORD PTR[64+rsp] + add r8,rax + adc r9,rdx + mov QWORD PTR[48+rdi],r8 + mov QWORD PTR[56+rdi],r9 + sar r9,63 + mov QWORD PTR[64+rdi],r9 + mov QWORD PTR[72+rdi],r9 + mov QWORD PTR[80+rdi],r9 + mov QWORD PTR[88+rdi],r9 + lea rsi,QWORD PTR[96+rsi] + + mov rax,r10 + imul rbx + mov r8,rax + mov rax,r11 + mov r9,rdx + imul rcx + add r8,rax + adc r9,rdx + mov QWORD PTR[96+rdi],r8 + mov QWORD PTR[104+rdi],r9 + sar r9,63 + mov QWORD PTR[112+rdi],r9 + mov QWORD PTR[120+rdi],r9 + mov QWORD PTR[128+rdi],r9 + mov QWORD PTR[136+rdi],r9 + xor rsi,256+8*12 + mov edi,31 + call __ab_approximation_31 + + + mov QWORD PTR[72+rsp],r12 + mov QWORD PTR[80+rsp],r13 + + mov rdi,256 + xor rdi,rsi + call __smulx_383_n_shift_by_31 + mov QWORD PTR[56+rsp],rdx + mov QWORD PTR[64+rsp],rcx + + mov rdx,QWORD PTR[72+rsp] + mov rcx,QWORD PTR[80+rsp] + lea rdi,QWORD PTR[48+rdi] + call __smulx_383_n_shift_by_31 + mov QWORD PTR[72+rsp],rdx + mov QWORD PTR[80+rsp],rcx + + mov rdx,QWORD PTR[56+rsp] + mov rcx,QWORD PTR[64+rsp] + lea rsi,QWORD PTR[96+rsi] + lea rdi,QWORD PTR[48+rdi] + call __smulx_383x63 + + mov rdx,QWORD PTR[72+rsp] + mov rcx,QWORD PTR[80+rsp] + lea rdi,QWORD PTR[48+rdi] + call __smulx_383x63 + xor rsi,256+8*12 + mov edi,31 + call __ab_approximation_31 + + + mov QWORD PTR[72+rsp],r12 + mov QWORD PTR[80+rsp],r13 + + mov rdi,256 + xor rdi,rsi + call __smulx_383_n_shift_by_31 + mov QWORD PTR[56+rsp],rdx + mov QWORD PTR[64+rsp],rcx + + mov rdx,QWORD PTR[72+rsp] + mov rcx,QWORD PTR[80+rsp] + lea rdi,QWORD PTR[48+rdi] + call __smulx_383_n_shift_by_31 + mov QWORD PTR[72+rsp],rdx + mov QWORD PTR[80+rsp],rcx + + mov rdx,QWORD PTR[56+rsp] + mov rcx,QWORD PTR[64+rsp] + lea rsi,QWORD PTR[96+rsi] + lea rdi,QWORD PTR[48+rdi] + call __smulx_383x63 + + mov rdx,QWORD PTR[72+rsp] + mov rcx,QWORD PTR[80+rsp] + lea rdi,QWORD PTR[48+rdi] + call __smulx_383x63 + xor rsi,256+8*12 + mov edi,31 + call __ab_approximation_31 + + + mov QWORD PTR[72+rsp],r12 + mov QWORD PTR[80+rsp],r13 + + mov rdi,256 + xor rdi,rsi + call __smulx_383_n_shift_by_31 + mov QWORD PTR[56+rsp],rdx + mov QWORD PTR[64+rsp],rcx + + mov rdx,QWORD PTR[72+rsp] + mov rcx,QWORD PTR[80+rsp] + lea rdi,QWORD PTR[48+rdi] + call __smulx_383_n_shift_by_31 + mov QWORD PTR[72+rsp],rdx + mov QWORD PTR[80+rsp],rcx + + mov rdx,QWORD PTR[56+rsp] + mov rcx,QWORD PTR[64+rsp] + lea rsi,QWORD PTR[96+rsi] + lea rdi,QWORD PTR[48+rdi] + call __smulx_383x63 + + mov rdx,QWORD PTR[72+rsp] + mov rcx,QWORD PTR[80+rsp] + lea rdi,QWORD PTR[48+rdi] + call __smulx_383x63 + xor rsi,256+8*12 + mov edi,31 + call __ab_approximation_31 + + + mov QWORD PTR[72+rsp],r12 + mov QWORD PTR[80+rsp],r13 + + mov rdi,256 + xor rdi,rsi + call __smulx_383_n_shift_by_31 + mov QWORD PTR[56+rsp],rdx + mov QWORD PTR[64+rsp],rcx + + mov rdx,QWORD PTR[72+rsp] + mov rcx,QWORD PTR[80+rsp] + lea rdi,QWORD PTR[48+rdi] + call __smulx_383_n_shift_by_31 + mov QWORD PTR[72+rsp],rdx + mov QWORD PTR[80+rsp],rcx + + mov rdx,QWORD PTR[56+rsp] + mov rcx,QWORD PTR[64+rsp] + lea rsi,QWORD PTR[96+rsi] + lea rdi,QWORD PTR[48+rdi] + call __smulx_383x63 + + mov rdx,QWORD PTR[72+rsp] + mov rcx,QWORD PTR[80+rsp] + lea rdi,QWORD PTR[48+rdi] + call __smulx_383x63 + xor rsi,256+8*12 + mov edi,31 + call __ab_approximation_31 + + + mov QWORD PTR[72+rsp],r12 + mov QWORD PTR[80+rsp],r13 + + mov rdi,256 + xor rdi,rsi + call __smulx_383_n_shift_by_31 + mov QWORD PTR[56+rsp],rdx + mov QWORD PTR[64+rsp],rcx + + mov rdx,QWORD PTR[72+rsp] + mov rcx,QWORD PTR[80+rsp] + lea rdi,QWORD PTR[48+rdi] + call __smulx_383_n_shift_by_31 + mov QWORD PTR[72+rsp],rdx + mov QWORD PTR[80+rsp],rcx + + mov rdx,QWORD PTR[56+rsp] + mov rcx,QWORD PTR[64+rsp] + lea rsi,QWORD PTR[96+rsi] + lea rdi,QWORD PTR[48+rdi] + call __smulx_383x63 + + mov rdx,QWORD PTR[72+rsp] + mov rcx,QWORD PTR[80+rsp] + lea rdi,QWORD PTR[48+rdi] + call __smulx_383x63 + xor rsi,256+8*12 + mov edi,31 + call __ab_approximation_31 + + + mov QWORD PTR[72+rsp],r12 + mov QWORD PTR[80+rsp],r13 + + mov rdi,256 + xor rdi,rsi + call __smulx_383_n_shift_by_31 + mov QWORD PTR[56+rsp],rdx + mov QWORD PTR[64+rsp],rcx + + mov rdx,QWORD PTR[72+rsp] + mov rcx,QWORD PTR[80+rsp] + lea rdi,QWORD PTR[48+rdi] + call __smulx_383_n_shift_by_31 + mov QWORD PTR[72+rsp],rdx + mov QWORD PTR[80+rsp],rcx + + mov rdx,QWORD PTR[56+rsp] + mov rcx,QWORD PTR[64+rsp] + lea rsi,QWORD PTR[96+rsi] + lea rdi,QWORD PTR[48+rdi] + call __smulx_383x63 + + mov rdx,QWORD PTR[72+rsp] + mov rcx,QWORD PTR[80+rsp] + lea rdi,QWORD PTR[48+rdi] + call __smulx_383x63 + xor rsi,256+8*12 + mov edi,31 + call __ab_approximation_31 + + + mov QWORD PTR[72+rsp],r12 + mov QWORD PTR[80+rsp],r13 + + mov rdi,256 + xor rdi,rsi + call __smulx_383_n_shift_by_31 + mov QWORD PTR[56+rsp],rdx + mov QWORD PTR[64+rsp],rcx + + mov rdx,QWORD PTR[72+rsp] + mov rcx,QWORD PTR[80+rsp] + lea rdi,QWORD PTR[48+rdi] + call __smulx_383_n_shift_by_31 + mov QWORD PTR[72+rsp],rdx + mov QWORD PTR[80+rsp],rcx + + mov rdx,QWORD PTR[56+rsp] + mov rcx,QWORD PTR[64+rsp] + lea rsi,QWORD PTR[96+rsi] + lea rdi,QWORD PTR[48+rdi] + call __smulx_383x63 + + mov rdx,QWORD PTR[72+rsp] + mov rcx,QWORD PTR[80+rsp] + lea rdi,QWORD PTR[48+rdi] + call __smulx_383x63 + xor rsi,256+8*12 + mov edi,31 + call __ab_approximation_31 + + + mov QWORD PTR[72+rsp],r12 + mov QWORD PTR[80+rsp],r13 + + mov rdi,256 + xor rdi,rsi + call __smulx_383_n_shift_by_31 + mov QWORD PTR[56+rsp],rdx + mov QWORD PTR[64+rsp],rcx + + mov rdx,QWORD PTR[72+rsp] + mov rcx,QWORD PTR[80+rsp] + lea rdi,QWORD PTR[48+rdi] + call __smulx_383_n_shift_by_31 + mov QWORD PTR[72+rsp],rdx + mov QWORD PTR[80+rsp],rcx + + mov rdx,QWORD PTR[56+rsp] + mov rcx,QWORD PTR[64+rsp] + lea rsi,QWORD PTR[96+rsi] + lea rdi,QWORD PTR[48+rdi] + call __smulx_383x63 + + mov rdx,QWORD PTR[72+rsp] + mov rcx,QWORD PTR[80+rsp] + lea rdi,QWORD PTR[48+rdi] + call __smulx_383x63 + xor rsi,256+8*12 + mov edi,31 + call __ab_approximation_31 + + + mov QWORD PTR[72+rsp],r12 + mov QWORD PTR[80+rsp],r13 + + mov rdi,256 + xor rdi,rsi + call __smulx_383_n_shift_by_31 + mov QWORD PTR[56+rsp],rdx + mov QWORD PTR[64+rsp],rcx + + mov rdx,QWORD PTR[72+rsp] + mov rcx,QWORD PTR[80+rsp] + lea rdi,QWORD PTR[48+rdi] + call __smulx_383_n_shift_by_31 + mov QWORD PTR[72+rsp],rdx + mov QWORD PTR[80+rsp],rcx + + mov rdx,QWORD PTR[56+rsp] + mov rcx,QWORD PTR[64+rsp] + lea rsi,QWORD PTR[96+rsi] + lea rdi,QWORD PTR[48+rdi] + call __smulx_383x63 + + mov rdx,QWORD PTR[72+rsp] + mov rcx,QWORD PTR[80+rsp] + lea rdi,QWORD PTR[48+rdi] + call __smulx_383x63 + xor rsi,256+8*12 + mov edi,31 + call __ab_approximation_31 + + + mov QWORD PTR[72+rsp],r12 + mov QWORD PTR[80+rsp],r13 + + mov rdi,256 + xor rdi,rsi + call __smulx_383_n_shift_by_31 + mov QWORD PTR[56+rsp],rdx + mov QWORD PTR[64+rsp],rcx + + mov rdx,QWORD PTR[72+rsp] + mov rcx,QWORD PTR[80+rsp] + lea rdi,QWORD PTR[48+rdi] + call __smulx_383_n_shift_by_31 + mov QWORD PTR[72+rsp],rdx + mov QWORD PTR[80+rsp],rcx + + mov rdx,QWORD PTR[56+rsp] + mov rcx,QWORD PTR[64+rsp] + lea rsi,QWORD PTR[96+rsi] + lea rdi,QWORD PTR[48+rdi] + call __smulx_383x63 + + mov rdx,QWORD PTR[72+rsp] + mov rcx,QWORD PTR[80+rsp] + lea rdi,QWORD PTR[48+rdi] + call __smulx_383x63 + sar r13,63 + mov QWORD PTR[48+rdi],r13 + mov QWORD PTR[56+rdi],r13 + mov QWORD PTR[64+rdi],r13 + mov QWORD PTR[72+rdi],r13 + mov QWORD PTR[80+rdi],r13 + mov QWORD PTR[88+rdi],r13 + xor rsi,256+8*12 + mov edi,31 + call __ab_approximation_31 + + + mov QWORD PTR[72+rsp],r12 + mov QWORD PTR[80+rsp],r13 + + mov rdi,256 + xor rdi,rsi + call __smulx_383_n_shift_by_31 + mov QWORD PTR[56+rsp],rdx + mov QWORD PTR[64+rsp],rcx + + mov rdx,QWORD PTR[72+rsp] + mov rcx,QWORD PTR[80+rsp] + lea rdi,QWORD PTR[48+rdi] + call __smulx_383_n_shift_by_31 + mov QWORD PTR[72+rsp],rdx + mov QWORD PTR[80+rsp],rcx + + mov rdx,QWORD PTR[56+rsp] + mov rcx,QWORD PTR[64+rsp] + lea rsi,QWORD PTR[96+rsi] + lea rdi,QWORD PTR[48+rdi] + call __smulx_383x63 + + mov rdx,QWORD PTR[72+rsp] + mov rcx,QWORD PTR[80+rsp] + lea rdi,QWORD PTR[48+rdi] + call __smulx_767x63 + xor rsi,256+8*12 + mov edi,31 + call __ab_approximation_31 + + + mov QWORD PTR[72+rsp],r12 + mov QWORD PTR[80+rsp],r13 + + mov rdi,256 + xor rdi,rsi + call __smulx_383_n_shift_by_31 + mov QWORD PTR[56+rsp],rdx + mov QWORD PTR[64+rsp],rcx + + mov rdx,QWORD PTR[72+rsp] + mov rcx,QWORD PTR[80+rsp] + lea rdi,QWORD PTR[48+rdi] + call __smulx_383_n_shift_by_31 + mov QWORD PTR[72+rsp],rdx + mov QWORD PTR[80+rsp],rcx + + mov rdx,QWORD PTR[56+rsp] + mov rcx,QWORD PTR[64+rsp] + lea rsi,QWORD PTR[96+rsi] + lea rdi,QWORD PTR[48+rdi] + call __smulx_383x63 + + mov rdx,QWORD PTR[72+rsp] + mov rcx,QWORD PTR[80+rsp] + lea rdi,QWORD PTR[48+rdi] + call __smulx_767x63 + xor rsi,256+8*12 + mov edi,31 + call __ab_approximation_31 + + + mov QWORD PTR[72+rsp],r12 + mov QWORD PTR[80+rsp],r13 + + mov rdi,256 + xor rdi,rsi + call __smulx_383_n_shift_by_31 + mov QWORD PTR[56+rsp],rdx + mov QWORD PTR[64+rsp],rcx + + mov rdx,QWORD PTR[72+rsp] + mov rcx,QWORD PTR[80+rsp] + lea rdi,QWORD PTR[48+rdi] + call __smulx_383_n_shift_by_31 + mov QWORD PTR[72+rsp],rdx + mov QWORD PTR[80+rsp],rcx + + mov rdx,QWORD PTR[56+rsp] + mov rcx,QWORD PTR[64+rsp] + lea rsi,QWORD PTR[96+rsi] + lea rdi,QWORD PTR[48+rdi] + call __smulx_383x63 + + mov rdx,QWORD PTR[72+rsp] + mov rcx,QWORD PTR[80+rsp] + lea rdi,QWORD PTR[48+rdi] + call __smulx_767x63 + xor rsi,256+8*12 + mov edi,31 + call __ab_approximation_31 + + + mov QWORD PTR[72+rsp],r12 + mov QWORD PTR[80+rsp],r13 + + mov rdi,256 + xor rdi,rsi + call __smulx_383_n_shift_by_31 + mov QWORD PTR[56+rsp],rdx + mov QWORD PTR[64+rsp],rcx + + mov rdx,QWORD PTR[72+rsp] + mov rcx,QWORD PTR[80+rsp] + lea rdi,QWORD PTR[48+rdi] + call __smulx_383_n_shift_by_31 + mov QWORD PTR[72+rsp],rdx + mov QWORD PTR[80+rsp],rcx + + mov rdx,QWORD PTR[56+rsp] + mov rcx,QWORD PTR[64+rsp] + lea rsi,QWORD PTR[96+rsi] + lea rdi,QWORD PTR[48+rdi] + call __smulx_383x63 + + mov rdx,QWORD PTR[72+rsp] + mov rcx,QWORD PTR[80+rsp] + lea rdi,QWORD PTR[48+rdi] + call __smulx_767x63 + xor rsi,256+8*12 + mov edi,31 + call __ab_approximation_31 + + + mov QWORD PTR[72+rsp],r12 + mov QWORD PTR[80+rsp],r13 + + mov rdi,256 + xor rdi,rsi + call __smulx_383_n_shift_by_31 + mov QWORD PTR[56+rsp],rdx + mov QWORD PTR[64+rsp],rcx + + mov rdx,QWORD PTR[72+rsp] + mov rcx,QWORD PTR[80+rsp] + lea rdi,QWORD PTR[48+rdi] + call __smulx_383_n_shift_by_31 + mov QWORD PTR[72+rsp],rdx + mov QWORD PTR[80+rsp],rcx + + mov rdx,QWORD PTR[56+rsp] + mov rcx,QWORD PTR[64+rsp] + lea rsi,QWORD PTR[96+rsi] + lea rdi,QWORD PTR[48+rdi] + call __smulx_383x63 + + mov rdx,QWORD PTR[72+rsp] + mov rcx,QWORD PTR[80+rsp] + lea rdi,QWORD PTR[48+rdi] + call __smulx_767x63 + xor rsi,256+8*12 + mov edi,31 + call __ab_approximation_31 + + + mov QWORD PTR[72+rsp],r12 + mov QWORD PTR[80+rsp],r13 + + mov rdi,256 + xor rdi,rsi + call __smulx_383_n_shift_by_31 + mov QWORD PTR[56+rsp],rdx + mov QWORD PTR[64+rsp],rcx + + mov rdx,QWORD PTR[72+rsp] + mov rcx,QWORD PTR[80+rsp] + lea rdi,QWORD PTR[48+rdi] + call __smulx_383_n_shift_by_31 + mov QWORD PTR[72+rsp],rdx + mov QWORD PTR[80+rsp],rcx + + mov rdx,QWORD PTR[56+rsp] + mov rcx,QWORD PTR[64+rsp] + lea rsi,QWORD PTR[96+rsi] + lea rdi,QWORD PTR[48+rdi] + call __smulx_383x63 + + mov rdx,QWORD PTR[72+rsp] + mov rcx,QWORD PTR[80+rsp] + lea rdi,QWORD PTR[48+rdi] + call __smulx_767x63 + xor rsi,256+8*12 + mov edi,31 + call __ab_approximation_31 + + + mov QWORD PTR[72+rsp],r12 + mov QWORD PTR[80+rsp],r13 + + mov rdi,256 + xor rdi,rsi + call __smulx_383_n_shift_by_31 + mov QWORD PTR[56+rsp],rdx + mov QWORD PTR[64+rsp],rcx + + mov rdx,QWORD PTR[72+rsp] + mov rcx,QWORD PTR[80+rsp] + lea rdi,QWORD PTR[48+rdi] + call __smulx_383_n_shift_by_31 + mov QWORD PTR[72+rsp],rdx + mov QWORD PTR[80+rsp],rcx + + mov rdx,QWORD PTR[56+rsp] + mov rcx,QWORD PTR[64+rsp] + lea rsi,QWORD PTR[96+rsi] + lea rdi,QWORD PTR[48+rdi] + call __smulx_383x63 + + mov rdx,QWORD PTR[72+rsp] + mov rcx,QWORD PTR[80+rsp] + lea rdi,QWORD PTR[48+rdi] + call __smulx_767x63 + xor rsi,256+8*12 + mov edi,31 + call __ab_approximation_31 + + + mov QWORD PTR[72+rsp],r12 + mov QWORD PTR[80+rsp],r13 + + mov rdi,256 + xor rdi,rsi + call __smulx_191_n_shift_by_31 + mov QWORD PTR[56+rsp],rdx + mov QWORD PTR[64+rsp],rcx + + mov rdx,QWORD PTR[72+rsp] + mov rcx,QWORD PTR[80+rsp] + lea rdi,QWORD PTR[48+rdi] + call __smulx_191_n_shift_by_31 + mov QWORD PTR[72+rsp],rdx + mov QWORD PTR[80+rsp],rcx + + mov rdx,QWORD PTR[56+rsp] + mov rcx,QWORD PTR[64+rsp] + lea rsi,QWORD PTR[96+rsi] + lea rdi,QWORD PTR[48+rdi] + call __smulx_383x63 + + mov rdx,QWORD PTR[72+rsp] + mov rcx,QWORD PTR[80+rsp] + lea rdi,QWORD PTR[48+rdi] + call __smulx_767x63 + xor rsi,256+8*12 + mov edi,31 + call __ab_approximation_31 + + + mov QWORD PTR[72+rsp],r12 + mov QWORD PTR[80+rsp],r13 + + mov rdi,256 + xor rdi,rsi + call __smulx_191_n_shift_by_31 + mov QWORD PTR[56+rsp],rdx + mov QWORD PTR[64+rsp],rcx + + mov rdx,QWORD PTR[72+rsp] + mov rcx,QWORD PTR[80+rsp] + lea rdi,QWORD PTR[48+rdi] + call __smulx_191_n_shift_by_31 + mov QWORD PTR[72+rsp],rdx + mov QWORD PTR[80+rsp],rcx + + mov rdx,QWORD PTR[56+rsp] + mov rcx,QWORD PTR[64+rsp] + lea rsi,QWORD PTR[96+rsi] + lea rdi,QWORD PTR[48+rdi] + call __smulx_383x63 + + mov rdx,QWORD PTR[72+rsp] + mov rcx,QWORD PTR[80+rsp] + lea rdi,QWORD PTR[48+rdi] + call __smulx_767x63 + xor rsi,256+8*12 + mov edi,31 + call __ab_approximation_31 + + + mov QWORD PTR[72+rsp],r12 + mov QWORD PTR[80+rsp],r13 + + mov rdi,256 + xor rdi,rsi + call __smulx_191_n_shift_by_31 + mov QWORD PTR[56+rsp],rdx + mov QWORD PTR[64+rsp],rcx + + mov rdx,QWORD PTR[72+rsp] + mov rcx,QWORD PTR[80+rsp] + lea rdi,QWORD PTR[48+rdi] + call __smulx_191_n_shift_by_31 + mov QWORD PTR[72+rsp],rdx + mov QWORD PTR[80+rsp],rcx + + mov rdx,QWORD PTR[56+rsp] + mov rcx,QWORD PTR[64+rsp] + lea rsi,QWORD PTR[96+rsi] + lea rdi,QWORD PTR[48+rdi] + call __smulx_383x63 + + mov rdx,QWORD PTR[72+rsp] + mov rcx,QWORD PTR[80+rsp] + lea rdi,QWORD PTR[48+rdi] + call __smulx_767x63 + xor rsi,256+8*12 + mov edi,31 + call __ab_approximation_31 + + + mov QWORD PTR[72+rsp],r12 + mov QWORD PTR[80+rsp],r13 + + mov rdi,256 + xor rdi,rsi + call __smulx_191_n_shift_by_31 + mov QWORD PTR[56+rsp],rdx + mov QWORD PTR[64+rsp],rcx + + mov rdx,QWORD PTR[72+rsp] + mov rcx,QWORD PTR[80+rsp] + lea rdi,QWORD PTR[48+rdi] + call __smulx_191_n_shift_by_31 + mov QWORD PTR[72+rsp],rdx + mov QWORD PTR[80+rsp],rcx + + mov rdx,QWORD PTR[56+rsp] + mov rcx,QWORD PTR[64+rsp] + lea rsi,QWORD PTR[96+rsi] + lea rdi,QWORD PTR[48+rdi] + call __smulx_383x63 + + mov rdx,QWORD PTR[72+rsp] + mov rcx,QWORD PTR[80+rsp] + lea rdi,QWORD PTR[48+rdi] + call __smulx_767x63 + + xor rsi,256+8*12 + mov edi,53 + + mov r8,QWORD PTR[rsi] + + mov r10,QWORD PTR[48+rsi] + + call __inner_loop_62 + + + + + + + + lea rsi,QWORD PTR[96+rsi] + + + + + + mov rdx,r12 + mov rcx,r13 + mov rdi,QWORD PTR[32+rsp] + call __smulx_767x63 + + mov rsi,QWORD PTR[40+rsp] + mov rdx,rax + sar rax,63 + + mov r8,rax + mov r9,rax + mov r10,rax + and r8,QWORD PTR[rsi] + and r9,QWORD PTR[8+rsi] + mov r11,rax + and r10,QWORD PTR[16+rsi] + and r11,QWORD PTR[24+rsi] + mov r12,rax + and r12,QWORD PTR[32+rsi] + and rax,QWORD PTR[40+rsi] + + add r14,r8 + adc r15,r9 + adc rbx,r10 + adc rbp,r11 + adc rcx,r12 + adc rdx,rax + + mov QWORD PTR[48+rdi],r14 + mov QWORD PTR[56+rdi],r15 + mov QWORD PTR[64+rdi],rbx + mov QWORD PTR[72+rdi],rbp + mov QWORD PTR[80+rdi],rcx + mov QWORD PTR[88+rdi],rdx + + lea r8,QWORD PTR[1112+rsp] + mov r15,QWORD PTR[r8] + + mov r14,QWORD PTR[8+r8] + + mov r13,QWORD PTR[16+r8] + + mov r12,QWORD PTR[24+r8] + + mov rbx,QWORD PTR[32+r8] + + mov rbp,QWORD PTR[40+r8] + + lea rsp,QWORD PTR[48+r8] + +$L$SEH_epilogue_ctx_inverse_mod_383:: + mov rdi,QWORD PTR[8+rsp] ;WIN64 epilogue + mov rsi,QWORD PTR[16+rsp] + + DB 0F3h,0C3h ;repret + +$L$SEH_end_ctx_inverse_mod_383:: +ctx_inverse_mod_383 ENDP + +ALIGN 32 +__smulx_767x63 PROC PRIVATE + DB 243,15,30,250 + mov r8,QWORD PTR[rsi] + mov r9,QWORD PTR[8+rsi] + mov r10,QWORD PTR[16+rsi] + mov r11,QWORD PTR[24+rsi] + mov r12,QWORD PTR[32+rsi] + mov r13,QWORD PTR[40+rsi] + + mov rax,rdx + sar rax,63 + xor rbp,rbp + sub rbp,rax + + mov QWORD PTR[8+rsp],rdi + mov QWORD PTR[16+rsp],rsi + lea rsi,QWORD PTR[48+rsi] + + xor rdx,rax + add rdx,rbp + + xor r8,rax + xor r9,rax + xor r10,rax + xor r11,rax + xor r12,rax + xor rax,r13 + add r8,rbp + adc r9,0 + adc r10,0 + adc r11,0 + adc r12,0 + adc rax,0 + + mulx rbp,r8,r8 + mulx r13,r9,r9 + add r9,rbp + mulx rbp,r10,r10 + adc r10,r13 + mulx r13,r11,r11 + adc r11,rbp + mulx rbp,r12,r12 + adc r12,r13 + adc rbp,0 + imul rdx + add rax,rbp + adc rdx,0 + + mov QWORD PTR[rdi],r8 + mov QWORD PTR[8+rdi],r9 + mov QWORD PTR[16+rdi],r10 + mov QWORD PTR[24+rdi],r11 + mov QWORD PTR[32+rdi],r12 + mov QWORD PTR[40+rdi],rax + mov QWORD PTR[48+rdi],rdx + sar rdx,63 + mov QWORD PTR[56+rdi],rdx + mov rdx,rcx + mov rax,rcx + + mov r8,QWORD PTR[rsi] + mov r9,QWORD PTR[8+rsi] + mov r10,QWORD PTR[16+rsi] + mov r11,QWORD PTR[24+rsi] + mov r12,QWORD PTR[32+rsi] + mov r13,QWORD PTR[40+rsi] + mov r14,QWORD PTR[48+rsi] + mov r15,QWORD PTR[56+rsi] + mov rbx,QWORD PTR[64+rsi] + mov rbp,QWORD PTR[72+rsi] + mov rcx,QWORD PTR[80+rsi] + mov rdi,QWORD PTR[88+rsi] + + sar rax,63 + xor rsi,rsi + sub rsi,rax + + xor rdx,rax + add rdx,rsi + + xor r8,rax + xor r9,rax + xor r10,rax + xor r11,rax + xor r12,rax + xor r13,rax + xor r14,rax + xor r15,rax + xor rbx,rax + xor rbp,rax + xor rcx,rax + xor rdi,rax + add r8,rsi + adc r9,0 + adc r10,0 + adc r11,0 + adc r12,0 + adc r13,0 + adc r14,0 + adc r15,0 + adc rbx,0 + adc rbp,0 + adc rcx,0 + adc rdi,0 + + mulx rax,r8,r8 + mulx rsi,r9,r9 + add r9,rax + mulx rax,r10,r10 + adc r10,rsi + mulx rsi,r11,r11 + adc r11,rax + mulx rax,r12,r12 + adc r12,rsi + mulx rsi,r13,r13 + adc r13,rax + mulx rax,r14,r14 + adc r14,rsi + mulx rsi,r15,r15 + adc r15,rax + mulx rax,rbx,rbx + adc rbx,rsi + mulx rsi,rbp,rbp + adc rbp,rax + mulx rax,rcx,rcx + adc rcx,rsi + mulx rsi,rdi,rdi + mov rdx,QWORD PTR[8+rsp] + mov rsi,QWORD PTR[16+rsp] + adc rax,rdi + + add r8,QWORD PTR[rdx] + adc r9,QWORD PTR[8+rdx] + adc r10,QWORD PTR[16+rdx] + adc r11,QWORD PTR[24+rdx] + adc r12,QWORD PTR[32+rdx] + adc r13,QWORD PTR[40+rdx] + adc r14,QWORD PTR[48+rdx] + mov rdi,QWORD PTR[56+rdx] + adc r15,rdi + adc rbx,rdi + adc rbp,rdi + adc rcx,rdi + adc rax,rdi + + mov rdi,rdx + + mov QWORD PTR[rdx],r8 + mov QWORD PTR[8+rdx],r9 + mov QWORD PTR[16+rdx],r10 + mov QWORD PTR[24+rdx],r11 + mov QWORD PTR[32+rdx],r12 + mov QWORD PTR[40+rdx],r13 + mov QWORD PTR[48+rdx],r14 + mov QWORD PTR[56+rdx],r15 + mov QWORD PTR[64+rdx],rbx + mov QWORD PTR[72+rdx],rbp + mov QWORD PTR[80+rdx],rcx + mov QWORD PTR[88+rdx],rax + + DB 0F3h,0C3h ;repret +__smulx_767x63 ENDP + +ALIGN 32 +__smulx_383x63 PROC PRIVATE + DB 243,15,30,250 + mov r8,QWORD PTR[((0+0))+rsi] + mov r9,QWORD PTR[((0+8))+rsi] + mov r10,QWORD PTR[((0+16))+rsi] + mov r11,QWORD PTR[((0+24))+rsi] + mov r12,QWORD PTR[((0+32))+rsi] + mov r13,QWORD PTR[((0+40))+rsi] + + mov rbp,rdx + sar rbp,63 + xor rax,rax + sub rax,rbp + + xor rdx,rbp + add rdx,rax + + xor r8,rbp + xor r9,rbp + xor r10,rbp + xor r11,rbp + xor r12,rbp + xor r13,rbp + add r8,rax + adc r9,0 + adc r10,0 + adc r11,0 + adc r12,0 + adc r13,0 + + mulx rbp,r8,r8 + mulx rax,r9,r9 + add r9,rbp + mulx rbp,r10,r10 + adc r10,rax + mulx rax,r11,r11 + adc r11,rbp + mulx rbp,r12,r12 + adc r12,rax + mulx rax,r13,r13 + mov rdx,rcx + adc r13,rbp + + mov QWORD PTR[rdi],r8 + mov QWORD PTR[8+rdi],r9 + mov QWORD PTR[16+rdi],r10 + mov QWORD PTR[24+rdi],r11 + mov QWORD PTR[32+rdi],r12 + mov QWORD PTR[40+rdi],r13 + mov r8,QWORD PTR[((48+0))+rsi] + mov r9,QWORD PTR[((48+8))+rsi] + mov r10,QWORD PTR[((48+16))+rsi] + mov r11,QWORD PTR[((48+24))+rsi] + mov r12,QWORD PTR[((48+32))+rsi] + mov r13,QWORD PTR[((48+40))+rsi] + + mov rbp,rdx + sar rbp,63 + xor rax,rax + sub rax,rbp + + xor rdx,rbp + add rdx,rax + + xor r8,rbp + xor r9,rbp + xor r10,rbp + xor r11,rbp + xor r12,rbp + xor r13,rbp + add r8,rax + adc r9,0 + adc r10,0 + adc r11,0 + adc r12,0 + adc r13,0 + + mulx rbp,r8,r8 + mulx rax,r9,r9 + add r9,rbp + mulx rbp,r10,r10 + adc r10,rax + mulx rax,r11,r11 + adc r11,rbp + mulx rbp,r12,r12 + adc r12,rax + mulx rax,r13,r13 + adc r13,rbp + + add r8,QWORD PTR[rdi] + adc r9,QWORD PTR[8+rdi] + adc r10,QWORD PTR[16+rdi] + adc r11,QWORD PTR[24+rdi] + adc r12,QWORD PTR[32+rdi] + adc r13,QWORD PTR[40+rdi] + + mov QWORD PTR[rdi],r8 + mov QWORD PTR[8+rdi],r9 + mov QWORD PTR[16+rdi],r10 + mov QWORD PTR[24+rdi],r11 + mov QWORD PTR[32+rdi],r12 + mov QWORD PTR[40+rdi],r13 + + DB 0F3h,0C3h ;repret +__smulx_383x63 ENDP + +ALIGN 32 +__smulx_383_n_shift_by_31 PROC PRIVATE + DB 243,15,30,250 + mov rbx,rdx + xor r14,r14 + mov r8,QWORD PTR[((0+0))+rsi] + mov r9,QWORD PTR[((0+8))+rsi] + mov r10,QWORD PTR[((0+16))+rsi] + mov r11,QWORD PTR[((0+24))+rsi] + mov r12,QWORD PTR[((0+32))+rsi] + mov r13,QWORD PTR[((0+40))+rsi] + + mov rax,rdx + sar rax,63 + xor rbp,rbp + sub rbp,rax + + xor rdx,rax + add rdx,rbp + + xor r8,rax + xor r9,rax + xor r10,rax + xor r11,rax + xor r12,rax + xor rax,r13 + add r8,rbp + adc r9,0 + adc r10,0 + adc r11,0 + adc r12,0 + adc rax,0 + + mulx rbp,r8,r8 + mulx r13,r9,r9 + add r9,rbp + mulx rbp,r10,r10 + adc r10,r13 + mulx r13,r11,r11 + adc r11,rbp + mulx rbp,r12,r12 + adc r12,r13 + adc rbp,0 + imul rdx + add rax,rbp + adc r14,rdx + + mov rdx,rcx + + mov QWORD PTR[rdi],r8 + mov QWORD PTR[8+rdi],r9 + mov QWORD PTR[16+rdi],r10 + mov QWORD PTR[24+rdi],r11 + mov QWORD PTR[32+rdi],r12 + mov QWORD PTR[40+rdi],rax + mov r8,QWORD PTR[((48+0))+rsi] + mov r9,QWORD PTR[((48+8))+rsi] + mov r10,QWORD PTR[((48+16))+rsi] + mov r11,QWORD PTR[((48+24))+rsi] + mov r12,QWORD PTR[((48+32))+rsi] + mov r13,QWORD PTR[((48+40))+rsi] + + mov rax,rdx + sar rax,63 + xor rbp,rbp + sub rbp,rax + + xor rdx,rax + add rdx,rbp + + xor r8,rax + xor r9,rax + xor r10,rax + xor r11,rax + xor r12,rax + xor rax,r13 + add r8,rbp + adc r9,0 + adc r10,0 + adc r11,0 + adc r12,0 + adc rax,0 + + mulx rbp,r8,r8 + mulx r13,r9,r9 + add r9,rbp + mulx rbp,r10,r10 + adc r10,r13 + mulx r13,r11,r11 + adc r11,rbp + mulx rbp,r12,r12 + adc r12,r13 + adc rbp,0 + imul rdx + add rax,rbp + adc rdx,0 + + add r8,QWORD PTR[rdi] + adc r9,QWORD PTR[8+rdi] + adc r10,QWORD PTR[16+rdi] + adc r11,QWORD PTR[24+rdi] + adc r12,QWORD PTR[32+rdi] + adc rax,QWORD PTR[40+rdi] + adc r14,rdx + mov rdx,rbx + + shrd r8,r9,31 + shrd r9,r10,31 + shrd r10,r11,31 + shrd r11,r12,31 + shrd r12,rax,31 + shrd rax,r14,31 + + sar r14,63 + xor rbp,rbp + sub rbp,r14 + + xor r8,r14 + xor r9,r14 + xor r10,r14 + xor r11,r14 + xor r12,r14 + xor rax,r14 + add r8,rbp + adc r9,0 + adc r10,0 + adc r11,0 + adc r12,0 + adc rax,0 + + mov QWORD PTR[rdi],r8 + mov QWORD PTR[8+rdi],r9 + mov QWORD PTR[16+rdi],r10 + mov QWORD PTR[24+rdi],r11 + mov QWORD PTR[32+rdi],r12 + mov QWORD PTR[40+rdi],rax + + xor rdx,r14 + xor rcx,r14 + add rdx,rbp + add rcx,rbp + + DB 0F3h,0C3h ;repret +__smulx_383_n_shift_by_31 ENDP + +ALIGN 32 +__smulx_191_n_shift_by_31 PROC PRIVATE + DB 243,15,30,250 + mov rbx,rdx + mov r8,QWORD PTR[((0+0))+rsi] + mov r9,QWORD PTR[((0+8))+rsi] + mov r10,QWORD PTR[((0+16))+rsi] + + mov rax,rdx + sar rax,63 + xor rbp,rbp + sub rbp,rax + + xor rdx,rax + add rdx,rbp + + xor r8,rax + xor r9,rax + xor rax,r10 + add r8,rbp + adc r9,0 + adc rax,0 + + mulx rbp,r8,r8 + mulx r10,r9,r9 + add r9,rbp + adc r10,0 + imul rdx + add r10,rax + adc rdx,0 + mov r14,rdx + mov rdx,rcx + mov r11,QWORD PTR[((48+0))+rsi] + mov r12,QWORD PTR[((48+8))+rsi] + mov r13,QWORD PTR[((48+16))+rsi] + + mov rax,rdx + sar rax,63 + xor rbp,rbp + sub rbp,rax + + xor rdx,rax + add rdx,rbp + + xor r11,rax + xor r12,rax + xor rax,r13 + add r11,rbp + adc r12,0 + adc rax,0 + + mulx rbp,r11,r11 + mulx r13,r12,r12 + add r12,rbp + adc r13,0 + imul rdx + add r13,rax + adc rdx,0 + add r11,r8 + adc r12,r9 + adc r13,r10 + adc r14,rdx + mov rdx,rbx + + shrd r11,r12,31 + shrd r12,r13,31 + shrd r13,r14,31 + + sar r14,63 + xor rbp,rbp + sub rbp,r14 + + xor r11,r14 + xor r12,r14 + xor r13,r14 + add r11,rbp + adc r12,0 + adc r13,0 + + mov QWORD PTR[rdi],r11 + mov QWORD PTR[8+rdi],r12 + mov QWORD PTR[16+rdi],r13 + + xor rdx,r14 + xor rcx,r14 + add rdx,rbp + add rcx,rbp + + DB 0F3h,0C3h ;repret +__smulx_191_n_shift_by_31 ENDP + +ALIGN 32 +__ab_approximation_31 PROC PRIVATE + DB 243,15,30,250 + mov r9,QWORD PTR[40+rsi] + mov r11,QWORD PTR[88+rsi] + mov rbx,QWORD PTR[32+rsi] + mov rbp,QWORD PTR[80+rsi] + mov r8,QWORD PTR[24+rsi] + mov r10,QWORD PTR[72+rsi] + + mov rax,r9 + or rax,r11 + cmovz r9,rbx + cmovz r11,rbp + cmovz rbx,r8 + mov r8,QWORD PTR[16+rsi] + cmovz rbp,r10 + mov r10,QWORD PTR[64+rsi] + + mov rax,r9 + or rax,r11 + cmovz r9,rbx + cmovz r11,rbp + cmovz rbx,r8 + mov r8,QWORD PTR[8+rsi] + cmovz rbp,r10 + mov r10,QWORD PTR[56+rsi] + + mov rax,r9 + or rax,r11 + cmovz r9,rbx + cmovz r11,rbp + cmovz rbx,r8 + mov r8,QWORD PTR[rsi] + cmovz rbp,r10 + mov r10,QWORD PTR[48+rsi] + + mov rax,r9 + or rax,r11 + cmovz r9,rbx + cmovz r11,rbp + cmovz rbx,r8 + cmovz rbp,r10 + + mov rax,r9 + or rax,r11 + bsr rcx,rax + lea rcx,QWORD PTR[1+rcx] + cmovz r9,r8 + cmovz r11,r10 + cmovz rcx,rax + neg rcx + + + shld r9,rbx,cl + shld r11,rbp,cl + + mov eax,07FFFFFFFh + and r8,rax + and r10,rax + andn r9,rax,r9 + andn r11,rax,r11 + or r8,r9 + or r10,r11 + + jmp __inner_loop_31 + + DB 0F3h,0C3h ;repret +__ab_approximation_31 ENDP + +ALIGN 32 +__inner_loop_31 PROC PRIVATE + DB 243,15,30,250 + mov rcx,07FFFFFFF80000000h + mov r13,0800000007FFFFFFFh + mov r15,07FFFFFFF7FFFFFFFh + +$L$oop_31:: + cmp r8,r10 + mov rax,r8 + mov rbx,r10 + mov rbp,rcx + mov r14,r13 + cmovb r8,r10 + cmovb r10,rax + cmovb rcx,r13 + cmovb r13,rbp + + sub r8,r10 + sub rcx,r13 + add rcx,r15 + + test rax,1 + cmovz r8,rax + cmovz r10,rbx + cmovz rcx,rbp + cmovz r13,r14 + + shr r8,1 + add r13,r13 + sub r13,r15 + sub edi,1 + jnz $L$oop_31 + + shr r15,32 + mov edx,ecx + mov r12d,r13d + shr rcx,32 + shr r13,32 + sub rdx,r15 + sub rcx,r15 + sub r12,r15 + sub r13,r15 + + DB 0F3h,0C3h ;repret +__inner_loop_31 ENDP + + +ALIGN 32 +__inner_loop_62 PROC PRIVATE + DB 243,15,30,250 + mov rdx,1 + xor rcx,rcx + xor r12,r12 + mov r13,1 + +$L$oop_62:: + xor rax,rax + test r8,1 + mov rbx,r10 + cmovnz rax,r10 + sub rbx,r8 + mov rbp,r8 + sub r8,rax + cmovc r8,rbx + cmovc r10,rbp + mov rax,rdx + cmovc rdx,r12 + cmovc r12,rax + mov rbx,rcx + cmovc rcx,r13 + cmovc r13,rbx + xor rax,rax + xor rbx,rbx + shr r8,1 + test rbp,1 + cmovnz rax,r12 + cmovnz rbx,r13 + add r12,r12 + add r13,r13 + sub rdx,rax + sub rcx,rbx + sub edi,1 + jnz $L$oop_62 + + DB 0F3h,0C3h ;repret +__inner_loop_62 ENDP +.text$ ENDS +.pdata SEGMENT READONLY ALIGN(4) +ALIGN 4 + DD imagerel $L$SEH_begin_ctx_inverse_mod_383 + DD imagerel $L$SEH_body_ctx_inverse_mod_383 + DD imagerel $L$SEH_info_ctx_inverse_mod_383_prologue + + DD imagerel $L$SEH_body_ctx_inverse_mod_383 + DD imagerel $L$SEH_epilogue_ctx_inverse_mod_383 + DD imagerel $L$SEH_info_ctx_inverse_mod_383_body + + DD imagerel $L$SEH_epilogue_ctx_inverse_mod_383 + DD imagerel $L$SEH_end_ctx_inverse_mod_383 + DD imagerel $L$SEH_info_ctx_inverse_mod_383_epilogue + +.pdata ENDS +.xdata SEGMENT READONLY ALIGN(8) +ALIGN 8 +$L$SEH_info_ctx_inverse_mod_383_prologue:: +DB 1,0,5,00bh +DB 0,074h,1,0 +DB 0,064h,2,0 +DB 0,003h +DB 0,0 +$L$SEH_info_ctx_inverse_mod_383_body:: +DB 1,0,18,0 +DB 000h,0f4h,08bh,000h +DB 000h,0e4h,08ch,000h +DB 000h,0d4h,08dh,000h +DB 000h,0c4h,08eh,000h +DB 000h,034h,08fh,000h +DB 000h,054h,090h,000h +DB 000h,074h,092h,000h +DB 000h,064h,093h,000h +DB 000h,001h,091h,000h +$L$SEH_info_ctx_inverse_mod_383_epilogue:: +DB 1,0,4,0 +DB 000h,074h,001h,000h +DB 000h,064h,002h,000h +DB 000h,000h,000h,000h + + +.xdata ENDS +END diff --git a/temp-deps/blst-ts/deps/blst/build/win64/div3w-armv8.asm b/temp-deps/blst-ts/deps/blst/build/win64/div3w-armv8.asm new file mode 100644 index 0000000..7114ccf --- /dev/null +++ b/temp-deps/blst-ts/deps/blst/build/win64/div3w-armv8.asm @@ -0,0 +1,89 @@ + AREA |.text|,CODE,ALIGN=8,ARM64 + + + EXPORT |div_3_limbs|[FUNC] + ALIGN 32 +|div_3_limbs| PROC + ldp x4,x5,[x0] // load R + eor x0,x0,x0 // Q = 0 + mov x3,#64 // loop counter + nop + +|$Loop| + subs x6,x4,x1 // R - D + add x0,x0,x0 // Q <<= 1 + sbcs x7,x5,x2 + add x0,x0,#1 // Q + speculative bit + csello x4,x4,x6 + extr x1,x2,x1,#1 // D >>= 1 + csello x5,x5,x7 + lsr x2,x2,#1 + sbc x0,x0,xzr // subtract speculative bit + sub x3,x3,#1 + cbnz x3,|$Loop| + + asr x3,x0,#63 // top bit -> mask + add x0,x0,x0 // Q <<= 1 + subs x6,x4,x1 // R - D + add x0,x0,#1 // Q + specilative bit + sbcs x7,x5,x2 + sbc x0,x0,xzr // subtract speculative bit + + orr x0,x0,x3 // all ones if overflow + + ret + ENDP + + EXPORT |quot_rem_128|[FUNC] + ALIGN 32 +|quot_rem_128| PROC + ldp x3,x4,[x1] + + mul x5,x3,x2 // divisor[0:1} * quotient + umulh x6,x3,x2 + mul x11, x4,x2 + umulh x7,x4,x2 + + ldp x8,x9,[x0] // load 3 limbs of the dividend + ldr x10,[x0,#16] + + adds x6,x6,x11 + adc x7,x7,xzr + + subs x8,x8,x5 // dividend - divisor * quotient + sbcs x9,x9,x6 + sbcs x10,x10,x7 + sbc x5,xzr,xzr // borrow -> mask + + add x2,x2,x5 // if borrowed, adjust the quotient ... + and x3,x3,x5 + and x4,x4,x5 + adds x8,x8,x3 // ... and add divisor + adc x9,x9,x4 + + stp x8,x9,[x0] // save 2 limbs of the remainder + str x2,[x0,#16] // and one limb of the quotient + + mov x0,x2 // return adjusted quotient + + ret + ENDP + + + EXPORT |quot_rem_64|[FUNC] + ALIGN 32 +|quot_rem_64| PROC + ldr x3,[x1] + ldr x8,[x0] // load 1 limb of the dividend + + mul x5,x3,x2 // divisor * quotient + + sub x8,x8,x5 // dividend - divisor * quotient + + stp x8,x2,[x0] // save remainder and quotient + + mov x0,x2 // return quotient + + ret + ENDP + END diff --git a/temp-deps/blst-ts/deps/blst/build/win64/div3w-x86_64.asm b/temp-deps/blst-ts/deps/blst/build/win64/div3w-x86_64.asm new file mode 100644 index 0000000..c35f426 --- /dev/null +++ b/temp-deps/blst-ts/deps/blst/build/win64/div3w-x86_64.asm @@ -0,0 +1,152 @@ +OPTION DOTNAME +.text$ SEGMENT ALIGN(256) 'CODE' + +PUBLIC div_3_limbs + + +ALIGN 32 +div_3_limbs PROC PUBLIC + DB 243,15,30,250 + mov QWORD PTR[8+rsp],rdi ;WIN64 prologue + mov QWORD PTR[16+rsp],rsi + mov rax,rsp +$L$SEH_begin_div_3_limbs:: + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 + + + mov r8,QWORD PTR[rdi] + mov r9,QWORD PTR[8+rdi] + xor rax,rax + mov ecx,64 + +$L$oop:: + mov r10,r8 + sub r8,rsi + mov r11,r9 + sbb r9,rdx + lea rax,QWORD PTR[1+rax*1+rax] + mov rdi,rdx + cmovc r8,r10 + cmovc r9,r11 + sbb rax,0 + shl rdi,63 + shr rsi,1 + shr rdx,1 + or rsi,rdi + sub ecx,1 + jnz $L$oop + + lea rcx,QWORD PTR[1+rax*1+rax] + sar rax,63 + + sub r8,rsi + sbb r9,rdx + sbb rcx,0 + + or rax,rcx + + mov rdi,QWORD PTR[8+rsp] ;WIN64 epilogue + mov rsi,QWORD PTR[16+rsp] + DB 0F3h,0C3h ;repret +$L$SEH_end_div_3_limbs:: +div_3_limbs ENDP +PUBLIC quot_rem_128 + + +ALIGN 32 +quot_rem_128 PROC PUBLIC + DB 243,15,30,250 + mov QWORD PTR[8+rsp],rdi ;WIN64 prologue + mov QWORD PTR[16+rsp],rsi + mov rax,rsp +$L$SEH_begin_quot_rem_128:: + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 + + + mov rax,rdx + mov rcx,rdx + + mul QWORD PTR[rsi] + mov r8,rax + mov rax,rcx + mov r9,rdx + + mul QWORD PTR[8+rsi] + add r9,rax + adc rdx,0 + + mov r10,QWORD PTR[rdi] + mov r11,QWORD PTR[8+rdi] + mov rax,QWORD PTR[16+rdi] + + sub r10,r8 + sbb r11,r9 + sbb rax,rdx + sbb r8,r8 + + add rcx,r8 + mov r9,r8 + and r8,QWORD PTR[rsi] + and r9,QWORD PTR[8+rsi] + add r10,r8 + adc r11,r9 + + mov QWORD PTR[rdi],r10 + mov QWORD PTR[8+rdi],r11 + mov QWORD PTR[16+rdi],rcx + + mov rax,rcx + + mov rdi,QWORD PTR[8+rsp] ;WIN64 epilogue + mov rsi,QWORD PTR[16+rsp] + DB 0F3h,0C3h ;repret +$L$SEH_end_quot_rem_128:: +quot_rem_128 ENDP + + + + + +PUBLIC quot_rem_64 + + +ALIGN 32 +quot_rem_64 PROC PUBLIC + DB 243,15,30,250 + mov QWORD PTR[8+rsp],rdi ;WIN64 prologue + mov QWORD PTR[16+rsp],rsi + mov rax,rsp +$L$SEH_begin_quot_rem_64:: + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 + + + mov rax,rdx + imul rdx,QWORD PTR[rsi] + + mov r10,QWORD PTR[rdi] + + sub r10,rdx + + mov QWORD PTR[rdi],r10 + mov QWORD PTR[8+rdi],rax + + mov rdi,QWORD PTR[8+rsp] ;WIN64 epilogue + mov rsi,QWORD PTR[16+rsp] + DB 0F3h,0C3h ;repret +$L$SEH_end_quot_rem_64:: +quot_rem_64 ENDP +.text$ ENDS +.pdata SEGMENT READONLY ALIGN(4) +ALIGN 4 +.pdata ENDS +.xdata SEGMENT READONLY ALIGN(8) +ALIGN 8 + +.xdata ENDS +END diff --git a/temp-deps/blst-ts/deps/blst/build/win64/dll.c b/temp-deps/blst-ts/deps/blst/build/win64/dll.c new file mode 100644 index 0000000..a70d0c9 --- /dev/null +++ b/temp-deps/blst-ts/deps/blst/build/win64/dll.c @@ -0,0 +1,32 @@ +#include + +#if defined(_MSC_VER) +/* + * Even though we don't have memcpy/memset anywhere, MSVC compiler + * generates calls to them as it recognizes corresponding patterns. + */ +void *memcpy(unsigned char *dst, const unsigned char *src, size_t n) +{ + void *ret = dst; + + while(n--) + *dst++ = *src++; + + return ret; +} + +void *memset(unsigned char *dst, int c, size_t n) +{ + void *ret = dst; + + while(n--) + *dst++ = (unsigned char)c; + + return ret; +} +#elif defined(__GNUC__) +# pragma GCC diagnostic ignored "-Wunused-parameter" +#endif + +BOOL WINAPI DllMain(HINSTANCE hinstDLL, DWORD fdwReason, LPVOID lpvReserved) +{ return TRUE; } diff --git a/temp-deps/blst-ts/deps/blst/build/win64/mul_mont_256-armv8.asm b/temp-deps/blst-ts/deps/blst/build/win64/mul_mont_256-armv8.asm new file mode 100644 index 0000000..bb2dfe0 --- /dev/null +++ b/temp-deps/blst-ts/deps/blst/build/win64/mul_mont_256-armv8.asm @@ -0,0 +1,465 @@ + AREA |.text|,CODE,ALIGN=8,ARM64 + + + + EXPORT |mul_mont_sparse_256|[FUNC] + ALIGN 32 +|mul_mont_sparse_256| PROC + stp x29,x30,[sp,#-64]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + stp x23,x24,[sp,#48] + + ldp x10,x11,[x1] + ldr x9, [x2] + ldp x12,x13,[x1,#16] + + mul x19,x10,x9 + ldp x5,x6,[x3] + mul x20,x11,x9 + ldp x7,x8,[x3,#16] + mul x21,x12,x9 + mul x22,x13,x9 + + umulh x14,x10,x9 + umulh x15,x11,x9 + mul x3,x4,x19 + umulh x16,x12,x9 + umulh x17,x13,x9 + adds x20,x20,x14 + //mul x14,x5,x3 + adcs x21,x21,x15 + mul x15,x6,x3 + adcs x22,x22,x16 + mul x16,x7,x3 + adc x23,xzr, x17 + mul x17,x8,x3 + ldr x9,[x2,8*1] + subs xzr,x19,#1 //adds x19,x19,x14 + umulh x14,x5,x3 + adcs x20,x20,x15 + umulh x15,x6,x3 + adcs x21,x21,x16 + umulh x16,x7,x3 + adcs x22,x22,x17 + umulh x17,x8,x3 + adc x23,x23,xzr + + adds x19,x20,x14 + mul x14,x10,x9 + adcs x20,x21,x15 + mul x15,x11,x9 + adcs x21,x22,x16 + mul x16,x12,x9 + adcs x22,x23,x17 + mul x17,x13,x9 + adc x23,xzr,xzr + + adds x19,x19,x14 + umulh x14,x10,x9 + adcs x20,x20,x15 + umulh x15,x11,x9 + adcs x21,x21,x16 + mul x3,x4,x19 + umulh x16,x12,x9 + adcs x22,x22,x17 + umulh x17,x13,x9 + adc x23,x23,xzr + + adds x20,x20,x14 + //mul x14,x5,x3 + adcs x21,x21,x15 + mul x15,x6,x3 + adcs x22,x22,x16 + mul x16,x7,x3 + adc x23,x23,x17 + mul x17,x8,x3 + ldr x9,[x2,8*2] + subs xzr,x19,#1 //adds x19,x19,x14 + umulh x14,x5,x3 + adcs x20,x20,x15 + umulh x15,x6,x3 + adcs x21,x21,x16 + umulh x16,x7,x3 + adcs x22,x22,x17 + umulh x17,x8,x3 + adc x23,x23,xzr + + adds x19,x20,x14 + mul x14,x10,x9 + adcs x20,x21,x15 + mul x15,x11,x9 + adcs x21,x22,x16 + mul x16,x12,x9 + adcs x22,x23,x17 + mul x17,x13,x9 + adc x23,xzr,xzr + + adds x19,x19,x14 + umulh x14,x10,x9 + adcs x20,x20,x15 + umulh x15,x11,x9 + adcs x21,x21,x16 + mul x3,x4,x19 + umulh x16,x12,x9 + adcs x22,x22,x17 + umulh x17,x13,x9 + adc x23,x23,xzr + + adds x20,x20,x14 + //mul x14,x5,x3 + adcs x21,x21,x15 + mul x15,x6,x3 + adcs x22,x22,x16 + mul x16,x7,x3 + adc x23,x23,x17 + mul x17,x8,x3 + ldr x9,[x2,8*3] + subs xzr,x19,#1 //adds x19,x19,x14 + umulh x14,x5,x3 + adcs x20,x20,x15 + umulh x15,x6,x3 + adcs x21,x21,x16 + umulh x16,x7,x3 + adcs x22,x22,x17 + umulh x17,x8,x3 + adc x23,x23,xzr + + adds x19,x20,x14 + mul x14,x10,x9 + adcs x20,x21,x15 + mul x15,x11,x9 + adcs x21,x22,x16 + mul x16,x12,x9 + adcs x22,x23,x17 + mul x17,x13,x9 + adc x23,xzr,xzr + + adds x19,x19,x14 + umulh x14,x10,x9 + adcs x20,x20,x15 + umulh x15,x11,x9 + adcs x21,x21,x16 + mul x3,x4,x19 + umulh x16,x12,x9 + adcs x22,x22,x17 + umulh x17,x13,x9 + adc x23,x23,xzr + + adds x20,x20,x14 + //mul x14,x5,x3 + adcs x21,x21,x15 + mul x15,x6,x3 + adcs x22,x22,x16 + mul x16,x7,x3 + adc x23,x23,x17 + mul x17,x8,x3 + subs xzr,x19,#1 //adds x19,x19,x14 + umulh x14,x5,x3 + adcs x20,x20,x15 + umulh x15,x6,x3 + adcs x21,x21,x16 + umulh x16,x7,x3 + adcs x22,x22,x17 + umulh x17,x8,x3 + adc x23,x23,xzr + + adds x19,x20,x14 + adcs x20,x21,x15 + adcs x21,x22,x16 + adcs x22,x23,x17 + adc x23,xzr,xzr + + subs x14,x19,x5 + sbcs x15,x20,x6 + sbcs x16,x21,x7 + sbcs x17,x22,x8 + sbcs xzr, x23,xzr + + csello x19,x19,x14 + csello x20,x20,x15 + csello x21,x21,x16 + csello x22,x22,x17 + + stp x19,x20,[x0] + stp x21,x22,[x0,#16] + + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldp x23,x24,[x29,#48] + ldr x29,[sp],#64 + ret + ENDP + + + EXPORT |sqr_mont_sparse_256|[FUNC] + ALIGN 32 +|sqr_mont_sparse_256| PROC + DCDU 3573752639 + stp x29,x30,[sp,#-48]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + + ldp x5,x6,[x1] + ldp x7,x8,[x1,#16] + mov x4,x3 + + //////////////////////////////////////////////////////////////// + // | | | | | |a1*a0| | + // | | | | |a2*a0| | | + // | |a3*a2|a3*a0| | | | + // | | | |a2*a1| | | | + // | | |a3*a1| | | | | + // *| | | | | | | | 2| + // +|a3*a3|a2*a2|a1*a1|a0*a0| + // |--+--+--+--+--+--+--+--| + // |A7|A6|A5|A4|A3|A2|A1|A0|, where Ax is x10 + // + // "can't overflow" below mark carrying into high part of + // multiplication result, which can't overflow, because it + // can never be all ones. + + mul x11,x6,x5 // a[1]*a[0] + umulh x15,x6,x5 + mul x12,x7,x5 // a[2]*a[0] + umulh x16,x7,x5 + mul x13,x8,x5 // a[3]*a[0] + umulh x19,x8,x5 + + adds x12,x12,x15 // accumulate high parts of multiplication + mul x14,x7,x6 // a[2]*a[1] + umulh x15,x7,x6 + adcs x13,x13,x16 + mul x16,x8,x6 // a[3]*a[1] + umulh x17,x8,x6 + adc x19,x19,xzr // can't overflow + + mul x20,x8,x7 // a[3]*a[2] + umulh x21,x8,x7 + + adds x15,x15,x16 // accumulate high parts of multiplication + mul x10,x5,x5 // a[0]*a[0] + adc x16,x17,xzr // can't overflow + + adds x13,x13,x14 // accumulate low parts of multiplication + umulh x5,x5,x5 + adcs x19,x19,x15 + mul x15,x6,x6 // a[1]*a[1] + adcs x20,x20,x16 + umulh x6,x6,x6 + adc x21,x21,xzr // can't overflow + + adds x11,x11,x11 // acc[1-6]*=2 + mul x16,x7,x7 // a[2]*a[2] + adcs x12,x12,x12 + umulh x7,x7,x7 + adcs x13,x13,x13 + mul x17,x8,x8 // a[3]*a[3] + adcs x19,x19,x19 + umulh x8,x8,x8 + adcs x20,x20,x20 + adcs x21,x21,x21 + adc x22,xzr,xzr + + adds x11,x11,x5 // +a[i]*a[i] + adcs x12,x12,x15 + adcs x13,x13,x6 + adcs x19,x19,x16 + adcs x20,x20,x7 + adcs x21,x21,x17 + adc x22,x22,x8 + + bl __mul_by_1_mont_256 + ldr x30,[x29,#8] + + adds x10,x10,x19 // accumulate upper half + adcs x11,x11,x20 + adcs x12,x12,x21 + adcs x13,x13,x22 + adc x19,xzr,xzr + + subs x14,x10,x5 + sbcs x15,x11,x6 + sbcs x16,x12,x7 + sbcs x17,x13,x8 + sbcs xzr, x19,xzr + + csello x10,x10,x14 + csello x11,x11,x15 + csello x12,x12,x16 + csello x13,x13,x17 + + stp x10,x11,[x0] + stp x12,x13,[x0,#16] + + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldr x29,[sp],#48 + DCDU 3573752767 + ret + ENDP + + + EXPORT |from_mont_256|[FUNC] + ALIGN 32 +|from_mont_256| PROC + DCDU 3573752639 + stp x29,x30,[sp,#-16]! + add x29,sp,#0 + + mov x4,x3 + ldp x10,x11,[x1] + ldp x12,x13,[x1,#16] + + bl __mul_by_1_mont_256 + ldr x30,[x29,#8] + + subs x14,x10,x5 + sbcs x15,x11,x6 + sbcs x16,x12,x7 + sbcs x17,x13,x8 + + csello x10,x10,x14 + csello x11,x11,x15 + csello x12,x12,x16 + csello x13,x13,x17 + + stp x10,x11,[x0] + stp x12,x13,[x0,#16] + + ldr x29,[sp],#16 + DCDU 3573752767 + ret + ENDP + + + + EXPORT |redc_mont_256|[FUNC] + ALIGN 32 +|redc_mont_256| PROC + DCDU 3573752639 + stp x29,x30,[sp,#-16]! + add x29,sp,#0 + + mov x4,x3 + ldp x10,x11,[x1] + ldp x12,x13,[x1,#16] + + bl __mul_by_1_mont_256 + ldr x30,[x29,#8] + + ldp x14,x15,[x1,#32] + ldp x16,x17,[x1,#48] + + adds x10,x10,x14 + adcs x11,x11,x15 + adcs x12,x12,x16 + adcs x13,x13,x17 + adc x9,xzr,xzr + + subs x14,x10,x5 + sbcs x15,x11,x6 + sbcs x16,x12,x7 + sbcs x17,x13,x8 + sbcs xzr, x9,xzr + + csello x10,x10,x14 + csello x11,x11,x15 + csello x12,x12,x16 + csello x13,x13,x17 + + stp x10,x11,[x0] + stp x12,x13,[x0,#16] + + ldr x29,[sp],#16 + DCDU 3573752767 + ret + ENDP + + + ALIGN 32 +|__mul_by_1_mont_256| PROC + mul x3,x4,x10 + ldp x5,x6,[x2] + ldp x7,x8,[x2,#16] + //mul x14,x5,x3 + mul x15,x6,x3 + mul x16,x7,x3 + mul x17,x8,x3 + subs xzr,x10,#1 //adds x10,x10,x14 + umulh x14,x5,x3 + adcs x11,x11,x15 + umulh x15,x6,x3 + adcs x12,x12,x16 + umulh x16,x7,x3 + adcs x13,x13,x17 + umulh x17,x8,x3 + adc x9,xzr,xzr + + adds x10,x11,x14 + adcs x11,x12,x15 + adcs x12,x13,x16 + mul x3,x4,x10 + adc x13,x9,x17 + //mul x14,x5,x3 + mul x15,x6,x3 + mul x16,x7,x3 + mul x17,x8,x3 + subs xzr,x10,#1 //adds x10,x10,x14 + umulh x14,x5,x3 + adcs x11,x11,x15 + umulh x15,x6,x3 + adcs x12,x12,x16 + umulh x16,x7,x3 + adcs x13,x13,x17 + umulh x17,x8,x3 + adc x9,xzr,xzr + + adds x10,x11,x14 + adcs x11,x12,x15 + adcs x12,x13,x16 + mul x3,x4,x10 + adc x13,x9,x17 + //mul x14,x5,x3 + mul x15,x6,x3 + mul x16,x7,x3 + mul x17,x8,x3 + subs xzr,x10,#1 //adds x10,x10,x14 + umulh x14,x5,x3 + adcs x11,x11,x15 + umulh x15,x6,x3 + adcs x12,x12,x16 + umulh x16,x7,x3 + adcs x13,x13,x17 + umulh x17,x8,x3 + adc x9,xzr,xzr + + adds x10,x11,x14 + adcs x11,x12,x15 + adcs x12,x13,x16 + mul x3,x4,x10 + adc x13,x9,x17 + //mul x14,x5,x3 + mul x15,x6,x3 + mul x16,x7,x3 + mul x17,x8,x3 + subs xzr,x10,#1 //adds x10,x10,x14 + umulh x14,x5,x3 + adcs x11,x11,x15 + umulh x15,x6,x3 + adcs x12,x12,x16 + umulh x16,x7,x3 + adcs x13,x13,x17 + umulh x17,x8,x3 + adc x9,xzr,xzr + + adds x10,x11,x14 + adcs x11,x12,x15 + adcs x12,x13,x16 + adc x13,x9,x17 + + ret + ENDP + END diff --git a/temp-deps/blst-ts/deps/blst/build/win64/mul_mont_384-armv8.asm b/temp-deps/blst-ts/deps/blst/build/win64/mul_mont_384-armv8.asm new file mode 100644 index 0000000..a309dfa --- /dev/null +++ b/temp-deps/blst-ts/deps/blst/build/win64/mul_mont_384-armv8.asm @@ -0,0 +1,2373 @@ + AREA |.text|,CODE,ALIGN=8,ARM64 + + + EXPORT |add_mod_384x384|[FUNC] + ALIGN 32 +|add_mod_384x384| PROC + DCDU 3573752639 + stp x29,x30,[sp,#-64]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + stp x23,x24,[sp,#48] + + ldp x5,x6,[x3] + ldp x7,x8,[x3,#16] + ldp x9,x10,[x3,#32] + + bl __add_mod_384x384 + ldr x30,[x29,#8] + + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldp x23,x24,[x29,#48] + ldr x29,[sp],#64 + DCDU 3573752767 + ret + ENDP + + + ALIGN 32 +|__add_mod_384x384| PROC + ldp x11, x12, [x1] + ldp x19,x20,[x2] + ldp x13, x14, [x1,#16] + adds x11,x11,x19 + ldp x21,x22,[x2,#16] + adcs x12,x12,x20 + ldp x15, x16, [x1,#32] + adcs x13,x13,x21 + ldp x23,x24,[x2,#32] + adcs x14,x14,x22 + stp x11, x12, [x0] + adcs x15,x15,x23 + ldp x11, x12, [x1,#48] + adcs x16,x16,x24 + + ldp x19,x20,[x2,#48] + stp x13, x14, [x0,#16] + ldp x13, x14, [x1,#64] + ldp x21,x22,[x2,#64] + + adcs x11,x11,x19 + stp x15, x16, [x0,#32] + adcs x12,x12,x20 + ldp x15, x16, [x1,#80] + adcs x13,x13,x21 + ldp x23,x24,[x2,#80] + adcs x14,x14,x22 + adcs x15,x15,x23 + adcs x16,x16,x24 + adc x17,xzr,xzr + + subs x19,x11,x5 + sbcs x20,x12,x6 + sbcs x21,x13,x7 + sbcs x22,x14,x8 + sbcs x23,x15,x9 + sbcs x24,x16,x10 + sbcs xzr,x17,xzr + + csello x11,x11,x19 + csello x12,x12,x20 + csello x13,x13,x21 + csello x14,x14,x22 + stp x11,x12,[x0,#48] + csello x15,x15,x23 + stp x13,x14,[x0,#64] + csello x16,x16,x24 + stp x15,x16,[x0,#80] + + ret + ENDP + + + EXPORT |sub_mod_384x384|[FUNC] + ALIGN 32 +|sub_mod_384x384| PROC + DCDU 3573752639 + stp x29,x30,[sp,#-64]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + stp x23,x24,[sp,#48] + + ldp x5,x6,[x3] + ldp x7,x8,[x3,#16] + ldp x9,x10,[x3,#32] + + bl __sub_mod_384x384 + ldr x30,[x29,#8] + + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldp x23,x24,[x29,#48] + ldr x29,[sp],#64 + DCDU 3573752767 + ret + ENDP + + + ALIGN 32 +|__sub_mod_384x384| PROC + ldp x11, x12, [x1] + ldp x19,x20,[x2] + ldp x13, x14, [x1,#16] + subs x11,x11,x19 + ldp x21,x22,[x2,#16] + sbcs x12,x12,x20 + ldp x15, x16, [x1,#32] + sbcs x13,x13,x21 + ldp x23,x24,[x2,#32] + sbcs x14,x14,x22 + stp x11, x12, [x0] + sbcs x15,x15,x23 + ldp x11, x12, [x1,#48] + sbcs x16,x16,x24 + + ldp x19,x20,[x2,#48] + stp x13, x14, [x0,#16] + ldp x13, x14, [x1,#64] + ldp x21,x22,[x2,#64] + + sbcs x11,x11,x19 + stp x15, x16, [x0,#32] + sbcs x12,x12,x20 + ldp x15, x16, [x1,#80] + sbcs x13,x13,x21 + ldp x23,x24,[x2,#80] + sbcs x14,x14,x22 + sbcs x15,x15,x23 + sbcs x16,x16,x24 + sbc x17,xzr,xzr + + and x19,x5,x17 + and x20,x6,x17 + adds x11,x11,x19 + and x21,x7,x17 + adcs x12,x12,x20 + and x22,x8,x17 + adcs x13,x13,x21 + and x23,x9,x17 + adcs x14,x14,x22 + and x24,x10,x17 + adcs x15,x15,x23 + stp x11,x12,[x0,#48] + adc x16,x16,x24 + stp x13,x14,[x0,#64] + stp x15,x16,[x0,#80] + + ret + ENDP + + + ALIGN 32 +|__add_mod_384| PROC + ldp x11, x12, [x1] + ldp x19,x20,[x2] + ldp x13, x14, [x1,#16] + adds x11,x11,x19 + ldp x21,x22,[x2,#16] + adcs x12,x12,x20 + ldp x15, x16, [x1,#32] + adcs x13,x13,x21 + ldp x23,x24,[x2,#32] + adcs x14,x14,x22 + adcs x15,x15,x23 + adcs x16,x16,x24 + adc x17,xzr,xzr + + subs x19,x11,x5 + sbcs x20,x12,x6 + sbcs x21,x13,x7 + sbcs x22,x14,x8 + sbcs x23,x15,x9 + sbcs x24,x16,x10 + sbcs xzr,x17,xzr + + csello x11,x11,x19 + csello x12,x12,x20 + csello x13,x13,x21 + csello x14,x14,x22 + csello x15,x15,x23 + stp x11,x12,[x0] + csello x16,x16,x24 + stp x13,x14,[x0,#16] + stp x15,x16,[x0,#32] + + ret + ENDP + + + ALIGN 32 +|__sub_mod_384| PROC + ldp x11, x12, [x1] + ldp x19,x20,[x2] + ldp x13, x14, [x1,#16] + subs x11,x11,x19 + ldp x21,x22,[x2,#16] + sbcs x12,x12,x20 + ldp x15, x16, [x1,#32] + sbcs x13,x13,x21 + ldp x23,x24,[x2,#32] + sbcs x14,x14,x22 + sbcs x15,x15,x23 + sbcs x16,x16,x24 + sbc x17,xzr,xzr + + and x19,x5,x17 + and x20,x6,x17 + adds x11,x11,x19 + and x21,x7,x17 + adcs x12,x12,x20 + and x22,x8,x17 + adcs x13,x13,x21 + and x23,x9,x17 + adcs x14,x14,x22 + and x24,x10,x17 + adcs x15,x15,x23 + stp x11,x12,[x0] + adc x16,x16,x24 + stp x13,x14,[x0,#16] + stp x15,x16,[x0,#32] + + ret + ENDP + + + + EXPORT |mul_mont_384x|[FUNC] + ALIGN 32 +|mul_mont_384x| PROC + DCDU 3573752639 + stp x29,x30,[sp,#-128]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + stp x23,x24,[sp,#48] + stp x25,x26,[sp,#64] + stp x27,x28,[sp,#80] + sub sp,sp,#288 // space for 3 768-bit vectors + + mov x26,x0 // save r_ptr + mov x27,x1 // save b_ptr + mov x28,x2 // save b_ptr + + sub x0,sp,#0 // mul_384(t0, a->re, b->re) + bl __mul_384 + + add x1,x1,#48 // mul_384(t1, a->im, b->im) + add x2,x2,#48 + add x0,sp,#96 + bl __mul_384 + + ldp x5,x6,[x3] + ldp x7,x8,[x3,#16] + ldp x9,x10,[x3,#32] + + sub x2,x1,#48 + add x0,sp,#240 + bl __add_mod_384 + + add x1,x28,#0 + add x2,x28,#48 + add x0,sp,#192 // t2 + bl __add_mod_384 + + add x1,x0,#0 + add x2,x0,#48 + bl __mul_384 // mul_384(t2, a->re+a->im, b->re+b->im) + + ldp x5,x6,[x3] + ldp x7,x8,[x3,#16] + ldp x9,x10,[x3,#32] + + mov x1,x0 + add x2,sp,#0 + bl __sub_mod_384x384 + + add x2,sp,#96 + bl __sub_mod_384x384 // t2 = t2-t0-t1 + + add x1,sp,#0 + add x2,sp,#96 + add x0,sp,#0 + bl __sub_mod_384x384 // t0 = t0-t1 + + add x1,sp,#0 // ret->re = redc(t0) + add x0,x26,#0 + bl __mul_by_1_mont_384 + bl __redc_tail_mont_384 + + add x1,sp,#192 // ret->im = redc(t2) + add x0,x0,#48 + bl __mul_by_1_mont_384 + bl __redc_tail_mont_384 + ldr x30,[x29,#8] + + add sp,sp,#288 + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldp x23,x24,[x29,#48] + ldp x25,x26,[x29,#64] + ldp x27,x28,[x29,#80] + ldr x29,[sp],#128 + DCDU 3573752767 + ret + ENDP + + + + EXPORT |sqr_mont_384x|[FUNC] + ALIGN 32 +|sqr_mont_384x| PROC + DCDU 3573752639 + stp x29,x30,[sp,#-128]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + stp x23,x24,[sp,#48] + stp x25,x26,[sp,#64] + stp x27,x28,[sp,#80] + stp x3,x0,[sp,#96] // __mul_mont_384 wants them there + sub sp,sp,#96 // space for 2 384-bit vectors + mov x4,x3 // adjust for missing b_ptr + + ldp x5,x6,[x2] + ldp x7,x8,[x2,#16] + ldp x9,x10,[x2,#32] + + add x2,x1,#48 + add x0,sp,#0 + bl __add_mod_384 // t0 = a->re + a->im + + add x0,sp,#48 + bl __sub_mod_384 // t1 = a->re - a->im + + ldp x11,x12,[x1] + ldr x17, [x2] + ldp x13,x14,[x1,#16] + ldp x15,x16,[x1,#32] + + bl __mul_mont_384 // mul_mont_384(ret->im, a->re, a->im) + + adds x11,x11,x11 // add with itself + adcs x12,x12,x12 + adcs x13,x13,x13 + adcs x14,x14,x14 + adcs x15,x15,x15 + adcs x16,x16,x16 + adc x25,xzr,xzr + + subs x19,x11,x5 + sbcs x20,x12,x6 + sbcs x21,x13,x7 + sbcs x22,x14,x8 + sbcs x23,x15,x9 + sbcs x24,x16,x10 + sbcs xzr,x25,xzr + + csello x19,x11,x19 + csello x20,x12,x20 + csello x21,x13,x21 + ldp x11,x12,[sp] + csello x22,x14,x22 + ldr x17, [sp,#48] + csello x23,x15,x23 + ldp x13,x14,[sp,#16] + csello x24,x16,x24 + ldp x15,x16,[sp,#32] + + stp x19,x20,[x2,#48] + stp x21,x22,[x2,#64] + stp x23,x24,[x2,#80] + + add x2,sp,#48 + bl __mul_mont_384 // mul_mont_384(ret->re, t0, t1) + ldr x30,[x29,#8] + + stp x11,x12,[x2] + stp x13,x14,[x2,#16] + stp x15,x16,[x2,#32] + + add sp,sp,#96 + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldp x23,x24,[x29,#48] + ldp x25,x26,[x29,#64] + ldp x27,x28,[x29,#80] + ldr x29,[sp],#128 + DCDU 3573752767 + ret + ENDP + + + + EXPORT |mul_mont_384|[FUNC] + ALIGN 32 +|mul_mont_384| PROC + DCDU 3573752639 + stp x29,x30,[sp,#-128]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + stp x23,x24,[sp,#48] + stp x25,x26,[sp,#64] + stp x27,x28,[sp,#80] + stp x4,x0,[sp,#96] // __mul_mont_384 wants them there + + ldp x11,x12,[x1] + ldr x17, [x2] + ldp x13,x14,[x1,#16] + ldp x15,x16,[x1,#32] + + ldp x5,x6,[x3] + ldp x7,x8,[x3,#16] + ldp x9,x10,[x3,#32] + + bl __mul_mont_384 + ldr x30,[x29,#8] + + stp x11,x12,[x2] + stp x13,x14,[x2,#16] + stp x15,x16,[x2,#32] + + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldp x23,x24,[x29,#48] + ldp x25,x26,[x29,#64] + ldp x27,x28,[x29,#80] + ldr x29,[sp],#128 + DCDU 3573752767 + ret + ENDP + + + ALIGN 32 +|__mul_mont_384| PROC + mul x19,x11,x17 + mul x20,x12,x17 + mul x21,x13,x17 + mul x22,x14,x17 + mul x23,x15,x17 + mul x24,x16,x17 + mul x4,x4,x19 + + umulh x26,x11,x17 + umulh x27,x12,x17 + umulh x28,x13,x17 + umulh x0,x14,x17 + umulh x1,x15,x17 + umulh x3,x16,x17 + + adds x20,x20,x26 + // mul x26,x5,x4 + adcs x21,x21,x27 + mul x27,x6,x4 + adcs x22,x22,x28 + mul x28,x7,x4 + adcs x23,x23,x0 + mul x0,x8,x4 + adcs x24,x24,x1 + mul x1,x9,x4 + adc x25,xzr, x3 + mul x3,x10,x4 + mov x17,xzr + subs xzr,x19,#1 // adds x19,x19,x26 + umulh x26,x5,x4 + adcs x20,x20,x27 + umulh x27,x6,x4 + adcs x21,x21,x28 + umulh x28,x7,x4 + adcs x22,x22,x0 + umulh x0,x8,x4 + adcs x23,x23,x1 + umulh x1,x9,x4 + adcs x24,x24,x3 + umulh x3,x10,x4 + adcs x25,x25,xzr + adc x4,x17,xzr + ldr x17,[x2,8*1] + + adds x19,x20,x26 + mul x26,x11,x17 + adcs x20,x21,x27 + mul x27,x12,x17 + adcs x21,x22,x28 + mul x28,x13,x17 + adcs x22,x23,x0 + mul x0,x14,x17 + adcs x23,x24,x1 + mul x1,x15,x17 + adcs x24,x25,x3 + mul x3,x16,x17 + adc x25,x4,xzr + ldr x4,[x29,#96] + + adds x19,x19,x26 + umulh x26,x11,x17 + adcs x20,x20,x27 + umulh x27,x12,x17 + adcs x21,x21,x28 + mul x4,x4,x19 + umulh x28,x13,x17 + adcs x22,x22,x0 + umulh x0,x14,x17 + adcs x23,x23,x1 + umulh x1,x15,x17 + adcs x24,x24,x3 + umulh x3,x16,x17 + adcs x25,x25,xzr + adc x17,xzr,xzr + + adds x20,x20,x26 + // mul x26,x5,x4 + adcs x21,x21,x27 + mul x27,x6,x4 + adcs x22,x22,x28 + mul x28,x7,x4 + adcs x23,x23,x0 + mul x0,x8,x4 + adcs x24,x24,x1 + mul x1,x9,x4 + adcs x25,x25,x3 + mul x3,x10,x4 + adc x17,x17,xzr + subs xzr,x19,#1 // adds x19,x19,x26 + umulh x26,x5,x4 + adcs x20,x20,x27 + umulh x27,x6,x4 + adcs x21,x21,x28 + umulh x28,x7,x4 + adcs x22,x22,x0 + umulh x0,x8,x4 + adcs x23,x23,x1 + umulh x1,x9,x4 + adcs x24,x24,x3 + umulh x3,x10,x4 + adcs x25,x25,xzr + adc x4,x17,xzr + ldr x17,[x2,8*2] + + adds x19,x20,x26 + mul x26,x11,x17 + adcs x20,x21,x27 + mul x27,x12,x17 + adcs x21,x22,x28 + mul x28,x13,x17 + adcs x22,x23,x0 + mul x0,x14,x17 + adcs x23,x24,x1 + mul x1,x15,x17 + adcs x24,x25,x3 + mul x3,x16,x17 + adc x25,x4,xzr + ldr x4,[x29,#96] + + adds x19,x19,x26 + umulh x26,x11,x17 + adcs x20,x20,x27 + umulh x27,x12,x17 + adcs x21,x21,x28 + mul x4,x4,x19 + umulh x28,x13,x17 + adcs x22,x22,x0 + umulh x0,x14,x17 + adcs x23,x23,x1 + umulh x1,x15,x17 + adcs x24,x24,x3 + umulh x3,x16,x17 + adcs x25,x25,xzr + adc x17,xzr,xzr + + adds x20,x20,x26 + // mul x26,x5,x4 + adcs x21,x21,x27 + mul x27,x6,x4 + adcs x22,x22,x28 + mul x28,x7,x4 + adcs x23,x23,x0 + mul x0,x8,x4 + adcs x24,x24,x1 + mul x1,x9,x4 + adcs x25,x25,x3 + mul x3,x10,x4 + adc x17,x17,xzr + subs xzr,x19,#1 // adds x19,x19,x26 + umulh x26,x5,x4 + adcs x20,x20,x27 + umulh x27,x6,x4 + adcs x21,x21,x28 + umulh x28,x7,x4 + adcs x22,x22,x0 + umulh x0,x8,x4 + adcs x23,x23,x1 + umulh x1,x9,x4 + adcs x24,x24,x3 + umulh x3,x10,x4 + adcs x25,x25,xzr + adc x4,x17,xzr + ldr x17,[x2,8*3] + + adds x19,x20,x26 + mul x26,x11,x17 + adcs x20,x21,x27 + mul x27,x12,x17 + adcs x21,x22,x28 + mul x28,x13,x17 + adcs x22,x23,x0 + mul x0,x14,x17 + adcs x23,x24,x1 + mul x1,x15,x17 + adcs x24,x25,x3 + mul x3,x16,x17 + adc x25,x4,xzr + ldr x4,[x29,#96] + + adds x19,x19,x26 + umulh x26,x11,x17 + adcs x20,x20,x27 + umulh x27,x12,x17 + adcs x21,x21,x28 + mul x4,x4,x19 + umulh x28,x13,x17 + adcs x22,x22,x0 + umulh x0,x14,x17 + adcs x23,x23,x1 + umulh x1,x15,x17 + adcs x24,x24,x3 + umulh x3,x16,x17 + adcs x25,x25,xzr + adc x17,xzr,xzr + + adds x20,x20,x26 + // mul x26,x5,x4 + adcs x21,x21,x27 + mul x27,x6,x4 + adcs x22,x22,x28 + mul x28,x7,x4 + adcs x23,x23,x0 + mul x0,x8,x4 + adcs x24,x24,x1 + mul x1,x9,x4 + adcs x25,x25,x3 + mul x3,x10,x4 + adc x17,x17,xzr + subs xzr,x19,#1 // adds x19,x19,x26 + umulh x26,x5,x4 + adcs x20,x20,x27 + umulh x27,x6,x4 + adcs x21,x21,x28 + umulh x28,x7,x4 + adcs x22,x22,x0 + umulh x0,x8,x4 + adcs x23,x23,x1 + umulh x1,x9,x4 + adcs x24,x24,x3 + umulh x3,x10,x4 + adcs x25,x25,xzr + adc x4,x17,xzr + ldr x17,[x2,8*4] + + adds x19,x20,x26 + mul x26,x11,x17 + adcs x20,x21,x27 + mul x27,x12,x17 + adcs x21,x22,x28 + mul x28,x13,x17 + adcs x22,x23,x0 + mul x0,x14,x17 + adcs x23,x24,x1 + mul x1,x15,x17 + adcs x24,x25,x3 + mul x3,x16,x17 + adc x25,x4,xzr + ldr x4,[x29,#96] + + adds x19,x19,x26 + umulh x26,x11,x17 + adcs x20,x20,x27 + umulh x27,x12,x17 + adcs x21,x21,x28 + mul x4,x4,x19 + umulh x28,x13,x17 + adcs x22,x22,x0 + umulh x0,x14,x17 + adcs x23,x23,x1 + umulh x1,x15,x17 + adcs x24,x24,x3 + umulh x3,x16,x17 + adcs x25,x25,xzr + adc x17,xzr,xzr + + adds x20,x20,x26 + // mul x26,x5,x4 + adcs x21,x21,x27 + mul x27,x6,x4 + adcs x22,x22,x28 + mul x28,x7,x4 + adcs x23,x23,x0 + mul x0,x8,x4 + adcs x24,x24,x1 + mul x1,x9,x4 + adcs x25,x25,x3 + mul x3,x10,x4 + adc x17,x17,xzr + subs xzr,x19,#1 // adds x19,x19,x26 + umulh x26,x5,x4 + adcs x20,x20,x27 + umulh x27,x6,x4 + adcs x21,x21,x28 + umulh x28,x7,x4 + adcs x22,x22,x0 + umulh x0,x8,x4 + adcs x23,x23,x1 + umulh x1,x9,x4 + adcs x24,x24,x3 + umulh x3,x10,x4 + adcs x25,x25,xzr + adc x4,x17,xzr + ldr x17,[x2,8*5] + + adds x19,x20,x26 + mul x26,x11,x17 + adcs x20,x21,x27 + mul x27,x12,x17 + adcs x21,x22,x28 + mul x28,x13,x17 + adcs x22,x23,x0 + mul x0,x14,x17 + adcs x23,x24,x1 + mul x1,x15,x17 + adcs x24,x25,x3 + mul x3,x16,x17 + adc x25,x4,xzr + ldr x4,[x29,#96] + + adds x19,x19,x26 + umulh x26,x11,x17 + adcs x20,x20,x27 + umulh x27,x12,x17 + adcs x21,x21,x28 + mul x4,x4,x19 + umulh x28,x13,x17 + adcs x22,x22,x0 + umulh x0,x14,x17 + adcs x23,x23,x1 + umulh x1,x15,x17 + adcs x24,x24,x3 + umulh x3,x16,x17 + adcs x25,x25,xzr + adc x17,xzr,xzr + + adds x20,x20,x26 + // mul x26,x5,x4 + adcs x21,x21,x27 + mul x27,x6,x4 + adcs x22,x22,x28 + mul x28,x7,x4 + adcs x23,x23,x0 + mul x0,x8,x4 + adcs x24,x24,x1 + mul x1,x9,x4 + adcs x25,x25,x3 + mul x3,x10,x4 + adc x17,x17,xzr + subs xzr,x19,#1 // adds x19,x19,x26 + umulh x26,x5,x4 + adcs x20,x20,x27 + umulh x27,x6,x4 + adcs x21,x21,x28 + umulh x28,x7,x4 + adcs x22,x22,x0 + umulh x0,x8,x4 + adcs x23,x23,x1 + umulh x1,x9,x4 + adcs x24,x24,x3 + umulh x3,x10,x4 + adcs x25,x25,xzr + ldp x4,x2,[x29,#96] // pull r_ptr + adc x17,x17,xzr + + adds x19,x20,x26 + adcs x20,x21,x27 + adcs x21,x22,x28 + adcs x22,x23,x0 + adcs x23,x24,x1 + adcs x24,x25,x3 + adc x25,x17,xzr + + subs x26,x19,x5 + sbcs x27,x20,x6 + sbcs x28,x21,x7 + sbcs x0,x22,x8 + sbcs x1,x23,x9 + sbcs x3,x24,x10 + sbcs xzr, x25,xzr + + csello x11,x19,x26 + csello x12,x20,x27 + csello x13,x21,x28 + csello x14,x22,x0 + csello x15,x23,x1 + csello x16,x24,x3 + ret + ENDP + + + + EXPORT |sqr_mont_384|[FUNC] + ALIGN 32 +|sqr_mont_384| PROC + DCDU 3573752639 + stp x29,x30,[sp,#-128]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + stp x23,x24,[sp,#48] + stp x25,x26,[sp,#64] + stp x27,x28,[sp,#80] + sub sp,sp,#96 // space for 768-bit vector + mov x4,x3 // adjust for missing b_ptr + + mov x3,x0 // save r_ptr + mov x0,sp + + ldp x11,x12,[x1] + ldp x13,x14,[x1,#16] + ldp x15,x16,[x1,#32] + + bl __sqr_384 + + ldp x5,x6,[x2] + ldp x7,x8,[x2,#16] + ldp x9,x10,[x2,#32] + + mov x1,sp + mov x0,x3 // restore r_ptr + bl __mul_by_1_mont_384 + bl __redc_tail_mont_384 + ldr x30,[x29,#8] + + add sp,sp,#96 + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldp x23,x24,[x29,#48] + ldp x25,x26,[x29,#64] + ldp x27,x28,[x29,#80] + ldr x29,[sp],#128 + DCDU 3573752767 + ret + ENDP + + + + EXPORT |sqr_n_mul_mont_383|[FUNC] + ALIGN 32 +|sqr_n_mul_mont_383| PROC + DCDU 3573752639 + stp x29,x30,[sp,#-128]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + stp x23,x24,[sp,#48] + stp x25,x26,[sp,#64] + stp x27,x28,[sp,#80] + stp x4,x0,[sp,#96] // __mul_mont_384 wants them there + sub sp,sp,#96 // space for 768-bit vector + mov x17,x5 // save b_ptr + + ldp x11,x12,[x1] + ldp x13,x14,[x1,#16] + ldp x15,x16,[x1,#32] + mov x0,sp +|$Loop_sqr_383| + bl __sqr_384 + sub x2,x2,#1 // counter + + ldp x5,x6,[x3] + ldp x7,x8,[x3,#16] + ldp x9,x10,[x3,#32] + + mov x1,sp + bl __mul_by_1_mont_384 + + ldp x19,x20,[x1,#48] + ldp x21,x22,[x1,#64] + ldp x23,x24,[x1,#80] + + adds x11,x11,x19 // just accumulate upper half + adcs x12,x12,x20 + adcs x13,x13,x21 + adcs x14,x14,x22 + adcs x15,x15,x23 + adc x16,x16,x24 + + cbnz x2,|$Loop_sqr_383| + + mov x2,x17 + ldr x17,[x17] + bl __mul_mont_384 + ldr x30,[x29,#8] + + stp x11,x12,[x2] + stp x13,x14,[x2,#16] + stp x15,x16,[x2,#32] + + add sp,sp,#96 + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldp x23,x24,[x29,#48] + ldp x25,x26,[x29,#64] + ldp x27,x28,[x29,#80] + ldr x29,[sp],#128 + DCDU 3573752767 + ret + ENDP + + ALIGN 32 +|__sqr_384| PROC + mul x19,x12,x11 + mul x20,x13,x11 + mul x21,x14,x11 + mul x22,x15,x11 + mul x23,x16,x11 + + umulh x6,x12,x11 + umulh x7,x13,x11 + umulh x8,x14,x11 + umulh x9,x15,x11 + adds x20,x20,x6 + umulh x10,x16,x11 + adcs x21,x21,x7 + mul x7,x13,x12 + adcs x22,x22,x8 + mul x8,x14,x12 + adcs x23,x23,x9 + mul x9,x15,x12 + adc x24,xzr, x10 + mul x10,x16,x12 + + adds x21,x21,x7 + umulh x7,x13,x12 + adcs x22,x22,x8 + umulh x8,x14,x12 + adcs x23,x23,x9 + umulh x9,x15,x12 + adcs x24,x24,x10 + umulh x10,x16,x12 + adc x25,xzr,xzr + + mul x5,x11,x11 + adds x22,x22,x7 + umulh x11, x11,x11 + adcs x23,x23,x8 + mul x8,x14,x13 + adcs x24,x24,x9 + mul x9,x15,x13 + adc x25,x25,x10 + mul x10,x16,x13 + + adds x23,x23,x8 + umulh x8,x14,x13 + adcs x24,x24,x9 + umulh x9,x15,x13 + adcs x25,x25,x10 + umulh x10,x16,x13 + adc x26,xzr,xzr + + mul x6,x12,x12 + adds x24,x24,x8 + umulh x12, x12,x12 + adcs x25,x25,x9 + mul x9,x15,x14 + adc x26,x26,x10 + mul x10,x16,x14 + + adds x25,x25,x9 + umulh x9,x15,x14 + adcs x26,x26,x10 + umulh x10,x16,x14 + adc x27,xzr,xzr + mul x7,x13,x13 + adds x26,x26,x9 + umulh x13, x13,x13 + adc x27,x27,x10 + mul x8,x14,x14 + + mul x10,x16,x15 + umulh x14, x14,x14 + adds x27,x27,x10 + umulh x10,x16,x15 + mul x9,x15,x15 + adc x28,x10,xzr + + adds x19,x19,x19 + adcs x20,x20,x20 + adcs x21,x21,x21 + adcs x22,x22,x22 + adcs x23,x23,x23 + adcs x24,x24,x24 + adcs x25,x25,x25 + adcs x26,x26,x26 + umulh x15, x15,x15 + adcs x27,x27,x27 + mul x10,x16,x16 + adcs x28,x28,x28 + umulh x16, x16,x16 + adc x1,xzr,xzr + + adds x19,x19,x11 + adcs x20,x20,x6 + adcs x21,x21,x12 + adcs x22,x22,x7 + adcs x23,x23,x13 + adcs x24,x24,x8 + adcs x25,x25,x14 + stp x5,x19,[x0] + adcs x26,x26,x9 + stp x20,x21,[x0,#16] + adcs x27,x27,x15 + stp x22,x23,[x0,#32] + adcs x28,x28,x10 + stp x24,x25,[x0,#48] + adc x16,x16,x1 + stp x26,x27,[x0,#64] + stp x28,x16,[x0,#80] + + ret + ENDP + + + EXPORT |sqr_384|[FUNC] + ALIGN 32 +|sqr_384| PROC + DCDU 3573752639 + stp x29,x30,[sp,#-128]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + stp x23,x24,[sp,#48] + stp x25,x26,[sp,#64] + stp x27,x28,[sp,#80] + + ldp x11,x12,[x1] + ldp x13,x14,[x1,#16] + ldp x15,x16,[x1,#32] + + bl __sqr_384 + ldr x30,[x29,#8] + + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldp x23,x24,[x29,#48] + ldp x25,x26,[x29,#64] + ldp x27,x28,[x29,#80] + ldr x29,[sp],#128 + DCDU 3573752767 + ret + ENDP + + + + EXPORT |redc_mont_384|[FUNC] + ALIGN 32 +|redc_mont_384| PROC + DCDU 3573752639 + stp x29,x30,[sp,#-128]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + stp x23,x24,[sp,#48] + stp x25,x26,[sp,#64] + stp x27,x28,[sp,#80] + mov x4,x3 // adjust for missing b_ptr + + ldp x5,x6,[x2] + ldp x7,x8,[x2,#16] + ldp x9,x10,[x2,#32] + + bl __mul_by_1_mont_384 + bl __redc_tail_mont_384 + ldr x30,[x29,#8] + + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldp x23,x24,[x29,#48] + ldp x25,x26,[x29,#64] + ldp x27,x28,[x29,#80] + ldr x29,[sp],#128 + DCDU 3573752767 + ret + ENDP + + + + EXPORT |from_mont_384|[FUNC] + ALIGN 32 +|from_mont_384| PROC + DCDU 3573752639 + stp x29,x30,[sp,#-128]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + stp x23,x24,[sp,#48] + stp x25,x26,[sp,#64] + stp x27,x28,[sp,#80] + mov x4,x3 // adjust for missing b_ptr + + ldp x5,x6,[x2] + ldp x7,x8,[x2,#16] + ldp x9,x10,[x2,#32] + + bl __mul_by_1_mont_384 + ldr x30,[x29,#8] + + subs x19,x11,x5 + sbcs x20,x12,x6 + sbcs x21,x13,x7 + sbcs x22,x14,x8 + sbcs x23,x15,x9 + sbcs x24,x16,x10 + + csello x11,x11,x19 + csello x12,x12,x20 + csello x13,x13,x21 + csello x14,x14,x22 + csello x15,x15,x23 + csello x16,x16,x24 + + stp x11,x12,[x0] + stp x13,x14,[x0,#16] + stp x15,x16,[x0,#32] + + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldp x23,x24,[x29,#48] + ldp x25,x26,[x29,#64] + ldp x27,x28,[x29,#80] + ldr x29,[sp],#128 + DCDU 3573752767 + ret + ENDP + + + ALIGN 32 +|__mul_by_1_mont_384| PROC + ldp x11,x12,[x1] + ldp x13,x14,[x1,#16] + mul x26,x4,x11 + ldp x15,x16,[x1,#32] + + // mul x19,x5,x26 + mul x20,x6,x26 + mul x21,x7,x26 + mul x22,x8,x26 + mul x23,x9,x26 + mul x24,x10,x26 + subs xzr,x11,#1 // adds x19,x19,x11 + umulh x11,x5,x26 + adcs x20,x20,x12 + umulh x12,x6,x26 + adcs x21,x21,x13 + umulh x13,x7,x26 + adcs x22,x22,x14 + umulh x14,x8,x26 + adcs x23,x23,x15 + umulh x15,x9,x26 + adcs x24,x24,x16 + umulh x16,x10,x26 + adc x25,xzr,xzr + adds x11,x11,x20 + adcs x12,x12,x21 + adcs x13,x13,x22 + mul x26,x4,x11 + adcs x14,x14,x23 + adcs x15,x15,x24 + adc x16,x16,x25 + + // mul x19,x5,x26 + mul x20,x6,x26 + mul x21,x7,x26 + mul x22,x8,x26 + mul x23,x9,x26 + mul x24,x10,x26 + subs xzr,x11,#1 // adds x19,x19,x11 + umulh x11,x5,x26 + adcs x20,x20,x12 + umulh x12,x6,x26 + adcs x21,x21,x13 + umulh x13,x7,x26 + adcs x22,x22,x14 + umulh x14,x8,x26 + adcs x23,x23,x15 + umulh x15,x9,x26 + adcs x24,x24,x16 + umulh x16,x10,x26 + adc x25,xzr,xzr + adds x11,x11,x20 + adcs x12,x12,x21 + adcs x13,x13,x22 + mul x26,x4,x11 + adcs x14,x14,x23 + adcs x15,x15,x24 + adc x16,x16,x25 + + // mul x19,x5,x26 + mul x20,x6,x26 + mul x21,x7,x26 + mul x22,x8,x26 + mul x23,x9,x26 + mul x24,x10,x26 + subs xzr,x11,#1 // adds x19,x19,x11 + umulh x11,x5,x26 + adcs x20,x20,x12 + umulh x12,x6,x26 + adcs x21,x21,x13 + umulh x13,x7,x26 + adcs x22,x22,x14 + umulh x14,x8,x26 + adcs x23,x23,x15 + umulh x15,x9,x26 + adcs x24,x24,x16 + umulh x16,x10,x26 + adc x25,xzr,xzr + adds x11,x11,x20 + adcs x12,x12,x21 + adcs x13,x13,x22 + mul x26,x4,x11 + adcs x14,x14,x23 + adcs x15,x15,x24 + adc x16,x16,x25 + + // mul x19,x5,x26 + mul x20,x6,x26 + mul x21,x7,x26 + mul x22,x8,x26 + mul x23,x9,x26 + mul x24,x10,x26 + subs xzr,x11,#1 // adds x19,x19,x11 + umulh x11,x5,x26 + adcs x20,x20,x12 + umulh x12,x6,x26 + adcs x21,x21,x13 + umulh x13,x7,x26 + adcs x22,x22,x14 + umulh x14,x8,x26 + adcs x23,x23,x15 + umulh x15,x9,x26 + adcs x24,x24,x16 + umulh x16,x10,x26 + adc x25,xzr,xzr + adds x11,x11,x20 + adcs x12,x12,x21 + adcs x13,x13,x22 + mul x26,x4,x11 + adcs x14,x14,x23 + adcs x15,x15,x24 + adc x16,x16,x25 + + // mul x19,x5,x26 + mul x20,x6,x26 + mul x21,x7,x26 + mul x22,x8,x26 + mul x23,x9,x26 + mul x24,x10,x26 + subs xzr,x11,#1 // adds x19,x19,x11 + umulh x11,x5,x26 + adcs x20,x20,x12 + umulh x12,x6,x26 + adcs x21,x21,x13 + umulh x13,x7,x26 + adcs x22,x22,x14 + umulh x14,x8,x26 + adcs x23,x23,x15 + umulh x15,x9,x26 + adcs x24,x24,x16 + umulh x16,x10,x26 + adc x25,xzr,xzr + adds x11,x11,x20 + adcs x12,x12,x21 + adcs x13,x13,x22 + mul x26,x4,x11 + adcs x14,x14,x23 + adcs x15,x15,x24 + adc x16,x16,x25 + + // mul x19,x5,x26 + mul x20,x6,x26 + mul x21,x7,x26 + mul x22,x8,x26 + mul x23,x9,x26 + mul x24,x10,x26 + subs xzr,x11,#1 // adds x19,x19,x11 + umulh x11,x5,x26 + adcs x20,x20,x12 + umulh x12,x6,x26 + adcs x21,x21,x13 + umulh x13,x7,x26 + adcs x22,x22,x14 + umulh x14,x8,x26 + adcs x23,x23,x15 + umulh x15,x9,x26 + adcs x24,x24,x16 + umulh x16,x10,x26 + adc x25,xzr,xzr + adds x11,x11,x20 + adcs x12,x12,x21 + adcs x13,x13,x22 + adcs x14,x14,x23 + adcs x15,x15,x24 + adc x16,x16,x25 + + ret + ENDP + + + ALIGN 32 +|__redc_tail_mont_384| PROC + ldp x19,x20,[x1,#48] + ldp x21,x22,[x1,#64] + ldp x23,x24,[x1,#80] + + adds x11,x11,x19 // accumulate upper half + adcs x12,x12,x20 + adcs x13,x13,x21 + adcs x14,x14,x22 + adcs x15,x15,x23 + adcs x16,x16,x24 + adc x25,xzr,xzr + + subs x19,x11,x5 + sbcs x20,x12,x6 + sbcs x21,x13,x7 + sbcs x22,x14,x8 + sbcs x23,x15,x9 + sbcs x24,x16,x10 + sbcs xzr,x25,xzr + + csello x11,x11,x19 + csello x12,x12,x20 + csello x13,x13,x21 + csello x14,x14,x22 + csello x15,x15,x23 + csello x16,x16,x24 + + stp x11,x12,[x0] + stp x13,x14,[x0,#16] + stp x15,x16,[x0,#32] + + ret + ENDP + + + + EXPORT |mul_384|[FUNC] + ALIGN 32 +|mul_384| PROC + DCDU 3573752639 + stp x29,x30,[sp,#-128]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + stp x23,x24,[sp,#48] + stp x25,x26,[sp,#64] + stp x27,x28,[sp,#80] + + bl __mul_384 + ldr x30,[x29,#8] + + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldp x23,x24,[x29,#48] + ldp x25,x26,[x29,#64] + ldp x27,x28,[x29,#80] + ldr x29,[sp],#128 + DCDU 3573752767 + ret + ENDP + + + ALIGN 32 +|__mul_384| PROC + ldp x11,x12,[x1] + ldr x17, [x2] + ldp x13,x14,[x1,#16] + ldp x15,x16,[x1,#32] + + mul x19,x11,x17 + mul x20,x12,x17 + mul x21,x13,x17 + mul x22,x14,x17 + mul x23,x15,x17 + mul x24,x16,x17 + + umulh x5,x11,x17 + umulh x6,x12,x17 + umulh x7,x13,x17 + umulh x8,x14,x17 + umulh x9,x15,x17 + umulh x10,x16,x17 + ldr x17,[x2,8*1] + + str x19,[x0] + adds x19,x20,x5 + mul x5,x11,x17 + adcs x20,x21,x6 + mul x6,x12,x17 + adcs x21,x22,x7 + mul x7,x13,x17 + adcs x22,x23,x8 + mul x8,x14,x17 + adcs x23,x24,x9 + mul x9,x15,x17 + adc x24,xzr, x10 + mul x10,x16,x17 + adds x19,x19,x5 + umulh x5,x11,x17 + adcs x20,x20,x6 + umulh x6,x12,x17 + adcs x21,x21,x7 + umulh x7,x13,x17 + adcs x22,x22,x8 + umulh x8,x14,x17 + adcs x23,x23,x9 + umulh x9,x15,x17 + adcs x24,x24,x10 + umulh x10,x16,x17 + ldr x17,[x2,#8*(1+1)] + adc x25,xzr,xzr + + str x19,[x0,8*1] + adds x19,x20,x5 + mul x5,x11,x17 + adcs x20,x21,x6 + mul x6,x12,x17 + adcs x21,x22,x7 + mul x7,x13,x17 + adcs x22,x23,x8 + mul x8,x14,x17 + adcs x23,x24,x9 + mul x9,x15,x17 + adc x24,x25,x10 + mul x10,x16,x17 + adds x19,x19,x5 + umulh x5,x11,x17 + adcs x20,x20,x6 + umulh x6,x12,x17 + adcs x21,x21,x7 + umulh x7,x13,x17 + adcs x22,x22,x8 + umulh x8,x14,x17 + adcs x23,x23,x9 + umulh x9,x15,x17 + adcs x24,x24,x10 + umulh x10,x16,x17 + ldr x17,[x2,#8*(2+1)] + adc x25,xzr,xzr + + str x19,[x0,8*2] + adds x19,x20,x5 + mul x5,x11,x17 + adcs x20,x21,x6 + mul x6,x12,x17 + adcs x21,x22,x7 + mul x7,x13,x17 + adcs x22,x23,x8 + mul x8,x14,x17 + adcs x23,x24,x9 + mul x9,x15,x17 + adc x24,x25,x10 + mul x10,x16,x17 + adds x19,x19,x5 + umulh x5,x11,x17 + adcs x20,x20,x6 + umulh x6,x12,x17 + adcs x21,x21,x7 + umulh x7,x13,x17 + adcs x22,x22,x8 + umulh x8,x14,x17 + adcs x23,x23,x9 + umulh x9,x15,x17 + adcs x24,x24,x10 + umulh x10,x16,x17 + ldr x17,[x2,#8*(3+1)] + adc x25,xzr,xzr + + str x19,[x0,8*3] + adds x19,x20,x5 + mul x5,x11,x17 + adcs x20,x21,x6 + mul x6,x12,x17 + adcs x21,x22,x7 + mul x7,x13,x17 + adcs x22,x23,x8 + mul x8,x14,x17 + adcs x23,x24,x9 + mul x9,x15,x17 + adc x24,x25,x10 + mul x10,x16,x17 + adds x19,x19,x5 + umulh x5,x11,x17 + adcs x20,x20,x6 + umulh x6,x12,x17 + adcs x21,x21,x7 + umulh x7,x13,x17 + adcs x22,x22,x8 + umulh x8,x14,x17 + adcs x23,x23,x9 + umulh x9,x15,x17 + adcs x24,x24,x10 + umulh x10,x16,x17 + ldr x17,[x2,#8*(4+1)] + adc x25,xzr,xzr + + str x19,[x0,8*4] + adds x19,x20,x5 + mul x5,x11,x17 + adcs x20,x21,x6 + mul x6,x12,x17 + adcs x21,x22,x7 + mul x7,x13,x17 + adcs x22,x23,x8 + mul x8,x14,x17 + adcs x23,x24,x9 + mul x9,x15,x17 + adc x24,x25,x10 + mul x10,x16,x17 + adds x19,x19,x5 + umulh x5,x11,x17 + adcs x20,x20,x6 + umulh x6,x12,x17 + adcs x21,x21,x7 + umulh x7,x13,x17 + adcs x22,x22,x8 + umulh x8,x14,x17 + adcs x23,x23,x9 + umulh x9,x15,x17 + adcs x24,x24,x10 + umulh x10,x16,x17 + adc x25,xzr,xzr + + str x19,[x0,8*5] + adds x19,x20,x5 + adcs x20,x21,x6 + adcs x21,x22,x7 + adcs x22,x23,x8 + adcs x23,x24,x9 + adc x24,x25,x10 + + stp x19,x20,[x0,#48] + stp x21,x22,[x0,#64] + stp x23,x24,[x0,#80] + + ret + ENDP + + + + EXPORT |mul_382x|[FUNC] + ALIGN 32 +|mul_382x| PROC + DCDU 3573752639 + stp x29,x30,[sp,#-128]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + stp x23,x24,[sp,#48] + stp x25,x26,[sp,#64] + stp x27,x28,[sp,#80] + sub sp,sp,#96 // space for two 384-bit vectors + + ldp x11,x12,[x1] + mov x26,x0 // save r_ptr + ldp x19,x20,[x1,#48] + mov x27,x1 // save a_ptr + ldp x13,x14,[x1,#16] + mov x28,x2 // save b_ptr + ldp x21,x22,[x1,#64] + ldp x15,x16,[x1,#32] + adds x5,x11,x19 // t0 = a->re + a->im + ldp x23,x24,[x1,#80] + adcs x6,x12,x20 + ldp x11,x12,[x2] + adcs x7,x13,x21 + ldp x19,x20,[x2,#48] + adcs x8,x14,x22 + ldp x13,x14,[x2,#16] + adcs x9,x15,x23 + ldp x21,x22,[x2,#64] + adc x10,x16,x24 + ldp x15,x16,[x2,#32] + + stp x5,x6,[sp] + adds x5,x11,x19 // t1 = b->re + b->im + ldp x23,x24,[x2,#80] + adcs x6,x12,x20 + stp x7,x8,[sp,#16] + adcs x7,x13,x21 + adcs x8,x14,x22 + stp x9,x10,[sp,#32] + adcs x9,x15,x23 + stp x5,x6,[sp,#48] + adc x10,x16,x24 + stp x7,x8,[sp,#64] + stp x9,x10,[sp,#80] + + bl __mul_384 // mul_384(ret->re, a->re, b->re) + + add x1,sp,#0 // mul_384(ret->im, t0, t1) + add x2,sp,#48 + add x0,x26,#96 + bl __mul_384 + + add x1,x27,#48 // mul_384(tx, a->im, b->im) + add x2,x28,#48 + add x0,sp,#0 + bl __mul_384 + + ldp x5,x6,[x3] + ldp x7,x8,[x3,#16] + ldp x9,x10,[x3,#32] + + add x1,x26,#96 // ret->im -= tx + add x2,sp,#0 + add x0,x26,#96 + bl __sub_mod_384x384 + + add x2,x26,#0 // ret->im -= ret->re + bl __sub_mod_384x384 + + add x1,x26,#0 // ret->re -= tx + add x2,sp,#0 + add x0,x26,#0 + bl __sub_mod_384x384 + ldr x30,[x29,#8] + + add sp,sp,#96 + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldp x23,x24,[x29,#48] + ldp x25,x26,[x29,#64] + ldp x27,x28,[x29,#80] + ldr x29,[sp],#128 + DCDU 3573752767 + ret + ENDP + + + + EXPORT |sqr_382x|[FUNC] + ALIGN 32 +|sqr_382x| PROC + DCDU 3573752639 + stp x29,x30,[sp,#-128]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + stp x23,x24,[sp,#48] + stp x25,x26,[sp,#64] + stp x27,x28,[sp,#80] + + ldp x11,x12,[x1] + ldp x19,x20,[x1,#48] + ldp x13,x14,[x1,#16] + adds x5,x11,x19 // t0 = a->re + a->im + ldp x21,x22,[x1,#64] + adcs x6,x12,x20 + ldp x15,x16,[x1,#32] + adcs x7,x13,x21 + ldp x23,x24,[x1,#80] + adcs x8,x14,x22 + stp x5,x6,[x0] + adcs x9,x15,x23 + ldp x5,x6,[x2] + adc x10,x16,x24 + stp x7,x8,[x0,#16] + + subs x11,x11,x19 // t1 = a->re - a->im + ldp x7,x8,[x2,#16] + sbcs x12,x12,x20 + stp x9,x10,[x0,#32] + sbcs x13,x13,x21 + ldp x9,x10,[x2,#32] + sbcs x14,x14,x22 + sbcs x15,x15,x23 + sbcs x16,x16,x24 + sbc x25,xzr,xzr + + and x19,x5,x25 + and x20,x6,x25 + adds x11,x11,x19 + and x21,x7,x25 + adcs x12,x12,x20 + and x22,x8,x25 + adcs x13,x13,x21 + and x23,x9,x25 + adcs x14,x14,x22 + and x24,x10,x25 + adcs x15,x15,x23 + stp x11,x12,[x0,#48] + adc x16,x16,x24 + stp x13,x14,[x0,#64] + stp x15,x16,[x0,#80] + + mov x4,x1 // save a_ptr + add x1,x0,#0 // mul_384(ret->re, t0, t1) + add x2,x0,#48 + bl __mul_384 + + add x1,x4,#0 // mul_384(ret->im, a->re, a->im) + add x2,x4,#48 + add x0,x0,#96 + bl __mul_384 + ldr x30,[x29,#8] + + ldp x11,x12,[x0] + ldp x13,x14,[x0,#16] + adds x11,x11,x11 // add with itself + ldp x15,x16,[x0,#32] + adcs x12,x12,x12 + adcs x13,x13,x13 + adcs x14,x14,x14 + adcs x15,x15,x15 + adcs x16,x16,x16 + adcs x19,x19,x19 + adcs x20,x20,x20 + stp x11,x12,[x0] + adcs x21,x21,x21 + stp x13,x14,[x0,#16] + adcs x22,x22,x22 + stp x15,x16,[x0,#32] + adcs x23,x23,x23 + stp x19,x20,[x0,#48] + adc x24,x24,x24 + stp x21,x22,[x0,#64] + stp x23,x24,[x0,#80] + + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldp x23,x24,[x29,#48] + ldp x25,x26,[x29,#64] + ldp x27,x28,[x29,#80] + ldr x29,[sp],#128 + DCDU 3573752767 + ret + ENDP + + + + EXPORT |sqr_mont_382x|[FUNC] + ALIGN 32 +|sqr_mont_382x| PROC + DCDU 3573752639 + stp x29,x30,[sp,#-128]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + stp x23,x24,[sp,#48] + stp x25,x26,[sp,#64] + stp x27,x28,[sp,#80] + stp x3,x0,[sp,#96] // __mul_mont_384 wants them there + sub sp,sp,#112 // space for two 384-bit vectors + word + mov x4,x3 // adjust for missing b_ptr + + ldp x11,x12,[x1] + ldp x13,x14,[x1,#16] + ldp x15,x16,[x1,#32] + + ldp x17,x20,[x1,#48] + ldp x21,x22,[x1,#64] + ldp x23,x24,[x1,#80] + + adds x5,x11,x17 // t0 = a->re + a->im + adcs x6,x12,x20 + adcs x7,x13,x21 + adcs x8,x14,x22 + adcs x9,x15,x23 + adc x10,x16,x24 + + subs x19,x11,x17 // t1 = a->re - a->im + sbcs x20,x12,x20 + sbcs x21,x13,x21 + sbcs x22,x14,x22 + sbcs x23,x15,x23 + sbcs x24,x16,x24 + sbc x25,xzr,xzr // borrow flag as mask + + stp x5,x6,[sp] + stp x7,x8,[sp,#16] + stp x9,x10,[sp,#32] + stp x19,x20,[sp,#48] + stp x21,x22,[sp,#64] + stp x23,x24,[sp,#80] + str x25,[sp,#96] + + ldp x5,x6,[x2] + ldp x7,x8,[x2,#16] + ldp x9,x10,[x2,#32] + + add x2,x1,#48 + bl __mul_mont_383_nonred // mul_mont_384(ret->im, a->re, a->im) + + adds x19,x11,x11 // add with itself + adcs x20,x12,x12 + adcs x21,x13,x13 + adcs x22,x14,x14 + adcs x23,x15,x15 + adc x24,x16,x16 + + stp x19,x20,[x2,#48] + stp x21,x22,[x2,#64] + stp x23,x24,[x2,#80] + + ldp x11,x12,[sp] + ldr x17,[sp,#48] + ldp x13,x14,[sp,#16] + ldp x15,x16,[sp,#32] + + add x2,sp,#48 + bl __mul_mont_383_nonred // mul_mont_384(ret->im, t0, t1) + ldr x30,[x29,#8] + + ldr x25,[sp,#96] // account for sign from a->re - a->im + ldp x19,x20,[sp] + ldp x21,x22,[sp,#16] + ldp x23,x24,[sp,#32] + + and x19,x19,x25 + and x20,x20,x25 + and x21,x21,x25 + and x22,x22,x25 + and x23,x23,x25 + and x24,x24,x25 + + subs x11,x11,x19 + sbcs x12,x12,x20 + sbcs x13,x13,x21 + sbcs x14,x14,x22 + sbcs x15,x15,x23 + sbcs x16,x16,x24 + sbc x25,xzr,xzr + + and x19,x5,x25 + and x20,x6,x25 + and x21,x7,x25 + and x22,x8,x25 + and x23,x9,x25 + and x24,x10,x25 + + adds x11,x11,x19 + adcs x12,x12,x20 + adcs x13,x13,x21 + adcs x14,x14,x22 + adcs x15,x15,x23 + adc x16,x16,x24 + + stp x11,x12,[x2] + stp x13,x14,[x2,#16] + stp x15,x16,[x2,#32] + + add sp,sp,#112 + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldp x23,x24,[x29,#48] + ldp x25,x26,[x29,#64] + ldp x27,x28,[x29,#80] + ldr x29,[sp],#128 + DCDU 3573752767 + ret + ENDP + + + ALIGN 32 +|__mul_mont_383_nonred| PROC + mul x19,x11,x17 + mul x20,x12,x17 + mul x21,x13,x17 + mul x22,x14,x17 + mul x23,x15,x17 + mul x24,x16,x17 + mul x4,x4,x19 + + umulh x26,x11,x17 + umulh x27,x12,x17 + umulh x28,x13,x17 + umulh x0,x14,x17 + umulh x1,x15,x17 + umulh x3,x16,x17 + + adds x20,x20,x26 + mul x26,x5,x4 + adcs x21,x21,x27 + mul x27,x6,x4 + adcs x22,x22,x28 + mul x28,x7,x4 + adcs x23,x23,x0 + mul x0,x8,x4 + adcs x24,x24,x1 + mul x1,x9,x4 + adc x25,xzr, x3 + mul x3,x10,x4 + ldr x17,[x2,8*1] + adds x19,x19,x26 + umulh x26,x5,x4 + adcs x20,x20,x27 + umulh x27,x6,x4 + adcs x21,x21,x28 + umulh x28,x7,x4 + adcs x22,x22,x0 + umulh x0,x8,x4 + adcs x23,x23,x1 + umulh x1,x9,x4 + adcs x24,x24,x3 + umulh x3,x10,x4 + adc x25,x25,xzr + + ldr x4,[x29,#96] + adds x19,x20,x26 + mul x26,x11,x17 + adcs x20,x21,x27 + mul x27,x12,x17 + adcs x21,x22,x28 + mul x28,x13,x17 + adcs x22,x23,x0 + mul x0,x14,x17 + adcs x23,x24,x1 + mul x1,x15,x17 + adcs x24,x25,x3 + mul x3,x16,x17 + adc x25,xzr,xzr + + adds x19,x19,x26 + umulh x26,x11,x17 + adcs x20,x20,x27 + umulh x27,x12,x17 + adcs x21,x21,x28 + mul x4,x4,x19 + umulh x28,x13,x17 + adcs x22,x22,x0 + umulh x0,x14,x17 + adcs x23,x23,x1 + umulh x1,x15,x17 + adcs x24,x24,x3 + umulh x3,x16,x17 + adc x25,x25,xzr + + adds x20,x20,x26 + mul x26,x5,x4 + adcs x21,x21,x27 + mul x27,x6,x4 + adcs x22,x22,x28 + mul x28,x7,x4 + adcs x23,x23,x0 + mul x0,x8,x4 + adcs x24,x24,x1 + mul x1,x9,x4 + adc x25,x25,x3 + mul x3,x10,x4 + ldr x17,[x2,8*2] + adds x19,x19,x26 + umulh x26,x5,x4 + adcs x20,x20,x27 + umulh x27,x6,x4 + adcs x21,x21,x28 + umulh x28,x7,x4 + adcs x22,x22,x0 + umulh x0,x8,x4 + adcs x23,x23,x1 + umulh x1,x9,x4 + adcs x24,x24,x3 + umulh x3,x10,x4 + adc x25,x25,xzr + + ldr x4,[x29,#96] + adds x19,x20,x26 + mul x26,x11,x17 + adcs x20,x21,x27 + mul x27,x12,x17 + adcs x21,x22,x28 + mul x28,x13,x17 + adcs x22,x23,x0 + mul x0,x14,x17 + adcs x23,x24,x1 + mul x1,x15,x17 + adcs x24,x25,x3 + mul x3,x16,x17 + adc x25,xzr,xzr + + adds x19,x19,x26 + umulh x26,x11,x17 + adcs x20,x20,x27 + umulh x27,x12,x17 + adcs x21,x21,x28 + mul x4,x4,x19 + umulh x28,x13,x17 + adcs x22,x22,x0 + umulh x0,x14,x17 + adcs x23,x23,x1 + umulh x1,x15,x17 + adcs x24,x24,x3 + umulh x3,x16,x17 + adc x25,x25,xzr + + adds x20,x20,x26 + mul x26,x5,x4 + adcs x21,x21,x27 + mul x27,x6,x4 + adcs x22,x22,x28 + mul x28,x7,x4 + adcs x23,x23,x0 + mul x0,x8,x4 + adcs x24,x24,x1 + mul x1,x9,x4 + adc x25,x25,x3 + mul x3,x10,x4 + ldr x17,[x2,8*3] + adds x19,x19,x26 + umulh x26,x5,x4 + adcs x20,x20,x27 + umulh x27,x6,x4 + adcs x21,x21,x28 + umulh x28,x7,x4 + adcs x22,x22,x0 + umulh x0,x8,x4 + adcs x23,x23,x1 + umulh x1,x9,x4 + adcs x24,x24,x3 + umulh x3,x10,x4 + adc x25,x25,xzr + + ldr x4,[x29,#96] + adds x19,x20,x26 + mul x26,x11,x17 + adcs x20,x21,x27 + mul x27,x12,x17 + adcs x21,x22,x28 + mul x28,x13,x17 + adcs x22,x23,x0 + mul x0,x14,x17 + adcs x23,x24,x1 + mul x1,x15,x17 + adcs x24,x25,x3 + mul x3,x16,x17 + adc x25,xzr,xzr + + adds x19,x19,x26 + umulh x26,x11,x17 + adcs x20,x20,x27 + umulh x27,x12,x17 + adcs x21,x21,x28 + mul x4,x4,x19 + umulh x28,x13,x17 + adcs x22,x22,x0 + umulh x0,x14,x17 + adcs x23,x23,x1 + umulh x1,x15,x17 + adcs x24,x24,x3 + umulh x3,x16,x17 + adc x25,x25,xzr + + adds x20,x20,x26 + mul x26,x5,x4 + adcs x21,x21,x27 + mul x27,x6,x4 + adcs x22,x22,x28 + mul x28,x7,x4 + adcs x23,x23,x0 + mul x0,x8,x4 + adcs x24,x24,x1 + mul x1,x9,x4 + adc x25,x25,x3 + mul x3,x10,x4 + ldr x17,[x2,8*4] + adds x19,x19,x26 + umulh x26,x5,x4 + adcs x20,x20,x27 + umulh x27,x6,x4 + adcs x21,x21,x28 + umulh x28,x7,x4 + adcs x22,x22,x0 + umulh x0,x8,x4 + adcs x23,x23,x1 + umulh x1,x9,x4 + adcs x24,x24,x3 + umulh x3,x10,x4 + adc x25,x25,xzr + + ldr x4,[x29,#96] + adds x19,x20,x26 + mul x26,x11,x17 + adcs x20,x21,x27 + mul x27,x12,x17 + adcs x21,x22,x28 + mul x28,x13,x17 + adcs x22,x23,x0 + mul x0,x14,x17 + adcs x23,x24,x1 + mul x1,x15,x17 + adcs x24,x25,x3 + mul x3,x16,x17 + adc x25,xzr,xzr + + adds x19,x19,x26 + umulh x26,x11,x17 + adcs x20,x20,x27 + umulh x27,x12,x17 + adcs x21,x21,x28 + mul x4,x4,x19 + umulh x28,x13,x17 + adcs x22,x22,x0 + umulh x0,x14,x17 + adcs x23,x23,x1 + umulh x1,x15,x17 + adcs x24,x24,x3 + umulh x3,x16,x17 + adc x25,x25,xzr + + adds x20,x20,x26 + mul x26,x5,x4 + adcs x21,x21,x27 + mul x27,x6,x4 + adcs x22,x22,x28 + mul x28,x7,x4 + adcs x23,x23,x0 + mul x0,x8,x4 + adcs x24,x24,x1 + mul x1,x9,x4 + adc x25,x25,x3 + mul x3,x10,x4 + ldr x17,[x2,8*5] + adds x19,x19,x26 + umulh x26,x5,x4 + adcs x20,x20,x27 + umulh x27,x6,x4 + adcs x21,x21,x28 + umulh x28,x7,x4 + adcs x22,x22,x0 + umulh x0,x8,x4 + adcs x23,x23,x1 + umulh x1,x9,x4 + adcs x24,x24,x3 + umulh x3,x10,x4 + adc x25,x25,xzr + + ldr x4,[x29,#96] + adds x19,x20,x26 + mul x26,x11,x17 + adcs x20,x21,x27 + mul x27,x12,x17 + adcs x21,x22,x28 + mul x28,x13,x17 + adcs x22,x23,x0 + mul x0,x14,x17 + adcs x23,x24,x1 + mul x1,x15,x17 + adcs x24,x25,x3 + mul x3,x16,x17 + adc x25,xzr,xzr + + adds x19,x19,x26 + umulh x26,x11,x17 + adcs x20,x20,x27 + umulh x27,x12,x17 + adcs x21,x21,x28 + mul x4,x4,x19 + umulh x28,x13,x17 + adcs x22,x22,x0 + umulh x0,x14,x17 + adcs x23,x23,x1 + umulh x1,x15,x17 + adcs x24,x24,x3 + umulh x3,x16,x17 + adc x25,x25,xzr + + adds x20,x20,x26 + mul x26,x5,x4 + adcs x21,x21,x27 + mul x27,x6,x4 + adcs x22,x22,x28 + mul x28,x7,x4 + adcs x23,x23,x0 + mul x0,x8,x4 + adcs x24,x24,x1 + mul x1,x9,x4 + adc x25,x25,x3 + mul x3,x10,x4 + adds x19,x19,x26 + umulh x26,x5,x4 + adcs x20,x20,x27 + umulh x27,x6,x4 + adcs x21,x21,x28 + umulh x28,x7,x4 + adcs x22,x22,x0 + umulh x0,x8,x4 + adcs x23,x23,x1 + umulh x1,x9,x4 + adcs x24,x24,x3 + umulh x3,x10,x4 + adc x25,x25,xzr + ldp x4,x2,[x29,#96] // pull r_ptr + + adds x11,x20,x26 + adcs x12,x21,x27 + adcs x13,x22,x28 + adcs x14,x23,x0 + adcs x15,x24,x1 + adcs x16,x25,x3 + + ret + ENDP + + + + EXPORT |sgn0_pty_mont_384|[FUNC] + ALIGN 32 +|sgn0_pty_mont_384| PROC + DCDU 3573752639 + stp x29,x30,[sp,#-128]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + stp x23,x24,[sp,#48] + stp x25,x26,[sp,#64] + stp x27,x28,[sp,#80] + + mov x4,x2 + ldp x5,x6,[x1] + ldp x7,x8,[x1,#16] + ldp x9,x10,[x1,#32] + mov x1,x0 + + bl __mul_by_1_mont_384 + ldr x30,[x29,#8] + + and x0,x11,#1 + adds x11,x11,x11 + adcs x12,x12,x12 + adcs x13,x13,x13 + adcs x14,x14,x14 + adcs x15,x15,x15 + adcs x16,x16,x16 + adc x17,xzr,xzr + + subs x11,x11,x5 + sbcs x12,x12,x6 + sbcs x13,x13,x7 + sbcs x14,x14,x8 + sbcs x15,x15,x9 + sbcs x16,x16,x10 + sbc x17,x17,xzr + + mvn x17,x17 + and x17,x17,#2 + orr x0,x0,x17 + + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldp x23,x24,[x29,#48] + ldp x25,x26,[x29,#64] + ldp x27,x28,[x29,#80] + ldr x29,[sp],#128 + DCDU 3573752767 + ret + ENDP + + + + EXPORT |sgn0_pty_mont_384x|[FUNC] + ALIGN 32 +|sgn0_pty_mont_384x| PROC + DCDU 3573752639 + stp x29,x30,[sp,#-128]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + stp x23,x24,[sp,#48] + stp x25,x26,[sp,#64] + stp x27,x28,[sp,#80] + + mov x4,x2 + ldp x5,x6,[x1] + ldp x7,x8,[x1,#16] + ldp x9,x10,[x1,#32] + mov x1,x0 + + bl __mul_by_1_mont_384 + add x1,x1,#48 + + and x2,x11,#1 + orr x3,x11,x12 + adds x11,x11,x11 + orr x3,x3,x13 + adcs x12,x12,x12 + orr x3,x3,x14 + adcs x13,x13,x13 + orr x3,x3,x15 + adcs x14,x14,x14 + orr x3,x3,x16 + adcs x15,x15,x15 + adcs x16,x16,x16 + adc x17,xzr,xzr + + subs x11,x11,x5 + sbcs x12,x12,x6 + sbcs x13,x13,x7 + sbcs x14,x14,x8 + sbcs x15,x15,x9 + sbcs x16,x16,x10 + sbc x17,x17,xzr + + mvn x17,x17 + and x17,x17,#2 + orr x2,x2,x17 + + bl __mul_by_1_mont_384 + ldr x30,[x29,#8] + + and x0,x11,#1 + orr x1,x11,x12 + adds x11,x11,x11 + orr x1,x1,x13 + adcs x12,x12,x12 + orr x1,x1,x14 + adcs x13,x13,x13 + orr x1,x1,x15 + adcs x14,x14,x14 + orr x1,x1,x16 + adcs x15,x15,x15 + adcs x16,x16,x16 + adc x17,xzr,xzr + + subs x11,x11,x5 + sbcs x12,x12,x6 + sbcs x13,x13,x7 + sbcs x14,x14,x8 + sbcs x15,x15,x9 + sbcs x16,x16,x10 + sbc x17,x17,xzr + + mvn x17,x17 + and x17,x17,#2 + orr x0,x0,x17 + + cmp x3,#0 + cseleq x3,x0,x2 + + cmp x1,#0 + cselne x1,x0,x2 + + and x3,x3,#1 + and x1,x1,#2 + orr x0,x1,x3 // pack sign and parity + + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldp x23,x24,[x29,#48] + ldp x25,x26,[x29,#64] + ldp x27,x28,[x29,#80] + ldr x29,[sp],#128 + DCDU 3573752767 + ret + ENDP + END diff --git a/temp-deps/blst-ts/deps/blst/build/win64/mulq_mont_256-x86_64.asm b/temp-deps/blst-ts/deps/blst/build/win64/mulq_mont_256-x86_64.asm new file mode 100644 index 0000000..c3bf863 --- /dev/null +++ b/temp-deps/blst-ts/deps/blst/build/win64/mulq_mont_256-x86_64.asm @@ -0,0 +1,884 @@ +OPTION DOTNAME +.text$ SEGMENT ALIGN(256) 'CODE' + +PUBLIC mul_mont_sparse_256 + + +ALIGN 32 +mul_mont_sparse_256 PROC PUBLIC + DB 243,15,30,250 + mov QWORD PTR[8+rsp],rdi ;WIN64 prologue + mov QWORD PTR[16+rsp],rsi + mov r11,rsp +$L$SEH_begin_mul_mont_sparse_256:: + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 + mov rcx,r9 + mov r8,QWORD PTR[40+rsp] + + + + push rbp + + push rbx + + push r12 + + push r13 + + push r14 + + push r15 + + push rdi + +$L$SEH_body_mul_mont_sparse_256:: + + + mov rax,QWORD PTR[rdx] + mov r13,QWORD PTR[rsi] + mov r14,QWORD PTR[8+rsi] + mov r12,QWORD PTR[16+rsi] + mov rbp,QWORD PTR[24+rsi] + mov rbx,rdx + + mov r15,rax + mul r13 + mov r9,rax + mov rax,r15 + mov r10,rdx + call __mulq_mont_sparse_256 + + mov r15,QWORD PTR[8+rsp] + + mov r14,QWORD PTR[16+rsp] + + mov r13,QWORD PTR[24+rsp] + + mov r12,QWORD PTR[32+rsp] + + mov rbx,QWORD PTR[40+rsp] + + mov rbp,QWORD PTR[48+rsp] + + lea rsp,QWORD PTR[56+rsp] + +$L$SEH_epilogue_mul_mont_sparse_256:: + mov rdi,QWORD PTR[8+rsp] ;WIN64 epilogue + mov rsi,QWORD PTR[16+rsp] + + DB 0F3h,0C3h ;repret + +$L$SEH_end_mul_mont_sparse_256:: +mul_mont_sparse_256 ENDP + +PUBLIC sqr_mont_sparse_256 + + +ALIGN 32 +sqr_mont_sparse_256 PROC PUBLIC + DB 243,15,30,250 + mov QWORD PTR[8+rsp],rdi ;WIN64 prologue + mov QWORD PTR[16+rsp],rsi + mov r11,rsp +$L$SEH_begin_sqr_mont_sparse_256:: + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 + mov rcx,r9 + + + + push rbp + + push rbx + + push r12 + + push r13 + + push r14 + + push r15 + + push rdi + +$L$SEH_body_sqr_mont_sparse_256:: + + + mov rax,QWORD PTR[rsi] + mov r8,rcx + mov r14,QWORD PTR[8+rsi] + mov rcx,rdx + mov r12,QWORD PTR[16+rsi] + lea rbx,QWORD PTR[rsi] + mov rbp,QWORD PTR[24+rsi] + + mov r15,rax + mul rax + mov r9,rax + mov rax,r15 + mov r10,rdx + call __mulq_mont_sparse_256 + + mov r15,QWORD PTR[8+rsp] + + mov r14,QWORD PTR[16+rsp] + + mov r13,QWORD PTR[24+rsp] + + mov r12,QWORD PTR[32+rsp] + + mov rbx,QWORD PTR[40+rsp] + + mov rbp,QWORD PTR[48+rsp] + + lea rsp,QWORD PTR[56+rsp] + +$L$SEH_epilogue_sqr_mont_sparse_256:: + mov rdi,QWORD PTR[8+rsp] ;WIN64 epilogue + mov rsi,QWORD PTR[16+rsp] + + DB 0F3h,0C3h ;repret + +$L$SEH_end_sqr_mont_sparse_256:: +sqr_mont_sparse_256 ENDP + +ALIGN 32 +__mulq_mont_sparse_256 PROC PRIVATE + DB 243,15,30,250 + mul r14 + add r10,rax + mov rax,r15 + adc rdx,0 + mov r11,rdx + + mul r12 + add r11,rax + mov rax,r15 + adc rdx,0 + mov r12,rdx + + mul rbp + add r12,rax + mov rax,QWORD PTR[8+rbx] + adc rdx,0 + xor r14,r14 + mov r13,rdx + + mov rdi,r9 + imul r9,r8 + + + mov r15,rax + mul QWORD PTR[rsi] + add r10,rax + mov rax,r15 + adc rdx,0 + mov rbp,rdx + + mul QWORD PTR[8+rsi] + add r11,rax + mov rax,r15 + adc rdx,0 + add r11,rbp + adc rdx,0 + mov rbp,rdx + + mul QWORD PTR[16+rsi] + add r12,rax + mov rax,r15 + adc rdx,0 + add r12,rbp + adc rdx,0 + mov rbp,rdx + + mul QWORD PTR[24+rsi] + add r13,rax + mov rax,r9 + adc rdx,0 + add r13,rbp + adc r14,rdx + xor r15,r15 + + + mul QWORD PTR[rcx] + add rdi,rax + mov rax,r9 + adc rdi,rdx + + mul QWORD PTR[8+rcx] + add r10,rax + mov rax,r9 + adc rdx,0 + add r10,rdi + adc rdx,0 + mov rbp,rdx + + mul QWORD PTR[16+rcx] + add r11,rax + mov rax,r9 + adc rdx,0 + add r11,rbp + adc rdx,0 + mov rbp,rdx + + mul QWORD PTR[24+rcx] + add r12,rax + mov rax,QWORD PTR[16+rbx] + adc rdx,0 + add r12,rbp + adc rdx,0 + add r13,rdx + adc r14,0 + adc r15,0 + mov rdi,r10 + imul r10,r8 + + + mov r9,rax + mul QWORD PTR[rsi] + add r11,rax + mov rax,r9 + adc rdx,0 + mov rbp,rdx + + mul QWORD PTR[8+rsi] + add r12,rax + mov rax,r9 + adc rdx,0 + add r12,rbp + adc rdx,0 + mov rbp,rdx + + mul QWORD PTR[16+rsi] + add r13,rax + mov rax,r9 + adc rdx,0 + add r13,rbp + adc rdx,0 + mov rbp,rdx + + mul QWORD PTR[24+rsi] + add r14,rax + mov rax,r10 + adc rdx,0 + add r14,rbp + adc r15,rdx + xor r9,r9 + + + mul QWORD PTR[rcx] + add rdi,rax + mov rax,r10 + adc rdi,rdx + + mul QWORD PTR[8+rcx] + add r11,rax + mov rax,r10 + adc rdx,0 + add r11,rdi + adc rdx,0 + mov rbp,rdx + + mul QWORD PTR[16+rcx] + add r12,rax + mov rax,r10 + adc rdx,0 + add r12,rbp + adc rdx,0 + mov rbp,rdx + + mul QWORD PTR[24+rcx] + add r13,rax + mov rax,QWORD PTR[24+rbx] + adc rdx,0 + add r13,rbp + adc rdx,0 + add r14,rdx + adc r15,0 + adc r9,0 + mov rdi,r11 + imul r11,r8 + + + mov r10,rax + mul QWORD PTR[rsi] + add r12,rax + mov rax,r10 + adc rdx,0 + mov rbp,rdx + + mul QWORD PTR[8+rsi] + add r13,rax + mov rax,r10 + adc rdx,0 + add r13,rbp + adc rdx,0 + mov rbp,rdx + + mul QWORD PTR[16+rsi] + add r14,rax + mov rax,r10 + adc rdx,0 + add r14,rbp + adc rdx,0 + mov rbp,rdx + + mul QWORD PTR[24+rsi] + add r15,rax + mov rax,r11 + adc rdx,0 + add r15,rbp + adc r9,rdx + xor r10,r10 + + + mul QWORD PTR[rcx] + add rdi,rax + mov rax,r11 + adc rdi,rdx + + mul QWORD PTR[8+rcx] + add r12,rax + mov rax,r11 + adc rdx,0 + add r12,rdi + adc rdx,0 + mov rbp,rdx + + mul QWORD PTR[16+rcx] + add r13,rax + mov rax,r11 + adc rdx,0 + add r13,rbp + adc rdx,0 + mov rbp,rdx + + mul QWORD PTR[24+rcx] + add r14,rax + mov rax,r12 + adc rdx,0 + add r14,rbp + adc rdx,0 + add r15,rdx + adc r9,0 + adc r10,0 + imul rax,r8 + mov rsi,QWORD PTR[8+rsp] + + + mov r11,rax + mul QWORD PTR[rcx] + add r12,rax + mov rax,r11 + adc r12,rdx + + mul QWORD PTR[8+rcx] + add r13,rax + mov rax,r11 + adc rdx,0 + add r13,r12 + adc rdx,0 + mov rbp,rdx + + mul QWORD PTR[16+rcx] + add r14,rax + mov rax,r11 + adc rdx,0 + add r14,rbp + adc rdx,0 + mov rbp,rdx + + mul QWORD PTR[24+rcx] + mov rbx,r14 + add r15,rbp + adc rdx,0 + add r15,rax + mov rax,r13 + adc rdx,0 + add r9,rdx + adc r10,0 + + + + + mov r12,r15 + sub r13,QWORD PTR[rcx] + sbb r14,QWORD PTR[8+rcx] + sbb r15,QWORD PTR[16+rcx] + mov rbp,r9 + sbb r9,QWORD PTR[24+rcx] + sbb r10,0 + + cmovc r13,rax + cmovc r14,rbx + cmovc r15,r12 + mov QWORD PTR[rsi],r13 + cmovc r9,rbp + mov QWORD PTR[8+rsi],r14 + mov QWORD PTR[16+rsi],r15 + mov QWORD PTR[24+rsi],r9 + + DB 0F3h,0C3h ;repret + +__mulq_mont_sparse_256 ENDP +PUBLIC from_mont_256 + + +ALIGN 32 +from_mont_256 PROC PUBLIC + DB 243,15,30,250 + mov QWORD PTR[8+rsp],rdi ;WIN64 prologue + mov QWORD PTR[16+rsp],rsi + mov r11,rsp +$L$SEH_begin_from_mont_256:: + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 + mov rcx,r9 + + + + push rbp + + push rbx + + push r12 + + push r13 + + push r14 + + push r15 + + sub rsp,8 + +$L$SEH_body_from_mont_256:: + + + mov rbx,rdx + call __mulq_by_1_mont_256 + + + + + + mov r10,r14 + mov r11,r15 + mov r12,r9 + + sub r13,QWORD PTR[rbx] + sbb r14,QWORD PTR[8+rbx] + sbb r15,QWORD PTR[16+rbx] + sbb r9,QWORD PTR[24+rbx] + + cmovnc rax,r13 + cmovnc r10,r14 + cmovnc r11,r15 + mov QWORD PTR[rdi],rax + cmovnc r12,r9 + mov QWORD PTR[8+rdi],r10 + mov QWORD PTR[16+rdi],r11 + mov QWORD PTR[24+rdi],r12 + + mov r15,QWORD PTR[8+rsp] + + mov r14,QWORD PTR[16+rsp] + + mov r13,QWORD PTR[24+rsp] + + mov r12,QWORD PTR[32+rsp] + + mov rbx,QWORD PTR[40+rsp] + + mov rbp,QWORD PTR[48+rsp] + + lea rsp,QWORD PTR[56+rsp] + +$L$SEH_epilogue_from_mont_256:: + mov rdi,QWORD PTR[8+rsp] ;WIN64 epilogue + mov rsi,QWORD PTR[16+rsp] + + DB 0F3h,0C3h ;repret + +$L$SEH_end_from_mont_256:: +from_mont_256 ENDP + +PUBLIC redc_mont_256 + + +ALIGN 32 +redc_mont_256 PROC PUBLIC + DB 243,15,30,250 + mov QWORD PTR[8+rsp],rdi ;WIN64 prologue + mov QWORD PTR[16+rsp],rsi + mov r11,rsp +$L$SEH_begin_redc_mont_256:: + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 + mov rcx,r9 + + + + push rbp + + push rbx + + push r12 + + push r13 + + push r14 + + push r15 + + sub rsp,8 + +$L$SEH_body_redc_mont_256:: + + + mov rbx,rdx + call __mulq_by_1_mont_256 + + add r13,QWORD PTR[32+rsi] + adc r14,QWORD PTR[40+rsi] + mov rax,r13 + adc r15,QWORD PTR[48+rsi] + mov r10,r14 + adc r9,QWORD PTR[56+rsi] + sbb rsi,rsi + + + + + mov r11,r15 + sub r13,QWORD PTR[rbx] + sbb r14,QWORD PTR[8+rbx] + sbb r15,QWORD PTR[16+rbx] + mov r12,r9 + sbb r9,QWORD PTR[24+rbx] + sbb rsi,0 + + cmovnc rax,r13 + cmovnc r10,r14 + cmovnc r11,r15 + mov QWORD PTR[rdi],rax + cmovnc r12,r9 + mov QWORD PTR[8+rdi],r10 + mov QWORD PTR[16+rdi],r11 + mov QWORD PTR[24+rdi],r12 + + mov r15,QWORD PTR[8+rsp] + + mov r14,QWORD PTR[16+rsp] + + mov r13,QWORD PTR[24+rsp] + + mov r12,QWORD PTR[32+rsp] + + mov rbx,QWORD PTR[40+rsp] + + mov rbp,QWORD PTR[48+rsp] + + lea rsp,QWORD PTR[56+rsp] + +$L$SEH_epilogue_redc_mont_256:: + mov rdi,QWORD PTR[8+rsp] ;WIN64 epilogue + mov rsi,QWORD PTR[16+rsp] + + DB 0F3h,0C3h ;repret + +$L$SEH_end_redc_mont_256:: +redc_mont_256 ENDP + +ALIGN 32 +__mulq_by_1_mont_256 PROC PRIVATE + DB 243,15,30,250 + mov rax,QWORD PTR[rsi] + mov r10,QWORD PTR[8+rsi] + mov r11,QWORD PTR[16+rsi] + mov r12,QWORD PTR[24+rsi] + + mov r13,rax + imul rax,rcx + mov r9,rax + + mul QWORD PTR[rbx] + add r13,rax + mov rax,r9 + adc r13,rdx + + mul QWORD PTR[8+rbx] + add r10,rax + mov rax,r9 + adc rdx,0 + add r10,r13 + adc rdx,0 + mov r13,rdx + + mul QWORD PTR[16+rbx] + mov r14,r10 + imul r10,rcx + add r11,rax + mov rax,r9 + adc rdx,0 + add r11,r13 + adc rdx,0 + mov r13,rdx + + mul QWORD PTR[24+rbx] + add r12,rax + mov rax,r10 + adc rdx,0 + add r12,r13 + adc rdx,0 + mov r13,rdx + + mul QWORD PTR[rbx] + add r14,rax + mov rax,r10 + adc r14,rdx + + mul QWORD PTR[8+rbx] + add r11,rax + mov rax,r10 + adc rdx,0 + add r11,r14 + adc rdx,0 + mov r14,rdx + + mul QWORD PTR[16+rbx] + mov r15,r11 + imul r11,rcx + add r12,rax + mov rax,r10 + adc rdx,0 + add r12,r14 + adc rdx,0 + mov r14,rdx + + mul QWORD PTR[24+rbx] + add r13,rax + mov rax,r11 + adc rdx,0 + add r13,r14 + adc rdx,0 + mov r14,rdx + + mul QWORD PTR[rbx] + add r15,rax + mov rax,r11 + adc r15,rdx + + mul QWORD PTR[8+rbx] + add r12,rax + mov rax,r11 + adc rdx,0 + add r12,r15 + adc rdx,0 + mov r15,rdx + + mul QWORD PTR[16+rbx] + mov r9,r12 + imul r12,rcx + add r13,rax + mov rax,r11 + adc rdx,0 + add r13,r15 + adc rdx,0 + mov r15,rdx + + mul QWORD PTR[24+rbx] + add r14,rax + mov rax,r12 + adc rdx,0 + add r14,r15 + adc rdx,0 + mov r15,rdx + + mul QWORD PTR[rbx] + add r9,rax + mov rax,r12 + adc r9,rdx + + mul QWORD PTR[8+rbx] + add r13,rax + mov rax,r12 + adc rdx,0 + add r13,r9 + adc rdx,0 + mov r9,rdx + + mul QWORD PTR[16+rbx] + add r14,rax + mov rax,r12 + adc rdx,0 + add r14,r9 + adc rdx,0 + mov r9,rdx + + mul QWORD PTR[24+rbx] + add r15,rax + mov rax,r13 + adc rdx,0 + add r15,r9 + adc rdx,0 + mov r9,rdx + DB 0F3h,0C3h ;repret +__mulq_by_1_mont_256 ENDP +.text$ ENDS +.pdata SEGMENT READONLY ALIGN(4) +ALIGN 4 + DD imagerel $L$SEH_begin_mul_mont_sparse_256 + DD imagerel $L$SEH_body_mul_mont_sparse_256 + DD imagerel $L$SEH_info_mul_mont_sparse_256_prologue + + DD imagerel $L$SEH_body_mul_mont_sparse_256 + DD imagerel $L$SEH_epilogue_mul_mont_sparse_256 + DD imagerel $L$SEH_info_mul_mont_sparse_256_body + + DD imagerel $L$SEH_epilogue_mul_mont_sparse_256 + DD imagerel $L$SEH_end_mul_mont_sparse_256 + DD imagerel $L$SEH_info_mul_mont_sparse_256_epilogue + + DD imagerel $L$SEH_begin_sqr_mont_sparse_256 + DD imagerel $L$SEH_body_sqr_mont_sparse_256 + DD imagerel $L$SEH_info_sqr_mont_sparse_256_prologue + + DD imagerel $L$SEH_body_sqr_mont_sparse_256 + DD imagerel $L$SEH_epilogue_sqr_mont_sparse_256 + DD imagerel $L$SEH_info_sqr_mont_sparse_256_body + + DD imagerel $L$SEH_epilogue_sqr_mont_sparse_256 + DD imagerel $L$SEH_end_sqr_mont_sparse_256 + DD imagerel $L$SEH_info_sqr_mont_sparse_256_epilogue + + DD imagerel $L$SEH_begin_from_mont_256 + DD imagerel $L$SEH_body_from_mont_256 + DD imagerel $L$SEH_info_from_mont_256_prologue + + DD imagerel $L$SEH_body_from_mont_256 + DD imagerel $L$SEH_epilogue_from_mont_256 + DD imagerel $L$SEH_info_from_mont_256_body + + DD imagerel $L$SEH_epilogue_from_mont_256 + DD imagerel $L$SEH_end_from_mont_256 + DD imagerel $L$SEH_info_from_mont_256_epilogue + + DD imagerel $L$SEH_begin_redc_mont_256 + DD imagerel $L$SEH_body_redc_mont_256 + DD imagerel $L$SEH_info_redc_mont_256_prologue + + DD imagerel $L$SEH_body_redc_mont_256 + DD imagerel $L$SEH_epilogue_redc_mont_256 + DD imagerel $L$SEH_info_redc_mont_256_body + + DD imagerel $L$SEH_epilogue_redc_mont_256 + DD imagerel $L$SEH_end_redc_mont_256 + DD imagerel $L$SEH_info_redc_mont_256_epilogue + +.pdata ENDS +.xdata SEGMENT READONLY ALIGN(8) +ALIGN 8 +$L$SEH_info_mul_mont_sparse_256_prologue:: +DB 1,0,5,00bh +DB 0,074h,1,0 +DB 0,064h,2,0 +DB 0,003h +DB 0,0 +$L$SEH_info_mul_mont_sparse_256_body:: +DB 1,0,17,0 +DB 000h,0f4h,001h,000h +DB 000h,0e4h,002h,000h +DB 000h,0d4h,003h,000h +DB 000h,0c4h,004h,000h +DB 000h,034h,005h,000h +DB 000h,054h,006h,000h +DB 000h,074h,008h,000h +DB 000h,064h,009h,000h +DB 000h,062h +DB 000h,000h +$L$SEH_info_mul_mont_sparse_256_epilogue:: +DB 1,0,4,0 +DB 000h,074h,001h,000h +DB 000h,064h,002h,000h +DB 000h,000h,000h,000h + +$L$SEH_info_sqr_mont_sparse_256_prologue:: +DB 1,0,5,00bh +DB 0,074h,1,0 +DB 0,064h,2,0 +DB 0,003h +DB 0,0 +$L$SEH_info_sqr_mont_sparse_256_body:: +DB 1,0,17,0 +DB 000h,0f4h,001h,000h +DB 000h,0e4h,002h,000h +DB 000h,0d4h,003h,000h +DB 000h,0c4h,004h,000h +DB 000h,034h,005h,000h +DB 000h,054h,006h,000h +DB 000h,074h,008h,000h +DB 000h,064h,009h,000h +DB 000h,062h +DB 000h,000h +$L$SEH_info_sqr_mont_sparse_256_epilogue:: +DB 1,0,4,0 +DB 000h,074h,001h,000h +DB 000h,064h,002h,000h +DB 000h,000h,000h,000h + +$L$SEH_info_from_mont_256_prologue:: +DB 1,0,5,00bh +DB 0,074h,1,0 +DB 0,064h,2,0 +DB 0,003h +DB 0,0 +$L$SEH_info_from_mont_256_body:: +DB 1,0,17,0 +DB 000h,0f4h,001h,000h +DB 000h,0e4h,002h,000h +DB 000h,0d4h,003h,000h +DB 000h,0c4h,004h,000h +DB 000h,034h,005h,000h +DB 000h,054h,006h,000h +DB 000h,074h,008h,000h +DB 000h,064h,009h,000h +DB 000h,062h +DB 000h,000h +$L$SEH_info_from_mont_256_epilogue:: +DB 1,0,4,0 +DB 000h,074h,001h,000h +DB 000h,064h,002h,000h +DB 000h,000h,000h,000h + +$L$SEH_info_redc_mont_256_prologue:: +DB 1,0,5,00bh +DB 0,074h,1,0 +DB 0,064h,2,0 +DB 0,003h +DB 0,0 +$L$SEH_info_redc_mont_256_body:: +DB 1,0,17,0 +DB 000h,0f4h,001h,000h +DB 000h,0e4h,002h,000h +DB 000h,0d4h,003h,000h +DB 000h,0c4h,004h,000h +DB 000h,034h,005h,000h +DB 000h,054h,006h,000h +DB 000h,074h,008h,000h +DB 000h,064h,009h,000h +DB 000h,062h +DB 000h,000h +$L$SEH_info_redc_mont_256_epilogue:: +DB 1,0,4,0 +DB 000h,074h,001h,000h +DB 000h,064h,002h,000h +DB 000h,000h,000h,000h + + +.xdata ENDS +END diff --git a/temp-deps/blst-ts/deps/blst/build/win64/mulq_mont_384-x86_64.asm b/temp-deps/blst-ts/deps/blst/build/win64/mulq_mont_384-x86_64.asm new file mode 100644 index 0000000..0ccb467 --- /dev/null +++ b/temp-deps/blst-ts/deps/blst/build/win64/mulq_mont_384-x86_64.asm @@ -0,0 +1,4233 @@ +OPTION DOTNAME +.text$ SEGMENT ALIGN(256) 'CODE' + + + + + + + + +ALIGN 32 +__sub_mod_384x384 PROC PRIVATE + DB 243,15,30,250 + mov r8,QWORD PTR[rsi] + mov r9,QWORD PTR[8+rsi] + mov r10,QWORD PTR[16+rsi] + mov r11,QWORD PTR[24+rsi] + mov r12,QWORD PTR[32+rsi] + mov r13,QWORD PTR[40+rsi] + mov r14,QWORD PTR[48+rsi] + + sub r8,QWORD PTR[rdx] + mov r15,QWORD PTR[56+rsi] + sbb r9,QWORD PTR[8+rdx] + mov rax,QWORD PTR[64+rsi] + sbb r10,QWORD PTR[16+rdx] + mov rbx,QWORD PTR[72+rsi] + sbb r11,QWORD PTR[24+rdx] + mov rbp,QWORD PTR[80+rsi] + sbb r12,QWORD PTR[32+rdx] + mov rsi,QWORD PTR[88+rsi] + sbb r13,QWORD PTR[40+rdx] + mov QWORD PTR[rdi],r8 + sbb r14,QWORD PTR[48+rdx] + mov r8,QWORD PTR[rcx] + mov QWORD PTR[8+rdi],r9 + sbb r15,QWORD PTR[56+rdx] + mov r9,QWORD PTR[8+rcx] + mov QWORD PTR[16+rdi],r10 + sbb rax,QWORD PTR[64+rdx] + mov r10,QWORD PTR[16+rcx] + mov QWORD PTR[24+rdi],r11 + sbb rbx,QWORD PTR[72+rdx] + mov r11,QWORD PTR[24+rcx] + mov QWORD PTR[32+rdi],r12 + sbb rbp,QWORD PTR[80+rdx] + mov r12,QWORD PTR[32+rcx] + mov QWORD PTR[40+rdi],r13 + sbb rsi,QWORD PTR[88+rdx] + mov r13,QWORD PTR[40+rcx] + sbb rdx,rdx + + and r8,rdx + and r9,rdx + and r10,rdx + and r11,rdx + and r12,rdx + and r13,rdx + + add r14,r8 + adc r15,r9 + mov QWORD PTR[48+rdi],r14 + adc rax,r10 + mov QWORD PTR[56+rdi],r15 + adc rbx,r11 + mov QWORD PTR[64+rdi],rax + adc rbp,r12 + mov QWORD PTR[72+rdi],rbx + adc rsi,r13 + mov QWORD PTR[80+rdi],rbp + mov QWORD PTR[88+rdi],rsi + + DB 0F3h,0C3h ;repret +__sub_mod_384x384 ENDP + + +ALIGN 32 +__add_mod_384 PROC PRIVATE + DB 243,15,30,250 + mov r8,QWORD PTR[rsi] + mov r9,QWORD PTR[8+rsi] + mov r10,QWORD PTR[16+rsi] + mov r11,QWORD PTR[24+rsi] + mov r12,QWORD PTR[32+rsi] + mov r13,QWORD PTR[40+rsi] + + add r8,QWORD PTR[rdx] + adc r9,QWORD PTR[8+rdx] + adc r10,QWORD PTR[16+rdx] + mov r14,r8 + adc r11,QWORD PTR[24+rdx] + mov r15,r9 + adc r12,QWORD PTR[32+rdx] + mov rax,r10 + adc r13,QWORD PTR[40+rdx] + mov rbx,r11 + sbb rdx,rdx + + sub r8,QWORD PTR[rcx] + sbb r9,QWORD PTR[8+rcx] + mov rbp,r12 + sbb r10,QWORD PTR[16+rcx] + sbb r11,QWORD PTR[24+rcx] + sbb r12,QWORD PTR[32+rcx] + mov rsi,r13 + sbb r13,QWORD PTR[40+rcx] + sbb rdx,0 + + cmovc r8,r14 + cmovc r9,r15 + cmovc r10,rax + mov QWORD PTR[rdi],r8 + cmovc r11,rbx + mov QWORD PTR[8+rdi],r9 + cmovc r12,rbp + mov QWORD PTR[16+rdi],r10 + cmovc r13,rsi + mov QWORD PTR[24+rdi],r11 + mov QWORD PTR[32+rdi],r12 + mov QWORD PTR[40+rdi],r13 + + DB 0F3h,0C3h ;repret +__add_mod_384 ENDP + + +ALIGN 32 +__sub_mod_384 PROC PRIVATE + DB 243,15,30,250 + mov r8,QWORD PTR[rsi] + mov r9,QWORD PTR[8+rsi] + mov r10,QWORD PTR[16+rsi] + mov r11,QWORD PTR[24+rsi] + mov r12,QWORD PTR[32+rsi] + mov r13,QWORD PTR[40+rsi] + +__sub_mod_384_a_is_loaded:: + sub r8,QWORD PTR[rdx] + mov r14,QWORD PTR[rcx] + sbb r9,QWORD PTR[8+rdx] + mov r15,QWORD PTR[8+rcx] + sbb r10,QWORD PTR[16+rdx] + mov rax,QWORD PTR[16+rcx] + sbb r11,QWORD PTR[24+rdx] + mov rbx,QWORD PTR[24+rcx] + sbb r12,QWORD PTR[32+rdx] + mov rbp,QWORD PTR[32+rcx] + sbb r13,QWORD PTR[40+rdx] + mov rsi,QWORD PTR[40+rcx] + sbb rdx,rdx + + and r14,rdx + and r15,rdx + and rax,rdx + and rbx,rdx + and rbp,rdx + and rsi,rdx + + add r8,r14 + adc r9,r15 + mov QWORD PTR[rdi],r8 + adc r10,rax + mov QWORD PTR[8+rdi],r9 + adc r11,rbx + mov QWORD PTR[16+rdi],r10 + adc r12,rbp + mov QWORD PTR[24+rdi],r11 + adc r13,rsi + mov QWORD PTR[32+rdi],r12 + mov QWORD PTR[40+rdi],r13 + + DB 0F3h,0C3h ;repret +__sub_mod_384 ENDP +PUBLIC mul_mont_384x + + +ALIGN 32 +mul_mont_384x PROC PUBLIC + DB 243,15,30,250 + mov QWORD PTR[8+rsp],rdi ;WIN64 prologue + mov QWORD PTR[16+rsp],rsi + mov r11,rsp +$L$SEH_begin_mul_mont_384x:: + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 + mov rcx,r9 + mov r8,QWORD PTR[40+rsp] + + + + push rbp + + push rbx + + push r12 + + push r13 + + push r14 + + push r15 + + sub rsp,328 + +$L$SEH_body_mul_mont_384x:: + + + mov rbx,rdx + mov QWORD PTR[32+rsp],rdi + mov QWORD PTR[24+rsp],rsi + mov QWORD PTR[16+rsp],rdx + mov QWORD PTR[8+rsp],rcx + mov QWORD PTR[rsp],r8 + + + + + lea rdi,QWORD PTR[40+rsp] + call __mulq_384 + + + lea rbx,QWORD PTR[48+rbx] + lea rsi,QWORD PTR[48+rsi] + lea rdi,QWORD PTR[((40+96))+rsp] + call __mulq_384 + + + mov rcx,QWORD PTR[8+rsp] + lea rdx,QWORD PTR[((-48))+rsi] + lea rdi,QWORD PTR[((40+192+48))+rsp] + call __add_mod_384 + + mov rsi,QWORD PTR[16+rsp] + lea rdx,QWORD PTR[48+rsi] + lea rdi,QWORD PTR[((-48))+rdi] + call __add_mod_384 + + lea rbx,QWORD PTR[rdi] + lea rsi,QWORD PTR[48+rdi] + call __mulq_384 + + + lea rsi,QWORD PTR[rdi] + lea rdx,QWORD PTR[40+rsp] + mov rcx,QWORD PTR[8+rsp] + call __sub_mod_384x384 + + lea rsi,QWORD PTR[rdi] + lea rdx,QWORD PTR[((-96))+rdi] + call __sub_mod_384x384 + + + lea rsi,QWORD PTR[40+rsp] + lea rdx,QWORD PTR[((40+96))+rsp] + lea rdi,QWORD PTR[40+rsp] + call __sub_mod_384x384 + + mov rbx,rcx + + + lea rsi,QWORD PTR[40+rsp] + mov rcx,QWORD PTR[rsp] + mov rdi,QWORD PTR[32+rsp] + call __mulq_by_1_mont_384 + call __redc_tail_mont_384 + + + lea rsi,QWORD PTR[((40+192))+rsp] + mov rcx,QWORD PTR[rsp] + lea rdi,QWORD PTR[48+rdi] + call __mulq_by_1_mont_384 + call __redc_tail_mont_384 + + lea r8,QWORD PTR[328+rsp] + mov r15,QWORD PTR[r8] + + mov r14,QWORD PTR[8+r8] + + mov r13,QWORD PTR[16+r8] + + mov r12,QWORD PTR[24+r8] + + mov rbx,QWORD PTR[32+r8] + + mov rbp,QWORD PTR[40+r8] + + lea rsp,QWORD PTR[48+r8] + +$L$SEH_epilogue_mul_mont_384x:: + mov rdi,QWORD PTR[8+rsp] ;WIN64 epilogue + mov rsi,QWORD PTR[16+rsp] + + DB 0F3h,0C3h ;repret + +$L$SEH_end_mul_mont_384x:: +mul_mont_384x ENDP +PUBLIC sqr_mont_384x + + +ALIGN 32 +sqr_mont_384x PROC PUBLIC + DB 243,15,30,250 + mov QWORD PTR[8+rsp],rdi ;WIN64 prologue + mov QWORD PTR[16+rsp],rsi + mov r11,rsp +$L$SEH_begin_sqr_mont_384x:: + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 + mov rcx,r9 + + + + push rbp + + push rbx + + push r12 + + push r13 + + push r14 + + push r15 + + sub rsp,136 + +$L$SEH_body_sqr_mont_384x:: + + + mov QWORD PTR[rsp],rcx + mov rcx,rdx + mov QWORD PTR[8+rsp],rdi + mov QWORD PTR[16+rsp],rsi + + + lea rdx,QWORD PTR[48+rsi] + lea rdi,QWORD PTR[32+rsp] + call __add_mod_384 + + + mov rsi,QWORD PTR[16+rsp] + lea rdx,QWORD PTR[48+rsi] + lea rdi,QWORD PTR[((32+48))+rsp] + call __sub_mod_384 + + + mov rsi,QWORD PTR[16+rsp] + lea rbx,QWORD PTR[48+rsi] + + mov rax,QWORD PTR[48+rsi] + mov r14,QWORD PTR[rsi] + mov r15,QWORD PTR[8+rsi] + mov r12,QWORD PTR[16+rsi] + mov r13,QWORD PTR[24+rsi] + + call __mulq_mont_384 + add r14,r14 + adc r15,r15 + adc r8,r8 + mov r12,r14 + adc r9,r9 + mov r13,r15 + adc r10,r10 + mov rax,r8 + adc r11,r11 + mov rbx,r9 + sbb rdx,rdx + + sub r14,QWORD PTR[rcx] + sbb r15,QWORD PTR[8+rcx] + mov rbp,r10 + sbb r8,QWORD PTR[16+rcx] + sbb r9,QWORD PTR[24+rcx] + sbb r10,QWORD PTR[32+rcx] + mov rsi,r11 + sbb r11,QWORD PTR[40+rcx] + sbb rdx,0 + + cmovc r14,r12 + cmovc r15,r13 + cmovc r8,rax + mov QWORD PTR[48+rdi],r14 + cmovc r9,rbx + mov QWORD PTR[56+rdi],r15 + cmovc r10,rbp + mov QWORD PTR[64+rdi],r8 + cmovc r11,rsi + mov QWORD PTR[72+rdi],r9 + mov QWORD PTR[80+rdi],r10 + mov QWORD PTR[88+rdi],r11 + + lea rsi,QWORD PTR[32+rsp] + lea rbx,QWORD PTR[((32+48))+rsp] + + mov rax,QWORD PTR[((32+48))+rsp] + mov r14,QWORD PTR[((32+0))+rsp] + mov r15,QWORD PTR[((32+8))+rsp] + mov r12,QWORD PTR[((32+16))+rsp] + mov r13,QWORD PTR[((32+24))+rsp] + + call __mulq_mont_384 + + lea r8,QWORD PTR[136+rsp] + mov r15,QWORD PTR[r8] + + mov r14,QWORD PTR[8+r8] + + mov r13,QWORD PTR[16+r8] + + mov r12,QWORD PTR[24+r8] + + mov rbx,QWORD PTR[32+r8] + + mov rbp,QWORD PTR[40+r8] + + lea rsp,QWORD PTR[48+r8] + +$L$SEH_epilogue_sqr_mont_384x:: + mov rdi,QWORD PTR[8+rsp] ;WIN64 epilogue + mov rsi,QWORD PTR[16+rsp] + + DB 0F3h,0C3h ;repret + +$L$SEH_end_sqr_mont_384x:: +sqr_mont_384x ENDP + +PUBLIC mul_382x + + +ALIGN 32 +mul_382x PROC PUBLIC + DB 243,15,30,250 + mov QWORD PTR[8+rsp],rdi ;WIN64 prologue + mov QWORD PTR[16+rsp],rsi + mov r11,rsp +$L$SEH_begin_mul_382x:: + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 + mov rcx,r9 + + + + push rbp + + push rbx + + push r12 + + push r13 + + push r14 + + push r15 + + sub rsp,136 + +$L$SEH_body_mul_382x:: + + + lea rdi,QWORD PTR[96+rdi] + mov QWORD PTR[rsp],rsi + mov QWORD PTR[8+rsp],rdx + mov QWORD PTR[16+rsp],rdi + mov QWORD PTR[24+rsp],rcx + + + mov r8,QWORD PTR[rsi] + mov r9,QWORD PTR[8+rsi] + mov r10,QWORD PTR[16+rsi] + mov r11,QWORD PTR[24+rsi] + mov r12,QWORD PTR[32+rsi] + mov r13,QWORD PTR[40+rsi] + + add r8,QWORD PTR[48+rsi] + adc r9,QWORD PTR[56+rsi] + adc r10,QWORD PTR[64+rsi] + adc r11,QWORD PTR[72+rsi] + adc r12,QWORD PTR[80+rsi] + adc r13,QWORD PTR[88+rsi] + + mov QWORD PTR[((32+0))+rsp],r8 + mov QWORD PTR[((32+8))+rsp],r9 + mov QWORD PTR[((32+16))+rsp],r10 + mov QWORD PTR[((32+24))+rsp],r11 + mov QWORD PTR[((32+32))+rsp],r12 + mov QWORD PTR[((32+40))+rsp],r13 + + + mov r8,QWORD PTR[rdx] + mov r9,QWORD PTR[8+rdx] + mov r10,QWORD PTR[16+rdx] + mov r11,QWORD PTR[24+rdx] + mov r12,QWORD PTR[32+rdx] + mov r13,QWORD PTR[40+rdx] + + add r8,QWORD PTR[48+rdx] + adc r9,QWORD PTR[56+rdx] + adc r10,QWORD PTR[64+rdx] + adc r11,QWORD PTR[72+rdx] + adc r12,QWORD PTR[80+rdx] + adc r13,QWORD PTR[88+rdx] + + mov QWORD PTR[((32+48))+rsp],r8 + mov QWORD PTR[((32+56))+rsp],r9 + mov QWORD PTR[((32+64))+rsp],r10 + mov QWORD PTR[((32+72))+rsp],r11 + mov QWORD PTR[((32+80))+rsp],r12 + mov QWORD PTR[((32+88))+rsp],r13 + + + lea rsi,QWORD PTR[((32+0))+rsp] + lea rbx,QWORD PTR[((32+48))+rsp] + call __mulq_384 + + + mov rsi,QWORD PTR[rsp] + mov rbx,QWORD PTR[8+rsp] + lea rdi,QWORD PTR[((-96))+rdi] + call __mulq_384 + + + lea rsi,QWORD PTR[48+rsi] + lea rbx,QWORD PTR[48+rbx] + lea rdi,QWORD PTR[32+rsp] + call __mulq_384 + + + mov rsi,QWORD PTR[16+rsp] + lea rdx,QWORD PTR[32+rsp] + mov rcx,QWORD PTR[24+rsp] + mov rdi,rsi + call __sub_mod_384x384 + + + lea rsi,QWORD PTR[rdi] + lea rdx,QWORD PTR[((-96))+rdi] + call __sub_mod_384x384 + + + lea rsi,QWORD PTR[((-96))+rdi] + lea rdx,QWORD PTR[32+rsp] + lea rdi,QWORD PTR[((-96))+rdi] + call __sub_mod_384x384 + + lea r8,QWORD PTR[136+rsp] + mov r15,QWORD PTR[r8] + + mov r14,QWORD PTR[8+r8] + + mov r13,QWORD PTR[16+r8] + + mov r12,QWORD PTR[24+r8] + + mov rbx,QWORD PTR[32+r8] + + mov rbp,QWORD PTR[40+r8] + + lea rsp,QWORD PTR[48+r8] + +$L$SEH_epilogue_mul_382x:: + mov rdi,QWORD PTR[8+rsp] ;WIN64 epilogue + mov rsi,QWORD PTR[16+rsp] + + DB 0F3h,0C3h ;repret + +$L$SEH_end_mul_382x:: +mul_382x ENDP +PUBLIC sqr_382x + + +ALIGN 32 +sqr_382x PROC PUBLIC + DB 243,15,30,250 + mov QWORD PTR[8+rsp],rdi ;WIN64 prologue + mov QWORD PTR[16+rsp],rsi + mov r11,rsp +$L$SEH_begin_sqr_382x:: + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 + + + + push rbp + + push rbx + + push r12 + + push r13 + + push r14 + + push r15 + + push rsi + +$L$SEH_body_sqr_382x:: + + + mov rcx,rdx + + + mov r14,QWORD PTR[rsi] + mov r15,QWORD PTR[8+rsi] + mov rax,QWORD PTR[16+rsi] + mov rbx,QWORD PTR[24+rsi] + mov rbp,QWORD PTR[32+rsi] + mov rdx,QWORD PTR[40+rsi] + + mov r8,r14 + add r14,QWORD PTR[48+rsi] + mov r9,r15 + adc r15,QWORD PTR[56+rsi] + mov r10,rax + adc rax,QWORD PTR[64+rsi] + mov r11,rbx + adc rbx,QWORD PTR[72+rsi] + mov r12,rbp + adc rbp,QWORD PTR[80+rsi] + mov r13,rdx + adc rdx,QWORD PTR[88+rsi] + + mov QWORD PTR[rdi],r14 + mov QWORD PTR[8+rdi],r15 + mov QWORD PTR[16+rdi],rax + mov QWORD PTR[24+rdi],rbx + mov QWORD PTR[32+rdi],rbp + mov QWORD PTR[40+rdi],rdx + + + lea rdx,QWORD PTR[48+rsi] + lea rdi,QWORD PTR[48+rdi] + call __sub_mod_384_a_is_loaded + + + lea rsi,QWORD PTR[rdi] + lea rbx,QWORD PTR[((-48))+rdi] + lea rdi,QWORD PTR[((-48))+rdi] + call __mulq_384 + + + mov rsi,QWORD PTR[rsp] + lea rbx,QWORD PTR[48+rsi] + lea rdi,QWORD PTR[96+rdi] + call __mulq_384 + + mov r8,QWORD PTR[rdi] + mov r9,QWORD PTR[8+rdi] + mov r10,QWORD PTR[16+rdi] + mov r11,QWORD PTR[24+rdi] + mov r12,QWORD PTR[32+rdi] + mov r13,QWORD PTR[40+rdi] + mov r14,QWORD PTR[48+rdi] + mov r15,QWORD PTR[56+rdi] + mov rax,QWORD PTR[64+rdi] + mov rbx,QWORD PTR[72+rdi] + mov rbp,QWORD PTR[80+rdi] + add r8,r8 + mov rdx,QWORD PTR[88+rdi] + adc r9,r9 + mov QWORD PTR[rdi],r8 + adc r10,r10 + mov QWORD PTR[8+rdi],r9 + adc r11,r11 + mov QWORD PTR[16+rdi],r10 + adc r12,r12 + mov QWORD PTR[24+rdi],r11 + adc r13,r13 + mov QWORD PTR[32+rdi],r12 + adc r14,r14 + mov QWORD PTR[40+rdi],r13 + adc r15,r15 + mov QWORD PTR[48+rdi],r14 + adc rax,rax + mov QWORD PTR[56+rdi],r15 + adc rbx,rbx + mov QWORD PTR[64+rdi],rax + adc rbp,rbp + mov QWORD PTR[72+rdi],rbx + adc rdx,rdx + mov QWORD PTR[80+rdi],rbp + mov QWORD PTR[88+rdi],rdx + + mov r15,QWORD PTR[8+rsp] + + mov r14,QWORD PTR[16+rsp] + + mov r13,QWORD PTR[24+rsp] + + mov r12,QWORD PTR[32+rsp] + + mov rbx,QWORD PTR[40+rsp] + + mov rbp,QWORD PTR[48+rsp] + + lea rsp,QWORD PTR[56+rsp] + +$L$SEH_epilogue_sqr_382x:: + mov rdi,QWORD PTR[8+rsp] ;WIN64 epilogue + mov rsi,QWORD PTR[16+rsp] + + DB 0F3h,0C3h ;repret + +$L$SEH_end_sqr_382x:: +sqr_382x ENDP +PUBLIC mul_384 + + +ALIGN 32 +mul_384 PROC PUBLIC + DB 243,15,30,250 + mov QWORD PTR[8+rsp],rdi ;WIN64 prologue + mov QWORD PTR[16+rsp],rsi + mov r11,rsp +$L$SEH_begin_mul_384:: + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 + + + + push rbp + + push rbx + + push r12 + +$L$SEH_body_mul_384:: + + + mov rbx,rdx + call __mulq_384 + + mov r12,QWORD PTR[rsp] + + mov rbx,QWORD PTR[8+rsp] + + mov rbp,QWORD PTR[16+rsp] + + lea rsp,QWORD PTR[24+rsp] + +$L$SEH_epilogue_mul_384:: + mov rdi,QWORD PTR[8+rsp] ;WIN64 epilogue + mov rsi,QWORD PTR[16+rsp] + + DB 0F3h,0C3h ;repret + +$L$SEH_end_mul_384:: +mul_384 ENDP + + +ALIGN 32 +__mulq_384 PROC PRIVATE + DB 243,15,30,250 + mov rax,QWORD PTR[rbx] + + mov rbp,rax + mul QWORD PTR[rsi] + mov QWORD PTR[rdi],rax + mov rax,rbp + mov rcx,rdx + + mul QWORD PTR[8+rsi] + add rcx,rax + mov rax,rbp + adc rdx,0 + mov r8,rdx + + mul QWORD PTR[16+rsi] + add r8,rax + mov rax,rbp + adc rdx,0 + mov r9,rdx + + mul QWORD PTR[24+rsi] + add r9,rax + mov rax,rbp + adc rdx,0 + mov r10,rdx + + mul QWORD PTR[32+rsi] + add r10,rax + mov rax,rbp + adc rdx,0 + mov r11,rdx + + mul QWORD PTR[40+rsi] + add r11,rax + mov rax,QWORD PTR[8+rbx] + adc rdx,0 + mov r12,rdx + mov rbp,rax + mul QWORD PTR[rsi] + add rcx,rax + mov rax,rbp + adc rdx,0 + mov QWORD PTR[8+rdi],rcx + mov rcx,rdx + + mul QWORD PTR[8+rsi] + add r8,rax + mov rax,rbp + adc rdx,0 + add rcx,r8 + adc rdx,0 + mov r8,rdx + + mul QWORD PTR[16+rsi] + add r9,rax + mov rax,rbp + adc rdx,0 + add r8,r9 + adc rdx,0 + mov r9,rdx + + mul QWORD PTR[24+rsi] + add r10,rax + mov rax,rbp + adc rdx,0 + add r9,r10 + adc rdx,0 + mov r10,rdx + + mul QWORD PTR[32+rsi] + add r11,rax + mov rax,rbp + adc rdx,0 + add r10,r11 + adc rdx,0 + mov r11,rdx + + mul QWORD PTR[40+rsi] + add r12,rax + mov rax,QWORD PTR[16+rbx] + adc rdx,0 + add r11,r12 + adc rdx,0 + mov r12,rdx + mov rbp,rax + mul QWORD PTR[rsi] + add rcx,rax + mov rax,rbp + adc rdx,0 + mov QWORD PTR[16+rdi],rcx + mov rcx,rdx + + mul QWORD PTR[8+rsi] + add r8,rax + mov rax,rbp + adc rdx,0 + add rcx,r8 + adc rdx,0 + mov r8,rdx + + mul QWORD PTR[16+rsi] + add r9,rax + mov rax,rbp + adc rdx,0 + add r8,r9 + adc rdx,0 + mov r9,rdx + + mul QWORD PTR[24+rsi] + add r10,rax + mov rax,rbp + adc rdx,0 + add r9,r10 + adc rdx,0 + mov r10,rdx + + mul QWORD PTR[32+rsi] + add r11,rax + mov rax,rbp + adc rdx,0 + add r10,r11 + adc rdx,0 + mov r11,rdx + + mul QWORD PTR[40+rsi] + add r12,rax + mov rax,QWORD PTR[24+rbx] + adc rdx,0 + add r11,r12 + adc rdx,0 + mov r12,rdx + mov rbp,rax + mul QWORD PTR[rsi] + add rcx,rax + mov rax,rbp + adc rdx,0 + mov QWORD PTR[24+rdi],rcx + mov rcx,rdx + + mul QWORD PTR[8+rsi] + add r8,rax + mov rax,rbp + adc rdx,0 + add rcx,r8 + adc rdx,0 + mov r8,rdx + + mul QWORD PTR[16+rsi] + add r9,rax + mov rax,rbp + adc rdx,0 + add r8,r9 + adc rdx,0 + mov r9,rdx + + mul QWORD PTR[24+rsi] + add r10,rax + mov rax,rbp + adc rdx,0 + add r9,r10 + adc rdx,0 + mov r10,rdx + + mul QWORD PTR[32+rsi] + add r11,rax + mov rax,rbp + adc rdx,0 + add r10,r11 + adc rdx,0 + mov r11,rdx + + mul QWORD PTR[40+rsi] + add r12,rax + mov rax,QWORD PTR[32+rbx] + adc rdx,0 + add r11,r12 + adc rdx,0 + mov r12,rdx + mov rbp,rax + mul QWORD PTR[rsi] + add rcx,rax + mov rax,rbp + adc rdx,0 + mov QWORD PTR[32+rdi],rcx + mov rcx,rdx + + mul QWORD PTR[8+rsi] + add r8,rax + mov rax,rbp + adc rdx,0 + add rcx,r8 + adc rdx,0 + mov r8,rdx + + mul QWORD PTR[16+rsi] + add r9,rax + mov rax,rbp + adc rdx,0 + add r8,r9 + adc rdx,0 + mov r9,rdx + + mul QWORD PTR[24+rsi] + add r10,rax + mov rax,rbp + adc rdx,0 + add r9,r10 + adc rdx,0 + mov r10,rdx + + mul QWORD PTR[32+rsi] + add r11,rax + mov rax,rbp + adc rdx,0 + add r10,r11 + adc rdx,0 + mov r11,rdx + + mul QWORD PTR[40+rsi] + add r12,rax + mov rax,QWORD PTR[40+rbx] + adc rdx,0 + add r11,r12 + adc rdx,0 + mov r12,rdx + mov rbp,rax + mul QWORD PTR[rsi] + add rcx,rax + mov rax,rbp + adc rdx,0 + mov QWORD PTR[40+rdi],rcx + mov rcx,rdx + + mul QWORD PTR[8+rsi] + add r8,rax + mov rax,rbp + adc rdx,0 + add rcx,r8 + adc rdx,0 + mov r8,rdx + + mul QWORD PTR[16+rsi] + add r9,rax + mov rax,rbp + adc rdx,0 + add r8,r9 + adc rdx,0 + mov r9,rdx + + mul QWORD PTR[24+rsi] + add r10,rax + mov rax,rbp + adc rdx,0 + add r9,r10 + adc rdx,0 + mov r10,rdx + + mul QWORD PTR[32+rsi] + add r11,rax + mov rax,rbp + adc rdx,0 + add r10,r11 + adc rdx,0 + mov r11,rdx + + mul QWORD PTR[40+rsi] + add r12,rax + mov rax,rax + adc rdx,0 + add r11,r12 + adc rdx,0 + mov r12,rdx + mov QWORD PTR[48+rdi],rcx + mov QWORD PTR[56+rdi],r8 + mov QWORD PTR[64+rdi],r9 + mov QWORD PTR[72+rdi],r10 + mov QWORD PTR[80+rdi],r11 + mov QWORD PTR[88+rdi],r12 + + DB 0F3h,0C3h ;repret +__mulq_384 ENDP +PUBLIC sqr_384 + + +ALIGN 32 +sqr_384 PROC PUBLIC + DB 243,15,30,250 + mov QWORD PTR[8+rsp],rdi ;WIN64 prologue + mov QWORD PTR[16+rsp],rsi + mov r11,rsp +$L$SEH_begin_sqr_384:: + mov rdi,rcx + mov rsi,rdx + + + + push rbp + + push rbx + + push r12 + + push r13 + + push r14 + + push r15 + + sub rsp,8 + +$L$SEH_body_sqr_384:: + + + call __sqrq_384 + + mov r15,QWORD PTR[8+rsp] + + mov r14,QWORD PTR[16+rsp] + + mov r13,QWORD PTR[24+rsp] + + mov r12,QWORD PTR[32+rsp] + + mov rbx,QWORD PTR[40+rsp] + + mov rbp,QWORD PTR[48+rsp] + + lea rsp,QWORD PTR[56+rsp] + +$L$SEH_epilogue_sqr_384:: + mov rdi,QWORD PTR[8+rsp] ;WIN64 epilogue + mov rsi,QWORD PTR[16+rsp] + + DB 0F3h,0C3h ;repret + +$L$SEH_end_sqr_384:: +sqr_384 ENDP + + +ALIGN 32 +__sqrq_384 PROC PRIVATE + DB 243,15,30,250 + mov rax,QWORD PTR[rsi] + mov r15,QWORD PTR[8+rsi] + mov rcx,QWORD PTR[16+rsi] + mov rbx,QWORD PTR[24+rsi] + + + mov r14,rax + mul r15 + mov r9,rax + mov rax,r14 + mov rbp,QWORD PTR[32+rsi] + mov r10,rdx + + mul rcx + add r10,rax + mov rax,r14 + adc rdx,0 + mov rsi,QWORD PTR[40+rsi] + mov r11,rdx + + mul rbx + add r11,rax + mov rax,r14 + adc rdx,0 + mov r12,rdx + + mul rbp + add r12,rax + mov rax,r14 + adc rdx,0 + mov r13,rdx + + mul rsi + add r13,rax + mov rax,r14 + adc rdx,0 + mov r14,rdx + + mul rax + xor r8,r8 + mov QWORD PTR[rdi],rax + mov rax,r15 + add r9,r9 + adc r8,0 + add r9,rdx + adc r8,0 + mov QWORD PTR[8+rdi],r9 + + mul rcx + add r11,rax + mov rax,r15 + adc rdx,0 + mov r9,rdx + + mul rbx + add r12,rax + mov rax,r15 + adc rdx,0 + add r12,r9 + adc rdx,0 + mov r9,rdx + + mul rbp + add r13,rax + mov rax,r15 + adc rdx,0 + add r13,r9 + adc rdx,0 + mov r9,rdx + + mul rsi + add r14,rax + mov rax,r15 + adc rdx,0 + add r14,r9 + adc rdx,0 + mov r15,rdx + + mul rax + xor r9,r9 + add r8,rax + mov rax,rcx + add r10,r10 + adc r11,r11 + adc r9,0 + add r10,r8 + adc r11,rdx + adc r9,0 + mov QWORD PTR[16+rdi],r10 + + mul rbx + add r13,rax + mov rax,rcx + adc rdx,0 + mov QWORD PTR[24+rdi],r11 + mov r8,rdx + + mul rbp + add r14,rax + mov rax,rcx + adc rdx,0 + add r14,r8 + adc rdx,0 + mov r8,rdx + + mul rsi + add r15,rax + mov rax,rcx + adc rdx,0 + add r15,r8 + adc rdx,0 + mov rcx,rdx + + mul rax + xor r11,r11 + add r9,rax + mov rax,rbx + add r12,r12 + adc r13,r13 + adc r11,0 + add r12,r9 + adc r13,rdx + adc r11,0 + mov QWORD PTR[32+rdi],r12 + + + mul rbp + add r15,rax + mov rax,rbx + adc rdx,0 + mov QWORD PTR[40+rdi],r13 + mov r8,rdx + + mul rsi + add rcx,rax + mov rax,rbx + adc rdx,0 + add rcx,r8 + adc rdx,0 + mov rbx,rdx + + mul rax + xor r12,r12 + add r11,rax + mov rax,rbp + add r14,r14 + adc r15,r15 + adc r12,0 + add r14,r11 + adc r15,rdx + mov QWORD PTR[48+rdi],r14 + adc r12,0 + mov QWORD PTR[56+rdi],r15 + + + mul rsi + add rbx,rax + mov rax,rbp + adc rdx,0 + mov rbp,rdx + + mul rax + xor r13,r13 + add r12,rax + mov rax,rsi + add rcx,rcx + adc rbx,rbx + adc r13,0 + add rcx,r12 + adc rbx,rdx + mov QWORD PTR[64+rdi],rcx + adc r13,0 + mov QWORD PTR[72+rdi],rbx + + + mul rax + add rax,r13 + add rbp,rbp + adc rdx,0 + add rax,rbp + adc rdx,0 + mov QWORD PTR[80+rdi],rax + mov QWORD PTR[88+rdi],rdx + + DB 0F3h,0C3h ;repret +__sqrq_384 ENDP + +PUBLIC sqr_mont_384 + + +ALIGN 32 +sqr_mont_384 PROC PUBLIC + DB 243,15,30,250 + mov QWORD PTR[8+rsp],rdi ;WIN64 prologue + mov QWORD PTR[16+rsp],rsi + mov r11,rsp +$L$SEH_begin_sqr_mont_384:: + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 + mov rcx,r9 + + + + push rbp + + push rbx + + push r12 + + push r13 + + push r14 + + push r15 + + sub rsp,8*15 + +$L$SEH_body_sqr_mont_384:: + + + mov QWORD PTR[96+rsp],rcx + mov QWORD PTR[104+rsp],rdx + mov QWORD PTR[112+rsp],rdi + + mov rdi,rsp + call __sqrq_384 + + lea rsi,QWORD PTR[rsp] + mov rcx,QWORD PTR[96+rsp] + mov rbx,QWORD PTR[104+rsp] + mov rdi,QWORD PTR[112+rsp] + call __mulq_by_1_mont_384 + call __redc_tail_mont_384 + + lea r8,QWORD PTR[120+rsp] + mov r15,QWORD PTR[120+rsp] + + mov r14,QWORD PTR[8+r8] + + mov r13,QWORD PTR[16+r8] + + mov r12,QWORD PTR[24+r8] + + mov rbx,QWORD PTR[32+r8] + + mov rbp,QWORD PTR[40+r8] + + lea rsp,QWORD PTR[48+r8] + +$L$SEH_epilogue_sqr_mont_384:: + mov rdi,QWORD PTR[8+rsp] ;WIN64 epilogue + mov rsi,QWORD PTR[16+rsp] + + DB 0F3h,0C3h ;repret + +$L$SEH_end_sqr_mont_384:: +sqr_mont_384 ENDP + + + +PUBLIC redc_mont_384 + + +ALIGN 32 +redc_mont_384 PROC PUBLIC + DB 243,15,30,250 + mov QWORD PTR[8+rsp],rdi ;WIN64 prologue + mov QWORD PTR[16+rsp],rsi + mov r11,rsp +$L$SEH_begin_redc_mont_384:: + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 + mov rcx,r9 + + + + push rbp + + push rbx + + push r12 + + push r13 + + push r14 + + push r15 + + sub rsp,8 + +$L$SEH_body_redc_mont_384:: + + + mov rbx,rdx + call __mulq_by_1_mont_384 + call __redc_tail_mont_384 + + mov r15,QWORD PTR[8+rsp] + + mov r14,QWORD PTR[16+rsp] + + mov r13,QWORD PTR[24+rsp] + + mov r12,QWORD PTR[32+rsp] + + mov rbx,QWORD PTR[40+rsp] + + mov rbp,QWORD PTR[48+rsp] + + lea rsp,QWORD PTR[56+rsp] + +$L$SEH_epilogue_redc_mont_384:: + mov rdi,QWORD PTR[8+rsp] ;WIN64 epilogue + mov rsi,QWORD PTR[16+rsp] + + DB 0F3h,0C3h ;repret + +$L$SEH_end_redc_mont_384:: +redc_mont_384 ENDP + + + + +PUBLIC from_mont_384 + + +ALIGN 32 +from_mont_384 PROC PUBLIC + DB 243,15,30,250 + mov QWORD PTR[8+rsp],rdi ;WIN64 prologue + mov QWORD PTR[16+rsp],rsi + mov r11,rsp +$L$SEH_begin_from_mont_384:: + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 + mov rcx,r9 + + + + push rbp + + push rbx + + push r12 + + push r13 + + push r14 + + push r15 + + sub rsp,8 + +$L$SEH_body_from_mont_384:: + + + mov rbx,rdx + call __mulq_by_1_mont_384 + + + + + + mov rcx,r15 + mov rdx,r8 + mov rbp,r9 + + sub r14,QWORD PTR[rbx] + sbb r15,QWORD PTR[8+rbx] + mov r13,r10 + sbb r8,QWORD PTR[16+rbx] + sbb r9,QWORD PTR[24+rbx] + sbb r10,QWORD PTR[32+rbx] + mov rsi,r11 + sbb r11,QWORD PTR[40+rbx] + + cmovc r14,rax + cmovc r15,rcx + cmovc r8,rdx + mov QWORD PTR[rdi],r14 + cmovc r9,rbp + mov QWORD PTR[8+rdi],r15 + cmovc r10,r13 + mov QWORD PTR[16+rdi],r8 + cmovc r11,rsi + mov QWORD PTR[24+rdi],r9 + mov QWORD PTR[32+rdi],r10 + mov QWORD PTR[40+rdi],r11 + + mov r15,QWORD PTR[8+rsp] + + mov r14,QWORD PTR[16+rsp] + + mov r13,QWORD PTR[24+rsp] + + mov r12,QWORD PTR[32+rsp] + + mov rbx,QWORD PTR[40+rsp] + + mov rbp,QWORD PTR[48+rsp] + + lea rsp,QWORD PTR[56+rsp] + +$L$SEH_epilogue_from_mont_384:: + mov rdi,QWORD PTR[8+rsp] ;WIN64 epilogue + mov rsi,QWORD PTR[16+rsp] + + DB 0F3h,0C3h ;repret + +$L$SEH_end_from_mont_384:: +from_mont_384 ENDP + +ALIGN 32 +__mulq_by_1_mont_384 PROC PRIVATE + DB 243,15,30,250 + mov rax,QWORD PTR[rsi] + mov r9,QWORD PTR[8+rsi] + mov r10,QWORD PTR[16+rsi] + mov r11,QWORD PTR[24+rsi] + mov r12,QWORD PTR[32+rsi] + mov r13,QWORD PTR[40+rsi] + + mov r14,rax + imul rax,rcx + mov r8,rax + + mul QWORD PTR[rbx] + add r14,rax + mov rax,r8 + adc r14,rdx + + mul QWORD PTR[8+rbx] + add r9,rax + mov rax,r8 + adc rdx,0 + add r9,r14 + adc rdx,0 + mov r14,rdx + + mul QWORD PTR[16+rbx] + add r10,rax + mov rax,r8 + adc rdx,0 + add r10,r14 + adc rdx,0 + mov r14,rdx + + mul QWORD PTR[24+rbx] + add r11,rax + mov rax,r8 + adc rdx,0 + mov r15,r9 + imul r9,rcx + add r11,r14 + adc rdx,0 + mov r14,rdx + + mul QWORD PTR[32+rbx] + add r12,rax + mov rax,r8 + adc rdx,0 + add r12,r14 + adc rdx,0 + mov r14,rdx + + mul QWORD PTR[40+rbx] + add r13,rax + mov rax,r9 + adc rdx,0 + add r13,r14 + adc rdx,0 + mov r14,rdx + + mul QWORD PTR[rbx] + add r15,rax + mov rax,r9 + adc r15,rdx + + mul QWORD PTR[8+rbx] + add r10,rax + mov rax,r9 + adc rdx,0 + add r10,r15 + adc rdx,0 + mov r15,rdx + + mul QWORD PTR[16+rbx] + add r11,rax + mov rax,r9 + adc rdx,0 + add r11,r15 + adc rdx,0 + mov r15,rdx + + mul QWORD PTR[24+rbx] + add r12,rax + mov rax,r9 + adc rdx,0 + mov r8,r10 + imul r10,rcx + add r12,r15 + adc rdx,0 + mov r15,rdx + + mul QWORD PTR[32+rbx] + add r13,rax + mov rax,r9 + adc rdx,0 + add r13,r15 + adc rdx,0 + mov r15,rdx + + mul QWORD PTR[40+rbx] + add r14,rax + mov rax,r10 + adc rdx,0 + add r14,r15 + adc rdx,0 + mov r15,rdx + + mul QWORD PTR[rbx] + add r8,rax + mov rax,r10 + adc r8,rdx + + mul QWORD PTR[8+rbx] + add r11,rax + mov rax,r10 + adc rdx,0 + add r11,r8 + adc rdx,0 + mov r8,rdx + + mul QWORD PTR[16+rbx] + add r12,rax + mov rax,r10 + adc rdx,0 + add r12,r8 + adc rdx,0 + mov r8,rdx + + mul QWORD PTR[24+rbx] + add r13,rax + mov rax,r10 + adc rdx,0 + mov r9,r11 + imul r11,rcx + add r13,r8 + adc rdx,0 + mov r8,rdx + + mul QWORD PTR[32+rbx] + add r14,rax + mov rax,r10 + adc rdx,0 + add r14,r8 + adc rdx,0 + mov r8,rdx + + mul QWORD PTR[40+rbx] + add r15,rax + mov rax,r11 + adc rdx,0 + add r15,r8 + adc rdx,0 + mov r8,rdx + + mul QWORD PTR[rbx] + add r9,rax + mov rax,r11 + adc r9,rdx + + mul QWORD PTR[8+rbx] + add r12,rax + mov rax,r11 + adc rdx,0 + add r12,r9 + adc rdx,0 + mov r9,rdx + + mul QWORD PTR[16+rbx] + add r13,rax + mov rax,r11 + adc rdx,0 + add r13,r9 + adc rdx,0 + mov r9,rdx + + mul QWORD PTR[24+rbx] + add r14,rax + mov rax,r11 + adc rdx,0 + mov r10,r12 + imul r12,rcx + add r14,r9 + adc rdx,0 + mov r9,rdx + + mul QWORD PTR[32+rbx] + add r15,rax + mov rax,r11 + adc rdx,0 + add r15,r9 + adc rdx,0 + mov r9,rdx + + mul QWORD PTR[40+rbx] + add r8,rax + mov rax,r12 + adc rdx,0 + add r8,r9 + adc rdx,0 + mov r9,rdx + + mul QWORD PTR[rbx] + add r10,rax + mov rax,r12 + adc r10,rdx + + mul QWORD PTR[8+rbx] + add r13,rax + mov rax,r12 + adc rdx,0 + add r13,r10 + adc rdx,0 + mov r10,rdx + + mul QWORD PTR[16+rbx] + add r14,rax + mov rax,r12 + adc rdx,0 + add r14,r10 + adc rdx,0 + mov r10,rdx + + mul QWORD PTR[24+rbx] + add r15,rax + mov rax,r12 + adc rdx,0 + mov r11,r13 + imul r13,rcx + add r15,r10 + adc rdx,0 + mov r10,rdx + + mul QWORD PTR[32+rbx] + add r8,rax + mov rax,r12 + adc rdx,0 + add r8,r10 + adc rdx,0 + mov r10,rdx + + mul QWORD PTR[40+rbx] + add r9,rax + mov rax,r13 + adc rdx,0 + add r9,r10 + adc rdx,0 + mov r10,rdx + + mul QWORD PTR[rbx] + add r11,rax + mov rax,r13 + adc r11,rdx + + mul QWORD PTR[8+rbx] + add r14,rax + mov rax,r13 + adc rdx,0 + add r14,r11 + adc rdx,0 + mov r11,rdx + + mul QWORD PTR[16+rbx] + add r15,rax + mov rax,r13 + adc rdx,0 + add r15,r11 + adc rdx,0 + mov r11,rdx + + mul QWORD PTR[24+rbx] + add r8,rax + mov rax,r13 + adc rdx,0 + add r8,r11 + adc rdx,0 + mov r11,rdx + + mul QWORD PTR[32+rbx] + add r9,rax + mov rax,r13 + adc rdx,0 + add r9,r11 + adc rdx,0 + mov r11,rdx + + mul QWORD PTR[40+rbx] + add r10,rax + mov rax,r14 + adc rdx,0 + add r10,r11 + adc rdx,0 + mov r11,rdx + DB 0F3h,0C3h ;repret +__mulq_by_1_mont_384 ENDP + + +ALIGN 32 +__redc_tail_mont_384 PROC PRIVATE + DB 243,15,30,250 + add r14,QWORD PTR[48+rsi] + mov rax,r14 + adc r15,QWORD PTR[56+rsi] + adc r8,QWORD PTR[64+rsi] + adc r9,QWORD PTR[72+rsi] + mov rcx,r15 + adc r10,QWORD PTR[80+rsi] + adc r11,QWORD PTR[88+rsi] + sbb r12,r12 + + + + + mov rdx,r8 + mov rbp,r9 + + sub r14,QWORD PTR[rbx] + sbb r15,QWORD PTR[8+rbx] + mov r13,r10 + sbb r8,QWORD PTR[16+rbx] + sbb r9,QWORD PTR[24+rbx] + sbb r10,QWORD PTR[32+rbx] + mov rsi,r11 + sbb r11,QWORD PTR[40+rbx] + sbb r12,0 + + cmovc r14,rax + cmovc r15,rcx + cmovc r8,rdx + mov QWORD PTR[rdi],r14 + cmovc r9,rbp + mov QWORD PTR[8+rdi],r15 + cmovc r10,r13 + mov QWORD PTR[16+rdi],r8 + cmovc r11,rsi + mov QWORD PTR[24+rdi],r9 + mov QWORD PTR[32+rdi],r10 + mov QWORD PTR[40+rdi],r11 + + DB 0F3h,0C3h ;repret +__redc_tail_mont_384 ENDP + +PUBLIC sgn0_pty_mont_384 + + +ALIGN 32 +sgn0_pty_mont_384 PROC PUBLIC + DB 243,15,30,250 + mov QWORD PTR[8+rsp],rdi ;WIN64 prologue + mov QWORD PTR[16+rsp],rsi + mov r11,rsp +$L$SEH_begin_sgn0_pty_mont_384:: + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 + + + + push rbp + + push rbx + + push r12 + + push r13 + + push r14 + + push r15 + + sub rsp,8 + +$L$SEH_body_sgn0_pty_mont_384:: + + + mov rbx,rsi + lea rsi,QWORD PTR[rdi] + mov rcx,rdx + call __mulq_by_1_mont_384 + + xor rax,rax + mov r13,r14 + add r14,r14 + adc r15,r15 + adc r8,r8 + adc r9,r9 + adc r10,r10 + adc r11,r11 + adc rax,0 + + sub r14,QWORD PTR[rbx] + sbb r15,QWORD PTR[8+rbx] + sbb r8,QWORD PTR[16+rbx] + sbb r9,QWORD PTR[24+rbx] + sbb r10,QWORD PTR[32+rbx] + sbb r11,QWORD PTR[40+rbx] + sbb rax,0 + + not rax + and r13,1 + and rax,2 + or rax,r13 + + mov r15,QWORD PTR[8+rsp] + + mov r14,QWORD PTR[16+rsp] + + mov r13,QWORD PTR[24+rsp] + + mov r12,QWORD PTR[32+rsp] + + mov rbx,QWORD PTR[40+rsp] + + mov rbp,QWORD PTR[48+rsp] + + lea rsp,QWORD PTR[56+rsp] + +$L$SEH_epilogue_sgn0_pty_mont_384:: + mov rdi,QWORD PTR[8+rsp] ;WIN64 epilogue + mov rsi,QWORD PTR[16+rsp] + + DB 0F3h,0C3h ;repret + +$L$SEH_end_sgn0_pty_mont_384:: +sgn0_pty_mont_384 ENDP + +PUBLIC sgn0_pty_mont_384x + + +ALIGN 32 +sgn0_pty_mont_384x PROC PUBLIC + DB 243,15,30,250 + mov QWORD PTR[8+rsp],rdi ;WIN64 prologue + mov QWORD PTR[16+rsp],rsi + mov r11,rsp +$L$SEH_begin_sgn0_pty_mont_384x:: + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 + + + + push rbp + + push rbx + + push r12 + + push r13 + + push r14 + + push r15 + + sub rsp,8 + +$L$SEH_body_sgn0_pty_mont_384x:: + + + mov rbx,rsi + lea rsi,QWORD PTR[48+rdi] + mov rcx,rdx + call __mulq_by_1_mont_384 + + mov r12,r14 + or r14,r15 + or r14,r8 + or r14,r9 + or r14,r10 + or r14,r11 + + lea rsi,QWORD PTR[rdi] + xor rdi,rdi + mov r13,r12 + add r12,r12 + adc r15,r15 + adc r8,r8 + adc r9,r9 + adc r10,r10 + adc r11,r11 + adc rdi,0 + + sub r12,QWORD PTR[rbx] + sbb r15,QWORD PTR[8+rbx] + sbb r8,QWORD PTR[16+rbx] + sbb r9,QWORD PTR[24+rbx] + sbb r10,QWORD PTR[32+rbx] + sbb r11,QWORD PTR[40+rbx] + sbb rdi,0 + + mov QWORD PTR[rsp],r14 + not rdi + and r13,1 + and rdi,2 + or rdi,r13 + + call __mulq_by_1_mont_384 + + mov r12,r14 + or r14,r15 + or r14,r8 + or r14,r9 + or r14,r10 + or r14,r11 + + xor rax,rax + mov r13,r12 + add r12,r12 + adc r15,r15 + adc r8,r8 + adc r9,r9 + adc r10,r10 + adc r11,r11 + adc rax,0 + + sub r12,QWORD PTR[rbx] + sbb r15,QWORD PTR[8+rbx] + sbb r8,QWORD PTR[16+rbx] + sbb r9,QWORD PTR[24+rbx] + sbb r10,QWORD PTR[32+rbx] + sbb r11,QWORD PTR[40+rbx] + sbb rax,0 + + mov r12,QWORD PTR[rsp] + + not rax + + test r14,r14 + cmovz r13,rdi + + test r12,r12 + cmovnz rax,rdi + + and r13,1 + and rax,2 + or rax,r13 + + mov r15,QWORD PTR[8+rsp] + + mov r14,QWORD PTR[16+rsp] + + mov r13,QWORD PTR[24+rsp] + + mov r12,QWORD PTR[32+rsp] + + mov rbx,QWORD PTR[40+rsp] + + mov rbp,QWORD PTR[48+rsp] + + lea rsp,QWORD PTR[56+rsp] + +$L$SEH_epilogue_sgn0_pty_mont_384x:: + mov rdi,QWORD PTR[8+rsp] ;WIN64 epilogue + mov rsi,QWORD PTR[16+rsp] + + DB 0F3h,0C3h ;repret + +$L$SEH_end_sgn0_pty_mont_384x:: +sgn0_pty_mont_384x ENDP +PUBLIC mul_mont_384 + + +ALIGN 32 +mul_mont_384 PROC PUBLIC + DB 243,15,30,250 + mov QWORD PTR[8+rsp],rdi ;WIN64 prologue + mov QWORD PTR[16+rsp],rsi + mov r11,rsp +$L$SEH_begin_mul_mont_384:: + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 + mov rcx,r9 + mov r8,QWORD PTR[40+rsp] + + + + push rbp + + push rbx + + push r12 + + push r13 + + push r14 + + push r15 + + sub rsp,8*3 + +$L$SEH_body_mul_mont_384:: + + + mov rax,QWORD PTR[rdx] + mov r14,QWORD PTR[rsi] + mov r15,QWORD PTR[8+rsi] + mov r12,QWORD PTR[16+rsi] + mov r13,QWORD PTR[24+rsi] + mov rbx,rdx + mov QWORD PTR[rsp],r8 + mov QWORD PTR[8+rsp],rdi + + call __mulq_mont_384 + + mov r15,QWORD PTR[24+rsp] + + mov r14,QWORD PTR[32+rsp] + + mov r13,QWORD PTR[40+rsp] + + mov r12,QWORD PTR[48+rsp] + + mov rbx,QWORD PTR[56+rsp] + + mov rbp,QWORD PTR[64+rsp] + + lea rsp,QWORD PTR[72+rsp] + +$L$SEH_epilogue_mul_mont_384:: + mov rdi,QWORD PTR[8+rsp] ;WIN64 epilogue + mov rsi,QWORD PTR[16+rsp] + + DB 0F3h,0C3h ;repret + +$L$SEH_end_mul_mont_384:: +mul_mont_384 ENDP + +ALIGN 32 +__mulq_mont_384 PROC PRIVATE + DB 243,15,30,250 + mov rdi,rax + mul r14 + mov r8,rax + mov rax,rdi + mov r9,rdx + + mul r15 + add r9,rax + mov rax,rdi + adc rdx,0 + mov r10,rdx + + mul r12 + add r10,rax + mov rax,rdi + adc rdx,0 + mov r11,rdx + + mov rbp,r8 + imul r8,QWORD PTR[8+rsp] + + mul r13 + add r11,rax + mov rax,rdi + adc rdx,0 + mov r12,rdx + + mul QWORD PTR[32+rsi] + add r12,rax + mov rax,rdi + adc rdx,0 + mov r13,rdx + + mul QWORD PTR[40+rsi] + add r13,rax + mov rax,r8 + adc rdx,0 + xor r15,r15 + mov r14,rdx + + mul QWORD PTR[rcx] + add rbp,rax + mov rax,r8 + adc rbp,rdx + + mul QWORD PTR[8+rcx] + add r9,rax + mov rax,r8 + adc rdx,0 + add r9,rbp + adc rdx,0 + mov rbp,rdx + + mul QWORD PTR[16+rcx] + add r10,rax + mov rax,r8 + adc rdx,0 + add r10,rbp + adc rdx,0 + mov rbp,rdx + + mul QWORD PTR[24+rcx] + add r11,rbp + adc rdx,0 + add r11,rax + mov rax,r8 + adc rdx,0 + mov rbp,rdx + + mul QWORD PTR[32+rcx] + add r12,rax + mov rax,r8 + adc rdx,0 + add r12,rbp + adc rdx,0 + mov rbp,rdx + + mul QWORD PTR[40+rcx] + add r13,rax + mov rax,QWORD PTR[8+rbx] + adc rdx,0 + add r13,rbp + adc r14,rdx + adc r15,0 + + mov rdi,rax + mul QWORD PTR[rsi] + add r9,rax + mov rax,rdi + adc rdx,0 + mov r8,rdx + + mul QWORD PTR[8+rsi] + add r10,rax + mov rax,rdi + adc rdx,0 + add r10,r8 + adc rdx,0 + mov r8,rdx + + mul QWORD PTR[16+rsi] + add r11,rax + mov rax,rdi + adc rdx,0 + add r11,r8 + adc rdx,0 + mov r8,rdx + + mov rbp,r9 + imul r9,QWORD PTR[8+rsp] + + mul QWORD PTR[24+rsi] + add r12,rax + mov rax,rdi + adc rdx,0 + add r12,r8 + adc rdx,0 + mov r8,rdx + + mul QWORD PTR[32+rsi] + add r13,rax + mov rax,rdi + adc rdx,0 + add r13,r8 + adc rdx,0 + mov r8,rdx + + mul QWORD PTR[40+rsi] + add r14,r8 + adc rdx,0 + xor r8,r8 + add r14,rax + mov rax,r9 + adc r15,rdx + adc r8,0 + + mul QWORD PTR[rcx] + add rbp,rax + mov rax,r9 + adc rbp,rdx + + mul QWORD PTR[8+rcx] + add r10,rax + mov rax,r9 + adc rdx,0 + add r10,rbp + adc rdx,0 + mov rbp,rdx + + mul QWORD PTR[16+rcx] + add r11,rax + mov rax,r9 + adc rdx,0 + add r11,rbp + adc rdx,0 + mov rbp,rdx + + mul QWORD PTR[24+rcx] + add r12,rbp + adc rdx,0 + add r12,rax + mov rax,r9 + adc rdx,0 + mov rbp,rdx + + mul QWORD PTR[32+rcx] + add r13,rax + mov rax,r9 + adc rdx,0 + add r13,rbp + adc rdx,0 + mov rbp,rdx + + mul QWORD PTR[40+rcx] + add r14,rax + mov rax,QWORD PTR[16+rbx] + adc rdx,0 + add r14,rbp + adc r15,rdx + adc r8,0 + + mov rdi,rax + mul QWORD PTR[rsi] + add r10,rax + mov rax,rdi + adc rdx,0 + mov r9,rdx + + mul QWORD PTR[8+rsi] + add r11,rax + mov rax,rdi + adc rdx,0 + add r11,r9 + adc rdx,0 + mov r9,rdx + + mul QWORD PTR[16+rsi] + add r12,rax + mov rax,rdi + adc rdx,0 + add r12,r9 + adc rdx,0 + mov r9,rdx + + mov rbp,r10 + imul r10,QWORD PTR[8+rsp] + + mul QWORD PTR[24+rsi] + add r13,rax + mov rax,rdi + adc rdx,0 + add r13,r9 + adc rdx,0 + mov r9,rdx + + mul QWORD PTR[32+rsi] + add r14,rax + mov rax,rdi + adc rdx,0 + add r14,r9 + adc rdx,0 + mov r9,rdx + + mul QWORD PTR[40+rsi] + add r15,r9 + adc rdx,0 + xor r9,r9 + add r15,rax + mov rax,r10 + adc r8,rdx + adc r9,0 + + mul QWORD PTR[rcx] + add rbp,rax + mov rax,r10 + adc rbp,rdx + + mul QWORD PTR[8+rcx] + add r11,rax + mov rax,r10 + adc rdx,0 + add r11,rbp + adc rdx,0 + mov rbp,rdx + + mul QWORD PTR[16+rcx] + add r12,rax + mov rax,r10 + adc rdx,0 + add r12,rbp + adc rdx,0 + mov rbp,rdx + + mul QWORD PTR[24+rcx] + add r13,rbp + adc rdx,0 + add r13,rax + mov rax,r10 + adc rdx,0 + mov rbp,rdx + + mul QWORD PTR[32+rcx] + add r14,rax + mov rax,r10 + adc rdx,0 + add r14,rbp + adc rdx,0 + mov rbp,rdx + + mul QWORD PTR[40+rcx] + add r15,rax + mov rax,QWORD PTR[24+rbx] + adc rdx,0 + add r15,rbp + adc r8,rdx + adc r9,0 + + mov rdi,rax + mul QWORD PTR[rsi] + add r11,rax + mov rax,rdi + adc rdx,0 + mov r10,rdx + + mul QWORD PTR[8+rsi] + add r12,rax + mov rax,rdi + adc rdx,0 + add r12,r10 + adc rdx,0 + mov r10,rdx + + mul QWORD PTR[16+rsi] + add r13,rax + mov rax,rdi + adc rdx,0 + add r13,r10 + adc rdx,0 + mov r10,rdx + + mov rbp,r11 + imul r11,QWORD PTR[8+rsp] + + mul QWORD PTR[24+rsi] + add r14,rax + mov rax,rdi + adc rdx,0 + add r14,r10 + adc rdx,0 + mov r10,rdx + + mul QWORD PTR[32+rsi] + add r15,rax + mov rax,rdi + adc rdx,0 + add r15,r10 + adc rdx,0 + mov r10,rdx + + mul QWORD PTR[40+rsi] + add r8,r10 + adc rdx,0 + xor r10,r10 + add r8,rax + mov rax,r11 + adc r9,rdx + adc r10,0 + + mul QWORD PTR[rcx] + add rbp,rax + mov rax,r11 + adc rbp,rdx + + mul QWORD PTR[8+rcx] + add r12,rax + mov rax,r11 + adc rdx,0 + add r12,rbp + adc rdx,0 + mov rbp,rdx + + mul QWORD PTR[16+rcx] + add r13,rax + mov rax,r11 + adc rdx,0 + add r13,rbp + adc rdx,0 + mov rbp,rdx + + mul QWORD PTR[24+rcx] + add r14,rbp + adc rdx,0 + add r14,rax + mov rax,r11 + adc rdx,0 + mov rbp,rdx + + mul QWORD PTR[32+rcx] + add r15,rax + mov rax,r11 + adc rdx,0 + add r15,rbp + adc rdx,0 + mov rbp,rdx + + mul QWORD PTR[40+rcx] + add r8,rax + mov rax,QWORD PTR[32+rbx] + adc rdx,0 + add r8,rbp + adc r9,rdx + adc r10,0 + + mov rdi,rax + mul QWORD PTR[rsi] + add r12,rax + mov rax,rdi + adc rdx,0 + mov r11,rdx + + mul QWORD PTR[8+rsi] + add r13,rax + mov rax,rdi + adc rdx,0 + add r13,r11 + adc rdx,0 + mov r11,rdx + + mul QWORD PTR[16+rsi] + add r14,rax + mov rax,rdi + adc rdx,0 + add r14,r11 + adc rdx,0 + mov r11,rdx + + mov rbp,r12 + imul r12,QWORD PTR[8+rsp] + + mul QWORD PTR[24+rsi] + add r15,rax + mov rax,rdi + adc rdx,0 + add r15,r11 + adc rdx,0 + mov r11,rdx + + mul QWORD PTR[32+rsi] + add r8,rax + mov rax,rdi + adc rdx,0 + add r8,r11 + adc rdx,0 + mov r11,rdx + + mul QWORD PTR[40+rsi] + add r9,r11 + adc rdx,0 + xor r11,r11 + add r9,rax + mov rax,r12 + adc r10,rdx + adc r11,0 + + mul QWORD PTR[rcx] + add rbp,rax + mov rax,r12 + adc rbp,rdx + + mul QWORD PTR[8+rcx] + add r13,rax + mov rax,r12 + adc rdx,0 + add r13,rbp + adc rdx,0 + mov rbp,rdx + + mul QWORD PTR[16+rcx] + add r14,rax + mov rax,r12 + adc rdx,0 + add r14,rbp + adc rdx,0 + mov rbp,rdx + + mul QWORD PTR[24+rcx] + add r15,rbp + adc rdx,0 + add r15,rax + mov rax,r12 + adc rdx,0 + mov rbp,rdx + + mul QWORD PTR[32+rcx] + add r8,rax + mov rax,r12 + adc rdx,0 + add r8,rbp + adc rdx,0 + mov rbp,rdx + + mul QWORD PTR[40+rcx] + add r9,rax + mov rax,QWORD PTR[40+rbx] + adc rdx,0 + add r9,rbp + adc r10,rdx + adc r11,0 + + mov rdi,rax + mul QWORD PTR[rsi] + add r13,rax + mov rax,rdi + adc rdx,0 + mov r12,rdx + + mul QWORD PTR[8+rsi] + add r14,rax + mov rax,rdi + adc rdx,0 + add r14,r12 + adc rdx,0 + mov r12,rdx + + mul QWORD PTR[16+rsi] + add r15,rax + mov rax,rdi + adc rdx,0 + add r15,r12 + adc rdx,0 + mov r12,rdx + + mov rbp,r13 + imul r13,QWORD PTR[8+rsp] + + mul QWORD PTR[24+rsi] + add r8,rax + mov rax,rdi + adc rdx,0 + add r8,r12 + adc rdx,0 + mov r12,rdx + + mul QWORD PTR[32+rsi] + add r9,rax + mov rax,rdi + adc rdx,0 + add r9,r12 + adc rdx,0 + mov r12,rdx + + mul QWORD PTR[40+rsi] + add r10,r12 + adc rdx,0 + xor r12,r12 + add r10,rax + mov rax,r13 + adc r11,rdx + adc r12,0 + + mul QWORD PTR[rcx] + add rbp,rax + mov rax,r13 + adc rbp,rdx + + mul QWORD PTR[8+rcx] + add r14,rax + mov rax,r13 + adc rdx,0 + add r14,rbp + adc rdx,0 + mov rbp,rdx + + mul QWORD PTR[16+rcx] + add r15,rax + mov rax,r13 + adc rdx,0 + add r15,rbp + adc rdx,0 + mov rbp,rdx + + mul QWORD PTR[24+rcx] + add r8,rbp + adc rdx,0 + add r8,rax + mov rax,r13 + adc rdx,0 + mov rbp,rdx + + mul QWORD PTR[32+rcx] + add r9,rax + mov rax,r13 + adc rdx,0 + add r9,rbp + adc rdx,0 + mov rbp,rdx + + mul QWORD PTR[40+rcx] + add r10,rax + mov rax,r14 + adc rdx,0 + add r10,rbp + adc r11,rdx + adc r12,0 + + + + + mov rdi,QWORD PTR[16+rsp] + sub r14,QWORD PTR[rcx] + mov rdx,r15 + sbb r15,QWORD PTR[8+rcx] + mov rbx,r8 + sbb r8,QWORD PTR[16+rcx] + mov rsi,r9 + sbb r9,QWORD PTR[24+rcx] + mov rbp,r10 + sbb r10,QWORD PTR[32+rcx] + mov r13,r11 + sbb r11,QWORD PTR[40+rcx] + sbb r12,0 + + cmovc r14,rax + cmovc r15,rdx + cmovc r8,rbx + mov QWORD PTR[rdi],r14 + cmovc r9,rsi + mov QWORD PTR[8+rdi],r15 + cmovc r10,rbp + mov QWORD PTR[16+rdi],r8 + cmovc r11,r13 + mov QWORD PTR[24+rdi],r9 + mov QWORD PTR[32+rdi],r10 + mov QWORD PTR[40+rdi],r11 + + DB 0F3h,0C3h ;repret +__mulq_mont_384 ENDP +PUBLIC sqr_n_mul_mont_384 + + +ALIGN 32 +sqr_n_mul_mont_384 PROC PUBLIC + DB 243,15,30,250 + mov QWORD PTR[8+rsp],rdi ;WIN64 prologue + mov QWORD PTR[16+rsp],rsi + mov r11,rsp +$L$SEH_begin_sqr_n_mul_mont_384:: + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 + mov rcx,r9 + mov r8,QWORD PTR[40+rsp] + mov r9,QWORD PTR[48+rsp] + + + + push rbp + + push rbx + + push r12 + + push r13 + + push r14 + + push r15 + + sub rsp,8*17 + +$L$SEH_body_sqr_n_mul_mont_384:: + + + mov QWORD PTR[rsp],r8 + mov QWORD PTR[8+rsp],rdi + mov QWORD PTR[16+rsp],rcx + lea rdi,QWORD PTR[32+rsp] + mov QWORD PTR[24+rsp],r9 + movq xmm2,QWORD PTR[r9] + +$L$oop_sqr_384:: + movd xmm1,edx + + call __sqrq_384 + + lea rsi,QWORD PTR[rdi] + mov rcx,QWORD PTR[rsp] + mov rbx,QWORD PTR[16+rsp] + call __mulq_by_1_mont_384 + call __redc_tail_mont_384 + + movd edx,xmm1 + lea rsi,QWORD PTR[rdi] + dec edx + jnz $L$oop_sqr_384 + +DB 102,72,15,126,208 + mov rcx,rbx + mov rbx,QWORD PTR[24+rsp] + + + + + + + mov r12,r8 + mov r13,r9 + + call __mulq_mont_384 + + lea r8,QWORD PTR[136+rsp] + mov r15,QWORD PTR[136+rsp] + + mov r14,QWORD PTR[8+r8] + + mov r13,QWORD PTR[16+r8] + + mov r12,QWORD PTR[24+r8] + + mov rbx,QWORD PTR[32+r8] + + mov rbp,QWORD PTR[40+r8] + + lea rsp,QWORD PTR[48+r8] + +$L$SEH_epilogue_sqr_n_mul_mont_384:: + mov rdi,QWORD PTR[8+rsp] ;WIN64 epilogue + mov rsi,QWORD PTR[16+rsp] + + DB 0F3h,0C3h ;repret + +$L$SEH_end_sqr_n_mul_mont_384:: +sqr_n_mul_mont_384 ENDP + +PUBLIC sqr_n_mul_mont_383 + + +ALIGN 32 +sqr_n_mul_mont_383 PROC PUBLIC + DB 243,15,30,250 + mov QWORD PTR[8+rsp],rdi ;WIN64 prologue + mov QWORD PTR[16+rsp],rsi + mov r11,rsp +$L$SEH_begin_sqr_n_mul_mont_383:: + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 + mov rcx,r9 + mov r8,QWORD PTR[40+rsp] + mov r9,QWORD PTR[48+rsp] + + + + push rbp + + push rbx + + push r12 + + push r13 + + push r14 + + push r15 + + sub rsp,8*17 + +$L$SEH_body_sqr_n_mul_mont_383:: + + + mov QWORD PTR[rsp],r8 + mov QWORD PTR[8+rsp],rdi + mov QWORD PTR[16+rsp],rcx + lea rdi,QWORD PTR[32+rsp] + mov QWORD PTR[24+rsp],r9 + movq xmm2,QWORD PTR[r9] + +$L$oop_sqr_383:: + movd xmm1,edx + + call __sqrq_384 + + lea rsi,QWORD PTR[rdi] + mov rcx,QWORD PTR[rsp] + mov rbx,QWORD PTR[16+rsp] + call __mulq_by_1_mont_384 + + movd edx,xmm1 + add r14,QWORD PTR[48+rsi] + adc r15,QWORD PTR[56+rsi] + adc r8,QWORD PTR[64+rsi] + adc r9,QWORD PTR[72+rsi] + adc r10,QWORD PTR[80+rsi] + adc r11,QWORD PTR[88+rsi] + lea rsi,QWORD PTR[rdi] + + mov QWORD PTR[rdi],r14 + mov QWORD PTR[8+rdi],r15 + mov QWORD PTR[16+rdi],r8 + mov QWORD PTR[24+rdi],r9 + mov QWORD PTR[32+rdi],r10 + mov QWORD PTR[40+rdi],r11 + + dec edx + jnz $L$oop_sqr_383 + +DB 102,72,15,126,208 + mov rcx,rbx + mov rbx,QWORD PTR[24+rsp] + + + + + + + mov r12,r8 + mov r13,r9 + + call __mulq_mont_384 + + lea r8,QWORD PTR[136+rsp] + mov r15,QWORD PTR[136+rsp] + + mov r14,QWORD PTR[8+r8] + + mov r13,QWORD PTR[16+r8] + + mov r12,QWORD PTR[24+r8] + + mov rbx,QWORD PTR[32+r8] + + mov rbp,QWORD PTR[40+r8] + + lea rsp,QWORD PTR[48+r8] + +$L$SEH_epilogue_sqr_n_mul_mont_383:: + mov rdi,QWORD PTR[8+rsp] ;WIN64 epilogue + mov rsi,QWORD PTR[16+rsp] + + DB 0F3h,0C3h ;repret + +$L$SEH_end_sqr_n_mul_mont_383:: +sqr_n_mul_mont_383 ENDP + +ALIGN 32 +__mulq_mont_383_nonred PROC PRIVATE + DB 243,15,30,250 + mov rbp,rax + mul r14 + mov r8,rax + mov rax,rbp + mov r9,rdx + + mul r15 + add r9,rax + mov rax,rbp + adc rdx,0 + mov r10,rdx + + mul r12 + add r10,rax + mov rax,rbp + adc rdx,0 + mov r11,rdx + + mov r15,r8 + imul r8,QWORD PTR[8+rsp] + + mul r13 + add r11,rax + mov rax,rbp + adc rdx,0 + mov r12,rdx + + mul QWORD PTR[32+rsi] + add r12,rax + mov rax,rbp + adc rdx,0 + mov r13,rdx + + mul QWORD PTR[40+rsi] + add r13,rax + mov rax,r8 + adc rdx,0 + mov r14,rdx + + mul QWORD PTR[rcx] + add r15,rax + mov rax,r8 + adc r15,rdx + + mul QWORD PTR[8+rcx] + add r9,rax + mov rax,r8 + adc rdx,0 + add r9,r15 + adc rdx,0 + mov r15,rdx + + mul QWORD PTR[16+rcx] + add r10,rax + mov rax,r8 + adc rdx,0 + add r10,r15 + adc rdx,0 + mov r15,rdx + + mul QWORD PTR[24+rcx] + add r11,r15 + adc rdx,0 + add r11,rax + mov rax,r8 + adc rdx,0 + mov r15,rdx + + mul QWORD PTR[32+rcx] + add r12,rax + mov rax,r8 + adc rdx,0 + add r12,r15 + adc rdx,0 + mov r15,rdx + + mul QWORD PTR[40+rcx] + add r13,rax + mov rax,QWORD PTR[8+rbx] + adc rdx,0 + add r13,r15 + adc r14,rdx + + mov rbp,rax + mul QWORD PTR[rsi] + add r9,rax + mov rax,rbp + adc rdx,0 + mov r15,rdx + + mul QWORD PTR[8+rsi] + add r10,rax + mov rax,rbp + adc rdx,0 + add r10,r15 + adc rdx,0 + mov r15,rdx + + mul QWORD PTR[16+rsi] + add r11,rax + mov rax,rbp + adc rdx,0 + add r11,r15 + adc rdx,0 + mov r15,rdx + + mov r8,r9 + imul r9,QWORD PTR[8+rsp] + + mul QWORD PTR[24+rsi] + add r12,rax + mov rax,rbp + adc rdx,0 + add r12,r15 + adc rdx,0 + mov r15,rdx + + mul QWORD PTR[32+rsi] + add r13,rax + mov rax,rbp + adc rdx,0 + add r13,r15 + adc rdx,0 + mov r15,rdx + + mul QWORD PTR[40+rsi] + add r14,r15 + adc rdx,0 + add r14,rax + mov rax,r9 + adc rdx,0 + mov r15,rdx + + mul QWORD PTR[rcx] + add r8,rax + mov rax,r9 + adc r8,rdx + + mul QWORD PTR[8+rcx] + add r10,rax + mov rax,r9 + adc rdx,0 + add r10,r8 + adc rdx,0 + mov r8,rdx + + mul QWORD PTR[16+rcx] + add r11,rax + mov rax,r9 + adc rdx,0 + add r11,r8 + adc rdx,0 + mov r8,rdx + + mul QWORD PTR[24+rcx] + add r12,r8 + adc rdx,0 + add r12,rax + mov rax,r9 + adc rdx,0 + mov r8,rdx + + mul QWORD PTR[32+rcx] + add r13,rax + mov rax,r9 + adc rdx,0 + add r13,r8 + adc rdx,0 + mov r8,rdx + + mul QWORD PTR[40+rcx] + add r14,rax + mov rax,QWORD PTR[16+rbx] + adc rdx,0 + add r14,r8 + adc r15,rdx + + mov rbp,rax + mul QWORD PTR[rsi] + add r10,rax + mov rax,rbp + adc rdx,0 + mov r8,rdx + + mul QWORD PTR[8+rsi] + add r11,rax + mov rax,rbp + adc rdx,0 + add r11,r8 + adc rdx,0 + mov r8,rdx + + mul QWORD PTR[16+rsi] + add r12,rax + mov rax,rbp + adc rdx,0 + add r12,r8 + adc rdx,0 + mov r8,rdx + + mov r9,r10 + imul r10,QWORD PTR[8+rsp] + + mul QWORD PTR[24+rsi] + add r13,rax + mov rax,rbp + adc rdx,0 + add r13,r8 + adc rdx,0 + mov r8,rdx + + mul QWORD PTR[32+rsi] + add r14,rax + mov rax,rbp + adc rdx,0 + add r14,r8 + adc rdx,0 + mov r8,rdx + + mul QWORD PTR[40+rsi] + add r15,r8 + adc rdx,0 + add r15,rax + mov rax,r10 + adc rdx,0 + mov r8,rdx + + mul QWORD PTR[rcx] + add r9,rax + mov rax,r10 + adc r9,rdx + + mul QWORD PTR[8+rcx] + add r11,rax + mov rax,r10 + adc rdx,0 + add r11,r9 + adc rdx,0 + mov r9,rdx + + mul QWORD PTR[16+rcx] + add r12,rax + mov rax,r10 + adc rdx,0 + add r12,r9 + adc rdx,0 + mov r9,rdx + + mul QWORD PTR[24+rcx] + add r13,r9 + adc rdx,0 + add r13,rax + mov rax,r10 + adc rdx,0 + mov r9,rdx + + mul QWORD PTR[32+rcx] + add r14,rax + mov rax,r10 + adc rdx,0 + add r14,r9 + adc rdx,0 + mov r9,rdx + + mul QWORD PTR[40+rcx] + add r15,rax + mov rax,QWORD PTR[24+rbx] + adc rdx,0 + add r15,r9 + adc r8,rdx + + mov rbp,rax + mul QWORD PTR[rsi] + add r11,rax + mov rax,rbp + adc rdx,0 + mov r9,rdx + + mul QWORD PTR[8+rsi] + add r12,rax + mov rax,rbp + adc rdx,0 + add r12,r9 + adc rdx,0 + mov r9,rdx + + mul QWORD PTR[16+rsi] + add r13,rax + mov rax,rbp + adc rdx,0 + add r13,r9 + adc rdx,0 + mov r9,rdx + + mov r10,r11 + imul r11,QWORD PTR[8+rsp] + + mul QWORD PTR[24+rsi] + add r14,rax + mov rax,rbp + adc rdx,0 + add r14,r9 + adc rdx,0 + mov r9,rdx + + mul QWORD PTR[32+rsi] + add r15,rax + mov rax,rbp + adc rdx,0 + add r15,r9 + adc rdx,0 + mov r9,rdx + + mul QWORD PTR[40+rsi] + add r8,r9 + adc rdx,0 + add r8,rax + mov rax,r11 + adc rdx,0 + mov r9,rdx + + mul QWORD PTR[rcx] + add r10,rax + mov rax,r11 + adc r10,rdx + + mul QWORD PTR[8+rcx] + add r12,rax + mov rax,r11 + adc rdx,0 + add r12,r10 + adc rdx,0 + mov r10,rdx + + mul QWORD PTR[16+rcx] + add r13,rax + mov rax,r11 + adc rdx,0 + add r13,r10 + adc rdx,0 + mov r10,rdx + + mul QWORD PTR[24+rcx] + add r14,r10 + adc rdx,0 + add r14,rax + mov rax,r11 + adc rdx,0 + mov r10,rdx + + mul QWORD PTR[32+rcx] + add r15,rax + mov rax,r11 + adc rdx,0 + add r15,r10 + adc rdx,0 + mov r10,rdx + + mul QWORD PTR[40+rcx] + add r8,rax + mov rax,QWORD PTR[32+rbx] + adc rdx,0 + add r8,r10 + adc r9,rdx + + mov rbp,rax + mul QWORD PTR[rsi] + add r12,rax + mov rax,rbp + adc rdx,0 + mov r10,rdx + + mul QWORD PTR[8+rsi] + add r13,rax + mov rax,rbp + adc rdx,0 + add r13,r10 + adc rdx,0 + mov r10,rdx + + mul QWORD PTR[16+rsi] + add r14,rax + mov rax,rbp + adc rdx,0 + add r14,r10 + adc rdx,0 + mov r10,rdx + + mov r11,r12 + imul r12,QWORD PTR[8+rsp] + + mul QWORD PTR[24+rsi] + add r15,rax + mov rax,rbp + adc rdx,0 + add r15,r10 + adc rdx,0 + mov r10,rdx + + mul QWORD PTR[32+rsi] + add r8,rax + mov rax,rbp + adc rdx,0 + add r8,r10 + adc rdx,0 + mov r10,rdx + + mul QWORD PTR[40+rsi] + add r9,r10 + adc rdx,0 + add r9,rax + mov rax,r12 + adc rdx,0 + mov r10,rdx + + mul QWORD PTR[rcx] + add r11,rax + mov rax,r12 + adc r11,rdx + + mul QWORD PTR[8+rcx] + add r13,rax + mov rax,r12 + adc rdx,0 + add r13,r11 + adc rdx,0 + mov r11,rdx + + mul QWORD PTR[16+rcx] + add r14,rax + mov rax,r12 + adc rdx,0 + add r14,r11 + adc rdx,0 + mov r11,rdx + + mul QWORD PTR[24+rcx] + add r15,r11 + adc rdx,0 + add r15,rax + mov rax,r12 + adc rdx,0 + mov r11,rdx + + mul QWORD PTR[32+rcx] + add r8,rax + mov rax,r12 + adc rdx,0 + add r8,r11 + adc rdx,0 + mov r11,rdx + + mul QWORD PTR[40+rcx] + add r9,rax + mov rax,QWORD PTR[40+rbx] + adc rdx,0 + add r9,r11 + adc r10,rdx + + mov rbp,rax + mul QWORD PTR[rsi] + add r13,rax + mov rax,rbp + adc rdx,0 + mov r11,rdx + + mul QWORD PTR[8+rsi] + add r14,rax + mov rax,rbp + adc rdx,0 + add r14,r11 + adc rdx,0 + mov r11,rdx + + mul QWORD PTR[16+rsi] + add r15,rax + mov rax,rbp + adc rdx,0 + add r15,r11 + adc rdx,0 + mov r11,rdx + + mov r12,r13 + imul r13,QWORD PTR[8+rsp] + + mul QWORD PTR[24+rsi] + add r8,rax + mov rax,rbp + adc rdx,0 + add r8,r11 + adc rdx,0 + mov r11,rdx + + mul QWORD PTR[32+rsi] + add r9,rax + mov rax,rbp + adc rdx,0 + add r9,r11 + adc rdx,0 + mov r11,rdx + + mul QWORD PTR[40+rsi] + add r10,r11 + adc rdx,0 + add r10,rax + mov rax,r13 + adc rdx,0 + mov r11,rdx + + mul QWORD PTR[rcx] + add r12,rax + mov rax,r13 + adc r12,rdx + + mul QWORD PTR[8+rcx] + add r14,rax + mov rax,r13 + adc rdx,0 + add r14,r12 + adc rdx,0 + mov r12,rdx + + mul QWORD PTR[16+rcx] + add r15,rax + mov rax,r13 + adc rdx,0 + add r15,r12 + adc rdx,0 + mov r12,rdx + + mul QWORD PTR[24+rcx] + add r8,r12 + adc rdx,0 + add r8,rax + mov rax,r13 + adc rdx,0 + mov r12,rdx + + mul QWORD PTR[32+rcx] + add r9,rax + mov rax,r13 + adc rdx,0 + add r9,r12 + adc rdx,0 + mov r12,rdx + + mul QWORD PTR[40+rcx] + add r10,rax + mov rax,r14 + adc rdx,0 + add r10,r12 + adc r11,rdx + DB 0F3h,0C3h ;repret +__mulq_mont_383_nonred ENDP +PUBLIC sqr_mont_382x + + +ALIGN 32 +sqr_mont_382x PROC PUBLIC + DB 243,15,30,250 + mov QWORD PTR[8+rsp],rdi ;WIN64 prologue + mov QWORD PTR[16+rsp],rsi + mov r11,rsp +$L$SEH_begin_sqr_mont_382x:: + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 + mov rcx,r9 + + + + push rbp + + push rbx + + push r12 + + push r13 + + push r14 + + push r15 + + sub rsp,136 + +$L$SEH_body_sqr_mont_382x:: + + + mov QWORD PTR[rsp],rcx + mov rcx,rdx + mov QWORD PTR[16+rsp],rsi + mov QWORD PTR[24+rsp],rdi + + + mov r8,QWORD PTR[rsi] + mov r9,QWORD PTR[8+rsi] + mov r10,QWORD PTR[16+rsi] + mov r11,QWORD PTR[24+rsi] + mov r12,QWORD PTR[32+rsi] + mov r13,QWORD PTR[40+rsi] + + mov r14,r8 + add r8,QWORD PTR[48+rsi] + mov r15,r9 + adc r9,QWORD PTR[56+rsi] + mov rax,r10 + adc r10,QWORD PTR[64+rsi] + mov rdx,r11 + adc r11,QWORD PTR[72+rsi] + mov rbx,r12 + adc r12,QWORD PTR[80+rsi] + mov rbp,r13 + adc r13,QWORD PTR[88+rsi] + + sub r14,QWORD PTR[48+rsi] + sbb r15,QWORD PTR[56+rsi] + sbb rax,QWORD PTR[64+rsi] + sbb rdx,QWORD PTR[72+rsi] + sbb rbx,QWORD PTR[80+rsi] + sbb rbp,QWORD PTR[88+rsi] + sbb rdi,rdi + + mov QWORD PTR[((32+0))+rsp],r8 + mov QWORD PTR[((32+8))+rsp],r9 + mov QWORD PTR[((32+16))+rsp],r10 + mov QWORD PTR[((32+24))+rsp],r11 + mov QWORD PTR[((32+32))+rsp],r12 + mov QWORD PTR[((32+40))+rsp],r13 + + mov QWORD PTR[((32+48))+rsp],r14 + mov QWORD PTR[((32+56))+rsp],r15 + mov QWORD PTR[((32+64))+rsp],rax + mov QWORD PTR[((32+72))+rsp],rdx + mov QWORD PTR[((32+80))+rsp],rbx + mov QWORD PTR[((32+88))+rsp],rbp + mov QWORD PTR[((32+96))+rsp],rdi + + + + lea rbx,QWORD PTR[48+rsi] + + mov rax,QWORD PTR[48+rsi] + mov r14,QWORD PTR[rsi] + mov r15,QWORD PTR[8+rsi] + mov r12,QWORD PTR[16+rsi] + mov r13,QWORD PTR[24+rsi] + + mov rdi,QWORD PTR[24+rsp] + call __mulq_mont_383_nonred + add r14,r14 + adc r15,r15 + adc r8,r8 + adc r9,r9 + adc r10,r10 + adc r11,r11 + + mov QWORD PTR[48+rdi],r14 + mov QWORD PTR[56+rdi],r15 + mov QWORD PTR[64+rdi],r8 + mov QWORD PTR[72+rdi],r9 + mov QWORD PTR[80+rdi],r10 + mov QWORD PTR[88+rdi],r11 + + lea rsi,QWORD PTR[32+rsp] + lea rbx,QWORD PTR[((32+48))+rsp] + + mov rax,QWORD PTR[((32+48))+rsp] + mov r14,QWORD PTR[((32+0))+rsp] + mov r15,QWORD PTR[((32+8))+rsp] + mov r12,QWORD PTR[((32+16))+rsp] + mov r13,QWORD PTR[((32+24))+rsp] + + call __mulq_mont_383_nonred + mov rsi,QWORD PTR[((32+96))+rsp] + mov r12,QWORD PTR[((32+0))+rsp] + mov r13,QWORD PTR[((32+8))+rsp] + and r12,rsi + mov rax,QWORD PTR[((32+16))+rsp] + and r13,rsi + mov rbx,QWORD PTR[((32+24))+rsp] + and rax,rsi + mov rbp,QWORD PTR[((32+32))+rsp] + and rbx,rsi + and rbp,rsi + and rsi,QWORD PTR[((32+40))+rsp] + + sub r14,r12 + mov r12,QWORD PTR[rcx] + sbb r15,r13 + mov r13,QWORD PTR[8+rcx] + sbb r8,rax + mov rax,QWORD PTR[16+rcx] + sbb r9,rbx + mov rbx,QWORD PTR[24+rcx] + sbb r10,rbp + mov rbp,QWORD PTR[32+rcx] + sbb r11,rsi + sbb rsi,rsi + + and r12,rsi + and r13,rsi + and rax,rsi + and rbx,rsi + and rbp,rsi + and rsi,QWORD PTR[40+rcx] + + add r14,r12 + adc r15,r13 + adc r8,rax + adc r9,rbx + adc r10,rbp + adc r11,rsi + + mov QWORD PTR[rdi],r14 + mov QWORD PTR[8+rdi],r15 + mov QWORD PTR[16+rdi],r8 + mov QWORD PTR[24+rdi],r9 + mov QWORD PTR[32+rdi],r10 + mov QWORD PTR[40+rdi],r11 + lea r8,QWORD PTR[136+rsp] + mov r15,QWORD PTR[r8] + + mov r14,QWORD PTR[8+r8] + + mov r13,QWORD PTR[16+r8] + + mov r12,QWORD PTR[24+r8] + + mov rbx,QWORD PTR[32+r8] + + mov rbp,QWORD PTR[40+r8] + + lea rsp,QWORD PTR[48+r8] + +$L$SEH_epilogue_sqr_mont_382x:: + mov rdi,QWORD PTR[8+rsp] ;WIN64 epilogue + mov rsi,QWORD PTR[16+rsp] + + DB 0F3h,0C3h ;repret + +$L$SEH_end_sqr_mont_382x:: +sqr_mont_382x ENDP +.text$ ENDS +.pdata SEGMENT READONLY ALIGN(4) +ALIGN 4 + DD imagerel $L$SEH_begin_mul_mont_384x + DD imagerel $L$SEH_body_mul_mont_384x + DD imagerel $L$SEH_info_mul_mont_384x_prologue + + DD imagerel $L$SEH_body_mul_mont_384x + DD imagerel $L$SEH_epilogue_mul_mont_384x + DD imagerel $L$SEH_info_mul_mont_384x_body + + DD imagerel $L$SEH_epilogue_mul_mont_384x + DD imagerel $L$SEH_end_mul_mont_384x + DD imagerel $L$SEH_info_mul_mont_384x_epilogue + + DD imagerel $L$SEH_begin_sqr_mont_384x + DD imagerel $L$SEH_body_sqr_mont_384x + DD imagerel $L$SEH_info_sqr_mont_384x_prologue + + DD imagerel $L$SEH_body_sqr_mont_384x + DD imagerel $L$SEH_epilogue_sqr_mont_384x + DD imagerel $L$SEH_info_sqr_mont_384x_body + + DD imagerel $L$SEH_epilogue_sqr_mont_384x + DD imagerel $L$SEH_end_sqr_mont_384x + DD imagerel $L$SEH_info_sqr_mont_384x_epilogue + + DD imagerel $L$SEH_begin_mul_382x + DD imagerel $L$SEH_body_mul_382x + DD imagerel $L$SEH_info_mul_382x_prologue + + DD imagerel $L$SEH_body_mul_382x + DD imagerel $L$SEH_epilogue_mul_382x + DD imagerel $L$SEH_info_mul_382x_body + + DD imagerel $L$SEH_epilogue_mul_382x + DD imagerel $L$SEH_end_mul_382x + DD imagerel $L$SEH_info_mul_382x_epilogue + + DD imagerel $L$SEH_begin_sqr_382x + DD imagerel $L$SEH_body_sqr_382x + DD imagerel $L$SEH_info_sqr_382x_prologue + + DD imagerel $L$SEH_body_sqr_382x + DD imagerel $L$SEH_epilogue_sqr_382x + DD imagerel $L$SEH_info_sqr_382x_body + + DD imagerel $L$SEH_epilogue_sqr_382x + DD imagerel $L$SEH_end_sqr_382x + DD imagerel $L$SEH_info_sqr_382x_epilogue + + DD imagerel $L$SEH_begin_mul_384 + DD imagerel $L$SEH_body_mul_384 + DD imagerel $L$SEH_info_mul_384_prologue + + DD imagerel $L$SEH_body_mul_384 + DD imagerel $L$SEH_epilogue_mul_384 + DD imagerel $L$SEH_info_mul_384_body + + DD imagerel $L$SEH_epilogue_mul_384 + DD imagerel $L$SEH_end_mul_384 + DD imagerel $L$SEH_info_mul_384_epilogue + + DD imagerel $L$SEH_begin_sqr_384 + DD imagerel $L$SEH_body_sqr_384 + DD imagerel $L$SEH_info_sqr_384_prologue + + DD imagerel $L$SEH_body_sqr_384 + DD imagerel $L$SEH_epilogue_sqr_384 + DD imagerel $L$SEH_info_sqr_384_body + + DD imagerel $L$SEH_epilogue_sqr_384 + DD imagerel $L$SEH_end_sqr_384 + DD imagerel $L$SEH_info_sqr_384_epilogue + + DD imagerel $L$SEH_begin_sqr_mont_384 + DD imagerel $L$SEH_body_sqr_mont_384 + DD imagerel $L$SEH_info_sqr_mont_384_prologue + + DD imagerel $L$SEH_body_sqr_mont_384 + DD imagerel $L$SEH_epilogue_sqr_mont_384 + DD imagerel $L$SEH_info_sqr_mont_384_body + + DD imagerel $L$SEH_epilogue_sqr_mont_384 + DD imagerel $L$SEH_end_sqr_mont_384 + DD imagerel $L$SEH_info_sqr_mont_384_epilogue + + DD imagerel $L$SEH_begin_redc_mont_384 + DD imagerel $L$SEH_body_redc_mont_384 + DD imagerel $L$SEH_info_redc_mont_384_prologue + + DD imagerel $L$SEH_body_redc_mont_384 + DD imagerel $L$SEH_epilogue_redc_mont_384 + DD imagerel $L$SEH_info_redc_mont_384_body + + DD imagerel $L$SEH_epilogue_redc_mont_384 + DD imagerel $L$SEH_end_redc_mont_384 + DD imagerel $L$SEH_info_redc_mont_384_epilogue + + DD imagerel $L$SEH_begin_from_mont_384 + DD imagerel $L$SEH_body_from_mont_384 + DD imagerel $L$SEH_info_from_mont_384_prologue + + DD imagerel $L$SEH_body_from_mont_384 + DD imagerel $L$SEH_epilogue_from_mont_384 + DD imagerel $L$SEH_info_from_mont_384_body + + DD imagerel $L$SEH_epilogue_from_mont_384 + DD imagerel $L$SEH_end_from_mont_384 + DD imagerel $L$SEH_info_from_mont_384_epilogue + + DD imagerel $L$SEH_begin_sgn0_pty_mont_384 + DD imagerel $L$SEH_body_sgn0_pty_mont_384 + DD imagerel $L$SEH_info_sgn0_pty_mont_384_prologue + + DD imagerel $L$SEH_body_sgn0_pty_mont_384 + DD imagerel $L$SEH_epilogue_sgn0_pty_mont_384 + DD imagerel $L$SEH_info_sgn0_pty_mont_384_body + + DD imagerel $L$SEH_epilogue_sgn0_pty_mont_384 + DD imagerel $L$SEH_end_sgn0_pty_mont_384 + DD imagerel $L$SEH_info_sgn0_pty_mont_384_epilogue + + DD imagerel $L$SEH_begin_sgn0_pty_mont_384x + DD imagerel $L$SEH_body_sgn0_pty_mont_384x + DD imagerel $L$SEH_info_sgn0_pty_mont_384x_prologue + + DD imagerel $L$SEH_body_sgn0_pty_mont_384x + DD imagerel $L$SEH_epilogue_sgn0_pty_mont_384x + DD imagerel $L$SEH_info_sgn0_pty_mont_384x_body + + DD imagerel $L$SEH_epilogue_sgn0_pty_mont_384x + DD imagerel $L$SEH_end_sgn0_pty_mont_384x + DD imagerel $L$SEH_info_sgn0_pty_mont_384x_epilogue + + DD imagerel $L$SEH_begin_mul_mont_384 + DD imagerel $L$SEH_body_mul_mont_384 + DD imagerel $L$SEH_info_mul_mont_384_prologue + + DD imagerel $L$SEH_body_mul_mont_384 + DD imagerel $L$SEH_epilogue_mul_mont_384 + DD imagerel $L$SEH_info_mul_mont_384_body + + DD imagerel $L$SEH_epilogue_mul_mont_384 + DD imagerel $L$SEH_end_mul_mont_384 + DD imagerel $L$SEH_info_mul_mont_384_epilogue + + DD imagerel $L$SEH_begin_sqr_n_mul_mont_384 + DD imagerel $L$SEH_body_sqr_n_mul_mont_384 + DD imagerel $L$SEH_info_sqr_n_mul_mont_384_prologue + + DD imagerel $L$SEH_body_sqr_n_mul_mont_384 + DD imagerel $L$SEH_epilogue_sqr_n_mul_mont_384 + DD imagerel $L$SEH_info_sqr_n_mul_mont_384_body + + DD imagerel $L$SEH_epilogue_sqr_n_mul_mont_384 + DD imagerel $L$SEH_end_sqr_n_mul_mont_384 + DD imagerel $L$SEH_info_sqr_n_mul_mont_384_epilogue + + DD imagerel $L$SEH_begin_sqr_n_mul_mont_383 + DD imagerel $L$SEH_body_sqr_n_mul_mont_383 + DD imagerel $L$SEH_info_sqr_n_mul_mont_383_prologue + + DD imagerel $L$SEH_body_sqr_n_mul_mont_383 + DD imagerel $L$SEH_epilogue_sqr_n_mul_mont_383 + DD imagerel $L$SEH_info_sqr_n_mul_mont_383_body + + DD imagerel $L$SEH_epilogue_sqr_n_mul_mont_383 + DD imagerel $L$SEH_end_sqr_n_mul_mont_383 + DD imagerel $L$SEH_info_sqr_n_mul_mont_383_epilogue + + DD imagerel $L$SEH_begin_sqr_mont_382x + DD imagerel $L$SEH_body_sqr_mont_382x + DD imagerel $L$SEH_info_sqr_mont_382x_prologue + + DD imagerel $L$SEH_body_sqr_mont_382x + DD imagerel $L$SEH_epilogue_sqr_mont_382x + DD imagerel $L$SEH_info_sqr_mont_382x_body + + DD imagerel $L$SEH_epilogue_sqr_mont_382x + DD imagerel $L$SEH_end_sqr_mont_382x + DD imagerel $L$SEH_info_sqr_mont_382x_epilogue + +.pdata ENDS +.xdata SEGMENT READONLY ALIGN(8) +ALIGN 8 +$L$SEH_info_mul_mont_384x_prologue:: +DB 1,0,5,00bh +DB 0,074h,1,0 +DB 0,064h,2,0 +DB 0,003h +DB 0,0 +$L$SEH_info_mul_mont_384x_body:: +DB 1,0,18,0 +DB 000h,0f4h,029h,000h +DB 000h,0e4h,02ah,000h +DB 000h,0d4h,02bh,000h +DB 000h,0c4h,02ch,000h +DB 000h,034h,02dh,000h +DB 000h,054h,02eh,000h +DB 000h,074h,030h,000h +DB 000h,064h,031h,000h +DB 000h,001h,02fh,000h +$L$SEH_info_mul_mont_384x_epilogue:: +DB 1,0,4,0 +DB 000h,074h,001h,000h +DB 000h,064h,002h,000h +DB 000h,000h,000h,000h + +$L$SEH_info_sqr_mont_384x_prologue:: +DB 1,0,5,00bh +DB 0,074h,1,0 +DB 0,064h,2,0 +DB 0,003h +DB 0,0 +$L$SEH_info_sqr_mont_384x_body:: +DB 1,0,18,0 +DB 000h,0f4h,011h,000h +DB 000h,0e4h,012h,000h +DB 000h,0d4h,013h,000h +DB 000h,0c4h,014h,000h +DB 000h,034h,015h,000h +DB 000h,054h,016h,000h +DB 000h,074h,018h,000h +DB 000h,064h,019h,000h +DB 000h,001h,017h,000h +$L$SEH_info_sqr_mont_384x_epilogue:: +DB 1,0,4,0 +DB 000h,074h,001h,000h +DB 000h,064h,002h,000h +DB 000h,000h,000h,000h + +$L$SEH_info_mul_382x_prologue:: +DB 1,0,5,00bh +DB 0,074h,1,0 +DB 0,064h,2,0 +DB 0,003h +DB 0,0 +$L$SEH_info_mul_382x_body:: +DB 1,0,18,0 +DB 000h,0f4h,011h,000h +DB 000h,0e4h,012h,000h +DB 000h,0d4h,013h,000h +DB 000h,0c4h,014h,000h +DB 000h,034h,015h,000h +DB 000h,054h,016h,000h +DB 000h,074h,018h,000h +DB 000h,064h,019h,000h +DB 000h,001h,017h,000h +$L$SEH_info_mul_382x_epilogue:: +DB 1,0,4,0 +DB 000h,074h,001h,000h +DB 000h,064h,002h,000h +DB 000h,000h,000h,000h + +$L$SEH_info_sqr_382x_prologue:: +DB 1,0,5,00bh +DB 0,074h,1,0 +DB 0,064h,2,0 +DB 0,003h +DB 0,0 +$L$SEH_info_sqr_382x_body:: +DB 1,0,17,0 +DB 000h,0f4h,001h,000h +DB 000h,0e4h,002h,000h +DB 000h,0d4h,003h,000h +DB 000h,0c4h,004h,000h +DB 000h,034h,005h,000h +DB 000h,054h,006h,000h +DB 000h,074h,008h,000h +DB 000h,064h,009h,000h +DB 000h,062h +DB 000h,000h +$L$SEH_info_sqr_382x_epilogue:: +DB 1,0,4,0 +DB 000h,074h,001h,000h +DB 000h,064h,002h,000h +DB 000h,000h,000h,000h + +$L$SEH_info_mul_384_prologue:: +DB 1,0,5,00bh +DB 0,074h,1,0 +DB 0,064h,2,0 +DB 0,003h +DB 0,0 +$L$SEH_info_mul_384_body:: +DB 1,0,11,0 +DB 000h,0c4h,000h,000h +DB 000h,034h,001h,000h +DB 000h,054h,002h,000h +DB 000h,074h,004h,000h +DB 000h,064h,005h,000h +DB 000h,022h +DB 000h,000h,000h,000h,000h,000h +$L$SEH_info_mul_384_epilogue:: +DB 1,0,4,0 +DB 000h,074h,001h,000h +DB 000h,064h,002h,000h +DB 000h,000h,000h,000h + +$L$SEH_info_sqr_384_prologue:: +DB 1,0,5,00bh +DB 0,074h,1,0 +DB 0,064h,2,0 +DB 0,003h +DB 0,0 +$L$SEH_info_sqr_384_body:: +DB 1,0,17,0 +DB 000h,0f4h,001h,000h +DB 000h,0e4h,002h,000h +DB 000h,0d4h,003h,000h +DB 000h,0c4h,004h,000h +DB 000h,034h,005h,000h +DB 000h,054h,006h,000h +DB 000h,074h,008h,000h +DB 000h,064h,009h,000h +DB 000h,062h +DB 000h,000h +$L$SEH_info_sqr_384_epilogue:: +DB 1,0,4,0 +DB 000h,074h,001h,000h +DB 000h,064h,002h,000h +DB 000h,000h,000h,000h + +$L$SEH_info_sqr_mont_384_prologue:: +DB 1,0,5,00bh +DB 0,074h,1,0 +DB 0,064h,2,0 +DB 0,003h +DB 0,0 +$L$SEH_info_sqr_mont_384_body:: +DB 1,0,18,0 +DB 000h,0f4h,00fh,000h +DB 000h,0e4h,010h,000h +DB 000h,0d4h,011h,000h +DB 000h,0c4h,012h,000h +DB 000h,034h,013h,000h +DB 000h,054h,014h,000h +DB 000h,074h,016h,000h +DB 000h,064h,017h,000h +DB 000h,001h,015h,000h +$L$SEH_info_sqr_mont_384_epilogue:: +DB 1,0,4,0 +DB 000h,074h,001h,000h +DB 000h,064h,002h,000h +DB 000h,000h,000h,000h + +$L$SEH_info_redc_mont_384_prologue:: +DB 1,0,5,00bh +DB 0,074h,1,0 +DB 0,064h,2,0 +DB 0,003h +DB 0,0 +$L$SEH_info_redc_mont_384_body:: +DB 1,0,17,0 +DB 000h,0f4h,001h,000h +DB 000h,0e4h,002h,000h +DB 000h,0d4h,003h,000h +DB 000h,0c4h,004h,000h +DB 000h,034h,005h,000h +DB 000h,054h,006h,000h +DB 000h,074h,008h,000h +DB 000h,064h,009h,000h +DB 000h,062h +DB 000h,000h +$L$SEH_info_redc_mont_384_epilogue:: +DB 1,0,4,0 +DB 000h,074h,001h,000h +DB 000h,064h,002h,000h +DB 000h,000h,000h,000h + +$L$SEH_info_from_mont_384_prologue:: +DB 1,0,5,00bh +DB 0,074h,1,0 +DB 0,064h,2,0 +DB 0,003h +DB 0,0 +$L$SEH_info_from_mont_384_body:: +DB 1,0,17,0 +DB 000h,0f4h,001h,000h +DB 000h,0e4h,002h,000h +DB 000h,0d4h,003h,000h +DB 000h,0c4h,004h,000h +DB 000h,034h,005h,000h +DB 000h,054h,006h,000h +DB 000h,074h,008h,000h +DB 000h,064h,009h,000h +DB 000h,062h +DB 000h,000h +$L$SEH_info_from_mont_384_epilogue:: +DB 1,0,4,0 +DB 000h,074h,001h,000h +DB 000h,064h,002h,000h +DB 000h,000h,000h,000h + +$L$SEH_info_sgn0_pty_mont_384_prologue:: +DB 1,0,5,00bh +DB 0,074h,1,0 +DB 0,064h,2,0 +DB 0,003h +DB 0,0 +$L$SEH_info_sgn0_pty_mont_384_body:: +DB 1,0,17,0 +DB 000h,0f4h,001h,000h +DB 000h,0e4h,002h,000h +DB 000h,0d4h,003h,000h +DB 000h,0c4h,004h,000h +DB 000h,034h,005h,000h +DB 000h,054h,006h,000h +DB 000h,074h,008h,000h +DB 000h,064h,009h,000h +DB 000h,062h +DB 000h,000h +$L$SEH_info_sgn0_pty_mont_384_epilogue:: +DB 1,0,4,0 +DB 000h,074h,001h,000h +DB 000h,064h,002h,000h +DB 000h,000h,000h,000h + +$L$SEH_info_sgn0_pty_mont_384x_prologue:: +DB 1,0,5,00bh +DB 0,074h,1,0 +DB 0,064h,2,0 +DB 0,003h +DB 0,0 +$L$SEH_info_sgn0_pty_mont_384x_body:: +DB 1,0,17,0 +DB 000h,0f4h,001h,000h +DB 000h,0e4h,002h,000h +DB 000h,0d4h,003h,000h +DB 000h,0c4h,004h,000h +DB 000h,034h,005h,000h +DB 000h,054h,006h,000h +DB 000h,074h,008h,000h +DB 000h,064h,009h,000h +DB 000h,062h +DB 000h,000h +$L$SEH_info_sgn0_pty_mont_384x_epilogue:: +DB 1,0,4,0 +DB 000h,074h,001h,000h +DB 000h,064h,002h,000h +DB 000h,000h,000h,000h + +$L$SEH_info_mul_mont_384_prologue:: +DB 1,0,5,00bh +DB 0,074h,1,0 +DB 0,064h,2,0 +DB 0,003h +DB 0,0 +$L$SEH_info_mul_mont_384_body:: +DB 1,0,17,0 +DB 000h,0f4h,003h,000h +DB 000h,0e4h,004h,000h +DB 000h,0d4h,005h,000h +DB 000h,0c4h,006h,000h +DB 000h,034h,007h,000h +DB 000h,054h,008h,000h +DB 000h,074h,00ah,000h +DB 000h,064h,00bh,000h +DB 000h,082h +DB 000h,000h +$L$SEH_info_mul_mont_384_epilogue:: +DB 1,0,4,0 +DB 000h,074h,001h,000h +DB 000h,064h,002h,000h +DB 000h,000h,000h,000h + +$L$SEH_info_sqr_n_mul_mont_384_prologue:: +DB 1,0,5,00bh +DB 0,074h,1,0 +DB 0,064h,2,0 +DB 0,003h +DB 0,0 +$L$SEH_info_sqr_n_mul_mont_384_body:: +DB 1,0,18,0 +DB 000h,0f4h,011h,000h +DB 000h,0e4h,012h,000h +DB 000h,0d4h,013h,000h +DB 000h,0c4h,014h,000h +DB 000h,034h,015h,000h +DB 000h,054h,016h,000h +DB 000h,074h,018h,000h +DB 000h,064h,019h,000h +DB 000h,001h,017h,000h +$L$SEH_info_sqr_n_mul_mont_384_epilogue:: +DB 1,0,4,0 +DB 000h,074h,001h,000h +DB 000h,064h,002h,000h +DB 000h,000h,000h,000h + +$L$SEH_info_sqr_n_mul_mont_383_prologue:: +DB 1,0,5,00bh +DB 0,074h,1,0 +DB 0,064h,2,0 +DB 0,003h +DB 0,0 +$L$SEH_info_sqr_n_mul_mont_383_body:: +DB 1,0,18,0 +DB 000h,0f4h,011h,000h +DB 000h,0e4h,012h,000h +DB 000h,0d4h,013h,000h +DB 000h,0c4h,014h,000h +DB 000h,034h,015h,000h +DB 000h,054h,016h,000h +DB 000h,074h,018h,000h +DB 000h,064h,019h,000h +DB 000h,001h,017h,000h +$L$SEH_info_sqr_n_mul_mont_383_epilogue:: +DB 1,0,4,0 +DB 000h,074h,001h,000h +DB 000h,064h,002h,000h +DB 000h,000h,000h,000h + +$L$SEH_info_sqr_mont_382x_prologue:: +DB 1,0,5,00bh +DB 0,074h,1,0 +DB 0,064h,2,0 +DB 0,003h +DB 0,0 +$L$SEH_info_sqr_mont_382x_body:: +DB 1,0,18,0 +DB 000h,0f4h,011h,000h +DB 000h,0e4h,012h,000h +DB 000h,0d4h,013h,000h +DB 000h,0c4h,014h,000h +DB 000h,034h,015h,000h +DB 000h,054h,016h,000h +DB 000h,074h,018h,000h +DB 000h,064h,019h,000h +DB 000h,001h,017h,000h +$L$SEH_info_sqr_mont_382x_epilogue:: +DB 1,0,4,0 +DB 000h,074h,001h,000h +DB 000h,064h,002h,000h +DB 000h,000h,000h,000h + + +.xdata ENDS +END diff --git a/temp-deps/blst-ts/deps/blst/build/win64/mulx_mont_256-x86_64.asm b/temp-deps/blst-ts/deps/blst/build/win64/mulx_mont_256-x86_64.asm new file mode 100644 index 0000000..83534c6 --- /dev/null +++ b/temp-deps/blst-ts/deps/blst/build/win64/mulx_mont_256-x86_64.asm @@ -0,0 +1,796 @@ +OPTION DOTNAME +.text$ SEGMENT ALIGN(256) 'CODE' + +PUBLIC mulx_mont_sparse_256 + + +ALIGN 32 +mulx_mont_sparse_256 PROC PUBLIC + DB 243,15,30,250 + mov QWORD PTR[8+rsp],rdi ;WIN64 prologue + mov QWORD PTR[16+rsp],rsi + mov r11,rsp +$L$SEH_begin_mulx_mont_sparse_256:: + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 + mov rcx,r9 + mov r8,QWORD PTR[40+rsp] + + + + push rbp + + push rbx + + push r12 + + push r13 + + push r14 + + push r15 + + sub rsp,8 + +$L$SEH_body_mulx_mont_sparse_256:: + + + mov rbx,rdx + mov rdx,QWORD PTR[rdx] + mov r14,QWORD PTR[rsi] + mov r15,QWORD PTR[8+rsi] + mov rbp,QWORD PTR[16+rsi] + mov r9,QWORD PTR[24+rsi] + lea rsi,QWORD PTR[((-128))+rsi] + lea rcx,QWORD PTR[((-128))+rcx] + + mulx r11,rax,r14 + call __mulx_mont_sparse_256 + + mov r15,QWORD PTR[8+rsp] + + mov r14,QWORD PTR[16+rsp] + + mov r13,QWORD PTR[24+rsp] + + mov r12,QWORD PTR[32+rsp] + + mov rbx,QWORD PTR[40+rsp] + + mov rbp,QWORD PTR[48+rsp] + + lea rsp,QWORD PTR[56+rsp] + +$L$SEH_epilogue_mulx_mont_sparse_256:: + mov rdi,QWORD PTR[8+rsp] ;WIN64 epilogue + mov rsi,QWORD PTR[16+rsp] + + DB 0F3h,0C3h ;repret + +$L$SEH_end_mulx_mont_sparse_256:: +mulx_mont_sparse_256 ENDP + +PUBLIC sqrx_mont_sparse_256 + + +ALIGN 32 +sqrx_mont_sparse_256 PROC PUBLIC + DB 243,15,30,250 + mov QWORD PTR[8+rsp],rdi ;WIN64 prologue + mov QWORD PTR[16+rsp],rsi + mov r11,rsp +$L$SEH_begin_sqrx_mont_sparse_256:: + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 + mov rcx,r9 + + + + push rbp + + push rbx + + push r12 + + push r13 + + push r14 + + push r15 + + sub rsp,8 + +$L$SEH_body_sqrx_mont_sparse_256:: + + + mov rbx,rsi + mov r8,rcx + mov rcx,rdx + mov rdx,QWORD PTR[rsi] + mov r15,QWORD PTR[8+rsi] + mov rbp,QWORD PTR[16+rsi] + mov r9,QWORD PTR[24+rsi] + lea rsi,QWORD PTR[((-128))+rbx] + lea rcx,QWORD PTR[((-128))+rcx] + + mulx r11,rax,rdx + call __mulx_mont_sparse_256 + + mov r15,QWORD PTR[8+rsp] + + mov r14,QWORD PTR[16+rsp] + + mov r13,QWORD PTR[24+rsp] + + mov r12,QWORD PTR[32+rsp] + + mov rbx,QWORD PTR[40+rsp] + + mov rbp,QWORD PTR[48+rsp] + + lea rsp,QWORD PTR[56+rsp] + +$L$SEH_epilogue_sqrx_mont_sparse_256:: + mov rdi,QWORD PTR[8+rsp] ;WIN64 epilogue + mov rsi,QWORD PTR[16+rsp] + + DB 0F3h,0C3h ;repret + +$L$SEH_end_sqrx_mont_sparse_256:: +sqrx_mont_sparse_256 ENDP + +ALIGN 32 +__mulx_mont_sparse_256 PROC PRIVATE + DB 243,15,30,250 + mulx r12,r15,r15 + mulx r13,rbp,rbp + add r11,r15 + mulx r14,r9,r9 + mov rdx,QWORD PTR[8+rbx] + adc r12,rbp + adc r13,r9 + adc r14,0 + + mov r10,rax + imul rax,r8 + + + xor r15,r15 + mulx r9,rbp,QWORD PTR[((0+128))+rsi] + adox r11,rbp + adcx r12,r9 + + mulx r9,rbp,QWORD PTR[((8+128))+rsi] + adox r12,rbp + adcx r13,r9 + + mulx r9,rbp,QWORD PTR[((16+128))+rsi] + adox r13,rbp + adcx r14,r9 + + mulx r9,rbp,QWORD PTR[((24+128))+rsi] + mov rdx,rax + adox r14,rbp + adcx r9,r15 + adox r15,r9 + + + mulx rax,rbp,QWORD PTR[((0+128))+rcx] + adcx r10,rbp + adox rax,r11 + + mulx r9,rbp,QWORD PTR[((8+128))+rcx] + adcx rax,rbp + adox r12,r9 + + mulx r9,rbp,QWORD PTR[((16+128))+rcx] + adcx r12,rbp + adox r13,r9 + + mulx r9,rbp,QWORD PTR[((24+128))+rcx] + mov rdx,QWORD PTR[16+rbx] + adcx r13,rbp + adox r14,r9 + adcx r14,r10 + adox r15,r10 + adcx r15,r10 + adox r10,r10 + adc r10,0 + mov r11,rax + imul rax,r8 + + + xor rbp,rbp + mulx r9,rbp,QWORD PTR[((0+128))+rsi] + adox r12,rbp + adcx r13,r9 + + mulx r9,rbp,QWORD PTR[((8+128))+rsi] + adox r13,rbp + adcx r14,r9 + + mulx r9,rbp,QWORD PTR[((16+128))+rsi] + adox r14,rbp + adcx r15,r9 + + mulx r9,rbp,QWORD PTR[((24+128))+rsi] + mov rdx,rax + adox r15,rbp + adcx r9,r10 + adox r10,r9 + + + mulx rax,rbp,QWORD PTR[((0+128))+rcx] + adcx r11,rbp + adox rax,r12 + + mulx r9,rbp,QWORD PTR[((8+128))+rcx] + adcx rax,rbp + adox r13,r9 + + mulx r9,rbp,QWORD PTR[((16+128))+rcx] + adcx r13,rbp + adox r14,r9 + + mulx r9,rbp,QWORD PTR[((24+128))+rcx] + mov rdx,QWORD PTR[24+rbx] + adcx r14,rbp + adox r15,r9 + adcx r15,r11 + adox r10,r11 + adcx r10,r11 + adox r11,r11 + adc r11,0 + mov r12,rax + imul rax,r8 + + + xor rbp,rbp + mulx r9,rbp,QWORD PTR[((0+128))+rsi] + adox r13,rbp + adcx r14,r9 + + mulx r9,rbp,QWORD PTR[((8+128))+rsi] + adox r14,rbp + adcx r15,r9 + + mulx r9,rbp,QWORD PTR[((16+128))+rsi] + adox r15,rbp + adcx r10,r9 + + mulx r9,rbp,QWORD PTR[((24+128))+rsi] + mov rdx,rax + adox r10,rbp + adcx r9,r11 + adox r11,r9 + + + mulx rax,rbp,QWORD PTR[((0+128))+rcx] + adcx r12,rbp + adox rax,r13 + + mulx r9,rbp,QWORD PTR[((8+128))+rcx] + adcx rax,rbp + adox r14,r9 + + mulx r9,rbp,QWORD PTR[((16+128))+rcx] + adcx r14,rbp + adox r15,r9 + + mulx r9,rbp,QWORD PTR[((24+128))+rcx] + mov rdx,rax + adcx r15,rbp + adox r10,r9 + adcx r10,r12 + adox r11,r12 + adcx r11,r12 + adox r12,r12 + adc r12,0 + imul rdx,r8 + + + xor rbp,rbp + mulx r9,r13,QWORD PTR[((0+128))+rcx] + adcx r13,rax + adox r14,r9 + + mulx r9,rbp,QWORD PTR[((8+128))+rcx] + adcx r14,rbp + adox r15,r9 + + mulx r9,rbp,QWORD PTR[((16+128))+rcx] + adcx r15,rbp + adox r10,r9 + + mulx r9,rbp,QWORD PTR[((24+128))+rcx] + mov rdx,r14 + lea rcx,QWORD PTR[128+rcx] + adcx r10,rbp + adox r11,r9 + mov rax,r15 + adcx r11,r13 + adox r12,r13 + adc r12,0 + + + + + mov rbp,r10 + sub r14,QWORD PTR[rcx] + sbb r15,QWORD PTR[8+rcx] + sbb r10,QWORD PTR[16+rcx] + mov r9,r11 + sbb r11,QWORD PTR[24+rcx] + sbb r12,0 + + cmovc r14,rdx + cmovc r15,rax + cmovc r10,rbp + mov QWORD PTR[rdi],r14 + cmovc r11,r9 + mov QWORD PTR[8+rdi],r15 + mov QWORD PTR[16+rdi],r10 + mov QWORD PTR[24+rdi],r11 + + DB 0F3h,0C3h ;repret +__mulx_mont_sparse_256 ENDP +PUBLIC fromx_mont_256 + + +ALIGN 32 +fromx_mont_256 PROC PUBLIC + DB 243,15,30,250 + mov QWORD PTR[8+rsp],rdi ;WIN64 prologue + mov QWORD PTR[16+rsp],rsi + mov r11,rsp +$L$SEH_begin_fromx_mont_256:: + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 + mov rcx,r9 + + + + push rbp + + push rbx + + push r12 + + push r13 + + push r14 + + push r15 + + sub rsp,8 + +$L$SEH_body_fromx_mont_256:: + + + mov rbx,rdx + call __mulx_by_1_mont_256 + + + + + + mov rdx,r15 + mov r12,r10 + mov r13,r11 + + sub r14,QWORD PTR[rbx] + sbb r15,QWORD PTR[8+rbx] + sbb r10,QWORD PTR[16+rbx] + sbb r11,QWORD PTR[24+rbx] + + cmovnc rax,r14 + cmovnc rdx,r15 + cmovnc r12,r10 + mov QWORD PTR[rdi],rax + cmovnc r13,r11 + mov QWORD PTR[8+rdi],rdx + mov QWORD PTR[16+rdi],r12 + mov QWORD PTR[24+rdi],r13 + + mov r15,QWORD PTR[8+rsp] + + mov r14,QWORD PTR[16+rsp] + + mov r13,QWORD PTR[24+rsp] + + mov r12,QWORD PTR[32+rsp] + + mov rbx,QWORD PTR[40+rsp] + + mov rbp,QWORD PTR[48+rsp] + + lea rsp,QWORD PTR[56+rsp] + +$L$SEH_epilogue_fromx_mont_256:: + mov rdi,QWORD PTR[8+rsp] ;WIN64 epilogue + mov rsi,QWORD PTR[16+rsp] + + DB 0F3h,0C3h ;repret + +$L$SEH_end_fromx_mont_256:: +fromx_mont_256 ENDP + +PUBLIC redcx_mont_256 + + +ALIGN 32 +redcx_mont_256 PROC PUBLIC + DB 243,15,30,250 + mov QWORD PTR[8+rsp],rdi ;WIN64 prologue + mov QWORD PTR[16+rsp],rsi + mov r11,rsp +$L$SEH_begin_redcx_mont_256:: + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 + mov rcx,r9 + + + + push rbp + + push rbx + + push r12 + + push r13 + + push r14 + + push r15 + + sub rsp,8 + +$L$SEH_body_redcx_mont_256:: + + + mov rbx,rdx + call __mulx_by_1_mont_256 + + add r14,QWORD PTR[32+rsi] + adc r15,QWORD PTR[40+rsi] + mov rax,r14 + adc r10,QWORD PTR[48+rsi] + mov rdx,r15 + adc r11,QWORD PTR[56+rsi] + sbb rsi,rsi + + + + + mov r12,r10 + sub r14,QWORD PTR[rbx] + sbb r15,QWORD PTR[8+rbx] + sbb r10,QWORD PTR[16+rbx] + mov r13,r11 + sbb r11,QWORD PTR[24+rbx] + sbb rsi,0 + + cmovnc rax,r14 + cmovnc rdx,r15 + cmovnc r12,r10 + mov QWORD PTR[rdi],rax + cmovnc r13,r11 + mov QWORD PTR[8+rdi],rdx + mov QWORD PTR[16+rdi],r12 + mov QWORD PTR[24+rdi],r13 + + mov r15,QWORD PTR[8+rsp] + + mov r14,QWORD PTR[16+rsp] + + mov r13,QWORD PTR[24+rsp] + + mov r12,QWORD PTR[32+rsp] + + mov rbx,QWORD PTR[40+rsp] + + mov rbp,QWORD PTR[48+rsp] + + lea rsp,QWORD PTR[56+rsp] + +$L$SEH_epilogue_redcx_mont_256:: + mov rdi,QWORD PTR[8+rsp] ;WIN64 epilogue + mov rsi,QWORD PTR[16+rsp] + + DB 0F3h,0C3h ;repret + +$L$SEH_end_redcx_mont_256:: +redcx_mont_256 ENDP + +ALIGN 32 +__mulx_by_1_mont_256 PROC PRIVATE + DB 243,15,30,250 + mov rax,QWORD PTR[rsi] + mov r11,QWORD PTR[8+rsi] + mov r12,QWORD PTR[16+rsi] + mov r13,QWORD PTR[24+rsi] + + mov r14,rax + imul rax,rcx + mov r10,rax + + mul QWORD PTR[rbx] + add r14,rax + mov rax,r10 + adc r14,rdx + + mul QWORD PTR[8+rbx] + add r11,rax + mov rax,r10 + adc rdx,0 + add r11,r14 + adc rdx,0 + mov r14,rdx + + mul QWORD PTR[16+rbx] + mov r15,r11 + imul r11,rcx + add r12,rax + mov rax,r10 + adc rdx,0 + add r12,r14 + adc rdx,0 + mov r14,rdx + + mul QWORD PTR[24+rbx] + add r13,rax + mov rax,r11 + adc rdx,0 + add r13,r14 + adc rdx,0 + mov r14,rdx + + mul QWORD PTR[rbx] + add r15,rax + mov rax,r11 + adc r15,rdx + + mul QWORD PTR[8+rbx] + add r12,rax + mov rax,r11 + adc rdx,0 + add r12,r15 + adc rdx,0 + mov r15,rdx + + mul QWORD PTR[16+rbx] + mov r10,r12 + imul r12,rcx + add r13,rax + mov rax,r11 + adc rdx,0 + add r13,r15 + adc rdx,0 + mov r15,rdx + + mul QWORD PTR[24+rbx] + add r14,rax + mov rax,r12 + adc rdx,0 + add r14,r15 + adc rdx,0 + mov r15,rdx + + mul QWORD PTR[rbx] + add r10,rax + mov rax,r12 + adc r10,rdx + + mul QWORD PTR[8+rbx] + add r13,rax + mov rax,r12 + adc rdx,0 + add r13,r10 + adc rdx,0 + mov r10,rdx + + mul QWORD PTR[16+rbx] + mov r11,r13 + imul r13,rcx + add r14,rax + mov rax,r12 + adc rdx,0 + add r14,r10 + adc rdx,0 + mov r10,rdx + + mul QWORD PTR[24+rbx] + add r15,rax + mov rax,r13 + adc rdx,0 + add r15,r10 + adc rdx,0 + mov r10,rdx + + mul QWORD PTR[rbx] + add r11,rax + mov rax,r13 + adc r11,rdx + + mul QWORD PTR[8+rbx] + add r14,rax + mov rax,r13 + adc rdx,0 + add r14,r11 + adc rdx,0 + mov r11,rdx + + mul QWORD PTR[16+rbx] + add r15,rax + mov rax,r13 + adc rdx,0 + add r15,r11 + adc rdx,0 + mov r11,rdx + + mul QWORD PTR[24+rbx] + add r10,rax + mov rax,r14 + adc rdx,0 + add r10,r11 + adc rdx,0 + mov r11,rdx + DB 0F3h,0C3h ;repret +__mulx_by_1_mont_256 ENDP +.text$ ENDS +.pdata SEGMENT READONLY ALIGN(4) +ALIGN 4 + DD imagerel $L$SEH_begin_mulx_mont_sparse_256 + DD imagerel $L$SEH_body_mulx_mont_sparse_256 + DD imagerel $L$SEH_info_mulx_mont_sparse_256_prologue + + DD imagerel $L$SEH_body_mulx_mont_sparse_256 + DD imagerel $L$SEH_epilogue_mulx_mont_sparse_256 + DD imagerel $L$SEH_info_mulx_mont_sparse_256_body + + DD imagerel $L$SEH_epilogue_mulx_mont_sparse_256 + DD imagerel $L$SEH_end_mulx_mont_sparse_256 + DD imagerel $L$SEH_info_mulx_mont_sparse_256_epilogue + + DD imagerel $L$SEH_begin_sqrx_mont_sparse_256 + DD imagerel $L$SEH_body_sqrx_mont_sparse_256 + DD imagerel $L$SEH_info_sqrx_mont_sparse_256_prologue + + DD imagerel $L$SEH_body_sqrx_mont_sparse_256 + DD imagerel $L$SEH_epilogue_sqrx_mont_sparse_256 + DD imagerel $L$SEH_info_sqrx_mont_sparse_256_body + + DD imagerel $L$SEH_epilogue_sqrx_mont_sparse_256 + DD imagerel $L$SEH_end_sqrx_mont_sparse_256 + DD imagerel $L$SEH_info_sqrx_mont_sparse_256_epilogue + + DD imagerel $L$SEH_begin_fromx_mont_256 + DD imagerel $L$SEH_body_fromx_mont_256 + DD imagerel $L$SEH_info_fromx_mont_256_prologue + + DD imagerel $L$SEH_body_fromx_mont_256 + DD imagerel $L$SEH_epilogue_fromx_mont_256 + DD imagerel $L$SEH_info_fromx_mont_256_body + + DD imagerel $L$SEH_epilogue_fromx_mont_256 + DD imagerel $L$SEH_end_fromx_mont_256 + DD imagerel $L$SEH_info_fromx_mont_256_epilogue + + DD imagerel $L$SEH_begin_redcx_mont_256 + DD imagerel $L$SEH_body_redcx_mont_256 + DD imagerel $L$SEH_info_redcx_mont_256_prologue + + DD imagerel $L$SEH_body_redcx_mont_256 + DD imagerel $L$SEH_epilogue_redcx_mont_256 + DD imagerel $L$SEH_info_redcx_mont_256_body + + DD imagerel $L$SEH_epilogue_redcx_mont_256 + DD imagerel $L$SEH_end_redcx_mont_256 + DD imagerel $L$SEH_info_redcx_mont_256_epilogue + +.pdata ENDS +.xdata SEGMENT READONLY ALIGN(8) +ALIGN 8 +$L$SEH_info_mulx_mont_sparse_256_prologue:: +DB 1,0,5,00bh +DB 0,074h,1,0 +DB 0,064h,2,0 +DB 0,003h +DB 0,0 +$L$SEH_info_mulx_mont_sparse_256_body:: +DB 1,0,17,0 +DB 000h,0f4h,001h,000h +DB 000h,0e4h,002h,000h +DB 000h,0d4h,003h,000h +DB 000h,0c4h,004h,000h +DB 000h,034h,005h,000h +DB 000h,054h,006h,000h +DB 000h,074h,008h,000h +DB 000h,064h,009h,000h +DB 000h,062h +DB 000h,000h +$L$SEH_info_mulx_mont_sparse_256_epilogue:: +DB 1,0,4,0 +DB 000h,074h,001h,000h +DB 000h,064h,002h,000h +DB 000h,000h,000h,000h + +$L$SEH_info_sqrx_mont_sparse_256_prologue:: +DB 1,0,5,00bh +DB 0,074h,1,0 +DB 0,064h,2,0 +DB 0,003h +DB 0,0 +$L$SEH_info_sqrx_mont_sparse_256_body:: +DB 1,0,17,0 +DB 000h,0f4h,001h,000h +DB 000h,0e4h,002h,000h +DB 000h,0d4h,003h,000h +DB 000h,0c4h,004h,000h +DB 000h,034h,005h,000h +DB 000h,054h,006h,000h +DB 000h,074h,008h,000h +DB 000h,064h,009h,000h +DB 000h,062h +DB 000h,000h +$L$SEH_info_sqrx_mont_sparse_256_epilogue:: +DB 1,0,4,0 +DB 000h,074h,001h,000h +DB 000h,064h,002h,000h +DB 000h,000h,000h,000h + +$L$SEH_info_fromx_mont_256_prologue:: +DB 1,0,5,00bh +DB 0,074h,1,0 +DB 0,064h,2,0 +DB 0,003h +DB 0,0 +$L$SEH_info_fromx_mont_256_body:: +DB 1,0,17,0 +DB 000h,0f4h,001h,000h +DB 000h,0e4h,002h,000h +DB 000h,0d4h,003h,000h +DB 000h,0c4h,004h,000h +DB 000h,034h,005h,000h +DB 000h,054h,006h,000h +DB 000h,074h,008h,000h +DB 000h,064h,009h,000h +DB 000h,062h +DB 000h,000h +$L$SEH_info_fromx_mont_256_epilogue:: +DB 1,0,4,0 +DB 000h,074h,001h,000h +DB 000h,064h,002h,000h +DB 000h,000h,000h,000h + +$L$SEH_info_redcx_mont_256_prologue:: +DB 1,0,5,00bh +DB 0,074h,1,0 +DB 0,064h,2,0 +DB 0,003h +DB 0,0 +$L$SEH_info_redcx_mont_256_body:: +DB 1,0,17,0 +DB 000h,0f4h,001h,000h +DB 000h,0e4h,002h,000h +DB 000h,0d4h,003h,000h +DB 000h,0c4h,004h,000h +DB 000h,034h,005h,000h +DB 000h,054h,006h,000h +DB 000h,074h,008h,000h +DB 000h,064h,009h,000h +DB 000h,062h +DB 000h,000h +$L$SEH_info_redcx_mont_256_epilogue:: +DB 1,0,4,0 +DB 000h,074h,001h,000h +DB 000h,064h,002h,000h +DB 000h,000h,000h,000h + + +.xdata ENDS +END diff --git a/temp-deps/blst-ts/deps/blst/build/win64/mulx_mont_384-x86_64.asm b/temp-deps/blst-ts/deps/blst/build/win64/mulx_mont_384-x86_64.asm new file mode 100644 index 0000000..25bee97 --- /dev/null +++ b/temp-deps/blst-ts/deps/blst/build/win64/mulx_mont_384-x86_64.asm @@ -0,0 +1,3586 @@ +OPTION DOTNAME +.text$ SEGMENT ALIGN(256) 'CODE' + + + + + + + + +ALIGN 32 +__sub_mod_384x384 PROC PRIVATE + DB 243,15,30,250 + mov r8,QWORD PTR[rsi] + mov r9,QWORD PTR[8+rsi] + mov r10,QWORD PTR[16+rsi] + mov r11,QWORD PTR[24+rsi] + mov r12,QWORD PTR[32+rsi] + mov r13,QWORD PTR[40+rsi] + mov r14,QWORD PTR[48+rsi] + + sub r8,QWORD PTR[rdx] + mov r15,QWORD PTR[56+rsi] + sbb r9,QWORD PTR[8+rdx] + mov rax,QWORD PTR[64+rsi] + sbb r10,QWORD PTR[16+rdx] + mov rbx,QWORD PTR[72+rsi] + sbb r11,QWORD PTR[24+rdx] + mov rbp,QWORD PTR[80+rsi] + sbb r12,QWORD PTR[32+rdx] + mov rsi,QWORD PTR[88+rsi] + sbb r13,QWORD PTR[40+rdx] + mov QWORD PTR[rdi],r8 + sbb r14,QWORD PTR[48+rdx] + mov r8,QWORD PTR[rcx] + mov QWORD PTR[8+rdi],r9 + sbb r15,QWORD PTR[56+rdx] + mov r9,QWORD PTR[8+rcx] + mov QWORD PTR[16+rdi],r10 + sbb rax,QWORD PTR[64+rdx] + mov r10,QWORD PTR[16+rcx] + mov QWORD PTR[24+rdi],r11 + sbb rbx,QWORD PTR[72+rdx] + mov r11,QWORD PTR[24+rcx] + mov QWORD PTR[32+rdi],r12 + sbb rbp,QWORD PTR[80+rdx] + mov r12,QWORD PTR[32+rcx] + mov QWORD PTR[40+rdi],r13 + sbb rsi,QWORD PTR[88+rdx] + mov r13,QWORD PTR[40+rcx] + sbb rdx,rdx + + and r8,rdx + and r9,rdx + and r10,rdx + and r11,rdx + and r12,rdx + and r13,rdx + + add r14,r8 + adc r15,r9 + mov QWORD PTR[48+rdi],r14 + adc rax,r10 + mov QWORD PTR[56+rdi],r15 + adc rbx,r11 + mov QWORD PTR[64+rdi],rax + adc rbp,r12 + mov QWORD PTR[72+rdi],rbx + adc rsi,r13 + mov QWORD PTR[80+rdi],rbp + mov QWORD PTR[88+rdi],rsi + + DB 0F3h,0C3h ;repret +__sub_mod_384x384 ENDP + + +ALIGN 32 +__add_mod_384 PROC PRIVATE + DB 243,15,30,250 + mov r8,QWORD PTR[rsi] + mov r9,QWORD PTR[8+rsi] + mov r10,QWORD PTR[16+rsi] + mov r11,QWORD PTR[24+rsi] + mov r12,QWORD PTR[32+rsi] + mov r13,QWORD PTR[40+rsi] + + add r8,QWORD PTR[rdx] + adc r9,QWORD PTR[8+rdx] + adc r10,QWORD PTR[16+rdx] + mov r14,r8 + adc r11,QWORD PTR[24+rdx] + mov r15,r9 + adc r12,QWORD PTR[32+rdx] + mov rax,r10 + adc r13,QWORD PTR[40+rdx] + mov rbx,r11 + sbb rdx,rdx + + sub r8,QWORD PTR[rcx] + sbb r9,QWORD PTR[8+rcx] + mov rbp,r12 + sbb r10,QWORD PTR[16+rcx] + sbb r11,QWORD PTR[24+rcx] + sbb r12,QWORD PTR[32+rcx] + mov rsi,r13 + sbb r13,QWORD PTR[40+rcx] + sbb rdx,0 + + cmovc r8,r14 + cmovc r9,r15 + cmovc r10,rax + mov QWORD PTR[rdi],r8 + cmovc r11,rbx + mov QWORD PTR[8+rdi],r9 + cmovc r12,rbp + mov QWORD PTR[16+rdi],r10 + cmovc r13,rsi + mov QWORD PTR[24+rdi],r11 + mov QWORD PTR[32+rdi],r12 + mov QWORD PTR[40+rdi],r13 + + DB 0F3h,0C3h ;repret +__add_mod_384 ENDP + + +ALIGN 32 +__sub_mod_384 PROC PRIVATE + DB 243,15,30,250 + mov r8,QWORD PTR[rsi] + mov r9,QWORD PTR[8+rsi] + mov r10,QWORD PTR[16+rsi] + mov r11,QWORD PTR[24+rsi] + mov r12,QWORD PTR[32+rsi] + mov r13,QWORD PTR[40+rsi] + +__sub_mod_384_a_is_loaded:: + sub r8,QWORD PTR[rdx] + mov r14,QWORD PTR[rcx] + sbb r9,QWORD PTR[8+rdx] + mov r15,QWORD PTR[8+rcx] + sbb r10,QWORD PTR[16+rdx] + mov rax,QWORD PTR[16+rcx] + sbb r11,QWORD PTR[24+rdx] + mov rbx,QWORD PTR[24+rcx] + sbb r12,QWORD PTR[32+rdx] + mov rbp,QWORD PTR[32+rcx] + sbb r13,QWORD PTR[40+rdx] + mov rsi,QWORD PTR[40+rcx] + sbb rdx,rdx + + and r14,rdx + and r15,rdx + and rax,rdx + and rbx,rdx + and rbp,rdx + and rsi,rdx + + add r8,r14 + adc r9,r15 + mov QWORD PTR[rdi],r8 + adc r10,rax + mov QWORD PTR[8+rdi],r9 + adc r11,rbx + mov QWORD PTR[16+rdi],r10 + adc r12,rbp + mov QWORD PTR[24+rdi],r11 + adc r13,rsi + mov QWORD PTR[32+rdi],r12 + mov QWORD PTR[40+rdi],r13 + + DB 0F3h,0C3h ;repret +__sub_mod_384 ENDP +PUBLIC mulx_mont_384x + + +ALIGN 32 +mulx_mont_384x PROC PUBLIC + DB 243,15,30,250 + mov QWORD PTR[8+rsp],rdi ;WIN64 prologue + mov QWORD PTR[16+rsp],rsi + mov r11,rsp +$L$SEH_begin_mulx_mont_384x:: + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 + mov rcx,r9 + mov r8,QWORD PTR[40+rsp] + + + + push rbp + + push rbx + + push r12 + + push r13 + + push r14 + + push r15 + + sub rsp,328 + +$L$SEH_body_mulx_mont_384x:: + + + mov rbx,rdx + mov QWORD PTR[32+rsp],rdi + mov QWORD PTR[24+rsp],rsi + mov QWORD PTR[16+rsp],rdx + mov QWORD PTR[8+rsp],rcx + mov QWORD PTR[rsp],r8 + + + + + lea rdi,QWORD PTR[40+rsp] + call __mulx_384 + + + lea rbx,QWORD PTR[48+rbx] + lea rsi,QWORD PTR[((128+48))+rsi] + lea rdi,QWORD PTR[96+rdi] + call __mulx_384 + + + mov rcx,QWORD PTR[8+rsp] + lea rsi,QWORD PTR[rbx] + lea rdx,QWORD PTR[((-48))+rbx] + lea rdi,QWORD PTR[((40+192+48))+rsp] + call __add_mod_384 + + mov rsi,QWORD PTR[24+rsp] + lea rdx,QWORD PTR[48+rsi] + lea rdi,QWORD PTR[((-48))+rdi] + call __add_mod_384 + + lea rbx,QWORD PTR[rdi] + lea rsi,QWORD PTR[48+rdi] + call __mulx_384 + + + lea rsi,QWORD PTR[rdi] + lea rdx,QWORD PTR[40+rsp] + mov rcx,QWORD PTR[8+rsp] + call __sub_mod_384x384 + + lea rsi,QWORD PTR[rdi] + lea rdx,QWORD PTR[((-96))+rdi] + call __sub_mod_384x384 + + + lea rsi,QWORD PTR[40+rsp] + lea rdx,QWORD PTR[((40+96))+rsp] + lea rdi,QWORD PTR[40+rsp] + call __sub_mod_384x384 + + lea rbx,QWORD PTR[rcx] + + + lea rsi,QWORD PTR[40+rsp] + mov rcx,QWORD PTR[rsp] + mov rdi,QWORD PTR[32+rsp] + call __mulx_by_1_mont_384 + call __redc_tail_mont_384 + + + lea rsi,QWORD PTR[((40+192))+rsp] + mov rcx,QWORD PTR[rsp] + lea rdi,QWORD PTR[48+rdi] + call __mulx_by_1_mont_384 + call __redc_tail_mont_384 + + lea r8,QWORD PTR[328+rsp] + mov r15,QWORD PTR[r8] + + mov r14,QWORD PTR[8+r8] + + mov r13,QWORD PTR[16+r8] + + mov r12,QWORD PTR[24+r8] + + mov rbx,QWORD PTR[32+r8] + + mov rbp,QWORD PTR[40+r8] + + lea rsp,QWORD PTR[48+r8] + +$L$SEH_epilogue_mulx_mont_384x:: + mov rdi,QWORD PTR[8+rsp] ;WIN64 epilogue + mov rsi,QWORD PTR[16+rsp] + + DB 0F3h,0C3h ;repret + +$L$SEH_end_mulx_mont_384x:: +mulx_mont_384x ENDP +PUBLIC sqrx_mont_384x + + +ALIGN 32 +sqrx_mont_384x PROC PUBLIC + DB 243,15,30,250 + mov QWORD PTR[8+rsp],rdi ;WIN64 prologue + mov QWORD PTR[16+rsp],rsi + mov r11,rsp +$L$SEH_begin_sqrx_mont_384x:: + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 + mov rcx,r9 + + + + push rbp + + push rbx + + push r12 + + push r13 + + push r14 + + push r15 + + sub rsp,136 + +$L$SEH_body_sqrx_mont_384x:: + + + mov QWORD PTR[rsp],rcx + mov rcx,rdx + + mov QWORD PTR[16+rsp],rdi + mov QWORD PTR[24+rsp],rsi + + + lea rdx,QWORD PTR[48+rsi] + lea rdi,QWORD PTR[32+rsp] + call __add_mod_384 + + + mov rsi,QWORD PTR[24+rsp] + lea rdx,QWORD PTR[48+rsi] + lea rdi,QWORD PTR[((32+48))+rsp] + call __sub_mod_384 + + + mov rsi,QWORD PTR[24+rsp] + lea rbx,QWORD PTR[48+rsi] + + mov rdx,QWORD PTR[48+rsi] + mov r14,QWORD PTR[rsi] + mov r15,QWORD PTR[8+rsi] + mov rax,QWORD PTR[16+rsi] + mov r12,QWORD PTR[24+rsi] + mov rdi,QWORD PTR[32+rsi] + mov rbp,QWORD PTR[40+rsi] + lea rsi,QWORD PTR[((-128))+rsi] + lea rcx,QWORD PTR[((-128))+rcx] + + mulx r9,r8,r14 + call __mulx_mont_384 + add rdx,rdx + adc r15,r15 + adc rax,rax + mov r8,rdx + adc r12,r12 + mov r9,r15 + adc rdi,rdi + mov r10,rax + adc rbp,rbp + mov r11,r12 + sbb rsi,rsi + + sub rdx,QWORD PTR[rcx] + sbb r15,QWORD PTR[8+rcx] + mov r13,rdi + sbb rax,QWORD PTR[16+rcx] + sbb r12,QWORD PTR[24+rcx] + sbb rdi,QWORD PTR[32+rcx] + mov r14,rbp + sbb rbp,QWORD PTR[40+rcx] + sbb rsi,0 + + cmovc rdx,r8 + cmovc r15,r9 + cmovc rax,r10 + mov QWORD PTR[48+rbx],rdx + cmovc r12,r11 + mov QWORD PTR[56+rbx],r15 + cmovc rdi,r13 + mov QWORD PTR[64+rbx],rax + cmovc rbp,r14 + mov QWORD PTR[72+rbx],r12 + mov QWORD PTR[80+rbx],rdi + mov QWORD PTR[88+rbx],rbp + + lea rsi,QWORD PTR[32+rsp] + lea rbx,QWORD PTR[((32+48))+rsp] + + mov rdx,QWORD PTR[((32+48))+rsp] + mov r14,QWORD PTR[((32+0))+rsp] + mov r15,QWORD PTR[((32+8))+rsp] + mov rax,QWORD PTR[((32+16))+rsp] + mov r12,QWORD PTR[((32+24))+rsp] + mov rdi,QWORD PTR[((32+32))+rsp] + mov rbp,QWORD PTR[((32+40))+rsp] + lea rsi,QWORD PTR[((-128))+rsi] + lea rcx,QWORD PTR[((-128))+rcx] + + mulx r9,r8,r14 + call __mulx_mont_384 + + lea r8,QWORD PTR[136+rsp] + mov r15,QWORD PTR[r8] + + mov r14,QWORD PTR[8+r8] + + mov r13,QWORD PTR[16+r8] + + mov r12,QWORD PTR[24+r8] + + mov rbx,QWORD PTR[32+r8] + + mov rbp,QWORD PTR[40+r8] + + lea rsp,QWORD PTR[48+r8] + +$L$SEH_epilogue_sqrx_mont_384x:: + mov rdi,QWORD PTR[8+rsp] ;WIN64 epilogue + mov rsi,QWORD PTR[16+rsp] + + DB 0F3h,0C3h ;repret + +$L$SEH_end_sqrx_mont_384x:: +sqrx_mont_384x ENDP + +PUBLIC mulx_382x + + +ALIGN 32 +mulx_382x PROC PUBLIC + DB 243,15,30,250 + mov QWORD PTR[8+rsp],rdi ;WIN64 prologue + mov QWORD PTR[16+rsp],rsi + mov r11,rsp +$L$SEH_begin_mulx_382x:: + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 + mov rcx,r9 + + + + push rbp + + push rbx + + push r12 + + push r13 + + push r14 + + push r15 + + sub rsp,136 + +$L$SEH_body_mulx_382x:: + + + lea rdi,QWORD PTR[96+rdi] + mov QWORD PTR[rsp],rsi + mov QWORD PTR[8+rsp],rdx + mov QWORD PTR[16+rsp],rdi + mov QWORD PTR[24+rsp],rcx + + + mov r8,QWORD PTR[rsi] + mov r9,QWORD PTR[8+rsi] + mov r10,QWORD PTR[16+rsi] + mov r11,QWORD PTR[24+rsi] + mov r12,QWORD PTR[32+rsi] + mov r13,QWORD PTR[40+rsi] + + add r8,QWORD PTR[48+rsi] + adc r9,QWORD PTR[56+rsi] + adc r10,QWORD PTR[64+rsi] + adc r11,QWORD PTR[72+rsi] + adc r12,QWORD PTR[80+rsi] + adc r13,QWORD PTR[88+rsi] + + mov QWORD PTR[((32+0))+rsp],r8 + mov QWORD PTR[((32+8))+rsp],r9 + mov QWORD PTR[((32+16))+rsp],r10 + mov QWORD PTR[((32+24))+rsp],r11 + mov QWORD PTR[((32+32))+rsp],r12 + mov QWORD PTR[((32+40))+rsp],r13 + + + mov r8,QWORD PTR[rdx] + mov r9,QWORD PTR[8+rdx] + mov r10,QWORD PTR[16+rdx] + mov r11,QWORD PTR[24+rdx] + mov r12,QWORD PTR[32+rdx] + mov r13,QWORD PTR[40+rdx] + + add r8,QWORD PTR[48+rdx] + adc r9,QWORD PTR[56+rdx] + adc r10,QWORD PTR[64+rdx] + adc r11,QWORD PTR[72+rdx] + adc r12,QWORD PTR[80+rdx] + adc r13,QWORD PTR[88+rdx] + + mov QWORD PTR[((32+48))+rsp],r8 + mov QWORD PTR[((32+56))+rsp],r9 + mov QWORD PTR[((32+64))+rsp],r10 + mov QWORD PTR[((32+72))+rsp],r11 + mov QWORD PTR[((32+80))+rsp],r12 + mov QWORD PTR[((32+88))+rsp],r13 + + + lea rsi,QWORD PTR[((32+0))+rsp] + lea rbx,QWORD PTR[((32+48))+rsp] + call __mulx_384 + + + mov rsi,QWORD PTR[rsp] + mov rbx,QWORD PTR[8+rsp] + lea rdi,QWORD PTR[((-96))+rdi] + call __mulx_384 + + + lea rsi,QWORD PTR[((48+128))+rsi] + lea rbx,QWORD PTR[48+rbx] + lea rdi,QWORD PTR[32+rsp] + call __mulx_384 + + + mov rsi,QWORD PTR[16+rsp] + lea rdx,QWORD PTR[32+rsp] + mov rcx,QWORD PTR[24+rsp] + mov rdi,rsi + call __sub_mod_384x384 + + + lea rsi,QWORD PTR[rdi] + lea rdx,QWORD PTR[((-96))+rdi] + call __sub_mod_384x384 + + + lea rsi,QWORD PTR[((-96))+rdi] + lea rdx,QWORD PTR[32+rsp] + lea rdi,QWORD PTR[((-96))+rdi] + call __sub_mod_384x384 + + lea r8,QWORD PTR[136+rsp] + mov r15,QWORD PTR[r8] + + mov r14,QWORD PTR[8+r8] + + mov r13,QWORD PTR[16+r8] + + mov r12,QWORD PTR[24+r8] + + mov rbx,QWORD PTR[32+r8] + + mov rbp,QWORD PTR[40+r8] + + lea rsp,QWORD PTR[48+r8] + +$L$SEH_epilogue_mulx_382x:: + mov rdi,QWORD PTR[8+rsp] ;WIN64 epilogue + mov rsi,QWORD PTR[16+rsp] + + DB 0F3h,0C3h ;repret + +$L$SEH_end_mulx_382x:: +mulx_382x ENDP +PUBLIC sqrx_382x + + +ALIGN 32 +sqrx_382x PROC PUBLIC + DB 243,15,30,250 + mov QWORD PTR[8+rsp],rdi ;WIN64 prologue + mov QWORD PTR[16+rsp],rsi + mov r11,rsp +$L$SEH_begin_sqrx_382x:: + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 + + + + push rbp + + push rbx + + push r12 + + push r13 + + push r14 + + push r15 + + push rsi + +$L$SEH_body_sqrx_382x:: + + + mov rcx,rdx + + + mov r14,QWORD PTR[rsi] + mov r15,QWORD PTR[8+rsi] + mov rax,QWORD PTR[16+rsi] + mov rbx,QWORD PTR[24+rsi] + mov rbp,QWORD PTR[32+rsi] + mov rdx,QWORD PTR[40+rsi] + + mov r8,r14 + add r14,QWORD PTR[48+rsi] + mov r9,r15 + adc r15,QWORD PTR[56+rsi] + mov r10,rax + adc rax,QWORD PTR[64+rsi] + mov r11,rbx + adc rbx,QWORD PTR[72+rsi] + mov r12,rbp + adc rbp,QWORD PTR[80+rsi] + mov r13,rdx + adc rdx,QWORD PTR[88+rsi] + + mov QWORD PTR[rdi],r14 + mov QWORD PTR[8+rdi],r15 + mov QWORD PTR[16+rdi],rax + mov QWORD PTR[24+rdi],rbx + mov QWORD PTR[32+rdi],rbp + mov QWORD PTR[40+rdi],rdx + + + lea rdx,QWORD PTR[48+rsi] + lea rdi,QWORD PTR[48+rdi] + call __sub_mod_384_a_is_loaded + + + lea rsi,QWORD PTR[rdi] + lea rbx,QWORD PTR[((-48))+rdi] + lea rdi,QWORD PTR[((-48))+rdi] + call __mulx_384 + + + mov rsi,QWORD PTR[rsp] + lea rbx,QWORD PTR[48+rsi] + lea rdi,QWORD PTR[96+rdi] + call __mulx_384 + + mov r8,QWORD PTR[rdi] + mov r9,QWORD PTR[8+rdi] + mov r10,QWORD PTR[16+rdi] + mov r11,QWORD PTR[24+rdi] + mov r12,QWORD PTR[32+rdi] + mov r13,QWORD PTR[40+rdi] + mov r14,QWORD PTR[48+rdi] + mov r15,QWORD PTR[56+rdi] + mov rax,QWORD PTR[64+rdi] + mov rbx,QWORD PTR[72+rdi] + mov rbp,QWORD PTR[80+rdi] + add r8,r8 + mov rdx,QWORD PTR[88+rdi] + adc r9,r9 + mov QWORD PTR[rdi],r8 + adc r10,r10 + mov QWORD PTR[8+rdi],r9 + adc r11,r11 + mov QWORD PTR[16+rdi],r10 + adc r12,r12 + mov QWORD PTR[24+rdi],r11 + adc r13,r13 + mov QWORD PTR[32+rdi],r12 + adc r14,r14 + mov QWORD PTR[40+rdi],r13 + adc r15,r15 + mov QWORD PTR[48+rdi],r14 + adc rax,rax + mov QWORD PTR[56+rdi],r15 + adc rbx,rbx + mov QWORD PTR[64+rdi],rax + adc rbp,rbp + mov QWORD PTR[72+rdi],rbx + adc rdx,rdx + mov QWORD PTR[80+rdi],rbp + mov QWORD PTR[88+rdi],rdx + + mov r15,QWORD PTR[8+rsp] + + mov r14,QWORD PTR[16+rsp] + + mov r13,QWORD PTR[24+rsp] + + mov r12,QWORD PTR[32+rsp] + + mov rbx,QWORD PTR[40+rsp] + + mov rbp,QWORD PTR[48+rsp] + + lea rsp,QWORD PTR[56+rsp] + +$L$SEH_epilogue_sqrx_382x:: + mov rdi,QWORD PTR[8+rsp] ;WIN64 epilogue + mov rsi,QWORD PTR[16+rsp] + + DB 0F3h,0C3h ;repret + +$L$SEH_end_sqrx_382x:: +sqrx_382x ENDP +PUBLIC mulx_384 + + +ALIGN 32 +mulx_384 PROC PUBLIC + DB 243,15,30,250 + mov QWORD PTR[8+rsp],rdi ;WIN64 prologue + mov QWORD PTR[16+rsp],rsi + mov r11,rsp +$L$SEH_begin_mulx_384:: + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 + + + + push rbp + + push rbx + + push r12 + + push r13 + + push r14 + + push r15 + +$L$SEH_body_mulx_384:: + + + mov rbx,rdx + call __mulx_384 + + mov r15,QWORD PTR[rsp] + + mov r14,QWORD PTR[8+rsp] + + mov r13,QWORD PTR[16+rsp] + + mov r12,QWORD PTR[24+rsp] + + mov rbx,QWORD PTR[32+rsp] + + mov rbp,QWORD PTR[40+rsp] + + lea rsp,QWORD PTR[48+rsp] + +$L$SEH_epilogue_mulx_384:: + mov rdi,QWORD PTR[8+rsp] ;WIN64 epilogue + mov rsi,QWORD PTR[16+rsp] + + DB 0F3h,0C3h ;repret + +$L$SEH_end_mulx_384:: +mulx_384 ENDP + + +ALIGN 32 +__mulx_384 PROC PRIVATE + DB 243,15,30,250 + mov rdx,QWORD PTR[rbx] + mov r14,QWORD PTR[rsi] + mov r15,QWORD PTR[8+rsi] + mov r10,QWORD PTR[16+rsi] + mov r11,QWORD PTR[24+rsi] + mov r12,QWORD PTR[32+rsi] + mov r13,QWORD PTR[40+rsi] + lea rsi,QWORD PTR[((-128))+rsi] + + mulx rcx,r9,r14 + xor rbp,rbp + + mulx rax,r8,r15 + adcx r8,rcx + mov QWORD PTR[rdi],r9 + + mulx rcx,r9,r10 + adcx r9,rax + + mulx rax,r10,r11 + adcx r10,rcx + + mulx rcx,r11,r12 + adcx r11,rax + + mulx r13,r12,r13 + mov rdx,QWORD PTR[8+rbx] + adcx r12,rcx + adcx r13,rbp + mulx rcx,rax,r14 + adcx rax,r8 + adox r9,rcx + mov QWORD PTR[8+rdi],rax + + mulx rcx,r8,r15 + adcx r8,r9 + adox r10,rcx + + mulx rax,r9,QWORD PTR[((128+16))+rsi] + adcx r9,r10 + adox r11,rax + + mulx rcx,r10,QWORD PTR[((128+24))+rsi] + adcx r10,r11 + adox r12,rcx + + mulx rax,r11,QWORD PTR[((128+32))+rsi] + adcx r11,r12 + adox rax,r13 + + mulx r13,r12,QWORD PTR[((128+40))+rsi] + mov rdx,QWORD PTR[16+rbx] + adcx r12,rax + adox r13,rbp + adcx r13,rbp + mulx rcx,rax,r14 + adcx rax,r8 + adox r9,rcx + mov QWORD PTR[16+rdi],rax + + mulx rcx,r8,r15 + adcx r8,r9 + adox r10,rcx + + mulx rax,r9,QWORD PTR[((128+16))+rsi] + adcx r9,r10 + adox r11,rax + + mulx rcx,r10,QWORD PTR[((128+24))+rsi] + adcx r10,r11 + adox r12,rcx + + mulx rax,r11,QWORD PTR[((128+32))+rsi] + adcx r11,r12 + adox rax,r13 + + mulx r13,r12,QWORD PTR[((128+40))+rsi] + mov rdx,QWORD PTR[24+rbx] + adcx r12,rax + adox r13,rbp + adcx r13,rbp + mulx rcx,rax,r14 + adcx rax,r8 + adox r9,rcx + mov QWORD PTR[24+rdi],rax + + mulx rcx,r8,r15 + adcx r8,r9 + adox r10,rcx + + mulx rax,r9,QWORD PTR[((128+16))+rsi] + adcx r9,r10 + adox r11,rax + + mulx rcx,r10,QWORD PTR[((128+24))+rsi] + adcx r10,r11 + adox r12,rcx + + mulx rax,r11,QWORD PTR[((128+32))+rsi] + adcx r11,r12 + adox rax,r13 + + mulx r13,r12,QWORD PTR[((128+40))+rsi] + mov rdx,QWORD PTR[32+rbx] + adcx r12,rax + adox r13,rbp + adcx r13,rbp + mulx rcx,rax,r14 + adcx rax,r8 + adox r9,rcx + mov QWORD PTR[32+rdi],rax + + mulx rcx,r8,r15 + adcx r8,r9 + adox r10,rcx + + mulx rax,r9,QWORD PTR[((128+16))+rsi] + adcx r9,r10 + adox r11,rax + + mulx rcx,r10,QWORD PTR[((128+24))+rsi] + adcx r10,r11 + adox r12,rcx + + mulx rax,r11,QWORD PTR[((128+32))+rsi] + adcx r11,r12 + adox rax,r13 + + mulx r13,r12,QWORD PTR[((128+40))+rsi] + mov rdx,QWORD PTR[40+rbx] + adcx r12,rax + adox r13,rbp + adcx r13,rbp + mulx rcx,rax,r14 + adcx rax,r8 + adox r9,rcx + mov QWORD PTR[40+rdi],rax + + mulx rcx,r8,r15 + adcx r8,r9 + adox r10,rcx + + mulx rax,r9,QWORD PTR[((128+16))+rsi] + adcx r9,r10 + adox r11,rax + + mulx rcx,r10,QWORD PTR[((128+24))+rsi] + adcx r10,r11 + adox r12,rcx + + mulx rax,r11,QWORD PTR[((128+32))+rsi] + adcx r11,r12 + adox rax,r13 + + mulx r13,r12,QWORD PTR[((128+40))+rsi] + mov rdx,rax + adcx r12,rax + adox r13,rbp + adcx r13,rbp + mov QWORD PTR[48+rdi],r8 + mov QWORD PTR[56+rdi],r9 + mov QWORD PTR[64+rdi],r10 + mov QWORD PTR[72+rdi],r11 + mov QWORD PTR[80+rdi],r12 + mov QWORD PTR[88+rdi],r13 + + DB 0F3h,0C3h ;repret +__mulx_384 ENDP +PUBLIC sqrx_384 + + +ALIGN 32 +sqrx_384 PROC PUBLIC + DB 243,15,30,250 + mov QWORD PTR[8+rsp],rdi ;WIN64 prologue + mov QWORD PTR[16+rsp],rsi + mov r11,rsp +$L$SEH_begin_sqrx_384:: + mov rdi,rcx + mov rsi,rdx + + + + push rbp + + push rbx + + push r12 + + push r13 + + push r14 + + push r15 + + push rdi + +$L$SEH_body_sqrx_384:: + + + call __sqrx_384 + + mov r15,QWORD PTR[8+rsp] + + mov r14,QWORD PTR[16+rsp] + + mov r13,QWORD PTR[24+rsp] + + mov r12,QWORD PTR[32+rsp] + + mov rbx,QWORD PTR[40+rsp] + + mov rbp,QWORD PTR[48+rsp] + + lea rsp,QWORD PTR[56+rsp] + +$L$SEH_epilogue_sqrx_384:: + mov rdi,QWORD PTR[8+rsp] ;WIN64 epilogue + mov rsi,QWORD PTR[16+rsp] + + DB 0F3h,0C3h ;repret + +$L$SEH_end_sqrx_384:: +sqrx_384 ENDP + +ALIGN 32 +__sqrx_384 PROC PRIVATE + DB 243,15,30,250 + mov rdx,QWORD PTR[rsi] + mov r14,QWORD PTR[8+rsi] + mov r15,QWORD PTR[16+rsi] + mov rcx,QWORD PTR[24+rsi] + mov rbx,QWORD PTR[32+rsi] + + + mulx rdi,r8,r14 + mov rbp,QWORD PTR[40+rsi] + mulx rax,r9,r15 + add r9,rdi + mulx rdi,r10,rcx + adc r10,rax + mulx rax,r11,rbx + adc r11,rdi + mulx r13,r12,rbp + mov rdx,r14 + adc r12,rax + adc r13,0 + + + xor r14,r14 + mulx rax,rdi,r15 + adcx r10,rdi + adox r11,rax + + mulx rax,rdi,rcx + adcx r11,rdi + adox r12,rax + + mulx rax,rdi,rbx + adcx r12,rdi + adox r13,rax + + mulx rax,rdi,rbp + mov rdx,r15 + adcx r13,rdi + adox rax,r14 + adcx r14,rax + + + xor r15,r15 + mulx rax,rdi,rcx + adcx r12,rdi + adox r13,rax + + mulx rax,rdi,rbx + adcx r13,rdi + adox r14,rax + + mulx rax,rdi,rbp + mov rdx,rcx + adcx r14,rdi + adox rax,r15 + adcx r15,rax + + + xor rcx,rcx + mulx rax,rdi,rbx + adcx r14,rdi + adox r15,rax + + mulx rax,rdi,rbp + mov rdx,rbx + adcx r15,rdi + adox rax,rcx + adcx rcx,rax + + + mulx rbx,rdi,rbp + mov rdx,QWORD PTR[rsi] + add rcx,rdi + mov rdi,QWORD PTR[8+rsp] + adc rbx,0 + + + xor rbp,rbp + adcx r8,r8 + adcx r9,r9 + adcx r10,r10 + adcx r11,r11 + adcx r12,r12 + + + mulx rax,rdx,rdx + mov QWORD PTR[rdi],rdx + mov rdx,QWORD PTR[8+rsi] + adox r8,rax + mov QWORD PTR[8+rdi],r8 + + mulx rax,r8,rdx + mov rdx,QWORD PTR[16+rsi] + adox r9,r8 + adox r10,rax + mov QWORD PTR[16+rdi],r9 + mov QWORD PTR[24+rdi],r10 + + mulx r9,r8,rdx + mov rdx,QWORD PTR[24+rsi] + adox r11,r8 + adox r12,r9 + adcx r13,r13 + adcx r14,r14 + mov QWORD PTR[32+rdi],r11 + mov QWORD PTR[40+rdi],r12 + + mulx r9,r8,rdx + mov rdx,QWORD PTR[32+rsi] + adox r13,r8 + adox r14,r9 + adcx r15,r15 + adcx rcx,rcx + mov QWORD PTR[48+rdi],r13 + mov QWORD PTR[56+rdi],r14 + + mulx r9,r8,rdx + mov rdx,QWORD PTR[40+rsi] + adox r15,r8 + adox rcx,r9 + adcx rbx,rbx + adcx rbp,rbp + mov QWORD PTR[64+rdi],r15 + mov QWORD PTR[72+rdi],rcx + + mulx r9,r8,rdx + adox rbx,r8 + adox rbp,r9 + + mov QWORD PTR[80+rdi],rbx + mov QWORD PTR[88+rdi],rbp + + DB 0F3h,0C3h ;repret +__sqrx_384 ENDP + + + +PUBLIC redcx_mont_384 + + +ALIGN 32 +redcx_mont_384 PROC PUBLIC + DB 243,15,30,250 + mov QWORD PTR[8+rsp],rdi ;WIN64 prologue + mov QWORD PTR[16+rsp],rsi + mov r11,rsp +$L$SEH_begin_redcx_mont_384:: + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 + mov rcx,r9 + + + + push rbp + + push rbx + + push r12 + + push r13 + + push r14 + + push r15 + + sub rsp,8 + +$L$SEH_body_redcx_mont_384:: + + + mov rbx,rdx + call __mulx_by_1_mont_384 + call __redc_tail_mont_384 + + mov r15,QWORD PTR[8+rsp] + + mov r14,QWORD PTR[16+rsp] + + mov r13,QWORD PTR[24+rsp] + + mov r12,QWORD PTR[32+rsp] + + mov rbx,QWORD PTR[40+rsp] + + mov rbp,QWORD PTR[48+rsp] + + lea rsp,QWORD PTR[56+rsp] + +$L$SEH_epilogue_redcx_mont_384:: + mov rdi,QWORD PTR[8+rsp] ;WIN64 epilogue + mov rsi,QWORD PTR[16+rsp] + + DB 0F3h,0C3h ;repret + +$L$SEH_end_redcx_mont_384:: +redcx_mont_384 ENDP + + + + +PUBLIC fromx_mont_384 + + +ALIGN 32 +fromx_mont_384 PROC PUBLIC + DB 243,15,30,250 + mov QWORD PTR[8+rsp],rdi ;WIN64 prologue + mov QWORD PTR[16+rsp],rsi + mov r11,rsp +$L$SEH_begin_fromx_mont_384:: + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 + mov rcx,r9 + + + + push rbp + + push rbx + + push r12 + + push r13 + + push r14 + + push r15 + + sub rsp,8 + +$L$SEH_body_fromx_mont_384:: + + + mov rbx,rdx + call __mulx_by_1_mont_384 + + + + + mov rax,r14 + mov rcx,r15 + mov rdx,r8 + mov rbp,r9 + + sub r14,QWORD PTR[rbx] + sbb r15,QWORD PTR[8+rbx] + mov r13,r10 + sbb r8,QWORD PTR[16+rbx] + sbb r9,QWORD PTR[24+rbx] + sbb r10,QWORD PTR[32+rbx] + mov rsi,r11 + sbb r11,QWORD PTR[40+rbx] + + cmovc r14,rax + cmovc r15,rcx + cmovc r8,rdx + mov QWORD PTR[rdi],r14 + cmovc r9,rbp + mov QWORD PTR[8+rdi],r15 + cmovc r10,r13 + mov QWORD PTR[16+rdi],r8 + cmovc r11,rsi + mov QWORD PTR[24+rdi],r9 + mov QWORD PTR[32+rdi],r10 + mov QWORD PTR[40+rdi],r11 + + mov r15,QWORD PTR[8+rsp] + + mov r14,QWORD PTR[16+rsp] + + mov r13,QWORD PTR[24+rsp] + + mov r12,QWORD PTR[32+rsp] + + mov rbx,QWORD PTR[40+rsp] + + mov rbp,QWORD PTR[48+rsp] + + lea rsp,QWORD PTR[56+rsp] + +$L$SEH_epilogue_fromx_mont_384:: + mov rdi,QWORD PTR[8+rsp] ;WIN64 epilogue + mov rsi,QWORD PTR[16+rsp] + + DB 0F3h,0C3h ;repret + +$L$SEH_end_fromx_mont_384:: +fromx_mont_384 ENDP + +ALIGN 32 +__mulx_by_1_mont_384 PROC PRIVATE + DB 243,15,30,250 + mov r8,QWORD PTR[rsi] + mov rdx,rcx + mov r9,QWORD PTR[8+rsi] + mov r10,QWORD PTR[16+rsi] + mov r11,QWORD PTR[24+rsi] + mov r12,QWORD PTR[32+rsi] + mov r13,QWORD PTR[40+rsi] + imul rdx,r8 + + + xor r14,r14 + mulx rbp,rax,QWORD PTR[rbx] + adcx r8,rax + adox r9,rbp + + mulx rbp,rax,QWORD PTR[8+rbx] + adcx r9,rax + adox r10,rbp + + mulx rbp,rax,QWORD PTR[16+rbx] + adcx r10,rax + adox r11,rbp + + mulx rbp,rax,QWORD PTR[24+rbx] + adcx r11,rax + adox r12,rbp + + mulx rbp,rax,QWORD PTR[32+rbx] + adcx r12,rax + adox r13,rbp + + mulx rbp,rax,QWORD PTR[40+rbx] + mov rdx,rcx + adcx r13,rax + adox rbp,r14 + adcx r14,rbp + imul rdx,r9 + + + xor r15,r15 + mulx rbp,rax,QWORD PTR[rbx] + adcx r9,rax + adox r10,rbp + + mulx rbp,rax,QWORD PTR[8+rbx] + adcx r10,rax + adox r11,rbp + + mulx rbp,rax,QWORD PTR[16+rbx] + adcx r11,rax + adox r12,rbp + + mulx rbp,rax,QWORD PTR[24+rbx] + adcx r12,rax + adox r13,rbp + + mulx rbp,rax,QWORD PTR[32+rbx] + adcx r13,rax + adox r14,rbp + + mulx rbp,rax,QWORD PTR[40+rbx] + mov rdx,rcx + adcx r14,rax + adox rbp,r15 + adcx r15,rbp + imul rdx,r10 + + + xor r8,r8 + mulx rbp,rax,QWORD PTR[rbx] + adcx r10,rax + adox r11,rbp + + mulx rbp,rax,QWORD PTR[8+rbx] + adcx r11,rax + adox r12,rbp + + mulx rbp,rax,QWORD PTR[16+rbx] + adcx r12,rax + adox r13,rbp + + mulx rbp,rax,QWORD PTR[24+rbx] + adcx r13,rax + adox r14,rbp + + mulx rbp,rax,QWORD PTR[32+rbx] + adcx r14,rax + adox r15,rbp + + mulx rbp,rax,QWORD PTR[40+rbx] + mov rdx,rcx + adcx r15,rax + adox rbp,r8 + adcx r8,rbp + imul rdx,r11 + + + xor r9,r9 + mulx rbp,rax,QWORD PTR[rbx] + adcx r11,rax + adox r12,rbp + + mulx rbp,rax,QWORD PTR[8+rbx] + adcx r12,rax + adox r13,rbp + + mulx rbp,rax,QWORD PTR[16+rbx] + adcx r13,rax + adox r14,rbp + + mulx rbp,rax,QWORD PTR[24+rbx] + adcx r14,rax + adox r15,rbp + + mulx rbp,rax,QWORD PTR[32+rbx] + adcx r15,rax + adox r8,rbp + + mulx rbp,rax,QWORD PTR[40+rbx] + mov rdx,rcx + adcx r8,rax + adox rbp,r9 + adcx r9,rbp + imul rdx,r12 + + + xor r10,r10 + mulx rbp,rax,QWORD PTR[rbx] + adcx r12,rax + adox r13,rbp + + mulx rbp,rax,QWORD PTR[8+rbx] + adcx r13,rax + adox r14,rbp + + mulx rbp,rax,QWORD PTR[16+rbx] + adcx r14,rax + adox r15,rbp + + mulx rbp,rax,QWORD PTR[24+rbx] + adcx r15,rax + adox r8,rbp + + mulx rbp,rax,QWORD PTR[32+rbx] + adcx r8,rax + adox r9,rbp + + mulx rbp,rax,QWORD PTR[40+rbx] + mov rdx,rcx + adcx r9,rax + adox rbp,r10 + adcx r10,rbp + imul rdx,r13 + + + xor r11,r11 + mulx rbp,rax,QWORD PTR[rbx] + adcx r13,rax + adox r14,rbp + + mulx rbp,rax,QWORD PTR[8+rbx] + adcx r14,rax + adox r15,rbp + + mulx rbp,rax,QWORD PTR[16+rbx] + adcx r15,rax + adox r8,rbp + + mulx rbp,rax,QWORD PTR[24+rbx] + adcx r8,rax + adox r9,rbp + + mulx rbp,rax,QWORD PTR[32+rbx] + adcx r9,rax + adox r10,rbp + + mulx rbp,rax,QWORD PTR[40+rbx] + mov rdx,rcx + adcx r10,rax + adox rbp,r11 + adcx r11,rbp + DB 0F3h,0C3h ;repret +__mulx_by_1_mont_384 ENDP + + +ALIGN 32 +__redc_tail_mont_384 PROC PRIVATE + DB 243,15,30,250 + add r14,QWORD PTR[48+rsi] + mov rax,r14 + adc r15,QWORD PTR[56+rsi] + adc r8,QWORD PTR[64+rsi] + adc r9,QWORD PTR[72+rsi] + mov rcx,r15 + adc r10,QWORD PTR[80+rsi] + adc r11,QWORD PTR[88+rsi] + sbb r12,r12 + + + + + mov rdx,r8 + mov rbp,r9 + + sub r14,QWORD PTR[rbx] + sbb r15,QWORD PTR[8+rbx] + mov r13,r10 + sbb r8,QWORD PTR[16+rbx] + sbb r9,QWORD PTR[24+rbx] + sbb r10,QWORD PTR[32+rbx] + mov rsi,r11 + sbb r11,QWORD PTR[40+rbx] + sbb r12,0 + + cmovc r14,rax + cmovc r15,rcx + cmovc r8,rdx + mov QWORD PTR[rdi],r14 + cmovc r9,rbp + mov QWORD PTR[8+rdi],r15 + cmovc r10,r13 + mov QWORD PTR[16+rdi],r8 + cmovc r11,rsi + mov QWORD PTR[24+rdi],r9 + mov QWORD PTR[32+rdi],r10 + mov QWORD PTR[40+rdi],r11 + + DB 0F3h,0C3h ;repret +__redc_tail_mont_384 ENDP + +PUBLIC sgn0x_pty_mont_384 + + +ALIGN 32 +sgn0x_pty_mont_384 PROC PUBLIC + DB 243,15,30,250 + mov QWORD PTR[8+rsp],rdi ;WIN64 prologue + mov QWORD PTR[16+rsp],rsi + mov r11,rsp +$L$SEH_begin_sgn0x_pty_mont_384:: + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 + + + + push rbp + + push rbx + + push r12 + + push r13 + + push r14 + + push r15 + + sub rsp,8 + +$L$SEH_body_sgn0x_pty_mont_384:: + + + mov rbx,rsi + lea rsi,QWORD PTR[rdi] + mov rcx,rdx + call __mulx_by_1_mont_384 + + xor rax,rax + mov r13,r14 + add r14,r14 + adc r15,r15 + adc r8,r8 + adc r9,r9 + adc r10,r10 + adc r11,r11 + adc rax,0 + + sub r14,QWORD PTR[rbx] + sbb r15,QWORD PTR[8+rbx] + sbb r8,QWORD PTR[16+rbx] + sbb r9,QWORD PTR[24+rbx] + sbb r10,QWORD PTR[32+rbx] + sbb r11,QWORD PTR[40+rbx] + sbb rax,0 + + not rax + and r13,1 + and rax,2 + or rax,r13 + + mov r15,QWORD PTR[8+rsp] + + mov r14,QWORD PTR[16+rsp] + + mov r13,QWORD PTR[24+rsp] + + mov r12,QWORD PTR[32+rsp] + + mov rbx,QWORD PTR[40+rsp] + + mov rbp,QWORD PTR[48+rsp] + + lea rsp,QWORD PTR[56+rsp] + +$L$SEH_epilogue_sgn0x_pty_mont_384:: + mov rdi,QWORD PTR[8+rsp] ;WIN64 epilogue + mov rsi,QWORD PTR[16+rsp] + + DB 0F3h,0C3h ;repret + +$L$SEH_end_sgn0x_pty_mont_384:: +sgn0x_pty_mont_384 ENDP + +PUBLIC sgn0x_pty_mont_384x + + +ALIGN 32 +sgn0x_pty_mont_384x PROC PUBLIC + DB 243,15,30,250 + mov QWORD PTR[8+rsp],rdi ;WIN64 prologue + mov QWORD PTR[16+rsp],rsi + mov r11,rsp +$L$SEH_begin_sgn0x_pty_mont_384x:: + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 + + + + push rbp + + push rbx + + push r12 + + push r13 + + push r14 + + push r15 + + sub rsp,8 + +$L$SEH_body_sgn0x_pty_mont_384x:: + + + mov rbx,rsi + lea rsi,QWORD PTR[48+rdi] + mov rcx,rdx + call __mulx_by_1_mont_384 + + mov r12,r14 + or r14,r15 + or r14,r8 + or r14,r9 + or r14,r10 + or r14,r11 + + lea rsi,QWORD PTR[rdi] + xor rdi,rdi + mov r13,r12 + add r12,r12 + adc r15,r15 + adc r8,r8 + adc r9,r9 + adc r10,r10 + adc r11,r11 + adc rdi,0 + + sub r12,QWORD PTR[rbx] + sbb r15,QWORD PTR[8+rbx] + sbb r8,QWORD PTR[16+rbx] + sbb r9,QWORD PTR[24+rbx] + sbb r10,QWORD PTR[32+rbx] + sbb r11,QWORD PTR[40+rbx] + sbb rdi,0 + + mov QWORD PTR[rsp],r14 + not rdi + and r13,1 + and rdi,2 + or rdi,r13 + + call __mulx_by_1_mont_384 + + mov r12,r14 + or r14,r15 + or r14,r8 + or r14,r9 + or r14,r10 + or r14,r11 + + xor rax,rax + mov r13,r12 + add r12,r12 + adc r15,r15 + adc r8,r8 + adc r9,r9 + adc r10,r10 + adc r11,r11 + adc rax,0 + + sub r12,QWORD PTR[rbx] + sbb r15,QWORD PTR[8+rbx] + sbb r8,QWORD PTR[16+rbx] + sbb r9,QWORD PTR[24+rbx] + sbb r10,QWORD PTR[32+rbx] + sbb r11,QWORD PTR[40+rbx] + sbb rax,0 + + mov r12,QWORD PTR[rsp] + + not rax + + test r14,r14 + cmovz r13,rdi + + test r12,r12 + cmovnz rax,rdi + + and r13,1 + and rax,2 + or rax,r13 + + mov r15,QWORD PTR[8+rsp] + + mov r14,QWORD PTR[16+rsp] + + mov r13,QWORD PTR[24+rsp] + + mov r12,QWORD PTR[32+rsp] + + mov rbx,QWORD PTR[40+rsp] + + mov rbp,QWORD PTR[48+rsp] + + lea rsp,QWORD PTR[56+rsp] + +$L$SEH_epilogue_sgn0x_pty_mont_384x:: + mov rdi,QWORD PTR[8+rsp] ;WIN64 epilogue + mov rsi,QWORD PTR[16+rsp] + + DB 0F3h,0C3h ;repret + +$L$SEH_end_sgn0x_pty_mont_384x:: +sgn0x_pty_mont_384x ENDP +PUBLIC mulx_mont_384 + + +ALIGN 32 +mulx_mont_384 PROC PUBLIC + DB 243,15,30,250 + mov QWORD PTR[8+rsp],rdi ;WIN64 prologue + mov QWORD PTR[16+rsp],rsi + mov r11,rsp +$L$SEH_begin_mulx_mont_384:: + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 + mov rcx,r9 + mov r8,QWORD PTR[40+rsp] + + + + push rbp + + push rbx + + push r12 + + push r13 + + push r14 + + push r15 + + lea rsp,QWORD PTR[((-24))+rsp] + +$L$SEH_body_mulx_mont_384:: + + + mov rbx,rdx + mov rdx,QWORD PTR[rdx] + mov r14,QWORD PTR[rsi] + mov r15,QWORD PTR[8+rsi] + mov rax,QWORD PTR[16+rsi] + mov r12,QWORD PTR[24+rsi] + mov QWORD PTR[16+rsp],rdi + mov rdi,QWORD PTR[32+rsi] + mov rbp,QWORD PTR[40+rsi] + lea rsi,QWORD PTR[((-128))+rsi] + lea rcx,QWORD PTR[((-128))+rcx] + mov QWORD PTR[rsp],r8 + + mulx r9,r8,r14 + call __mulx_mont_384 + + mov r15,QWORD PTR[24+rsp] + + mov r14,QWORD PTR[32+rsp] + + mov r13,QWORD PTR[40+rsp] + + mov r12,QWORD PTR[48+rsp] + + mov rbx,QWORD PTR[56+rsp] + + mov rbp,QWORD PTR[64+rsp] + + lea rsp,QWORD PTR[72+rsp] + +$L$SEH_epilogue_mulx_mont_384:: + mov rdi,QWORD PTR[8+rsp] ;WIN64 epilogue + mov rsi,QWORD PTR[16+rsp] + + DB 0F3h,0C3h ;repret + +$L$SEH_end_mulx_mont_384:: +mulx_mont_384 ENDP + +ALIGN 32 +__mulx_mont_384 PROC PRIVATE + DB 243,15,30,250 + + mulx r10,r14,r15 + mulx r11,r15,rax + add r9,r14 + mulx r12,rax,r12 + adc r10,r15 + mulx r13,rdi,rdi + adc r11,rax + mulx r14,rbp,rbp + mov rdx,QWORD PTR[8+rbx] + adc r12,rdi + adc r13,rbp + adc r14,0 + xor r15,r15 + + mov QWORD PTR[16+rsp],r8 + imul r8,QWORD PTR[8+rsp] + + + xor rax,rax + mulx rbp,rdi,QWORD PTR[((0+128))+rsi] + adox r9,rdi + adcx r10,rbp + + mulx rbp,rdi,QWORD PTR[((8+128))+rsi] + adox r10,rdi + adcx r11,rbp + + mulx rbp,rdi,QWORD PTR[((16+128))+rsi] + adox r11,rdi + adcx r12,rbp + + mulx rbp,rdi,QWORD PTR[((24+128))+rsi] + adox r12,rdi + adcx r13,rbp + + mulx rbp,rdi,QWORD PTR[((32+128))+rsi] + adox r13,rdi + adcx r14,rbp + + mulx rbp,rdi,QWORD PTR[((40+128))+rsi] + mov rdx,r8 + adox r14,rdi + adcx r15,rbp + adox r15,rax + adox rax,rax + + + xor r8,r8 + mulx rbp,rdi,QWORD PTR[((0+128))+rcx] + adcx rdi,QWORD PTR[16+rsp] + adox r9,rbp + + mulx rbp,rdi,QWORD PTR[((8+128))+rcx] + adcx r9,rdi + adox r10,rbp + + mulx rbp,rdi,QWORD PTR[((16+128))+rcx] + adcx r10,rdi + adox r11,rbp + + mulx rbp,rdi,QWORD PTR[((24+128))+rcx] + adcx r11,rdi + adox r12,rbp + + mulx rbp,rdi,QWORD PTR[((32+128))+rcx] + adcx r12,rdi + adox r13,rbp + + mulx rbp,rdi,QWORD PTR[((40+128))+rcx] + mov rdx,QWORD PTR[16+rbx] + adcx r13,rdi + adox r14,rbp + adcx r14,r8 + adox r15,r8 + adcx r15,r8 + adox rax,r8 + adcx rax,r8 + mov QWORD PTR[16+rsp],r9 + imul r9,QWORD PTR[8+rsp] + + + xor r8,r8 + mulx rbp,rdi,QWORD PTR[((0+128))+rsi] + adox r10,rdi + adcx r11,rbp + + mulx rbp,rdi,QWORD PTR[((8+128))+rsi] + adox r11,rdi + adcx r12,rbp + + mulx rbp,rdi,QWORD PTR[((16+128))+rsi] + adox r12,rdi + adcx r13,rbp + + mulx rbp,rdi,QWORD PTR[((24+128))+rsi] + adox r13,rdi + adcx r14,rbp + + mulx rbp,rdi,QWORD PTR[((32+128))+rsi] + adox r14,rdi + adcx r15,rbp + + mulx rbp,rdi,QWORD PTR[((40+128))+rsi] + mov rdx,r9 + adox r15,rdi + adcx rax,rbp + adox rax,r8 + adox r8,r8 + + + xor r9,r9 + mulx rbp,rdi,QWORD PTR[((0+128))+rcx] + adcx rdi,QWORD PTR[16+rsp] + adox r10,rbp + + mulx rbp,rdi,QWORD PTR[((8+128))+rcx] + adcx r10,rdi + adox r11,rbp + + mulx rbp,rdi,QWORD PTR[((16+128))+rcx] + adcx r11,rdi + adox r12,rbp + + mulx rbp,rdi,QWORD PTR[((24+128))+rcx] + adcx r12,rdi + adox r13,rbp + + mulx rbp,rdi,QWORD PTR[((32+128))+rcx] + adcx r13,rdi + adox r14,rbp + + mulx rbp,rdi,QWORD PTR[((40+128))+rcx] + mov rdx,QWORD PTR[24+rbx] + adcx r14,rdi + adox r15,rbp + adcx r15,r9 + adox rax,r9 + adcx rax,r9 + adox r8,r9 + adcx r8,r9 + mov QWORD PTR[16+rsp],r10 + imul r10,QWORD PTR[8+rsp] + + + xor r9,r9 + mulx rbp,rdi,QWORD PTR[((0+128))+rsi] + adox r11,rdi + adcx r12,rbp + + mulx rbp,rdi,QWORD PTR[((8+128))+rsi] + adox r12,rdi + adcx r13,rbp + + mulx rbp,rdi,QWORD PTR[((16+128))+rsi] + adox r13,rdi + adcx r14,rbp + + mulx rbp,rdi,QWORD PTR[((24+128))+rsi] + adox r14,rdi + adcx r15,rbp + + mulx rbp,rdi,QWORD PTR[((32+128))+rsi] + adox r15,rdi + adcx rax,rbp + + mulx rbp,rdi,QWORD PTR[((40+128))+rsi] + mov rdx,r10 + adox rax,rdi + adcx r8,rbp + adox r8,r9 + adox r9,r9 + + + xor r10,r10 + mulx rbp,rdi,QWORD PTR[((0+128))+rcx] + adcx rdi,QWORD PTR[16+rsp] + adox r11,rbp + + mulx rbp,rdi,QWORD PTR[((8+128))+rcx] + adcx r11,rdi + adox r12,rbp + + mulx rbp,rdi,QWORD PTR[((16+128))+rcx] + adcx r12,rdi + adox r13,rbp + + mulx rbp,rdi,QWORD PTR[((24+128))+rcx] + adcx r13,rdi + adox r14,rbp + + mulx rbp,rdi,QWORD PTR[((32+128))+rcx] + adcx r14,rdi + adox r15,rbp + + mulx rbp,rdi,QWORD PTR[((40+128))+rcx] + mov rdx,QWORD PTR[32+rbx] + adcx r15,rdi + adox rax,rbp + adcx rax,r10 + adox r8,r10 + adcx r8,r10 + adox r9,r10 + adcx r9,r10 + mov QWORD PTR[16+rsp],r11 + imul r11,QWORD PTR[8+rsp] + + + xor r10,r10 + mulx rbp,rdi,QWORD PTR[((0+128))+rsi] + adox r12,rdi + adcx r13,rbp + + mulx rbp,rdi,QWORD PTR[((8+128))+rsi] + adox r13,rdi + adcx r14,rbp + + mulx rbp,rdi,QWORD PTR[((16+128))+rsi] + adox r14,rdi + adcx r15,rbp + + mulx rbp,rdi,QWORD PTR[((24+128))+rsi] + adox r15,rdi + adcx rax,rbp + + mulx rbp,rdi,QWORD PTR[((32+128))+rsi] + adox rax,rdi + adcx r8,rbp + + mulx rbp,rdi,QWORD PTR[((40+128))+rsi] + mov rdx,r11 + adox r8,rdi + adcx r9,rbp + adox r9,r10 + adox r10,r10 + + + xor r11,r11 + mulx rbp,rdi,QWORD PTR[((0+128))+rcx] + adcx rdi,QWORD PTR[16+rsp] + adox r12,rbp + + mulx rbp,rdi,QWORD PTR[((8+128))+rcx] + adcx r12,rdi + adox r13,rbp + + mulx rbp,rdi,QWORD PTR[((16+128))+rcx] + adcx r13,rdi + adox r14,rbp + + mulx rbp,rdi,QWORD PTR[((24+128))+rcx] + adcx r14,rdi + adox r15,rbp + + mulx rbp,rdi,QWORD PTR[((32+128))+rcx] + adcx r15,rdi + adox rax,rbp + + mulx rbp,rdi,QWORD PTR[((40+128))+rcx] + mov rdx,QWORD PTR[40+rbx] + adcx rax,rdi + adox r8,rbp + adcx r8,r11 + adox r9,r11 + adcx r9,r11 + adox r10,r11 + adcx r10,r11 + mov QWORD PTR[16+rsp],r12 + imul r12,QWORD PTR[8+rsp] + + + xor r11,r11 + mulx rbp,rdi,QWORD PTR[((0+128))+rsi] + adox r13,rdi + adcx r14,rbp + + mulx rbp,rdi,QWORD PTR[((8+128))+rsi] + adox r14,rdi + adcx r15,rbp + + mulx rbp,rdi,QWORD PTR[((16+128))+rsi] + adox r15,rdi + adcx rax,rbp + + mulx rbp,rdi,QWORD PTR[((24+128))+rsi] + adox rax,rdi + adcx r8,rbp + + mulx rbp,rdi,QWORD PTR[((32+128))+rsi] + adox r8,rdi + adcx r9,rbp + + mulx rbp,rdi,QWORD PTR[((40+128))+rsi] + mov rdx,r12 + adox r9,rdi + adcx r10,rbp + adox r10,r11 + adox r11,r11 + + + xor r12,r12 + mulx rbp,rdi,QWORD PTR[((0+128))+rcx] + adcx rdi,QWORD PTR[16+rsp] + adox r13,rbp + + mulx rbp,rdi,QWORD PTR[((8+128))+rcx] + adcx r13,rdi + adox r14,rbp + + mulx rbp,rdi,QWORD PTR[((16+128))+rcx] + adcx r14,rdi + adox r15,rbp + + mulx rbp,rdi,QWORD PTR[((24+128))+rcx] + adcx r15,rdi + adox rax,rbp + + mulx rbp,rdi,QWORD PTR[((32+128))+rcx] + adcx rax,rdi + adox r8,rbp + + mulx rbp,rdi,QWORD PTR[((40+128))+rcx] + mov rdx,r13 + adcx r8,rdi + adox r9,rbp + adcx r9,r12 + adox r10,r12 + adcx r10,r12 + adox r11,r12 + adcx r11,r12 + imul rdx,QWORD PTR[8+rsp] + mov rbx,QWORD PTR[24+rsp] + + + xor r12,r12 + mulx rbp,rdi,QWORD PTR[((0+128))+rcx] + adcx r13,rdi + adox r14,rbp + + mulx rbp,rdi,QWORD PTR[((8+128))+rcx] + adcx r14,rdi + adox r15,rbp + + mulx rbp,rdi,QWORD PTR[((16+128))+rcx] + adcx r15,rdi + adox rax,rbp + + mulx rbp,rdi,QWORD PTR[((24+128))+rcx] + adcx rax,rdi + adox r8,rbp + mov r13,r15 + + mulx rbp,rdi,QWORD PTR[((32+128))+rcx] + adcx r8,rdi + adox r9,rbp + mov rsi,rax + + mulx rbp,rdi,QWORD PTR[((40+128))+rcx] + adcx r9,rdi + adox r10,rbp + mov rdx,r14 + adcx r10,r12 + adox r11,r12 + lea rcx,QWORD PTR[128+rcx] + mov r12,r8 + adc r11,0 + + + + + sub r14,QWORD PTR[rcx] + sbb r15,QWORD PTR[8+rcx] + mov rdi,r9 + sbb rax,QWORD PTR[16+rcx] + sbb r8,QWORD PTR[24+rcx] + sbb r9,QWORD PTR[32+rcx] + mov rbp,r10 + sbb r10,QWORD PTR[40+rcx] + sbb r11,0 + + cmovnc rdx,r14 + cmovc r15,r13 + cmovc rax,rsi + cmovnc r12,r8 + mov QWORD PTR[rbx],rdx + cmovnc rdi,r9 + mov QWORD PTR[8+rbx],r15 + cmovnc rbp,r10 + mov QWORD PTR[16+rbx],rax + mov QWORD PTR[24+rbx],r12 + mov QWORD PTR[32+rbx],rdi + mov QWORD PTR[40+rbx],rbp + + DB 0F3h,0C3h ;repret + +__mulx_mont_384 ENDP +PUBLIC sqrx_mont_384 + + +ALIGN 32 +sqrx_mont_384 PROC PUBLIC + DB 243,15,30,250 + mov QWORD PTR[8+rsp],rdi ;WIN64 prologue + mov QWORD PTR[16+rsp],rsi + mov r11,rsp +$L$SEH_begin_sqrx_mont_384:: + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 + mov rcx,r9 + + + + push rbp + + push rbx + + push r12 + + push r13 + + push r14 + + push r15 + + lea rsp,QWORD PTR[((-24))+rsp] + +$L$SEH_body_sqrx_mont_384:: + + + mov r8,rcx + lea rcx,QWORD PTR[((-128))+rdx] + mov rdx,QWORD PTR[rsi] + mov r15,QWORD PTR[8+rsi] + mov rax,QWORD PTR[16+rsi] + mov r12,QWORD PTR[24+rsi] + mov QWORD PTR[16+rsp],rdi + mov rdi,QWORD PTR[32+rsi] + mov rbp,QWORD PTR[40+rsi] + + lea rbx,QWORD PTR[rsi] + mov QWORD PTR[rsp],r8 + lea rsi,QWORD PTR[((-128))+rsi] + + mulx r9,r8,rdx + call __mulx_mont_384 + + mov r15,QWORD PTR[24+rsp] + + mov r14,QWORD PTR[32+rsp] + + mov r13,QWORD PTR[40+rsp] + + mov r12,QWORD PTR[48+rsp] + + mov rbx,QWORD PTR[56+rsp] + + mov rbp,QWORD PTR[64+rsp] + + lea rsp,QWORD PTR[72+rsp] + +$L$SEH_epilogue_sqrx_mont_384:: + mov rdi,QWORD PTR[8+rsp] ;WIN64 epilogue + mov rsi,QWORD PTR[16+rsp] + + DB 0F3h,0C3h ;repret + +$L$SEH_end_sqrx_mont_384:: +sqrx_mont_384 ENDP + +PUBLIC sqrx_n_mul_mont_384 + + +ALIGN 32 +sqrx_n_mul_mont_384 PROC PUBLIC + DB 243,15,30,250 + mov QWORD PTR[8+rsp],rdi ;WIN64 prologue + mov QWORD PTR[16+rsp],rsi + mov r11,rsp +$L$SEH_begin_sqrx_n_mul_mont_384:: + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 + mov rcx,r9 + mov r8,QWORD PTR[40+rsp] + mov r9,QWORD PTR[48+rsp] + + + + push rbp + + push rbx + + push r12 + + push r13 + + push r14 + + push r15 + + lea rsp,QWORD PTR[((-40))+rsp] + +$L$SEH_body_sqrx_n_mul_mont_384:: + + + mov r10,rdx + mov rdx,QWORD PTR[rsi] + mov r15,QWORD PTR[8+rsi] + mov rax,QWORD PTR[16+rsi] + mov rbx,rsi + mov r12,QWORD PTR[24+rsi] + mov QWORD PTR[16+rsp],rdi + mov rdi,QWORD PTR[32+rsi] + mov rbp,QWORD PTR[40+rsi] + + mov QWORD PTR[rsp],r8 + mov QWORD PTR[24+rsp],r9 + movq xmm2,QWORD PTR[r9] + +$L$oop_sqrx_384:: + movd xmm1,r10d + lea rsi,QWORD PTR[((-128))+rbx] + lea rcx,QWORD PTR[((-128))+rcx] + + mulx r9,r8,rdx + call __mulx_mont_384 + + movd r10d,xmm1 + dec r10d + jnz $L$oop_sqrx_384 + + mov r14,rdx +DB 102,72,15,126,210 + lea rsi,QWORD PTR[((-128))+rbx] + mov rbx,QWORD PTR[24+rsp] + lea rcx,QWORD PTR[((-128))+rcx] + + mulx r9,r8,r14 + call __mulx_mont_384 + + mov r15,QWORD PTR[40+rsp] + + mov r14,QWORD PTR[48+rsp] + + mov r13,QWORD PTR[56+rsp] + + mov r12,QWORD PTR[64+rsp] + + mov rbx,QWORD PTR[72+rsp] + + mov rbp,QWORD PTR[80+rsp] + + lea rsp,QWORD PTR[88+rsp] + +$L$SEH_epilogue_sqrx_n_mul_mont_384:: + mov rdi,QWORD PTR[8+rsp] ;WIN64 epilogue + mov rsi,QWORD PTR[16+rsp] + + DB 0F3h,0C3h ;repret + +$L$SEH_end_sqrx_n_mul_mont_384:: +sqrx_n_mul_mont_384 ENDP + +PUBLIC sqrx_n_mul_mont_383 + + +ALIGN 32 +sqrx_n_mul_mont_383 PROC PUBLIC + DB 243,15,30,250 + mov QWORD PTR[8+rsp],rdi ;WIN64 prologue + mov QWORD PTR[16+rsp],rsi + mov r11,rsp +$L$SEH_begin_sqrx_n_mul_mont_383:: + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 + mov rcx,r9 + mov r8,QWORD PTR[40+rsp] + mov r9,QWORD PTR[48+rsp] + + + + push rbp + + push rbx + + push r12 + + push r13 + + push r14 + + push r15 + + lea rsp,QWORD PTR[((-40))+rsp] + +$L$SEH_body_sqrx_n_mul_mont_383:: + + + mov r10,rdx + mov rdx,QWORD PTR[rsi] + mov r15,QWORD PTR[8+rsi] + mov rax,QWORD PTR[16+rsi] + mov rbx,rsi + mov r12,QWORD PTR[24+rsi] + mov QWORD PTR[16+rsp],rdi + mov rdi,QWORD PTR[32+rsi] + mov rbp,QWORD PTR[40+rsi] + + mov QWORD PTR[rsp],r8 + mov QWORD PTR[24+rsp],r9 + movq xmm2,QWORD PTR[r9] + lea rcx,QWORD PTR[((-128))+rcx] + +$L$oop_sqrx_383:: + movd xmm1,r10d + lea rsi,QWORD PTR[((-128))+rbx] + + mulx r9,r8,rdx + call __mulx_mont_383_nonred + + movd r10d,xmm1 + dec r10d + jnz $L$oop_sqrx_383 + + mov r14,rdx +DB 102,72,15,126,210 + lea rsi,QWORD PTR[((-128))+rbx] + mov rbx,QWORD PTR[24+rsp] + + mulx r9,r8,r14 + call __mulx_mont_384 + + mov r15,QWORD PTR[40+rsp] + + mov r14,QWORD PTR[48+rsp] + + mov r13,QWORD PTR[56+rsp] + + mov r12,QWORD PTR[64+rsp] + + mov rbx,QWORD PTR[72+rsp] + + mov rbp,QWORD PTR[80+rsp] + + lea rsp,QWORD PTR[88+rsp] + +$L$SEH_epilogue_sqrx_n_mul_mont_383:: + mov rdi,QWORD PTR[8+rsp] ;WIN64 epilogue + mov rsi,QWORD PTR[16+rsp] + + DB 0F3h,0C3h ;repret + +$L$SEH_end_sqrx_n_mul_mont_383:: +sqrx_n_mul_mont_383 ENDP + +ALIGN 32 +__mulx_mont_383_nonred PROC PRIVATE + DB 243,15,30,250 + + mulx r10,r14,r15 + mulx r11,r15,rax + add r9,r14 + mulx r12,rax,r12 + adc r10,r15 + mulx r13,rdi,rdi + adc r11,rax + mulx r14,rbp,rbp + mov rdx,QWORD PTR[8+rbx] + adc r12,rdi + adc r13,rbp + adc r14,0 + mov rax,r8 + imul r8,QWORD PTR[8+rsp] + + + xor r15,r15 + mulx rbp,rdi,QWORD PTR[((0+128))+rsi] + adox r9,rdi + adcx r10,rbp + + mulx rbp,rdi,QWORD PTR[((8+128))+rsi] + adox r10,rdi + adcx r11,rbp + + mulx rbp,rdi,QWORD PTR[((16+128))+rsi] + adox r11,rdi + adcx r12,rbp + + mulx rbp,rdi,QWORD PTR[((24+128))+rsi] + adox r12,rdi + adcx r13,rbp + + mulx rbp,rdi,QWORD PTR[((32+128))+rsi] + adox r13,rdi + adcx r14,rbp + + mulx rbp,rdi,QWORD PTR[((40+128))+rsi] + mov rdx,r8 + adox r14,rdi + adcx rbp,r15 + adox r15,rbp + + + xor r8,r8 + mulx rbp,rdi,QWORD PTR[((0+128))+rcx] + adcx rax,rdi + adox r9,rbp + + mulx rbp,rdi,QWORD PTR[((8+128))+rcx] + adcx r9,rdi + adox r10,rbp + + mulx rbp,rdi,QWORD PTR[((16+128))+rcx] + adcx r10,rdi + adox r11,rbp + + mulx rbp,rdi,QWORD PTR[((24+128))+rcx] + adcx r11,rdi + adox r12,rbp + + mulx rbp,rdi,QWORD PTR[((32+128))+rcx] + adcx r12,rdi + adox r13,rbp + + mulx rbp,rdi,QWORD PTR[((40+128))+rcx] + mov rdx,QWORD PTR[16+rbx] + adcx r13,rdi + adox r14,rbp + adcx r14,rax + adox r15,rax + adcx r15,rax + mov r8,r9 + imul r9,QWORD PTR[8+rsp] + + + xor rax,rax + mulx rbp,rdi,QWORD PTR[((0+128))+rsi] + adox r10,rdi + adcx r11,rbp + + mulx rbp,rdi,QWORD PTR[((8+128))+rsi] + adox r11,rdi + adcx r12,rbp + + mulx rbp,rdi,QWORD PTR[((16+128))+rsi] + adox r12,rdi + adcx r13,rbp + + mulx rbp,rdi,QWORD PTR[((24+128))+rsi] + adox r13,rdi + adcx r14,rbp + + mulx rbp,rdi,QWORD PTR[((32+128))+rsi] + adox r14,rdi + adcx r15,rbp + + mulx rbp,rdi,QWORD PTR[((40+128))+rsi] + mov rdx,r9 + adox r15,rdi + adcx rbp,rax + adox rax,rbp + + + xor r9,r9 + mulx rbp,rdi,QWORD PTR[((0+128))+rcx] + adcx r8,rdi + adox r10,rbp + + mulx rbp,rdi,QWORD PTR[((8+128))+rcx] + adcx r10,rdi + adox r11,rbp + + mulx rbp,rdi,QWORD PTR[((16+128))+rcx] + adcx r11,rdi + adox r12,rbp + + mulx rbp,rdi,QWORD PTR[((24+128))+rcx] + adcx r12,rdi + adox r13,rbp + + mulx rbp,rdi,QWORD PTR[((32+128))+rcx] + adcx r13,rdi + adox r14,rbp + + mulx rbp,rdi,QWORD PTR[((40+128))+rcx] + mov rdx,QWORD PTR[24+rbx] + adcx r14,rdi + adox r15,rbp + adcx r15,r8 + adox rax,r8 + adcx rax,r8 + mov r9,r10 + imul r10,QWORD PTR[8+rsp] + + + xor r8,r8 + mulx rbp,rdi,QWORD PTR[((0+128))+rsi] + adox r11,rdi + adcx r12,rbp + + mulx rbp,rdi,QWORD PTR[((8+128))+rsi] + adox r12,rdi + adcx r13,rbp + + mulx rbp,rdi,QWORD PTR[((16+128))+rsi] + adox r13,rdi + adcx r14,rbp + + mulx rbp,rdi,QWORD PTR[((24+128))+rsi] + adox r14,rdi + adcx r15,rbp + + mulx rbp,rdi,QWORD PTR[((32+128))+rsi] + adox r15,rdi + adcx rax,rbp + + mulx rbp,rdi,QWORD PTR[((40+128))+rsi] + mov rdx,r10 + adox rax,rdi + adcx rbp,r8 + adox r8,rbp + + + xor r10,r10 + mulx rbp,rdi,QWORD PTR[((0+128))+rcx] + adcx r9,rdi + adox r11,rbp + + mulx rbp,rdi,QWORD PTR[((8+128))+rcx] + adcx r11,rdi + adox r12,rbp + + mulx rbp,rdi,QWORD PTR[((16+128))+rcx] + adcx r12,rdi + adox r13,rbp + + mulx rbp,rdi,QWORD PTR[((24+128))+rcx] + adcx r13,rdi + adox r14,rbp + + mulx rbp,rdi,QWORD PTR[((32+128))+rcx] + adcx r14,rdi + adox r15,rbp + + mulx rbp,rdi,QWORD PTR[((40+128))+rcx] + mov rdx,QWORD PTR[32+rbx] + adcx r15,rdi + adox rax,rbp + adcx rax,r9 + adox r8,r9 + adcx r8,r9 + mov r10,r11 + imul r11,QWORD PTR[8+rsp] + + + xor r9,r9 + mulx rbp,rdi,QWORD PTR[((0+128))+rsi] + adox r12,rdi + adcx r13,rbp + + mulx rbp,rdi,QWORD PTR[((8+128))+rsi] + adox r13,rdi + adcx r14,rbp + + mulx rbp,rdi,QWORD PTR[((16+128))+rsi] + adox r14,rdi + adcx r15,rbp + + mulx rbp,rdi,QWORD PTR[((24+128))+rsi] + adox r15,rdi + adcx rax,rbp + + mulx rbp,rdi,QWORD PTR[((32+128))+rsi] + adox rax,rdi + adcx r8,rbp + + mulx rbp,rdi,QWORD PTR[((40+128))+rsi] + mov rdx,r11 + adox r8,rdi + adcx rbp,r9 + adox r9,rbp + + + xor r11,r11 + mulx rbp,rdi,QWORD PTR[((0+128))+rcx] + adcx r10,rdi + adox r12,rbp + + mulx rbp,rdi,QWORD PTR[((8+128))+rcx] + adcx r12,rdi + adox r13,rbp + + mulx rbp,rdi,QWORD PTR[((16+128))+rcx] + adcx r13,rdi + adox r14,rbp + + mulx rbp,rdi,QWORD PTR[((24+128))+rcx] + adcx r14,rdi + adox r15,rbp + + mulx rbp,rdi,QWORD PTR[((32+128))+rcx] + adcx r15,rdi + adox rax,rbp + + mulx rbp,rdi,QWORD PTR[((40+128))+rcx] + mov rdx,QWORD PTR[40+rbx] + adcx rax,rdi + adox r8,rbp + adcx r8,r10 + adox r9,r10 + adcx r9,r10 + mov r11,r12 + imul r12,QWORD PTR[8+rsp] + + + xor r10,r10 + mulx rbp,rdi,QWORD PTR[((0+128))+rsi] + adox r13,rdi + adcx r14,rbp + + mulx rbp,rdi,QWORD PTR[((8+128))+rsi] + adox r14,rdi + adcx r15,rbp + + mulx rbp,rdi,QWORD PTR[((16+128))+rsi] + adox r15,rdi + adcx rax,rbp + + mulx rbp,rdi,QWORD PTR[((24+128))+rsi] + adox rax,rdi + adcx r8,rbp + + mulx rbp,rdi,QWORD PTR[((32+128))+rsi] + adox r8,rdi + adcx r9,rbp + + mulx rbp,rdi,QWORD PTR[((40+128))+rsi] + mov rdx,r12 + adox r9,rdi + adcx rbp,r10 + adox r10,rbp + + + xor r12,r12 + mulx rbp,rdi,QWORD PTR[((0+128))+rcx] + adcx r11,rdi + adox r13,rbp + + mulx rbp,rdi,QWORD PTR[((8+128))+rcx] + adcx r13,rdi + adox r14,rbp + + mulx rbp,rdi,QWORD PTR[((16+128))+rcx] + adcx r14,rdi + adox r15,rbp + + mulx rbp,rdi,QWORD PTR[((24+128))+rcx] + adcx r15,rdi + adox rax,rbp + + mulx rbp,rdi,QWORD PTR[((32+128))+rcx] + adcx rax,rdi + adox r8,rbp + + mulx rbp,rdi,QWORD PTR[((40+128))+rcx] + mov rdx,r13 + adcx r8,rdi + adox r9,rbp + adcx r9,r11 + adox r10,r11 + adcx r10,r11 + imul rdx,QWORD PTR[8+rsp] + mov rbx,QWORD PTR[24+rsp] + + + xor r12,r12 + mulx rbp,rdi,QWORD PTR[((0+128))+rcx] + adcx r13,rdi + adox r14,rbp + + mulx rbp,rdi,QWORD PTR[((8+128))+rcx] + adcx r14,rdi + adox r15,rbp + + mulx rbp,rdi,QWORD PTR[((16+128))+rcx] + adcx r15,rdi + adox rax,rbp + + mulx rbp,rdi,QWORD PTR[((24+128))+rcx] + adcx rax,rdi + adox r8,rbp + + mulx rbp,rdi,QWORD PTR[((32+128))+rcx] + adcx r8,rdi + adox r9,rbp + + mulx rbp,rdi,QWORD PTR[((40+128))+rcx] + mov rdx,r14 + adcx r9,rdi + adox r10,rbp + adc r10,0 + mov r12,r8 + + mov QWORD PTR[rbx],r14 + mov QWORD PTR[8+rbx],r15 + mov QWORD PTR[16+rbx],rax + mov rdi,r9 + mov QWORD PTR[24+rbx],r8 + mov QWORD PTR[32+rbx],r9 + mov QWORD PTR[40+rbx],r10 + mov rbp,r10 + + DB 0F3h,0C3h ;repret + +__mulx_mont_383_nonred ENDP +PUBLIC sqrx_mont_382x + + +ALIGN 32 +sqrx_mont_382x PROC PUBLIC + DB 243,15,30,250 + mov QWORD PTR[8+rsp],rdi ;WIN64 prologue + mov QWORD PTR[16+rsp],rsi + mov r11,rsp +$L$SEH_begin_sqrx_mont_382x:: + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 + mov rcx,r9 + + + + push rbp + + push rbx + + push r12 + + push r13 + + push r14 + + push r15 + + sub rsp,136 + +$L$SEH_body_sqrx_mont_382x:: + + + mov QWORD PTR[rsp],rcx + mov rcx,rdx + mov QWORD PTR[16+rsp],rdi + mov QWORD PTR[24+rsp],rsi + + + mov r8,QWORD PTR[rsi] + mov r9,QWORD PTR[8+rsi] + mov r10,QWORD PTR[16+rsi] + mov r11,QWORD PTR[24+rsi] + mov r12,QWORD PTR[32+rsi] + mov r13,QWORD PTR[40+rsi] + + mov r14,r8 + add r8,QWORD PTR[48+rsi] + mov r15,r9 + adc r9,QWORD PTR[56+rsi] + mov rax,r10 + adc r10,QWORD PTR[64+rsi] + mov rdx,r11 + adc r11,QWORD PTR[72+rsi] + mov rbx,r12 + adc r12,QWORD PTR[80+rsi] + mov rbp,r13 + adc r13,QWORD PTR[88+rsi] + + sub r14,QWORD PTR[48+rsi] + sbb r15,QWORD PTR[56+rsi] + sbb rax,QWORD PTR[64+rsi] + sbb rdx,QWORD PTR[72+rsi] + sbb rbx,QWORD PTR[80+rsi] + sbb rbp,QWORD PTR[88+rsi] + sbb rdi,rdi + + mov QWORD PTR[((32+0))+rsp],r8 + mov QWORD PTR[((32+8))+rsp],r9 + mov QWORD PTR[((32+16))+rsp],r10 + mov QWORD PTR[((32+24))+rsp],r11 + mov QWORD PTR[((32+32))+rsp],r12 + mov QWORD PTR[((32+40))+rsp],r13 + + mov QWORD PTR[((32+48))+rsp],r14 + mov QWORD PTR[((32+56))+rsp],r15 + mov QWORD PTR[((32+64))+rsp],rax + mov QWORD PTR[((32+72))+rsp],rdx + mov QWORD PTR[((32+80))+rsp],rbx + mov QWORD PTR[((32+88))+rsp],rbp + mov QWORD PTR[((32+96))+rsp],rdi + + + + lea rbx,QWORD PTR[48+rsi] + + mov rdx,QWORD PTR[48+rsi] + mov r14,QWORD PTR[rsi] + mov r15,QWORD PTR[8+rsi] + mov rax,QWORD PTR[16+rsi] + mov r12,QWORD PTR[24+rsi] + mov rdi,QWORD PTR[32+rsi] + mov rbp,QWORD PTR[40+rsi] + lea rsi,QWORD PTR[((-128))+rsi] + lea rcx,QWORD PTR[((-128))+rcx] + + mulx r9,r8,r14 + call __mulx_mont_383_nonred + add rdx,rdx + adc r15,r15 + adc rax,rax + adc r12,r12 + adc rdi,rdi + adc rbp,rbp + + mov QWORD PTR[48+rbx],rdx + mov QWORD PTR[56+rbx],r15 + mov QWORD PTR[64+rbx],rax + mov QWORD PTR[72+rbx],r12 + mov QWORD PTR[80+rbx],rdi + mov QWORD PTR[88+rbx],rbp + + lea rsi,QWORD PTR[((32-128))+rsp] + lea rbx,QWORD PTR[((32+48))+rsp] + + mov rdx,QWORD PTR[((32+48))+rsp] + mov r14,QWORD PTR[((32+0))+rsp] + mov r15,QWORD PTR[((32+8))+rsp] + mov rax,QWORD PTR[((32+16))+rsp] + mov r12,QWORD PTR[((32+24))+rsp] + mov rdi,QWORD PTR[((32+32))+rsp] + mov rbp,QWORD PTR[((32+40))+rsp] + + + + mulx r9,r8,r14 + call __mulx_mont_383_nonred + mov r14,QWORD PTR[((32+96))+rsp] + lea rcx,QWORD PTR[128+rcx] + mov r8,QWORD PTR[((32+0))+rsp] + and r8,r14 + mov r9,QWORD PTR[((32+8))+rsp] + and r9,r14 + mov r10,QWORD PTR[((32+16))+rsp] + and r10,r14 + mov r11,QWORD PTR[((32+24))+rsp] + and r11,r14 + mov r13,QWORD PTR[((32+32))+rsp] + and r13,r14 + and r14,QWORD PTR[((32+40))+rsp] + + sub rdx,r8 + mov r8,QWORD PTR[rcx] + sbb r15,r9 + mov r9,QWORD PTR[8+rcx] + sbb rax,r10 + mov r10,QWORD PTR[16+rcx] + sbb r12,r11 + mov r11,QWORD PTR[24+rcx] + sbb rdi,r13 + mov r13,QWORD PTR[32+rcx] + sbb rbp,r14 + sbb r14,r14 + + and r8,r14 + and r9,r14 + and r10,r14 + and r11,r14 + and r13,r14 + and r14,QWORD PTR[40+rcx] + + add rdx,r8 + adc r15,r9 + adc rax,r10 + adc r12,r11 + adc rdi,r13 + adc rbp,r14 + + mov QWORD PTR[rbx],rdx + mov QWORD PTR[8+rbx],r15 + mov QWORD PTR[16+rbx],rax + mov QWORD PTR[24+rbx],r12 + mov QWORD PTR[32+rbx],rdi + mov QWORD PTR[40+rbx],rbp + lea r8,QWORD PTR[136+rsp] + mov r15,QWORD PTR[r8] + + mov r14,QWORD PTR[8+r8] + + mov r13,QWORD PTR[16+r8] + + mov r12,QWORD PTR[24+r8] + + mov rbx,QWORD PTR[32+r8] + + mov rbp,QWORD PTR[40+r8] + + lea rsp,QWORD PTR[48+r8] + +$L$SEH_epilogue_sqrx_mont_382x:: + mov rdi,QWORD PTR[8+rsp] ;WIN64 epilogue + mov rsi,QWORD PTR[16+rsp] + + DB 0F3h,0C3h ;repret + +$L$SEH_end_sqrx_mont_382x:: +sqrx_mont_382x ENDP +.text$ ENDS +.pdata SEGMENT READONLY ALIGN(4) +ALIGN 4 + DD imagerel $L$SEH_begin_mulx_mont_384x + DD imagerel $L$SEH_body_mulx_mont_384x + DD imagerel $L$SEH_info_mulx_mont_384x_prologue + + DD imagerel $L$SEH_body_mulx_mont_384x + DD imagerel $L$SEH_epilogue_mulx_mont_384x + DD imagerel $L$SEH_info_mulx_mont_384x_body + + DD imagerel $L$SEH_epilogue_mulx_mont_384x + DD imagerel $L$SEH_end_mulx_mont_384x + DD imagerel $L$SEH_info_mulx_mont_384x_epilogue + + DD imagerel $L$SEH_begin_sqrx_mont_384x + DD imagerel $L$SEH_body_sqrx_mont_384x + DD imagerel $L$SEH_info_sqrx_mont_384x_prologue + + DD imagerel $L$SEH_body_sqrx_mont_384x + DD imagerel $L$SEH_epilogue_sqrx_mont_384x + DD imagerel $L$SEH_info_sqrx_mont_384x_body + + DD imagerel $L$SEH_epilogue_sqrx_mont_384x + DD imagerel $L$SEH_end_sqrx_mont_384x + DD imagerel $L$SEH_info_sqrx_mont_384x_epilogue + + DD imagerel $L$SEH_begin_mulx_382x + DD imagerel $L$SEH_body_mulx_382x + DD imagerel $L$SEH_info_mulx_382x_prologue + + DD imagerel $L$SEH_body_mulx_382x + DD imagerel $L$SEH_epilogue_mulx_382x + DD imagerel $L$SEH_info_mulx_382x_body + + DD imagerel $L$SEH_epilogue_mulx_382x + DD imagerel $L$SEH_end_mulx_382x + DD imagerel $L$SEH_info_mulx_382x_epilogue + + DD imagerel $L$SEH_begin_sqrx_382x + DD imagerel $L$SEH_body_sqrx_382x + DD imagerel $L$SEH_info_sqrx_382x_prologue + + DD imagerel $L$SEH_body_sqrx_382x + DD imagerel $L$SEH_epilogue_sqrx_382x + DD imagerel $L$SEH_info_sqrx_382x_body + + DD imagerel $L$SEH_epilogue_sqrx_382x + DD imagerel $L$SEH_end_sqrx_382x + DD imagerel $L$SEH_info_sqrx_382x_epilogue + + DD imagerel $L$SEH_begin_mulx_384 + DD imagerel $L$SEH_body_mulx_384 + DD imagerel $L$SEH_info_mulx_384_prologue + + DD imagerel $L$SEH_body_mulx_384 + DD imagerel $L$SEH_epilogue_mulx_384 + DD imagerel $L$SEH_info_mulx_384_body + + DD imagerel $L$SEH_epilogue_mulx_384 + DD imagerel $L$SEH_end_mulx_384 + DD imagerel $L$SEH_info_mulx_384_epilogue + + DD imagerel $L$SEH_begin_sqrx_384 + DD imagerel $L$SEH_body_sqrx_384 + DD imagerel $L$SEH_info_sqrx_384_prologue + + DD imagerel $L$SEH_body_sqrx_384 + DD imagerel $L$SEH_epilogue_sqrx_384 + DD imagerel $L$SEH_info_sqrx_384_body + + DD imagerel $L$SEH_epilogue_sqrx_384 + DD imagerel $L$SEH_end_sqrx_384 + DD imagerel $L$SEH_info_sqrx_384_epilogue + + DD imagerel $L$SEH_begin_redcx_mont_384 + DD imagerel $L$SEH_body_redcx_mont_384 + DD imagerel $L$SEH_info_redcx_mont_384_prologue + + DD imagerel $L$SEH_body_redcx_mont_384 + DD imagerel $L$SEH_epilogue_redcx_mont_384 + DD imagerel $L$SEH_info_redcx_mont_384_body + + DD imagerel $L$SEH_epilogue_redcx_mont_384 + DD imagerel $L$SEH_end_redcx_mont_384 + DD imagerel $L$SEH_info_redcx_mont_384_epilogue + + DD imagerel $L$SEH_begin_fromx_mont_384 + DD imagerel $L$SEH_body_fromx_mont_384 + DD imagerel $L$SEH_info_fromx_mont_384_prologue + + DD imagerel $L$SEH_body_fromx_mont_384 + DD imagerel $L$SEH_epilogue_fromx_mont_384 + DD imagerel $L$SEH_info_fromx_mont_384_body + + DD imagerel $L$SEH_epilogue_fromx_mont_384 + DD imagerel $L$SEH_end_fromx_mont_384 + DD imagerel $L$SEH_info_fromx_mont_384_epilogue + + DD imagerel $L$SEH_begin_sgn0x_pty_mont_384 + DD imagerel $L$SEH_body_sgn0x_pty_mont_384 + DD imagerel $L$SEH_info_sgn0x_pty_mont_384_prologue + + DD imagerel $L$SEH_body_sgn0x_pty_mont_384 + DD imagerel $L$SEH_epilogue_sgn0x_pty_mont_384 + DD imagerel $L$SEH_info_sgn0x_pty_mont_384_body + + DD imagerel $L$SEH_epilogue_sgn0x_pty_mont_384 + DD imagerel $L$SEH_end_sgn0x_pty_mont_384 + DD imagerel $L$SEH_info_sgn0x_pty_mont_384_epilogue + + DD imagerel $L$SEH_begin_sgn0x_pty_mont_384x + DD imagerel $L$SEH_body_sgn0x_pty_mont_384x + DD imagerel $L$SEH_info_sgn0x_pty_mont_384x_prologue + + DD imagerel $L$SEH_body_sgn0x_pty_mont_384x + DD imagerel $L$SEH_epilogue_sgn0x_pty_mont_384x + DD imagerel $L$SEH_info_sgn0x_pty_mont_384x_body + + DD imagerel $L$SEH_epilogue_sgn0x_pty_mont_384x + DD imagerel $L$SEH_end_sgn0x_pty_mont_384x + DD imagerel $L$SEH_info_sgn0x_pty_mont_384x_epilogue + + DD imagerel $L$SEH_begin_mulx_mont_384 + DD imagerel $L$SEH_body_mulx_mont_384 + DD imagerel $L$SEH_info_mulx_mont_384_prologue + + DD imagerel $L$SEH_body_mulx_mont_384 + DD imagerel $L$SEH_epilogue_mulx_mont_384 + DD imagerel $L$SEH_info_mulx_mont_384_body + + DD imagerel $L$SEH_epilogue_mulx_mont_384 + DD imagerel $L$SEH_end_mulx_mont_384 + DD imagerel $L$SEH_info_mulx_mont_384_epilogue + + DD imagerel $L$SEH_begin_sqrx_mont_384 + DD imagerel $L$SEH_body_sqrx_mont_384 + DD imagerel $L$SEH_info_sqrx_mont_384_prologue + + DD imagerel $L$SEH_body_sqrx_mont_384 + DD imagerel $L$SEH_epilogue_sqrx_mont_384 + DD imagerel $L$SEH_info_sqrx_mont_384_body + + DD imagerel $L$SEH_epilogue_sqrx_mont_384 + DD imagerel $L$SEH_end_sqrx_mont_384 + DD imagerel $L$SEH_info_sqrx_mont_384_epilogue + + DD imagerel $L$SEH_begin_sqrx_n_mul_mont_384 + DD imagerel $L$SEH_body_sqrx_n_mul_mont_384 + DD imagerel $L$SEH_info_sqrx_n_mul_mont_384_prologue + + DD imagerel $L$SEH_body_sqrx_n_mul_mont_384 + DD imagerel $L$SEH_epilogue_sqrx_n_mul_mont_384 + DD imagerel $L$SEH_info_sqrx_n_mul_mont_384_body + + DD imagerel $L$SEH_epilogue_sqrx_n_mul_mont_384 + DD imagerel $L$SEH_end_sqrx_n_mul_mont_384 + DD imagerel $L$SEH_info_sqrx_n_mul_mont_384_epilogue + + DD imagerel $L$SEH_begin_sqrx_n_mul_mont_383 + DD imagerel $L$SEH_body_sqrx_n_mul_mont_383 + DD imagerel $L$SEH_info_sqrx_n_mul_mont_383_prologue + + DD imagerel $L$SEH_body_sqrx_n_mul_mont_383 + DD imagerel $L$SEH_epilogue_sqrx_n_mul_mont_383 + DD imagerel $L$SEH_info_sqrx_n_mul_mont_383_body + + DD imagerel $L$SEH_epilogue_sqrx_n_mul_mont_383 + DD imagerel $L$SEH_end_sqrx_n_mul_mont_383 + DD imagerel $L$SEH_info_sqrx_n_mul_mont_383_epilogue + + DD imagerel $L$SEH_begin_sqrx_mont_382x + DD imagerel $L$SEH_body_sqrx_mont_382x + DD imagerel $L$SEH_info_sqrx_mont_382x_prologue + + DD imagerel $L$SEH_body_sqrx_mont_382x + DD imagerel $L$SEH_epilogue_sqrx_mont_382x + DD imagerel $L$SEH_info_sqrx_mont_382x_body + + DD imagerel $L$SEH_epilogue_sqrx_mont_382x + DD imagerel $L$SEH_end_sqrx_mont_382x + DD imagerel $L$SEH_info_sqrx_mont_382x_epilogue + +.pdata ENDS +.xdata SEGMENT READONLY ALIGN(8) +ALIGN 8 +$L$SEH_info_mulx_mont_384x_prologue:: +DB 1,0,5,00bh +DB 0,074h,1,0 +DB 0,064h,2,0 +DB 0,003h +DB 0,0 +$L$SEH_info_mulx_mont_384x_body:: +DB 1,0,18,0 +DB 000h,0f4h,029h,000h +DB 000h,0e4h,02ah,000h +DB 000h,0d4h,02bh,000h +DB 000h,0c4h,02ch,000h +DB 000h,034h,02dh,000h +DB 000h,054h,02eh,000h +DB 000h,074h,030h,000h +DB 000h,064h,031h,000h +DB 000h,001h,02fh,000h +$L$SEH_info_mulx_mont_384x_epilogue:: +DB 1,0,4,0 +DB 000h,074h,001h,000h +DB 000h,064h,002h,000h +DB 000h,000h,000h,000h + +$L$SEH_info_sqrx_mont_384x_prologue:: +DB 1,0,5,00bh +DB 0,074h,1,0 +DB 0,064h,2,0 +DB 0,003h +DB 0,0 +$L$SEH_info_sqrx_mont_384x_body:: +DB 1,0,18,0 +DB 000h,0f4h,011h,000h +DB 000h,0e4h,012h,000h +DB 000h,0d4h,013h,000h +DB 000h,0c4h,014h,000h +DB 000h,034h,015h,000h +DB 000h,054h,016h,000h +DB 000h,074h,018h,000h +DB 000h,064h,019h,000h +DB 000h,001h,017h,000h +$L$SEH_info_sqrx_mont_384x_epilogue:: +DB 1,0,4,0 +DB 000h,074h,001h,000h +DB 000h,064h,002h,000h +DB 000h,000h,000h,000h + +$L$SEH_info_mulx_382x_prologue:: +DB 1,0,5,00bh +DB 0,074h,1,0 +DB 0,064h,2,0 +DB 0,003h +DB 0,0 +$L$SEH_info_mulx_382x_body:: +DB 1,0,18,0 +DB 000h,0f4h,011h,000h +DB 000h,0e4h,012h,000h +DB 000h,0d4h,013h,000h +DB 000h,0c4h,014h,000h +DB 000h,034h,015h,000h +DB 000h,054h,016h,000h +DB 000h,074h,018h,000h +DB 000h,064h,019h,000h +DB 000h,001h,017h,000h +$L$SEH_info_mulx_382x_epilogue:: +DB 1,0,4,0 +DB 000h,074h,001h,000h +DB 000h,064h,002h,000h +DB 000h,000h,000h,000h + +$L$SEH_info_sqrx_382x_prologue:: +DB 1,0,5,00bh +DB 0,074h,1,0 +DB 0,064h,2,0 +DB 0,003h +DB 0,0 +$L$SEH_info_sqrx_382x_body:: +DB 1,0,17,0 +DB 000h,0f4h,001h,000h +DB 000h,0e4h,002h,000h +DB 000h,0d4h,003h,000h +DB 000h,0c4h,004h,000h +DB 000h,034h,005h,000h +DB 000h,054h,006h,000h +DB 000h,074h,008h,000h +DB 000h,064h,009h,000h +DB 000h,062h +DB 000h,000h +$L$SEH_info_sqrx_382x_epilogue:: +DB 1,0,4,0 +DB 000h,074h,001h,000h +DB 000h,064h,002h,000h +DB 000h,000h,000h,000h + +$L$SEH_info_mulx_384_prologue:: +DB 1,0,5,00bh +DB 0,074h,1,0 +DB 0,064h,2,0 +DB 0,003h +DB 0,0 +$L$SEH_info_mulx_384_body:: +DB 1,0,17,0 +DB 000h,0f4h,000h,000h +DB 000h,0e4h,001h,000h +DB 000h,0d4h,002h,000h +DB 000h,0c4h,003h,000h +DB 000h,034h,004h,000h +DB 000h,054h,005h,000h +DB 000h,074h,007h,000h +DB 000h,064h,008h,000h +DB 000h,052h +DB 000h,000h +$L$SEH_info_mulx_384_epilogue:: +DB 1,0,4,0 +DB 000h,074h,001h,000h +DB 000h,064h,002h,000h +DB 000h,000h,000h,000h + +$L$SEH_info_sqrx_384_prologue:: +DB 1,0,5,00bh +DB 0,074h,1,0 +DB 0,064h,2,0 +DB 0,003h +DB 0,0 +$L$SEH_info_sqrx_384_body:: +DB 1,0,17,0 +DB 000h,0f4h,001h,000h +DB 000h,0e4h,002h,000h +DB 000h,0d4h,003h,000h +DB 000h,0c4h,004h,000h +DB 000h,034h,005h,000h +DB 000h,054h,006h,000h +DB 000h,074h,008h,000h +DB 000h,064h,009h,000h +DB 000h,062h +DB 000h,000h +$L$SEH_info_sqrx_384_epilogue:: +DB 1,0,4,0 +DB 000h,074h,001h,000h +DB 000h,064h,002h,000h +DB 000h,000h,000h,000h + +$L$SEH_info_redcx_mont_384_prologue:: +DB 1,0,5,00bh +DB 0,074h,1,0 +DB 0,064h,2,0 +DB 0,003h +DB 0,0 +$L$SEH_info_redcx_mont_384_body:: +DB 1,0,17,0 +DB 000h,0f4h,001h,000h +DB 000h,0e4h,002h,000h +DB 000h,0d4h,003h,000h +DB 000h,0c4h,004h,000h +DB 000h,034h,005h,000h +DB 000h,054h,006h,000h +DB 000h,074h,008h,000h +DB 000h,064h,009h,000h +DB 000h,062h +DB 000h,000h +$L$SEH_info_redcx_mont_384_epilogue:: +DB 1,0,4,0 +DB 000h,074h,001h,000h +DB 000h,064h,002h,000h +DB 000h,000h,000h,000h + +$L$SEH_info_fromx_mont_384_prologue:: +DB 1,0,5,00bh +DB 0,074h,1,0 +DB 0,064h,2,0 +DB 0,003h +DB 0,0 +$L$SEH_info_fromx_mont_384_body:: +DB 1,0,17,0 +DB 000h,0f4h,001h,000h +DB 000h,0e4h,002h,000h +DB 000h,0d4h,003h,000h +DB 000h,0c4h,004h,000h +DB 000h,034h,005h,000h +DB 000h,054h,006h,000h +DB 000h,074h,008h,000h +DB 000h,064h,009h,000h +DB 000h,062h +DB 000h,000h +$L$SEH_info_fromx_mont_384_epilogue:: +DB 1,0,4,0 +DB 000h,074h,001h,000h +DB 000h,064h,002h,000h +DB 000h,000h,000h,000h + +$L$SEH_info_sgn0x_pty_mont_384_prologue:: +DB 1,0,5,00bh +DB 0,074h,1,0 +DB 0,064h,2,0 +DB 0,003h +DB 0,0 +$L$SEH_info_sgn0x_pty_mont_384_body:: +DB 1,0,17,0 +DB 000h,0f4h,001h,000h +DB 000h,0e4h,002h,000h +DB 000h,0d4h,003h,000h +DB 000h,0c4h,004h,000h +DB 000h,034h,005h,000h +DB 000h,054h,006h,000h +DB 000h,074h,008h,000h +DB 000h,064h,009h,000h +DB 000h,062h +DB 000h,000h +$L$SEH_info_sgn0x_pty_mont_384_epilogue:: +DB 1,0,4,0 +DB 000h,074h,001h,000h +DB 000h,064h,002h,000h +DB 000h,000h,000h,000h + +$L$SEH_info_sgn0x_pty_mont_384x_prologue:: +DB 1,0,5,00bh +DB 0,074h,1,0 +DB 0,064h,2,0 +DB 0,003h +DB 0,0 +$L$SEH_info_sgn0x_pty_mont_384x_body:: +DB 1,0,17,0 +DB 000h,0f4h,001h,000h +DB 000h,0e4h,002h,000h +DB 000h,0d4h,003h,000h +DB 000h,0c4h,004h,000h +DB 000h,034h,005h,000h +DB 000h,054h,006h,000h +DB 000h,074h,008h,000h +DB 000h,064h,009h,000h +DB 000h,062h +DB 000h,000h +$L$SEH_info_sgn0x_pty_mont_384x_epilogue:: +DB 1,0,4,0 +DB 000h,074h,001h,000h +DB 000h,064h,002h,000h +DB 000h,000h,000h,000h + +$L$SEH_info_mulx_mont_384_prologue:: +DB 1,0,5,00bh +DB 0,074h,1,0 +DB 0,064h,2,0 +DB 0,003h +DB 0,0 +$L$SEH_info_mulx_mont_384_body:: +DB 1,0,17,0 +DB 000h,0f4h,003h,000h +DB 000h,0e4h,004h,000h +DB 000h,0d4h,005h,000h +DB 000h,0c4h,006h,000h +DB 000h,034h,007h,000h +DB 000h,054h,008h,000h +DB 000h,074h,00ah,000h +DB 000h,064h,00bh,000h +DB 000h,082h +DB 000h,000h +$L$SEH_info_mulx_mont_384_epilogue:: +DB 1,0,4,0 +DB 000h,074h,001h,000h +DB 000h,064h,002h,000h +DB 000h,000h,000h,000h + +$L$SEH_info_sqrx_mont_384_prologue:: +DB 1,0,5,00bh +DB 0,074h,1,0 +DB 0,064h,2,0 +DB 0,003h +DB 0,0 +$L$SEH_info_sqrx_mont_384_body:: +DB 1,0,17,0 +DB 000h,0f4h,003h,000h +DB 000h,0e4h,004h,000h +DB 000h,0d4h,005h,000h +DB 000h,0c4h,006h,000h +DB 000h,034h,007h,000h +DB 000h,054h,008h,000h +DB 000h,074h,00ah,000h +DB 000h,064h,00bh,000h +DB 000h,082h +DB 000h,000h +$L$SEH_info_sqrx_mont_384_epilogue:: +DB 1,0,4,0 +DB 000h,074h,001h,000h +DB 000h,064h,002h,000h +DB 000h,000h,000h,000h + +$L$SEH_info_sqrx_n_mul_mont_384_prologue:: +DB 1,0,5,00bh +DB 0,074h,1,0 +DB 0,064h,2,0 +DB 0,003h +DB 0,0 +$L$SEH_info_sqrx_n_mul_mont_384_body:: +DB 1,0,17,0 +DB 000h,0f4h,005h,000h +DB 000h,0e4h,006h,000h +DB 000h,0d4h,007h,000h +DB 000h,0c4h,008h,000h +DB 000h,034h,009h,000h +DB 000h,054h,00ah,000h +DB 000h,074h,00ch,000h +DB 000h,064h,00dh,000h +DB 000h,0a2h +DB 000h,000h +$L$SEH_info_sqrx_n_mul_mont_384_epilogue:: +DB 1,0,4,0 +DB 000h,074h,001h,000h +DB 000h,064h,002h,000h +DB 000h,000h,000h,000h + +$L$SEH_info_sqrx_n_mul_mont_383_prologue:: +DB 1,0,5,00bh +DB 0,074h,1,0 +DB 0,064h,2,0 +DB 0,003h +DB 0,0 +$L$SEH_info_sqrx_n_mul_mont_383_body:: +DB 1,0,17,0 +DB 000h,0f4h,005h,000h +DB 000h,0e4h,006h,000h +DB 000h,0d4h,007h,000h +DB 000h,0c4h,008h,000h +DB 000h,034h,009h,000h +DB 000h,054h,00ah,000h +DB 000h,074h,00ch,000h +DB 000h,064h,00dh,000h +DB 000h,0a2h +DB 000h,000h +$L$SEH_info_sqrx_n_mul_mont_383_epilogue:: +DB 1,0,4,0 +DB 000h,074h,001h,000h +DB 000h,064h,002h,000h +DB 000h,000h,000h,000h + +$L$SEH_info_sqrx_mont_382x_prologue:: +DB 1,0,5,00bh +DB 0,074h,1,0 +DB 0,064h,2,0 +DB 0,003h +DB 0,0 +$L$SEH_info_sqrx_mont_382x_body:: +DB 1,0,18,0 +DB 000h,0f4h,011h,000h +DB 000h,0e4h,012h,000h +DB 000h,0d4h,013h,000h +DB 000h,0c4h,014h,000h +DB 000h,034h,015h,000h +DB 000h,054h,016h,000h +DB 000h,074h,018h,000h +DB 000h,064h,019h,000h +DB 000h,001h,017h,000h +$L$SEH_info_sqrx_mont_382x_epilogue:: +DB 1,0,4,0 +DB 000h,074h,001h,000h +DB 000h,064h,002h,000h +DB 000h,000h,000h,000h + + +.xdata ENDS +END diff --git a/temp-deps/blst-ts/deps/blst/build/win64/sha256-armv8.asm b/temp-deps/blst-ts/deps/blst/build/win64/sha256-armv8.asm new file mode 100644 index 0000000..0e0c54c --- /dev/null +++ b/temp-deps/blst-ts/deps/blst/build/win64/sha256-armv8.asm @@ -0,0 +1,1078 @@ +// +// Copyright Supranational LLC +// Licensed under the Apache License, Version 2.0, see LICENSE for details. +// SPDX-License-Identifier: Apache-2.0 +// +// ==================================================================== +// Written by Andy Polyakov, @dot-asm, initially for the OpenSSL +// project. +// ==================================================================== +// +// sha256_block procedure for ARMv8. +// +// This module is stripped of scalar code paths, with raionale that all +// known processors are NEON-capable. +// +// See original module at CRYPTOGAMS for further details. + + AREA |.text|,CODE,ALIGN=8,ARM64 + + ALIGN 64 + +|$LK256| + DCDU 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5 + DCDU 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5 + DCDU 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3 + DCDU 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174 + DCDU 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc + DCDU 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da + DCDU 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7 + DCDU 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967 + DCDU 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13 + DCDU 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85 + DCDU 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3 + DCDU 0xd192e819,0xd6990624,0xf40e3585,0x106aa070 + DCDU 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5 + DCDU 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3 + DCDU 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208 + DCDU 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 + DCDU 0 //terminator + + DCB "SHA256 block transform for ARMv8, CRYPTOGAMS by @dot-asm",0 + ALIGN 4 + ALIGN 4 + + EXPORT |blst_sha256_block_armv8|[FUNC] + ALIGN 64 +|blst_sha256_block_armv8| PROC +|$Lv8_entry| + stp x29,x30,[sp,#-16]! + add x29,sp,#0 + + ld1 {v0.4s,v1.4s},[x0] + adr x3,|$LK256| + +|$Loop_hw| + ld1 {v4.16b,v5.16b,v6.16b,v7.16b},[x1],#64 + sub x2,x2,#1 + ld1 {v16.4s},[x3],#16 + rev32 v4.16b,v4.16b + rev32 v5.16b,v5.16b + rev32 v6.16b,v6.16b + rev32 v7.16b,v7.16b + orr v18.16b,v0.16b,v0.16b // offload + orr v19.16b,v1.16b,v1.16b + ld1 {v17.4s},[x3],#16 + add v16.4s,v16.4s,v4.4s + DCDU 0x5e2828a4 //sha256su0 v4.16b,v5.16b + orr v2.16b,v0.16b,v0.16b + DCDU 0x5e104020 //sha256h v0.16b,v1.16b,v16.4s + DCDU 0x5e105041 //sha256h2 v1.16b,v2.16b,v16.4s + DCDU 0x5e0760c4 //sha256su1 v4.16b,v6.16b,v7.16b + ld1 {v16.4s},[x3],#16 + add v17.4s,v17.4s,v5.4s + DCDU 0x5e2828c5 //sha256su0 v5.16b,v6.16b + orr v2.16b,v0.16b,v0.16b + DCDU 0x5e114020 //sha256h v0.16b,v1.16b,v17.4s + DCDU 0x5e115041 //sha256h2 v1.16b,v2.16b,v17.4s + DCDU 0x5e0460e5 //sha256su1 v5.16b,v7.16b,v4.16b + ld1 {v17.4s},[x3],#16 + add v16.4s,v16.4s,v6.4s + DCDU 0x5e2828e6 //sha256su0 v6.16b,v7.16b + orr v2.16b,v0.16b,v0.16b + DCDU 0x5e104020 //sha256h v0.16b,v1.16b,v16.4s + DCDU 0x5e105041 //sha256h2 v1.16b,v2.16b,v16.4s + DCDU 0x5e056086 //sha256su1 v6.16b,v4.16b,v5.16b + ld1 {v16.4s},[x3],#16 + add v17.4s,v17.4s,v7.4s + DCDU 0x5e282887 //sha256su0 v7.16b,v4.16b + orr v2.16b,v0.16b,v0.16b + DCDU 0x5e114020 //sha256h v0.16b,v1.16b,v17.4s + DCDU 0x5e115041 //sha256h2 v1.16b,v2.16b,v17.4s + DCDU 0x5e0660a7 //sha256su1 v7.16b,v5.16b,v6.16b + ld1 {v17.4s},[x3],#16 + add v16.4s,v16.4s,v4.4s + DCDU 0x5e2828a4 //sha256su0 v4.16b,v5.16b + orr v2.16b,v0.16b,v0.16b + DCDU 0x5e104020 //sha256h v0.16b,v1.16b,v16.4s + DCDU 0x5e105041 //sha256h2 v1.16b,v2.16b,v16.4s + DCDU 0x5e0760c4 //sha256su1 v4.16b,v6.16b,v7.16b + ld1 {v16.4s},[x3],#16 + add v17.4s,v17.4s,v5.4s + DCDU 0x5e2828c5 //sha256su0 v5.16b,v6.16b + orr v2.16b,v0.16b,v0.16b + DCDU 0x5e114020 //sha256h v0.16b,v1.16b,v17.4s + DCDU 0x5e115041 //sha256h2 v1.16b,v2.16b,v17.4s + DCDU 0x5e0460e5 //sha256su1 v5.16b,v7.16b,v4.16b + ld1 {v17.4s},[x3],#16 + add v16.4s,v16.4s,v6.4s + DCDU 0x5e2828e6 //sha256su0 v6.16b,v7.16b + orr v2.16b,v0.16b,v0.16b + DCDU 0x5e104020 //sha256h v0.16b,v1.16b,v16.4s + DCDU 0x5e105041 //sha256h2 v1.16b,v2.16b,v16.4s + DCDU 0x5e056086 //sha256su1 v6.16b,v4.16b,v5.16b + ld1 {v16.4s},[x3],#16 + add v17.4s,v17.4s,v7.4s + DCDU 0x5e282887 //sha256su0 v7.16b,v4.16b + orr v2.16b,v0.16b,v0.16b + DCDU 0x5e114020 //sha256h v0.16b,v1.16b,v17.4s + DCDU 0x5e115041 //sha256h2 v1.16b,v2.16b,v17.4s + DCDU 0x5e0660a7 //sha256su1 v7.16b,v5.16b,v6.16b + ld1 {v17.4s},[x3],#16 + add v16.4s,v16.4s,v4.4s + DCDU 0x5e2828a4 //sha256su0 v4.16b,v5.16b + orr v2.16b,v0.16b,v0.16b + DCDU 0x5e104020 //sha256h v0.16b,v1.16b,v16.4s + DCDU 0x5e105041 //sha256h2 v1.16b,v2.16b,v16.4s + DCDU 0x5e0760c4 //sha256su1 v4.16b,v6.16b,v7.16b + ld1 {v16.4s},[x3],#16 + add v17.4s,v17.4s,v5.4s + DCDU 0x5e2828c5 //sha256su0 v5.16b,v6.16b + orr v2.16b,v0.16b,v0.16b + DCDU 0x5e114020 //sha256h v0.16b,v1.16b,v17.4s + DCDU 0x5e115041 //sha256h2 v1.16b,v2.16b,v17.4s + DCDU 0x5e0460e5 //sha256su1 v5.16b,v7.16b,v4.16b + ld1 {v17.4s},[x3],#16 + add v16.4s,v16.4s,v6.4s + DCDU 0x5e2828e6 //sha256su0 v6.16b,v7.16b + orr v2.16b,v0.16b,v0.16b + DCDU 0x5e104020 //sha256h v0.16b,v1.16b,v16.4s + DCDU 0x5e105041 //sha256h2 v1.16b,v2.16b,v16.4s + DCDU 0x5e056086 //sha256su1 v6.16b,v4.16b,v5.16b + ld1 {v16.4s},[x3],#16 + add v17.4s,v17.4s,v7.4s + DCDU 0x5e282887 //sha256su0 v7.16b,v4.16b + orr v2.16b,v0.16b,v0.16b + DCDU 0x5e114020 //sha256h v0.16b,v1.16b,v17.4s + DCDU 0x5e115041 //sha256h2 v1.16b,v2.16b,v17.4s + DCDU 0x5e0660a7 //sha256su1 v7.16b,v5.16b,v6.16b + ld1 {v17.4s},[x3],#16 + add v16.4s,v16.4s,v4.4s + orr v2.16b,v0.16b,v0.16b + DCDU 0x5e104020 //sha256h v0.16b,v1.16b,v16.4s + DCDU 0x5e105041 //sha256h2 v1.16b,v2.16b,v16.4s + + ld1 {v16.4s},[x3],#16 + add v17.4s,v17.4s,v5.4s + orr v2.16b,v0.16b,v0.16b + DCDU 0x5e114020 //sha256h v0.16b,v1.16b,v17.4s + DCDU 0x5e115041 //sha256h2 v1.16b,v2.16b,v17.4s + + ld1 {v17.4s},[x3] + add v16.4s,v16.4s,v6.4s + sub x3,x3,#64*4-16 // rewind + orr v2.16b,v0.16b,v0.16b + DCDU 0x5e104020 //sha256h v0.16b,v1.16b,v16.4s + DCDU 0x5e105041 //sha256h2 v1.16b,v2.16b,v16.4s + + add v17.4s,v17.4s,v7.4s + orr v2.16b,v0.16b,v0.16b + DCDU 0x5e114020 //sha256h v0.16b,v1.16b,v17.4s + DCDU 0x5e115041 //sha256h2 v1.16b,v2.16b,v17.4s + + add v0.4s,v0.4s,v18.4s + add v1.4s,v1.4s,v19.4s + + cbnz x2,|$Loop_hw| + + st1 {v0.4s,v1.4s},[x0] + + ldr x29,[sp],#16 + ret + ENDP + + EXPORT |blst_sha256_block_data_order|[FUNC] + ALIGN 16 +|blst_sha256_block_data_order| PROC + stp x29, x30, [sp, #-16]! + mov x29, sp + sub sp,sp,#16*4 + + adr x16,|$LK256| + add x2,x1,x2,lsl#6 // len to point at the end of inp + + ld1 {v0.16b},[x1], #16 + ld1 {v1.16b},[x1], #16 + ld1 {v2.16b},[x1], #16 + ld1 {v3.16b},[x1], #16 + ld1 {v4.4s},[x16], #16 + ld1 {v5.4s},[x16], #16 + ld1 {v6.4s},[x16], #16 + ld1 {v7.4s},[x16], #16 + rev32 v0.16b,v0.16b // yes, even on + rev32 v1.16b,v1.16b // big-endian + rev32 v2.16b,v2.16b + rev32 v3.16b,v3.16b + mov x17,sp + add v4.4s,v4.4s,v0.4s + add v5.4s,v5.4s,v1.4s + add v6.4s,v6.4s,v2.4s + st1 {v4.4s,v5.4s},[x17], #32 + add v7.4s,v7.4s,v3.4s + st1 {v6.4s,v7.4s},[x17] + sub x17,x17,#32 + + ldp w3,w4,[x0] + ldp w5,w6,[x0,#8] + ldp w7,w8,[x0,#16] + ldp w9,w10,[x0,#24] + ldr w12,[sp,#0] + mov w13,wzr + eor w14,w4,w5 + mov w15,wzr + b |$L_00_48| + + ALIGN 16 +|$L_00_48| + ext8 v4.16b,v0.16b,v1.16b,#4 + add w10,w10,w12 + add w3,w3,w15 + and w12,w8,w7 + bic w15,w9,w7 + ext8 v7.16b,v2.16b,v3.16b,#4 + eor w11,w7,w7,ror#5 + add w3,w3,w13 + mov d19,v3.d[1] + orr w12,w12,w15 + eor w11,w11,w7,ror#19 + ushr v6.4s,v4.4s,#7 + eor w15,w3,w3,ror#11 + ushr v5.4s,v4.4s,#3 + add w10,w10,w12 + add v0.4s,v0.4s,v7.4s + ror w11,w11,#6 + sli v6.4s,v4.4s,#25 + eor w13,w3,w4 + eor w15,w15,w3,ror#20 + ushr v7.4s,v4.4s,#18 + add w10,w10,w11 + ldr w12,[sp,#4] + and w14,w14,w13 + eor v5.16b,v5.16b,v6.16b + ror w15,w15,#2 + add w6,w6,w10 + sli v7.4s,v4.4s,#14 + eor w14,w14,w4 + ushr v16.4s,v19.4s,#17 + add w9,w9,w12 + add w10,w10,w15 + and w12,w7,w6 + eor v5.16b,v5.16b,v7.16b + bic w15,w8,w6 + eor w11,w6,w6,ror#5 + sli v16.4s,v19.4s,#15 + add w10,w10,w14 + orr w12,w12,w15 + ushr v17.4s,v19.4s,#10 + eor w11,w11,w6,ror#19 + eor w15,w10,w10,ror#11 + ushr v7.4s,v19.4s,#19 + add w9,w9,w12 + ror w11,w11,#6 + add v0.4s,v0.4s,v5.4s + eor w14,w10,w3 + eor w15,w15,w10,ror#20 + sli v7.4s,v19.4s,#13 + add w9,w9,w11 + ldr w12,[sp,#8] + and w13,w13,w14 + eor v17.16b,v17.16b,v16.16b + ror w15,w15,#2 + add w5,w5,w9 + eor w13,w13,w3 + eor v17.16b,v17.16b,v7.16b + add w8,w8,w12 + add w9,w9,w15 + and w12,w6,w5 + add v0.4s,v0.4s,v17.4s + bic w15,w7,w5 + eor w11,w5,w5,ror#5 + add w9,w9,w13 + ushr v18.4s,v0.4s,#17 + orr w12,w12,w15 + ushr v19.4s,v0.4s,#10 + eor w11,w11,w5,ror#19 + eor w15,w9,w9,ror#11 + sli v18.4s,v0.4s,#15 + add w8,w8,w12 + ushr v17.4s,v0.4s,#19 + ror w11,w11,#6 + eor w13,w9,w10 + eor v19.16b,v19.16b,v18.16b + eor w15,w15,w9,ror#20 + add w8,w8,w11 + sli v17.4s,v0.4s,#13 + ldr w12,[sp,#12] + and w14,w14,w13 + ror w15,w15,#2 + ld1 {v4.4s},[x16], #16 + add w4,w4,w8 + eor v19.16b,v19.16b,v17.16b + eor w14,w14,w10 + eor v17.16b,v17.16b,v17.16b + add w7,w7,w12 + add w8,w8,w15 + and w12,w5,w4 + mov v17.d[1],v19.d[0] + bic w15,w6,w4 + eor w11,w4,w4,ror#5 + add w8,w8,w14 + add v0.4s,v0.4s,v17.4s + orr w12,w12,w15 + eor w11,w11,w4,ror#19 + eor w15,w8,w8,ror#11 + add v4.4s,v4.4s,v0.4s + add w7,w7,w12 + ror w11,w11,#6 + eor w14,w8,w9 + eor w15,w15,w8,ror#20 + add w7,w7,w11 + ldr w12,[sp,#16] + and w13,w13,w14 + ror w15,w15,#2 + add w3,w3,w7 + eor w13,w13,w9 + st1 {v4.4s},[x17], #16 + ext8 v4.16b,v1.16b,v2.16b,#4 + add w6,w6,w12 + add w7,w7,w15 + and w12,w4,w3 + bic w15,w5,w3 + ext8 v7.16b,v3.16b,v0.16b,#4 + eor w11,w3,w3,ror#5 + add w7,w7,w13 + mov d19,v0.d[1] + orr w12,w12,w15 + eor w11,w11,w3,ror#19 + ushr v6.4s,v4.4s,#7 + eor w15,w7,w7,ror#11 + ushr v5.4s,v4.4s,#3 + add w6,w6,w12 + add v1.4s,v1.4s,v7.4s + ror w11,w11,#6 + sli v6.4s,v4.4s,#25 + eor w13,w7,w8 + eor w15,w15,w7,ror#20 + ushr v7.4s,v4.4s,#18 + add w6,w6,w11 + ldr w12,[sp,#20] + and w14,w14,w13 + eor v5.16b,v5.16b,v6.16b + ror w15,w15,#2 + add w10,w10,w6 + sli v7.4s,v4.4s,#14 + eor w14,w14,w8 + ushr v16.4s,v19.4s,#17 + add w5,w5,w12 + add w6,w6,w15 + and w12,w3,w10 + eor v5.16b,v5.16b,v7.16b + bic w15,w4,w10 + eor w11,w10,w10,ror#5 + sli v16.4s,v19.4s,#15 + add w6,w6,w14 + orr w12,w12,w15 + ushr v17.4s,v19.4s,#10 + eor w11,w11,w10,ror#19 + eor w15,w6,w6,ror#11 + ushr v7.4s,v19.4s,#19 + add w5,w5,w12 + ror w11,w11,#6 + add v1.4s,v1.4s,v5.4s + eor w14,w6,w7 + eor w15,w15,w6,ror#20 + sli v7.4s,v19.4s,#13 + add w5,w5,w11 + ldr w12,[sp,#24] + and w13,w13,w14 + eor v17.16b,v17.16b,v16.16b + ror w15,w15,#2 + add w9,w9,w5 + eor w13,w13,w7 + eor v17.16b,v17.16b,v7.16b + add w4,w4,w12 + add w5,w5,w15 + and w12,w10,w9 + add v1.4s,v1.4s,v17.4s + bic w15,w3,w9 + eor w11,w9,w9,ror#5 + add w5,w5,w13 + ushr v18.4s,v1.4s,#17 + orr w12,w12,w15 + ushr v19.4s,v1.4s,#10 + eor w11,w11,w9,ror#19 + eor w15,w5,w5,ror#11 + sli v18.4s,v1.4s,#15 + add w4,w4,w12 + ushr v17.4s,v1.4s,#19 + ror w11,w11,#6 + eor w13,w5,w6 + eor v19.16b,v19.16b,v18.16b + eor w15,w15,w5,ror#20 + add w4,w4,w11 + sli v17.4s,v1.4s,#13 + ldr w12,[sp,#28] + and w14,w14,w13 + ror w15,w15,#2 + ld1 {v4.4s},[x16], #16 + add w8,w8,w4 + eor v19.16b,v19.16b,v17.16b + eor w14,w14,w6 + eor v17.16b,v17.16b,v17.16b + add w3,w3,w12 + add w4,w4,w15 + and w12,w9,w8 + mov v17.d[1],v19.d[0] + bic w15,w10,w8 + eor w11,w8,w8,ror#5 + add w4,w4,w14 + add v1.4s,v1.4s,v17.4s + orr w12,w12,w15 + eor w11,w11,w8,ror#19 + eor w15,w4,w4,ror#11 + add v4.4s,v4.4s,v1.4s + add w3,w3,w12 + ror w11,w11,#6 + eor w14,w4,w5 + eor w15,w15,w4,ror#20 + add w3,w3,w11 + ldr w12,[sp,#32] + and w13,w13,w14 + ror w15,w15,#2 + add w7,w7,w3 + eor w13,w13,w5 + st1 {v4.4s},[x17], #16 + ext8 v4.16b,v2.16b,v3.16b,#4 + add w10,w10,w12 + add w3,w3,w15 + and w12,w8,w7 + bic w15,w9,w7 + ext8 v7.16b,v0.16b,v1.16b,#4 + eor w11,w7,w7,ror#5 + add w3,w3,w13 + mov d19,v1.d[1] + orr w12,w12,w15 + eor w11,w11,w7,ror#19 + ushr v6.4s,v4.4s,#7 + eor w15,w3,w3,ror#11 + ushr v5.4s,v4.4s,#3 + add w10,w10,w12 + add v2.4s,v2.4s,v7.4s + ror w11,w11,#6 + sli v6.4s,v4.4s,#25 + eor w13,w3,w4 + eor w15,w15,w3,ror#20 + ushr v7.4s,v4.4s,#18 + add w10,w10,w11 + ldr w12,[sp,#36] + and w14,w14,w13 + eor v5.16b,v5.16b,v6.16b + ror w15,w15,#2 + add w6,w6,w10 + sli v7.4s,v4.4s,#14 + eor w14,w14,w4 + ushr v16.4s,v19.4s,#17 + add w9,w9,w12 + add w10,w10,w15 + and w12,w7,w6 + eor v5.16b,v5.16b,v7.16b + bic w15,w8,w6 + eor w11,w6,w6,ror#5 + sli v16.4s,v19.4s,#15 + add w10,w10,w14 + orr w12,w12,w15 + ushr v17.4s,v19.4s,#10 + eor w11,w11,w6,ror#19 + eor w15,w10,w10,ror#11 + ushr v7.4s,v19.4s,#19 + add w9,w9,w12 + ror w11,w11,#6 + add v2.4s,v2.4s,v5.4s + eor w14,w10,w3 + eor w15,w15,w10,ror#20 + sli v7.4s,v19.4s,#13 + add w9,w9,w11 + ldr w12,[sp,#40] + and w13,w13,w14 + eor v17.16b,v17.16b,v16.16b + ror w15,w15,#2 + add w5,w5,w9 + eor w13,w13,w3 + eor v17.16b,v17.16b,v7.16b + add w8,w8,w12 + add w9,w9,w15 + and w12,w6,w5 + add v2.4s,v2.4s,v17.4s + bic w15,w7,w5 + eor w11,w5,w5,ror#5 + add w9,w9,w13 + ushr v18.4s,v2.4s,#17 + orr w12,w12,w15 + ushr v19.4s,v2.4s,#10 + eor w11,w11,w5,ror#19 + eor w15,w9,w9,ror#11 + sli v18.4s,v2.4s,#15 + add w8,w8,w12 + ushr v17.4s,v2.4s,#19 + ror w11,w11,#6 + eor w13,w9,w10 + eor v19.16b,v19.16b,v18.16b + eor w15,w15,w9,ror#20 + add w8,w8,w11 + sli v17.4s,v2.4s,#13 + ldr w12,[sp,#44] + and w14,w14,w13 + ror w15,w15,#2 + ld1 {v4.4s},[x16], #16 + add w4,w4,w8 + eor v19.16b,v19.16b,v17.16b + eor w14,w14,w10 + eor v17.16b,v17.16b,v17.16b + add w7,w7,w12 + add w8,w8,w15 + and w12,w5,w4 + mov v17.d[1],v19.d[0] + bic w15,w6,w4 + eor w11,w4,w4,ror#5 + add w8,w8,w14 + add v2.4s,v2.4s,v17.4s + orr w12,w12,w15 + eor w11,w11,w4,ror#19 + eor w15,w8,w8,ror#11 + add v4.4s,v4.4s,v2.4s + add w7,w7,w12 + ror w11,w11,#6 + eor w14,w8,w9 + eor w15,w15,w8,ror#20 + add w7,w7,w11 + ldr w12,[sp,#48] + and w13,w13,w14 + ror w15,w15,#2 + add w3,w3,w7 + eor w13,w13,w9 + st1 {v4.4s},[x17], #16 + ext8 v4.16b,v3.16b,v0.16b,#4 + add w6,w6,w12 + add w7,w7,w15 + and w12,w4,w3 + bic w15,w5,w3 + ext8 v7.16b,v1.16b,v2.16b,#4 + eor w11,w3,w3,ror#5 + add w7,w7,w13 + mov d19,v2.d[1] + orr w12,w12,w15 + eor w11,w11,w3,ror#19 + ushr v6.4s,v4.4s,#7 + eor w15,w7,w7,ror#11 + ushr v5.4s,v4.4s,#3 + add w6,w6,w12 + add v3.4s,v3.4s,v7.4s + ror w11,w11,#6 + sli v6.4s,v4.4s,#25 + eor w13,w7,w8 + eor w15,w15,w7,ror#20 + ushr v7.4s,v4.4s,#18 + add w6,w6,w11 + ldr w12,[sp,#52] + and w14,w14,w13 + eor v5.16b,v5.16b,v6.16b + ror w15,w15,#2 + add w10,w10,w6 + sli v7.4s,v4.4s,#14 + eor w14,w14,w8 + ushr v16.4s,v19.4s,#17 + add w5,w5,w12 + add w6,w6,w15 + and w12,w3,w10 + eor v5.16b,v5.16b,v7.16b + bic w15,w4,w10 + eor w11,w10,w10,ror#5 + sli v16.4s,v19.4s,#15 + add w6,w6,w14 + orr w12,w12,w15 + ushr v17.4s,v19.4s,#10 + eor w11,w11,w10,ror#19 + eor w15,w6,w6,ror#11 + ushr v7.4s,v19.4s,#19 + add w5,w5,w12 + ror w11,w11,#6 + add v3.4s,v3.4s,v5.4s + eor w14,w6,w7 + eor w15,w15,w6,ror#20 + sli v7.4s,v19.4s,#13 + add w5,w5,w11 + ldr w12,[sp,#56] + and w13,w13,w14 + eor v17.16b,v17.16b,v16.16b + ror w15,w15,#2 + add w9,w9,w5 + eor w13,w13,w7 + eor v17.16b,v17.16b,v7.16b + add w4,w4,w12 + add w5,w5,w15 + and w12,w10,w9 + add v3.4s,v3.4s,v17.4s + bic w15,w3,w9 + eor w11,w9,w9,ror#5 + add w5,w5,w13 + ushr v18.4s,v3.4s,#17 + orr w12,w12,w15 + ushr v19.4s,v3.4s,#10 + eor w11,w11,w9,ror#19 + eor w15,w5,w5,ror#11 + sli v18.4s,v3.4s,#15 + add w4,w4,w12 + ushr v17.4s,v3.4s,#19 + ror w11,w11,#6 + eor w13,w5,w6 + eor v19.16b,v19.16b,v18.16b + eor w15,w15,w5,ror#20 + add w4,w4,w11 + sli v17.4s,v3.4s,#13 + ldr w12,[sp,#60] + and w14,w14,w13 + ror w15,w15,#2 + ld1 {v4.4s},[x16], #16 + add w8,w8,w4 + eor v19.16b,v19.16b,v17.16b + eor w14,w14,w6 + eor v17.16b,v17.16b,v17.16b + add w3,w3,w12 + add w4,w4,w15 + and w12,w9,w8 + mov v17.d[1],v19.d[0] + bic w15,w10,w8 + eor w11,w8,w8,ror#5 + add w4,w4,w14 + add v3.4s,v3.4s,v17.4s + orr w12,w12,w15 + eor w11,w11,w8,ror#19 + eor w15,w4,w4,ror#11 + add v4.4s,v4.4s,v3.4s + add w3,w3,w12 + ror w11,w11,#6 + eor w14,w4,w5 + eor w15,w15,w4,ror#20 + add w3,w3,w11 + ldr w12,[x16] + and w13,w13,w14 + ror w15,w15,#2 + add w7,w7,w3 + eor w13,w13,w5 + st1 {v4.4s},[x17], #16 + cmp w12,#0 // check for K256 terminator + ldr w12,[sp,#0] + sub x17,x17,#64 + bne |$L_00_48| + + sub x16,x16,#256 // rewind x16 + cmp x1,x2 + mov x17, #64 + cseleq x17,x17,xzr + sub x1,x1,x17 // avoid SEGV + mov x17,sp + add w10,w10,w12 + add w3,w3,w15 + and w12,w8,w7 + ld1 {v0.16b},[x1],#16 + bic w15,w9,w7 + eor w11,w7,w7,ror#5 + ld1 {v4.4s},[x16],#16 + add w3,w3,w13 + orr w12,w12,w15 + eor w11,w11,w7,ror#19 + eor w15,w3,w3,ror#11 + rev32 v0.16b,v0.16b + add w10,w10,w12 + ror w11,w11,#6 + eor w13,w3,w4 + eor w15,w15,w3,ror#20 + add v4.4s,v4.4s,v0.4s + add w10,w10,w11 + ldr w12,[sp,#4] + and w14,w14,w13 + ror w15,w15,#2 + add w6,w6,w10 + eor w14,w14,w4 + add w9,w9,w12 + add w10,w10,w15 + and w12,w7,w6 + bic w15,w8,w6 + eor w11,w6,w6,ror#5 + add w10,w10,w14 + orr w12,w12,w15 + eor w11,w11,w6,ror#19 + eor w15,w10,w10,ror#11 + add w9,w9,w12 + ror w11,w11,#6 + eor w14,w10,w3 + eor w15,w15,w10,ror#20 + add w9,w9,w11 + ldr w12,[sp,#8] + and w13,w13,w14 + ror w15,w15,#2 + add w5,w5,w9 + eor w13,w13,w3 + add w8,w8,w12 + add w9,w9,w15 + and w12,w6,w5 + bic w15,w7,w5 + eor w11,w5,w5,ror#5 + add w9,w9,w13 + orr w12,w12,w15 + eor w11,w11,w5,ror#19 + eor w15,w9,w9,ror#11 + add w8,w8,w12 + ror w11,w11,#6 + eor w13,w9,w10 + eor w15,w15,w9,ror#20 + add w8,w8,w11 + ldr w12,[sp,#12] + and w14,w14,w13 + ror w15,w15,#2 + add w4,w4,w8 + eor w14,w14,w10 + add w7,w7,w12 + add w8,w8,w15 + and w12,w5,w4 + bic w15,w6,w4 + eor w11,w4,w4,ror#5 + add w8,w8,w14 + orr w12,w12,w15 + eor w11,w11,w4,ror#19 + eor w15,w8,w8,ror#11 + add w7,w7,w12 + ror w11,w11,#6 + eor w14,w8,w9 + eor w15,w15,w8,ror#20 + add w7,w7,w11 + ldr w12,[sp,#16] + and w13,w13,w14 + ror w15,w15,#2 + add w3,w3,w7 + eor w13,w13,w9 + st1 {v4.4s},[x17], #16 + add w6,w6,w12 + add w7,w7,w15 + and w12,w4,w3 + ld1 {v1.16b},[x1],#16 + bic w15,w5,w3 + eor w11,w3,w3,ror#5 + ld1 {v4.4s},[x16],#16 + add w7,w7,w13 + orr w12,w12,w15 + eor w11,w11,w3,ror#19 + eor w15,w7,w7,ror#11 + rev32 v1.16b,v1.16b + add w6,w6,w12 + ror w11,w11,#6 + eor w13,w7,w8 + eor w15,w15,w7,ror#20 + add v4.4s,v4.4s,v1.4s + add w6,w6,w11 + ldr w12,[sp,#20] + and w14,w14,w13 + ror w15,w15,#2 + add w10,w10,w6 + eor w14,w14,w8 + add w5,w5,w12 + add w6,w6,w15 + and w12,w3,w10 + bic w15,w4,w10 + eor w11,w10,w10,ror#5 + add w6,w6,w14 + orr w12,w12,w15 + eor w11,w11,w10,ror#19 + eor w15,w6,w6,ror#11 + add w5,w5,w12 + ror w11,w11,#6 + eor w14,w6,w7 + eor w15,w15,w6,ror#20 + add w5,w5,w11 + ldr w12,[sp,#24] + and w13,w13,w14 + ror w15,w15,#2 + add w9,w9,w5 + eor w13,w13,w7 + add w4,w4,w12 + add w5,w5,w15 + and w12,w10,w9 + bic w15,w3,w9 + eor w11,w9,w9,ror#5 + add w5,w5,w13 + orr w12,w12,w15 + eor w11,w11,w9,ror#19 + eor w15,w5,w5,ror#11 + add w4,w4,w12 + ror w11,w11,#6 + eor w13,w5,w6 + eor w15,w15,w5,ror#20 + add w4,w4,w11 + ldr w12,[sp,#28] + and w14,w14,w13 + ror w15,w15,#2 + add w8,w8,w4 + eor w14,w14,w6 + add w3,w3,w12 + add w4,w4,w15 + and w12,w9,w8 + bic w15,w10,w8 + eor w11,w8,w8,ror#5 + add w4,w4,w14 + orr w12,w12,w15 + eor w11,w11,w8,ror#19 + eor w15,w4,w4,ror#11 + add w3,w3,w12 + ror w11,w11,#6 + eor w14,w4,w5 + eor w15,w15,w4,ror#20 + add w3,w3,w11 + ldr w12,[sp,#32] + and w13,w13,w14 + ror w15,w15,#2 + add w7,w7,w3 + eor w13,w13,w5 + st1 {v4.4s},[x17], #16 + add w10,w10,w12 + add w3,w3,w15 + and w12,w8,w7 + ld1 {v2.16b},[x1],#16 + bic w15,w9,w7 + eor w11,w7,w7,ror#5 + ld1 {v4.4s},[x16],#16 + add w3,w3,w13 + orr w12,w12,w15 + eor w11,w11,w7,ror#19 + eor w15,w3,w3,ror#11 + rev32 v2.16b,v2.16b + add w10,w10,w12 + ror w11,w11,#6 + eor w13,w3,w4 + eor w15,w15,w3,ror#20 + add v4.4s,v4.4s,v2.4s + add w10,w10,w11 + ldr w12,[sp,#36] + and w14,w14,w13 + ror w15,w15,#2 + add w6,w6,w10 + eor w14,w14,w4 + add w9,w9,w12 + add w10,w10,w15 + and w12,w7,w6 + bic w15,w8,w6 + eor w11,w6,w6,ror#5 + add w10,w10,w14 + orr w12,w12,w15 + eor w11,w11,w6,ror#19 + eor w15,w10,w10,ror#11 + add w9,w9,w12 + ror w11,w11,#6 + eor w14,w10,w3 + eor w15,w15,w10,ror#20 + add w9,w9,w11 + ldr w12,[sp,#40] + and w13,w13,w14 + ror w15,w15,#2 + add w5,w5,w9 + eor w13,w13,w3 + add w8,w8,w12 + add w9,w9,w15 + and w12,w6,w5 + bic w15,w7,w5 + eor w11,w5,w5,ror#5 + add w9,w9,w13 + orr w12,w12,w15 + eor w11,w11,w5,ror#19 + eor w15,w9,w9,ror#11 + add w8,w8,w12 + ror w11,w11,#6 + eor w13,w9,w10 + eor w15,w15,w9,ror#20 + add w8,w8,w11 + ldr w12,[sp,#44] + and w14,w14,w13 + ror w15,w15,#2 + add w4,w4,w8 + eor w14,w14,w10 + add w7,w7,w12 + add w8,w8,w15 + and w12,w5,w4 + bic w15,w6,w4 + eor w11,w4,w4,ror#5 + add w8,w8,w14 + orr w12,w12,w15 + eor w11,w11,w4,ror#19 + eor w15,w8,w8,ror#11 + add w7,w7,w12 + ror w11,w11,#6 + eor w14,w8,w9 + eor w15,w15,w8,ror#20 + add w7,w7,w11 + ldr w12,[sp,#48] + and w13,w13,w14 + ror w15,w15,#2 + add w3,w3,w7 + eor w13,w13,w9 + st1 {v4.4s},[x17], #16 + add w6,w6,w12 + add w7,w7,w15 + and w12,w4,w3 + ld1 {v3.16b},[x1],#16 + bic w15,w5,w3 + eor w11,w3,w3,ror#5 + ld1 {v4.4s},[x16],#16 + add w7,w7,w13 + orr w12,w12,w15 + eor w11,w11,w3,ror#19 + eor w15,w7,w7,ror#11 + rev32 v3.16b,v3.16b + add w6,w6,w12 + ror w11,w11,#6 + eor w13,w7,w8 + eor w15,w15,w7,ror#20 + add v4.4s,v4.4s,v3.4s + add w6,w6,w11 + ldr w12,[sp,#52] + and w14,w14,w13 + ror w15,w15,#2 + add w10,w10,w6 + eor w14,w14,w8 + add w5,w5,w12 + add w6,w6,w15 + and w12,w3,w10 + bic w15,w4,w10 + eor w11,w10,w10,ror#5 + add w6,w6,w14 + orr w12,w12,w15 + eor w11,w11,w10,ror#19 + eor w15,w6,w6,ror#11 + add w5,w5,w12 + ror w11,w11,#6 + eor w14,w6,w7 + eor w15,w15,w6,ror#20 + add w5,w5,w11 + ldr w12,[sp,#56] + and w13,w13,w14 + ror w15,w15,#2 + add w9,w9,w5 + eor w13,w13,w7 + add w4,w4,w12 + add w5,w5,w15 + and w12,w10,w9 + bic w15,w3,w9 + eor w11,w9,w9,ror#5 + add w5,w5,w13 + orr w12,w12,w15 + eor w11,w11,w9,ror#19 + eor w15,w5,w5,ror#11 + add w4,w4,w12 + ror w11,w11,#6 + eor w13,w5,w6 + eor w15,w15,w5,ror#20 + add w4,w4,w11 + ldr w12,[sp,#60] + and w14,w14,w13 + ror w15,w15,#2 + add w8,w8,w4 + eor w14,w14,w6 + add w3,w3,w12 + add w4,w4,w15 + and w12,w9,w8 + bic w15,w10,w8 + eor w11,w8,w8,ror#5 + add w4,w4,w14 + orr w12,w12,w15 + eor w11,w11,w8,ror#19 + eor w15,w4,w4,ror#11 + add w3,w3,w12 + ror w11,w11,#6 + eor w14,w4,w5 + eor w15,w15,w4,ror#20 + add w3,w3,w11 + and w13,w13,w14 + ror w15,w15,#2 + add w7,w7,w3 + eor w13,w13,w5 + st1 {v4.4s},[x17], #16 + add w3,w3,w15 // h+=Sigma0(a) from the past + ldp w11,w12,[x0,#0] + add w3,w3,w13 // h+=Maj(a,b,c) from the past + ldp w13,w14,[x0,#8] + add w3,w3,w11 // accumulate + add w4,w4,w12 + ldp w11,w12,[x0,#16] + add w5,w5,w13 + add w6,w6,w14 + ldp w13,w14,[x0,#24] + add w7,w7,w11 + add w8,w8,w12 + ldr w12,[sp,#0] + stp w3,w4,[x0,#0] + add w9,w9,w13 + mov w13,wzr + stp w5,w6,[x0,#8] + add w10,w10,w14 + stp w7,w8,[x0,#16] + eor w14,w4,w5 + stp w9,w10,[x0,#24] + mov w15,wzr + mov x17,sp + bne |$L_00_48| + + ldr x29,[x29] + add sp,sp,#16*4+16 + ret + ENDP + + + EXPORT |blst_sha256_emit|[FUNC] + ALIGN 16 +|blst_sha256_emit| PROC + ldp x4,x5,[x1] + ldp x6,x7,[x1,#16] +#ifndef __AARCH64EB__ + rev x4,x4 + rev x5,x5 + rev x6,x6 + rev x7,x7 +#endif + str w4,[x0,#4] + lsr x4,x4,#32 + str w5,[x0,#12] + lsr x5,x5,#32 + str w6,[x0,#20] + lsr x6,x6,#32 + str w7,[x0,#28] + lsr x7,x7,#32 + str w4,[x0,#0] + str w5,[x0,#8] + str w6,[x0,#16] + str w7,[x0,#24] + ret + ENDP + + + + EXPORT |blst_sha256_bcopy|[FUNC] + ALIGN 16 +|blst_sha256_bcopy| PROC +|$Loop_bcopy| + ldrb w3,[x1],#1 + sub x2,x2,#1 + strb w3,[x0],#1 + cbnz x2,|$Loop_bcopy| + ret + ENDP + + + + EXPORT |blst_sha256_hcopy|[FUNC] + ALIGN 16 +|blst_sha256_hcopy| PROC + ldp x4,x5,[x1] + ldp x6,x7,[x1,#16] + stp x4,x5,[x0] + stp x6,x7,[x0,#16] + ret + ENDP + END diff --git a/temp-deps/blst-ts/deps/blst/build/win64/sha256-x86_64.asm b/temp-deps/blst-ts/deps/blst/build/win64/sha256-x86_64.asm new file mode 100644 index 0000000..d3b4092 --- /dev/null +++ b/temp-deps/blst-ts/deps/blst/build/win64/sha256-x86_64.asm @@ -0,0 +1,1570 @@ +OPTION DOTNAME +.text$ SEGMENT ALIGN(256) 'CODE' + +ALIGN 64 + +K256:: + DD 0428a2f98h,071374491h,0b5c0fbcfh,0e9b5dba5h + DD 03956c25bh,059f111f1h,0923f82a4h,0ab1c5ed5h + DD 0d807aa98h,012835b01h,0243185beh,0550c7dc3h + DD 072be5d74h,080deb1feh,09bdc06a7h,0c19bf174h + DD 0e49b69c1h,0efbe4786h,00fc19dc6h,0240ca1cch + DD 02de92c6fh,04a7484aah,05cb0a9dch,076f988dah + DD 0983e5152h,0a831c66dh,0b00327c8h,0bf597fc7h + DD 0c6e00bf3h,0d5a79147h,006ca6351h,014292967h + DD 027b70a85h,02e1b2138h,04d2c6dfch,053380d13h + DD 0650a7354h,0766a0abbh,081c2c92eh,092722c85h + DD 0a2bfe8a1h,0a81a664bh,0c24b8b70h,0c76c51a3h + DD 0d192e819h,0d6990624h,0f40e3585h,0106aa070h + DD 019a4c116h,01e376c08h,02748774ch,034b0bcb5h + DD 0391c0cb3h,04ed8aa4ah,05b9cca4fh,0682e6ff3h + DD 0748f82eeh,078a5636fh,084c87814h,08cc70208h + DD 090befffah,0a4506cebh,0bef9a3f7h,0c67178f2h + + DD 000010203h,004050607h,008090a0bh,00c0d0e0fh + DD 003020100h,00b0a0908h,0ffffffffh,0ffffffffh + DD 0ffffffffh,0ffffffffh,003020100h,00b0a0908h +DB 83,72,65,50,53,54,32,98,108,111,99,107,32,116,114,97 +DB 110,115,102,111,114,109,32,102,111,114,32,120,56,54,95,54 +DB 52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121 +DB 32,64,100,111,116,45,97,115,109,0 +PUBLIC blst_sha256_block_data_order_shaext + + +ALIGN 64 +blst_sha256_block_data_order_shaext PROC PUBLIC + DB 243,15,30,250 + mov QWORD PTR[8+rsp],rdi ;WIN64 prologue + mov QWORD PTR[16+rsp],rsi + mov r11,rsp +$L$SEH_begin_blst_sha256_block_data_order_shaext:: + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 + + + + sub rsp,058h + + movaps XMMWORD PTR[(-88)+r11],xmm6 + + movaps XMMWORD PTR[(-72)+r11],xmm7 + + movaps XMMWORD PTR[(-56)+r11],xmm8 + + movaps XMMWORD PTR[(-40)+r11],xmm9 + + movaps XMMWORD PTR[(-24)+r11],xmm10 + +$L$SEH_body_blst_sha256_block_data_order_shaext:: + + lea rcx,QWORD PTR[((K256+128))] + movdqu xmm1,XMMWORD PTR[rdi] + movdqu xmm2,XMMWORD PTR[16+rdi] + movdqa xmm7,XMMWORD PTR[((256-128))+rcx] + + pshufd xmm0,xmm1,01bh + pshufd xmm1,xmm1,0b1h + pshufd xmm2,xmm2,01bh + movdqa xmm8,xmm7 +DB 102,15,58,15,202,8 + punpcklqdq xmm2,xmm0 + jmp $L$oop_shaext + +ALIGN 16 +$L$oop_shaext:: + movdqu xmm3,XMMWORD PTR[rsi] + movdqu xmm4,XMMWORD PTR[16+rsi] + movdqu xmm5,XMMWORD PTR[32+rsi] +DB 102,15,56,0,223 + movdqu xmm6,XMMWORD PTR[48+rsi] + + movdqa xmm0,XMMWORD PTR[((0-128))+rcx] + paddd xmm0,xmm3 +DB 102,15,56,0,231 + movdqa xmm10,xmm2 +DB 15,56,203,209 + pshufd xmm0,xmm0,00eh + nop + movdqa xmm9,xmm1 +DB 15,56,203,202 + + movdqa xmm0,XMMWORD PTR[((16-128))+rcx] + paddd xmm0,xmm4 +DB 102,15,56,0,239 +DB 15,56,203,209 + pshufd xmm0,xmm0,00eh + lea rsi,QWORD PTR[64+rsi] +DB 15,56,204,220 +DB 15,56,203,202 + + movdqa xmm0,XMMWORD PTR[((32-128))+rcx] + paddd xmm0,xmm5 +DB 102,15,56,0,247 +DB 15,56,203,209 + pshufd xmm0,xmm0,00eh + movdqa xmm7,xmm6 +DB 102,15,58,15,253,4 + nop + paddd xmm3,xmm7 +DB 15,56,204,229 +DB 15,56,203,202 + + movdqa xmm0,XMMWORD PTR[((48-128))+rcx] + paddd xmm0,xmm6 +DB 15,56,205,222 +DB 15,56,203,209 + pshufd xmm0,xmm0,00eh + movdqa xmm7,xmm3 +DB 102,15,58,15,254,4 + nop + paddd xmm4,xmm7 +DB 15,56,204,238 +DB 15,56,203,202 + movdqa xmm0,XMMWORD PTR[((64-128))+rcx] + paddd xmm0,xmm3 +DB 15,56,205,227 +DB 15,56,203,209 + pshufd xmm0,xmm0,00eh + movdqa xmm7,xmm4 +DB 102,15,58,15,251,4 + nop + paddd xmm5,xmm7 +DB 15,56,204,243 +DB 15,56,203,202 + movdqa xmm0,XMMWORD PTR[((80-128))+rcx] + paddd xmm0,xmm4 +DB 15,56,205,236 +DB 15,56,203,209 + pshufd xmm0,xmm0,00eh + movdqa xmm7,xmm5 +DB 102,15,58,15,252,4 + nop + paddd xmm6,xmm7 +DB 15,56,204,220 +DB 15,56,203,202 + movdqa xmm0,XMMWORD PTR[((96-128))+rcx] + paddd xmm0,xmm5 +DB 15,56,205,245 +DB 15,56,203,209 + pshufd xmm0,xmm0,00eh + movdqa xmm7,xmm6 +DB 102,15,58,15,253,4 + nop + paddd xmm3,xmm7 +DB 15,56,204,229 +DB 15,56,203,202 + movdqa xmm0,XMMWORD PTR[((112-128))+rcx] + paddd xmm0,xmm6 +DB 15,56,205,222 +DB 15,56,203,209 + pshufd xmm0,xmm0,00eh + movdqa xmm7,xmm3 +DB 102,15,58,15,254,4 + nop + paddd xmm4,xmm7 +DB 15,56,204,238 +DB 15,56,203,202 + movdqa xmm0,XMMWORD PTR[((128-128))+rcx] + paddd xmm0,xmm3 +DB 15,56,205,227 +DB 15,56,203,209 + pshufd xmm0,xmm0,00eh + movdqa xmm7,xmm4 +DB 102,15,58,15,251,4 + nop + paddd xmm5,xmm7 +DB 15,56,204,243 +DB 15,56,203,202 + movdqa xmm0,XMMWORD PTR[((144-128))+rcx] + paddd xmm0,xmm4 +DB 15,56,205,236 +DB 15,56,203,209 + pshufd xmm0,xmm0,00eh + movdqa xmm7,xmm5 +DB 102,15,58,15,252,4 + nop + paddd xmm6,xmm7 +DB 15,56,204,220 +DB 15,56,203,202 + movdqa xmm0,XMMWORD PTR[((160-128))+rcx] + paddd xmm0,xmm5 +DB 15,56,205,245 +DB 15,56,203,209 + pshufd xmm0,xmm0,00eh + movdqa xmm7,xmm6 +DB 102,15,58,15,253,4 + nop + paddd xmm3,xmm7 +DB 15,56,204,229 +DB 15,56,203,202 + movdqa xmm0,XMMWORD PTR[((176-128))+rcx] + paddd xmm0,xmm6 +DB 15,56,205,222 +DB 15,56,203,209 + pshufd xmm0,xmm0,00eh + movdqa xmm7,xmm3 +DB 102,15,58,15,254,4 + nop + paddd xmm4,xmm7 +DB 15,56,204,238 +DB 15,56,203,202 + movdqa xmm0,XMMWORD PTR[((192-128))+rcx] + paddd xmm0,xmm3 +DB 15,56,205,227 +DB 15,56,203,209 + pshufd xmm0,xmm0,00eh + movdqa xmm7,xmm4 +DB 102,15,58,15,251,4 + nop + paddd xmm5,xmm7 +DB 15,56,204,243 +DB 15,56,203,202 + movdqa xmm0,XMMWORD PTR[((208-128))+rcx] + paddd xmm0,xmm4 +DB 15,56,205,236 +DB 15,56,203,209 + pshufd xmm0,xmm0,00eh + movdqa xmm7,xmm5 +DB 102,15,58,15,252,4 +DB 15,56,203,202 + paddd xmm6,xmm7 + + movdqa xmm0,XMMWORD PTR[((224-128))+rcx] + paddd xmm0,xmm5 +DB 15,56,203,209 + pshufd xmm0,xmm0,00eh +DB 15,56,205,245 + movdqa xmm7,xmm8 +DB 15,56,203,202 + + movdqa xmm0,XMMWORD PTR[((240-128))+rcx] + paddd xmm0,xmm6 + nop +DB 15,56,203,209 + pshufd xmm0,xmm0,00eh + dec rdx + nop +DB 15,56,203,202 + + paddd xmm2,xmm10 + paddd xmm1,xmm9 + jnz $L$oop_shaext + + pshufd xmm2,xmm2,0b1h + pshufd xmm7,xmm1,01bh + pshufd xmm1,xmm1,0b1h + punpckhqdq xmm1,xmm2 +DB 102,15,58,15,215,8 + + movdqu XMMWORD PTR[rdi],xmm1 + movdqu XMMWORD PTR[16+rdi],xmm2 + movaps xmm6,XMMWORD PTR[((-88))+r11] + movaps xmm7,XMMWORD PTR[((-72))+r11] + movaps xmm8,XMMWORD PTR[((-56))+r11] + movaps xmm9,XMMWORD PTR[((-40))+r11] + movaps xmm10,XMMWORD PTR[((-24))+r11] + mov rsp,r11 + +$L$SEH_epilogue_blst_sha256_block_data_order_shaext:: + mov rdi,QWORD PTR[8+r11] ;WIN64 epilogue + mov rsi,QWORD PTR[16+r11] + + DB 0F3h,0C3h ;repret + +$L$SEH_end_blst_sha256_block_data_order_shaext:: +blst_sha256_block_data_order_shaext ENDP +PUBLIC blst_sha256_block_data_order + + +ALIGN 64 +blst_sha256_block_data_order PROC PUBLIC + DB 243,15,30,250 + mov QWORD PTR[8+rsp],rdi ;WIN64 prologue + mov QWORD PTR[16+rsp],rsi + mov r11,rsp +$L$SEH_begin_blst_sha256_block_data_order:: + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 + + + + push rbp + + push rbx + + push r12 + + push r13 + + push r14 + + push r15 + + shl rdx,4 + sub rsp,104 + + lea rdx,QWORD PTR[rdx*4+rsi] + mov QWORD PTR[rsp],rdi + + mov QWORD PTR[16+rsp],rdx + movaps XMMWORD PTR[32+rsp],xmm6 + + movaps XMMWORD PTR[48+rsp],xmm7 + + movaps XMMWORD PTR[64+rsp],xmm8 + + movaps XMMWORD PTR[80+rsp],xmm9 + + mov rbp,rsp + +$L$SEH_body_blst_sha256_block_data_order:: + + + lea rsp,QWORD PTR[((-64))+rsp] + mov eax,DWORD PTR[rdi] + and rsp,-64 + mov ebx,DWORD PTR[4+rdi] + mov ecx,DWORD PTR[8+rdi] + mov edx,DWORD PTR[12+rdi] + mov r8d,DWORD PTR[16+rdi] + mov r9d,DWORD PTR[20+rdi] + mov r10d,DWORD PTR[24+rdi] + mov r11d,DWORD PTR[28+rdi] + + + jmp $L$loop_ssse3 +ALIGN 16 +$L$loop_ssse3:: + movdqa xmm7,XMMWORD PTR[((K256+256))] + mov QWORD PTR[8+rbp],rsi + movdqu xmm0,XMMWORD PTR[rsi] + movdqu xmm1,XMMWORD PTR[16+rsi] + movdqu xmm2,XMMWORD PTR[32+rsi] +DB 102,15,56,0,199 + movdqu xmm3,XMMWORD PTR[48+rsi] + lea rsi,QWORD PTR[K256] +DB 102,15,56,0,207 + movdqa xmm4,XMMWORD PTR[rsi] + movdqa xmm5,XMMWORD PTR[16+rsi] +DB 102,15,56,0,215 + paddd xmm4,xmm0 + movdqa xmm6,XMMWORD PTR[32+rsi] +DB 102,15,56,0,223 + movdqa xmm7,XMMWORD PTR[48+rsi] + paddd xmm5,xmm1 + paddd xmm6,xmm2 + paddd xmm7,xmm3 + movdqa XMMWORD PTR[rsp],xmm4 + mov r14d,eax + movdqa XMMWORD PTR[16+rsp],xmm5 + mov edi,ebx + movdqa XMMWORD PTR[32+rsp],xmm6 + xor edi,ecx + movdqa XMMWORD PTR[48+rsp],xmm7 + mov r13d,r8d + jmp $L$ssse3_00_47 + +ALIGN 16 +$L$ssse3_00_47:: + sub rsi,-64 + ror r13d,14 + movdqa xmm4,xmm1 + mov eax,r14d + mov r12d,r9d + movdqa xmm7,xmm3 + ror r14d,9 + xor r13d,r8d + xor r12d,r10d + ror r13d,5 + xor r14d,eax +DB 102,15,58,15,224,4 + and r12d,r8d + xor r13d,r8d +DB 102,15,58,15,250,4 + add r11d,DWORD PTR[rsp] + mov r15d,eax + xor r12d,r10d + ror r14d,11 + movdqa xmm5,xmm4 + xor r15d,ebx + add r11d,r12d + movdqa xmm6,xmm4 + ror r13d,6 + and edi,r15d + psrld xmm4,3 + xor r14d,eax + add r11d,r13d + xor edi,ebx + paddd xmm0,xmm7 + ror r14d,2 + add edx,r11d + psrld xmm6,7 + add r11d,edi + mov r13d,edx + pshufd xmm7,xmm3,250 + add r14d,r11d + ror r13d,14 + pslld xmm5,14 + mov r11d,r14d + mov r12d,r8d + pxor xmm4,xmm6 + ror r14d,9 + xor r13d,edx + xor r12d,r9d + ror r13d,5 + psrld xmm6,11 + xor r14d,r11d + pxor xmm4,xmm5 + and r12d,edx + xor r13d,edx + pslld xmm5,11 + add r10d,DWORD PTR[4+rsp] + mov edi,r11d + pxor xmm4,xmm6 + xor r12d,r9d + ror r14d,11 + movdqa xmm6,xmm7 + xor edi,eax + add r10d,r12d + pxor xmm4,xmm5 + ror r13d,6 + and r15d,edi + xor r14d,r11d + psrld xmm7,10 + add r10d,r13d + xor r15d,eax + paddd xmm0,xmm4 + ror r14d,2 + add ecx,r10d + psrlq xmm6,17 + add r10d,r15d + mov r13d,ecx + add r14d,r10d + pxor xmm7,xmm6 + ror r13d,14 + mov r10d,r14d + mov r12d,edx + ror r14d,9 + psrlq xmm6,2 + xor r13d,ecx + xor r12d,r8d + pxor xmm7,xmm6 + ror r13d,5 + xor r14d,r10d + and r12d,ecx + pshufd xmm7,xmm7,128 + xor r13d,ecx + add r9d,DWORD PTR[8+rsp] + mov r15d,r10d + psrldq xmm7,8 + xor r12d,r8d + ror r14d,11 + xor r15d,r11d + add r9d,r12d + ror r13d,6 + paddd xmm0,xmm7 + and edi,r15d + xor r14d,r10d + add r9d,r13d + pshufd xmm7,xmm0,80 + xor edi,r11d + ror r14d,2 + add ebx,r9d + movdqa xmm6,xmm7 + add r9d,edi + mov r13d,ebx + psrld xmm7,10 + add r14d,r9d + ror r13d,14 + psrlq xmm6,17 + mov r9d,r14d + mov r12d,ecx + pxor xmm7,xmm6 + ror r14d,9 + xor r13d,ebx + xor r12d,edx + ror r13d,5 + xor r14d,r9d + psrlq xmm6,2 + and r12d,ebx + xor r13d,ebx + add r8d,DWORD PTR[12+rsp] + pxor xmm7,xmm6 + mov edi,r9d + xor r12d,edx + ror r14d,11 + pshufd xmm7,xmm7,8 + xor edi,r10d + add r8d,r12d + movdqa xmm6,XMMWORD PTR[rsi] + ror r13d,6 + and r15d,edi + pslldq xmm7,8 + xor r14d,r9d + add r8d,r13d + xor r15d,r10d + paddd xmm0,xmm7 + ror r14d,2 + add eax,r8d + add r8d,r15d + paddd xmm6,xmm0 + mov r13d,eax + add r14d,r8d + movdqa XMMWORD PTR[rsp],xmm6 + ror r13d,14 + movdqa xmm4,xmm2 + mov r8d,r14d + mov r12d,ebx + movdqa xmm7,xmm0 + ror r14d,9 + xor r13d,eax + xor r12d,ecx + ror r13d,5 + xor r14d,r8d +DB 102,15,58,15,225,4 + and r12d,eax + xor r13d,eax +DB 102,15,58,15,251,4 + add edx,DWORD PTR[16+rsp] + mov r15d,r8d + xor r12d,ecx + ror r14d,11 + movdqa xmm5,xmm4 + xor r15d,r9d + add edx,r12d + movdqa xmm6,xmm4 + ror r13d,6 + and edi,r15d + psrld xmm4,3 + xor r14d,r8d + add edx,r13d + xor edi,r9d + paddd xmm1,xmm7 + ror r14d,2 + add r11d,edx + psrld xmm6,7 + add edx,edi + mov r13d,r11d + pshufd xmm7,xmm0,250 + add r14d,edx + ror r13d,14 + pslld xmm5,14 + mov edx,r14d + mov r12d,eax + pxor xmm4,xmm6 + ror r14d,9 + xor r13d,r11d + xor r12d,ebx + ror r13d,5 + psrld xmm6,11 + xor r14d,edx + pxor xmm4,xmm5 + and r12d,r11d + xor r13d,r11d + pslld xmm5,11 + add ecx,DWORD PTR[20+rsp] + mov edi,edx + pxor xmm4,xmm6 + xor r12d,ebx + ror r14d,11 + movdqa xmm6,xmm7 + xor edi,r8d + add ecx,r12d + pxor xmm4,xmm5 + ror r13d,6 + and r15d,edi + xor r14d,edx + psrld xmm7,10 + add ecx,r13d + xor r15d,r8d + paddd xmm1,xmm4 + ror r14d,2 + add r10d,ecx + psrlq xmm6,17 + add ecx,r15d + mov r13d,r10d + add r14d,ecx + pxor xmm7,xmm6 + ror r13d,14 + mov ecx,r14d + mov r12d,r11d + ror r14d,9 + psrlq xmm6,2 + xor r13d,r10d + xor r12d,eax + pxor xmm7,xmm6 + ror r13d,5 + xor r14d,ecx + and r12d,r10d + pshufd xmm7,xmm7,128 + xor r13d,r10d + add ebx,DWORD PTR[24+rsp] + mov r15d,ecx + psrldq xmm7,8 + xor r12d,eax + ror r14d,11 + xor r15d,edx + add ebx,r12d + ror r13d,6 + paddd xmm1,xmm7 + and edi,r15d + xor r14d,ecx + add ebx,r13d + pshufd xmm7,xmm1,80 + xor edi,edx + ror r14d,2 + add r9d,ebx + movdqa xmm6,xmm7 + add ebx,edi + mov r13d,r9d + psrld xmm7,10 + add r14d,ebx + ror r13d,14 + psrlq xmm6,17 + mov ebx,r14d + mov r12d,r10d + pxor xmm7,xmm6 + ror r14d,9 + xor r13d,r9d + xor r12d,r11d + ror r13d,5 + xor r14d,ebx + psrlq xmm6,2 + and r12d,r9d + xor r13d,r9d + add eax,DWORD PTR[28+rsp] + pxor xmm7,xmm6 + mov edi,ebx + xor r12d,r11d + ror r14d,11 + pshufd xmm7,xmm7,8 + xor edi,ecx + add eax,r12d + movdqa xmm6,XMMWORD PTR[16+rsi] + ror r13d,6 + and r15d,edi + pslldq xmm7,8 + xor r14d,ebx + add eax,r13d + xor r15d,ecx + paddd xmm1,xmm7 + ror r14d,2 + add r8d,eax + add eax,r15d + paddd xmm6,xmm1 + mov r13d,r8d + add r14d,eax + movdqa XMMWORD PTR[16+rsp],xmm6 + ror r13d,14 + movdqa xmm4,xmm3 + mov eax,r14d + mov r12d,r9d + movdqa xmm7,xmm1 + ror r14d,9 + xor r13d,r8d + xor r12d,r10d + ror r13d,5 + xor r14d,eax +DB 102,15,58,15,226,4 + and r12d,r8d + xor r13d,r8d +DB 102,15,58,15,248,4 + add r11d,DWORD PTR[32+rsp] + mov r15d,eax + xor r12d,r10d + ror r14d,11 + movdqa xmm5,xmm4 + xor r15d,ebx + add r11d,r12d + movdqa xmm6,xmm4 + ror r13d,6 + and edi,r15d + psrld xmm4,3 + xor r14d,eax + add r11d,r13d + xor edi,ebx + paddd xmm2,xmm7 + ror r14d,2 + add edx,r11d + psrld xmm6,7 + add r11d,edi + mov r13d,edx + pshufd xmm7,xmm1,250 + add r14d,r11d + ror r13d,14 + pslld xmm5,14 + mov r11d,r14d + mov r12d,r8d + pxor xmm4,xmm6 + ror r14d,9 + xor r13d,edx + xor r12d,r9d + ror r13d,5 + psrld xmm6,11 + xor r14d,r11d + pxor xmm4,xmm5 + and r12d,edx + xor r13d,edx + pslld xmm5,11 + add r10d,DWORD PTR[36+rsp] + mov edi,r11d + pxor xmm4,xmm6 + xor r12d,r9d + ror r14d,11 + movdqa xmm6,xmm7 + xor edi,eax + add r10d,r12d + pxor xmm4,xmm5 + ror r13d,6 + and r15d,edi + xor r14d,r11d + psrld xmm7,10 + add r10d,r13d + xor r15d,eax + paddd xmm2,xmm4 + ror r14d,2 + add ecx,r10d + psrlq xmm6,17 + add r10d,r15d + mov r13d,ecx + add r14d,r10d + pxor xmm7,xmm6 + ror r13d,14 + mov r10d,r14d + mov r12d,edx + ror r14d,9 + psrlq xmm6,2 + xor r13d,ecx + xor r12d,r8d + pxor xmm7,xmm6 + ror r13d,5 + xor r14d,r10d + and r12d,ecx + pshufd xmm7,xmm7,128 + xor r13d,ecx + add r9d,DWORD PTR[40+rsp] + mov r15d,r10d + psrldq xmm7,8 + xor r12d,r8d + ror r14d,11 + xor r15d,r11d + add r9d,r12d + ror r13d,6 + paddd xmm2,xmm7 + and edi,r15d + xor r14d,r10d + add r9d,r13d + pshufd xmm7,xmm2,80 + xor edi,r11d + ror r14d,2 + add ebx,r9d + movdqa xmm6,xmm7 + add r9d,edi + mov r13d,ebx + psrld xmm7,10 + add r14d,r9d + ror r13d,14 + psrlq xmm6,17 + mov r9d,r14d + mov r12d,ecx + pxor xmm7,xmm6 + ror r14d,9 + xor r13d,ebx + xor r12d,edx + ror r13d,5 + xor r14d,r9d + psrlq xmm6,2 + and r12d,ebx + xor r13d,ebx + add r8d,DWORD PTR[44+rsp] + pxor xmm7,xmm6 + mov edi,r9d + xor r12d,edx + ror r14d,11 + pshufd xmm7,xmm7,8 + xor edi,r10d + add r8d,r12d + movdqa xmm6,XMMWORD PTR[32+rsi] + ror r13d,6 + and r15d,edi + pslldq xmm7,8 + xor r14d,r9d + add r8d,r13d + xor r15d,r10d + paddd xmm2,xmm7 + ror r14d,2 + add eax,r8d + add r8d,r15d + paddd xmm6,xmm2 + mov r13d,eax + add r14d,r8d + movdqa XMMWORD PTR[32+rsp],xmm6 + ror r13d,14 + movdqa xmm4,xmm0 + mov r8d,r14d + mov r12d,ebx + movdqa xmm7,xmm2 + ror r14d,9 + xor r13d,eax + xor r12d,ecx + ror r13d,5 + xor r14d,r8d +DB 102,15,58,15,227,4 + and r12d,eax + xor r13d,eax +DB 102,15,58,15,249,4 + add edx,DWORD PTR[48+rsp] + mov r15d,r8d + xor r12d,ecx + ror r14d,11 + movdqa xmm5,xmm4 + xor r15d,r9d + add edx,r12d + movdqa xmm6,xmm4 + ror r13d,6 + and edi,r15d + psrld xmm4,3 + xor r14d,r8d + add edx,r13d + xor edi,r9d + paddd xmm3,xmm7 + ror r14d,2 + add r11d,edx + psrld xmm6,7 + add edx,edi + mov r13d,r11d + pshufd xmm7,xmm2,250 + add r14d,edx + ror r13d,14 + pslld xmm5,14 + mov edx,r14d + mov r12d,eax + pxor xmm4,xmm6 + ror r14d,9 + xor r13d,r11d + xor r12d,ebx + ror r13d,5 + psrld xmm6,11 + xor r14d,edx + pxor xmm4,xmm5 + and r12d,r11d + xor r13d,r11d + pslld xmm5,11 + add ecx,DWORD PTR[52+rsp] + mov edi,edx + pxor xmm4,xmm6 + xor r12d,ebx + ror r14d,11 + movdqa xmm6,xmm7 + xor edi,r8d + add ecx,r12d + pxor xmm4,xmm5 + ror r13d,6 + and r15d,edi + xor r14d,edx + psrld xmm7,10 + add ecx,r13d + xor r15d,r8d + paddd xmm3,xmm4 + ror r14d,2 + add r10d,ecx + psrlq xmm6,17 + add ecx,r15d + mov r13d,r10d + add r14d,ecx + pxor xmm7,xmm6 + ror r13d,14 + mov ecx,r14d + mov r12d,r11d + ror r14d,9 + psrlq xmm6,2 + xor r13d,r10d + xor r12d,eax + pxor xmm7,xmm6 + ror r13d,5 + xor r14d,ecx + and r12d,r10d + pshufd xmm7,xmm7,128 + xor r13d,r10d + add ebx,DWORD PTR[56+rsp] + mov r15d,ecx + psrldq xmm7,8 + xor r12d,eax + ror r14d,11 + xor r15d,edx + add ebx,r12d + ror r13d,6 + paddd xmm3,xmm7 + and edi,r15d + xor r14d,ecx + add ebx,r13d + pshufd xmm7,xmm3,80 + xor edi,edx + ror r14d,2 + add r9d,ebx + movdqa xmm6,xmm7 + add ebx,edi + mov r13d,r9d + psrld xmm7,10 + add r14d,ebx + ror r13d,14 + psrlq xmm6,17 + mov ebx,r14d + mov r12d,r10d + pxor xmm7,xmm6 + ror r14d,9 + xor r13d,r9d + xor r12d,r11d + ror r13d,5 + xor r14d,ebx + psrlq xmm6,2 + and r12d,r9d + xor r13d,r9d + add eax,DWORD PTR[60+rsp] + pxor xmm7,xmm6 + mov edi,ebx + xor r12d,r11d + ror r14d,11 + pshufd xmm7,xmm7,8 + xor edi,ecx + add eax,r12d + movdqa xmm6,XMMWORD PTR[48+rsi] + ror r13d,6 + and r15d,edi + pslldq xmm7,8 + xor r14d,ebx + add eax,r13d + xor r15d,ecx + paddd xmm3,xmm7 + ror r14d,2 + add r8d,eax + add eax,r15d + paddd xmm6,xmm3 + mov r13d,r8d + add r14d,eax + movdqa XMMWORD PTR[48+rsp],xmm6 + cmp BYTE PTR[67+rsi],0 + jne $L$ssse3_00_47 + ror r13d,14 + mov eax,r14d + mov r12d,r9d + ror r14d,9 + xor r13d,r8d + xor r12d,r10d + ror r13d,5 + xor r14d,eax + and r12d,r8d + xor r13d,r8d + add r11d,DWORD PTR[rsp] + mov r15d,eax + xor r12d,r10d + ror r14d,11 + xor r15d,ebx + add r11d,r12d + ror r13d,6 + and edi,r15d + xor r14d,eax + add r11d,r13d + xor edi,ebx + ror r14d,2 + add edx,r11d + add r11d,edi + mov r13d,edx + add r14d,r11d + ror r13d,14 + mov r11d,r14d + mov r12d,r8d + ror r14d,9 + xor r13d,edx + xor r12d,r9d + ror r13d,5 + xor r14d,r11d + and r12d,edx + xor r13d,edx + add r10d,DWORD PTR[4+rsp] + mov edi,r11d + xor r12d,r9d + ror r14d,11 + xor edi,eax + add r10d,r12d + ror r13d,6 + and r15d,edi + xor r14d,r11d + add r10d,r13d + xor r15d,eax + ror r14d,2 + add ecx,r10d + add r10d,r15d + mov r13d,ecx + add r14d,r10d + ror r13d,14 + mov r10d,r14d + mov r12d,edx + ror r14d,9 + xor r13d,ecx + xor r12d,r8d + ror r13d,5 + xor r14d,r10d + and r12d,ecx + xor r13d,ecx + add r9d,DWORD PTR[8+rsp] + mov r15d,r10d + xor r12d,r8d + ror r14d,11 + xor r15d,r11d + add r9d,r12d + ror r13d,6 + and edi,r15d + xor r14d,r10d + add r9d,r13d + xor edi,r11d + ror r14d,2 + add ebx,r9d + add r9d,edi + mov r13d,ebx + add r14d,r9d + ror r13d,14 + mov r9d,r14d + mov r12d,ecx + ror r14d,9 + xor r13d,ebx + xor r12d,edx + ror r13d,5 + xor r14d,r9d + and r12d,ebx + xor r13d,ebx + add r8d,DWORD PTR[12+rsp] + mov edi,r9d + xor r12d,edx + ror r14d,11 + xor edi,r10d + add r8d,r12d + ror r13d,6 + and r15d,edi + xor r14d,r9d + add r8d,r13d + xor r15d,r10d + ror r14d,2 + add eax,r8d + add r8d,r15d + mov r13d,eax + add r14d,r8d + ror r13d,14 + mov r8d,r14d + mov r12d,ebx + ror r14d,9 + xor r13d,eax + xor r12d,ecx + ror r13d,5 + xor r14d,r8d + and r12d,eax + xor r13d,eax + add edx,DWORD PTR[16+rsp] + mov r15d,r8d + xor r12d,ecx + ror r14d,11 + xor r15d,r9d + add edx,r12d + ror r13d,6 + and edi,r15d + xor r14d,r8d + add edx,r13d + xor edi,r9d + ror r14d,2 + add r11d,edx + add edx,edi + mov r13d,r11d + add r14d,edx + ror r13d,14 + mov edx,r14d + mov r12d,eax + ror r14d,9 + xor r13d,r11d + xor r12d,ebx + ror r13d,5 + xor r14d,edx + and r12d,r11d + xor r13d,r11d + add ecx,DWORD PTR[20+rsp] + mov edi,edx + xor r12d,ebx + ror r14d,11 + xor edi,r8d + add ecx,r12d + ror r13d,6 + and r15d,edi + xor r14d,edx + add ecx,r13d + xor r15d,r8d + ror r14d,2 + add r10d,ecx + add ecx,r15d + mov r13d,r10d + add r14d,ecx + ror r13d,14 + mov ecx,r14d + mov r12d,r11d + ror r14d,9 + xor r13d,r10d + xor r12d,eax + ror r13d,5 + xor r14d,ecx + and r12d,r10d + xor r13d,r10d + add ebx,DWORD PTR[24+rsp] + mov r15d,ecx + xor r12d,eax + ror r14d,11 + xor r15d,edx + add ebx,r12d + ror r13d,6 + and edi,r15d + xor r14d,ecx + add ebx,r13d + xor edi,edx + ror r14d,2 + add r9d,ebx + add ebx,edi + mov r13d,r9d + add r14d,ebx + ror r13d,14 + mov ebx,r14d + mov r12d,r10d + ror r14d,9 + xor r13d,r9d + xor r12d,r11d + ror r13d,5 + xor r14d,ebx + and r12d,r9d + xor r13d,r9d + add eax,DWORD PTR[28+rsp] + mov edi,ebx + xor r12d,r11d + ror r14d,11 + xor edi,ecx + add eax,r12d + ror r13d,6 + and r15d,edi + xor r14d,ebx + add eax,r13d + xor r15d,ecx + ror r14d,2 + add r8d,eax + add eax,r15d + mov r13d,r8d + add r14d,eax + ror r13d,14 + mov eax,r14d + mov r12d,r9d + ror r14d,9 + xor r13d,r8d + xor r12d,r10d + ror r13d,5 + xor r14d,eax + and r12d,r8d + xor r13d,r8d + add r11d,DWORD PTR[32+rsp] + mov r15d,eax + xor r12d,r10d + ror r14d,11 + xor r15d,ebx + add r11d,r12d + ror r13d,6 + and edi,r15d + xor r14d,eax + add r11d,r13d + xor edi,ebx + ror r14d,2 + add edx,r11d + add r11d,edi + mov r13d,edx + add r14d,r11d + ror r13d,14 + mov r11d,r14d + mov r12d,r8d + ror r14d,9 + xor r13d,edx + xor r12d,r9d + ror r13d,5 + xor r14d,r11d + and r12d,edx + xor r13d,edx + add r10d,DWORD PTR[36+rsp] + mov edi,r11d + xor r12d,r9d + ror r14d,11 + xor edi,eax + add r10d,r12d + ror r13d,6 + and r15d,edi + xor r14d,r11d + add r10d,r13d + xor r15d,eax + ror r14d,2 + add ecx,r10d + add r10d,r15d + mov r13d,ecx + add r14d,r10d + ror r13d,14 + mov r10d,r14d + mov r12d,edx + ror r14d,9 + xor r13d,ecx + xor r12d,r8d + ror r13d,5 + xor r14d,r10d + and r12d,ecx + xor r13d,ecx + add r9d,DWORD PTR[40+rsp] + mov r15d,r10d + xor r12d,r8d + ror r14d,11 + xor r15d,r11d + add r9d,r12d + ror r13d,6 + and edi,r15d + xor r14d,r10d + add r9d,r13d + xor edi,r11d + ror r14d,2 + add ebx,r9d + add r9d,edi + mov r13d,ebx + add r14d,r9d + ror r13d,14 + mov r9d,r14d + mov r12d,ecx + ror r14d,9 + xor r13d,ebx + xor r12d,edx + ror r13d,5 + xor r14d,r9d + and r12d,ebx + xor r13d,ebx + add r8d,DWORD PTR[44+rsp] + mov edi,r9d + xor r12d,edx + ror r14d,11 + xor edi,r10d + add r8d,r12d + ror r13d,6 + and r15d,edi + xor r14d,r9d + add r8d,r13d + xor r15d,r10d + ror r14d,2 + add eax,r8d + add r8d,r15d + mov r13d,eax + add r14d,r8d + ror r13d,14 + mov r8d,r14d + mov r12d,ebx + ror r14d,9 + xor r13d,eax + xor r12d,ecx + ror r13d,5 + xor r14d,r8d + and r12d,eax + xor r13d,eax + add edx,DWORD PTR[48+rsp] + mov r15d,r8d + xor r12d,ecx + ror r14d,11 + xor r15d,r9d + add edx,r12d + ror r13d,6 + and edi,r15d + xor r14d,r8d + add edx,r13d + xor edi,r9d + ror r14d,2 + add r11d,edx + add edx,edi + mov r13d,r11d + add r14d,edx + ror r13d,14 + mov edx,r14d + mov r12d,eax + ror r14d,9 + xor r13d,r11d + xor r12d,ebx + ror r13d,5 + xor r14d,edx + and r12d,r11d + xor r13d,r11d + add ecx,DWORD PTR[52+rsp] + mov edi,edx + xor r12d,ebx + ror r14d,11 + xor edi,r8d + add ecx,r12d + ror r13d,6 + and r15d,edi + xor r14d,edx + add ecx,r13d + xor r15d,r8d + ror r14d,2 + add r10d,ecx + add ecx,r15d + mov r13d,r10d + add r14d,ecx + ror r13d,14 + mov ecx,r14d + mov r12d,r11d + ror r14d,9 + xor r13d,r10d + xor r12d,eax + ror r13d,5 + xor r14d,ecx + and r12d,r10d + xor r13d,r10d + add ebx,DWORD PTR[56+rsp] + mov r15d,ecx + xor r12d,eax + ror r14d,11 + xor r15d,edx + add ebx,r12d + ror r13d,6 + and edi,r15d + xor r14d,ecx + add ebx,r13d + xor edi,edx + ror r14d,2 + add r9d,ebx + add ebx,edi + mov r13d,r9d + add r14d,ebx + ror r13d,14 + mov ebx,r14d + mov r12d,r10d + ror r14d,9 + xor r13d,r9d + xor r12d,r11d + ror r13d,5 + xor r14d,ebx + and r12d,r9d + xor r13d,r9d + add eax,DWORD PTR[60+rsp] + mov edi,ebx + xor r12d,r11d + ror r14d,11 + xor edi,ecx + add eax,r12d + ror r13d,6 + and r15d,edi + xor r14d,ebx + add eax,r13d + xor r15d,ecx + ror r14d,2 + add r8d,eax + add eax,r15d + mov r13d,r8d + add r14d,eax + mov rdi,QWORD PTR[rbp] + mov eax,r14d + mov rsi,QWORD PTR[8+rbp] + + add eax,DWORD PTR[rdi] + add ebx,DWORD PTR[4+rdi] + add ecx,DWORD PTR[8+rdi] + add edx,DWORD PTR[12+rdi] + add r8d,DWORD PTR[16+rdi] + add r9d,DWORD PTR[20+rdi] + add r10d,DWORD PTR[24+rdi] + add r11d,DWORD PTR[28+rdi] + + lea rsi,QWORD PTR[64+rsi] + cmp rsi,QWORD PTR[16+rbp] + + mov DWORD PTR[rdi],eax + mov DWORD PTR[4+rdi],ebx + mov DWORD PTR[8+rdi],ecx + mov DWORD PTR[12+rdi],edx + mov DWORD PTR[16+rdi],r8d + mov DWORD PTR[20+rdi],r9d + mov DWORD PTR[24+rdi],r10d + mov DWORD PTR[28+rdi],r11d + jb $L$loop_ssse3 + + xorps xmm0,xmm0 + lea r11,QWORD PTR[((104+48))+rbp] + + movaps XMMWORD PTR[rsp],xmm0 + movaps XMMWORD PTR[16+rsp],xmm0 + movaps XMMWORD PTR[32+rsp],xmm0 + movaps XMMWORD PTR[48+rsp],xmm0 + movaps xmm6,XMMWORD PTR[32+rbp] + movaps xmm7,XMMWORD PTR[48+rbp] + movaps xmm8,XMMWORD PTR[64+rbp] + movaps xmm9,XMMWORD PTR[80+rbp] + mov r15,QWORD PTR[104+rbp] + + mov r14,QWORD PTR[((-40))+r11] + + mov r13,QWORD PTR[((-32))+r11] + + mov r12,QWORD PTR[((-24))+r11] + + mov rbx,QWORD PTR[((-16))+r11] + + mov rbp,QWORD PTR[((-8))+r11] + +$L$SEH_epilogue_blst_sha256_block_data_order:: + mov rdi,QWORD PTR[8+r11] ;WIN64 epilogue + mov rsi,QWORD PTR[16+r11] + + lea rsp,QWORD PTR[r11] + DB 0F3h,0C3h ;repret + +$L$SEH_end_blst_sha256_block_data_order:: +blst_sha256_block_data_order ENDP +PUBLIC blst_sha256_emit + + +ALIGN 16 +blst_sha256_emit PROC PUBLIC + DB 243,15,30,250 + mov r8,QWORD PTR[rdx] + mov r9,QWORD PTR[8+rdx] + mov r10,QWORD PTR[16+rdx] + bswap r8 + mov r11,QWORD PTR[24+rdx] + bswap r9 + mov DWORD PTR[4+rcx],r8d + bswap r10 + mov DWORD PTR[12+rcx],r9d + bswap r11 + mov DWORD PTR[20+rcx],r10d + shr r8,32 + mov DWORD PTR[28+rcx],r11d + shr r9,32 + mov DWORD PTR[rcx],r8d + shr r10,32 + mov DWORD PTR[8+rcx],r9d + shr r11,32 + mov DWORD PTR[16+rcx],r10d + mov DWORD PTR[24+rcx],r11d + DB 0F3h,0C3h ;repret +blst_sha256_emit ENDP + +PUBLIC blst_sha256_bcopy + + +ALIGN 16 +blst_sha256_bcopy PROC PUBLIC + DB 243,15,30,250 + sub rcx,rdx +$L$oop_bcopy:: + movzx eax,BYTE PTR[rdx] + lea rdx,QWORD PTR[1+rdx] + mov BYTE PTR[((-1))+rdx*1+rcx],al + dec r8 + jnz $L$oop_bcopy + DB 0F3h,0C3h ;repret +blst_sha256_bcopy ENDP + +PUBLIC blst_sha256_hcopy + + +ALIGN 16 +blst_sha256_hcopy PROC PUBLIC + DB 243,15,30,250 + mov r8,QWORD PTR[rdx] + mov r9,QWORD PTR[8+rdx] + mov r10,QWORD PTR[16+rdx] + mov r11,QWORD PTR[24+rdx] + mov QWORD PTR[rcx],r8 + mov QWORD PTR[8+rcx],r9 + mov QWORD PTR[16+rcx],r10 + mov QWORD PTR[24+rcx],r11 + DB 0F3h,0C3h ;repret +blst_sha256_hcopy ENDP +.text$ ENDS +.pdata SEGMENT READONLY ALIGN(4) +ALIGN 4 + DD imagerel $L$SEH_begin_blst_sha256_block_data_order_shaext + DD imagerel $L$SEH_body_blst_sha256_block_data_order_shaext + DD imagerel $L$SEH_info_blst_sha256_block_data_order_shaext_prologue + + DD imagerel $L$SEH_body_blst_sha256_block_data_order_shaext + DD imagerel $L$SEH_epilogue_blst_sha256_block_data_order_shaext + DD imagerel $L$SEH_info_blst_sha256_block_data_order_shaext_body + + DD imagerel $L$SEH_epilogue_blst_sha256_block_data_order_shaext + DD imagerel $L$SEH_end_blst_sha256_block_data_order_shaext + DD imagerel $L$SEH_info_blst_sha256_block_data_order_shaext_epilogue + + DD imagerel $L$SEH_begin_blst_sha256_block_data_order + DD imagerel $L$SEH_body_blst_sha256_block_data_order + DD imagerel $L$SEH_info_blst_sha256_block_data_order_prologue + + DD imagerel $L$SEH_body_blst_sha256_block_data_order + DD imagerel $L$SEH_epilogue_blst_sha256_block_data_order + DD imagerel $L$SEH_info_blst_sha256_block_data_order_body + + DD imagerel $L$SEH_epilogue_blst_sha256_block_data_order + DD imagerel $L$SEH_end_blst_sha256_block_data_order + DD imagerel $L$SEH_info_blst_sha256_block_data_order_epilogue + +.pdata ENDS +.xdata SEGMENT READONLY ALIGN(8) +ALIGN 8 +$L$SEH_info_blst_sha256_block_data_order_shaext_prologue:: +DB 1,0,5,00bh +DB 0,074h,1,0 +DB 0,064h,2,0 +DB 0,003h +DB 0,0 +$L$SEH_info_blst_sha256_block_data_order_shaext_body:: +DB 1,0,15,0 +DB 000h,068h,000h,000h +DB 000h,078h,001h,000h +DB 000h,088h,002h,000h +DB 000h,098h,003h,000h +DB 000h,0a8h,004h,000h +DB 000h,074h,00ch,000h +DB 000h,064h,00dh,000h +DB 000h,0a2h +DB 000h,000h,000h,000h,000h,000h +$L$SEH_info_blst_sha256_block_data_order_shaext_epilogue:: +DB 1,0,5,11 +DB 000h,074h,001h,000h +DB 000h,064h,002h,000h +DB 000h,003h +DB 000h,000h + +$L$SEH_info_blst_sha256_block_data_order_prologue:: +DB 1,0,5,00bh +DB 0,074h,1,0 +DB 0,064h,2,0 +DB 0,003h +DB 0,0 +$L$SEH_info_blst_sha256_block_data_order_body:: +DB 1,0,26,5 +DB 000h,068h,002h,000h +DB 000h,078h,003h,000h +DB 000h,088h,004h,000h +DB 000h,098h,005h,000h +DB 000h,0f4h,00dh,000h +DB 000h,0e4h,00eh,000h +DB 000h,0d4h,00fh,000h +DB 000h,0c4h,010h,000h +DB 000h,034h,011h,000h +DB 000h,074h,014h,000h +DB 000h,064h,015h,000h +DB 000h,003h +DB 000h,001h,012h,000h +DB 000h,050h +$L$SEH_info_blst_sha256_block_data_order_epilogue:: +DB 1,0,5,11 +DB 000h,074h,001h,000h +DB 000h,064h,002h,000h +DB 000h,003h +DB 000h,000h + + +.xdata ENDS +END