diff --git a/internal/utils/min_max_neon_arm64.s b/internal/utils/min_max_neon_arm64.s index a31c5d2e..078971d1 100644 --- a/internal/utils/min_max_neon_arm64.s +++ b/internal/utils/min_max_neon_arm64.s @@ -1,9 +1,8 @@ //+build !noasm !appengine -// ARROW-15336 -// (C2GOASM doesn't work correctly for Arm64) -// Partly GENERATED BY asm2plan9s. - +// ARROW-15336: optimized NEON min/max for ARM64 +// 32-bit functions use .4s (128-bit Q registers, 4 lanes) processing 8 elements/iteration +// 64-bit functions use BIT/BIF instead of BSL+MOV to eliminate register saves // func _int32_max_min_neon(values unsafe.Pointer, length int, minout, maxout unsafe.Pointer) TEXT ·_int32_max_min_neon(SB), $0-32 @@ -13,76 +12,74 @@ TEXT ·_int32_max_min_neon(SB), $0-32 MOVD minout+16(FP), R2 MOVD maxout+24(FP), R3 - // The Go ABI saves the frame pointer register one word below the - // caller's frame. Make room so we don't overwrite it. Needs to stay - // 16-byte aligned + // The Go ABI saves the frame pointer register one word below the + // caller's frame. Make room so we don't overwrite it. Needs to stay + // 16-byte aligned SUB $16, RSP - WORD $0xa9bf7bfd // stp x29, x30, [sp, #-16]! + WORD $0xa9bf7bfd // stp x29, x30, [sp, #-16]! WORD $0x7100043f // cmp w1, #1 WORD $0x910003fd // mov x29, sp - BLT LBB0_3 + BLT int32_early_exit - WORD $0x71000c3f // cmp w1, #3 + WORD $0x71001c3f // cmp w1, #7 WORD $0x2a0103e8 // mov w8, w1 - BHI LBB0_4 + BHI int32_neon WORD $0xaa1f03e9 // mov x9, xzr WORD $0x52b0000b // mov w11, #-2147483648 WORD $0x12b0000a // mov w10, #2147483647 - JMP LBB0_7 -LBB0_3: + JMP int32_scalar +int32_early_exit: WORD $0x12b0000a // mov w10, #2147483647 WORD $0x52b0000b // mov w11, #-2147483648 WORD $0xb900006b // str w11, [x3] WORD $0xb900004a // str w10, [x2] WORD $0xa8c17bfd // ldp x29, x30, [sp], #16 - // Put the stack pointer back where it was + // Put the stack pointer back where it was ADD $16, RSP RET -LBB0_4: - WORD $0x927e7509 // and x9, x8, #0xfffffffc - WORD $0x9100200a // add x10, x0, #8 - WORD $0x0f046402 // movi v2.2s, #128, lsl #24 - WORD $0x2f046400 // mvni v0.2s, #128, lsl #24 - WORD $0x2f046401 // mvni v1.2s, #128, lsl #24 +int32_neon: + WORD $0x927d7109 // and x9, x8, #0xfffffff8 + WORD $0x9100400a // add x10, x0, #16 + WORD $0x4f046402 // movi v2.4s, #128, lsl #24 + WORD $0x6f046400 // mvni v0.4s, #128, lsl #24 + WORD $0x6f046401 // mvni v1.4s, #128, lsl #24 WORD $0xaa0903eb // mov x11, x9 - WORD $0x0f046403 // movi v3.2s, #128, lsl #24 -LBB0_5: - WORD $0x6d7f9544 // ldp d4, d5, [x10, #-8] - WORD $0xf100116b // subs x11, x11, #4 - WORD $0x9100414a // add x10, x10, #16 - WORD $0x0ea46c00 // smin v0.2s, v0.2s, v4.2s - WORD $0x0ea56c21 // smin v1.2s, v1.2s, v5.2s - WORD $0x0ea46442 // smax v2.2s, v2.2s, v4.2s - WORD $0x0ea56463 // smax v3.2s, v3.2s, v5.2s - BNE LBB0_5 + WORD $0x4f046403 // movi v3.4s, #128, lsl #24 +int32_loop: + WORD $0xad7f9544 // ldp q4, q5, [x10, #-16] + WORD $0xf100216b // subs x11, x11, #8 + WORD $0x9100814a // add x10, x10, #32 + WORD $0x4ea46c00 // smin v0.4s, v0.4s, v4.4s + WORD $0x4ea56c21 // smin v1.4s, v1.4s, v5.4s + WORD $0x4ea46442 // smax v2.4s, v2.4s, v4.4s + WORD $0x4ea56463 // smax v3.4s, v3.4s, v5.4s + BNE int32_loop - WORD $0x0ea36442 // smax v2.2s, v2.2s, v3.2s - WORD $0x0ea16c00 // smin v0.2s, v0.2s, v1.2s - WORD $0x0e0c0441 // dup v1.2s, v2.s[1] - WORD $0x0e0c0403 // dup v3.2s, v0.s[1] - WORD $0x0ea16441 // smax v1.2s, v2.2s, v1.2s - WORD $0x0ea36c00 // smin v0.2s, v0.2s, v3.2s + WORD $0x4ea36442 // smax v2.4s, v2.4s, v3.4s + WORD $0x4ea16c00 // smin v0.4s, v0.4s, v1.4s + WORD $0x4eb0a842 // smaxv s2, v2.4s + WORD $0x4eb1a800 // sminv s0, v0.4s WORD $0xeb08013f // cmp x9, x8 - WORD $0x1e26002b // fmov w11, s1 - WORD $0x1e26000a // fmov w10, s0 - BEQ LBB0_9 -LBB0_7: + WORD $0x1e26004b // fmov w11, s2 + WORD $0x1e26000a // fmov w10, s0 + BEQ int32_done +int32_scalar: WORD $0x8b09080c // add x12, x0, x9, lsl #2 WORD $0xcb090108 // sub x8, x8, x9 -LBB0_8: +int32_scalar_loop: WORD $0xb8404589 // ldr w9, [x12], #4 WORD $0x6b09015f // cmp w10, w9 - WORD $0x1a89b14a // csel w10, w10, w9, lt + WORD $0x1a89b14a // csel w10, w10, w9, lt WORD $0x6b09017f // cmp w11, w9 - WORD $0x1a89c16b // csel w11, w11, w9, gt - WORD $0xf1000508 // subs x8, x8, #1 - BNE LBB0_8 -LBB0_9: + WORD $0x1a89c16b // csel w11, w11, w9, gt + WORD $0xf1000508 // subs x8, x8, #1 + BNE int32_scalar_loop +int32_done: WORD $0xb900006b // str w11, [x3] WORD $0xb900004a // str w10, [x2] WORD $0xa8c17bfd // ldp x29, x30, [sp], #16 - // Put the stack pointer back where it was + // Put the stack pointer back where it was ADD $16, RSP RET @@ -93,115 +90,113 @@ TEXT ·_uint32_max_min_neon(SB), $0-32 MOVD length+8(FP), R1 MOVD minout+16(FP), R2 MOVD maxout+24(FP), R3 - - // The Go ABI saves the frame pointer register one word below the - // caller's frame. Make room so we don't overwrite it. Needs to stay - // 16-byte aligned + + // The Go ABI saves the frame pointer register one word below the + // caller's frame. Make room so we don't overwrite it. Needs to stay + // 16-byte aligned SUB $16, RSP - WORD $0xa9bf7bfd // stp x29, x30, [sp, #-16]! + WORD $0xa9bf7bfd // stp x29, x30, [sp, #-16]! WORD $0x7100043f // cmp w1, #1 WORD $0x910003fd // mov x29, sp - BLT LBB1_3 + BLT uint32_early_exit - WORD $0x71000c3f // cmp w1, #3 + WORD $0x71001c3f // cmp w1, #7 WORD $0x2a0103e8 // mov w8, w1 - BHI LBB1_4 + BHI uint32_neon WORD $0xaa1f03e9 // mov x9, xzr WORD $0x2a1f03ea // mov w10, wzr WORD $0x1280000b // mov w11, #-1 - JMP LBB1_7 -LBB1_3: + JMP uint32_scalar +uint32_early_exit: WORD $0x2a1f03ea // mov w10, wzr WORD $0x1280000b // mov w11, #-1 WORD $0xb900006a // str w10, [x3] WORD $0xb900004b // str w11, [x2] WORD $0xa8c17bfd // ldp x29, x30, [sp], #16 - // Put the stack pointer back where it was + // Put the stack pointer back where it was ADD $16, RSP RET -LBB1_4: - WORD $0x927e7509 // and x9, x8, #0xfffffffc - WORD $0x6f00e401 // movi v1.2d, #0000000000000000 - WORD $0x6f07e7e0 // movi v0.2d, #0xffffffffffffffff - WORD $0x9100200a // add x10, x0, #8 - WORD $0x6f07e7e2 // movi v2.2d, #0xffffffffffffffff +uint32_neon: + WORD $0x927d7109 // and x9, x8, #0xfffffff8 + WORD $0x6f00e401 // movi v1.2d, #0000000000000000 + WORD $0x6f07e7e0 // movi v0.2d, #0xffffffffffffffff + WORD $0x9100400a // add x10, x0, #16 + WORD $0x6f07e7e2 // movi v2.2d, #0xffffffffffffffff WORD $0xaa0903eb // mov x11, x9 - WORD $0x6f00e403 // movi v3.2d, #0000000000000000 -LBB1_5: - WORD $0x6d7f9544 // ldp d4, d5, [x10, #-8] - WORD $0xf100116b // subs x11, x11, #4 - WORD $0x9100414a // add x10, x10, #16 - WORD $0x2ea46c00 // umin v0.2s, v0.2s, v4.2s - WORD $0x2ea56c42 // umin v2.2s, v2.2s, v5.2s - WORD $0x2ea46421 // umax v1.2s, v1.2s, v4.2s - WORD $0x2ea56463 // umax v3.2s, v3.2s, v5.2s - BNE LBB1_5 + WORD $0x6f00e403 // movi v3.2d, #0000000000000000 +uint32_loop: + WORD $0xad7f9544 // ldp q4, q5, [x10, #-16] + WORD $0xf100216b // subs x11, x11, #8 + WORD $0x9100814a // add x10, x10, #32 + WORD $0x6ea46c00 // umin v0.4s, v0.4s, v4.4s + WORD $0x6ea56c42 // umin v2.4s, v2.4s, v5.4s + WORD $0x6ea46421 // umax v1.4s, v1.4s, v4.4s + WORD $0x6ea56463 // umax v3.4s, v3.4s, v5.4s + BNE uint32_loop - WORD $0x2ea36421 // umax v1.2s, v1.2s, v3.2s - WORD $0x2ea26c00 // umin v0.2s, v0.2s, v2.2s - WORD $0x0e0c0422 // dup v2.2s, v1.s[1] - WORD $0x0e0c0403 // dup v3.2s, v0.s[1] - WORD $0x2ea26421 // umax v1.2s, v1.2s, v2.2s - WORD $0x2ea36c00 // umin v0.2s, v0.2s, v3.2s + WORD $0x6ea36421 // umax v1.4s, v1.4s, v3.4s + WORD $0x6ea26c00 // umin v0.4s, v0.4s, v2.4s + WORD $0x6eb0a821 // umaxv s1, v1.4s + WORD $0x6eb1a800 // uminv s0, v0.4s WORD $0xeb08013f // cmp x9, x8 - WORD $0x1e26002a // fmov w10, s1 - WORD $0x1e26000b // fmov w11, s0 - BEQ LBB1_9 -LBB1_7: + WORD $0x1e26002a // fmov w10, s1 + WORD $0x1e26000b // fmov w11, s0 + BEQ uint32_done +uint32_scalar: WORD $0x8b09080c // add x12, x0, x9, lsl #2 WORD $0xcb090108 // sub x8, x8, x9 -LBB1_8: +uint32_scalar_loop: WORD $0xb8404589 // ldr w9, [x12], #4 WORD $0x6b09017f // cmp w11, w9 - WORD $0x1a89316b // csel w11, w11, w9, lo + WORD $0x1a89316b // csel w11, w11, w9, lo WORD $0x6b09015f // cmp w10, w9 - WORD $0x1a89814a // csel w10, w10, w9, hi - WORD $0xf1000508 // subs x8, x8, #1 - BNE LBB1_8 -LBB1_9: + WORD $0x1a89814a // csel w10, w10, w9, hi + WORD $0xf1000508 // subs x8, x8, #1 + BNE uint32_scalar_loop +uint32_done: WORD $0xb900006a // str w10, [x3] WORD $0xb900004b // str w11, [x2] WORD $0xa8c17bfd // ldp x29, x30, [sp], #16 - // Put the stack pointer back where it was + // Put the stack pointer back where it was ADD $16, RSP RET // func _int64_max_min_neon(values unsafe.Pointer, length int, minout, maxout unsafe.Pointer) TEXT ·_int64_max_min_neon(SB), $0-32 - MOVD values+0(FP), R0 - MOVD length+8(FP), R1 - MOVD minout+16(FP), R2 - MOVD maxout+24(FP), R3 + MOVD values+0(FP), R0 + MOVD length+8(FP), R1 + MOVD minout+16(FP), R2 + MOVD maxout+24(FP), R3 - // The Go ABI saves the frame pointer register one word below the - // caller's frame. Make room so we don't overwrite it. Needs to stay - // 16-byte aligned + // The Go ABI saves the frame pointer register one word below the + // caller's frame. Make room so we don't overwrite it. Needs to stay + // 16-byte aligned SUB $16, RSP WORD $0xa9bf7bfd // stp x29, x30, [sp, #-16]! WORD $0x7100043f // cmp w1, #1 WORD $0x910003fd // mov x29, sp - BLT LBB2_3 + BLT int64_early_exit WORD $0x2a0103e8 // mov w8, w1 WORD $0xd2f0000b // mov x11, #-9223372036854775808 WORD $0x71000c3f // cmp w1, #3 WORD $0x92f0000a // mov x10, #9223372036854775807 - BHI LBB2_4 + BHI int64_neon WORD $0xaa1f03e9 // mov x9, xzr - JMP LBB2_7 -LBB2_3: + JMP int64_scalar +int64_early_exit: WORD $0x92f0000a // mov x10, #9223372036854775807 WORD $0xd2f0000b // mov x11, #-9223372036854775808 WORD $0xf900006b // str x11, [x3] WORD $0xf900004a // str x10, [x2] WORD $0xa8c17bfd // ldp x29, x30, [sp], #16 - // Put the stack pointer back where it was + // Put the stack pointer back where it was ADD $16, RSP RET -LBB2_4: +int64_neon: WORD $0x927e7509 // and x9, x8, #0xfffffffc WORD $0x4e080d61 // dup v1.2d, x11 WORD $0x4e080d40 // dup v0.2d, x10 @@ -209,54 +204,50 @@ LBB2_4: WORD $0xaa0903eb // mov x11, x9 WORD $0x4ea01c02 // mov v2.16b, v0.16b WORD $0x4ea11c23 // mov v3.16b, v1.16b -LBB2_5: +int64_loop: WORD $0xad7f9544 // ldp q4, q5, [x10, #-16] - WORD $0x4ea31c66 // mov v6.16b, v3.16b - WORD $0x4ea11c27 // mov v7.16b, v1.16b - WORD $0x4ea21c43 // mov v3.16b, v2.16b - WORD $0x4ea01c01 // mov v1.16b, v0.16b - WORD $0x4ee03480 // cmgt v0.2d, v4.2d, v0.2d - WORD $0x4ee234a2 // cmgt v2.2d, v5.2d, v2.2d - WORD $0x6e641c20 // bsl v0.16b, v1.16b, v4.16b - WORD $0x4ee434e1 // cmgt v1.2d, v7.2d, v4.2d - WORD $0x6e651c62 // bsl v2.16b, v3.16b, v5.16b - WORD $0x4ee534c3 // cmgt v3.2d, v6.2d, v5.2d - WORD $0xf100116b // subs x11, x11, #4 - WORD $0x6e641ce1 // bsl v1.16b, v7.16b, v4.16b - WORD $0x6e651cc3 // bsl v3.16b, v6.16b, v5.16b + WORD $0x4ee03486 // cmgt v6.2d, v4.2d, v0.2d + WORD $0x4ee234a7 // cmgt v7.2d, v5.2d, v2.2d + WORD $0x4ee13490 // cmgt v16.2d, v4.2d, v1.2d + WORD $0x4ee334b1 // cmgt v17.2d, v5.2d, v3.2d + WORD $0x6ee61c80 // bif v0.16b, v4.16b, v6.16b + WORD $0x6ee71ca2 // bif v2.16b, v5.16b, v7.16b + WORD $0x6eb01c81 // bit v1.16b, v4.16b, v16.16b + WORD $0x6eb11ca3 // bit v3.16b, v5.16b, v17.16b + WORD $0xf100116b // subs x11, x11, #4 WORD $0x9100814a // add x10, x10, #32 - BNE LBB2_5 + BNE int64_loop - WORD $0x4ee33424 // cmgt v4.2d, v1.2d, v3.2d - WORD $0x4ee03445 // cmgt v5.2d, v2.2d, v0.2d + WORD $0x4ee33424 // cmgt v4.2d, v1.2d, v3.2d + WORD $0x4ee03445 // cmgt v5.2d, v2.2d, v0.2d WORD $0x6e631c24 // bsl v4.16b, v1.16b, v3.16b WORD $0x6e621c05 // bsl v5.16b, v0.16b, v2.16b WORD $0x4e180480 // dup v0.2d, v4.d[1] WORD $0x4e1804a1 // dup v1.2d, v5.d[1] - WORD $0x4ee03482 // cmgt v2.2d, v4.2d, v0.2d - WORD $0x4ee53423 // cmgt v3.2d, v1.2d, v5.2d + WORD $0x4ee03482 // cmgt v2.2d, v4.2d, v0.2d + WORD $0x4ee53423 // cmgt v3.2d, v1.2d, v5.2d WORD $0x6e601c82 // bsl v2.16b, v4.16b, v0.16b WORD $0x6e611ca3 // bsl v3.16b, v5.16b, v1.16b WORD $0xeb08013f // cmp x9, x8 - WORD $0x9e66004b // fmov x11, d2 - WORD $0x9e66006a // fmov x10, d3 - BEQ LBB2_9 -LBB2_7: + WORD $0x9e66004b // fmov x11, d2 + WORD $0x9e66006a // fmov x10, d3 + BEQ int64_done +int64_scalar: WORD $0x8b090c0c // add x12, x0, x9, lsl #3 WORD $0xcb090108 // sub x8, x8, x9 -LBB2_8: +int64_scalar_loop: WORD $0xf8408589 // ldr x9, [x12], #8 WORD $0xeb09015f // cmp x10, x9 - WORD $0x9a89b14a // csel x10, x10, x9, lt + WORD $0x9a89b14a // csel x10, x10, x9, lt WORD $0xeb09017f // cmp x11, x9 - WORD $0x9a89c16b // csel x11, x11, x9, gt - WORD $0xf1000508 // subs x8, x8, #1 - BNE LBB2_8 -LBB2_9: + WORD $0x9a89c16b // csel x11, x11, x9, gt + WORD $0xf1000508 // subs x8, x8, #1 + BNE int64_scalar_loop +int64_done: WORD $0xf900006b // str x11, [x3] WORD $0xf900004a // str x10, [x2] WORD $0xa8c17bfd // ldp x29, x30, [sp], #16 - // Put the stack pointer back where it was + // Put the stack pointer back where it was ADD $16, RSP RET @@ -264,93 +255,88 @@ LBB2_9: // func _uint64_max_min_neon(values unsafe.Pointer, length int, minout, maxout unsafe.Pointer) TEXT ·_uint64_max_min_neon(SB), $0-32 - MOVD values+0(FP), R0 - MOVD length+8(FP), R1 - MOVD minout+16(FP), R2 - MOVD maxout+24(FP), R3 + MOVD values+0(FP), R0 + MOVD length+8(FP), R1 + MOVD minout+16(FP), R2 + MOVD maxout+24(FP), R3 - // The Go ABI saves the frame pointer register one word below the - // caller's frame. Make room so we don't overwrite it. Needs to stay - // 16-byte aligned + // The Go ABI saves the frame pointer register one word below the + // caller's frame. Make room so we don't overwrite it. Needs to stay + // 16-byte aligned SUB $16, RSP WORD $0xa9bf7bfd // stp x29, x30, [sp, #-16]! WORD $0x7100043f // cmp w1, #1 WORD $0x910003fd // mov x29, sp - BLT LBB3_3 + BLT uint64_early_exit WORD $0x71000c3f // cmp w1, #3 WORD $0x2a0103e8 // mov w8, w1 - BHI LBB3_4 + BHI uint64_neon WORD $0xaa1f03e9 // mov x9, xzr WORD $0xaa1f03ea // mov x10, xzr WORD $0x9280000b // mov x11, #-1 - JMP LBB3_7 -LBB3_3: + JMP uint64_scalar +uint64_early_exit: WORD $0xaa1f03ea // mov x10, xzr WORD $0x9280000b // mov x11, #-1 WORD $0xf900006a // str x10, [x3] WORD $0xf900004b // str x11, [x2] WORD $0xa8c17bfd // ldp x29, x30, [sp], #16 - // Put the stack pointer back where it was + // Put the stack pointer back where it was ADD $16, RSP RET -LBB3_4: +uint64_neon: WORD $0x927e7509 // and x9, x8, #0xfffffffc WORD $0x9100400a // add x10, x0, #16 - WORD $0x6f00e401 // movi v1.2d, #0000000000000000 - WORD $0x6f07e7e0 // movi v0.2d, #0xffffffffffffffff - WORD $0x6f07e7e2 // movi v2.2d, #0xffffffffffffffff + WORD $0x6f00e401 // movi v1.2d, #0000000000000000 + WORD $0x6f07e7e0 // movi v0.2d, #0xffffffffffffffff + WORD $0x6f07e7e2 // movi v2.2d, #0xffffffffffffffff WORD $0xaa0903eb // mov x11, x9 - WORD $0x6f00e403 // movi v3.2d, #0000000000000000 -LBB3_5: + WORD $0x6f00e403 // movi v3.2d, #0000000000000000 +uint64_loop: WORD $0xad7f9544 // ldp q4, q5, [x10, #-16] - WORD $0x4ea31c66 // mov v6.16b, v3.16b - WORD $0x4ea11c27 // mov v7.16b, v1.16b - WORD $0x4ea21c43 // mov v3.16b, v2.16b - WORD $0x4ea01c01 // mov v1.16b, v0.16b - WORD $0x6ee03480 // cmhi v0.2d, v4.2d, v0.2d - WORD $0x6ee234a2 // cmhi v2.2d, v5.2d, v2.2d - WORD $0x6e641c20 // bsl v0.16b, v1.16b, v4.16b - WORD $0x6ee434e1 // cmhi v1.2d, v7.2d, v4.2d - WORD $0x6e651c62 // bsl v2.16b, v3.16b, v5.16b - WORD $0x6ee534c3 // cmhi v3.2d, v6.2d, v5.2d - WORD $0xf100116b // subs x11, x11, #4 - WORD $0x6e641ce1 // bsl v1.16b, v7.16b, v4.16b - WORD $0x6e651cc3 // bsl v3.16b, v6.16b, v5.16b + WORD $0x6ee03486 // cmhi v6.2d, v4.2d, v0.2d + WORD $0x6ee234a7 // cmhi v7.2d, v5.2d, v2.2d + WORD $0x6ee13490 // cmhi v16.2d, v4.2d, v1.2d + WORD $0x6ee334b1 // cmhi v17.2d, v5.2d, v3.2d + WORD $0x6ee61c80 // bif v0.16b, v4.16b, v6.16b + WORD $0x6ee71ca2 // bif v2.16b, v5.16b, v7.16b + WORD $0x6eb01c81 // bit v1.16b, v4.16b, v16.16b + WORD $0x6eb11ca3 // bit v3.16b, v5.16b, v17.16b + WORD $0xf100116b // subs x11, x11, #4 WORD $0x9100814a // add x10, x10, #32 - BNE LBB3_5 + BNE uint64_loop - WORD $0x6ee33424 // cmhi v4.2d, v1.2d, v3.2d - WORD $0x6ee03445 // cmhi v5.2d, v2.2d, v0.2d + WORD $0x6ee33424 // cmhi v4.2d, v1.2d, v3.2d + WORD $0x6ee03445 // cmhi v5.2d, v2.2d, v0.2d WORD $0x6e631c24 // bsl v4.16b, v1.16b, v3.16b WORD $0x6e621c05 // bsl v5.16b, v0.16b, v2.16b WORD $0x4e180480 // dup v0.2d, v4.d[1] WORD $0x4e1804a1 // dup v1.2d, v5.d[1] - WORD $0x6ee03482 // cmhi v2.2d, v4.2d, v0.2d - WORD $0x6ee53423 // cmhi v3.2d, v1.2d, v5.2d + WORD $0x6ee03482 // cmhi v2.2d, v4.2d, v0.2d + WORD $0x6ee53423 // cmhi v3.2d, v1.2d, v5.2d WORD $0x6e601c82 // bsl v2.16b, v4.16b, v0.16b WORD $0x6e611ca3 // bsl v3.16b, v5.16b, v1.16b WORD $0xeb08013f // cmp x9, x8 - WORD $0x9e66004a // fmov x10, d2 - WORD $0x9e66006b // fmov x11, d3 - BEQ LBB3_9 -LBB3_7: + WORD $0x9e66004a // fmov x10, d2 + WORD $0x9e66006b // fmov x11, d3 + BEQ uint64_done +uint64_scalar: WORD $0x8b090c0c // add x12, x0, x9, lsl #3 WORD $0xcb090108 // sub x8, x8, x9 -LBB3_8: +uint64_scalar_loop: WORD $0xf8408589 // ldr x9, [x12], #8 WORD $0xeb09017f // cmp x11, x9 - WORD $0x9a89316b // csel x11, x11, x9, lo + WORD $0x9a89316b // csel x11, x11, x9, lo WORD $0xeb09015f // cmp x10, x9 - WORD $0x9a89814a // csel x10, x10, x9, hi - WORD $0xf1000508 // subs x8, x8, #1 - BNE LBB3_8 -LBB3_9: + WORD $0x9a89814a // csel x10, x10, x9, hi + WORD $0xf1000508 // subs x8, x8, #1 + BNE uint64_scalar_loop +uint64_done: WORD $0xf900006a // str x10, [x3] WORD $0xf900004b // str x11, [x2] WORD $0xa8c17bfd // ldp x29, x30, [sp], #16 - // Put the stack pointer back where it was + // Put the stack pointer back where it was ADD $16, RSP RET - diff --git a/internal/utils/min_max_test.go b/internal/utils/min_max_test.go new file mode 100644 index 00000000..cc4d528a --- /dev/null +++ b/internal/utils/min_max_test.go @@ -0,0 +1,184 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package utils + +import ( + "fmt" + "math" + "math/rand" + "testing" +) + +func TestMinMaxInt32(t *testing.T) { + for _, size := range []int{0, 1, 2, 3, 4, 7, 8, 9, 15, 16, 31, 63, 64, 100, 1024} { + t.Run(fmt.Sprintf("n=%d", size), func(t *testing.T) { + if size == 0 { + // skip empty — both impls may differ on sentinel values + return + } + values := make([]int32, size) + for i := range values { + values[i] = rand.Int31() - math.MaxInt32/2 + } + values[rand.Intn(size)] = math.MinInt32 + values[rand.Intn(size)] = math.MaxInt32 + + goMin, goMax := int32MinMax(values) + min, max := GetMinMaxInt32(values) + if min != goMin || max != goMax { + t.Errorf("n=%d: got min=%d max=%d, want min=%d max=%d", size, min, max, goMin, goMax) + } + }) + } +} + +func TestMinMaxUint32(t *testing.T) { + for _, size := range []int{0, 1, 2, 3, 4, 7, 8, 9, 15, 16, 31, 63, 64, 100, 1024} { + t.Run(fmt.Sprintf("n=%d", size), func(t *testing.T) { + if size == 0 { + return + } + values := make([]uint32, size) + for i := range values { + values[i] = rand.Uint32() + } + values[rand.Intn(size)] = 0 + values[rand.Intn(size)] = math.MaxUint32 + + goMin, goMax := uint32MinMax(values) + min, max := GetMinMaxUint32(values) + if min != goMin || max != goMax { + t.Errorf("n=%d: got min=%d max=%d, want min=%d max=%d", size, min, max, goMin, goMax) + } + }) + } +} + +func TestMinMaxInt64(t *testing.T) { + for _, size := range []int{0, 1, 2, 3, 4, 7, 8, 9, 15, 16, 31, 63, 64, 100, 1024} { + t.Run(fmt.Sprintf("n=%d", size), func(t *testing.T) { + if size == 0 { + return + } + values := make([]int64, size) + for i := range values { + values[i] = rand.Int63() - math.MaxInt64/2 + } + values[rand.Intn(size)] = math.MinInt64 + values[rand.Intn(size)] = math.MaxInt64 + + goMin, goMax := int64MinMax(values) + min, max := GetMinMaxInt64(values) + if min != goMin || max != goMax { + t.Errorf("n=%d: got min=%d max=%d, want min=%d max=%d", size, min, max, goMin, goMax) + } + }) + } +} + +func TestMinMaxUint64(t *testing.T) { + for _, size := range []int{0, 1, 2, 3, 4, 7, 8, 9, 15, 16, 31, 63, 64, 100, 1024} { + t.Run(fmt.Sprintf("n=%d", size), func(t *testing.T) { + if size == 0 { + return + } + values := make([]uint64, size) + for i := range values { + values[i] = rand.Uint64() + } + values[rand.Intn(size)] = 0 + values[rand.Intn(size)] = math.MaxUint64 + + goMin, goMax := uint64MinMax(values) + min, max := GetMinMaxUint64(values) + if min != goMin || max != goMax { + t.Errorf("n=%d: got min=%d max=%d, want min=%d max=%d", size, min, max, goMin, goMax) + } + }) + } +} + +var ( + benchMinI32 int32 + benchMaxI32 int32 + benchMinU32 uint32 + benchMaxU32 uint32 + benchMinI64 int64 + benchMaxI64 int64 + benchMinU64 uint64 + benchMaxU64 uint64 +) + +func BenchmarkMinMaxInt32(b *testing.B) { + for _, size := range []int{64, 256, 1024, 8192, 65536} { + values := make([]int32, size) + for i := range values { + values[i] = rand.Int31() - math.MaxInt32/2 + } + b.Run(fmt.Sprintf("n=%d", size), func(b *testing.B) { + b.SetBytes(int64(size) * 4) + for i := 0; i < b.N; i++ { + benchMinI32, benchMaxI32 = GetMinMaxInt32(values) + } + }) + } +} + +func BenchmarkMinMaxUint32(b *testing.B) { + for _, size := range []int{64, 256, 1024, 8192, 65536} { + values := make([]uint32, size) + for i := range values { + values[i] = rand.Uint32() + } + b.Run(fmt.Sprintf("n=%d", size), func(b *testing.B) { + b.SetBytes(int64(size) * 4) + for i := 0; i < b.N; i++ { + benchMinU32, benchMaxU32 = GetMinMaxUint32(values) + } + }) + } +} + +func BenchmarkMinMaxInt64(b *testing.B) { + for _, size := range []int{64, 256, 1024, 8192, 65536} { + values := make([]int64, size) + for i := range values { + values[i] = rand.Int63() - math.MaxInt64/2 + } + b.Run(fmt.Sprintf("n=%d", size), func(b *testing.B) { + b.SetBytes(int64(size) * 8) + for i := 0; i < b.N; i++ { + benchMinI64, benchMaxI64 = GetMinMaxInt64(values) + } + }) + } +} + +func BenchmarkMinMaxUint64(b *testing.B) { + for _, size := range []int{64, 256, 1024, 8192, 65536} { + values := make([]uint64, size) + for i := range values { + values[i] = rand.Uint64() + } + b.Run(fmt.Sprintf("n=%d", size), func(b *testing.B) { + b.SetBytes(int64(size) * 8) + for i := 0; i < b.N; i++ { + benchMinU64, benchMaxU64 = GetMinMaxUint64(values) + } + }) + } +}