Skip to content

Commit

Permalink
Armv8A Rename Regs for Clang Compile: FP64 Part
Browse files Browse the repository at this point in the history
- x7, x8: Used to store address for Alpha and Beta.
  As Alpha & Beta was not used in k-loops, use x0, x1 to load
  Alpha & Beta's addresses after k-loops are completed, since A & B's
  addresses are no longer needed there.
  This "ldr [addr]; -> ldr val, [addr]" would not cause much performance
  drawback since it is done outside k-loops and there are plenty of
  instructions between Alpha & Beta's loading and usage.
- x9: Used to store cs_c. x9 is multiplied by 8 into x10 and not used
  any longer. Directly loading cs_c and into x10 and scale by 8 spares
  x9 straightforwardly.
- x11, x12: Not used at all. Simply remove from clobber list.
- x13: Alike x9, loaded and scaled by 8 into x14, except that x13 is
  also used in a conditional branch so that "cmp x13, #1" needs to be
  modified into "cmp x14, #8" to completely free x13.
- x3, x4: Used to store next_a & next_b. Untouched in k-loops. Load
  these addresses into x0 and x1 after Alpha & Beta are both loaded,
  since then neigher address of A/B nor address of Alpha/Beta is needed.
  • Loading branch information
xrq-phys committed May 29, 2021
1 parent 7fabd89 commit 916e1fa
Showing 1 changed file with 21 additions and 23 deletions.
44 changes: 21 additions & 23 deletions kernels/armv8a/3/bli_gemm_armv8a_asm_d6x8.c
Original file line number Diff line number Diff line change
Expand Up @@ -1135,20 +1135,14 @@ __asm__ volatile
" ldr x1,%[baddr] \n\t" // Load address of B
" ldr x2,%[caddr] \n\t" // Load address of C
" \n\t"
" ldr x3,%[a_next] \n\t" // Move pointer
" ldr x4,%[b_next] \n\t" // Move pointer
" \n\t"
" ldr x5,%[k_iter] \n\t" // Init guard (k_iter)
" ldr x6,%[k_left] \n\t" // Init guard (k_iter)
" \n\t"
" ldr x7,%[alpha] \n\t" // Alpha address
" ldr x8,%[beta] \n\t" // Beta address
" \n\t"
" ldr x9,%[cs_c] \n\t" // Load cs_c
" lsl x10,x9,#3 \n\t" // cs_c * sizeof(double)
" ldr x10,%[cs_c] \n\t" // Load cs_c
" lsl x10,x10,#3 \n\t" // cs_c * sizeof(double)
" \n\t"
" ldr x13,%[rs_c] \n\t" // Load rs_c.
" lsl x14,x13,#3 \n\t" // rs_c * sizeof(double).
" ldr x14,%[rs_c] \n\t" // Load rs_c.
" lsl x14,x14,#3 \n\t" // rs_c * sizeof(double).
" \n\t"
" add x20,x2,x10 \n\t" //Load address Column 1 of C
" add x21,x20,x10 \n\t" //Load address Column 2 of C
Expand Down Expand Up @@ -1610,10 +1604,16 @@ BNE(DLOOPKLEFT) // if i!=0.
" \n\t"
LABEL(DPOSTACCUM)
" \n\t"
" ld1r {v6.2d},[x7] \n\t" // Load alpha.
" ld1r {v7.2d},[x8] \n\t" // Load beta
" ldr x0,%[alpha] \n\t" // Alpha address
" ldr x1,%[beta] \n\t" // Beta address
" \n\t"
" ld1r {v6.2d},[x0] \n\t" // Load alpha.
" ld1r {v7.2d},[x1] \n\t" // Load beta
" \n\t"
" cmp x13,#1 \n\t" // If rs_c != 1 (column-major)
" ldr x0,%[a_next] \n\t" // Next A address for later use.
" ldr x1,%[b_next] \n\t" // Next B address for later use.
" \n\t"
" cmp x14,#8 \n\t" // If rs_c != 1 (column-major)
BNE(DGENSTORED)
" \n\t"
LABEL(DCOLSTORED) // C is column-major.
Expand Down Expand Up @@ -1771,8 +1771,8 @@ BEQ(DBETAZEROCOLSTOREDS4) // Taking care of the beta==0
" \n\t"
LABEL(DBETAZEROCOLSTOREDS4)
" \n\t"
" prfm pldl2keep,[x3] \n\t"
" prfm pldl2keep,[x4] \n\t"
" prfm pldl2keep,[x0] \n\t"
" prfm pldl2keep,[x1] \n\t"
" \n\t"
" fmla v8.2d, v26.2d,v6.d[0] \n\t" // Scale by alpha
" fmla v9.2d, v27.2d,v6.d[0] \n\t" // Scale by alpha
Expand Down Expand Up @@ -2016,8 +2016,8 @@ BEQ(DBETAZEROGENSTOREDS4) // Taking care of the beta==0
" \n\t"
LABEL(DBETAZEROGENSTOREDS4)
" \n\t"
" prfm pldl2keep,[x3] \n\t"
" prfm pldl2keep,[x4] \n\t"
" prfm pldl2keep,[x0] \n\t"
" prfm pldl2keep,[x1] \n\t"
" \n\t"
" fmla v8.2d, v26.2d,v6.d[0] \n\t" // Scale by alpha
" fmla v9.2d, v27.2d,v6.d[0] \n\t" // Scale by alpha
Expand Down Expand Up @@ -2060,12 +2060,10 @@ LABEL(DEND) // Done!
[a_next] "m" (a_next), // 8
[b_next] "m" (b_next) // 9
:// Register clobber list
"x0","x1","x2","x3",
"x4","x5","x6",
"x7","x8","x9",
"x10","x11","x12","x13","x14","x16","x17",
"x20","x21","x22","x23","x24","x25","x26",
"x27",
"x0","x1","x2",
"x5","x6","x10",
"x14","x16","x17",
"x20","x21","x22","x23","x24","x25","x26","x27",
"v0","v1","v2",
"v3","v4","v5",
"v6","v7","v8",
Expand Down

0 comments on commit 916e1fa

Please sign in to comment.