Skip to content

Commit

Permalink
Japanese: editting
Browse files Browse the repository at this point in the history
  • Loading branch information
shmz committed Oct 23, 2018
1 parent 7f57e2c commit 172a422
Show file tree
Hide file tree
Showing 46 changed files with 1,947 additions and 2 deletions.
46 changes: 46 additions & 0 deletions patterns/12_FPU/1_simple/ARM/ARM64_GCC_O0_JPN.s
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
f:
sub sp, sp, #16
str d0, [sp,8] ; save "a" in Register Save Area
str d1, [sp] ; save "b" in Register Save Area
ldr x1, [sp,8]
; X1 = a
ldr x0, .LC25
; X0 = 3.14
fmov d0, x1
fmov d1, x0
; D0 = a, D1 = 3.14
fdiv d0, d0, d1
; D0 = D0/D1 = a/3.14

fmov x1, d0
; X1 = a/3.14
ldr x2, [sp]
; X2 = b
ldr x0, .LC26
; X0 = 4.1
fmov d0, x2
; D0 = b
fmov d1, x0
; D1 = 4.1
fmul d0, d0, d1
; D0 = D0*D1 = b*4.1

fmov x0, d0
; X0 = D0 = b*4.1
fmov d0, x1
; D0 = a/3.14
fmov d1, x0
; D1 = X0 = b*4.1
fadd d0, d0, d1
; D0 = D0+D1 = a/3.14 + b*4.1

fmov x0, d0 ; \ redundant code
fmov d0, x0 ; /
add sp, sp, 16
ret
.LC25:
.word 1374389535 ; 3.14
.word 1074339512
.LC26:
.word 1717986918 ; 4.1
.word 1074816614
19 changes: 19 additions & 0 deletions patterns/12_FPU/1_simple/ARM/ARM64_GCC_O3_JPN.s
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
f:
; D0 = a, D1 = b
ldr d2, .LC25 ; 3.14
; D2 = 3.14
fdiv d0, d0, d2
; D0 = D0/D2 = a/3.14
ldr d2, .LC26 ; 4.1
; D2 = 4.1
fmadd d0, d1, d2, d0
; D0 = D1*D2+D0 = b*4.1+a/3.14
ret

; constants in IEEE 754 format:
.LC25:
.word 1374389535 ; 3.14
.word 1074339512
.LC26:
.word 1717986918 ; 4.1
.word 1074816614
29 changes: 29 additions & 0 deletions patterns/12_FPU/1_simple/ARM/Keil_O3_thumb_JPN.asm
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
f
PUSH {R3-R7,LR}
MOVS R7, R2
MOVS R4, R3
MOVS R5, R0
MOVS R6, R1
LDR R2, =0x66666666 ; 4.1
LDR R3, =0x40106666
MOVS R0, R7
MOVS R1, R4
BL __aeabi_dmul
MOVS R7, R0
MOVS R4, R1
LDR R2, =0x51EB851F ; 3.14
LDR R3, =0x40091EB8
MOVS R0, R5
MOVS R1, R6
BL __aeabi_ddiv
MOVS R2, R7
MOVS R3, R4
BL __aeabi_dadd
POP {R3-R7,PC}

; 4.1 in IEEE 754 form:
dword_364 DCD 0x66666666 ; DATA XREF: f+A
dword_368 DCD 0x40106666 ; DATA XREF: f+C
; 3.14 in IEEE 754 form:
dword_36C DCD 0x51EB851F ; DATA XREF: f+1A
dword_370 DCD 0x40091EB8 ; DATA XREF: f+1C
13 changes: 13 additions & 0 deletions patterns/12_FPU/1_simple/ARM/Xcode_ARM_O3_JPN.asm
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
f
VLDR D16, =3.14
VMOV D17, R0, R1 ; load "a"
VMOV D18, R2, R3 ; load "b"
VDIV.F64 D16, D17, D16 ; a/3.14
VLDR D17, =4.1
VMUL.F64 D17, D18, D17 ; b*4.1
VADD.F64 D16, D17, D16 ; +
VMOV R0, R1, D16
BX LR

dbl_2C98 DCFD 3.14 ; DATA XREF: f
dbl_2CA0 DCFD 4.1 ; DATA XREF: f+10
1 change: 1 addition & 0 deletions patterns/12_FPU/1_simple/ARM/main.tex
Original file line number Diff line number Diff line change
Expand Up @@ -2,4 +2,5 @@
\RU{\input{patterns/12_FPU/1_simple/ARM/main_RU}}
\DE{\input{patterns/12_FPU/1_simple/ARM/main_DE}}
\FR{\input{patterns/12_FPU/1_simple/ARM/main_FR}}
\JPN{\input{patterns/12_FPU/1_simple/ARM/main_JPN}}

114 changes: 114 additions & 0 deletions patterns/12_FPU/1_simple/ARM/main_JPN.tex
Original file line number Diff line number Diff line change
@@ -0,0 +1,114 @@
\subsubsection{ARM: \OptimizingXcodeIV (\ARMMode)}

Until ARM got standardized floating point support, several processor manufacturers added their own
instructions extensions.
Then, VFP (\IT{Vector Floating Point}) was standardized.

One important difference from x86 is that in ARM, there
is no stack, you work just with registers.

\lstinputlisting[label=ARM_leaf_example10,caption=\OptimizingXcodeIV (\ARMMode),style=customasmARM]{patterns/12_FPU/1_simple/ARM/Xcode_ARM_O3_JPN.asm}

\myindex{ARM!D-\registers{}}
\myindex{ARM!S-\registers{}}

So, we see here new some registers used, with D prefix.

These are 64-bit registers, there are 32 of them, and they can be used both for floating-point numbers
(double) but also for SIMD (it is called NEON here in ARM).

There are also 32 32-bit S-registers, intended to be used for single precision
floating pointer numbers (float).

It is easy to memorize: D-registers are for double precision numbers, while
S-registers---for single precision numbers.
More about it: \myref{ARM_VFP_registers}.

Both constants (3.14 and 4.1) are stored in memory in IEEE 754 format.

\myindex{ARM!\Instructions!VLDR}
\myindex{ARM!\Instructions!VMOV}
\INS{VLDR} and \INS{VMOV}, as it can be easily deduced, are analogous to the \INS{LDR} and \MOV instructions,
but they work with D-registers.

It has to be noted that these instructions, just like the D-registers, are intended not only for
floating point numbers,
but can be also used for SIMD (NEON) operations and this will also be shown soon.

The arguments are passed to the function in a common way, via the R-registers, however
each number that has double precision has a size of 64 bits, so two R-registers are needed to pass each one.

\INS{VMOV D17, R0, R1} at the start, composes two 32-bit values from \Reg{0} and \Reg{1} into one 64-bit value
and saves it to \GTT{D17}.

\INS{VMOV R0, R1, D16} is the inverse operation: what has been in \GTT{D16}
is split in two registers, \Reg{0} and \Reg{1}, because a double-precision number
that needs 64 bits for storage, is returned in \Reg{0} and \Reg{1}.

\myindex{ARM!\Instructions!VDIV}
\myindex{ARM!\Instructions!VMUL}
\myindex{ARM!\Instructions!VADD}
\INS{VDIV}, \INS{VMUL} and \INS{VADD},
are instruction for processing floating point numbers that compute \gls{quotient},
\gls{product} and sum, respectively.

The code for Thumb-2 is same.

\subsubsection{ARM: \OptimizingKeilVI (\ThumbMode)}

\lstinputlisting[style=customasmARM]{patterns/12_FPU/1_simple/ARM/Keil_O3_thumb_JPN.asm}

Keil generated code for a processor without FPU or NEON support.

The double-precision floating-point numbers are passed via generic R-registers,
and instead of FPU-instructions, service library functions are called\\
(like \GTT{\_\_aeabi\_dmul}, \GTT{\_\_aeabi\_ddiv}, \GTT{\_\_aeabi\_dadd})
which emulate multiplication, division and addition for floating-point numbers.

Of course, that is slower than FPU-coprocessor, but it's still better than nothing.

By the way, similar FPU-emulating libraries were very popular in the x86 world when coprocessors were rare
and expensive, and were installed only on expensive computers.

\myindex{ARM!soft float}
\myindex{ARM!armel}
\myindex{ARM!armhf}
\myindex{ARM!hard float}

The FPU-coprocessor emulation is called \IT{soft float} or \IT{armel} (\IT{emulation}) in the ARM world,
while using the coprocessor's FPU-instructions is called \IT{hard float} or \IT{armhf}.

\iffalse
% TODO разобраться...
\myindex{Raspberry Pi}

For example, the Linux kernel for Raspberry Pi is compiled in two variants.

In the \IT{soft float} case, arguments are passed via R-registers, and in the \IT{hard float} case---via D-registers.

And that is what stops you from using armhf-libraries from armel-code or vice versa,
and that is why all the code in Linux distributions must be compiled according to a single convention.
\fi

\subsubsection{ARM64: \Optimizing GCC (Linaro) 4.9}

Very compact code:

\lstinputlisting[caption=\Optimizing GCC (Linaro) 4.9,style=customasmARM]{patterns/12_FPU/1_simple/ARM/ARM64_GCC_O3_JPN.s}

\subsubsection{ARM64: \NonOptimizing GCC (Linaro) 4.9}

\lstinputlisting[caption=\NonOptimizing GCC (Linaro) 4.9,style=customasmARM]{patterns/12_FPU/1_simple/ARM/ARM64_GCC_O0_JPN.s}

\NonOptimizing GCC is more verbose.

There is a lot of unnecessary value shuffling, including some clearly redundant code
(the last two \INS{FMOV} instructions). Probably, GCC 4.9 is not yet good in generating ARM64 code.

What is worth noting is that ARM64 has 64-bit registers, and the D-registers are 64-bit ones as well.

So the compiler is free to save values of type \Tdouble in \ac{GPR}s instead of the local stack.
This isn't possible on 32-bit CPUs.

And again, as an exercise, you can try to optimize this function manually, without introducing
new instructions like \INS{FMADD}.
31 changes: 31 additions & 0 deletions patterns/12_FPU/1_simple/GCC_JPN.asm
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
public f
f proc near

arg_0 = qword ptr 8
arg_8 = qword ptr 10h

push ebp
fld ds:dbl_8048608 ; 3.14

; stack state now: ST(0) = 3.14

mov ebp, esp
fdivr [ebp+arg_0]

; stack state now: ST(0) = result of division

fld ds:dbl_8048610 ; 4.1

; stack state now: ST(0) = 4.1, ST(1) = result of division

fmul [ebp+arg_8]

; stack state now: ST(0) = result of multiplication, ST(1) = result of division

pop ebp
faddp st(1), st

; stack state now: ST(0) = result of addition

retn
f endp
20 changes: 20 additions & 0 deletions patterns/12_FPU/1_simple/GCC_JPN.tex
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
\myparagraph{GCC}

GCC 4.4.1 (with \Othree option) emits the same code, just slightly different:

\lstinputlisting[caption=\Optimizing GCC 4.4.1,style=customasmx86]{patterns/12_FPU/1_simple/GCC_EN.asm}

The difference is that, first of all, 3.14 is pushed to the stack (into \ST{0}), and then the value
in \GTT{arg\_0} is divided by the value in \ST{0}.

\myindex{x86!\Instructions!FDIVR}

\FDIVR stands for \IT{Reverse Divide}~---to divide with divisor and dividend swapped with each other.
There is no likewise instruction for multiplication since it is
a commutative operation, so we just have \FMUL without its \GTT{-R} counterpart.

\myindex{x86!\Instructions!FADDP}

\FADDP adds the two values but also pops one value from the stack.
After that operation, \ST{0} holds the sum.

41 changes: 41 additions & 0 deletions patterns/12_FPU/1_simple/MIPS_JPN.tex
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
\subsubsection{MIPS}

MIPS can support several coprocessors (up to 4),
the zeroth of which\footnote{Starting at 0.} is a special control coprocessor,
and first coprocessor is the FPU.

As in ARM, the MIPS coprocessor is not a stack machine, it has 32 32-bit registers (\$F0-\$F31):
\myref{MIPS_FPU_registers}.

When one needs to work with 64-bit \Tdouble values, a pair of 32-bit F-registers is used.

\lstinputlisting[caption=\Optimizing GCC 4.4.5 (IDA),style=customasmMIPS]{patterns/12_FPU/1_simple/MIPS_O3_IDA_JPN.lst}

The new instructions here are:

\myindex{MIPS!\Instructions!LWC1}
\myindex{MIPS!\Instructions!DIV.D}
\myindex{MIPS!\Instructions!MUL.D}
\myindex{MIPS!\Instructions!ADD.D}
\begin{itemize}

\item \INS{LWC1} loads a 32-bit word into a register of the first coprocessor (hence \q{1} in instruction name).
\myindex{MIPS!\Pseudoinstructions!L.D}

A pair of \INS{LWC1} instructions may be combined into a \INS{L.D} pseudo instruction.

\item \INS{DIV.D}, \INS{MUL.D}, \INS{ADD.D} do division, multiplication, and addition respectively
(\q{.D} in the suffix stands for double precision, \q{.S} stands for single precision)

\end{itemize}

\myindex{MIPS!\Instructions!LUI}
\myindex{\CompilerAnomaly}
\label{MIPS_FPU_LUI}

There is also a weird compiler anomaly: the \INS{LUI} instructions that we've marked with a question mark.
It's hard for me to understand why load a part of a 64-bit constant of \Tdouble type into the \$V0 register.
These instructions has no effect.
% TODO did you try checking out compiler source code?
If someone knows more about it, please drop an email to author\footnote{\EMAIL}.

31 changes: 31 additions & 0 deletions patterns/12_FPU/1_simple/MIPS_O3_IDA_JPN.lst
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
f:
; $f12-$f13=A
; $f14-$f15=B
lui $v0, (dword_C4 >> 16) ; ?
; load low 32-bit part of 3.14 constant to $f0:
lwc1 $f0, dword_BC
or $at, $zero ; load delay slot, NOP
; load high 32-bit part of 3.14 constant to $f1:
lwc1 $f1, $LC0
lui $v0, ($LC1 >> 16) ; ?
; A in $f12-$f13, 3.14 constant in $f0-$f1, do division:
div.d $f0, $f12, $f0
; $f0-$f1=A/3.14
; load low 32-bit part of 4.1 to $f2:
lwc1 $f2, dword_C4
or $at, $zero ; load delay slot, NOP
; load high 32-bit part of 4.1 to $f3:
lwc1 $f3, $LC1
or $at, $zero ; load delay slot, NOP
; B in $f14-$f15, 4.1 constant in $f2-$f3, do multiplication:
mul.d $f2, $f14, $f2
; $f2-$f3=B*4.1
jr $ra
; sum 64-bit parts and leave result in $f0-$f1:
add.d $f0, $f2 ; branch delay slot, NOP


.rodata.cst8:000000B8 $LC0: .word 0x40091EB8 # DATA XREF: f+C
.rodata.cst8:000000BC dword_BC: .word 0x51EB851F # DATA XREF: f+4
.rodata.cst8:000000C0 $LC1: .word 0x40106666 # DATA XREF: f+10
.rodata.cst8:000000C4 dword_C4: .word 0x66666666 # DATA XREF: f
38 changes: 38 additions & 0 deletions patterns/12_FPU/1_simple/MSVC_JPN.asm
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
CONST SEGMENT
__real@4010666666666666 DQ 04010666666666666r ; 4.1
CONST ENDS
CONST SEGMENT
__real@40091eb851eb851f DQ 040091eb851eb851fr ; 3.14
CONST ENDS
_TEXT SEGMENT
_a$ = 8 ; size = 8
_b$ = 16 ; size = 8
_f PROC
push ebp
mov ebp, esp
fld QWORD PTR _a$[ebp]

; current stack state: ST(0) = _a

fdiv QWORD PTR __real@40091eb851eb851f

; current stack state: ST(0) = result of _a divided by 3.14

fld QWORD PTR _b$[ebp]

; current stack state: ST(0) = _b;
; ST(1) = result of _a divided by 3.14

fmul QWORD PTR __real@4010666666666666

; current stack state:
; ST(0) = result of _b * 4.1;
; ST(1) = result of _a divided by 3.14

faddp ST(1), ST(0)

; current stack state: ST(0) = result of addition

pop ebp
ret 0
_f ENDP
Loading

0 comments on commit 172a422

Please sign in to comment.