Japanese: editting

HoMeCracKeR · Oct 23, 2018 · 172a422 · 172a422
1 parent 7f57e2c
commit 172a422
Show file tree

Hide file tree

Showing 46 changed files with 1,947 additions and 2 deletions.
diff --git a/patterns/12_FPU/1_simple/ARM/ARM64_GCC_O0_JPN.s b/patterns/12_FPU/1_simple/ARM/ARM64_GCC_O0_JPN.s
@@ -0,0 +1,46 @@
+f:
+	sub	sp, sp, #16
+	str	d0, [sp,8]	; save "a" in Register Save Area
+	str	d1, [sp]	; save "b" in Register Save Area
+	ldr	x1, [sp,8]
+; X1 = a
+	ldr	x0, .LC25
+; X0 = 3.14
+	fmov	d0, x1
+	fmov	d1, x0
+; D0 = a, D1 = 3.14
+	fdiv	d0, d0, d1
+; D0 = D0/D1 = a/3.14
+
+	fmov	x1, d0
+; X1 = a/3.14
+	ldr	x2, [sp]
+; X2 = b
+	ldr	x0, .LC26
+; X0 = 4.1
+	fmov	d0, x2
+; D0 = b
+	fmov	d1, x0
+; D1 = 4.1
+	fmul	d0, d0, d1
+; D0 = D0*D1 = b*4.1
+
+	fmov	x0, d0
+; X0 = D0 = b*4.1
+	fmov	d0, x1
+; D0 = a/3.14
+	fmov	d1, x0
+; D1 = X0 = b*4.1
+	fadd	d0, d0, d1
+; D0 = D0+D1 = a/3.14 + b*4.1
+
+	fmov	x0, d0 ; \ redundant code
+	fmov	d0, x0 ; /
+	add	sp, sp, 16
+	ret
+.LC25:
+	.word	1374389535	; 3.14
+	.word	1074339512
+.LC26:
+	.word	1717986918	; 4.1
+	.word	1074816614
diff --git a/patterns/12_FPU/1_simple/ARM/ARM64_GCC_O3_JPN.s b/patterns/12_FPU/1_simple/ARM/ARM64_GCC_O3_JPN.s
@@ -0,0 +1,19 @@
+f:
+; D0 = a, D1 = b
+	ldr	d2, .LC25	; 3.14
+; D2 = 3.14
+	fdiv	d0, d0, d2
+; D0 = D0/D2 = a/3.14
+	ldr	d2, .LC26	; 4.1
+; D2 = 4.1
+	fmadd	d0, d1, d2, d0
+; D0 = D1*D2+D0 = b*4.1+a/3.14
+	ret
+
+; constants in IEEE 754 format:
+.LC25:
+	.word	1374389535	; 3.14
+	.word	1074339512
+.LC26:
+	.word	1717986918	; 4.1
+	.word	1074816614
diff --git a/patterns/12_FPU/1_simple/ARM/Keil_O3_thumb_JPN.asm b/patterns/12_FPU/1_simple/ARM/Keil_O3_thumb_JPN.asm
@@ -0,0 +1,29 @@
+f
+                PUSH    {R3-R7,LR}
+                MOVS    R7, R2
+                MOVS    R4, R3
+                MOVS    R5, R0
+                MOVS    R6, R1
+                LDR     R2, =0x66666666	; 4.1
+                LDR     R3, =0x40106666
+                MOVS    R0, R7
+                MOVS    R1, R4
+                BL      __aeabi_dmul
+                MOVS    R7, R0
+                MOVS    R4, R1
+                LDR     R2, =0x51EB851F	; 3.14
+                LDR     R3, =0x40091EB8
+                MOVS    R0, R5
+                MOVS    R1, R6
+                BL      __aeabi_ddiv
+                MOVS    R2, R7
+                MOVS    R3, R4
+                BL      __aeabi_dadd
+                POP     {R3-R7,PC}
+
+; 4.1 in IEEE 754 form:
+dword_364       DCD 0x66666666          ; DATA XREF: f+A
+dword_368       DCD 0x40106666          ; DATA XREF: f+C
+; 3.14 in IEEE 754 form:
+dword_36C       DCD 0x51EB851F          ; DATA XREF: f+1A
+dword_370       DCD 0x40091EB8          ; DATA XREF: f+1C
diff --git a/patterns/12_FPU/1_simple/ARM/Xcode_ARM_O3_JPN.asm b/patterns/12_FPU/1_simple/ARM/Xcode_ARM_O3_JPN.asm
@@ -0,0 +1,13 @@
+f
+                VLDR            D16, =3.14
+                VMOV            D17, R0, R1 ; load "a"
+                VMOV            D18, R2, R3 ; load "b"
+                VDIV.F64        D16, D17, D16 ; a/3.14
+                VLDR            D17, =4.1
+                VMUL.F64        D17, D18, D17 ; b*4.1
+                VADD.F64        D16, D17, D16 ; +
+                VMOV            R0, R1, D16
+                BX              LR
+
+dbl_2C98        DCFD 3.14               ; DATA XREF: f
+dbl_2CA0        DCFD 4.1                ; DATA XREF: f+10
diff --git a/patterns/12_FPU/1_simple/ARM/main.tex b/patterns/12_FPU/1_simple/ARM/main.tex
@@ -2,4 +2,5 @@
 \RU{\input{patterns/12_FPU/1_simple/ARM/main_RU}}
 \DE{\input{patterns/12_FPU/1_simple/ARM/main_DE}}
 \FR{\input{patterns/12_FPU/1_simple/ARM/main_FR}}
+\JPN{\input{patterns/12_FPU/1_simple/ARM/main_JPN}}
 
diff --git a/patterns/12_FPU/1_simple/ARM/main_JPN.tex b/patterns/12_FPU/1_simple/ARM/main_JPN.tex
@@ -0,0 +1,114 @@
+\subsubsection{ARM: \OptimizingXcodeIV (\ARMMode)}
+
+Until ARM got standardized floating point support, several processor manufacturers added their own 
+instructions extensions.
+Then, VFP (\IT{Vector Floating Point}) was standardized.
+
+One important difference from x86 is that in ARM, there
+is no stack, you work just with registers.
+
+\lstinputlisting[label=ARM_leaf_example10,caption=\OptimizingXcodeIV (\ARMMode),style=customasmARM]{patterns/12_FPU/1_simple/ARM/Xcode_ARM_O3_JPN.asm}
+
+\myindex{ARM!D-\registers{}}
+\myindex{ARM!S-\registers{}}
+
+So, we see here new some registers used, with D prefix.
+
+These are 64-bit registers, there are 32 of them, and they can be used both for floating-point numbers 
+(double) but also for SIMD (it is called NEON here in ARM).
+
+There are also 32 32-bit S-registers, intended to be used for single precision 
+floating pointer numbers (float).
+
+It is easy to memorize: D-registers are for double precision numbers, while
+S-registers---for single precision numbers.
+More about it: \myref{ARM_VFP_registers}.
+
+Both constants (3.14 and 4.1) are stored in memory in IEEE 754 format.
+
+\myindex{ARM!\Instructions!VLDR}
+\myindex{ARM!\Instructions!VMOV}
+\INS{VLDR} and \INS{VMOV}, as it can be easily deduced, are analogous to the \INS{LDR} and \MOV instructions,
+but they work with D-registers.
+
+It has to be noted that these instructions, just like the D-registers, are intended not only for
+floating point numbers, 
+but can be also used for SIMD (NEON) operations and this will also be shown soon.
+
+The arguments are passed to the function in a common way, via the R-registers, however
+each number that has double precision has a size of 64 bits, so two R-registers are needed to pass each one.
+
+\INS{VMOV D17, R0, R1} at the start, composes two 32-bit values from \Reg{0} and \Reg{1} into one 64-bit value
+and saves it to \GTT{D17}.
+
+\INS{VMOV R0, R1, D16} is the inverse operation: what has been in \GTT{D16} 
+is split in two registers, \Reg{0} and \Reg{1}, because a double-precision number 
+that needs 64 bits for storage, is returned in \Reg{0} and \Reg{1}.
+
+\myindex{ARM!\Instructions!VDIV}
+\myindex{ARM!\Instructions!VMUL}
+\myindex{ARM!\Instructions!VADD}
+\INS{VDIV}, \INS{VMUL} and \INS{VADD}, 
+are instruction for processing floating point numbers that compute \gls{quotient}, 
+\gls{product} and sum, respectively.
+
+The code for Thumb-2 is same.
+
+\subsubsection{ARM: \OptimizingKeilVI (\ThumbMode)}
+
+\lstinputlisting[style=customasmARM]{patterns/12_FPU/1_simple/ARM/Keil_O3_thumb_JPN.asm}
+
+Keil generated code for a processor without FPU or NEON support.
+
+The double-precision floating-point numbers are passed via generic R-registers,
+and instead of FPU-instructions, service library functions are called\\
+(like \GTT{\_\_aeabi\_dmul}, \GTT{\_\_aeabi\_ddiv}, \GTT{\_\_aeabi\_dadd})
+which emulate multiplication, division and addition for floating-point numbers.
+
+Of course, that is slower than FPU-coprocessor, but it's still better than nothing.
+
+By the way, similar FPU-emulating libraries were very popular in the x86 world when coprocessors were rare
+and expensive, and were installed only on expensive computers.
+
+\myindex{ARM!soft float}
+\myindex{ARM!armel}
+\myindex{ARM!armhf}
+\myindex{ARM!hard float}
+
+The FPU-coprocessor emulation is called \IT{soft float} or \IT{armel} (\IT{emulation}) in the ARM world, 
+while using the coprocessor's FPU-instructions is called \IT{hard float} or \IT{armhf}.
+
+\iffalse
+% TODO разобраться...
+\myindex{Raspberry Pi}
+
+For example, the Linux kernel for Raspberry Pi is compiled in two variants.
+
+In the \IT{soft float} case, arguments are passed via R-registers, and in the \IT{hard float} case---via D-registers.
+
+And that is what stops you from using armhf-libraries from armel-code or vice versa,
+and that is why all the code in Linux distributions must be compiled according to a single convention.
+\fi
+
+\subsubsection{ARM64: \Optimizing GCC (Linaro) 4.9}
+
+Very compact code:
+
+\lstinputlisting[caption=\Optimizing GCC (Linaro) 4.9,style=customasmARM]{patterns/12_FPU/1_simple/ARM/ARM64_GCC_O3_JPN.s}
+
+\subsubsection{ARM64: \NonOptimizing GCC (Linaro) 4.9}
+
+\lstinputlisting[caption=\NonOptimizing GCC (Linaro) 4.9,style=customasmARM]{patterns/12_FPU/1_simple/ARM/ARM64_GCC_O0_JPN.s}
+
+\NonOptimizing GCC is more verbose.
+
+There is a lot of unnecessary value shuffling, including some clearly redundant code 
+(the last two \INS{FMOV} instructions). Probably, GCC 4.9 is not yet good in generating ARM64 code.
+
+What is worth noting is that ARM64 has 64-bit registers, and the D-registers are 64-bit ones as well.
+
+So the compiler is free to save values of type \Tdouble in \ac{GPR}s instead of the local stack.
+This isn't possible on 32-bit CPUs.
+
+And again, as an exercise, you can try to optimize this function manually, without introducing
+new instructions like \INS{FMADD}.
diff --git a/patterns/12_FPU/1_simple/GCC_JPN.asm b/patterns/12_FPU/1_simple/GCC_JPN.asm
@@ -0,0 +1,31 @@
+                public f
+f               proc near
+
+arg_0           = qword ptr  8
+arg_8           = qword ptr  10h
+
+                push    ebp
+                fld     ds:dbl_8048608 ; 3.14
+
+; stack state now: ST(0) = 3.14
+
+                mov     ebp, esp
+                fdivr   [ebp+arg_0]
+
+; stack state now: ST(0) = result of division
+
+                fld     ds:dbl_8048610 ; 4.1
+
+; stack state now: ST(0) = 4.1, ST(1) = result of division
+
+                fmul    [ebp+arg_8]
+
+; stack state now: ST(0) = result of multiplication, ST(1) = result of division
+
+                pop     ebp
+                faddp   st(1), st
+
+; stack state now: ST(0) = result of addition
+
+                retn
+f               endp
diff --git a/patterns/12_FPU/1_simple/GCC_JPN.tex b/patterns/12_FPU/1_simple/GCC_JPN.tex
@@ -0,0 +1,20 @@
+\myparagraph{GCC}
+
+GCC 4.4.1 (with \Othree option) emits the same code, just slightly different:
+
+\lstinputlisting[caption=\Optimizing GCC 4.4.1,style=customasmx86]{patterns/12_FPU/1_simple/GCC_EN.asm}
+
+The difference is that, first of all, 3.14 is pushed to the stack (into \ST{0}), and then the value 
+in \GTT{arg\_0} is divided by the value in \ST{0}.
+
+\myindex{x86!\Instructions!FDIVR}
+
+\FDIVR stands for \IT{Reverse Divide}~---to divide with divisor and dividend swapped with each other. 
+There is no likewise instruction for multiplication since it is 
+a commutative operation, so we just have \FMUL without its \GTT{-R} counterpart.
+
+\myindex{x86!\Instructions!FADDP}
+
+\FADDP adds the two values but also pops one value from the stack. 
+After that operation, \ST{0} holds the sum.
+
diff --git a/patterns/12_FPU/1_simple/MIPS_JPN.tex b/patterns/12_FPU/1_simple/MIPS_JPN.tex
@@ -0,0 +1,41 @@
+\subsubsection{MIPS}
+
+MIPS can support several coprocessors (up to 4), 
+the zeroth of which\footnote{Starting at 0.} is a special control coprocessor,
+and first coprocessor is the FPU.
+
+As in ARM, the MIPS coprocessor is not a stack machine, it has 32 32-bit registers (\$F0-\$F31):
+\myref{MIPS_FPU_registers}.
+
+When one needs to work with 64-bit \Tdouble values, a pair of 32-bit F-registers is used.
+
+\lstinputlisting[caption=\Optimizing GCC 4.4.5 (IDA),style=customasmMIPS]{patterns/12_FPU/1_simple/MIPS_O3_IDA_JPN.lst}
+
+The new instructions here are:
+
+\myindex{MIPS!\Instructions!LWC1}
+\myindex{MIPS!\Instructions!DIV.D}
+\myindex{MIPS!\Instructions!MUL.D}
+\myindex{MIPS!\Instructions!ADD.D}
+\begin{itemize}
+
+\item \INS{LWC1} loads a 32-bit word into a register of the first coprocessor (hence \q{1} in instruction name).
+\myindex{MIPS!\Pseudoinstructions!L.D}
+
+A pair of \INS{LWC1} instructions may be combined into a \INS{L.D} pseudo instruction.
+
+\item \INS{DIV.D}, \INS{MUL.D}, \INS{ADD.D} do division, multiplication, and addition respectively 
+(\q{.D} in the suffix stands for double precision, \q{.S} stands for single precision)
+
+\end{itemize}
+
+\myindex{MIPS!\Instructions!LUI}
+\myindex{\CompilerAnomaly}
+\label{MIPS_FPU_LUI}
+
+There is also a weird compiler anomaly: the \INS{LUI} instructions that we've marked with a question mark.
+It's hard for me to understand why load a part of a 64-bit constant of \Tdouble type into the \$V0 register.
+These instructions has no effect.
+% TODO did you try checking out compiler source code?
+If someone knows more about it, please drop an email to author\footnote{\EMAIL}.
+
diff --git a/patterns/12_FPU/1_simple/MIPS_O3_IDA_JPN.lst b/patterns/12_FPU/1_simple/MIPS_O3_IDA_JPN.lst
@@ -0,0 +1,31 @@
+f:
+; $f12-$f13=A
+; $f14-$f15=B
+                lui     $v0, (dword_C4 >> 16) ; ?
+; load low 32-bit part of 3.14 constant to $f0:
+                lwc1    $f0, dword_BC
+                or      $at, $zero            ; load delay slot, NOP
+; load high 32-bit part of 3.14 constant to $f1:
+                lwc1    $f1, $LC0
+                lui     $v0, ($LC1 >> 16)     ; ?
+; A in $f12-$f13, 3.14 constant in $f0-$f1, do division:
+                div.d   $f0, $f12, $f0
+; $f0-$f1=A/3.14
+; load low 32-bit part of 4.1 to $f2:
+                lwc1    $f2, dword_C4
+                or      $at, $zero            ; load delay slot, NOP
+; load high 32-bit part of 4.1 to $f3:
+                lwc1    $f3, $LC1
+                or      $at, $zero            ; load delay slot, NOP
+; B in $f14-$f15, 4.1 constant in $f2-$f3, do multiplication:
+                mul.d   $f2, $f14, $f2
+; $f2-$f3=B*4.1
+                jr      $ra
+; sum 64-bit parts and leave result in $f0-$f1:
+                add.d   $f0, $f2              ; branch delay slot, NOP
+
+
+.rodata.cst8:000000B8 $LC0:           .word 0x40091EB8         # DATA XREF: f+C
+.rodata.cst8:000000BC dword_BC:       .word 0x51EB851F         # DATA XREF: f+4
+.rodata.cst8:000000C0 $LC1:           .word 0x40106666         # DATA XREF: f+10
+.rodata.cst8:000000C4 dword_C4:       .word 0x66666666         # DATA XREF: f
diff --git a/patterns/12_FPU/1_simple/MSVC_JPN.asm b/patterns/12_FPU/1_simple/MSVC_JPN.asm
@@ -0,0 +1,38 @@
+CONST    SEGMENT
+__real@4010666666666666 DQ 04010666666666666r    ; 4.1
+CONST    ENDS
+CONST    SEGMENT
+__real@40091eb851eb851f DQ 040091eb851eb851fr    ; 3.14
+CONST    ENDS
+_TEXT    SEGMENT
+_a$ = 8           ; size = 8
+_b$ = 16          ; size = 8
+_f  PROC
+    push   ebp
+    mov    ebp, esp
+    fld    QWORD PTR _a$[ebp]
+
+; current stack state: ST(0) = _a
+
+    fdiv   QWORD PTR __real@40091eb851eb851f
+
+; current stack state: ST(0) = result of _a divided by 3.14
+
+    fld    QWORD PTR _b$[ebp]
+
+; current stack state: ST(0) = _b;
+; ST(1) = result of _a divided by 3.14
+
+    fmul   QWORD PTR __real@4010666666666666
+
+; current stack state: 
+; ST(0) = result of _b * 4.1; 
+; ST(1) = result of _a divided by 3.14
+
+    faddp  ST(1), ST(0)
+
+; current stack state: ST(0) = result of addition
+
+    pop    ebp
+    ret    0
+_f  ENDP