forked from wangyif2/RE-for-beginners
-
Notifications
You must be signed in to change notification settings - Fork 5
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
46 changed files
with
1,947 additions
and
2 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,46 @@ | ||
f: | ||
sub sp, sp, #16 | ||
str d0, [sp,8] ; save "a" in Register Save Area | ||
str d1, [sp] ; save "b" in Register Save Area | ||
ldr x1, [sp,8] | ||
; X1 = a | ||
ldr x0, .LC25 | ||
; X0 = 3.14 | ||
fmov d0, x1 | ||
fmov d1, x0 | ||
; D0 = a, D1 = 3.14 | ||
fdiv d0, d0, d1 | ||
; D0 = D0/D1 = a/3.14 | ||
|
||
fmov x1, d0 | ||
; X1 = a/3.14 | ||
ldr x2, [sp] | ||
; X2 = b | ||
ldr x0, .LC26 | ||
; X0 = 4.1 | ||
fmov d0, x2 | ||
; D0 = b | ||
fmov d1, x0 | ||
; D1 = 4.1 | ||
fmul d0, d0, d1 | ||
; D0 = D0*D1 = b*4.1 | ||
|
||
fmov x0, d0 | ||
; X0 = D0 = b*4.1 | ||
fmov d0, x1 | ||
; D0 = a/3.14 | ||
fmov d1, x0 | ||
; D1 = X0 = b*4.1 | ||
fadd d0, d0, d1 | ||
; D0 = D0+D1 = a/3.14 + b*4.1 | ||
|
||
fmov x0, d0 ; \ redundant code | ||
fmov d0, x0 ; / | ||
add sp, sp, 16 | ||
ret | ||
.LC25: | ||
.word 1374389535 ; 3.14 | ||
.word 1074339512 | ||
.LC26: | ||
.word 1717986918 ; 4.1 | ||
.word 1074816614 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,19 @@ | ||
f: | ||
; D0 = a, D1 = b | ||
ldr d2, .LC25 ; 3.14 | ||
; D2 = 3.14 | ||
fdiv d0, d0, d2 | ||
; D0 = D0/D2 = a/3.14 | ||
ldr d2, .LC26 ; 4.1 | ||
; D2 = 4.1 | ||
fmadd d0, d1, d2, d0 | ||
; D0 = D1*D2+D0 = b*4.1+a/3.14 | ||
ret | ||
|
||
; constants in IEEE 754 format: | ||
.LC25: | ||
.word 1374389535 ; 3.14 | ||
.word 1074339512 | ||
.LC26: | ||
.word 1717986918 ; 4.1 | ||
.word 1074816614 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,29 @@ | ||
f | ||
PUSH {R3-R7,LR} | ||
MOVS R7, R2 | ||
MOVS R4, R3 | ||
MOVS R5, R0 | ||
MOVS R6, R1 | ||
LDR R2, =0x66666666 ; 4.1 | ||
LDR R3, =0x40106666 | ||
MOVS R0, R7 | ||
MOVS R1, R4 | ||
BL __aeabi_dmul | ||
MOVS R7, R0 | ||
MOVS R4, R1 | ||
LDR R2, =0x51EB851F ; 3.14 | ||
LDR R3, =0x40091EB8 | ||
MOVS R0, R5 | ||
MOVS R1, R6 | ||
BL __aeabi_ddiv | ||
MOVS R2, R7 | ||
MOVS R3, R4 | ||
BL __aeabi_dadd | ||
POP {R3-R7,PC} | ||
|
||
; 4.1 in IEEE 754 form: | ||
dword_364 DCD 0x66666666 ; DATA XREF: f+A | ||
dword_368 DCD 0x40106666 ; DATA XREF: f+C | ||
; 3.14 in IEEE 754 form: | ||
dword_36C DCD 0x51EB851F ; DATA XREF: f+1A | ||
dword_370 DCD 0x40091EB8 ; DATA XREF: f+1C |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,13 @@ | ||
f | ||
VLDR D16, =3.14 | ||
VMOV D17, R0, R1 ; load "a" | ||
VMOV D18, R2, R3 ; load "b" | ||
VDIV.F64 D16, D17, D16 ; a/3.14 | ||
VLDR D17, =4.1 | ||
VMUL.F64 D17, D18, D17 ; b*4.1 | ||
VADD.F64 D16, D17, D16 ; + | ||
VMOV R0, R1, D16 | ||
BX LR | ||
|
||
dbl_2C98 DCFD 3.14 ; DATA XREF: f | ||
dbl_2CA0 DCFD 4.1 ; DATA XREF: f+10 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,114 @@ | ||
\subsubsection{ARM: \OptimizingXcodeIV (\ARMMode)} | ||
|
||
Until ARM got standardized floating point support, several processor manufacturers added their own | ||
instructions extensions. | ||
Then, VFP (\IT{Vector Floating Point}) was standardized. | ||
|
||
One important difference from x86 is that in ARM, there | ||
is no stack, you work just with registers. | ||
|
||
\lstinputlisting[label=ARM_leaf_example10,caption=\OptimizingXcodeIV (\ARMMode),style=customasmARM]{patterns/12_FPU/1_simple/ARM/Xcode_ARM_O3_JPN.asm} | ||
|
||
\myindex{ARM!D-\registers{}} | ||
\myindex{ARM!S-\registers{}} | ||
|
||
So, we see here new some registers used, with D prefix. | ||
|
||
These are 64-bit registers, there are 32 of them, and they can be used both for floating-point numbers | ||
(double) but also for SIMD (it is called NEON here in ARM). | ||
|
||
There are also 32 32-bit S-registers, intended to be used for single precision | ||
floating pointer numbers (float). | ||
|
||
It is easy to memorize: D-registers are for double precision numbers, while | ||
S-registers---for single precision numbers. | ||
More about it: \myref{ARM_VFP_registers}. | ||
|
||
Both constants (3.14 and 4.1) are stored in memory in IEEE 754 format. | ||
|
||
\myindex{ARM!\Instructions!VLDR} | ||
\myindex{ARM!\Instructions!VMOV} | ||
\INS{VLDR} and \INS{VMOV}, as it can be easily deduced, are analogous to the \INS{LDR} and \MOV instructions, | ||
but they work with D-registers. | ||
|
||
It has to be noted that these instructions, just like the D-registers, are intended not only for | ||
floating point numbers, | ||
but can be also used for SIMD (NEON) operations and this will also be shown soon. | ||
|
||
The arguments are passed to the function in a common way, via the R-registers, however | ||
each number that has double precision has a size of 64 bits, so two R-registers are needed to pass each one. | ||
|
||
\INS{VMOV D17, R0, R1} at the start, composes two 32-bit values from \Reg{0} and \Reg{1} into one 64-bit value | ||
and saves it to \GTT{D17}. | ||
|
||
\INS{VMOV R0, R1, D16} is the inverse operation: what has been in \GTT{D16} | ||
is split in two registers, \Reg{0} and \Reg{1}, because a double-precision number | ||
that needs 64 bits for storage, is returned in \Reg{0} and \Reg{1}. | ||
|
||
\myindex{ARM!\Instructions!VDIV} | ||
\myindex{ARM!\Instructions!VMUL} | ||
\myindex{ARM!\Instructions!VADD} | ||
\INS{VDIV}, \INS{VMUL} and \INS{VADD}, | ||
are instruction for processing floating point numbers that compute \gls{quotient}, | ||
\gls{product} and sum, respectively. | ||
|
||
The code for Thumb-2 is same. | ||
|
||
\subsubsection{ARM: \OptimizingKeilVI (\ThumbMode)} | ||
|
||
\lstinputlisting[style=customasmARM]{patterns/12_FPU/1_simple/ARM/Keil_O3_thumb_JPN.asm} | ||
|
||
Keil generated code for a processor without FPU or NEON support. | ||
|
||
The double-precision floating-point numbers are passed via generic R-registers, | ||
and instead of FPU-instructions, service library functions are called\\ | ||
(like \GTT{\_\_aeabi\_dmul}, \GTT{\_\_aeabi\_ddiv}, \GTT{\_\_aeabi\_dadd}) | ||
which emulate multiplication, division and addition for floating-point numbers. | ||
|
||
Of course, that is slower than FPU-coprocessor, but it's still better than nothing. | ||
|
||
By the way, similar FPU-emulating libraries were very popular in the x86 world when coprocessors were rare | ||
and expensive, and were installed only on expensive computers. | ||
|
||
\myindex{ARM!soft float} | ||
\myindex{ARM!armel} | ||
\myindex{ARM!armhf} | ||
\myindex{ARM!hard float} | ||
|
||
The FPU-coprocessor emulation is called \IT{soft float} or \IT{armel} (\IT{emulation}) in the ARM world, | ||
while using the coprocessor's FPU-instructions is called \IT{hard float} or \IT{armhf}. | ||
|
||
\iffalse | ||
% TODO разобраться... | ||
\myindex{Raspberry Pi} | ||
|
||
For example, the Linux kernel for Raspberry Pi is compiled in two variants. | ||
|
||
In the \IT{soft float} case, arguments are passed via R-registers, and in the \IT{hard float} case---via D-registers. | ||
|
||
And that is what stops you from using armhf-libraries from armel-code or vice versa, | ||
and that is why all the code in Linux distributions must be compiled according to a single convention. | ||
\fi | ||
|
||
\subsubsection{ARM64: \Optimizing GCC (Linaro) 4.9} | ||
|
||
Very compact code: | ||
|
||
\lstinputlisting[caption=\Optimizing GCC (Linaro) 4.9,style=customasmARM]{patterns/12_FPU/1_simple/ARM/ARM64_GCC_O3_JPN.s} | ||
|
||
\subsubsection{ARM64: \NonOptimizing GCC (Linaro) 4.9} | ||
|
||
\lstinputlisting[caption=\NonOptimizing GCC (Linaro) 4.9,style=customasmARM]{patterns/12_FPU/1_simple/ARM/ARM64_GCC_O0_JPN.s} | ||
|
||
\NonOptimizing GCC is more verbose. | ||
|
||
There is a lot of unnecessary value shuffling, including some clearly redundant code | ||
(the last two \INS{FMOV} instructions). Probably, GCC 4.9 is not yet good in generating ARM64 code. | ||
|
||
What is worth noting is that ARM64 has 64-bit registers, and the D-registers are 64-bit ones as well. | ||
|
||
So the compiler is free to save values of type \Tdouble in \ac{GPR}s instead of the local stack. | ||
This isn't possible on 32-bit CPUs. | ||
|
||
And again, as an exercise, you can try to optimize this function manually, without introducing | ||
new instructions like \INS{FMADD}. |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,31 @@ | ||
public f | ||
f proc near | ||
|
||
arg_0 = qword ptr 8 | ||
arg_8 = qword ptr 10h | ||
|
||
push ebp | ||
fld ds:dbl_8048608 ; 3.14 | ||
|
||
; stack state now: ST(0) = 3.14 | ||
|
||
mov ebp, esp | ||
fdivr [ebp+arg_0] | ||
|
||
; stack state now: ST(0) = result of division | ||
|
||
fld ds:dbl_8048610 ; 4.1 | ||
|
||
; stack state now: ST(0) = 4.1, ST(1) = result of division | ||
|
||
fmul [ebp+arg_8] | ||
|
||
; stack state now: ST(0) = result of multiplication, ST(1) = result of division | ||
|
||
pop ebp | ||
faddp st(1), st | ||
|
||
; stack state now: ST(0) = result of addition | ||
|
||
retn | ||
f endp |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,20 @@ | ||
\myparagraph{GCC} | ||
|
||
GCC 4.4.1 (with \Othree option) emits the same code, just slightly different: | ||
|
||
\lstinputlisting[caption=\Optimizing GCC 4.4.1,style=customasmx86]{patterns/12_FPU/1_simple/GCC_EN.asm} | ||
|
||
The difference is that, first of all, 3.14 is pushed to the stack (into \ST{0}), and then the value | ||
in \GTT{arg\_0} is divided by the value in \ST{0}. | ||
|
||
\myindex{x86!\Instructions!FDIVR} | ||
|
||
\FDIVR stands for \IT{Reverse Divide}~---to divide with divisor and dividend swapped with each other. | ||
There is no likewise instruction for multiplication since it is | ||
a commutative operation, so we just have \FMUL without its \GTT{-R} counterpart. | ||
|
||
\myindex{x86!\Instructions!FADDP} | ||
|
||
\FADDP adds the two values but also pops one value from the stack. | ||
After that operation, \ST{0} holds the sum. | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,41 @@ | ||
\subsubsection{MIPS} | ||
|
||
MIPS can support several coprocessors (up to 4), | ||
the zeroth of which\footnote{Starting at 0.} is a special control coprocessor, | ||
and first coprocessor is the FPU. | ||
|
||
As in ARM, the MIPS coprocessor is not a stack machine, it has 32 32-bit registers (\$F0-\$F31): | ||
\myref{MIPS_FPU_registers}. | ||
|
||
When one needs to work with 64-bit \Tdouble values, a pair of 32-bit F-registers is used. | ||
|
||
\lstinputlisting[caption=\Optimizing GCC 4.4.5 (IDA),style=customasmMIPS]{patterns/12_FPU/1_simple/MIPS_O3_IDA_JPN.lst} | ||
|
||
The new instructions here are: | ||
|
||
\myindex{MIPS!\Instructions!LWC1} | ||
\myindex{MIPS!\Instructions!DIV.D} | ||
\myindex{MIPS!\Instructions!MUL.D} | ||
\myindex{MIPS!\Instructions!ADD.D} | ||
\begin{itemize} | ||
|
||
\item \INS{LWC1} loads a 32-bit word into a register of the first coprocessor (hence \q{1} in instruction name). | ||
\myindex{MIPS!\Pseudoinstructions!L.D} | ||
|
||
A pair of \INS{LWC1} instructions may be combined into a \INS{L.D} pseudo instruction. | ||
|
||
\item \INS{DIV.D}, \INS{MUL.D}, \INS{ADD.D} do division, multiplication, and addition respectively | ||
(\q{.D} in the suffix stands for double precision, \q{.S} stands for single precision) | ||
|
||
\end{itemize} | ||
|
||
\myindex{MIPS!\Instructions!LUI} | ||
\myindex{\CompilerAnomaly} | ||
\label{MIPS_FPU_LUI} | ||
|
||
There is also a weird compiler anomaly: the \INS{LUI} instructions that we've marked with a question mark. | ||
It's hard for me to understand why load a part of a 64-bit constant of \Tdouble type into the \$V0 register. | ||
These instructions has no effect. | ||
% TODO did you try checking out compiler source code? | ||
If someone knows more about it, please drop an email to author\footnote{\EMAIL}. | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,31 @@ | ||
f: | ||
; $f12-$f13=A | ||
; $f14-$f15=B | ||
lui $v0, (dword_C4 >> 16) ; ? | ||
; load low 32-bit part of 3.14 constant to $f0: | ||
lwc1 $f0, dword_BC | ||
or $at, $zero ; load delay slot, NOP | ||
; load high 32-bit part of 3.14 constant to $f1: | ||
lwc1 $f1, $LC0 | ||
lui $v0, ($LC1 >> 16) ; ? | ||
; A in $f12-$f13, 3.14 constant in $f0-$f1, do division: | ||
div.d $f0, $f12, $f0 | ||
; $f0-$f1=A/3.14 | ||
; load low 32-bit part of 4.1 to $f2: | ||
lwc1 $f2, dword_C4 | ||
or $at, $zero ; load delay slot, NOP | ||
; load high 32-bit part of 4.1 to $f3: | ||
lwc1 $f3, $LC1 | ||
or $at, $zero ; load delay slot, NOP | ||
; B in $f14-$f15, 4.1 constant in $f2-$f3, do multiplication: | ||
mul.d $f2, $f14, $f2 | ||
; $f2-$f3=B*4.1 | ||
jr $ra | ||
; sum 64-bit parts and leave result in $f0-$f1: | ||
add.d $f0, $f2 ; branch delay slot, NOP | ||
|
||
|
||
.rodata.cst8:000000B8 $LC0: .word 0x40091EB8 # DATA XREF: f+C | ||
.rodata.cst8:000000BC dword_BC: .word 0x51EB851F # DATA XREF: f+4 | ||
.rodata.cst8:000000C0 $LC1: .word 0x40106666 # DATA XREF: f+10 | ||
.rodata.cst8:000000C4 dword_C4: .word 0x66666666 # DATA XREF: f |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,38 @@ | ||
CONST SEGMENT | ||
__real@4010666666666666 DQ 04010666666666666r ; 4.1 | ||
CONST ENDS | ||
CONST SEGMENT | ||
__real@40091eb851eb851f DQ 040091eb851eb851fr ; 3.14 | ||
CONST ENDS | ||
_TEXT SEGMENT | ||
_a$ = 8 ; size = 8 | ||
_b$ = 16 ; size = 8 | ||
_f PROC | ||
push ebp | ||
mov ebp, esp | ||
fld QWORD PTR _a$[ebp] | ||
|
||
; current stack state: ST(0) = _a | ||
|
||
fdiv QWORD PTR __real@40091eb851eb851f | ||
|
||
; current stack state: ST(0) = result of _a divided by 3.14 | ||
|
||
fld QWORD PTR _b$[ebp] | ||
|
||
; current stack state: ST(0) = _b; | ||
; ST(1) = result of _a divided by 3.14 | ||
|
||
fmul QWORD PTR __real@4010666666666666 | ||
|
||
; current stack state: | ||
; ST(0) = result of _b * 4.1; | ||
; ST(1) = result of _a divided by 3.14 | ||
|
||
faddp ST(1), ST(0) | ||
|
||
; current stack state: ST(0) = result of addition | ||
|
||
pop ebp | ||
ret 0 | ||
_f ENDP |
Oops, something went wrong.