What the hell - Why should I learn assembly?

Understanding debugger output:

400d4e:  55                  push %rbp
400d4f:  48 89 e5            mov %rsp,%rbp
400d52:  bf 84 79 48 00      mov $0x487984,%edi
400d57:  e8 54 6b 00 00      callq 4078b0 <<<IO_puts>>
400d5c:  5d                  pop %rbp
400d5d:  c3                  retq

get full control over your hardware (using specific instructions)

system programming (e.g. kernel entry/exit)
We need to go deeper: Fibonacci

```c
int fib(int n)
{
    int fcur = 0, fnext = 1, tmp;
    while(--n>0) {
        tmp = fcur + fnext;
        fcur = fnext;
        fnext = tmp;
    }
    return fnext;
}

int main(int argc, char **argv)
{
    printf("Fib: %d\n", fib(atoi(argv[1])));
}
```
Fibonacci

```
int fib(int n)
{
    int fcur = 0, fnext = 1, tmp;
    while (--n > 0) {
        tmp = fcur + fnext;
        fcur = fnext;
        fnext = tmp;
    }
    return fnext;
}
```
Fibonacci

fib.c

```c
int fib(int n)
{
    int fcur = 0, fnext = 1, tmp;
    while (--n > 0) {
        tmp = fcur + fnext;
        fcur = fnext;
        fnext = tmp;
    }
    return fnext;
}
```

CFLAGS="-Wall -02 -march=x86-64" make fib.o
Sections of object file

$ objdump -h fib.o

fib.o:     file format elf64-x86-64

Sections:
Idx     Name    Size       ...       File off   Algn
  0     .text   00000021   ...   00000040   2**4
        CONTENTS, ALLOC, LOAD, READONLY, CODE
  1     .data   00000000   ...   00000061   2**0
        CONTENTS, ALLOC, LOAD, DATA

...
$$\text{Sections of object file}$$

```
$ \text{objdump -h fib.o}

fib.o: file format elf64-x86-64

Sections:
<table>
<thead>
<tr>
<th>Idx</th>
<th>Name</th>
<th>Size</th>
<th>...</th>
<th>File off</th>
<th>Algn</th>
</tr>
</thead>
<tbody>
<tr>
<td>0</td>
<td>.text</td>
<td>00000021</td>
<td>...</td>
<td>00000040</td>
<td>2**4</td>
</tr>
</tbody>
</table>
  CONTENTS, ALLOC, LOAD, READONLY, CODE
| 1   | .data | 00000000 | ... | 00000061 | 2**0 |
  CONTENTS, ALLOC, LOAD, DATA

...
Looking into text section

$ dd if=fib.o of=fib.o.hex bs=1 count=$((0x23)) skip=$((0x40))
35+0 records in
35+0 records out
35 bytes copied, 0.000799485 s, 43.8 kB/s
$ xxd fib.o.hex
00000000: 83ef 01b8 0100 0000 85ff 7e14 31d2 6690 ..........~.1.f.
00000010: 89c1 01d0 89ca 83ef 0175 f5c3 0f1f 4000 .........u....@.
00000020: c3
What sees a processor

83ef01b801000000085ff7e1431d2669089c101d089ca83ef0175f5c30f1f4000c3
What sees a human

\texttt{fib :}

\begin{verbatim}
sub $0x1, %edi
mov $0x1, %eax
test %edi, %edi
jle 20 <fib+0x20>
xor %edx, %edx
xchg %ax, %ax
mov %eax, %ecx
add %edx, %eax
mov %ecx, %edx
sub $0x1, %edi
jne 10 <fib+0x10>
ret
nop 0x0(%rax)
ret
\end{verbatim}
What sees a processor

83ef01b80100000085ff7e1431d2669089c101d089ca83ef0175f5c30f1f4000c3
What sees a processor

83ef01b801000000085ff7e1431d2669089c101d089ca83ef0175f5c30f1f4000c3

$ wget http://svn.inf.tu-dresden.de/repos/advsysprog/asm/opcodes.pdf
What sees a processor

83ef01b80100000085ff7e1431d2669089c101d089ca83ef0175f5c30f1f4000c3


$ wget http://svn.inf.tu-dresden.de/repos/advsysprog/asm/opcodes.pdf

Table[0x8, 0x3] = Immediate Grp 1: Ev, Ib
What 0x83 stands for?

- Ev, lb
  - E A ModR/M byte follows the opcode. The operand is either a GPR or an address.
  - v Word, doubleword or quadword
  - l Immediate data
  - b Byte
What 0x83 stands for?

Ev, Ib

E  A ModR/M byte follows the opcode. The operand is either a GPR or an address.

v  Word, doubleword or quadword

l  Immediate data

b  Byte

Need to look into next byte
ModR/M

83ef01b80100000085ff7e1431d2669089c101d089ca83ef0175f5c30f1f4000c3
ModR/M

83ef01b80100000085ff7e1431d2669089c101d089ca83ef0175f5c30f1f4000c3

<table>
<thead>
<tr>
<th>Mod</th>
<th>Reg/Opcode</th>
<th>R/M</th>
</tr>
</thead>
<tbody>
<tr>
<td>11</td>
<td>101</td>
<td>111</td>
</tr>
</tbody>
</table>
ModR/M

83ef01b80100000085ff7e1431d2669089c101d089ca83ef0175f5c30f1f4000c3

Mod(11) + R/M(111) → edi
Opcode(101) → sub
Immediate data

```
sub imm8, %edi
```

Look into next byte

```
sub $0x1, %edi
```
Immediate data

```
sub imm8, %edi

83ef01b80100000085ff7e1431d2669089c101d089ca83ef0175f5c30f1f4000c3
```

Look into next byte
Immediate data

**sub** imm8, %edi

83ef01b80100000085ff7e1431d2669089c101d089ca83ef0175f5c30f1f4000c3

Look into next byte
Immediate data

```
sub imm8, %edi

83ef01 b80100000085ff7e1431d2669089c101d089ca83ef0175f5c30f1f4000c3

Look into next byte

sub $0x1, %edi
```
Table \[ b, 8 \] = Mov: rAX/r8, Iv

Move immediate word into word register

\texttt{mov$ 0x1 , \%eax}
b80100000085ff7e1431d2669089c101d089ca83ef0175f5c30f1f4000c3
Next instruction

Table[b, 8] = Mov: rAX/r8, lv
Next instruction

\texttt{b80100000085ff7e1431d2669089c101d089ca83ef0175f5c30f1f4000c3}

\text{Table}[^{b, 8}] = \textit{Mov} : rAX/r8, lv
Move immediate word into word register
Next instruction

Table[b, 8] = Mov : rAX/r8, lv
Move immediate word into word register

`mov $0x1, %eax`
Next introduction (ModR/M)

85ff7e1431d2669089c101d089ca83ef0175f5c30f1f4000c3
Next introduction (ModR/M)

85ff7e1431d2669089c101d089ca83ef0175f5c30f1f4000c3

Test Ev, Gv
Next introduction (ModR/M)

85ff7e1431d2669089c101d089ca83ef0175f5c30f1f4000c3

Test Ev, Gv

<table>
<thead>
<tr>
<th>Mod (11)</th>
<th>Reg/Opcode (111)</th>
<th>R/M (111)</th>
</tr>
</thead>
<tbody>
<tr>
<td>1 1</td>
<td>1 1 1</td>
<td>1 1 1</td>
</tr>
</tbody>
</table>

\[
\text{Mod}\rightarrow\text{edi} \\
\text{Reg}\rightarrow\text{edi} \\
\text{test} \%
\text{edi}, \%
\text{edi}
\]
Next introduction (ModR/M)

85ff7e1431d2669089c101d089ca83ef0175f5c30f1f4000c3

Test Ev, Gv

<table>
<thead>
<tr>
<th>Mod</th>
<th>Reg/Opcode</th>
<th>R/M</th>
</tr>
</thead>
<tbody>
<tr>
<td>7</td>
<td>6</td>
<td>5</td>
</tr>
<tr>
<td></td>
<td>4</td>
<td>3</td>
</tr>
<tr>
<td></td>
<td>2</td>
<td>1</td>
</tr>
<tr>
<td>1</td>
<td>1 1 1</td>
<td>1</td>
</tr>
</tbody>
</table>

Mod(11) + R/M(111) → edi
Reg(111) → edi
Next introduction (ModR/M)

85ff7e1431d2669089c101d089ca83ef0175f5c30f1f4000c3

Test Ev, Gv

<table>
<thead>
<tr>
<th>Mod</th>
<th>Reg/Opcode</th>
<th>R/M</th>
</tr>
</thead>
<tbody>
<tr>
<td>7</td>
<td>6</td>
<td>5</td>
</tr>
<tr>
<td>4</td>
<td>3</td>
<td>2</td>
</tr>
<tr>
<td>1</td>
<td>0</td>
<td></td>
</tr>
</tbody>
</table>

Mod(11) + R/M(111) → edi
Reg(111) → edi

test %edi, %edi
Continue decoding

\[ 85\text{ff}7e1431d2669089c101d089ca83ef0175f5c30f1f4000c3 \]
Continue decoding

7e1431d2669089c101d089ca83ef0175f5c30f1f4000c3
7e1431d2669089c101d089ca83ef0175f5c30f1f4000c3

Table[7, e] = jle

*Short jump* is followed by single byte immediate offset. *Near jump* has prefix 0x0f, e.g. 0x0f7e – near jle.
7e1431d2669089c101d089ca83ef0175f5c30f1f4000c3

Table[7, e] = jle

*Short jump* is followed by single byte immediate offset. *Near jump* has prefix 0x0f, e. g. 0x0f7e – near jle.
Continue decoding

Table[7, e] = jle

*Short jump* is followed by single byte immediate offset. *Near jump* has prefix 0x0f, e.g. 0x0f7e – near jle.

    jle  $0x14

Jump is relative to the address of next instruction.
Check

0000000000000000  <fib >:

0: 83 ef 01           sub  $0x1,%edi
3: b8 01 00 00 00      mov  $0x1,%eax
8: 85 ff              test  %edi,%edi
    7e 14              jle  20 <fib+0x20>
    ...              ...
    1c: 0f 1f 40 00     nopl  0x0(%rax)
   20: c3              ret
This chapter describes the instruction format for all Intel 64 and IA-32 processors. The instruction format for protected mode, real-address mode and virtual-8086 mode is described in Section 2.1. Increments provided for IA-32e mode and its sub-modes are described in Section 2.2.

2.1 INSTRUCTION FORMAT FOR PROTECTED MODE, REAL-ADDRESS MODE, AND VIRTUAL-8086 MODE

The Intel 64 and IA-32 architectures instruction encodings are subsets of the format shown in Figure 2-1. Instructions consist of optional instruction prefixes (in any order), primary opcode bytes (up to three bytes), an addressing-form specifier (if required) consisting of the ModR/M byte and sometimes the SIB (Scale-Index-Base) byte, a displacement (if required), and an immediate data field (if required).

2.1.1 Instruction Prefixes

Instruction prefixes are divided into four groups, each with a set of allowable prefix codes. For each instruction, it is only useful to include up to one prefix code from each of the four groups (Groups 1, 2, 3, 4). Groups 1 through 4 may be placed in any order relative to each other.

- **Group 1** — Lock and repeat prefixes:
  - LOCK prefix is encoded using F0H.
  - REPNE/REPNZ prefix is encoded using F2H. Repeat-Not-Zero prefix applies only to string and input/output instructions. (F2H is also used as a mandatory prefix for some instructions.)
  - REP or REPE/REPZ is encoded using F3H. The repeat prefix applies only to string and input/output instructions. F3H is also used as a mandatory prefix for POPCNT, LZCNT and ADOX instructions.

- **Group 2** — Bound prefix is encoded using F2H if the following conditions are true:
  - CPUID.(EAX=07H, ECX=0):EBX.MPX[bit 14] is set.

**Figure 2-1. Intel 64 and IA-32 Architectures Instruction Format**

<table>
<thead>
<tr>
<th>Instruction Prefixes</th>
<th>Opcode</th>
<th>ModR/M</th>
<th>SIB</th>
<th>Displacement</th>
<th>Immediate</th>
</tr>
</thead>
<tbody>
<tr>
<td>Prefixes of 1 byte each (optional)&lt;sup&gt;1, 2&lt;/sup&gt;</td>
<td>1-, 2-, or 3-byte opcode</td>
<td>1 byte (if required)</td>
<td>1 byte (if required)</td>
<td>Address displacement of 1, 2, or 4 bytes or none&lt;sup&gt;3&lt;/sup&gt;</td>
<td>Immediate data of 1, 2, or 4 bytes or none&lt;sup&gt;3&lt;/sup&gt;</td>
</tr>
</tbody>
</table>

1. The REX prefix is optional, but if used must be immediately before the opcode; see Section 2.2.1, “REX Prefixes” for additional information.
2. For VEX encoding information, see Section 2.3, “Intel® Advanced Vector Extensions (Intel® AVX)”.  
3. Some rare instructions can take an 8B immediate or 8B displacement.

1 byte of Mod R/M Reg/Opcode R/M Scale Index Base

Address displacement of 1, 2, or 4 bytes or none
2.2.1 REX Prefixes

REX prefixes are instruction-prefix bytes used in 64-bit mode. They do the following:

- Specify GPRs and SSE registers.
- Specify 64-bit operand size.
- Specify extended control registers.

Not all instructions require a REX prefix in 64-bit mode. A prefix is necessary only if an instruction references one of the extended registers or uses a 64-bit operand. If a REX prefix is used when it has no meaning, it is ignored.

Only one REX prefix is allowed per instruction. If used, the REX prefix byte must immediately precede the opcode byte or the escape opcode byte (0FH). When a REX prefix is used in conjunction with an instruction containing a mandatory prefix, the mandatory prefix must come before the REX so the REX prefix can be immediately preceding the opcode or the escape byte. For example, CVTDQ2PD with a REX prefix should have REX placed between F3 and 0F E6. Other placements are ignored. The instruction-size limit of 15 bytes still applies to instructions with a REX prefix. See Figure 2-3.

2.2.1.1 Encoding

Intel 64 and IA-32 instruction formats specify up to three registers by using 3-bit fields in the encoding, depending on the format:

- ModR/M: the reg and r/m fields of the ModR/M byte
- ModR/M with SIB: the reg field of the ModR/M byte, the base and index fields of the SIB (scale, index, base) byte
- Instructions without ModR/M: the reg field of the opcode

In 64-bit mode, these formats do not change. Bits needed to define fields in the 64-bit context are provided by the addition of REX prefixes.

2.2.1.2 More on REX Prefix Fields

REX prefixes are a set of 16 opcodes that span one row of the opcode map and occupy entries 40H to 4FH. These opcodes represent valid instructions (INC or DEC) in IA-32 operating modes and in compatibility mode. In 64-bit mode, the same opcodes represent the instruction prefix REX and are not treated as individual instructions.

The single-byte-opcode forms of the INC/DEC instructions are not available in 64-bit mode. INC/DEC functionality is still available using ModR/M forms of the same instructions (opcodes FF/0 and FF/1).

See Table 2-4 for a summary of the REX prefix format. Figure 2-4 through Figure 2-7 show examples of REX prefix fields in use. Some combinations of REX prefix fields are invalid. In such cases, the prefix is ignored. Some additional information follows:

- Setting REX.W can be used to determine the operand size but does not solely determine operand width. Like the 66H size prefix, 64-bit operand size override has no effect on byte-specific operations.
- For non-byte operations: if a 66H prefix is used with prefix (REX.W = 1), 66H is ignored.
- If a 66H override is used with REX and REX.W = 0, the operand size is 16 bits.

Figure 2-3. Prefix Ordering in 64-bit Mode
Try it yourself

Pick one of those:

- 8d 04 11
- 75 f5
- 66 90
Complete disassembly

0000000000000000 <fib>:

0: 83 ef 01   sub $0x1,%edi
3: b8 01 00 00 00  mov $0x1,%eax
8: 85 ff      test %edi,%edi
10: 89 c1     mov %eax,%ecx
12: 01 d0     add %edx,%eax
14: 89 ca     mov %ecx,%edx
16: 83 ef 01   sub $0x1,%edi
19: 75 f5     jne 10 <fib+0x10>
1b: c3        ret
1c: 0f 1f 40 00  nopl 0x0(%rax)
20: c3        ret
What do you think?

- What is the most common number of operands for x86 assembly?
What do you think?

- What is the most common number of operands for x86 assembly?
- Why there is no three operand assembly instruction?
What do you think?

- What is the most common number of operands for x86 assembly?
- Why there is no three operand assembly instruction?
- Fixed length instructions. What are advantages and disadvantages?
What do you think?

- What is the most common number of operands for x86 assembly?
- Why there is no three operand assembly instruction?
- Fixed length instructions. What are advantages and disadvantages?
- What is one operand instruction?
What do you think?

- What is the most common number of operands for x86 assembly?
- Why there is no three operand assembly instruction?
- Fixed length instructions. What are advantages and disadvantages?
- What is one operand instruction?
- What is zero operand instruction?
General Purpose Registers

- Data registers
- Flags register
- Instruction pointer

![Diagram of General Purpose Registers in 64-Bit Mode](image)

* Not addressable in REX prefix instruction forms
** Only addressable in REX prefix instruction forms

Figure 3-3. General Purpose Registers in 64-Bit Mode
Register Names

Did you know register names are there for a reason?

▶ (R/E)SP – stack pointer
▶ (R/E)BP – base pointer
▶ (R/E)IP – instruction pointer
Did you know register names are there for a reason?

- (R/E)SP – stack pointer
- (R/E)BP – base pointer
- (R/E)IP – instruction pointer
- (R/E)AX – accumulator
- (R/E)BX – base register
- (R/E)CX – counter register
- (R/E)DX – extended accumulator
- (R/E)SI – source index
- (R/E)DI – destination index
Move Instructions

mov
move data between registers or to/from memory

movl $1, %eax
movl $0xff, %ebx
movl (%ebx), %eax
movl 3(%ebx), %eax
## Assembler dialects

<table>
<thead>
<tr>
<th></th>
<th><strong>Intel</strong></th>
<th><strong>AT&amp;T</strong></th>
</tr>
</thead>
<tbody>
<tr>
<td><strong>order</strong></td>
<td>instr dest, src</td>
<td>instr src, dest</td>
</tr>
<tr>
<td><strong>size</strong></td>
<td>implicit (by reg. name)</td>
<td>explicit (by instr)</td>
</tr>
<tr>
<td><strong>Sigils</strong></td>
<td>automatic</td>
<td>prefixes ($, %)</td>
</tr>
<tr>
<td><strong>mem access</strong></td>
<td>[base+index*scale+disp]</td>
<td>disp(base,index,scale)</td>
</tr>
<tr>
<td></td>
<td>[base + disp]</td>
<td>disp(base)</td>
</tr>
<tr>
<td><strong>Example</strong></td>
<td></td>
<td></td>
</tr>
<tr>
<td>mov</td>
<td>eax, 1</td>
<td>movl $1,%eax</td>
</tr>
<tr>
<td>mov</td>
<td>ebx, 0 fffh</td>
<td>movl $0xff,%ebx</td>
</tr>
<tr>
<td>mov</td>
<td>eax, [ebx]</td>
<td>movl (%ebx),%eax</td>
</tr>
<tr>
<td>mov</td>
<td>eax, [ebx+3]</td>
<td>movl 3(%ebx),%eax</td>
</tr>
</tbody>
</table>
Arithmetic Instructions

add/sub
addition / substraction

\textbf{add} $1,\%eax$
\textbf{add} $\%eax,\%ebx$
\textbf{sub} $1,\%eax$
\textbf{sub} $\%eax,\%ebx$
Logial Instructions

and/or/xor/test

logical operations

and  %eax,%ebx
or   %eax,%ebx
xor  %eax,%ebx
test %eax,%ebx
Stack Instructions

push/pop

push or pop register content to or from the stack

```assembly
push  %eax
pop   %eax
pusha
popa
```
Function-related Instructions

call

call a function

call 0xC0FFEE
call 0xBADA55
ret
Arguments are passed on the stack. Integer values and memory addresses are returned in the EAX register. Registers EAX, ECX, and EDX are caller-saved, and the rest are callee-saved.

https://en.wikipedia.org/wiki/X86_calling_conventions
### x86_64

<table>
<thead>
<tr>
<th>Platform</th>
<th>Par. Reg</th>
<th>Par. Stack</th>
<th>Cleanup</th>
</tr>
</thead>
<tbody>
<tr>
<td>Microsoft</td>
<td>RCX, RDX, R8, R9</td>
<td>RTL(C)</td>
<td>Caller</td>
</tr>
<tr>
<td>System V</td>
<td>RDI, RSI, RDX, RCX, R8, R9</td>
<td>RTL(C)</td>
<td>Caller</td>
</tr>
</tbody>
</table>

**Par. Reg** refers to the parameters passed to the function, **Par. Stack** refers to the parameters stored on the stack, and **Cleanup** refers to the cleanup mechanism used.
<table>
<thead>
<tr>
<th></th>
<th>Par. Reg</th>
<th>Par. Stack</th>
<th>Cleanup</th>
</tr>
</thead>
<tbody>
<tr>
<td><strong>Microsoft</strong></td>
<td>RCX, RDX, R8, R9</td>
<td>RTL(C)</td>
<td>Caller</td>
</tr>
<tr>
<td><strong>System V</strong></td>
<td>RDI, RSI, RDX, RCX</td>
<td>RTL(C)</td>
<td>Caller</td>
</tr>
<tr>
<td></td>
<td>R8, R9</td>
<td></td>
<td></td>
</tr>
</tbody>
</table>

<table>
<thead>
<tr>
<th>Return</th>
<th>Callee Saved</th>
</tr>
</thead>
<tbody>
<tr>
<td><strong>Microsoft</strong></td>
<td>RAX</td>
</tr>
<tr>
<td><strong>System V</strong></td>
<td>RAX</td>
</tr>
</tbody>
</table>

<table>
<thead>
<tr>
<th>Return</th>
<th>Callee Saved</th>
</tr>
</thead>
<tbody>
<tr>
<td><strong>Microsoft</strong></td>
<td>RBX, RBP, RDI, RSI, R12 - R15</td>
</tr>
<tr>
<td><strong>System V</strong></td>
<td>RBX, RBP, R12-R15</td>
</tr>
</tbody>
</table>
Buffers on the stack

Stolen from DOS...
The Battlefield: x86/32

CPU

- EAX
- EBX
- ECX
- EDX
- ESI
- EDI
- EBP
- ESP

General-purpose registers

EIP

Instruction pointer

Segment, FPU, control, MMX, … registers

Address Space

0xFFFFFFFF

Kernel

0xBFFFFFFF

Stack

BSS

Data

Text

0x00000000

Exploitx
The Stack

- Stack frame per function
  - Set up by compiler-generated code
- Used to store
  - Function parameters
  - If not in registers – GCC:
    __attribute__((regparm((<num>))))
  - Local variables
  - Control information
    - Function return address

Address Space

- Kernel
- Stack
- BSS
- Data
- Text

Exploitz
int sum(int a, int b)
{
   return a+b;
}

int main()
{
   return sum(1,3);
}
%<reg> refers to register content

Offset notation: X(%reg) == memory
Location pointed to by reg + X

Constants prefixed with $ sign

(%<reg>) refers to memory location pointed to by <reg>

Exploitz

Assembly recap’d

sum:
  pushl %ebp
  movl %esp, %ebp
  movl 12(%ebp), %eax
  addl 8(%ebp), %eax
  popl %ebp
  ret

main:
  pushl %ebp
  movl %esp, %ebp
  subl $8, %esp
  movl $3, 4(%esp)
  movl $1, (%esp)
  call sum
  ret
So what happens on a call?

```assembly
sum:
  pushl %ebp
  movl %esp, %ebp
  movl 12(%ebp), %eax
  addl 8(%ebp), %eax
  leave
  ret

main:
  pushl %ebp
  movl %esp, %ebp
  subl $8, %esp
  movl $3, 4(%esp)
  movl $1, (%esp)
  call sum
  ret
```
So what happens on a call?

```
sum:
  pushl %ebp
  movl %esp, %ebp
  movl 12(%ebp), %eax
  addl 8(%ebp), %eax
  leave
  ret

main:
  pushl %ebp
  movl %esp, %ebp
  subl $8, %esp
  movl $3, 4(%esp)
  movl $1, (%esp)
  call sum
  ret
```
So what happens on a call?

```assembly
sum:
  pushl %ebp
  movl %esp, %ebp
  movl 12(%ebp), %eax
  addl 8(%ebp), %eax
  leave
  ret

main:
  pushl %ebp
  movl %esp, %ebp
  subl $8, %esp
  movl $3, 4(%esp)
  movl $1, (%esp)
  call sum
  ret
```

Stack
So what happens on a call?

```
sum:
  pushl %ebp
  movl %esp, %ebp
  movl 12(%ebp), %eax
  addl 8(%ebp), %eax
  leave
  ret
```

```
main:
  pushl %ebp
  movl %esp, %ebp
  subl $8, %esp
  movl $3, 4(%esp)
  movl $1, (%esp)
  call sum
  ret
```
So what happens on a call?

```assembly
sum:
  pushl %ebp
  movl %esp, %ebp
  movl 12(%ebp), %eax
  addl 8(%ebp), %eax
  leave
  ret

main:
  pushl %ebp
  movl %esp, %ebp
  subl $8, %esp
  movl $3, 4(%esp)
  movl $1, (%esp)
  call sum
  ret
```
So what happens on a call?

sum:
  pushl %ebp
  movl %esp, %ebp
  movl 12(%ebp), %eax
  addl 8(%ebp), %eax
  leave
  ret

main:
  pushl %ebp
  movl %esp, %ebp
  subl $8, %esp
  movl $3, 4(%esp)
  movl $1, (%esp)
  call sum
  ret
So what happens on a call?

Exploitz

Stack

EIP

ESP

EBP (main)

3

1

Return Addr

sum:
  pushl %ebp
  movl %esp, %ebp
  movl 12(%ebp), %eax
  addl 8(%ebp), %eax
  leave
  ret

main:
  pushl %ebp
  movl %esp, %ebp
  subl $8, %esp
  movl $3, 4(%esp)
  movl $1, (%esp)
  call sum
  ret
So what happens on a call?

```assembly
main:
  pushl %ebp
  movl %esp, %ebp
  subl $8, %esp
  movl $3, 4(%esp)
  movl $1, (%esp)
  call sum
  ret

sum:
  pushl %ebp
  movl %esp, %ebp
  movl 12(%ebp), %eax
  addl 8(%ebp), %eax
  leave
  ret
```

Stack:
- EBP (main)
  - 3
  - 1
- Return Addr
- EBP (sum)

Exploitz
So what happens on a call?

```
sum:
pushl %ebp
movl %esp, %ebp
movl 12(%ebp), %eax
addl 8(%ebp), %eax
leave
ret
```

```
main:
pushl %ebp
movl %esp, %ebp
subl $8, %esp
movl $3, 4(%esp)
movl $1, (%esp)
call sum
ret
```
So what happens on a call?

**sum:**
```
pushl %ebp
movl %esp, %ebp
movl 12(%ebp), %eax
addl 8(%ebp), %eax
leave
ret
```

**main:**
```
pushl %ebp
movl %esp, %ebp
subl $8, %esp
movl $3, 4(%esp)
movl $1, (%esp)
call sum
ret
```
So what happens on a call?

```
sum:
  pushl %ebp
  movl %esp, %ebp
  movl 12(%ebp), %eax
  addl 8(%ebp), %eax
  leave
  ret

main:
  pushl %ebp
  movl %esp, %ebp
  subl $8, %esp
  movl $3, 4(%esp)
  movl $1, (%esp)
  call sum
  ret
```

EIP
ESP
EBP (main)
3
1
Return Addr
EBP (sum)
EBP
EAX: 4
stack: [373x40]Exploitz
So what happens on a call?

```
sum:
  pushl %ebp
  movl %esp, %ebp
  movl 12(%ebp), %eax
  addl 8(%ebp), %eax
  leave
  ret

main:
  pushl %ebp
  movl %esp, %ebp
  subl $8, %esp
  movl $3, 4(%esp)
  movl $1, (%esp)
  call sum
  ret
```
So what happens on a call?

```
sum:
  pushl %ebp
  movl %esp, %ebp
  movl 12(%ebp), %eax
  addl 8(%ebp), %eax
  leave
  ret

main:
  pushl %ebp
  movl %esp, %ebp
  subl $8, %esp
  movl $3, 4(%esp)
  movl $1, (%esp)
  call sum
  ret
```
Now let's add a buffer

```c
int foo()
{
    char buf[20];
    return 0;
}

int main()
{
    return foo();
}
```

```assembly
foo:
    pushl %ebp
    movl %esp, %ebp
    subl $32, %esp
    movl $0, %eax
    leave
    ret

main:
    pushl %ebp
    movl %esp, %ebp
    call foo
    popl %ebp
    ret
```
Now let's add a buffer

foo:
   pushl %ebp
   movl %esp, %ebp
   subl $32, %esp
   movl $0, %eax
   ...
   leave
   ret

main:
   pushl %ebp
   movl %esp, %ebp
   call foo
   popl %ebp
   ret
Now let's add a buffer

```
foo:
   pushl %ebp
   movl %esp, %ebp
   subl $32, %esp
   movl $0, %eax
   leave
   ret
```

```
main:
   pushl %ebp
   movl %esp, %ebp
   call foo
   popl %ebp
   ret
```
Calling a libC function

```c
int foo(char *str)
{
    char buf[20];
    strcpy(buf, str);
    return 0;
}

int main(int argc, char *argv[])
{
    return foo(argv[1]);
}
```

foo:
  pushl %ebp
  movl %esp, %ebp
  subl $36, %esp
  movl 8(%ebp), %eax
  movl %eax, 4(%esp)
  leal -28(%ebp), %eax
  movl %eax, 4(%esp)
  call strcpy
  xorl %eax, %eax
  leave
  ret
Calling a libC function

foo:
  pushl %ebp
  movl %esp, %ebp
  subl $36, %esp
  movl 8(%ebp), %eax
  movl %eax, 4(%esp)
  leal -28(%ebp), %eax
  movl %eax, (%esp)
  call strcpy
  xorl %eax, %eax
  leave
  ret
Calling a libC function

```
foo:
pushl %ebp
movl %esp, %ebp
subl $36, %esp
movl 8(%ebp), %eax
movl %eax, 4(%esp)
leal -28(%ebp), %eax
movl %eax, (%esp)
call strcpy
xorl %eax, %eax
leave
ret
```
Calling a libC function

foo:
  pushl %ebp
  movl %esp, %ebp
  subl $36, %esp
  movl 8(%ebp), %eax
  movl %eax, 4(%esp)
  leal -28(%ebp), %eax
  movl %eax, 4(%esp)
  call strcpy
  xorl %eax, %eax
  leave
  ret

Stack
  EBP (main)
  string ptr
  Return Addr
  EBP(foo)

Exploitz
Calling a libC function

foo:
  pushl %ebp
  movl %esp, %ebp
  subl $36, %esp
  movl 8(%ebp), %eax
  movl %eax, 4(%esp)
  leal -28(%ebp), %eax
  movl %eax, 4(%esp)
  call strcpy
  xorl %eax, %eax
  leave
  ret
Calling a libC function

foo:
  pushl %ebp
  movl %esp, %ebp
  subl $36, %esp
  movl 8(%ebp), %eax
  movl %eax, 4(%esp)
  leal -28(%ebp), %eax
  movl %eax, (%esp)
  call strcpy
  xorl %eax, %eax
  leave
  ret
Calling a libC function

foo:
  pushl %ebp
  movl %esp, %ebp
  subl $36, %esp
  movl 8(%ebp), %eax
  movl %eax, 4(%esp)
  leal -28(%ebp), %eax
  movl %eax, (%esp)
  call strcpy
  xorl %eax, %eax
  leave
  ret
Calling a libC function

```
foo:
pushl %ebp
movl %esp, %ebp
subl $36, %esp
movl 8(%ebp), %eax
movl %eax, 4(%esp)
leal -28(%ebp), %eax
movl %eax, (%esp)
call strcpy
xorl %eax, %eax
leave
ret
```

string = "Hello world"
Calling Assembly from C

main.c:

```c
extern int get_random(void);

int main(int argc, char **argv)
{
}
```

main.S:
```
global get_random
get_random:
    mov $4, %rax
    ret
```
Calling Assembly from C

main.c:

```c
extern int get_random(void);

int main(int argc, char ** argv) {
}
```

main.S:

```assembly
.global get_random

get_random:
    mov $4, %rax
    ret
```
Assignment: functions

Let’s write some code:

1. add two values
2. return the current instruction pointer (rip)
3. return the current stack pointer (rsp)
Functions in assembly

1. How do you create local variables?
2. How do you ensure that control flow of a function does not go into another function?
3. Can address on a stack be one or two bytes, like with jmp?
4. Is it possible to use pop and jmp instead of ret? How?
System calls

<table>
<thead>
<tr>
<th>Linux</th>
<th>Return</th>
<th>Syscall Number</th>
<th>Args</th>
</tr>
</thead>
<tbody>
<tr>
<td></td>
<td>RAX</td>
<td>RAX</td>
<td>RDI, RSI, RDX, R10, R8, R9</td>
</tr>
</tbody>
</table>

Max. 6 Arguments for syscalls.
Assignment: functions

Let’s write some code:

1. get the process id from the operating system
You will need the getpid() system call – number 39 (x86_64).
sti / cli
enable / disable interrupts

sti
cli
How would you implement a loop? Which instructions do you need?
cmp
compare two values

cmp $0, %eax
cmp %eax, %ebx
cmp
compare two values

    cmp $0, %eax
    cmp %eax, %ebx

Where to store the result?
Special purpose register that contains several bits to indicate the result of certain instructions – like cmp.

0  CF Carry Flag
2  PF Parity Flag
6  ZF Zero Flag
7  SF Sign Flag
8  TF Trap Flag (single step)
9  IF Interrupt Enable Flag

https://en.wikipedia.org/wiki/FLAGS_register
jmp
(Conditionally) jump to an address

jmp 0xC0FFEE
jmp %eax
ja 0xC0FFEE
jae 0xC0FFEE
jb [e] 0xC0FFEE
jg [e] 0xC0FFEE
jl [e] 0xC0FFEE
jne 0xC0FFEE
jz 0xC0FFEE

and lots of others, see the Intel manual:

Assignment: Hello world

A function which prints “Hello world!” \( N \) times.

1. Use directives \texttt{.data} and \texttt{.text}
2. Make a syscall from within the assembly function
3. Call your function from c code and test it
Assignment: Bitcount

Count the bits in a given integer.

1. write a function bitcount in x86_64 assembly
2. call your function from c code and test it
int i = 42;
asm volatile ("add %0, %0;
  : "+r"(i)
  : // no other input, just i
  : // no clobber
 );

https://gcc.gnu.org/onlinedocs/gcc/Extended-Asm.html
Register Constraints and Modifiers

\[
\text{asm \ volatile \ ("add\_\%0,\_\%0;" \ : \ "+r"(i));}
\]

<table>
<thead>
<tr>
<th>Constraints</th>
<th>Modifiers</th>
</tr>
</thead>
<tbody>
<tr>
<td>r</td>
<td>any general purpose register = write only operand</td>
</tr>
<tr>
<td>a</td>
<td>al, ax, eax, rax</td>
</tr>
<tr>
<td>c</td>
<td>cl, cx, ecx, rcx</td>
</tr>
<tr>
<td>D</td>
<td>edi, rdi</td>
</tr>
<tr>
<td>m</td>
<td>memory operand</td>
</tr>
<tr>
<td></td>
<td>+ read / write</td>
</tr>
</tbody>
</table>
int add(int a, int b) {
    asm volatile("add %1, %0;"
                  : "+r"(a) : "r"(b) );
    return b;
}
Additional Registers

- SSE adds 16 new 128bit registers – xmm0 - xmm15.
- AVX adds 32 new 256bit registers – ymm0 - ymm31.
- Eases and accelerates vector computations.

For a full description see Intel Manual (Volume 1, Chapter 10).
SSE instructions

**movaps**  move four aligned packed single-precision floating-point values between XMM registers or memory

**addps**  add packed single-precision floating-point values

**rcpps**  compute reciprocals of packed single-precision floating-point values

**cmpps**  compare packed single-precision floating-point values
Let’s write some code:

1. add two vectors using SSE
2. multiply two vectors using SSE
Intel Software Developer Manual

X86 Calling Conventions
https://en.wikipedia.org/wiki/X86_calling_conventions

FLAGS register
https://en.wikipedia.org/wiki/FLAGS_register
Compiler Builtins

GCC (and others) come with special intrinsics that map to optimized code. Examples:

- Common libC functions (builtin `memcpy`)
- `builtin expect()`
- `builtin popcount()`
- `builtin prefetch()`
- `builtin bswap32()`
- `builtin return address()`
- `builtin ia32 addps()`
Compiler Builtins

GCC (and others) come with special **intrinsics** that map to optimized code. Examples:

- Common libC functions (\_\_builtin\_memcpy)
- \_\_builtin\_expect()
- \_\_builtin\_popcount()
- \_\_builtin\_prefetch()
- \_\_builtin\_bswap32()
- \_\_builtin\_return\_address()
- \_\_builtin\_ia32\_addps()
unsigned long long
__attribute__((noinline))
eip()
{
    return __builtin_return_address(0);
}
unsigned count_bits(unsigned x) {
    return __builtin_popcount(x);
}
typedef float v4sf
    __attribute__((vector_size(16))); // Hah!

void sse() {
    v4sf v1 = {1,2,3,4};
    v4sf v2 = {1,2,3,4};
    v4sf v3 = {2,2,2,2};
    v4sf res;

    res = __builtin_ia32_mulps(v3,
                                __builtin_ia32_addps(v1, v2));

    printf("res = [%f,%f,%f,%f]\n", res[0],
            res[1], res[2], res[3]);
}

typedef float v4sf
    __attribute__((vector_size(16)));
    // Hah!

void sse() {
    v4sf v1 = {1, 2, 3, 4};
    v4sf v2 = {1, 2, 3, 4};
    v4sf v3 = {2, 2, 2, 2};
    v4sf res;

    res = v3 * (v1 + v2);

    printf("res = [%f, %f, %f, %f]\\n", res[0],
            res[1], res[2], res[3]);
}

How much is my code?

You will always need to understand the cost of your code:

- Memory / resource consumption
  - Memory consumption in GiB?
  - Binary size
  - Energy consumption
How much is my code?

You will always need to understand the cost of your code:
  ▶ Memory / resource consumption
    ▶ Memory consumption in GiB?
    ▶ Binary size
    ▶ Energy consumption
  ▶ Implementation cost
    ▶ Source Lines of Code
    ▶ Cyclomatic Complexity
  ▶ Execution time
    ▶ Execution time in seconds
      → gettimeofday()
  ▶ Short running code
    → CPU cycles
How much is my code?

You will always need to understand the cost of your code:

▶ Memory / resource consumption
  ▶ Memory consumption in GiB?
  ▶ Binary size
  ▶ Energy consumption

▶ Implementation cost
  ▶ Source Lines of Code
  ▶ Cyclomatic Complexity

▶ Execution time
  ▶ Execution time in seconds → gettimeofday()
  ▶ Short running code → CPU cycles
CPU Time Stamp Counter

64 bit register counting the clocks since system startup.

- Pentium*, early Xeon CPUs: increment with every CPU cycle.
- Newer Xeons and Core*: increment at a constant rate.
- AMD up to K8: per CPU, increment with every CPU cycle

Spot the problem, anyone?
Reading the TSC

Instruction: `rdtsc` stores TSC in EAX (lower 32 bits) and EDX (higher 32 bits).
Reading the TSC

Instruction: rdtsc stores TSC in EAX (lower 32 bits) and EDX (higher 32 bits).

```c
unsigned long long rdtsc() {
    unsigned long long hi, lo;

    asm volatile("rdtsc\n\n"mov edx, %0\n"mov eax, %1\n: "=r" (hi), "=r" (lo));

    return (hi << 32) | lo;
}
```
Clobbering matters!

```c
unsigned long long rdtsc() {
    unsigned long long hi, lo;

    asm volatile("rdtsc"
                 "mov %edx, %0\n"
                 "mov %eax, %1\n"
                 : "=r" (hi), "=r" (lo)
                 :
                 : "eax", "edx");

    return (hi << 32) | lo;
}
```
Catching out-of-order execution

Before a measurement:

```c
unsigned long long rdtsc_pre() {
    unsigned long long hi, lo;

    asm volatile("cpuid; rdtsc"
                 "mov %edx, %0\n"
                 "mov %eax, %1\n"
                 : "=r" (hi), "=r" (lo)
                 : "rax", "rbx", "rcx", "rdx";

    return (hi << 32) | lo;
}
```

---

1How to Benchmark Code Execution Times on Intel ® IA-32 and IA-64 Instruction SetArchitectures. Gabriele Paoloni
Catching out-of-order execution

After a measurement:

```c
unsigned long long rdtsc_post() {
    unsigned long long hi, lo;

    asm volatile("rdtscp\n\t" 
                 "mov %edx, %0\n\t" 
                 "mov %eax, %1\n\t" 
                 "cpuid\n\t"
                 : "=r" (hi), "=r" (lo)
                 : "rax", "rbx", "rcx", "rdx";

    return (hi << 32) | lo;
}
```
Benchmarking Considerations

- RTSC is not for free.
Benchmarking Considerations

- RTSC is not for free.
- Interruption by other programs, migration.
  - Own OS: measure in kernel and disable IRQs.
  - Linux user space: difficult
    - Set CPU affinity
    - Collect 1000s of samples and ignore outliers
Assignment

Implement the following function in assembly:

/*
 * Takes the argument <buf> if length size,
 * reverses it and stores the result in the
 * location of the original <buf>.
 * Returns the number of bytes reversed.
 */

unsigned reverse_buf(char *buf, size_t size);
Counting Lines

Implement the following function in assembly:

```assembly
/*
 * Gets a file descriptor to an open file and
 * iterates over the file’s content to count the
 * number of lines in the file. (A.k.a an ASM
 * equivalent of ’wc -l’ on the shell.
 */

unsigned count_lines(int fd);
```
What we’ve learned

▶ Assembly instruction format
▶ Decoding rules
▶ Some of assembly instructions
▶ Calling conventions
▶ How to program in assembly
Wait, there is more!

Where to apply assembly knowledge?

Side channel attacks
- CPU vulnerabilities
- Spectre and Meltdown
- https://meltdownattack.com/
- https://github.com/IAIK/meltdown

Meltdown

; rcx = kernel address
; rbx = probe array
xor %rax, %rax

retry:

movb (%rcx), %al
shl $0xc, %rax
jz retry
movq (%rbx, %rax), %rbx