nasm-know-hows

nasm assembly related stuff

View on GitHub

Sections (.text, .data, .bss) and XOR Optimization

Questions

  1. What is .text .bss .data sections with respect to C code?
  2. Why is xor rax, rax used instead of mov rax, 0?

Part 1: Understanding Sections

The Memory Layout

When your program runs, memory is organized into segments (sections):

┌─────────────────────────────────────┐  High Memory (0xFFFF...)
│         STACK                       │  ← Function calls, local vars
│         ↓ grows down                │
├─────────────────────────────────────┤
│                                     │
│         (unused space)              │
│                                     │
├─────────────────────────────────────┤
│         ↑ grows up                  │
│         HEAP                        │  ← malloc/new allocations
├─────────────────────────────────────┤
│         .bss (uninitialized data)   │  ← Global/static uninitialized
├─────────────────────────────────────┤
│         .data (initialized data)    │  ← Global/static initialized
├─────────────────────────────────────┤
│         .rodata (read-only data)    │  ← String literals, const
├─────────────────────────────────────┤
│         .text (code)                │  ← Your program instructions
└─────────────────────────────────────┘  Low Memory (0x0000...)

Section Breakdown

1. .text Section - THE CODE

What it is: Your actual executable instructions (machine code) Permissions: Read + Execute (NOT writable for security)

C Code Example:

#include <stdio.h>

int add(int a, int b) {          // ← Function code goes in .text
    return a + b;
}

int main() {                     // ← Function code goes in .text
    int x = add(5, 10);
    printf("Result: %d\n", x);
    return 0;
}

Corresponding Assembly (.text section):

section .text
    global add
    global main

add:
    push rbp
    mov rbp, rsp
    mov eax, edi              ; a
    add eax, esi              ; + b
    pop rbp
    ret

main:
    push rbp
    mov rbp, rsp
    mov edi, 5
    mov esi, 10
    call add
    ; ... more code ...
    pop rbp
    ret

Key Points:


2. .data Section - INITIALIZED GLOBAL/STATIC DATA

What it is: Global and static variables with initial values Permissions: Read + Write Storage: Takes up space in executable file

C Code Example:

// Global variables with initial values → .data
int global_count = 100;                    // .data (4 bytes)
char message[] = "Hello";                  // .data (6 bytes with \0)
double pi = 3.14159;                       // .data (8 bytes)

// Static variables with initial values → .data
static int file_counter = 0;               // .data

int main() {
    // Static local with initial value → .data
    static int func_calls = 1;             // .data
    
    global_count++;
    func_calls++;
    return 0;
}

Corresponding Assembly (.data section):

section .data
    global_count    dd 100              ; int = 100
    message         db "Hello", 0       ; char[] = "Hello\0"
    pi              dq 3.14159          ; double = 3.14159
    file_counter    dd 0                ; static int = 0
    func_calls      dd 1                ; static int = 1

Key Points:


3. .bss Section - UNINITIALIZED GLOBAL/STATIC DATA

What it is: Global and static variables without initial values (or initialized to zero) Permissions: Read + Write Storage: Does NOT take space in executable! (Clever optimization)

C Code Example:

// Global variables without initializers → .bss
int global_array[1000];                    // .bss (4000 bytes)
char buffer[256];                          // .bss (256 bytes)
double values[100];                        // .bss (800 bytes)

// Explicitly initialized to zero → .bss (compiler optimization)
int zero_count = 0;                        // .bss (not .data!)
static char null_term = '\0';              // .bss

// Static variables without initializers → .bss
static long uninitialized;                 // .bss

int main() {
    // Static local without initializer → .bss
    static int cache[50];                  // .bss
    
    // These are automatically zeroed:
    printf("%d\n", global_array[0]);       // Prints 0
    printf("%d\n", zero_count);            // Prints 0
    
    return 0;
}

Corresponding Assembly (.bss section):

section .bss
    global_array    resd 1000           ; Reserve 4000 bytes (not in file!)
    buffer          resb 256            ; Reserve 256 bytes
    values          resq 100            ; Reserve 800 bytes
    zero_count      resd 1              ; Reserve 4 bytes
    null_term       resb 1              ; Reserve 1 byte
    uninitialized   resq 1              ; Reserve 8 bytes
    cache           resd 50             ; Reserve 200 bytes

Key Points:

BSS = “Block Started by Symbol” (historical Unix term)


Side-by-Side Comparison

C Code:

// .data - initialized
int x = 42;
char msg[] = "Hi";

// .bss - uninitialized or zero
int y;
int z = 0;
char buf[100];

// .text - code
int add(int a, int b) {
    return a + b;
}

int main() {
    // .stack - local variables
    int local = 10;
    
    // .heap - dynamic allocation
    int *ptr = malloc(sizeof(int));
    *ptr = 20;
    free(ptr);
    
    return 0;
}

Assembly Equivalent:

section .data
    x   dd 42           ; Initialized data
    msg db "Hi", 0

section .bss
    y   resd 1          ; Uninitialized
    z   resd 1          ; Zero (optimized to .bss)
    buf resb 100        ; Buffer

section .text
    global add
    global main

add:
    lea eax, [rdi + rsi]    ; return a + b
    ret

main:
    push rbp
    mov rbp, rsp
    sub rsp, 16             ; Local var on stack
    
    mov dword [rbp-4], 10   ; local = 10
    
    mov rdi, 4              ; malloc(4)
    call malloc
    mov qword [rbp-16], rax ; ptr = result
    
    mov rbx, [rbp-16]
    mov dword [rbx], 20     ; *ptr = 20
    
    mov rdi, [rbp-16]
    call free               ; free(ptr)
    
    xor eax, eax
    leave
    ret

Why This Matters

File Size Example:

// Option 1: Using .data (BAD for large arrays)
char buffer[1000000] = {0};  // 1 MB in executable file!

// Option 2: Using .bss (GOOD!)
char buffer[1000000];        // 0 bytes in executable file!
                             // OS allocates 1 MB at runtime

Verify with size command:

$ size my_program
   text    data     bss     dec     hex filename
   1234    1000       8    2242     8c2 program1  # data version
   1234       8    1000    2242     8c2 program2  # bss version

Same runtime memory, but different file size!


Summary Table: Sections vs C Code

┌───────────┬──────────────────────────┬─────────────┬────────────┐
│ Section   │ Contains                 │ In File?    │ Writable?  │
├───────────┼──────────────────────────┼─────────────┼────────────┤
│ .text     │ Code (functions)         │ Yes         │ No         │
│ .data     │ Initialized globals      │ Yes         │ Yes        │
│ .bss      │ Uninitialized globals    │ No!         │ Yes        │
│ .rodata   │ const, string literals   │ Yes         │ No         │
│ stack     │ Local variables          │ No (runtime)│ Yes        │
│ heap      │ malloc/new allocations   │ No (runtime)│ Yes        │
└───────────┴──────────────────────────┴─────────────┴────────────┘

Part 2: XOR Optimization - Why xor rax, rax Instead of mov rax, 0?

Quick Answer

Both set RAX to zero, but xor rax, rax is smaller and faster!

mov rax, 0          ; 7 bytes, slower
xor rax, rax        ; 3 bytes, faster

The Math

XOR truth table:

A  |  B  | A XOR B
---+-----+--------
0  |  0  |   0
0  |  1  |   1
1  |  0  |   1
1  |  1  |   0      ← Same value XOR itself = 0

Applied to registers:

; If RAX = 0b10110101
; RAX XOR RAX:
  10110101
^ 10110101
----------
  00000000     Always zero!

Any value XORed with itself equals zero!


Detailed Comparison

1. Instruction Encoding Size

; 32-bit registers
mov eax, 0          ; 5 bytes: B8 00 00 00 00
xor eax, eax        ; 2 bytes: 31 C0

; 64-bit registers
mov rax, 0          ; 7 bytes: 48 C7 C0 00 00 00 00
xor rax, rax        ; 3 bytes: 48 31 C0

; Even shorter with 32-bit (zeros upper bits anyway)
xor eax, eax        ; 2 bytes: 31 C0
                    ; RAX = 0x0000000000000000

Savings: 4-5 bytes per instruction!


2. Performance

Modern CPUs have special handling for xor reg, reg:

MOV RAX, 0:
- Needs execution unit
- Takes ~1 cycle
- Uses resources

XOR RAX, RAX:
- Recognized by CPU as "zeroing idiom"
- Often eliminated at decode stage (0 cycles!)
- Doesn't use execution port
- Breaks dependency chains

Zero-latency optimization:


3. Dependency Breaking

; BAD - dependency chain
mov rax, [mem1]
add rax, rbx
mov rax, 0          ; Still depends on previous RAX
add rax, rcx        ; Waits for MOV

; GOOD - breaks dependency
mov rax, [mem1]
add rax, rbx
xor rax, rax        ; CPU knows: "RAX is zero, no dependency!"
add rax, rcx        ; Can start immediately

All Zeroing Methods Compared

; Method 1: MOV with immediate
mov rax, 0              ; 7 bytes, standard speed
mov eax, 0              ; 5 bytes, zeros upper 32 bits too

; Method 2: XOR (BEST)
xor rax, rax            ; 3 bytes, fastest
xor eax, eax            ; 2 bytes, fastest (zeros full RAX)

; Method 3: SUB
sub rax, rax            ; 3 bytes, fast (affects flags!)
sub eax, eax            ; 2 bytes, fast (affects flags!)

; Method 4: AND (weird but works)
and rax, 0              ; 7 bytes, slow
and eax, 0              ; 6 bytes, slow

; Method 5: IMUL (very weird!)
imul rax, 0             ; 4 bytes, slow (use XOR instead!)

Recommendation: Always use xor reg, reg


Practical Example

C Equivalent:

#include <stdint.h>

int main() {
    // In C, you just initialize variables to zero
    int32_t eax = 0;
    int32_t ebx = 0;
    int32_t ecx = 0;
    int32_t edx = 0;
    
    // The C compiler automatically uses XOR for optimization:
    // C code: int x = 0;
    // Compiles to: xor eax, eax
    
    return 0;  // This also compiles to: xor eax, eax
}

Assembly:

section .text
    global _start

_start:
    ; Zeroing multiple registers efficiently
    xor eax, eax        ; 2 bytes
    xor ebx, ebx        ; 2 bytes
    xor ecx, ecx        ; 2 bytes
    xor edx, edx        ; 2 bytes
    ; Total: 8 bytes
    
    ; vs MOV version:
    mov eax, 0          ; 5 bytes
    mov ebx, 0          ; 5 bytes
    mov ecx, 0          ; 5 bytes
    mov edx, 0          ; 5 bytes
    ; Total: 20 bytes (2.5x larger!)
    
    ; Common pattern: clear return value
    xor eax, eax        ; return 0
    ret
    
    ; Exit syscall
    mov rax, 60
    xor rdi, rdi        ; exit code = 0 (FAST!)
    syscall

Real-World Usage

1. Function Return Zero

; Every C function that returns 0:
int main() {
    return 0;
}

; Compiles to:
main:
    push rbp
    mov rbp, rsp
    xor eax, eax        ; return 0
    pop rbp
    ret

2. Loop Counters

; Initialize loop counter
xor rcx, rcx            ; rcx = 0
loop_start:
    ; ... do work ...
    inc rcx
    cmp rcx, 10
    jl loop_start

3. Clearing Before Syscalls

; printf() with floating point
xor eax, eax            ; AL = 0 (no vector registers used)
mov rdi, format
mov rsi, 42
call printf

When XOR Can Be Confusing

; ❌ WRONG - Thinking XOR works like MOV
mov rax, 5
xor rax, 3              ; RAX = 6 (NOT 3!)
                        ; 101 XOR 011 = 110

; ✅ Correct understanding
mov rax, 5
xor rax, rax            ; RAX = 0 (same value XORed)

Compiler Output Example

C Code:

int main() {
    int x = 0;
    int y = 0;
    return 0;
}

GCC Output (optimized):

$ gcc -O2 -S test.c -o test.s
main:
    xor eax, eax        ; return 0
    ret

Even the locals x and y are optimized away! Only the return uses XOR.


Summary Table: Zeroing Methods

┌──────────────┬───────┬───────┬─────────────┬────────────────┐
│ Instruction  │ Bytes │ Speed │ Flags       │ Recommendation │
├──────────────┼───────┼───────┼─────────────┼────────────────┤
│ xor eax,eax  │ 2     │ ★★★★★ │ Modified    │ ✅ BEST        │
│ xor rax,rax  │ 3     │ ★★★★★ │ Modified    │ ✅ BEST        │
│ mov eax, 0   │ 5     │ ★★★☆☆ │ Unchanged   │ ⚠️  Larger     │
│ mov rax, 0   │ 7     │ ★★★☆☆ │ Unchanged   │ ⚠️  Larger     │
│ sub eax,eax  │ 2     │ ★★★★☆ │ Modified    │ ⚠️  Use XOR    │
└──────────────┴───────┴───────┴─────────────┴────────────────┘

TL;DR

Sections:

XOR Trick:


Related Topics:


← Back to Q&A Index