Loading...
Loading...
AArch64 and ARM assembly skill for reading and writing ARM assembly code. Use when reading GCC/Clang output for AArch64 or ARM Thumb targets, writing inline asm in C/C++, understanding the ARM ABI (AAPCS64/AAPCS), or debugging register and stack state on ARM hardware or QEMU. Activates on queries about AArch64 assembly, ARM Thumb, NEON/SVE SIMD, ARM calling convention, inline asm for ARM, or reading ARM disassembly.
npx skill4agent add mohitmishra786/low-level-dev-skills assembly-arm# AArch64 (native or cross-compile)
aarch64-linux-gnu-gcc -S -O2 foo.c -o foo.s
# 32-bit ARM Thumb
arm-linux-gnueabihf-gcc -S -O2 -mthumb foo.c -o foo.s
# From objdump
aarch64-linux-gnu-objdump -d -S prog
# From GDB on target
(gdb) disassemble /s main| Register | Alias | Role |
|---|---|---|
| — | Arguments 1–8 and return values |
| | Indirect result location (struct return) |
| — | Caller-saved temporaries |
| | Intra-procedure-call temporaries (used by linker) |
| | Platform register (reserved on some OS) |
| — | Callee-saved |
| | Frame pointer (callee-saved) |
| | Link register (return address) |
| — | Stack pointer (must be 16-byte aligned at call) |
| — | Program counter (not directly accessible) |
| | Zero register (reads as 0, writes discarded) |
| | FP/SIMD args and return |
| — | Callee-saved SIMD (lower 64 bits only) |
| — | Caller-saved temporaries |
x0w0h0b0x0x7v0v7x0x0x1v0x19x28x29x30v8v15blblr| Instruction | Effect |
|---|---|
| Copy register |
| Load immediate |
| Move zero-extended with shift |
| Move with keep (partial update) |
| Load 64-bit from address in x1 |
| Load from x1+8 |
| Store x0 to x1+8 |
| Load pair (two regs at once) |
| Store pair, pre-decrement sp |
| x0 = x1 + x2 |
| x0 = x1 + 8 |
| x0 = x1 - x2 |
| x0 = x1 * x2 |
| Signed divide |
| Unsigned divide |
| Set flags for x0 - x1 |
| Branch if x0 == 0 |
| Branch if x0 != 0 |
| Branch with link (call) |
| Branch with link to address in x0 |
| Return (branch to x30) |
| Return to address in x0 |
| PC-relative page address |
| Low 12 bits of symbol offset |
// Non-leaf function
stp x29, x30, [sp, #-32]! // save fp, lr; allocate 32 bytes
mov x29, sp // set frame pointer
stp x19, x20, [sp, #16] // save callee-saved registers
// ... body ...
ldp x19, x20, [sp, #16] // restore
ldp x29, x30, [sp], #32 // restore fp, lr; deallocate
ret
// Leaf function (no calls, no callee-saved regs needed)
// Can use red zone (no rsp adjustment) — but AArch64 has no red zone
sub sp, sp, #16 // allocate locals
// ... body ...
add sp, sp, #16
ret// Barrier
__asm__ volatile ("dmb ish" ::: "memory");
// Load acquire
static inline int load_acquire(volatile int *p) {
int val;
__asm__ volatile ("ldar %w0, %1" : "=r"(val) : "Q"(*p));
return val;
}
// Store release
static inline void store_release(volatile int *p, int val) {
__asm__ volatile ("stlr %w1, %0" : "=Q"(*p) : "r"(val));
}
// Read system counter
static inline uint64_t read_cntvct(void) {
uint64_t val;
__asm__ volatile ("mrs %0, cntvct_el0" : "=r"(val));
return val;
}"Q""r""w"#include <arm_neon.h>
// Add 4 floats at once
float32x4_t a = vld1q_f32(arr_a); // load 4 floats
float32x4_t b = vld1q_f32(arr_b);
float32x4_t c = vaddq_f32(a, b);
vst1q_f32(result, c);
// Horizontal sum
float32x4_t sum = vpaddq_f32(c, c);
sum = vpaddq_f32(sum, sum);
float total = vgetq_lane_f32(sum, 0);v<op><q>_<type>q_f32_s32_u8skills/low-level-programming/assembly-x86skills/compilers/cross-gccskills/debuggers/gdb