Loading...
Loading...
Write high-performance JIT compilers and interpreters in the style of Mike Pall, creator of LuaJIT. Emphasizes trace-based compilation, aggressive optimization, and understanding CPU microarchitecture. Use when building JITs, interpreters, or any code where every cycle counts.
npx skill4agent add copyleftdev/sk1llz pall-jit-mastery"Measure, don't guess."
"The fastest code is code that doesn't run."
"Understand your hardware or it will humble you."
// LuaJIT's key insight: trace hot paths, not methods
// A trace is a linear sequence of operations
typedef struct Trace {
uint32_t *mcode; // Generated machine code
IRIns *ir; // IR instructions
uint16_t nins; // Number of IR instructions
uint16_t nk; // Number of constants
SnapShot *snap; // Side exit snapshots
uint16_t nsnap;
// Link to next trace (for loops)
struct Trace *link;
} Trace;
// Recording: capture operations as they execute
void record_instruction(JitState *J, BCIns ins) {
switch (bc_op(ins)) {
case BC_ADDVN:
// Record: result = slot[A] + constant[D]
TRef tr = emitir(IR_ADD, J->slots[bc_a(ins)],
lj_ir_knum(J, bc_d(ins)));
J->slots[bc_a(ins)] = tr;
break;
// ... other bytecodes
}
}
// Key: traces are LINEAR
// No control flow in the trace itself
// Side exits handle divergence// LuaJIT IR: compact, cache-friendly, no pointers
typedef struct IRIns {
uint16_t op; // Operation + type
uint16_t op1; // First operand (IR ref or slot)
uint16_t op2; // Second operand
uint16_t prev; // Previous instruction (for CSE chains)
} IRIns; // 8 bytes, fits in cache line nicely
// IR references are indices, not pointers
// Enables: compact storage, easy serialization, cache efficiency
#define IRREF_BIAS 0x8000
#define irref_isk(r) ((r) < IRREF_BIAS) // Is constant?
// Constants stored separately, referenced by negative indices
// Instructions stored linearly, referenced by positive indices
// Example IR sequence for: x = a + b * c
// K001 NUM 3.14 -- constant
// 0001 SLOAD #1 -- load slot 1 (a)
// 0002 SLOAD #2 -- load slot 2 (b)
// 0003 SLOAD #3 -- load slot 3 (c)
// 0004 MUL 0002 0003 -- b * c
// 0005 ADD 0001 0004 -- a + (b * c)// Traces assume types and values
// Guards verify assumptions, exit if wrong
void emit_guard(JitState *J, IRType expected, TRef tr) {
IRIns *ir = &J->cur.ir[tref_ref(tr)];
if (ir->t != expected) {
// Emit type guard
emitir(IR_GUARD, tr, expected);
// Record snapshot for side exit
snapshot_add(J);
}
}
// Side exit: restore interpreter state, continue there
typedef struct SnapShot {
uint16_t ref; // First IR ref in snapshot
uint8_t nslots; // Number of slots to restore
uint8_t topslot; // Top slot number
uint32_t *map; // Slot -> IR ref mapping
} SnapShot;
// When guard fails:
// 1. Look up snapshot for this guard
// 2. Restore Lua stack from IR values
// 3. Jump back to interpreter
// 4. Maybe record a new trace from exit point// LuaJIT generates assembly directly
// Every instruction chosen deliberately
// x86-64 code emission helpers
static void emit_rr(ASMState *as, x86Op op, Reg r1, Reg r2) {
// REX prefix if needed
if (r1 >= 8 || r2 >= 8) {
*--as->mcp = 0x40 | ((r1 >> 3) << 2) | (r2 >> 3);
}
*--as->mcp = 0xc0 | ((r1 & 7) << 3) | (r2 & 7);
*--as->mcp = op;
}
// Register allocation: linear scan, but smarter
// Allocate backwards from trace end for better results
void ra_allocate(ASMState *as) {
// Process IR in reverse order
for (IRRef ref = as->curins; ref >= as->stopins; ref--) {
IRIns *ir = &as->ir[ref];
// Allocate destination register
Reg dest = ra_dest(as, ir);
// Allocate source registers
ra_left(as, ir, dest);
ra_right(as, ir);
}
}
// Key insight: backwards allocation sees all uses
// Can make better spill decisions// Cache-friendly data structures are critical
// BAD: Linked list of variable-size nodes
struct Node {
struct Node *next;
int type;
union {
double num;
struct String *str;
// ...
} value;
};
// GOOD: Separate arrays by type (SoA)
struct ValueArray {
uint8_t *types; // Type tags: sequential access
TValue *values; // Values: sequential access
size_t count;
};
// Iteration patterns matter enormously
// This is ~10x faster than pointer chasing:
for (size_t i = 0; i < arr->count; i++) {
if (arr->types[i] == TYPE_NUMBER) {
sum += arr->values[i].n;
}
}// Polymorphic inline cache for property access
// Avoids hash lookup in common case
typedef struct InlineCache {
uint32_t shape_id; // Expected object shape
uint16_t offset; // Cached property offset
uint16_t _pad;
} InlineCache;
TValue get_property_cached(Object *obj, String *key, InlineCache *ic) {
// Fast path: shape matches
if (likely(obj->shape_id == ic->shape_id)) {
return obj->slots[ic->offset]; // Direct access!
}
// Slow path: lookup and update cache
uint16_t offset = shape_lookup(obj->shape, key);
ic->shape_id = obj->shape_id;
ic->offset = offset;
return obj->slots[offset];
}
// Monomorphic: one shape, one offset
// Polymorphic: small set of shapes
// Megamorphic: too many shapes, fall back to hash// CPUs predict branches; help them be right
// BAD: Unpredictable branches in hot loop
for (int i = 0; i < n; i++) {
if (data[i] > threshold) { // 50% taken = unpredictable
sum += data[i];
}
}
// GOOD: Branchless version
for (int i = 0; i < n; i++) {
int mask = -(data[i] > threshold); // 0 or -1
sum += data[i] & mask;
}
// GOOD: Sort first if possible
qsort(data, n, sizeof(int), compare);
for (int i = 0; i < n && data[i] <= threshold; i++) {
// All branches now predictable
}
// Loop unrolling: reduce branch overhead
for (int i = 0; i + 4 <= n; i += 4) {
sum += data[i];
sum += data[i + 1];
sum += data[i + 2];
sum += data[i + 3];
}// Generate specialized code for each type combination
// LuaJIT specializes aggressively
// Generic add (slow)
TValue generic_add(TValue a, TValue b) {
if (tvisnum(a) && tvisnum(b)) {
return numV(numV(a) + numV(b));
} else if (tvisstr(a) || tvisstr(b)) {
return concat(tostring(a), tostring(b));
}
// ... metamethod lookup
}
// Specialized add for numbers (fast)
// Generated when trace shows both args are numbers
double specialized_add_nn(double a, double b) {
return a + b; // Single instruction
}
// Type guards ensure specialization is valid
// Side exit if types don't match expectedCPU Pipeline Awareness
══════════════════════════════════════════════════════════════
Latency (cycles) Operation
────────────────────────────────────────────────────────────
1 Register-to-register ALU
3-4 L1 cache hit
~12 L2 cache hit
~40 L3 cache hit
~200 Main memory
~10-20 Branch mispredict penalty
~100+ Page fault
Key insight: Memory is the bottleneck
Computation is nearly free by comparison
Optimize for memory access patterns first