Loading...
Loading...
CPU cache optimization skill for C/C++ and Rust. Use when diagnosing cache misses, improving data layout for cache efficiency, using perf stat cache counters, understanding false sharing, prefetching, or structuring AoS vs SoA data layouts. Activates on queries about cache misses, cache lines, false sharing, perf cache counters, data layout optimization, prefetch, AoS vs SoA, or L1/L2/L3 cache performance.
npx skill4agent add mohitmishra786/low-level-dev-skills cpu-cache-opt# Basic cache counters
perf stat -e cache-references,cache-misses,cycles,instructions ./prog
# L1/L2/L3 miss breakdown
perf stat -e \
L1-dcache-load-misses,\
L1-dcache-loads,\
L2-dcache-load-misses,\
LLC-load-misses,\
LLC-loads \
./prog
# Cache miss rate = L1-dcache-load-misses / L1-dcache-loads
# > 5% is concerning; > 20% is severe
# False sharing detection
perf stat -e \
machine_clears.memory_ordering,\
mem_load_l3_hit_retired.xsnp_hitm \
./prog// Check cache line size
long cache_line = sysconf(_SC_LEVEL1_DCACHE_LINESIZE);
// Align data to cache line
struct alignas(64) HotData {
int counter;
// ... 60 bytes of data that fit in one line
};
// C
typedef struct {
int x;
} __attribute__((aligned(64))) AlignedData;// AoS (Array of Structures) — default layout
struct Particle {
float x, y, z; // position (12 bytes)
float vx, vy, vz; // velocity (12 bytes)
float mass; // (4 bytes)
int flags; // (4 bytes)
};
Particle particles[N]; // Bad for loops that only need position
// Problem: accessing particles[i].x loads x,y,z,vx,vy,vz,mass,flags
// But we only need x,y,z → 75% of loaded data is wasted
// SoA (Structure of Arrays) — cache-friendly for SIMD + sequential access
struct ParticlesSoA {
float *x, *y, *z;
float *vx, *vy, *vz;
float *mass;
int *flags;
};
// Accessing x[i] for i=0..N loads 16 consecutive x values → 0% waste
// Also auto-vectorizes better// BAD: random access (linked list traversal)
Node *node = head;
while (node) {
process(node->data);
node = node->next; // pointer chasing = cache miss per node
}
// BETTER: pool allocate nodes contiguously
// Or: rewrite as contiguous array with indices
// BAD: stride > cache line in matrix traversal
for (int i = 0; i < N; i++)
for (int j = 0; j < M; j++)
sum += matrix[j][i]; // column-major access on row-major array
// GOOD: row-major access
for (int i = 0; i < N; i++)
for (int j = 0; j < M; j++)
sum += matrix[i][j];
// BAD: large struct with hot + cold fields
struct Record {
int id; // hot: accessed every iteration
char name[128]; // cold: accessed rarely
int value; // hot
char desc[256]; // cold
};
// GOOD: separate hot and cold data
struct RecordHot { int id; int value; };
struct RecordCold { char name[128]; char desc[256]; };
RecordHot hot_data[N];
RecordCold cold_data[N];// BAD: counters likely on same cache line (8 bytes each, line = 64 bytes)
int counter_a; // thread A's counter
int counter_b; // thread B's counter
// Both on the same cache line → every write invalidates the other thread's cache
// GOOD: pad to separate cache lines
struct alignas(64) PaddedCounter {
int value;
char padding[60]; // Ensure next counter is on different cache line
};
PaddedCounter counters[NUM_THREADS];
// Thread i: counters[i].value++
// C++ standard approach
struct alignas(std::hardware_destructive_interference_size) PaddedCounter {
int value;
};#include <immintrin.h> // or <xmmintrin.h>
// Prefetch for read (locality 0=non-temporal, 3=high temporal)
__builtin_prefetch(ptr, 0, 3); // prefetch for read, high locality
__builtin_prefetch(ptr, 1, 3); // prefetch for write, high locality
// SSE prefetch (x86)
_mm_prefetch((char*)ptr, _MM_HINT_T0); // L1
_mm_prefetch((char*)ptr, _MM_HINT_T1); // L2
_mm_prefetch((char*)ptr, _MM_HINT_T2); // L3
_mm_prefetch((char*)ptr, _MM_HINT_NTA); // non-temporal (streaming)
// Typical pattern: prefetch N iterations ahead
#define PREFETCH_DIST 8
for (int i = 0; i < N; i++) {
if (i + PREFETCH_DIST < N)
__builtin_prefetch(&data[i + PREFETCH_DIST], 0, 3);
process(data[i]);
}// Loop blocking / tiling for matrix operations
// Process cache-fitting blocks instead of full rows/columns
#define BLOCK 64 // tuned to L1 cache size
void matrix_mult_blocked(float *C, float *A, float *B, int N) {
for (int i = 0; i < N; i += BLOCK)
for (int k = 0; k < N; k += BLOCK)
for (int j = 0; j < N; j += BLOCK)
// Inner block fits in L1 cache
for (int ii = i; ii < i + BLOCK && ii < N; ii++)
for (int kk = k; kk < k + BLOCK && kk < N; kk++)
for (int jj = j; jj < j + BLOCK && jj < N; jj++)
C[ii*N+jj] += A[ii*N+kk] * B[kk*N+jj];
}skills/profilers/linux-perfperf statperf recordskills/profilers/valgrindskills/low-level-programming/simd-intrinsicsskills/low-level-programming/memory-model