Loading...
Loading...
Compare original and translation side by side
__global____device____shared__<<<>>>__global____device____shared__<<<>>>| CUDA / What you'd guess | Mojo GPU |
|---|---|
| Plain |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
Raw | |
| Automatic — buffers freed when out of scope |
| CUDA / 你可能会想到的写法 | Mojo GPU 写法 |
|---|---|
| 普通 |
| |
| |
| |
| |
| 来自 |
| |
| |
| |
| |
| |
原生 | |
| 自动释放——缓冲区超出作用域时自动释放 |
undefinedundefinedundefinedundefinedMutAnyOrigindef my_kernel(
input: LayoutTensor[DType.float32, layout, MutAnyOrigin],
output: LayoutTensor[DType.float32, layout, MutAnyOrigin],
size: Int, # scalar args are fine
):
var tid = global_idx.x
if tid < UInt(size):
output[tid] = input[tid] * 2UInt(size)global_idx.xUInt...LayoutTensor[dtype, layout, ...]MutAnyOrigindef my_kernel(
input: LayoutTensor[DType.float32, layout, MutAnyOrigin],
output: LayoutTensor[DType.float32, layout, MutAnyOrigin],
size: Int, # 标量参数没问题
):
var tid = global_idx.x
if tid < UInt(size):
output[tid] = input[tid] * 2UInt(size)global_idx.xUInt...LayoutTensor[dtype, layout, ...]comptime layout_1d = Layout.row_major(1024) # 1D
comptime layout_2d = Layout.row_major(64, 64) # 2D (rows, cols)
comptime layout_3d = Layout.row_major(10, 5, 3) # 3D (e.g. H, W, C)comptime layout_1d = Layout.row_major(1024) # 1D布局
comptime layout_2d = Layout.row_major(64, 64) # 2D布局(行、列)
comptime layout_3d = Layout.row_major(10, 5, 3) # 3D布局(例如高、宽、通道)var buf = ctx.enqueue_create_buffer[DType.float32](comptime (layout.size()))
var tensor = LayoutTensor[DType.float32, layout](buf) # wraps device buffervar buf = ctx.enqueue_create_buffer[DType.float32](comptime (layout.size()))
var tensor = LayoutTensor[DType.float32, layout](buf) # 包装设备缓冲区tensor[tid] # 1D
tensor[row, col] # 2D
tensor[row, col, channel] # 3D
tensor.dim(0) # query dimension size
tensor.shape[0]() # also workstensor[tid] # 1D索引
tensor[row, col] # 2D索引
tensor[row, col, channel] # 3D索引
tensor.dim(0) # 查询维度大小
tensor.shape[0]() # 同样有效undefinedundefinedundefinedundefinedundefinedundefinedundefinedundefinedvar val = tensor[row, col].cast[DType.float32]() # cast elementvar val = tensor[row, col].cast[DType.float32]() # 转换元素类型rebindrebindtensor[idx]SIMD[dtype, layout_expr]layout_expr__iadd__undefinedtensor[idx]SIMD[dtype, layout_expr]layout_expr__iadd__undefined
`rebind` is a builtin (no import needed). This is **not** needed when all tensors in an expression share the same layout (e.g., the matmul example where `sa` and `sb` have identical tile layouts).
Also use `rebind` when reading/writing individual elements for scalar arithmetic or passing to helper functions — even with a single tensor:
```mojo
`rebind`是内置函数(无需导入)。当表达式中所有张量共享相同布局时(例如矩阵乘法示例中`sa`和`sb`具有相同的子块布局),则不需要使用`rebind`。
在读取/写入单个元素进行标量运算或传递给辅助函数时,即使是单个张量也应使用`rebind`:
```mojo
`tensor.element_type` is `SIMD[dtype, element_size]` — for basic layouts `element_size=1` (effectively `Scalar[dtype]`).
`tensor.element_type`是`SIMD[dtype, element_size]`——对于基础布局,`element_size=1`(实际上等同于`Scalar[dtype]`)。var ctx = DeviceContext()var ctx = DeviceContext()undefinedundefinedenqueue_functionctx.enqueue_function[my_kernel, my_kernel](
input_tensor,
output_tensor,
size, # scalar args passed directly
grid_dim=num_blocks, # 1D: scalar
block_dim=block_size, # 1D: scalar
)enqueue_functionctx.enqueue_function[my_kernel, my_kernel](
input_tensor,
output_tensor,
size, # 标量参数直接传递
grid_dim=num_blocks, # 1D:标量
block_dim=block_size, # 1D:标量
)
For parameterized kernels, bind parameters first:
```mojo
comptime kernel = sum_kernel[SIZE, BATCH_SIZE]
ctx.enqueue_function[kernel, kernel](out_buf, in_buf, grid_dim=N, block_dim=TPB)
对于参数化内核,需先绑定参数:
```mojo
comptime kernel = sum_kernel[SIZE, BATCH_SIZE]
ctx.enqueue_function[kernel, kernel](out_buf, in_buf, grid_dim=N, block_dim=TPB)LayoutTensor.stack_allocation()from std.gpu.memory import AddressSpace
comptime tile_layout = Layout.row_major(TILE_M, TILE_K)
var tile_shared = LayoutTensor[
DType.float32,
tile_layout,
MutAnyOrigin,
address_space=AddressSpace.SHARED,
].stack_allocation()LayoutTensor.stack_allocation()from std.gpu.memory import AddressSpace
comptime tile_layout = Layout.row_major(TILE_M, TILE_K)
var tile_shared = LayoutTensor[
DType.float32,
tile_layout,
MutAnyOrigin,
address_space=AddressSpace.SHARED,
].stack_allocation()undefinedundefinedundefinedundefined
All return `UInt`. Compare with `UInt(int_val)` for bounds checks.
所有函数都返回`UInt`类型。进行边界检查时请与`UInt(int_val)`比较。from std.gpu import barrier
from std.gpu.primitives import warp
from std.os.atomic import Atomic
barrier() # block-level sync
var warp_sum = warp.sum(my_value) # warp-wide sum reduction
var result = warp.reduce[warp.shuffle_down, reduce_fn](val) # custom warp reduce
_ = Atomic.fetch_add(output_ptr, value) # atomic addfrom std.gpu import barrier
from std.gpu.primitives import warp
from std.os.atomic import Atomic
barrier() # 块级同步
var warp_sum = warp.sum(my_value) # Warp级求和归约
var result = warp.reduce[warp.shuffle_down, reduce_fn](val) # 自定义Warp归约
_ = Atomic.fetch_add(output_ptr, value) # 原子加法from std.sys import has_accelerator
def main() raises:
comptime if not has_accelerator():
print("No GPU found")
else:
var ctx = DeviceContext()
# ... GPU codecomptime assert has_accelerator(), "Requires a GPU"from std.sys import has_accelerator
def main() raises:
comptime if not has_accelerator():
print("未找到GPU")
else:
var ctx = DeviceContext()
# ... GPU代码comptime assert has_accelerator(), "需要GPU支持"is_has_is_*has_*is_*has_*from std.sys.info import (
# Target checks — "am I being compiled FOR this GPU?"
# Use inside kernels or GPU-targeted code paths.
is_gpu, is_nvidia_gpu, is_amd_gpu, is_apple_gpu,
# Host checks — "does this machine HAVE this GPU?"
# Use from host code to decide whether to launch GPU work.
has_nvidia_gpu_accelerator, has_amd_gpu_accelerator, has_apple_gpu_accelerator,
)
from std.sys import has_accelerator # host check: any GPU presentis_*has_*from std.sys.info import (
# 目标检查——“我是否正在为该GPU编译?”
# 在内核或GPU目标代码路径中使用。
is_gpu, is_nvidia_gpu, is_amd_gpu, is_apple_gpu,
# 主机检查——“该机器是否拥有该GPU?”
# 在主机代码中使用,用于决定是否启动GPU任务。
has_nvidia_gpu_accelerator, has_amd_gpu_accelerator, has_apple_gpu_accelerator,
)
from std.sys import has_accelerator # 主机检查:是否存在任何GPU
Subarchitecture checks (inside GPU code only):
```mojo
from std.sys.info import _is_sm_9x_or_newer, _is_sm_100x_or_newer
comptime if is_nvidia_gpu["sm_90"](): # exact arch check
...
子架构检查(仅在GPU代码内部使用):
```mojo
from std.sys.info import _is_sm_9x_or_newer, _is_sm_100x_or_newer
comptime if is_nvidia_gpu["sm_90"](): # 精确架构检查
...comptimecomptime dtype = DType.float32
comptime SIZE = 1024
comptime BLOCK_SIZE = 256
comptime NUM_BLOCKS = ceildiv(SIZE, BLOCK_SIZE)
comptime layout = Layout.row_major(SIZE)comptime (layout.size())comptimecomptime dtype = DType.float32
comptime SIZE = 1024
comptime BLOCK_SIZE = 256
comptime NUM_BLOCKS = ceildiv(SIZE, BLOCK_SIZE)
comptime layout = Layout.row_major(SIZE)comptime (layout.size())from std.math import ceildiv
from std.sys import has_accelerator
from std.gpu import global_idx
from std.gpu.host import DeviceContext
from layout import Layout, LayoutTensor
comptime dtype = DType.float32
comptime N = 1024
comptime BLOCK = 256
comptime layout = Layout.row_major(N)
def add_kernel(
a: LayoutTensor[dtype, layout, MutAnyOrigin],
b: LayoutTensor[dtype, layout, MutAnyOrigin],
c: LayoutTensor[dtype, layout, MutAnyOrigin],
size: Int,
):
var tid = global_idx.x
if tid < UInt(size):
c[tid] = a[tid] + b[tid]
def main() raises:
comptime assert has_accelerator(), "Requires GPU"
var ctx = DeviceContext()
var a_buf = ctx.enqueue_create_buffer[dtype](N)
var b_buf = ctx.enqueue_create_buffer[dtype](N)
var c_buf = ctx.enqueue_create_buffer[dtype](N)
a_buf.enqueue_fill(1.0)
b_buf.enqueue_fill(2.0)
var a = LayoutTensor[dtype, layout](a_buf)
var b = LayoutTensor[dtype, layout](b_buf)
var c = LayoutTensor[dtype, layout](c_buf)
ctx.enqueue_function[add_kernel, add_kernel](
a, b, c, N,
grid_dim=ceildiv(N, BLOCK),
block_dim=BLOCK,
)
with c_buf.map_to_host() as host:
var result = LayoutTensor[dtype, layout](host)
print(result)from std.math import ceildiv
from std.sys import has_accelerator
from std.gpu import global_idx
from std.gpu.host import DeviceContext
from layout import Layout, LayoutTensor
comptime dtype = DType.float32
comptime N = 1024
comptime BLOCK = 256
comptime layout = Layout.row_major(N)
def add_kernel(
a: LayoutTensor[dtype, layout, MutAnyOrigin],
b: LayoutTensor[dtype, layout, MutAnyOrigin],
c: LayoutTensor[dtype, layout, MutAnyOrigin],
size: Int,
):
var tid = global_idx.x
if tid < UInt(size):
c[tid] = a[tid] + b[tid]
def main() raises:
comptime assert has_accelerator(), "需要GPU支持"
var ctx = DeviceContext()
var a_buf = ctx.enqueue_create_buffer[dtype](N)
var b_buf = ctx.enqueue_create_buffer[dtype](N)
var c_buf = ctx.enqueue_create_buffer[dtype](N)
a_buf.enqueue_fill(1.0)
b_buf.enqueue_fill(2.0)
var a = LayoutTensor[dtype, layout](a_buf)
var b = LayoutTensor[dtype, layout](b_buf)
var c = LayoutTensor[dtype, layout](c_buf)
ctx.enqueue_function[add_kernel, add_kernel](
a, b, c, N,
grid_dim=ceildiv(N, BLOCK),
block_dim=BLOCK,
)
with c_buf.map_to_host() as host:
var result = LayoutTensor[dtype, layout](host)
print(result)from std.math import ceildiv
from std.sys import has_accelerator
from std.gpu.sync import barrier
from std.gpu.host import DeviceContext
from std.gpu import thread_idx, block_idx
from std.gpu.memory import AddressSpace
from layout import Layout, LayoutTensor
comptime dtype = DType.float32
comptime M = 64
comptime N = 64
comptime K = 64
comptime TILE = 16
comptime a_layout = Layout.row_major(M, K)
comptime b_layout = Layout.row_major(K, N)
comptime c_layout = Layout.row_major(M, N)
comptime tile_a = Layout.row_major(TILE, TILE)
comptime tile_b = Layout.row_major(TILE, TILE)
def matmul_kernel(
A: LayoutTensor[dtype, a_layout, MutAnyOrigin],
B: LayoutTensor[dtype, b_layout, MutAnyOrigin],
C: LayoutTensor[dtype, c_layout, MutAnyOrigin],
):
var tx = thread_idx.x
var ty = thread_idx.y
var row = block_idx.y * TILE + ty
var col = block_idx.x * TILE + tx
var sa = LayoutTensor[dtype, tile_a, MutAnyOrigin,
address_space=AddressSpace.SHARED].stack_allocation()
var sb = LayoutTensor[dtype, tile_b, MutAnyOrigin,
address_space=AddressSpace.SHARED].stack_allocation()
var acc: C.element_type = 0.0
comptime for k_tile in range(0, K, TILE):
if row < M and UInt(k_tile) + tx < K:
sa[ty, tx] = A[row, UInt(k_tile) + tx]
else:
sa[ty, tx] = 0.0
if UInt(k_tile) + ty < K and col < N:
sb[ty, tx] = B[UInt(k_tile) + ty, col]
else:
sb[ty, tx] = 0.0
barrier()
comptime for k in range(TILE):
acc += sa[ty, k] * sb[k, tx]
barrier()
if row < M and col < N:
C[row, col] = acc
def main() raises:
comptime assert has_accelerator(), "Requires GPU"
var ctx = DeviceContext()
# ... allocate buffers, init data, launch:
# ctx.enqueue_function[matmul_kernel, matmul_kernel](
# A, B, C,
# grid_dim=(ceildiv(N, TILE), ceildiv(M, TILE)),
# block_dim=(TILE, TILE),
# )from std.math import ceildiv
from std.sys import has_accelerator
from std.gpu.sync import barrier
from std.gpu.host import DeviceContext
from std.gpu import thread_idx, block_idx
from std.gpu.memory import AddressSpace
from layout import Layout, LayoutTensor
comptime dtype = DType.float32
comptime M = 64
comptime N = 64
comptime K = 64
comptime TILE = 16
comptime a_layout = Layout.row_major(M, K)
comptime b_layout = Layout.row_major(K, N)
comptime c_layout = Layout.row_major(M, N)
comptime tile_a = Layout.row_major(TILE, TILE)
comptime tile_b = Layout.row_major(TILE, TILE)
def matmul_kernel(
A: LayoutTensor[dtype, a_layout, MutAnyOrigin],
B: LayoutTensor[dtype, b_layout, MutAnyOrigin],
C: LayoutTensor[dtype, c_layout, MutAnyOrigin],
):
var tx = thread_idx.x
var ty = thread_idx.y
var row = block_idx.y * TILE + ty
var col = block_idx.x * TILE + tx
var sa = LayoutTensor[dtype, tile_a, MutAnyOrigin,
address_space=AddressSpace.SHARED].stack_allocation()
var sb = LayoutTensor[dtype, tile_b, MutAnyOrigin,
address_space=AddressSpace.SHARED].stack_allocation()
var acc: C.element_type = 0.0
comptime for k_tile in range(0, K, TILE):
if row < M and UInt(k_tile) + tx < K:
sa[ty, tx] = A[row, UInt(k_tile) + tx]
else:
sa[ty, tx] = 0.0
if UInt(k_tile) + ty < K and col < N:
sb[ty, tx] = B[UInt(k_tile) + ty, col]
else:
sb[ty, tx] = 0.0
barrier()
comptime for k in range(TILE):
acc += sa[ty, k] * sb[k, tx]
barrier()
if row < M and col < N:
C[row, col] = acc
def main() raises:
comptime assert has_accelerator(), "需要GPU支持"
var ctx = DeviceContext()
# ... 分配缓冲区、初始化数据、启动内核:
# ctx.enqueue_function[matmul_kernel, matmul_kernel](
# A, B, C,
# grid_dim=(ceildiv(N, TILE), ceildiv(M, TILE)),
# block_dim=(TILE, TILE),
# )undefinedundefinedundefinedundefineddef block_reduce(
output: UnsafePointer[Int32, MutAnyOrigin],
input: UnsafePointer[Int32, MutAnyOrigin],
):
var sums = stack_allocation[512, Scalar[DType.int32],
address_space=AddressSpace.SHARED]()
var tid = thread_idx.x
sums[tid] = input[block_idx.x * block_dim.x + tid]
barrier()
# Tree reduction in shared memory
var active = block_dim.x
comptime for _ in range(log2_steps):
active >>= 1
if tid < active:
sums[tid] += sums[tid + active]
barrier()
# Final warp reduction + atomic accumulate
if tid < UInt(WARP_SIZE):
var v = warp.sum(sums[tid][0])
if tid == 0:
_ = Atomic.fetch_add(output, v)def block_reduce(
output: UnsafePointer[Int32, MutAnyOrigin],
input: UnsafePointer[Int32, MutAnyOrigin],
):
var sums = stack_allocation[512, Scalar[DType.int32],
address_space=AddressSpace.SHARED]()
var tid = thread_idx.x
sums[tid] = input[block_idx.x * block_dim.x + tid]
barrier()
# 共享内存中的树形归约
var active = block_dim.x
comptime for _ in range(log2_steps):
active >>= 1
if tid < active:
sums[tid] += sums[tid + active]
barrier()
# 最终Warp归约 + 原子累加
if tid < UInt(WARP_SIZE):
var v = warp.sum(sums[tid][0])
if tid == 0:
_ = Atomic.fetch_add(output, v)undefinedundefinedundefinedundefinedfrom std.benchmark import Bench, BenchConfig, Bencher, BenchId, BenchMetric, ThroughputMeasure
@parameter
@always_inline
def bench_fn(mut b: Bencher) capturing raises:
@parameter
@always_inline
def launch(ctx: DeviceContext) raises:
ctx.enqueue_function[kernel, kernel](args, grid_dim=G, block_dim=B)
b.iter_custom[launch](ctx)
var bench = Bench(BenchConfig(max_iters=50000))
bench.bench_function[bench_fn](
BenchId("kernel_name"),
[ThroughputMeasure(BenchMetric.bytes, total_bytes)],
)from std.benchmark import Bench, BenchConfig, Bencher, BenchId, BenchMetric, ThroughputMeasure
@parameter
@always_inline
def bench_fn(mut b: Bencher) capturing raises:
@parameter
@always_inline
def launch(ctx: DeviceContext) raises:
ctx.enqueue_function[kernel, kernel](args, grid_dim=G, block_dim=B)
b.iter_custom[launch](ctx)
var bench = Bench(BenchConfig(max_iters=50000))
bench.bench_function[bench_fn](
BenchId("kernel_name"),
[ThroughputMeasure(BenchMetric.bytes, total_bytes)],
)| Property | NVIDIA | AMD CDNA | AMD RDNA |
|---|---|---|---|
| Warp size | 32 | 64 | 32 |
| Shared memory | 48-228 KB/block | 64 KB/block | configurable |
| Tensor cores | SM70+ (WMMA) | Matrix cores | WMMA (RDNA3+) |
| TMA | SM90+ (Hopper) | N/A | N/A |
| Clusters | SM90+ | N/A | N/A |
| 属性 | NVIDIA | AMD CDNA | AMD RDNA |
|---|---|---|---|
| Warp大小 | 32 | 64 | 32 |
| 共享内存 | 48-228 KB/块 | 64 KB/块 | 可配置 |
| 张量核心 | SM70+(WMMA) | 矩阵核心 | WMMA(RDNA3+) |
| TMA | SM90+(Hopper) | 无 | 无 |
| 集群 | SM90+ | 无 | 无 |