Loading...
Loading...
将简单Vector类型Triton算子从GPU迁移到昇腾NPU。当用户需要迁移Triton代码到NPU、提到GPU到NPU迁移、Triton迁移、昇腾适配时使用。注意:无法自动迁移存在编译问题的算子。
npx skill4agent add ascend/agent-skills simple-vector-triton-gpu-to-npu# 1. 分析源代码
# 使用 templates/analysis_template.md 生成语义分析报告
# 2. 最小化迁移
# 只修改设备:device='cuda' -> device='npu'
# 3. 运行测试
python test_your_kernel.py
# 4. 根据错误调整
# 参考 reference/troubleshooting.md 解决问题# 安装依赖
pip uninstall triton # 卸载社区Triton
pip install triton-ascend
pip install torch-npu
# 验证安装
python -c "import torch_npu; print(torch_npu.npu.is_available())"# 第一步:只修改设备指定
# device='cuda' -> device='npu'
x = torch.rand(size, device='npu')
# 第二步:运行测试
try:
result = kernel_npu(**test_inputs)
print("✅ 基础运行成功")
except Exception as e:
print(f"❌ 运行失败: {e}")| GPU API | NPU API | 说明 |
|---|---|---|
| | 检查设备是否可用 |
| | 清空缓存 |
| | 同步设备 |
| | 获取内存信息 |
| | 设备指定 |
| 删除 | NPU暂不支持torch.compile训练 |
# 直接使用NPU API
import torch_npu
with torch_npu.npu.device(device_index):
kernel[grid](...)props = torch_npu.npu.get_device_properties(device)
sm_count = props.vector_core_num # Ascend910为48| GPU属性 | NPU属性 | 典型值(Ascend910) |
|---|---|---|
| multi_processor_count | vector_core_num | 48 |
| total_memory | total_memory | 62740MB |
| name | name | 'Ascend910_9392' |
| - | cube_core_num | 24 |
| - | L2_cache_size | '192MB' |
| 错误类型 | 错误信息关键词 | 解决方案 |
|---|---|---|
| 编译错误 | compilation failed | 检查Triton语法兼容性 |
| coreDim超限 | coreDim > UINT16_MAX | 增大BLOCK_SIZE或设置环境变量 |
| UB溢出 | ub overflow | 使用子块切分策略 |
| 精度问题 | NaN, Inf, 不匹配 | 检查逻辑运算符、mask使用 |
| 性能问题 | 运行缓慢 | 优化内存访问、使用Tiling |
def verify_accuracy(result, ref, dtype):
# 检查NaN/Inf
assert not torch.isnan(result).any(), "结果包含NaN"
assert not torch.isinf(result).any(), "结果包含Inf"
# 设置容差
if dtype in [torch.float16, torch.bfloat16]:
rtol, atol = 1e-3, 1e-3
elif dtype == torch.float32:
rtol, atol = 1e-4, 1e-4
else:
rtol, atol = 0, 0
torch.testing.assert_close(result, ref, rtol=rtol, atol=atol)@triton.jit
def add_kernel(x_ptr, y_ptr, output_ptr, n_elements, BLOCK_SIZE: tl.constexpr):
pid = tl.program_id(axis=0)
offsets = pid * BLOCK_SIZE + tl.arange(0, BLOCK_SIZE)
mask = offsets < n_elements
x = tl.load(x_ptr + offsets, mask=mask)
y = tl.load(y_ptr + offsets, mask=mask)
output = x + y
tl.store(output_ptr + offsets, output, mask=mask)
x = torch.rand(98432, device='cuda')
y = torch.rand(98432, device='cuda')@triton.jit
def add_kernel(x_ptr, y_ptr, output_ptr, n_elements, BLOCK_SIZE: tl.constexpr):
pid = tl.program_id(axis=0)
offsets = pid * BLOCK_SIZE + tl.arange(0, BLOCK_SIZE)
mask = offsets < n_elements
x = tl.load(x_ptr + offsets, mask=mask)
y = tl.load(y_ptr + offsets, mask=mask)
output = x + y
tl.store(output_ptr + offsets, output, mask=mask)
x = torch.rand(98432, device='npu')
y = torch.rand(98432, device='npu')care_padding=FalseotherNtl.loadtl.storetorch.cuda.*torch.npu.*torch_npu.npu.*vector_core_nummulti_processor_count@torch.compile| GPU API | NPU API | 说明 |
|---|---|---|
| | 检查设备是否可用 |
| | 清空缓存 |
| | 同步设备 |
| | 获取内存信息 |
| | 设备指定 |
| 删除 | NPU暂不支持torch.compile训练 |
care_padding=Falsecare_padding=False# Step 1: 初始迁移(不添加)
x = tl.load(x_ptr + offsets, mask=mask)
# Step 2: 功能验证通过后,可选的性能优化
# x = tl.load(x_ptr + offsets, mask=mask, care_padding=False)care_padding=False# ✅ 正确示例:load 和 store 使用各自的 mask
out_mask = rows_mask & cols_mask[None, :] # 输出边界检查
final_mask = out_mask & index_valid_mask[None, :] # 输入有效性检查
selected = tl.load(inp + inp_off, mask=final_mask, other=0.0)
tl.store(out + out_off, selected, mask=out_mask) # 正确!| 操作 | mask 含义 | 应检查的内容 |
|---|---|---|
| 哪些输入位置需要读取 | 索引有效性、输入边界 |
| 哪些输出位置需要写入 | 输出边界、行列范围 |
pip install triton-ascend torch-npudevice='cuda'device='npu'grid = (num_core,)grid = (NV, NK, N * H) # 3D逻辑网格
kernel[grid](...)import torch_npu
import triton.runtime.driver as driver
def get_npu_properties():
device = torch.npu.current_device()
return driver.active.utils.get_device_properties(device)
num_core = get_npu_properties()["num_vectorcore"]
grid = (num_core,) # 1D物理核心网格i_v, i_k, i_nh = tl.program_id(0).to(tl.int64), tl.program_id(1).to(tl.int64), tl.program_id(2).to(tl.int64)
i_n, i_h = i_nh // H, i_nh % Hcore_id = tl.program_id(0)
task_num = NV * NK * N * H
knh_step = NK * N * H
nh_step = N * H
for task_id in tl.range(core_id, task_num, num_core):
i_v = task_id // knh_step
i_k = task_id % knh_step // nh_step
i_nh = task_id % knh_step % nh_step
i_n = task_id % knh_step % nh_step // H
i_h = task_id % knh_step % nh_step % H
# ... 原有内核逻辑def kernel(...,
knh_step: tl.constexpr,
nh_step: tl.constexpr,
N: tl.constexpr,
task_num: tl.constexpr,
num_core: tl.constexpr,
...):grid = (dim1, dim2, dim3, ...)tl.program_id(0)tl.program_id(1)# 计算总任务数
task_num = dim1 * dim2 * dim3 * ... # 所有网格维度的乘积
# 计算每个维度的步长
# 3D网格(dim1, dim2, dim3)示例:
step_dim2_dim3 = dim2 * dim3
step_dim3 = dim3
# 在内核中:
# task_id = core_id + i * num_core
# dim1_idx = task_id // step_dim2_dim3
# dim2_idx = (task_id % step_dim2_dim3) // step_dim3
# dim3_idx = task_id % step_dim3# 之前:
i0 = tl.program_id(0)
i1 = tl.program_id(1)
i2 = tl.program_id(2)
# 之后:
core_id = tl.program_id(0)
for task_id in tl.range(core_id, task_num, num_core):
i0 = task_id // step_dim2_dim3
i1 = (task_id % step_dim2_dim3) // step_dim3
i2 = task_id % step_dim3
# ... 所有使用这些变量的代码都在循环内部pid_bpid_hpid_seqi0i1i2seq_lenTBIS_VARLENseq_offsetboseosnchunks# ❌ 错误:变量在循环内部定义,但在外部使用
core_id = tl.program_id(0)
for task_id in tl.range(core_id, task_num, num_core):
pid_b = task_id // h_step
pid_h = task_id % h_step
# ... 其他变量定义
# 错误:在循环外部使用循环内部定义的变量
nchunks = tl.cdiv(seq_len, CHUNK_SIZE) # seq_len未定义
ANGLE += pid_b * stride_angle_batch # pid_b未定义# ✅ 正确:所有使用循环内部变量的代码都在循环内部
core_id = tl.program_id(0)
for task_id in tl.range(core_id, task_num, num_core):
pid_b = task_id // h_step
pid_h = task_id % h_step
seq_len = ... # 在循环内部定义
# 所有使用这些变量的代码都在循环内部
nchunks = tl.cdiv(seq_len, CHUNK_SIZE)
angle_ptr = ANGLE + pid_b * stride_angle_batch # 使用局部变量
# ... 后续所有计算# 之前:
grid = (dim1, dim2, dim3)
kernel[grid](...)
# 之后:
num_core = get_npu_properties()["num_vectorcore"]
grid = (num_core,)
kernel[grid](
...,
knh_step=step_dim2_dim3,
nh_step=step_dim3,
N=dim1, # 或适当的映射
task_num=task_num,
num_core=num_core,
)@triton.jit
def kernel_gpu(x_ptr, output_ptr, N, M, BLOCK_SIZE: tl.constexpr):
pid_n = tl.program_id(0)
pid_m = tl.program_id(1)
# 计算偏移
x = x_ptr + pid_n * M + pid_m * BLOCK_SIZE
# ... 计算逻辑
# 启动内核
grid = (N, M // BLOCK_SIZE)
kernel_gpu[grid](x, output, N, M, BLOCK_SIZE=128)@triton.jit
def kernel_npu(x_ptr, output_ptr, N, M, BLOCK_SIZE: tl.constexpr,
m_step: tl.constexpr, task_num: tl.constexpr, num_core: tl.constexpr):
core_id = tl.program_id(0)
for task_id in tl.range(core_id, task_num, num_core):
pid_n = task_id // m_step
pid_m = task_id % m_step
# 计算偏移
x = x_ptr + pid_n * M + pid_m * BLOCK_SIZE
# ... 计算逻辑(所有代码都在循环内部)
# 启动内核
num_core = get_npu_properties()["num_vectorcore"]
m_step = M // BLOCK_SIZE
task_num = N * m_step
grid = (num_core,)
kernel_npu[grid](x, output, N, M, BLOCK_SIZE=128,
m_step=m_step, task_num=task_num, num_core=num_core)xxx_optimized.pyxxx_npu.py_npu# ❌ 不要添加_npu后缀
def _layer_norm_fwd_1pass_kernel_npu(...):
...# ✅ 保持原始函数名
def _layer_norm_fwd_1pass_kernel(...):
# NPU optimized implementation
...| 中文注释 | 英文注释 |
|---|---|
| |
| |
| |
| |
| |
| |
| |
def _layer_norm_fwd_1pass_kernel(...):
# GPU kernel implementation
row = tl.program_id(0)
group = tl.program_id(1)
...
def _layer_norm_fwd(...):
grid = (M, ngroups)
with torch.cuda.device(x.device.index):
_layer_norm_fwd_1pass_kernel[grid](...)# NPU support
import torch_npu
import triton.runtime.driver as driver
def get_npu_properties():
"""Get NPU device properties, including number of cores"""
device = torch.npu.current_device()
return driver.active.utils.get_device_properties(device)
def _layer_norm_fwd_1pass_kernel(...,
# NPU task dispatch parameters
ngroups_step: tl.constexpr,
task_num: tl.constexpr,
num_core: tl.constexpr,
):
# NPU optimization: Use 1D physical core grid with task dispatch
core_id = tl.program_id(0)
for task_id in tl.range(core_id, task_num, num_core):
# Reconstruct original 2D indices from task_id
row = task_id // ngroups_step
group = task_id % ngroups_step
# Calculate pointer offsets
X_ptr = X + row * stride_x_row + group * N
# ... kernel logic
def _layer_norm_fwd(...):
# NPU optimization: Use 1D physical core grid
npu_props = get_npu_properties()
num_core = npu_props["num_vectorcore"]
grid = (num_core,)
_layer_norm_fwd_1pass_kernel[grid](...)_npuwith torch.cuda.device(...)num_vectorcoremulti_processor_count(num_core,)ngroups_steptask_numnum_coretorch.cuda.is_available()torch.npu.is_available()torch.cuda.empty_cache()torch.npu.empty_cache()torch.cuda.synchronize()torch.npu.synchronize()torch.cuda.mem_get_info()torch.npu.mem_get_info()device="cuda"device="npu"@torch.compile