Loading...
Loading...
vLLM Ascend plugin for LLM inference serving on Huawei Ascend NPU. Use for offline batch inference, API server deployment, quantization inference (with msmodelslim quantized models), tensor/pipeline parallelism for distributed serving, and OpenAI-compatible API endpoints. Supports Qwen, DeepSeek, GLM, LLaMA models with Ascend-optimized kernels.
npx skill4agent add ascend-ai-coding/awesome-ascend-skills vllm-ascendimport os
# Required for vLLM-Ascend: set multiprocessing method before importing vLLM
os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn"
from vllm import LLM, SamplingParams
# Load model with Ascend NPU (device auto-detected when vllm-ascend is installed)
llm = LLM(
model="Qwen/Qwen2.5-7B-Instruct",
max_model_len=4096
)
# Prepare prompts and sampling params
prompts = [
"Hello, how are you?",
"Explain quantum computing in simple terms.",
]
sampling_params = SamplingParams(temperature=0.7, top_p=0.9, max_tokens=512)
# Generate outputs
outputs = llm.generate(prompts, sampling_params)
# Print results
for output in outputs:
print(f"Prompt: {output.prompt}")
print(f"Output: {output.outputs[0].text}\n")# Start the API server
vllm serve Qwen/Qwen2.5-7B-Instruct \
--max-model-len 4096 \
--max-num-seqs 256 \
--served-model-name "qwen2.5-7b"
# Or using Python
python -m vllm.entrypoints.openai.api_server \
--model Qwen/Qwen2.5-7B-Instruct \
--max-model-len 4096import requests
# Completions API
response = requests.post(
"http://localhost:8000/v1/completions",
json={
"model": "qwen2.5-7b",
"prompt": "Once upon a time",
"max_tokens": 100,
"temperature": 0.7
}
)
print(response.json())
# Chat Completions API
response = requests.post(
"http://localhost:8000/v1/chat/completions",
json={
"model": "qwen2.5-7b",
"messages": [
{"role": "user", "content": "Hello!"}
],
"max_tokens": 100
}
)
print(response.json())# Pull pre-built image
docker pull ascendai/vllm-ascend:latest
# Run with NPU access
docker run -it --rm \
--device /dev/davinci0 \
--device /dev/davinci_manager \
--device /dev/devmm_svm \
--device /dev/hisi_hdc \
-v /usr/local/Ascend/driver:/usr/local/Ascend/driver \
-v /usr/local/Ascend/add-ons:/usr/local/Ascend/add-ons \
-e ASCEND_RT_VISIBLE_DEVICES=0 \
ascendai/vllm-ascend:latest# Install vLLM with Ascend plugin
pip install vllm-ascend
# Or install from source
git clone https://github.com/vllm-project/vllm-ascend.git
cd vllm-ascend
pip install -e .# Check vLLM Ascend installation
python -c "import vllm_ascend; print(vllm_ascend.__version__)"
# Check NPU availability
python -c "import torch; import torch_npu; print(torch_npu.npu.device_count())"# Basic server deployment
vllm serve <model_path> \
\
--served-model-name <name> \
--host 0.0.0.0 \
--port 8000
# Production deployment with optimizations
vllm serve /path/to/model \
\
--served-model-name "qwen2.5-72b" \
--max-model-len 8192 \
--max-num-seqs 256 \
--tensor-parallel-size 4 \
--gpu-memory-utilization 0.9 \
--dtype bfloat16 \
--api-key <your-api-key>import os
# Required: Set spawn method before importing vLLM
os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn"
from vllm import LLM, SamplingParams
# Single NPU
llm = LLM(
model="Qwen/Qwen2.5-7B-Instruct",
max_model_len=4096,
dtype="bfloat16"
)
# Distributed inference (multi-NPU)
llm = LLM(
model="Qwen/Qwen2.5-72B-Instruct",
tensor_parallel_size=4,
max_model_len=8192
)
# Generate
params = SamplingParams(temperature=0.7, max_tokens=512)
outputs = llm.generate(["Hello world"], params)from vllm import LLMEngine, EngineArgs, SamplingParams
engine_args = EngineArgs(
model="Qwen/Qwen2.5-7B-Instruct",
max_model_len=4096
)
engine = LLMEngine.from_engine_args(engine_args)
# Add requests and step through generation
request_id = "req-001"
prompt = "Hello, world!"
params = SamplingParams(max_tokens=50)
engine.add_request(request_id, prompt, params)
while engine.has_unfinished_requests():
outputs = engine.step()
for output in outputs:
if output.finished:
print(f"{output.request_id}: {output.outputs[0].text}")# W8A8 quantized model
vllm serve /path/to/quantized-model-w8a8 \
\
--quantization ascend \
--max-model-len 4096
# W4A8 quantized model
vllm serve /path/to/quantized-model-w4a8 \
\
--quantization ascend \
--max-model-len 4096from vllm import LLM, SamplingParams
llm = LLM(
model="/path/to/quantized-model",
quantization="ascend",
max_model_len=4096
)
params = SamplingParams(temperature=0.7, max_tokens=512)
outputs = llm.generate(["Hello"], params)# 4-way tensor parallelism
vllm serve Qwen/Qwen2.5-72B-Instruct \
--tensor-parallel-size 4 \
--max-model-len 8192import os
os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn"
from vllm import LLM
llm = LLM(
model="Qwen/Qwen2.5-72B-Instruct",
tensor_parallel_size=4,
max_model_len=8192
)from vllm import LLM
llm = LLM(
model="DeepSeek-V3",
pipeline_parallel_size=2,
tensor_parallel_size=4
)# Node 0 (Rank 0)
vllm serve <model> \
\
--tensor-parallel-size 8 \
--pipeline-parallel-size 2 \
--distributed-init-method "tcp://192.168.1.10:29500" \
--distributed-rank 0
# Node 1 (Rank 1)
vllm serve <model> \
\
--tensor-parallel-size 8 \
--pipeline-parallel-size 2 \
--distributed-init-method "tcp://192.168.1.10:29500" \
--distributed-rank 1| Parameter | Default | Description | Tuning Advice |
|---|---|---|---|
| Model max | Maximum sequence length | Reduce if OOM |
| 256 | Max concurrent sequences | Increase for throughput |
| 0.9 | GPU memory fraction | Lower if OOM during warmup |
| auto | Data type | bfloat16 for speed, float16 for compatibility |
| 1 | Tensor parallelism degree | Use for large models |
| 1 | Pipeline parallelism degree | Use for very large models |
# Small model (7B), single NPU
vllm serve <model> --max-model-len 4096 --max-num-seqs 256
# Medium model (32B), single NPU
vllm serve <model> --max-model-len 8192 --max-num-seqs 128
# Large model (72B), multi-NPU
vllm serve <model> --tensor-parallel-size 4 --max-model-len 8192
# Maximum throughput
vllm serve <model> --max-num-seqs 512 --gpu-memory-utilization 0.95# Check CANN version compatibility
npu-smi info
# Ensure CANN >= 8.0.RC1
# Try different dtype
vllm serve <model> --dtype float16# Reduce max model length
vllm serve <model> --max-model-len 2048
# Lower memory utilization
vllm serve <model> --gpu-memory-utilization 0.8
# Reduce concurrent sequences
vllm serve <model> --max-num-seqs 128# Check model path
ls /path/to/model
# Verify tokenizer
python -c "from transformers import AutoTokenizer; tok = AutoTokenizer.from_pretrained('/path/to/model'); print('OK')"
# Use trust_remote_code for custom models
vllm serve <model> --trust-remote-code# Enable bfloat16 for faster compute
vllm serve <model> --dtype bfloat16
# Adjust block size
vllm serve <model> --block-size 256
# Enable prefix caching
vllm serve <model> --enable-prefix-caching# Check server is running
curl http://localhost:8000/health
# Verify port is not in use
lsof -i :8000
# Use explicit host/port
vllm serve <model> --host 0.0.0.0 --port 8000# Required: Set multiprocessing method for vLLM-Ascend
export VLLM_WORKER_MULTIPROC_METHOD=spawn
# Set Ascend device IDs
export ASCEND_RT_VISIBLE_DEVICES=0,1,2,3
# Debug logging
export VLLM_LOGGING_LEVEL=DEBUG
# Disable lazy initialization (for debugging)
export VLLM_ASCEND_LAZY_INIT=0scripts/benchmark_throughput.pyscripts/benchmark_latency.pyscripts/start_server.sh