一键部署

创建启动脚本

手动创建`qwen3-8b-start.sh`

#!/bin/bash

CONTAINER_NAME="minimax-20260228"
IMAGE_NAME="quay.io/ascend/vllm-ascend:v0.14.0rc1"
MODEL_PATH="/models/Qwen/Qwen3-8B"
MODEL_NAME="Qwen3-8B"
TP_SIZE=1
MAX_MODEL_LEN=40960
GPU_MEMORY_UTILIZATION=0.9
HOST="0.0.0.0"
PORT=8000

docker run -itd --rm \
--name "$CONTAINER_NAME" \
--shm-size=50G \
-p $PORT:$PORT \
--privileged \
--device /dev/davinci0 \
--device /dev/davinci_manager \
--device /dev/devmm_svm \
--device /dev/hisi_hdc \
-v /usr/local/dcmi:/usr/local/dcmi \
-v /usr/local/bin/npu-smi:/usr/local/bin/npu-smi \
-v /usr/local/Ascend/driver/lib64/:/usr/local/Ascend/driver/lib64/ \
-v /usr/local/Ascend/driver/version.info:/usr/local/Ascend/driver/version.info \
-v /etc/ascend_install.info:/etc/ascend_install.info \
-v /nvme1/.cache:/root/.cache \
-v "$MODEL_PATH":$MODEL_PATH \
-e VLLM_USE_MODELSCOPE=True \
-e PYTORCH_NPU_ALLOC_CONF=max_split_size_mb:256 \
-e ASCEND_RT_VISIBLE_DEVICES=0 \
-e TZ=Asia/Shanghai \
$IMAGE \
vllm serve $MODEL_PATH --served-model-name $MODEL_NAME --tensor-parallel-size $TP_SIZE --max_model_len $MAX_MODEL_LEN --gpu-memory-utilization $GPU_MEMORY_UTILIZATION --host $HOST --port $PORT --disable-log-requests

启动服务

bash qwen3-8b-start.sh