启动推理服务 ================== 天数 ^^^^ .. code-block:: shell CUDA_VISIBLE_DEVICES=4,5,6,7 VLLM_ENFORCE_CUDA_GRAPH=1 python3 -m vllm.entrypoints.openai.api_server --model /nvme0n1/llm_models/deepseek-ai/DeepSeek-R1-Distill-Qwen-32B --host 0.0.0.0 --port 1025 --tensor-parallel-size 4 --max-model-len 8192 --gpu-memory-utilization 0.9 --disable-log-requests Ascend ^^^^^^ .. code-block:: shell ASCEND_RT_VISIBLE_DEVICES=4,5,6,7 vllm serve /nvme0n1/llm_models/deepseek-ai/DeepSeek-R1-Distill-Qwen-32B --served-model-name DeepSeek-R1-Distill-Qwen-32B --tensor-parallel-size 4 --max-model-len 8192 --gpu-memory-utilization 0.9 --host 0.0.0.0 --port 1025 --disable-log-requests NVIDIA ^^^^^^ .. code-block:: shell CUDA_VISIBLE_DEVICES=4,5,6,7 vllm serve --model /nvme0n1/llm_models/deepseek-ai/DeepSeek-R1-Distill-Qwen-32B --host 0.0.0.0 --port 1025 --tensor-parallel-size 4 --max-model-len 8192 --disable-log-requests