NVIDIA创建容器

docker run --runtime nvidia --gpus 0 \
    -v ~/.cache/huggingface:/root/.cache/huggingface \
    --env "HUGGING_FACE_HUB_TOKEN=$HF_TOKEN" \
    -p 8000:8000 \
    --ipc=host \
    vllm/vllm-openai:latest \
    --model Qwen/Qwen3-0.6B

docker-compose(NVIDIA)

services:
  vllm:
    container_name: vllm-qwen2.5-7b
    restart: always
    image: vllm/vllm-openai:v0.8.1
    ipc: host
    volumes:
      - /mnt/data/llm_models:/llm_models
    environment:
      - CUDA_VISIBLE_DEVICES=0
    command:
      - --model
      - /llm_models/Qwen/Qwen2.5-7B-Instruct
      - --served-model-name
      - Qwen2.5-7B-Instruct
      - --port
      - "10013"
    ports:
      - 10013:10013
    deploy:
      resources:
        reservations:
          devices:
            - driver: nvidia
              count: all
              capabilities: [gpu]