version 1 2 3 4 GPU driver cuda | vllm torch image h100 550 12.4 | h200 565 12.7 | 0.7.3 2.5.1+cu124 h200 570 12.8 | 0.7.3 2.5.1+cu124 nvcr.io/nvidia/pytorch:25.01-py3
install
https://developer.nvidia.com/cuda-toolkit-archive
local
1 2 3 4 5 6 7 8 9 10 11 12 13 wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/cuda-ubuntu2204.pin sudo mv cuda-ubuntu2204.pin /etc/apt/preferences.d/cuda-repository-pin-600 wget https://developer.download.nvidia.com/compute/cuda/12.8.1/local_installers/cuda-repo-ubuntu2204-12-8-local_12.8.1-570.124.06-1_amd64.deb sudo dpkg -i cuda-repo-ubuntu2204-12-8-local_12.8.1-570.124.06-1_amd64.deb sudo cp /var/cuda-repo-ubuntu2204-12-8-local/cuda-*-keyring.gpg /usr/share/keyrings/ sudo apt-get update sudo apt-get -y install cuda-toolkit-12-8
online
1 2 3 4 wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/cuda-keyring_1.1-1_all.deb sudo dpkg -i cuda-keyring_1.1-1_all.deb sudo apt-get update sudo apt-get -y install cuda-toolkit-12-8
post operate 1 2 3 4 5 # config env export PATH=/usr/local/cuda/bin:$PATH # NVIDIA persistence daemon sudo systemctl start nvidia-persistenced
driver 1 sudo apt-get install -y cuda-drivers
nvidia-fabricmanager 1 2 3 sudo apt install -y nvidia-fabricmanager-570 sudo systemctl start nvidia-fabricmanager
ENV
resolve issues: Error 802: system not yet initialized
1 2 3 4 5 6 # sort GPUs, by ordering their IDs with IDs on the PCIe bus. export CUDA_DEVICE_ORDER="PCI_BUS_ID" # perform an availability check using NVML (NVIDIA Management Library). NVML is an API layer for obtaining data directly from the NVIDIA-smi utility. export PYTORCH_NVML_BASED_CUDA_CHECK=1 # force show the system the IDs of available GPUs. export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
cuda kernel model
check by lsmod | grep nvidia
Module
Description
nvidia_uvm
NVIDIA’s Unified Memory driver
nvidia_drm
Direct Rendering Manager support
nvidia_modeset
Kernel mode-setting support
nvidia
Main NVIDIA driver module
command nvidia-smi
Enable Persistence Mode
check state
1 2 3 4 5 6 nvidia-smi conf-compute -grs # Confidential Compute GPUs Ready state: not-ready # Confidential Compute GPUs Ready state: ready # if above state is not-ready, execute below cmd nvidia-smi conf-compute -srs 1
cuda 12.8
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 $ nvidia-smi +-----------------------------------------------------------------------------------------+ | NVIDIA-SMI 570.124.06 Driver Version: 570.124.06 CUDA Version: 12.8 | |-----------------------------------------+------------------------+----------------------+ | GPU Name Persistence-M | Bus-Id Disp.A | Volatile Uncorr. ECC | | Fan Temp Perf Pwr:Usage/Cap | Memory-Usage | GPU-Util Compute M. | | | | MIG M. | |=========================================+========================+======================| | 0 NVIDIA H200 On | 00000000:19:00.0 Off | 0 | | N/A 23C P0 76W / 700W | 1MiB / 143771MiB | 0% Default | | | | Disabled | +-----------------------------------------+------------------------+----------------------+ | 1 NVIDIA H200 On | 00000000:3B:00.0 Off | 0 | | N/A 21C P0 75W / 700W | 1MiB / 143771MiB | 0% Default | | | | Disabled | +-----------------------------------------+------------------------+----------------------+ | 2 NVIDIA H200 On | 00000000:4C:00.0 Off | 0 | | N/A 23C P0 76W / 700W | 1MiB / 143771MiB | 0% Default | | | | Disabled | +-----------------------------------------+------------------------+----------------------+ | 3 NVIDIA H200 On | 00000000:5D:00.0 Off | 0 | | N/A 24C P0 77W / 700W | 1MiB / 143771MiB | 0% Default | | | | Disabled | +-----------------------------------------+------------------------+----------------------+ | 4 NVIDIA H200 On | 00000000:9B:00.0 Off | 0 | | N/A 24C P0 75W / 700W | 1MiB / 143771MiB | 0% Default | | | | Disabled | +-----------------------------------------+------------------------+----------------------+ | 5 NVIDIA H200 On | 00000000:BB:00.0 Off | 0 | | N/A 23C P0 77W / 700W | 1MiB / 143771MiB | 0% Default | | | | Disabled | +-----------------------------------------+------------------------+----------------------+ | 6 NVIDIA H200 On | 00000000:CB:00.0 Off | 0 | | N/A 24C P0 76W / 700W | 1MiB / 143771MiB | 0% Default | | | | Disabled | +-----------------------------------------+------------------------+----------------------+ | 7 NVIDIA H200 On | 00000000:DB:00.0 Off | 0 | | N/A 24C P0 76W / 700W | 1MiB / 143771MiB | 0% Default | | | | Disabled | +-----------------------------------------+------------------------+----------------------+ +-----------------------------------------------------------------------------------------+ | Processes: | | GPU GI CI PID Type Process name GPU Memory | | ID ID Usage | |=========================================================================================| | No running processes found | +-----------------------------------------------------------------------------------------+
cuda 12.7
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 $ nvidia-smi Fri Mar 14 10:23:56 2025 +-----------------------------------------------------------------------------------------+ | NVIDIA-SMI 565.57.01 Driver Version: 565.57.01 CUDA Version: 12.7 | |-----------------------------------------+------------------------+----------------------+ | GPU Name Persistence-M | Bus-Id Disp.A | Volatile Uncorr. ECC | | Fan Temp Perf Pwr:Usage/Cap | Memory-Usage | GPU-Util Compute M. | | | | MIG M. | |=========================================+========================+======================| | 0 NVIDIA H200 On | 00000000:19:00.0 Off | 0 | | N/A 26C P0 111W / 700W | 134402MiB / 143771MiB | 0% Default | | | | Disabled | +-----------------------------------------+------------------------+----------------------+ | 1 NVIDIA H200 On | 00000000:3B:00.0 Off | 0 | | N/A 25C P0 116W / 700W | 132320MiB / 143771MiB | 0% Default | | | | Disabled | +-----------------------------------------+------------------------+----------------------+ | 2 NVIDIA H200 On | 00000000:4C:00.0 Off | 0 | | N/A 25C P0 112W / 700W | 132320MiB / 143771MiB | 0% Default | | | | Disabled | +-----------------------------------------+------------------------+----------------------+ | 3 NVIDIA H200 On | 00000000:5D:00.0 Off | 0 | | N/A 27C P0 115W / 700W | 132320MiB / 143771MiB | 0% Default | | | | Disabled | +-----------------------------------------+------------------------+----------------------+ | 4 NVIDIA H200 On | 00000000:9B:00.0 Off | 0 | | N/A 27C P0 115W / 700W | 132320MiB / 143771MiB | 0% Default | | | | Disabled | +-----------------------------------------+------------------------+----------------------+ | 5 NVIDIA H200 On | 00000000:BB:00.0 Off | 0 | | N/A 26C P0 114W / 700W | 132320MiB / 143771MiB | 0% Default | | | | Disabled | +-----------------------------------------+------------------------+----------------------+ | 6 NVIDIA H200 On | 00000000:CB:00.0 Off | 0 | | N/A 26C P0 113W / 700W | 132320MiB / 143771MiB | 0% Default | | | | Disabled | +-----------------------------------------+------------------------+----------------------+ | 7 NVIDIA H200 On | 00000000:DB:00.0 Off | 0 | | N/A 24C P0 114W / 700W | 131840MiB / 143771MiB | 0% Default | | | | Disabled | +-----------------------------------------+------------------------+----------------------+ +-----------------------------------------------------------------------------------------+ | Processes: | | GPU GI CI PID Type Process name GPU Memory | | ID ID Usage | |=========================================================================================| +-----------------------------------------------------------------------------------------+
vllm install offline On your local machine, create a virtual environment:
1 2 python3 -m venv vllm_env source vllm_env/bin/activate
1️⃣ On your local machine:
1 pip download --dest=./vllm_deps vllm
2️⃣ Transfer dependencies to the remote server:
1 scp -r vllm_deps user@remote_server:/path/to/destination/
3️⃣ On the remote server:
1 2 3 4 5 cd /path/to/destination/vllm_deps pip install --no-index --find-links=. vllm* • --no-index tells pip not to use the internet. • --find-links=./vllm_deps tells pip to look for packages in this directory. • vllm* ensures pip finds the correct package in that folder.
running time 1 vllm serve /mnt/dingofs-test/DeepSeek-R1 --host 0.0.0.0 --port 8000 --served-model-name deepseek-r1 --tensor-parallel-size 8 --gpu-memory-utilization 0.85 --max-model-len 128000 --max-num-batched-tokens 32000 --max-num-seqs 1024 --trust-remote-code --enable-reasoning --reasoning-parser deepseek_r1
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 Sat Mar 15 20:33:04 2025 +-----------------------------------------------------------------------------------------+ | NVIDIA-SMI 570.124.06 Driver Version: 570.124.06 CUDA Version: 12.8 | |-----------------------------------------+------------------------+----------------------+ | GPU Name Persistence-M | Bus-Id Disp.A | Volatile Uncorr. ECC | | Fan Temp Perf Pwr:Usage/Cap | Memory-Usage | GPU-Util Compute M. | | | | MIG M. | |=========================================+========================+======================| | 0 NVIDIA H200 On | 00000000:19:00.0 Off | 0 | | N/A 26C P0 115W / 700W | 84474MiB / 143771MiB | 1% Default | | | | Disabled | +-----------------------------------------+------------------------+----------------------+ | 1 NVIDIA H200 On | 00000000:3B:00.0 Off | 0 | | N/A 24C P0 113W / 700W | 84522MiB / 143771MiB | 1% Default | | | | Disabled | +-----------------------------------------+------------------------+----------------------+ | 2 NVIDIA H200 On | 00000000:4C:00.0 Off | 0 | | N/A 26C P0 114W / 700W | 84522MiB / 143771MiB | 1% Default | | | | Disabled | +-----------------------------------------+------------------------+----------------------+ | 3 NVIDIA H200 On | 00000000:5D:00.0 Off | 0 | | N/A 27C P0 117W / 700W | 84522MiB / 143771MiB | 1% Default | | | | Disabled | +-----------------------------------------+------------------------+----------------------+ | 4 NVIDIA H200 On | 00000000:9B:00.0 Off | 0 | | N/A 27C P0 114W / 700W | 84522MiB / 143771MiB | 1% Default | | | | Disabled | +-----------------------------------------+------------------------+----------------------+ | 5 NVIDIA H200 On | 00000000:BB:00.0 Off | 0 | | N/A 26C P0 116W / 700W | 84522MiB / 143771MiB | 1% Default | | | | Disabled | +-----------------------------------------+------------------------+----------------------+ | 6 NVIDIA H200 On | 00000000:CB:00.0 Off | 0 | | N/A 27C P0 115W / 700W | 84522MiB / 143771MiB | 1% Default | | | | Disabled | +-----------------------------------------+------------------------+----------------------+ | 7 NVIDIA H200 On | 00000000:DB:00.0 Off | 0 | | N/A 26C P0 114W / 700W | 84282MiB / 143771MiB | 1% Default | | | | Disabled | +-----------------------------------------+------------------------+----------------------+ +-----------------------------------------------------------------------------------------+ | Processes: | | GPU GI CI PID Type Process name GPU Memory | | ID ID Usage | |=========================================================================================| | 0 N/A N/A 915920 C ...niconda3/envs/vllm/bin/python 84464MiB | | 1 N/A N/A 916338 C ...niconda3/envs/vllm/bin/python 84512MiB | | 2 N/A N/A 916339 C ...niconda3/envs/vllm/bin/python 84512MiB | | 3 N/A N/A 916340 C ...niconda3/envs/vllm/bin/python 84512MiB | | 4 N/A N/A 916341 C ...niconda3/envs/vllm/bin/python 84512MiB | | 5 N/A N/A 916342 C ...niconda3/envs/vllm/bin/python 84512MiB | | 6 N/A N/A 916343 C ...niconda3/envs/vllm/bin/python 84512MiB | | 7 N/A N/A 916344 C ...niconda3/envs/vllm/bin/python 84272MiB | +-----------------------------------------------------------------------------------------+
log
1 2 3 4 5 6 INFO 03-15 20:36:48 worker.py:267] Memory profiling takes 7.63 seconds INFO 03-15 20:36:48 worker.py:267] the current vLLM instance can use total_gpu_memory (139.81GiB) x gpu_memory_utilization (0.85) = 118.84GiB INFO 03-15 20:36:48 worker.py:267] model weights take 83.88GiB; non_torch_memory takes 7.16GiB; PyTorch activation peak memory takes 6.37GiB; the rest of the memory reserved for KV Cache is 21.43GiB. INFO 03-15 20:36:48 executor_base.py:111] # cuda blocks: 18418, # CPU blocks: 3437 INFO 03-15 20:36:48 executor_base.py:116] Maximum concurrency for 128000 tokens per request: 2.30x
chat
1 2 3 4 5 6 7 8 9 curl http://localhost:8000/v1/chat/completions \ -H "Content-Type: application/json" \ -d '{ "model": "deepseek-r1", "messages": [ {"role": "system", "content": "You are a helpful assistant."}, {"role": "user", "content": "introduce yourself"} ] }'
sglang install offline 1 2 3 4 5 6 7 8 9 10 11 12 13 14 # prepare env python3 -m venv sglang_env source sglang_env/bin/activate # optional use uv pip install --upgrade pip # pip install uv # download deps mkdir -p ./sglang_deps pip download "sglang[all]>=0.4.4.post1" --find-links https://flashinfer.ai/whl/cu124/torch2.5/flashinfer-python -d ./sglang_deps # scp deps to remote scp -r sglang_deps user@remote_server:/path/to/destination/ # install sglang on remote cd /path/to/remote/sglang_deps pip install --no-index --find-links=. "sglang[all]>=0.4.4.post1"
runtime 1 python3 -m sglang.launch_server --model /mnt/3fs/DeepSeek-R1 --tp 8 --trust-remote-code --port 30000
chat
1 2 3 4 5 6 7 8 9 curl http://localhost:30000/v1/chat/completions \ -H "Content-Type: application/json" \ -d '{ "model": "deepseek-r1", "messages": [ {"role": "system", "content": "You are a helpful assistant."}, {"role": "user", "content": "introduce yourself"} ] }'
torch install offline 1 2 3 4 5 6 mkdir ~/torch_deps pip download --dest=~/torch_deps torch==2.5.1 --extra-index-url https://download.pytorch.org/whl/nightly/cu128 scp -r ~/torch_deps user@remote_server:/path/to/remote/directory cd /path/to/remote/directory pip install --no-index --find-links=./ torch
check
1 python -c "import torch; print(torch.cuda.is_available()); print(torch.cuda.device_count())"
or
1 2 3 4 5 import torchprint (torch.cuda.is_available()) print (torch.cuda.device_count()) print (torch.__version__) print (torch.version.cuda)
best practices CUDA_LAUNCH_BLOCKING=1
CUDA_LAUNCH_BLOCKING=1 will tell CUDA: “Wait (block) for each GPU kernel to finish before moving to the next line of Python code.”
Normally, CUDA operations are asynchronous —errors may not happen exactly where the code looks wrong, because the kernel may fail later. This can make debugging frustrating.
should never use it in production or performance benchmarking—just for debugging.