|
20 | 20 |
|
21 | 21 | __all__ = ['SpeedMonitor']
|
22 | 22 |
|
| 23 | +_HSERIES_SXM = { |
| 24 | + 'fp64': 67e12, |
| 25 | + 'fp32': 67e12, |
| 26 | + 'tf32': 989e12 / 2, |
| 27 | + 'fp16': 1.979e15 / 2, |
| 28 | + 'amp_fp16': 1.979e15 / 2, |
| 29 | + 'bf16': 1.979e15 / 2, |
| 30 | + 'amp_bf16': 1.979e15 / 2, |
| 31 | + 'fp8': 3.958e15 / 2, |
| 32 | + 'amp_fp8': 3.958e15 / 2, |
| 33 | + 'int8': 3.958e15 / 2, |
| 34 | +} |
| 35 | + |
23 | 36 | GPU_AVAILABLE_FLOPS = {
|
| 37 | + # source: https://resources.nvidia.com/en-us-data-center-overview-mc/en-us-data-center-overview/hpc-datasheet-sc23-h200 |
| 38 | + 'h200-sxm': _HSERIES_SXM, |
24 | 39 | # source: https://resources.nvidia.com/en-us-tensor-core/nvidia-tensor-core-gpu-datasheet
|
25 | 40 | # nvidia publishes spec sheet with a 2x sparsity factor
|
26 |
| - 'h100-sxm': { |
27 |
| - 'fp64': 67e12, |
28 |
| - 'fp32': 67e12, |
29 |
| - 'tf32': 989e12 / 2, |
30 |
| - 'fp16': 1.979e15 / 2, |
31 |
| - 'amp_fp16': 1.979e15 / 2, |
32 |
| - 'bf16': 1.979e15 / 2, |
33 |
| - 'amp_bf16': 1.979e15 / 2, |
34 |
| - 'fp8': 3.958e15 / 2, |
35 |
| - 'amp_fp8': 3.958e15 / 2, |
36 |
| - 'int8': 3.958e15 / 2, |
37 |
| - }, |
| 41 | + 'h100-sxm': _HSERIES_SXM, |
38 | 42 | 'h100-pcie': {
|
39 | 43 | 'fp64': 51e12,
|
40 | 44 | 'fp32': 51e12,
|
@@ -107,7 +111,11 @@ def get_gpu_flops_available(state: State):
|
107 | 111 | if torch.cuda.is_available():
|
108 | 112 | # torch.cuda.get_device_name() ex output: 'NVIDIA A100-SXM4-40GB'
|
109 | 113 | device_name = torch.cuda.get_device_name().lower()
|
110 |
| - if 'h100' in device_name and 'hbm3' in device_name: |
| 114 | + if 'h200' in device_name: |
| 115 | + # We just assume SXM because device name does not differentiate, and we would have to check |
| 116 | + # power or bandwidth or something. |
| 117 | + device_name = 'h200-sxm' |
| 118 | + elif 'h100' in device_name and 'hbm3' in device_name: |
111 | 119 | device_name = 'h100-sxm'
|
112 | 120 | elif 'h100' in device_name and ('pcie' in device_name or 'hbm2e' in device_name):
|
113 | 121 | device_name = 'h100-pcie'
|
|
0 commit comments