Skip to content

Commit 882b9d0

Browse files
authored
Add DGX benchmarks (#77)
* Add DGX benchmarks * update tensor core setting * update
1 parent 8896759 commit 882b9d0

File tree

4 files changed

+101
-50
lines changed

4 files changed

+101
-50
lines changed

ch02/01_main-chapter-code/ch02_main.ipynb

Lines changed: 39 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -517,19 +517,33 @@
517517
}
518518
],
519519
"source": [
520-
"def get_device():\n",
520+
"from packaging import version\n",
521+
"\n",
522+
"def get_device(enable_tensor_cores=True):\n",
521523
" if torch.cuda.is_available():\n",
522524
" device = torch.device(\"cuda\")\n",
523525
" print(\"Using NVIDIA CUDA GPU\")\n",
526+
" \n",
527+
" if enable_tensor_cores:\n",
528+
" if version.parse(torch.__version__) >= version.parse(\"2.9.0\"):\n",
529+
" torch.backends.cuda.matmul.fp32_precision = \"tf32\"\n",
530+
" torch.backends.cudnn.conv.fp32_precision = \"tf32\"\n",
531+
" else:\n",
532+
" torch.backends.cuda.matmul.allow_tf32 = True\n",
533+
" torch.backends.cudnn.allow_tf32 = True\n",
534+
"\n",
524535
" elif torch.backends.mps.is_available():\n",
525536
" device = torch.device(\"mps\")\n",
526537
" print(\"Using Apple Silicon GPU (MPS)\")\n",
538+
"\n",
527539
" elif torch.xpu.is_available():\n",
528540
" device = torch.device(\"xpu\")\n",
529-
" print(\"Intel GPU\")\n",
541+
" print(\"Using Intel GPU\")\n",
542+
"\n",
530543
" else:\n",
531544
" device = torch.device(\"cpu\")\n",
532545
" print(\"Using CPU\")\n",
546+
"\n",
533547
" return device\n",
534548
"\n",
535549
"device = get_device()"
@@ -1490,29 +1504,35 @@
14901504
"id": "e403cdac-c633-49a6-a713-7735efc46a60",
14911505
"metadata": {},
14921506
"source": [
1493-
"| Model | Mode | Hardware | Tokens/sec | GPU Memory (VRAM) |\n",
1494-
"|------------|-------------------|-----------------|---------------|-------------------|\n",
1495-
"| Qwen3Model | Regular | Mac Mini M4 CPU | 6 | - |\n",
1496-
"| Qwen3Model | Regular compiled | Mac Mini M4 CPU | 6 | - |\n",
1497-
"| Qwen3Model | KV cache | Mac Mini M4 CPU | 28 | - |\n",
1498-
"| Qwen3Model | KV cache compiled | Mac Mini M4 CPU | 68 | - |\n",
1499-
"| | | | | |\n",
1500-
"| Qwen3Model | Regular | Mac Mini M4 GPU | 17 | - |\n",
1501-
"| Qwen3Model | Regular compiled | Mac Mini M4 GPU | InductorError | - |\n",
1502-
"| Qwen3Model | KV cache | Mac Mini M4 GPU | 18 | - |\n",
1503-
"| Qwen3Model | KV cache compiled | Mac Mini M4 GPU | InductorError | - |\n",
1504-
"| | | | | |\n",
1505-
"| Qwen3Model | Regular | NVIDIA H100 GPU | 51 | 1.55 GB |\n",
1506-
"| Qwen3Model | Regular compiled | NVIDIA H100 GPU | 164 | 1.81 GB |\n",
1507-
"| Qwen3Model | KV cache | NVIDIA H100 GPU | 48 | 1.52 GB |\n",
1508-
"| Qwen3Model | KV cache compiled | NVIDIA H100 GPU | 141 | 1.81 GB |"
1507+
"| Model | Mode | Hardware | Tokens/sec | GPU Memory (VRAM) |\n",
1508+
"|------------|-------------------|----------------------|---------------|-------------------|\n",
1509+
"| Qwen3Model | Regular | Mac Mini M4 CPU | 6 | - |\n",
1510+
"| Qwen3Model | Regular compiled | Mac Mini M4 CPU | 6 | - |\n",
1511+
"| Qwen3Model | KV cache | Mac Mini M4 CPU | 28 | - |\n",
1512+
"| Qwen3Model | KV cache compiled | Mac Mini M4 CPU | 68 | - |\n",
1513+
"| | | | | |\n",
1514+
"| Qwen3Model | Regular | Mac Mini M4 GPU | 17 | - |\n",
1515+
"| Qwen3Model | Regular compiled | Mac Mini M4 GPU | InductorError | - |\n",
1516+
"| Qwen3Model | KV cache | Mac Mini M4 GPU | 18 | - |\n",
1517+
"| Qwen3Model | KV cache compiled | Mac Mini M4 GPU | InductorError | - |\n",
1518+
"| | | | | |\n",
1519+
"| Qwen3Model | Regular | NVIDIA H100 GPU | 51 | 1.55 GB |\n",
1520+
"| Qwen3Model | Regular compiled | NVIDIA H100 GPU | 164 | 1.81 GB |\n",
1521+
"| Qwen3Model | KV cache | NVIDIA H100 GPU | 48 | 1.52 GB |\n",
1522+
"| Qwen3Model | KV cache compiled | NVIDIA H100 GPU | 141 | 1.81 GB |\n",
1523+
"| | | | | |\n",
1524+
"| Qwen3Model | Regular | NVIDIA DGX Spark GPU | 72 | 1.53 GB |\n",
1525+
"| Qwen3Model | Regular compiled | NVIDIA DGX Spark GPU | 118 | 1.49 GB |\n",
1526+
"| Qwen3Model | KV cache | NVIDIA DGX Spark GPU | 69 | 1.47 GB |\n",
1527+
"| Qwen3Model | KV cache compiled | NVIDIA DGX Spark GPU | 107 | 1.47 GB |"
15091528
]
15101529
},
15111530
{
15121531
"cell_type": "markdown",
15131532
"id": "4d05c6a6-3170-4251-9bdf-07489d1104c6",
15141533
"metadata": {},
15151534
"source": [
1535+
"- The NVIDIA DGX Spark above uses a GB10 (Blackwell) GPU\n",
15161536
"- Note that we ran all the examples with a single prompt (i.e., a batch size of 1); if you are curious about batched inference, see appendix E"
15171537
]
15181538
},
@@ -1550,7 +1570,7 @@
15501570
"name": "python",
15511571
"nbconvert_exporter": "python",
15521572
"pygments_lexer": "ipython3",
1553-
"version": "3.10.16"
1573+
"version": "3.12.3"
15541574
}
15551575
},
15561576
"nbformat": 4,

ch03/01_main-chapter-code/ch03_main.ipynb

Lines changed: 20 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -156,10 +156,6 @@
156156
"\n",
157157
"device = get_device()\n",
158158
"\n",
159-
"# Lower precision from \"highest\" (default)\n",
160-
"# which enables Tensor Cores if applicable\n",
161-
"torch.set_float32_matmul_precision(\"high\")\n",
162-
"\n",
163159
"# If you have compatibility issues, try to\n",
164160
"# uncomment the line below and rerun the notebook\n",
165161
"# device = \"cpu\"\n",
@@ -1841,25 +1837,25 @@
18411837
},
18421838
{
18431839
"cell_type": "markdown",
1844-
"id": "8543773b-2d8d-4917-b0cb-25ec0da7e2b9",
1840+
"id": "408afa29-fadf-4af1-adbf-74e4f87c3aba",
18451841
"metadata": {},
18461842
"source": [
1847-
"| Mode | Device | Accuracy | MATH-500 size |\n",
1848-
"|-----------|--------|----------|---------------|\n",
1849-
"| Base | CPU | 30% | 10 |\n",
1850-
"| Base | CUDA | 30% | 10 |\n",
1851-
"| Base | MPS | 20% | 10 |\n",
1852-
"| Base | XPU | 30% | 10 |\n",
1853-
"| Reasoning | CPU | 90% | 10 |\n",
1854-
"| Reasoning | CUDA | 90% | 10 |\n",
1855-
"| Reasoning | MPS | 80% | 10 |\n",
1856-
"| Reasoning | XPU | 70% | 10 |\n",
1843+
"| Mode | Device | Accuracy | MATH-500 size | Time |\n",
1844+
"|-----------|--------|----------|---------------|-----------------------|\n",
1845+
"| Base | CPU | 30% | 10 | 0.7 min (Mac Mini M4) |\n",
1846+
"| Base | MPS | 20% | 10 | 0.4 min (Mac Mini M4) |\n",
1847+
"| Base | CUDA | 30% | 10 | 0.2 min (DGX Spark) |\n",
1848+
"| Base | XPU | 30% | 10 | 1.2 min (Intel) |\n",
1849+
"| Reasoning | CPU | 90% | 10 | 9.5 min (Mac Mini M4) |\n",
1850+
"| Reasoning | MPS | 80% | 10 | 3.8 min (Mac Mini M4) |\n",
1851+
"| Reasoning | CUDA | 90% | 10 | 3.7 min (DGX Spark) |\n",
1852+
"| Reasoning | XPU | 70% | 10 | 8.5 min (Intel) |\n",
18571853
"\n",
18581854
"\n",
1859-
"| Mode | Device | Accuracy | MATH-500 size |\n",
1860-
"|-----------|--------|----------|---------------|\n",
1861-
"| Base | CUDA | 15.3% | 500 |\n",
1862-
"| Reasoning | CUDA | 50.8% | 500 |\n"
1855+
"| Mode | Device | Accuracy | MATH-500 size | Time |\n",
1856+
"|-----------|--------|----------|------------------|------------------------|\n",
1857+
"| Base | CUDA | 15.6% | 500 | 10.0 min (DGX Spark) |\n",
1858+
"| Reasoning | CUDA | 50.8% | 500 | 182.2 min (DGX Spark) |"
18631859
]
18641860
},
18651861
{
@@ -1869,8 +1865,10 @@
18691865
"source": [
18701866
"- For reference, above are the different accuracy values \n",
18711867
"- Note that \"GPU\" here refers to a NVIDIA (\"cuda\") GPU; MPS refers to an Apple Silicon M4 chip\n",
1872-
"- It takes about 0.7 min to evaluate the base model (on a M4 Mac Mini) and about 7 min to evaluate the reasoning model, since it produces much longer responses\n",
1873-
"- While Qwen3-Base is a pre-trained base model and the Qwen3 recommends using it without chat template, changing `tokenizer = Qwen3Tokenizer(tokenizer_file_path=tokenizer_path)` to `tokenizer = Qwen3Tokenizer(tokenizer_file_path=tokenizer_path, apply_chat_template=True)` boosts the MATH-500 performance substantially (80%); note that it is not clear whether the MATH-500 test set was part of the training data; in the age of LLMs, we can assume that any data available on the internet has been part of the training data (also see the discussion [here](https://github.com/rasbt/LLMs-from-scratch/pull/828#issuecomment-3324829736))"
1868+
"- The reasoning model is much slower because it produces much longer responses\n",
1869+
"- While Qwen3-Base is a pre-trained base model and the Qwen3 recommends using it without chat template, changing `tokenizer = Qwen3Tokenizer(tokenizer_file_path=tokenizer_path)` to `tokenizer = Qwen3Tokenizer(tokenizer_file_path=tokenizer_path, apply_chat_template=True)` boosts the MATH-500 performance substantially (80%); note that it is not clear whether the MATH-500 test set was part of the training data; in the age of LLMs, we can assume that any data available on the internet has been part of the training data (also see the discussion [here](https://github.com/rasbt/LLMs-from-scratch/pull/828#issuecomment-3324829736))\n",
1870+
"- The run for the 500 MATH-500 examples corresponds to changing the code here in the `evaluate_math500_stream` function call from `math_data=math_data[:10],` to `math_data=math_data,`\n",
1871+
"- The bonus materials contain a script to run the evaluation batched mode for higher throughput (see [../02_math500-verifier-scripts/README.md](../02_math500-verifier-scripts/README.md); on an H100, with a batch size of 128, the base model can be evaluated in 3.3 min, and the reasoning model can be evaluated in 14.6 min"
18741872
]
18751873
},
18761874
{
@@ -1928,7 +1926,7 @@
19281926
"name": "python",
19291927
"nbconvert_exporter": "python",
19301928
"pygments_lexer": "ipython3",
1931-
"version": "3.10.16"
1929+
"version": "3.12.3"
19321930
}
19331931
},
19341932
"nbformat": 4,

ch03/02_math500-verifier-scripts/README.md

Lines changed: 25 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -80,6 +80,7 @@ Extra options:
8080
8181
 
8282
83+
8384
**Implementation note:**
8485
By default, batched generation halts for sequences that emit a stop token. With `--disable_efficient_mode`, all sequences continue until the longest finishes. This affects compute efficiency only, not qualitative results, since tokens after the stop token are discarded.
8586
@@ -98,11 +99,28 @@ Some PyTorch ops used in efficient batched inference are not yet supported on MP
9899
99100
 
100101
101-
| Device / Dataset size | Base model | Reasoning model | Notes |
102-
| ------------------------------------------------------------ | ---------- | --------------- | ------------------------------------------------------------ |
103-
| **M4 Mac Mini** (10 examples) | ~0.7 min | ~7 min | `evaluate_math500.py`<br>Sequential mode |
104-
| **H100 GPU** (500 examples, single-script) | ~13.3 min | ~185.4 min | `evaluate_math500.py`<br>Sequential mode |
105-
| **H100 GPU** (500 examples, batched, `--batch_size 128`) | ~3.3 min | ~14.6 min | `evaluate_math500_batch.py`<br>Efficient batched mode |
106-
| **H100 GPU** (500 examples, batched, `--batch_size 128 --disable_efficient_mode`) | ~21.3 min | ~21.3 min | `evaluate_math500_batch.py`<br>Simpler but less efficient batched mode |
102+
- `evaluate_math500.py --dataset_size 500`
103+
104+
105+
| Device / Dataset size | Base model | Reasoning model |
106+
| ------------------------------------------- | ---------- | --------------- |
107+
| **Mac Mini M4 CPU** (500 examples, sequential | 43.6 min | Didn't run (too hot) |
108+
| **Mac Mini M4 GPU** (500 examples, sequential) | 37.5 min | Didn't run (too hot) |
109+
| **DGX Spark** (500 examples, sequential) | 10.0 min | 182.2 min |
110+
| **H100 GPU** (500 examples, sequential) | 13.3 min | 185.4 min |
111+
112+
<br>
113+
<br>
114+
115+
- `evaluate_math500_batched.py --dataset_size 500 --batch_size 128`
116+
117+
| Device / Dataset size | Base model | Reasoning model |
118+
| ------------------------------------------------------------ | ---------- | --------------- |
119+
| **Mac Mini M4 CPU** (500 examples, batched, `--batch_size 128`) | 167.2 min | Didn't run (too hot) |
120+
| **Mac Mini M4 GPU** (500 examples, batched, `--batch_size 128`) | Error* | Error |
121+
| **DGX Spark** (500 examples, batched, `--batch_size 128`) | 16.3 min | 119.3 min |
122+
| **H100 GPU** (500 examples, batched, `--batch_size 128`) | 3.3 min | 14.6 min |
123+
124+
107125
108-
The accuracy of the base model is 15.6% (78/500); the accuracy of the reasoning model is 50.8% (254/500).
126+
- The accuracy of the base model is 15.6% (78/500); the accuracy of the reasoning model is 50.8% (254/500).

reasoning_from_scratch/ch02.py

Lines changed: 17 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3,22 +3,37 @@
33
# Code repository: https://github.com/rasbt/reasoning-from-scratch
44

55
from .qwen3 import KVCache
6+
from packaging import version
67
import torch
78

89

9-
def get_device():
10+
11+
12+
def get_device(enable_tensor_cores=True):
1013
if torch.cuda.is_available():
1114
device = torch.device("cuda")
1215
print("Using NVIDIA CUDA GPU")
16+
17+
if enable_tensor_cores:
18+
if version.parse(torch.__version__) >= version.parse("2.9.0"):
19+
torch.backends.cuda.matmul.fp32_precision = "tf32"
20+
torch.backends.cudnn.conv.fp32_precision = "tf32"
21+
else:
22+
torch.backends.cuda.matmul.allow_tf32 = True
23+
torch.backends.cudnn.allow_tf32 = True
24+
1325
elif torch.backends.mps.is_available():
1426
device = torch.device("mps")
1527
print("Using Apple Silicon GPU (MPS)")
28+
1629
elif torch.xpu.is_available():
1730
device = torch.device("xpu")
18-
print("Intel GPU")
31+
print("Using Intel GPU")
32+
1933
else:
2034
device = torch.device("cpu")
2135
print("Using CPU")
36+
2237
return device
2338

2439

0 commit comments

Comments
 (0)