Add DGX benchmarks (#77)

rasbt · web-flow · commit 882b9d0c7ebc · 2025-10-29T22:05:29.000-05:00
* Add DGX benchmarks

* update tensor core setting

* update
diff --git a/ch02/01_main-chapter-code/ch02_main.ipynb b/ch02/01_main-chapter-code/ch02_main.ipynb
@@ -517,19 +517,33 @@
     }
    ],
    "source": [
-    "def get_device():\n",
+    "from packaging import version\n",
+    "\n",
+    "def get_device(enable_tensor_cores=True):\n",
     "    if torch.cuda.is_available():\n",
     "        device = torch.device(\"cuda\")\n",
     "        print(\"Using NVIDIA CUDA GPU\")\n",
+    "        \n",
+    "        if enable_tensor_cores:\n",
+    "            if version.parse(torch.__version__) >= version.parse(\"2.9.0\"):\n",
+    "                torch.backends.cuda.matmul.fp32_precision = \"tf32\"\n",
+    "                torch.backends.cudnn.conv.fp32_precision = \"tf32\"\n",
+    "            else:\n",
+    "                torch.backends.cuda.matmul.allow_tf32 = True\n",
+    "                torch.backends.cudnn.allow_tf32 = True\n",
+    "\n",
     "    elif torch.backends.mps.is_available():\n",
     "        device = torch.device(\"mps\")\n",
     "        print(\"Using Apple Silicon GPU (MPS)\")\n",
+    "\n",
     "    elif torch.xpu.is_available():\n",
     "        device = torch.device(\"xpu\")\n",
-    "        print(\"Intel GPU\")\n",
+    "        print(\"Using Intel GPU\")\n",
+    "\n",
     "    else:\n",
     "        device = torch.device(\"cpu\")\n",
     "        print(\"Using CPU\")\n",
+    "\n",
     "    return device\n",
     "\n",
     "device = get_device()"
@@ -1490,29 +1504,35 @@
    "id": "e403cdac-c633-49a6-a713-7735efc46a60",
    "metadata": {},
    "source": [
-    "| Model      | Mode              | Hardware        | Tokens/sec    | GPU Memory (VRAM) |\n",
-    "|------------|-------------------|-----------------|---------------|-------------------|\n",
-    "| Qwen3Model | Regular           | Mac Mini M4 CPU | 6             | -                 |\n",
-    "| Qwen3Model | Regular compiled  | Mac Mini M4 CPU | 6             | -                 |\n",
-    "| Qwen3Model | KV cache          | Mac Mini M4 CPU | 28            | -                 |\n",
-    "| Qwen3Model | KV cache compiled | Mac Mini M4 CPU | 68            | -                 |\n",
-    "|            |                   |                 |               |                   |\n",
-    "| Qwen3Model | Regular           | Mac Mini M4 GPU | 17            | -                 |\n",
-    "| Qwen3Model | Regular compiled  | Mac Mini M4 GPU | InductorError | -                 |\n",
-    "| Qwen3Model | KV cache          | Mac Mini M4 GPU | 18            | -                 |\n",
-    "| Qwen3Model | KV cache compiled | Mac Mini M4 GPU | InductorError | -                 |\n",
-    "|            |                   |                 |               |                   |\n",
-    "| Qwen3Model | Regular           | NVIDIA H100 GPU | 51            | 1.55 GB           |\n",
-    "| Qwen3Model | Regular compiled  | NVIDIA H100 GPU | 164           | 1.81 GB           |\n",
-    "| Qwen3Model | KV cache          | NVIDIA H100 GPU | 48            | 1.52 GB           |\n",
-    "| Qwen3Model | KV cache compiled | NVIDIA H100 GPU | 141           | 1.81 GB           |"
+    "| Model      | Mode              | Hardware             | Tokens/sec    | GPU Memory (VRAM) |\n",
+    "|------------|-------------------|----------------------|---------------|-------------------|\n",
+    "| Qwen3Model | Regular           | Mac Mini M4 CPU      | 6             | -                 |\n",
+    "| Qwen3Model | Regular compiled  | Mac Mini M4 CPU      | 6             | -                 |\n",
+    "| Qwen3Model | KV cache          | Mac Mini M4 CPU      | 28            | -                 |\n",
+    "| Qwen3Model | KV cache compiled | Mac Mini M4 CPU      | 68            | -                 |\n",
+    "|            |                   |                      |               |                   |\n",
+    "| Qwen3Model | Regular           | Mac Mini M4 GPU      | 17            | -                 |\n",
+    "| Qwen3Model | Regular compiled  | Mac Mini M4 GPU      | InductorError | -                 |\n",
+    "| Qwen3Model | KV cache          | Mac Mini M4 GPU      | 18            | -                 |\n",
+    "| Qwen3Model | KV cache compiled | Mac Mini M4 GPU      | InductorError | -                 |\n",
+    "|            |                   |                      |               |                   |\n",
+    "| Qwen3Model | Regular           | NVIDIA H100 GPU      | 51            | 1.55 GB           |\n",
+    "| Qwen3Model | Regular compiled  | NVIDIA H100 GPU      | 164           | 1.81 GB           |\n",
+    "| Qwen3Model | KV cache          | NVIDIA H100 GPU      | 48            | 1.52 GB           |\n",
+    "| Qwen3Model | KV cache compiled | NVIDIA H100 GPU      | 141           | 1.81 GB           |\n",
+    "|            |                   |                      |               |                   |\n",
+    "| Qwen3Model | Regular           | NVIDIA DGX Spark GPU | 72            | 1.53 GB           |\n",
+    "| Qwen3Model | Regular compiled  | NVIDIA DGX Spark GPU | 118           | 1.49 GB           |\n",
+    "| Qwen3Model | KV cache          | NVIDIA DGX Spark GPU | 69            | 1.47 GB           |\n",
+    "| Qwen3Model | KV cache compiled | NVIDIA DGX Spark GPU | 107           | 1.47 GB           |"
    ]
   },
   {
    "cell_type": "markdown",
    "id": "4d05c6a6-3170-4251-9bdf-07489d1104c6",
    "metadata": {},
    "source": [
+    "- The NVIDIA DGX Spark above uses a GB10 (Blackwell) GPU\n",
     "- Note that we ran all the examples with a single prompt (i.e., a batch size of 1); if you are curious about batched inference, see appendix E"
    ]
   },
@@ -1550,7 +1570,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.10.16"
+   "version": "3.12.3"
   }
  },
  "nbformat": 4,
diff --git a/ch03/01_main-chapter-code/ch03_main.ipynb b/ch03/01_main-chapter-code/ch03_main.ipynb
@@ -156,10 +156,6 @@
     "\n",
     "device = get_device()\n",
     "\n",
-    "# Lower precision from \"highest\" (default)\n",
-    "# which enables Tensor Cores if applicable\n",
-    "torch.set_float32_matmul_precision(\"high\")\n",
-    "\n",
     "# If you have compatibility issues, try to\n",
     "# uncomment the line below and rerun the notebook\n",
     "# device = \"cpu\"\n",
@@ -1841,25 +1837,25 @@
   },
   {
    "cell_type": "markdown",
-   "id": "8543773b-2d8d-4917-b0cb-25ec0da7e2b9",
+   "id": "408afa29-fadf-4af1-adbf-74e4f87c3aba",
    "metadata": {},
    "source": [
-    "| Mode      | Device | Accuracy | MATH-500 size |\n",
-    "|-----------|--------|----------|---------------|\n",
-    "| Base      | CPU    | 30%      | 10            |\n",
-    "| Base      | CUDA   | 30%      | 10            |\n",
-    "| Base      | MPS    | 20%      | 10            |\n",
-    "| Base      | XPU    | 30%      | 10            |\n",
-    "| Reasoning | CPU    | 90%      | 10            |\n",
-    "| Reasoning | CUDA   | 90%      | 10            |\n",
-    "| Reasoning | MPS    | 80%      | 10            |\n",
-    "| Reasoning | XPU    | 70%      | 10            |\n",
+    "| Mode      | Device | Accuracy | MATH-500 size | Time                  |\n",
+    "|-----------|--------|----------|---------------|-----------------------|\n",
+    "| Base      | CPU    | 30%      | 10            | 0.7 min (Mac Mini M4) |\n",
+    "| Base      | MPS    | 20%      | 10            | 0.4 min (Mac Mini M4) |\n",
+    "| Base      | CUDA   | 30%      | 10            | 0.2 min (DGX Spark)   |\n",
+    "| Base      | XPU    | 30%      | 10            | 1.2 min (Intel)       |\n",
+    "| Reasoning | CPU    | 90%      | 10            | 9.5 min (Mac Mini M4) |\n",
+    "| Reasoning | MPS    | 80%      | 10            | 3.8 min (Mac Mini M4) |\n",
+    "| Reasoning | CUDA   | 90%      | 10            | 3.7 min (DGX Spark)   |\n",
+    "| Reasoning | XPU    | 70%      | 10            | 8.5 min (Intel)       |\n",
     "\n",
     "\n",
-    "| Mode      | Device | Accuracy | MATH-500 size |\n",
-    "|-----------|--------|----------|---------------|\n",
-    "| Base      | CUDA   | 15.3%    | 500           |\n",
-    "| Reasoning | CUDA   | 50.8%    | 500           |\n"
+    "| Mode      | Device | Accuracy | MATH-500 size    | Time                   |\n",
+    "|-----------|--------|----------|------------------|------------------------|\n",
+    "| Base      | CUDA   | 15.6%    | 500              | 10.0 min (DGX Spark)   |\n",
+    "| Reasoning | CUDA   | 50.8%    | 500              | 182.2 min (DGX Spark)  |"
    ]
   },
   {
@@ -1869,8 +1865,10 @@
    "source": [
     "- For reference, above are the different accuracy values \n",
     "- Note that \"GPU\" here refers to a NVIDIA (\"cuda\") GPU; MPS refers to an Apple Silicon M4 chip\n",
-    "- It takes about 0.7 min to evaluate the base model (on a M4 Mac Mini) and about 7 min to evaluate the reasoning model, since it produces much longer responses\n",
-    "- While Qwen3-Base is a pre-trained base model and the Qwen3 recommends using it without chat template, changing `tokenizer = Qwen3Tokenizer(tokenizer_file_path=tokenizer_path)` to `tokenizer = Qwen3Tokenizer(tokenizer_file_path=tokenizer_path, apply_chat_template=True)` boosts the MATH-500 performance substantially (80%); note that it is not clear whether the MATH-500 test set was part of the training data; in the age of LLMs, we can assume that any data available on the internet has been part of the training data (also see the discussion [here](https://github.com/rasbt/LLMs-from-scratch/pull/828#issuecomment-3324829736))"
+    "- The reasoning model is much slower because it produces much longer responses\n",
+    "- While Qwen3-Base is a pre-trained base model and the Qwen3 recommends using it without chat template, changing `tokenizer = Qwen3Tokenizer(tokenizer_file_path=tokenizer_path)` to `tokenizer = Qwen3Tokenizer(tokenizer_file_path=tokenizer_path, apply_chat_template=True)` boosts the MATH-500 performance substantially (80%); note that it is not clear whether the MATH-500 test set was part of the training data; in the age of LLMs, we can assume that any data available on the internet has been part of the training data (also see the discussion [here](https://github.com/rasbt/LLMs-from-scratch/pull/828#issuecomment-3324829736))\n",
+    "- The run for the 500 MATH-500 examples corresponds to changing the code here in the `evaluate_math500_stream` function call from `math_data=math_data[:10],` to `math_data=math_data,`\n",
+    "- The bonus materials contain a script to run the evaluation batched mode for higher throughput (see [../02_math500-verifier-scripts/README.md](../02_math500-verifier-scripts/README.md); on an H100, with a batch size of 128, the base model can be evaluated in 3.3 min, and the reasoning model can be evaluated in 14.6  min"
    ]
   },
   {
@@ -1928,7 +1926,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.10.16"
+   "version": "3.12.3"
   }
  },
  "nbformat": 4,
diff --git a/ch03/02_math500-verifier-scripts/README.md b/ch03/02_math500-verifier-scripts/README.md
@@ -80,6 +80,7 @@ Extra options:
 
 &nbsp;
 
+
 **Implementation note:**
 By default, batched generation halts for sequences that emit a stop token. With `--disable_efficient_mode`, all sequences continue until the longest finishes. This affects compute efficiency only, not qualitative results, since tokens after the stop token are discarded.
 
@@ -98,11 +99,28 @@ Some PyTorch ops used in efficient batched inference are not yet supported on MP
 
 &nbsp;
 
-| Device / Dataset size                                        | Base model | Reasoning model | Notes                                                        |
-| ------------------------------------------------------------ | ---------- | --------------- | ------------------------------------------------------------ |
-| **M4 Mac Mini** (10 examples)                                | ~0.7 min   | ~7 min          | `evaluate_math500.py`<br>Sequential mode                     |
-| **H100 GPU** (500 examples, single-script)                   | ~13.3 min  | ~185.4 min      | `evaluate_math500.py`<br>Sequential mode                     |
-| **H100 GPU** (500 examples, batched, `--batch_size 128`)     | ~3.3 min   | ~14.6 min       | `evaluate_math500_batch.py`<br>Efficient batched mode        |
-| **H100 GPU** (500 examples, batched, `--batch_size 128 --disable_efficient_mode`) | ~21.3 min  | ~21.3 min       | `evaluate_math500_batch.py`<br>Simpler but less efficient batched mode |
+- `evaluate_math500.py --dataset_size 500`
+
+
+| Device / Dataset size                       | Base model | Reasoning model |
+| ------------------------------------------- | ---------- | --------------- |
+| **Mac Mini M4 CPU** (500 examples, sequential | 43.6 min | Didn't run (too hot)           |
+| **Mac Mini M4 GPU** (500 examples, sequential) | 37.5 min | Didn't run (too hot) |
+| **DGX Spark** (500 examples, sequential) | 10.0 min  | 182.2 min      |
+| **H100 GPU** (500 examples, sequential) | 13.3 min  | 185.4 min      |
+
+<br>
+<br>
+
+- `evaluate_math500_batched.py --dataset_size 500 --batch_size 128`
+
+| Device / Dataset size                                        | Base model | Reasoning model |
+| ------------------------------------------------------------ | ---------- | --------------- |
+| **Mac Mini M4 CPU** (500 examples, batched, `--batch_size 128`) | 167.2 min | Didn't run (too hot)           |
+| **Mac Mini M4 GPU** (500 examples, batched, `--batch_size 128`) | Error*     | Error           |
+| **DGX Spark** (500 examples, batched, `--batch_size 128`)    | 16.3 min  | 119.3 min      |
+| **H100 GPU** (500 examples, batched, `--batch_size 128`)     | 3.3 min   | 14.6 min       |
+
+
 
-The accuracy of the base model  is 15.6% (78/500); the accuracy of the reasoning model is 50.8% (254/500).
+- The accuracy of the base model  is 15.6% (78/500); the accuracy of the reasoning model is 50.8% (254/500).
diff --git a/reasoning_from_scratch/ch02.py b/reasoning_from_scratch/ch02.py
@@ -3,22 +3,37 @@
 # Code repository: https://github.com/rasbt/reasoning-from-scratch
 
 from .qwen3 import KVCache
+from packaging import version
 import torch
 
 
-def get_device():
+
+
+def get_device(enable_tensor_cores=True):
     if torch.cuda.is_available():
         device = torch.device("cuda")
         print("Using NVIDIA CUDA GPU")
+        
+        if enable_tensor_cores:
+            if version.parse(torch.__version__) >= version.parse("2.9.0"):
+                torch.backends.cuda.matmul.fp32_precision = "tf32"
+                torch.backends.cudnn.conv.fp32_precision = "tf32"
+            else:
+                torch.backends.cuda.matmul.allow_tf32 = True
+                torch.backends.cudnn.allow_tf32 = True
+
     elif torch.backends.mps.is_available():
         device = torch.device("mps")
         print("Using Apple Silicon GPU (MPS)")
+
     elif torch.xpu.is_available():
         device = torch.device("xpu")
-        print("Intel GPU")
+        print("Using Intel GPU")
+
     else:
         device = torch.device("cpu")
         print("Using CPU")
+
     return device