|
517 | 517 | } |
518 | 518 | ], |
519 | 519 | "source": [ |
520 | | - "def get_device():\n", |
| 520 | + "from packaging import version\n", |
| 521 | + "\n", |
| 522 | + "def get_device(enable_tensor_cores=True):\n", |
521 | 523 | " if torch.cuda.is_available():\n", |
522 | 524 | " device = torch.device(\"cuda\")\n", |
523 | 525 | " print(\"Using NVIDIA CUDA GPU\")\n", |
| 526 | + " \n", |
| 527 | + " if enable_tensor_cores:\n", |
| 528 | + " if version.parse(torch.__version__) >= version.parse(\"2.9.0\"):\n", |
| 529 | + " torch.backends.cuda.matmul.fp32_precision = \"tf32\"\n", |
| 530 | + " torch.backends.cudnn.conv.fp32_precision = \"tf32\"\n", |
| 531 | + " else:\n", |
| 532 | + " torch.backends.cuda.matmul.allow_tf32 = True\n", |
| 533 | + " torch.backends.cudnn.allow_tf32 = True\n", |
| 534 | + "\n", |
524 | 535 | " elif torch.backends.mps.is_available():\n", |
525 | 536 | " device = torch.device(\"mps\")\n", |
526 | 537 | " print(\"Using Apple Silicon GPU (MPS)\")\n", |
| 538 | + "\n", |
527 | 539 | " elif torch.xpu.is_available():\n", |
528 | 540 | " device = torch.device(\"xpu\")\n", |
529 | | - " print(\"Intel GPU\")\n", |
| 541 | + " print(\"Using Intel GPU\")\n", |
| 542 | + "\n", |
530 | 543 | " else:\n", |
531 | 544 | " device = torch.device(\"cpu\")\n", |
532 | 545 | " print(\"Using CPU\")\n", |
| 546 | + "\n", |
533 | 547 | " return device\n", |
534 | 548 | "\n", |
535 | 549 | "device = get_device()" |
|
1490 | 1504 | "id": "e403cdac-c633-49a6-a713-7735efc46a60", |
1491 | 1505 | "metadata": {}, |
1492 | 1506 | "source": [ |
1493 | | - "| Model | Mode | Hardware | Tokens/sec | GPU Memory (VRAM) |\n", |
1494 | | - "|------------|-------------------|-----------------|---------------|-------------------|\n", |
1495 | | - "| Qwen3Model | Regular | Mac Mini M4 CPU | 6 | - |\n", |
1496 | | - "| Qwen3Model | Regular compiled | Mac Mini M4 CPU | 6 | - |\n", |
1497 | | - "| Qwen3Model | KV cache | Mac Mini M4 CPU | 28 | - |\n", |
1498 | | - "| Qwen3Model | KV cache compiled | Mac Mini M4 CPU | 68 | - |\n", |
1499 | | - "| | | | | |\n", |
1500 | | - "| Qwen3Model | Regular | Mac Mini M4 GPU | 17 | - |\n", |
1501 | | - "| Qwen3Model | Regular compiled | Mac Mini M4 GPU | InductorError | - |\n", |
1502 | | - "| Qwen3Model | KV cache | Mac Mini M4 GPU | 18 | - |\n", |
1503 | | - "| Qwen3Model | KV cache compiled | Mac Mini M4 GPU | InductorError | - |\n", |
1504 | | - "| | | | | |\n", |
1505 | | - "| Qwen3Model | Regular | NVIDIA H100 GPU | 51 | 1.55 GB |\n", |
1506 | | - "| Qwen3Model | Regular compiled | NVIDIA H100 GPU | 164 | 1.81 GB |\n", |
1507 | | - "| Qwen3Model | KV cache | NVIDIA H100 GPU | 48 | 1.52 GB |\n", |
1508 | | - "| Qwen3Model | KV cache compiled | NVIDIA H100 GPU | 141 | 1.81 GB |" |
| 1507 | + "| Model | Mode | Hardware | Tokens/sec | GPU Memory (VRAM) |\n", |
| 1508 | + "|------------|-------------------|----------------------|---------------|-------------------|\n", |
| 1509 | + "| Qwen3Model | Regular | Mac Mini M4 CPU | 6 | - |\n", |
| 1510 | + "| Qwen3Model | Regular compiled | Mac Mini M4 CPU | 6 | - |\n", |
| 1511 | + "| Qwen3Model | KV cache | Mac Mini M4 CPU | 28 | - |\n", |
| 1512 | + "| Qwen3Model | KV cache compiled | Mac Mini M4 CPU | 68 | - |\n", |
| 1513 | + "| | | | | |\n", |
| 1514 | + "| Qwen3Model | Regular | Mac Mini M4 GPU | 17 | - |\n", |
| 1515 | + "| Qwen3Model | Regular compiled | Mac Mini M4 GPU | InductorError | - |\n", |
| 1516 | + "| Qwen3Model | KV cache | Mac Mini M4 GPU | 18 | - |\n", |
| 1517 | + "| Qwen3Model | KV cache compiled | Mac Mini M4 GPU | InductorError | - |\n", |
| 1518 | + "| | | | | |\n", |
| 1519 | + "| Qwen3Model | Regular | NVIDIA H100 GPU | 51 | 1.55 GB |\n", |
| 1520 | + "| Qwen3Model | Regular compiled | NVIDIA H100 GPU | 164 | 1.81 GB |\n", |
| 1521 | + "| Qwen3Model | KV cache | NVIDIA H100 GPU | 48 | 1.52 GB |\n", |
| 1522 | + "| Qwen3Model | KV cache compiled | NVIDIA H100 GPU | 141 | 1.81 GB |\n", |
| 1523 | + "| | | | | |\n", |
| 1524 | + "| Qwen3Model | Regular | NVIDIA DGX Spark GPU | 72 | 1.53 GB |\n", |
| 1525 | + "| Qwen3Model | Regular compiled | NVIDIA DGX Spark GPU | 118 | 1.49 GB |\n", |
| 1526 | + "| Qwen3Model | KV cache | NVIDIA DGX Spark GPU | 69 | 1.47 GB |\n", |
| 1527 | + "| Qwen3Model | KV cache compiled | NVIDIA DGX Spark GPU | 107 | 1.47 GB |" |
1509 | 1528 | ] |
1510 | 1529 | }, |
1511 | 1530 | { |
1512 | 1531 | "cell_type": "markdown", |
1513 | 1532 | "id": "4d05c6a6-3170-4251-9bdf-07489d1104c6", |
1514 | 1533 | "metadata": {}, |
1515 | 1534 | "source": [ |
| 1535 | + "- The NVIDIA DGX Spark above uses a GB10 (Blackwell) GPU\n", |
1516 | 1536 | "- Note that we ran all the examples with a single prompt (i.e., a batch size of 1); if you are curious about batched inference, see appendix E" |
1517 | 1537 | ] |
1518 | 1538 | }, |
|
1550 | 1570 | "name": "python", |
1551 | 1571 | "nbconvert_exporter": "python", |
1552 | 1572 | "pygments_lexer": "ipython3", |
1553 | | - "version": "3.10.16" |
| 1573 | + "version": "3.12.3" |
1554 | 1574 | } |
1555 | 1575 | }, |
1556 | 1576 | "nbformat": 4, |
|
0 commit comments