Update @satyaog code from PR #115 to harmonize GPU names

notoraptor · notoraptor · commit 581377bed5a0 · 2025-01-30T10:25:55.000-05:00
diff --git a/config/sarc-dev.json b/config/sarc-dev.json
@@ -28,7 +28,42 @@
             "duc_storage_command": null,
             "diskusage_report_command": "beegfs-ctl --cfgFile=/etc/beegfs/home.d/beegfs-client.conf --getquota --uid $USER --csv",
             "prometheus_url": "http://prometheus01.server.mila.quebec:9090/",
-            "start_date": "2022-04-01"
+            "start_date": "2022-04-01",
+            "gpus_per_nodes": {
+                "__DEFAULTS__": {
+                    "rtx8000": "RTX8000"
+                },
+                "cn-b[001-005]": {
+                    "v100": "V100-SXM2-32GB"
+                },
+                "cn-d[001-002]": {
+                    "a100": "A100-SXM4-40GB"
+                },
+                "cn-d[003-004]": {
+                    "a100l": "A100-SXM4-80GB"
+                },
+                "cn-e[002-003]": {
+                    "v100": "V100-SXM2-32GB"
+                },
+                "cn-g[001-029]": {
+                    "a100l": "A100-SXM4-80GB"
+                },
+                "cn-i001": {
+                    "a100l": "A100-PCIe-80GB"
+                },
+                "cn-j001": {
+                    "a6000": "A6000"
+                },
+                "cn-k[001-004]": {
+                    "a100": "A100-SXM4-40GB"
+                },
+                "cn-l[001-091]": {
+                    "l40s": "L40S"
+                },
+                "cn-n[001-002]": {
+                    "h100": "H100-SXM5-80GB"
+                }
+            }
         },
         "narval": {
             "host": "narval.computecanada.ca",
@@ -42,7 +77,16 @@
             "prometheus_headers_file": "secrets/drac_prometheus/headers.json",
             "start_date": "2022-04-01",
             "rgu_start_date": "2023-11-28",
-            "gpu_to_rgu_billing": "secrets/gpu_to_rgu_billing_narval.json"
+            "gpu_to_rgu_billing": "secrets/gpu_to_rgu_billing_narval.json",
+            "gpus_per_nodes": {
+                "__DEFAULTS__": {
+                    "a100": "A100-SXM4-40GB",
+                    "a100_1g.5gb": "__MIG_FLAG__a100",
+                    "a100_2g.10gb": "__MIG_FLAG__a100",
+                    "a100_3g.20gb": "__MIG_FLAG__a100",
+                    "a100_4g.20gb": "__MIG_FLAG__a100"
+                }
+            }
         },
         "beluga": {
             "host": "beluga.computecanada.ca",
@@ -56,7 +100,12 @@
             "prometheus_headers_file": "secrets/drac_prometheus/headers.json",
             "start_date": "2022-04-01",
             "rgu_start_date": "2024-04-03",
-            "gpu_to_rgu_billing": "secrets/gpu_to_rgu_billing_beluga.json"
+            "gpu_to_rgu_billing": "secrets/gpu_to_rgu_billing_beluga.json",
+            "gpus_per_nodes": {
+                "__DEFAULTS__": {
+                    "tesla_v100-sxm2-16gb": "V100-SXM2-16GB"
+                }
+            }
         },
         "graham": {
             "host": "graham.computecanada.ca",
@@ -70,7 +119,30 @@
             "prometheus_headers_file": null,
             "start_date": "2022-04-01",
             "rgu_start_date": "2024-04-03",
-            "gpu_to_rgu_billing": "secrets/gpu_to_rgu_billing_graham.json"
+            "gpu_to_rgu_billing": "secrets/gpu_to_rgu_billing_graham.json",
+            "gpus_per_nodes": {
+                "gra[828-987]": {
+                    "p100": "P100-PCIe-12GB"
+                },
+                "gra[1147-1153]": {
+                    "v100": "V100-PCIe-16GB"
+                },
+                "gra[1154-1189]": {
+                    "t4": "T4"
+                },
+                "gra[1337-1338]": {
+                    "v100": "V100-SXM2-32GB"
+                },
+                "gra1342": {
+                    "a100": "A100-SXM4-80GB"
+                },
+                "gra[1361-1362]": {
+                    "a100": "A100-PCIe-80GB"
+                },
+                "gra[1363-1373]": {
+                    "a5000": "A5000"
+                }
+            }
         },
         "cedar": {
             "host": "cedar.computecanada.ca",
@@ -84,7 +156,14 @@
             "prometheus_headers_file": null,
             "start_date": "2022-04-01",
             "rgu_start_date": "2024-04-03",
-            "gpu_to_rgu_billing": "secrets/gpu_to_rgu_billing_cedar.json"
+            "gpu_to_rgu_billing": "secrets/gpu_to_rgu_billing_cedar.json",
+            "gpus_per_nodes": {
+                "__DEFAULTS__": {
+                    "p100": "P100-PCIe-12GB",
+                    "p100l": "P100-PCIe-16GB",
+                    "v100l": "V100-PCIe-32GB"
+                }
+            }
         }
     }
 }
diff --git a/sarc/config.py b/sarc/config.py
@@ -7,13 +7,14 @@
 from datetime import date, datetime
 from functools import cached_property
 from pathlib import Path
-from typing import Any, Union
+from typing import Any, Dict, Union
 
 import pydantic
 import tzlocal
 from bson import ObjectId
+from hostlist import expand_hostlist
 from pydantic import BaseModel as _BaseModel
-from pydantic import Extra, validator
+from pydantic import Extra, Field, validator
 
 MTL = zoneinfo.ZoneInfo("America/Montreal")
 PST = zoneinfo.ZoneInfo("America/Vancouver")
@@ -70,6 +71,10 @@ def replace(self, **replacements):
         return type(self)(**new_arguments)
 
 
+MIG_FLAG = "__MIG__"
+DEFAULTS_FLAG = "__DEFAULTS__"
+
+
 class ClusterConfig(BaseModel):
     host: str = "localhost"
     timezone: Union[str, zoneinfo.ZoneInfo]  # | does not work with Pydantic's eval
@@ -87,13 +92,48 @@ class ClusterConfig(BaseModel):
     gpu_to_rgu_billing: Path = None
     slurm_conf_host_path: str = "/etc/slurm/slurm.conf"
 
+    # Dictionary mapping a node name -> gpu type -> IGUANE gpu name
+    gpus_per_nodes: Dict[str, Dict[str, str]] = Field(default_factory=dict)
+
     @validator("timezone")
     def _timezone(cls, value):
         if isinstance(value, str):
             return zoneinfo.ZoneInfo(value)
         else:
             return value
 
+    @validator("gpus_per_nodes")
+    def _expand_gpus_per_nodes(cls, value: dict):
+        # Convert node list to node names with `expand_hostlist`
+        return {
+            node: gpu_to_desc
+            for node_list, gpu_to_desc in value.items()
+            for node in expand_hostlist(node_list)
+        }
+
+    def harmonize_gpu(self, nodename: str, gpu_type: str) -> str:
+        """Actual utility method to get a GPU name from given node and gpu type."""
+
+        gpu_type = gpu_type.lower().replace(" ", "-").split(":")
+        if gpu_type[0] == "gpu":
+            gpu_type.pop(0)
+        gpu_type = gpu_type[0]
+
+        if nodename in self.gpus_per_nodes:
+            gpu_map = self.gpus_per_nodes[nodename]
+        else:
+            gpu_map = self.gpus_per_nodes.get(DEFAULTS_FLAG, {})
+
+        harmonized_gpu = gpu_map.get(gpu_type, None)
+
+        if harmonized_gpu and harmonized_gpu.startswith(MIG_FLAG):
+            harmonized_gpu = self.harmonize_gpu(
+                nodename, harmonized_gpu[len(MIG_FLAG) :]
+            )
+            harmonized_gpu = f"{harmonized_gpu} : {gpu_type}"
+
+        return harmonized_gpu
+
     @cached_property
     def ssh(self):
         from fabric import Config as FabricConfig
diff --git a/tests/sarc-test.json b/tests/sarc-test.json
@@ -41,7 +41,15 @@
             "duc_inodes_command": null,
             "duc_storage_command": null,
             "diskusage_report_command": null,
-            "prometheus_url": null
+            "prometheus_url": null,
+            "gpus_per_nodes" : {
+                "cn-c018": {
+                    "asupergpu": "Nec Plus Plus ULTRA GPU 2000"
+                },
+                "cn-c[019-030]": {
+                    "asupergpu": "Nec Plus ULTRA GPU 2000"
+                }
+            }
         },
         "fromage": {
             "host": "fromage",
diff --git a/tests/unittests/jobs/test_harmonize_gpu.py b/tests/unittests/jobs/test_harmonize_gpu.py
@@ -0,0 +1,70 @@
+import pytest
+
+from sarc.config import DEFAULTS_FLAG, MIG_FLAG, ClusterConfig, config
+
+GPUS_PER_NODES = {
+    "node[0-9]": {"gpu1": "DESCRIPTIVE GPU 1"},
+    "node[9-19]": {"gpu2": "DESCRIPTIVE GPU 2"},
+    "node_mig20": {"gpu3": "DESCRIPTIVE GPU 3", "4g.40gb": f"{MIG_FLAG}gpu3"},
+    DEFAULTS_FLAG: {"gpu_default": "DESCRIPTIVE GPU DEFAULT"},
+}
+
+
+@pytest.mark.parametrize(
+    "node,gpu_type,expected,gpus_per_nodes",
+    [
+        [
+            "DoesNotExist",
+            "DoesNotExist",
+            None,
+            {},
+        ],
+        [
+            "node1",
+            "GPU1",
+            "DESCRIPTIVE GPU 1",
+            GPUS_PER_NODES,
+        ],
+        [
+            "node11",
+            "GPU2",
+            "DESCRIPTIVE GPU 2",
+            GPUS_PER_NODES,
+        ],
+        [
+            "DoesNotExist",
+            "GPU_DEFAULT",
+            "DESCRIPTIVE GPU DEFAULT",
+            GPUS_PER_NODES,
+        ],
+        [
+            "node1",
+            "DoesNotExist",
+            None,
+            GPUS_PER_NODES,
+        ],
+        [
+            "node_mig20",
+            "4g.40gb",
+            "DESCRIPTIVE GPU 3 : 4g.40gb",
+            GPUS_PER_NODES,
+        ],
+    ],
+)
+def test_harmonize_gpu(node, gpu_type, expected, gpus_per_nodes):
+    cluster = ClusterConfig(timezone="America/Montreal", gpus_per_nodes=gpus_per_nodes)
+    assert cluster.harmonize_gpu(node, gpu_type) == expected
+
+
+@pytest.mark.usefixtures("standard_config")
+@pytest.mark.parametrize(
+    "node,gpu_type,expected",
+    [
+        ("cn-c018", "asupergpu", "Nec Plus Plus ULTRA GPU 2000"),
+        ("cn-c019", "asupergpu", "Nec Plus ULTRA GPU 2000"),
+        ("cn-c024", "asupergpu", "Nec Plus ULTRA GPU 2000"),
+    ],
+)
+def test_clusterconfig_harmonize_gpu(node, gpu_type, expected):
+    cluster = config().clusters["raisin_no_prometheus"]
+    assert cluster.harmonize_gpu(node, gpu_type) == expected