Skip to content

Commit 581377b

Browse files
committed
Update @satyaog code from PR #115 to harmonize GPU names
1 parent c42bd4d commit 581377b

File tree

4 files changed

+205
-8
lines changed

4 files changed

+205
-8
lines changed

config/sarc-dev.json

Lines changed: 84 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,42 @@
2828
"duc_storage_command": null,
2929
"diskusage_report_command": "beegfs-ctl --cfgFile=/etc/beegfs/home.d/beegfs-client.conf --getquota --uid $USER --csv",
3030
"prometheus_url": "http://prometheus01.server.mila.quebec:9090/",
31-
"start_date": "2022-04-01"
31+
"start_date": "2022-04-01",
32+
"gpus_per_nodes": {
33+
"__DEFAULTS__": {
34+
"rtx8000": "RTX8000"
35+
},
36+
"cn-b[001-005]": {
37+
"v100": "V100-SXM2-32GB"
38+
},
39+
"cn-d[001-002]": {
40+
"a100": "A100-SXM4-40GB"
41+
},
42+
"cn-d[003-004]": {
43+
"a100l": "A100-SXM4-80GB"
44+
},
45+
"cn-e[002-003]": {
46+
"v100": "V100-SXM2-32GB"
47+
},
48+
"cn-g[001-029]": {
49+
"a100l": "A100-SXM4-80GB"
50+
},
51+
"cn-i001": {
52+
"a100l": "A100-PCIe-80GB"
53+
},
54+
"cn-j001": {
55+
"a6000": "A6000"
56+
},
57+
"cn-k[001-004]": {
58+
"a100": "A100-SXM4-40GB"
59+
},
60+
"cn-l[001-091]": {
61+
"l40s": "L40S"
62+
},
63+
"cn-n[001-002]": {
64+
"h100": "H100-SXM5-80GB"
65+
}
66+
}
3267
},
3368
"narval": {
3469
"host": "narval.computecanada.ca",
@@ -42,7 +77,16 @@
4277
"prometheus_headers_file": "secrets/drac_prometheus/headers.json",
4378
"start_date": "2022-04-01",
4479
"rgu_start_date": "2023-11-28",
45-
"gpu_to_rgu_billing": "secrets/gpu_to_rgu_billing_narval.json"
80+
"gpu_to_rgu_billing": "secrets/gpu_to_rgu_billing_narval.json",
81+
"gpus_per_nodes": {
82+
"__DEFAULTS__": {
83+
"a100": "A100-SXM4-40GB",
84+
"a100_1g.5gb": "__MIG_FLAG__a100",
85+
"a100_2g.10gb": "__MIG_FLAG__a100",
86+
"a100_3g.20gb": "__MIG_FLAG__a100",
87+
"a100_4g.20gb": "__MIG_FLAG__a100"
88+
}
89+
}
4690
},
4791
"beluga": {
4892
"host": "beluga.computecanada.ca",
@@ -56,7 +100,12 @@
56100
"prometheus_headers_file": "secrets/drac_prometheus/headers.json",
57101
"start_date": "2022-04-01",
58102
"rgu_start_date": "2024-04-03",
59-
"gpu_to_rgu_billing": "secrets/gpu_to_rgu_billing_beluga.json"
103+
"gpu_to_rgu_billing": "secrets/gpu_to_rgu_billing_beluga.json",
104+
"gpus_per_nodes": {
105+
"__DEFAULTS__": {
106+
"tesla_v100-sxm2-16gb": "V100-SXM2-16GB"
107+
}
108+
}
60109
},
61110
"graham": {
62111
"host": "graham.computecanada.ca",
@@ -70,7 +119,30 @@
70119
"prometheus_headers_file": null,
71120
"start_date": "2022-04-01",
72121
"rgu_start_date": "2024-04-03",
73-
"gpu_to_rgu_billing": "secrets/gpu_to_rgu_billing_graham.json"
122+
"gpu_to_rgu_billing": "secrets/gpu_to_rgu_billing_graham.json",
123+
"gpus_per_nodes": {
124+
"gra[828-987]": {
125+
"p100": "P100-PCIe-12GB"
126+
},
127+
"gra[1147-1153]": {
128+
"v100": "V100-PCIe-16GB"
129+
},
130+
"gra[1154-1189]": {
131+
"t4": "T4"
132+
},
133+
"gra[1337-1338]": {
134+
"v100": "V100-SXM2-32GB"
135+
},
136+
"gra1342": {
137+
"a100": "A100-SXM4-80GB"
138+
},
139+
"gra[1361-1362]": {
140+
"a100": "A100-PCIe-80GB"
141+
},
142+
"gra[1363-1373]": {
143+
"a5000": "A5000"
144+
}
145+
}
74146
},
75147
"cedar": {
76148
"host": "cedar.computecanada.ca",
@@ -84,7 +156,14 @@
84156
"prometheus_headers_file": null,
85157
"start_date": "2022-04-01",
86158
"rgu_start_date": "2024-04-03",
87-
"gpu_to_rgu_billing": "secrets/gpu_to_rgu_billing_cedar.json"
159+
"gpu_to_rgu_billing": "secrets/gpu_to_rgu_billing_cedar.json",
160+
"gpus_per_nodes": {
161+
"__DEFAULTS__": {
162+
"p100": "P100-PCIe-12GB",
163+
"p100l": "P100-PCIe-16GB",
164+
"v100l": "V100-PCIe-32GB"
165+
}
166+
}
88167
}
89168
}
90169
}

sarc/config.py

Lines changed: 42 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -7,13 +7,14 @@
77
from datetime import date, datetime
88
from functools import cached_property
99
from pathlib import Path
10-
from typing import Any, Union
10+
from typing import Any, Dict, Union
1111

1212
import pydantic
1313
import tzlocal
1414
from bson import ObjectId
15+
from hostlist import expand_hostlist
1516
from pydantic import BaseModel as _BaseModel
16-
from pydantic import Extra, validator
17+
from pydantic import Extra, Field, validator
1718

1819
MTL = zoneinfo.ZoneInfo("America/Montreal")
1920
PST = zoneinfo.ZoneInfo("America/Vancouver")
@@ -70,6 +71,10 @@ def replace(self, **replacements):
7071
return type(self)(**new_arguments)
7172

7273

74+
MIG_FLAG = "__MIG__"
75+
DEFAULTS_FLAG = "__DEFAULTS__"
76+
77+
7378
class ClusterConfig(BaseModel):
7479
host: str = "localhost"
7580
timezone: Union[str, zoneinfo.ZoneInfo] # | does not work with Pydantic's eval
@@ -87,13 +92,48 @@ class ClusterConfig(BaseModel):
8792
gpu_to_rgu_billing: Path = None
8893
slurm_conf_host_path: str = "/etc/slurm/slurm.conf"
8994

95+
# Dictionary mapping a node name -> gpu type -> IGUANE gpu name
96+
gpus_per_nodes: Dict[str, Dict[str, str]] = Field(default_factory=dict)
97+
9098
@validator("timezone")
9199
def _timezone(cls, value):
92100
if isinstance(value, str):
93101
return zoneinfo.ZoneInfo(value)
94102
else:
95103
return value
96104

105+
@validator("gpus_per_nodes")
106+
def _expand_gpus_per_nodes(cls, value: dict):
107+
# Convert node list to node names with `expand_hostlist`
108+
return {
109+
node: gpu_to_desc
110+
for node_list, gpu_to_desc in value.items()
111+
for node in expand_hostlist(node_list)
112+
}
113+
114+
def harmonize_gpu(self, nodename: str, gpu_type: str) -> str:
115+
"""Actual utility method to get a GPU name from given node and gpu type."""
116+
117+
gpu_type = gpu_type.lower().replace(" ", "-").split(":")
118+
if gpu_type[0] == "gpu":
119+
gpu_type.pop(0)
120+
gpu_type = gpu_type[0]
121+
122+
if nodename in self.gpus_per_nodes:
123+
gpu_map = self.gpus_per_nodes[nodename]
124+
else:
125+
gpu_map = self.gpus_per_nodes.get(DEFAULTS_FLAG, {})
126+
127+
harmonized_gpu = gpu_map.get(gpu_type, None)
128+
129+
if harmonized_gpu and harmonized_gpu.startswith(MIG_FLAG):
130+
harmonized_gpu = self.harmonize_gpu(
131+
nodename, harmonized_gpu[len(MIG_FLAG) :]
132+
)
133+
harmonized_gpu = f"{harmonized_gpu} : {gpu_type}"
134+
135+
return harmonized_gpu
136+
97137
@cached_property
98138
def ssh(self):
99139
from fabric import Config as FabricConfig

tests/sarc-test.json

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -41,7 +41,15 @@
4141
"duc_inodes_command": null,
4242
"duc_storage_command": null,
4343
"diskusage_report_command": null,
44-
"prometheus_url": null
44+
"prometheus_url": null,
45+
"gpus_per_nodes" : {
46+
"cn-c018": {
47+
"asupergpu": "Nec Plus Plus ULTRA GPU 2000"
48+
},
49+
"cn-c[019-030]": {
50+
"asupergpu": "Nec Plus ULTRA GPU 2000"
51+
}
52+
}
4553
},
4654
"fromage": {
4755
"host": "fromage",
Lines changed: 70 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,70 @@
1+
import pytest
2+
3+
from sarc.config import DEFAULTS_FLAG, MIG_FLAG, ClusterConfig, config
4+
5+
GPUS_PER_NODES = {
6+
"node[0-9]": {"gpu1": "DESCRIPTIVE GPU 1"},
7+
"node[9-19]": {"gpu2": "DESCRIPTIVE GPU 2"},
8+
"node_mig20": {"gpu3": "DESCRIPTIVE GPU 3", "4g.40gb": f"{MIG_FLAG}gpu3"},
9+
DEFAULTS_FLAG: {"gpu_default": "DESCRIPTIVE GPU DEFAULT"},
10+
}
11+
12+
13+
@pytest.mark.parametrize(
14+
"node,gpu_type,expected,gpus_per_nodes",
15+
[
16+
[
17+
"DoesNotExist",
18+
"DoesNotExist",
19+
None,
20+
{},
21+
],
22+
[
23+
"node1",
24+
"GPU1",
25+
"DESCRIPTIVE GPU 1",
26+
GPUS_PER_NODES,
27+
],
28+
[
29+
"node11",
30+
"GPU2",
31+
"DESCRIPTIVE GPU 2",
32+
GPUS_PER_NODES,
33+
],
34+
[
35+
"DoesNotExist",
36+
"GPU_DEFAULT",
37+
"DESCRIPTIVE GPU DEFAULT",
38+
GPUS_PER_NODES,
39+
],
40+
[
41+
"node1",
42+
"DoesNotExist",
43+
None,
44+
GPUS_PER_NODES,
45+
],
46+
[
47+
"node_mig20",
48+
"4g.40gb",
49+
"DESCRIPTIVE GPU 3 : 4g.40gb",
50+
GPUS_PER_NODES,
51+
],
52+
],
53+
)
54+
def test_harmonize_gpu(node, gpu_type, expected, gpus_per_nodes):
55+
cluster = ClusterConfig(timezone="America/Montreal", gpus_per_nodes=gpus_per_nodes)
56+
assert cluster.harmonize_gpu(node, gpu_type) == expected
57+
58+
59+
@pytest.mark.usefixtures("standard_config")
60+
@pytest.mark.parametrize(
61+
"node,gpu_type,expected",
62+
[
63+
("cn-c018", "asupergpu", "Nec Plus Plus ULTRA GPU 2000"),
64+
("cn-c019", "asupergpu", "Nec Plus ULTRA GPU 2000"),
65+
("cn-c024", "asupergpu", "Nec Plus ULTRA GPU 2000"),
66+
],
67+
)
68+
def test_clusterconfig_harmonize_gpu(node, gpu_type, expected):
69+
cluster = config().clusters["raisin_no_prometheus"]
70+
assert cluster.harmonize_gpu(node, gpu_type) == expected

0 commit comments

Comments
 (0)