Skip to content

Commit 11dbc63

Browse files
committed
Add per cluster's node gpu maps
1 parent 262d37c commit 11dbc63

File tree

9 files changed

+274
-57
lines changed

9 files changed

+274
-57
lines changed

config/sarc-dev.json

Lines changed: 88 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,37 @@
2828
"duc_storage_command": null,
2929
"diskusage_report_command": "beegfs-ctl --cfgFile=/etc/beegfs/home.d/beegfs-client.conf --getquota --uid $USER --csv",
3030
"prometheus_url": "http://monitoring.server.mila.quebec:9090/",
31-
"start_date": "2022-04-01"
31+
"start_date": "2022-04-01",
32+
"gpus_per_nodes": {
33+
"cn-a{{[001-011]}}": {
34+
"rtx8000": "Quadro RTX 8000"
35+
},
36+
"cn-b{{[001-005]}}": {
37+
"v100": "Tesla V100-SXM2-32GB"
38+
},
39+
"cn-c{{[001-040]}}": {
40+
"rtx8000": "Quadro RTX 8000"
41+
},
42+
"cn-g{{[001-029]}}": {
43+
"a100": "NVIDIA A100 80GB PCIe",
44+
"[0-9]+g\\.[0-9]+gb": "__MIG_FLAG__a100"
45+
},
46+
"cn-i001": {
47+
"a100": "NVIDIA A100 80GB PCIe"
48+
},
49+
"cn-j001": {
50+
"a6000": "NVIDIA RTX A6000"
51+
},
52+
"cn-d{{[001-002]}}": {
53+
"a100": "NVIDIA A100-SXM4-40GB"
54+
},
55+
"cn-d{{[003-004]}}": {
56+
"a100": "NVIDIA A100-SXM4-80GB"
57+
},
58+
"cn-e{{[002-003]}}": {
59+
"v100": "Tesla V100-SXM2-32GB"
60+
}
61+
}
3262
},
3363
"narval": {
3464
"host": "narval.computecanada.ca",
@@ -40,7 +70,12 @@
4070
"diskusage_report_command": "diskusage_report --project --all_users",
4171
"prometheus_url": "https://mila-thanos.calculquebec.ca",
4272
"prometheus_headers_file": "secrets/drac_prometheus/headers.json",
43-
"start_date": "2022-04-01"
73+
"start_date": "2022-04-01",
74+
"gpus_per_nodes": {
75+
"__DEFAULTS__": {
76+
"a100": "NVIDIA A100-SXM4-40GB"
77+
}
78+
}
4479
},
4580
"beluga": {
4681
"host": "beluga.computecanada.ca",
@@ -52,7 +87,12 @@
5287
"diskusage_report_command": "diskusage_report --project --all_users",
5388
"prometheus_url": "https://mila-thanos.calculquebec.ca",
5489
"prometheus_headers_file": "secrets/drac_prometheus/headers.json",
55-
"start_date": "2022-04-01"
90+
"start_date": "2022-04-01",
91+
"gpus_per_nodes": {
92+
"__DEFAULTS__": {
93+
"v100": "Tesla V100-SXM2-16GB"
94+
}
95+
}
5696
},
5797
"graham": {
5898
"host": "graham.computecanada.ca",
@@ -65,7 +105,30 @@
65105
"prometheus_url": null,
66106
"prometheus_headers_file": null,
67107
"start_date": "2022-04-01",
68-
"nodes_info_file": "secrets/nodes_graham.txt"
108+
"nodes_info_file": "secrets/nodes_graham.txt",
109+
"gpus_per_nodes": {
110+
"gra{{[828-987]}}": {
111+
"p100": "NVIDIA P100-12G PCIe"
112+
},
113+
"gra{{[1147-1153]}}": {
114+
"v100": "NVIDIA V100-16G PCIe"
115+
},
116+
"gra{{[1154-1189]}}": {
117+
"t4": "NVIDIA T4-16G PCIe"
118+
},
119+
"gra{{[1337-1338]}}": {
120+
"v100": "NVIDIA V100-32G PCIe"
121+
},
122+
"gra1342": {
123+
"a100": "NVIDIA A100 PCIe"
124+
},
125+
"gra{{[1361-1362]}}": {
126+
"a100": "NVIDIA A100 PCIe"
127+
},
128+
"gra{{[1363-1373]}}": {
129+
"a5000": "NVIDIA A5000-24G PCIe"
130+
}
131+
}
69132
},
70133
"cedar": {
71134
"host": "cedar.computecanada.ca",
@@ -78,7 +141,27 @@
78141
"prometheus_url": null,
79142
"prometheus_headers_file": null,
80143
"start_date": "2022-04-01",
81-
"nodes_info_file": "secrets/nodes_cedar.txt"
144+
"nodes_info_file": "secrets/nodes_cedar.txt",
145+
"gpus_per_nodes": {
146+
"cdr{{[26-386]}}": {
147+
"p100": "NVIDIA P100-12G PCIe"
148+
},
149+
"cdr{{[876-904]}}": {
150+
"p100l": "NVIDIA P100-16G PCIe"
151+
},
152+
"cdr{{[905-906]}}": {
153+
"p100": "NVIDIA P100-12G PCIe"
154+
},
155+
"cdr{{[908-911]}}": {
156+
"p100l": "NVIDIA P100-16G PCIe"
157+
},
158+
"cdr{{[912-922]}}": {
159+
"p100": "NVIDIA P100-12G PCIe"
160+
},
161+
"cdr{{[2468-2678]}}": {
162+
"v100l": "NVIDIA V100-32G PCIe"
163+
}
164+
}
82165
}
83166
}
84167
}

sarc/config.py

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -83,8 +83,7 @@ class ClusterConfig(BaseModel):
8383
duc_storage_command: str = None
8484
diskusage_report_command: str = None
8585
start_date: str = "2022-04-01"
86-
gpus: list = []
87-
harmonize_gpu_map: dict = {}
86+
gpus_per_nodes: dict = {}
8887

8988
@validator("timezone")
9089
def _timezone(cls, value):
@@ -138,7 +137,7 @@ def node_to_gpu(self):
138137
from .jobs.node_gpu_mapping import NodeToGPUMapping
139138

140139
return NodeToGPUMapping(
141-
self.name, self.nodes_info_file, self.harmonize_gpu_map, self.gpus
140+
self.name, self.nodes_info_file, self.gpus_per_nodes
142141
)
143142

144143

sarc/jobs/node_gpu_mapping.py

Lines changed: 67 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -12,22 +12,65 @@
1212
from hostlist import expand_hostlist
1313

1414

15+
MIG_FLAG = "__MIG__"
16+
DEFAULTS_FLAG = "__DEFAULTS__"
17+
18+
19+
def _find_pattern(string:str) -> tuple:
20+
try:
21+
begin = string.index("{{")
22+
end = string.index("}}")
23+
except ValueError:
24+
return None, None
25+
return string[begin:end+2], string[begin+2:end]
26+
27+
28+
def _expand_list(list_pattern:str) -> str:
29+
start, stop = list_pattern[1:-1].split("-")
30+
start, stop = int(start), int(stop)
31+
regex = "|".join([f"0*{i}" for i in range(start, stop + 1)])
32+
return f"({regex})"
33+
34+
35+
EXPAND_PATTERNS = {
36+
re.compile(r"^\[.*\]$"): _expand_list
37+
}
38+
39+
40+
def expand_patterns(string:str) -> re.Pattern:
41+
full_pattern, pattern = _find_pattern(string)
42+
while pattern:
43+
for pattern_regex, _expand in EXPAND_PATTERNS.items():
44+
if pattern_regex.match(pattern):
45+
regex = _expand(pattern)
46+
string = string.replace(full_pattern, regex)
47+
break
48+
else:
49+
raise ValueError(f"Unknown pattern {full_pattern}")
50+
51+
full_pattern, pattern = _find_pattern(string)
52+
53+
return re.compile(string)
54+
55+
1556
class NodeToGPUMapping:
1657
"""Helper class to generate JSON file, load it in memory, and query GPU type for a nodename."""
1758

18-
def __init__(self, cluster_name, nodes_info_file, harmonize_gpu_map, gpus):
59+
def __init__(self, cluster_name, nodes_info_file, gpus_per_nodes:dict):
1960
"""Initialize with cluster name and TXT file path to parse."""
2061

2162
# Mapping is empty by default.
2263
self.mapping = {}
2364
self.json_path = None
24-
self.harmonize_gpu_map = {
25-
**{
26-
re.compile(regex): gpu_type
27-
for regex, gpu_type in harmonize_gpu_map.items()
28-
},
29-
**{re.compile(f".*{gpu}.*"): gpu for gpu in gpus},
30-
}
65+
self.harmonize_gpu_map = {DEFAULTS_FLAG: {}}
66+
for node_pattern, node_gpus in gpus_per_nodes.items():
67+
if node_pattern != DEFAULTS_FLAG:
68+
node_pattern = expand_patterns(f"^{node_pattern}$")
69+
self.harmonize_gpu_map[node_pattern] = {
70+
re.compile(f".*{gpu}.*"): descriptive_gpu
71+
for gpu, descriptive_gpu in node_gpus.items()
72+
}
73+
self.default_gpu_map = self.harmonize_gpu_map.pop(DEFAULTS_FLAG)
3174

3275
# Mapping is filled only if TXT file is available.
3376
if nodes_info_file and os.path.exists(nodes_info_file):
@@ -54,25 +97,37 @@ def __init__(self, cluster_name, nodes_info_file, harmonize_gpu_map, gpus):
5497
with open(self.json_path, encoding="utf-8") as file:
5598
self.mapping = json.load(file)
5699

57-
def _harmonize_gpu(self, gpu_type: str):
100+
def _harmonize_gpu(self, nodename:str, gpu_type: str):
58101
gpu_type = gpu_type.lower().replace(" ", "-").split(":")
59102
if gpu_type[0] == "gpu":
60103
gpu_type.pop(0)
61104
gpu_type = gpu_type[0]
62-
for regex, harmonized_gpu in self.harmonize_gpu_map.items():
105+
106+
for node_regex, gpu_map in self.harmonize_gpu_map.items():
107+
if node_regex.match(nodename):
108+
break
109+
else:
110+
gpu_map = self.default_gpu_map
111+
112+
for regex, harmonized_gpu in gpu_map.items():
63113
if regex.match(gpu_type):
64114
break
65115
else:
66116
harmonized_gpu = None
117+
118+
if harmonized_gpu and harmonized_gpu.startswith(MIG_FLAG):
119+
harmonized_gpu = self._harmonize_gpu(nodename, harmonized_gpu[len(MIG_FLAG):])
120+
harmonized_gpu = f"{harmonized_gpu} : {gpu_type}"
121+
67122
return harmonized_gpu
68123

69124
def __getitem__(self, nodename):
70125
"""Return GPU type for nodename, or None if not found."""
71126
gpu_type = self.mapping.get(nodename, None)
72-
return self._harmonize_gpu(gpu_type)
127+
return self._harmonize_gpu(nodename, gpu_type)
73128

74129
@staticmethod
75-
def _parse_nodenames(path: str, output: dict):
130+
def _parse_nodenames(self, path: str, output: dict):
76131
"""
77132
Parse node-to-GPU mapping from a path and save parsed nodes into output dict.
78133

tests/conftest.py

Lines changed: 4 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -1,15 +1,15 @@
1+
import json
12
import os
2-
import shutil
33
import sys
44
import tempfile
55
import zoneinfo
66
from pathlib import Path
7-
from unittest.mock import MagicMock, mock_open
7+
from unittest.mock import MagicMock, mock_open, patch
88

99
from opentelemetry.sdk.trace import TracerProvider
1010
from opentelemetry.sdk.trace.export import SimpleSpanProcessor
1111
from opentelemetry.sdk.trace.export.in_memory_span_exporter import InMemorySpanExporter
12-
from opentelemetry.trace import set_tracer_provider
12+
from opentelemetry.trace import get_tracer_provider, set_tracer_provider
1313

1414
_tracer_provider = TracerProvider()
1515
_exporter = InMemorySpanExporter()
@@ -24,6 +24,7 @@
2424
from sarc.config import (
2525
ClusterConfig,
2626
Config,
27+
MongoConfig,
2728
ScraperConfig,
2829
config,
2930
parse_config,
@@ -72,14 +73,6 @@ def disabled_cache():
7273
yield
7374

7475

75-
# Make sure the cache dir is empty before running the tests
76-
@pytest.fixture(scope="session", autouse=True)
77-
def clean_up_test_cache_before_run(standard_config_object):
78-
if standard_config_object.cache.exists():
79-
shutil.rmtree(str(standard_config_object.cache))
80-
yield
81-
82-
8376
@pytest.fixture
8477
def tzlocal_is_mtl(monkeypatch):
8578
monkeypatch.setattr("sarc.config.TZLOCAL", zoneinfo.ZoneInfo("America/Montreal"))

tests/functional/jobs/test_func_sacct.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -445,7 +445,7 @@ def test_get_gpu_type_without_prometheus(
445445
job = jobs[0]
446446
print(job)
447447
print(job.nodes)
448-
assert job.allocated.gpu_type == "asupergpu"
448+
assert job.allocated.gpu_type == "Nec Plus ULTRA GPU 2000"
449449

450450
file_regression.check(
451451
f"Found {len(jobs)} job(s):\n"

tests/functional/jobs/test_func_sacct/test_get_gpu_type_without_prometheus_json_jobs0_test_config0_.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -42,7 +42,7 @@ Found 1 job(s):
4242
"node": 1,
4343
"billing": 1,
4444
"gres_gpu": 1,
45-
"gpu_type": "asupergpu"
45+
"gpu_type": "Nec Plus ULTRA GPU 2000"
4646
},
4747
"stored_statistics": null
4848
}

tests/functional/test_clusterconfig.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -8,9 +8,9 @@ def test_clusterconfig_node_to_gpu():
88
cluster_config = config().clusters["raisin_no_prometheus"]
99
mapping = cluster_config.node_to_gpu
1010

11-
result = mapping["cn-c018"]
12-
assert result in cluster_config.gpus
11+
nodename = "cn-c018"
12+
result = mapping[nodename]
1313
assert (
14-
mapping._harmonize_gpu(f"{cluster_config.gpus[0]}_suffix")
15-
== cluster_config.gpus[0]
14+
result
15+
== cluster_config.gpus_per_nodes[nodename]["asupergpu"]
1616
)

tests/sarc-test.json

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -41,9 +41,13 @@
4141
"diskusage_report_command": null,
4242
"prometheus_url": null,
4343
"nodes_info_file": "tests/not-so-secrets/raisin_no_prometheus/nodes_raisin_no_prometheus.txt",
44-
"gpus": ["asupergpu"],
45-
"harmonize_gpu_map": {
46-
".*asupergpu_suffix.*": "asupergpu"
44+
"gpus_per_nodes": {
45+
"cn-c018": {
46+
"asupergpu": "Nec Plus Plus ULTRA GPU 2000"
47+
},
48+
"cn-c{{[019-030]}}": {
49+
"asupergpu": "Nec Plus ULTRA GPU 2000"
50+
}
4751
}
4852
},
4953
"fromage": {

0 commit comments

Comments
 (0)