Skip to content

Commit 262d37c

Browse files
committed
Harmonize gres across clusters
1 parent 167a4de commit 262d37c

File tree

8 files changed

+101
-11
lines changed

8 files changed

+101
-11
lines changed

sarc/config.py

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -83,6 +83,8 @@ class ClusterConfig(BaseModel):
8383
duc_storage_command: str = None
8484
diskusage_report_command: str = None
8585
start_date: str = "2022-04-01"
86+
gpus: list = []
87+
harmonize_gpu_map: dict = {}
8688

8789
@validator("timezone")
8890
def _timezone(cls, value):
@@ -135,7 +137,9 @@ def node_to_gpu(self):
135137
"""
136138
from .jobs.node_gpu_mapping import NodeToGPUMapping
137139

138-
return NodeToGPUMapping(self.name, self.nodes_info_file)
140+
return NodeToGPUMapping(
141+
self.name, self.nodes_info_file, self.harmonize_gpu_map, self.gpus
142+
)
139143

140144

141145
class MongoConfig(BaseModel):

sarc/jobs/node_gpu_mapping.py

Lines changed: 24 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -8,18 +8,26 @@
88
import json
99
import os
1010

11+
import regex as re
1112
from hostlist import expand_hostlist
1213

1314

1415
class NodeToGPUMapping:
1516
"""Helper class to generate JSON file, load it in memory, and query GPU type for a nodename."""
1617

17-
def __init__(self, cluster_name, nodes_info_file):
18+
def __init__(self, cluster_name, nodes_info_file, harmonize_gpu_map, gpus):
1819
"""Initialize with cluster name and TXT file path to parse."""
1920

2021
# Mapping is empty by default.
2122
self.mapping = {}
2223
self.json_path = None
24+
self.harmonize_gpu_map = {
25+
**{
26+
re.compile(regex): gpu_type
27+
for regex, gpu_type in harmonize_gpu_map.items()
28+
},
29+
**{re.compile(f".*{gpu}.*"): gpu for gpu in gpus},
30+
}
2331

2432
# Mapping is filled only if TXT file is available.
2533
if nodes_info_file and os.path.exists(nodes_info_file):
@@ -36,7 +44,7 @@ def __init__(self, cluster_name, nodes_info_file):
3644
not os.path.exists(self.json_path)
3745
or os.stat(self.json_path).st_mtime < info_file_stat.st_mtime
3846
):
39-
# Pase TXT file into self.mapping.
47+
# Parse TXT file into self.mapping.
4048
self._parse_nodenames(nodes_info_file, self.mapping)
4149
# Save self.mapping into JSON file.
4250
with open(self.json_path, "w", encoding="utf-8") as file:
@@ -46,9 +54,22 @@ def __init__(self, cluster_name, nodes_info_file):
4654
with open(self.json_path, encoding="utf-8") as file:
4755
self.mapping = json.load(file)
4856

57+
def _harmonize_gpu(self, gpu_type: str):
58+
gpu_type = gpu_type.lower().replace(" ", "-").split(":")
59+
if gpu_type[0] == "gpu":
60+
gpu_type.pop(0)
61+
gpu_type = gpu_type[0]
62+
for regex, harmonized_gpu in self.harmonize_gpu_map.items():
63+
if regex.match(gpu_type):
64+
break
65+
else:
66+
harmonized_gpu = None
67+
return harmonized_gpu
68+
4969
def __getitem__(self, nodename):
5070
"""Return GPU type for nodename, or None if not found."""
51-
return self.mapping.get(nodename, None)
71+
gpu_type = self.mapping.get(nodename, None)
72+
return self._harmonize_gpu(gpu_type)
5273

5374
@staticmethod
5475
def _parse_nodenames(path: str, output: dict):

tests/conftest.py

Lines changed: 11 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,15 +1,15 @@
1-
import json
21
import os
2+
import shutil
33
import sys
44
import tempfile
55
import zoneinfo
66
from pathlib import Path
7-
from unittest.mock import MagicMock, mock_open, patch
7+
from unittest.mock import MagicMock, mock_open
88

99
from opentelemetry.sdk.trace import TracerProvider
1010
from opentelemetry.sdk.trace.export import SimpleSpanProcessor
1111
from opentelemetry.sdk.trace.export.in_memory_span_exporter import InMemorySpanExporter
12-
from opentelemetry.trace import get_tracer_provider, set_tracer_provider
12+
from opentelemetry.trace import set_tracer_provider
1313

1414
_tracer_provider = TracerProvider()
1515
_exporter = InMemorySpanExporter()
@@ -24,7 +24,6 @@
2424
from sarc.config import (
2525
ClusterConfig,
2626
Config,
27-
MongoConfig,
2827
ScraperConfig,
2928
config,
3029
parse_config,
@@ -73,6 +72,14 @@ def disabled_cache():
7372
yield
7473

7574

75+
# Make sure the cache dir is empty before running the tests
76+
@pytest.fixture(scope="session", autouse=True)
77+
def clean_up_test_cache_before_run(standard_config_object):
78+
if standard_config_object.cache.exists():
79+
shutil.rmtree(str(standard_config_object.cache))
80+
yield
81+
82+
7683
@pytest.fixture
7784
def tzlocal_is_mtl(monkeypatch):
7885
monkeypatch.setattr("sarc.config.TZLOCAL", zoneinfo.ZoneInfo("America/Montreal"))

tests/functional/jobs/test_func_sacct.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -445,7 +445,7 @@ def test_get_gpu_type_without_prometheus(
445445
job = jobs[0]
446446
print(job)
447447
print(job.nodes)
448-
assert job.allocated.gpu_type == "gpu:asupergpu:4"
448+
assert job.allocated.gpu_type == "asupergpu"
449449

450450
file_regression.check(
451451
f"Found {len(jobs)} job(s):\n"

tests/functional/jobs/test_func_sacct/test_get_gpu_type_without_prometheus_json_jobs0_test_config0_.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -42,7 +42,7 @@ Found 1 job(s):
4242
"node": 1,
4343
"billing": 1,
4444
"gres_gpu": 1,
45-
"gpu_type": "gpu:asupergpu:4"
45+
"gpu_type": "asupergpu"
4646
},
4747
"stored_statistics": null
4848
}
Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,16 @@
1+
import pytest
2+
3+
from sarc.config import config
4+
5+
6+
@pytest.mark.usefixtures("standard_config")
7+
def test_clusterconfig_node_to_gpu():
8+
cluster_config = config().clusters["raisin_no_prometheus"]
9+
mapping = cluster_config.node_to_gpu
10+
11+
result = mapping["cn-c018"]
12+
assert result in cluster_config.gpus
13+
assert (
14+
mapping._harmonize_gpu(f"{cluster_config.gpus[0]}_suffix")
15+
== cluster_config.gpus[0]
16+
)

tests/sarc-test.json

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -40,7 +40,11 @@
4040
"duc_storage_command": null,
4141
"diskusage_report_command": null,
4242
"prometheus_url": null,
43-
"nodes_info_file": "tests/not-so-secrets/raisin_no_prometheus/nodes_raisin_no_prometheus.txt"
43+
"nodes_info_file": "tests/not-so-secrets/raisin_no_prometheus/nodes_raisin_no_prometheus.txt",
44+
"gpus": ["asupergpu"],
45+
"harmonize_gpu_map": {
46+
".*asupergpu_suffix.*": "asupergpu"
47+
}
4448
},
4549
"fromage": {
4650
"host": "fromage",
Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,38 @@
1+
import pytest
2+
3+
from sarc.jobs.node_gpu_mapping import NodeToGPUMapping
4+
5+
6+
@pytest.mark.parametrize(
7+
"gpu_type,expected,harmonize_gpu_map,gpus",
8+
[
9+
[
10+
"DoesNotExist",
11+
None,
12+
{},
13+
[],
14+
],
15+
[
16+
"prefix GPU1:suffix",
17+
"gpu1",
18+
{},
19+
["gpu1", "gpu2"],
20+
],
21+
[
22+
"prefix GPU2 suffix",
23+
"gpu2",
24+
{},
25+
["gpu1", "gpu2"],
26+
],
27+
[
28+
"prefix GPU1_suffix",
29+
"gpu1",
30+
{".*gpu1_suffix.*": "gpu1"},
31+
["gpu1", "gpu2"],
32+
],
33+
],
34+
)
35+
def test_node_to_gpu_mapping(gpu_type, expected, harmonize_gpu_map, gpus):
36+
mapping = NodeToGPUMapping("cluster", None, harmonize_gpu_map, gpus)
37+
38+
assert mapping._harmonize_gpu(gpu_type) == expected

0 commit comments

Comments
 (0)