Skip to content

Commit 701e3ff

Browse files
bkanangoJeniferC99
authored andcommitted
[SWDEV-542718] Correct socket_affinity (#760)
* [SWDEV-542718] Correct socket_affinity Updated Socket affinity to show bitmask and expanded cpu list. Signed-off-by: Bindhiya Kanangot Balakrishnan <[email protected]> * Update per-device local_cpulist for socket_affinity Signed-off-by: Bindhiya Kanangot Balakrishnan <[email protected]> * Added amdsmi_get_cpu_affinity_from_local_cpulist API. Updated the wrapper. Signed-off-by: Bindhiya Kanangot Balakrishnan <[email protected]> * Revert "Added amdsmi_get_cpu_affinity_from_local_cpulist API." This reverts commit 9a2ef934b1787f8aa09d3e4efe02f897b4295215. * Moved the changes to C API. In case of SOCKET_SCOPE, use local_cpulist first. If it is unavailable or not readable, fallback to numa. Signed-off-by: Bindhiya Kanangot Balakrishnan <[email protected]> * Addressed review comments Signed-off-by: Bindhiya Kanangot Balakrishnan <[email protected]> --------- Signed-off-by: Bindhiya Kanangot Balakrishnan <[email protected]>
1 parent 3cdaf4d commit 701e3ff

File tree

4 files changed

+51
-23
lines changed

4 files changed

+51
-23
lines changed

amdsmi_cli/amdsmi_commands.py

Lines changed: 12 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -892,16 +892,24 @@ def static_gpu(self, args, multiple_devices=False, gpu=None, asic=None, bus=None
892892
logging.debug("Failed to get cpu affinity for gpu %s | %s", gpu_id, e.get_error_info())
893893

894894
try:
895-
cpusockets = amdsmi_interface.amdsmi_get_cpu_affinity_with_scope(args.gpu, amdsmi_interface.AmdSmiAffinityScope.SOCKET_SCOPE)
896-
cpusockets = {f'socket_{i}': socket for i, socket in enumerate(set(cpusockets))}
895+
socket_set = amdsmi_interface.amdsmi_get_cpu_affinity_with_scope(args.gpu, amdsmi_interface.AmdSmiAffinityScope.SOCKET_SCOPE)
896+
socket_set = [f"{cpus:016X}" for cpus in socket_set]
897+
socket_set = {f'cpu_list_{i}': f"{cpus}" for i, cpus in enumerate(socket_set)}
898+
socket_bitmask_ranges = self.helpers.get_bitmask_ranges(socket_set)
899+
socket_affinity = {}
900+
for key in socket_set:
901+
socket_affinity[key] = {
902+
"bitmask": socket_set[key],
903+
"cpu_cores_affinity": socket_bitmask_ranges.get(key, "N/A")
904+
}
897905
except amdsmi_exception.AmdSmiLibraryException as e:
898-
cpusockets = {}
906+
socket_affinity = "N/A"
899907
logging.debug("Failed to get socket affinity for gpu %s | %s", gpu_id, e.get_error_info())
900908

901909
static_dict['numa'] = { 'node' : numa_node_number,
902910
'affinity' : numa_affinity,
903911
'cpu_affinity' : cpu_affinity,
904-
'socket_affinity' : cpusockets if cpusockets else "N/A"}
912+
'socket_affinity' : socket_affinity}
905913
if args.vram:
906914
vram_info_dict = {"type" : "N/A",
907915
"vendor" : "N/A",

include/amd_smi/impl/amd_smi_gpu_device.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -72,6 +72,7 @@ class AMDSmiGPUDevice: public AMDSmiProcessor {
7272
// New methods for -e feature
7373
std::string bdf_to_string() const; // -e feature
7474
std::vector<uint64_t> get_bitmask_from_numa_node(int32_t node_id, uint32_t size) const;
75+
std::vector<uint64_t> get_bitmask_from_local_cpulist(uint32_t drm_card, uint32_t size) const;
7576

7677
private:
7778
uint32_t gpu_id_;

src/amd_smi/amd_smi.cc

Lines changed: 4 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -5187,27 +5187,12 @@ amdsmi_status_t amdsmi_get_cpu_affinity_with_scope(amdsmi_processor_handle proce
51875187

51885188
case AMDSMI_AFFINITY_SCOPE_SOCKET:
51895189
{
5190-
std::vector<uint32_t> sockets = amd::smi::AMDSmiSystem::getInstance().get_cpu_sockets_from_numa_node(node_id);
5191-
5192-
if(sockets[0] == std::numeric_limits<int32_t>::max()){
5190+
uint32_t drm_card = gpu_device->get_card_id();
5191+
std::vector<uint64_t> bitmask = gpu_device->get_bitmask_from_local_cpulist(drm_card, cpu_set_size);
5192+
if(bitmask[0] == std::numeric_limits<int32_t>::max()){
51935193
return AMDSMI_STATUS_REFCOUNT_OVERFLOW;
51945194
} else {
5195-
for (uint32_t idx : sockets) {
5196-
cpu_set[idx] = idx;
5197-
}
5198-
5199-
std::sort(cpu_set, cpu_set + cpu_set_size);
5200-
5201-
// Discard duplicates
5202-
uint32_t temp_size = 0;
5203-
for (uint32_t i = 0; i < cpu_set_size; ++i) {
5204-
if (i == 0 || cpu_set[i] != cpu_set[i - 1]) {
5205-
cpu_set[temp_size++] = cpu_set[i];
5206-
}
5207-
}
5208-
5209-
// Update the size to the temp size after discarding duplicates
5210-
cpu_set_size = temp_size;
5195+
std::memcpy(cpu_set, bitmask.data(), cpu_set_size * sizeof(uint64_t));
52115196
}
52125197
break;
52135198
}

src/amd_smi/amd_smi_gpu_device.cc

Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -310,4 +310,38 @@ std::vector<uint64_t> AMDSmiGPUDevice::get_bitmask_from_numa_node(int32_t node_i
310310
return bitmask;
311311
}
312312

313+
std::vector<uint64_t> AMDSmiGPUDevice::get_bitmask_from_local_cpulist(uint32_t drm_card, uint32_t size) const {
314+
std::vector<uint64_t> bitmask(size, 0);
315+
316+
if (drm_card < 0) {
317+
bitmask[0] = std::numeric_limits<int32_t>::max();
318+
return bitmask;
319+
}
320+
321+
std::string path = "/sys/class/drm/card" + std::to_string(drm_card) + "/device/local_cpulist";
322+
std::ifstream file(path);
323+
324+
if (file.is_open()) {
325+
std::string info;
326+
while (std::getline(file, info)) {
327+
std::istringstream sstr(info);
328+
std::string node_cpus;
329+
while (std::getline(sstr, node_cpus, ',')) {
330+
size_t hyphen = node_cpus.find('-');
331+
if (hyphen != std::string::npos) {
332+
int start = std::stoi(node_cpus.substr(0, hyphen));
333+
int end = std::stoi(node_cpus.substr(hyphen + 1));
334+
for (int i = start; i <= end; ++i) {
335+
bitmask[i / 64] |= (1ULL << (i % 64));
336+
}
337+
} else {
338+
int core = std::stoi(node_cpus);
339+
bitmask[core / 64] |= (1ULL << (core % 64));
340+
}
341+
}
342+
}
343+
}
344+
return bitmask;
345+
}
346+
313347
} // namespace amd::smi

0 commit comments

Comments
 (0)