Skip to content

Commit 5df6c76

Browse files
adam360xJeniferC99
authored andcommitted
[SWDEV-556149] Fix group checking (#746)
Signed-off-by: Arif, Maisam <[email protected]> Signed-off-by: Pryor, Adam <[email protected]> Change-Id: I9ed7676b3a90f1ce86f5fea3f278c5e385c8be47
1 parent f4dcba2 commit 5df6c76

File tree

4 files changed

+98
-45
lines changed

4 files changed

+98
-45
lines changed

CHANGELOG.md

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -192,6 +192,8 @@ Full documentation for amd_smi_lib is available at [https://rocm.docs.amd.com/pr
192192
- **Changed sourcing of BDF to from drm to kfd**.
193193
- Non sudo privliged users were unable to see the BDF due to logical errors.
194194

195+
- **Optimized the way `amd-smi process` validates which proccesses are running on a GPU**.
196+
195197
### Resolved Issues
196198

197199
- **Fixed a CPER record count mismatch issue when using the `amd-smi ras --cper --file-limit`**.

amdsmi_cli/amdsmi_commands.py

Lines changed: 5 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -204,19 +204,7 @@ def list(self, args, multiple_devices=False, gpu=None):
204204
if args.gpu == None:
205205
args.gpu = self.device_handles
206206

207-
# Perform one-time group check. If it fails, record that fact
208-
# but do NOT abort—just mark that UUID should be "N/A" later.
209-
_group_check_done = False
210-
_group_in_groups = False
211-
if not _group_check_done:
212-
try:
213-
self.helpers.check_required_groups()
214-
_group_in_groups = True
215-
except Exception as e:
216-
_group_in_groups = False
217-
# print the helper's error message exactly once:
218-
print(f"{e}")
219-
_group_check_done = True
207+
_group_in_groups = self.helpers.check_required_groups()
220208

221209
# Handle multiple GPUs
222210
handled_multiple_gpus, device_handle = self.helpers.handle_gpus(args, self.logger, self.list)
@@ -234,14 +222,9 @@ def list(self, args, multiple_devices=False, gpu=None):
234222
except amdsmi_exception.AmdSmiLibraryException as e:
235223
bdf = "N/A"
236224

237-
# Only fetch UUID if group check passed; otherwise force "N/A"
238-
if _group_in_groups:
239-
try:
240-
uuid = amdsmi_interface.amdsmi_get_gpu_device_uuid(args.gpu)
241-
except amdsmi_exception.AmdSmiLibraryException:
242-
uuid = "N/A"
243-
else:
244-
# user not in render/video → UUID is N/A
225+
try:
226+
uuid = amdsmi_interface.amdsmi_get_gpu_device_uuid(args.gpu)
227+
except amdsmi_exception.AmdSmiLibraryException:
245228
uuid = "N/A"
246229

247230
try:
@@ -6447,7 +6430,7 @@ def xgmi(self, args, multiple_devices=False, gpu=None, metric=None, xgmi_source_
64476430
gpu_bdf = amdsmi_interface.amdsmi_get_gpu_device_bdf(gpu)
64486431
xgmi_values.append({"gpu" : gpu_id,
64496432
"bdf" : gpu_bdf})
6450-
# Populate header with just numerical GPU ids
6433+
# Add this device's GPU ID to the header
64516434
self.logger.table_header += f"GPU{gpu_id}".rjust(13)
64526435

64536436
# Cache processor handles for each BDF

amdsmi_cli/amdsmi_helpers.py

Lines changed: 91 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,9 @@
2929
import re
3030
import sys
3131
import time
32+
import glob
33+
import errno
34+
import pwd
3235

3336
from enum import Enum
3437
from pathlib import Path
@@ -115,7 +118,7 @@ def assign_previous_set_success_check(self, status):
115118
This is used to determine if the last set was successful or not.
116119
"""
117120
self._previous_set_success_check = status
118-
121+
119122
def get_previous_set_success_check(self):
120123
"""Returns the previous set success check.
121124
This is used to determine if the last set was successful or not.
@@ -822,7 +825,7 @@ def get_power_caps(self):
822825
except amdsmi_interface.AmdSmiLibraryException as e:
823826
logging.debug(f"AMDSMIHelpers.get_power_caps - Unable to get power cap info for device {dev}: {str(e)}")
824827
continue
825-
828+
826829
# If we never found a real min or max, set them to N/A
827830
if power_cap_min == amdsmi_interface.MaxUIntegerTypes.UINT64_T:
828831
power_cap_min = "N/A"
@@ -931,7 +934,7 @@ def confirm_changing_memory_partition_gpu_reload_warning(self, auto_respond=Fals
931934
memory (NPS) partition mode.
932935
933936
Please use `sudo amd-smi reset -r` AFTER successfully
934-
changing the memory (NPS) partition mode. A successful driver reload
937+
changing the memory (NPS) partition mode. A successful driver reload
935938
is REQUIRED in order to complete updating ALL GPUs in the hive to
936939
the requested partition mode.
937940
@@ -1133,34 +1136,101 @@ def showProgressbar(self, title="", timeInSeconds=13, add_newline=False):
11331136
for i in self.progressbar(range(timeInSeconds), title, 40, add_newline=add_newline):
11341137
time.sleep(1)
11351138

1139+
def _user_name(self, uid: int) -> str:
1140+
try:
1141+
return pwd.getpwuid(uid).pw_name
1142+
except Exception:
1143+
# In containers, the UID may not resolve to a name
1144+
return str(uid)
1145+
1146+
def _group_name(self, gid: int) -> str:
1147+
try:
1148+
return grp.getgrgid(gid).gr_name
1149+
except Exception:
1150+
# In containers, the GID may not resolve to a name
1151+
return str(gid)
1152+
1153+
# Attempt to grab file info
1154+
def _stat_info(self, path: str) -> dict:
1155+
try:
1156+
st = os.stat(path)
1157+
return {
1158+
"uid": st.st_uid,
1159+
"gid": st.st_gid,
1160+
"user": self._user_name(st.st_uid),
1161+
"group": self._group_name(st.st_gid),
1162+
}
1163+
except Exception as e:
1164+
return {"error": str(e)}
1165+
1166+
def _try_open(self, path: str):
1167+
try:
1168+
fd = os.open(path, os.O_RDONLY) # Only read access is needed for permission check
1169+
os.close(fd)
1170+
return True, None, None
1171+
except OSError as e:
1172+
return False, e.errno, e.strerror
11361173

1174+
# Check kfd and dri for EACCES/EPERM
11371175
def check_required_groups(self):
11381176
"""
1139-
Check if the current user is a member of the required groups.
1140-
If not, log a warning.
1177+
Check if the current user can access kfd and dri
1178+
Specifically, only care for EACCES/EPERM
11411179
"""
11421180

11431181
# Skip check if running as root.
11441182
if os.geteuid() == 0:
11451183
return
11461184

1147-
required_groups = {'video', 'render'}
1185+
paths_to_check = []
1186+
if os.path.exists("/dev/kfd"):
1187+
paths_to_check.append("/dev/kfd")
11481188

1149-
user_groups = set()
1150-
for gid in set(os.getgroups()) | {os.getgid()}:
1151-
try:
1152-
user_groups.add(grp.getgrgid(gid).gr_name)
1153-
except Exception as e:
1154-
# Expected in containers when the name for this GID isn't defined
1155-
pass
1189+
# Render group correspond to /dev/dri/renderD*
1190+
paths_to_check += [p for p in sorted(glob.glob("/dev/dri/renderD*"))]
1191+
1192+
# Video group corresponds to /dev/dri/card*
1193+
paths_to_check += [p for p in sorted(glob.glob("/dev/dri/card*"))]
11561194

1157-
missing_groups = required_groups - user_groups
1158-
if missing_groups:
1159-
msg = (
1160-
"WARNING: User is missing the following required groups: %s. "
1161-
"Please add user to these groups."
1162-
) % ", ".join(sorted(missing_groups))
1163-
raise RuntimeError(msg)
1195+
if not paths_to_check:
1196+
return
1197+
1198+
denied = []
1199+
1200+
for path in paths_to_check:
1201+
ok, err, msg = self._try_open(path)
1202+
if ok:
1203+
continue
1204+
# if permission denied or operation not permitted
1205+
if err in (errno.EACCES, errno.EPERM):
1206+
denied.append((path, err, msg, self._stat_info(path)))
1207+
1208+
if denied:
1209+
lines = []
1210+
lines.append("Permission needed to access required GPU device node(s):")
1211+
for path, err, msg, si in denied:
1212+
if "error" in si:
1213+
lines.append(f" - {path}: {os.strerror(err)}; stat failed: {si['error']}")
1214+
else:
1215+
lines.append(
1216+
" - {p}: {err}; owner={user}({uid}):{group}({gid});".format(
1217+
p=path,
1218+
err=os.strerror(err),
1219+
user=si["user"],
1220+
uid=si["uid"],
1221+
group=si["group"],
1222+
gid=si["gid"],
1223+
)
1224+
)
1225+
1226+
lines.append("")
1227+
lines.append("You can try:")
1228+
lines.append(" • Add your user to the group that owns these devices:")
1229+
lines.append(" sudo usermod -aG <group> \"$USER\"\n")
1230+
print("\n".join(lines))
1231+
return False
1232+
1233+
return True
11641234

11651235
def _severity_as_string(self, error_severity, notify_type, for_filename):
11661236
if error_severity == "non_fatal_uncorrected":
@@ -1563,7 +1633,7 @@ def average_flattened_ints(data, context="data"):
15631633
if not isinstance(data, (list, tuple)):
15641634
logging.debug(f"Invalid data type for {context}: expected list/tuple, got {type(data)}")
15651635
return "N/A"
1566-
1636+
15671637
# Flatten nested lists and filter integers
15681638
flat = [v for value in data for v in (value if isinstance(value, list) else [value]) if isinstance(v, int)]
15691639
return round(sum(flat) / len(flat)) if flat else "N/A"

src/amd_smi/amd_smi_drm.cc

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -113,7 +113,6 @@ amdsmi_status_t AMDSmiDrm::init() {
113113
amd::smi::RocmSMI& smi = amd::smi::RocmSMI::getInstance();
114114
auto devices = smi.devices();
115115

116-
bool has_valid_fds = false;
117116
for (uint32_t i=0; i < devices.size(); i++) {
118117
auto rocm_smi_device = devices[i];
119118
drmDevicePtr device;
@@ -139,7 +138,6 @@ amdsmi_status_t AMDSmiDrm::init() {
139138
drm_free_device(&device);
140139
}
141140
drm_free_version(version);
142-
has_valid_fds = true;
143141
}
144142

145143
uint64_t bdf_rocm = 0;

0 commit comments

Comments
 (0)