|
29 | 29 | import re |
30 | 30 | import sys |
31 | 31 | import time |
| 32 | +import glob |
| 33 | +import errno |
| 34 | +import pwd |
32 | 35 |
|
33 | 36 | from enum import Enum |
34 | 37 | from pathlib import Path |
@@ -115,7 +118,7 @@ def assign_previous_set_success_check(self, status): |
115 | 118 | This is used to determine if the last set was successful or not. |
116 | 119 | """ |
117 | 120 | self._previous_set_success_check = status |
118 | | - |
| 121 | + |
119 | 122 | def get_previous_set_success_check(self): |
120 | 123 | """Returns the previous set success check. |
121 | 124 | This is used to determine if the last set was successful or not. |
@@ -822,7 +825,7 @@ def get_power_caps(self): |
822 | 825 | except amdsmi_interface.AmdSmiLibraryException as e: |
823 | 826 | logging.debug(f"AMDSMIHelpers.get_power_caps - Unable to get power cap info for device {dev}: {str(e)}") |
824 | 827 | continue |
825 | | - |
| 828 | + |
826 | 829 | # If we never found a real min or max, set them to N/A |
827 | 830 | if power_cap_min == amdsmi_interface.MaxUIntegerTypes.UINT64_T: |
828 | 831 | power_cap_min = "N/A" |
@@ -931,7 +934,7 @@ def confirm_changing_memory_partition_gpu_reload_warning(self, auto_respond=Fals |
931 | 934 | memory (NPS) partition mode. |
932 | 935 |
|
933 | 936 | Please use `sudo amd-smi reset -r` AFTER successfully |
934 | | - changing the memory (NPS) partition mode. A successful driver reload |
| 937 | + changing the memory (NPS) partition mode. A successful driver reload |
935 | 938 | is REQUIRED in order to complete updating ALL GPUs in the hive to |
936 | 939 | the requested partition mode. |
937 | 940 |
|
@@ -1133,34 +1136,101 @@ def showProgressbar(self, title="", timeInSeconds=13, add_newline=False): |
1133 | 1136 | for i in self.progressbar(range(timeInSeconds), title, 40, add_newline=add_newline): |
1134 | 1137 | time.sleep(1) |
1135 | 1138 |
|
| 1139 | + def _user_name(self, uid: int) -> str: |
| 1140 | + try: |
| 1141 | + return pwd.getpwuid(uid).pw_name |
| 1142 | + except Exception: |
| 1143 | + # In containers, the UID may not resolve to a name |
| 1144 | + return str(uid) |
| 1145 | + |
| 1146 | + def _group_name(self, gid: int) -> str: |
| 1147 | + try: |
| 1148 | + return grp.getgrgid(gid).gr_name |
| 1149 | + except Exception: |
| 1150 | + # In containers, the GID may not resolve to a name |
| 1151 | + return str(gid) |
| 1152 | + |
| 1153 | + # Attempt to grab file info |
| 1154 | + def _stat_info(self, path: str) -> dict: |
| 1155 | + try: |
| 1156 | + st = os.stat(path) |
| 1157 | + return { |
| 1158 | + "uid": st.st_uid, |
| 1159 | + "gid": st.st_gid, |
| 1160 | + "user": self._user_name(st.st_uid), |
| 1161 | + "group": self._group_name(st.st_gid), |
| 1162 | + } |
| 1163 | + except Exception as e: |
| 1164 | + return {"error": str(e)} |
| 1165 | + |
| 1166 | + def _try_open(self, path: str): |
| 1167 | + try: |
| 1168 | + fd = os.open(path, os.O_RDONLY) # Only read access is needed for permission check |
| 1169 | + os.close(fd) |
| 1170 | + return True, None, None |
| 1171 | + except OSError as e: |
| 1172 | + return False, e.errno, e.strerror |
1136 | 1173 |
|
| 1174 | + # Check kfd and dri for EACCES/EPERM |
1137 | 1175 | def check_required_groups(self): |
1138 | 1176 | """ |
1139 | | - Check if the current user is a member of the required groups. |
1140 | | - If not, log a warning. |
| 1177 | + Check if the current user can access kfd and dri |
| 1178 | + Specifically, only care for EACCES/EPERM |
1141 | 1179 | """ |
1142 | 1180 |
|
1143 | 1181 | # Skip check if running as root. |
1144 | 1182 | if os.geteuid() == 0: |
1145 | 1183 | return |
1146 | 1184 |
|
1147 | | - required_groups = {'video', 'render'} |
| 1185 | + paths_to_check = [] |
| 1186 | + if os.path.exists("/dev/kfd"): |
| 1187 | + paths_to_check.append("/dev/kfd") |
1148 | 1188 |
|
1149 | | - user_groups = set() |
1150 | | - for gid in set(os.getgroups()) | {os.getgid()}: |
1151 | | - try: |
1152 | | - user_groups.add(grp.getgrgid(gid).gr_name) |
1153 | | - except Exception as e: |
1154 | | - # Expected in containers when the name for this GID isn't defined |
1155 | | - pass |
| 1189 | + # Render group correspond to /dev/dri/renderD* |
| 1190 | + paths_to_check += [p for p in sorted(glob.glob("/dev/dri/renderD*"))] |
| 1191 | + |
| 1192 | + # Video group corresponds to /dev/dri/card* |
| 1193 | + paths_to_check += [p for p in sorted(glob.glob("/dev/dri/card*"))] |
1156 | 1194 |
|
1157 | | - missing_groups = required_groups - user_groups |
1158 | | - if missing_groups: |
1159 | | - msg = ( |
1160 | | - "WARNING: User is missing the following required groups: %s. " |
1161 | | - "Please add user to these groups." |
1162 | | - ) % ", ".join(sorted(missing_groups)) |
1163 | | - raise RuntimeError(msg) |
| 1195 | + if not paths_to_check: |
| 1196 | + return |
| 1197 | + |
| 1198 | + denied = [] |
| 1199 | + |
| 1200 | + for path in paths_to_check: |
| 1201 | + ok, err, msg = self._try_open(path) |
| 1202 | + if ok: |
| 1203 | + continue |
| 1204 | + # if permission denied or operation not permitted |
| 1205 | + if err in (errno.EACCES, errno.EPERM): |
| 1206 | + denied.append((path, err, msg, self._stat_info(path))) |
| 1207 | + |
| 1208 | + if denied: |
| 1209 | + lines = [] |
| 1210 | + lines.append("Permission needed to access required GPU device node(s):") |
| 1211 | + for path, err, msg, si in denied: |
| 1212 | + if "error" in si: |
| 1213 | + lines.append(f" - {path}: {os.strerror(err)}; stat failed: {si['error']}") |
| 1214 | + else: |
| 1215 | + lines.append( |
| 1216 | + " - {p}: {err}; owner={user}({uid}):{group}({gid});".format( |
| 1217 | + p=path, |
| 1218 | + err=os.strerror(err), |
| 1219 | + user=si["user"], |
| 1220 | + uid=si["uid"], |
| 1221 | + group=si["group"], |
| 1222 | + gid=si["gid"], |
| 1223 | + ) |
| 1224 | + ) |
| 1225 | + |
| 1226 | + lines.append("") |
| 1227 | + lines.append("You can try:") |
| 1228 | + lines.append(" • Add your user to the group that owns these devices:") |
| 1229 | + lines.append(" sudo usermod -aG <group> \"$USER\"\n") |
| 1230 | + print("\n".join(lines)) |
| 1231 | + return False |
| 1232 | + |
| 1233 | + return True |
1164 | 1234 |
|
1165 | 1235 | def _severity_as_string(self, error_severity, notify_type, for_filename): |
1166 | 1236 | if error_severity == "non_fatal_uncorrected": |
@@ -1563,7 +1633,7 @@ def average_flattened_ints(data, context="data"): |
1563 | 1633 | if not isinstance(data, (list, tuple)): |
1564 | 1634 | logging.debug(f"Invalid data type for {context}: expected list/tuple, got {type(data)}") |
1565 | 1635 | return "N/A" |
1566 | | - |
| 1636 | + |
1567 | 1637 | # Flatten nested lists and filter integers |
1568 | 1638 | flat = [v for value in data for v in (value if isinstance(value, list) else [value]) if isinstance(v, int)] |
1569 | 1639 | return round(sum(flat) / len(flat)) if flat else "N/A" |
0 commit comments