[SWDEV-556149] Fix group checking (#746)

adam360x · JeniferC99 · commit 5df6c7654391 · 2025-10-15T09:03:38.000-07:00
Signed-off-by: Arif, Maisam &lt;Maisam.Arif@amd.com&gt;
Signed-off-by: Pryor, Adam &lt;Adam.Pryor@amd.com&gt;
Change-Id: I9ed7676b3a90f1ce86f5fea3f278c5e385c8be47
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -192,6 +192,8 @@ Full documentation for amd_smi_lib is available at [https://rocm.docs.amd.com/pr
 - **Changed sourcing of BDF to from drm to kfd**.  
   - Non sudo privliged users were unable to see the BDF due to logical errors.
 
+- **Optimized the way `amd-smi process` validates which proccesses are running on a GPU**.  
+
 ### Resolved Issues
 
 - **Fixed a CPER record count mismatch issue when using the `amd-smi ras --cper --file-limit`**.  
diff --git a/amdsmi_cli/amdsmi_commands.py b/amdsmi_cli/amdsmi_commands.py
@@ -204,19 +204,7 @@ def list(self, args, multiple_devices=False, gpu=None):
         if args.gpu == None:
             args.gpu = self.device_handles
 
-        # Perform one-time group check. If it fails, record that fact
-        # but do NOT abort—just mark that UUID should be "N/A" later.
-        _group_check_done = False
-        _group_in_groups = False
-        if not _group_check_done:
-           try:
-               self.helpers.check_required_groups()
-               _group_in_groups = True
-           except Exception as e:
-               _group_in_groups = False
-               # print the helper's error message exactly once:
-               print(f"{e}")
-           _group_check_done = True
+        _group_in_groups = self.helpers.check_required_groups()
 
         # Handle multiple GPUs
         handled_multiple_gpus, device_handle = self.helpers.handle_gpus(args, self.logger, self.list)
@@ -234,14 +222,9 @@ def list(self, args, multiple_devices=False, gpu=None):
         except amdsmi_exception.AmdSmiLibraryException as e:
             bdf = "N/A"
         
-        # Only fetch UUID if group check passed; otherwise force "N/A"
-        if _group_in_groups:
-            try:
-                uuid = amdsmi_interface.amdsmi_get_gpu_device_uuid(args.gpu)
-            except amdsmi_exception.AmdSmiLibraryException:
-                uuid = "N/A"
-        else:
-            # user not in render/video → UUID is N/A
+        try:
+            uuid = amdsmi_interface.amdsmi_get_gpu_device_uuid(args.gpu)
+        except amdsmi_exception.AmdSmiLibraryException:
             uuid = "N/A"
 
         try:
@@ -6447,7 +6430,7 @@ def xgmi(self, args, multiple_devices=False, gpu=None, metric=None, xgmi_source_
             gpu_bdf = amdsmi_interface.amdsmi_get_gpu_device_bdf(gpu)
             xgmi_values.append({"gpu" : gpu_id,
                                 "bdf" : gpu_bdf})
-            # Populate header with just numerical GPU ids
+            # Add this device's GPU ID to the header
             self.logger.table_header += f"GPU{gpu_id}".rjust(13)
 
         # Cache processor handles for each BDF
diff --git a/amdsmi_cli/amdsmi_helpers.py b/amdsmi_cli/amdsmi_helpers.py
@@ -29,6 +29,9 @@
 import re
 import sys
 import time
+import glob
+import errno
+import pwd
 
 from enum import Enum
 from pathlib import Path
@@ -115,7 +118,7 @@ def assign_previous_set_success_check(self, status):
         This is used to determine if the last set was successful or not.
         """
         self._previous_set_success_check = status
-    
+
     def get_previous_set_success_check(self):
         """Returns the previous set success check.
         This is used to determine if the last set was successful or not.
@@ -822,7 +825,7 @@ def get_power_caps(self):
             except amdsmi_interface.AmdSmiLibraryException as e:
                 logging.debug(f"AMDSMIHelpers.get_power_caps - Unable to get power cap info for device {dev}: {str(e)}")
                 continue
-        
+
         # If we never found a real min or max, set them to N/A
         if power_cap_min == amdsmi_interface.MaxUIntegerTypes.UINT64_T:
             power_cap_min = "N/A"
@@ -931,7 +934,7 @@ def confirm_changing_memory_partition_gpu_reload_warning(self, auto_respond=Fals
             memory (NPS) partition mode.
 
             Please use `sudo amd-smi reset -r` AFTER successfully
-            changing the memory (NPS) partition mode. A successful driver reload 
+            changing the memory (NPS) partition mode. A successful driver reload
             is REQUIRED in order to complete updating ALL GPUs in the hive to
             the requested partition mode.
 
@@ -1133,34 +1136,101 @@ def showProgressbar(self, title="", timeInSeconds=13, add_newline=False):
         for i in self.progressbar(range(timeInSeconds), title, 40, add_newline=add_newline):
             time.sleep(1)
 
+    def _user_name(self, uid: int) -> str:
+        try:
+            return pwd.getpwuid(uid).pw_name
+        except Exception:
+            # In containers, the UID may not resolve to a name
+            return str(uid)
+
+    def _group_name(self, gid: int) -> str:
+        try:
+            return grp.getgrgid(gid).gr_name
+        except Exception:
+            # In containers, the GID may not resolve to a name
+            return str(gid)
+
+    # Attempt to grab file info
+    def _stat_info(self, path: str) -> dict:
+        try:
+            st = os.stat(path)
+            return {
+                "uid": st.st_uid,
+                "gid": st.st_gid,
+                "user": self._user_name(st.st_uid),
+                "group": self._group_name(st.st_gid),
+            }
+        except Exception as e:
+            return {"error": str(e)}
+
+    def _try_open(self, path: str):
+        try:
+            fd = os.open(path, os.O_RDONLY) # Only read access is needed for permission check
+            os.close(fd)
+            return True, None, None
+        except OSError as e:
+            return False, e.errno, e.strerror
 
+    # Check kfd and dri for EACCES/EPERM
     def check_required_groups(self):
         """
-        Check if the current user is a member of the required groups.
-        If not, log a warning.
+        Check if the current user can access kfd and dri
+        Specifically, only care for EACCES/EPERM
         """
 
         # Skip check if running as root.
         if os.geteuid() == 0:
             return
 
-        required_groups = {'video', 'render'}
+        paths_to_check = []
+        if os.path.exists("/dev/kfd"):
+            paths_to_check.append("/dev/kfd")
 
-        user_groups = set()
-        for gid in set(os.getgroups()) | {os.getgid()}:
-            try:
-                user_groups.add(grp.getgrgid(gid).gr_name)
-            except Exception as e:
-                # Expected in containers when the name for this GID isn't defined
-                pass
+        # Render group correspond to /dev/dri/renderD*
+        paths_to_check += [p for p in sorted(glob.glob("/dev/dri/renderD*"))]
+
+        # Video group corresponds to /dev/dri/card*
+        paths_to_check += [p for p in sorted(glob.glob("/dev/dri/card*"))]
 
-        missing_groups = required_groups - user_groups
-        if missing_groups:
-            msg = (
-                "WARNING: User is missing the following required groups: %s. "
-                "Please add user to these groups."
-            ) % ", ".join(sorted(missing_groups))
-            raise RuntimeError(msg)
+        if not paths_to_check:
+            return
+
+        denied = []
+
+        for path in paths_to_check:
+            ok, err, msg = self._try_open(path)
+            if ok:
+                continue
+            # if permission denied or operation not permitted
+            if err in (errno.EACCES, errno.EPERM):
+                denied.append((path, err, msg, self._stat_info(path)))
+
+        if denied:
+            lines = []
+            lines.append("Permission needed to access required GPU device node(s):")
+            for path, err, msg, si in denied:
+                if "error" in si:
+                    lines.append(f"  - {path}: {os.strerror(err)}; stat failed: {si['error']}")
+                else:
+                    lines.append(
+                        "  - {p}: {err}; owner={user}({uid}):{group}({gid});".format(
+                            p=path,
+                            err=os.strerror(err),
+                            user=si["user"],
+                            uid=si["uid"],
+                            group=si["group"],
+                            gid=si["gid"],
+                        )
+                    )
+
+            lines.append("")
+            lines.append("You can try:")
+            lines.append("  • Add your user to the group that owns these devices:")
+            lines.append("      sudo usermod -aG <group> \"$USER\"\n")
+            print("\n".join(lines))
+            return False
+
+        return True
 
     def _severity_as_string(self, error_severity, notify_type, for_filename):
         if error_severity == "non_fatal_uncorrected":
@@ -1563,7 +1633,7 @@ def average_flattened_ints(data, context="data"):
         if not isinstance(data, (list, tuple)):
             logging.debug(f"Invalid data type for {context}: expected list/tuple, got {type(data)}")
             return "N/A"
-    
+
         # Flatten nested lists and filter integers
         flat = [v for value in data for v in (value if isinstance(value, list) else [value]) if isinstance(v, int)]
         return round(sum(flat) / len(flat)) if flat else "N/A"
diff --git a/src/amd_smi/amd_smi_drm.cc b/src/amd_smi/amd_smi_drm.cc
@@ -113,7 +113,6 @@ amdsmi_status_t AMDSmiDrm::init() {
     amd::smi::RocmSMI& smi = amd::smi::RocmSMI::getInstance();
     auto devices = smi.devices();
 
-    bool has_valid_fds = false;
     for (uint32_t i=0; i < devices.size(); i++) {
         auto rocm_smi_device = devices[i];
         drmDevicePtr device;
@@ -139,7 +138,6 @@ amdsmi_status_t AMDSmiDrm::init() {
                 drm_free_device(&device);
             }
             drm_free_version(version);
-            has_valid_fds = true;
         }
 
         uint64_t bdf_rocm = 0;

Original file line number	Diff line number	Diff line change
`@@ -113,7 +113,6 @@ amdsmi_status_t AMDSmiDrm::init() {`
`113`	`113`	`amd::smi::RocmSMI& smi = amd::smi::RocmSMI::getInstance();`
`114`	`114`	`auto devices = smi.devices();`
`115`	`115`
`116`		`- bool has_valid_fds = false;`
`117`	`116`	`for (uint32_t i=0; i < devices.size(); i++) {`
`118`	`117`	`auto rocm_smi_device = devices[i];`
`119`	`118`	`drmDevicePtr device;`
`@@ -139,7 +138,6 @@ amdsmi_status_t AMDSmiDrm::init() {`
`139`	`138`	`drm_free_device(&device);`
`140`	`139`	`}`
`141`	`140`	`drm_free_version(version);`
`142`		`- has_valid_fds = true;`
`143`	`141`	`}`
`144`	`142`
`145`	`143`	`uint64_t bdf_rocm = 0;`