@@ -83,6 +83,43 @@ void TestGpuPartitionMetricsRead::Run(void) {
8383 std::cout << " \n\n " ;
8484 std::cout << " \t **GPU PARTITION METRICS: Using static struct (Backwards Compatibility):\n " ;
8585 }
86+
87+ // Test if xcp_metrics causes kernel crash
88+ pid_t test_pid = fork ();
89+ if (test_pid == 0 ) {
90+ // Child: try reading xcp_metrics
91+ amdsmi_gpu_metrics_t test_smu = {};
92+ amdsmi_get_gpu_partition_metrics_info (processor_handles_[i], &test_smu);
93+ _exit (0 );
94+ }
95+ if (test_pid < 0 ) {
96+ FAIL () << " Fork failed" ;
97+ }
98+
99+ // Parent: wait for child (3 second timeout: 30 iterations × 100ms)
100+ constexpr int MAX_WAIT_RETRIES = 30 ;
101+ constexpr int WAIT_INTERVAL_US = 100000 ; // 100ms in microseconds
102+ int status;
103+ bool child_exited = false ;
104+ for (int retry = 0 ; retry < MAX_WAIT_RETRIES; retry++) {
105+ if (waitpid (test_pid, &status, WNOHANG) > 0 ) {
106+ child_exited = true ;
107+ if (WIFSIGNALED (status)) {
108+ // Child process terminated by signal - fail the test
109+ FAIL () << " FAILED: Child process terminated by signal (signal " << WTERMSIG (status) << " )" ;
110+ }
111+ break ;
112+ }
113+ usleep (WAIT_INTERVAL_US);
114+ }
115+
116+ // Handle timeout - child still running after 3 seconds
117+ if (!child_exited) {
118+ kill (test_pid, SIGKILL);
119+ waitpid (test_pid, &status, 0 ); // Clean up zombie process
120+ FAIL () << " FAILED: Timeout waiting for child process (hung for 3+ seconds)" ;
121+ }
122+
86123 amdsmi_gpu_metrics_t smu = {};
87124 err = amdsmi_get_gpu_partition_metrics_info (processor_handles_[i], &smu);
88125 const char *status_string;
0 commit comments