Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 13 additions & 0 deletions projects/amdsmi/CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,19 @@ Full documentation for amd_smi_lib is available at [https://rocm.docs.amd.com/pr

### Added

- **Added GPU and base board temperature `amd-smi monitor` CLI support**.
- Added `--gpu-board-temps` option to `amd-smi monitor` command for GPU board temperature sensors
- Added `--base-board-temps` option to `amd-smi monitor` command for base board temperature sensors

- **Added Node Power Management (NPM) support**.
- Added new Node Power Management APIs and CLI for node monitoring
- Added C API functions:
- `amdsmi_get_node_handle()`: Get handle for node devices
- `amdsmi_get_npm_info()`: Retrieve Node Power Management information
- Added Python API wrappers for new node device functions
- Added `amd-smi node` CLI command for Node Power Management operations
- Currently supported for OAM_ID 0 only.

- **Added the following C API's to amdsmi_interface.py**.
- amdsmi_get_cpu_handle()
- amdsmi_get_esmi_err_msg()
Expand Down
130 changes: 50 additions & 80 deletions projects/amdsmi/amdsmi_cli/amdsmi_commands.py
Original file line number Diff line number Diff line change
Expand Up @@ -2311,41 +2311,7 @@ def metric_gpu(self, args, multiple_devices=False, watching_output=False, gpu=No

if "gpu_board" in current_platform_args:
if args.gpu_board:
gpu_board_temp_dict = {}
gpu_board_temp_types = [
amdsmi_interface.AmdSmiTemperatureType.GPUBOARD_NODE_RETIMER_X,
amdsmi_interface.AmdSmiTemperatureType.GPUBOARD_NODE_OAM_X_IBC,
amdsmi_interface.AmdSmiTemperatureType.GPUBOARD_NODE_OAM_X_IBC_2,
amdsmi_interface.AmdSmiTemperatureType.GPUBOARD_NODE_OAM_X_VDD18_VR,
amdsmi_interface.AmdSmiTemperatureType.GPUBOARD_NODE_OAM_X_04_HBM_B_VR,
amdsmi_interface.AmdSmiTemperatureType.GPUBOARD_NODE_OAM_X_04_HBM_D_VR,
amdsmi_interface.AmdSmiTemperatureType.GPUBOARD_VDDCR_VDD0,
amdsmi_interface.AmdSmiTemperatureType.GPUBOARD_VDDCR_VDD1,
amdsmi_interface.AmdSmiTemperatureType.GPUBOARD_VDDCR_VDD2,
amdsmi_interface.AmdSmiTemperatureType.GPUBOARD_VDDCR_VDD3,
amdsmi_interface.AmdSmiTemperatureType.GPUBOARD_VDDCR_SOC_A,
amdsmi_interface.AmdSmiTemperatureType.GPUBOARD_VDDCR_SOC_C,
amdsmi_interface.AmdSmiTemperatureType.GPUBOARD_VDDCR_SOCIO_A,
amdsmi_interface.AmdSmiTemperatureType.GPUBOARD_VDDCR_SOCIO_C,
amdsmi_interface.AmdSmiTemperatureType.GPUBOARD_VDD_085_HBM,
amdsmi_interface.AmdSmiTemperatureType.GPUBOARD_VDDCR_11_HBM_B,
amdsmi_interface.AmdSmiTemperatureType.GPUBOARD_VDDCR_11_HBM_D,
amdsmi_interface.AmdSmiTemperatureType.GPUBOARD_VDD_USR,
amdsmi_interface.AmdSmiTemperatureType.GPUBOARD_VDDIO_11_E32
]
for type in gpu_board_temp_types:
type_name = type.name.replace("GPUBOARD_", "")
try:
gpu_board_temp_holder = amdsmi_interface.amdsmi_get_temp_metric(args.gpu, type, amdsmi_interface.AmdSmiTemperatureMetric.CURRENT)
if gpu_board_temp_holder != "N/A":
gpu_board_temp_dict[f'{type_name}'] = self.helpers.unit_format(self.logger,
gpu_board_temp_holder,
'\N{DEGREE SIGN}C')
else:
gpu_board_temp_dict[f'{type_name}'] = "N/A"
except amdsmi_exception.AmdSmiLibraryException as e:
gpu_board_temp_dict[f'{type_name}'] = "N/A"
logging.debug("Failed to get gpu_board %s for gpu %s | %s", type_name, gpu_id, e.get_error_info())
gpu_board_temp_dict = self.helpers.get_gpu_board_temperatures(args.gpu, gpu_id, self.logger)
# if every value is N/A, then we don't want to display the values unless explicitly told to
# all args_list being True indicates that this gpu_board is not explicitly called itself
args_list = [getattr(args, arg) for arg in current_platform_args]
Expand All @@ -2355,46 +2321,7 @@ def metric_gpu(self, args, multiple_devices=False, watching_output=False, gpu=No
values_dict['gpu_board'] = {'temperature':gpu_board_temp_dict}
if "base_board" in current_platform_args:
if args.base_board:
base_board_temp_dict = {}
base_board_temp_types = [
amdsmi_interface.AmdSmiTemperatureType.BASEBOARD_UBB_FPGA,
amdsmi_interface.AmdSmiTemperatureType.BASEBOARD_UBB_FRONT,
amdsmi_interface.AmdSmiTemperatureType.BASEBOARD_UBB_BACK,
amdsmi_interface.AmdSmiTemperatureType.BASEBOARD_UBB_OAM7,
amdsmi_interface.AmdSmiTemperatureType.BASEBOARD_UBB_IBC,
amdsmi_interface.AmdSmiTemperatureType.BASEBOARD_UBB_UFPGA,
amdsmi_interface.AmdSmiTemperatureType.BASEBOARD_UBB_OAM1,
amdsmi_interface.AmdSmiTemperatureType.BASEBOARD_OAM_0_1_HSC,
amdsmi_interface.AmdSmiTemperatureType.BASEBOARD_OAM_2_3_HSC,
amdsmi_interface.AmdSmiTemperatureType.BASEBOARD_OAM_4_5_HSC,
amdsmi_interface.AmdSmiTemperatureType.BASEBOARD_OAM_6_7_HSC,
amdsmi_interface.AmdSmiTemperatureType.BASEBOARD_UBB_FPGA_0V72_VR,
amdsmi_interface.AmdSmiTemperatureType.BASEBOARD_UBB_FPGA_3V3_VR,
amdsmi_interface.AmdSmiTemperatureType.BASEBOARD_RETIMER_0_1_2_3_1V2_VR,
amdsmi_interface.AmdSmiTemperatureType.BASEBOARD_RETIMER_4_5_6_7_1V2_VR,
amdsmi_interface.AmdSmiTemperatureType.BASEBOARD_RETIMER_0_1_0V9_VR,
amdsmi_interface.AmdSmiTemperatureType.BASEBOARD_RETIMER_4_5_0V9_VR,
amdsmi_interface.AmdSmiTemperatureType.BASEBOARD_RETIMER_2_3_0V9_VR,
amdsmi_interface.AmdSmiTemperatureType.BASEBOARD_RETIMER_6_7_0V9_VR,
amdsmi_interface.AmdSmiTemperatureType.BASEBOARD_OAM_0_1_2_3_3V3_VR,
amdsmi_interface.AmdSmiTemperatureType.BASEBOARD_OAM_4_5_6_7_3V3_VR,
amdsmi_interface.AmdSmiTemperatureType.BASEBOARD_IBC_HSC,
amdsmi_interface.AmdSmiTemperatureType.BASEBOARD_IBC
]
for type in base_board_temp_types:
type_name = type.name.replace("BASEBOARD_", "")
try:
base_board_temp_holder = amdsmi_interface.amdsmi_get_temp_metric(args.gpu, type, amdsmi_interface.AmdSmiTemperatureMetric.CURRENT)
if base_board_temp_holder != "N/A":

base_board_temp_dict[f'{type_name}'] = self.helpers.unit_format(self.logger,
base_board_temp_holder,
'\N{DEGREE SIGN}C')
else:
base_board_temp_dict[f'{type_name}'] = "N/A"
except amdsmi_exception.AmdSmiLibraryException as e:
base_board_temp_dict[f'{type_name}'] = "N/A"
logging.debug("Failed to get base_board %s for gpu %s | %s", type_name, gpu_id, e.get_error_info())
base_board_temp_dict = self.helpers.get_base_board_temperatures(args.gpu, gpu_id, self.logger)
# if every value is N/A, then we don't want to display the values unless explicitly told to
# all args_list being True indicates that this base_board is not explicitly called itself
args_list = [getattr(args, arg) for arg in current_platform_args]
Expand Down Expand Up @@ -5680,8 +5607,9 @@ def reset(self, args, multiple_devices=False, gpu=None, gpureset=None,

def monitor(self, args, multiple_devices=False, watching_output=False, gpu=None,
watch=None, watch_time=None, iterations=None, power_usage=None,
temperature=None, gfx_util=None, mem_util=None, encoder=None,
decoder=None, ecc=None, vram_usage=None, pcie=None, process=None,
temperature=None, base_board_temps=None, gpu_board_temps=None,
gfx_util=None, mem_util=None, encoder=None, decoder=None,
ecc=None, vram_usage=None, pcie=None, process=None,
violation=None):
""" Populate a table with each GPU as an index to rows of targeted data

Expand All @@ -5694,6 +5622,8 @@ def monitor(self, args, multiple_devices=False, watching_output=False, gpu=None,
iterations (int, optional): Value override for args.iterations. Defaults to None.
power_usage (bool, optional): Value override for args.power_usage. Defaults to None.
temperature (bool, optional): Value override for args.temperature. Defaults to None.
base_board_temps (bool, optional): Value override for args.base_board_temps. Defaults to None.
gpu_board_temps (bool, optional): Value override for args.gpu_board_temps. Defaults to None.
gfx (bool, optional): Value override for args.gfx. Defaults to None.
mem_util (bool, optional): Value override for args.mem. Defaults to None.
encoder (bool, optional): Value override for args.encoder. Defaults to None.
Expand Down Expand Up @@ -5726,6 +5656,10 @@ def monitor(self, args, multiple_devices=False, watching_output=False, gpu=None,
args.power_usage = power_usage
if temperature:
args.temperature = temperature
if base_board_temps:
args.base_board_temps = base_board_temps
if gpu_board_temps:
args.gpu_board_temps = gpu_board_temps
if gfx_util:
args.gfx = gfx_util
if mem_util:
Expand Down Expand Up @@ -5758,9 +5692,10 @@ def monitor(self, args, multiple_devices=False, watching_output=False, gpu=None,

# If all arguments are False, the print all values
# Don't include process in this logic as it's an optional edge case
if not any([args.power_usage, args.temperature, args.gfx, args.mem,
args.encoder, args.decoder, args.ecc, args.vram_usage,
args.pcie, args.violation]):
if not any([args.power_usage, args.temperature, args.base_board_temps,
args.gpu_board_temps, args.gfx, args.mem, args.encoder,
args.decoder, args.ecc, args.vram_usage, args.pcie,
args.violation]):
args.power_usage = args.temperature = args.gfx = args.mem = \
args.encoder = args.decoder = args.vram_usage = True
# set extra args for default output filtering
Expand Down Expand Up @@ -5942,6 +5877,41 @@ def monitor(self, args, multiple_devices=False, watching_output=False, gpu=None,
self.logger.table_header += 'GPU_T'.rjust(8)
self.logger.table_header += 'MEM_T'.rjust(8)


if args.gpu_board_temps:
try:
gpu_board_temp_dict = self.helpers.get_gpu_board_temperatures(args.gpu, gpu_id, self.logger)

temp_unit_json = 'C'
# Add GPU board sensor headers
if gpu_board_temp_dict:
for temp_sensor in sorted(gpu_board_temp_dict.keys()):
self.logger.table_header += f"{temp_sensor}".rjust(max(len(temp_sensor)+2, 7))
for temp_type, temp_value in gpu_board_temp_dict.items():
if self.logger.is_json_format() and isinstance(temp_value, dict):
temp_value['unit'] = temp_unit_json
monitor_values[temp_type] = temp_value
except Exception as e:
logging.debug("Failed to get GPU board temperatures on gpu %s | %s", gpu_id, e)


if args.base_board_temps:
try:
base_board_temp_dict = self.helpers.get_base_board_temperatures(args.gpu, gpu_id, self.logger)

temp_unit_json = 'C'
# Add base board sensor headers
if base_board_temp_dict:
for temp_sensor in sorted(base_board_temp_dict.keys()):
self.logger.table_header += f"{temp_sensor}".rjust(max(len(temp_sensor)+2, 7))
for temp_type, temp_value in base_board_temp_dict.items():
if self.logger.is_json_format() and isinstance(temp_value, dict):
temp_value['unit'] = temp_unit_json
monitor_values[temp_type] = temp_value
except Exception as e:
logging.debug("Failed to get base board temperatures on gpu %s | %s", gpu_id, e)


if args.gfx:
try:
gfx_clk = gpu_metrics_info['current_gfxclk']
Expand Down
106 changes: 106 additions & 0 deletions projects/amdsmi/amdsmi_cli/amdsmi_helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -1932,3 +1932,109 @@ def _get_metric_version_and_partition_info(self, gpu_metrics_info, is_partition_
'num_partition': num_partition,
'num_xcp': num_xcp
}

def get_gpu_board_temperatures(self, device_handle, gpu_id, logger):
"""Get GPU board temperature readings

Args:
device_handle: GPU device handle
gpu_id: GPU identifier for logging
logger: AMDSMILogger instance

Returns:
dict: GPU board temperature data or empty dict if all values are N/A
"""
gpu_board_temp_dict = {}
gpu_board_temp_types = [
amdsmi_interface.AmdSmiTemperatureType.GPUBOARD_NODE_RETIMER_X,
amdsmi_interface.AmdSmiTemperatureType.GPUBOARD_NODE_OAM_X_IBC,
amdsmi_interface.AmdSmiTemperatureType.GPUBOARD_NODE_OAM_X_IBC_2,
amdsmi_interface.AmdSmiTemperatureType.GPUBOARD_NODE_OAM_X_VDD18_VR,
amdsmi_interface.AmdSmiTemperatureType.GPUBOARD_NODE_OAM_X_04_HBM_B_VR,
amdsmi_interface.AmdSmiTemperatureType.GPUBOARD_NODE_OAM_X_04_HBM_D_VR,
amdsmi_interface.AmdSmiTemperatureType.GPUBOARD_VDDCR_VDD0,
amdsmi_interface.AmdSmiTemperatureType.GPUBOARD_VDDCR_VDD1,
amdsmi_interface.AmdSmiTemperatureType.GPUBOARD_VDDCR_VDD2,
amdsmi_interface.AmdSmiTemperatureType.GPUBOARD_VDDCR_VDD3,
amdsmi_interface.AmdSmiTemperatureType.GPUBOARD_VDDCR_SOC_A,
amdsmi_interface.AmdSmiTemperatureType.GPUBOARD_VDDCR_SOC_C,
amdsmi_interface.AmdSmiTemperatureType.GPUBOARD_VDDCR_SOCIO_A,
amdsmi_interface.AmdSmiTemperatureType.GPUBOARD_VDDCR_SOCIO_C,
amdsmi_interface.AmdSmiTemperatureType.GPUBOARD_VDD_085_HBM,
amdsmi_interface.AmdSmiTemperatureType.GPUBOARD_VDDCR_11_HBM_B,
amdsmi_interface.AmdSmiTemperatureType.GPUBOARD_VDDCR_11_HBM_D,
amdsmi_interface.AmdSmiTemperatureType.GPUBOARD_VDD_USR,
amdsmi_interface.AmdSmiTemperatureType.GPUBOARD_VDDIO_11_E32
]

for temp_type in gpu_board_temp_types:
type_name = temp_type.name.replace("GPUBOARD_", "")
try:
gpu_board_temp_holder = amdsmi_interface.amdsmi_get_temp_metric(
device_handle, temp_type, amdsmi_interface.AmdSmiTemperatureMetric.CURRENT)
if gpu_board_temp_holder != "N/A":
gpu_board_temp_dict[f'{type_name}'] = self.unit_format(
logger, gpu_board_temp_holder, '\N{DEGREE SIGN}C')
else:
gpu_board_temp_dict[f'{type_name}'] = "N/A"
except amdsmi_exception.AmdSmiLibraryException as e:
gpu_board_temp_dict[f'{type_name}'] = "N/A"
logging.debug("Failed to get gpu_board %s for gpu %s | %s",
type_name, gpu_id, e.get_error_info())

return gpu_board_temp_dict

def get_base_board_temperatures(self, device_handle, gpu_id, logger):
"""Get base board temperature readings

Args:
device_handle: GPU device handle
gpu_id: GPU identifier for logging
logger: AMDSMILogger instance

Returns:
dict: Base board temperature data or empty dict if all values are N/A
"""
base_board_temp_dict = {}
base_board_temp_types = [
amdsmi_interface.AmdSmiTemperatureType.BASEBOARD_UBB_FPGA,
amdsmi_interface.AmdSmiTemperatureType.BASEBOARD_UBB_FRONT,
amdsmi_interface.AmdSmiTemperatureType.BASEBOARD_UBB_BACK,
amdsmi_interface.AmdSmiTemperatureType.BASEBOARD_UBB_OAM7,
amdsmi_interface.AmdSmiTemperatureType.BASEBOARD_UBB_IBC,
amdsmi_interface.AmdSmiTemperatureType.BASEBOARD_UBB_UFPGA,
amdsmi_interface.AmdSmiTemperatureType.BASEBOARD_UBB_OAM1,
amdsmi_interface.AmdSmiTemperatureType.BASEBOARD_OAM_0_1_HSC,
amdsmi_interface.AmdSmiTemperatureType.BASEBOARD_OAM_2_3_HSC,
amdsmi_interface.AmdSmiTemperatureType.BASEBOARD_OAM_4_5_HSC,
amdsmi_interface.AmdSmiTemperatureType.BASEBOARD_OAM_6_7_HSC,
amdsmi_interface.AmdSmiTemperatureType.BASEBOARD_UBB_FPGA_0V72_VR,
amdsmi_interface.AmdSmiTemperatureType.BASEBOARD_UBB_FPGA_3V3_VR,
amdsmi_interface.AmdSmiTemperatureType.BASEBOARD_RETIMER_0_1_2_3_1V2_VR,
amdsmi_interface.AmdSmiTemperatureType.BASEBOARD_RETIMER_4_5_6_7_1V2_VR,
amdsmi_interface.AmdSmiTemperatureType.BASEBOARD_RETIMER_0_1_0V9_VR,
amdsmi_interface.AmdSmiTemperatureType.BASEBOARD_RETIMER_4_5_0V9_VR,
amdsmi_interface.AmdSmiTemperatureType.BASEBOARD_RETIMER_2_3_0V9_VR,
amdsmi_interface.AmdSmiTemperatureType.BASEBOARD_RETIMER_6_7_0V9_VR,
amdsmi_interface.AmdSmiTemperatureType.BASEBOARD_OAM_0_1_2_3_3V3_VR,
amdsmi_interface.AmdSmiTemperatureType.BASEBOARD_OAM_4_5_6_7_3V3_VR,
amdsmi_interface.AmdSmiTemperatureType.BASEBOARD_IBC_HSC,
amdsmi_interface.AmdSmiTemperatureType.BASEBOARD_IBC
]

for temp_type in base_board_temp_types:
type_name = temp_type.name.replace("BASEBOARD_", "")
try:
base_board_temp_holder = amdsmi_interface.amdsmi_get_temp_metric(
device_handle, temp_type, amdsmi_interface.AmdSmiTemperatureMetric.CURRENT)
if base_board_temp_holder != "N/A":
base_board_temp_dict[f'{type_name}'] = self.unit_format(
logger, base_board_temp_holder, '\N{DEGREE SIGN}C')
else:
base_board_temp_dict[f'{type_name}'] = "N/A"
except amdsmi_exception.AmdSmiLibraryException as e:
base_board_temp_dict[f'{type_name}'] = "N/A"
logging.debug("Failed to get base_board %s for gpu %s | %s",
type_name, gpu_id, e.get_error_info())

return base_board_temp_dict
3 changes: 3 additions & 0 deletions projects/amdsmi/amdsmi_cli/amdsmi_logger.py
Original file line number Diff line number Diff line change
Expand Up @@ -263,6 +263,9 @@ def _convert_json_to_tabular(self, json_object: Dict[str, any], dynamic=False):
# Remove excess two values after a new line in table_values
table_values = table_values[:table_values.rfind('\n')]
table_values += '\n'
# Board temperature key patterns
elif any(pattern in key for pattern in ['IBC', 'OAM', 'RETIMER', 'UBB', 'HSC', 'VR', 'VDDCR', 'NODE', 'VDD', 'HBM']):
table_values += string_value.rjust(max((len(key)+2), 7))
# Default spacing
else:
table_values += string_value.rjust(10)
Expand Down
Loading
Loading