Skip to content

Commit ef79588

Browse files
committed
[SWDEV-560235] Add gpu and base board temperatures to monitor
Added gpu_board and base_board temperatures non-default monitor temperature. Signed-off-by: Bindhiya Kanangot Balakrishnan <[email protected]>
1 parent de27daa commit ef79588

File tree

4 files changed

+68
-5
lines changed

4 files changed

+68
-5
lines changed

projects/amdsmi/CHANGELOG.md

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,19 @@ Full documentation for amd_smi_lib is available at [https://rocm.docs.amd.com/pr
88

99
### Added
1010

11+
- **Added GPU and base board temperature `amd-smi monitor` CLI support**.
12+
- Added `--gpu-board-temps` option to `amd-smi monitor` command for GPU board temperature sensors
13+
- Added `--base-board-temps` option to `amd-smi monitor` command for base board temperature sensors
14+
15+
- **Added Node Power Management (NPM) support**.
16+
- Added new Node Power Management APIs and CLI for node monitoring
17+
- Added C API functions:
18+
- `amdsmi_get_node_handle()`: Get handle for node devices
19+
- `amdsmi_get_npm_info()`: Retrieve Node Power Management information
20+
- Added Python API wrappers for new node device functions
21+
- Added `amd-smi node` CLI command for Node Power Management operations
22+
- Currently supported for OAM_ID 0 only.
23+
1124
- **Added the following C API's to amdsmi_interface.py**.
1225
- amdsmi_get_cpu_handle()
1326
- amdsmi_get_esmi_err_msg()

projects/amdsmi/amdsmi_cli/amdsmi_commands.py

Lines changed: 48 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -5607,8 +5607,9 @@ def reset(self, args, multiple_devices=False, gpu=None, gpureset=None,
56075607

56085608
def monitor(self, args, multiple_devices=False, watching_output=False, gpu=None,
56095609
watch=None, watch_time=None, iterations=None, power_usage=None,
5610-
temperature=None, gfx_util=None, mem_util=None, encoder=None,
5611-
decoder=None, ecc=None, vram_usage=None, pcie=None, process=None,
5610+
temperature=None, base_board_temps=None, gpu_board_temps=None,
5611+
gfx_util=None, mem_util=None, encoder=None, decoder=None,
5612+
ecc=None, vram_usage=None, pcie=None, process=None,
56125613
violation=None):
56135614
""" Populate a table with each GPU as an index to rows of targeted data
56145615
@@ -5621,6 +5622,8 @@ def monitor(self, args, multiple_devices=False, watching_output=False, gpu=None,
56215622
iterations (int, optional): Value override for args.iterations. Defaults to None.
56225623
power_usage (bool, optional): Value override for args.power_usage. Defaults to None.
56235624
temperature (bool, optional): Value override for args.temperature. Defaults to None.
5625+
base_board_temps (bool, optional): Value override for args.base_board_temps. Defaults to None.
5626+
gpu_board_temps (bool, optional): Value override for args.gpu_board_temps. Defaults to None.
56245627
gfx (bool, optional): Value override for args.gfx. Defaults to None.
56255628
mem_util (bool, optional): Value override for args.mem. Defaults to None.
56265629
encoder (bool, optional): Value override for args.encoder. Defaults to None.
@@ -5653,6 +5656,10 @@ def monitor(self, args, multiple_devices=False, watching_output=False, gpu=None,
56535656
args.power_usage = power_usage
56545657
if temperature:
56555658
args.temperature = temperature
5659+
if base_board_temps:
5660+
args.base_board_temps = base_board_temps
5661+
if gpu_board_temps:
5662+
args.gpu_board_temps = gpu_board_temps
56565663
if gfx_util:
56575664
args.gfx = gfx_util
56585665
if mem_util:
@@ -5685,9 +5692,10 @@ def monitor(self, args, multiple_devices=False, watching_output=False, gpu=None,
56855692

56865693
# If all arguments are False, the print all values
56875694
# Don't include process in this logic as it's an optional edge case
5688-
if not any([args.power_usage, args.temperature, args.gfx, args.mem,
5689-
args.encoder, args.decoder, args.ecc, args.vram_usage,
5690-
args.pcie, args.violation]):
5695+
if not any([args.power_usage, args.temperature, args.base_board_temps,
5696+
args.gpu_board_temps, args.gfx, args.mem, args.encoder,
5697+
args.decoder, args.ecc, args.vram_usage, args.pcie,
5698+
args.violation]):
56915699
args.power_usage = args.temperature = args.gfx = args.mem = \
56925700
args.encoder = args.decoder = args.vram_usage = True
56935701
# set extra args for default output filtering
@@ -5869,6 +5877,41 @@ def monitor(self, args, multiple_devices=False, watching_output=False, gpu=None,
58695877
self.logger.table_header += 'GPU_T'.rjust(8)
58705878
self.logger.table_header += 'MEM_T'.rjust(8)
58715879

5880+
5881+
if args.gpu_board_temps:
5882+
try:
5883+
gpu_board_temp_dict = self.helpers.get_gpu_board_temperatures(args.gpu, gpu_id, self.logger)
5884+
5885+
temp_unit_json = 'C'
5886+
# Add GPU board sensor headers
5887+
if gpu_board_temp_dict:
5888+
for temp_sensor in sorted(gpu_board_temp_dict.keys()):
5889+
self.logger.table_header += f"{temp_sensor}".rjust(max(len(temp_sensor)+2, 7))
5890+
for temp_type, temp_value in gpu_board_temp_dict.items():
5891+
if self.logger.is_json_format() and isinstance(temp_value, dict):
5892+
temp_value['unit'] = temp_unit_json
5893+
monitor_values[temp_type] = temp_value
5894+
except Exception as e:
5895+
logging.debug("Failed to get GPU board temperatures on gpu %s | %s", gpu_id, e)
5896+
5897+
5898+
if args.base_board_temps:
5899+
try:
5900+
base_board_temp_dict = self.helpers.get_base_board_temperatures(args.gpu, gpu_id, self.logger)
5901+
5902+
temp_unit_json = 'C'
5903+
# Add base board sensor headers
5904+
if base_board_temp_dict:
5905+
for temp_sensor in sorted(base_board_temp_dict.keys()):
5906+
self.logger.table_header += f"{temp_sensor}".rjust(max(len(temp_sensor)+2, 7))
5907+
for temp_type, temp_value in base_board_temp_dict.items():
5908+
if self.logger.is_json_format() and isinstance(temp_value, dict):
5909+
temp_value['unit'] = temp_unit_json
5910+
monitor_values[temp_type] = temp_value
5911+
except Exception as e:
5912+
logging.debug("Failed to get base board temperatures on gpu %s | %s", gpu_id, e)
5913+
5914+
58725915
if args.gfx:
58735916
try:
58745917
gfx_clk = gpu_metrics_info['current_gfxclk']

projects/amdsmi/amdsmi_cli/amdsmi_logger.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -263,6 +263,9 @@ def _convert_json_to_tabular(self, json_object: Dict[str, any], dynamic=False):
263263
# Remove excess two values after a new line in table_values
264264
table_values = table_values[:table_values.rfind('\n')]
265265
table_values += '\n'
266+
# Board temperature key patterns
267+
elif any(pattern in key for pattern in ['IBC', 'OAM', 'RETIMER', 'UBB', 'HSC', 'VR', 'VDDCR', 'NODE', 'VDD', 'HBM']):
268+
table_values += string_value.rjust(max((len(key)+2), 7))
266269
# Default spacing
267270
else:
268271
table_values += string_value.rjust(10)

projects/amdsmi/amdsmi_cli/amdsmi_parser.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1412,6 +1412,8 @@ def _add_monitor_parser(self, subparsers: argparse._SubParsersAction, func):
14121412
# Help text for Arguments only on Guest and BM platforms
14131413
power_usage_help = "Monitor power usage and power cap in Watts"
14141414
temperature_help = "Monitor temperature in Celsius"
1415+
base_board_temps_help = "Monitor base board temperatures in Celsius"
1416+
gpu_board_temps_help = "Monitor GPU board temperatures in Celsius"
14151417
gfx_util_help = "Monitor graphics utilization (%%) and clock (MHz)"
14161418
mem_util_help = "Monitor memory utilization (%%) and clock (MHz)"
14171419
encoder_util_help = "Monitor encoder utilization (%%) and clock (MHz)"
@@ -1431,6 +1433,8 @@ def _add_monitor_parser(self, subparsers: argparse._SubParsersAction, func):
14311433
# Add monitor arguments
14321434
monitor_parser.add_argument('-p', '--power-usage', action='store_true', required=False, help=power_usage_help)
14331435
monitor_parser.add_argument('-t', '--temperature', action='store_true', required=False, help=temperature_help)
1436+
monitor_parser.add_argument('-b', '--base-board-temps', action='store_true', required=False, help=base_board_temps_help)
1437+
monitor_parser.add_argument('-o', '--gpu-board-temps', action='store_true', required=False, help=gpu_board_temps_help)
14341438
monitor_parser.add_argument('-u', '--gfx', action='store_true', required=False, help=gfx_util_help)
14351439
monitor_parser.add_argument('-m', '--mem', action='store_true', required=False, help=mem_util_help)
14361440
monitor_parser.add_argument('-n', '--encoder', action='store_true', required=False, help=encoder_util_help)

0 commit comments

Comments
 (0)