@@ -5607,8 +5607,9 @@ def reset(self, args, multiple_devices=False, gpu=None, gpureset=None,
56075607
56085608 def monitor (self , args , multiple_devices = False , watching_output = False , gpu = None ,
56095609 watch = None , watch_time = None , iterations = None , power_usage = None ,
5610- temperature = None , gfx_util = None , mem_util = None , encoder = None ,
5611- decoder = None , ecc = None , vram_usage = None , pcie = None , process = None ,
5610+ temperature = None , base_board_temps = None , gpu_board_temps = None ,
5611+ gfx_util = None , mem_util = None , encoder = None , decoder = None ,
5612+ ecc = None , vram_usage = None , pcie = None , process = None ,
56125613 violation = None ):
56135614 """ Populate a table with each GPU as an index to rows of targeted data
56145615
@@ -5621,6 +5622,8 @@ def monitor(self, args, multiple_devices=False, watching_output=False, gpu=None,
56215622 iterations (int, optional): Value override for args.iterations. Defaults to None.
56225623 power_usage (bool, optional): Value override for args.power_usage. Defaults to None.
56235624 temperature (bool, optional): Value override for args.temperature. Defaults to None.
5625+ base_board_temps (bool, optional): Value override for args.base_board_temps. Defaults to None.
5626+ gpu_board_temps (bool, optional): Value override for args.gpu_board_temps. Defaults to None.
56245627 gfx (bool, optional): Value override for args.gfx. Defaults to None.
56255628 mem_util (bool, optional): Value override for args.mem. Defaults to None.
56265629 encoder (bool, optional): Value override for args.encoder. Defaults to None.
@@ -5653,6 +5656,10 @@ def monitor(self, args, multiple_devices=False, watching_output=False, gpu=None,
56535656 args .power_usage = power_usage
56545657 if temperature :
56555658 args .temperature = temperature
5659+ if base_board_temps :
5660+ args .base_board_temps = base_board_temps
5661+ if gpu_board_temps :
5662+ args .gpu_board_temps = gpu_board_temps
56565663 if gfx_util :
56575664 args .gfx = gfx_util
56585665 if mem_util :
@@ -5685,9 +5692,10 @@ def monitor(self, args, multiple_devices=False, watching_output=False, gpu=None,
56855692
56865693 # If all arguments are False, the print all values
56875694 # Don't include process in this logic as it's an optional edge case
5688- if not any ([args .power_usage , args .temperature , args .gfx , args .mem ,
5689- args .encoder , args .decoder , args .ecc , args .vram_usage ,
5690- args .pcie , args .violation ]):
5695+ if not any ([args .power_usage , args .temperature , args .base_board_temps ,
5696+ args .gpu_board_temps , args .gfx , args .mem , args .encoder ,
5697+ args .decoder , args .ecc , args .vram_usage , args .pcie ,
5698+ args .violation ]):
56915699 args .power_usage = args .temperature = args .gfx = args .mem = \
56925700 args .encoder = args .decoder = args .vram_usage = True
56935701 # set extra args for default output filtering
@@ -5869,6 +5877,41 @@ def monitor(self, args, multiple_devices=False, watching_output=False, gpu=None,
58695877 self .logger .table_header += 'GPU_T' .rjust (8 )
58705878 self .logger .table_header += 'MEM_T' .rjust (8 )
58715879
5880+
5881+ if args .gpu_board_temps :
5882+ try :
5883+ gpu_board_temp_dict = self .helpers .get_gpu_board_temperatures (args .gpu , gpu_id , self .logger )
5884+
5885+ temp_unit_json = 'C'
5886+ # Add GPU board sensor headers
5887+ if gpu_board_temp_dict :
5888+ for temp_sensor in sorted (gpu_board_temp_dict .keys ()):
5889+ self .logger .table_header += f"{ temp_sensor } " .rjust (max (len (temp_sensor )+ 2 , 7 ))
5890+ for temp_type , temp_value in gpu_board_temp_dict .items ():
5891+ if self .logger .is_json_format () and isinstance (temp_value , dict ):
5892+ temp_value ['unit' ] = temp_unit_json
5893+ monitor_values [temp_type ] = temp_value
5894+ except Exception as e :
5895+ logging .debug ("Failed to get GPU board temperatures on gpu %s | %s" , gpu_id , e )
5896+
5897+
5898+ if args .base_board_temps :
5899+ try :
5900+ base_board_temp_dict = self .helpers .get_base_board_temperatures (args .gpu , gpu_id , self .logger )
5901+
5902+ temp_unit_json = 'C'
5903+ # Add base board sensor headers
5904+ if base_board_temp_dict :
5905+ for temp_sensor in sorted (base_board_temp_dict .keys ()):
5906+ self .logger .table_header += f"{ temp_sensor } " .rjust (max (len (temp_sensor )+ 2 , 7 ))
5907+ for temp_type , temp_value in base_board_temp_dict .items ():
5908+ if self .logger .is_json_format () and isinstance (temp_value , dict ):
5909+ temp_value ['unit' ] = temp_unit_json
5910+ monitor_values [temp_type ] = temp_value
5911+ except Exception as e :
5912+ logging .debug ("Failed to get base board temperatures on gpu %s | %s" , gpu_id , e )
5913+
5914+
58725915 if args .gfx :
58735916 try :
58745917 gfx_clk = gpu_metrics_info ['current_gfxclk' ]
0 commit comments