Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 3 additions & 3 deletions python/paddle/distributed/auto_tuner/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -1425,9 +1425,9 @@ def read_metric_log(
target_metric + r":* *(\d+(\.\d*)?)|(\d+(\.\d*)?) *" + target_metric
)
re_out_of_memory_pattern = (
r"Out of memory error on"
if paddle.device.is_compiled_with_cuda()
else r"out of memory"
r"out of memory"
if paddle.device.is_compiled_with_custom_device('npu')
else r"Out of memory error on"
)
out_of_memory_flag = 0
metric_list = []
Expand Down
8 changes: 6 additions & 2 deletions python/paddle/distributed/launch/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -1003,13 +1003,13 @@ def launch() -> None:
mem_allnodes = [i[0].decode() for i in result]

for mem in mem_allnodes:
if mem is None:
if mem is None or cur_cfg["max_mem_usage"] is None:
continue
if mem == "OOM":
cur_cfg["max_mem_usage"] = mem
break
cur_cfg["max_mem_usage"] = max(
int(mem), int(cur_cfg["max_mem_usage"])
int(float(mem)), int(float(cur_cfg["max_mem_usage"]))
)

# if need accurate peak memory
Expand Down Expand Up @@ -1238,6 +1238,10 @@ def launch() -> None:
processes = os.popen(
"fuser -v /dev/davinci* |awk '{for(i=1;i<=NF;i++) print $i;}'"
).readlines()
elif paddle.is_compiled_with_xpu():
processes = os.popen(
"fuser -v /dev/xpu* |awk '{for(i=1;i<=NF;i++) print $i;}'"
).readlines()
else:
processes = os.popen(
"fuser -v /dev/nvidia* |awk '{for(i=1;i<=NF;i++) print $i;}'"
Expand Down
48 changes: 47 additions & 1 deletion python/paddle/distributed/launch/utils/nvsmi.py
Original file line number Diff line number Diff line change
Expand Up @@ -149,6 +149,47 @@ def query_npu_smi(query=None, index=None, dtype=None):
return ret


def query_xpu_smi(query=None, index=None, dtype=None):
if not has_xpu_smi():
return []

cmd = ["xpu-smi"]

if not isinstance(dtype, list) or len(dtype) != len(query):
dtype = [str] * len(query)

output = subprocess.check_output(cmd, timeout=3)
lines = output.decode("utf-8").split(os.linesep)
ret = []
i = 0

for line in lines:
if not line:
continue
result = re.split(r',|/|\s+|\|', line)
length = len(result)
if length not in [23] or "XPU" in result:
continue
result = [item for item in result if item]
info = Info()
utilization_xpu = float(re.findall(r'\d+\.\d+|\d+', result[9])[0])
mem_total = float(re.findall(r'\d+\.\d+|\d+', result[8])[0])
mem_used = float(re.findall(r'\d+\.\d+|\d+', result[7])[0])
result = [
i,
utilization_xpu,
mem_total,
mem_used,
(mem_total - mem_used),
time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()),
]
i += 1
for k, v, d in zip(query, result, dtype):
setattr(info, k.replace(".", "_"), d(v))
ret.append(info)
return ret


def get_gpu_info(index=None):
q = "index,uuid,driver_version,name,gpu_serial,display_active,display_mode".split(
","
Expand All @@ -175,9 +216,10 @@ def get_gpu_util(index=None):
)
if paddle.device.is_compiled_with_rocm():
return query_rocm_smi(q, index=index, dtype=d)

elif paddle.device.is_compiled_with_custom_device('npu'):
return query_npu_smi(q, index=index, dtype=d)
elif paddle.is_compiled_with_xpu():
return query_xpu_smi(q, index=index, dtype=d)
return query_smi(q, index=index, dtype=d)


Expand Down Expand Up @@ -205,6 +247,10 @@ def has_npu_smi():
return shutil.which("npu-smi")


def has_xpu_smi():
return shutil.which("xpu-smi")


if __name__ == '__main__':
print(get_gpu_info(0))
print(get_gpu_util(0))
Expand Down