Skip to content

Commit a7b86ea

Browse files
tianhaodongbdJeff114514
authored andcommitted
[xpu] support autotuner for xpu (PaddlePaddle#67215)
1 parent bd269cf commit a7b86ea

File tree

3 files changed

+56
-6
lines changed

3 files changed

+56
-6
lines changed

python/paddle/distributed/auto_tuner/utils.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1425,9 +1425,9 @@ def read_metric_log(
14251425
target_metric + r":* *(\d+(\.\d*)?)|(\d+(\.\d*)?) *" + target_metric
14261426
)
14271427
re_out_of_memory_pattern = (
1428-
r"Out of memory error on"
1429-
if paddle.device.is_compiled_with_cuda()
1430-
else r"out of memory"
1428+
r"out of memory"
1429+
if paddle.device.is_compiled_with_custom_device('npu')
1430+
else r"Out of memory error on"
14311431
)
14321432
out_of_memory_flag = 0
14331433
metric_list = []

python/paddle/distributed/launch/main.py

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -997,13 +997,13 @@ def launch() -> None:
997997
mem_allnodes = [i[0].decode() for i in result]
998998

999999
for mem in mem_allnodes:
1000-
if mem is None:
1000+
if mem is None or cur_cfg["max_mem_usage"] is None:
10011001
continue
10021002
if mem == "OOM":
10031003
cur_cfg["max_mem_usage"] = mem
10041004
break
10051005
cur_cfg["max_mem_usage"] = max(
1006-
int(mem), int(cur_cfg["max_mem_usage"])
1006+
int(float(mem)), int(float(cur_cfg["max_mem_usage"]))
10071007
)
10081008

10091009
# if need accurate peak memory
@@ -1232,6 +1232,10 @@ def launch() -> None:
12321232
processes = os.popen(
12331233
"fuser -v /dev/davinci* |awk '{for(i=1;i<=NF;i++) print $i;}'"
12341234
).readlines()
1235+
elif paddle.is_compiled_with_xpu():
1236+
processes = os.popen(
1237+
"fuser -v /dev/xpu* |awk '{for(i=1;i<=NF;i++) print $i;}'"
1238+
).readlines()
12351239
else:
12361240
processes = os.popen(
12371241
"fuser -v /dev/nvidia* |awk '{for(i=1;i<=NF;i++) print $i;}'"

python/paddle/distributed/launch/utils/nvsmi.py

Lines changed: 47 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -149,6 +149,47 @@ def query_npu_smi(query=None, index=None, dtype=None):
149149
return ret
150150

151151

152+
def query_xpu_smi(query=None, index=None, dtype=None):
153+
if not has_xpu_smi():
154+
return []
155+
156+
cmd = ["xpu-smi"]
157+
158+
if not isinstance(dtype, list) or len(dtype) != len(query):
159+
dtype = [str] * len(query)
160+
161+
output = subprocess.check_output(cmd, timeout=3)
162+
lines = output.decode("utf-8").split(os.linesep)
163+
ret = []
164+
i = 0
165+
166+
for line in lines:
167+
if not line:
168+
continue
169+
result = re.split(r',|/|\s+|\|', line)
170+
length = len(result)
171+
if length not in [23] or "XPU" in result:
172+
continue
173+
result = [item for item in result if item]
174+
info = Info()
175+
utilization_xpu = float(re.findall(r'\d+\.\d+|\d+', result[9])[0])
176+
mem_total = float(re.findall(r'\d+\.\d+|\d+', result[8])[0])
177+
mem_used = float(re.findall(r'\d+\.\d+|\d+', result[7])[0])
178+
result = [
179+
i,
180+
utilization_xpu,
181+
mem_total,
182+
mem_used,
183+
(mem_total - mem_used),
184+
time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()),
185+
]
186+
i += 1
187+
for k, v, d in zip(query, result, dtype):
188+
setattr(info, k.replace(".", "_"), d(v))
189+
ret.append(info)
190+
return ret
191+
192+
152193
def get_gpu_info(index=None):
153194
q = "index,uuid,driver_version,name,gpu_serial,display_active,display_mode".split(
154195
","
@@ -175,9 +216,10 @@ def get_gpu_util(index=None):
175216
)
176217
if paddle.device.is_compiled_with_rocm():
177218
return query_rocm_smi(q, index=index, dtype=d)
178-
179219
elif paddle.device.is_compiled_with_custom_device('npu'):
180220
return query_npu_smi(q, index=index, dtype=d)
221+
elif paddle.is_compiled_with_xpu():
222+
return query_xpu_smi(q, index=index, dtype=d)
181223
return query_smi(q, index=index, dtype=d)
182224

183225

@@ -205,6 +247,10 @@ def has_npu_smi():
205247
return shutil.which("npu-smi")
206248

207249

250+
def has_xpu_smi():
251+
return shutil.which("xpu-smi")
252+
253+
208254
if __name__ == '__main__':
209255
print(get_gpu_info(0))
210256
print(get_gpu_util(0))

0 commit comments

Comments
 (0)