Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Binary file added swanlab/bin/apple_gpu_stats
Binary file not shown.
59 changes: 57 additions & 2 deletions swanlab/data/system/info.py
Original file line number Diff line number Diff line change
Expand Up @@ -87,13 +87,14 @@ def __get_git_branch_and_commit():
return None, None


def __get_gpu_info():
def __get_nvidia_gpu_info():
"""获取 GPU 信息"""
info = {"cores": None, "type": [], "memory": []}
try:
pynvml.nvmlInit()
except:
return info
return None

try:
# 获取 NVIDIA GPU 数量
info["cores"] = pynvml.nvmlDeviceGetCount()
Expand All @@ -114,6 +115,60 @@ def __get_gpu_info():
return info


def __get_apple_gpu_info():
import ujson

info = {"cores": None, "type": [], "memory": []}

# 使用system_profiler命令以JSON格式获取GPU信息
try:
result = subprocess.run(["system_profiler", "SPHardwareDataType", "-json"], capture_output=True, text=True)
gpu_name = ujson.loads(result.stdout)["SPHardwareDataType"][0]["chip_type"]
memory = ujson.loads(result.stdout)["SPHardwareDataType"][0]["physical_memory"]
memory = str(memory).lower().replace("gb", "")
number_processors = ujson.loads(result.stdout)["SPHardwareDataType"][0]["number_processors"]
except:
return None

info["type"].append(gpu_name)
info["memory"].append(memory)
info["cores"] = number_processors

# TODO: Apple设备硬件监控时再解除注释,二进制文件apple_gpu_stats来自https://github.com/wandb/wandb/blob/main/wandb/bin/apple_gpu_stats
#
# MAX_POWER_WATTS = 16.5
#
# import pathlib
# binary_path = (pathlib.Path(sys.modules["swanlab"].__path__[0]) / "bin" / "apple_gpu_stats").resolve()
# try:
# command = [str(binary_path), "--json"]
# output = (subprocess.check_output(command, universal_newlines=True).strip().split("\n"))[0]
# raw_stats = ujson.loads(output)
# stats = {
# "gpu": raw_stats["utilization"],
# "memoryAllocated": raw_stats["mem_used"],
# "temp": raw_stats["temperature"],
# "powerWatts": raw_stats["power"],
# "powerPercent": (raw_stats["power"] / MAX_POWER_WATTS) * 100,
# }
# except:
# swanlog.debug(f"Apple GPU stats failed to obtain.")

return info


def __get_gpu_info():
gpu_info = __get_nvidia_gpu_info()
if gpu_info is not None:
return gpu_info

apple_info = __get_apple_gpu_info()
if apple_info is not None:
return apple_info

return {"cores": None, "type": [], "memory": []}


def __get_command():
"""获取执行训练时的完整命令行信息
比如在运行`python main.py -i 123`时,full_command为`main.py -i 123`
Expand Down
6 changes: 3 additions & 3 deletions test/create_experiment.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,9 +29,9 @@
swanlab.config.debug = "这是一串" + "很长" * 100 + "的字符串"
# 模拟训练
for epoch in range(2, swanlab.config.epoches):
acc = 1 - 2 ** -epoch - random.random() / epoch - offset
loss = 2 ** -epoch + random.random() / epoch + offset
loss2 = 3 ** -epoch + random.random() / epoch + offset * 3
acc = 1 - 2**-epoch - random.random() / epoch - offset
loss = 2**-epoch + random.random() / epoch + offset
loss2 = 3**-epoch + random.random() / epoch + offset * 3
print(f"epoch={epoch}, accuracy={acc}, loss={loss}")
if epoch % 10 == 0:
# 测试audio
Expand Down