Skip to content

Commit 40be44c

Browse files
authored
Merge pull request #740 from SwanHubX/feat/webhook
feat/webhook
2 parents 2b609ea + 40fcaa6 commit 40be44c

File tree

17 files changed

+478
-629
lines changed

17 files changed

+478
-629
lines changed

requirements.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
swankit==0.1.1b3
2-
swanboard==0.1.4b2
2+
swanboard==0.1.6
33
cos-python-sdk-v5
44
urllib3>=1.26.0
55
requests>=2.25.0

swanlab/data/callback_local.py

Lines changed: 14 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -7,18 +7,20 @@
77
@Description:
88
基本回调函数注册表,此时不考虑云端情况
99
"""
10-
from swankit.core import SwanLabSharedSettings
11-
from swanlab.log import swanlog
12-
from swanlab.data.run.main import get_run, SwanLabRunState
13-
from swanlab.data.run.callback import SwanLabRunCallback
14-
from swankit.callback import RuntimeInfo, MetricInfo
15-
from swankit.log import FONT
16-
from swanlab.env import SwanLabEnv
17-
from datetime import datetime
18-
import traceback
1910
import json
2011
import os
2112
import sys
13+
import traceback
14+
from datetime import datetime
15+
16+
from swankit.callback import RuntimeInfo, MetricInfo
17+
from swankit.core import SwanLabSharedSettings
18+
from swankit.log import FONT
19+
20+
from swanlab.data.run.callback import SwanLabRunCallback
21+
from swanlab.data.run.main import get_run, SwanLabRunState
22+
from swanlab.env import SwanLabEnv
23+
from swanlab.log import swanlog
2224

2325

2426
class LocalRunCallback(SwanLabRunCallback):
@@ -55,6 +57,7 @@ def _init_logdir(logdir: str = None) -> str:
5557
根据传入的logdir,初始化日志文件夹
5658
FIXME shit code
5759
"""
60+
env_key = SwanLabEnv.SWANLOG_FOLDER.value
5861
# 如果传入了logdir,则将logdir设置为环境变量,代表日志文件存放的路径
5962
if logdir is not None:
6063
try:
@@ -73,7 +76,6 @@ def _init_logdir(logdir: str = None) -> str:
7376
raise ValueError("logdir must be a str.")
7477
except IOError:
7578
raise IOError("logdir must be a path and have Write permission.")
76-
os.environ[SwanLabEnv.SWANLOG_FOLDER.value] = logdir
7779
# 如果没有传入logdir,则使用默认的logdir, 即当前工作目录下的swanlog文件夹,但是需要保证目录存在
7880
else:
7981
logdir = os.environ.get(SwanLabEnv.SWANLOG_FOLDER.value) or os.path.join(os.getcwd(), "swanlog")
@@ -84,6 +86,8 @@ def _init_logdir(logdir: str = None) -> str:
8486
raise IOError
8587
except IOError:
8688
raise IOError("logdir must have Write permission.")
89+
# 同步环境变量
90+
os.environ[env_key] = logdir
8791
# 如果logdir是空的,创建.gitignore文件,写入*
8892
if not os.listdir(logdir):
8993
with open(os.path.join(logdir, ".gitignore"), "w", encoding="utf-8") as f:

swanlab/data/run/helper.py

Lines changed: 6 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -8,10 +8,11 @@
88
回调函数操作员,批量处理回调函数的调用
99
"""
1010
from typing import List, Union, Dict, Any, Tuple
11-
from swankit.callback import SwanKitCallback, MetricInfo, ColumnInfo, OperateErrorInfo, RuntimeInfo
11+
12+
from swankit.callback import SwanKitCallback, MetricInfo, ColumnInfo, RuntimeInfo
1213
from swankit.core import SwanLabSharedSettings
13-
import swanlab.error as E
14-
from swankit.log import FONT
14+
15+
from swanlab.data.run.webhook import try_send_webhook
1516

1617
OperatorReturnType = Dict[str, Any]
1718

@@ -83,15 +84,8 @@ def before_init_experiment(
8384
return self.__run_all("before_init_experiment", run_id, exp_name, description, num, colors)
8485

8586
def on_run(self):
86-
try:
87-
return self.__run_all("on_run")
88-
except E.ApiError as e:
89-
FONT.brush("", 50)
90-
if e.resp.status_code == 409:
91-
error = OperateErrorInfo("The experiment name already exists, please change the experiment name")
92-
return self.__run_all("on_run_error_from_operator", error)
93-
else:
94-
raise e
87+
self.__run_all("on_run")
88+
try_send_webhook()
9589

9690
def on_runtime_info_update(self, r: RuntimeInfo):
9791
return self.__run_all("on_runtime_info_update", r)

swanlab/data/run/main.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -22,8 +22,8 @@
2222
from .config import SwanLabConfig
2323
from .exp import SwanLabExp
2424
from .helper import SwanLabRunOperator, RuntimeInfo
25+
from .metadata import get_requirements, get_metadata
2526
from .public import SwanLabPublicConfig
26-
from .system import get_system_info, get_requirements
2727
from ..formater import check_key_format, check_exp_name_format, check_desc_format
2828

2929
MAX_LIST_LENGTH = 108
@@ -134,7 +134,7 @@ def _(state: SwanLabRunState):
134134
self.__operator.on_runtime_info_update(
135135
RuntimeInfo(
136136
requirements=get_requirements(),
137-
metadata=get_system_info(get_package_version(), self.__settings.log_dir),
137+
metadata=get_metadata(self.__settings.log_dir),
138138
)
139139
)
140140

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,30 @@
1+
"""
2+
@author: cunyue
3+
@file: __init__.py
4+
@time: 2024/11/18 15:02
5+
@description: 实验元信息采集
6+
"""
7+
8+
from swanlab.data.run.metadata.cooperation import get_cooperation_info
9+
from swanlab.data.run.metadata.hardware import get_hardware_info
10+
from swanlab.data.run.metadata.requirements import get_requirements
11+
from swanlab.data.run.metadata.runtime import get_runtime_info
12+
13+
14+
def get_metadata(logdir: str):
15+
"""
16+
采集实验的全部信息
17+
"""
18+
coop = get_cooperation_info()
19+
return {
20+
**get_hardware_info(),
21+
**get_runtime_info(),
22+
"swanlab": {
23+
"version": coop["swanlab"]["version"],
24+
"logdir": logdir,
25+
"_coop": coop,
26+
},
27+
}
28+
29+
30+
__all__ = ["get_metadata", "get_requirements", "get_cooperation_info"]
Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,36 @@
1+
"""
2+
@author: cunyue
3+
@file: qing_cloud.py
4+
@time: 2024/11/18 15:14
5+
@description: 青云(https://www.qingcloud.com/)元信息采集
6+
"""
7+
8+
import os
9+
10+
11+
BASE_KEYS = ['AICP_PLATFORM', 'AICP_TYPE', 'AICP_NAME', 'AICP_USER_NAME']
12+
RESOURCES_KEYS = [
13+
'AICP_SPEC_COUNT',
14+
'AICP_SPEC_GPU',
15+
'AICP_SPEC_CPU',
16+
'AICP_SPEC_MEMORY',
17+
'AICP_SPEC_GPU_NAME',
18+
'AICP_SPEC_GPU_TYPE',
19+
'AICP_SPEC_GPU_MEMORY',
20+
'AICP_HOSTNAME',
21+
'AICP_HOST_MACHINE',
22+
]
23+
24+
25+
def get_qing_cloud_info():
26+
plat = os.getenv("AICP_PLATFORM")
27+
if not plat:
28+
return None
29+
return {**get_envs_by_keys(BASE_KEYS), "resources": get_envs_by_keys(RESOURCES_KEYS)}
30+
31+
32+
def get_envs_by_keys(keys: list):
33+
"""
34+
通过keys获取环境变量,最终返回一个dict,key为keys的值(小写),value为环境变量的值
35+
"""
36+
return {key.lower(): os.getenv(key) for key in keys}
Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,36 @@
1+
"""
2+
@author: cunyue
3+
@file: official.py
4+
@time: 2024/11/18 15:13
5+
@description: swanlab官方合作信息
6+
"""
7+
8+
import os
9+
10+
from swanlab.api import get_http
11+
from swanlab.data.run.metadata.coop.qing_cloud import get_qing_cloud_info
12+
from swanlab.env import SwanLabEnv
13+
from swanlab.package import get_experiment_url
14+
from swanlab.package import get_package_version
15+
16+
17+
def get_cooperation_info():
18+
qing_cloud = get_qing_cloud_info()
19+
coop = {"swanlab": get_swanlab_info()}
20+
if qing_cloud:
21+
coop.update({"qing_cloud": qing_cloud})
22+
return coop
23+
24+
25+
def get_swanlab_info():
26+
data = {
27+
"version": get_package_version(),
28+
"mode": os.getenv(SwanLabEnv.MODE.value),
29+
"swanlog_dir": os.getenv(SwanLabEnv.SWANLOG_FOLDER.value),
30+
}
31+
try:
32+
http = get_http()
33+
data["exp_url"] = get_experiment_url(http.username, http.projname, http.exp_id)
34+
except ValueError:
35+
pass
36+
return data
Lines changed: 173 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,173 @@
1+
"""
2+
@author: cunyue
3+
@file: hardware.py
4+
@time: 2024/11/18 15:12
5+
@description: 硬件信息采集
6+
"""
7+
8+
import json
9+
import multiprocessing
10+
import platform
11+
import subprocess
12+
13+
import psutil
14+
import pynvml
15+
16+
17+
def get_hardware_info():
18+
"""
19+
采集硬件信息,包括CPU、GPU、内存、硬盘等
20+
"""
21+
info = {
22+
"memory": get_memory_size(),
23+
"cpu": get_cpu_info(),
24+
"gpu": {
25+
"nvidia": get_nvidia_gpu_info(),
26+
},
27+
"soc": {
28+
"apple": get_apple_chip_info(),
29+
},
30+
}
31+
return info
32+
33+
34+
# ---------------------------------- cpu信息 ----------------------------------
35+
36+
37+
def get_cpu_info():
38+
"""获取 CPU 信息"""
39+
info = {"brand": None, "cores": None}
40+
41+
# 获取 CPU 品牌, 根据不同操作系统调用不同的函数
42+
if platform.system() == "Windows":
43+
info["brand"] = get_cpu_brand_windows()
44+
elif platform.system() == "Linux":
45+
info["brand"] = get_cpu_brand_linux()
46+
else:
47+
# 其他情况,暂时不支持
48+
# 苹果芯片单独处理
49+
return None
50+
try:
51+
# 获取 CPU 核心数
52+
info["cores"] = multiprocessing.cpu_count()
53+
except Exception: # noqa
54+
pass
55+
56+
return info
57+
58+
59+
def get_cpu_brand_windows():
60+
try:
61+
# 使用 WMIC 命令获取 CPU 品牌
62+
result = subprocess.run(["wmic", "cpu", "get", "name"], capture_output=True, text=True)
63+
cpu_brand = result.stdout.strip().split("\n")[-1].strip()
64+
return cpu_brand
65+
except Exception: # noqa
66+
return None
67+
68+
69+
def get_cpu_brand_linux():
70+
try:
71+
# 使用 lscpu 命令获取 CPU 品牌
72+
result = subprocess.run(["lscpu"], capture_output=True, text=True)
73+
for line in result.stdout.split("\n"):
74+
if "Model name:" in line:
75+
cpu_brand = line.split(":")[1].strip()
76+
return cpu_brand
77+
return None
78+
except Exception: # noqa
79+
return None
80+
81+
82+
# ---------------------------------- 内存信息 ----------------------------------
83+
84+
85+
def get_memory_size():
86+
"""获取内存大小"""
87+
try:
88+
# 获取系统总内存大小
89+
mem = psutil.virtual_memory()
90+
total_memory = round(mem.total / (1024**3)) # 单位为GB
91+
return total_memory
92+
except Exception: # noqa
93+
return
94+
95+
96+
# ---------------------------------- gpu信息 ----------------------------------
97+
98+
99+
def get_nvidia_gpu_info():
100+
"""获取 GPU 信息"""
101+
102+
def get_cuda_version():
103+
"""获取 CUDA 版本"""
104+
try:
105+
output = subprocess.check_output(["nvcc", "--version"]).decode("utf-8")
106+
for line in output.split('\n'):
107+
if "release" in line:
108+
version = line.split("release")[-1].strip().split(" ")[0][:-1]
109+
return version
110+
except Exception: # noqa
111+
return None
112+
113+
info = {"driver": None, "cores": None, "type": [], "memory": [], "cuda": None}
114+
try:
115+
pynvml.nvmlInit()
116+
except Exception: # noqa
117+
return None
118+
119+
try:
120+
# 获取 NVIDIA 驱动版本信息
121+
nv_driver = pynvml.nvmlSystemGetDriverVersion()
122+
if isinstance(nv_driver, bytes):
123+
nv_driver = nv_driver.decode("utf-8")
124+
info["driver"] = nv_driver
125+
126+
# 获取 CUDA 版本
127+
info["cuda"] = get_cuda_version()
128+
129+
# 获取 NVIDIA GPU 数量
130+
info["cores"] = pynvml.nvmlDeviceGetCount()
131+
# 遍历每个 GPU,获取 GPU 信息
132+
for i in range(info["cores"]):
133+
handle = pynvml.nvmlDeviceGetHandleByIndex(i)
134+
# 获取 GPU 型号
135+
gpu_name = pynvml.nvmlDeviceGetName(handle) # types: bytes | str
136+
if isinstance(gpu_name, bytes): # Fix for pynvml 早期版本,关联 issue: #605
137+
gpu_name = gpu_name.decode("utf-8")
138+
info["type"].append(gpu_name)
139+
# 获取 GPU 的总显存, 单位为GB
140+
info["memory"].append(round(pynvml.nvmlDeviceGetMemoryInfo(handle).total / (1024**3)))
141+
142+
except pynvml.NVMLError:
143+
pass
144+
finally:
145+
# 结束 NVML
146+
pynvml.nvmlShutdown()
147+
return info
148+
149+
150+
# ---------------------------------- apple信息 ----------------------------------
151+
152+
153+
def get_apple_chip_info():
154+
if "mac" not in platform.platform().lower():
155+
return None
156+
info = {"cpu": None, "gpu": None, "memory": None, "type": None}
157+
158+
# 使用system_profiler命令以JSON格式获取GPU信息
159+
try:
160+
result = subprocess.run(["system_profiler", "SPHardwareDataType", "-json"], capture_output=True, text=True)
161+
gpu_name = json.loads(result.stdout)["SPHardwareDataType"][0]["chip_type"]
162+
memory = json.loads(result.stdout)["SPHardwareDataType"][0]["physical_memory"]
163+
memory = str(memory).lower().replace("gb", "")
164+
# TODO: 获取GPU信息
165+
info["type"] = gpu_name
166+
info["memory"] = memory
167+
except Exception: # noqa
168+
return None
169+
try:
170+
info["cpu"] = multiprocessing.cpu_count()
171+
except Exception: # noqa
172+
pass
173+
return info

0 commit comments

Comments
 (0)