Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
46 changes: 44 additions & 2 deletions swanlab/data/run/metadata/hardware/dcu/hygon.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,12 @@ def get_hygon_dcu_info() -> HardwareFuncResult:
driver, dcu_map = map_hygon_dcu()
info["driver"] = driver
info["dcu"] = dcu_map
collector = DCUCollector(dcu_map)
max_mem_value = 0
for dcu_id in dcu_map:
mem_value = int(dcu_map[dcu_id]["memory"][:-2])
max_mem_value = max(max_mem_value, mem_value)
max_mem_value *= 1024
collector = DCUCollector(dcu_map, max_mem_value)
except Exception: # noqa
if all(v is None for v in info.values()):
return None, None
Expand Down Expand Up @@ -85,9 +90,10 @@ def map_hygon_dcu() -> Tuple[Optional[str], dict]:


class DCUCollector(H):
def __init__(self, dcu_map):
def __init__(self, dcu_map, max_mem_value):
super().__init__()
self.dcu_map = dcu_map
self.max_mem_value = max_mem_value

# DCU Utilization (%)
self.util_key = generate_key("dcu.{dcu_index}.pct")
Expand All @@ -107,6 +113,15 @@ def __init__(self, dcu_map):
)
self.per_memory_configs = {}

# DCU Memory Allocated (MB)
self.mem_value_key = generate_key("dcu.{dcu_index}.mem.value")
mem_value_config = HardwareConfig(
y_range=(0, self.max_mem_value),
chart_index=random_index(),
chart_name="DCU Memory Allocated (MB)",
)
self.per_mem_value_configs = {}

# DCU Temperature (°C)
self.temp_key = generate_key("dcu.{dcu_index}.temp")
temp_config = HardwareConfig(
Expand All @@ -129,6 +144,7 @@ def __init__(self, dcu_map):
metric_name = f"DCU {dcu_id}"
self.per_util_configs[metric_name] = util_config.clone(metric_name=metric_name)
self.per_memory_configs[metric_name] = memory_config.clone(metric_name=metric_name)
self.per_mem_value_configs[metric_name] = mem_value_config.clone(metric_name=metric_name)
self.per_temp_configs[metric_name] = temp_config.clone(metric_name=metric_name)
self.per_power_configs[metric_name] = power_config.clone(metric_name=metric_name)

Expand All @@ -137,6 +153,7 @@ def collect(self) -> HardwareInfoList:
usage_methods = [
self.get_utilization_usage,
self.get_memory_usage,
self.get_mem_value_usage,
self.get_temperature_usage,
self.get_power_usage,
]
Expand Down Expand Up @@ -194,6 +211,31 @@ def get_memory_usage(self) -> dict:
mem_infos[dcu_id]["value"] = float(dcu_mem_use)
return mem_infos

def get_mem_value_usage(self) -> dict:
"""
获取指定DCU设备的内存使用量(MB)
"""
output_str = subprocess.run(
["hy-smi", "--showmemuse", "--json"],
capture_output=True,
text=True,
).stdout
output_json = json.loads(output_str)
mem_value_infos = {}

for idx, (dcu_key, dcu_info) in enumerate(output_json.items()):
dcu_id = str(idx)
mem_value_infos[dcu_id] = {
"key": self.mem_value_key.format(dcu_index=dcu_id),
"name": f"DCU {dcu_id} Memory Allocated (MB)",
"value": math.nan,
"config": self.per_mem_value_configs[f"DCU {dcu_id}"],
}

dcu_mem_use_rate = dcu_info["DCU memory use (%)"]
mem_value_infos[dcu_id]["value"] = float(dcu_mem_use_rate) * 0.01 * self.max_mem_value
return mem_value_infos

def get_temperature_usage(self) -> dict:
"""
获取指定DCU设备的温度(°C) (采集 Junction 核心温度)
Expand Down
66 changes: 54 additions & 12 deletions swanlab/data/run/metadata/hardware/gpu/metax.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,14 +8,10 @@
import math
import platform
import subprocess
from typing import Tuple, Optional

from ..type import (
HardwareFuncResult,
HardwareInfoList,
HardwareConfig,
HardwareCollector as H,
)
from typing import Optional, Tuple

from ..type import HardwareCollector as H
from ..type import HardwareConfig, HardwareFuncResult, HardwareInfoList
from ..utils import generate_key, random_index


Expand All @@ -34,7 +30,13 @@ def get_metax_gpu_info() -> HardwareFuncResult:
info["driver"] = driver
info["maca"] = maca_version
info["gpu"] = gpu_map
collector = MetaxCollector(gpu_map)
max_mem_value: int = 0
for gpu_id in gpu_map:
mem_value = gpu_map[gpu_id]["memory"]
if mem_value > max_mem_value:
max_mem_value = mem_value
max_mem_value = max_mem_value * 1024
collector = MetaxCollector(gpu_map, max_mem_value)
except Exception: # noqa
if all(v is None for v in info.values()):
return None, None
Expand Down Expand Up @@ -80,9 +82,10 @@ def map_metax_gpu() -> Tuple[Optional[str], Optional[str], dict]:


class MetaxCollector(H):
def __init__(self, gpu_map):
def __init__(self, gpu_map, max_mem_value):
super().__init__()
self.gpu_map = gpu_map
self.max_mem_value = max_mem_value

# GPU Utilization (%)
self.util_key = generate_key("gpu.{gpu_index}.pct")
Expand All @@ -102,6 +105,15 @@ def __init__(self, gpu_map):
)
self.per_memory_configs = {}

# GPU Memory Allocated (MB)
self.memory_value_key = generate_key("gpu.{gpu_index}.mem.value")
memory_value_config = HardwareConfig(
y_range=(0, self.max_mem_value),
chart_index=random_index(),
chart_name="GPU Memory Allocated (MB)",
)
self.per_memory_value_configs = {}

# GPU Temperature (°C)
self.temp_key = generate_key("gpu.{gpu_index}.temp")
temp_config = HardwareConfig(
Expand All @@ -124,14 +136,16 @@ def __init__(self, gpu_map):
metric_name = f"GPU {gpu_id}"
self.per_util_configs[metric_name] = util_config.clone(metric_name=metric_name)
self.per_memory_configs[metric_name] = memory_config.clone(metric_name=metric_name)
self.per_memory_value_configs[metric_name] = memory_value_config.clone(metric_name=metric_name)
self.per_temp_configs[metric_name] = temp_config.clone(metric_name=metric_name)
self.per_power_configs[metric_name] = power_config.clone(metric_name=metric_name)

def collect(self) -> HardwareInfoList:
result: HardwareInfoList = []
usage_methods = [
self.get_utilization_usage,
self.get_memory_usage,
self.get_memory_rate_usage,
self.get_memory_value_usage,
self.get_temperature_usage,
self.get_power_usage,
]
Expand Down Expand Up @@ -169,7 +183,7 @@ def get_utilization_usage(self) -> dict:

return usage_infos

def get_memory_usage(self) -> dict:
def get_memory_rate_usage(self) -> dict:
"""
获取指定GPU设备的内存占用率
"""
Expand All @@ -195,12 +209,40 @@ def get_memory_usage(self) -> dict:
"value": math.nan,
"config": self.per_memory_configs[f"GPU {gpu_id}"],
}

gpu_mem_usage_rate = float(gpu_mem_used) / float(gpu_mem_total) * 100
mem_infos[gpu_id]["value"] = gpu_mem_usage_rate
index += 1

return mem_infos

def get_memory_value_usage(self) -> dict:
"""
获取指定GPU设备的内存占用值
"""
output_str = subprocess.run(
["mx-smi", "--show-memory"],
capture_output=True,
text=True,
).stdout
output_str_line = output_str.split("\n")
mem_value_infos = {}
index = 0
for line in output_str_line:
if "vis_vram used" in line:
gpu_id = index
gpu_mem_used = line.split(" ")[-2]
mem_value_infos[gpu_id] = {
"key": self.memory_value_key.format(gpu_index=gpu_id),
"name": f"GPU {gpu_id} Memory Allocated (MB)",
"value": math.nan,
"config": self.per_memory_value_configs[f"GPU {gpu_id}"],
}
gpu_mem_usage_value = float(gpu_mem_used) / 1024
mem_value_infos[gpu_id]["value"] = gpu_mem_usage_value
index += 1
return mem_value_infos

def get_temperature_usage(self) -> dict:
"""
获取指定GPU设备的利用率
Expand Down
79 changes: 69 additions & 10 deletions swanlab/data/run/metadata/hardware/gpu/moorethreads.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,12 @@ def get_moorethreads_gpu_info() -> HardwareFuncResult:
driver, gpu_map = map_moorethreads_gpu()
info["driver"] = driver
info["gpu"] = gpu_map
collector = MTTCollector(gpu_map)
max_mem_value = 0
for gpu_id in gpu_map:
mem_value = int(gpu_map[gpu_id]["memory"])
max_mem_value = max(max_mem_value, mem_value)
max_mem_value *= 1024
collector = MTTCollector(gpu_map, max_mem_value)
except Exception: # noqa
if all(v is None for v in info.values()):
return None, None
Expand All @@ -41,9 +46,11 @@ def map_moorethreads_gpu() -> Tuple[Optional[str], dict]:
"""
获取 Moore Threads GPU信息,包括驱动版本、设备信息等,例如:
driver: '2.7.0'
gpu_map: {"0": { "name": "MTT S4000”, "memory": "48GB"}, "1": { "name": "MTT S4000", "memory": "48GB"}, ...}
gpu_map: {"0": { "name": "MTT S4000”, "memory": "48"}, "1": { "name": "MTT S4000", "memory": "48"}, ...}
"""
output_str = subprocess.run(["mthreads-gmi", "-q", "--json"], capture_output=True, check=True, text=True).stdout
output_str = subprocess.run(
["mthreads-gmi", "-q", "--json"], capture_output=True, check=True, text=True
).stdout
output_json = json.loads(output_str)
driver = None
gpu_map = {}
Expand Down Expand Up @@ -71,9 +78,10 @@ def map_moorethreads_gpu() -> Tuple[Optional[str], dict]:


class MTTCollector(H):
def __init__(self, gpu_map):
def __init__(self, gpu_map, max_mem_value):
super().__init__()
self.gpu_map = gpu_map
self.max_mem_value = max_mem_value

# GPU Utilization (%)
self.util_key = generate_key("gpu.{gpu_index}.pct")
Expand All @@ -93,6 +101,15 @@ def __init__(self, gpu_map):
)
self.per_memory_configs = {}

# GPU Memory Allocated (MB)
self.mem_value_key = generate_key("gpu.{gpu_index}.mem.value")
mem_value_config = HardwareConfig(
y_range=(0, self.max_mem_value),
chart_index=random_index(),
chart_name="GPU Memory Allocated (MB)",
)
self.per_mem_value_configs = {}

# GPU Temperature (°C)
self.temp_key = generate_key("gpu.{gpu_index}.temp")
temp_config = HardwareConfig(
Expand All @@ -113,16 +130,28 @@ def __init__(self, gpu_map):

for gpu_id in self.gpu_map:
metric_name = f"GPU {gpu_id}"
self.per_util_configs[metric_name] = util_config.clone(metric_name=metric_name)
self.per_memory_configs[metric_name] = memory_config.clone(metric_name=metric_name)
self.per_temp_configs[metric_name] = temp_config.clone(metric_name=metric_name)
self.per_power_configs[metric_name] = power_config.clone(metric_name=metric_name)
self.per_util_configs[metric_name] = util_config.clone(
metric_name=metric_name
)
self.per_memory_configs[metric_name] = memory_config.clone(
metric_name=metric_name
)
self.per_mem_value_configs[metric_name] = mem_value_config.clone(
metric_name=metric_name
)
self.per_temp_configs[metric_name] = temp_config.clone(
metric_name=metric_name
)
self.per_power_configs[metric_name] = power_config.clone(
metric_name=metric_name
)

def collect(self) -> HardwareInfoList:
result: HardwareInfoList = []
usage_methods = [
self.get_utilization_usage,
self.get_memory_usage,
self.get_mem_value_usage,
self.get_temperature_usage,
self.get_power_usage,
]
Expand Down Expand Up @@ -189,6 +218,32 @@ def get_memory_usage(self) -> dict:
mem_infos[gpu_id]["value"] = gpu_mem_usage_rate
return mem_infos

def get_mem_value_usage(self) -> dict:
"""
获取指定GPU设备的显存使用量
"""
output_str = subprocess.run(
["mthreads-gmi", "-q", "-d", "MEMORY", "--json"],
capture_output=True,
text=True,
).stdout
output_json = json.loads(output_str)
mem_infos = {}
for gpu_info in output_json["GPU"]:
gpu_id = gpu_info["Index"]

mem_infos[gpu_id] = {
"key": self.mem_value_key.format(gpu_index=gpu_id),
"name": f"GPU {gpu_id} Memory Allocated (MB)",
"value": math.nan,
"config": self.per_mem_value_configs[f"GPU {gpu_id}"],
}
gpu_mem_info = gpu_info["FB Memory Usage"]
mem_infos[gpu_id]["value"] = float(
gpu_mem_info["Used"].replace("MiB", "").strip()
)
return mem_infos

def get_temperature_usage(self) -> dict:
"""
获取指定GPU设备的温度(°C)
Expand All @@ -210,7 +265,9 @@ def get_temperature_usage(self) -> dict:
"config": self.per_temp_configs[f"GPU {gpu_id}"],
}
# 解析温度
gpu_temp = gpu_info["Temperature"]["GPU Current Temp"].replace("C", "").strip()
gpu_temp = (
gpu_info["Temperature"]["GPU Current Temp"].replace("C", "").strip()
)
temp_infos[gpu_id]["value"] = float(gpu_temp)
return temp_infos

Expand All @@ -234,6 +291,8 @@ def get_power_usage(self) -> dict:
"value": math.nan,
"config": self.per_power_configs[f"GPU {gpu_id}"],
}
gpu_power = gpu_info["Power Readings"]["Power Draw "].strip().replace("W", "")
gpu_power = (
gpu_info["Power Readings"]["Power Draw "].strip().replace("W", "")
)
power_infos[gpu_id]["value"] = float(gpu_power)
return power_infos
Loading
Loading