| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167 |
- """
- GPU/NPU 监控工具模块
- 根据运行环境自动识别 NVIDIA GPU 或华为昇腾 NPU,采集显存与利用率等使用情况。
- """
- import re
- import subprocess
- import logging
- from typing import Optional, Dict, Any
- logger = logging.getLogger(__name__)
- def _get_nvidia_gpu_info() -> Optional[Dict[str, Any]]:
- """通过 nvidia-smi 获取 NVIDIA GPU 信息(统一返回格式)。"""
- try:
- cmd = [
- "nvidia-smi",
- "--query-gpu=index,name,memory.total,memory.used,utilization.gpu",
- "--format=csv,noheader,nounits"
- ]
- result = subprocess.run(
- cmd,
- capture_output=True,
- text=True,
- timeout=5,
- check=False
- )
- if result.returncode != 0:
- logger.debug(f"nvidia-smi 执行失败: {result.stderr}")
- return None
- lines = result.stdout.strip().split('\n')
- if not lines or not lines[0]:
- logger.debug("nvidia-smi 未返回 GPU 信息")
- return None
- parts = [p.strip() for p in lines[0].split(',')]
- if len(parts) < 5:
- logger.debug(f"nvidia-smi 输出格式异常: {lines[0]}")
- return None
- gpu_index = int(parts[0])
- gpu_name = parts[1]
- memory_total_mb = int(parts[2])
- memory_used_mb = int(parts[3])
- utilization = float(parts[4])
- return {
- "gpu_index": gpu_index,
- "gpu_name": gpu_name,
- "gpu_memory_total": memory_total_mb * 1024 * 1024,
- "gpu_memory_used": memory_used_mb * 1024 * 1024,
- "gpu_utilization": utilization
- }
- except Exception as e:
- logger.debug(f"获取 NVIDIA GPU 信息失败: {e}")
- return None
- def _get_npu_info() -> Optional[Dict[str, Any]]:
- """通过 npu-smi info 获取华为昇腾 NPU 信息(统一返回格式)。
- 解析 AICore(%) 与 Memory-Usage(MB) 行,如: | 0 0 | ... | 0 1154 / 7767 |。
- """
- try:
- result = subprocess.run(
- ["npu-smi", "info"],
- capture_output=True,
- text=True,
- timeout=5,
- check=False
- )
- if result.returncode != 0:
- logger.debug(f"npu-smi info 执行失败: {result.stderr}")
- return None
- # 匹配行内 "数字 used / total" 形式(AICore% 与 Memory-Usage)
- # 例如: "| 0 1154 / 7767 |" 或 " 0 1154 / 7767 "
- pattern = re.compile(r"(\d+)\s+(\d+)\s*/\s*(\d+)")
- for line in result.stdout.splitlines():
- m = pattern.search(line)
- if m:
- aicore_pct = float(m.group(1))
- memory_used_mb = int(m.group(2))
- memory_total_mb = int(m.group(3))
- return {
- "gpu_index": 0,
- "gpu_name": "NPU",
- "gpu_memory_total": memory_total_mb * 1024 * 1024,
- "gpu_memory_used": memory_used_mb * 1024 * 1024,
- "gpu_utilization": aicore_pct
- }
- logger.debug("npu-smi info 中未解析到 Memory-Usage 行")
- return None
- except FileNotFoundError:
- logger.debug("未找到 npu-smi 命令")
- return None
- except Exception as e:
- logger.debug(f"获取 NPU 信息失败: {e}")
- return None
- def get_gpu_info() -> Optional[Dict[str, Any]]:
- """
- 根据当前运行环境自动选择采集方式,获取加速卡(GPU/NPU)信息。
- 优先使用环境变量 PDF_CONVERTER_DEVICE_KIND,否则自动检测 nvidia-smi / npu-smi。
-
- Returns:
- 统一格式的字典:
- - gpu_index: 设备索引
- - gpu_name: 设备名称(如 GPU 型号或 "NPU")
- - gpu_memory_total: 总显存(字节)
- - gpu_memory_used: 已使用显存(字节)
- - gpu_utilization: 利用率(%)
- 无可用设备或采集失败时返回 None。
- """
- from .device_env import detect_device_kind
- kind = detect_device_kind()
- if kind == "nvi":
- return _get_nvidia_gpu_info()
- if kind == "npu":
- return _get_npu_info()
- # cpu 或未知:也可尝试按顺序探测,便于未设置环境变量时仍能采集
- info = _get_nvidia_gpu_info()
- if info:
- return info
- info = _get_npu_info()
- if info:
- return info
- return None
- def get_gpu_info_delta(start_gpu_info: Optional[Dict], end_gpu_info: Optional[Dict]) -> Optional[Dict[str, Any]]:
- """
- 计算GPU使用增量(OCR任务期间的GPU使用)
-
- Args:
- start_gpu_info: 开始时的GPU信息
- end_gpu_info: 结束时的GPU信息
-
- Returns:
- GPU增量信息,包含:
- - gpu_index: GPU索引
- - gpu_memory_used: 显存增量(字节),OCR任务期间增加的显存使用
- - gpu_utilization: GPU利用率(%),结束时的利用率
- - gpu_memory_total: 总显存(字节)
- - gpu_name: GPU名称
- 如果无法计算返回None
- """
- if not end_gpu_info:
- return None
-
- result = {
- "gpu_index": end_gpu_info.get("gpu_index"),
- "gpu_name": end_gpu_info.get("gpu_name"),
- "gpu_memory_total": end_gpu_info.get("gpu_memory_total"),
- "gpu_utilization": end_gpu_info.get("gpu_utilization")
- }
-
- # 计算显存增量
- if start_gpu_info and end_gpu_info:
- start_memory = start_gpu_info.get("gpu_memory_used", 0)
- end_memory = end_gpu_info.get("gpu_memory_used", 0)
- memory_delta = max(0, end_memory - start_memory) # 确保非负
- result["gpu_memory_used"] = memory_delta
- else:
- # 如果没有开始信息,使用结束时的绝对显存(不推荐,但作为后备)
- # 注意:这种情况下无法准确计算增量,但至少能知道GPU使用情况
- result["gpu_memory_used"] = end_gpu_info.get("gpu_memory_used", 0)
-
- return result
|