gpu_monitor.py 5.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167
  1. """
  2. GPU/NPU 监控工具模块
  3. 根据运行环境自动识别 NVIDIA GPU 或华为昇腾 NPU,采集显存与利用率等使用情况。
  4. """
  5. import re
  6. import subprocess
  7. import logging
  8. from typing import Optional, Dict, Any
  9. logger = logging.getLogger(__name__)
  10. def _get_nvidia_gpu_info() -> Optional[Dict[str, Any]]:
  11. """通过 nvidia-smi 获取 NVIDIA GPU 信息(统一返回格式)。"""
  12. try:
  13. cmd = [
  14. "nvidia-smi",
  15. "--query-gpu=index,name,memory.total,memory.used,utilization.gpu",
  16. "--format=csv,noheader,nounits"
  17. ]
  18. result = subprocess.run(
  19. cmd,
  20. capture_output=True,
  21. text=True,
  22. timeout=5,
  23. check=False
  24. )
  25. if result.returncode != 0:
  26. logger.debug(f"nvidia-smi 执行失败: {result.stderr}")
  27. return None
  28. lines = result.stdout.strip().split('\n')
  29. if not lines or not lines[0]:
  30. logger.debug("nvidia-smi 未返回 GPU 信息")
  31. return None
  32. parts = [p.strip() for p in lines[0].split(',')]
  33. if len(parts) < 5:
  34. logger.debug(f"nvidia-smi 输出格式异常: {lines[0]}")
  35. return None
  36. gpu_index = int(parts[0])
  37. gpu_name = parts[1]
  38. memory_total_mb = int(parts[2])
  39. memory_used_mb = int(parts[3])
  40. utilization = float(parts[4])
  41. return {
  42. "gpu_index": gpu_index,
  43. "gpu_name": gpu_name,
  44. "gpu_memory_total": memory_total_mb * 1024 * 1024,
  45. "gpu_memory_used": memory_used_mb * 1024 * 1024,
  46. "gpu_utilization": utilization
  47. }
  48. except Exception as e:
  49. logger.debug(f"获取 NVIDIA GPU 信息失败: {e}")
  50. return None
  51. def _get_npu_info() -> Optional[Dict[str, Any]]:
  52. """通过 npu-smi info 获取华为昇腾 NPU 信息(统一返回格式)。
  53. 解析 AICore(%) 与 Memory-Usage(MB) 行,如: | 0 0 | ... | 0 1154 / 7767 |。
  54. """
  55. try:
  56. result = subprocess.run(
  57. ["npu-smi", "info"],
  58. capture_output=True,
  59. text=True,
  60. timeout=5,
  61. check=False
  62. )
  63. if result.returncode != 0:
  64. logger.debug(f"npu-smi info 执行失败: {result.stderr}")
  65. return None
  66. # 匹配行内 "数字 used / total" 形式(AICore% 与 Memory-Usage)
  67. # 例如: "| 0 1154 / 7767 |" 或 " 0 1154 / 7767 "
  68. pattern = re.compile(r"(\d+)\s+(\d+)\s*/\s*(\d+)")
  69. for line in result.stdout.splitlines():
  70. m = pattern.search(line)
  71. if m:
  72. aicore_pct = float(m.group(1))
  73. memory_used_mb = int(m.group(2))
  74. memory_total_mb = int(m.group(3))
  75. return {
  76. "gpu_index": 0,
  77. "gpu_name": "NPU",
  78. "gpu_memory_total": memory_total_mb * 1024 * 1024,
  79. "gpu_memory_used": memory_used_mb * 1024 * 1024,
  80. "gpu_utilization": aicore_pct
  81. }
  82. logger.debug("npu-smi info 中未解析到 Memory-Usage 行")
  83. return None
  84. except FileNotFoundError:
  85. logger.debug("未找到 npu-smi 命令")
  86. return None
  87. except Exception as e:
  88. logger.debug(f"获取 NPU 信息失败: {e}")
  89. return None
  90. def get_gpu_info() -> Optional[Dict[str, Any]]:
  91. """
  92. 根据当前运行环境自动选择采集方式,获取加速卡(GPU/NPU)信息。
  93. 优先使用环境变量 PDF_CONVERTER_DEVICE_KIND,否则自动检测 nvidia-smi / npu-smi。
  94. Returns:
  95. 统一格式的字典:
  96. - gpu_index: 设备索引
  97. - gpu_name: 设备名称(如 GPU 型号或 "NPU")
  98. - gpu_memory_total: 总显存(字节)
  99. - gpu_memory_used: 已使用显存(字节)
  100. - gpu_utilization: 利用率(%)
  101. 无可用设备或采集失败时返回 None。
  102. """
  103. from .device_env import detect_device_kind
  104. kind = detect_device_kind()
  105. if kind == "nvi":
  106. return _get_nvidia_gpu_info()
  107. if kind == "npu":
  108. return _get_npu_info()
  109. # cpu 或未知:也可尝试按顺序探测,便于未设置环境变量时仍能采集
  110. info = _get_nvidia_gpu_info()
  111. if info:
  112. return info
  113. info = _get_npu_info()
  114. if info:
  115. return info
  116. return None
  117. def get_gpu_info_delta(start_gpu_info: Optional[Dict], end_gpu_info: Optional[Dict]) -> Optional[Dict[str, Any]]:
  118. """
  119. 计算GPU使用增量(OCR任务期间的GPU使用)
  120. Args:
  121. start_gpu_info: 开始时的GPU信息
  122. end_gpu_info: 结束时的GPU信息
  123. Returns:
  124. GPU增量信息,包含:
  125. - gpu_index: GPU索引
  126. - gpu_memory_used: 显存增量(字节),OCR任务期间增加的显存使用
  127. - gpu_utilization: GPU利用率(%),结束时的利用率
  128. - gpu_memory_total: 总显存(字节)
  129. - gpu_name: GPU名称
  130. 如果无法计算返回None
  131. """
  132. if not end_gpu_info:
  133. return None
  134. result = {
  135. "gpu_index": end_gpu_info.get("gpu_index"),
  136. "gpu_name": end_gpu_info.get("gpu_name"),
  137. "gpu_memory_total": end_gpu_info.get("gpu_memory_total"),
  138. "gpu_utilization": end_gpu_info.get("gpu_utilization")
  139. }
  140. # 计算显存增量
  141. if start_gpu_info and end_gpu_info:
  142. start_memory = start_gpu_info.get("gpu_memory_used", 0)
  143. end_memory = end_gpu_info.get("gpu_memory_used", 0)
  144. memory_delta = max(0, end_memory - start_memory) # 确保非负
  145. result["gpu_memory_used"] = memory_delta
  146. else:
  147. # 如果没有开始信息,使用结束时的绝对显存(不推荐,但作为后备)
  148. # 注意:这种情况下无法准确计算增量,但至少能知道GPU使用情况
  149. result["gpu_memory_used"] = end_gpu_info.get("gpu_memory_used", 0)
  150. return result