resource_monitor.py 8.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196
  1. """
  2. 资源监控采集器模块
  3. 在 OCR 任务期间,后台线程定期采集加速卡(NVIDIA GPU / 华为昇腾 NPU)和系统负载数据。
  4. 根据运行环境自动选择 nvidia-smi 或 npu-smi 进行采集。
  5. """
  6. import threading
  7. import time
  8. import logging
  9. import os
  10. from typing import Optional, Dict, Any, List
  11. logger = logging.getLogger(__name__)
  12. class ResourceMonitor:
  13. """资源监控采集器,在后台线程中定期采集 GPU/NPU 和系统负载数据"""
  14. def __init__(self, interval: float = 0.5):
  15. """
  16. 初始化资源监控采集器
  17. Args:
  18. interval: 采集间隔(秒),默认0.5秒
  19. """
  20. self.interval = interval
  21. self.monitoring = False
  22. self.monitor_thread: Optional[threading.Thread] = None
  23. self.samples: List[Dict[str, Any]] = []
  24. self.lock = threading.Lock()
  25. def start(self):
  26. """启动监控采集"""
  27. if self.monitoring:
  28. logger.warning("资源监控已在运行中")
  29. return
  30. self.monitoring = True
  31. self.samples.clear()
  32. self.monitor_thread = threading.Thread(target=self._monitor_loop, daemon=True)
  33. self.monitor_thread.start()
  34. logger.info(f"资源监控采集器已启动,采集间隔: {self.interval}秒")
  35. def stop(self):
  36. """停止监控采集"""
  37. if not self.monitoring:
  38. logger.warning("资源监控未在运行")
  39. return
  40. self.monitoring = False
  41. if self.monitor_thread:
  42. self.monitor_thread.join(timeout=2.0)
  43. logger.info(f"资源监控采集器已停止,共采集 {len(self.samples)} 个样本")
  44. def _monitor_loop(self):
  45. """监控循环,定期采集数据"""
  46. while self.monitoring:
  47. try:
  48. sample = self._collect_sample()
  49. if sample:
  50. with self.lock:
  51. self.samples.append(sample)
  52. except Exception as e:
  53. logger.warning(f"采集资源数据时出错: {e}")
  54. time.sleep(self.interval)
  55. def _collect_sample(self) -> Optional[Dict[str, Any]]:
  56. """
  57. 采集一次资源数据样本
  58. Returns:
  59. 包含GPU和系统负载信息的字典,如果采集失败返回None
  60. """
  61. sample = {
  62. "timestamp": time.time(),
  63. "gpu_info": self._get_gpu_info(),
  64. "system_load": self._get_system_load()
  65. }
  66. return sample
  67. def _get_gpu_info(self) -> Optional[Dict[str, Any]]:
  68. """获取加速卡信息(根据环境自动使用 nvidia-smi 或 npu-smi)"""
  69. try:
  70. from .gpu_monitor import get_gpu_info
  71. return get_gpu_info()
  72. except Exception as e:
  73. logger.debug(f"获取加速卡信息失败: {e}")
  74. return None
  75. def _get_system_load(self) -> Optional[Dict[str, float]]:
  76. """获取系统负载"""
  77. try:
  78. # Linux系统使用os.getloadavg()
  79. if hasattr(os, 'getloadavg'):
  80. load_avg = os.getloadavg()
  81. return {
  82. "load_1min": load_avg[0],
  83. "load_5min": load_avg[1],
  84. "load_15min": load_avg[2]
  85. }
  86. except Exception as e:
  87. logger.debug(f"获取系统负载失败: {e}")
  88. return None
  89. def get_statistics(self) -> Optional[Dict[str, Any]]:
  90. """
  91. 对采集的数据进行统计分析
  92. Returns:
  93. 统计结果,包含:
  94. - gpu_index: GPU索引
  95. - gpu_name: GPU名称
  96. - gpu_memory_total: 总显存(字节)
  97. - gpu_memory_used: 期间最大显存使用量(字节),任务期间采集到的最大显存使用
  98. - gpu_memory_used_avg: 平均显存使用(字节)
  99. - gpu_memory_used_max: 最大显存使用(字节)
  100. - gpu_utilization_avg: 平均GPU利用率(%)
  101. - gpu_utilization_max: 最大GPU利用率(%)
  102. - system_load_avg_1min: 平均1分钟系统负载
  103. - system_load_max_1min: 最大1分钟系统负载
  104. - sample_count: 采集的样本数量
  105. - duration: 监控持续时间(秒)
  106. """
  107. with self.lock:
  108. if not self.samples:
  109. logger.warning("没有采集到任何数据样本")
  110. return None
  111. # 提取GPU信息
  112. gpu_samples = [s["gpu_info"] for s in self.samples if s.get("gpu_info")]
  113. if not gpu_samples:
  114. logger.warning("没有采集到GPU数据")
  115. return None
  116. # 提取系统负载信息
  117. load_samples = [s["system_load"] for s in self.samples if s.get("system_load")]
  118. # 计算GPU统计信息
  119. first_gpu = gpu_samples[0]
  120. last_gpu = gpu_samples[-1]
  121. # 计算平均值和最大值(用于统计)
  122. memory_values = [g.get("gpu_memory_used", 0) for g in gpu_samples]
  123. utilization_values = [g.get("gpu_utilization", 0) for g in gpu_samples]
  124. memory_avg = sum(memory_values) / len(memory_values) if memory_values else 0
  125. memory_max = max(memory_values) if memory_values else 0
  126. utilization_avg = sum(utilization_values) / len(utilization_values) if utilization_values else 0
  127. utilization_max = max(utilization_values) if utilization_values else 0
  128. # 使用期间最大显存值(不再计算增量)
  129. # 注意:这是采集期间的最大显存使用量,不是增量
  130. gpu_memory_used = int(memory_max)
  131. # 计算系统负载统计(1分钟、5分钟、15分钟)
  132. load_1min_values = [l.get("load_1min", 0) for l in load_samples if l]
  133. load_1min_avg = sum(load_1min_values) / len(load_1min_values) if load_1min_values else None
  134. load_1min_max = max(load_1min_values) if load_1min_values else None
  135. load_5min_values = [l.get("load_5min", 0) for l in load_samples if l]
  136. load_5min_avg = sum(load_5min_values) / len(load_5min_values) if load_5min_values else None
  137. load_5min_max = max(load_5min_values) if load_5min_values else None
  138. load_15min_values = [l.get("load_15min", 0) for l in load_samples if l]
  139. load_15min_avg = sum(load_15min_values) / len(load_15min_values) if load_15min_values else None
  140. load_15min_max = max(load_15min_values) if load_15min_values else None
  141. # 计算持续时间
  142. duration = self.samples[-1]["timestamp"] - self.samples[0]["timestamp"] if len(self.samples) > 1 else 0
  143. result = {
  144. "gpu_index": first_gpu.get("gpu_index"),
  145. "gpu_name": first_gpu.get("gpu_name"),
  146. "gpu_memory_total": first_gpu.get("gpu_memory_total"),
  147. "gpu_memory_used": gpu_memory_used, # 期间最大显存使用量(不是增量)
  148. "gpu_memory_used_avg": int(memory_avg),
  149. "gpu_memory_used_max": int(memory_max),
  150. "gpu_utilization": utilization_avg, # 平均利用率
  151. "gpu_utilization_avg": utilization_avg,
  152. "gpu_utilization_max": utilization_max,
  153. "system_load_avg_1min": load_1min_avg,
  154. "system_load_max_1min": load_1min_max,
  155. "system_load_avg_5min": load_5min_avg,
  156. "system_load_max_5min": load_5min_max,
  157. "system_load_avg_15min": load_15min_avg,
  158. "system_load_max_15min": load_15min_max,
  159. "sample_count": len(self.samples),
  160. "duration": duration
  161. }
  162. logger.info(f"资源统计计算完成 - 样本数: {len(self.samples)}, 持续时间: {duration:.2f}秒, "
  163. f"最大显存使用: {gpu_memory_used / 1024 / 1024:.2f}MB (平均: {memory_avg / 1024 / 1024:.2f}MB), "
  164. f"平均GPU利用率: {utilization_avg:.2f}%, 最大GPU利用率: {utilization_max:.2f}%")
  165. return result