gpu_monitor.py 3.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113
  1. """
  2. GPU监控工具模块
  3. 用于获取和计算GPU使用情况
  4. """
  5. import subprocess
  6. import logging
  7. from typing import Optional, Dict, Any
  8. logger = logging.getLogger(__name__)
  9. def get_gpu_info() -> Optional[Dict[str, Any]]:
  10. """
  11. 获取GPU信息(使用nvidia-smi)
  12. Returns:
  13. GPU信息字典,包含:
  14. - gpu_index: GPU索引
  15. - gpu_memory_used: 已使用显存(字节)
  16. - gpu_utilization: GPU利用率(%)
  17. - gpu_memory_total: 总显存(字节)
  18. - gpu_name: GPU名称
  19. 如果获取失败返回None
  20. """
  21. try:
  22. # 执行nvidia-smi命令
  23. cmd = [
  24. "nvidia-smi",
  25. "--query-gpu=index,name,memory.total,memory.used,utilization.gpu",
  26. "--format=csv,noheader,nounits"
  27. ]
  28. result = subprocess.run(
  29. cmd,
  30. capture_output=True,
  31. text=True,
  32. timeout=5,
  33. check=False
  34. )
  35. if result.returncode != 0:
  36. logger.debug(f"nvidia-smi命令执行失败: {result.stderr}")
  37. return None
  38. # 解析输出(取第一个GPU)
  39. lines = result.stdout.strip().split('\n')
  40. if not lines or not lines[0]:
  41. logger.debug("nvidia-smi未返回GPU信息")
  42. return None
  43. parts = [p.strip() for p in lines[0].split(',')]
  44. if len(parts) < 5:
  45. logger.debug(f"GPU信息格式不正确: {lines[0]}")
  46. return None
  47. gpu_index = int(parts[0])
  48. gpu_name = parts[1]
  49. memory_total_mb = int(parts[2])
  50. memory_used_mb = int(parts[3])
  51. utilization = float(parts[4])
  52. return {
  53. "gpu_index": gpu_index,
  54. "gpu_name": gpu_name,
  55. "gpu_memory_total": memory_total_mb * 1024 * 1024, # 转换为字节
  56. "gpu_memory_used": memory_used_mb * 1024 * 1024, # 转换为字节
  57. "gpu_utilization": utilization
  58. }
  59. except Exception as e:
  60. logger.debug(f"获取GPU信息失败: {e}")
  61. return None
  62. def get_gpu_info_delta(start_gpu_info: Optional[Dict], end_gpu_info: Optional[Dict]) -> Optional[Dict[str, Any]]:
  63. """
  64. 计算GPU使用增量(OCR任务期间的GPU使用)
  65. Args:
  66. start_gpu_info: 开始时的GPU信息
  67. end_gpu_info: 结束时的GPU信息
  68. Returns:
  69. GPU增量信息,包含:
  70. - gpu_index: GPU索引
  71. - gpu_memory_used: 显存增量(字节),OCR任务期间增加的显存使用
  72. - gpu_utilization: GPU利用率(%),结束时的利用率
  73. - gpu_memory_total: 总显存(字节)
  74. - gpu_name: GPU名称
  75. 如果无法计算返回None
  76. """
  77. if not end_gpu_info:
  78. return None
  79. result = {
  80. "gpu_index": end_gpu_info.get("gpu_index"),
  81. "gpu_name": end_gpu_info.get("gpu_name"),
  82. "gpu_memory_total": end_gpu_info.get("gpu_memory_total"),
  83. "gpu_utilization": end_gpu_info.get("gpu_utilization")
  84. }
  85. # 计算显存增量
  86. if start_gpu_info and end_gpu_info:
  87. start_memory = start_gpu_info.get("gpu_memory_used", 0)
  88. end_memory = end_gpu_info.get("gpu_memory_used", 0)
  89. memory_delta = max(0, end_memory - start_memory) # 确保非负
  90. result["gpu_memory_used"] = memory_delta
  91. else:
  92. # 如果没有开始信息,使用结束时的绝对显存(不推荐,但作为后备)
  93. # 注意:这种情况下无法准确计算增量,但至少能知道GPU使用情况
  94. result["gpu_memory_used"] = end_gpu_info.get("gpu_memory_used", 0)
  95. return result