Forráskód Böngészése

refactor: 优化配置文件并恢复部分底层环境变量读取逻辑

- 从 config.yaml 中移除了 default_model_name, server_url 以及 MinerU 服务管理相关配置
- 同步更新 config.py,将上述配置项恢复为硬编码默认值或通过 os.getenv 读取
- 恢复 api_server.py, mineru_service_manager.py, converter.py 等模块对底层服务/命令相关环境变量的读取逻辑
- 保持核心转换参数(api_url, backend, language 等)继续由配置文件管理

Co-authored-by: Cursor <cursoragent@cursor.com>
何文松 2 hete
szülő
commit
692a0a4103

+ 6 - 8
pdf_converter_v2/api_server.py

@@ -21,12 +21,10 @@ import uvicorn
 # 支持相对导入和绝对导入
 try:
     from pdf_converter_v2.api.main import app
-    from pdf_converter_v2.config import API_HOST, API_PORT, LOG_LEVEL
 except ImportError:
     # 如果绝对导入失败,尝试相对导入
     sys.path.insert(0, str(current_dir.parent))
     from pdf_converter_v2.api.main import app
-    from pdf_converter_v2.config import API_HOST, API_PORT, LOG_LEVEL
 
 
 def parse_args():
@@ -53,23 +51,23 @@ def parse_args():
     parser.add_argument(
         "--host",
         type=str,
-        default=API_HOST,
-        help=f"服务器监听地址 (默认: {API_HOST})"
+        default=os.getenv("API_HOST", "0.0.0.0"),
+        help="服务器监听地址 (默认: 0.0.0.0,可通过环境变量 API_HOST 设置)"
     )
     
     parser.add_argument(
         "--port",
         type=int,
-        default=API_PORT,
-        help=f"服务器监听端口 (默认: {API_PORT})"
+        default=int(os.getenv("API_PORT", "4214")),
+        help="服务器监听端口 (默认: 4214,可通过环境变量 API_PORT 设置)"
     )
     
     parser.add_argument(
         "--log-level",
         type=str,
-        default=LOG_LEVEL.lower(),
+        default=os.getenv("LOG_LEVEL", "info"),
         choices=["critical", "error", "warning", "info", "debug", "trace"],
-        help=f"日志级别 (默认: {LOG_LEVEL.lower()})"
+        help="日志级别 (默认: info,可通过环境变量 LOG_LEVEL 设置)"
     )
     
     parser.add_argument(

+ 3 - 20
pdf_converter_v2/config.py

@@ -4,6 +4,7 @@
 配置文件 v2 - 从配置文件读取配置(不使用环境变量)
 """
 
+import os
 from .config_loader import get_config_loader
 from .utils.device_env import detect_device_kind
 
@@ -16,7 +17,7 @@ _device_kind_from_config = _config.get_str("device_kind", "")
 DEVICE_KIND = _device_kind_from_config if _device_kind_from_config else detect_device_kind()
 
 # 默认模型配置
-DEFAULT_MODEL_NAME = _config.get_str("default_model_name", "OpenDataLab/MinerU2.5-2509-1.2B")
+DEFAULT_MODEL_NAME = "OpenDataLab/MinerU2.5-2509-1.2B"
 DEFAULT_GPU_MEMORY_UTILIZATION = _config.get_float("default_gpu_memory_utilization", 0.9)
 DEFAULT_DPI = _config.get_int("default_dpi", 200)
 DEFAULT_MAX_PAGES = _config.get_int("default_max_pages", 10)
@@ -34,28 +35,10 @@ DEFAULT_RETURN_MODEL_OUTPUT = _config.get_bool("return_model_output", True)
 DEFAULT_RETURN_MD = _config.get_bool("return_md", True)
 DEFAULT_RETURN_IMAGES = _config.get_bool("return_images", False)
 DEFAULT_RETURN_CONTENT_LIST = _config.get_bool("return_content_list", False)
-DEFAULT_SERVER_URL = _config.get_str("server_url", "string")
-
-# API 服务启动配置
-API_HOST = _config.get_str("api_host", "0.0.0.0")
-API_PORT = _config.get_int("api_port", 4214)
-
-# MinerU 服务管理配置
-MINERU_API_HOST = _config.get_str("mineru_api_host", "127.0.0.1")
-MINERU_API_PORT = _config.get_int("mineru_api_port", 5282)
-MINERU_IDLE_TIMEOUT = _config.get_int("mineru_idle_timeout", 60)
-MINERU_CHECK_INTERVAL = _config.get_int("mineru_check_interval", 60)
-MINERU_START_TIMEOUT = _config.get_int("mineru_start_timeout", 120)
-
-# PaddleOCR 配置
-PADDLEOCR_CMD = _config.get_str("paddleocr_cmd", "paddleocr")
-PADDLE_OCR_DEVICE = _config.get_str("paddle_ocr_device", "")
-PADDLE_OCR_DEVICES = _config.get_str("paddle_ocr_devices", "")
-PADDLE_DOC_PARSER_CMD = _config.get_str("paddle_doc_parser_cmd", "paddleocr")
+DEFAULT_SERVER_URL = os.getenv("SERVER_URL", "string")
 
 # 日志配置(可选)
 LOG_DIR = _config.get_str("log_dir", "./logs")
 LOG_LEVEL = _config.get_str("log_level", "INFO")
 LOG_TO_FILE = _config.get_bool("log_to_file", True)
 LOG_TO_CONSOLE = _config.get_bool("log_to_console", True)
-

+ 3 - 27
pdf_converter_v2/config.yaml

@@ -11,9 +11,6 @@ device_kind: ""
 # =============================================================================
 # 默认模型配置
 # =============================================================================
-# 默认模型名称
-default_model_name: "OpenDataLab/MinerU2.5-2509-1.2B"
-
 # GPU 内存利用率(0.0-1.0)
 default_gpu_memory_utilization: 0.9
 
@@ -27,10 +24,10 @@ default_max_pages: 10
 # API 配置
 # =============================================================================
 # MinerU API 服务地址
-api_url: "http://127.0.0.1:5282"
+api_url: "http://172.17.0.1:5282"
 
 # 处理后端:vlm-vllm-async-engine / pipeline
-backend: "vlm-vllm-async-engine"
+backend: "pipeline"
 
 # 解析方法:auto / txt / ocr
 parse_method: "auto"
@@ -44,9 +41,6 @@ end_page_id: 99999
 # 识别语言:ch / en
 language: "ch"
 
-# 服务器URL
-server_url: "string"
-
 # =============================================================================
 # API 服务启动配置
 # =============================================================================
@@ -56,24 +50,6 @@ api_host: "0.0.0.0"
 # API 服务监听端口
 api_port: 4214
 
-# =============================================================================
-# MinerU 服务管理配置
-# =============================================================================
-# MinerU API 内部地址(用于健康检查)
-mineru_api_host: "127.0.0.1"
-
-# MinerU API 内部端口
-mineru_api_port: 5282
-
-# 空闲超时时间(秒),超过此时间无任务则停止服务
-mineru_idle_timeout: 60
-
-# 检查间隔(秒)
-mineru_check_interval: 60
-
-# 服务启动等待超时(秒)
-mineru_start_timeout: 120
-
 # =============================================================================
 # PaddleOCR 配置
 # =============================================================================
@@ -82,7 +58,7 @@ paddleocr_cmd: "paddleocr"
 
 # PaddleOCR 推理设备 (例如 "npu:0", "cuda:0", "cpu")
 # 留空则根据环境自动选择
-paddle_ocr_device: ""
+paddle_ocr_device: "npu:1"
 
 # PaddleOCR 多卡推理设备 (例如 "npu:0,npu:1")
 paddle_ocr_devices: ""

+ 1 - 2
pdf_converter_v2/processor/converter.py

@@ -26,10 +26,9 @@ from ..utils.paddleocr_fallback import (
     _paddle_ocr_device_args,
     _get_paddleocr_subprocess_env,
 )
-from ..config import PADDLE_DOC_PARSER_CMD
 
 logger = get_logger("pdf_converter_v2.processor")
-PADDLE_CMD = PADDLE_DOC_PARSER_CMD
+PADDLE_CMD = os.getenv("PADDLE_DOC_PARSER_CMD", "paddleocr")
 
 
 async def _run_paddle_doc_parser(cmd: Sequence[str]) -> tuple[int, str, str]:

+ 5 - 12
pdf_converter_v2/utils/mineru_service_manager.py

@@ -19,13 +19,6 @@ from typing import Optional
 from datetime import datetime
 
 from .logging_config import get_logger
-from ..config import (
-    MINERU_API_HOST as _MINERU_API_HOST,
-    MINERU_API_PORT as _MINERU_API_PORT,
-    MINERU_IDLE_TIMEOUT as _MINERU_IDLE_TIMEOUT,
-    MINERU_CHECK_INTERVAL as _MINERU_CHECK_INTERVAL,
-    MINERU_START_TIMEOUT as _MINERU_START_TIMEOUT,
-)
 
 logger = get_logger("pdf_converter_v2.mineru_manager")
 
@@ -33,17 +26,17 @@ logger = get_logger("pdf_converter_v2.mineru_manager")
 MINERU_SERVICE_NAME = "mineru-api.service"
 
 # MinerU API 地址和端口(用于健康检查)
-MINERU_API_HOST = _MINERU_API_HOST
-MINERU_API_PORT = _MINERU_API_PORT
+MINERU_API_HOST = os.getenv("MINERU_API_HOST", "127.0.0.1")
+MINERU_API_PORT = int(os.getenv("MINERU_API_PORT", "5282"))
 
 # 空闲超时时间(秒),超过此时间无任务则停止服务
-IDLE_TIMEOUT_SECONDS = _MINERU_IDLE_TIMEOUT
+IDLE_TIMEOUT_SECONDS = int(os.getenv("MINERU_IDLE_TIMEOUT", "60"))  # 默认 1 分钟
 
 # 检查间隔(秒)
-CHECK_INTERVAL_SECONDS = _MINERU_CHECK_INTERVAL
+CHECK_INTERVAL_SECONDS = int(os.getenv("MINERU_CHECK_INTERVAL", "60"))  # 默认 1 分钟
 
 # 服务启动等待超时(秒)
-SERVICE_START_TIMEOUT = _MINERU_START_TIMEOUT
+SERVICE_START_TIMEOUT = int(os.getenv("MINERU_START_TIMEOUT", "120"))  # 默认 2 分钟
 
 
 class MinerUServiceManager:

+ 4 - 10
pdf_converter_v2/utils/paddleocr_fallback.py

@@ -15,11 +15,6 @@ import ast
 import re
 
 from ..utils.logging_config import get_logger
-from ..config import (
-    PADDLEOCR_CMD as _PADDLEOCR_CMD,
-    PADDLE_OCR_DEVICE as _PADDLE_OCR_DEVICE,
-    PADDLE_OCR_DEVICES as _PADDLE_OCR_DEVICES_CONFIG,
-)
 
 logger = get_logger("pdf_converter_v2.utils.paddleocr")
 
@@ -48,8 +43,8 @@ except ImportError:
 def _get_paddleocr_executable() -> str:
     """返回 paddleocr 可执行文件路径或命令名,供 subprocess 使用。
     当以 systemd 等方式运行时 PATH 可能不包含 venv/bin,故优先使用当前 Python 同目录下的 paddleocr。
-    可通过配置 PADDLEOCR_CMD 显式指定(完整路径或命令名)。"""
-    cmd = _PADDLEOCR_CMD.strip()
+    可通过环境变量 PADDLEOCR_CMD 显式指定(完整路径或命令名)。"""
+    cmd = os.getenv("PADDLEOCR_CMD", "").strip()
     if cmd:
         return cmd
     # 与当前 Python 同目录(venv/bin)下的 paddleocr
@@ -75,11 +70,11 @@ def _get_paddle_ocr_devices() -> List[str]:
     with _PADDLE_OCR_DEVICE_LOCK:
         if _PADDLE_OCR_DEVICES:
             return _PADDLE_OCR_DEVICES
-        multi = _PADDLE_OCR_DEVICES_CONFIG.strip()
+        multi = os.getenv("PADDLE_OCR_DEVICES", "").strip()
         if multi:
             _PADDLE_OCR_DEVICES[:] = [d.strip() for d in multi.split(",") if d.strip()]
         if not _PADDLE_OCR_DEVICES:
-            single = _PADDLE_OCR_DEVICE.strip()
+            single = os.getenv("PADDLE_OCR_DEVICE", "").strip()
             if not single:
                 from .device_env import is_npu
                 if is_npu():
@@ -1971,4 +1966,3 @@ def extract_text_with_paragraphs_from_ocr_json(json_path: str, line_height_thres
     except Exception as e:
         logger.exception(f"[OCR文本提取] 处理失败: {e}")
         return ""
-