Browse Source

fix: converter.py 使用 config.yaml 的 backend/server_url 等配置,不再硬编码默认值

何文松 1 week ago
parent
commit
ace7956efd
1 changed files with 23 additions and 15 deletions
  1. 23 15
      pdf_converter_v2/processor/converter.py

+ 23 - 15
pdf_converter_v2/processor/converter.py

@@ -26,7 +26,15 @@ from ..utils.paddleocr_fallback import (
     _paddle_ocr_device_args,
     _get_paddleocr_subprocess_env,
 )
-from ..config import PADDLE_DOC_PARSER_CMD, VL_REC_BACKEND, VL_REC_SERVER_URL
+from ..config import (
+    PADDLE_DOC_PARSER_CMD, VL_REC_BACKEND, VL_REC_SERVER_URL,
+    DEFAULT_BACKEND, DEFAULT_SERVER_URL, DEFAULT_PARSE_METHOD,
+    DEFAULT_LANGUAGE, DEFAULT_START_PAGE_ID, DEFAULT_END_PAGE_ID,
+    DEFAULT_TABLE_ENABLE, DEFAULT_FORMULA_ENABLE,
+    DEFAULT_RESPONSE_FORMAT_ZIP, DEFAULT_RETURN_MIDDLE_JSON,
+    DEFAULT_RETURN_MODEL_OUTPUT, DEFAULT_RETURN_MD, DEFAULT_RETURN_IMAGES,
+    DEFAULT_RETURN_CONTENT_LIST,
+)
 
 logger = get_logger("pdf_converter_v2.processor")
 
@@ -225,23 +233,23 @@ async def convert_to_markdown(
     output_dir: str = "./output",
     max_pages: int = 10,
     is_ocr: bool = False,
-    formula_enable: bool = True,
-    table_enable: bool = True,
-    language: str = "ch",
-    backend: str = "vlm-vllm-async-engine",
+    formula_enable: bool = DEFAULT_FORMULA_ENABLE,
+    table_enable: bool = DEFAULT_TABLE_ENABLE,
+    language: str = DEFAULT_LANGUAGE,
+    backend: str = DEFAULT_BACKEND,
     url: str = "http://127.0.0.1:5282",
     embed_images: bool = True,
     output_json: bool = False,
-    start_page_id: int = 0,
-    end_page_id: int = 99999,
-    parse_method: str = "auto",
-    server_url: str = "string",
-    response_format_zip: bool = True,
-    return_middle_json: bool = False,
-    return_model_output: bool = True,
-    return_md: bool = True,
-    return_images: bool = True,  # 默认启用,以便PaddleOCR备用解析可以使用
-    return_content_list: bool = False,
+    start_page_id: int = DEFAULT_START_PAGE_ID,
+    end_page_id: int = DEFAULT_END_PAGE_ID if DEFAULT_END_PAGE_ID >= 0 else 99999,
+    parse_method: str = DEFAULT_PARSE_METHOD,
+    server_url: str = DEFAULT_SERVER_URL or "string",
+    response_format_zip: bool = DEFAULT_RESPONSE_FORMAT_ZIP,
+    return_middle_json: bool = DEFAULT_RETURN_MIDDLE_JSON,
+    return_model_output: bool = DEFAULT_RETURN_MODEL_OUTPUT,
+    return_md: bool = DEFAULT_RETURN_MD,
+    return_images: bool = DEFAULT_RETURN_IMAGES,
+    return_content_list: bool = DEFAULT_RETURN_CONTENT_LIST,
     forced_document_type: Optional[str] = None
 ):
     """将PDF/图片转换为Markdown的主要函数(使用新的API接口)"""