|
@@ -26,7 +26,15 @@ from ..utils.paddleocr_fallback import (
|
|
|
_paddle_ocr_device_args,
|
|
_paddle_ocr_device_args,
|
|
|
_get_paddleocr_subprocess_env,
|
|
_get_paddleocr_subprocess_env,
|
|
|
)
|
|
)
|
|
|
-from ..config import PADDLE_DOC_PARSER_CMD, VL_REC_BACKEND, VL_REC_SERVER_URL
|
|
|
|
|
|
|
+from ..config import (
|
|
|
|
|
+ PADDLE_DOC_PARSER_CMD, VL_REC_BACKEND, VL_REC_SERVER_URL,
|
|
|
|
|
+ DEFAULT_BACKEND, DEFAULT_SERVER_URL, DEFAULT_PARSE_METHOD,
|
|
|
|
|
+ DEFAULT_LANGUAGE, DEFAULT_START_PAGE_ID, DEFAULT_END_PAGE_ID,
|
|
|
|
|
+ DEFAULT_TABLE_ENABLE, DEFAULT_FORMULA_ENABLE,
|
|
|
|
|
+ DEFAULT_RESPONSE_FORMAT_ZIP, DEFAULT_RETURN_MIDDLE_JSON,
|
|
|
|
|
+ DEFAULT_RETURN_MODEL_OUTPUT, DEFAULT_RETURN_MD, DEFAULT_RETURN_IMAGES,
|
|
|
|
|
+ DEFAULT_RETURN_CONTENT_LIST,
|
|
|
|
|
+)
|
|
|
|
|
|
|
|
logger = get_logger("pdf_converter_v2.processor")
|
|
logger = get_logger("pdf_converter_v2.processor")
|
|
|
|
|
|
|
@@ -225,23 +233,23 @@ async def convert_to_markdown(
|
|
|
output_dir: str = "./output",
|
|
output_dir: str = "./output",
|
|
|
max_pages: int = 10,
|
|
max_pages: int = 10,
|
|
|
is_ocr: bool = False,
|
|
is_ocr: bool = False,
|
|
|
- formula_enable: bool = True,
|
|
|
|
|
- table_enable: bool = True,
|
|
|
|
|
- language: str = "ch",
|
|
|
|
|
- backend: str = "vlm-vllm-async-engine",
|
|
|
|
|
|
|
+ formula_enable: bool = DEFAULT_FORMULA_ENABLE,
|
|
|
|
|
+ table_enable: bool = DEFAULT_TABLE_ENABLE,
|
|
|
|
|
+ language: str = DEFAULT_LANGUAGE,
|
|
|
|
|
+ backend: str = DEFAULT_BACKEND,
|
|
|
url: str = "http://127.0.0.1:5282",
|
|
url: str = "http://127.0.0.1:5282",
|
|
|
embed_images: bool = True,
|
|
embed_images: bool = True,
|
|
|
output_json: bool = False,
|
|
output_json: bool = False,
|
|
|
- start_page_id: int = 0,
|
|
|
|
|
- end_page_id: int = 99999,
|
|
|
|
|
- parse_method: str = "auto",
|
|
|
|
|
- server_url: str = "string",
|
|
|
|
|
- response_format_zip: bool = True,
|
|
|
|
|
- return_middle_json: bool = False,
|
|
|
|
|
- return_model_output: bool = True,
|
|
|
|
|
- return_md: bool = True,
|
|
|
|
|
- return_images: bool = True, # 默认启用,以便PaddleOCR备用解析可以使用
|
|
|
|
|
- return_content_list: bool = False,
|
|
|
|
|
|
|
+ start_page_id: int = DEFAULT_START_PAGE_ID,
|
|
|
|
|
+ end_page_id: int = DEFAULT_END_PAGE_ID if DEFAULT_END_PAGE_ID >= 0 else 99999,
|
|
|
|
|
+ parse_method: str = DEFAULT_PARSE_METHOD,
|
|
|
|
|
+ server_url: str = DEFAULT_SERVER_URL or "string",
|
|
|
|
|
+ response_format_zip: bool = DEFAULT_RESPONSE_FORMAT_ZIP,
|
|
|
|
|
+ return_middle_json: bool = DEFAULT_RETURN_MIDDLE_JSON,
|
|
|
|
|
+ return_model_output: bool = DEFAULT_RETURN_MODEL_OUTPUT,
|
|
|
|
|
+ return_md: bool = DEFAULT_RETURN_MD,
|
|
|
|
|
+ return_images: bool = DEFAULT_RETURN_IMAGES,
|
|
|
|
|
+ return_content_list: bool = DEFAULT_RETURN_CONTENT_LIST,
|
|
|
forced_document_type: Optional[str] = None
|
|
forced_document_type: Optional[str] = None
|
|
|
):
|
|
):
|
|
|
"""将PDF/图片转换为Markdown的主要函数(使用新的API接口)"""
|
|
"""将PDF/图片转换为Markdown的主要函数(使用新的API接口)"""
|