|
@@ -150,6 +150,16 @@ class ConversionRequest(BaseModel):
|
|
|
"""转换请求模型(v2 精简版)"""
|
|
"""转换请求模型(v2 精简版)"""
|
|
|
# 新增:强制文档类型(正式全称)
|
|
# 新增:强制文档类型(正式全称)
|
|
|
doc_type: Optional[str] = None
|
|
doc_type: Optional[str] = None
|
|
|
|
|
+ # 新增:去水印参数
|
|
|
|
|
+ remove_watermark: Optional[bool] = False
|
|
|
|
|
+ watermark_light_threshold: Optional[int] = 200
|
|
|
|
|
+ watermark_saturation_threshold: Optional[int] = 30
|
|
|
|
|
+ crop_header_footer: Optional[bool] = False
|
|
|
|
|
+ header_ratio: Optional[float] = 0.05
|
|
|
|
|
+ footer_ratio: Optional[float] = 0.05
|
|
|
|
|
+ auto_detect_header_footer: Optional[bool] = False
|
|
|
|
|
+ # 新增:附件页切割参数
|
|
|
|
|
+ table_only: Optional[bool] = False # 是否只保留包含表格的附件页(默认False)
|
|
|
|
|
|
|
|
|
|
|
|
|
class ConversionResponse(BaseModel):
|
|
class ConversionResponse(BaseModel):
|
|
@@ -200,6 +210,13 @@ class OCRRequest(BaseModel):
|
|
|
"""OCR识别请求模型"""
|
|
"""OCR识别请求模型"""
|
|
|
image_base64: str # base64编码的图片数据
|
|
image_base64: str # base64编码的图片数据
|
|
|
image_format: Optional[str] = "png" # 图片格式:png, jpg, jpeg
|
|
image_format: Optional[str] = "png" # 图片格式:png, jpg, jpeg
|
|
|
|
|
+ remove_watermark: Optional[bool] = False # 是否去除水印
|
|
|
|
|
+ watermark_light_threshold: Optional[int] = 200 # 水印亮度阈值(0-255),高于此值的浅色像素可能是水印
|
|
|
|
|
+ watermark_saturation_threshold: Optional[int] = 30 # 水印饱和度阈值(0-255),低于此值的低饱和度像素可能是水印
|
|
|
|
|
+ crop_header_footer: Optional[bool] = False # 是否裁剪页眉页脚
|
|
|
|
|
+ header_ratio: Optional[float] = 0.05 # 页眉裁剪比例(0-1),默认5%
|
|
|
|
|
+ footer_ratio: Optional[float] = 0.05 # 页脚裁剪比例(0-1),默认5%
|
|
|
|
|
+ auto_detect_header_footer: Optional[bool] = False # 是否自动检测页眉页脚边界
|
|
|
|
|
|
|
|
|
|
|
|
|
class OCRResponse(BaseModel):
|
|
class OCRResponse(BaseModel):
|
|
@@ -308,11 +325,108 @@ async def process_conversion_task(
|
|
|
|
|
|
|
|
logger.info(f"[任务 {task_id}] 开始处理: {file_path}")
|
|
logger.info(f"[任务 {task_id}] 开始处理: {file_path}")
|
|
|
|
|
|
|
|
|
|
+ # 文件预处理(支持图片和PDF)
|
|
|
|
|
+ from pathlib import Path as PathLib
|
|
|
|
|
+ file_suffix = PathLib(file_path).suffix.lower()
|
|
|
|
|
+ is_image = file_suffix in ['.png', '.jpg', '.jpeg', '.bmp', '.tiff', '.tif']
|
|
|
|
|
+ is_pdf = file_suffix == '.pdf'
|
|
|
|
|
+
|
|
|
|
|
+ # 图片预处理:去水印或裁剪页眉页脚
|
|
|
|
|
+ if is_image and (request.remove_watermark or request.crop_header_footer):
|
|
|
|
|
+ logger.info(f"[任务 {task_id}] 检测到图片文件,开始预处理...")
|
|
|
|
|
+ preprocessed_path = file_path
|
|
|
|
|
+
|
|
|
|
|
+ # 裁剪页眉页脚
|
|
|
|
|
+ if request.crop_header_footer:
|
|
|
|
|
+ try:
|
|
|
|
|
+ from ..utils.image_preprocessor import crop_header_footer, check_opencv_available
|
|
|
|
|
+
|
|
|
|
|
+ if check_opencv_available():
|
|
|
|
|
+ if request.auto_detect_header_footer:
|
|
|
|
|
+ logger.info(f"[任务 {task_id}] 开始自动检测并裁剪页眉页脚")
|
|
|
|
|
+ else:
|
|
|
|
|
+ logger.info(f"[任务 {task_id}] 开始裁剪页眉页脚,顶部比例: {request.header_ratio}, 底部比例: {request.footer_ratio}")
|
|
|
|
|
+
|
|
|
|
|
+ # 裁剪后的图片路径
|
|
|
|
|
+ cropped_path = str(PathLib(output_dir) / f"preprocessed_cropped{file_suffix}")
|
|
|
|
|
+
|
|
|
|
|
+ preprocessed_path = await asyncio.to_thread(
|
|
|
|
|
+ crop_header_footer,
|
|
|
|
|
+ preprocessed_path,
|
|
|
|
|
+ output_path=cropped_path,
|
|
|
|
|
+ header_ratio=request.header_ratio or 0.05,
|
|
|
|
|
+ footer_ratio=request.footer_ratio or 0.05,
|
|
|
|
|
+ auto_detect=request.auto_detect_header_footer or False
|
|
|
|
|
+ )
|
|
|
|
|
+ logger.info(f"[任务 {task_id}] 裁剪页眉页脚完成: {preprocessed_path}")
|
|
|
|
|
+ else:
|
|
|
|
|
+ logger.warning(f"[任务 {task_id}] OpenCV 未安装,跳过裁剪页眉页脚")
|
|
|
|
|
+ except Exception as e:
|
|
|
|
|
+ logger.warning(f"[任务 {task_id}] 裁剪页眉页脚失败,使用原图继续: {e}")
|
|
|
|
|
+
|
|
|
|
|
+ # 去水印
|
|
|
|
|
+ if request.remove_watermark:
|
|
|
|
|
+ try:
|
|
|
|
|
+ from ..utils.image_preprocessor import remove_watermark, check_opencv_available
|
|
|
|
|
+
|
|
|
|
|
+ if check_opencv_available():
|
|
|
|
|
+ logger.info(f"[任务 {task_id}] 开始去水印处理,亮度阈值: {request.watermark_light_threshold}, 饱和度阈值: {request.watermark_saturation_threshold}")
|
|
|
|
|
+
|
|
|
|
|
+ # 去水印后的图片路径
|
|
|
|
|
+ nowm_path = str(PathLib(output_dir) / f"preprocessed_nowm{file_suffix}")
|
|
|
|
|
+
|
|
|
|
|
+ preprocessed_path = await asyncio.to_thread(
|
|
|
|
|
+ remove_watermark,
|
|
|
|
|
+ preprocessed_path,
|
|
|
|
|
+ output_path=nowm_path,
|
|
|
|
|
+ light_threshold=request.watermark_light_threshold or 200,
|
|
|
|
|
+ saturation_threshold=request.watermark_saturation_threshold or 30,
|
|
|
|
|
+ method="hsv"
|
|
|
|
|
+ )
|
|
|
|
|
+ logger.info(f"[任务 {task_id}] 去水印完成: {preprocessed_path}")
|
|
|
|
|
+ else:
|
|
|
|
|
+ logger.warning(f"[任务 {task_id}] OpenCV 未安装,跳过去水印处理")
|
|
|
|
|
+ except Exception as e:
|
|
|
|
|
+ logger.warning(f"[任务 {task_id}] 去水印处理失败,使用原图继续: {e}")
|
|
|
|
|
+
|
|
|
|
|
+ # 更新文件路径为预处理后的路径
|
|
|
|
|
+ if preprocessed_path != file_path:
|
|
|
|
|
+ file_path = preprocessed_path
|
|
|
|
|
+ logger.info(f"[任务 {task_id}] 图片预处理完成,使用预处理后的文件: {file_path}")
|
|
|
|
|
+
|
|
|
|
|
+ # PDF预处理:去水印
|
|
|
|
|
+ elif is_pdf and request.remove_watermark:
|
|
|
|
|
+ logger.info(f"[任务 {task_id}] 检测到PDF文件,开始去水印预处理...")
|
|
|
|
|
+ try:
|
|
|
|
|
+ from ..utils.pdf_watermark_remover import remove_watermark_from_pdf
|
|
|
|
|
+
|
|
|
|
|
+ # 去水印后的PDF路径
|
|
|
|
|
+ nowm_pdf_path = str(PathLib(output_dir) / f"preprocessed_nowm.pdf")
|
|
|
|
|
+
|
|
|
|
|
+ # 执行去水印
|
|
|
|
|
+ logger.info(f"[任务 {task_id}] 开始PDF去水印处理,亮度阈值: {request.watermark_light_threshold}, 饱和度阈值: {request.watermark_saturation_threshold}")
|
|
|
|
|
+ success = await asyncio.to_thread(
|
|
|
|
|
+ remove_watermark_from_pdf,
|
|
|
|
|
+ input_pdf=file_path,
|
|
|
|
|
+ output_pdf=nowm_pdf_path,
|
|
|
|
|
+ light_threshold=request.watermark_light_threshold or 200,
|
|
|
|
|
+ saturation_threshold=request.watermark_saturation_threshold or 30,
|
|
|
|
|
+ dpi=200 # PDF转图片的DPI
|
|
|
|
|
+ )
|
|
|
|
|
+
|
|
|
|
|
+ if success and PathLib(nowm_pdf_path).exists():
|
|
|
|
|
+ file_path = nowm_pdf_path
|
|
|
|
|
+ logger.info(f"[任务 {task_id}] PDF去水印完成: {file_path}")
|
|
|
|
|
+ else:
|
|
|
|
|
+ logger.warning(f"[任务 {task_id}] PDF去水印失败,使用原PDF继续")
|
|
|
|
|
+ except Exception as e:
|
|
|
|
|
+ logger.warning(f"[任务 {task_id}] PDF去水印处理失败,使用原PDF继续: {e}")
|
|
|
|
|
+
|
|
|
result = None
|
|
result = None
|
|
|
tables_info = None
|
|
tables_info = None
|
|
|
|
|
|
|
|
# 针对投资估算类型,需要先切割附件页
|
|
# 针对投资估算类型,需要先切割附件页
|
|
|
- if request.doc_type in ("fsApproval", "fsReview", "pdApproval"):
|
|
|
|
|
|
|
+ if request.doc_type in ("fsApproval", "fsReview", "pdApproval", "safetyFsApproval"):
|
|
|
logger.info(f"[任务 {task_id}] 文档类型 {request.doc_type},需要先切割附件页")
|
|
logger.info(f"[任务 {task_id}] 文档类型 {request.doc_type},需要先切割附件页")
|
|
|
|
|
|
|
|
# 导入附件页切割函数
|
|
# 导入附件页切割函数
|
|
@@ -328,18 +442,21 @@ async def process_conversion_task(
|
|
|
attachment_dir = PathLib(output_dir) / "attachments"
|
|
attachment_dir = PathLib(output_dir) / "attachments"
|
|
|
attachment_dir.mkdir(parents=True, exist_ok=True)
|
|
attachment_dir.mkdir(parents=True, exist_ok=True)
|
|
|
|
|
|
|
|
- # 切割附件页
|
|
|
|
|
- logger.info(f"[任务 {task_id}] 开始切割附件页,输出目录: {attachment_dir}")
|
|
|
|
|
|
|
+ # 切割附件页(根据 table_only 参数决定是否过滤非表格内容)
|
|
|
|
|
+ logger.info(f"[任务 {task_id}] 开始切割附件页(table_only={request.table_only}),输出目录: {attachment_dir}")
|
|
|
await asyncio.to_thread(
|
|
await asyncio.to_thread(
|
|
|
split_attachment_pages,
|
|
split_attachment_pages,
|
|
|
file_path,
|
|
file_path,
|
|
|
attachment_dir,
|
|
attachment_dir,
|
|
|
use_ocr=True,
|
|
use_ocr=True,
|
|
|
- debug=False
|
|
|
|
|
|
|
+ debug=False,
|
|
|
|
|
+ table_only=request.table_only # 是否只保留包含表格的附件页
|
|
|
)
|
|
)
|
|
|
|
|
|
|
|
- # 查找切割后的附件页PDF
|
|
|
|
|
- attachment_pdfs = list(attachment_dir.glob("*_附件页_*.pdf"))
|
|
|
|
|
|
|
+ # 查找切割后的附件页PDF(优先使用表格附件页,其次使用普通附件页)
|
|
|
|
|
+ attachment_pdfs = list(attachment_dir.glob("*_表格附件页_*.pdf"))
|
|
|
|
|
+ if not attachment_pdfs:
|
|
|
|
|
+ attachment_pdfs = list(attachment_dir.glob("*_附件页_*.pdf"))
|
|
|
logger.info(f"[任务 {task_id}] 附件页目录内容: {list(attachment_dir.iterdir()) if attachment_dir.exists() else '(目录不存在)'}")
|
|
logger.info(f"[任务 {task_id}] 附件页目录内容: {list(attachment_dir.iterdir()) if attachment_dir.exists() else '(目录不存在)'}")
|
|
|
|
|
|
|
|
if attachment_pdfs:
|
|
if attachment_pdfs:
|
|
@@ -532,11 +649,44 @@ async def process_conversion_task(
|
|
|
@app.post("/convert", response_model=ConversionResponse)
|
|
@app.post("/convert", response_model=ConversionResponse)
|
|
|
async def convert_file(
|
|
async def convert_file(
|
|
|
file: Annotated[UploadFile, File(description="上传的PDF或图片文件")],
|
|
file: Annotated[UploadFile, File(description="上传的PDF或图片文件")],
|
|
|
- # 新增:类型参数(英文传参) noiseRec | emRec | opStatus | settlementReport | designReview | fsApproval | fsReview | pdApproval | finalAccount
|
|
|
|
|
|
|
+ # 新增:类型参数(英文传参) noiseRec | emRec | opStatus | settlementReport | designReview | fsApproval | fsReview | pdApproval | finalAccount | safetyFsApproval
|
|
|
type: Annotated[
|
|
type: Annotated[
|
|
|
- Optional[Literal["noiseRec", "emRec", "opStatus", "settlementReport", "designReview", "fsApproval", "fsReview", "pdApproval", "finalAccount"]],
|
|
|
|
|
- Form(description="文档类型:noiseRec | emRec | opStatus | settlementReport | designReview | fsApproval | fsReview | pdApproval | finalAccount")
|
|
|
|
|
|
|
+ Optional[Literal["noiseRec", "emRec", "opStatus", "settlementReport", "designReview", "fsApproval", "fsReview", "pdApproval", "finalAccount", "safetyFsApproval"]],
|
|
|
|
|
+ Form(description="文档类型:noiseRec | emRec | opStatus | settlementReport | designReview | fsApproval | fsReview | pdApproval | finalAccount | safetyFsApproval")
|
|
|
] = None,
|
|
] = None,
|
|
|
|
|
+ # 新增:去水印参数
|
|
|
|
|
+ remove_watermark: Annotated[
|
|
|
|
|
+ Optional[bool],
|
|
|
|
|
+ Form(description="是否去除水印,默认为false")
|
|
|
|
|
+ ] = False,
|
|
|
|
|
+ watermark_light_threshold: Annotated[
|
|
|
|
|
+ Optional[int],
|
|
|
|
|
+ Form(description="水印亮度阈值(0-255),默认200,高于此值的浅色像素可能是水印")
|
|
|
|
|
+ ] = 200,
|
|
|
|
|
+ watermark_saturation_threshold: Annotated[
|
|
|
|
|
+ Optional[int],
|
|
|
|
|
+ Form(description="水印饱和度阈值(0-255),默认30,低于此值的低饱和度像素可能是水印")
|
|
|
|
|
+ ] = 30,
|
|
|
|
|
+ crop_header_footer: Annotated[
|
|
|
|
|
+ Optional[bool],
|
|
|
|
|
+ Form(description="是否裁剪页眉页脚,默认为false")
|
|
|
|
|
+ ] = False,
|
|
|
|
|
+ header_ratio: Annotated[
|
|
|
|
|
+ Optional[float],
|
|
|
|
|
+ Form(description="页眉裁剪比例(0-1),默认0.05表示裁剪顶部5%")
|
|
|
|
|
+ ] = 0.05,
|
|
|
|
|
+ footer_ratio: Annotated[
|
|
|
|
|
+ Optional[float],
|
|
|
|
|
+ Form(description="页脚裁剪比例(0-1),默认0.05表示裁剪底部5%")
|
|
|
|
|
+ ] = 0.05,
|
|
|
|
|
+ auto_detect_header_footer: Annotated[
|
|
|
|
|
+ Optional[bool],
|
|
|
|
|
+ Form(description="是否自动检测页眉页脚边界,默认为false(启用后忽略header_ratio和footer_ratio)")
|
|
|
|
|
+ ] = False,
|
|
|
|
|
+ table_only: Annotated[
|
|
|
|
|
+ Optional[bool],
|
|
|
|
|
+ Form(description="是否只保留包含表格的附件页,默认为false")
|
|
|
|
|
+ ] = False,
|
|
|
):
|
|
):
|
|
|
"""
|
|
"""
|
|
|
转换PDF/图片文件(异步处理)
|
|
转换PDF/图片文件(异步处理)
|
|
@@ -557,6 +707,16 @@ async def convert_file(
|
|
|
* fsApproval - 可研批复投资估算
|
|
* fsApproval - 可研批复投资估算
|
|
|
* fsReview - 可研评审投资估算
|
|
* fsReview - 可研评审投资估算
|
|
|
* pdApproval - 初设批复概算投资
|
|
* pdApproval - 初设批复概算投资
|
|
|
|
|
+ * finalAccount - 决算报告
|
|
|
|
|
+ * safetyFsApproval - 安评可研批复投资估算
|
|
|
|
|
+ - **remove_watermark**: 是否去除水印(仅对图片有效),默认为false
|
|
|
|
|
+ - **watermark_light_threshold**: 水印亮度阈值(0-255),默认200
|
|
|
|
|
+ - **watermark_saturation_threshold**: 水印饱和度阈值(0-255),默认30
|
|
|
|
|
+ - **crop_header_footer**: 是否裁剪页眉页脚(仅对图片有效),默认为false
|
|
|
|
|
+ - **header_ratio**: 页眉裁剪比例(0-1),默认0.05
|
|
|
|
|
+ - **footer_ratio**: 页脚裁剪比例(0-1),默认0.05
|
|
|
|
|
+ - **auto_detect_header_footer**: 是否自动检测页眉页脚边界,默认为false
|
|
|
|
|
+ - **table_only**: 是否只保留包含表格的附件页,默认为false
|
|
|
|
|
|
|
|
注意:v2 版本内部使用外部API进行转换,v2特有的配置参数(如API URL、backend等)
|
|
注意:v2 版本内部使用外部API进行转换,v2特有的配置参数(如API URL、backend等)
|
|
|
通过环境变量或配置文件设置,不通过API参数传入。
|
|
通过环境变量或配置文件设置,不通过API参数传入。
|
|
@@ -677,6 +837,8 @@ async def convert_file(
|
|
|
"pdApproval": "pdApproval",
|
|
"pdApproval": "pdApproval",
|
|
|
# 决算报告
|
|
# 决算报告
|
|
|
"finalAccount": "finalAccount",
|
|
"finalAccount": "finalAccount",
|
|
|
|
|
+ # 安评类
|
|
|
|
|
+ "safetyFsApproval": "safetyFsApproval",
|
|
|
}
|
|
}
|
|
|
doc_type = None
|
|
doc_type = None
|
|
|
if type:
|
|
if type:
|
|
@@ -692,6 +854,14 @@ async def convert_file(
|
|
|
# 创建请求对象(v2 精简)
|
|
# 创建请求对象(v2 精简)
|
|
|
request = ConversionRequest(
|
|
request = ConversionRequest(
|
|
|
doc_type=doc_type,
|
|
doc_type=doc_type,
|
|
|
|
|
+ remove_watermark=remove_watermark,
|
|
|
|
|
+ watermark_light_threshold=watermark_light_threshold,
|
|
|
|
|
+ watermark_saturation_threshold=watermark_saturation_threshold,
|
|
|
|
|
+ crop_header_footer=crop_header_footer,
|
|
|
|
|
+ header_ratio=header_ratio,
|
|
|
|
|
+ footer_ratio=footer_ratio,
|
|
|
|
|
+ auto_detect_header_footer=auto_detect_header_footer,
|
|
|
|
|
+ table_only=table_only,
|
|
|
)
|
|
)
|
|
|
|
|
|
|
|
# 使用 asyncio.create_task 创建后台任务,确保立即返回
|
|
# 使用 asyncio.create_task 创建后台任务,确保立即返回
|
|
@@ -875,6 +1045,13 @@ async def ocr_image(request: OCRRequest):
|
|
|
|
|
|
|
|
- **image_base64**: base64编码的图片数据(可以包含data:image/xxx;base64,前缀)
|
|
- **image_base64**: base64编码的图片数据(可以包含data:image/xxx;base64,前缀)
|
|
|
- **image_format**: 图片格式(png, jpg, jpeg),默认为png
|
|
- **image_format**: 图片格式(png, jpg, jpeg),默认为png
|
|
|
|
|
+ - **remove_watermark**: 是否去除水印,默认为false
|
|
|
|
|
+ - **watermark_light_threshold**: 水印亮度阈值(0-255),默认200,高于此值的浅色像素可能是水印
|
|
|
|
|
+ - **watermark_saturation_threshold**: 水印饱和度阈值(0-255),默认30,低于此值的低饱和度像素可能是水印
|
|
|
|
|
+ - **crop_header_footer**: 是否裁剪页眉页脚,默认为false
|
|
|
|
|
+ - **header_ratio**: 页眉裁剪比例(0-1),默认0.05表示裁剪顶部5%
|
|
|
|
|
+ - **footer_ratio**: 页脚裁剪比例(0-1),默认0.05表示裁剪底部5%
|
|
|
|
|
+ - **auto_detect_header_footer**: 是否自动检测页眉页脚边界,默认为false(启用后忽略header_ratio和footer_ratio)
|
|
|
|
|
|
|
|
返回识别出的文本列表和GPU监控信息
|
|
返回识别出的文本列表和GPU监控信息
|
|
|
"""
|
|
"""
|
|
@@ -934,6 +1111,57 @@ async def ocr_image(request: OCRRequest):
|
|
|
f.write(image_bytes)
|
|
f.write(image_bytes)
|
|
|
logger.info(f"[OCR] 图片已保存: {image_path}")
|
|
logger.info(f"[OCR] 图片已保存: {image_path}")
|
|
|
|
|
|
|
|
|
|
+ # 如果需要裁剪页眉页脚,先进行裁剪
|
|
|
|
|
+ if request.crop_header_footer:
|
|
|
|
|
+ try:
|
|
|
|
|
+ from ..utils.image_preprocessor import crop_header_footer, check_opencv_available
|
|
|
|
|
+
|
|
|
|
|
+ if check_opencv_available():
|
|
|
|
|
+ if request.auto_detect_header_footer:
|
|
|
|
|
+ logger.info("[OCR] 开始自动检测并裁剪页眉页脚")
|
|
|
|
|
+ else:
|
|
|
|
|
+ logger.info(f"[OCR] 开始裁剪页眉页脚,顶部比例: {request.header_ratio}, 底部比例: {request.footer_ratio}")
|
|
|
|
|
+
|
|
|
|
|
+ # 裁剪后的图片路径
|
|
|
|
|
+ cropped_image_path = os.path.join(temp_dir, f"ocr_image_cropped{ext}")
|
|
|
|
|
+
|
|
|
|
|
+ image_path = crop_header_footer(
|
|
|
|
|
+ image_path,
|
|
|
|
|
+ output_path=cropped_image_path,
|
|
|
|
|
+ header_ratio=request.header_ratio or 0.05,
|
|
|
|
|
+ footer_ratio=request.footer_ratio or 0.05,
|
|
|
|
|
+ auto_detect=request.auto_detect_header_footer or False
|
|
|
|
|
+ )
|
|
|
|
|
+ logger.info(f"[OCR] 裁剪页眉页脚完成: {image_path}")
|
|
|
|
|
+ else:
|
|
|
|
|
+ logger.warning("[OCR] OpenCV 未安装,跳过裁剪页眉页脚")
|
|
|
|
|
+ except Exception as e:
|
|
|
|
|
+ logger.warning(f"[OCR] 裁剪页眉页脚失败,使用原图继续: {e}")
|
|
|
|
|
+
|
|
|
|
|
+ # 如果需要去水印,进行预处理
|
|
|
|
|
+ if request.remove_watermark:
|
|
|
|
|
+ try:
|
|
|
|
|
+ from ..utils.image_preprocessor import remove_watermark, check_opencv_available
|
|
|
|
|
+
|
|
|
|
|
+ if check_opencv_available():
|
|
|
|
|
+ logger.info(f"[OCR] 开始去水印处理,亮度阈值: {request.watermark_light_threshold}, 饱和度阈值: {request.watermark_saturation_threshold}")
|
|
|
|
|
+
|
|
|
|
|
+ # 去水印后的图片路径
|
|
|
|
|
+ nowm_image_path = os.path.join(temp_dir, f"ocr_image_nowm{ext}")
|
|
|
|
|
+
|
|
|
|
|
+ image_path = remove_watermark(
|
|
|
|
|
+ image_path,
|
|
|
|
|
+ output_path=nowm_image_path,
|
|
|
|
|
+ light_threshold=request.watermark_light_threshold or 200,
|
|
|
|
|
+ saturation_threshold=request.watermark_saturation_threshold or 30,
|
|
|
|
|
+ method="hsv"
|
|
|
|
|
+ )
|
|
|
|
|
+ logger.info(f"[OCR] 去水印完成: {image_path}")
|
|
|
|
|
+ else:
|
|
|
|
|
+ logger.warning("[OCR] OpenCV 未安装,跳过去水印处理")
|
|
|
|
|
+ except Exception as e:
|
|
|
|
|
+ logger.warning(f"[OCR] 去水印处理失败,使用原图继续: {e}")
|
|
|
|
|
+
|
|
|
# 调用PaddleOCR进行识别(监控线程在此期间持续采集数据)
|
|
# 调用PaddleOCR进行识别(监控线程在此期间持续采集数据)
|
|
|
from ..utils.paddleocr_fallback import call_paddleocr_ocr
|
|
from ..utils.paddleocr_fallback import call_paddleocr_ocr
|
|
|
|
|
|