|
@@ -403,6 +403,52 @@ def has_recognition_garbage(text: str, min_repeat: int = 10) -> bool:
|
|
|
return bool(re.search(rf"(.)\1{{{min_repeat},}}", text))
|
|
return bool(re.search(rf"(.)\1{{{min_repeat},}}", text))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
+def is_paddleocr_number_garbage(text: str) -> bool:
|
|
|
|
|
+ """检测 PaddleOCR 是否返回了大量连续数字的垃圾识别结果
|
|
|
|
|
+
|
|
|
|
|
+ 这种情况通常发生在空表格识别时,PaddleOCR 把表格边框误识别成大量连续数字。
|
|
|
|
|
+
|
|
|
|
|
+ Args:
|
|
|
|
|
+ text: PaddleOCR 返回的文本
|
|
|
|
|
+
|
|
|
|
|
+ Returns:
|
|
|
|
|
+ True 表示是垃圾识别结果,应该丢弃
|
|
|
|
|
+ """
|
|
|
|
|
+ if not text:
|
|
|
|
|
+ return False
|
|
|
|
|
+
|
|
|
|
|
+ # 提取所有数字序列
|
|
|
|
|
+ numbers = re.findall(r'\d+', text)
|
|
|
|
|
+ if not numbers:
|
|
|
|
|
+ return False
|
|
|
|
|
+
|
|
|
|
|
+ # 计算数字字符占总字符的比例
|
|
|
|
|
+ total_chars = len(text.replace(' ', '').replace('\n', ''))
|
|
|
|
|
+ if total_chars == 0:
|
|
|
|
|
+ return False
|
|
|
|
|
+
|
|
|
|
|
+ number_chars = sum(len(n) for n in numbers)
|
|
|
|
|
+ number_ratio = number_chars / total_chars
|
|
|
|
|
+
|
|
|
|
|
+ # 如果数字占比超过 80%,且有大量连续数字,判定为垃圾
|
|
|
|
|
+ if number_ratio > 0.8:
|
|
|
|
|
+ # 检查是否有大量连续的递增数字(如 1 2 3 4 5...)
|
|
|
|
|
+ number_list = [int(n) for n in numbers if len(n) <= 4] # 只考虑 4 位以内的数字
|
|
|
|
|
+ if len(number_list) > 50: # 如果有超过 50 个数字
|
|
|
|
|
+ # 检查是否有连续递增的数字序列
|
|
|
|
|
+ consecutive_count = 0
|
|
|
|
|
+ for i in range(len(number_list) - 1):
|
|
|
|
|
+ if number_list[i+1] == number_list[i] + 1:
|
|
|
|
|
+ consecutive_count += 1
|
|
|
|
|
+ if consecutive_count > 20: # 连续递增超过 20 个
|
|
|
|
|
+ logger.warning(f"[PaddleOCR 垃圾检测] 检测到大量连续递增数字,数字占比: {number_ratio:.2%}")
|
|
|
|
|
+ return True
|
|
|
|
|
+ else:
|
|
|
|
|
+ consecutive_count = 0
|
|
|
|
|
+
|
|
|
|
|
+ return False
|
|
|
|
|
+
|
|
|
|
|
+
|
|
|
def detect_file_type(file_path: str) -> Optional[str]:
|
|
def detect_file_type(file_path: str) -> Optional[str]:
|
|
|
"""通过文件内容(魔数)检测文件类型,不依赖扩展名
|
|
"""通过文件内容(魔数)检测文件类型,不依赖扩展名
|
|
|
|
|
|
|
@@ -659,6 +705,10 @@ def call_paddleocr(image_path: str) -> Optional[Dict[str, Any]]:
|
|
|
with open(md_file, 'r', encoding='utf-8') as f:
|
|
with open(md_file, 'r', encoding='utf-8') as f:
|
|
|
markdown_content = f.read()
|
|
markdown_content = f.read()
|
|
|
if markdown_content.strip():
|
|
if markdown_content.strip():
|
|
|
|
|
+ # 检测是否是垃圾识别结果(大量连续数字)
|
|
|
|
|
+ if is_paddleocr_number_garbage(markdown_content):
|
|
|
|
|
+ logger.warning("[PaddleOCR 图表识别] 检测到垃圾识别结果(大量连续数字),丢弃此结果")
|
|
|
|
|
+ return None
|
|
|
logger.info(f"[PaddleOCR 图表识别] 成功读取Markdown文件,内容长度: {len(markdown_content)} 字符")
|
|
logger.info(f"[PaddleOCR 图表识别] 成功读取Markdown文件,内容长度: {len(markdown_content)} 字符")
|
|
|
return {"markdown_content": markdown_content}
|
|
return {"markdown_content": markdown_content}
|
|
|
else:
|
|
else:
|