| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526 |
- """
- 图像预处理工具 - 包含去水印等功能
- 支持的预处理操作:
- - 去水印(颜色过滤法)
- - 灰度转换
- - 二值化
- - 去噪
- """
- import numpy as np
- from pathlib import Path
- from typing import Optional, Tuple
- from loguru import logger
- try:
- from PIL import Image
- PIL_AVAILABLE = True
- except ImportError:
- PIL_AVAILABLE = False
- logger.warning("[图像预处理] PIL 未安装,部分功能不可用")
- try:
- import cv2
- CV2_AVAILABLE = True
- except ImportError:
- CV2_AVAILABLE = False
- logger.warning("[图像预处理] OpenCV 未安装,部分功能不可用")
- def remove_watermark(
- image_path: str,
- output_path: Optional[str] = None,
- light_threshold: int = 200,
- saturation_threshold: int = 30,
- method: str = "auto"
- ) -> str:
- """
- 去除图片水印
-
- 原理:大多数水印是浅色或半透明的,通过以下方式去除:
- 1. 将浅色像素(亮度高、饱和度低)替换为白色
- 2. 保留深色文字内容
-
- Args:
- image_path: 输入图片路径
- output_path: 输出图片路径,默认在原文件名后加 _nowm
- light_threshold: 亮度阈值(0-255),高于此值的浅色像素可能是水印
- saturation_threshold: 饱和度阈值(0-255),低于此值的低饱和度像素可能是水印
- method: 去水印方法
- - "auto": 自动选择最佳方法
- - "light": 基于亮度的简单方法(快速)
- - "hsv": 基于HSV颜色空间的方法(更精确)
- - "adaptive": 自适应阈值方法
-
- Returns:
- 处理后的图片路径
- """
- if not CV2_AVAILABLE:
- logger.warning("[去水印] OpenCV 未安装,跳过去水印处理")
- return image_path
-
- logger.info(f"[去水印] 开始处理: {image_path}")
- logger.info(f"[去水印] 方法: {method}, 亮度阈值: {light_threshold}, 饱和度阈值: {saturation_threshold}")
-
- # 读取图片
- img = cv2.imread(image_path)
- if img is None:
- logger.error(f"[去水印] 无法读取图片: {image_path}")
- return image_path
-
- original_shape = img.shape
- logger.info(f"[去水印] 图片尺寸: {original_shape[1]}x{original_shape[0]}")
-
- # 根据方法选择处理逻辑
- if method == "auto":
- # 自动检测:先尝试 HSV 方法,如果效果不好则用 adaptive
- method = "hsv"
-
- if method == "light":
- # 简单亮度方法:将浅色像素替换为白色
- result = _remove_watermark_light(img, light_threshold)
- elif method == "hsv":
- # HSV 方法:基于亮度和饱和度
- result = _remove_watermark_hsv(img, light_threshold, saturation_threshold)
- elif method == "adaptive":
- # 自适应方法:使用自适应阈值
- result = _remove_watermark_adaptive(img)
- else:
- logger.warning(f"[去水印] 未知方法: {method},使用 hsv")
- result = _remove_watermark_hsv(img, light_threshold, saturation_threshold)
-
- # 确定输出路径
- if output_path is None:
- path = Path(image_path)
- output_path = str(path.parent / f"{path.stem}_nowm{path.suffix}")
-
- # 保存结果
- cv2.imwrite(output_path, result)
- logger.info(f"[去水印] 处理完成,保存到: {output_path}")
-
- return output_path
- def _remove_watermark_light(img: np.ndarray, threshold: int = 200) -> np.ndarray:
- """
- 简单亮度方法:将浅色像素替换为白色
-
- 适用于:浅色/灰色水印
- """
- # 转为灰度图
- gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
-
- # 创建掩码:亮度高于阈值的区域
- mask = gray > threshold
-
- # 将掩码区域设为白色
- result = img.copy()
- result[mask] = [255, 255, 255]
-
- return result
- def _remove_watermark_hsv(
- img: np.ndarray,
- light_threshold: int = 200,
- saturation_threshold: int = 30
- ) -> np.ndarray:
- """
- HSV 方法:基于亮度和饱和度去除水印
-
- 原理:水印通常是高亮度、低饱和度的
- 适用于:彩色水印、半透明水印
- """
- # 转换到 HSV 颜色空间
- hsv = cv2.cvtColor(img, cv2.COLOR_BGR2HSV)
-
- # 分离通道
- h, s, v = cv2.split(hsv)
-
- # 创建水印掩码:高亮度 AND 低饱和度
- watermark_mask = (v > light_threshold) & (s < saturation_threshold)
-
- # 将水印区域设为白色
- result = img.copy()
- result[watermark_mask] = [255, 255, 255]
-
- # 可选:对边缘进行平滑处理
- # kernel = np.ones((3, 3), np.uint8)
- # watermark_mask_dilated = cv2.dilate(watermark_mask.astype(np.uint8), kernel, iterations=1)
- # result[watermark_mask_dilated == 1] = [255, 255, 255]
-
- return result
- def _remove_watermark_adaptive(img: np.ndarray) -> np.ndarray:
- """
- 自适应阈值方法
-
- 适用于:复杂背景、不均匀光照
- """
- # 转为灰度图
- gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
-
- # 使用自适应阈值
- # 这会根据局部区域计算阈值,保留文字,去除背景和水印
- binary = cv2.adaptiveThreshold(
- gray, 255,
- cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
- cv2.THRESH_BINARY,
- blockSize=15,
- C=10
- )
-
- # 转回 BGR(3通道)
- result = cv2.cvtColor(binary, cv2.COLOR_GRAY2BGR)
-
- return result
- def enhance_for_ocr(
- image_path: str,
- output_path: Optional[str] = None,
- remove_wm: bool = True,
- denoise: bool = True,
- sharpen: bool = False
- ) -> str:
- """
- OCR 预处理增强
-
- 组合多种预处理操作,优化 OCR 识别效果
-
- Args:
- image_path: 输入图片路径
- output_path: 输出图片路径
- remove_wm: 是否去除水印
- denoise: 是否去噪
- sharpen: 是否锐化
-
- Returns:
- 处理后的图片路径
- """
- if not CV2_AVAILABLE:
- logger.warning("[OCR预处理] OpenCV 未安装,跳过预处理")
- return image_path
-
- logger.info(f"[OCR预处理] 开始处理: {image_path}")
-
- # 读取图片
- img = cv2.imread(image_path)
- if img is None:
- logger.error(f"[OCR预处理] 无法读取图片: {image_path}")
- return image_path
-
- result = img.copy()
-
- # 1. 去水印
- if remove_wm:
- result = _remove_watermark_hsv(result)
- logger.info("[OCR预处理] 已去除水印")
-
- # 2. 去噪
- if denoise:
- result = cv2.fastNlMeansDenoisingColored(result, None, 10, 10, 7, 21)
- logger.info("[OCR预处理] 已去噪")
-
- # 3. 锐化
- if sharpen:
- kernel = np.array([[-1, -1, -1],
- [-1, 9, -1],
- [-1, -1, -1]])
- result = cv2.filter2D(result, -1, kernel)
- logger.info("[OCR预处理] 已锐化")
-
- # 确定输出路径
- if output_path is None:
- path = Path(image_path)
- output_path = str(path.parent / f"{path.stem}_enhanced{path.suffix}")
-
- # 保存结果
- cv2.imwrite(output_path, result)
- logger.info(f"[OCR预处理] 处理完成,保存到: {output_path}")
-
- return output_path
- def check_opencv_available() -> bool:
- """检查 OpenCV 是否可用"""
- return CV2_AVAILABLE
- def crop_header_footer(
- image_path: str,
- output_path: Optional[str] = None,
- header_ratio: float = 0.05,
- footer_ratio: float = 0.05,
- auto_detect: bool = False
- ) -> str:
- """
- 裁剪图片的页眉和页脚区域
-
- 通过按比例裁剪图片顶部和底部来去除页眉页脚
-
- Args:
- image_path: 输入图片路径
- output_path: 输出图片路径,默认在原文件名后加 _cropped
- header_ratio: 页眉裁剪比例(0-1),默认0.05表示裁剪顶部5%
- footer_ratio: 页脚裁剪比例(0-1),默认0.05表示裁剪底部5%
- auto_detect: 是否自动检测页眉页脚边界(忽略 header_ratio 和 footer_ratio)
-
- Returns:
- 处理后的图片路径
- """
- if not CV2_AVAILABLE:
- logger.warning("[裁剪页眉页脚] OpenCV 未安装,跳过处理")
- return image_path
-
- logger.info(f"[裁剪页眉页脚] 开始处理: {image_path}")
-
- # 读取图片
- img = cv2.imread(image_path)
- if img is None:
- logger.error(f"[裁剪页眉页脚] 无法读取图片: {image_path}")
- return image_path
-
- height, width = img.shape[:2]
- logger.info(f"[裁剪页眉页脚] 原始尺寸: {width}x{height}")
-
- if auto_detect:
- # 自动检测页眉页脚边界
- logger.info("[裁剪页眉页脚] 使用自动检测模式")
- header_pixels, footer_pixels = _detect_header_footer_boundaries(img)
- logger.info(f"[裁剪页眉页脚] 自动检测结果: 页眉={header_pixels}px, 页脚={footer_pixels}px")
- else:
- # 使用固定比例
- logger.info(f"[裁剪页眉页脚] 使用固定比例: 页眉={header_ratio}, 页脚={footer_ratio}")
- header_pixels = int(height * header_ratio)
- footer_pixels = int(height * footer_ratio)
-
- # 裁剪图片(保留中间部分)
- top = header_pixels
- bottom = height - footer_pixels
-
- if top >= bottom:
- logger.warning("[裁剪页眉页脚] 裁剪区域无效,跳过处理")
- return image_path
-
- result = img[top:bottom, :]
-
- new_height = result.shape[0]
- logger.info(f"[裁剪页眉页脚] 裁剪后尺寸: {width}x{new_height}")
- logger.info(f"[裁剪页眉页脚] 裁剪了顶部 {header_pixels}px,底部 {footer_pixels}px")
-
- # 确定输出路径
- if output_path is None:
- path = Path(image_path)
- output_path = str(path.parent / f"{path.stem}_cropped{path.suffix}")
-
- # 保存结果
- cv2.imwrite(output_path, result)
- logger.info(f"[裁剪页眉页脚] 处理完成,保存到: {output_path}")
-
- return output_path
- def _detect_header_footer_boundaries(img: np.ndarray) -> Tuple[int, int]:
- """
- 自动检测页眉页脚边界
-
- 使用多种方法综合判断:
- 1. 水平线检测 - 检测分隔线
- 2. 文本密度分析 - 页眉页脚通常文字较少
- 3. 空白区域检测 - 检测大面积空白
-
- Args:
- img: 输入图片(BGR格式)
-
- Returns:
- (header_pixels, footer_pixels): 页眉和页脚的像素高度
- """
- height, width = img.shape[:2]
-
- # 转为灰度图
- gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
-
- # 定义搜索范围(页眉页脚通常在顶部/底部 15% 以内)
- search_range = int(height * 0.15)
- min_margin = int(height * 0.02) # 最小边距 2%
-
- # 方法1: 检测水平线
- header_line = _find_horizontal_line(gray, 0, search_range, from_top=True)
- footer_line = _find_horizontal_line(gray, height - search_range, height, from_top=False)
-
- # 方法2: 分析文本密度变化
- header_density = _find_content_boundary(gray, 0, search_range, from_top=True)
- footer_density = _find_content_boundary(gray, height - search_range, height, from_top=False)
-
- # 综合判断:取最可靠的结果
- # 优先使用水平线检测结果,其次使用密度分析结果
- if header_line > min_margin:
- header_pixels = header_line
- logger.debug(f"[自动检测] 页眉: 使用水平线检测结果 {header_pixels}px")
- elif header_density > min_margin:
- header_pixels = header_density
- logger.debug(f"[自动检测] 页眉: 使用密度分析结果 {header_pixels}px")
- else:
- header_pixels = min_margin
- logger.debug(f"[自动检测] 页眉: 使用最小边距 {header_pixels}px")
-
- if footer_line > min_margin:
- footer_pixels = footer_line
- logger.debug(f"[自动检测] 页脚: 使用水平线检测结果 {footer_pixels}px")
- elif footer_density > min_margin:
- footer_pixels = footer_density
- logger.debug(f"[自动检测] 页脚: 使用密度分析结果 {footer_pixels}px")
- else:
- footer_pixels = min_margin
- logger.debug(f"[自动检测] 页脚: 使用最小边距 {footer_pixels}px")
-
- return header_pixels, footer_pixels
- def _find_horizontal_line(
- gray: np.ndarray,
- start_y: int,
- end_y: int,
- from_top: bool = True
- ) -> int:
- """
- 在指定区域内查找水平分隔线
-
- Args:
- gray: 灰度图
- start_y: 搜索起始y坐标
- end_y: 搜索结束y坐标
- from_top: True表示从上往下找,False表示从下往上找
-
- Returns:
- 分隔线位置(像素),如果没找到返回0
- """
- height, width = gray.shape
-
- # 使用 Canny 边缘检测
- edges = cv2.Canny(gray[start_y:end_y, :], 50, 150)
-
- # 使用霍夫变换检测直线
- lines = cv2.HoughLinesP(
- edges,
- rho=1,
- theta=np.pi/180,
- threshold=int(width * 0.5), # 线长度至少为图片宽度的50%
- minLineLength=int(width * 0.4),
- maxLineGap=20
- )
-
- if lines is None:
- return 0
-
- # 筛选水平线(角度接近0或180度)
- horizontal_lines = []
- for line in lines:
- x1, y1, x2, y2 = line[0]
- # 计算角度
- angle = abs(np.arctan2(y2 - y1, x2 - x1) * 180 / np.pi)
- # 水平线角度应该接近 0 或 180
- if angle < 5 or angle > 175:
- avg_y = (y1 + y2) // 2 + start_y
- horizontal_lines.append(avg_y)
-
- if not horizontal_lines:
- return 0
-
- # 根据方向返回最合适的线
- if from_top:
- # 从上往下,返回最下面的水平线(作为页眉下边界)
- return max(horizontal_lines)
- else:
- # 从下往上,返回距离底部的距离
- return height - min(horizontal_lines)
- def _find_content_boundary(
- gray: np.ndarray,
- start_y: int,
- end_y: int,
- from_top: bool = True
- ) -> int:
- """
- 通过分析文本/内容密度找到内容边界
-
- 原理:页眉页脚区域通常是空白或只有少量文字,
- 正文区域文字密度较高。通过检测密度突变点来确定边界。
-
- Args:
- gray: 灰度图
- start_y: 搜索起始y坐标
- end_y: 搜索结束y坐标
- from_top: True表示从上往下找,False表示从下往上找
-
- Returns:
- 内容边界位置(像素),如果没找到返回0
- """
- height, width = gray.shape
-
- # 二值化
- _, binary = cv2.threshold(gray, 200, 255, cv2.THRESH_BINARY_INV)
-
- # 计算每一行的像素密度(黑色像素占比)
- row_densities = []
- for y in range(start_y, end_y):
- row = binary[y, :]
- density = np.sum(row > 0) / width
- row_densities.append((y, density))
-
- if not row_densities:
- return 0
-
- # 使用滑动窗口平滑密度曲线
- window_size = 10
- smoothed = []
- for i in range(len(row_densities)):
- start = max(0, i - window_size // 2)
- end = min(len(row_densities), i + window_size // 2)
- avg_density = sum(d[1] for d in row_densities[start:end]) / (end - start)
- smoothed.append((row_densities[i][0], avg_density))
-
- # 找到密度突变点
- # 定义阈值:当密度从低于 0.01 变化到高于 0.02 时,认为进入正文区域
- low_threshold = 0.005
- high_threshold = 0.02
-
- if from_top:
- # 从上往下,找到第一个连续高密度区域的起始位置
- in_content = False
- content_start = 0
- consecutive_high = 0
-
- for y, density in smoothed:
- if density > high_threshold:
- consecutive_high += 1
- if consecutive_high >= 5 and not in_content:
- # 连续5行高密度,认为进入正文
- in_content = True
- content_start = y - 5 # 往上回退一点
- break
- else:
- consecutive_high = 0
-
- return max(0, content_start - start_y)
- else:
- # 从下往上,找到最后一个连续高密度区域的结束位置
- in_content = False
- content_end = height
- consecutive_high = 0
-
- for y, density in reversed(smoothed):
- if density > high_threshold:
- consecutive_high += 1
- if consecutive_high >= 5 and not in_content:
- in_content = True
- content_end = y + 5
- break
- else:
- consecutive_high = 0
-
- return max(0, height - content_end)
|