|
@@ -1,8 +1,6 @@
|
|
|
package com.lingyue.parse.service;
|
|
package com.lingyue.parse.service;
|
|
|
|
|
|
|
|
-import com.fasterxml.jackson.databind.ObjectMapper;
|
|
|
|
|
import com.lingyue.common.exception.ServiceException;
|
|
import com.lingyue.common.exception.ServiceException;
|
|
|
-import lombok.Data;
|
|
|
|
|
import lombok.RequiredArgsConstructor;
|
|
import lombok.RequiredArgsConstructor;
|
|
|
import lombok.extern.slf4j.Slf4j;
|
|
import lombok.extern.slf4j.Slf4j;
|
|
|
import org.apache.pdfbox.pdmodel.PDDocument;
|
|
import org.apache.pdfbox.pdmodel.PDDocument;
|
|
@@ -16,7 +14,6 @@ import java.io.File;
|
|
|
import java.io.IOException;
|
|
import java.io.IOException;
|
|
|
import java.nio.file.Files;
|
|
import java.nio.file.Files;
|
|
|
import java.nio.file.Path;
|
|
import java.nio.file.Path;
|
|
|
-import java.nio.file.Paths;
|
|
|
|
|
import java.util.ArrayList;
|
|
import java.util.ArrayList;
|
|
|
import java.util.List;
|
|
import java.util.List;
|
|
|
import java.util.UUID;
|
|
import java.util.UUID;
|
|
@@ -35,7 +32,6 @@ public class PdfTextExtractionService {
|
|
|
|
|
|
|
|
private final PaddleOcrClient paddleOcrClient;
|
|
private final PaddleOcrClient paddleOcrClient;
|
|
|
private final OcrResultParser ocrResultParser;
|
|
private final OcrResultParser ocrResultParser;
|
|
|
- private final ObjectMapper objectMapper;
|
|
|
|
|
|
|
|
|
|
/**
|
|
/**
|
|
|
* 文本阈值:每页至少需要这么多字符才认为有文本层
|
|
* 文本阈值:每页至少需要这么多字符才认为有文本层
|
|
@@ -209,170 +205,6 @@ public class PdfTextExtractionService {
|
|
|
return combinedText.toString();
|
|
return combinedText.toString();
|
|
|
}
|
|
}
|
|
|
|
|
|
|
|
- /**
|
|
|
|
|
- * 提取PDF文本并生成页面索引
|
|
|
|
|
- *
|
|
|
|
|
- * @param pdfFilePath PDF文件路径
|
|
|
|
|
- * @param documentId 文档ID
|
|
|
|
|
- * @param indexOutputPath 索引文件输出路径(如果为null则不生成索引文件)
|
|
|
|
|
- * @return 提取结果(包含文本和索引)
|
|
|
|
|
- */
|
|
|
|
|
- public ExtractionResult extractTextWithIndex(String pdfFilePath, String documentId, String indexOutputPath) {
|
|
|
|
|
- File pdfFile = new File(pdfFilePath);
|
|
|
|
|
- if (!pdfFile.exists()) {
|
|
|
|
|
- throw new ServiceException("PDF文件不存在: " + pdfFilePath);
|
|
|
|
|
- }
|
|
|
|
|
-
|
|
|
|
|
- List<PageTextResult> pageResults = new ArrayList<>();
|
|
|
|
|
-
|
|
|
|
|
- try (PDDocument document = org.apache.pdfbox.Loader.loadPDF(pdfFile)) {
|
|
|
|
|
- int totalPages = document.getNumberOfPages();
|
|
|
|
|
- log.info("开始处理PDF文件(带索引): {}, 总页数: {}", pdfFilePath, totalPages);
|
|
|
|
|
-
|
|
|
|
|
- PDFTextStripper textStripper = new PDFTextStripper();
|
|
|
|
|
-
|
|
|
|
|
- // 逐页处理
|
|
|
|
|
- for (int pageNum = 1; pageNum <= totalPages; pageNum++) {
|
|
|
|
|
- log.debug("处理第 {} 页/共 {} 页", pageNum, totalPages);
|
|
|
|
|
-
|
|
|
|
|
- try {
|
|
|
|
|
- textStripper.setStartPage(pageNum);
|
|
|
|
|
- textStripper.setEndPage(pageNum);
|
|
|
|
|
- String pageText = textStripper.getText(document);
|
|
|
|
|
-
|
|
|
|
|
- if (hasSufficientText(pageText)) {
|
|
|
|
|
- log.debug("第 {} 页有文本层,直接使用,文本长度: {}", pageNum, pageText.length());
|
|
|
|
|
- pageResults.add(new PageTextResult(pageNum, pageText, false));
|
|
|
|
|
- } else {
|
|
|
|
|
- log.debug("第 {} 页文本不足,使用OCR处理", pageNum);
|
|
|
|
|
- String ocrText = extractTextByOcr(pdfFilePath, pageNum);
|
|
|
|
|
- pageResults.add(new PageTextResult(pageNum, ocrText, true));
|
|
|
|
|
- }
|
|
|
|
|
- } catch (Exception e) {
|
|
|
|
|
- log.error("处理第 {} 页时出错,尝试使用OCR", pageNum, e);
|
|
|
|
|
- try {
|
|
|
|
|
- String ocrText = extractTextByOcr(pdfFilePath, pageNum);
|
|
|
|
|
- pageResults.add(new PageTextResult(pageNum, ocrText, true));
|
|
|
|
|
- } catch (Exception ocrException) {
|
|
|
|
|
- log.error("第 {} 页OCR也失败", pageNum, ocrException);
|
|
|
|
|
- pageResults.add(new PageTextResult(pageNum, "", true));
|
|
|
|
|
- }
|
|
|
|
|
- }
|
|
|
|
|
- }
|
|
|
|
|
- } catch (IOException e) {
|
|
|
|
|
- log.error("读取PDF文件失败: {}", pdfFilePath, e);
|
|
|
|
|
- throw new ServiceException("读取PDF文件失败: " + e.getMessage());
|
|
|
|
|
- }
|
|
|
|
|
-
|
|
|
|
|
- // 生成带索引的合并结果
|
|
|
|
|
- return combinePageTextsWithIndex(pageResults, documentId, indexOutputPath);
|
|
|
|
|
- }
|
|
|
|
|
-
|
|
|
|
|
- /**
|
|
|
|
|
- * 合并页面文本并生成索引
|
|
|
|
|
- */
|
|
|
|
|
- private ExtractionResult combinePageTextsWithIndex(List<PageTextResult> pageResults,
|
|
|
|
|
- String documentId,
|
|
|
|
|
- String indexOutputPath) {
|
|
|
|
|
- StringBuilder combinedText = new StringBuilder();
|
|
|
|
|
- List<PageIndex> pageIndices = new ArrayList<>();
|
|
|
|
|
- List<LineIndex> lineIndices = new ArrayList<>();
|
|
|
|
|
-
|
|
|
|
|
- int currentCharPos = 0;
|
|
|
|
|
- int currentLine = 1;
|
|
|
|
|
-
|
|
|
|
|
- for (PageTextResult result : pageResults) {
|
|
|
|
|
- int pageCharStart = currentCharPos;
|
|
|
|
|
- int pageLineStart = currentLine;
|
|
|
|
|
-
|
|
|
|
|
- if (result.getText() != null && !result.getText().trim().isEmpty()) {
|
|
|
|
|
- // 添加页头标记
|
|
|
|
|
- String pageHeader = "=== 第 " + result.getPageNum() + " 页";
|
|
|
|
|
- if (result.isOcrUsed()) {
|
|
|
|
|
- pageHeader += " (OCR识别)";
|
|
|
|
|
- } else {
|
|
|
|
|
- pageHeader += " (文本层提取)";
|
|
|
|
|
- }
|
|
|
|
|
- pageHeader += " ===\n";
|
|
|
|
|
-
|
|
|
|
|
- // 记录页头行的索引
|
|
|
|
|
- LineIndex headerLineIndex = new LineIndex();
|
|
|
|
|
- headerLineIndex.setLine(currentLine);
|
|
|
|
|
- headerLineIndex.setCharStart(currentCharPos);
|
|
|
|
|
- headerLineIndex.setCharEnd(currentCharPos + pageHeader.length() - 2); // -2 去掉 \n
|
|
|
|
|
- lineIndices.add(headerLineIndex);
|
|
|
|
|
-
|
|
|
|
|
- combinedText.append(pageHeader);
|
|
|
|
|
- currentCharPos += pageHeader.length();
|
|
|
|
|
- currentLine++;
|
|
|
|
|
-
|
|
|
|
|
- // 添加页面内容并记录每行索引
|
|
|
|
|
- String pageText = result.getText();
|
|
|
|
|
- String[] lines = pageText.split("\n", -1);
|
|
|
|
|
- for (String line : lines) {
|
|
|
|
|
- LineIndex lineIndex = new LineIndex();
|
|
|
|
|
- lineIndex.setLine(currentLine);
|
|
|
|
|
- lineIndex.setCharStart(currentCharPos);
|
|
|
|
|
- lineIndex.setCharEnd(currentCharPos + line.length());
|
|
|
|
|
- lineIndices.add(lineIndex);
|
|
|
|
|
-
|
|
|
|
|
- currentCharPos += line.length() + 1; // +1 for \n
|
|
|
|
|
- currentLine++;
|
|
|
|
|
- }
|
|
|
|
|
-
|
|
|
|
|
- // 添加额外的空行分隔符
|
|
|
|
|
- combinedText.append(pageText).append("\n\n");
|
|
|
|
|
- currentCharPos++; // 额外的 \n
|
|
|
|
|
- currentLine++;
|
|
|
|
|
- }
|
|
|
|
|
-
|
|
|
|
|
- // 创建页面索引
|
|
|
|
|
- PageIndex pageIndex = new PageIndex();
|
|
|
|
|
- pageIndex.setPage(result.getPageNum());
|
|
|
|
|
- pageIndex.setCharStart(pageCharStart);
|
|
|
|
|
- pageIndex.setCharEnd(currentCharPos - 1);
|
|
|
|
|
- pageIndex.setLineStart(pageLineStart);
|
|
|
|
|
- pageIndex.setLineEnd(currentLine - 1);
|
|
|
|
|
- pageIndex.setOcrUsed(result.isOcrUsed());
|
|
|
|
|
- pageIndices.add(pageIndex);
|
|
|
|
|
- }
|
|
|
|
|
-
|
|
|
|
|
- // 创建文档索引
|
|
|
|
|
- DocumentIndex documentIndex = new DocumentIndex();
|
|
|
|
|
- documentIndex.setDocumentId(documentId);
|
|
|
|
|
- documentIndex.setPages(pageIndices);
|
|
|
|
|
- documentIndex.setLines(lineIndices);
|
|
|
|
|
- documentIndex.setTotalChars(currentCharPos);
|
|
|
|
|
- documentIndex.setTotalLines(currentLine - 1);
|
|
|
|
|
- documentIndex.setTotalPages(pageResults.size());
|
|
|
|
|
-
|
|
|
|
|
- // 保存索引文件
|
|
|
|
|
- if (indexOutputPath != null) {
|
|
|
|
|
- saveIndexFile(documentIndex, indexOutputPath);
|
|
|
|
|
- }
|
|
|
|
|
-
|
|
|
|
|
- ExtractionResult result = new ExtractionResult();
|
|
|
|
|
- result.setText(combinedText.toString());
|
|
|
|
|
- result.setIndex(documentIndex);
|
|
|
|
|
- return result;
|
|
|
|
|
- }
|
|
|
|
|
-
|
|
|
|
|
- /**
|
|
|
|
|
- * 保存索引文件
|
|
|
|
|
- */
|
|
|
|
|
- private void saveIndexFile(DocumentIndex index, String outputPath) {
|
|
|
|
|
- try {
|
|
|
|
|
- Path path = Paths.get(outputPath);
|
|
|
|
|
- Files.createDirectories(path.getParent());
|
|
|
|
|
- String json = objectMapper.writerWithDefaultPrettyPrinter().writeValueAsString(index);
|
|
|
|
|
- Files.writeString(path, json);
|
|
|
|
|
- log.info("页面索引文件已保存: {}", outputPath);
|
|
|
|
|
- } catch (Exception e) {
|
|
|
|
|
- log.error("保存页面索引文件失败: {}", outputPath, e);
|
|
|
|
|
- // 索引保存失败不影响主流程
|
|
|
|
|
- }
|
|
|
|
|
- }
|
|
|
|
|
-
|
|
|
|
|
/**
|
|
/**
|
|
|
* 页面文本结果
|
|
* 页面文本结果
|
|
|
*/
|
|
*/
|
|
@@ -399,49 +231,4 @@ public class PdfTextExtractionService {
|
|
|
return ocrUsed;
|
|
return ocrUsed;
|
|
|
}
|
|
}
|
|
|
}
|
|
}
|
|
|
-
|
|
|
|
|
- /**
|
|
|
|
|
- * 提取结果(包含文本和索引)
|
|
|
|
|
- */
|
|
|
|
|
- @Data
|
|
|
|
|
- public static class ExtractionResult {
|
|
|
|
|
- private String text;
|
|
|
|
|
- private DocumentIndex index;
|
|
|
|
|
- }
|
|
|
|
|
-
|
|
|
|
|
- /**
|
|
|
|
|
- * 文档索引
|
|
|
|
|
- */
|
|
|
|
|
- @Data
|
|
|
|
|
- public static class DocumentIndex {
|
|
|
|
|
- private String documentId;
|
|
|
|
|
- private List<PageIndex> pages;
|
|
|
|
|
- private List<LineIndex> lines;
|
|
|
|
|
- private int totalChars;
|
|
|
|
|
- private int totalLines;
|
|
|
|
|
- private int totalPages;
|
|
|
|
|
- }
|
|
|
|
|
-
|
|
|
|
|
- /**
|
|
|
|
|
- * 行索引
|
|
|
|
|
- */
|
|
|
|
|
- @Data
|
|
|
|
|
- public static class LineIndex {
|
|
|
|
|
- private int line;
|
|
|
|
|
- private int charStart;
|
|
|
|
|
- private int charEnd;
|
|
|
|
|
- }
|
|
|
|
|
-
|
|
|
|
|
- /**
|
|
|
|
|
- * 页面索引
|
|
|
|
|
- */
|
|
|
|
|
- @Data
|
|
|
|
|
- public static class PageIndex {
|
|
|
|
|
- private int page;
|
|
|
|
|
- private int charStart;
|
|
|
|
|
- private int charEnd;
|
|
|
|
|
- private int lineStart;
|
|
|
|
|
- private int lineEnd;
|
|
|
|
|
- private boolean ocrUsed;
|
|
|
|
|
- }
|
|
|
|
|
}
|
|
}
|