|
|
@@ -1,10 +1,11 @@
|
|
|
package com.lingyue.parse.service;
|
|
|
|
|
|
+import com.fasterxml.jackson.databind.ObjectMapper;
|
|
|
import com.lingyue.common.exception.ServiceException;
|
|
|
+import lombok.Data;
|
|
|
import lombok.RequiredArgsConstructor;
|
|
|
import lombok.extern.slf4j.Slf4j;
|
|
|
import org.apache.pdfbox.pdmodel.PDDocument;
|
|
|
-import org.apache.pdfbox.pdmodel.PDPage;
|
|
|
import org.apache.pdfbox.rendering.PDFRenderer;
|
|
|
import org.apache.pdfbox.text.PDFTextStripper;
|
|
|
import org.springframework.stereotype.Service;
|
|
|
@@ -34,6 +35,7 @@ public class PdfTextExtractionService {
|
|
|
|
|
|
private final PaddleOcrClient paddleOcrClient;
|
|
|
private final OcrResultParser ocrResultParser;
|
|
|
+ private final ObjectMapper objectMapper;
|
|
|
|
|
|
/**
|
|
|
* 文本阈值:每页至少需要这么多字符才认为有文本层
|
|
|
@@ -207,6 +209,167 @@ public class PdfTextExtractionService {
|
|
|
return combinedText.toString();
|
|
|
}
|
|
|
|
|
|
+ /**
|
|
|
+ * 提取PDF文本并生成页面索引
|
|
|
+ *
|
|
|
+ * @param pdfFilePath PDF文件路径
|
|
|
+ * @param documentId 文档ID
|
|
|
+ * @param indexOutputPath 索引文件输出路径(如果为null则不生成索引文件)
|
|
|
+ * @return 提取结果(包含文本和索引)
|
|
|
+ */
|
|
|
+ public ExtractionResult extractTextWithIndex(String pdfFilePath, String documentId, String indexOutputPath) {
|
|
|
+ File pdfFile = new File(pdfFilePath);
|
|
|
+ if (!pdfFile.exists()) {
|
|
|
+ throw new ServiceException("PDF文件不存在: " + pdfFilePath);
|
|
|
+ }
|
|
|
+
|
|
|
+ List<PageTextResult> pageResults = new ArrayList<>();
|
|
|
+
|
|
|
+ try (PDDocument document = org.apache.pdfbox.Loader.loadPDF(pdfFile)) {
|
|
|
+ int totalPages = document.getNumberOfPages();
|
|
|
+ log.info("开始处理PDF文件(带索引): {}, 总页数: {}", pdfFilePath, totalPages);
|
|
|
+
|
|
|
+ PDFTextStripper textStripper = new PDFTextStripper();
|
|
|
+
|
|
|
+ // 逐页处理
|
|
|
+ for (int pageNum = 1; pageNum <= totalPages; pageNum++) {
|
|
|
+ log.debug("处理第 {} 页/共 {} 页", pageNum, totalPages);
|
|
|
+
|
|
|
+ try {
|
|
|
+ textStripper.setStartPage(pageNum);
|
|
|
+ textStripper.setEndPage(pageNum);
|
|
|
+ String pageText = textStripper.getText(document);
|
|
|
+
|
|
|
+ if (hasSufficientText(pageText)) {
|
|
|
+ log.debug("第 {} 页有文本层,直接使用,文本长度: {}", pageNum, pageText.length());
|
|
|
+ pageResults.add(new PageTextResult(pageNum, pageText, false));
|
|
|
+ } else {
|
|
|
+ log.debug("第 {} 页文本不足,使用OCR处理", pageNum);
|
|
|
+ String ocrText = extractTextByOcr(pdfFilePath, pageNum);
|
|
|
+ pageResults.add(new PageTextResult(pageNum, ocrText, true));
|
|
|
+ }
|
|
|
+ } catch (Exception e) {
|
|
|
+ log.error("处理第 {} 页时出错,尝试使用OCR", pageNum, e);
|
|
|
+ try {
|
|
|
+ String ocrText = extractTextByOcr(pdfFilePath, pageNum);
|
|
|
+ pageResults.add(new PageTextResult(pageNum, ocrText, true));
|
|
|
+ } catch (Exception ocrException) {
|
|
|
+ log.error("第 {} 页OCR也失败", pageNum, ocrException);
|
|
|
+ pageResults.add(new PageTextResult(pageNum, "", true));
|
|
|
+ }
|
|
|
+ }
|
|
|
+ }
|
|
|
+ } catch (IOException e) {
|
|
|
+ log.error("读取PDF文件失败: {}", pdfFilePath, e);
|
|
|
+ throw new ServiceException("读取PDF文件失败: " + e.getMessage());
|
|
|
+ }
|
|
|
+
|
|
|
+ // 生成带索引的合并结果
|
|
|
+ return combinePageTextsWithIndex(pageResults, documentId, indexOutputPath);
|
|
|
+ }
|
|
|
+
|
|
|
+ /**
|
|
|
+ * 合并页面文本并生成索引
|
|
|
+ */
|
|
|
+ private ExtractionResult combinePageTextsWithIndex(List<PageTextResult> pageResults,
|
|
|
+ String documentId,
|
|
|
+ String indexOutputPath) {
|
|
|
+ StringBuilder combinedText = new StringBuilder();
|
|
|
+ List<PageIndex> pageIndices = new ArrayList<>();
|
|
|
+
|
|
|
+ int currentCharPos = 0;
|
|
|
+ int currentLine = 1;
|
|
|
+
|
|
|
+ for (PageTextResult result : pageResults) {
|
|
|
+ int pageCharStart = currentCharPos;
|
|
|
+ int pageLineStart = currentLine;
|
|
|
+
|
|
|
+ if (result.getText() != null && !result.getText().trim().isEmpty()) {
|
|
|
+ // 添加页头标记
|
|
|
+ String pageHeader = "=== 第 " + result.getPageNum() + " 页";
|
|
|
+ if (result.isOcrUsed()) {
|
|
|
+ pageHeader += " (OCR识别)";
|
|
|
+ } else {
|
|
|
+ pageHeader += " (文本层提取)";
|
|
|
+ }
|
|
|
+ pageHeader += " ===\n";
|
|
|
+
|
|
|
+ combinedText.append(pageHeader);
|
|
|
+ currentCharPos += pageHeader.length();
|
|
|
+ currentLine++;
|
|
|
+
|
|
|
+ // 添加页面内容
|
|
|
+ String pageText = result.getText();
|
|
|
+ combinedText.append(pageText).append("\n\n");
|
|
|
+
|
|
|
+ // 计算行数
|
|
|
+ int pageLines = countLines(pageText);
|
|
|
+ currentLine += pageLines + 1; // +1 for the empty line separator
|
|
|
+ currentCharPos += pageText.length() + 2; // +2 for "\n\n"
|
|
|
+ }
|
|
|
+
|
|
|
+ // 创建页面索引
|
|
|
+ PageIndex pageIndex = new PageIndex();
|
|
|
+ pageIndex.setPage(result.getPageNum());
|
|
|
+ pageIndex.setCharStart(pageCharStart);
|
|
|
+ pageIndex.setCharEnd(currentCharPos - 1);
|
|
|
+ pageIndex.setLineStart(pageLineStart);
|
|
|
+ pageIndex.setLineEnd(currentLine - 1);
|
|
|
+ pageIndex.setOcrUsed(result.isOcrUsed());
|
|
|
+ pageIndices.add(pageIndex);
|
|
|
+ }
|
|
|
+
|
|
|
+ // 创建文档索引
|
|
|
+ DocumentIndex documentIndex = new DocumentIndex();
|
|
|
+ documentIndex.setDocumentId(documentId);
|
|
|
+ documentIndex.setPages(pageIndices);
|
|
|
+ documentIndex.setTotalChars(currentCharPos);
|
|
|
+ documentIndex.setTotalLines(currentLine - 1);
|
|
|
+ documentIndex.setTotalPages(pageResults.size());
|
|
|
+
|
|
|
+ // 保存索引文件
|
|
|
+ if (indexOutputPath != null) {
|
|
|
+ saveIndexFile(documentIndex, indexOutputPath);
|
|
|
+ }
|
|
|
+
|
|
|
+ ExtractionResult result = new ExtractionResult();
|
|
|
+ result.setText(combinedText.toString());
|
|
|
+ result.setIndex(documentIndex);
|
|
|
+ return result;
|
|
|
+ }
|
|
|
+
|
|
|
+ /**
|
|
|
+ * 计算文本行数
|
|
|
+ */
|
|
|
+ private int countLines(String text) {
|
|
|
+ if (text == null || text.isEmpty()) {
|
|
|
+ return 0;
|
|
|
+ }
|
|
|
+ int lines = 1;
|
|
|
+ for (char c : text.toCharArray()) {
|
|
|
+ if (c == '\n') {
|
|
|
+ lines++;
|
|
|
+ }
|
|
|
+ }
|
|
|
+ return lines;
|
|
|
+ }
|
|
|
+
|
|
|
+ /**
|
|
|
+ * 保存索引文件
|
|
|
+ */
|
|
|
+ private void saveIndexFile(DocumentIndex index, String outputPath) {
|
|
|
+ try {
|
|
|
+ Path path = Paths.get(outputPath);
|
|
|
+ Files.createDirectories(path.getParent());
|
|
|
+ String json = objectMapper.writerWithDefaultPrettyPrinter().writeValueAsString(index);
|
|
|
+ Files.writeString(path, json);
|
|
|
+ log.info("页面索引文件已保存: {}", outputPath);
|
|
|
+ } catch (Exception e) {
|
|
|
+ log.error("保存页面索引文件失败: {}", outputPath, e);
|
|
|
+ // 索引保存失败不影响主流程
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
/**
|
|
|
* 页面文本结果
|
|
|
*/
|
|
|
@@ -233,4 +396,38 @@ public class PdfTextExtractionService {
|
|
|
return ocrUsed;
|
|
|
}
|
|
|
}
|
|
|
+
|
|
|
+ /**
|
|
|
+ * 提取结果(包含文本和索引)
|
|
|
+ */
|
|
|
+ @Data
|
|
|
+ public static class ExtractionResult {
|
|
|
+ private String text;
|
|
|
+ private DocumentIndex index;
|
|
|
+ }
|
|
|
+
|
|
|
+ /**
|
|
|
+ * 文档索引
|
|
|
+ */
|
|
|
+ @Data
|
|
|
+ public static class DocumentIndex {
|
|
|
+ private String documentId;
|
|
|
+ private List<PageIndex> pages;
|
|
|
+ private int totalChars;
|
|
|
+ private int totalLines;
|
|
|
+ private int totalPages;
|
|
|
+ }
|
|
|
+
|
|
|
+ /**
|
|
|
+ * 页面索引
|
|
|
+ */
|
|
|
+ @Data
|
|
|
+ public static class PageIndex {
|
|
|
+ private int page;
|
|
|
+ private int charStart;
|
|
|
+ private int charEnd;
|
|
|
+ private int lineStart;
|
|
|
+ private int lineEnd;
|
|
|
+ private boolean ocrUsed;
|
|
|
+ }
|
|
|
}
|