package com.lingyue.parse.service; import com.fasterxml.jackson.databind.ObjectMapper; import lombok.Data; import lombok.RequiredArgsConstructor; import lombok.extern.slf4j.Slf4j; import org.springframework.stereotype.Service; import java.nio.file.Files; import java.nio.file.Path; import java.nio.file.Paths; import java.util.ArrayList; import java.util.List; /** * 文档索引服务 * 负责生成和管理文档的位置索引 * * @author lingyue * @since 2026-01-20 */ @Slf4j @Service @RequiredArgsConstructor public class DocumentIndexService { private final ObjectMapper objectMapper; /** * 分页符字符(Form Feed) */ private static final char FORM_FEED = '\f'; /** * 为纯文本生成行索引(Word/Excel等文档) * 如果文本中包含分页符(\f),则根据分页符生成页面索引 * * @param text 文本内容 * @param documentId 文档ID * @param indexOutputPath 索引输出路径 * @return 文档索引 */ public DocumentIndex generateLineIndex(String text, String documentId, String indexOutputPath) { if (text == null || text.isEmpty()) { return createEmptyIndex(documentId); } // 检查是否包含分页符 boolean hasPageBreaks = text.indexOf(FORM_FEED) >= 0; if (hasPageBreaks) { log.info("检测到分页符,生成分页索引: documentId={}", documentId); return generateIndexWithPageBreaks(text, documentId, indexOutputPath); } else { log.debug("无分页符,生成单页索引: documentId={}", documentId); return generateSinglePageIndex(text, documentId, indexOutputPath); } } /** * 根据分页符生成多页索引 */ private DocumentIndex generateIndexWithPageBreaks(String text, String documentId, String indexOutputPath) { List pageIndices = new ArrayList<>(); List lineIndices = new ArrayList<>(); int charPos = 0; int lineNum = 1; int pageNum = 1; int pageCharStart = 0; int pageLineStart = 1; String[] lines = text.split("\n", -1); for (String line : lines) { // 检查这一行是否包含分页符 int ffIndex = line.indexOf(FORM_FEED); if (ffIndex >= 0) { // 处理分页符之前的内容 if (ffIndex > 0) { LineIndex lineIndex = new LineIndex(); lineIndex.setLine(lineNum); lineIndex.setCharStart(charPos); lineIndex.setCharEnd(charPos + ffIndex); lineIndices.add(lineIndex); } // 结束当前页 PageIndex pageIndex = new PageIndex(); pageIndex.setPage(pageNum); pageIndex.setCharStart(pageCharStart); pageIndex.setCharEnd(charPos + ffIndex); pageIndex.setLineStart(pageLineStart); pageIndex.setLineEnd(lineNum); pageIndices.add(pageIndex); // 开始新页 pageNum++; pageCharStart = charPos + ffIndex + 1; // +1 跳过分页符 pageLineStart = lineNum + 1; // 处理分页符之后的内容(如果有) if (ffIndex + 1 < line.length()) { lineNum++; LineIndex afterLineIndex = new LineIndex(); afterLineIndex.setLine(lineNum); afterLineIndex.setCharStart(charPos + ffIndex + 1); afterLineIndex.setCharEnd(charPos + line.length()); lineIndices.add(afterLineIndex); } } else { // 普通行 LineIndex lineIndex = new LineIndex(); lineIndex.setLine(lineNum); lineIndex.setCharStart(charPos); lineIndex.setCharEnd(charPos + line.length()); lineIndices.add(lineIndex); } charPos += line.length() + 1; // +1 for \n lineNum++; } // 添加最后一页 if (pageCharStart < text.length()) { PageIndex lastPage = new PageIndex(); lastPage.setPage(pageNum); lastPage.setCharStart(pageCharStart); lastPage.setCharEnd(text.length()); lastPage.setLineStart(pageLineStart); lastPage.setLineEnd(lineNum - 1); pageIndices.add(lastPage); } // 创建文档索引 DocumentIndex index = new DocumentIndex(); index.setDocumentId(documentId); index.setTotalChars(text.length()); index.setTotalLines(lineNum - 1); index.setTotalPages(pageIndices.size()); index.setPages(pageIndices); index.setLines(lineIndices); log.info("分页索引生成完成: documentId={}, pages={}, lines={}", documentId, pageIndices.size(), lineIndices.size()); // 保存索引文件 if (indexOutputPath != null) { saveIndexFile(index, indexOutputPath); } return index; } /** * 生成单页索引(无分页符的文档) */ private DocumentIndex generateSinglePageIndex(String text, String documentId, String indexOutputPath) { List lineIndices = new ArrayList<>(); int charPos = 0; int lineNum = 1; String[] lines = text.split("\n", -1); for (String line : lines) { LineIndex lineIndex = new LineIndex(); lineIndex.setLine(lineNum); lineIndex.setCharStart(charPos); lineIndex.setCharEnd(charPos + line.length()); lineIndices.add(lineIndex); charPos += line.length() + 1; // +1 for \n lineNum++; } // 创建文档索引(无分页,只有行索引) DocumentIndex index = new DocumentIndex(); index.setDocumentId(documentId); index.setTotalChars(text.length()); index.setTotalLines(lines.length); index.setTotalPages(1); // 非分页文档统一为1页 index.setLines(lineIndices); // 创建虚拟的单页索引 List pages = new ArrayList<>(); PageIndex singlePage = new PageIndex(); singlePage.setPage(1); singlePage.setCharStart(0); singlePage.setCharEnd(text.length()); singlePage.setLineStart(1); singlePage.setLineEnd(lines.length); pages.add(singlePage); index.setPages(pages); // 保存索引文件 if (indexOutputPath != null) { saveIndexFile(index, indexOutputPath); } return index; } /** * 读取索引文件 */ public DocumentIndex loadIndex(String indexFilePath) { try { Path path = Paths.get(indexFilePath); if (!Files.exists(path)) { log.warn("索引文件不存在: {}", indexFilePath); return null; } String json = Files.readString(path); return objectMapper.readValue(json, DocumentIndex.class); } catch (Exception e) { log.error("读取索引文件失败: {}", indexFilePath, e); return null; } } /** * 根据字符位置查找页码和行号 * * @param index 文档索引 * @param charPosition 字符位置 * @return [页码, 行号] 或 null(未找到) */ public int[] findPageAndLine(DocumentIndex index, int charPosition) { if (index == null || index.getPages() == null) { return null; } // 1. 找到所在页 int page = 1; for (PageIndex pageIndex : index.getPages()) { if (charPosition >= pageIndex.getCharStart() && charPosition <= pageIndex.getCharEnd()) { page = pageIndex.getPage(); break; } } // 2. 找到所在行 int line = 1; if (index.getLines() != null) { for (LineIndex lineIndex : index.getLines()) { if (charPosition >= lineIndex.getCharStart() && charPosition <= lineIndex.getCharEnd()) { line = lineIndex.getLine(); break; } } } return new int[]{page, line}; } private DocumentIndex createEmptyIndex(String documentId) { DocumentIndex index = new DocumentIndex(); index.setDocumentId(documentId); index.setTotalChars(0); index.setTotalLines(0); index.setTotalPages(0); index.setPages(new ArrayList<>()); return index; } private void saveIndexFile(DocumentIndex index, String outputPath) { try { Path path = Paths.get(outputPath); Files.createDirectories(path.getParent()); String json = objectMapper.writerWithDefaultPrettyPrinter().writeValueAsString(index); Files.writeString(path, json); log.info("文档索引文件已保存: {}", outputPath); } catch (Exception e) { log.error("保存文档索引文件失败: {}", outputPath, e); } } /** * 文档索引 */ @Data public static class DocumentIndex { private String documentId; private List pages; private List lines; private int totalChars; private int totalLines; private int totalPages; } /** * 页面索引 */ @Data public static class PageIndex { private int page; private int charStart; private int charEnd; private int lineStart; private int lineEnd; private boolean ocrUsed; } /** * 行索引 */ @Data public static class LineIndex { private int line; private int charStart; private int charEnd; } }