| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313 |
- package com.lingyue.parse.service;
- import com.fasterxml.jackson.databind.ObjectMapper;
- import lombok.Data;
- import lombok.RequiredArgsConstructor;
- import lombok.extern.slf4j.Slf4j;
- import org.springframework.stereotype.Service;
- import java.nio.file.Files;
- import java.nio.file.Path;
- import java.nio.file.Paths;
- import java.util.ArrayList;
- import java.util.List;
- /**
- * 文档索引服务
- * 负责生成和管理文档的位置索引
- *
- * @author lingyue
- * @since 2026-01-20
- */
- @Slf4j
- @Service
- @RequiredArgsConstructor
- public class DocumentIndexService {
-
- private final ObjectMapper objectMapper;
-
- /**
- * 分页符字符(Form Feed)
- */
- private static final char FORM_FEED = '\f';
-
- /**
- * 为纯文本生成行索引(Word/Excel等文档)
- * 如果文本中包含分页符(\f),则根据分页符生成页面索引
- *
- * @param text 文本内容
- * @param documentId 文档ID
- * @param indexOutputPath 索引输出路径
- * @return 文档索引
- */
- public DocumentIndex generateLineIndex(String text, String documentId, String indexOutputPath) {
- if (text == null || text.isEmpty()) {
- return createEmptyIndex(documentId);
- }
-
- // 检查是否包含分页符
- boolean hasPageBreaks = text.indexOf(FORM_FEED) >= 0;
-
- if (hasPageBreaks) {
- log.info("检测到分页符,生成分页索引: documentId={}", documentId);
- return generateIndexWithPageBreaks(text, documentId, indexOutputPath);
- } else {
- log.debug("无分页符,生成单页索引: documentId={}", documentId);
- return generateSinglePageIndex(text, documentId, indexOutputPath);
- }
- }
-
- /**
- * 根据分页符生成多页索引
- */
- private DocumentIndex generateIndexWithPageBreaks(String text, String documentId, String indexOutputPath) {
- List<PageIndex> pageIndices = new ArrayList<>();
- List<LineIndex> lineIndices = new ArrayList<>();
-
- int charPos = 0;
- int lineNum = 1;
- int pageNum = 1;
- int pageCharStart = 0;
- int pageLineStart = 1;
-
- String[] lines = text.split("\n", -1);
-
- for (String line : lines) {
- // 检查这一行是否包含分页符
- int ffIndex = line.indexOf(FORM_FEED);
-
- if (ffIndex >= 0) {
- // 处理分页符之前的内容
- if (ffIndex > 0) {
- LineIndex lineIndex = new LineIndex();
- lineIndex.setLine(lineNum);
- lineIndex.setCharStart(charPos);
- lineIndex.setCharEnd(charPos + ffIndex);
- lineIndices.add(lineIndex);
- }
-
- // 结束当前页
- PageIndex pageIndex = new PageIndex();
- pageIndex.setPage(pageNum);
- pageIndex.setCharStart(pageCharStart);
- pageIndex.setCharEnd(charPos + ffIndex);
- pageIndex.setLineStart(pageLineStart);
- pageIndex.setLineEnd(lineNum);
- pageIndices.add(pageIndex);
-
- // 开始新页
- pageNum++;
- pageCharStart = charPos + ffIndex + 1; // +1 跳过分页符
- pageLineStart = lineNum + 1;
-
- // 处理分页符之后的内容(如果有)
- if (ffIndex + 1 < line.length()) {
- lineNum++;
- LineIndex afterLineIndex = new LineIndex();
- afterLineIndex.setLine(lineNum);
- afterLineIndex.setCharStart(charPos + ffIndex + 1);
- afterLineIndex.setCharEnd(charPos + line.length());
- lineIndices.add(afterLineIndex);
- }
- } else {
- // 普通行
- LineIndex lineIndex = new LineIndex();
- lineIndex.setLine(lineNum);
- lineIndex.setCharStart(charPos);
- lineIndex.setCharEnd(charPos + line.length());
- lineIndices.add(lineIndex);
- }
-
- charPos += line.length() + 1; // +1 for \n
- lineNum++;
- }
-
- // 添加最后一页
- if (pageCharStart < text.length()) {
- PageIndex lastPage = new PageIndex();
- lastPage.setPage(pageNum);
- lastPage.setCharStart(pageCharStart);
- lastPage.setCharEnd(text.length());
- lastPage.setLineStart(pageLineStart);
- lastPage.setLineEnd(lineNum - 1);
- pageIndices.add(lastPage);
- }
-
- // 创建文档索引
- DocumentIndex index = new DocumentIndex();
- index.setDocumentId(documentId);
- index.setTotalChars(text.length());
- index.setTotalLines(lineNum - 1);
- index.setTotalPages(pageIndices.size());
- index.setPages(pageIndices);
- index.setLines(lineIndices);
-
- log.info("分页索引生成完成: documentId={}, pages={}, lines={}",
- documentId, pageIndices.size(), lineIndices.size());
-
- // 保存索引文件
- if (indexOutputPath != null) {
- saveIndexFile(index, indexOutputPath);
- }
-
- return index;
- }
-
- /**
- * 生成单页索引(无分页符的文档)
- */
- private DocumentIndex generateSinglePageIndex(String text, String documentId, String indexOutputPath) {
- List<LineIndex> lineIndices = new ArrayList<>();
- int charPos = 0;
- int lineNum = 1;
-
- String[] lines = text.split("\n", -1);
- for (String line : lines) {
- LineIndex lineIndex = new LineIndex();
- lineIndex.setLine(lineNum);
- lineIndex.setCharStart(charPos);
- lineIndex.setCharEnd(charPos + line.length());
- lineIndices.add(lineIndex);
-
- charPos += line.length() + 1; // +1 for \n
- lineNum++;
- }
-
- // 创建文档索引(无分页,只有行索引)
- DocumentIndex index = new DocumentIndex();
- index.setDocumentId(documentId);
- index.setTotalChars(text.length());
- index.setTotalLines(lines.length);
- index.setTotalPages(1); // 非分页文档统一为1页
- index.setLines(lineIndices);
-
- // 创建虚拟的单页索引
- List<PageIndex> pages = new ArrayList<>();
- PageIndex singlePage = new PageIndex();
- singlePage.setPage(1);
- singlePage.setCharStart(0);
- singlePage.setCharEnd(text.length());
- singlePage.setLineStart(1);
- singlePage.setLineEnd(lines.length);
- pages.add(singlePage);
- index.setPages(pages);
-
- // 保存索引文件
- if (indexOutputPath != null) {
- saveIndexFile(index, indexOutputPath);
- }
-
- return index;
- }
-
- /**
- * 读取索引文件
- */
- public DocumentIndex loadIndex(String indexFilePath) {
- try {
- Path path = Paths.get(indexFilePath);
- if (!Files.exists(path)) {
- log.warn("索引文件不存在: {}", indexFilePath);
- return null;
- }
- String json = Files.readString(path);
- return objectMapper.readValue(json, DocumentIndex.class);
- } catch (Exception e) {
- log.error("读取索引文件失败: {}", indexFilePath, e);
- return null;
- }
- }
-
- /**
- * 根据字符位置查找页码和行号
- *
- * @param index 文档索引
- * @param charPosition 字符位置
- * @return [页码, 行号] 或 null(未找到)
- */
- public int[] findPageAndLine(DocumentIndex index, int charPosition) {
- if (index == null || index.getPages() == null) {
- return null;
- }
-
- // 1. 找到所在页
- int page = 1;
- for (PageIndex pageIndex : index.getPages()) {
- if (charPosition >= pageIndex.getCharStart() && charPosition <= pageIndex.getCharEnd()) {
- page = pageIndex.getPage();
- break;
- }
- }
-
- // 2. 找到所在行
- int line = 1;
- if (index.getLines() != null) {
- for (LineIndex lineIndex : index.getLines()) {
- if (charPosition >= lineIndex.getCharStart() && charPosition <= lineIndex.getCharEnd()) {
- line = lineIndex.getLine();
- break;
- }
- }
- }
-
- return new int[]{page, line};
- }
-
- private DocumentIndex createEmptyIndex(String documentId) {
- DocumentIndex index = new DocumentIndex();
- index.setDocumentId(documentId);
- index.setTotalChars(0);
- index.setTotalLines(0);
- index.setTotalPages(0);
- index.setPages(new ArrayList<>());
- return index;
- }
-
- private void saveIndexFile(DocumentIndex index, String outputPath) {
- try {
- Path path = Paths.get(outputPath);
- Files.createDirectories(path.getParent());
- String json = objectMapper.writerWithDefaultPrettyPrinter().writeValueAsString(index);
- Files.writeString(path, json);
- log.info("文档索引文件已保存: {}", outputPath);
- } catch (Exception e) {
- log.error("保存文档索引文件失败: {}", outputPath, e);
- }
- }
-
- /**
- * 文档索引
- */
- @Data
- public static class DocumentIndex {
- private String documentId;
- private List<PageIndex> pages;
- private List<LineIndex> lines;
- private int totalChars;
- private int totalLines;
- private int totalPages;
- }
-
- /**
- * 页面索引
- */
- @Data
- public static class PageIndex {
- private int page;
- private int charStart;
- private int charEnd;
- private int lineStart;
- private int lineEnd;
- private boolean ocrUsed;
- }
-
- /**
- * 行索引
- */
- @Data
- public static class LineIndex {
- private int line;
- private int charStart;
- private int charEnd;
- }
- }
|