DocumentIndexService.java 10 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313
  1. package com.lingyue.parse.service;
  2. import com.fasterxml.jackson.databind.ObjectMapper;
  3. import lombok.Data;
  4. import lombok.RequiredArgsConstructor;
  5. import lombok.extern.slf4j.Slf4j;
  6. import org.springframework.stereotype.Service;
  7. import java.nio.file.Files;
  8. import java.nio.file.Path;
  9. import java.nio.file.Paths;
  10. import java.util.ArrayList;
  11. import java.util.List;
  12. /**
  13. * 文档索引服务
  14. * 负责生成和管理文档的位置索引
  15. *
  16. * @author lingyue
  17. * @since 2026-01-20
  18. */
  19. @Slf4j
  20. @Service
  21. @RequiredArgsConstructor
  22. public class DocumentIndexService {
  23. private final ObjectMapper objectMapper;
  24. /**
  25. * 分页符字符(Form Feed)
  26. */
  27. private static final char FORM_FEED = '\f';
  28. /**
  29. * 为纯文本生成行索引(Word/Excel等文档)
  30. * 如果文本中包含分页符(\f),则根据分页符生成页面索引
  31. *
  32. * @param text 文本内容
  33. * @param documentId 文档ID
  34. * @param indexOutputPath 索引输出路径
  35. * @return 文档索引
  36. */
  37. public DocumentIndex generateLineIndex(String text, String documentId, String indexOutputPath) {
  38. if (text == null || text.isEmpty()) {
  39. return createEmptyIndex(documentId);
  40. }
  41. // 检查是否包含分页符
  42. boolean hasPageBreaks = text.indexOf(FORM_FEED) >= 0;
  43. if (hasPageBreaks) {
  44. log.info("检测到分页符,生成分页索引: documentId={}", documentId);
  45. return generateIndexWithPageBreaks(text, documentId, indexOutputPath);
  46. } else {
  47. log.debug("无分页符,生成单页索引: documentId={}", documentId);
  48. return generateSinglePageIndex(text, documentId, indexOutputPath);
  49. }
  50. }
  51. /**
  52. * 根据分页符生成多页索引
  53. */
  54. private DocumentIndex generateIndexWithPageBreaks(String text, String documentId, String indexOutputPath) {
  55. List<PageIndex> pageIndices = new ArrayList<>();
  56. List<LineIndex> lineIndices = new ArrayList<>();
  57. int charPos = 0;
  58. int lineNum = 1;
  59. int pageNum = 1;
  60. int pageCharStart = 0;
  61. int pageLineStart = 1;
  62. String[] lines = text.split("\n", -1);
  63. for (String line : lines) {
  64. // 检查这一行是否包含分页符
  65. int ffIndex = line.indexOf(FORM_FEED);
  66. if (ffIndex >= 0) {
  67. // 处理分页符之前的内容
  68. if (ffIndex > 0) {
  69. LineIndex lineIndex = new LineIndex();
  70. lineIndex.setLine(lineNum);
  71. lineIndex.setCharStart(charPos);
  72. lineIndex.setCharEnd(charPos + ffIndex);
  73. lineIndices.add(lineIndex);
  74. }
  75. // 结束当前页
  76. PageIndex pageIndex = new PageIndex();
  77. pageIndex.setPage(pageNum);
  78. pageIndex.setCharStart(pageCharStart);
  79. pageIndex.setCharEnd(charPos + ffIndex);
  80. pageIndex.setLineStart(pageLineStart);
  81. pageIndex.setLineEnd(lineNum);
  82. pageIndices.add(pageIndex);
  83. // 开始新页
  84. pageNum++;
  85. pageCharStart = charPos + ffIndex + 1; // +1 跳过分页符
  86. pageLineStart = lineNum + 1;
  87. // 处理分页符之后的内容(如果有)
  88. if (ffIndex + 1 < line.length()) {
  89. lineNum++;
  90. LineIndex afterLineIndex = new LineIndex();
  91. afterLineIndex.setLine(lineNum);
  92. afterLineIndex.setCharStart(charPos + ffIndex + 1);
  93. afterLineIndex.setCharEnd(charPos + line.length());
  94. lineIndices.add(afterLineIndex);
  95. }
  96. } else {
  97. // 普通行
  98. LineIndex lineIndex = new LineIndex();
  99. lineIndex.setLine(lineNum);
  100. lineIndex.setCharStart(charPos);
  101. lineIndex.setCharEnd(charPos + line.length());
  102. lineIndices.add(lineIndex);
  103. }
  104. charPos += line.length() + 1; // +1 for \n
  105. lineNum++;
  106. }
  107. // 添加最后一页
  108. if (pageCharStart < text.length()) {
  109. PageIndex lastPage = new PageIndex();
  110. lastPage.setPage(pageNum);
  111. lastPage.setCharStart(pageCharStart);
  112. lastPage.setCharEnd(text.length());
  113. lastPage.setLineStart(pageLineStart);
  114. lastPage.setLineEnd(lineNum - 1);
  115. pageIndices.add(lastPage);
  116. }
  117. // 创建文档索引
  118. DocumentIndex index = new DocumentIndex();
  119. index.setDocumentId(documentId);
  120. index.setTotalChars(text.length());
  121. index.setTotalLines(lineNum - 1);
  122. index.setTotalPages(pageIndices.size());
  123. index.setPages(pageIndices);
  124. index.setLines(lineIndices);
  125. log.info("分页索引生成完成: documentId={}, pages={}, lines={}",
  126. documentId, pageIndices.size(), lineIndices.size());
  127. // 保存索引文件
  128. if (indexOutputPath != null) {
  129. saveIndexFile(index, indexOutputPath);
  130. }
  131. return index;
  132. }
  133. /**
  134. * 生成单页索引(无分页符的文档)
  135. */
  136. private DocumentIndex generateSinglePageIndex(String text, String documentId, String indexOutputPath) {
  137. List<LineIndex> lineIndices = new ArrayList<>();
  138. int charPos = 0;
  139. int lineNum = 1;
  140. String[] lines = text.split("\n", -1);
  141. for (String line : lines) {
  142. LineIndex lineIndex = new LineIndex();
  143. lineIndex.setLine(lineNum);
  144. lineIndex.setCharStart(charPos);
  145. lineIndex.setCharEnd(charPos + line.length());
  146. lineIndices.add(lineIndex);
  147. charPos += line.length() + 1; // +1 for \n
  148. lineNum++;
  149. }
  150. // 创建文档索引(无分页,只有行索引)
  151. DocumentIndex index = new DocumentIndex();
  152. index.setDocumentId(documentId);
  153. index.setTotalChars(text.length());
  154. index.setTotalLines(lines.length);
  155. index.setTotalPages(1); // 非分页文档统一为1页
  156. index.setLines(lineIndices);
  157. // 创建虚拟的单页索引
  158. List<PageIndex> pages = new ArrayList<>();
  159. PageIndex singlePage = new PageIndex();
  160. singlePage.setPage(1);
  161. singlePage.setCharStart(0);
  162. singlePage.setCharEnd(text.length());
  163. singlePage.setLineStart(1);
  164. singlePage.setLineEnd(lines.length);
  165. pages.add(singlePage);
  166. index.setPages(pages);
  167. // 保存索引文件
  168. if (indexOutputPath != null) {
  169. saveIndexFile(index, indexOutputPath);
  170. }
  171. return index;
  172. }
  173. /**
  174. * 读取索引文件
  175. */
  176. public DocumentIndex loadIndex(String indexFilePath) {
  177. try {
  178. Path path = Paths.get(indexFilePath);
  179. if (!Files.exists(path)) {
  180. log.warn("索引文件不存在: {}", indexFilePath);
  181. return null;
  182. }
  183. String json = Files.readString(path);
  184. return objectMapper.readValue(json, DocumentIndex.class);
  185. } catch (Exception e) {
  186. log.error("读取索引文件失败: {}", indexFilePath, e);
  187. return null;
  188. }
  189. }
  190. /**
  191. * 根据字符位置查找页码和行号
  192. *
  193. * @param index 文档索引
  194. * @param charPosition 字符位置
  195. * @return [页码, 行号] 或 null(未找到)
  196. */
  197. public int[] findPageAndLine(DocumentIndex index, int charPosition) {
  198. if (index == null || index.getPages() == null) {
  199. return null;
  200. }
  201. // 1. 找到所在页
  202. int page = 1;
  203. for (PageIndex pageIndex : index.getPages()) {
  204. if (charPosition >= pageIndex.getCharStart() && charPosition <= pageIndex.getCharEnd()) {
  205. page = pageIndex.getPage();
  206. break;
  207. }
  208. }
  209. // 2. 找到所在行
  210. int line = 1;
  211. if (index.getLines() != null) {
  212. for (LineIndex lineIndex : index.getLines()) {
  213. if (charPosition >= lineIndex.getCharStart() && charPosition <= lineIndex.getCharEnd()) {
  214. line = lineIndex.getLine();
  215. break;
  216. }
  217. }
  218. }
  219. return new int[]{page, line};
  220. }
  221. private DocumentIndex createEmptyIndex(String documentId) {
  222. DocumentIndex index = new DocumentIndex();
  223. index.setDocumentId(documentId);
  224. index.setTotalChars(0);
  225. index.setTotalLines(0);
  226. index.setTotalPages(0);
  227. index.setPages(new ArrayList<>());
  228. return index;
  229. }
  230. private void saveIndexFile(DocumentIndex index, String outputPath) {
  231. try {
  232. Path path = Paths.get(outputPath);
  233. Files.createDirectories(path.getParent());
  234. String json = objectMapper.writerWithDefaultPrettyPrinter().writeValueAsString(index);
  235. Files.writeString(path, json);
  236. log.info("文档索引文件已保存: {}", outputPath);
  237. } catch (Exception e) {
  238. log.error("保存文档索引文件失败: {}", outputPath, e);
  239. }
  240. }
  241. /**
  242. * 文档索引
  243. */
  244. @Data
  245. public static class DocumentIndex {
  246. private String documentId;
  247. private List<PageIndex> pages;
  248. private List<LineIndex> lines;
  249. private int totalChars;
  250. private int totalLines;
  251. private int totalPages;
  252. }
  253. /**
  254. * 页面索引
  255. */
  256. @Data
  257. public static class PageIndex {
  258. private int page;
  259. private int charStart;
  260. private int charEnd;
  261. private int lineStart;
  262. private int lineEnd;
  263. private boolean ocrUsed;
  264. }
  265. /**
  266. * 行索引
  267. */
  268. @Data
  269. public static class LineIndex {
  270. private int line;
  271. private int charStart;
  272. private int charEnd;
  273. }
  274. }