|
|
@@ -0,0 +1,334 @@
|
|
|
+package com.lingyue.graph.service;
|
|
|
+
|
|
|
+import com.fasterxml.jackson.databind.JsonNode;
|
|
|
+import com.fasterxml.jackson.databind.ObjectMapper;
|
|
|
+import lombok.Data;
|
|
|
+import lombok.extern.slf4j.Slf4j;
|
|
|
+import org.springframework.beans.factory.annotation.Value;
|
|
|
+import org.springframework.stereotype.Service;
|
|
|
+
|
|
|
+import java.nio.file.Files;
|
|
|
+import java.nio.file.Path;
|
|
|
+import java.util.HashMap;
|
|
|
+import java.util.Map;
|
|
|
+import java.util.concurrent.ConcurrentHashMap;
|
|
|
+
|
|
|
+/**
|
|
|
+ * 位置映射服务
|
|
|
+ * 根据文档索引将字符位置映射到页码和行号
|
|
|
+ *
|
|
|
+ * @author lingyue
|
|
|
+ * @since 2026-01-20
|
|
|
+ */
|
|
|
+@Slf4j
|
|
|
+@Service
|
|
|
+public class PositionMappingService {
|
|
|
+
|
|
|
+ private final ObjectMapper objectMapper;
|
|
|
+
|
|
|
+ @Value("${file.storage.text-path:/data/lingyue/texts}")
|
|
|
+ private String textStoragePath;
|
|
|
+
|
|
|
+ // 索引缓存,避免重复读取文件
|
|
|
+ private final Map<String, DocumentIndex> indexCache = new ConcurrentHashMap<>();
|
|
|
+
|
|
|
+ public PositionMappingService(ObjectMapper objectMapper) {
|
|
|
+ this.objectMapper = objectMapper;
|
|
|
+ }
|
|
|
+
|
|
|
+ /**
|
|
|
+ * 根据字符位置获取完整位置信息(包含页码和行号)
|
|
|
+ *
|
|
|
+ * @param documentId 文档ID
|
|
|
+ * @param charStart 字符起始位置
|
|
|
+ * @param charEnd 字符结束位置
|
|
|
+ * @return 位置信息 Map(包含 charStart, charEnd, page, line)
|
|
|
+ */
|
|
|
+ public Map<String, Object> mapCharToPosition(String documentId, int charStart, int charEnd) {
|
|
|
+ Map<String, Object> position = new HashMap<>();
|
|
|
+ position.put("charStart", charStart);
|
|
|
+ position.put("charEnd", charEnd);
|
|
|
+
|
|
|
+ // 尝试加载文档索引
|
|
|
+ DocumentIndex index = loadDocumentIndex(documentId);
|
|
|
+ if (index == null) {
|
|
|
+ log.debug("未找到文档索引,返回仅包含字符位置的信息: documentId={}", documentId);
|
|
|
+ return position;
|
|
|
+ }
|
|
|
+
|
|
|
+ // 查找页码
|
|
|
+ int page = findPage(index, charStart);
|
|
|
+ position.put("page", page);
|
|
|
+
|
|
|
+ // 查找行号
|
|
|
+ int line = findLine(index, charStart);
|
|
|
+ position.put("line", line);
|
|
|
+
|
|
|
+ // 计算全局行号(如果有行索引)
|
|
|
+ if (index.getLines() != null && index.getLines().length > 0) {
|
|
|
+ position.put("globalLine", line);
|
|
|
+ }
|
|
|
+
|
|
|
+ return position;
|
|
|
+ }
|
|
|
+
|
|
|
+ /**
|
|
|
+ * 根据页码和行号获取字符位置范围
|
|
|
+ *
|
|
|
+ * @param documentId 文档ID
|
|
|
+ * @param page 页码
|
|
|
+ * @param line 行号(页内行号)
|
|
|
+ * @return 字符位置范围 [charStart, charEnd] 或 null
|
|
|
+ */
|
|
|
+ public int[] mapPageLineToChar(String documentId, int page, int line) {
|
|
|
+ DocumentIndex index = loadDocumentIndex(documentId);
|
|
|
+ if (index == null || index.getPages() == null) {
|
|
|
+ return null;
|
|
|
+ }
|
|
|
+
|
|
|
+ // 找到对应页
|
|
|
+ PageIndex pageIndex = null;
|
|
|
+ for (PageIndex p : index.getPages()) {
|
|
|
+ if (p.getPage() == page) {
|
|
|
+ pageIndex = p;
|
|
|
+ break;
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
+ if (pageIndex == null) {
|
|
|
+ return null;
|
|
|
+ }
|
|
|
+
|
|
|
+ // 如果有行索引,找到具体行
|
|
|
+ if (index.getLines() != null) {
|
|
|
+ // 计算目标全局行号
|
|
|
+ int targetGlobalLine = pageIndex.getLineStart() + line - 1;
|
|
|
+ for (LineIndex lineIndex : index.getLines()) {
|
|
|
+ if (lineIndex.getLine() == targetGlobalLine) {
|
|
|
+ return new int[]{lineIndex.getCharStart(), lineIndex.getCharEnd()};
|
|
|
+ }
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
+ // 没有行索引,返回页的范围
|
|
|
+ return new int[]{pageIndex.getCharStart(), pageIndex.getCharEnd()};
|
|
|
+ }
|
|
|
+
|
|
|
+ /**
|
|
|
+ * 批量映射位置信息
|
|
|
+ * 用于一次性处理多个实体的位置
|
|
|
+ *
|
|
|
+ * @param documentId 文档ID
|
|
|
+ * @param charPositions 字符位置列表 [[charStart1, charEnd1], [charStart2, charEnd2], ...]
|
|
|
+ * @return 完整位置信息列表
|
|
|
+ */
|
|
|
+ public Map<String, Object>[] mapCharToPositionBatch(String documentId, int[][] charPositions) {
|
|
|
+ // 预加载索引
|
|
|
+ DocumentIndex index = loadDocumentIndex(documentId);
|
|
|
+
|
|
|
+ @SuppressWarnings("unchecked")
|
|
|
+ Map<String, Object>[] results = new Map[charPositions.length];
|
|
|
+
|
|
|
+ for (int i = 0; i < charPositions.length; i++) {
|
|
|
+ int charStart = charPositions[i][0];
|
|
|
+ int charEnd = charPositions[i][1];
|
|
|
+
|
|
|
+ Map<String, Object> position = new HashMap<>();
|
|
|
+ position.put("charStart", charStart);
|
|
|
+ position.put("charEnd", charEnd);
|
|
|
+
|
|
|
+ if (index != null) {
|
|
|
+ position.put("page", findPage(index, charStart));
|
|
|
+ position.put("line", findLine(index, charStart));
|
|
|
+ }
|
|
|
+
|
|
|
+ results[i] = position;
|
|
|
+ }
|
|
|
+
|
|
|
+ return results;
|
|
|
+ }
|
|
|
+
|
|
|
+ /**
|
|
|
+ * 清除缓存的索引
|
|
|
+ */
|
|
|
+ public void clearCache(String documentId) {
|
|
|
+ indexCache.remove(documentId);
|
|
|
+ }
|
|
|
+
|
|
|
+ /**
|
|
|
+ * 清除所有缓存
|
|
|
+ */
|
|
|
+ public void clearAllCache() {
|
|
|
+ indexCache.clear();
|
|
|
+ }
|
|
|
+
|
|
|
+ /**
|
|
|
+ * 加载文档索引
|
|
|
+ */
|
|
|
+ private DocumentIndex loadDocumentIndex(String documentId) {
|
|
|
+ // 先检查缓存
|
|
|
+ if (indexCache.containsKey(documentId)) {
|
|
|
+ return indexCache.get(documentId);
|
|
|
+ }
|
|
|
+
|
|
|
+ // 构建索引文件路径
|
|
|
+ String indexFilePath = buildIndexFilePath(documentId);
|
|
|
+ Path path = Path.of(indexFilePath);
|
|
|
+
|
|
|
+ if (!Files.exists(path)) {
|
|
|
+ log.debug("索引文件不存在: {}", indexFilePath);
|
|
|
+ return null;
|
|
|
+ }
|
|
|
+
|
|
|
+ try {
|
|
|
+ String json = Files.readString(path);
|
|
|
+ JsonNode root = objectMapper.readTree(json);
|
|
|
+
|
|
|
+ DocumentIndex index = new DocumentIndex();
|
|
|
+ index.setDocumentId(root.path("documentId").asText(documentId));
|
|
|
+ index.setTotalChars(root.path("totalChars").asInt(0));
|
|
|
+ index.setTotalLines(root.path("totalLines").asInt(0));
|
|
|
+ index.setTotalPages(root.path("totalPages").asInt(0));
|
|
|
+
|
|
|
+ // 解析页面索引
|
|
|
+ JsonNode pagesNode = root.path("pages");
|
|
|
+ if (pagesNode.isArray()) {
|
|
|
+ PageIndex[] pages = new PageIndex[pagesNode.size()];
|
|
|
+ for (int i = 0; i < pagesNode.size(); i++) {
|
|
|
+ JsonNode pageNode = pagesNode.get(i);
|
|
|
+ PageIndex pageIndex = new PageIndex();
|
|
|
+ pageIndex.setPage(pageNode.path("page").asInt(i + 1));
|
|
|
+ pageIndex.setCharStart(pageNode.path("charStart").asInt(0));
|
|
|
+ pageIndex.setCharEnd(pageNode.path("charEnd").asInt(0));
|
|
|
+ pageIndex.setLineStart(pageNode.path("lineStart").asInt(1));
|
|
|
+ pageIndex.setLineEnd(pageNode.path("lineEnd").asInt(1));
|
|
|
+ pages[i] = pageIndex;
|
|
|
+ }
|
|
|
+ index.setPages(pages);
|
|
|
+ }
|
|
|
+
|
|
|
+ // 解析行索引
|
|
|
+ JsonNode linesNode = root.path("lines");
|
|
|
+ if (linesNode.isArray()) {
|
|
|
+ LineIndex[] lines = new LineIndex[linesNode.size()];
|
|
|
+ for (int i = 0; i < linesNode.size(); i++) {
|
|
|
+ JsonNode lineNode = linesNode.get(i);
|
|
|
+ LineIndex lineIndex = new LineIndex();
|
|
|
+ lineIndex.setLine(lineNode.path("line").asInt(i + 1));
|
|
|
+ lineIndex.setCharStart(lineNode.path("charStart").asInt(0));
|
|
|
+ lineIndex.setCharEnd(lineNode.path("charEnd").asInt(0));
|
|
|
+ lines[i] = lineIndex;
|
|
|
+ }
|
|
|
+ index.setLines(lines);
|
|
|
+ }
|
|
|
+
|
|
|
+ // 缓存
|
|
|
+ indexCache.put(documentId, index);
|
|
|
+ log.debug("已加载并缓存文档索引: documentId={}, pages={}, lines={}",
|
|
|
+ documentId,
|
|
|
+ index.getPages() != null ? index.getPages().length : 0,
|
|
|
+ index.getLines() != null ? index.getLines().length : 0);
|
|
|
+
|
|
|
+ return index;
|
|
|
+ } catch (Exception e) {
|
|
|
+ log.error("加载文档索引失败: documentId={}", documentId, e);
|
|
|
+ return null;
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
+ /**
|
|
|
+ * 构建索引文件路径
|
|
|
+ */
|
|
|
+ private String buildIndexFilePath(String documentId) {
|
|
|
+ return Path.of(
|
|
|
+ textStoragePath,
|
|
|
+ documentId.substring(0, 2),
|
|
|
+ documentId + "_index.json"
|
|
|
+ ).toString();
|
|
|
+ }
|
|
|
+
|
|
|
+ /**
|
|
|
+ * 使用二分查找页码
|
|
|
+ */
|
|
|
+ private int findPage(DocumentIndex index, int charPosition) {
|
|
|
+ if (index.getPages() == null || index.getPages().length == 0) {
|
|
|
+ return 1;
|
|
|
+ }
|
|
|
+
|
|
|
+ for (PageIndex page : index.getPages()) {
|
|
|
+ if (charPosition >= page.getCharStart() && charPosition <= page.getCharEnd()) {
|
|
|
+ return page.getPage();
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
+ // 如果未找到,返回最后一页
|
|
|
+ return index.getPages()[index.getPages().length - 1].getPage();
|
|
|
+ }
|
|
|
+
|
|
|
+ /**
|
|
|
+ * 使用二分查找行号
|
|
|
+ */
|
|
|
+ private int findLine(DocumentIndex index, int charPosition) {
|
|
|
+ if (index.getLines() == null || index.getLines().length == 0) {
|
|
|
+ return 1;
|
|
|
+ }
|
|
|
+
|
|
|
+ // 二分查找
|
|
|
+ int left = 0;
|
|
|
+ int right = index.getLines().length - 1;
|
|
|
+
|
|
|
+ while (left <= right) {
|
|
|
+ int mid = (left + right) / 2;
|
|
|
+ LineIndex line = index.getLines()[mid];
|
|
|
+
|
|
|
+ if (charPosition < line.getCharStart()) {
|
|
|
+ right = mid - 1;
|
|
|
+ } else if (charPosition > line.getCharEnd()) {
|
|
|
+ left = mid + 1;
|
|
|
+ } else {
|
|
|
+ return line.getLine();
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
+ // 如果未找到,返回最近的行
|
|
|
+ if (left >= index.getLines().length) {
|
|
|
+ return index.getLines()[index.getLines().length - 1].getLine();
|
|
|
+ }
|
|
|
+ return index.getLines()[left].getLine();
|
|
|
+ }
|
|
|
+
|
|
|
+ /**
|
|
|
+ * 文档索引
|
|
|
+ */
|
|
|
+ @Data
|
|
|
+ public static class DocumentIndex {
|
|
|
+ private String documentId;
|
|
|
+ private PageIndex[] pages;
|
|
|
+ private LineIndex[] lines;
|
|
|
+ private int totalChars;
|
|
|
+ private int totalLines;
|
|
|
+ private int totalPages;
|
|
|
+ }
|
|
|
+
|
|
|
+ /**
|
|
|
+ * 页面索引
|
|
|
+ */
|
|
|
+ @Data
|
|
|
+ public static class PageIndex {
|
|
|
+ private int page;
|
|
|
+ private int charStart;
|
|
|
+ private int charEnd;
|
|
|
+ private int lineStart;
|
|
|
+ private int lineEnd;
|
|
|
+ }
|
|
|
+
|
|
|
+ /**
|
|
|
+ * 行索引
|
|
|
+ */
|
|
|
+ @Data
|
|
|
+ public static class LineIndex {
|
|
|
+ private int line;
|
|
|
+ private int charStart;
|
|
|
+ private int charEnd;
|
|
|
+ }
|
|
|
+}
|