|
|
@@ -1,337 +0,0 @@
|
|
|
-package com.lingyue.graph.service;
|
|
|
-
|
|
|
-import com.fasterxml.jackson.databind.JsonNode;
|
|
|
-import com.fasterxml.jackson.databind.ObjectMapper;
|
|
|
-import lombok.Data;
|
|
|
-import lombok.extern.slf4j.Slf4j;
|
|
|
-import org.springframework.beans.factory.annotation.Value;
|
|
|
-import org.springframework.stereotype.Service;
|
|
|
-
|
|
|
-import java.nio.file.Files;
|
|
|
-import java.nio.file.Path;
|
|
|
-import java.util.HashMap;
|
|
|
-import java.util.Map;
|
|
|
-import java.util.concurrent.ConcurrentHashMap;
|
|
|
-
|
|
|
-/**
|
|
|
- * 位置映射服务
|
|
|
- * 根据文档索引将字符位置映射到页码和行号
|
|
|
- *
|
|
|
- * @author lingyue
|
|
|
- * @since 2026-01-20
|
|
|
- */
|
|
|
-@Slf4j
|
|
|
-@Service
|
|
|
-public class PositionMappingService {
|
|
|
-
|
|
|
- private final ObjectMapper objectMapper;
|
|
|
-
|
|
|
- @Value("${file.storage.text-path:/data/lingyue/texts}")
|
|
|
- private String textStoragePath;
|
|
|
-
|
|
|
- // 索引缓存,避免重复读取文件
|
|
|
- private final Map<String, DocumentIndex> indexCache = new ConcurrentHashMap<>();
|
|
|
-
|
|
|
- public PositionMappingService(ObjectMapper objectMapper) {
|
|
|
- this.objectMapper = objectMapper;
|
|
|
- }
|
|
|
-
|
|
|
- /**
|
|
|
- * 根据字符位置获取完整位置信息(包含页码和行号)
|
|
|
- *
|
|
|
- * @param documentId 文档ID
|
|
|
- * @param charStart 字符起始位置
|
|
|
- * @param charEnd 字符结束位置
|
|
|
- * @return 位置信息 Map(包含 charStart, charEnd, page, line)
|
|
|
- */
|
|
|
- public Map<String, Object> mapCharToPosition(String documentId, int charStart, int charEnd) {
|
|
|
- Map<String, Object> position = new HashMap<>();
|
|
|
- position.put("charStart", charStart);
|
|
|
- position.put("charEnd", charEnd);
|
|
|
-
|
|
|
- // 尝试加载文档索引
|
|
|
- DocumentIndex index = loadDocumentIndex(documentId);
|
|
|
- if (index == null) {
|
|
|
- log.debug("未找到文档索引,返回仅包含字符位置的信息: documentId={}", documentId);
|
|
|
- return position;
|
|
|
- }
|
|
|
-
|
|
|
- // 查找页码
|
|
|
- int page = findPage(index, charStart);
|
|
|
- position.put("page", page);
|
|
|
-
|
|
|
- // 查找行号
|
|
|
- int line = findLine(index, charStart);
|
|
|
- position.put("line", line);
|
|
|
-
|
|
|
- // 计算全局行号(如果有行索引)
|
|
|
- if (index.getLines() != null && index.getLines().length > 0) {
|
|
|
- position.put("globalLine", line);
|
|
|
- }
|
|
|
-
|
|
|
- return position;
|
|
|
- }
|
|
|
-
|
|
|
- /**
|
|
|
- * 根据页码和行号获取字符位置范围
|
|
|
- *
|
|
|
- * @param documentId 文档ID
|
|
|
- * @param page 页码
|
|
|
- * @param line 行号(页内行号)
|
|
|
- * @return 字符位置范围 [charStart, charEnd] 或 null
|
|
|
- */
|
|
|
- public int[] mapPageLineToChar(String documentId, int page, int line) {
|
|
|
- DocumentIndex index = loadDocumentIndex(documentId);
|
|
|
- if (index == null || index.getPages() == null) {
|
|
|
- return null;
|
|
|
- }
|
|
|
-
|
|
|
- // 找到对应页
|
|
|
- PageIndex pageIndex = null;
|
|
|
- for (PageIndex p : index.getPages()) {
|
|
|
- if (p.getPage() == page) {
|
|
|
- pageIndex = p;
|
|
|
- break;
|
|
|
- }
|
|
|
- }
|
|
|
-
|
|
|
- if (pageIndex == null) {
|
|
|
- return null;
|
|
|
- }
|
|
|
-
|
|
|
- // 如果有行索引,找到具体行
|
|
|
- if (index.getLines() != null) {
|
|
|
- // 计算目标全局行号
|
|
|
- int targetGlobalLine = pageIndex.getLineStart() + line - 1;
|
|
|
- for (LineIndex lineIndex : index.getLines()) {
|
|
|
- if (lineIndex.getLine() == targetGlobalLine) {
|
|
|
- return new int[]{lineIndex.getCharStart(), lineIndex.getCharEnd()};
|
|
|
- }
|
|
|
- }
|
|
|
- }
|
|
|
-
|
|
|
- // 没有行索引,返回页的范围
|
|
|
- return new int[]{pageIndex.getCharStart(), pageIndex.getCharEnd()};
|
|
|
- }
|
|
|
-
|
|
|
- /**
|
|
|
- * 批量映射位置信息
|
|
|
- * 用于一次性处理多个实体的位置
|
|
|
- *
|
|
|
- * @param documentId 文档ID
|
|
|
- * @param charPositions 字符位置列表 [[charStart1, charEnd1], [charStart2, charEnd2], ...]
|
|
|
- * @return 完整位置信息列表
|
|
|
- */
|
|
|
- public Map<String, Object>[] mapCharToPositionBatch(String documentId, int[][] charPositions) {
|
|
|
- // 预加载索引
|
|
|
- DocumentIndex index = loadDocumentIndex(documentId);
|
|
|
-
|
|
|
- @SuppressWarnings("unchecked")
|
|
|
- Map<String, Object>[] results = new Map[charPositions.length];
|
|
|
-
|
|
|
- for (int i = 0; i < charPositions.length; i++) {
|
|
|
- int charStart = charPositions[i][0];
|
|
|
- int charEnd = charPositions[i][1];
|
|
|
-
|
|
|
- Map<String, Object> position = new HashMap<>();
|
|
|
- position.put("charStart", charStart);
|
|
|
- position.put("charEnd", charEnd);
|
|
|
-
|
|
|
- if (index != null) {
|
|
|
- position.put("page", findPage(index, charStart));
|
|
|
- position.put("line", findLine(index, charStart));
|
|
|
- }
|
|
|
-
|
|
|
- results[i] = position;
|
|
|
- }
|
|
|
-
|
|
|
- return results;
|
|
|
- }
|
|
|
-
|
|
|
- /**
|
|
|
- * 清除缓存的索引
|
|
|
- */
|
|
|
- public void clearCache(String documentId) {
|
|
|
- indexCache.remove(documentId);
|
|
|
- }
|
|
|
-
|
|
|
- /**
|
|
|
- * 清除所有缓存
|
|
|
- */
|
|
|
- public void clearAllCache() {
|
|
|
- indexCache.clear();
|
|
|
- }
|
|
|
-
|
|
|
- /**
|
|
|
- * 加载文档索引
|
|
|
- */
|
|
|
- private DocumentIndex loadDocumentIndex(String documentId) {
|
|
|
- // 先检查缓存
|
|
|
- if (indexCache.containsKey(documentId)) {
|
|
|
- log.debug("从缓存加载索引: documentId={}", documentId);
|
|
|
- return indexCache.get(documentId);
|
|
|
- }
|
|
|
-
|
|
|
- // 构建索引文件路径
|
|
|
- String indexFilePath = buildIndexFilePath(documentId);
|
|
|
- Path path = Path.of(indexFilePath);
|
|
|
-
|
|
|
- log.debug("尝试加载索引文件: {}", indexFilePath);
|
|
|
-
|
|
|
- if (!Files.exists(path)) {
|
|
|
- log.info("索引文件不存在,无法补充页码信息: {}", indexFilePath);
|
|
|
- return null;
|
|
|
- }
|
|
|
-
|
|
|
- try {
|
|
|
- String json = Files.readString(path);
|
|
|
- JsonNode root = objectMapper.readTree(json);
|
|
|
-
|
|
|
- DocumentIndex index = new DocumentIndex();
|
|
|
- index.setDocumentId(root.path("documentId").asText(documentId));
|
|
|
- index.setTotalChars(root.path("totalChars").asInt(0));
|
|
|
- index.setTotalLines(root.path("totalLines").asInt(0));
|
|
|
- index.setTotalPages(root.path("totalPages").asInt(0));
|
|
|
-
|
|
|
- // 解析页面索引
|
|
|
- JsonNode pagesNode = root.path("pages");
|
|
|
- if (pagesNode.isArray()) {
|
|
|
- PageIndex[] pages = new PageIndex[pagesNode.size()];
|
|
|
- for (int i = 0; i < pagesNode.size(); i++) {
|
|
|
- JsonNode pageNode = pagesNode.get(i);
|
|
|
- PageIndex pageIndex = new PageIndex();
|
|
|
- pageIndex.setPage(pageNode.path("page").asInt(i + 1));
|
|
|
- pageIndex.setCharStart(pageNode.path("charStart").asInt(0));
|
|
|
- pageIndex.setCharEnd(pageNode.path("charEnd").asInt(0));
|
|
|
- pageIndex.setLineStart(pageNode.path("lineStart").asInt(1));
|
|
|
- pageIndex.setLineEnd(pageNode.path("lineEnd").asInt(1));
|
|
|
- pages[i] = pageIndex;
|
|
|
- }
|
|
|
- index.setPages(pages);
|
|
|
- }
|
|
|
-
|
|
|
- // 解析行索引
|
|
|
- JsonNode linesNode = root.path("lines");
|
|
|
- if (linesNode.isArray()) {
|
|
|
- LineIndex[] lines = new LineIndex[linesNode.size()];
|
|
|
- for (int i = 0; i < linesNode.size(); i++) {
|
|
|
- JsonNode lineNode = linesNode.get(i);
|
|
|
- LineIndex lineIndex = new LineIndex();
|
|
|
- lineIndex.setLine(lineNode.path("line").asInt(i + 1));
|
|
|
- lineIndex.setCharStart(lineNode.path("charStart").asInt(0));
|
|
|
- lineIndex.setCharEnd(lineNode.path("charEnd").asInt(0));
|
|
|
- lines[i] = lineIndex;
|
|
|
- }
|
|
|
- index.setLines(lines);
|
|
|
- }
|
|
|
-
|
|
|
- // 缓存
|
|
|
- indexCache.put(documentId, index);
|
|
|
- log.debug("已加载并缓存文档索引: documentId={}, pages={}, lines={}",
|
|
|
- documentId,
|
|
|
- index.getPages() != null ? index.getPages().length : 0,
|
|
|
- index.getLines() != null ? index.getLines().length : 0);
|
|
|
-
|
|
|
- return index;
|
|
|
- } catch (Exception e) {
|
|
|
- log.error("加载文档索引失败: documentId={}", documentId, e);
|
|
|
- return null;
|
|
|
- }
|
|
|
- }
|
|
|
-
|
|
|
- /**
|
|
|
- * 构建索引文件路径
|
|
|
- */
|
|
|
- private String buildIndexFilePath(String documentId) {
|
|
|
- return Path.of(
|
|
|
- textStoragePath,
|
|
|
- documentId.substring(0, 2),
|
|
|
- documentId + "_index.json"
|
|
|
- ).toString();
|
|
|
- }
|
|
|
-
|
|
|
- /**
|
|
|
- * 使用二分查找页码
|
|
|
- */
|
|
|
- private int findPage(DocumentIndex index, int charPosition) {
|
|
|
- if (index.getPages() == null || index.getPages().length == 0) {
|
|
|
- return 1;
|
|
|
- }
|
|
|
-
|
|
|
- for (PageIndex page : index.getPages()) {
|
|
|
- if (charPosition >= page.getCharStart() && charPosition <= page.getCharEnd()) {
|
|
|
- return page.getPage();
|
|
|
- }
|
|
|
- }
|
|
|
-
|
|
|
- // 如果未找到,返回最后一页
|
|
|
- return index.getPages()[index.getPages().length - 1].getPage();
|
|
|
- }
|
|
|
-
|
|
|
- /**
|
|
|
- * 使用二分查找行号
|
|
|
- */
|
|
|
- private int findLine(DocumentIndex index, int charPosition) {
|
|
|
- if (index.getLines() == null || index.getLines().length == 0) {
|
|
|
- return 1;
|
|
|
- }
|
|
|
-
|
|
|
- // 二分查找
|
|
|
- int left = 0;
|
|
|
- int right = index.getLines().length - 1;
|
|
|
-
|
|
|
- while (left <= right) {
|
|
|
- int mid = (left + right) / 2;
|
|
|
- LineIndex line = index.getLines()[mid];
|
|
|
-
|
|
|
- if (charPosition < line.getCharStart()) {
|
|
|
- right = mid - 1;
|
|
|
- } else if (charPosition > line.getCharEnd()) {
|
|
|
- left = mid + 1;
|
|
|
- } else {
|
|
|
- return line.getLine();
|
|
|
- }
|
|
|
- }
|
|
|
-
|
|
|
- // 如果未找到,返回最近的行
|
|
|
- if (left >= index.getLines().length) {
|
|
|
- return index.getLines()[index.getLines().length - 1].getLine();
|
|
|
- }
|
|
|
- return index.getLines()[left].getLine();
|
|
|
- }
|
|
|
-
|
|
|
- /**
|
|
|
- * 文档索引
|
|
|
- */
|
|
|
- @Data
|
|
|
- public static class DocumentIndex {
|
|
|
- private String documentId;
|
|
|
- private PageIndex[] pages;
|
|
|
- private LineIndex[] lines;
|
|
|
- private int totalChars;
|
|
|
- private int totalLines;
|
|
|
- private int totalPages;
|
|
|
- }
|
|
|
-
|
|
|
- /**
|
|
|
- * 页面索引
|
|
|
- */
|
|
|
- @Data
|
|
|
- public static class PageIndex {
|
|
|
- private int page;
|
|
|
- private int charStart;
|
|
|
- private int charEnd;
|
|
|
- private int lineStart;
|
|
|
- private int lineEnd;
|
|
|
- }
|
|
|
-
|
|
|
- /**
|
|
|
- * 行索引
|
|
|
- */
|
|
|
- @Data
|
|
|
- public static class LineIndex {
|
|
|
- private int line;
|
|
|
- private int charStart;
|
|
|
- private int charEnd;
|
|
|
- }
|
|
|
-}
|