Преглед изворни кода

feat: 添加位置映射服务并自动补充实体页码行号

- 新增 PositionMappingService 读取文档索引并映射字符位置到页码/行号
- 支持索引缓存,提高批量处理性能
- GraphNerService 保存实体时自动调用 enrichPosition 补充页码行号
- 支持二分查找优化行号查找效率
何文松 пре 1 месец
родитељ
комит
9ac992d618

+ 65 - 4
backend/graph-service/src/main/java/com/lingyue/graph/service/GraphNerService.java

@@ -1,7 +1,6 @@
 package com.lingyue.graph.service;
 
 import com.lingyue.common.exception.ServiceException;
-import com.lingyue.graph.dto.*;
 import com.lingyue.graph.entity.GraphNode;
 import com.lingyue.graph.entity.GraphRelation;
 import com.lingyue.graph.entity.TextStorage;
@@ -35,6 +34,7 @@ public class GraphNerService {
     private final TextStorageRepository textStorageRepository;
     private final GraphNodeRepository graphNodeRepository;
     private final GraphRelationRepository graphRelationRepository;
+    private final PositionMappingService positionMappingService;
 
     /**
      * 获取文档的文本内容
@@ -99,13 +99,17 @@ public class GraphNerService {
             node.setCreateTime(new Date());
             node.setUpdateTime(new Date());
             
-            // 转换位置信息
+            // 转换位置信息并补充页码/行号
             Object positionObj = entity.get("position");
             if (positionObj instanceof Map) {
                 @SuppressWarnings("unchecked")
                 Map<String, Object> posMap = (Map<String, Object>) positionObj;
-                log.debug("实体位置信息: name={}, position={}", node.getName(), posMap);
-                node.setPosition(posMap);
+                
+                // 使用 PositionMappingService 补充页码和行号
+                Map<String, Object> enrichedPosition = enrichPosition(documentId, posMap);
+                
+                log.debug("实体位置信息: name={}, position={}", node.getName(), enrichedPosition);
+                node.setPosition(enrichedPosition);
             } else {
                 log.debug("实体无位置信息: name={}, positionObj={}", node.getName(), positionObj);
             }
@@ -290,6 +294,63 @@ public class GraphNerService {
         }
     }
 
+    /**
+     * 丰富位置信息,补充页码和行号
+     * 
+     * @param documentId 文档ID
+     * @param posMap 原始位置信息(包含 charStart, charEnd)
+     * @return 丰富后的位置信息(包含 charStart, charEnd, page, line)
+     */
+    private Map<String, Object> enrichPosition(String documentId, Map<String, Object> posMap) {
+        // 如果已经有页码和行号,直接返回
+        if (posMap.containsKey("page") && posMap.get("page") != null 
+            && posMap.containsKey("line") && posMap.get("line") != null) {
+            return posMap;
+        }
+        
+        // 获取字符位置
+        Integer charStart = getIntValue(posMap, "charStart");
+        Integer charEnd = getIntValue(posMap, "charEnd");
+        
+        if (charStart == null || charEnd == null) {
+            return posMap;
+        }
+        
+        try {
+            // 使用 PositionMappingService 映射页码和行号
+            Map<String, Object> mappedPosition = positionMappingService.mapCharToPosition(
+                documentId, charStart, charEnd);
+            
+            // 合并原始位置信息和映射结果
+            Map<String, Object> enrichedPosition = new HashMap<>(posMap);
+            enrichedPosition.putAll(mappedPosition);
+            
+            return enrichedPosition;
+        } catch (Exception e) {
+            log.warn("位置映射失败: documentId={}, charStart={}, charEnd={}, error={}", 
+                    documentId, charStart, charEnd, e.getMessage());
+            return posMap;
+        }
+    }
+    
+    /**
+     * 从 Map 中获取整数值
+     */
+    private Integer getIntValue(Map<String, Object> map, String key) {
+        Object value = map.get(key);
+        if (value == null) {
+            return null;
+        }
+        if (value instanceof Number) {
+            return ((Number) value).intValue();
+        }
+        try {
+            return Integer.parseInt(value.toString());
+        } catch (NumberFormatException e) {
+            return null;
+        }
+    }
+    
     /**
      * 从 Map 中获取字符串值
      */

+ 334 - 0
backend/graph-service/src/main/java/com/lingyue/graph/service/PositionMappingService.java

@@ -0,0 +1,334 @@
+package com.lingyue.graph.service;
+
+import com.fasterxml.jackson.databind.JsonNode;
+import com.fasterxml.jackson.databind.ObjectMapper;
+import lombok.Data;
+import lombok.extern.slf4j.Slf4j;
+import org.springframework.beans.factory.annotation.Value;
+import org.springframework.stereotype.Service;
+
+import java.nio.file.Files;
+import java.nio.file.Path;
+import java.util.HashMap;
+import java.util.Map;
+import java.util.concurrent.ConcurrentHashMap;
+
+/**
+ * 位置映射服务
+ * 根据文档索引将字符位置映射到页码和行号
+ * 
+ * @author lingyue
+ * @since 2026-01-20
+ */
+@Slf4j
+@Service
+public class PositionMappingService {
+    
+    private final ObjectMapper objectMapper;
+    
+    @Value("${file.storage.text-path:/data/lingyue/texts}")
+    private String textStoragePath;
+    
+    // 索引缓存,避免重复读取文件
+    private final Map<String, DocumentIndex> indexCache = new ConcurrentHashMap<>();
+    
+    public PositionMappingService(ObjectMapper objectMapper) {
+        this.objectMapper = objectMapper;
+    }
+    
+    /**
+     * 根据字符位置获取完整位置信息(包含页码和行号)
+     * 
+     * @param documentId 文档ID
+     * @param charStart 字符起始位置
+     * @param charEnd 字符结束位置
+     * @return 位置信息 Map(包含 charStart, charEnd, page, line)
+     */
+    public Map<String, Object> mapCharToPosition(String documentId, int charStart, int charEnd) {
+        Map<String, Object> position = new HashMap<>();
+        position.put("charStart", charStart);
+        position.put("charEnd", charEnd);
+        
+        // 尝试加载文档索引
+        DocumentIndex index = loadDocumentIndex(documentId);
+        if (index == null) {
+            log.debug("未找到文档索引,返回仅包含字符位置的信息: documentId={}", documentId);
+            return position;
+        }
+        
+        // 查找页码
+        int page = findPage(index, charStart);
+        position.put("page", page);
+        
+        // 查找行号
+        int line = findLine(index, charStart);
+        position.put("line", line);
+        
+        // 计算全局行号(如果有行索引)
+        if (index.getLines() != null && index.getLines().length > 0) {
+            position.put("globalLine", line);
+        }
+        
+        return position;
+    }
+    
+    /**
+     * 根据页码和行号获取字符位置范围
+     * 
+     * @param documentId 文档ID
+     * @param page 页码
+     * @param line 行号(页内行号)
+     * @return 字符位置范围 [charStart, charEnd] 或 null
+     */
+    public int[] mapPageLineToChar(String documentId, int page, int line) {
+        DocumentIndex index = loadDocumentIndex(documentId);
+        if (index == null || index.getPages() == null) {
+            return null;
+        }
+        
+        // 找到对应页
+        PageIndex pageIndex = null;
+        for (PageIndex p : index.getPages()) {
+            if (p.getPage() == page) {
+                pageIndex = p;
+                break;
+            }
+        }
+        
+        if (pageIndex == null) {
+            return null;
+        }
+        
+        // 如果有行索引,找到具体行
+        if (index.getLines() != null) {
+            // 计算目标全局行号
+            int targetGlobalLine = pageIndex.getLineStart() + line - 1;
+            for (LineIndex lineIndex : index.getLines()) {
+                if (lineIndex.getLine() == targetGlobalLine) {
+                    return new int[]{lineIndex.getCharStart(), lineIndex.getCharEnd()};
+                }
+            }
+        }
+        
+        // 没有行索引,返回页的范围
+        return new int[]{pageIndex.getCharStart(), pageIndex.getCharEnd()};
+    }
+    
+    /**
+     * 批量映射位置信息
+     * 用于一次性处理多个实体的位置
+     * 
+     * @param documentId 文档ID
+     * @param charPositions 字符位置列表 [[charStart1, charEnd1], [charStart2, charEnd2], ...]
+     * @return 完整位置信息列表
+     */
+    public Map<String, Object>[] mapCharToPositionBatch(String documentId, int[][] charPositions) {
+        // 预加载索引
+        DocumentIndex index = loadDocumentIndex(documentId);
+        
+        @SuppressWarnings("unchecked")
+        Map<String, Object>[] results = new Map[charPositions.length];
+        
+        for (int i = 0; i < charPositions.length; i++) {
+            int charStart = charPositions[i][0];
+            int charEnd = charPositions[i][1];
+            
+            Map<String, Object> position = new HashMap<>();
+            position.put("charStart", charStart);
+            position.put("charEnd", charEnd);
+            
+            if (index != null) {
+                position.put("page", findPage(index, charStart));
+                position.put("line", findLine(index, charStart));
+            }
+            
+            results[i] = position;
+        }
+        
+        return results;
+    }
+    
+    /**
+     * 清除缓存的索引
+     */
+    public void clearCache(String documentId) {
+        indexCache.remove(documentId);
+    }
+    
+    /**
+     * 清除所有缓存
+     */
+    public void clearAllCache() {
+        indexCache.clear();
+    }
+    
+    /**
+     * 加载文档索引
+     */
+    private DocumentIndex loadDocumentIndex(String documentId) {
+        // 先检查缓存
+        if (indexCache.containsKey(documentId)) {
+            return indexCache.get(documentId);
+        }
+        
+        // 构建索引文件路径
+        String indexFilePath = buildIndexFilePath(documentId);
+        Path path = Path.of(indexFilePath);
+        
+        if (!Files.exists(path)) {
+            log.debug("索引文件不存在: {}", indexFilePath);
+            return null;
+        }
+        
+        try {
+            String json = Files.readString(path);
+            JsonNode root = objectMapper.readTree(json);
+            
+            DocumentIndex index = new DocumentIndex();
+            index.setDocumentId(root.path("documentId").asText(documentId));
+            index.setTotalChars(root.path("totalChars").asInt(0));
+            index.setTotalLines(root.path("totalLines").asInt(0));
+            index.setTotalPages(root.path("totalPages").asInt(0));
+            
+            // 解析页面索引
+            JsonNode pagesNode = root.path("pages");
+            if (pagesNode.isArray()) {
+                PageIndex[] pages = new PageIndex[pagesNode.size()];
+                for (int i = 0; i < pagesNode.size(); i++) {
+                    JsonNode pageNode = pagesNode.get(i);
+                    PageIndex pageIndex = new PageIndex();
+                    pageIndex.setPage(pageNode.path("page").asInt(i + 1));
+                    pageIndex.setCharStart(pageNode.path("charStart").asInt(0));
+                    pageIndex.setCharEnd(pageNode.path("charEnd").asInt(0));
+                    pageIndex.setLineStart(pageNode.path("lineStart").asInt(1));
+                    pageIndex.setLineEnd(pageNode.path("lineEnd").asInt(1));
+                    pages[i] = pageIndex;
+                }
+                index.setPages(pages);
+            }
+            
+            // 解析行索引
+            JsonNode linesNode = root.path("lines");
+            if (linesNode.isArray()) {
+                LineIndex[] lines = new LineIndex[linesNode.size()];
+                for (int i = 0; i < linesNode.size(); i++) {
+                    JsonNode lineNode = linesNode.get(i);
+                    LineIndex lineIndex = new LineIndex();
+                    lineIndex.setLine(lineNode.path("line").asInt(i + 1));
+                    lineIndex.setCharStart(lineNode.path("charStart").asInt(0));
+                    lineIndex.setCharEnd(lineNode.path("charEnd").asInt(0));
+                    lines[i] = lineIndex;
+                }
+                index.setLines(lines);
+            }
+            
+            // 缓存
+            indexCache.put(documentId, index);
+            log.debug("已加载并缓存文档索引: documentId={}, pages={}, lines={}", 
+                    documentId, 
+                    index.getPages() != null ? index.getPages().length : 0,
+                    index.getLines() != null ? index.getLines().length : 0);
+            
+            return index;
+        } catch (Exception e) {
+            log.error("加载文档索引失败: documentId={}", documentId, e);
+            return null;
+        }
+    }
+    
+    /**
+     * 构建索引文件路径
+     */
+    private String buildIndexFilePath(String documentId) {
+        return Path.of(
+                textStoragePath,
+                documentId.substring(0, 2),
+                documentId + "_index.json"
+        ).toString();
+    }
+    
+    /**
+     * 使用二分查找页码
+     */
+    private int findPage(DocumentIndex index, int charPosition) {
+        if (index.getPages() == null || index.getPages().length == 0) {
+            return 1;
+        }
+        
+        for (PageIndex page : index.getPages()) {
+            if (charPosition >= page.getCharStart() && charPosition <= page.getCharEnd()) {
+                return page.getPage();
+            }
+        }
+        
+        // 如果未找到,返回最后一页
+        return index.getPages()[index.getPages().length - 1].getPage();
+    }
+    
+    /**
+     * 使用二分查找行号
+     */
+    private int findLine(DocumentIndex index, int charPosition) {
+        if (index.getLines() == null || index.getLines().length == 0) {
+            return 1;
+        }
+        
+        // 二分查找
+        int left = 0;
+        int right = index.getLines().length - 1;
+        
+        while (left <= right) {
+            int mid = (left + right) / 2;
+            LineIndex line = index.getLines()[mid];
+            
+            if (charPosition < line.getCharStart()) {
+                right = mid - 1;
+            } else if (charPosition > line.getCharEnd()) {
+                left = mid + 1;
+            } else {
+                return line.getLine();
+            }
+        }
+        
+        // 如果未找到,返回最近的行
+        if (left >= index.getLines().length) {
+            return index.getLines()[index.getLines().length - 1].getLine();
+        }
+        return index.getLines()[left].getLine();
+    }
+    
+    /**
+     * 文档索引
+     */
+    @Data
+    public static class DocumentIndex {
+        private String documentId;
+        private PageIndex[] pages;
+        private LineIndex[] lines;
+        private int totalChars;
+        private int totalLines;
+        private int totalPages;
+    }
+    
+    /**
+     * 页面索引
+     */
+    @Data
+    public static class PageIndex {
+        private int page;
+        private int charStart;
+        private int charEnd;
+        private int lineStart;
+        private int lineEnd;
+    }
+    
+    /**
+     * 行索引
+     */
+    @Data
+    public static class LineIndex {
+        private int line;
+        private int charStart;
+        private int charEnd;
+    }
+}