1 ay önce · fe3056fbb3
--- a/backend/graph-service/src/main/java/com/lingyue/graph/service/GraphNerService.java
+++ b/backend/graph-service/src/main/java/com/lingyue/graph/service/GraphNerService.java
@@ -34,7 +34,6 @@ public class GraphNerService {
 
															     private final TextStorageRepository textStorageRepository;
														
 
															     private final GraphNodeRepository graphNodeRepository;
														
 
															     private final GraphRelationRepository graphRelationRepository;
														
 
															-    private final PositionMappingService positionMappingService;
														
 
															     /**
														
 
															      * 获取文档的文本内容
														
@@ -99,17 +98,13 @@ public class GraphNerService {
 
															             node.setCreateTime(new Date());
														
 
															             node.setUpdateTime(new Date());
														
 
															-            // 转换位置信息并补充页码/行号
														
 
															+            // 转换位置信息（直接使用字符偏移）
														
 
															             Object positionObj = entity.get("position");
														
 
															             if (positionObj instanceof Map) {
														
 
															                 @SuppressWarnings("unchecked")
														
 
															                 Map<String, Object> posMap = (Map<String, Object>) positionObj;
														
 
															-                
														
 
															-                // 使用 PositionMappingService 补充页码和行号
														
 
															-                Map<String, Object> enrichedPosition = enrichPosition(documentId, posMap);
														
 
															-                
														
 
															-                log.debug("实体位置信息: name={}, position={}", node.getName(), enrichedPosition);
														
 
															-                node.setPosition(enrichedPosition);
														
 
															+                log.debug("实体位置信息: name={}, position={}", node.getName(), posMap);
														
 
															+                node.setPosition(posMap);
														
 
															             } else {
														
 
															                 log.debug("实体无位置信息: name={}, positionObj={}", node.getName(), positionObj);
														
 
															             }
														
@@ -294,67 +289,6 @@ public class GraphNerService {
 
															         }
														
 
															     }
														
 
															-    /**
														
 
															-     * 丰富位置信息，补充页码和行号
														
 
															-     * 
														
 
															-     * @param documentId 文档ID
														
 
															-     * @param posMap 原始位置信息（包含 charStart, charEnd）
														
 
															-     * @return 丰富后的位置信息（包含 charStart, charEnd, page, line）
														
 
															-     */
														
 
															-    private Map<String, Object> enrichPosition(String documentId, Map<String, Object> posMap) {
														
 
															-        // 如果已经有页码和行号，直接返回
														
 
															-        if (posMap.containsKey("page") && posMap.get("page") != null 
														
 
															-            && posMap.containsKey("line") && posMap.get("line") != null) {
														
 
															-            log.debug("位置信息已完整，跳过映射: documentId={}", documentId);
														
 
															-            return posMap;
														
 
															-        }
														
 
															-        
														
 
															-        // 获取字符位置
														
 
															-        Integer charStart = getIntValue(posMap, "charStart");
														
 
															-        Integer charEnd = getIntValue(posMap, "charEnd");
														
 
															-        
														
 
															-        if (charStart == null || charEnd == null) {
														
 
															-            log.debug("缺少字符位置信息，跳过映射: documentId={}, posMap={}", documentId, posMap);
														
 
															-            return posMap;
														
 
															-        }
														
 
															-        
														
 
															-        try {
														
 
															-            // 使用 PositionMappingService 映射页码和行号
														
 
															-            log.debug("开始位置映射: documentId={}, charStart={}, charEnd={}", documentId, charStart, charEnd);
														
 
															-            Map<String, Object> mappedPosition = positionMappingService.mapCharToPosition(
														
 
															-                documentId, charStart, charEnd);
														
 
															-            
														
 
															-            // 合并原始位置信息和映射结果
														
 
															-            Map<String, Object> enrichedPosition = new HashMap<>(posMap);
														
 
															-            enrichedPosition.putAll(mappedPosition);
														
 
															-            
														
 
															-            log.debug("位置映射完成: documentId={}, result={}", documentId, enrichedPosition);
														
 
															-            return enrichedPosition;
														
 
															-        } catch (Exception e) {
														
 
															-            log.warn("位置映射失败: documentId={}, charStart={}, charEnd={}, error={}", 
														
 
															-                    documentId, charStart, charEnd, e.getMessage());
														
 
															-            return posMap;
														
 
															-        }
														
 
															-    }
														
 
															-    
														
 
															-    /**
														
 
															-     * 从 Map 中获取整数值
														
 
															-     */
														
 
															-    private Integer getIntValue(Map<String, Object> map, String key) {
														
 
															-        Object value = map.get(key);
														
 
															-        if (value == null) {
														
 
															-            return null;
														
 
															-        }
														
 
															-        if (value instanceof Number) {
														
 
															-            return ((Number) value).intValue();
														
 
															-        }
														
 
															-        try {
														
 
															-            return Integer.parseInt(value.toString());
														
 
															-        } catch (NumberFormatException e) {
														
 
															-            return null;
														
 
															-        }
														
 
															-    }
														
 
															-    
														
 
															     /**
														
 
															      * 从 Map 中获取字符串值
														
 
															      */
														
--- a/backend/graph-service/src/main/java/com/lingyue/graph/service/PositionMappingService.java
+++ b/backend/graph-service/src/main/java/com/lingyue/graph/service/PositionMappingService.java
@@ -1,337 +0,0 @@
 
															-package com.lingyue.graph.service;
														
 
															-
														
 
															-import com.fasterxml.jackson.databind.JsonNode;
														
 
															-import com.fasterxml.jackson.databind.ObjectMapper;
														
 
															-import lombok.Data;
														
 
															-import lombok.extern.slf4j.Slf4j;
														
 
															-import org.springframework.beans.factory.annotation.Value;
														
 
															-import org.springframework.stereotype.Service;
														
 
															-
														
 
															-import java.nio.file.Files;
														
 
															-import java.nio.file.Path;
														
 
															-import java.util.HashMap;
														
 
															-import java.util.Map;
														
 
															-import java.util.concurrent.ConcurrentHashMap;
														
 
															-
														
 
															-/**
														
 
															- * 位置映射服务
														
 
															- * 根据文档索引将字符位置映射到页码和行号
														
 
															- * 
														
 
															- * @author lingyue
														
 
															- * @since 2026-01-20
														
 
															- */
														
 
															-@Slf4j
														
 
															-@Service
														
 
															-public class PositionMappingService {
														
 
															-    
														
 
															-    private final ObjectMapper objectMapper;
														
 
															-    
														
 
															-    @Value("${file.storage.text-path:/data/lingyue/texts}")
														
 
															-    private String textStoragePath;
														
 
															-    
														
 
															-    // 索引缓存，避免重复读取文件
														
 
															-    private final Map<String, DocumentIndex> indexCache = new ConcurrentHashMap<>();
														
 
															-    
														
 
															-    public PositionMappingService(ObjectMapper objectMapper) {
														
 
															-        this.objectMapper = objectMapper;
														
 
															-    }
														
 
															-    
														
 
															-    /**
														
 
															-     * 根据字符位置获取完整位置信息（包含页码和行号）
														
 
															-     * 
														
 
															-     * @param documentId 文档ID
														
 
															-     * @param charStart 字符起始位置
														
 
															-     * @param charEnd 字符结束位置
														
 
															-     * @return 位置信息 Map（包含 charStart, charEnd, page, line）
														
 
															-     */
														
 
															-    public Map<String, Object> mapCharToPosition(String documentId, int charStart, int charEnd) {
														
 
															-        Map<String, Object> position = new HashMap<>();
														
 
															-        position.put("charStart", charStart);
														
 
															-        position.put("charEnd", charEnd);
														
 
															-        
														
 
															-        // 尝试加载文档索引
														
 
															-        DocumentIndex index = loadDocumentIndex(documentId);
														
 
															-        if (index == null) {
														
 
															-            log.debug("未找到文档索引，返回仅包含字符位置的信息: documentId={}", documentId);
														
 
															-            return position;
														
 
															-        }
														
 
															-        
														
 
															-        // 查找页码
														
 
															-        int page = findPage(index, charStart);
														
 
															-        position.put("page", page);
														
 
															-        
														
 
															-        // 查找行号
														
 
															-        int line = findLine(index, charStart);
														
 
															-        position.put("line", line);
														
 
															-        
														
 
															-        // 计算全局行号（如果有行索引）
														
 
															-        if (index.getLines() != null && index.getLines().length > 0) {
														
 
															-            position.put("globalLine", line);
														
 
															-        }
														
 
															-        
														
 
															-        return position;
														
 
															-    }
														
 
															-    
														
 
															-    /**
														
 
															-     * 根据页码和行号获取字符位置范围
														
 
															-     * 
														
 
															-     * @param documentId 文档ID
														
 
															-     * @param page 页码
														
 
															-     * @param line 行号（页内行号）
														
 
															-     * @return 字符位置范围 [charStart, charEnd] 或 null
														
 
															-     */
														
 
															-    public int[] mapPageLineToChar(String documentId, int page, int line) {
														
 
															-        DocumentIndex index = loadDocumentIndex(documentId);
														
 
															-        if (index == null || index.getPages() == null) {
														
 
															-            return null;
														
 
															-        }
														
 
															-        
														
 
															-        // 找到对应页
														
 
															-        PageIndex pageIndex = null;
														
 
															-        for (PageIndex p : index.getPages()) {
														
 
															-            if (p.getPage() == page) {
														
 
															-                pageIndex = p;
														
 
															-                break;
														
 
															-            }
														
 
															-        }
														
 
															-        
														
 
															-        if (pageIndex == null) {
														
 
															-            return null;
														
 
															-        }
														
 
															-        
														
 
															-        // 如果有行索引，找到具体行
														
 
															-        if (index.getLines() != null) {
														
 
															-            // 计算目标全局行号
														
 
															-            int targetGlobalLine = pageIndex.getLineStart() + line - 1;
														
 
															-            for (LineIndex lineIndex : index.getLines()) {
														
 
															-                if (lineIndex.getLine() == targetGlobalLine) {
														
 
															-                    return new int[]{lineIndex.getCharStart(), lineIndex.getCharEnd()};
														
 
															-                }
														
 
															-            }
														
 
															-        }
														
 
															-        
														
 
															-        // 没有行索引，返回页的范围
														
 
															-        return new int[]{pageIndex.getCharStart(), pageIndex.getCharEnd()};
														
 
															-    }
														
 
															-    
														
 
															-    /**
														
 
															-     * 批量映射位置信息
														
 
															-     * 用于一次性处理多个实体的位置
														
 
															-     * 
														
 
															-     * @param documentId 文档ID
														
 
															-     * @param charPositions 字符位置列表 [[charStart1, charEnd1], [charStart2, charEnd2], ...]
														
 
															-     * @return 完整位置信息列表
														
 
															-     */
														
 
															-    public Map<String, Object>[] mapCharToPositionBatch(String documentId, int[][] charPositions) {
														
 
															-        // 预加载索引
														
 
															-        DocumentIndex index = loadDocumentIndex(documentId);
														
 
															-        
														
 
															-        @SuppressWarnings("unchecked")
														
 
															-        Map<String, Object>[] results = new Map[charPositions.length];
														
 
															-        
														
 
															-        for (int i = 0; i < charPositions.length; i++) {
														
 
															-            int charStart = charPositions[i][0];
														
 
															-            int charEnd = charPositions[i][1];
														
 
															-            
														
 
															-            Map<String, Object> position = new HashMap<>();
														
 
															-            position.put("charStart", charStart);
														
 
															-            position.put("charEnd", charEnd);
														
 
															-            
														
 
															-            if (index != null) {
														
 
															-                position.put("page", findPage(index, charStart));
														
 
															-                position.put("line", findLine(index, charStart));
														
 
															-            }
														
 
															-            
														
 
															-            results[i] = position;
														
 
															-        }
														
 
															-        
														
 
															-        return results;
														
 
															-    }
														
 
															-    
														
 
															-    /**
														
 
															-     * 清除缓存的索引
														
 
															-     */
														
 
															-    public void clearCache(String documentId) {
														
 
															-        indexCache.remove(documentId);
														
 
															-    }
														
 
															-    
														
 
															-    /**
														
 
															-     * 清除所有缓存
														
 
															-     */
														
 
															-    public void clearAllCache() {
														
 
															-        indexCache.clear();
														
 
															-    }
														
 
															-    
														
 
															-    /**
														
 
															-     * 加载文档索引
														
 
															-     */
														
 
															-    private DocumentIndex loadDocumentIndex(String documentId) {
														
 
															-        // 先检查缓存
														
 
															-        if (indexCache.containsKey(documentId)) {
														
 
															-            log.debug("从缓存加载索引: documentId={}", documentId);
														
 
															-            return indexCache.get(documentId);
														
 
															-        }
														
 
															-        
														
 
															-        // 构建索引文件路径
														
 
															-        String indexFilePath = buildIndexFilePath(documentId);
														
 
															-        Path path = Path.of(indexFilePath);
														
 
															-        
														
 
															-        log.debug("尝试加载索引文件: {}", indexFilePath);
														
 
															-        
														
 
															-        if (!Files.exists(path)) {
														
 
															-            log.info("索引文件不存在，无法补充页码信息: {}", indexFilePath);
														
 
															-            return null;
														
 
															-        }
														
 
															-        
														
 
															-        try {
														
 
															-            String json = Files.readString(path);
														
 
															-            JsonNode root = objectMapper.readTree(json);
														
 
															-            
														
 
															-            DocumentIndex index = new DocumentIndex();
														
 
															-            index.setDocumentId(root.path("documentId").asText(documentId));
														
 
															-            index.setTotalChars(root.path("totalChars").asInt(0));
														
 
															-            index.setTotalLines(root.path("totalLines").asInt(0));
														
 
															-            index.setTotalPages(root.path("totalPages").asInt(0));
														
 
															-            
														
 
															-            // 解析页面索引
														
 
															-            JsonNode pagesNode = root.path("pages");
														
 
															-            if (pagesNode.isArray()) {
														
 
															-                PageIndex[] pages = new PageIndex[pagesNode.size()];
														
 
															-                for (int i = 0; i < pagesNode.size(); i++) {
														
 
															-                    JsonNode pageNode = pagesNode.get(i);
														
 
															-                    PageIndex pageIndex = new PageIndex();
														
 
															-                    pageIndex.setPage(pageNode.path("page").asInt(i + 1));
														
 
															-                    pageIndex.setCharStart(pageNode.path("charStart").asInt(0));
														
 
															-                    pageIndex.setCharEnd(pageNode.path("charEnd").asInt(0));
														
 
															-                    pageIndex.setLineStart(pageNode.path("lineStart").asInt(1));
														
 
															-                    pageIndex.setLineEnd(pageNode.path("lineEnd").asInt(1));
														
 
															-                    pages[i] = pageIndex;
														
 
															-                }
														
 
															-                index.setPages(pages);
														
 
															-            }
														
 
															-            
														
 
															-            // 解析行索引
														
 
															-            JsonNode linesNode = root.path("lines");
														
 
															-            if (linesNode.isArray()) {
														
 
															-                LineIndex[] lines = new LineIndex[linesNode.size()];
														
 
															-                for (int i = 0; i < linesNode.size(); i++) {
														
 
															-                    JsonNode lineNode = linesNode.get(i);
														
 
															-                    LineIndex lineIndex = new LineIndex();
														
 
															-                    lineIndex.setLine(lineNode.path("line").asInt(i + 1));
														
 
															-                    lineIndex.setCharStart(lineNode.path("charStart").asInt(0));
														
 
															-                    lineIndex.setCharEnd(lineNode.path("charEnd").asInt(0));
														
 
															-                    lines[i] = lineIndex;
														
 
															-                }
														
 
															-                index.setLines(lines);
														
 
															-            }
														
 
															-            
														
 
															-            // 缓存
														
 
															-            indexCache.put(documentId, index);
														
 
															-            log.debug("已加载并缓存文档索引: documentId={}, pages={}, lines={}", 
														
 
															-                    documentId, 
														
 
															-                    index.getPages() != null ? index.getPages().length : 0,
														
 
															-                    index.getLines() != null ? index.getLines().length : 0);
														
 
															-            
														
 
															-            return index;
														
 
															-        } catch (Exception e) {
														
 
															-            log.error("加载文档索引失败: documentId={}", documentId, e);
														
 
															-            return null;
														
 
															-        }
														
 
															-    }
														
 
															-    
														
 
															-    /**
														
 
															-     * 构建索引文件路径
														
 
															-     */
														
 
															-    private String buildIndexFilePath(String documentId) {
														
 
															-        return Path.of(
														
 
															-                textStoragePath,
														
 
															-                documentId.substring(0, 2),
														
 
															-                documentId + "_index.json"
														
 
															-        ).toString();
														
 
															-    }
														
 
															-    
														
 
															-    /**
														
 
															-     * 使用二分查找页码
														
 
															-     */
														
 
															-    private int findPage(DocumentIndex index, int charPosition) {
														
 
															-        if (index.getPages() == null || index.getPages().length == 0) {
														
 
															-            return 1;
														
 
															-        }
														
 
															-        
														
 
															-        for (PageIndex page : index.getPages()) {
														
 
															-            if (charPosition >= page.getCharStart() && charPosition <= page.getCharEnd()) {
														
 
															-                return page.getPage();
														
 
															-            }
														
 
															-        }
														
 
															-        
														
 
															-        // 如果未找到，返回最后一页
														
 
															-        return index.getPages()[index.getPages().length - 1].getPage();
														
 
															-    }
														
 
															-    
														
 
															-    /**
														
 
															-     * 使用二分查找行号
														
 
															-     */
														
 
															-    private int findLine(DocumentIndex index, int charPosition) {
														
 
															-        if (index.getLines() == null || index.getLines().length == 0) {
														
 
															-            return 1;
														
 
															-        }
														
 
															-        
														
 
															-        // 二分查找
														
 
															-        int left = 0;
														
 
															-        int right = index.getLines().length - 1;
														
 
															-        
														
 
															-        while (left <= right) {
														
 
															-            int mid = (left + right) / 2;
														
 
															-            LineIndex line = index.getLines()[mid];
														
 
															-            
														
 
															-            if (charPosition < line.getCharStart()) {
														
 
															-                right = mid - 1;
														
 
															-            } else if (charPosition > line.getCharEnd()) {
														
 
															-                left = mid + 1;
														
 
															-            } else {
														
 
															-                return line.getLine();
														
 
															-            }
														
 
															-        }
														
 
															-        
														
 
															-        // 如果未找到，返回最近的行
														
 
															-        if (left >= index.getLines().length) {
														
 
															-            return index.getLines()[index.getLines().length - 1].getLine();
														
 
															-        }
														
 
															-        return index.getLines()[left].getLine();
														
 
															-    }
														
 
															-    
														
 
															-    /**
														
 
															-     * 文档索引
														
 
															-     */
														
 
															-    @Data
														
 
															-    public static class DocumentIndex {
														
 
															-        private String documentId;
														
 
															-        private PageIndex[] pages;
														
 
															-        private LineIndex[] lines;
														
 
															-        private int totalChars;
														
 
															-        private int totalLines;
														
 
															-        private int totalPages;
														
 
															-    }
														
 
															-    
														
 
															-    /**
														
 
															-     * 页面索引
														
 
															-     */
														
 
															-    @Data
														
 
															-    public static class PageIndex {
														
 
															-        private int page;
														
 
															-        private int charStart;
														
 
															-        private int charEnd;
														
 
															-        private int lineStart;
														
 
															-        private int lineEnd;
														
 
															-    }
														
 
															-    
														
 
															-    /**
														
 
															-     * 行索引
														
 
															-     */
														
 
															-    @Data
														
 
															-    public static class LineIndex {
														
 
															-        private int line;
														
 
															-        private int charStart;
														
 
															-        private int charEnd;
														
 
															-    }
														
 
															-}
														
--- a/backend/parse-service/src/main/java/com/lingyue/parse/service/DocumentIndexService.java
+++ b/backend/parse-service/src/main/java/com/lingyue/parse/service/DocumentIndexService.java
@@ -1,313 +0,0 @@
 
															-package com.lingyue.parse.service;
														
 
															-
														
 
															-import com.fasterxml.jackson.databind.ObjectMapper;
														
 
															-import lombok.Data;
														
 
															-import lombok.RequiredArgsConstructor;
														
 
															-import lombok.extern.slf4j.Slf4j;
														
 
															-import org.springframework.stereotype.Service;
														
 
															-
														
 
															-import java.nio.file.Files;
														
 
															-import java.nio.file.Path;
														
 
															-import java.nio.file.Paths;
														
 
															-import java.util.ArrayList;
														
 
															-import java.util.List;
														
 
															-
														
 
															-/**
														
 
															- * 文档索引服务
														
 
															- * 负责生成和管理文档的位置索引
														
 
															- * 
														
 
															- * @author lingyue
														
 
															- * @since 2026-01-20
														
 
															- */
														
 
															-@Slf4j
														
 
															-@Service
														
 
															-@RequiredArgsConstructor
														
 
															-public class DocumentIndexService {
														
 
															-    
														
 
															-    private final ObjectMapper objectMapper;
														
 
															-    
														
 
															-    /**
														
 
															-     * 分页符字符（Form Feed）
														
 
															-     */
														
 
															-    private static final char FORM_FEED = '\f';
														
 
															-    
														
 
															-    /**
														
 
															-     * 为纯文本生成行索引（Word/Excel等文档）
														
 
															-     * 如果文本中包含分页符(\f)，则根据分页符生成页面索引
														
 
															-     * 
														
 
															-     * @param text 文本内容
														
 
															-     * @param documentId 文档ID
														
 
															-     * @param indexOutputPath 索引输出路径
														
 
															-     * @return 文档索引
														
 
															-     */
														
 
															-    public DocumentIndex generateLineIndex(String text, String documentId, String indexOutputPath) {
														
 
															-        if (text == null || text.isEmpty()) {
														
 
															-            return createEmptyIndex(documentId);
														
 
															-        }
														
 
															-        
														
 
															-        // 检查是否包含分页符
														
 
															-        boolean hasPageBreaks = text.indexOf(FORM_FEED) >= 0;
														
 
															-        
														
 
															-        if (hasPageBreaks) {
														
 
															-            log.info("检测到分页符，生成分页索引: documentId={}", documentId);
														
 
															-            return generateIndexWithPageBreaks(text, documentId, indexOutputPath);
														
 
															-        } else {
														
 
															-            log.debug("无分页符，生成单页索引: documentId={}", documentId);
														
 
															-            return generateSinglePageIndex(text, documentId, indexOutputPath);
														
 
															-        }
														
 
															-    }
														
 
															-    
														
 
															-    /**
														
 
															-     * 根据分页符生成多页索引
														
 
															-     */
														
 
															-    private DocumentIndex generateIndexWithPageBreaks(String text, String documentId, String indexOutputPath) {
														
 
															-        List<PageIndex> pageIndices = new ArrayList<>();
														
 
															-        List<LineIndex> lineIndices = new ArrayList<>();
														
 
															-        
														
 
															-        int charPos = 0;
														
 
															-        int lineNum = 1;
														
 
															-        int pageNum = 1;
														
 
															-        int pageCharStart = 0;
														
 
															-        int pageLineStart = 1;
														
 
															-        
														
 
															-        String[] lines = text.split("\n", -1);
														
 
															-        
														
 
															-        for (String line : lines) {
														
 
															-            // 检查这一行是否包含分页符
														
 
															-            int ffIndex = line.indexOf(FORM_FEED);
														
 
															-            
														
 
															-            if (ffIndex >= 0) {
														
 
															-                // 处理分页符之前的内容
														
 
															-                if (ffIndex > 0) {
														
 
															-                    LineIndex lineIndex = new LineIndex();
														
 
															-                    lineIndex.setLine(lineNum);
														
 
															-                    lineIndex.setCharStart(charPos);
														
 
															-                    lineIndex.setCharEnd(charPos + ffIndex);
														
 
															-                    lineIndices.add(lineIndex);
														
 
															-                }
														
 
															-                
														
 
															-                // 结束当前页
														
 
															-                PageIndex pageIndex = new PageIndex();
														
 
															-                pageIndex.setPage(pageNum);
														
 
															-                pageIndex.setCharStart(pageCharStart);
														
 
															-                pageIndex.setCharEnd(charPos + ffIndex);
														
 
															-                pageIndex.setLineStart(pageLineStart);
														
 
															-                pageIndex.setLineEnd(lineNum);
														
 
															-                pageIndices.add(pageIndex);
														
 
															-                
														
 
															-                // 开始新页
														
 
															-                pageNum++;
														
 
															-                pageCharStart = charPos + ffIndex + 1; // +1 跳过分页符
														
 
															-                pageLineStart = lineNum + 1;
														
 
															-                
														
 
															-                // 处理分页符之后的内容（如果有）
														
 
															-                if (ffIndex + 1 < line.length()) {
														
 
															-                    lineNum++;
														
 
															-                    LineIndex afterLineIndex = new LineIndex();
														
 
															-                    afterLineIndex.setLine(lineNum);
														
 
															-                    afterLineIndex.setCharStart(charPos + ffIndex + 1);
														
 
															-                    afterLineIndex.setCharEnd(charPos + line.length());
														
 
															-                    lineIndices.add(afterLineIndex);
														
 
															-                }
														
 
															-            } else {
														
 
															-                // 普通行
														
 
															-                LineIndex lineIndex = new LineIndex();
														
 
															-                lineIndex.setLine(lineNum);
														
 
															-                lineIndex.setCharStart(charPos);
														
 
															-                lineIndex.setCharEnd(charPos + line.length());
														
 
															-                lineIndices.add(lineIndex);
														
 
															-            }
														
 
															-            
														
 
															-            charPos += line.length() + 1; // +1 for \n
														
 
															-            lineNum++;
														
 
															-        }
														
 
															-        
														
 
															-        // 添加最后一页
														
 
															-        if (pageCharStart < text.length()) {
														
 
															-            PageIndex lastPage = new PageIndex();
														
 
															-            lastPage.setPage(pageNum);
														
 
															-            lastPage.setCharStart(pageCharStart);
														
 
															-            lastPage.setCharEnd(text.length());
														
 
															-            lastPage.setLineStart(pageLineStart);
														
 
															-            lastPage.setLineEnd(lineNum - 1);
														
 
															-            pageIndices.add(lastPage);
														
 
															-        }
														
 
															-        
														
 
															-        // 创建文档索引
														
 
															-        DocumentIndex index = new DocumentIndex();
														
 
															-        index.setDocumentId(documentId);
														
 
															-        index.setTotalChars(text.length());
														
 
															-        index.setTotalLines(lineNum - 1);
														
 
															-        index.setTotalPages(pageIndices.size());
														
 
															-        index.setPages(pageIndices);
														
 
															-        index.setLines(lineIndices);
														
 
															-        
														
 
															-        log.info("分页索引生成完成: documentId={}, pages={}, lines={}", 
														
 
															-                documentId, pageIndices.size(), lineIndices.size());
														
 
															-        
														
 
															-        // 保存索引文件
														
 
															-        if (indexOutputPath != null) {
														
 
															-            saveIndexFile(index, indexOutputPath);
														
 
															-        }
														
 
															-        
														
 
															-        return index;
														
 
															-    }
														
 
															-    
														
 
															-    /**
														
 
															-     * 生成单页索引（无分页符的文档）
														
 
															-     */
														
 
															-    private DocumentIndex generateSinglePageIndex(String text, String documentId, String indexOutputPath) {
														
 
															-        List<LineIndex> lineIndices = new ArrayList<>();
														
 
															-        int charPos = 0;
														
 
															-        int lineNum = 1;
														
 
															-        
														
 
															-        String[] lines = text.split("\n", -1);
														
 
															-        for (String line : lines) {
														
 
															-            LineIndex lineIndex = new LineIndex();
														
 
															-            lineIndex.setLine(lineNum);
														
 
															-            lineIndex.setCharStart(charPos);
														
 
															-            lineIndex.setCharEnd(charPos + line.length());
														
 
															-            lineIndices.add(lineIndex);
														
 
															-            
														
 
															-            charPos += line.length() + 1; // +1 for \n
														
 
															-            lineNum++;
														
 
															-        }
														
 
															-        
														
 
															-        // 创建文档索引（无分页，只有行索引）
														
 
															-        DocumentIndex index = new DocumentIndex();
														
 
															-        index.setDocumentId(documentId);
														
 
															-        index.setTotalChars(text.length());
														
 
															-        index.setTotalLines(lines.length);
														
 
															-        index.setTotalPages(1); // 非分页文档统一为1页
														
 
															-        index.setLines(lineIndices);
														
 
															-        
														
 
															-        // 创建虚拟的单页索引
														
 
															-        List<PageIndex> pages = new ArrayList<>();
														
 
															-        PageIndex singlePage = new PageIndex();
														
 
															-        singlePage.setPage(1);
														
 
															-        singlePage.setCharStart(0);
														
 
															-        singlePage.setCharEnd(text.length());
														
 
															-        singlePage.setLineStart(1);
														
 
															-        singlePage.setLineEnd(lines.length);
														
 
															-        pages.add(singlePage);
														
 
															-        index.setPages(pages);
														
 
															-        
														
 
															-        // 保存索引文件
														
 
															-        if (indexOutputPath != null) {
														
 
															-            saveIndexFile(index, indexOutputPath);
														
 
															-        }
														
 
															-        
														
 
															-        return index;
														
 
															-    }
														
 
															-    
														
 
															-    /**
														
 
															-     * 读取索引文件
														
 
															-     */
														
 
															-    public DocumentIndex loadIndex(String indexFilePath) {
														
 
															-        try {
														
 
															-            Path path = Paths.get(indexFilePath);
														
 
															-            if (!Files.exists(path)) {
														
 
															-                log.warn("索引文件不存在: {}", indexFilePath);
														
 
															-                return null;
														
 
															-            }
														
 
															-            String json = Files.readString(path);
														
 
															-            return objectMapper.readValue(json, DocumentIndex.class);
														
 
															-        } catch (Exception e) {
														
 
															-            log.error("读取索引文件失败: {}", indexFilePath, e);
														
 
															-            return null;
														
 
															-        }
														
 
															-    }
														
 
															-    
														
 
															-    /**
														
 
															-     * 根据字符位置查找页码和行号
														
 
															-     * 
														
 
															-     * @param index 文档索引
														
 
															-     * @param charPosition 字符位置
														
 
															-     * @return [页码, 行号] 或 null（未找到）
														
 
															-     */
														
 
															-    public int[] findPageAndLine(DocumentIndex index, int charPosition) {
														
 
															-        if (index == null || index.getPages() == null) {
														
 
															-            return null;
														
 
															-        }
														
 
															-        
														
 
															-        // 1. 找到所在页
														
 
															-        int page = 1;
														
 
															-        for (PageIndex pageIndex : index.getPages()) {
														
 
															-            if (charPosition >= pageIndex.getCharStart() && charPosition <= pageIndex.getCharEnd()) {
														
 
															-                page = pageIndex.getPage();
														
 
															-                break;
														
 
															-            }
														
 
															-        }
														
 
															-        
														
 
															-        // 2. 找到所在行
														
 
															-        int line = 1;
														
 
															-        if (index.getLines() != null) {
														
 
															-            for (LineIndex lineIndex : index.getLines()) {
														
 
															-                if (charPosition >= lineIndex.getCharStart() && charPosition <= lineIndex.getCharEnd()) {
														
 
															-                    line = lineIndex.getLine();
														
 
															-                    break;
														
 
															-                }
														
 
															-            }
														
 
															-        }
														
 
															-        
														
 
															-        return new int[]{page, line};
														
 
															-    }
														
 
															-    
														
 
															-    private DocumentIndex createEmptyIndex(String documentId) {
														
 
															-        DocumentIndex index = new DocumentIndex();
														
 
															-        index.setDocumentId(documentId);
														
 
															-        index.setTotalChars(0);
														
 
															-        index.setTotalLines(0);
														
 
															-        index.setTotalPages(0);
														
 
															-        index.setPages(new ArrayList<>());
														
 
															-        return index;
														
 
															-    }
														
 
															-    
														
 
															-    private void saveIndexFile(DocumentIndex index, String outputPath) {
														
 
															-        try {
														
 
															-            Path path = Paths.get(outputPath);
														
 
															-            Files.createDirectories(path.getParent());
														
 
															-            String json = objectMapper.writerWithDefaultPrettyPrinter().writeValueAsString(index);
														
 
															-            Files.writeString(path, json);
														
 
															-            log.info("文档索引文件已保存: {}", outputPath);
														
 
															-        } catch (Exception e) {
														
 
															-            log.error("保存文档索引文件失败: {}", outputPath, e);
														
 
															-        }
														
 
															-    }
														
 
															-    
														
 
															-    /**
														
 
															-     * 文档索引
														
 
															-     */
														
 
															-    @Data
														
 
															-    public static class DocumentIndex {
														
 
															-        private String documentId;
														
 
															-        private List<PageIndex> pages;
														
 
															-        private List<LineIndex> lines;
														
 
															-        private int totalChars;
														
 
															-        private int totalLines;
														
 
															-        private int totalPages;
														
 
															-    }
														
 
															-    
														
 
															-    /**
														
 
															-     * 页面索引
														
 
															-     */
														
 
															-    @Data
														
 
															-    public static class PageIndex {
														
 
															-        private int page;
														
 
															-        private int charStart;
														
 
															-        private int charEnd;
														
 
															-        private int lineStart;
														
 
															-        private int lineEnd;
														
 
															-        private boolean ocrUsed;
														
 
															-    }
														
 
															-    
														
 
															-    /**
														
 
															-     * 行索引
														
 
															-     */
														
 
															-    @Data
														
 
															-    public static class LineIndex {
														
 
															-        private int line;
														
 
															-        private int charStart;
														
 
															-        private int charEnd;
														
 
															-    }
														
 
															-}
														
--- a/backend/parse-service/src/main/java/com/lingyue/parse/service/ParseService.java
+++ b/backend/parse-service/src/main/java/com/lingyue/parse/service/ParseService.java
@@ -32,7 +32,6 @@ public class ParseService {
 
															     private final ExcelTextExtractionService excelTextExtractionService;
														
 
															     private final OcrResultParser ocrResultParser;
														
 
															     private final LayoutAnalysisService layoutAnalysisService;
														
 
															-    private final DocumentIndexService documentIndexService;
														
 
															     private final FileStorageProperties fileStorageProperties;
														
 
															     // 单体应用直接注入 Service，不使用 Feign Client
														
 
															     private final com.lingyue.graph.service.TextStorageService textStorageService;
														
@@ -97,12 +96,9 @@ public class ParseService {
 
															                 task.setProgress(20);
														
 
															                 saveParseTask(task);
														
 
															-                // PDF使用分页判断逻辑，并生成页面索引
														
 
															-                String indexFilePath = buildIndexFilePath(documentId);
														
 
															-                PdfTextExtractionService.ExtractionResult extractionResult = 
														
 
															-                    pdfTextExtractionService.extractTextWithIndex(sourceFilePath, documentId, indexFilePath);
														
 
															-                plainText = extractionResult.getText();
														
 
															-                log.info("PDF提取完成，索引文件: {}", indexFilePath);
														
 
															+                // PDF使用分页判断逻辑
														
 
															+                plainText = pdfTextExtractionService.extractText(sourceFilePath);
														
 
															+                log.info("PDF提取完成，文本长度: {}", plainText.length());
														
 
															             } else if (fileType == FileType.WORD || fileType == FileType.WORD_OLD) {
														
 
															                 log.info("处理Word文件: {}", sourceFilePath);
														
 
															                 task.setCurrentStep("word_extraction");
														
@@ -111,11 +107,7 @@ public class ParseService {
 
															                 // Word文档直接提取文本
														
 
															                 plainText = wordTextExtractionService.extractText(sourceFilePath);
														
 
															-                
														
 
															-                // 为Word生成行索引
														
 
															-                String indexFilePath = buildIndexFilePath(documentId);
														
 
															-                documentIndexService.generateLineIndex(plainText, documentId, indexFilePath);
														
 
															-                log.info("Word提取完成，索引文件: {}", indexFilePath);
														
 
															+                log.info("Word提取完成，文本长度: {}", plainText.length());
														
 
															             } else if (fileType == FileType.EXCEL || fileType == FileType.EXCEL_OLD) {
														
 
															                 log.info("处理Excel文件: {}", sourceFilePath);
														
 
															                 task.setCurrentStep("excel_extraction");
														
@@ -124,11 +116,7 @@ public class ParseService {
 
															                 // Excel表格直接提取文本
														
 
															                 plainText = excelTextExtractionService.extractText(sourceFilePath);
														
 
															-                
														
 
															-                // 为Excel生成行索引
														
 
															-                String indexFilePath = buildIndexFilePath(documentId);
														
 
															-                documentIndexService.generateLineIndex(plainText, documentId, indexFilePath);
														
 
															-                log.info("Excel提取完成，索引文件: {}", indexFilePath);
														
 
															+                log.info("Excel提取完成，文本长度: {}", plainText.length());
														
 
															             } else if (fileType.isImage()) {
														
 
															                 log.info("处理图片文件: {}", sourceFilePath);
														
 
															                 task.setCurrentStep("ocr");
														
@@ -283,18 +271,6 @@ public class ParseService {
 
															         return path.toString();
														
 
															     }
														
 
															-    /**
														
 
															-     * 根据文档ID构建索引文件存储路径
														
 
															-     */
														
 
															-    private String buildIndexFilePath(String documentId) {
														
 
															-        Path path = Path.of(
														
 
															-                fileStorageProperties.getTextPath(),
														
 
															-                documentId.substring(0, 2),
														
 
															-                documentId + "_index.json"
														
 
															-        );
														
 
															-        return path.toString();
														
 
															-    }
														
 
															-
														
 
															     /**
														
 
															      * 将纯文本写入 TXT 文件
														
 
															      * 对于大文件使用分块写入，避免内存溢出
														
--- a/backend/parse-service/src/main/java/com/lingyue/parse/service/PdfTextExtractionService.java
+++ b/backend/parse-service/src/main/java/com/lingyue/parse/service/PdfTextExtractionService.java
@@ -1,8 +1,6 @@
 
															 package com.lingyue.parse.service;
														
 
															-import com.fasterxml.jackson.databind.ObjectMapper;
														
 
															 import com.lingyue.common.exception.ServiceException;
														
 
															-import lombok.Data;
														
 
															 import lombok.RequiredArgsConstructor;
														
 
															 import lombok.extern.slf4j.Slf4j;
														
 
															 import org.apache.pdfbox.pdmodel.PDDocument;
														
@@ -16,7 +14,6 @@ import java.io.File;
 
															 import java.io.IOException;
														
 
															 import java.nio.file.Files;
														
 
															 import java.nio.file.Path;
														
 
															-import java.nio.file.Paths;
														
 
															 import java.util.ArrayList;
														
 
															 import java.util.List;
														
 
															 import java.util.UUID;
														
@@ -35,7 +32,6 @@ public class PdfTextExtractionService {
 
															     private final PaddleOcrClient paddleOcrClient;
														
 
															     private final OcrResultParser ocrResultParser;
														
 
															-    private final ObjectMapper objectMapper;
														
 
															     /**
														
 
															      * 文本阈值：每页至少需要这么多字符才认为有文本层
														
@@ -209,170 +205,6 @@ public class PdfTextExtractionService {
 
															         return combinedText.toString();
														
 
															     }
														
 
															-    /**
														
 
															-     * 提取PDF文本并生成页面索引
														
 
															-     * 
														
 
															-     * @param pdfFilePath PDF文件路径
														
 
															-     * @param documentId 文档ID
														
 
															-     * @param indexOutputPath 索引文件输出路径（如果为null则不生成索引文件）
														
 
															-     * @return 提取结果（包含文本和索引）
														
 
															-     */
														
 
															-    public ExtractionResult extractTextWithIndex(String pdfFilePath, String documentId, String indexOutputPath) {
														
 
															-        File pdfFile = new File(pdfFilePath);
														
 
															-        if (!pdfFile.exists()) {
														
 
															-            throw new ServiceException("PDF文件不存在: " + pdfFilePath);
														
 
															-        }
														
 
															-        
														
 
															-        List<PageTextResult> pageResults = new ArrayList<>();
														
 
															-        
														
 
															-        try (PDDocument document = org.apache.pdfbox.Loader.loadPDF(pdfFile)) {
														
 
															-            int totalPages = document.getNumberOfPages();
														
 
															-            log.info("开始处理PDF文件（带索引）: {}, 总页数: {}", pdfFilePath, totalPages);
														
 
															-            
														
 
															-            PDFTextStripper textStripper = new PDFTextStripper();
														
 
															-            
														
 
															-            // 逐页处理
														
 
															-            for (int pageNum = 1; pageNum <= totalPages; pageNum++) {
														
 
															-                log.debug("处理第 {} 页/共 {} 页", pageNum, totalPages);
														
 
															-                
														
 
															-                try {
														
 
															-                    textStripper.setStartPage(pageNum);
														
 
															-                    textStripper.setEndPage(pageNum);
														
 
															-                    String pageText = textStripper.getText(document);
														
 
															-                    
														
 
															-                    if (hasSufficientText(pageText)) {
														
 
															-                        log.debug("第 {} 页有文本层，直接使用，文本长度: {}", pageNum, pageText.length());
														
 
															-                        pageResults.add(new PageTextResult(pageNum, pageText, false));
														
 
															-                    } else {
														
 
															-                        log.debug("第 {} 页文本不足，使用OCR处理", pageNum);
														
 
															-                        String ocrText = extractTextByOcr(pdfFilePath, pageNum);
														
 
															-                        pageResults.add(new PageTextResult(pageNum, ocrText, true));
														
 
															-                    }
														
 
															-                } catch (Exception e) {
														
 
															-                    log.error("处理第 {} 页时出错，尝试使用OCR", pageNum, e);
														
 
															-                    try {
														
 
															-                        String ocrText = extractTextByOcr(pdfFilePath, pageNum);
														
 
															-                        pageResults.add(new PageTextResult(pageNum, ocrText, true));
														
 
															-                    } catch (Exception ocrException) {
														
 
															-                        log.error("第 {} 页OCR也失败", pageNum, ocrException);
														
 
															-                        pageResults.add(new PageTextResult(pageNum, "", true));
														
 
															-                    }
														
 
															-                }
														
 
															-            }
														
 
															-        } catch (IOException e) {
														
 
															-            log.error("读取PDF文件失败: {}", pdfFilePath, e);
														
 
															-            throw new ServiceException("读取PDF文件失败: " + e.getMessage());
														
 
															-        }
														
 
															-        
														
 
															-        // 生成带索引的合并结果
														
 
															-        return combinePageTextsWithIndex(pageResults, documentId, indexOutputPath);
														
 
															-    }
														
 
															-    
														
 
															-    /**
														
 
															-     * 合并页面文本并生成索引
														
 
															-     */
														
 
															-    private ExtractionResult combinePageTextsWithIndex(List<PageTextResult> pageResults, 
														
 
															-                                                        String documentId, 
														
 
															-                                                        String indexOutputPath) {
														
 
															-        StringBuilder combinedText = new StringBuilder();
														
 
															-        List<PageIndex> pageIndices = new ArrayList<>();
														
 
															-        List<LineIndex> lineIndices = new ArrayList<>();
														
 
															-        
														
 
															-        int currentCharPos = 0;
														
 
															-        int currentLine = 1;
														
 
															-        
														
 
															-        for (PageTextResult result : pageResults) {
														
 
															-            int pageCharStart = currentCharPos;
														
 
															-            int pageLineStart = currentLine;
														
 
															-            
														
 
															-            if (result.getText() != null && !result.getText().trim().isEmpty()) {
														
 
															-                // 添加页头标记
														
 
															-                String pageHeader = "=== 第 " + result.getPageNum() + " 页";
														
 
															-                if (result.isOcrUsed()) {
														
 
															-                    pageHeader += " (OCR识别)";
														
 
															-                } else {
														
 
															-                    pageHeader += " (文本层提取)";
														
 
															-                }
														
 
															-                pageHeader += " ===\n";
														
 
															-                
														
 
															-                // 记录页头行的索引
														
 
															-                LineIndex headerLineIndex = new LineIndex();
														
 
															-                headerLineIndex.setLine(currentLine);
														
 
															-                headerLineIndex.setCharStart(currentCharPos);
														
 
															-                headerLineIndex.setCharEnd(currentCharPos + pageHeader.length() - 2); // -2 去掉 \n
														
 
															-                lineIndices.add(headerLineIndex);
														
 
															-                
														
 
															-                combinedText.append(pageHeader);
														
 
															-                currentCharPos += pageHeader.length();
														
 
															-                currentLine++;
														
 
															-                
														
 
															-                // 添加页面内容并记录每行索引
														
 
															-                String pageText = result.getText();
														
 
															-                String[] lines = pageText.split("\n", -1);
														
 
															-                for (String line : lines) {
														
 
															-                    LineIndex lineIndex = new LineIndex();
														
 
															-                    lineIndex.setLine(currentLine);
														
 
															-                    lineIndex.setCharStart(currentCharPos);
														
 
															-                    lineIndex.setCharEnd(currentCharPos + line.length());
														
 
															-                    lineIndices.add(lineIndex);
														
 
															-                    
														
 
															-                    currentCharPos += line.length() + 1; // +1 for \n
														
 
															-                    currentLine++;
														
 
															-                }
														
 
															-                
														
 
															-                // 添加额外的空行分隔符
														
 
															-                combinedText.append(pageText).append("\n\n");
														
 
															-                currentCharPos++; // 额外的 \n
														
 
															-                currentLine++;
														
 
															-            }
														
 
															-            
														
 
															-            // 创建页面索引
														
 
															-            PageIndex pageIndex = new PageIndex();
														
 
															-            pageIndex.setPage(result.getPageNum());
														
 
															-            pageIndex.setCharStart(pageCharStart);
														
 
															-            pageIndex.setCharEnd(currentCharPos - 1);
														
 
															-            pageIndex.setLineStart(pageLineStart);
														
 
															-            pageIndex.setLineEnd(currentLine - 1);
														
 
															-            pageIndex.setOcrUsed(result.isOcrUsed());
														
 
															-            pageIndices.add(pageIndex);
														
 
															-        }
														
 
															-        
														
 
															-        // 创建文档索引
														
 
															-        DocumentIndex documentIndex = new DocumentIndex();
														
 
															-        documentIndex.setDocumentId(documentId);
														
 
															-        documentIndex.setPages(pageIndices);
														
 
															-        documentIndex.setLines(lineIndices);
														
 
															-        documentIndex.setTotalChars(currentCharPos);
														
 
															-        documentIndex.setTotalLines(currentLine - 1);
														
 
															-        documentIndex.setTotalPages(pageResults.size());
														
 
															-        
														
 
															-        // 保存索引文件
														
 
															-        if (indexOutputPath != null) {
														
 
															-            saveIndexFile(documentIndex, indexOutputPath);
														
 
															-        }
														
 
															-        
														
 
															-        ExtractionResult result = new ExtractionResult();
														
 
															-        result.setText(combinedText.toString());
														
 
															-        result.setIndex(documentIndex);
														
 
															-        return result;
														
 
															-    }
														
 
															-    
														
 
															-    /**
														
 
															-     * 保存索引文件
														
 
															-     */
														
 
															-    private void saveIndexFile(DocumentIndex index, String outputPath) {
														
 
															-        try {
														
 
															-            Path path = Paths.get(outputPath);
														
 
															-            Files.createDirectories(path.getParent());
														
 
															-            String json = objectMapper.writerWithDefaultPrettyPrinter().writeValueAsString(index);
														
 
															-            Files.writeString(path, json);
														
 
															-            log.info("页面索引文件已保存: {}", outputPath);
														
 
															-        } catch (Exception e) {
														
 
															-            log.error("保存页面索引文件失败: {}", outputPath, e);
														
 
															-            // 索引保存失败不影响主流程
														
 
															-        }
														
 
															-    }
														
 
															-    
														
 
															     /**
														
 
															      * 页面文本结果
														
 
															      */
														
@@ -399,49 +231,4 @@ public class PdfTextExtractionService {
 
															             return ocrUsed;
														
 
															         }
														
 
															     }
														
 
															-    
														
 
															-    /**
														
 
															-     * 提取结果（包含文本和索引）
														
 
															-     */
														
 
															-    @Data
														
 
															-    public static class ExtractionResult {
														
 
															-        private String text;
														
 
															-        private DocumentIndex index;
														
 
															-    }
														
 
															-    
														
 
															-    /**
														
 
															-     * 文档索引
														
 
															-     */
														
 
															-    @Data
														
 
															-    public static class DocumentIndex {
														
 
															-        private String documentId;
														
 
															-        private List<PageIndex> pages;
														
 
															-        private List<LineIndex> lines;
														
 
															-        private int totalChars;
														
 
															-        private int totalLines;
														
 
															-        private int totalPages;
														
 
															-    }
														
 
															-    
														
 
															-    /**
														
 
															-     * 行索引
														
 
															-     */
														
 
															-    @Data
														
 
															-    public static class LineIndex {
														
 
															-        private int line;
														
 
															-        private int charStart;
														
 
															-        private int charEnd;
														
 
															-    }
														
 
															-    
														
 
															-    /**
														
 
															-     * 页面索引
														
 
															-     */
														
 
															-    @Data
														
 
															-    public static class PageIndex {
														
 
															-        private int page;
														
 
															-        private int charStart;
														
 
															-        private int charEnd;
														
 
															-        private int lineStart;
														
 
															-        private int lineEnd;
														
 
															-        private boolean ocrUsed;
														
 
															-    }
														
 
															 }
														
--- a/backend/parse-service/src/main/java/com/lingyue/parse/service/WordTextExtractionService.java
+++ b/backend/parse-service/src/main/java/com/lingyue/parse/service/WordTextExtractionService.java
@@ -4,8 +4,8 @@ import com.lingyue.common.exception.ServiceException;
 
															 import lombok.extern.slf4j.Slf4j;
														
 
															 import org.apache.poi.hwpf.HWPFDocument;
														
 
															 import org.apache.poi.hwpf.extractor.WordExtractor;
														
 
															-import org.apache.poi.xwpf.usermodel.*;
														
 
															-import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTR;
														
 
															+import org.apache.poi.xwpf.extractor.XWPFWordExtractor;
														
 
															+import org.apache.poi.xwpf.usermodel.XWPFDocument;
														
 
															 import org.springframework.stereotype.Service;
														
 
															 import java.io.File;
														
@@ -15,7 +15,6 @@ import java.io.IOException;
 
															 /**
														
 
															  * Word文档文本提取服务
														
 
															  * 支持.docx和.doc格式
														
 
															- * 在分页符位置插入 \f 字符，便于后续识别页码
														
 
															  * 
														
 
															  * @author lingyue
														
 
															  * @since 2026-01-14
														
@@ -24,16 +23,11 @@ import java.io.IOException;
 
															 @Service
														
 
															 public class WordTextExtractionService {
														
 
															-    /**
														
 
															-     * 分页符字符
														
 
															-     */
														
 
															-    private static final char PAGE_BREAK = '\f';
														
 
															-    
														
 
															     /**
														
 
															      * 提取Word文档文本
														
 
															      * 
														
 
															      * @param wordFilePath Word文件路径
														
 
															-     * @return 提取的文本内容（分页符位置插入 \f）
														
 
															+     * @return 提取的文本内容
														
 
															      */
														
 
															     public String extractText(String wordFilePath) {
														
 
															         File wordFile = new File(wordFilePath);
														
@@ -45,7 +39,7 @@ public class WordTextExtractionService {
 
															         try {
														
 
															             if (fileName.endsWith(".docx")) {
														
 
															-                return extractFromDocxWithPageBreaks(wordFilePath);
														
 
															+                return extractFromDocx(wordFilePath);
														
 
															             } else if (fileName.endsWith(".doc")) {
														
 
															                 return extractFromDoc(wordFilePath);
														
 
															             } else {
														
@@ -58,88 +52,23 @@ public class WordTextExtractionService {
 
															     }
														
 
															     /**
														
 
															-     * 从.docx文件提取文本，保留分页符
														
 
															+     * 从.docx文件提取文本
														
 
															      */
														
 
															-    private String extractFromDocxWithPageBreaks(String filePath) throws IOException {
														
 
															-        log.info("提取.docx文件文本（含分页符）: {}", filePath);
														
 
															-        
														
 
															-        StringBuilder sb = new StringBuilder();
														
 
															-        int pageBreakCount = 0;
														
 
															+    private String extractFromDocx(String filePath) throws IOException {
														
 
															+        log.info("提取.docx文件文本: {}", filePath);
														
 
															         try (FileInputStream fis = new FileInputStream(filePath);
														
 
															-             XWPFDocument document = new XWPFDocument(fis)) {
														
 
															-            
														
 
															-            // 遍历文档主体的所有元素
														
 
															-            for (IBodyElement element : document.getBodyElements()) {
														
 
															-                if (element instanceof XWPFParagraph) {
														
 
															-                    XWPFParagraph paragraph = (XWPFParagraph) element;
														
 
															-                    
														
 
															-                    // 检查段落中的分页符
														
 
															-                    for (XWPFRun run : paragraph.getRuns()) {
														
 
															-                        // 检查 run 中是否有分页符
														
 
															-                        CTR ctr = run.getCTR();
														
 
															-                        if (ctr != null) {
														
 
															-                            // 检查是否有硬分页符 (page break)
														
 
															-                            if (ctr.getBrList() != null) {
														
 
															-                                for (org.openxmlformats.schemas.wordprocessingml.x2006.main.CTBr br : ctr.getBrList()) {
														
 
															-                                    if (br.getType() == org.openxmlformats.schemas.wordprocessingml.x2006.main.STBrType.PAGE) {
														
 
															-                                        sb.append(PAGE_BREAK);
														
 
															-                                        pageBreakCount++;
														
 
															-                                    }
														
 
															-                                }
														
 
															-                            }
														
 
															-                        }
														
 
															-                        
														
 
															-                        // 添加文本内容
														
 
															-                        String text = run.getText(0);
														
 
															-                        if (text != null) {
														
 
															-                            sb.append(text);
														
 
															-                        }
														
 
															-                    }
														
 
															-                    
														
 
															-                    // 检查段落后是否有分页符（通过段落属性）
														
 
															-                    if (paragraph.getCTP() != null && paragraph.getCTP().getPPr() != null) {
														
 
															-                        var pPr = paragraph.getCTP().getPPr();
														
 
															-                        // 检查分节符带来的分页
														
 
															-                        if (pPr.getSectPr() != null) {
														
 
															-                            var sectPr = pPr.getSectPr();
														
 
															-                            if (sectPr.getType() != null) {
														
 
															-                                String type = sectPr.getType().getVal().toString();
														
 
															-                                if ("nextPage".equals(type) || "oddPage".equals(type) || "evenPage".equals(type)) {
														
 
															-                                    sb.append(PAGE_BREAK);
														
 
															-                                    pageBreakCount++;
														
 
															-                                }
														
 
															-                            }
														
 
															-                        }
														
 
															-                    }
														
 
															-                    
														
 
															-                    sb.append("\n");
														
 
															-                    
														
 
															-                } else if (element instanceof XWPFTable) {
														
 
															-                    XWPFTable table = (XWPFTable) element;
														
 
															-                    for (XWPFTableRow row : table.getRows()) {
														
 
															-                        for (XWPFTableCell cell : row.getTableCells()) {
														
 
															-                            sb.append(cell.getText()).append("\t");
														
 
															-                        }
														
 
															-                        sb.append("\n");
														
 
															-                    }
														
 
															-                }
														
 
															-            }
														
 
															+             XWPFDocument document = new XWPFDocument(fis);
														
 
															+             XWPFWordExtractor extractor = new XWPFWordExtractor(document)) {
														
 
															-            // 检查文档末尾的分节符
														
 
															-            if (document.getDocument().getBody().getSectPr() != null) {
														
 
															-                // 文档末尾的分节符不需要额外处理
														
 
															-            }
														
 
															+            String text = extractor.getText();
														
 
															+            log.debug("提取到文本长度: {}", text != null ? text.length() : 0);
														
 
															+            return text != null ? text : "";
														
 
															         }
														
 
															-        
														
 
															-        String result = sb.toString();
														
 
															-        log.info("提取完成: 文本长度={}, 分页符数量={}", result.length(), pageBreakCount);
														
 
															-        return result;
														
 
															     }
														
 
															     /**
														
 
															      * 从.doc文件提取文本
														
 
															-     * .doc 格式的分页符通常会被 WordExtractor 保留为 \f
														
 
															      */
														
 
															     private String extractFromDoc(String filePath) throws IOException {
														
 
															         log.info("提取.doc文件文本: {}", filePath);
														
@@ -149,14 +78,7 @@ public class WordTextExtractionService {
 
															              WordExtractor extractor = new WordExtractor(document)) {
														
 
															             String text = extractor.getText();
														
 
															-            int pageBreakCount = 0;
														
 
															-            if (text != null) {
														
 
															-                for (char c : text.toCharArray()) {
														
 
															-                    if (c == PAGE_BREAK) pageBreakCount++;
														
 
															-                }
														
 
															-            }
														
 
															-            log.info("提取完成: 文本长度={}, 分页符数量={}", 
														
 
															-                    text != null ? text.length() : 0, pageBreakCount);
														
 
															+            log.debug("提取到文本长度: {}", text != null ? text.length() : 0);
														
 
															             return text != null ? text : "";
														
 
															         }
														
 
															     }